diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 39480995..4dda9426 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -16,6 +16,8 @@ // For web display "ghcr.io/devcontainers/features/node:1": {}, // For scripting - "ghcr.io/va-h/devcontainers-features/uv:1": {} + "ghcr.io/va-h/devcontainers-features/uv:1": {}, + // For paxtools + "ghcr.io/devcontainers/features/java:1": {} } } diff --git a/cache/__init__.py b/cache/__init__.py index 2f15fe4d..900e94cf 100644 --- a/cache/__init__.py +++ b/cache/__init__.py @@ -67,9 +67,9 @@ def link(output: str, directive: list[str], uncompress=False): Path(output).unlink(missing_ok=True) - # Re-download if the directive has expired. + # Re-download if the directive has expired / the artifact mysteriously disappeared. cache_item = get_cache_item(directive) - if has_expired(directive): + if has_expired(directive) or not (artifacts_dir / artifact_name).exists(): (artifacts_dir / artifact_name).unlink(missing_ok=True) cache_item.download(artifacts_dir / artifact_name) diff --git a/cache/directory.py b/cache/directory.py index b308f1b5..70f0a827 100644 --- a/cache/directory.py +++ b/cache/directory.py @@ -2,26 +2,61 @@ from typing import Union from os import PathLike from tempfile import NamedTemporaryFile -from typing import Optional -import urllib.request +from typing import Optional, Mapping import filecmp -import urllib.parse -import os from pathlib import Path +from enum import Enum +import warnings +import requests +import shutil +import urllib.parse import gdown -dir_path = Path(os.path.dirname(os.path.realpath(__file__))) +dir_path = Path(__file__).parent.resolve() + +@dataclass +class Service: + url: str + headers: Optional[Mapping[str, str]] = None + def download(self, output: str | PathLike) -> requests.Response: + """ + Downloads a URL, returning the response (to be used with `with`) and modifying the output path. + """ + # As per https://stackoverflow.com/a/39217788/7589775 to enable download streaming. + with requests.get(self.url, stream=True, headers=self.headers) as response: + response.raw.decode_content = True + with open(output, 'wb') as f: + shutil.copyfileobj(response.raw, f) + return response -def fetch_biomart_url(xml: str) -> str: +def fetch_biomart_service(xml: str) -> Service: """ Access BioMart data through the BioMart REST API: https://useast.ensembl.org/info/data/biomart/biomart_restful.html#biomartxml """ ROOT = "http://www.ensembl.org/biomart/martservice?query=" - return ROOT + urllib.parse.quote_plus(xml) + return Service(ROOT + urllib.parse.quote_plus(xml)) +class OnlineStatus(Enum): + ONLINE = 1 + """ + Services that are always online. If these fail, we fail the workflow and + log this. + """ + + INTERMITTENT_ERROR_CODE = 2 + """ + Services that error often (not go down!) + these will be logged when they fail, but we continue with the cached option. + """ + + # (we choose to do this over arbitrary lambdas because its nicer. For now.) + INTERMITTENT_HTML = 3 + """ + Like INTERMITTENT_ERROR_CODE, but errors when HTML is returned. + """ @dataclass class CacheItem: @@ -35,41 +70,46 @@ class CacheItem: name: str """The display name of the artifact, used for human-printing.""" cached: str - online: str - online_headers: Optional[list[tuple[str, str]]] = None + online: Optional[Service] = None + status: OnlineStatus = OnlineStatus.ONLINE + """How much to care about errors from downloading the online file.""" @classmethod + @warnings.deprecated("Pending for removal after the CONTRIBUTING guide is updated.") def cache_only(cls, name: str, cached: str) -> "CacheItem": """Wrapper method to explicitly declare a CacheItem as cached only.""" - return cls(name=name, online=cached, cached="") - - def download_online(self, output: str | PathLike): - # https://stackoverflow.com/a/45313194/7589775: this is to add optional headers to requests. - # We remove the opener at the end by re-installing the default opener. - opener = urllib.request.build_opener() - if self.online_headers: - opener.addheaders = self.online_headers - urllib.request.install_opener(opener) - urllib.request.urlretrieve(self.online, output) - urllib.request.install_opener(urllib.request.build_opener()) + return cls(name=name, cached=cached, online=None) def download(self, output: str | PathLike): print(f"Fetching {self.name}...") - print(f"Downloading {self.online}...") - - if self.cached == "": - # From CacheItem.cached_only - # (gdown doesn't take in Paths for the output_file, so we must stringify it here) - gdown.download(self.online, str(output)) - return - - self.download_online(output) with NamedTemporaryFile() as cached_file: print(f"Downloading cache {self.cached}...") gdown.download(self.cached, cached_file) - print("Checking that downloaded artifact matches with cached artifact...") - filecmp.cmp(output, cached_file.name) + + if self.online is None: + return + + print(f"Downloading {self.online}...") + with self.online.download(output) as response: + + print("Checking that downloaded artifact matches with cached artifact...") + if filecmp.cmp(output, cached_file.name): + return # It does! + + # For debug purposes, we allow the output artifact to be viewed in some kind of temporary folder. + debug_file_path = Path(NamedTemporaryFile(prefix="spras-benchmarking-debug-artifact", delete=False).name) + # (and we pedantically use this over Path#rename since temporary directories can be mounted to a different file system.) + shutil.move(output, debug_file_path) + if (self.status == OnlineStatus.INTERMITTENT_ERROR_CODE and not response.ok) \ + or (self.status == OnlineStatus.INTERMITTENT_HTML and Path(debug_file_path).read_text().strip().startswith("")): + warnings.warn(f"Online url {self.online} erroring with status code {response.status_code}. " \ + f"See {debug_file_path} for the online output. Using the cached file instead...") + # Back up to the cached_file + shutil.move(cached_file.name, output) + else: + raise RuntimeError(f"Cached and online files did not match with status code {response.status_code}! " \ + f"See {debug_file_path} for the online output.") CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]] @@ -78,15 +118,15 @@ def download(self, output: str | PathLike): directory: CacheDirectory = { "STRING": { "9606": { - "9606.protein.links.txt.gz": CacheItem( - name="STRING 9606 protein links", - cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj", - online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz", + "9606.protein.links.full.txt.gz": CacheItem( + name="STRING 9606 full links", + cached="https://drive.google.com/uc?id=13tE_-A6g7McZs_lZGz9As7iE-5cBFvqE", + online=Service("http://stringdb-downloads.org/download/protein.links.full.v12.0/9606.protein.links.full.v12.0.txt.gz"), ), "9606.protein.aliases.txt.gz": CacheItem( name="STRING 9606 protein aliases", cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY", - online="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz", + online=Service("https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz"), ), } }, @@ -98,19 +138,19 @@ def download(self, output: str | PathLike): "SwissProt_9606.tsv": CacheItem( name="UniProt 9606 SwissProt genes", cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk", - online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29", + online=Service("https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29"), ), # idmapping FTP files. See the associated README: # https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README "HUMAN_9606_idmapping_selected.tab.gz": CacheItem( name="UniProt 9606 ID external database mapping", cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX", - online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz", + online=Service("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"), ), "HUMAN_9606_idmapping.dat.gz": CacheItem( name="UniProt 9606 internal id mapping", cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O", - online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz", + online=Service("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz"), ), } }, @@ -120,56 +160,56 @@ def download(self, output: str | PathLike): "tiga_gene-trait_stats.tsv": CacheItem( name="TIGA data", cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK", - online="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv", + online=Service("https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv"), ), "HumanDO.tsv": CacheItem( name="Disease ontology data", cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi", - online="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv", + online=Service("https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv"), ), "human_disease_textmining_filtered.tsv": CacheItem( name="DISEASES textmining channel", cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D", - online="https://download.jensenlab.org/human_disease_textmining_filtered.tsv", + online=Service("https://download.jensenlab.org/human_disease_textmining_filtered.tsv"), ), "human_disease_knowledge_filtered.tsv": CacheItem( name="DISEASES knowledge channel", cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld", - online="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv", + online=Service("https://download.jensenlab.org/human_disease_knowledge_filtered.tsv"), ), }, "BioMart": { "ensg-ensp.tsv": CacheItem( name="BioMart ENSG <-> ENSP mapping", cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL", - online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text()), + online=fetch_biomart_service((dir_path / "biomart" / "ensg-ensp.xml").read_text()), ) }, "DepMap": { "OmicsProfiles.csv": CacheItem( name="DepMap omics metadata", cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads", + online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads"), ), "CRISPRGeneDependency.csv": CacheItem( name="DepMap gene dependency probability estimates", cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads", + online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads"), ), "OmicsSomaticMutationsMatrixDamaging.csv": CacheItem( name="DepMap genotyped matrix", cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads", + online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads"), ), "OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem( name="DepMap model-level TPMs", cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads", + online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads"), ), "OmicsCNGeneWGS.csv": CacheItem( name="DepMap gene-level copy number data", cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads", + online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads"), ), }, "iRefIndex": { @@ -190,30 +230,166 @@ def download(self, output: str | PathLike): # The following files are from https://github.com/gitter-lab/osmotic-stress "prizes.txt": CacheItem( name="Osmotic Stress Prizes", - online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt", + online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt"), cached="https://drive.google.com/uc?id=16WDQs0Vjv6rI12-hbifsbnpH31jMGhJg" ), "ChasmanNetwork-DirUndir.txt": CacheItem( name="Network Input", - online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt", + online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt"), cached="https://drive.google.com/uc?id=1qYXPaWcPU72YYME7NaBzD7thYCHRzrLH" ), "dummy.txt": CacheItem( name="Dummy Nodes File", - online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt", + online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt"), cached="https://drive.google.com/uc?id=1dsFIhBrIEahggg0JPxw64JwS51pKxoQU" ), "_edgeFreq.eda ": CacheItem( name="Case Study Omics Integrator Edge Frequencies", - online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda", + online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda"), cached="https://drive.google.com/uc?id=1M_rxEzUCo_EVuFyM47OEH2J-4LB3eeCR" ), "goldStandardUnionDetailed.txt": CacheItem( name="Gold Standard Reference Pathways", - online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt", + online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt"), cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T" ), }, + "Surfaceome": { + "table_S3_surfaceome.xlsx": CacheItem( + name="Human surfaceome", + online=Service("http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx"), + cached="https://docs.google.com/uc?id=1cBXYbDnAJVet0lv3BRrizV5FuqfMbBr0" + ) + }, + "TranscriptionFactors": { + "Homo_sapiens_TF.tsv": CacheItem.cache_only( + name="Human transcription factors", + # This server has anti-bot protection, so to respect their wishes, we don't download from the server. + # The original URL is https://guolab.wchscu.cn/AnimalTFDB4_static/download/TF_list_final/Homo_sapiens_TF, + # which is accessible from https://guolab.wchscu.cn/AnimalTFDB4//#/Download -> Homo sapiens + # (also under the Internet Archive as of Feb 2nd, 2026. If the original artifact disappears, the drive link below should suffice.) + cached="https://drive.google.com/uc?id=1fVi18GpudUlquRPHgUJl3H1jy54gO-uz", + ) + }, + "PathwayCommons": { + # TODO: all of these share the same common URL: can we make this API a little nicer? + "PANTHER": { + "Apoptosis_signaling_pathway.txt": CacheItem( + name="Apoptosis Signaling Pathway", + cached="https://drive.google.com/uc?id=1BPcnvqHrGMQeX4oQx2ow3OribgPxzwhG", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00006"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "B_cell_activation.txt": CacheItem( + name="B cell activation", + cached="https://drive.google.com/uc?id=1iWcb5AfdobGncRB6xQ6T5qunXzb6Gxd-", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00010"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Beta3_adrenergic_receptor_signaling_pathway.txt": CacheItem( + name="Beta3_adrenergic_receptor_signaling_pathway", + cached="https://drive.google.com/uc?id=1jrJzrDvhDAs818wYjQ_dm1irOz8Bv4lk", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP04379"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Cadherin_signaling_pathway.txt": CacheItem( + name="Cadherin signaling pathway", + cached="https://drive.google.com/uc?id=14Of-6mwIpul_QciyJ-Xb9f7t-IrVcIna", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00012"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Fas_signaling_pathway.txt": CacheItem( + name="FAS signaling_pathway", + cached="https://drive.google.com/uc?id=121cHJf0ZtglQHvy9xuEpYSjwBbJV9Fju", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00020"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "FGF_signaling_pathway.txt": CacheItem( + name="FGF signaling pathway", + cached="https://drive.google.com/uc?id=1PIiWK1-ImXE1YHdDh1hGUVB01Ye8brQg", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00021"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Hedgehog_signaling_pathway.txt": CacheItem( + name="Hedgehog signaling pathway", + cached="https://drive.google.com/uc?id=1i7HKn4nlJQcaXUDXpbpDFBxbkBXZC0xQ", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00025"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Insulin_IGF_pathway_protein_kinase_B_signaling_cascade.txt": CacheItem( + name="Insulin/IGF pathway-protein kinase B signaling cascade", + cached="https://drive.google.com/uc?id=1Xkxcm0ngrE8otau9ccyPeCg7KZUdhJf7", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00033"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Interferon_gamma_signaling_pathway.txt": CacheItem( + name="Interferon-gamma signaling pathway", + cached="https://drive.google.com/uc?id=1aPqi0A5ZIOA5kKELVUI_NvC8taiHll5z", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00035"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Interleukin_signaling_pathway.txt": CacheItem( + name="Interleukin signaling pathway", + cached="https://drive.google.com/uc?id=1IOv14pRJ8aN9LRnkZ4BQXf3QGUAashku", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00036"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "JAK_STAT_signaling_pathway.txt": CacheItem( + name="JAK/STAT signaling pathway", + cached="https://drive.google.com/uc?id=1QzMEMUZzeoxUYZZRGcm6Al_HzH6pmwED", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00038"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Nicotinic_acetylcholine_receptor_signaling_pathway.txt": CacheItem( + name="Nicotinic acetylcholine receptor signaling pathway", + cached="https://drive.google.com/uc?id=1SdnKr4TthfmZWgMA_FOlTmf-EEpNsdzx", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00044"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Notch_signaling_pathway.txt": CacheItem( + name="Notch signaling pathway", + cached="https://drive.google.com/uc?id=1qfyxuc1EomOKGRyI7QyQ7LUUhLPZytz5", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00045"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "PDGF_signaling_pathway.txt": CacheItem( + name="PDGF signaling pathway", + cached="https://drive.google.com/uc?id=1A9hl340XKnZeNfd3hiiX7lxOVV94lQ5s", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00047"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Ras_pathway.txt": CacheItem( + name="Ras pathway", + cached="https://drive.google.com/uc?id=1wNizL5wDh48E5YxHcZjURa9UeKMONrgr", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP04393"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "T_cell_activation.txt": CacheItem( + name="T cell activation", + cached="https://drive.google.com/uc?id=1t5G_jN8QSOiVceQGAmKvbYebkV1G5oJy", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00053"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Toll_receptor_signaling_pathway.txt": CacheItem( + name="Toll receptor signaling pathway", + cached="https://drive.google.com/uc?id=1nFix8mMvuU_Vu9tExwgaS279nynqM_Oo", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00054"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "VEGF_signaling_pathway.txt": CacheItem( + name="VEGF signaling pathway", + cached="https://drive.google.com/uc?id=1W1G0TmA6-JLF9pIZD0TR4w95IwG2IALs", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00056"), + status=OnlineStatus.INTERMITTENT_HTML + ), + "Wnt_signaling_pathway.txt": CacheItem( + name="Wnt signaling pathway", + cached="https://drive.google.com/uc?id=1diaacbik5hcA9Fo7vMXFAP_wXRe0xCLB", + online=Service("https://www.pathwaycommons.org/pc2/get?format=TXT&uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00057"), + status=OnlineStatus.INTERMITTENT_HTML + ), + } + } } diff --git a/datasets/README.md b/datasets/README.md index a53730c9..c26dc862 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -11,3 +11,8 @@ Many of the datasets here have been stripped of their extra post-analysis. Here, - [`diseases`](https://github.com/Reed-CompBio/spras-benchmarking/tree/3c0155567dbc43278531b91f9173f6d4f4486dd8/datasets/diseases) - [`depmap`](https://github.com/Reed-CompBio/spras-benchmarking/tree/b332c0ab53868f111cb89cd4e9f485e8c19aa9e3/datasets/depmap) - [`yeast-osmotic-stress`](https://github.com/Reed-CompBio/spras-benchmarking/tree/8f69dcdf4a52607347fe3a962b753df396e44cda/yeast-osmotic-stress) + +## `explore` folders + +To motivate certain decisions made in-code, such as `synthetic-data`'s PANTHER pathway choices, we provide scripts that use live data +to assist in data curation. These folders can also contain exploratory CLIs for motivating e.g. magic constants. diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile index 0455b57a..93de50d5 100644 --- a/datasets/diseases/Snakefile +++ b/datasets/diseases/Snakefile @@ -13,7 +13,7 @@ produce_fetch_rules({ "raw/HumanDO.tsv": ["DISEASES", "HumanDO.tsv"], "raw/tiga_gene-trait_stats.tsv": ["DISEASES", "tiga_gene-trait_stats.tsv"], "raw/ensg-ensp.tsv": ["BioMart", "ensg-ensp.tsv"], - "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True), + "raw/9606.protein.links.full.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True), "raw/9606.protein.aliases.txt": FetchConfig(["STRING", "9606", "9606.protein.aliases.txt.gz"], uncompress=True), }) @@ -42,7 +42,7 @@ rule files: input: "data/inputs.csv", "data/gold_standard.csv", - "raw/9606.protein.links.txt" + "raw/9606.protein.links.full.txt" output: # These are the two we use for the SPRAS run for now "GS_files/Alopecia_areata_GS.txt", diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py index dc5a949b..f8704461 100644 --- a/datasets/diseases/scripts/files.py +++ b/datasets/diseases/scripts/files.py @@ -42,7 +42,7 @@ def main(): # See /cache/directory.py for information on how this was grabbed. # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES. - string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.txt", sep=" ", skiprows=[0], header=None) + string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ", skiprows=[0], header=None) # Threshold anything above a confidence score of 900 to trim down the background interactome string = string[string.iloc[:, 2] > 900] diff --git a/datasets/synthetic-data/.gitignore b/datasets/synthetic-data/.gitignore new file mode 100644 index 00000000..ca5e16da --- /dev/null +++ b/datasets/synthetic-data/.gitignore @@ -0,0 +1,3 @@ +intermediate +processed +raw \ No newline at end of file diff --git a/datasets/synthetic-data/README.md b/datasets/synthetic-data/README.md new file mode 100644 index 00000000..1d1bf1b4 --- /dev/null +++ b/datasets/synthetic-data/README.md @@ -0,0 +1,67 @@ +# Synthetic Data + +## Download STRING Human Interactome +1. Download the STRING *Homo sapiens* `9606.protein.links.full.v12.0.txt.gz` database file from [STRING](https://string-db.org/cgi/download?sessionId=bL9sRTdIaUEt&species_text=Homo+sapiens&settings_expanded=0&min_download_score=0&filter_redundant_pairs=0&delimiter_type=txt). +2. Move the downloaded file into the `raw/human-interactome/` folder. +3. From the `raw/synthetic-data/` directory, extract the file using: + + ```sh + gunzip human-interactome/9606.protein.links.full.v12.0.txt.gz + ``` + +## Download New PANTHER Pathways +1. Visit [Pathway Commons](https://www.pathwaycommons.org/). +2. Search for the desired pathway (e.g., "signaling") and filter the results by the **PANTHER pathway** data source. + Example: [Search for "Signaling" filtered by PANTHER pathway](https://apps.pathwaycommons.org/search?datasource=panther&q=Signaling&type=Pathway) +3. Click on the desired pathway and download the **Extended SIF** version of the pathway. +4. In the `raw/pathway-data/` folder, create a new subfolder named after the pathway you downloaded. +5. Move the downloaded Extended SIF file to this new folder (as a `.txt` file). Rename the file to match the subfolder name exactly. + +## Sources and Targets + +[Sources](http://wlab.ethz.ch/surfaceome/), or `table_S3_surfaceome.xlsx`, (see [original paper](https://doi.org/10.1073/pnas.1808790115)) +are silico human surfaceomes receptors. + +[Targets]( https://guolab.wchscu.cn/AnimalTFDB4//#/), or `Homo_sapiens_TF.tsv`, (see [original paper](https://doi.org/10.1093/nar/gkac907)) +are human transcription factors. + +## Steps to Generate SPRAS-Compatible Pathways + +This entire workflow can also be done with `uv run snakemake --cores 1` inside this directory. + +### 1. Process PANTHER Pathways + +1. Open `Snakefile` and add the name of any new pathways to the `pathways` entry. +2. Run the command: + ```sh + uv run scripts/process_panther_pathway.py + ``` +3. This will create five new files in the respective `pathway` subfolder of the `pathway-data/` directory: +- `edges.txt` +- `nodes.txt` +- `prizes-100.txt` +- `sources.txt` +- `targets.txt` + +### 2. Convert Pathways to SPRAS-Compatible Format +1. In `panther_spras_formatting.py`, add the name of any new pathways to the `pathway_dirs` list on **line 8**. +2. From the synthetic-data/ directory, run the command: +``` +python scripts/panther_spras_formatting.py +``` +3. This will create a new folder named `spras-compatible-pathway-data`, containing subfolders for each PANTHER pathway in SPRAS-compatible format. +Each subfolder will include the following three files: +- `_gs_edges.txt` +- `_gs_nodes.txt` +- `_node_prizes.txt` + +# Pilot Data +For the pilot data, use the list `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]` in both: +- the list in `combine.py` +- the list in `overlap_analytics.py` + +Make sure these pathways in the list are also added `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]`to: +- the `pathways` vector in `ProcessPantherPathway.R` +- the list in `panther_spras_formatting.py` + +**Once you’ve updated the pathway lists in all relevant scripts, run all the steps above to generate the Pilot dataset.** diff --git a/datasets/synthetic-data/Snakefile b/datasets/synthetic-data/Snakefile new file mode 100644 index 00000000..88bbe33c --- /dev/null +++ b/datasets/synthetic-data/Snakefile @@ -0,0 +1,91 @@ +include: "../../cache/Snakefile" + +pathways = [ + "Apoptosis_signaling_pathway", + "B_cell_activation", + "Beta3_adrenergic_receptor_signaling_pathway", + "Cadherin_signaling_pathway", + "Fas_signaling_pathway", + "FGF_signaling_pathway", + "Hedgehog_signaling_pathway", + "Insulin_IGF_pathway_protein_kinase_B_signaling_cascade", + "Interferon_gamma_signaling_pathway", + "Interleukin_signaling_pathway", + "JAK_STAT_signaling_pathway", + "Nicotinic_acetylcholine_receptor_signaling_pathway", + "Notch_signaling_pathway", + "PDGF_signaling_pathway", + "Ras_pathway", + "T_cell_activation", + "Toll_receptor_signaling_pathway", + "VEGF_signaling_pathway", + "Wnt_signaling_pathway", +] + +# TODO: deduplicate from sampling.py +thresholds = list(map(str, map(lambda x: (x + 1) / 10, range(10)))) + +rule all: + input: + "raw/9606.protein.links.full.v12.0.txt", + expand([ + "thresholded/{threshold}/{pathway}/interactome.txt", + "thresholded/{threshold}/{pathway}/gold_standard_edges.txt", + ], pathway=pathways, threshold=thresholds) + +produce_fetch_rules({ + **{ + "raw/9606.protein.links.full.v12.0.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True), + "raw/human-interactome/table_S3_surfaceome.xlsx": ["Surfaceome", "table_S3_surfaceome.xlsx"], + "raw/human-interactome/Homo_sapiens_TF.tsv": ["TranscriptionFactors", "Homo_sapiens_TF.tsv"], + "raw/human-interactome/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True), + }, + # See directory.py for the online/cached location of all pathways from PathwayCommons. + **{f"raw/pathway-data/{k}.txt": ["PathwayCommons", "PANTHER", f"{k}.txt"] for k in pathways} +}) +rule process_tfs: + input: + "raw/human-interactome/Homo_sapiens_TF.tsv", + "raw/human-interactome/HUMAN_9606_idmapping_selected.tsv" + output: + "raw/human-interactome/Homo_sapiens_TF_Uniprot.tsv" + shell: + "uv run scripts/map_transcription_factors.py" + +rule process_panther_pathway: + input: + "raw/pathway-data/{pathway}.txt", + "raw/human-interactome/table_S3_surfaceome.xlsx", + "raw/human-interactome/Homo_sapiens_TF_Uniprot.tsv" + output: + "intermediate/{pathway}/edges.txt", + "intermediate/{pathway}/nodes.txt", + "intermediate/{pathway}/sources.txt", + "intermediate/{pathway}/targets.txt", + "intermediate/{pathway}/prizes.txt" + shell: + "uv run scripts/process_panther_pathway.py {wildcards.pathway}" + +rule make_spras_compatible: + input: + "intermediate/{pathway}/edges.txt", + "intermediate/{pathway}/nodes.txt", + "intermediate/{pathway}/sources.txt", + "intermediate/{pathway}/targets.txt", + "intermediate/{pathway}/prizes.txt" + output: + "processed/{pathway}/{pathway}_node_prizes.txt", + "processed/{pathway}/{pathway}_gs_edges.txt", + "processed/{pathway}/{pathway}_gs_nodes.txt" + shell: + "uv run scripts/panther_spras_formatting.py {wildcards.pathway}" + +rule threshold: + input: + "processed/{pathway}/{pathway}_node_prizes.txt", + "processed/{pathway}/{pathway}_gs_edges.txt" + output: + expand("thresholded/{threshold}/{{pathway}}/interactome.txt", threshold=thresholds), + expand("thresholded/{threshold}/{{pathway}}/gold_standard_edges.txt", threshold=thresholds) + shell: + "uv run scripts/sampling.py {wildcards.pathway}" diff --git a/datasets/synthetic-data/explore/README.md b/datasets/synthetic-data/explore/README.md new file mode 100644 index 00000000..50faeeaa --- /dev/null +++ b/datasets/synthetic-data/explore/README.md @@ -0,0 +1,5 @@ +# explore + +See [the datasets readme](../../README.md) for the motivation for the `explore` folder. + +This folder contains `candidates.py`, which is a CLI for finding all viable pathways with our custom filtering criteria. diff --git a/datasets/synthetic-data/explore/candidates.py b/datasets/synthetic-data/explore/candidates.py new file mode 100644 index 00000000..3d96ccb4 --- /dev/null +++ b/datasets/synthetic-data/explore/candidates.py @@ -0,0 +1,64 @@ +""" +Utility CLI for finding pathway critetia from PathwayCommons based on our desired participant count. +This is meant to be interactive for easily examining the available pathways from PathwayCommons over PANTHER +(and perhaps more later!). + +See https://www.pathwaycommons.org/pc2/swagger-ui/index.html#/api-controller-v-2 for the API. +""" + +import requests + +from pydantic import BaseModel + +SEARCH_URL = "https://www.pathwaycommons.org/pc2/v2/search" + +# These schemas were manually examined from the API response, and are thus not exhaustive. +class SearchHit(BaseModel): + uri: str + name: str + biopaxClass: str + numParticipants: int + numProcesses: int + +class SearchResponse(BaseModel): + numHits: int + maxHitsPerPage: int + searchHit: list[SearchHit] + +def request(page: int) -> SearchResponse: + return SearchResponse.model_validate(requests.post( + 'https://www.pathwaycommons.org/pc2/v2/search', + headers={ + 'accept': 'application/json', + 'Content-Type': 'application/json', + }, json={ + # Indicates a BioPAX pathway + 'q': 'xrefid:P*', + 'type': 'pathway', + 'organism': [ + '9606', + ], + 'datasource': [ + 'panther', + ], + 'page': page, + } + ).json()) + +def main(): + # TODO: weirdly constructed loop? could be nicer if we use numHits and maxHitsPerPage + hits: list[SearchHit] = [] + page = 0 + response = request(page) + print(f"Paginating {page}...") + while len(response.searchHit) != 0: + hits.extend(response.searchHit) + page += 1 + response = request(page) + print(f"Paginating {page}...") + + for hit in hits: + print(f"({hit.numParticipants}) {hit.name}") + +if __name__ == "__main__": + main() diff --git a/datasets/synthetic-data/scripts/interactome.py b/datasets/synthetic-data/scripts/interactome.py new file mode 100644 index 00000000..ac8bc399 --- /dev/null +++ b/datasets/synthetic-data/scripts/interactome.py @@ -0,0 +1,81 @@ +import pandas +from pathlib import Path + +current_directory = Path(__file__).parent.resolve() + + +def main(): + # Convert the interactome to SPRAS format + print("Reading interactome...") + interactome_df = pandas.read_csv( + current_directory / ".." / "raw" / "9606.protein.links.full.v12.0.txt", sep=" ", usecols=["protein1", "protein2", "combined_score"] + ) + interactome_df.columns = ["Protein1", "Protein2", "Weight"] + + # We also want to representatively remove a certain percentage of elements from the interactome, + # to make sure our interactome downsampling preserves edge weight distributions + # (we don't care to preserve other major topological properties just yet.) + # since this file is large, we opt for streaming the interactome for removing edges instead + + print("Initially processing interactome...") + interactome_df["Weight"] = interactome_df["Weight"].div(1000) # scores are from 1-1000: we normalize from 0-1. + interactome_df["Direction"] = "U" + print("Sorting interactome...") + interactome_df = interactome_df.sort_values("Weight", kind="stable") + + print("Mapping interactome...") + # STRINGDB -> UniProt accession ID pairings + UniProt_AC = pandas.read_csv(current_directory / ".." / "raw" / "human-interactome" / "String_to_Uniprot_ids_2025_04_06.tsv", sep="\t", header=0) + one_to_many_dict = UniProt_AC.groupby("From")["Entry"].apply(list).to_dict() + + def get_aliases(protein_id): + return one_to_many_dict.get(protein_id, []) + + interactome_df["Protein1_uniprot"] = interactome_df["Protein1"].apply(get_aliases) + interactome_df["Protein2_uniprot"] = interactome_df["Protein2"].apply(get_aliases) + + interactome_df = interactome_df.explode("Protein1_uniprot").explode("Protein2_uniprot") + + missing_alias_edges = interactome_df[(interactome_df["Protein1_uniprot"].isna()) | (interactome_df["Protein2_uniprot"].isna())] + + proteins_without_aliases = ( + pandas.concat( + [ + missing_alias_edges.loc[missing_alias_edges["Protein1_uniprot"].isna(), "Protein1"], + missing_alias_edges.loc[missing_alias_edges["Protein2_uniprot"].isna(), "Protein2"], + ], + ignore_index=True, + ) + .drop_duplicates() + .reset_index(drop=True) + ) + proteins_without_aliases = proteins_without_aliases.to_frame(name="protein") + + removed_edges = missing_alias_edges[["Protein1", "Protein2"]] + removed_edges = removed_edges.drop_duplicates().reset_index(drop=True) + + (current_directory / ".." / "processed" / "interactomes" / "uniprot-threshold-interactomes").mkdir(exist_ok=True, parents=True) + proteins_without_aliases.to_csv( + current_directory / ".." / "processed" / "interactomes" / "uniprot-threshold-interactomes" / "proteins_missing_aliases.csv", + sep="\t", + index=False, + header=True, + ) + removed_edges.to_csv( + current_directory / ".." / "processed" / "interactomes" / "uniprot-threshold-interactomes" / "removed_edges.txt", + sep="\t", + index=False, + header=True, + ) + interactome_df = interactome_df.dropna(subset=["Protein1_uniprot", "Protein2_uniprot"]).reset_index(drop=True) + interactome_df = interactome_df[["Protein1_uniprot", "Protein2_uniprot", "Weight", "Direction"]] + + print("Counting weight counts...") + interactome_df["Weight"].value_counts(sort=False).to_csv(current_directory / ".." / "processed" / "weight-counts.tsv", sep="\t") + + print("Saving interactome...") + interactome_df.to_csv(current_directory / ".." / "processed" / "interactome.tsv", sep="\t", header=False, index=False) + + +if __name__ == "__main__": + main() diff --git a/datasets/synthetic-data/scripts/map_transcription_factors.py b/datasets/synthetic-data/scripts/map_transcription_factors.py new file mode 100644 index 00000000..31741bc8 --- /dev/null +++ b/datasets/synthetic-data/scripts/map_transcription_factors.py @@ -0,0 +1,35 @@ +import pandas +from pathlib import Path + +current_directory = Path(__file__).parent.resolve() + +interactome_folder = current_directory / ".." / "raw" / "human-interactome" + + +def main(): + tf_df = pandas.read_csv(interactome_folder / "Homo_sapiens_TF.tsv", sep="\t", header=0) + # The very powerful UniProt-provided mapping file: its Ensembl mappings are a semicolon-delimeted list of Emsembl IDs containing + # attached isoforms (and not all UniProtKB-AC identifiers have those!) so we'll need to do some extra post-processing. + idmapping_selected_df = pandas.read_csv( + interactome_folder / "HUMAN_9606_idmapping_selected.tsv", + header=None, + # See directory.py for the README associated with this mapping file. + usecols=[0, 18], + names=["UniProtKB-AC", "Ensembl"], + sep="\t", + ) + idmapping_selected_df = idmapping_selected_df[idmapping_selected_df["Ensembl"].notnull()] + # Handle our ;-delimited list + idmapping_selected_df["Ensembl"] = idmapping_selected_df["Ensembl"].str.split("; ") + idmapping_selected_df = idmapping_selected_df.explode("Ensembl") + # Drop isoforms + idmapping_selected_df["Ensembl"] = idmapping_selected_df["Ensembl"].str.split(".").str[0] + + tf_df = tf_df.merge(idmapping_selected_df, on="Ensembl", how="inner") + tf_df = tf_df.explode("UniProtKB-AC") + tf_df = tf_df.fillna("NA") + tf_df.to_csv(interactome_folder / "Homo_sapiens_TF_Uniprot.tsv", header=True, sep="\t", index=False) + + +if __name__ == "__main__": + main() diff --git a/datasets/synthetic-data/scripts/panther_spras_formatting.py b/datasets/synthetic-data/scripts/panther_spras_formatting.py new file mode 100644 index 00000000..56cda99b --- /dev/null +++ b/datasets/synthetic-data/scripts/panther_spras_formatting.py @@ -0,0 +1,98 @@ +import pandas as pd +from pathlib import Path +import sys + +current_directory = Path(__file__).parent.resolve() + +spras_compatible_dir = Path(current_directory, "..", "processed") +directory = Path(current_directory, "..", "intermediate") + +directed = [ + "controls-state-change-of", + "controls-transport-of", + "controls-phosphorylation-of", + "controls-expression-of", + "catalysis-precedes", + "consumption-controlled-by", + "controls-production-of", + "controls-transport-of-chemical", + "chemical-affects", + "used-to-produce", + "consumption-controled-by", +] + +undirected = ["in-complex-with", "interacts-with", "neighbor-of", "reacts-with"] + + +def raise_unknown_direction(dir: str): + raise ValueError(f"Unknown direction {dir}") + + +def main(): + spras_compatible_dir.mkdir(exist_ok=True) + + pathway = sys.argv[1] + pathway_folder = directory / pathway + + # Create the output folder "uniprot" within the pathway directory + out_folder = spras_compatible_dir / pathway + out_folder.mkdir(exist_ok=True) + + nodes_file = pathway_folder / "nodes.txt" + nodes_df = pd.read_csv(nodes_file, sep="\t") + + # a dictionary mapping gene -> Uniprot accession ID + gene_to_uniprot = pd.Series(nodes_df["uniprot"].values, index=nodes_df["NODE"]).to_dict() + + # nodes + nodes_uniprot = nodes_df[["uniprot"]] + nodes_uniprot.to_csv(out_folder / f"{pathway}_gs_nodes.txt", sep="\t", index=False, header=False) + + # edges + edges_file = pathway_folder / "edges.txt" + edges_df = pd.read_csv(edges_file, sep="\t", header=0) + edges_df["NODE1"] = edges_df["NODE1"].map(gene_to_uniprot) + edges_df["NODE2"] = edges_df["NODE2"].map(gene_to_uniprot) + edges_df["Rank"] = 1 + edges_df["Direction"] = edges_df["INTERACTION_TYPE"].apply( + lambda x: "D" if x in directed else ("U" if x in undirected else raise_unknown_direction(x)) + ) + edges_df = edges_df.drop(columns="INTERACTION_TYPE") + + # remove duplicate rows + # sort by (node1 and node2) to ensure deterministic sorting + edges_df = edges_df.sort_values(by=["NODE1", "NODE2"], ascending=True, ignore_index=True) + undirected_mask = edges_df["Direction"] == "U" + min_nodes = edges_df.loc[undirected_mask, ["NODE1", "NODE2"]].min(axis=1) + max_nodes = edges_df.loc[undirected_mask, ["NODE1", "NODE2"]].max(axis=1) + edges_df.loc[undirected_mask, "NODE1"] = min_nodes + edges_df.loc[undirected_mask, "NODE2"] = max_nodes + + # keep 1 directed and 1 undirected edge if both exist + # since rank is 1, we don't need to sort by rank. + edges_df = edges_df.sort_values(by=["NODE1", "NODE2", "Direction"], ascending=True, ignore_index=True) + edges_df = edges_df.drop_duplicates(keep="first", ignore_index=True) + + edges_df.to_csv(out_folder / f"{pathway}_gs_edges.txt", sep="\t", index=False, header=False) + + # prizes, targets, sources + prizes_file = pathway_folder / "prizes.txt" + prizes_df = pd.read_csv(prizes_file, sep="\t") + + target_file = pathway_folder / "targets.txt" + target_df = pd.read_csv(target_file, sep="\t") + + source_file = pathway_folder / "sources.txt" + source_df = pd.read_csv(source_file, sep="\t") + + # final resulting df combining all the sources, targets, and prizes + prizes_df["sources"] = prizes_df["uniprot"].isin(source_df["uniprot"]) + prizes_df["targets"] = prizes_df["uniprot"].isin(target_df["uniprot"]) + prizes_df["dummy"] = "" + prizes_df.rename(columns={"uniprot": "NODEID", "prizes": "prize"}, inplace=True) + result_df = prizes_df[["NODEID", "prize", "sources", "targets", "active", "dummy"]] + result_df.to_csv(out_folder / f"{pathway}_node_prizes.txt", sep="\t", index=False, header=True) + + +if __name__ == "__main__": + main() diff --git a/datasets/synthetic-data/scripts/process_panther_pathway.py b/datasets/synthetic-data/scripts/process_panther_pathway.py new file mode 100644 index 00000000..a7879948 --- /dev/null +++ b/datasets/synthetic-data/scripts/process_panther_pathway.py @@ -0,0 +1,82 @@ +import argparse +import io +import pandas as pd +from pathlib import Path + +current_directory = Path(__file__).parent.resolve() + +data_directory = current_directory / ".." / "raw" / "pathway-data" +interactome_folder = current_directory / ".." / "raw" / "human-interactome" + + +def process_pathway(file: Path, folder: Path): + file_content = file.read_text() + # This file has two csv files stacked on top of each other. + # This is the header that we are looking for + needle = "PARTICIPANT\tPARTICIPANT_TYPE\tPARTICIPANT_NAME\tUNIFICATION_XREF\tRELATIONSHIP_XREF" + + edges, nodes = file_content.split(needle) + # Re-add the header + nodes = needle + nodes + # https://stackoverflow.com/a/65018984/7589775 read the text + # as a file. + edges_df = pd.read_csv(io.StringIO(edges), header=0, sep="\t") + nodes_df = pd.read_csv(io.StringIO(nodes), header=0, sep="\t") + + # First, get the relevant info from the edges + edges_df = edges_df[["PARTICIPANT_A", "INTERACTION_TYPE", "PARTICIPANT_B"]] + edges_df.columns = ["NODE1", "INTERACTION_TYPE", "NODE2"] + # removing ChEBI identifiers: these aren't proteins and we therefore are not interested in them. + edges_df = edges_df[~edges_df["NODE1"].str.startswith("chebi:")] + edges_df = edges_df[~edges_df["NODE2"].str.startswith("chebi:")] + + # Do the same for the nodes + nodes_df = nodes_df[["PARTICIPANT", "UNIFICATION_XREF"]] + nodes_df.columns = ["NODE", "uniprot"] + # removing the chebi: prefix + nodes_df = nodes_df[~nodes_df["NODE"].str.startswith("chebi:")] + # and remove the uniprot: prefix + nodes_df["uniprot"] = nodes_df["uniprot"].str.removeprefix("uniprot:") + + # Save edges and nodes + edges_df.to_csv(folder / "edges.txt", header=True, index=False, sep="\t") + nodes_df.to_csv(folder / "nodes.txt", header=True, index=False, sep="\t") + + # Then, we need to get the sources and targets, save them, + # and mark them with 1.0 prizes: + + # First, for our targets, or transcription factors + human_tfs = pd.read_csv(interactome_folder / "Homo_sapiens_TF_Uniprot.tsv", sep="\t") + human_tfs = nodes_df.merge(human_tfs, how="inner", left_on="uniprot", right_on="UniProtKB-AC") + human_tfs = human_tfs[["NODE", "uniprot"]] + human_tfs.to_csv(folder / "targets.txt", sep="\t", index=False) + + # Then, for our receptors. NOTE: we skip the first row since it's empty in the XLSX, so this might break if the surfaceome authors fix this. + human_receptors = pd.read_excel(interactome_folder / "table_S3_surfaceome.xlsx", sheet_name="in silico surfaceome only", skiprows=1) + human_receptors = human_receptors[["UniProt accession", "Ensembl gene", "Membranome Almen main-class"]] + human_receptors = human_receptors[human_receptors["Membranome Almen main-class"] == "Receptors"] + human_receptors = nodes_df.merge(human_receptors, how="inner", left_on="uniprot", right_on="UniProt accession") + human_receptors = human_receptors[["NODE", "uniprot"]] + human_receptors.to_csv(folder / "sources.txt", sep="\t", index=False) + + # Finally, scores + scores = pd.concat([human_tfs, human_receptors]).drop_duplicates() + scores["prizes"] = 1 + scores["active"] = "true" + scores.to_csv(folder / "prizes.txt", sep="\t", index=False) + + +def parser(): + parser = argparse.ArgumentParser(prog="PANTHER pathway parser") + + parser.add_argument("pathway", choices=[file.stem for file in data_directory.iterdir()]) + + return parser + + +if __name__ == "__main__": + pathway = parser().parse_args().pathway + pathway_file = data_directory / Path(pathway).with_suffix(".txt") + intermediate_folder = current_directory / ".." / "intermediate" / pathway + intermediate_folder.mkdir(parents=True, exist_ok=True) + process_pathway(pathway_file, intermediate_folder) diff --git a/datasets/synthetic-data/scripts/sampling.py b/datasets/synthetic-data/scripts/sampling.py new file mode 100644 index 00000000..71f52065 --- /dev/null +++ b/datasets/synthetic-data/scripts/sampling.py @@ -0,0 +1,112 @@ +import argparse +import pandas +from pathlib import Path +import collections +from typing import OrderedDict, NamedTuple +from tools.sample import attempt_sample +from tools.trim import trim_data_file + +current_directory = Path(__file__).parent.resolve() + + +# From SPRAS. TODO: import once SPRAS uses pixi +def convert_undirected_to_directed(df: pandas.DataFrame) -> pandas.DataFrame: + mask = df["Direction"] == "U" + new_df = df[mask].copy(deep=True) + new_df["Interactor1"], new_df["Interactor2"] = new_df["Interactor2"], new_df["Interactor1"] + new_df["Direction"] = "D" + df.loc[mask, "Direction"] = "D" + df = pandas.concat([df, new_df], ignore_index=True) + return df + + +def parser(): + parser = argparse.ArgumentParser(prog="PANTHER pathway parser") + + parser.add_argument("pathway", choices=[file.stem for file in (current_directory / ".." / "raw" / "pathway-data").iterdir()]) + + return parser + + +def count_weights() -> OrderedDict[int, int]: + """Returns an ordered map (lowest to highest weight) from the weight to the number of elements the weight has""" + weight_counts = pandas.read_csv(current_directory / ".." / "processed" / "weight-counts.tsv", sep="\t") + return collections.OrderedDict(sorted({int(k * 1000): int(v) for k, v in dict(weight_counts.values).items()}.items())) + + +def read_pathway(pathway_name: str) -> pandas.DataFrame: + """ + Returns the directed-only pathway from a pathway name, + with columns Interactor1 -> Interactor2. + """ + pathway_df = pandas.read_csv( + current_directory / ".." / "processed" / pathway_name / f"{pathway_name}_gs_edges.txt", + sep="\t", + names=["Interactor1", "Interactor2", "Weight", "Direction"], + ) + # We consider an undirected edge to be two directed edges + pathway_df = convert_undirected_to_directed(pathway_df) + return pathway_df[["Interactor1", "Interactor2"]] + + +class SourcesTargets(NamedTuple): + sources: list[str] + targets: list[str] + +def get_node_data(pathway_name: str) -> pandas.DataFrame: + return pandas.read_csv( + current_directory / ".." / "processed" / pathway_name / f"{pathway_name}_node_prizes.txt", sep="\t", usecols=["NODEID", "sources", "targets"] + ) + +def sources_and_targets(pathway_node_prizes_df: pandas.DataFrame) -> SourcesTargets: + """ + Returns the sources and targets associated with a particular pathway + """ + sources: list[str] = list(pathway_node_prizes_df[pathway_node_prizes_df["sources"] is True]["NODEID"]) + targets: list[str] = list(pathway_node_prizes_df[pathway_node_prizes_df["targets"] is True]["NODEID"]) + + return SourcesTargets(sources, targets) + + +def main(): + pathway_name = parser().parse_args().pathway + print("Reading interactome...") + interactome_df = pandas.read_csv( + current_directory / ".." / "processed" / "interactome.tsv", + header=None, + sep="\t", + names=["Interactor1", "Interactor2", "Weight", "Direction"], + usecols=[0, 1], + ) + + # For performance reasons (groupby is quite slow), we sample in the interactome using the pre-computed weight-counts.tsv file + weight_mapping = count_weights() + + # Get information about the pathway + pathway_df = read_pathway(pathway_name) + node_data_df = get_node_data(pathway_name) + sources, targets = sources_and_targets(node_data_df) + + # TODO: isolate percentage constant (this currently builds up 0%, 10%, ..., 100%) + for percentage in map(lambda x: (x + 1) / 10, range(10)): + output_directory = current_directory / '..' / 'thresholded' / str(percentage) / pathway_name + output_interactome = output_directory / 'interactome.txt' + output_gold_standard = output_directory / 'gold_standard_edges.txt' + + print(f"Sampling with {percentage * 100:.1f}% of edges...") + attempt_number = 1 + while attempt_sample( + pathway_name, pathway_df, percentage, + weight_mapping, interactome_df, sources, targets, + output_interactome=output_interactome, + output_gold_standard=output_gold_standard) is None: + attempt_number += 1 + print(f"Attempt number {attempt_number}") + + # We're done sampling: + (output_directory / 'attempt-number.txt').write_text(attempt_number) + # we need to trim our data file as well. + trim_data_file(data_df=node_data_df, gold_standard_df=pathway_df).to_csv(output_directory / 'node_prizes.tsv', sep='\t', index=False) + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 9a071ecd..c0decbba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,11 @@ dependencies = [ "gdown>=5.2.0", "more-itertools>=10.7.0", "networkx>=3.6.1", + "openpyxl>=3.1.5", "pandas>=2.3.0", + "paxtools>=6.0.0.post1", + "pydantic>=2.12.5", + "requests>=2.32.5", ] [dependency-groups] diff --git a/run_snakemake.sh b/run_snakemake.sh index 24305244..cd1a6773 100755 --- a/run_snakemake.sh +++ b/run_snakemake.sh @@ -9,7 +9,7 @@ set -o errexit set -o nounset -# Forcibly use the current CWD +# Forcibly use the CWD cd "$(dirname "$0")" main() { @@ -18,6 +18,7 @@ main() { uv run snakemake --cores 1 -d datasets/diseases -s datasets/diseases/Snakefile uv run snakemake --cores 1 -d datasets/rn-muscle-skeletal -s datasets/rn-muscle-skeletal/Snakefile uv run snakemake --cores 1 -d datasets/depmap -s datasets/depmap/Snakefile + uv run snakemake --cores 1 -d datasets/synthetic-data -s datasets/synthetic-data/Snakefile } main "$@" diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 00000000..45fb1104 --- /dev/null +++ b/tools/README.md @@ -0,0 +1,8 @@ +# Dataset Processing Tools + +This includes common tools for doing dataset processing, which take in SPRAS-compatible file formats. This currently includes: + +- `trim.py`: this may be included in SPRAS later, but this contains utilities for trimming a gold standard with its respective interactome, +and the gold standard data with the interactome and the gold standard itself. +- `sample.py`: this samples an interactome and downstream samples the gold standard, preserving a percentage of the associated data in the largest +connected component of the gold standard. _These tools require a gold standard_ diff --git a/tools/sample.py b/tools/sample.py new file mode 100644 index 00000000..c5c56eb9 --- /dev/null +++ b/tools/sample.py @@ -0,0 +1,100 @@ +""" +Tools for sampling interactomes. +""" + +import collections +import networkx +import itertools +import pandas +import random +from typing import OrderedDict, Optional +import os + +def count_weights(weights: dict[float, int]) -> OrderedDict[float, int]: + """ + Returns an ordered map (lowest to highest weight) from the + weight to the number of elements the weight has. + + The full workflow for this function should be: + ```python + count_weights(dict(interactome_df["Weight"].value_counts(sort=False).values)) + ``` + """ + return collections.OrderedDict(sorted({k: int(v) for k, v in weights.items()}.items())) + +def find_connected_sources_targets( + sources: list[str], + targets: list[str], + graph: networkx.Graph +) -> list[tuple[str, str]]: + connections: list[tuple[str, str]] = [] + for source, target in itertools.product(sources, targets): + if graph.has_node(source) and graph.has_node(target) and networkx.has_path(graph, source, target): + connections.append((source, target)) + return connections + +def attempt_sample( + pathway_name: str, + pathway_df: pandas.DataFrame, + percentage: float, + weight_mapping: OrderedDict[int, int], + interactome_df: pandas.DataFrame, + sources: list[str], + targets: list[str], + output_interactome: str | os.PathLike, + output_gold_standard: str | os.PathLike +) -> Optional[list[tuple[str, str]]]: + # TODO: generalize to node prizes/actives + """ + Samples a {pathway_df} (logged as {pathway_name}) along with its backing {interactome_df} + with a certain {percentage} backed by a {weight_mapping} while preserving some {sources} and {targets}, + outputting to {output_interactome} and {output_gold_standard}, + returning the connections between {sources} and {targets}, + or None if the target percentage failed. + """ + interactome_df = sample_interactome(interactome_df, weight_mapping, percentage) + + print(f"Merging {pathway_name} with interactome...") + # While we are merging this graph, we are preparing to compare the connectedness of the prev[ious] and curr[ent] (merged) graph. + prev_graph = networkx.from_pandas_edgelist(pathway_df, source="Interactor1", target="Interactor2") + prev_connections = find_connected_sources_targets(sources, targets, prev_graph) + + print("Checking for pathway connectedness...") + pathway_df = pathway_df.merge(interactome_df, how="inner", on=["Interactor1", "Interactor2"]) + curr_graph = networkx.from_pandas_edgelist(pathway_df, source="Interactor1", target="Interactor2") + curr_connections = find_connected_sources_targets(sources, targets, curr_graph) + + # We ask that at least `percentage` of the sources and targets are connected with one another. + connection_percentage = float(len(curr_connections)) / float(len(prev_connections)) + + if percentage < connection_percentage: + print(f"Got {connection_percentage * 100:.1f}% connections above the {percentage * 100:.1f}% threshold.") + pathway_df.to_csv(output_gold_standard, sep="\t", index=False, header=False) + interactome_df.to_csv(output_interactome, sep='\t', index=False, header=False) + return curr_connections + print(f"Failed {connection_percentage * 100:.1f}% connections below the {percentage * 100:.1f}% threshold.") + return None + +def sample_interactome( + interactome_df: pandas.DataFrame, + weight_mapping: OrderedDict[int, int], + percentage: float +): + """ + Samples an interactome with its weight_counts dictionary. (See `count_weights` for generating `weight_counts`.) + """ + if percentage > 1: + raise RuntimeError(f"Got a percentage above 1 ({percentage})?") + if percentage == 1: + return interactome_df + # Using a list then creating the set is faster because of the sets rather than the gets. + print("Creating item samples...") + full_list: list[int] = [] + curr_v = 0 + for k, v in weight_mapping.items(): + full_list.extend(map(lambda x: x + curr_v, random.sample(range(1, v), round(percentage * v)))) + curr_v += v + full_set = set(full_list) + + print("Sampling interactome...") + return interactome_df.iloc[list(full_set)] diff --git a/tools/trim.py b/tools/trim.py new file mode 100644 index 00000000..64b89ca0 --- /dev/null +++ b/tools/trim.py @@ -0,0 +1,9 @@ +import pandas + +def trim_data_file(data_df: pandas.DataFrame, gold_standard_df: pandas.DataFrame) -> pandas.DataFrame: + """ + Trims the associated SPRAS data file with the nodes in the gold standard file. + """ + # We just want the set of all nodes present in the gold standard + gold_standard_nodes = set(gold_standard_df["Interactor1"]).union(set(gold_standard_df["Interactor2"])) + return data_df[data_df["NODEID"].isin(gold_standard_nodes)] diff --git a/uv.lock b/uv.lock index a00522cf..ecb46fa6 100644 --- a/uv.lock +++ b/uv.lock @@ -2,6 +2,15 @@ version = 1 revision = 2 requires-python = ">=3.13" +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + [[package]] name = "appdirs" version = "1.4.4" @@ -154,6 +163,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/05/d1/8952806fbf9583004ab479d8f58a9496c3d35f6b6009ddd458bdd9978eaf/dpath-2.2.0-py3-none-any.whl", hash = "sha256:b330a375ded0a0d2ed404440f6c6a715deae5313af40bbb01c8a41d891900576", size = 17618, upload-time = "2024-06-12T22:08:01.881Z" }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, +] + [[package]] name = "fastjsonschema" version = "2.21.2" @@ -432,6 +450,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/9e/1652778bce745a67b5fe05adde60ed362d38eb17d919a540e813d30f6874/numpy-2.3.2-cp314-cp314t-win_arm64.whl", hash = "sha256:092aeb3449833ea9c0bf0089d70c29ae480685dd2377ec9cdbbb620257f84631", size = 10544226, upload-time = "2025-07-24T20:56:34.509Z" }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, +] + [[package]] name = "packaging" version = "25.0" @@ -468,6 +498,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cd/d7/612123674d7b17cf345aad0a10289b2a384bff404e0463a83c4a3a59d205/pandas-2.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d2c3554bd31b731cd6490d94a28f3abb8dd770634a9e06eb6d2911b9827db370", size = 13186141, upload-time = "2025-08-21T10:28:05.377Z" }, ] +[[package]] +name = "paxtools" +version = "6.0.0.post1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2d/76/e65fc320494d07339a7f6b83c23deeb7337e3b7e3814880093114b6f488a/paxtools-6.0.0.post1.tar.gz", hash = "sha256:2fddd9155e92e5a8d5cc4b83427f8e804e6957aa8362d4abbf7656c6f858b9a8", size = 13690012, upload-time = "2026-02-11T07:29:45.375Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/fb/5299ab3d3e4bfb686c36a017c098bfaafd5f3b8e3fa81a8a2e618a50638a/paxtools-6.0.0.post1-py3-none-any.whl", hash = "sha256:c10318fc2a7767c4d39dd7365bfbcf1c1e8052201315024f7bf1c62cca2ef8fe", size = 13693939, upload-time = "2026-02-11T07:29:42.783Z" }, +] + [[package]] name = "platformdirs" version = "4.4.0" @@ -517,6 +556,74 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/8d/a6a9d58c929a869f7f1b99b3d37b3f14ef63e2826eef581416338d686c3f/pulp-3.2.2-py3-none-any.whl", hash = "sha256:d3ca5ff11a28b3e7b2508a992d7e51f3533471d89305f0560b5fe3b6cc821043", size = 16385354, upload-time = "2025-07-29T11:42:01.829Z" }, ] +[[package]] +name = "pydantic" +version = "2.12.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.41.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" }, + { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" }, + { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" }, + { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" }, + { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" }, + { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" }, + { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" }, + { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" }, + { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" }, + { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" }, + { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" }, + { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" }, + { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" }, + { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" }, + { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" }, + { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" }, + { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" }, + { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" }, + { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" }, + { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" }, + { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" }, + { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" }, + { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" }, + { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" }, + { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" }, + { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" }, + { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" }, + { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" }, + { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" }, + { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" }, + { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" }, + { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" }, + { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" }, + { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" }, + { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" }, + { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" }, + { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, + { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, + { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" }, +] + [[package]] name = "pyreadline3" version = "3.5.4" @@ -848,7 +955,11 @@ dependencies = [ { name = "gdown" }, { name = "more-itertools" }, { name = "networkx" }, + { name = "openpyxl" }, { name = "pandas" }, + { name = "paxtools" }, + { name = "pydantic" }, + { name = "requests" }, ] [package.dev-dependencies] @@ -862,7 +973,11 @@ requires-dist = [ { name = "gdown", specifier = ">=5.2.0" }, { name = "more-itertools", specifier = ">=10.7.0" }, { name = "networkx", specifier = ">=3.6.1" }, + { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pandas", specifier = ">=2.3.0" }, + { name = "paxtools", specifier = ">=6.0.0.post1" }, + { name = "pydantic", specifier = ">=2.12.5" }, + { name = "requests", specifier = ">=2.32.5" }, ] [package.metadata.requires-dev] @@ -919,6 +1034,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ] +[[package]] +name = "typing-inspection" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, +] + [[package]] name = "tzdata" version = "2025.2"