Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions pridepy/files/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,20 +725,24 @@ def download_all_category_files(
protocol: str,
aspera_maximum_bandwidth: str,
checksum_check: bool,
category: str,
categories: List[str] = None,
category: str = None,
):
"""
Download all files of a specified category from a PRIDE project.
Download all files of specified categories from a PRIDE project.

:param accession: The PRIDE project accession identifier.
:param output_folder: The directory where the files will be downloaded.
:param skip_if_downloaded_already: If True, skips downloading files that already exist.
:param protocol: The transfer protocol to use (e.g., ftp, aspera, globus, s3).
:param aspera_maximum_bandwidth: Maximum bandwidth for Aspera transfers.
:param checksum_check: If True, downloads the checksum file for the project.
:param category: The category of files to download.
:param categories: List of file categories to download.
:param category: Single file category (deprecated, use categories instead).
"""
raw_files = self.get_all_category_file_list(accession, category)
if categories is None:
categories = [category] if category else ["RAW"]
raw_files = self.get_all_category_file_list(accession, categories)
self.download_files(
raw_files,
accession,
Expand All @@ -749,17 +753,22 @@ def download_all_category_files(
checksum_check=checksum_check,
)

def get_all_category_file_list(self, accession: str, category: str):
def get_all_category_file_list(
self, accession: str, categories: "str | List[str]"
) -> List[Dict]:
Comment on lines +756 to +758
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Action required

1. Py3.9 union annotation 🐞 Bug ⛯ Reliability

get_all_category_file_list uses a quoted PEP604 union ("str | List[str]"). While it won’t
  crash at import time, any tooling that evaluates annotations (e.g., typing.get_type_hints, some
  doc generators) can raise SyntaxError on Python 3.9.
• The project declares Python ^3.9, so this compatibility risk is in-scope for supported runtimes.
• This can break downstream users in otherwise valid environments, especially in type-aware
  frameworks and documentation builds.
Agent Prompt
### Issue description
`get_all_category_file_list` uses a quoted PEP604 union annotation (`"str | List[str]"`). On Python 3.9, this can break any runtime/tooling that evaluates annotations (e.g., `typing.get_type_hints`), causing a `SyntaxError`.

### Issue Context
The project declares `python = "^3.9"`, so Python 3.9 is a supported runtime.

### Fix Focus Areas
- pridepy/files/files.py[756-758]

### Suggested change
- Import `Union` from `typing`.
- Change annotation to `Union[str, List[str]]` (or `Sequence[str]` if you want to accept more iterables).

ⓘ Copy this prompt and use it to remediate the issue with your preferred AI generation tools

"""
Retrieve a list of files from a specific project that belong to a given category.
Retrieve a list of files from a specific project that belong to given categories.

:param accession: The PRIDE project accession identifier.
:param category: The category of files to filter by.
:return: A list of files in the specified category.
:param categories: A single category string or list of categories to filter by.
:return: A list of files matching the specified categories.
"""
record_files = self.stream_all_files_by_project(accession)
if isinstance(categories, str):
categories = [categories]
category_set = set(categories)
category_files = [
file for file in record_files if file["fileCategory"]["value"] == category
file for file in record_files if file["fileCategory"]["value"] in category_set
]
return category_files

Expand Down
17 changes: 13 additions & 4 deletions pridepy/pridepy.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ def download_all_public_raw_files(
"-c",
"--category",
required=True,
help="Category of the files to be downloaded",
type=click.Choice("RAW,PEAK,SEARCH,RESULT,SPECTRUM_LIBRARY,OTHER,FASTA".split(",")),
help="Comma-separated categories of files to download (e.g. RAW or RAW,SEARCH). "
"Valid values: RAW, PEAK, SEARCH, RESULT, SPECTRUM_LIBRARY, OTHER, FASTA",
)
def download_all_public_category_files(
accession: str,
Expand All @@ -146,9 +146,18 @@ def download_all_public_category_files(
skip_if_downloaded_already (bool): If True, skips downloading files that already exist. Default is False.
aspera_maximum_bandwidth (str): Maximum bandwidth for Aspera transfers.
checksum_check (bool): If True, downloads the checksum file for the project.
category (str): The category of files to download.
category (str): Comma-separated categories of files to download (e.g. RAW or RAW,SEARCH).
"""

valid_categories = {"RAW", "PEAK", "SEARCH", "RESULT", "SPECTRUM_LIBRARY", "OTHER", "FASTA"}
categories = [c.strip().upper() for c in category.split(",")]
invalid = set(categories) - valid_categories
if invalid:
raise click.BadParameter(
f"Invalid category: {', '.join(invalid)}. "
f"Valid values: {', '.join(sorted(valid_categories))}"
)

raw_files = Files()
logging.info("accession: " + accession)
logging.info(f"Data will be downloaded from {protocol}")
Expand All @@ -163,7 +172,7 @@ def download_all_public_category_files(
protocol,
aspera_maximum_bandwidth=aspera_maximum_bandwidth,
checksum_check=checksum_check,
category=category,
categories=categories,
)


Expand Down
13 changes: 13 additions & 0 deletions pridepy/tests/test_raw_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,16 @@ def test_get_all_category_file_list(self):

result = raw.get_all_category_file_list("PXD008644", "SEARCH")
assert len(result) == 2

def test_get_all_category_file_list_multiple(self):
"""
Test filtering by multiple categories at once.
PXD008644 has 2 RAW + 2 SEARCH = 4 files combined.
"""
raw = Files()
result = raw.get_all_category_file_list("PXD008644", ["RAW", "SEARCH"])
assert len(result) == 4

# Verify both categories are present
categories = {file["fileCategory"]["value"] for file in result}
assert categories == {"RAW", "SEARCH"}
Loading