Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,8 @@ paper/jats/
venv

.qodo


#Ignore cursor AI rules
.cursor/rules/codacy.mdc
.DS_Store
238 changes: 238 additions & 0 deletions pridepy/files/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from ftplib import FTP
from typing import Dict, List
import socket
from urllib.parse import urlparse
import xml.etree.ElementTree as ET
Comment on lines +15 to +16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Use defusedxml to prevent XML attacks.

The standard xml.etree.ElementTree is vulnerable to XML bomb attacks, billion laughs, quadratic blowup, and external entity expansion. ProteomeXchange XML comes from an external source and should be treated as untrusted.

Install and use defusedxml:

 from urllib.parse import urlparse
-import xml.etree.ElementTree as ET
+import defusedxml.ElementTree as ET

Then add defusedxml to your project dependencies (e.g., requirements.txt or setup.py).

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
from urllib.parse import urlparse
import xml.etree.ElementTree as ET
from urllib.parse import urlparse
import defusedxml.ElementTree as ET
🤖 Prompt for AI Agents
In pridepy/files/files.py around lines 15 to 16, the code imports
xml.etree.ElementTree which is unsafe for untrusted XML; replace that import
with defusedxml.ElementTree (e.g., from defusedxml import ElementTree as ET) so
parsing uses the secure implementation, update any references to ET accordingly,
and add defusedxml to the project dependencies (requirements.txt or setup.py)
and run tests to confirm no API mismatches.


import boto3
import botocore
Expand Down Expand Up @@ -760,3 +762,239 @@ def get_all_category_file_list(self, accession: str, category: str):
file for file in record_files if file["fileCategory"]["value"] == category
]
return category_files

# -------------------------------
# ProteomeXchange support
# -------------------------------

@staticmethod
def _normalize_px_xml_url(px_id_or_url: str) -> str:
"""
Build the ProteomeXchange XML endpoint from a dataset accession or a dataset web URL.
Examples accepted:
- PXD039236
- https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236
- https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236&anything
"""
if px_id_or_url.startswith("http://") or px_id_or_url.startswith("https://"):
parsed = urlparse(px_id_or_url)
# keep the ID param value if present; otherwise fallback to the path tail
query = parsed.query or ""
if "ID=" in query:
id_value = [q.split("=", 1)[1] for q in query.split("&") if q.startswith("ID=")]
if id_value:
return (
f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={id_value[0]}&outputMode=XML&test=no"
)
# If the input URL already requests XML, just ensure flags
if parsed.path.endswith("/cgi/GetDataset"):
return (
f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{query}&outputMode=XML&test=no"
Comment on lines +791 to +792
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] If query is empty, this builds a URL with ?&outputMode=XML.... Consider constructing the query string conditionally, e.g., qs = f'{query}&' if query else '' and then ...GetDataset?{qs}outputMode=XML&test=no.

Suggested change
return (
f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{query}&outputMode=XML&test=no"
qs = f"{query}&" if query else ""
return (
f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{qs}outputMode=XML&test=no"

Copilot uses AI. Check for mistakes.
)
# Assume it's a plain accession if not a URL
return (
f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={px_id_or_url}&outputMode=XML&test=no"
)

@staticmethod
def _parse_px_xml_for_raw_file_urls(px_xml_url: str) -> List[str]:
"""
Parse the PX XML and return a list of associated raw file URIs.
We extract cvParam with name "Associated raw file URI" under each DatasetFile.
"""
headers = {"Accept": "application/xml"}
response = Util.get_api_call(px_xml_url, headers)
response.raise_for_status()
root = ET.fromstring(response.content)

urls: List[str] = []
# The XML namespace is often absent in PX XML; access elements directly
for dataset_file in root.iter("DatasetFile"):
for cv in dataset_file.findall("cvParam"):
name = cv.attrib.get("name")
value = cv.attrib.get("value")
if name == "Associated raw file URI" and value:
urls.append(value)
return urls

def download_px_raw_files(
self,
px_id_or_url: str,
output_folder: str,
skip_if_downloaded_already: bool = True,
) -> None:
"""
Download all raw files referenced by a ProteomeXchange dataset.
Prefer FTP when the URL is ftp://, otherwise use HTTP(S). Supports resume and skip.
"""
if not os.path.isdir(output_folder):
os.makedirs(output_folder, exist_ok=True)

px_xml_url = self._normalize_px_xml_url(px_id_or_url)
logging.info(f"Fetching PX XML: {px_xml_url}")
urls = self._parse_px_xml_for_raw_file_urls(px_xml_url)
if not urls:
logging.info("No Associated raw file URIs found in PX XML")
return

ftp_urls = [u for u in urls if u.lower().startswith("ftp://")]
http_urls = [u for u in urls if u.lower().startswith("http://") or u.lower().startswith("https://")]

if ftp_urls:
self.download_ftp_urls(ftp_urls, output_folder, skip_if_downloaded_already)
if http_urls:
self.download_http_urls(http_urls, output_folder, skip_if_downloaded_already)

@staticmethod
def _local_path_for_url(download_url: str, output_folder: str) -> str:
filename = os.path.basename(urlparse(download_url).path)
return os.path.join(output_folder, filename)
Comment on lines +848 to +851
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Handle URLs with trailing slashes.

os.path.basename() returns an empty string if the URL path ends with a slash, which would result in an invalid local path.

Add validation:

 @staticmethod
 def _local_path_for_url(download_url: str, output_folder: str) -> str:
     filename = os.path.basename(urlparse(download_url).path)
+    if not filename:
+        raise ValueError(f"Unable to extract filename from URL: {download_url}")
     return os.path.join(output_folder, filename)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
@staticmethod
def _local_path_for_url(download_url: str, output_folder: str) -> str:
filename = os.path.basename(urlparse(download_url).path)
return os.path.join(output_folder, filename)
@staticmethod
def _local_path_for_url(download_url: str, output_folder: str) -> str:
filename = os.path.basename(urlparse(download_url).path)
if not filename:
raise ValueError(f"Unable to extract filename from URL: {download_url}")
return os.path.join(output_folder, filename)
🤖 Prompt for AI Agents
In pridepy/files/files.py around lines 848 to 851, the method can return an
empty filename when the URL path ends with a slash; change it to strip trailing
slashes and pick the last non-empty path segment as the filename, and if that
still yields an empty string fall back to a safe name (e.g., netloc or a
uuid-based name, optionally with a default extension). Also unquote and sanitize
the chosen filename to remove unsafe characters before joining with
output_folder so the resulting local path is always valid.


@staticmethod
def download_ftp_urls(
ftp_urls: List[str],
output_folder: str,
skip_if_downloaded_already: bool,
max_connection_retries: int = 3,
max_download_retries: int = 3,
) -> None:
"""
Download a list of FTP URLs using a single connection, with retries and progress bars.
"""
if not os.path.isdir(output_folder):
os.makedirs(output_folder, exist_ok=True)

def connect_ftp(host: str):
ftp = FTP(host, timeout=30)
ftp.login()
ftp.set_pasv(True)
logging.info(f"Connected to FTP host: {host}")
return ftp

# Group URLs by host to reuse connections efficiently
host_to_paths: Dict[str, List[str]] = {}
for url in ftp_urls:
parsed = urlparse(url)
host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/"))

for host, paths in host_to_paths.items():
connection_attempt = 0
while connection_attempt < max_connection_retries:
try:
ftp = connect_ftp(host)
for ftp_path in paths:
try:
local_path = os.path.join(output_folder, os.path.basename(ftp_path))
if skip_if_downloaded_already and os.path.exists(local_path):
logging.info("Skipping download as file already exists")
continue

logging.info(f"Starting FTP download: {host}/{ftp_path}")
download_attempt = 0
while download_attempt < max_download_retries:
try:
total_size = ftp.size(ftp_path)
# Try to resume using REST if partial file exists
if os.path.exists(local_path):
current_size = os.path.getsize(local_path)
mode = "ab"
else:
current_size = 0
mode = "wb"

with open(local_path, mode) as f, tqdm(
total=total_size,
unit="B",
unit_scale=True,
desc=local_path,
initial=current_size,
) as pbar:
def callback(data):
f.write(data)
pbar.update(len(data))

if current_size:
try:
ftp.sendcmd(f"REST {current_size}")
except Exception:
# If REST not supported, fall back to full download
current_size = 0
f.seek(0)
f.truncate()
ftp.retrbinary(f"RETR {ftp_path}", callback)
logging.info(f"Successfully downloaded {local_path}")
break
except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e:
download_attempt += 1
logging.error(
f"Download failed for {local_path} (attempt {download_attempt}): {str(e)}"
)
if download_attempt >= max_download_retries:
logging.error(
Comment on lines +927 to +933
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ftplib is not imported as a module, but its names (error_temp, error_perm) are referenced via the ftplib namespace. Either add import ftplib at the top of the file (near other imports) or import the specific exceptions: from ftplib import error_temp, error_perm and then remove the ftplib. qualifier in the except clauses. This occurs in both exception handlers within download_ftp_urls.

Copilot uses AI. Check for mistakes.
f"Giving up on {local_path} after {max_download_retries} attempts."
)
break
except Exception as e:
logging.error(f"Unexpected error while processing FTP path {ftp_path}: {str(e)}")
ftp.quit()
logging.info(f"Disconnected from FTP host: {host}")
break
except (socket.timeout, ftplib.error_temp, ftplib.error_perm, socket.error) as e:
connection_attempt += 1
logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}")
Comment on lines +942 to +944
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue as above: ftplib is not imported as a module but is referenced here. Add import ftplib or import the specific exceptions (from ftplib import error_temp, error_perm) and update the clause accordingly.

Copilot uses AI. Check for mistakes.
if connection_attempt < max_connection_retries:
logging.info("Retrying connection...")
time.sleep(5)
else:
logging.error(
f"Giving up after {max_connection_retries} failed connection attempts to {host}."
)
Comment on lines +853 to +951
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Fix resume logic and improve error handling.

Several issues:

  1. Resume fallback bug (lines 916-923): If REST is not supported and the code falls back to a full download by truncating the file, the progress bar's initial=current_size is not reset to 0, causing incorrect progress display.

  2. Blind exception catching (lines 919, 937): Catching Exception is too broad. Be specific about what failures to handle.

  3. Use logging.exception (lines 929, 933, 938, 944, 949): In exception handlers, use logging.exception() instead of logging.error() to include the traceback.

Apply these fixes:

                                     if current_size:
                                         try:
                                             ftp.sendcmd(f"REST {current_size}")
-                                        except Exception:
+                                        except ftplib.error_perm:
                                             # If REST not supported, fall back to full download
                                             current_size = 0
                                             f.seek(0)
                                             f.truncate()
+                                            pbar.reset()  # Reset progress bar
                                         ftp.retrbinary(f"RETR {ftp_path}", callback)
                             except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e:
                                 download_attempt += 1
-                                logging.error(
-                                    f"Download failed for {local_path} (attempt {download_attempt}): {str(e)}"
+                                logging.exception(
+                                    f"Download failed for {local_path} (attempt {download_attempt})"
                                 )
                                 if download_attempt >= max_download_retries:
-                                    logging.error(
-                                        f"Giving up on {local_path} after {max_download_retries} attempts."
+                                    logging.exception(
+                                        f"Giving up on {local_path} after {max_download_retries} attempts"
                                     )
-                    except Exception as e:
-                        logging.error(f"Unexpected error while processing FTP path {ftp_path}: {str(e)}")
+                    except Exception:
+                        logging.exception(f"Unexpected error while processing FTP path {ftp_path}")
             except (socket.timeout, ftplib.error_temp, ftplib.error_perm, socket.error) as e:
                 connection_attempt += 1
-                logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}")
+                logging.exception(f"FTP connection failed (attempt {connection_attempt})")
                 if connection_attempt < max_connection_retries:
                     logging.info("Retrying connection...")
                     time.sleep(5)
                 else:
-                    logging.error(
-                        f"Giving up after {max_connection_retries} failed connection attempts to {host}."
+                    logging.exception(
+                        f"Giving up after {max_connection_retries} failed connection attempts to {host}"
                     )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
@staticmethod
def download_ftp_urls(
ftp_urls: List[str],
output_folder: str,
skip_if_downloaded_already: bool,
max_connection_retries: int = 3,
max_download_retries: int = 3,
) -> None:
"""
Download a list of FTP URLs using a single connection, with retries and progress bars.
"""
if not os.path.isdir(output_folder):
os.makedirs(output_folder, exist_ok=True)
def connect_ftp(host: str):
ftp = FTP(host, timeout=30)
ftp.login()
ftp.set_pasv(True)
logging.info(f"Connected to FTP host: {host}")
return ftp
# Group URLs by host to reuse connections efficiently
host_to_paths: Dict[str, List[str]] = {}
for url in ftp_urls:
parsed = urlparse(url)
host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/"))
for host, paths in host_to_paths.items():
connection_attempt = 0
while connection_attempt < max_connection_retries:
try:
ftp = connect_ftp(host)
for ftp_path in paths:
try:
local_path = os.path.join(output_folder, os.path.basename(ftp_path))
if skip_if_downloaded_already and os.path.exists(local_path):
logging.info("Skipping download as file already exists")
continue
logging.info(f"Starting FTP download: {host}/{ftp_path}")
download_attempt = 0
while download_attempt < max_download_retries:
try:
total_size = ftp.size(ftp_path)
# Try to resume using REST if partial file exists
if os.path.exists(local_path):
current_size = os.path.getsize(local_path)
mode = "ab"
else:
current_size = 0
mode = "wb"
with open(local_path, mode) as f, tqdm(
total=total_size,
unit="B",
unit_scale=True,
desc=local_path,
initial=current_size,
) as pbar:
def callback(data):
f.write(data)
pbar.update(len(data))
if current_size:
try:
ftp.sendcmd(f"REST {current_size}")
except Exception:
# If REST not supported, fall back to full download
current_size = 0
f.seek(0)
f.truncate()
ftp.retrbinary(f"RETR {ftp_path}", callback)
logging.info(f"Successfully downloaded {local_path}")
break
except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e:
download_attempt += 1
logging.error(
f"Download failed for {local_path} (attempt {download_attempt}): {str(e)}"
)
if download_attempt >= max_download_retries:
logging.error(
f"Giving up on {local_path} after {max_download_retries} attempts."
)
break
except Exception as e:
logging.error(f"Unexpected error while processing FTP path {ftp_path}: {str(e)}")
ftp.quit()
logging.info(f"Disconnected from FTP host: {host}")
break
except (socket.timeout, ftplib.error_temp, ftplib.error_perm, socket.error) as e:
connection_attempt += 1
logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}")
if connection_attempt < max_connection_retries:
logging.info("Retrying connection...")
time.sleep(5)
else:
logging.error(
f"Giving up after {max_connection_retries} failed connection attempts to {host}."
)
@staticmethod
def download_ftp_urls(
ftp_urls: List[str],
output_folder: str,
skip_if_downloaded_already: bool,
max_connection_retries: int = 3,
max_download_retries: int = 3,
) -> None:
"""
Download a list of FTP URLs using a single connection, with retries and progress bars.
"""
if not os.path.isdir(output_folder):
os.makedirs(output_folder, exist_ok=True)
def connect_ftp(host: str):
ftp = FTP(host, timeout=30)
ftp.login()
ftp.set_pasv(True)
logging.info(f"Connected to FTP host: {host}")
return ftp
# Group URLs by host to reuse connections efficiently
host_to_paths: Dict[str, List[str]] = {}
for url in ftp_urls:
parsed = urlparse(url)
host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/"))
for host, paths in host_to_paths.items():
connection_attempt = 0
while connection_attempt < max_connection_retries:
try:
ftp = connect_ftp(host)
for ftp_path in paths:
try:
local_path = os.path.join(output_folder, os.path.basename(ftp_path))
if skip_if_downloaded_already and os.path.exists(local_path):
logging.info("Skipping download as file already exists")
continue
logging.info(f"Starting FTP download: {host}/{ftp_path}")
download_attempt = 0
while download_attempt < max_download_retries:
try:
total_size = ftp.size(ftp_path)
# Try to resume using REST if partial file exists
if os.path.exists(local_path):
current_size = os.path.getsize(local_path)
mode = "ab"
else:
current_size = 0
mode = "wb"
with open(local_path, mode) as f, tqdm(
total=total_size,
unit="B",
unit_scale=True,
desc=local_path,
initial=current_size,
) as pbar:
def callback(data):
f.write(data)
pbar.update(len(data))
if current_size:
try:
ftp.sendcmd(f"REST {current_size}")
except ftplib.error_perm:
# If REST not supported, fall back to full download
current_size = 0
f.seek(0)
f.truncate()
pbar.reset() # Reset progress bar
ftp.retrbinary(f"RETR {ftp_path}", callback)
logging.info(f"Successfully downloaded {local_path}")
break
except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e:
download_attempt += 1
logging.exception(
f"Download failed for {local_path} (attempt {download_attempt})"
)
if download_attempt >= max_download_retries:
logging.exception(
f"Giving up on {local_path} after {max_download_retries} attempts"
)
break
except Exception:
logging.exception(f"Unexpected error while processing FTP path {ftp_path}")
ftp.quit()
logging.info(f"Disconnected from FTP host: {host}")
break
except (socket.timeout, ftplib.error_temp, ftplib.error_perm, socket.error) as e:
connection_attempt += 1
logging.exception(f"FTP connection failed (attempt {connection_attempt})")
if connection_attempt < max_connection_retries:
logging.info("Retrying connection...")
time.sleep(5)
else:
logging.exception(
f"Giving up after {max_connection_retries} failed connection attempts to {host}"
)
🧰 Tools
🪛 Ruff (0.14.0)

868-868: FTP-related functions are being called. FTP is considered insecure. Use SSH/SFTP/SCP or some other encrypted protocol.

(S321)


919-919: Do not catch blind exception: Exception

(BLE001)


929-931: Use logging.exception instead of logging.error

Replace with exception

(TRY400)


930-930: Use explicit conversion flag

Replace with conversion flag

(RUF010)


933-935: Use logging.exception instead of logging.error

Replace with exception

(TRY400)


937-937: Do not catch blind exception: Exception

(BLE001)


938-938: Use logging.exception instead of logging.error

Replace with exception

(TRY400)


938-938: Use explicit conversion flag

Replace with conversion flag

(RUF010)


944-944: Use logging.exception instead of logging.error

Replace with exception

(TRY400)


944-944: Use explicit conversion flag

Replace with conversion flag

(RUF010)


949-951: Use logging.exception instead of logging.error

Replace with exception

(TRY400)

🤖 Prompt for AI Agents
In pridepy/files/files.py around lines 853-951, fix resume and error handling:
when REST fails, reset current_size to 0, switch to write mode ('wb'), and
recreate the tqdm progress bar (or reset its initial) so the progress display
reflects a full download rather than the old initial value; replace the broad
"except Exception" around per-path processing with specific exceptions (e.g.,
OSError, ftplib.all_errors, socket.timeout) so only expected I/O/FTP errors are
caught; and replace logging.error(...) calls in all exception handlers at the
indicated lines with logging.exception(...) to include tracebacks. Ensure REST
fallback truncates the file, sets mode='wb' and current_size=0 before
re-creating the retrbinary callback/progress loop, and narrow all catch clauses
to explicit FTP/IO/socket errors.


@staticmethod
def download_http_urls(
http_urls: List[str],
output_folder: str,
skip_if_downloaded_already: bool,
) -> None:
"""
Download a list of HTTP(S) URLs with resume support and progress bars.
"""
if not os.path.isdir(output_folder):
os.makedirs(output_folder, exist_ok=True)

session = Util.create_session_with_retries()
for url in http_urls:
try:
local_path = Files._local_path_for_url(url, output_folder)
if skip_if_downloaded_already and os.path.exists(local_path):
logging.info("Skipping download as file already exists")
continue

if os.path.exists(local_path):
resume_size = os.path.getsize(local_path)
headers = {"Range": f"bytes={resume_size}-"}
mode = "ab"
else:
resume_size = 0
headers = {}
mode = "wb"

with session.get(url, stream=True, headers=headers, timeout=(10, 60)) as r:
r.raise_for_status()
total_size = int(r.headers.get("content-length", 0)) + resume_size
block_size = 1024 * 1024
with tqdm(
total=total_size,
unit="B",
unit_scale=True,
desc=local_path,
initial=resume_size,
Comment on lines +976 to +991
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] When resuming, if the server ignores the Range header and returns 200 (full content), adding resume_size to content-length will overcount and may duplicate data. Guard by checking for r.status_code == 206 or a Content-Range header; only add resume_size when the response is partial. Otherwise, fall back to rewriting from the start.

Suggested change
mode = "ab"
else:
resume_size = 0
headers = {}
mode = "wb"
with session.get(url, stream=True, headers=headers, timeout=(10, 60)) as r:
r.raise_for_status()
total_size = int(r.headers.get("content-length", 0)) + resume_size
block_size = 1024 * 1024
with tqdm(
total=total_size,
unit="B",
unit_scale=True,
desc=local_path,
initial=resume_size,
else:
resume_size = 0
headers = {}
with session.get(url, stream=True, headers=headers, timeout=(10, 60)) as r:
r.raise_for_status()
# Check if server honored Range request
is_partial = r.status_code == 206 or "content-range" in r.headers
if is_partial:
mode = "ab"
total_size = int(r.headers.get("content-length", 0)) + resume_size
initial = resume_size
else:
# Server ignored Range, start from scratch
mode = "wb"
resume_size = 0
total_size = int(r.headers.get("content-length", 0))
initial = 0
block_size = 1024 * 1024
with tqdm(
total=total_size,
unit="B",
unit_scale=True,
desc=local_path,
initial=initial,

Copilot uses AI. Check for mistakes.
) as pbar:
with open(local_path, mode) as f:
for chunk in r.iter_content(chunk_size=block_size):
if chunk:
f.write(chunk)
pbar.update(len(chunk))
logging.info(f"Successfully downloaded {local_path}")
except Exception as e:
logging.error(f"HTTP download failed for {url}: {str(e)}")
38 changes: 35 additions & 3 deletions pridepy/pridepy.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,38 @@ def download_file_by_name(
)


@main.command(
"download-px-raw-files",
help="Download all raw files referenced by a ProteomeXchange dataset (PX URL or accession)",
)
@click.option(
"-a",
"--accession",
"--px",
"accession",
required=True,
help="ProteomeXchange accession (e.g. PXD039236). --px is deprecated.",
)
@click.option(
"-o",
"--output_folder",
required=True,
help="output folder to download files",
)
@click.option(
"-skip",
"--skip_if_downloaded_already",
required=False,
default=True,
help="Boolean to skip a file if it already exists",
Comment on lines +275 to +279
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Click short options must be a single character (e.g., -s), so -skip will not work. Also, for a boolean flag, prefer is_flag=True and provide both forms. Suggest: @click.option('-s', '--skip-if-downloaded-already/--no-skip-if-downloaded-already', 'skip_if_downloaded_already', is_flag=True, default=True, help='Skip a file if it already exists').

Suggested change
"-skip",
"--skip_if_downloaded_already",
required=False,
default=True,
help="Boolean to skip a file if it already exists",
"-s",
"--skip-if-downloaded-already/--no-skip-if-downloaded-already",
"skip_if_downloaded_already",
is_flag=True,
default=True,
help="Skip a file if it already exists",

Copilot uses AI. Check for mistakes.
)
def download_px_raw_files(accession: str, output_folder: str, skip_if_downloaded_already: bool):
"""CLI wrapper to download raw files via ProteomeXchange XML."""
files = Files()
logging.info(f"PX accession/URL: {accession}")
files.download_px_raw_files(accession, output_folder, skip_if_downloaded_already)


@main.command("list-private-files", help="List private files by project accession")
@click.option("-a", "--accession", required=True, help="accession of the project")
@click.option("-u", "--user", required=True, help="PRIDE login username")
Expand Down Expand Up @@ -320,7 +352,7 @@ def stream_files_metadata(accession, output_file):
)
@click.option(
"-f",
"--filter",
"--filters",
required=False,
help="Parameters to filter the search results. The structure of the "
"filter is: field1==value1, field2==value2. Example "
Comment on lines 356 to 358
Copy link

Copilot AI Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The help text still refers to 'filter' (singular) but the option was renamed to --filters. Update to 'filters' in both sentences for consistency and clarity.

Copilot uses AI. Check for mistakes.
Expand Down Expand Up @@ -364,7 +396,7 @@ def stream_files_metadata(accession, output_file):
),
)
def search_projects_by_keywords_and_filters(
keyword, filter, page_size, page, sort_direction, sort_fields
keyword, filters, page_size, page, sort_direction, sort_fields
):
"""
Search all projects by keywords and filters
Expand All @@ -380,7 +412,7 @@ def search_projects_by_keywords_and_filters(
sf = ", ".join(sort_fields)
logging.info(
project.search_by_keywords_and_filters(
keyword, filter, page_size, page, sort_direction, sf
keyword, filters, page_size, page, sort_direction, sf
)
)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pridepy"
version = "0.0.8"
version = "0.0.9"
description = "Python Client library for PRIDE Rest API"
authors = [
"PRIDE Team <pride-support@ebi.ac.uk>",
Expand Down
Loading