PRIDE-Archive · ypriverol · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,8 @@ paper/jats/
 venv
 
 .qodo
+
+
+#Ignore cursor AI rules
+.cursor/rules/codacy.mdc
+.DS_Store
diff --git a/pridepy/files/files.py b/pridepy/files/files.py
@@ -12,6 +12,8 @@
 from ftplib import FTP
 from typing import Dict, List
 import socket
+from urllib.parse import urlparse
+import xml.etree.ElementTree as ET
-from urllib.parse import urlparse
-import xml.etree.ElementTree as ET
+from urllib.parse import urlparse
+import defusedxml.ElementTree as ET
-from urllib.parse import urlparse
-import xml.etree.ElementTree as ET
+from urllib.parse import urlparse
+import defusedxml.ElementTree as ET
 
 import boto3
 import botocore
@@ -760,3 +762,239 @@ def get_all_category_file_list(self, accession: str, category: str):
             file for file in record_files if file["fileCategory"]["value"] == category
         ]
         return category_files
+
+    # -------------------------------
+    # ProteomeXchange support
+    # -------------------------------
+
+    @staticmethod
+    def _normalize_px_xml_url(px_id_or_url: str) -> str:
+        """
+        Build the ProteomeXchange XML endpoint from a dataset accession or a dataset web URL.
+        Examples accepted:
+          - PXD039236
+          - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236
+          - https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD039236&anything
+        """
+        if px_id_or_url.startswith("http://") or px_id_or_url.startswith("https://"):
+            parsed = urlparse(px_id_or_url)
+            # keep the ID param value if present; otherwise fallback to the path tail
+            query = parsed.query or ""
+            if "ID=" in query:
+                id_value = [q.split("=", 1)[1] for q in query.split("&") if q.startswith("ID=")]
+                if id_value:
+                    return (
+                        f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={id_value[0]}&outputMode=XML&test=no"
+                    )
+            # If the input URL already requests XML, just ensure flags
+            if parsed.path.endswith("/cgi/GetDataset"):
+                return (
+                    f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{query}&outputMode=XML&test=no"
-                return (
-                    f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{query}&outputMode=XML&test=no"
+                qs = f"{query}&" if query else ""
+                return (
+                    f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{qs}outputMode=XML&test=no"
-                return (
-                    f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{query}&outputMode=XML&test=no"
+                qs = f"{query}&" if query else ""
+                return (
+                    f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?{qs}outputMode=XML&test=no"
+                )
+        # Assume it's a plain accession if not a URL
+        return (
+            f"https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={px_id_or_url}&outputMode=XML&test=no"
+        )
+
+    @staticmethod
+    def _parse_px_xml_for_raw_file_urls(px_xml_url: str) -> List[str]:
+        """
+        Parse the PX XML and return a list of associated raw file URIs.
+        We extract cvParam with name "Associated raw file URI" under each DatasetFile.
+        """
+        headers = {"Accept": "application/xml"}
+        response = Util.get_api_call(px_xml_url, headers)
+        response.raise_for_status()
+        root = ET.fromstring(response.content)
+
+        urls: List[str] = []
+        # The XML namespace is often absent in PX XML; access elements directly
+        for dataset_file in root.iter("DatasetFile"):
+            for cv in dataset_file.findall("cvParam"):
+                name = cv.attrib.get("name")
+                value = cv.attrib.get("value")
+                if name == "Associated raw file URI" and value:
+                    urls.append(value)
+        return urls
+
+    def download_px_raw_files(
+        self,
+        px_id_or_url: str,
+        output_folder: str,
+        skip_if_downloaded_already: bool = True,
+    ) -> None:
+        """
+        Download all raw files referenced by a ProteomeXchange dataset.
+        Prefer FTP when the URL is ftp://, otherwise use HTTP(S). Supports resume and skip.
+        """
+        if not os.path.isdir(output_folder):
+            os.makedirs(output_folder, exist_ok=True)
+
+        px_xml_url = self._normalize_px_xml_url(px_id_or_url)
+        logging.info(f"Fetching PX XML: {px_xml_url}")
+        urls = self._parse_px_xml_for_raw_file_urls(px_xml_url)
+        if not urls:
+            logging.info("No Associated raw file URIs found in PX XML")
+            return
+
+        ftp_urls = [u for u in urls if u.lower().startswith("ftp://")]
+        http_urls = [u for u in urls if u.lower().startswith("http://") or u.lower().startswith("https://")]
+
+        if ftp_urls:
+            self.download_ftp_urls(ftp_urls, output_folder, skip_if_downloaded_already)
+        if http_urls:
+            self.download_http_urls(http_urls, output_folder, skip_if_downloaded_already)
+
+    @staticmethod
+    def _local_path_for_url(download_url: str, output_folder: str) -> str:
+        filename = os.path.basename(urlparse(download_url).path)
+        return os.path.join(output_folder, filename)
-    @staticmethod
-    def _local_path_for_url(download_url: str, output_folder: str) -> str:
-        filename = os.path.basename(urlparse(download_url).path)
-        return os.path.join(output_folder, filename)
+    @staticmethod
+    def _local_path_for_url(download_url: str, output_folder: str) -> str:
+        filename = os.path.basename(urlparse(download_url).path)
+        if not filename:
+            raise ValueError(f"Unable to extract filename from URL: {download_url}")
+        return os.path.join(output_folder, filename)
-    @staticmethod
-    def _local_path_for_url(download_url: str, output_folder: str) -> str:
-        filename = os.path.basename(urlparse(download_url).path)
-        return os.path.join(output_folder, filename)
+    @staticmethod
+    def _local_path_for_url(download_url: str, output_folder: str) -> str:
+        filename = os.path.basename(urlparse(download_url).path)
+        if not filename:
+            raise ValueError(f"Unable to extract filename from URL: {download_url}")
+        return os.path.join(output_folder, filename)
+
+    @staticmethod
+    def download_ftp_urls(
+        ftp_urls: List[str],
+        output_folder: str,
+        skip_if_downloaded_already: bool,
+        max_connection_retries: int = 3,
+        max_download_retries: int = 3,
+    ) -> None:
+        """
+        Download a list of FTP URLs using a single connection, with retries and progress bars.
+        """
+        if not os.path.isdir(output_folder):
+            os.makedirs(output_folder, exist_ok=True)
+
+        def connect_ftp(host: str):
+            ftp = FTP(host, timeout=30)
+            ftp.login()
+            ftp.set_pasv(True)
+            logging.info(f"Connected to FTP host: {host}")
+            return ftp
+
+        # Group URLs by host to reuse connections efficiently
+        host_to_paths: Dict[str, List[str]] = {}
+        for url in ftp_urls:
+            parsed = urlparse(url)
+            host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/"))
+
+        for host, paths in host_to_paths.items():
+            connection_attempt = 0
+            while connection_attempt < max_connection_retries:
+                try:
+                    ftp = connect_ftp(host)
+                    for ftp_path in paths:
+                        try:
+                            local_path = os.path.join(output_folder, os.path.basename(ftp_path))
+                            if skip_if_downloaded_already and os.path.exists(local_path):
+                                logging.info("Skipping download as file already exists")
+                                continue
+
+                            logging.info(f"Starting FTP download: {host}/{ftp_path}")
+                            download_attempt = 0
+                            while download_attempt < max_download_retries:
+                                try:
+                                    total_size = ftp.size(ftp_path)
+                                    # Try to resume using REST if partial file exists
+                                    if os.path.exists(local_path):
+                                        current_size = os.path.getsize(local_path)
+                                        mode = "ab"
+                                    else:
+                                        current_size = 0
+                                        mode = "wb"
+
+                                    with open(local_path, mode) as f, tqdm(
+                                        total=total_size,
+                                        unit="B",
+                                        unit_scale=True,
+                                        desc=local_path,
+                                        initial=current_size,
+                                    ) as pbar:
+                                        def callback(data):
+                                            f.write(data)
+                                            pbar.update(len(data))
+
+                                        if current_size:
+                                            try:
+                                                ftp.sendcmd(f"REST {current_size}")
+                                            except Exception:
+                                                # If REST not supported, fall back to full download
+                                                current_size = 0
+                                                f.seek(0)
+                                                f.truncate()
+                                        ftp.retrbinary(f"RETR {ftp_path}", callback)
+                                    logging.info(f"Successfully downloaded {local_path}")
+                                    break
+                                except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e:
+                                    download_attempt += 1
+                                    logging.error(
+                                        f"Download failed for {local_path} (attempt {download_attempt}): {str(e)}"
+                                    )
+                                    if download_attempt >= max_download_retries:
+                                        logging.error(
+                                            f"Giving up on {local_path} after {max_download_retries} attempts."
+                                        )
+                                        break
+                        except Exception as e:
+                            logging.error(f"Unexpected error while processing FTP path {ftp_path}: {str(e)}")
+                    ftp.quit()
+                    logging.info(f"Disconnected from FTP host: {host}")
+                    break
+                except (socket.timeout, ftplib.error_temp, ftplib.error_perm, socket.error) as e:
+                    connection_attempt += 1
+                    logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}")
+                    if connection_attempt < max_connection_retries:
+                        logging.info("Retrying connection...")
+                        time.sleep(5)
+                    else:
+                        logging.error(
+                            f"Giving up after {max_connection_retries} failed connection attempts to {host}."
+                        )
-    @staticmethod
-    def download_ftp_urls(
-        ftp_urls: List[str],
-        output_folder: str,
-        skip_if_downloaded_already: bool,
-        max_connection_retries: int = 3,
-        max_download_retries: int = 3,
-    ) -> None:
-        """
-        Download a list of FTP URLs using a single connection, with retries and progress bars.
-        """
-        if not os.path.isdir(output_folder):
-            os.makedirs(output_folder, exist_ok=True)
-
-        def connect_ftp(host: str):
-            ftp = FTP(host, timeout=30)
-            ftp.login()
-            ftp.set_pasv(True)
-            logging.info(f"Connected to FTP host: {host}")
-            return ftp
-
-        # Group URLs by host to reuse connections efficiently
-        host_to_paths: Dict[str, List[str]] = {}
-        for url in ftp_urls:
-            parsed = urlparse(url)
-            host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/"))
-
-        for host, paths in host_to_paths.items():
-            connection_attempt = 0
-            while connection_attempt < max_connection_retries:
-                try:
-                    ftp = connect_ftp(host)
-                    for ftp_path in paths:
-                        try:
-                            local_path = os.path.join(output_folder, os.path.basename(ftp_path))
-                            if skip_if_downloaded_already and os.path.exists(local_path):
-                                logging.info("Skipping download as file already exists")
-                                continue
-
-                            logging.info(f"Starting FTP download: {host}/{ftp_path}")
-                            download_attempt = 0
-                            while download_attempt < max_download_retries:
-                                try:
-                                    total_size = ftp.size(ftp_path)
-                                    # Try to resume using REST if partial file exists
-                                    if os.path.exists(local_path):
-                                        current_size = os.path.getsize(local_path)
-                                        mode = "ab"
-                                    else:
-                                        current_size = 0
-                                        mode = "wb"
-
-                                    with open(local_path, mode) as f, tqdm(
-                                        total=total_size,
-                                        unit="B",
-                                        unit_scale=True,
-                                        desc=local_path,
-                                        initial=current_size,
-                                    ) as pbar:
-                                        def callback(data):
-                                            f.write(data)
-                                            pbar.update(len(data))
-
-                                        if current_size:
-                                            try:
-                                                ftp.sendcmd(f"REST {current_size}")
-                                            except Exception:
-                                                # If REST not supported, fall back to full download
-                                                current_size = 0
-                                                f.seek(0)
-                                                f.truncate()
-                                        ftp.retrbinary(f"RETR {ftp_path}", callback)
-                                    logging.info(f"Successfully downloaded {local_path}")
-                                    break
-                                except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e:
-                                    download_attempt += 1
-                                    logging.error(
-                                        f"Download failed for {local_path} (attempt {download_attempt}): {str(e)}"
-                                    )
-                                    if download_attempt >= max_download_retries:
-                                        logging.error(
-                                            f"Giving up on {local_path} after {max_download_retries} attempts."
-                                        )
-                                        break
-                        except Exception as e:
-                            logging.error(f"Unexpected error while processing FTP path {ftp_path}: {str(e)}")
-                    ftp.quit()
-                    logging.info(f"Disconnected from FTP host: {host}")
-                    break
-                except (socket.timeout, ftplib.error_temp, ftplib.error_perm, socket.error) as e:
-                    connection_attempt += 1
-                    logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}")
-                    if connection_attempt < max_connection_retries:
-                        logging.info("Retrying connection...")
-                        time.sleep(5)
-                    else:
-                        logging.error(
-                            f"Giving up after {max_connection_retries} failed connection attempts to {host}."
-                        )
+    @staticmethod
+    def download_ftp_urls(
+        ftp_urls: List[str],
+        output_folder: str,
+        skip_if_downloaded_already: bool,
+        max_connection_retries: int = 3,
+        max_download_retries: int = 3,
+    ) -> None:
+        """
+        Download a list of FTP URLs using a single connection, with retries and progress bars.
+        """
+        if not os.path.isdir(output_folder):
+            os.makedirs(output_folder, exist_ok=True)
+
+        def connect_ftp(host: str):
+            ftp = FTP(host, timeout=30)
+            ftp.login()
+            ftp.set_pasv(True)
+            logging.info(f"Connected to FTP host: {host}")
+            return ftp
+
+        # Group URLs by host to reuse connections efficiently
+        host_to_paths: Dict[str, List[str]] = {}
+        for url in ftp_urls:
+            parsed = urlparse(url)
+            host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/"))
+
+        for host, paths in host_to_paths.items():
+            connection_attempt = 0
+            while connection_attempt < max_connection_retries:
+                try:
+                    ftp = connect_ftp(host)
+                    for ftp_path in paths:
+                        try:
+                            local_path = os.path.join(output_folder, os.path.basename(ftp_path))
+                            if skip_if_downloaded_already and os.path.exists(local_path):
+                                logging.info("Skipping download as file already exists")
+                                continue
+
+                            logging.info(f"Starting FTP download: {host}/{ftp_path}")
+                            download_attempt = 0
+                            while download_attempt < max_download_retries:
+                                try:
+                                    total_size = ftp.size(ftp_path)
+                                    # Try to resume using REST if partial file exists
+                                    if os.path.exists(local_path):
+                                        current_size = os.path.getsize(local_path)
+                                        mode = "ab"
+                                    else:
+                                        current_size = 0
+                                        mode = "wb"
+
+                                    with open(local_path, mode) as f, tqdm(
+                                        total=total_size,
+                                        unit="B",
+                                        unit_scale=True,
+                                        desc=local_path,
+                                        initial=current_size,
+                                    ) as pbar:
+                                        def callback(data):
+                                            f.write(data)
+                                            pbar.update(len(data))
+
+                                        if current_size:
+                                            try:
+                                                ftp.sendcmd(f"REST {current_size}")
+                                            except ftplib.error_perm:
+                                                # If REST not supported, fall back to full download
+                                                current_size = 0
+                                                f.seek(0)
+                                                f.truncate()
+                                                pbar.reset()  # Reset progress bar
+                                        ftp.retrbinary(f"RETR {ftp_path}", callback)
+
+                                    logging.info(f"Successfully downloaded {local_path}")
+                                    break
+                                except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e:
+                                    download_attempt += 1
+                                    logging.exception(
+                                        f"Download failed for {local_path} (attempt {download_attempt})"
+                                    )
+                                    if download_attempt >= max_download_retries:
+                                        logging.exception(
+                                            f"Giving up on {local_path} after {max_download_retries} attempts"
+                                        )
+                                        break
+                        except Exception:
+                            logging.exception(f"Unexpected error while processing FTP path {ftp_path}")
+                    ftp.quit()
+                    logging.info(f"Disconnected from FTP host: {host}")
+                    break
+                except (socket.timeout, ftplib.error_temp, ftplib.error_perm, socket.error) as e:
+                    connection_attempt += 1
+                    logging.exception(f"FTP connection failed (attempt {connection_attempt})")
+                    if connection_attempt < max_connection_retries:
+                        logging.info("Retrying connection...")
+                        time.sleep(5)
+                    else:
+                        logging.exception(
+                            f"Giving up after {max_connection_retries} failed connection attempts to {host}"
+                        )
-    @staticmethod
-    def download_ftp_urls(
-        ftp_urls: List[str],
-        output_folder: str,
-        skip_if_downloaded_already: bool,
-        max_connection_retries: int = 3,
-        max_download_retries: int = 3,
-    ) -> None:
-        """
-        Download a list of FTP URLs using a single connection, with retries and progress bars.
-        """
-        if not os.path.isdir(output_folder):
-            os.makedirs(output_folder, exist_ok=True)
-
-        def connect_ftp(host: str):
-            ftp = FTP(host, timeout=30)
-            ftp.login()
-            ftp.set_pasv(True)
-            logging.info(f"Connected to FTP host: {host}")
-            return ftp
-
-        # Group URLs by host to reuse connections efficiently
-        host_to_paths: Dict[str, List[str]] = {}
-        for url in ftp_urls:
-            parsed = urlparse(url)
-            host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/"))
-
-        for host, paths in host_to_paths.items():
-            connection_attempt = 0
-            while connection_attempt < max_connection_retries:
-                try:
-                    ftp = connect_ftp(host)
-                    for ftp_path in paths:
-                        try:
-                            local_path = os.path.join(output_folder, os.path.basename(ftp_path))
-                            if skip_if_downloaded_already and os.path.exists(local_path):
-                                logging.info("Skipping download as file already exists")
-                                continue
-
-                            logging.info(f"Starting FTP download: {host}/{ftp_path}")
-                            download_attempt = 0
-                            while download_attempt < max_download_retries:
-                                try:
-                                    total_size = ftp.size(ftp_path)
-                                    # Try to resume using REST if partial file exists
-                                    if os.path.exists(local_path):
-                                        current_size = os.path.getsize(local_path)
-                                        mode = "ab"
-                                    else:
-                                        current_size = 0
-                                        mode = "wb"
-
-                                    with open(local_path, mode) as f, tqdm(
-                                        total=total_size,
-                                        unit="B",
-                                        unit_scale=True,
-                                        desc=local_path,
-                                        initial=current_size,
-                                    ) as pbar:
-                                        def callback(data):
-                                            f.write(data)
-                                            pbar.update(len(data))
-
-                                        if current_size:
-                                            try:
-                                                ftp.sendcmd(f"REST {current_size}")
-                                            except Exception:
-                                                # If REST not supported, fall back to full download
-                                                current_size = 0
-                                                f.seek(0)
-                                                f.truncate()
-                                        ftp.retrbinary(f"RETR {ftp_path}", callback)
-                                    logging.info(f"Successfully downloaded {local_path}")
-                                    break
-                                except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e:
-                                    download_attempt += 1
-                                    logging.error(
-                                        f"Download failed for {local_path} (attempt {download_attempt}): {str(e)}"
-                                    )
-                                    if download_attempt >= max_download_retries:
-                                        logging.error(
-                                            f"Giving up on {local_path} after {max_download_retries} attempts."
-                                        )
-                                        break
-                        except Exception as e:
-                            logging.error(f"Unexpected error while processing FTP path {ftp_path}: {str(e)}")
-                    ftp.quit()
-                    logging.info(f"Disconnected from FTP host: {host}")
-                    break
-                except (socket.timeout, ftplib.error_temp, ftplib.error_perm, socket.error) as e:
-                    connection_attempt += 1
-                    logging.error(f"FTP connection failed (attempt {connection_attempt}): {str(e)}")
-                    if connection_attempt < max_connection_retries:
-                        logging.info("Retrying connection...")
-                        time.sleep(5)
-                    else:
-                        logging.error(
-                            f"Giving up after {max_connection_retries} failed connection attempts to {host}."
-                        )
+    @staticmethod
+    def download_ftp_urls(
+        ftp_urls: List[str],
+        output_folder: str,
+        skip_if_downloaded_already: bool,
+        max_connection_retries: int = 3,
+        max_download_retries: int = 3,
+    ) -> None:
+        """
+        Download a list of FTP URLs using a single connection, with retries and progress bars.
+        """
+        if not os.path.isdir(output_folder):
+            os.makedirs(output_folder, exist_ok=True)
+
+        def connect_ftp(host: str):
+            ftp = FTP(host, timeout=30)
+            ftp.login()
+            ftp.set_pasv(True)
+            logging.info(f"Connected to FTP host: {host}")
+            return ftp
+
+        # Group URLs by host to reuse connections efficiently
+        host_to_paths: Dict[str, List[str]] = {}
+        for url in ftp_urls:
+            parsed = urlparse(url)
+            host_to_paths.setdefault(parsed.hostname, []).append(parsed.path.lstrip("/"))
+
+        for host, paths in host_to_paths.items():
+            connection_attempt = 0
+            while connection_attempt < max_connection_retries:
+                try:
+                    ftp = connect_ftp(host)
+                    for ftp_path in paths:
+                        try:
+                            local_path = os.path.join(output_folder, os.path.basename(ftp_path))
+                            if skip_if_downloaded_already and os.path.exists(local_path):
+                                logging.info("Skipping download as file already exists")
+                                continue
+
+                            logging.info(f"Starting FTP download: {host}/{ftp_path}")
+                            download_attempt = 0
+                            while download_attempt < max_download_retries:
+                                try:
+                                    total_size = ftp.size(ftp_path)
+                                    # Try to resume using REST if partial file exists
+                                    if os.path.exists(local_path):
+                                        current_size = os.path.getsize(local_path)
+                                        mode = "ab"
+                                    else:
+                                        current_size = 0
+                                        mode = "wb"
+
+                                    with open(local_path, mode) as f, tqdm(
+                                        total=total_size,
+                                        unit="B",
+                                        unit_scale=True,
+                                        desc=local_path,
+                                        initial=current_size,
+                                    ) as pbar:
+                                        def callback(data):
+                                            f.write(data)
+                                            pbar.update(len(data))
+
+                                        if current_size:
+                                            try:
+                                                ftp.sendcmd(f"REST {current_size}")
+                                            except ftplib.error_perm:
+                                                # If REST not supported, fall back to full download
+                                                current_size = 0
+                                                f.seek(0)
+                                                f.truncate()
+                                                pbar.reset()  # Reset progress bar
+                                        ftp.retrbinary(f"RETR {ftp_path}", callback)
+
+                                    logging.info(f"Successfully downloaded {local_path}")
+                                    break
+                                except (socket.timeout, ftplib.error_temp, ftplib.error_perm) as e:
+                                    download_attempt += 1
+                                    logging.exception(
+                                        f"Download failed for {local_path} (attempt {download_attempt})"
+                                    )
+                                    if download_attempt >= max_download_retries:
+                                        logging.exception(
+                                            f"Giving up on {local_path} after {max_download_retries} attempts"
+                                        )
+                                        break
+                        except Exception:
+                            logging.exception(f"Unexpected error while processing FTP path {ftp_path}")
+                    ftp.quit()
+                    logging.info(f"Disconnected from FTP host: {host}")
+                    break
+                except (socket.timeout, ftplib.error_temp, ftplib.error_perm, socket.error) as e:
+                    connection_attempt += 1
+                    logging.exception(f"FTP connection failed (attempt {connection_attempt})")
+                    if connection_attempt < max_connection_retries:
+                        logging.info("Retrying connection...")
+                        time.sleep(5)
+                    else:
+                        logging.exception(
+                            f"Giving up after {max_connection_retries} failed connection attempts to {host}"
+                        )
+
+    @staticmethod
+    def download_http_urls(
+        http_urls: List[str],
+        output_folder: str,
+        skip_if_downloaded_already: bool,
+    ) -> None:
+        """
+        Download a list of HTTP(S) URLs with resume support and progress bars.
+        """
+        if not os.path.isdir(output_folder):
+            os.makedirs(output_folder, exist_ok=True)
+
+        session = Util.create_session_with_retries()
+        for url in http_urls:
+            try:
+                local_path = Files._local_path_for_url(url, output_folder)
+                if skip_if_downloaded_already and os.path.exists(local_path):
+                    logging.info("Skipping download as file already exists")
+                    continue
+
+                if os.path.exists(local_path):
+                    resume_size = os.path.getsize(local_path)
+                    headers = {"Range": f"bytes={resume_size}-"}
+                    mode = "ab"
+                else:
+                    resume_size = 0
+                    headers = {}
+                    mode = "wb"
+
+                with session.get(url, stream=True, headers=headers, timeout=(10, 60)) as r:
+                    r.raise_for_status()
+                    total_size = int(r.headers.get("content-length", 0)) + resume_size
+                    block_size = 1024 * 1024
+                    with tqdm(
+                        total=total_size,
+                        unit="B",
+                        unit_scale=True,
+                        desc=local_path,
+                        initial=resume_size,
-                    mode = "ab"
-                else:
-                    resume_size = 0
-                    headers = {}
-                    mode = "wb"
-
-                with session.get(url, stream=True, headers=headers, timeout=(10, 60)) as r:
-                    r.raise_for_status()
-                    total_size = int(r.headers.get("content-length", 0)) + resume_size
-                    block_size = 1024 * 1024
-                    with tqdm(
-                        total=total_size,
-                        unit="B",
-                        unit_scale=True,
-                        desc=local_path,
-                        initial=resume_size,
+                else:
+                    resume_size = 0
+                    headers = {}
+
+                with session.get(url, stream=True, headers=headers, timeout=(10, 60)) as r:
+                    r.raise_for_status()
+                    # Check if server honored Range request
+                    is_partial = r.status_code == 206 or "content-range" in r.headers
+                    if is_partial:
+                        mode = "ab"
+                        total_size = int(r.headers.get("content-length", 0)) + resume_size
+                        initial = resume_size
+                    else:
+                        # Server ignored Range, start from scratch
+                        mode = "wb"
+                        resume_size = 0
+                        total_size = int(r.headers.get("content-length", 0))
+                        initial = 0
+                    block_size = 1024 * 1024
+                    with tqdm(
+                        total=total_size,
+                        unit="B",
+                        unit_scale=True,
+                        desc=local_path,
+                        initial=initial,
-                    mode = "ab"
-                else:
-                    resume_size = 0
-                    headers = {}
-                    mode = "wb"
-
-                with session.get(url, stream=True, headers=headers, timeout=(10, 60)) as r:
-                    r.raise_for_status()
-                    total_size = int(r.headers.get("content-length", 0)) + resume_size
-                    block_size = 1024 * 1024
-                    with tqdm(
-                        total=total_size,
-                        unit="B",
-                        unit_scale=True,
-                        desc=local_path,
-                        initial=resume_size,
+                else:
+                    resume_size = 0
+                    headers = {}
+
+                with session.get(url, stream=True, headers=headers, timeout=(10, 60)) as r:
+                    r.raise_for_status()
+                    # Check if server honored Range request
+                    is_partial = r.status_code == 206 or "content-range" in r.headers
+                    if is_partial:
+                        mode = "ab"
+                        total_size = int(r.headers.get("content-length", 0)) + resume_size
+                        initial = resume_size
+                    else:
+                        # Server ignored Range, start from scratch
+                        mode = "wb"
+                        resume_size = 0
+                        total_size = int(r.headers.get("content-length", 0))
+                        initial = 0
+                    block_size = 1024 * 1024
+                    with tqdm(
+                        total=total_size,
+                        unit="B",
+                        unit_scale=True,
+                        desc=local_path,
+                        initial=initial,
+                    ) as pbar:
+                        with open(local_path, mode) as f:
+                            for chunk in r.iter_content(chunk_size=block_size):
+                                if chunk:
+                                    f.write(chunk)
+                                    pbar.update(len(chunk))
+                logging.info(f"Successfully downloaded {local_path}")
+            except Exception as e:
+                logging.error(f"HTTP download failed for {url}: {str(e)}")
diff --git a/pridepy/pridepy.py b/pridepy/pridepy.py
@@ -253,6 +253,38 @@ def download_file_by_name(
     )
 
 
+@main.command(
+    "download-px-raw-files",
+    help="Download all raw files referenced by a ProteomeXchange dataset (PX URL or accession)",
+)
+@click.option(
+    "-a",
+    "--accession",
+    "--px",
+    "accession",
+    required=True,
+    help="ProteomeXchange accession (e.g. PXD039236). --px is deprecated.",
+)
+@click.option(
+    "-o",
+    "--output_folder",
+    required=True,
+    help="output folder to download files",
+)
+@click.option(
+    "-skip",
+    "--skip_if_downloaded_already",
+    required=False,
+    default=True,
+    help="Boolean to skip a file if it already exists",
-    "-skip",
-    "--skip_if_downloaded_already",
-    required=False,
-    default=True,
-    help="Boolean to skip a file if it already exists",
+    "-s",
+    "--skip-if-downloaded-already/--no-skip-if-downloaded-already",
+    "skip_if_downloaded_already",
+    is_flag=True,
+    default=True,
+    help="Skip a file if it already exists",
-    "-skip",
-    "--skip_if_downloaded_already",
-    required=False,
-    default=True,
-    help="Boolean to skip a file if it already exists",
+    "-s",
+    "--skip-if-downloaded-already/--no-skip-if-downloaded-already",
+    "skip_if_downloaded_already",
+    is_flag=True,
+    default=True,
+    help="Skip a file if it already exists",
+)
+def download_px_raw_files(accession: str, output_folder: str, skip_if_downloaded_already: bool):
+    """CLI wrapper to download raw files via ProteomeXchange XML."""
+    files = Files()
+    logging.info(f"PX accession/URL: {accession}")
+    files.download_px_raw_files(accession, output_folder, skip_if_downloaded_already)
+
+
 @main.command("list-private-files", help="List private files by project accession")
 @click.option("-a", "--accession", required=True, help="accession of the project")
 @click.option("-u", "--user", required=True, help="PRIDE login username")
@@ -320,7 +352,7 @@ def stream_files_metadata(accession, output_file):
 )
 @click.option(
     "-f",
-    "--filter",
+    "--filters",
     required=False,
     help="Parameters to filter the search results. The structure of the "
     "filter is: field1==value1, field2==value2. Example "
@@ -364,7 +396,7 @@ def stream_files_metadata(accession, output_file):
     ),
 )
 def search_projects_by_keywords_and_filters(
-    keyword, filter, page_size, page, sort_direction, sort_fields
+    keyword, filters, page_size, page, sort_direction, sort_fields
 ):
     """
     Search all projects by keywords and filters
@@ -380,7 +412,7 @@ def search_projects_by_keywords_and_filters(
     sf = ", ".join(sort_fields)
     logging.info(
         project.search_by_keywords_and_filters(
-            keyword, filter, page_size, page, sort_direction, sf
+            keyword, filters, page_size, page, sort_direction, sf
         )
     )
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pridepy"
-version = "0.0.8"
+version = "0.0.9"
 description = "Python Client library for PRIDE Rest API"
 authors = [
     "PRIDE Team <pride-support@ebi.ac.uk>",