Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 40 additions & 33 deletions generate_eupmc_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
import os
from ftplib import FTP
from pathlib import Path
import os


BATCH_SIZE = 5000 # number of links per XML file


def setup_logging():
Expand All @@ -28,8 +30,7 @@ def read_tsv(file_path: str) -> List[Dict[str, str]]:


def build_xml(data: List[Dict[str, str]]) -> ET.Element:
"""Build the XML structure from the TSV data."""
logging.info("Building XML structure")
"""Build XML for a batch of TSV rows."""
root = ET.Element("links")
for row in data:
emdb_id = row["EMDB_ID"]
Expand All @@ -49,12 +50,10 @@ def build_xml(data: List[Dict[str, str]]) -> ET.Element:


def prettify_xml(elem: ET.Element) -> str:
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring was removed from this function. While the function is simple, it's a best practice to maintain docstrings for all functions, especially in a codebase that previously had them. Consider adding a brief docstring like """Return a pretty-printed XML string for the Element."""

Suggested change
def prettify_xml(elem: ET.Element) -> str:
def prettify_xml(elem: ET.Element) -> str:
"""Return a pretty-printed XML string for the Element."""

Copilot uses AI. Check for mistakes.
"""Return a pretty-printed XML string for the Element."""
return minidom.parseString(ET.tostring(elem, 'utf-8')).toprettyxml(indent=" ")


def write_xml(xml_root: ET.Element, output_file: str) -> None:
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring was removed from this function. Consider restoring it or adding a brief one like """Write the XML tree to a file with pretty formatting.""" to maintain consistency with the codebase's documentation standards.

Suggested change
def write_xml(xml_root: ET.Element, output_file: str) -> None:
def write_xml(xml_root: ET.Element, output_file: str) -> None:
"""Write the XML tree to a file with pretty formatting."""

Copilot uses AI. Check for mistakes.
"""Write the XML tree to a file with pretty formatting."""
logging.info(f"Writing XML to file: {output_file}")
pretty_xml = prettify_xml(xml_root)
with open(output_file, "w", encoding='utf-8') as f:
Expand All @@ -69,17 +68,6 @@ def upload_file_via_ftp(
remote_dir: str = ".",
remote_filename: str = None
) -> None:
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comprehensive docstring was removed from this function. Given that this function has multiple parameters with specific purposes and interacts with external FTP servers, it would be beneficial to restore the docstring to help future maintainers understand the function's purpose, parameters, and behavior.

Suggested change
) -> None:
) -> None:
"""
Upload a local file to a remote FTP server.
Connects to the specified FTP server using the provided credentials,
changes to the specified remote directory, and uploads the given local file.
Parameters:
server (str): The FTP server address.
username (str): Username for FTP authentication.
password (str): Password for FTP authentication.
local_file_path (str): Path to the local file to upload.
remote_dir (str, optional): Remote directory to upload the file to. Defaults to ".".
remote_filename (str, optional): Name to use for the file on the server. Defaults to the local file's basename.
Returns:
None
Logs errors if the local file does not exist or if the upload fails.
"""

Copilot uses AI. Check for mistakes.
"""
Upload a file to an FTP server. It uses port 21 by default.

Args:
server (str): FTP server address.
username (str): FTP username.
password (str): FTP password.
local_file_path (str): Path to the local file to upload.
remote_dir (str): Remote directory to upload the file to. Default is root.
remote_filename (str): Optional name to give the uploaded file. Defaults to same as local file.
"""
if not os.path.exists(local_file_path):
logging.error(f"Local file does not exist: {local_file_path}")
return
Expand All @@ -97,26 +85,28 @@ def upload_file_via_ftp(

with open(local_file_path, "rb") as file:
ftp.storbinary(f"STOR {remote_filename}", file)
logging.info(f"File uploaded successfully as {remote_filename}")
logging.info(f"Uploaded: {remote_filename}")
except Exception as e:
logging.error(f"FTP upload failed: {e}")
return


def split_into_batches(data: List[Dict[str, str]], batch_size: int):
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function is missing a return type hint. For consistency with other functions in this file (which use type hints), consider adding -> Generator[List[Dict[str, str]], None, None] as the return type. You'll also need to import Generator from the typing module.

Copilot uses AI. Check for mistakes.
"""Yield chunks of data of size batch_size."""
for i in range(0, len(data), batch_size):
yield data[i:i + batch_size]


def main():
setup_logging()

input_tsv = "/nfs/ftp/public/databases/em_ebi/emdb_related/emicss/resources/emdb_pubmed.tsv"
output_xml = "/hps/nobackup/gerard/emdb/annotations/output/EMDB_linkFile_providerID_2057.xml"
output_dir = "/hps/nobackup/gerard/emdb/annotations/output/EPMC"
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The output directory is not validated or created before use. If the directory doesn't exist, the script will fail when attempting to write the first batch XML file. Consider adding directory creation logic (e.g., os.makedirs(output_dir, exist_ok=True)) before the batch processing loop.

Suggested change
output_dir = "/hps/nobackup/gerard/emdb/annotations/output/EPMC"
output_dir = "/hps/nobackup/gerard/emdb/annotations/output/EPMC"
os.makedirs(output_dir, exist_ok=True)

Copilot uses AI. Check for mistakes.

data = read_tsv(input_tsv)
xml_root = build_xml(data)
write_xml(xml_root, output_xml)

logging.info(f"XML file generated: {output_xml}")
total = len(data)
logging.info(f"Total links: {total}")

# Upload the XML file to the FTP server
logging.info("Uploading XML file to FTP server")
# Load FTP configuration
config = configparser.ConfigParser()
env_file = os.path.join(Path(__file__).parent.absolute(), "config.ini")
config.read(env_file)
Expand All @@ -125,15 +115,32 @@ def main():
ftp_pass = config.get("epmc_ftp", "password")
ftp_dir = config.get("epmc_ftp", "directory")

upload_file_via_ftp(
server=ftp_server,
username=ftp_user,
password=ftp_pass,
local_file_path=output_xml,
remote_dir=ftp_dir
)
# Process and upload batches
part = 1
for batch in split_into_batches(data, BATCH_SIZE):
logging.info(f"Processing batch {part} ({len(batch)} records)")

xml_root = build_xml(batch)

output_file = os.path.join(
output_dir,
f"EMDB_linkFile_providerID_2057_part{part}.xml"
)

write_xml(xml_root, output_file)

upload_file_via_ftp(
server=ftp_server,
username=ftp_user,
password=ftp_pass,
local_file_path=output_file,
remote_dir=ftp_dir
)
Comment on lines +132 to +138
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The FTP connection is established and torn down for each batch. For better performance, consider establishing a single FTP connection before the loop and reusing it for all uploads, then closing it after all batches are processed. This would significantly reduce connection overhead, especially when processing many batches.

Copilot uses AI. Check for mistakes.

logging.info(f"Batch {part} completed.")
part += 1

Comment on lines +122 to 142
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The batch processing loop lacks error handling. If an exception occurs during XML generation, file writing, or FTP upload for any batch, the entire script will fail and subsequent batches won't be processed. Consider adding try-except blocks around the batch processing operations to log failures and continue with remaining batches, or at minimum, provide clear error context about which batch failed.

Suggested change
xml_root = build_xml(batch)
output_file = os.path.join(
output_dir,
f"EMDB_linkFile_providerID_2057_part{part}.xml"
)
write_xml(xml_root, output_file)
upload_file_via_ftp(
server=ftp_server,
username=ftp_user,
password=ftp_pass,
local_file_path=output_file,
remote_dir=ftp_dir
)
logging.info(f"Batch {part} completed.")
part += 1
try:
xml_root = build_xml(batch)
output_file = os.path.join(
output_dir,
f"EMDB_linkFile_providerID_2057_part{part}.xml"
)
write_xml(xml_root, output_file)
upload_file_via_ftp(
server=ftp_server,
username=ftp_user,
password=ftp_pass,
local_file_path=output_file,
remote_dir=ftp_dir
)
logging.info(f"Batch {part} completed.")
except Exception as e:
logging.error(f"Error processing batch {part}: {e}", exc_info=True)
part += 1

Copilot uses AI. Check for mistakes.
logging.info("Processing complete.")
logging.info("All batches processed successfully.")


if __name__ == "__main__":
Expand Down
Loading