From 35d10a7f2fe2556289848797aaf838710f3ae73b Mon Sep 17 00:00:00 2001 From: Alex Jackson Date: Wed, 30 Apr 2025 11:45:33 -0500 Subject: [PATCH 1/6] chore: make ruff not yell at me --- tests/projects/project-1/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/projects/project-1/pyproject.toml b/tests/projects/project-1/pyproject.toml index 1ce14d2..a5f3853 100644 --- a/tests/projects/project-1/pyproject.toml +++ b/tests/projects/project-1/pyproject.toml @@ -5,7 +5,7 @@ description = "project-1" authors = [{ name = "Brandon White", email = "brandonlwhite@gmail.com" }] license = "MIT" readme = "README.md" -requires-python = "^3.10" +requires-python = ">=3.10,<4.0" [build-system] From 375a8fa987b9f267e8c1902699f3a8fa4cd6b11c Mon Sep 17 00:00:00 2001 From: Alex Jackson Date: Wed, 30 Apr 2025 11:46:35 -0500 Subject: [PATCH 2/6] feat: impl ReproducibleZipFile --- package_python_function/packager.py | 15 ++-- package_python_function/reproducible_zip.py | 88 +++++++++++++++++++++ tests/test_package_python_function.py | 2 +- 3 files changed, 96 insertions(+), 9 deletions(-) create mode 100644 package_python_function/reproducible_zip.py diff --git a/package_python_function/packager.py b/package_python_function/packager.py index b3d1d7d..780bb57 100644 --- a/package_python_function/packager.py +++ b/package_python_function/packager.py @@ -1,15 +1,14 @@ +import logging +import shutil +import zipfile from pathlib import Path from tempfile import NamedTemporaryFile -import zipfile -import shutil -import logging from .python_project import PythonProject - +from .reproducible_zip import ReproducibleZipFile logger = logging.getLogger(__name__) - class Packager: AWS_LAMBDA_MAX_UNZIP_SIZE = 262144000 @@ -40,7 +39,7 @@ def package(self) -> None: def zip_all_dependencies(self, target_path: Path) -> None: logger.info(f"Zipping to {target_path}...") - with zipfile.ZipFile(target_path, 'w', zipfile.ZIP_DEFLATED) as zip_file: + with ReproducibleZipFile(target_path, 'w', zipfile.ZIP_DEFLATED) as zip_file: def zip_dir(path: Path) -> None: for item in path.iterdir(): if item.is_dir(): @@ -61,7 +60,7 @@ def zip_dir(path: Path) -> None: logger.info(f"The compressed size ({compressed_bytes:,}) is less than the AWS limit, so the nested-zip strategy will be used.") self.generate_nested_zip(target_path) else: - print(f"TODO Error. The unzipped size it too large for AWS Lambda.") + print("TODO Error. The unzipped size it too large for AWS Lambda.") else: logger.info(f"Copying '{target_path}' to '{self.output_file}'") shutil.copy(str(target_path), str(self.output_file)) @@ -80,4 +79,4 @@ def generate_nested_zip(self, inner_zip_path: Path) -> None: str(entrypoint_dir / "__init__.py"), Path(__file__).parent.joinpath("nested_zip_loader.py").read_text(), compresslevel=zipfile.ZIP_DEFLATED - ) \ No newline at end of file + ) diff --git a/package_python_function/reproducible_zip.py b/package_python_function/reproducible_zip.py new file mode 100644 index 0000000..b51774e --- /dev/null +++ b/package_python_function/reproducible_zip.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import os +import shutil +import time +from copy import copy +from typing import TYPE_CHECKING +from zipfile import ZipFile, ZipInfo + +if TYPE_CHECKING: + from typing import Tuple + +FILE_MODE = 0o644 +DIR_MODE = 0o755 + +def date_time() -> Tuple[int, int, int, int, int, int]: + """Returns date_time value used to force overwrite on all ZipInfo objects. Defaults to + 1980-01-01 00:00:00. You can set this with the environment variable SOURCE_DATE_EPOCH as an + integer value representing seconds since Epoch. + """ + source_date_epoch = os.environ.get("SOURCE_DATE_EPOCH", None) + if source_date_epoch is not None: + return time.gmtime(int(source_date_epoch))[:6] + return (1980, 1, 1, 0, 0, 0) + +def clean_zip_info(zinfo: ZipInfo) -> ZipInfo: + """ + Cleans the ZipInfo object, overwriting file-modified timestamps and file/directory permissions modes in write mode in order to create a reproducible ZIP archive. + + Parameters: + zinfo (ZipInfo): A ZipInfo object from zipfile.ZipInfo. + + Returns: + ZipInfo: The ZipInfo for the file, with the proper file permissions and date. + """ + zinfo = copy(zinfo) + zinfo.date_time = date_time() + if zinfo.is_dir(): + zinfo.external_attr = (0o40000 | DIR_MODE) << 16 + zinfo.external_attr |= 0x10 # MS-DOS directory flag + else: + zinfo.external_attr = FILE_MODE << 16 + return zinfo + +class ReproducibleZipFile(ZipFile): + """Open a ZIP file, where file can be a path to a file (a string), a file-like object or a + path-like object. + + This is a replacement for the Python standard library zipfile.ZipFile that overwrites + file-modified timestamps and file/directory permissions modes in write mode in order to create + a reproducible ZIP archive. + """ + + # Following method modified from Python 3.12.9 + # https://github.com/python/cpython/blob/fdb81425a9ad683f8c24bf5cbedc9b96baf00cd2/Lib/zipfile/__init__.py#L1834-L1865 + # Copyright Python Software Foundation, licensed under PSF License Version 2 + # See LICENSE file for full license agreement and notice of copyright + def write(self, filename, arcname=None, compress_type=None, compresslevel=None): + """Put the bytes from filename into the archive under the name + arcname.""" + if not self.fp: + raise ValueError("Attempt to write to ZIP archive that was already closed") + if self._writing: + raise ValueError("Can't write to ZIP archive while an open writing handle exists") + + zinfo = ZipInfo.from_file(filename, arcname, strict_timestamps=self._strict_timestamps) + + ###### BEGIN ADDED CODE ###### + zinfo = clean_zip_info(zinfo) + ###### END ADDED CODE ###### + + if zinfo.is_dir(): + zinfo.compress_size = 0 + zinfo.CRC = 0 + self.mkdir(zinfo) + else: + if compress_type is not None: + zinfo.compress_type = compress_type + else: + zinfo.compress_type = self.compression + + if compresslevel is not None: + zinfo._compresslevel = compresslevel + else: + zinfo._compresslevel = self.compresslevel + + with open(filename, "rb") as src, self.open(zinfo, "w") as dest: + shutil.copyfileobj(src, dest, 1024 * 8) diff --git a/tests/test_package_python_function.py b/tests/test_package_python_function.py index 5d0ba3a..8254da6 100644 --- a/tests/test_package_python_function.py +++ b/tests/test_package_python_function.py @@ -34,4 +34,4 @@ def test_package_python_function(tmp_path: Path) -> None: ] main() - assert (output_dir_path / 'project_1.zip').exists() \ No newline at end of file + assert (output_dir_path / 'project_1.zip').exists() From 8ad98bfe3b3ab7ce9173244a121be9d65ee85b82 Mon Sep 17 00:00:00 2001 From: Alex Jackson Date: Wed, 30 Apr 2025 12:12:40 -0500 Subject: [PATCH 3/6] feat: add repro zip to tests --- tests/test_package_python_function.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/test_package_python_function.py b/tests/test_package_python_function.py index 8254da6..ea85d15 100644 --- a/tests/test_package_python_function.py +++ b/tests/test_package_python_function.py @@ -1,12 +1,15 @@ -from pathlib import Path import sys -from package_python_function.main import main +import zipfile +from pathlib import Path +from package_python_function.main import main PROJECTS_DIR_PATH = Path(__file__).parent / 'projects' - def test_package_python_function(tmp_path: Path) -> None: + EXPECTED_FILE_MODE = 0o644 + EXPECTED_FILE_DATE_TIME = (1980, 1, 1, 0, 0, 0) + project_file_path = PROJECTS_DIR_PATH / 'project-1' / 'pyproject.toml' venv_dir_path = tmp_path / 'venv' @@ -34,4 +37,15 @@ def test_package_python_function(tmp_path: Path) -> None: ] main() - assert (output_dir_path / 'project_1.zip').exists() + zip_file = output_dir_path / "project_1.zip" + assert zip_file.exists() + + verify_dir_path = tmp_path / "verify" + verify_dir_path.mkdir() + with zipfile.ZipFile(zip_file, "r") as zip: + zip.extractall(verify_dir_path) + for file_info in zip.infolist(): + assert file_info.date_time == EXPECTED_FILE_DATE_TIME + + mode = (file_info.external_attr >> 16) & 0xFFFF + assert mode == EXPECTED_FILE_MODE From 1cba515d36b7126eafb46fd2e05d308ea45ef56e Mon Sep 17 00:00:00 2001 From: Alex Jackson Date: Thu, 1 May 2025 10:25:53 -0500 Subject: [PATCH 4/6] refactor: use ZipFile.open() instead of overwriting ZipFile --- package_python_function/packager.py | 31 +++++++- package_python_function/reproducible_zip.py | 88 --------------------- 2 files changed, 28 insertions(+), 91 deletions(-) delete mode 100644 package_python_function/reproducible_zip.py diff --git a/package_python_function/packager.py b/package_python_function/packager.py index 780bb57..aff8a86 100644 --- a/package_python_function/packager.py +++ b/package_python_function/packager.py @@ -1,11 +1,18 @@ +from __future__ import annotations + import logging +import os import shutil +import time import zipfile from pathlib import Path from tempfile import NamedTemporaryFile +from typing import TYPE_CHECKING from .python_project import PythonProject -from .reproducible_zip import ReproducibleZipFile + +if TYPE_CHECKING: + from typing import Tuple logger = logging.getLogger(__name__) @@ -39,14 +46,32 @@ def package(self) -> None: def zip_all_dependencies(self, target_path: Path) -> None: logger.info(f"Zipping to {target_path}...") - with ReproducibleZipFile(target_path, 'w', zipfile.ZIP_DEFLATED) as zip_file: + def date_time() -> Tuple[int, int, int, int, int, int]: + """Returns date_time value used to force overwrite on all ZipInfo objects. Defaults to + 1980-01-01 00:00:00. You can set this with the environment variable SOURCE_DATE_EPOCH as an + integer value representing seconds since Epoch. + """ + source_date_epoch = os.environ.get("SOURCE_DATE_EPOCH", None) + if source_date_epoch is not None: + return time.gmtime(int(source_date_epoch))[:6] + return (1980, 1, 1, 0, 0, 0) + + with zipfile.ZipFile(target_path, "w", zipfile.ZIP_DEFLATED) as zip_file: + def zip_dir(path: Path) -> None: for item in path.iterdir(): if item.is_dir(): zip_dir(item) else: + zinfo = zipfile.ZipInfo.from_file(item) + zinfo.date_time = date_time() + zinfo.external_attr = 0o644 << 16 self._uncompressed_bytes += item.stat().st_size - zip_file.write(item, item.relative_to(self.input_path)) + with ( + open(item, "rb") as src, + zip_file.open(zinfo, "w") as dest, + ): + shutil.copyfileobj(src, dest, 1024 * 8) zip_dir(self.input_path) diff --git a/package_python_function/reproducible_zip.py b/package_python_function/reproducible_zip.py deleted file mode 100644 index b51774e..0000000 --- a/package_python_function/reproducible_zip.py +++ /dev/null @@ -1,88 +0,0 @@ -from __future__ import annotations - -import os -import shutil -import time -from copy import copy -from typing import TYPE_CHECKING -from zipfile import ZipFile, ZipInfo - -if TYPE_CHECKING: - from typing import Tuple - -FILE_MODE = 0o644 -DIR_MODE = 0o755 - -def date_time() -> Tuple[int, int, int, int, int, int]: - """Returns date_time value used to force overwrite on all ZipInfo objects. Defaults to - 1980-01-01 00:00:00. You can set this with the environment variable SOURCE_DATE_EPOCH as an - integer value representing seconds since Epoch. - """ - source_date_epoch = os.environ.get("SOURCE_DATE_EPOCH", None) - if source_date_epoch is not None: - return time.gmtime(int(source_date_epoch))[:6] - return (1980, 1, 1, 0, 0, 0) - -def clean_zip_info(zinfo: ZipInfo) -> ZipInfo: - """ - Cleans the ZipInfo object, overwriting file-modified timestamps and file/directory permissions modes in write mode in order to create a reproducible ZIP archive. - - Parameters: - zinfo (ZipInfo): A ZipInfo object from zipfile.ZipInfo. - - Returns: - ZipInfo: The ZipInfo for the file, with the proper file permissions and date. - """ - zinfo = copy(zinfo) - zinfo.date_time = date_time() - if zinfo.is_dir(): - zinfo.external_attr = (0o40000 | DIR_MODE) << 16 - zinfo.external_attr |= 0x10 # MS-DOS directory flag - else: - zinfo.external_attr = FILE_MODE << 16 - return zinfo - -class ReproducibleZipFile(ZipFile): - """Open a ZIP file, where file can be a path to a file (a string), a file-like object or a - path-like object. - - This is a replacement for the Python standard library zipfile.ZipFile that overwrites - file-modified timestamps and file/directory permissions modes in write mode in order to create - a reproducible ZIP archive. - """ - - # Following method modified from Python 3.12.9 - # https://github.com/python/cpython/blob/fdb81425a9ad683f8c24bf5cbedc9b96baf00cd2/Lib/zipfile/__init__.py#L1834-L1865 - # Copyright Python Software Foundation, licensed under PSF License Version 2 - # See LICENSE file for full license agreement and notice of copyright - def write(self, filename, arcname=None, compress_type=None, compresslevel=None): - """Put the bytes from filename into the archive under the name - arcname.""" - if not self.fp: - raise ValueError("Attempt to write to ZIP archive that was already closed") - if self._writing: - raise ValueError("Can't write to ZIP archive while an open writing handle exists") - - zinfo = ZipInfo.from_file(filename, arcname, strict_timestamps=self._strict_timestamps) - - ###### BEGIN ADDED CODE ###### - zinfo = clean_zip_info(zinfo) - ###### END ADDED CODE ###### - - if zinfo.is_dir(): - zinfo.compress_size = 0 - zinfo.CRC = 0 - self.mkdir(zinfo) - else: - if compress_type is not None: - zinfo.compress_type = compress_type - else: - zinfo.compress_type = self.compression - - if compresslevel is not None: - zinfo._compresslevel = compresslevel - else: - zinfo._compresslevel = self.compresslevel - - with open(filename, "rb") as src, self.open(zinfo, "w") as dest: - shutil.copyfileobj(src, dest, 1024 * 8) From e5ae6255cc846120f8c3055d516891d58a4971e5 Mon Sep 17 00:00:00 2001 From: Alex Jackson Date: Thu, 1 May 2025 11:46:27 -0500 Subject: [PATCH 5/6] feat: improve tests --- tests/test_package_python_function.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_package_python_function.py b/tests/test_package_python_function.py index ea85d15..91a935e 100644 --- a/tests/test_package_python_function.py +++ b/tests/test_package_python_function.py @@ -40,12 +40,16 @@ def test_package_python_function(tmp_path: Path) -> None: zip_file = output_dir_path / "project_1.zip" assert zip_file.exists() - verify_dir_path = tmp_path / "verify" - verify_dir_path.mkdir() + verify_dir = tmp_path / "verify" + verify_dir.mkdir() with zipfile.ZipFile(zip_file, "r") as zip: - zip.extractall(verify_dir_path) + zip.extractall(verify_dir) for file_info in zip.infolist(): - assert file_info.date_time == EXPECTED_FILE_DATE_TIME - mode = (file_info.external_attr >> 16) & 0xFFFF assert mode == EXPECTED_FILE_MODE + assert file_info.date_time == EXPECTED_FILE_DATE_TIME + + assert verify_dir / "project_1" / "__init__.py" + assert verify_dir / "project_1" / "project1.py" + assert verify_dir / "small_dependency" / "__init__.py" + assert verify_dir / "small_dependency" / "small_dependency.py" From 625da8a14f7ff009e2dd916dd25c6838e78d795a Mon Sep 17 00:00:00 2001 From: Alex Jackson Date: Thu, 1 May 2025 14:49:51 -0500 Subject: [PATCH 6/6] fix: relative pathing and tests --- package_python_function/packager.py | 4 +++- tests/test_package_python_function.py | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/package_python_function/packager.py b/package_python_function/packager.py index aff8a86..20687d0 100644 --- a/package_python_function/packager.py +++ b/package_python_function/packager.py @@ -63,7 +63,9 @@ def zip_dir(path: Path) -> None: if item.is_dir(): zip_dir(item) else: - zinfo = zipfile.ZipInfo.from_file(item) + zinfo = zipfile.ZipInfo.from_file( + item, item.relative_to(self.input_path) + ) zinfo.date_time = date_time() zinfo.external_attr = 0o644 << 16 self._uncompressed_bytes += item.stat().st_size diff --git a/tests/test_package_python_function.py b/tests/test_package_python_function.py index 91a935e..32a30a5 100644 --- a/tests/test_package_python_function.py +++ b/tests/test_package_python_function.py @@ -49,7 +49,7 @@ def test_package_python_function(tmp_path: Path) -> None: assert mode == EXPECTED_FILE_MODE assert file_info.date_time == EXPECTED_FILE_DATE_TIME - assert verify_dir / "project_1" / "__init__.py" - assert verify_dir / "project_1" / "project1.py" - assert verify_dir / "small_dependency" / "__init__.py" - assert verify_dir / "small_dependency" / "small_dependency.py" + assert (verify_dir / "project_1" / "__init__.py").exists() + assert (verify_dir / "project_1" / "project1.py").exists() + assert (verify_dir / "small_dependency" / "__init__.py").exists() + assert (verify_dir / "small_dependency" / "small_dependency.py").exists()