From c5613ab7466c580bb16c5aa572fa8ad95d51dc04 Mon Sep 17 00:00:00 2001 From: Alex Jackson Date: Thu, 8 May 2025 14:14:24 -0500 Subject: [PATCH 1/4] chore: ignore tests in coverage --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 61a800b..9f08c3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,10 @@ addopts = """ --capture=tee-sys """ +[tool.coverage.run] +branch = true +omit = ["*/tests/**"] + [tool.pipx-install] poetry = "==2.1.1" poethepoet = "==0.33.1" From 373357b147cbbd545173b2e43dfb200b0717f1d7 Mon Sep 17 00:00:00 2001 From: Alex Jackson Date: Thu, 8 May 2025 11:27:37 -0500 Subject: [PATCH 2/4] feat: implement strip mode Strips all `.py{c,o}`, `RECORD`, `direct_url.json`, and `__pycache__` files to ensure reproducible builds. --- package_python_function/packager.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/package_python_function/packager.py b/package_python_function/packager.py index 0805b1f..77ea856 100644 --- a/package_python_function/packager.py +++ b/package_python_function/packager.py @@ -13,6 +13,9 @@ class Packager: AWS_LAMBDA_MAX_UNZIP_SIZE = 262_144_000 + DIRS_TO_EXCLUDE = ["__pycache__"] + DIST_INFO_FILES_TO_EXCLUDE = ["RECORD", "direct_url.json"] + EXTENSIONS_TO_EXCLUDE = [".pyc", ".pyo"] def __init__(self, venv_path: Path, project_path: Path, output_dir: Path, output_file: Path | None): self.project = PythonProject(project_path) @@ -45,10 +48,18 @@ def zip_all_dependencies(self, target_path: Path) -> None: def zip_dir(path: Path) -> None: for item in path.iterdir(): if item.is_dir(): - zip_dir(item) + if item.name not in self.DIRS_TO_EXCLUDE: + zip_dir(item) else: - self._uncompressed_bytes += item.stat().st_size - zip_file.write_reproducibly(item, item.relative_to(self.input_path)) + is_excluded_by_extension = item.suffix in self.EXTENSIONS_TO_EXCLUDE + is_excluded_dist_info_file = ( + item.name in self.DIST_INFO_FILES_TO_EXCLUDE + if item.parent.name.endswith(".dist-info") + else False + ) + if not (is_excluded_by_extension or is_excluded_dist_info_file): + self._uncompressed_bytes += item.stat().st_size + zip_file.write_reproducibly(item, item.relative_to(self.input_path)) zip_dir(self.input_path) From 417a5f702b0a306880779ba62661c169cd77e2d0 Mon Sep 17 00:00:00 2001 From: Alex Jackson Date: Thu, 8 May 2025 13:54:11 -0500 Subject: [PATCH 3/4] feat: add tests for new `strip` mode --- tests/conftest.py | 46 +++++++++++++++++++++------ tests/test_package_python_function.py | 11 +++++-- 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 9ce2653..133ff2f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,4 @@ +import json from dataclasses import dataclass from pathlib import Path from typing import Self @@ -24,6 +25,7 @@ def new(cls, path: str, contents: str = "") -> Self: @dataclass class Data: + files_excluded_from_bundle: list[File] # relative to packages_dir project_files: list[File] # relative to packages_dir pyproject: PythonProject python_version: str @@ -34,10 +36,12 @@ def new( cls, project_name: str, project_files: list[File], + files_excluded_from_bundle: list[File], python_version: str = "3.13", ) -> Self: pyproject = _new_python_project(name=project_name) return cls( + files_excluded_from_bundle=files_excluded_from_bundle, project_files=project_files, pyproject=pyproject, python_version=python_version, @@ -86,25 +90,47 @@ def verify_file_reproducibility(file_info: list[ZipInfo], expected_file_date_tim assert info.date_time == expected_file_date_time @pytest.fixture -def test_data(tmp_path: Path): +def test_files(tmp_path: Path): + files_excluded_from_bundle = [ + File.new("__pycache__/_virtualenv.cpython-313.pyc"), + File.new("project_1.dist-info/RECORD"), + File.new("project_1.dist-info/direct_url.json", contents=json.dumps({"url": str(tmp_path)})), + ] files = [ File.new("project_1/__init__.py"), File.new("project_1/project1.py"), + File.new("project_1.dist-info/METADATA"), File.new("small_dependency/__init__.py"), File.new("small_dependency/small_dependency.py", "# This is a small dependency"), + *files_excluded_from_bundle, ] - data = Data.new(project_name="project-1", project_files=files).commit(loc=tmp_path) - yield data + yield files, files_excluded_from_bundle, tmp_path @pytest.fixture -def test_data_nested(tmp_path: Path): - files = [ - File.new("project_1/__init__.py"), - File.new("project_1/project1.py"), - File.new("small_dependency/__init__.py"), - File.new("small_dependency/small_dependency.py", "# This is a small dependency"), +def test_files_nested(test_files): + files, files_excluded_from_bundle, tmp_path = test_files + big_files = [ File.new("gigantic_dependency/__init__.py"), File.new("gigantic_dependency/gigantic.py", "a" * Packager.AWS_LAMBDA_MAX_UNZIP_SIZE), ] - data = Data.new(project_name="project-1", project_files=files).commit(loc=tmp_path) + yield [*files, *big_files], files_excluded_from_bundle, tmp_path + +@pytest.fixture +def test_data(test_files): + files, files_excluded_from_bundle, loc = test_files + data = Data.new( + project_name="project-1", + project_files=files, + files_excluded_from_bundle=files_excluded_from_bundle, + ).commit(loc=loc) + yield data + +@pytest.fixture +def test_data_nested(test_files_nested): + files, files_excluded_from_bundle, loc = test_files_nested + data = Data.new( + project_name="project-1-nested", + project_files=files, + files_excluded_from_bundle=files_excluded_from_bundle, + ).commit(loc=loc) yield data diff --git a/tests/test_package_python_function.py b/tests/test_package_python_function.py index 1ca8fd7..1714ecf 100644 --- a/tests/test_package_python_function.py +++ b/tests/test_package_python_function.py @@ -67,7 +67,10 @@ def test_package_python_function( verify_file_reproducibility(zip.infolist(), expected_file_date_time=expected_date_time) for file in test_data.project_files: - assert (verify_dir / file.path).exists() + if file in test_data.files_excluded_from_bundle: + assert not (verify_dir / file.path).exists() + else: + assert (verify_dir / file.path).exists() @pytest.mark.parametrize( "src_epoch, expected_exception, expected_date_time", @@ -131,5 +134,9 @@ def test_package_python_function_nested( with zipfile.ZipFile(inner_zip, "r") as izip: izip.extractall(verify_dir) verify_file_reproducibility(izip.infolist(), expected_file_date_time=expected_date_time) + for file in test_data_nested.project_files: - assert (verify_dir / file.path).exists() + if file in test_data_nested.files_excluded_from_bundle: + assert not (verify_dir / file.path).exists() + else: + assert (verify_dir / file.path).exists() From eb420836836d430231b5fb339154ba6e7a31103f Mon Sep 17 00:00:00 2001 From: Alex Jackson Date: Thu, 8 May 2025 14:30:39 -0500 Subject: [PATCH 4/4] chore: update README --- README.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4c759ba..95ed3ea 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,26 @@ One of the following must be specified: - `--output-dir`: The output directory for the final zip file. The name of the zip file will be based on the project's name in the `pyproject.toml` file (with dashes replaced with underscores). -## A Note on Reproducibility +## Notes on Reproducibility + +### Timestamps The ZIP files generated adhere with [reproducible builds](https://reproducible-builds.org/docs/archives/). This means that file permissions and timestamps are modified inside the ZIP, such that the ZIP will have a deterministic hash. By default, the date is set to `1980-01-01`. Additionally, the tool respects the standardized `$SOURCE_DATE_EPOCH` [environment variable](https://reproducible-builds.org/docs/source-date-epoch/), which will allow you to set that date as needed. One important caveat is that ZIP files do not support files with timestamps earlier than `1980-01-01` inside them, due to MS-DOS compatibility. Therefore, the tool will throw a `SourceDateEpochError` is `$SOURCE_DATE_EPOCH` is below `315532800`. + +### Files with embedded full paths + +In testing, we found that several file types can leak information from the machine that generated the virtual environment. + +To get around this, the tool removes the following files: + +```gitignore +**/__pycache/ +**/*.dist-info/direct_url.json +**/*.dist-info/RECORD +**/*.pyc +**/*.pyo +```