Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,26 @@ One of the following must be specified:
- `--output-dir`: The output directory for the final zip file. The name of the zip file will be based on the project's
name in the `pyproject.toml` file (with dashes replaced with underscores).

## A Note on Reproducibility
## Notes on Reproducibility

### Timestamps

The ZIP files generated adhere with [reproducible builds](https://reproducible-builds.org/docs/archives/). This means that file permissions and timestamps are modified inside the ZIP, such that the ZIP will have a deterministic hash. By default, the date is set to `1980-01-01`.

Additionally, the tool respects the standardized `$SOURCE_DATE_EPOCH` [environment variable](https://reproducible-builds.org/docs/source-date-epoch/), which will allow you to set that date as needed.

One important caveat is that ZIP files do not support files with timestamps earlier than `1980-01-01` inside them, due to MS-DOS compatibility. Therefore, the tool will throw a `SourceDateEpochError` is `$SOURCE_DATE_EPOCH` is below `315532800`.

### Files with embedded full paths

In testing, we found that several file types can leak information from the machine that generated the virtual environment.

To get around this, the tool removes the following files:

```gitignore
**/__pycache/
**/*.dist-info/direct_url.json
**/*.dist-info/RECORD
**/*.pyc
**/*.pyo
```
17 changes: 14 additions & 3 deletions package_python_function/packager.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@

class Packager:
AWS_LAMBDA_MAX_UNZIP_SIZE = 262_144_000
DIRS_TO_EXCLUDE = ["__pycache__"]
DIST_INFO_FILES_TO_EXCLUDE = ["RECORD", "direct_url.json"]
EXTENSIONS_TO_EXCLUDE = [".pyc", ".pyo"]

def __init__(self, venv_path: Path, project_path: Path, output_dir: Path, output_file: Path | None):
self.project = PythonProject(project_path)
Expand Down Expand Up @@ -45,10 +48,18 @@ def zip_all_dependencies(self, target_path: Path) -> None:
def zip_dir(path: Path) -> None:
for item in path.iterdir():
if item.is_dir():
zip_dir(item)
if item.name not in self.DIRS_TO_EXCLUDE:
zip_dir(item)
else:
self._uncompressed_bytes += item.stat().st_size
zip_file.write_reproducibly(item, item.relative_to(self.input_path))
is_excluded_by_extension = item.suffix in self.EXTENSIONS_TO_EXCLUDE
is_excluded_dist_info_file = (
item.name in self.DIST_INFO_FILES_TO_EXCLUDE
if item.parent.name.endswith(".dist-info")
else False
)
if not (is_excluded_by_extension or is_excluded_dist_info_file):
self._uncompressed_bytes += item.stat().st_size
zip_file.write_reproducibly(item, item.relative_to(self.input_path))

zip_dir(self.input_path)

Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ addopts = """
--capture=tee-sys
"""

[tool.coverage.run]
branch = true
omit = ["*/tests/**"]

[tool.pipx-install]
poetry = "==2.1.1"
poethepoet = "==0.33.1"
Expand Down
46 changes: 36 additions & 10 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Self
Expand All @@ -24,6 +25,7 @@ def new(cls, path: str, contents: str = "") -> Self:

@dataclass
class Data:
files_excluded_from_bundle: list[File] # relative to packages_dir
project_files: list[File] # relative to packages_dir
pyproject: PythonProject
python_version: str
Expand All @@ -34,10 +36,12 @@ def new(
cls,
project_name: str,
project_files: list[File],
files_excluded_from_bundle: list[File],
python_version: str = "3.13",
) -> Self:
pyproject = _new_python_project(name=project_name)
return cls(
files_excluded_from_bundle=files_excluded_from_bundle,
project_files=project_files,
pyproject=pyproject,
python_version=python_version,
Expand Down Expand Up @@ -86,25 +90,47 @@ def verify_file_reproducibility(file_info: list[ZipInfo], expected_file_date_tim
assert info.date_time == expected_file_date_time

@pytest.fixture
def test_data(tmp_path: Path):
def test_files(tmp_path: Path):
files_excluded_from_bundle = [
File.new("__pycache__/_virtualenv.cpython-313.pyc"),
File.new("project_1.dist-info/RECORD"),
File.new("project_1.dist-info/direct_url.json", contents=json.dumps({"url": str(tmp_path)})),
]
files = [
File.new("project_1/__init__.py"),
File.new("project_1/project1.py"),
File.new("project_1.dist-info/METADATA"),
File.new("small_dependency/__init__.py"),
File.new("small_dependency/small_dependency.py", "# This is a small dependency"),
*files_excluded_from_bundle,
]
data = Data.new(project_name="project-1", project_files=files).commit(loc=tmp_path)
yield data
yield files, files_excluded_from_bundle, tmp_path

@pytest.fixture
def test_data_nested(tmp_path: Path):
files = [
File.new("project_1/__init__.py"),
File.new("project_1/project1.py"),
File.new("small_dependency/__init__.py"),
File.new("small_dependency/small_dependency.py", "# This is a small dependency"),
def test_files_nested(test_files):
files, files_excluded_from_bundle, tmp_path = test_files
big_files = [
File.new("gigantic_dependency/__init__.py"),
File.new("gigantic_dependency/gigantic.py", "a" * Packager.AWS_LAMBDA_MAX_UNZIP_SIZE),
]
data = Data.new(project_name="project-1", project_files=files).commit(loc=tmp_path)
yield [*files, *big_files], files_excluded_from_bundle, tmp_path

@pytest.fixture
def test_data(test_files):
files, files_excluded_from_bundle, loc = test_files
data = Data.new(
project_name="project-1",
project_files=files,
files_excluded_from_bundle=files_excluded_from_bundle,
).commit(loc=loc)
yield data

@pytest.fixture
def test_data_nested(test_files_nested):
files, files_excluded_from_bundle, loc = test_files_nested
data = Data.new(
project_name="project-1-nested",
project_files=files,
files_excluded_from_bundle=files_excluded_from_bundle,
).commit(loc=loc)
yield data
11 changes: 9 additions & 2 deletions tests/test_package_python_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ def test_package_python_function(
verify_file_reproducibility(zip.infolist(), expected_file_date_time=expected_date_time)

for file in test_data.project_files:
assert (verify_dir / file.path).exists()
if file in test_data.files_excluded_from_bundle:
assert not (verify_dir / file.path).exists()
else:
assert (verify_dir / file.path).exists()

@pytest.mark.parametrize(
"src_epoch, expected_exception, expected_date_time",
Expand Down Expand Up @@ -131,5 +134,9 @@ def test_package_python_function_nested(
with zipfile.ZipFile(inner_zip, "r") as izip:
izip.extractall(verify_dir)
verify_file_reproducibility(izip.infolist(), expected_file_date_time=expected_date_time)

for file in test_data_nested.project_files:
assert (verify_dir / file.path).exists()
if file in test_data_nested.files_excluded_from_bundle:
assert not (verify_dir / file.path).exists()
else:
assert (verify_dir / file.path).exists()