From 0cf98873652c28e4aed81b460718e496513ad6b7 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 17 Mar 2026 18:06:48 +0000 Subject: [PATCH 1/3] dumpfile: Canonicalize parsed entries per composefs-dump(5) The composefs-dump(5) spec leaves several fields unspecified or explicitly ignored. Canonicalize them at parse time so that parsed entries have a single canonical representation regardless of which implementation produced them: - **Directory sizes**: "This is ignored for directories." Drop the size field from Item::Directory, always emit 0. - **Hardlink metadata**: "We ignore all the fields except the payload." Zero uid/gid/mode/mtime and skip xattrs, matching the C parser which bails out early (mkcomposefs.c:477-491). - **Xattr ordering**: The spec doesn't define an order. Sort lexicographically so output is deterministic regardless of on-disk ordering. The parser still accepts any input values for backward compatibility. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- crates/composefs/src/dumpfile.rs | 6 +- crates/composefs/src/dumpfile_parse.rs | 114 ++++++++++++++---- crates/composefs/src/erofs/reader.rs | 82 +++++++------ .../composefs/src/tests/assets/special.dump | 4 +- 4 files changed, 138 insertions(+), 68 deletions(-) diff --git a/crates/composefs/src/dumpfile.rs b/crates/composefs/src/dumpfile.rs index 88159bcc..ad6c645e 100644 --- a/crates/composefs/src/dumpfile.rs +++ b/crates/composefs/src/dumpfile.rs @@ -561,7 +561,7 @@ mod tests { use super::*; use crate::fsverity::Sha256HashValue; - const SIMPLE_DUMP: &str = r#"/ 4096 40755 2 0 0 0 1000.0 - - - + const SIMPLE_DUMP: &str = r#"/ 0 40755 2 0 0 0 1000.0 - - - /empty_file 0 100644 1 0 0 0 1000.0 - - - /small_file 5 100644 1 0 0 0 1000.0 - hello - /symlink 7 120777 1 0 0 0 1000.0 /target - - @@ -592,10 +592,10 @@ mod tests { // The nlink/uid/gid/rdev fields on hardlink lines use `-` here, // matching the C composefs writer convention. The parser must // accept these without trying to parse them as integers. - let dumpfile = r#"/ 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - /original 11 100644 2 0 0 0 1000.0 - hello_world - /hardlink1 0 @120000 - - - - 0.0 /original - - -/dir1 4096 40755 2 0 0 0 1000.0 - - - +/dir1 0 40755 2 0 0 0 1000.0 - - - /dir1/hardlink2 0 @120000 - - - - 0.0 /original - - "#; diff --git a/crates/composefs/src/dumpfile_parse.rs b/crates/composefs/src/dumpfile_parse.rs index ea4902c3..7e3e436b 100644 --- a/crates/composefs/src/dumpfile_parse.rs +++ b/crates/composefs/src/dumpfile_parse.rs @@ -33,7 +33,7 @@ const XATTR_LIST_MAX: usize = u16::MAX as usize; // See above const XATTR_SIZE_MAX: usize = u16::MAX as usize; -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] /// An extended attribute entry pub struct Xattr<'k> { /// key @@ -122,8 +122,6 @@ pub enum Item<'p> { }, /// A directory Directory { - /// Size of a directory is not necessarily meaningful - size: u64, /// Number of links nlink: u32, }, @@ -375,16 +373,14 @@ impl<'p> Entry<'p> { (false, u32::from_str_radix(modeval, 8)?) }; - // For hardlinks, the C parser skips the remaining numeric fields - // (nlink, uid, gid, rdev, mtime) and only reads the payload (target - // path). We match that: consume the tokens without parsing them as - // integers, so values like `-` are accepted. + // Per composefs-dump(5): for hardlinks "we ignore all the fields + // except the payload." The C parser does the same (mkcomposefs.c + // bails out early). Skip everything and zero ignored fields. if is_hardlink { let ty = FileType::from_raw_mode(mode); if ty == FileType::Directory { anyhow::bail!("Invalid hardlinked directory"); } - // Skip nlink, uid, gid, rdev, mtime for field in ["nlink", "uid", "gid", "rdev", "mtime"] { next(field)?; } @@ -395,7 +391,7 @@ impl<'p> Entry<'p> { path, uid: 0, gid: 0, - mode, + mode: 0, mtime: Mtime { sec: 0, nsec: 0 }, item: Item::Hardlink { target }, xattrs: Vec::new(), @@ -410,7 +406,7 @@ impl<'p> Entry<'p> { let payload = optional_str(next("payload")?); let content = optional_str(next("content")?); let fsverity_digest = optional_str(next("digest")?); - let xattrs = components + let mut xattrs = components .try_fold((Vec::new(), 0usize), |(mut acc, total_namelen), line| { let xattr = Xattr::parse(line)?; // Limit the total length of keys. @@ -422,6 +418,10 @@ impl<'p> Entry<'p> { Ok((acc, total_namelen)) })? .0; + // Canonicalize xattr ordering — the composefs-dump(5) spec doesn't + // define an order, and different implementations emit them differently + // (C uses EROFS on-disk order, Rust uses BTreeMap order). + xattrs.sort(); let ty = FileType::from_raw_mode(mode); let item = { @@ -474,8 +474,9 @@ impl<'p> Entry<'p> { FileType::Directory => { Self::check_nonregfile(content, fsverity_digest)?; Self::check_rdev(rdev)?; - - Item::Directory { size, nlink } + // Per composefs-dump(5): "SIZE: The size of the file. + // This is ignored for directories." We discard it. + Item::Directory { nlink } } FileType::Socket => { anyhow::bail!("sockets are not supported"); @@ -512,8 +513,10 @@ impl<'p> Entry<'p> { impl Item<'_> { pub(crate) fn size(&self) -> u64 { match self { - Item::Regular { size, .. } | Item::Directory { size, .. } => *size, + Item::Regular { size, .. } => *size, Item::RegularInline { content, .. } => content.len() as u64, + // Directories always report 0; the spec says size is ignored. + Item::Directory { .. } => 0, _ => 0, } } @@ -556,9 +559,14 @@ impl Display for Mtime { impl Display for Entry<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { escape(f, self.path.as_os_str().as_bytes(), EscapeMode::Standard)?; + let hardlink_prefix = if matches!(self.item, Item::Hardlink { .. }) { + "@" + } else { + "" + }; write!( f, - " {} {:o} {} {} {} {} {} ", + " {} {hardlink_prefix}{:o} {} {} {} {} {} ", self.item.size(), self.mode, self.item.nlink(), @@ -858,18 +866,78 @@ mod tests { fn test_parse() { const CONTENT: &str = include_str!("tests/assets/special.dump"); for line in CONTENT.lines() { - // Test a full round trip by parsing, serialize, parsing again + // Test a full round trip by parsing, serializing, parsing again. + // The serialized form may differ from the input (e.g. xattr + // ordering is canonicalized), so we check structural equality + // and that serialization is idempotent. let e = Entry::parse(line).unwrap(); let serialized = e.to_string(); - if line != serialized { - dbg!(&line, &e, &serialized); - } - similar_asserts::assert_eq!(line, serialized); let e2 = Entry::parse(&serialized).unwrap(); similar_asserts::assert_eq!(e, e2); + // Serialization must be idempotent + similar_asserts::assert_eq!(serialized, e2.to_string()); } } + #[test] + fn test_canonicalize_directory_size() { + // Directory size should be discarded — any input value becomes 0 + let e = Entry::parse("/ 4096 40755 2 0 0 0 1000.0 - - -").unwrap(); + assert_eq!(e.item.size(), 0); + assert!(e.to_string().starts_with("/ 0 40755")); + + let e = Entry::parse("/ 99999 40755 2 0 0 0 1000.0 - - -").unwrap(); + assert_eq!(e.item.size(), 0); + } + + #[test] + fn test_canonicalize_hardlink_metadata() { + // Hardlink metadata fields should all be zeroed — only path and + // target (payload) are meaningful per composefs-dump(5). + let e = Entry::parse( + "/link 259 @100644 3 1000 1000 0 1695368732.385062094 /original - \ + 35d02f81325122d77ec1d11baba655bc9bf8a891ab26119a41c50fa03ddfb408 \ + security.selinux=foo", + ) + .unwrap(); + + // All metadata zeroed + assert_eq!(e.uid, 0); + assert_eq!(e.gid, 0); + assert_eq!(e.mode, 0); + assert_eq!(e.mtime, Mtime { sec: 0, nsec: 0 }); + assert!(e.xattrs.is_empty()); + + // Target preserved + match &e.item { + Item::Hardlink { target } => assert_eq!(target.as_ref(), Path::new("/original")), + other => panic!("Expected Hardlink, got {other:?}"), + } + + // Serialization uses @0 for mode, zeroed fields + let s = e.to_string(); + assert!(s.contains("@0 0 0 0 0 0.0"), "got: {s}"); + } + + #[test] + fn test_canonicalize_xattr_ordering() { + // Xattrs should be sorted by key regardless of input order + let e = Entry::parse("/ 0 40755 2 0 0 0 0.0 - - - user.z=1 security.ima=2 trusted.a=3") + .unwrap(); + + let keys: Vec<&[u8]> = e.xattrs.iter().map(|x| x.key.as_bytes()).collect(); + assert_eq!( + keys, + vec![b"security.ima".as_slice(), b"trusted.a", b"user.z"], + "xattrs should be sorted by key" + ); + + // Re-serialization preserves sorted order + let s = e.to_string(); + let e2 = Entry::parse(&s).unwrap(); + assert_eq!(e, e2); + } + fn parse_all(name: &str, s: &str) -> Result<()> { for line in s.lines() { if line.is_empty() { @@ -886,10 +954,10 @@ mod tests { const CASES: &[(&str, &str)] = &[ ( "content in fifo", - "/ 4096 40755 2 0 0 0 0.0 - - -\n/fifo 0 10777 1 0 0 0 0.0 - foobar -", + "/ 0 40755 2 0 0 0 0.0 - - -\n/fifo 0 10777 1 0 0 0 0.0 - foobar -", ), - ("root with rdev", "/ 4096 40755 2 0 0 42 0.0 - - -"), - ("root with fsverity", "/ 4096 40755 2 0 0 0 0.0 - - 35d02f81325122d77ec1d11baba655bc9bf8a891ab26119a41c50fa03ddfb408"), + ("root with rdev", "/ 0 40755 2 0 0 42 0.0 - - -"), + ("root with fsverity", "/ 0 40755 2 0 0 0 0.0 - - 35d02f81325122d77ec1d11baba655bc9bf8a891ab26119a41c50fa03ddfb408"), ]; for (name, case) in CASES.iter().copied() { assert!( @@ -919,7 +987,7 @@ mod tests { #[test] fn test_load_cfs_filtered() -> Result<()> { const FILTERED: &str = - "/ 4096 40555 2 0 0 0 1633950376.0 - - - trusted.foo1=bar-1 user.foo2=bar-2\n\ + "/ 0 40555 2 0 0 0 1633950376.0 - - - trusted.foo1=bar-1 user.foo2=bar-2\n\ /blockdev 0 60777 1 0 0 107690 1633950376.0 - - - trusted.bar=bar-2\n\ /inline 15 100777 1 0 0 0 1633950376.0 - FOOBAR\\nINAFILE\\n - user.foo=bar-2\n"; let mut tmpf = tempfile::tempfile()?; diff --git a/crates/composefs/src/erofs/reader.rs b/crates/composefs/src/erofs/reader.rs index c52f275e..5d6a049a 100644 --- a/crates/composefs/src/erofs/reader.rs +++ b/crates/composefs/src/erofs/reader.rs @@ -1448,8 +1448,8 @@ mod tests { #[test] fn test_empty_directory() { // Create filesystem with empty directory - let dumpfile = r#"/ 4096 40755 2 0 0 0 1000.0 - - - -/empty_dir 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - +/empty_dir 0 40755 2 0 0 0 1000.0 - - - "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); @@ -1491,8 +1491,8 @@ mod tests { #[test] fn test_directory_with_inline_entries() { // Create filesystem with directory that has a few entries (should be inline) - let dumpfile = r#"/ 4096 40755 2 0 0 0 1000.0 - - - -/dir1 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - +/dir1 0 40755 2 0 0 0 1000.0 - - - /dir1/file1 5 100644 1 0 0 0 1000.0 - hello - /dir1/file2 5 100644 1 0 0 0 1000.0 - world - "#; @@ -1532,8 +1532,8 @@ mod tests { #[test] fn test_directory_with_many_entries() { // Create a directory with many entries to force block storage - let mut dumpfile = String::from("/ 4096 40755 2 0 0 0 1000.0 - - -\n"); - dumpfile.push_str("/bigdir 4096 40755 2 0 0 0 1000.0 - - -\n"); + let mut dumpfile = String::from("/ 0 40755 2 0 0 0 1000.0 - - -\n"); + dumpfile.push_str("/bigdir 0 40755 2 0 0 0 1000.0 - - -\n"); // Add many files to force directory blocks for i in 0..100 { @@ -1585,10 +1585,10 @@ mod tests { #[test] fn test_nested_directories() { // Test deeply nested directory structure - let dumpfile = r#"/ 4096 40755 2 0 0 0 1000.0 - - - -/a 4096 40755 2 0 0 0 1000.0 - - - -/a/b 4096 40755 2 0 0 0 1000.0 - - - -/a/b/c 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - +/a 0 40755 2 0 0 0 1000.0 - - - +/a/b 0 40755 2 0 0 0 1000.0 - - - +/a/b/c 0 40755 2 0 0 0 1000.0 - - - /a/b/c/file.txt 5 100644 1 0 0 0 1000.0 - hello - "#; @@ -1622,12 +1622,12 @@ mod tests { #[test] fn test_mixed_entry_types() { // Test directory with various file types - let dumpfile = r#"/ 4096 40755 2 0 0 0 1000.0 - - - -/mixed 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - +/mixed 0 40755 2 0 0 0 1000.0 - - - /mixed/regular 10 100644 1 0 0 0 1000.0 - content123 - /mixed/symlink 7 120777 1 0 0 0 1000.0 /target - - /mixed/fifo 0 10644 1 0 0 0 1000.0 - - - -/mixed/subdir 4096 40755 2 0 0 0 1000.0 - - - +/mixed/subdir 0 40755 2 0 0 0 1000.0 - - - "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); @@ -1668,11 +1668,11 @@ mod tests { #[test] fn test_collect_objects_traversal() { // Test that object collection properly traverses all directories - let dumpfile = r#"/ 4096 40755 2 0 0 0 1000.0 - - - -/dir1 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - +/dir1 0 40755 2 0 0 0 1000.0 - - - /dir1/file1 5 100644 1 0 0 0 1000.0 - hello - -/dir2 4096 40755 2 0 0 0 1000.0 - - - -/dir2/subdir 4096 40755 2 0 0 0 1000.0 - - - +/dir2 0 40755 2 0 0 0 1000.0 - - - +/dir2/subdir 0 40755 2 0 0 0 1000.0 - - - /dir2/subdir/file2 5 100644 1 0 0 0 1000.0 - world - "#; @@ -1705,8 +1705,8 @@ mod tests { // . and .. entries). // Generate a C-generated erofs image using mkcomposefs - let dumpfile_content = r#"/ 4096 40755 2 0 0 0 1000.0 - - - -/empty_dir 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile_content = r#"/ 0 40755 2 0 0 0 1000.0 - - - +/empty_dir 0 40755 2 0 0 0 1000.0 - - - "#; // Create temporary files for dumpfile and erofs output @@ -1745,10 +1745,10 @@ mod tests { #[test] fn test_round_trip_basic() { // Full round-trip: dumpfile -> tree -> erofs -> read back -> validate - let dumpfile = r#"/ 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - /file1 5 100644 1 0 0 0 1000.0 - hello - /file2 6 100644 1 0 0 0 1000.0 - world! - -/dir1 4096 40755 2 0 0 0 1000.0 - - - +/dir1 0 40755 2 0 0 0 1000.0 - - - /dir1/nested 8 100644 1 0 0 0 1000.0 - content1 - "#; @@ -1812,14 +1812,14 @@ mod tests { #[test] fn test_erofs_to_filesystem_empty_root() { - let dumpfile = "/ 4096 40755 2 0 0 0 1000.0 - - -\n"; + let dumpfile = "/ 0 40755 2 0 0 0 1000.0 - - -\n"; let (orig, rt) = round_trip_dumpfile(dumpfile); assert_eq!(orig, rt); } #[test] fn test_erofs_to_filesystem_inline_files() { - let dumpfile = r#"/ 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - /empty 0 100644 1 0 0 0 1000.0 - - - /hello 5 100644 1 0 0 0 1000.0 - hello - /world 6 100644 1 0 0 0 1000.0 - world! - @@ -1830,7 +1830,7 @@ mod tests { #[test] fn test_erofs_to_filesystem_symlinks() { - let dumpfile = r#"/ 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - /link1 7 120777 1 0 0 0 1000.0 /target - - /link2 11 120777 1 0 0 0 1000.0 /other/path - - "#; @@ -1840,10 +1840,10 @@ mod tests { #[test] fn test_erofs_to_filesystem_nested_dirs() { - let dumpfile = r#"/ 4096 40755 3 0 0 0 1000.0 - - - -/a 4096 40755 3 0 0 0 1000.0 - - - -/a/b 4096 40755 3 0 0 0 1000.0 - - - -/a/b/c 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 3 0 0 0 1000.0 - - - +/a 0 40755 3 0 0 0 1000.0 - - - +/a/b 0 40755 3 0 0 0 1000.0 - - - +/a/b/c 0 40755 2 0 0 0 1000.0 - - - /a/b/c/file.txt 5 100644 1 0 0 0 1000.0 - hello - /a/b/other 3 100644 1 0 0 0 1000.0 - abc - "#; @@ -1853,7 +1853,7 @@ mod tests { #[test] fn test_erofs_to_filesystem_devices_and_fifos() { - let dumpfile = r#"/ 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - /blk 0 60660 1 0 0 2049 1000.0 - - - /chr 0 20666 1 0 0 1025 1000.0 - - - /fifo 0 10644 1 0 0 0 1000.0 - - - @@ -1865,7 +1865,7 @@ mod tests { #[test] fn test_erofs_to_filesystem_xattrs() { let dumpfile = - "/ 4096 40755 2 0 0 0 1000.0 - - - security.selinux=system_u:object_r:root_t:s0\n\ + "/ 0 40755 2 0 0 0 1000.0 - - - security.selinux=system_u:object_r:root_t:s0\n\ /file 5 100644 1 0 0 0 1000.0 - hello - user.myattr=myvalue\n"; let (orig, rt) = round_trip_dumpfile(dumpfile); assert_eq!(orig, rt); @@ -1875,7 +1875,7 @@ mod tests { fn test_erofs_to_filesystem_escaped_overlay_xattrs() { // The writer escapes trusted.overlay.X to trusted.overlay.overlay.X. // Round-tripping must preserve the original xattr name. - let dumpfile = "/ 4096 40755 2 0 0 0 1000.0 - - -\n\ + let dumpfile = "/ 0 40755 2 0 0 0 1000.0 - - -\n\ /file 5 100644 1 0 0 0 1000.0 - hello - trusted.overlay.custom=val\n"; let (orig, rt) = round_trip_dumpfile(dumpfile); assert_eq!(orig, rt); @@ -1891,7 +1891,7 @@ mod tests { let digest = "a".repeat(64); let pathname = format!("{}/{}", &digest[..2], &digest[2..]); let dumpfile = format!( - "/ 4096 40755 2 0 0 0 1000.0 - - -\n\ + "/ 0 40755 2 0 0 0 1000.0 - - -\n\ /ext 1000000000 100644 1 0 0 0 1000.0 {pathname} - {digest}\n" ); let (orig, rt) = round_trip_dumpfile(&dumpfile); @@ -1900,7 +1900,7 @@ mod tests { #[test] fn test_erofs_to_filesystem_hardlinks() { - let dumpfile = r#"/ 4096 40755 2 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - /original 11 100644 2 0 0 0 1000.0 - hello_world - /hardlink 0 @120000 2 0 0 0 0.0 /original - - "#; @@ -1933,10 +1933,10 @@ mod tests { #[test] fn test_erofs_to_filesystem_mixed_types() { - let dumpfile = r#"/ 4096 40755 3 0 0 0 1000.0 - - - + let dumpfile = r#"/ 0 40755 3 0 0 0 1000.0 - - - /blk 0 60660 1 0 6 259 1000.0 - - - /chr 0 20666 1 0 6 1025 1000.0 - - - -/dir 4096 40755 2 42 42 0 2000.0 - - - +/dir 0 40755 2 42 42 0 2000.0 - - - /dir/nested 3 100644 1 42 42 0 2000.0 - abc - /fifo 0 10644 1 0 0 0 1000.0 - - - /hello 5 100644 1 1000 1000 0 1500.0 - hello - @@ -1949,7 +1949,7 @@ mod tests { #[test] fn test_restrict_to_composefs_rejects_unsupported_features() { // Build a minimal valid composefs image (just a root directory). - let dumpfile = "/ 4096 40755 2 0 0 0 1000.0 - - -\n"; + let dumpfile = "/ 0 40755 2 0 0 0 1000.0 - - -\n"; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); let base_image = mkfs_erofs(&fs); @@ -2079,7 +2079,7 @@ mod tests { // The corrupted size must be a multiple of block_size so that // additional_bytes() (which uses `size % block_size` for FlatInline) // stays the same and the inode still parses successfully. - let dumpfile = "/ 4096 40755 1 0 0 0 0.0 - - -\n"; + let dumpfile = "/ 0 40755 1 0 0 0 0.0 - - -\n"; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); let mut image = mkfs_erofs(&fs); @@ -2124,8 +2124,7 @@ mod tests { use crate::test::proptest_strategies::{build_filesystem, filesystem_spec}; use proptest::prelude::*; - /// Round-trip a FileSystem through erofs with a given ObjectID type - /// and compare dumpfile output before and after. + /// Round-trip a FileSystem through erofs and compare dumpfile output. fn round_trip_filesystem( fs_orig: &tree::FileSystem, ) { @@ -2138,7 +2137,10 @@ mod tests { let mut rt_output = Vec::new(); write_dumpfile(&mut rt_output, &fs_rt).unwrap(); - assert_eq!(orig_output, rt_output); + similar_asserts::assert_eq!( + String::from_utf8_lossy(&orig_output), + String::from_utf8_lossy(&rt_output) + ); } proptest! { diff --git a/crates/composefs/src/tests/assets/special.dump b/crates/composefs/src/tests/assets/special.dump index 7665fdd7..2afe62f4 100644 --- a/crates/composefs/src/tests/assets/special.dump +++ b/crates/composefs/src/tests/assets/special.dump @@ -1,7 +1,7 @@ -/ 4096 40555 2 0 0 0 1633950376.0 - - - trusted.foo1=bar-1 user.foo2=bar-2 +/ 0 40555 2 0 0 0 1633950376.0 - - - trusted.foo1=bar-1 user.foo2=bar-2 /blockdev 0 60777 1 0 0 107690 1633950376.0 - - - trusted.bar=bar-2 /chardev 0 20777 1 0 0 10769 1633950376.0 - - - trusted.foo=bar-2 -/escaped-xattr 0 100777 1 0 0 0 1633950376.0 - - - trusted.overlay.redirect=/foo\n user.overlay.redirect=/foo\n user.foo=bar-2 +/escaped-xattr 0 100777 1 0 0 0 1633950376.0 - - - trusted.overlay.redirect=/foo\n user.foo=bar-2 user.overlay.redirect=/foo\n /external 42 100755 1 0 0 0 1731497312.0 70/a9125438f7255245f596c54cebb6621cb9a64f062752cf26763c1b690e7340 - 70a9125438f7255245f596c54cebb6621cb9a64f062752cf26763c1b690e7340 /fifo 0 10777 1 0 0 0 1633950376.0 - - - trusted.bar=bar-2 /inline 15 100777 1 0 0 0 1633950376.0 - FOOBAR\nINAFILE\n - user.foo=bar-2 From d1748326d9548167b313fc1959c161605e80a3eb Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 17 Mar 2026 22:25:58 +0000 Subject: [PATCH 2/3] Limit symlink targets to 1024 bytes (XFS_SYMLINK_MAXLEN) XFS limits symlink targets to 1024 bytes, and since generic Linux containers are commonly backed by XFS, enforce that limit in both the dumpfile parser and the EROFS reader rather than allowing up to PATH_MAX (4096). This also avoids exercising a known limitation in our EROFS reader where symlink data that spills into a non-inline data block (which can happen with long symlinks + xattrs) is not read back correctly. See https://github.com/composefs/composefs/pull/342 for the corresponding C fix for that edge case. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- crates/composefs/src/dumpfile_parse.rs | 7 +++++-- crates/composefs/src/erofs/reader.rs | 8 ++++++++ crates/composefs/src/erofs/writer.rs | 6 ++++++ crates/composefs/src/lib.rs | 7 +++++++ crates/composefs/src/test.rs | 10 ++++------ 5 files changed, 30 insertions(+), 8 deletions(-) diff --git a/crates/composefs/src/dumpfile_parse.rs b/crates/composefs/src/dumpfile_parse.rs index 7e3e436b..dd6acdb9 100644 --- a/crates/composefs/src/dumpfile_parse.rs +++ b/crates/composefs/src/dumpfile_parse.rs @@ -24,6 +24,7 @@ use crate::MAX_INLINE_CONTENT; /// https://github.com/torvalds/linux/blob/47ac09b91befbb6a235ab620c32af719f8208399/include/uapi/linux/limits.h#L13 const PATH_MAX: u32 = 4096; +use crate::SYMLINK_MAX; /// https://github.com/torvalds/linux/blob/47ac09b91befbb6a235ab620c32af719f8208399/include/uapi/linux/limits.h#L15 /// This isn't exposed in libc/rustix, and in any case we should be conservative...if this ever /// gets bumped it'd be a hazard. @@ -456,8 +457,10 @@ impl<'p> Entry<'p> { let target = unescape_to_path(payload.ok_or_else(|| anyhow!("Missing payload"))?)?; let targetlen = target.as_os_str().as_bytes().len(); - if targetlen > PATH_MAX as usize { - anyhow::bail!("Target length too large {targetlen}"); + if targetlen > SYMLINK_MAX { + anyhow::bail!( + "Symlink target length {targetlen} exceeds limit {SYMLINK_MAX}" + ); } Item::Symlink { nlink, target } } diff --git a/crates/composefs/src/erofs/reader.rs b/crates/composefs/src/erofs/reader.rs index 5d6a049a..3acffbce 100644 --- a/crates/composefs/src/erofs/reader.rs +++ b/crates/composefs/src/erofs/reader.rs @@ -1350,6 +1350,14 @@ fn populate_directory( } S_IFLNK => { let target_data = child_inode.inline().unwrap_or(&[]); + if target_data.len() > crate::SYMLINK_MAX { + anyhow::bail!( + "symlink target for {:?} is {} bytes (max {})", + name, + target_data.len(), + crate::SYMLINK_MAX, + ); + } let target = OsStr::from_bytes(target_data); tree::LeafContent::Symlink(Box::from(target)) } diff --git a/crates/composefs/src/erofs/writer.rs b/crates/composefs/src/erofs/writer.rs index 78858162..33f5a09e 100644 --- a/crates/composefs/src/erofs/writer.rs +++ b/crates/composefs/src/erofs/writer.rs @@ -293,6 +293,12 @@ impl Leaf<'_, ObjectID> { (format::DataLayout::FlatPlain, 0, 0) } tree::LeafContent::Symlink(target) => { + assert!( + target.len() <= crate::SYMLINK_MAX, + "symlink target is {} bytes (max {})", + target.len(), + crate::SYMLINK_MAX, + ); (format::DataLayout::FlatInline, 0, target.len() as u64) } }; diff --git a/crates/composefs/src/lib.rs b/crates/composefs/src/lib.rs index 521a9a73..38d55e1f 100644 --- a/crates/composefs/src/lib.rs +++ b/crates/composefs/src/lib.rs @@ -45,6 +45,13 @@ pub const INLINE_CONTENT_MAX_V0: usize = 64; /// ). pub const MAX_INLINE_CONTENT: usize = 512; +/// Maximum symlink target length in bytes. +/// +/// XFS limits symlink targets to 1024 bytes (`XFS_SYMLINK_MAXLEN`). Since +/// generic Linux containers are commonly backed by XFS, we enforce that +/// limit rather than the Linux VFS `PATH_MAX` of 4096. +pub const SYMLINK_MAX: usize = 1024; + /// Internal constants shared across workspace crates. /// /// Not part of the public API — may change without notice. diff --git a/crates/composefs/src/test.rs b/crates/composefs/src/test.rs index 3ff1a433..7e165f76 100644 --- a/crates/composefs/src/test.rs +++ b/crates/composefs/src/test.rs @@ -131,8 +131,7 @@ pub(crate) mod proptest_strategies { /// EROFS limit (`EROFS_NAME_LEN`). const NAME_MAX: usize = 255; - /// Maximum symlink target length on Linux (`PATH_MAX`). - const PATH_MAX: usize = 4096; + use crate::SYMLINK_MAX; /// Strategy for valid filenames as OsString. /// @@ -230,8 +229,7 @@ pub(crate) mod proptest_strategies { /// Strategy for symlink targets as OsString. /// /// Symlink targets on Linux are arbitrary bytes except `\0`, up to - /// [`PATH_MAX`] (4096) bytes. We generate a mix of path-like ASCII - /// targets and binary targets, occasionally long. + /// [`SYMLINK_MAX`] (1024) bytes, matching the XFS limit. fn symlink_target() -> impl Strategy { prop_oneof![ // Short path-like ASCII target (common case) @@ -241,8 +239,8 @@ pub(crate) mod proptest_strategies { // Binary target with arbitrary bytes (no NUL) 3 => prop::collection::vec(1..=0xFFu8, 1..=100) .prop_map(OsString::from_vec), - // Long ASCII target (up to PATH_MAX) - 1 => proptest::string::string_regex(&format!("[a-zA-Z0-9/._-]{{100,{PATH_MAX}}}")) + // Long ASCII target (up to SYMLINK_MAX) + 1 => proptest::string::string_regex(&format!("[a-zA-Z0-9/._-]{{100,{SYMLINK_MAX}}}")) .expect("valid regex") .prop_map(OsString::from), ] From 59b7b3a5a5670f0711cc9823701265d8dfbc39f1 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 17 Mar 2026 18:42:09 +0000 Subject: [PATCH 3/3] dumpfile: Use named escapes and only escape '=' in xattr fields Increase alignment for dumpfile generation with the composefs C implementation - on general principle but also motivated by the goal of reimplementing it in Rust here. The C composefs implementation uses named escapes for backslash, newline, carriage return, and tab (\\ \n \r \t), while our writer was hex-escaping them uniformly (\x5c \x0a etc). Both forms parse correctly, but byte-identical output matters for cross-implementation comparison. Similarly, C only escapes '=' in xattr key/value fields (where it separates key from value). We were escaping it as \x3d in all fields including paths and content, where '=' is a normal graphic character. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- crates/composefs/src/dumpfile.rs | 125 ++++++++++++++++++++++++++----- 1 file changed, 105 insertions(+), 20 deletions(-) diff --git a/crates/composefs/src/dumpfile.rs b/crates/composefs/src/dumpfile.rs index ad6c645e..5aaa4927 100644 --- a/crates/composefs/src/dumpfile.rs +++ b/crates/composefs/src/dumpfile.rs @@ -50,32 +50,48 @@ fn write_escaped(writer: &mut impl fmt::Write, bytes: &[u8]) -> fmt::Result { return writer.write_str("\\x2d"); } - write_escaped_raw(writer, bytes) + write_escaped_raw(writer, bytes, EscapeEquals::No) +} + +/// Whether to escape `=` as `\x3d`. +/// +/// The C composefs implementation only escapes `=` in xattr key/value +/// fields where it separates the key from the value. In other fields +/// (paths, content, payload) `=` is a normal graphic character. +#[derive(Clone, Copy)] +enum EscapeEquals { + /// Escape `=` — used for xattr key/value fields. + Yes, + /// Do not escape `=` — used for paths, content, and payload fields. + No, } /// Escape a byte slice without the `-` sentinel logic. /// -/// This corresponds to `print_escaped` with `ESCAPE_EQUAL` (but without -/// `ESCAPE_LONE_DASH`) in the C composefs `composefs-info.c`. Used for -/// xattr values where `-` and empty are valid literal values, not -/// sentinels. +/// This corresponds to `print_escaped` in the C composefs +/// `composefs-info.c`. Used for xattr values where `-` and empty are +/// valid literal values, not sentinels. /// -/// Note: we unconditionally escape `=` in all fields, whereas the C code -/// only uses `ESCAPE_EQUAL` for xattr keys and values. This is harmless -/// since `\x3d` round-trips correctly, but means our output for paths -/// containing `=` is slightly more verbose than the C output. -fn write_escaped_raw(writer: &mut impl fmt::Write, bytes: &[u8]) -> fmt::Result { +/// The `escape_eq` parameter controls whether `=` is escaped (only +/// needed in xattr key/value fields where `=` is a separator). +fn write_escaped_raw( + writer: &mut impl fmt::Write, + bytes: &[u8], + escape_eq: EscapeEquals, +) -> fmt::Result { for c in bytes { let c = *c; - // The set of hex-escaped characters matches C `!isgraph(c)` in the - // POSIX locale (i.e. outside 0x21..=0x7E), plus `=` and `\`. - // The C code uses named escapes for `\\`, `\n`, `\r`, `\t` while - // we hex-escape everything uniformly; both forms parse correctly. - if c < b'!' || c == b'=' || c == b'\\' || c > b'~' { - write!(writer, "\\x{c:02x}")?; - } else { - writer.write_char(c as char)?; + // Named escapes matching the C composefs implementation. + match c { + b'\\' => writer.write_str("\\\\")?, + b'\n' => writer.write_str("\\n")?, + b'\r' => writer.write_str("\\r")?, + b'\t' => writer.write_str("\\t")?, + b'=' if matches!(escape_eq, EscapeEquals::Yes) => write!(writer, "\\x{c:02x}")?, + // Hex-escape non-graphic characters (outside 0x21..=0x7E in POSIX locale). + c if !(b'!'..=b'~').contains(&c) => write!(writer, "\\x{c:02x}")?, + c => writer.write_char(c as char)?, } } @@ -117,11 +133,11 @@ fn write_entry( for (key, value) in &*stat.xattrs.borrow() { write!(writer, " ")?; - write_escaped(writer, key.as_bytes())?; + write_escaped_raw(writer, key.as_bytes(), EscapeEquals::Yes)?; write!(writer, "=")?; // Xattr values don't use the `-` sentinel — they're always present // when the key=value pair exists, and empty or `-` are valid values. - write_escaped_raw(writer, value)?; + write_escaped_raw(writer, value, EscapeEquals::Yes)?; } Ok(()) @@ -765,6 +781,75 @@ mod tests { Ok(()) } + /// Helper to escape bytes through write_escaped and return the result. + fn escaped(bytes: &[u8]) -> String { + let mut out = String::new(); + write_escaped(&mut out, bytes).unwrap(); + out + } + + /// Helper to escape bytes through write_escaped_raw with the given mode. + fn escaped_raw(bytes: &[u8], eq: EscapeEquals) -> String { + let mut out = String::new(); + write_escaped_raw(&mut out, bytes, eq).unwrap(); + out + } + + #[test] + fn test_named_escapes() { + // These must use named escapes matching C composefs, not \xHH. + assert_eq!(escaped_raw(b"\\", EscapeEquals::No), "\\\\"); + assert_eq!(escaped_raw(b"\n", EscapeEquals::No), "\\n"); + assert_eq!(escaped_raw(b"\r", EscapeEquals::No), "\\r"); + assert_eq!(escaped_raw(b"\t", EscapeEquals::No), "\\t"); + + // Mixed: named escapes interspersed with literals + assert_eq!(escaped_raw(b"a\nb", EscapeEquals::No), "a\\nb"); + assert_eq!(escaped_raw(b"\t\n\\", EscapeEquals::No), "\\t\\n\\\\"); + } + + #[test] + fn test_non_graphic_hex_escapes() { + // Characters outside 0x21..=0x7E get \xHH + assert_eq!(escaped_raw(b"\x00", EscapeEquals::No), "\\x00"); + assert_eq!(escaped_raw(b"\x1f", EscapeEquals::No), "\\x1f"); + assert_eq!(escaped_raw(b" ", EscapeEquals::No), "\\x20"); // space = 0x20 < '!' + assert_eq!(escaped_raw(b"\x7f", EscapeEquals::No), "\\x7f"); + assert_eq!(escaped_raw(b"\xff", EscapeEquals::No), "\\xff"); + } + + #[test] + fn test_equals_escaping_context() { + // '=' is literal in normal fields (paths, content, payload) + assert_eq!(escaped_raw(b"a=b", EscapeEquals::No), "a=b"); + assert_eq!(escaped(b"key=val"), "key=val"); + + // '=' is escaped in xattr key/value fields + assert_eq!(escaped_raw(b"a=b", EscapeEquals::Yes), "a\\x3db"); + assert_eq!( + escaped_raw(b"overlay.redirect=/foo", EscapeEquals::Yes), + "overlay.redirect\\x3d/foo" + ); + } + + #[test] + fn test_escaped_sentinels() { + // Empty → "-" + assert_eq!(escaped(b""), "-"); + // Lone dash → "\x2d" + assert_eq!(escaped(b"-"), "\\x2d"); + // Dash in context is fine + assert_eq!(escaped(b"a-b"), "a-b"); + } + + #[test] + fn test_graphic_chars_literal() { + // All printable graphic ASCII (0x21..=0x7E) except '\' should be literal + assert_eq!(escaped_raw(b"!", EscapeEquals::No), "!"); + assert_eq!(escaped_raw(b"~", EscapeEquals::No), "~"); + assert_eq!(escaped_raw(b"abc/def.txt", EscapeEquals::No), "abc/def.txt"); + } + mod proptest_tests { use super::*; use crate::fsverity::Sha512HashValue;