diff --git a/CHANGELOG.md b/CHANGELOG.md index eebde94..f92cbcb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v2.0.0] - 2025-08-10 + +### Changed + +- **BREAKING:** Map encoding is now deterministic by default + - `Msgpack.encode/2` sorts map keys according to Elixir's standard term + ordering before serialization + - This guarantees that identical maps produce identical binary output, but it + alters the output compared to previous versions of this library + +### Added + +- Added a `:deterministic` option to `Msgpack.encode/2` + - You can set this to `false` to disable key sorting for higher performance in + contexts where deterministic output is not required. + ## [v1.1.1] - 2025-08-09 ### Fixed diff --git a/README.md b/README.md index ff314ea..892d3e3 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,36 @@ iex> encoded_stream |> Stream.take(3) |> Enum.to_list() ] ``` +### Map Encoding + +By default, `Msgpack.encode/2` serializes Elixir maps in a **deterministic** +manner. + +It achieves this by sorting the map keys according to Elixir's standard term +ordering before encoding. This ensures that encoding the same map will always +produce the exact same binary output, which is critical for tasks like +generating signatures or comparing hashes. + +```elixir +iex> map1 = %{a: 1, b: 2} +iex> map2 = %{b: 2, a: 1} + +# Both produce the same output because their keys are sorted [:a, :b] +iex> Msgpack.encode!(map1) == Msgpack.encode!(map2) +true +``` + +#### Performance Opt-Out + +Sorting keys has a performance cost (O(N log N)). + +If you are working in a performance-critical context where byte-for-byte +determinism is not required, you can disable it: + +```elixir +Msgpack.encode(map, deterministic: false) +``` + ## Full Documentation For detailed information on all features, options, and functions, see the [full diff --git a/lib/msgpack.ex b/lib/msgpack.ex index 1898d3a..ddd0e8a 100644 --- a/lib/msgpack.ex +++ b/lib/msgpack.ex @@ -87,6 +87,12 @@ defmodule Msgpack do payload with non-UTF-8 strings, which may be incompatible with other MessagePack decoders. + * `:deterministic` - Controls whether map keys are sorted before encoding. + * `true` (default) - Enables key sorting, which ensures that encoding the + same map always produces the same binary. + * `false` - Disables key sorting, which can provide a performance gain in + cases where determinism is not required. + ## Examples ### Standard Encoding diff --git a/lib/msgpack/encoder.ex b/lib/msgpack/encoder.ex index 4d7a1a8..7830019 100644 --- a/lib/msgpack/encoder.ex +++ b/lib/msgpack/encoder.ex @@ -15,7 +15,8 @@ defmodule Msgpack.Encoder do def default_opts() do [ atoms: :string, - string_validation: true + string_validation: true, + deterministic: true ] end @@ -160,6 +161,15 @@ defmodule Msgpack.Encoder do # ==== Maps ==== defp do_encode(map, opts) when is_map(map) do + enumerable = + if Keyword.get(opts, :deterministic, true) == false do + map + else + map + |> Map.to_list() + |> Enum.sort_by(fn {key, _value} -> key end) + end + acc = {:ok, []} reducer = fn {key, value}, {:ok, acc_list} -> @@ -172,7 +182,7 @@ defmodule Msgpack.Encoder do end end - case Enum.reduce(map, acc, reducer) do + case Enum.reduce(enumerable, acc, reducer) do {:ok, encoded_pairs} -> size = map_size(map) {:ok, [encode_map_header(size), Enum.reverse(encoded_pairs)]} diff --git a/mix.exs b/mix.exs index f5699db..9386637 100644 --- a/mix.exs +++ b/mix.exs @@ -1,7 +1,7 @@ defmodule MsgpackElixir.MixProject do use Mix.Project - @version "1.1.1" + @version "2.0.0" @source_url "https://github.com/nrednav/msgpack_elixir" def project do diff --git a/test/msgpack_test.exs b/test/msgpack_test.exs index 59dbd70..b65cdf7 100644 --- a/test/msgpack_test.exs +++ b/test/msgpack_test.exs @@ -76,6 +76,55 @@ defmodule MsgpackTest do string_32 = String.duplicate("a", 32) assert_encode(string_32, <<0xD9, 32, string_32::binary>>) end + + test "produces identical output for maps with different key orders, by default" do + map1 = %{c: 3, b: 2, a: 1} + map2 = %{a: 1, c: 3, b: 2} + + expected_binary = <<0x83, 0xA1, "a", 1, 0xA1, "b", 2, 0xA1, "c", 3>> + + assert_encode(map1, expected_binary) + assert_encode(map2, expected_binary) + end + + test "correctly sorts maps with mixed key types, by default" do + map = %{"a" => 1, 100 => 2, :z => 3, nil => 4} + expected_binary = <<0x84, 100, 2, 0xC0, 4, 0xA1, "z", 3, 0xA1, "a", 1>> + + assert_encode(map, expected_binary) + end + + test "applies sorting to nested maps, by default" do + map1 = %{b: %{y: 2, x: 1}, a: 10} + map2 = %{a: 10, b: %{x: 1, y: 2}} + + {:ok, expected_binary} = Msgpack.encode(map2) + + assert_encode(map1, expected_binary) + end + + test "with `deterministic: false` opts out of sorted key encoding" do + # Per the Erlang docs: + # https://www.erlang.org/doc/system/maps.html#how-large-maps-are-implemented, + # maps with 32 or fewer elements are internally stored with sorted keys. + # To reliably test the non-deterministic path, a large map (33+ elements) + # must be used, which uses a HAMT implementation and does not iterate in + # key-sorted order. + large_map = + Enum.into(1..33, %{}, fn i -> + key = String.to_atom(<<123 - i>> <> "_#{i}") + {key, i} + end) + + assert map_size(large_map) == 33 + + {:ok, sorted_binary} = Msgpack.encode(large_map) + {:ok, unsorted_binary} = Msgpack.encode(large_map, deterministic: false) + + refute unsorted_binary == sorted_binary, + "Expected binaries to be different, but both were identical. The + non-deterministic path may be producing sorted output." + end end describe "decode/2" do @@ -357,8 +406,8 @@ defmodule MsgpackTest do # ==== Helpers ==== - defp assert_encode(input, expected_binary) do - assert Msgpack.encode(input) == {:ok, expected_binary} + defp assert_encode(input, expected_binary, opts \\ []) do + assert Msgpack.encode(input, opts) == {:ok, expected_binary} end defp assert_encode_error(input, expected_reason, opts \\ []) do