From b9e8e05a936191063edfbda6b23669efda5dc827 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 6 Jul 2025 16:54:19 +0200 Subject: [PATCH 01/17] Upgrade statistex Calculates more percentiles now, that is an acceptable upgrade side effect. We want to use the new outlier functionality, so depending on the new version is required/ --- mix.exs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mix.exs b/mix.exs index b8a61227..06f02519 100644 --- a/mix.exs +++ b/mix.exs @@ -50,7 +50,7 @@ defmodule Benchee.Mixfile do defp deps do deps = [ {:deep_merge, "~> 1.0"}, - {:statistex, "~> 1.0"}, + {:statistex, "~> 1.1"}, {:ex_guard, "~> 1.3", only: :dev}, {:credo, "~> 1.7.7-rc.0", only: :dev, runtime: false}, {:ex_doc, ">= 0.0.0", only: :dev, runtime: false}, From 6ba20816cd8288216c50df789cee42909db57152 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 6 Jul 2025 17:09:54 +0200 Subject: [PATCH 02/17] Actually map over new statistics and explain why we do the mapping --- lib/benchee/statistics.ex | 31 +++++++++++++++++++++++++++---- test/fixtures/escript/mix.lock | 2 +- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/lib/benchee/statistics.ex b/lib/benchee/statistics.ex index 9fcabe88..436ff753 100644 --- a/lib/benchee/statistics.ex +++ b/lib/benchee/statistics.ex @@ -25,6 +25,9 @@ defmodule Benchee.Statistics do :relative_more, :relative_less, :absolute_difference, + :outliers, + :lower_outlier_bound, + :upper_outlier_bound, sample_size: 0 ] @@ -85,6 +88,9 @@ defmodule Benchee.Statistics do relative_more: float | nil | :infinity, relative_less: float | nil | :infinity, absolute_difference: float | nil, + outliers: [number], + lower_outlier_bound: number, + upper_outlier_bound: number, sample_size: integer } @@ -115,7 +121,7 @@ defmodule Benchee.Statistics do ...> input: "Input" ...> } ...> ] - ...> + ...> ...> suite = %Benchee.Suite{scenarios: scenarios} ...> statistics(suite, Benchee.Test.FakeProgressPrinter) %Benchee.Suite{ @@ -137,7 +143,10 @@ defmodule Benchee.Statistics do mode: [500, 400], minimum: 200, maximum: 900, - sample_size: 9 + sample_size: 9, + outliers: [], + lower_outlier_bound: 100.0, + upper_outlier_bound: 900.0 } }, memory_usage_data: %Benchee.CollectionData{ @@ -153,7 +162,10 @@ defmodule Benchee.Statistics do mode: [500, 400], minimum: 200, maximum: 900, - sample_size: 9 + sample_size: 9, + outliers: [], + lower_outlier_bound: 100.0, + upper_outlier_bound: 900.0 } } } @@ -247,6 +259,14 @@ defmodule Benchee.Statistics do |> convert_from_statistex end + # It might seem silly to maintain and map statistex to our own struct, + # but this gives benchee more control and makes it safer to upgrade and change. + # Also, we don't expose changes in statistex versions automatically to plugins. + # + # As an example right now it's being discussed in statistex to add an `m2` statistic that holds + # no value for benchee (as it's ony used to calculate variance). + # + # We also manually add `ips` related stats (see `add_ips/1`) so differences are sufficient. defp convert_from_statistex(statistex_statistics) do %__MODULE__{ average: statistex_statistics.average, @@ -257,7 +277,10 @@ defmodule Benchee.Statistics do mode: statistex_statistics.mode, minimum: statistex_statistics.minimum, maximum: statistex_statistics.maximum, - sample_size: statistex_statistics.sample_size + sample_size: statistex_statistics.sample_size, + outliers: statistex_statistics.outliers, + lower_outlier_bound: statistex_statistics.lower_outlier_bound, + upper_outlier_bound: statistex_statistics.upper_outlier_bound } end diff --git a/test/fixtures/escript/mix.lock b/test/fixtures/escript/mix.lock index 78753591..d6d37ea2 100644 --- a/test/fixtures/escript/mix.lock +++ b/test/fixtures/escript/mix.lock @@ -1,4 +1,4 @@ %{ "deep_merge": {:hex, :deep_merge, "1.0.0", "b4aa1a0d1acac393bdf38b2291af38cb1d4a52806cf7a4906f718e1feb5ee961", [:mix], [], "hexpm", "ce708e5f094b9cd4e8f2be4f00d2f4250c4095be93f8cd6d018c753894885430"}, - "statistex": {:hex, :statistex, "1.0.0", "f3dc93f3c0c6c92e5f291704cf62b99b553253d7969e9a5fa713e5481cd858a5", [:mix], [], "hexpm", "ff9d8bee7035028ab4742ff52fc80a2aa35cece833cf5319009b52f1b5a86c27"}, + "statistex": {:hex, :statistex, "1.1.0", "7fec1eb2f580a0d2c1a05ed27396a084ab064a40cfc84246dbfb0c72a5c761e5", [:mix], [], "hexpm", "f5950ea26ad43246ba2cce54324ac394a4e7408fdcf98b8e230f503a0cba9cf5"}, } From 87b16978abefeead71bfec7aa4832cbb69fa6c8d Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 6 Jul 2025 19:55:40 +0200 Subject: [PATCH 03/17] Basic implementation of excluding outliers Seems to all work fine. But do add a test - although I figure that one will be hard to do "properly". --- README.md | 18 ++++++++++++++++++ lib/benchee/configuration.ex | 11 +++++++++-- lib/benchee/statistics.ex | 30 +++++++++++++++++++----------- samples/outlier_removal.exs | 21 +++++++++++++++++++++ 4 files changed, 67 insertions(+), 13 deletions(-) create mode 100644 samples/outlier_removal.exs diff --git a/README.md b/README.md index 9117ae64..09f901db 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,8 @@ In addition, you can optionally output an extended set of statistics: * **sample size** - the number of measurements taken * **mode** - the measured values that occur the most. Often one value, but can be multiple values if they occur exactly as often. If no value occurs at least twice, this value will be `nil`. +Benchee can also [remove outliers](#remove-outliers). + ## Installation Add `:benchee` to your list of dependencies in `mix.exs`: @@ -303,6 +305,7 @@ So, what happens if a function executes too fast for Benchee to measure? If Benc * essentially every single measurement is now an average across 10 runs making lots of statistics less meaningful Benchee will print a big warning when this happens. + #### Measuring Memory Consumption Starting with version 0.13, users can now get measurements of how much memory their benchmarked scenarios use. The measurement is **limited to the process that Benchee executes your provided code in** - i.e. other processes (like worker pools)/the whole BEAM isn't taken into account. @@ -542,6 +545,21 @@ Enum."-map/2-lists^map/1-0-"/2 10001 26.38 2282 0.23 **Note about after_each hooks:** `after_each` hooks currently don't work when profiling a function, as they are not passed the return value of the function after the profiling run. It's already fixed on the elixir side and is waiting for release, likely in 1.14. It should then just work. +### Remove Outliers + +Benchee can remove outliers from the gathered samples. +That is, as determined by percentiles/quantiles (we follow [this approach](https://en.wikipedia.org/wiki/Interquartile_range#Outliers)). + +You can simply pass `exclude_outliers: true` to Benchee to trigger the removal of outliers. + +```elixir +Benchee.run(jobs, exclude_outliers: true) +``` + +The outliers themselves (aka the samples that have been determined to be outliers) +as well as the lower/upper bound after which samples are considered outliers are accessible +in the `Benchee.Statistics` struct. + ### Saving, loading and comparing previous runs Benchee can store the results of previous runs in a file and then load them again to compare them. For example this is useful to compare what was recorded on the main branch against a branch with performance improvements. You may also use this to benchmark across different exlixir/erlang versions. diff --git a/lib/benchee/configuration.ex b/lib/benchee/configuration.ex index 5c5f2cd1..a5118d55 100644 --- a/lib/benchee/configuration.ex +++ b/lib/benchee/configuration.ex @@ -48,7 +48,8 @@ defmodule Benchee.Configuration do # It also generates less than 1GB in data (some of which is garbage collected/ # not necessarily all in RAM at the same time) - which seems reasonable enough. # see `samples/statistics_performance.exs` and also maybe run it yourself. - max_sample_size: 1_000_000 + max_sample_size: 1_000_000, + exclude_outliers: false @typedoc """ The configuration supplied by the user as either a map or a keyword list @@ -152,6 +153,11 @@ defmodule Benchee.Configuration do This is used to limit memory consumption and unnecessary processing - 1 Million samples is plenty. This limit also applies to number of iterations done during warmup. You can set your own number or set it to `nil` if you don't want any limit. + * `exclude_outliers` - whether or not statistical outliers should be removed for the calculated statistics. + Defaults to `false`. + This means that values that are far outside the usual range (as determined by the percentiles/quantiles) will + be removed from the gathered samples and the calculated statistics. You might want to enable this if you + don't want things like the garbage collection triggering to influence your results as much. """ @type user_configuration :: map | keyword @@ -183,7 +189,8 @@ defmodule Benchee.Configuration do measure_function_call_overhead: boolean, title: String.t() | nil, profile_after: boolean | atom | {atom, keyword}, - max_sample_size: pos_integer() + max_sample_size: pos_integer(), + exclude_outliers: boolean() } @time_keys [:time, :warmup, :memory_time, :reduction_time] diff --git a/lib/benchee/statistics.ex b/lib/benchee/statistics.ex index 436ff753..8c0d38df 100644 --- a/lib/benchee/statistics.ex +++ b/lib/benchee/statistics.ex @@ -121,7 +121,7 @@ defmodule Benchee.Statistics do ...> input: "Input" ...> } ...> ] - ...> + ...> ...> suite = %Benchee.Suite{scenarios: scenarios} ...> statistics(suite, Benchee.Test.FakeProgressPrinter) %Benchee.Suite{ @@ -179,15 +179,17 @@ defmodule Benchee.Statistics do printer.calculating_statistics(suite.configuration) percentiles = suite.configuration.percentiles + exclude_outliers? = suite.configuration.exclude_outliers update_in(suite.scenarios, fn scenarios -> - scenario_statistics = compute_statistics_in_parallel(scenarios, percentiles) + scenario_statistics = + compute_statistics_in_parallel(scenarios, percentiles, exclude_outliers?) update_scenarios_with_statistics(scenarios, scenario_statistics) end) end - defp compute_statistics_in_parallel(scenarios, percentiles) do + defp compute_statistics_in_parallel(scenarios, percentiles, exclude_outliers?) do scenarios |> Enum.map(fn scenario -> # we filter down the data here to avoid sending the input and benchmarking function to @@ -200,7 +202,7 @@ defmodule Benchee.Statistics do # async_stream as we might run a ton of scenarios depending on the benchmark |> Task.async_stream( fn scenario_collection_data -> - calculate_scenario_statistics(scenario_collection_data, percentiles) + calculate_scenario_statistics(scenario_collection_data, percentiles, exclude_outliers?) end, timeout: :infinity, ordered: true @@ -235,27 +237,33 @@ defmodule Benchee.Statistics do end) end - defp calculate_scenario_statistics({run_time_data, memory_data, reductions_data}, percentiles) do + defp calculate_scenario_statistics( + {run_time_data, memory_data, reductions_data}, + percentiles, + exclude_outliers? + ) do run_time_stats = run_time_data.samples - |> calculate_statistics(percentiles) + |> calculate_statistics(percentiles, exclude_outliers?) |> add_ips - memory_stats = calculate_statistics(memory_data.samples, percentiles) - reductions_stats = calculate_statistics(reductions_data.samples, percentiles) + memory_stats = calculate_statistics(memory_data.samples, percentiles, exclude_outliers?) + + reductions_stats = + calculate_statistics(reductions_data.samples, percentiles, exclude_outliers?) {run_time_stats, memory_stats, reductions_stats} end - defp calculate_statistics([], _) do + defp calculate_statistics([], _, _) do %__MODULE__{ sample_size: 0 } end - defp calculate_statistics(samples, percentiles) do + defp calculate_statistics(samples, percentiles, exclude_outliers?) do samples - |> Statistex.statistics(percentiles: percentiles) + |> Statistex.statistics(percentiles: percentiles, exclude_outliers: exclude_outliers?) |> convert_from_statistex end diff --git a/samples/outlier_removal.exs b/samples/outlier_removal.exs new file mode 100644 index 00000000..103bae7e --- /dev/null +++ b/samples/outlier_removal.exs @@ -0,0 +1,21 @@ +list = Enum.to_list(1..10_000) +map_fun = fn i -> [i, i * i] end + +suite = + Benchee.run( + %{ + "flat_map" => fn -> Enum.flat_map(list, map_fun) end, + "map.flatten" => fn -> list |> Enum.map(map_fun) |> List.flatten() end + }, + formatters: [{Benchee.Formatters.Console, extended_statistics: true}], + exclude_outliers: true + ) + +suite.scenarios +|> Enum.map(fn scenario -> + statistics = scenario.run_time_data.statistics + + {scenario.name, length(statistics.outliers), statistics.outliers, + statistics.lower_outlier_bound, statistics.upper_outlier_bound} +end) +|> IO.inspect() From 7958fbc2100a894138e3014754841285bd71697d Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 12:52:10 +0200 Subject: [PATCH 04/17] Upgrade credo to work nicely with elixir 1.19 --- mix.exs | 2 +- mix.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mix.exs b/mix.exs index 06f02519..5beead03 100644 --- a/mix.exs +++ b/mix.exs @@ -52,7 +52,7 @@ defmodule Benchee.Mixfile do {:deep_merge, "~> 1.0"}, {:statistex, "~> 1.1"}, {:ex_guard, "~> 1.3", only: :dev}, - {:credo, "~> 1.7.7-rc.0", only: :dev, runtime: false}, + {:credo, "~> 1.7.13", only: :dev, runtime: false}, {:ex_doc, ">= 0.0.0", only: :dev, runtime: false}, {:excoveralls, "~> 0.13", only: :test}, {:dialyxir, "~> 1.0", only: :dev, runtime: false}, diff --git a/mix.lock b/mix.lock index 70a3bf88..40998776 100644 --- a/mix.lock +++ b/mix.lock @@ -1,6 +1,6 @@ %{ "bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"}, - "credo": {:hex, :credo, "1.7.12", "9e3c20463de4b5f3f23721527fcaf16722ec815e70ff6c60b86412c695d426c1", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "8493d45c656c5427d9c729235b99d498bd133421f3e0a683e5c1b561471291e5"}, + "credo": {:hex, :credo, "1.7.13", "126a0697df6b7b71cd18c81bc92335297839a806b6f62b61d417500d1070ff4e", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "47641e6d2bbff1e241e87695b29f617f1a8f912adea34296fb10ecc3d7e9e84f"}, "deep_merge": {:hex, :deep_merge, "1.0.0", "b4aa1a0d1acac393bdf38b2291af38cb1d4a52806cf7a4906f718e1feb5ee961", [:mix], [], "hexpm", "ce708e5f094b9cd4e8f2be4f00d2f4250c4095be93f8cd6d018c753894885430"}, "dialyxir": {:hex, :dialyxir, "1.4.6", "7cca478334bf8307e968664343cbdb432ee95b4b68a9cba95bdabb0ad5bdfd9a", [:mix], [{:erlex, ">= 0.2.7", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "8cf5615c5cd4c2da6c501faae642839c8405b49f8aa057ad4ae401cb808ef64d"}, "doctest_formatter": {:hex, :doctest_formatter, "0.4.1", "c69bf93853d1ec5785cbd22dcf0c2bd4dd357cc53f2e89d05850eed7e985462a", [:mix], [], "hexpm", "c1b07495a524126de133be4e077b28c4a2d8e1a14c9eeca962482e2067b5b068"}, From 234f6261fbac8e58ab1df2c8b9b44a2f9944d2f3 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 13:42:36 +0200 Subject: [PATCH 05/17] layout changes and typo fixes in statistics_test.exs --- test/benchee/statistics_test.exs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/benchee/statistics_test.exs b/test/benchee/statistics_test.exs index 2eb105f2..8ce7b04a 100644 --- a/test/benchee/statistics_test.exs +++ b/test/benchee/statistics_test.exs @@ -12,6 +12,7 @@ defmodule Benchee.StatistcsTest do @sample_4 [100, 100, 100, 100] @sample_5 [5, 10, 15] @sample_6 [10, 20] + describe ".statistics" do test "computes the statistics for all jobs correctly" do scenarios = [ @@ -176,7 +177,7 @@ defmodule Benchee.StatistcsTest do end @nothing [] - test "doesn't blow up whenthere are no measurements" do + test "doesn't blow up when there are no measurements" do scenarios = [ %Scenario{ run_time_data: %CollectionData{samples: @nothing}, @@ -204,7 +205,7 @@ defmodule Benchee.StatistcsTest do assert reductions_stats.average == nil end - test "lets you know it's benchmarking" do + test "lets you know it's calculating statistics" do Statistics.statistics(%Suite{}, FakeProgressPrinter) assert_received :calculating_statistics From 8ae3f4ca86432ca18180e461b83221bfda3d315c Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 14:00:35 +0200 Subject: [PATCH 06/17] Unit test for outliers in statistics --- test/benchee/statistics_test.exs | 56 ++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/test/benchee/statistics_test.exs b/test/benchee/statistics_test.exs index 8ce7b04a..b29c5728 100644 --- a/test/benchee/statistics_test.exs +++ b/test/benchee/statistics_test.exs @@ -211,6 +211,62 @@ defmodule Benchee.StatistcsTest do assert_received :calculating_statistics end + test "determinism and outlier removal" do + scenarios = [ + %Scenario{ + input: "Input", + input_name: "Input", + job_name: "Job 1", + run_time_data: %CollectionData{samples: @sample_1} + } + ] + + suite = %Suite{scenarios: scenarios} + new_suite = Statistics.statistics(suite, FakeProgressPrinter) + new_suite2 = Statistics.statistics(suite, FakeProgressPrinter) + + # deterministic + assert new_suite == new_suite2 + + stats_1 = run_time_stats_for(new_suite, "Job 1", "Input") + # Statistex tests these values itself + assert stats_1.outliers == [] + assert stats_1.lower_outlier_bound <= 100 + assert stats_1.upper_outlier_bound >= 900 + + sample_1_asserts(stats_1) + + outlier = 2000 + + outlier_scenarios = [ + %Scenario{ + input: "Input", + input_name: "Input", + job_name: "Job 1", + run_time_data: %CollectionData{samples: [outlier | @sample_1]} + } + ] + + outlier_suite = %Suite{ + scenarios: outlier_scenarios, + configuration: %Benchee.Configuration{exclude_outliers: true} + } + + outlier_suite = Statistics.statistics(outlier_suite, FakeProgressPrinter) + + # we're not the same + refute outlier_suite == new_suite + + stats_1_outlier = run_time_stats_for(outlier_suite, "Job 1", "Input") + assert stats_1_outlier.outliers == [outlier] + + # However thanks to the outlier removal, our stats (minus outliers) are, + # down to the sample_size even + sample_1_asserts(stats_1_outlier) + assert stats_1.lower_outlier_bound <= 100 + assert stats_1.upper_outlier_bound <= 2000 + end + defp run_time_stats_for(suite, job_name, input_name) do stats_for(suite, job_name, input_name, :run_time_data) end From 630e9886f0377adf9e766d28fdc6c88687d3eeaf Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 14:09:43 +0200 Subject: [PATCH 07/17] Reword statistic explanations Away from calling them run times (we got multiple now) and others. --- lib/benchee/statistics.ex | 45 +++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/lib/benchee/statistics.ex b/lib/benchee/statistics.ex index 8c0d38df..c1a4a13b 100644 --- a/lib/benchee/statistics.ex +++ b/lib/benchee/statistics.ex @@ -39,40 +39,43 @@ defmodule Benchee.Statistics do @typedoc """ All the statistics `statistics/1` computes from the samples. + This used for run times, memory and reductions. Generally with these, + the lower the better (less run time, memory consumption or reductions). + + These values mostly correspond to their cousins in `Statistex`. + Overview of all the statistics Benchee currently provides: - * average - average run time of the job in μs (the lower the better) + * average - average of all the samples (the lower the better) * ips - iterations per second, how often can the given function be - executed within one second (the higher the better) - * std_dev - standard deviation, a measurement how much results vary - (the higher the more the results vary) + executed within one second, used only for run times (the higher the better) + * std_dev - standard deviation, how much results vary among the samples + (the higher the more the results vary) * std_dev_ratio - standard deviation expressed as how much it is relative to - the average + the average * std_dev_ips - the absolute standard deviation of iterations per second - (= ips * std_dev_ratio) * median - when all measured times are sorted, this is the middle - value (or average of the two middle values when the number of times is - even). More stable than the average and somewhat more likely to be a - typical value you see. + value (or average of the two middle values when the number of times is + even). More stable than the average and somewhat more likely to be a + typical value you see. * percentiles - a map of percentile ranks. These are the values below - which x% of the run times lie. For example, 99% of run times are shorter - than the 99th percentile (99th %) rank. - is a value for which 99% of the run times are shorter. - * mode - the run time(s) that occur the most. Often one value, but + which x% of the samples lie. For example, 99% of samples are less than + is a value for which 99% of the run times are less than it. + * mode - the samples that occur the most. Often one value, but can be multiple values if they occur the same amount of times. If no value - occurs at least twice, this value will be nil. + occurs at least twice, this value will be `nil`. * minimum - the smallest sample measured for the scenario * maximum - the biggest sample measured for the scenario * relative_more - relative to the reference (usually the fastest scenario) how much more - was the average of this scenario. E.g. for reference at 100, this scenario 200 then it - is 2.0. + was the average of this scenario. E.g. for reference at 100, this scenario 200 then it + is 2.0. * relative_less - relative to the reference (usually the fastest scenario) how much less - was the average of this scenario. E.g. for reference at 100, this scenario 200 then it - is 0.5. + was the average of this scenario. E.g. for reference at 100, this scenario 200 then it + is 0.5. * absolute_difference - relative to the reference (usually the fastest scenario) what is - the difference of the averages of the scenarios. e.g. for reference at 100, this - scenario 200 then it is 100. - * sample_size - the number of run time measurements taken + the difference of the averages of the scenarios. e.g. for reference at 100, this + scenario 200 then it is 100. + * sample_size - the number of measurements/samples taken into account for calculating statistics """ @type t :: %__MODULE__{ average: float, From df9d0af58632bbe255ff066a1eac3cda817c9525 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 14:12:35 +0200 Subject: [PATCH 08/17] Shed unnecessary white space in list, gets swalloed during doc rendering anyhow --- lib/benchee/statistics.ex | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/benchee/statistics.ex b/lib/benchee/statistics.ex index c1a4a13b..0590e188 100644 --- a/lib/benchee/statistics.ex +++ b/lib/benchee/statistics.ex @@ -46,26 +46,26 @@ defmodule Benchee.Statistics do Overview of all the statistics Benchee currently provides: - * average - average of all the samples (the lower the better) - * ips - iterations per second, how often can the given function be + * average - average of all the samples (the lower the better) + * ips - iterations per second, how often can the given function be executed within one second, used only for run times (the higher the better) - * std_dev - standard deviation, how much results vary among the samples + * std_dev - standard deviation, how much results vary among the samples (the higher the more the results vary) * std_dev_ratio - standard deviation expressed as how much it is relative to the average - * std_dev_ips - the absolute standard deviation of iterations per second - * median - when all measured times are sorted, this is the middle + * std_dev_ips - the absolute standard deviation of iterations per second + * median - when all measured times are sorted, this is the middle value (or average of the two middle values when the number of times is even). More stable than the average and somewhat more likely to be a typical value you see. - * percentiles - a map of percentile ranks. These are the values below + * percentiles - a map of percentile ranks. These are the values below which x% of the samples lie. For example, 99% of samples are less than is a value for which 99% of the run times are less than it. - * mode - the samples that occur the most. Often one value, but + * mode - the samples that occur the most. Often one value, but can be multiple values if they occur the same amount of times. If no value occurs at least twice, this value will be `nil`. - * minimum - the smallest sample measured for the scenario - * maximum - the biggest sample measured for the scenario + * minimum - the smallest sample measured for the scenario + * maximum - the biggest sample measured for the scenario * relative_more - relative to the reference (usually the fastest scenario) how much more was the average of this scenario. E.g. for reference at 100, this scenario 200 then it is 2.0. @@ -75,7 +75,7 @@ defmodule Benchee.Statistics do * absolute_difference - relative to the reference (usually the fastest scenario) what is the difference of the averages of the scenarios. e.g. for reference at 100, this scenario 200 then it is 100. - * sample_size - the number of measurements/samples taken into account for calculating statistics + * sample_size - the number of measurements/samples taken into account for calculating statistics """ @type t :: %__MODULE__{ average: float, From 2b123ff472530c8e48d60945bd7522a704884c49 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 14:17:22 +0200 Subject: [PATCH 09/17] Doc the new outlier related values in Benchee.Statistics --- CHANGELOG.md | 6 ++++++ lib/benchee/statistics.ex | 3 +++ 2 files changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2fe3d37..c8f37153 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * fixed a bug where if times were supplied as `0` instead of `0.0` we'd sometimes gather a single measurement * elixir `1.19` compilation warnings have been removed +### Features (Plugins) +* The `%Benchee.Statistics{}` struct now comes with values to accompany the outlier exclusion feature: + * outliers - if outlier exclusion was enabled, may include any samples of outliers that were found, empty list otherwise + * lower_outlier_bound - value below which values are considered an outlier + * upper_outlier_bound - value above which values are considered an outlier + ## 1.4.0 (2025-04-14) Some nice features (`pre_check: :all_same` is cool) along with adding support for some new stuff (`tprof`) and fixing some bugs. diff --git a/lib/benchee/statistics.ex b/lib/benchee/statistics.ex index 0590e188..7c6e8f70 100644 --- a/lib/benchee/statistics.ex +++ b/lib/benchee/statistics.ex @@ -76,6 +76,9 @@ defmodule Benchee.Statistics do the difference of the averages of the scenarios. e.g. for reference at 100, this scenario 200 then it is 100. * sample_size - the number of measurements/samples taken into account for calculating statistics + * outliers - if outlier exclusion was enabled, may include any samples of outliers that were found, empty list otherwise + * lower_outlier_bound - value below which values are considered an outlier + * upper_outlier_bound - value above which values are considered an outlier """ @type t :: %__MODULE__{ average: float, From 8353dad96328d04fa4bfe42af2f85bd668006b12 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 14:22:51 +0200 Subject: [PATCH 10/17] Implement exclude outliers feature as a whole in the changelog --- CHANGELOG.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c8f37153..ac8f8c38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased ## Features (User Facing) -* Introduce `max_sample_size` which guides how many samples will be gathered at most for a given scenario. This avoids a variety of issues when scenarios gather too many samples (memory consumption etc). Defaults to `1_000_000`, setting it to `nil` gathers unlimited samples again (behavior before this version). +* Introduce `max_sample_size` which guides how many samples will be gathered at most for a given scenario. +This avoids a variety of issues when scenarios gather too many samples (memory consumption, statistics taking long to calculate, formatters hanging/not working). +Defaults to `1_000_000`, setting it to `nil` gathers unlimited samples again (behavior before this version). +* Introduce `exclude_outliers` option which when set to `true` will automatically exclude outliers from the samples gathered. +Especially important for run time, you can remove samples caused by garbage collection or external factors. +Defaults to `false`. +Shout out to [@NickNeck](https://github.com/NickNeck) who implemented this long wished for feature over in `Statistex`. ### Bugfixes (User Facing) * fixed a bug where if times were supplied as `0` instead of `0.0` we'd sometimes gather a single measurement From cb13cd793dfee73099d150d23df047b64b19bf0b Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 14:25:34 +0200 Subject: [PATCH 11/17] Fix credo which magically started failing :thinking-face: --- lib/benchee/benchmark/runner.ex | 4 +++- lib/benchee/formatters/console.ex | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/benchee/benchmark/runner.ex b/lib/benchee/benchmark/runner.ex index 40302dec..d90471a8 100644 --- a/lib/benchee/benchmark/runner.ex +++ b/lib/benchee/benchmark/runner.ex @@ -6,8 +6,10 @@ defmodule Benchee.Benchmark.Runner do # This module actually runs our benchmark scenarios, adding information about # run time and memory usage to each scenario. + alias Benchee.Benchmark alias Benchee.Benchmark.BenchmarkConfig - alias Benchee.{Benchmark, Scenario, Utility.Parallel} + alias Benchee.Scenario + alias Benchee.Utility.Parallel alias Benchmark.{ Collect, diff --git a/lib/benchee/formatters/console.ex b/lib/benchee/formatters/console.ex index d906c5a1..37660aef 100644 --- a/lib/benchee/formatters/console.ex +++ b/lib/benchee/formatters/console.ex @@ -33,8 +33,8 @@ defmodule Benchee.Formatters.Console do @behaviour Benchee.Formatter - alias Benchee.Suite alias Benchee.Formatters.Console.{Memory, Reductions, RunTime} + alias Benchee.Suite @doc """ Formats the benchmark statistics to a report suitable for output on the CLI. From 79b1e6b40f37c5dfba2184d77d309c8b58e7532b Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 14:33:51 +0200 Subject: [PATCH 12/17] adjust test script to have all the output and more readable --- samples/outlier_removal.exs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/outlier_removal.exs b/samples/outlier_removal.exs index 103bae7e..f00bbfb1 100644 --- a/samples/outlier_removal.exs +++ b/samples/outlier_removal.exs @@ -8,14 +8,16 @@ suite = "map.flatten" => fn -> list |> Enum.map(map_fun) |> List.flatten() end }, formatters: [{Benchee.Formatters.Console, extended_statistics: true}], - exclude_outliers: true + exclude_outliers: true, + warmup: 0, + time: 1 ) suite.scenarios |> Enum.map(fn scenario -> statistics = scenario.run_time_data.statistics - {scenario.name, length(statistics.outliers), statistics.outliers, - statistics.lower_outlier_bound, statistics.upper_outlier_bound} + {scenario.name, length(statistics.outliers), statistics.lower_outlier_bound, + statistics.upper_outlier_bound, statistics.outliers} end) -|> IO.inspect() +|> IO.inspect(printable_limit: :infinity, limit: :infinity) From fcf93e6a8cf89cac302d511dbebefb8b7bcb5110 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 15:26:37 +0200 Subject: [PATCH 13/17] Benchee integration test for outlier exclusion --- test/benchee_test.exs | 64 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/test/benchee_test.exs b/test/benchee_test.exs index 88366db2..275b9df8 100644 --- a/test/benchee_test.exs +++ b/test/benchee_test.exs @@ -1106,6 +1106,70 @@ defmodule BencheeTest do end end + describe "exclude_outliers" do + test "even with it the high level README example still passes its asserts" do + output = + capture_io(fn -> + list = Enum.to_list(1..10_000) + map_fun = fn i -> [i, i * i] end + + Benchee.run( + %{ + "flat_map" => fn -> Enum.flat_map(list, map_fun) end, + "map.flatten" => fn -> list |> Enum.map(map_fun) |> List.flatten() end + }, + Keyword.merge(@test_configuration, exclude_outliers: true) + ) + end) + + readme_sample_asserts(output) + end + + # The easiest way to create an outlier is to just run something once + # and then take a "stable" measurement like reductions or memory + test "removes outliers" do + {:ok, agent} = Agent.start(fn -> 0 end) + + output = + capture_io(fn -> + suite = + Benchee.run( + %{ + "flawed" => fn -> + # Produce some garbage but only once + # can't use process dictionary as it's a different process every time + if Agent.get(agent, & &1) < 1 do + Enum.map(1..100, fn i -> "garbage #{i}" end) + Agent.update(agent, &(&1 + 1)) + end + end + }, + time: 0, + warmup: 0, + reduction_time: 0.005, + exclude_outliers: true + ) + + %{scenarios: [%{reductions_data: %{samples: samples, statistics: stats}}]} = suite + + assert [outlier] = stats.outliers + assert outlier >= stats.upper_outlier_bound + # since the outlier is removed, all values are the same + assert stats.std_dev == 0 + assert stats.minimum == stats.maximum + + # It's a big outlier! + assert 10 * stats.average < outlier + + # The outlier is only removed from the stats, but not from the samples + assert outlier in samples + end) + + # As the outlier is removed, all measurements are the same + assert output =~ ~r/all.*reduction.*same/i + end + end + describe "warn when functions are evaluated" do test "warns when run in iex" do # test env to avoid repeated compilation on CI From 271c96b7ce63662c2754cfa30125c0fa8b4fa835 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 15:37:33 +0200 Subject: [PATCH 14/17] Print whether or not outliers are being excluded --- lib/benchee/output/benchmark_printer.ex | 4 +++- .../benchee/output/benchmark_printer_test.exs | 23 ++++++++++++++++++- test/benchee_test.exs | 1 + 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/lib/benchee/output/benchmark_printer.ex b/lib/benchee/output/benchmark_printer.ex index 7db2ee58..2b24cac6 100644 --- a/lib/benchee/output/benchmark_printer.ex +++ b/lib/benchee/output/benchmark_printer.ex @@ -69,7 +69,8 @@ defmodule Benchee.Output.BenchmarkPrinter do warmup: warmup, inputs: inputs, memory_time: memory_time, - reduction_time: reduction_time + reduction_time: reduction_time, + exclude_outliers: exclude_outliers }) do scenario_count = length(scenarios) exec_time = warmup + time + memory_time + reduction_time @@ -84,6 +85,7 @@ defmodule Benchee.Output.BenchmarkPrinter do parallel: #{parallel} inputs: #{inputs_out(inputs)} Estimated total run time: #{Duration.format_human(total_time)} + Excluding outliers: #{exclude_outliers} """) end diff --git a/test/benchee/output/benchmark_printer_test.exs b/test/benchee/output/benchmark_printer_test.exs index 67172b14..3a90cf11 100644 --- a/test/benchee/output/benchmark_printer_test.exs +++ b/test/benchee/output/benchmark_printer_test.exs @@ -55,6 +55,26 @@ defmodule Benchee.Output.BenchmarkPrintertest do assert output =~ "memory time: 0 ns" assert output =~ "parallel: 2" assert output =~ "Estimated total run time: 20 μs" + assert output =~ "Excluding outliers: false" + end + + test "tells you when outliers are being excluded" do + output = + capture_io(fn -> + %{ + configuration: %Configuration{ + time: 10_000, + warmup: 0, + inputs: nil, + exclude_outliers: true + }, + scenarios: [%Scenario{job_name: "one"}], + system: @system_info + } + |> configuration_information + end) + + assert output =~ "Excluding outliers: true" end test "it scales times appropriately" do @@ -86,7 +106,7 @@ defmodule Benchee.Output.BenchmarkPrintertest do output = capture_io(fn -> %{ - configuration: %{ + configuration: %Configuration{ parallel: 2, time: 10_000, warmup: 0, @@ -111,6 +131,7 @@ defmodule Benchee.Output.BenchmarkPrintertest do assert output =~ "parallel: 2" assert output =~ "inputs: Arg 1, Arg 2" assert output =~ "Estimated total run time: 44 μs" + assert output =~ "Excluding outliers: false" end test "does not print if disabled" do diff --git a/test/benchee_test.exs b/test/benchee_test.exs index 275b9df8..70d9f5e7 100644 --- a/test/benchee_test.exs +++ b/test/benchee_test.exs @@ -1167,6 +1167,7 @@ defmodule BencheeTest do # As the outlier is removed, all measurements are the same assert output =~ ~r/all.*reduction.*same/i + assert output =~ ~r/exclud.*outlier.*true/i end end From 33e006a7a5d0b60fbe6ed0d059882c427484fa0f Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 15:45:03 +0200 Subject: [PATCH 15/17] Include excluding outliers more in the README --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 09f901db..dbfa8cb4 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ The aforementioned [plugins](#plugins) like [benchee_html](https://github.com/be - [Formatters](#formatters) - [Console Formatter options](#console-formatter-options) - [Profiling after a run](#profiling-after-a-run) + - [Remove Outliers](#remove-outliers) - [Saving, loading and comparing previous runs](#saving-loading-and-comparing-previous-runs) - [Hooks (Setup, Teardown etc.)](#hooks-setup-teardown-etc) - [Suite hooks](#suite-hooks) @@ -115,6 +116,7 @@ The aforementioned [plugins](#plugins) like [benchee_html](https://github.com/be * as precise as it can get, measure with up to nanosecond precision (Operating System dependent) * nicely formatted console output with units scaled to appropriately (nanoseconds to minutes) * (optionally) measures the overhead of function calls so that the measured/reported times really are the execution time of _your_code_ without that overhead. +* (optionally) [removes outliers](#remove-outliers) * [hooks](#hooks-setup-teardown-etc) to execute something before/after a benchmarking invocation, without it impacting the measured time * execute benchmark jobs in parallel to gather more results in the same time, or simulate a system under load * well documented & well tested @@ -265,6 +267,11 @@ The available options are the following (also documented in [hexdocs](https://he This is used to limit memory consumption and unnecessary processing - 1 Million samples is plenty. This limit also applies to number of iterations done during warmup. You can set your own number or set it to `nil` if you don't want any limit. +* `exclude_outliers` - whether or not statistical outliers should be removed for the calculated statistics. +Defaults to `false`. +This means that values that are far outside the usual range (as determined by the percentiles/quantiles) will +be removed from the gathered samples and the calculated statistics. You might want to enable this if you +don't want things like the garbage collection triggering to influence your results as much. ### Metrics to measure @@ -547,9 +554,11 @@ Enum."-map/2-lists^map/1-0-"/2 10001 26.38 2282 0.23 ### Remove Outliers -Benchee can remove outliers from the gathered samples. +Benchee can remove outliers from the gathered samples while calculating statistics. That is, as determined by percentiles/quantiles (we follow [this approach](https://en.wikipedia.org/wiki/Interquartile_range#Outliers)). +You might consider excluding outliers for extreme micro/nano-benchmarks where individual results can be skewed a lot by the Garbage Collection. + You can simply pass `exclude_outliers: true` to Benchee to trigger the removal of outliers. ```elixir @@ -560,6 +569,8 @@ The outliers themselves (aka the samples that have been determined to be outlier as well as the lower/upper bound after which samples are considered outliers are accessible in the `Benchee.Statistics` struct. +The samples themselves still include the outliers, they are only removed for calculating statistics. + ### Saving, loading and comparing previous runs Benchee can store the results of previous runs in a file and then load them again to compare them. For example this is useful to compare what was recorded on the main branch against a branch with performance improvements. You may also use this to benchmark across different exlixir/erlang versions. From 738fd2d61675d1851a3aed7b49754f31b85ad8b4 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 15:46:03 +0200 Subject: [PATCH 16/17] Note about not printing outliers yet --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index dbfa8cb4..1819a1ed 100644 --- a/README.md +++ b/README.md @@ -571,6 +571,8 @@ in the `Benchee.Statistics` struct. The samples themselves still include the outliers, they are only removed for calculating statistics. +Right now Benchee doesn't print the outliers yet, but you can inspect the resulting data structures if you're interested (or send a PR :) ) + ### Saving, loading and comparing previous runs Benchee can store the results of previous runs in a file and then load them again to compare them. For example this is useful to compare what was recorded on the main branch against a branch with performance improvements. You may also use this to benchmark across different exlixir/erlang versions. From a8c355e9b97753b67a9d29568c3c363dfc15fe35 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sun, 19 Oct 2025 15:50:35 +0200 Subject: [PATCH 17/17] Redo mainline usage example --- README.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 1819a1ed..24c7cc58 100644 --- a/README.md +++ b/README.md @@ -26,9 +26,9 @@ Produces the following output on the console: Operating System: Linux CPU Information: AMD Ryzen 9 5900X 12-Core Processor Number of Available Cores: 24 -Available memory: 31.25 GB -Elixir 1.16.0-rc.1 -Erlang 26.1.2 +Available memory: 31.26 GB +Elixir 1.19.0 +Erlang 28.1 JIT enabled: true Benchmark suite executing with the following configuration: @@ -39,6 +39,7 @@ reduction time: 0 ns parallel: 1 inputs: none specified Estimated total run time: 28 s +Excluding outliers: false Benchmarking flat_map ... Benchmarking map.flatten ... @@ -46,18 +47,18 @@ Calculating statistics... Formatting results... Name ips average deviation median 99th % -flat_map 3.79 K 263.87 μs ±15.49% 259.47 μs 329.29 μs -map.flatten 1.96 K 509.19 μs ±51.36% 395.23 μs 1262.27 μs +flat_map 3.96 K 252.74 μs ±15.64% 247.61 μs 321.85 μs +map.flatten 1.84 K 543.57 μs ±44.18% 414.16 μs 1223.92 μs Comparison: -flat_map 3.79 K -map.flatten 1.96 K - 1.93x slower +245.32 μs +flat_map 3.96 K +map.flatten 1.84 K - 2.15x slower +290.83 μs Memory usage statistics: Name Memory usage -flat_map 625 KB -map.flatten 781.25 KB - 1.25x memory usage +156.25 KB +flat_map 624.97 KB +map.flatten 781.25 KB - 1.25x memory usage +156.28 KB **All measurements for memory usage were the same** ```