From 08a1f98eef45b0e69b437130800fe3bae273b257 Mon Sep 17 00:00:00 2001 From: ingyukoh Date: Mon, 2 Feb 2026 12:46:23 +0900 Subject: [PATCH] fix: use physical cores for --threads auto-detect (#19110) Replace std::thread::hardware_concurrency() with cpu_get_num_math() when --threads is set to -1 or 0 (auto-detect mode). hardware_concurrency() returns logical cores (includes hyperthreads), causing thread oversubscription and performance degradation: - 100% CPU usage instead of optimal ~50% - 3.6x slower (2.5 tok/s vs 9 tok/s reported) cpu_get_num_math() returns physical cores and also handles Intel hybrid CPUs by skipping efficiency cores for math workloads. Fixes #19110 --- common/arg.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 5fbc9022c02..43909265b3b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1111,7 +1111,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.cpuparams.n_threads = value; if (params.cpuparams.n_threads <= 0) { - params.cpuparams.n_threads = std::thread::hardware_concurrency(); + params.cpuparams.n_threads = cpu_get_num_math(); } } ).set_env("LLAMA_ARG_THREADS")); @@ -1121,7 +1121,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.cpuparams_batch.n_threads = value; if (params.cpuparams_batch.n_threads <= 0) { - params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + params.cpuparams_batch.n_threads = cpu_get_num_math(); } } )); @@ -3216,7 +3216,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.speculative.cpuparams.n_threads = value; if (params.speculative.cpuparams.n_threads <= 0) { - params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); + params.speculative.cpuparams.n_threads = cpu_get_num_math(); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); @@ -3226,7 +3226,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.speculative.cpuparams_batch.n_threads = value; if (params.speculative.cpuparams_batch.n_threads <= 0) { - params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + params.speculative.cpuparams_batch.n_threads = cpu_get_num_math(); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));