From ddb0dce0515555a5af7f4c0116e77c7cab9847f2 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 26 Jan 2026 16:23:56 -0800 Subject: [PATCH] Run sampler as a method if available With this PR: https://github.com/huggingface/optimum-executorch/pull/207 we are adding a new method "sampler" to ASR models, alongside with "encoder" and "text_decoder". The flow becomes: if temperature is 0 and sampler method is available, run that method. Otherwise still go with the old path. This change should largely improve the performance on CUDA since we don't have to copy logits from device to CPU for sampling purpose. Benchmark result: --- .../ci_commit_pins/optimum-executorch.txt | 2 +- backends/cuda/runtime/cuda_backend.cpp | 49 +++++++++++++-- extension/asr/runner/runner.cpp | 62 ++++++++++++++----- extension/asr/runner/runner.h | 2 + 4 files changed, 94 insertions(+), 21 deletions(-) diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt index 70fda8ab3fe..c58d8f9977b 100644 --- a/.ci/docker/ci_commit_pins/optimum-executorch.txt +++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt @@ -1 +1 @@ -732b11313b2006b4d8649500eaf5567ec6ac1e49 +f8aa919593cc51301ade73a2ee5491582521ab80 diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index cd1c6b96f02..7c94d67fabe 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ #include #include #include +#include #include // Include our shim layer headers @@ -60,6 +62,41 @@ constexpr char kSkipCopyOutputToCpuForMethod[] = class ET_EXPERIMENTAL CudaBackend final : public ::executorch::runtime::BackendInterface { private: + // Trim leading/trailing whitespace from a view of the string. + static std::string_view trim(std::string_view s) { + size_t start = 0; + while (start < s.size() && + std::isspace(static_cast(s[start]))) { + ++start; + } + size_t end = s.size(); + while (end > start && + std::isspace(static_cast(s[end - 1]))) { + --end; + } + return s.substr(start, end - start); + } + + // Check if method_name appears in a comma-separated list. + static bool method_in_csv( + const std::string& method_name, + const std::string& csv) { + size_t pos = 0; + while (pos <= csv.size()) { + const size_t comma = csv.find(',', pos); + const std::string_view token = + trim(std::string_view(csv).substr(pos, comma - pos)); + if (!token.empty() && token == method_name) { + return true; + } + if (comma == std::string::npos) { + break; + } + pos = comma + 1; + } + return false; + } + void set_skip_copy_method( const std::array& raw) { std::lock_guard guard(skip_copy_method_mutex_); @@ -83,7 +120,7 @@ class ET_EXPERIMENTAL CudaBackend final return false; } std::lock_guard guard(skip_copy_method_mutex_); - return method_name == skip_copy_method_; + return method_in_csv(method_name, skip_copy_method_); } Error load_function_pointers_into_handle( @@ -316,7 +353,7 @@ class ET_EXPERIMENTAL CudaBackend final ET_CHECK_OR_RETURN_ERROR( create_err == Error::Ok, Internal, - "Failed to create GPU tensor for input %d", + "Failed to create GPU tensor for input %" ET_PRIsize_t, i); gpu_inputs[i] = gpu_input_handle; @@ -325,7 +362,7 @@ class ET_EXPERIMENTAL CudaBackend final ET_CHECK_OR_RETURN_ERROR( aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok, Internal, - "Failed to copy input %d from CPU to GPU", + "Failed to copy input %" ET_PRIsize_t " from CPU to GPU", i); } // Process output tensors: create GPU counterparts for ExecuTorch CPU @@ -352,7 +389,7 @@ class ET_EXPERIMENTAL CudaBackend final ET_CHECK_OR_RETURN_ERROR( create_err == Error::Ok, Internal, - "Failed to create GPU tensor for output %d", + "Failed to create GPU tensor for output %" ET_PRIsize_t, i); gpu_outputs[i] = gpu_output_handle; @@ -382,11 +419,11 @@ class ET_EXPERIMENTAL CudaBackend final // For DYNAMIC_BOUND tensors we try to resize ET_CHECK_OK_OR_RETURN_ERROR( resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()), - "Error resizing tensor at output index %d", + "Error resizing tensor at output index %" ET_PRIsize_t, i); ET_CHECK_OK_OR_RETURN_ERROR( aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0), - "Failed to copy GPU output %d back to CPU", + "Failed to copy GPU output %" ET_PRIsize_t " back to CPU", i); } } else { diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index 6c4be57d193..21ff276bb82 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -27,6 +27,7 @@ namespace { constexpr const char* kEncoderMethodName = "encoder"; constexpr const char* kDecoderMethodName = "text_decoder"; +constexpr const char* kSamplerMethodName = "sampler"; } // namespace @@ -47,7 +48,8 @@ AsrRunner::AsrRunner( bool AsrRunner::is_loaded() const { return module_ && encoder_method_loaded_ && decoder_method_loaded_ && - tokenizer_ && tokenizer_->is_loaded() && !eos_token_ids_.empty(); + (!sampler_method_present_ || sampler_method_loaded_) && tokenizer_ && + tokenizer_->is_loaded() && !eos_token_ids_.empty(); } Error AsrRunner::load_tokenizer() { @@ -96,6 +98,8 @@ Error AsrRunner::load() { ET_CHECK_OK_OR_RETURN_ERROR(method_names_result.error()); const auto& method_names = method_names_result.get(); + sampler_method_present_ = method_names.count(kSamplerMethodName); + ET_CHECK_OR_RETURN_ERROR( method_names.count(kEncoderMethodName) && method_names.count(kDecoderMethodName), @@ -109,13 +113,21 @@ Error AsrRunner::load() { ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kDecoderMethodName)); decoder_method_loaded_ = true; + + if (sampler_method_present_) { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kSamplerMethodName)); + sampler_method_loaded_ = true; + } #ifdef CUDA_AVAILABLE + // Skip copying outputs to CPU. When a sampler exists, keep both encoder and + // decoder outputs on device and pass decoder logits directly into sampler. executorch::runtime::BackendOptions<1> backend_options; - // For decoder still copy output from GPU to CPU for sampling. - // TODO: change sampler to use a CUDA kernel to sample and then skip copying - // decoder output as well + std::string skip_methods = kEncoderMethodName; + if (sampler_method_present_) { + skip_methods.append(",").append(kDecoderMethodName); + } ET_CHECK_OK_OR_RETURN_ERROR(backend_options.set_option( - "skip_copy_output_to_cpu_for_method", kEncoderMethodName)); + "skip_copy_output_to_cpu_for_method", skip_methods.c_str())); const auto opt_err = executorch::runtime::set_option("CudaBackend", backend_options.view()); if (opt_err != ::executorch::runtime::Error::Ok) { @@ -264,6 +276,7 @@ Result> AsrRunner::transcribe( decoder_inputs.emplace_back(cache_position_ptr); // Add some green coloring for the first generated token // token_callback("\033[1;32m"); + const bool use_sampler_method = sampler_method_loaded_; while (generated_tokens < config.max_new_tokens) { input_id = tokens.back(); auto decoder_result = module_->execute(kDecoderMethodName, decoder_inputs); @@ -276,15 +289,36 @@ Result> AsrRunner::transcribe( "Decoder returned %zu outputs; expected a single tensor.", decoder_outputs.size()); - ::executorch::aten::Tensor logits_tensor = - std::move(decoder_outputs[0]).toTensor(); - const int64_t vocab_size = logits_tensor.numel(); - ET_CHECK_OR_RETURN_ERROR( - vocab_size > 0, Internal, "Decoder logits tensor is empty."); - - const int64_t next_token = - static_cast(::executorch::extension::llm::logits_to_token( - logits_tensor, config.temperature)); + int64_t next_token = 0; + if (!use_sampler_method || config.temperature != 0.0f) { + ::executorch::aten::Tensor logits_tensor = + std::move(decoder_outputs[0]).toTensor(); + const int64_t vocab_size = logits_tensor.numel(); + ET_CHECK_OR_RETURN_ERROR( + vocab_size > 0, Internal, "Decoder logits tensor is empty."); + next_token = + static_cast(::executorch::extension::llm::logits_to_token( + logits_tensor, config.temperature)); + } else { + auto sampler_result = + module_->execute(kSamplerMethodName, decoder_outputs); + ET_CHECK_OK_OR_RETURN_ERROR(sampler_result.error()); + + auto sampler_outputs = std::move(*sampler_result); + ET_CHECK_OR_RETURN_ERROR( + sampler_outputs.size() == 1 && sampler_outputs[0].isTensor(), + Internal, + "Sampler returned %zu outputs; expected a single tensor.", + sampler_outputs.size()); + + ::executorch::aten::Tensor token_tensor = + std::move(sampler_outputs[0]).toTensor(); + ET_CHECK_OR_RETURN_ERROR( + token_tensor.numel() > 0, + Internal, + "Sampler logits tensor is empty."); + next_token = token_tensor.mutable_data_ptr()[0]; + } if (!first_token_generated) { stats_.first_token_ms = ::executorch::extension::llm::time_in_ms(); diff --git a/extension/asr/runner/runner.h b/extension/asr/runner/runner.h index a9f8ce3edda..077fdb69fe4 100644 --- a/extension/asr/runner/runner.h +++ b/extension/asr/runner/runner.h @@ -108,6 +108,8 @@ class ET_EXPERIMENTAL AsrRunner { bool encoder_method_loaded_ = false; bool decoder_method_loaded_ = false; + bool sampler_method_loaded_ = false; + bool sampler_method_present_ = false; Stats stats_; };