diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt index 70fda8ab3fe..c58d8f9977b 100644 --- a/.ci/docker/ci_commit_pins/optimum-executorch.txt +++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt @@ -1 +1 @@ -732b11313b2006b4d8649500eaf5567ec6ac1e49 +f8aa919593cc51301ade73a2ee5491582521ab80 diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index cd1c6b96f02..7c94d67fabe 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ #include #include #include +#include #include // Include our shim layer headers @@ -60,6 +62,41 @@ constexpr char kSkipCopyOutputToCpuForMethod[] = class ET_EXPERIMENTAL CudaBackend final : public ::executorch::runtime::BackendInterface { private: + // Trim leading/trailing whitespace from a view of the string. + static std::string_view trim(std::string_view s) { + size_t start = 0; + while (start < s.size() && + std::isspace(static_cast(s[start]))) { + ++start; + } + size_t end = s.size(); + while (end > start && + std::isspace(static_cast(s[end - 1]))) { + --end; + } + return s.substr(start, end - start); + } + + // Check if method_name appears in a comma-separated list. + static bool method_in_csv( + const std::string& method_name, + const std::string& csv) { + size_t pos = 0; + while (pos <= csv.size()) { + const size_t comma = csv.find(',', pos); + const std::string_view token = + trim(std::string_view(csv).substr(pos, comma - pos)); + if (!token.empty() && token == method_name) { + return true; + } + if (comma == std::string::npos) { + break; + } + pos = comma + 1; + } + return false; + } + void set_skip_copy_method( const std::array& raw) { std::lock_guard guard(skip_copy_method_mutex_); @@ -83,7 +120,7 @@ class ET_EXPERIMENTAL CudaBackend final return false; } std::lock_guard guard(skip_copy_method_mutex_); - return method_name == skip_copy_method_; + return method_in_csv(method_name, skip_copy_method_); } Error load_function_pointers_into_handle( @@ -316,7 +353,7 @@ class ET_EXPERIMENTAL CudaBackend final ET_CHECK_OR_RETURN_ERROR( create_err == Error::Ok, Internal, - "Failed to create GPU tensor for input %d", + "Failed to create GPU tensor for input %" ET_PRIsize_t, i); gpu_inputs[i] = gpu_input_handle; @@ -325,7 +362,7 @@ class ET_EXPERIMENTAL CudaBackend final ET_CHECK_OR_RETURN_ERROR( aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok, Internal, - "Failed to copy input %d from CPU to GPU", + "Failed to copy input %" ET_PRIsize_t " from CPU to GPU", i); } // Process output tensors: create GPU counterparts for ExecuTorch CPU @@ -352,7 +389,7 @@ class ET_EXPERIMENTAL CudaBackend final ET_CHECK_OR_RETURN_ERROR( create_err == Error::Ok, Internal, - "Failed to create GPU tensor for output %d", + "Failed to create GPU tensor for output %" ET_PRIsize_t, i); gpu_outputs[i] = gpu_output_handle; @@ -382,11 +419,11 @@ class ET_EXPERIMENTAL CudaBackend final // For DYNAMIC_BOUND tensors we try to resize ET_CHECK_OK_OR_RETURN_ERROR( resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()), - "Error resizing tensor at output index %d", + "Error resizing tensor at output index %" ET_PRIsize_t, i); ET_CHECK_OK_OR_RETURN_ERROR( aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0), - "Failed to copy GPU output %d back to CPU", + "Failed to copy GPU output %" ET_PRIsize_t " back to CPU", i); } } else { diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index 6c4be57d193..21ff276bb82 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -27,6 +27,7 @@ namespace { constexpr const char* kEncoderMethodName = "encoder"; constexpr const char* kDecoderMethodName = "text_decoder"; +constexpr const char* kSamplerMethodName = "sampler"; } // namespace @@ -47,7 +48,8 @@ AsrRunner::AsrRunner( bool AsrRunner::is_loaded() const { return module_ && encoder_method_loaded_ && decoder_method_loaded_ && - tokenizer_ && tokenizer_->is_loaded() && !eos_token_ids_.empty(); + (!sampler_method_present_ || sampler_method_loaded_) && tokenizer_ && + tokenizer_->is_loaded() && !eos_token_ids_.empty(); } Error AsrRunner::load_tokenizer() { @@ -96,6 +98,8 @@ Error AsrRunner::load() { ET_CHECK_OK_OR_RETURN_ERROR(method_names_result.error()); const auto& method_names = method_names_result.get(); + sampler_method_present_ = method_names.count(kSamplerMethodName); + ET_CHECK_OR_RETURN_ERROR( method_names.count(kEncoderMethodName) && method_names.count(kDecoderMethodName), @@ -109,13 +113,21 @@ Error AsrRunner::load() { ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kDecoderMethodName)); decoder_method_loaded_ = true; + + if (sampler_method_present_) { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kSamplerMethodName)); + sampler_method_loaded_ = true; + } #ifdef CUDA_AVAILABLE + // Skip copying outputs to CPU. When a sampler exists, keep both encoder and + // decoder outputs on device and pass decoder logits directly into sampler. executorch::runtime::BackendOptions<1> backend_options; - // For decoder still copy output from GPU to CPU for sampling. - // TODO: change sampler to use a CUDA kernel to sample and then skip copying - // decoder output as well + std::string skip_methods = kEncoderMethodName; + if (sampler_method_present_) { + skip_methods.append(",").append(kDecoderMethodName); + } ET_CHECK_OK_OR_RETURN_ERROR(backend_options.set_option( - "skip_copy_output_to_cpu_for_method", kEncoderMethodName)); + "skip_copy_output_to_cpu_for_method", skip_methods.c_str())); const auto opt_err = executorch::runtime::set_option("CudaBackend", backend_options.view()); if (opt_err != ::executorch::runtime::Error::Ok) { @@ -264,6 +276,7 @@ Result> AsrRunner::transcribe( decoder_inputs.emplace_back(cache_position_ptr); // Add some green coloring for the first generated token // token_callback("\033[1;32m"); + const bool use_sampler_method = sampler_method_loaded_; while (generated_tokens < config.max_new_tokens) { input_id = tokens.back(); auto decoder_result = module_->execute(kDecoderMethodName, decoder_inputs); @@ -276,15 +289,36 @@ Result> AsrRunner::transcribe( "Decoder returned %zu outputs; expected a single tensor.", decoder_outputs.size()); - ::executorch::aten::Tensor logits_tensor = - std::move(decoder_outputs[0]).toTensor(); - const int64_t vocab_size = logits_tensor.numel(); - ET_CHECK_OR_RETURN_ERROR( - vocab_size > 0, Internal, "Decoder logits tensor is empty."); - - const int64_t next_token = - static_cast(::executorch::extension::llm::logits_to_token( - logits_tensor, config.temperature)); + int64_t next_token = 0; + if (!use_sampler_method || config.temperature != 0.0f) { + ::executorch::aten::Tensor logits_tensor = + std::move(decoder_outputs[0]).toTensor(); + const int64_t vocab_size = logits_tensor.numel(); + ET_CHECK_OR_RETURN_ERROR( + vocab_size > 0, Internal, "Decoder logits tensor is empty."); + next_token = + static_cast(::executorch::extension::llm::logits_to_token( + logits_tensor, config.temperature)); + } else { + auto sampler_result = + module_->execute(kSamplerMethodName, decoder_outputs); + ET_CHECK_OK_OR_RETURN_ERROR(sampler_result.error()); + + auto sampler_outputs = std::move(*sampler_result); + ET_CHECK_OR_RETURN_ERROR( + sampler_outputs.size() == 1 && sampler_outputs[0].isTensor(), + Internal, + "Sampler returned %zu outputs; expected a single tensor.", + sampler_outputs.size()); + + ::executorch::aten::Tensor token_tensor = + std::move(sampler_outputs[0]).toTensor(); + ET_CHECK_OR_RETURN_ERROR( + token_tensor.numel() > 0, + Internal, + "Sampler logits tensor is empty."); + next_token = token_tensor.mutable_data_ptr()[0]; + } if (!first_token_generated) { stats_.first_token_ms = ::executorch::extension::llm::time_in_ms(); diff --git a/extension/asr/runner/runner.h b/extension/asr/runner/runner.h index a9f8ce3edda..077fdb69fe4 100644 --- a/extension/asr/runner/runner.h +++ b/extension/asr/runner/runner.h @@ -108,6 +108,8 @@ class ET_EXPERIMENTAL AsrRunner { bool encoder_method_loaded_ = false; bool decoder_method_loaded_ = false; + bool sampler_method_loaded_ = false; + bool sampler_method_present_ = false; Stats stats_; };