Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/optimum-executorch.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
732b11313b2006b4d8649500eaf5567ec6ac1e49
f8aa919593cc51301ade73a2ee5491582521ab80
49 changes: 43 additions & 6 deletions backends/cuda/runtime/cuda_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/evalue.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
#include <cctype>
#include <cstdio>

#include <array>
#include <filesystem>
#include <fstream>
#include <mutex>
#include <string>
#include <string_view>
#include <vector>

// Include our shim layer headers
Expand Down Expand Up @@ -60,6 +62,41 @@ constexpr char kSkipCopyOutputToCpuForMethod[] =
class ET_EXPERIMENTAL CudaBackend final
: public ::executorch::runtime::BackendInterface {
private:
// Trim leading/trailing whitespace from a view of the string.
static std::string_view trim(std::string_view s) {
size_t start = 0;
while (start < s.size() &&
std::isspace(static_cast<unsigned char>(s[start]))) {
++start;
}
size_t end = s.size();
while (end > start &&
std::isspace(static_cast<unsigned char>(s[end - 1]))) {
--end;
}
return s.substr(start, end - start);
}

// Check if method_name appears in a comma-separated list.
static bool method_in_csv(
const std::string& method_name,
const std::string& csv) {
size_t pos = 0;
while (pos <= csv.size()) {
const size_t comma = csv.find(',', pos);
const std::string_view token =
trim(std::string_view(csv).substr(pos, comma - pos));
if (!token.empty() && token == method_name) {
return true;
}
if (comma == std::string::npos) {
break;
}
pos = comma + 1;
}
return false;
}

void set_skip_copy_method(
const std::array<char, kMaxOptionValueLength>& raw) {
std::lock_guard<std::mutex> guard(skip_copy_method_mutex_);
Expand All @@ -83,7 +120,7 @@ class ET_EXPERIMENTAL CudaBackend final
return false;
}
std::lock_guard<std::mutex> guard(skip_copy_method_mutex_);
return method_name == skip_copy_method_;
return method_in_csv(method_name, skip_copy_method_);
}

Error load_function_pointers_into_handle(
Expand Down Expand Up @@ -316,7 +353,7 @@ class ET_EXPERIMENTAL CudaBackend final
ET_CHECK_OR_RETURN_ERROR(
create_err == Error::Ok,
Internal,
"Failed to create GPU tensor for input %d",
"Failed to create GPU tensor for input %" ET_PRIsize_t,
i);

gpu_inputs[i] = gpu_input_handle;
Expand All @@ -325,7 +362,7 @@ class ET_EXPERIMENTAL CudaBackend final
ET_CHECK_OR_RETURN_ERROR(
aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
Internal,
"Failed to copy input %d from CPU to GPU",
"Failed to copy input %" ET_PRIsize_t " from CPU to GPU",
i);
}
// Process output tensors: create GPU counterparts for ExecuTorch CPU
Expand All @@ -352,7 +389,7 @@ class ET_EXPERIMENTAL CudaBackend final
ET_CHECK_OR_RETURN_ERROR(
create_err == Error::Ok,
Internal,
"Failed to create GPU tensor for output %d",
"Failed to create GPU tensor for output %" ET_PRIsize_t,
i);

gpu_outputs[i] = gpu_output_handle;
Expand Down Expand Up @@ -382,11 +419,11 @@ class ET_EXPERIMENTAL CudaBackend final
// For DYNAMIC_BOUND tensors we try to resize
ET_CHECK_OK_OR_RETURN_ERROR(
resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
"Error resizing tensor at output index %d",
"Error resizing tensor at output index %" ET_PRIsize_t,
i);
ET_CHECK_OK_OR_RETURN_ERROR(
aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
"Failed to copy GPU output %d back to CPU",
"Failed to copy GPU output %" ET_PRIsize_t " back to CPU",
i);
}
} else {
Expand Down
62 changes: 48 additions & 14 deletions extension/asr/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ namespace {

constexpr const char* kEncoderMethodName = "encoder";
constexpr const char* kDecoderMethodName = "text_decoder";
constexpr const char* kSamplerMethodName = "sampler";

} // namespace

Expand All @@ -47,7 +48,8 @@ AsrRunner::AsrRunner(

bool AsrRunner::is_loaded() const {
return module_ && encoder_method_loaded_ && decoder_method_loaded_ &&
tokenizer_ && tokenizer_->is_loaded() && !eos_token_ids_.empty();
(!sampler_method_present_ || sampler_method_loaded_) && tokenizer_ &&
tokenizer_->is_loaded() && !eos_token_ids_.empty();
}

Error AsrRunner::load_tokenizer() {
Expand Down Expand Up @@ -96,6 +98,8 @@ Error AsrRunner::load() {
ET_CHECK_OK_OR_RETURN_ERROR(method_names_result.error());
const auto& method_names = method_names_result.get();

sampler_method_present_ = method_names.count(kSamplerMethodName);

ET_CHECK_OR_RETURN_ERROR(
method_names.count(kEncoderMethodName) &&
method_names.count(kDecoderMethodName),
Expand All @@ -109,13 +113,21 @@ Error AsrRunner::load() {

ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kDecoderMethodName));
decoder_method_loaded_ = true;

if (sampler_method_present_) {
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kSamplerMethodName));
sampler_method_loaded_ = true;
}
#ifdef CUDA_AVAILABLE
// Skip copying outputs to CPU. When a sampler exists, keep both encoder and
// decoder outputs on device and pass decoder logits directly into sampler.
executorch::runtime::BackendOptions<1> backend_options;
// For decoder still copy output from GPU to CPU for sampling.
// TODO: change sampler to use a CUDA kernel to sample and then skip copying
// decoder output as well
std::string skip_methods = kEncoderMethodName;
if (sampler_method_present_) {
skip_methods.append(",").append(kDecoderMethodName);
}
ET_CHECK_OK_OR_RETURN_ERROR(backend_options.set_option(
"skip_copy_output_to_cpu_for_method", kEncoderMethodName));
"skip_copy_output_to_cpu_for_method", skip_methods.c_str()));
const auto opt_err =
executorch::runtime::set_option("CudaBackend", backend_options.view());
if (opt_err != ::executorch::runtime::Error::Ok) {
Expand Down Expand Up @@ -264,6 +276,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
decoder_inputs.emplace_back(cache_position_ptr);
// Add some green coloring for the first generated token
// token_callback("\033[1;32m");
const bool use_sampler_method = sampler_method_loaded_;
while (generated_tokens < config.max_new_tokens) {
input_id = tokens.back();
auto decoder_result = module_->execute(kDecoderMethodName, decoder_inputs);
Expand All @@ -276,15 +289,36 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
"Decoder returned %zu outputs; expected a single tensor.",
decoder_outputs.size());

::executorch::aten::Tensor logits_tensor =
std::move(decoder_outputs[0]).toTensor();
const int64_t vocab_size = logits_tensor.numel();
ET_CHECK_OR_RETURN_ERROR(
vocab_size > 0, Internal, "Decoder logits tensor is empty.");

const int64_t next_token =
static_cast<int64_t>(::executorch::extension::llm::logits_to_token(
logits_tensor, config.temperature));
int64_t next_token = 0;
if (!use_sampler_method || config.temperature != 0.0f) {
::executorch::aten::Tensor logits_tensor =
std::move(decoder_outputs[0]).toTensor();
const int64_t vocab_size = logits_tensor.numel();
ET_CHECK_OR_RETURN_ERROR(
vocab_size > 0, Internal, "Decoder logits tensor is empty.");
next_token =
static_cast<int64_t>(::executorch::extension::llm::logits_to_token(
logits_tensor, config.temperature));
} else {
auto sampler_result =
module_->execute(kSamplerMethodName, decoder_outputs);
ET_CHECK_OK_OR_RETURN_ERROR(sampler_result.error());

auto sampler_outputs = std::move(*sampler_result);
ET_CHECK_OR_RETURN_ERROR(
sampler_outputs.size() == 1 && sampler_outputs[0].isTensor(),
Internal,
"Sampler returned %zu outputs; expected a single tensor.",
sampler_outputs.size());

::executorch::aten::Tensor token_tensor =
std::move(sampler_outputs[0]).toTensor();
ET_CHECK_OR_RETURN_ERROR(
token_tensor.numel() > 0,
Internal,
"Sampler logits tensor is empty.");
next_token = token_tensor.mutable_data_ptr<int64_t>()[0];
}

if (!first_token_generated) {
stats_.first_token_ms = ::executorch::extension::llm::time_in_ms();
Expand Down
2 changes: 2 additions & 0 deletions extension/asr/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ class ET_EXPERIMENTAL AsrRunner {

bool encoder_method_loaded_ = false;
bool decoder_method_loaded_ = false;
bool sampler_method_loaded_ = false;
bool sampler_method_present_ = false;

Stats stats_;
};
Expand Down
Loading