diff --git a/Makefile b/Makefile index fe0236238fa..3bc172f1d72 100644 --- a/Makefile +++ b/Makefile @@ -88,7 +88,7 @@ # # ============================================================================== -.PHONY: voxtral-cuda voxtral-cpu voxtral-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cpu parakeet-metal llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help +.PHONY: voxtral-cuda voxtral-cpu voxtral-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cpu parakeet-metal llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma3-text-cpu clean help help: @echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make \`. Available targets:" @@ -106,6 +106,7 @@ help: @echo " llava-cpu - Build Llava runner with CPU backend" @echo " gemma3-cuda - Build Gemma3 runner with CUDA backend" @echo " gemma3-cpu - Build Gemma3 runner with CPU backend" + @echo " gemma3-text-cpu - Build Gemma3 text-only runner with CPU backend" @echo " clean - Clean build artifacts" voxtral-cuda: @@ -234,6 +235,15 @@ gemma3-cpu: @echo "✓ Build complete!" @echo " Binary: cmake-out/examples/models/gemma3/gemma3_e2e_runner" +gemma3-text-cpu: + @echo "==> Building and installing ExecuTorch..." + cmake --workflow --preset llm-release + @echo "==> Building Gemma3 text runner (CPU)..." + cd examples/models/gemma3 && cmake --workflow --preset gemma3-text-cpu + @echo "" + @echo "✓ Build complete!" + @echo " Binary: cmake-out/examples/models/gemma3/gemma3_text_runner" + clean: rm -rf cmake-out \ extension/llm/tokenizers/build \ diff --git a/examples/models/gemma3/CMakeLists.txt b/examples/models/gemma3/CMakeLists.txt index d228ca53c46..12d53a8f6c6 100644 --- a/examples/models/gemma3/CMakeLists.txt +++ b/examples/models/gemma3/CMakeLists.txt @@ -37,7 +37,6 @@ find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH) executorch_target_link_options_shared_lib(executorch) set(link_libraries executorch gflags) -set(_srcs e2e_runner.cpp) list( APPEND @@ -109,7 +108,8 @@ endif() # Add tokenizers list(APPEND link_libraries tokenizers::tokenizers) -add_executable(gemma3_e2e_runner ${_srcs}) +# Executable for multimodal e2e runner (with image support) +add_executable(gemma3_e2e_runner e2e_runner.cpp) if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") target_link_options_gc_sections(gemma3_e2e_runner) if(NOT APPLE) @@ -122,3 +122,18 @@ target_include_directories( ) target_link_libraries(gemma3_e2e_runner PUBLIC ${link_libraries}) target_compile_options(gemma3_e2e_runner PUBLIC ${_common_compile_options}) + +# Executable for text-only runner (no image support) +add_executable(gemma3_text_runner text_runner.cpp) +if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + target_link_options_gc_sections(gemma3_text_runner) + if(NOT APPLE) + target_link_options(gemma3_text_runner PRIVATE "LINKER:-s") + endif() +endif() + +target_include_directories( + gemma3_text_runner PUBLIC ${_common_include_directories} +) +target_link_libraries(gemma3_text_runner PUBLIC ${link_libraries}) +target_compile_options(gemma3_text_runner PUBLIC ${_common_compile_options}) diff --git a/examples/models/gemma3/CMakePresets.json b/examples/models/gemma3/CMakePresets.json index dcfeceba1cd..376930f32f8 100644 --- a/examples/models/gemma3/CMakePresets.json +++ b/examples/models/gemma3/CMakePresets.json @@ -36,6 +36,12 @@ "configurePreset": "gemma3-cpu", "targets": ["gemma3_e2e_runner"] }, + { + "name": "gemma3-text-cpu", + "displayName": "Build Gemma3 text runner (CPU)", + "configurePreset": "gemma3-cpu", + "targets": ["gemma3_text_runner"] + }, { "name": "gemma3-cuda", "displayName": "Build Gemma3 runner (CUDA)", @@ -58,6 +64,20 @@ } ] }, + { + "name": "gemma3-text-cpu", + "displayName": "Configure and build Gemma3 text runner (CPU)", + "steps": [ + { + "type": "configure", + "name": "gemma3-cpu" + }, + { + "type": "build", + "name": "gemma3-text-cpu" + } + ] + }, { "name": "gemma3-cuda", "displayName": "Configure and build Gemma3 runner (CUDA)", diff --git a/examples/models/gemma3/README.md b/examples/models/gemma3/README.md index 9d36ae2b625..a64133b3c41 100644 --- a/examples/models/gemma3/README.md +++ b/examples/models/gemma3/README.md @@ -1,27 +1,96 @@ # Summary -This example demonstrates how to export and run Google's [Gemma 3](https://huggingface.co/google/gemma-3-4b-it) vision-language multimodal model locally on ExecuTorch with CUDA backend support. +This example demonstrates how to export and run Google's Gemma 3 models on ExecuTorch: +- [Gemma 3 4B](https://huggingface.co/google/gemma-3-4b-it) - Vision-language multimodal model (CUDA/CPU) +- [Gemma 3 1B](https://huggingface.co/google/gemma-3-1b-it) - Text-only instruction-tuned model (CPU) -# Exporting the model -To export the model, we use [Optimum ExecuTorch](https://github.com/huggingface/optimum-executorch), a repo that enables exporting models straight from the source - from HuggingFace's Transformers repo. +# Prerequisites ## Setting up Optimum ExecuTorch -Install through pip package: -``` +To export the models, we use [Optimum ExecuTorch](https://github.com/huggingface/optimum-executorch), which enables exporting models from HuggingFace's Transformers. + +Install through pip: +```bash pip install optimum-executorch ``` Or install from source: -``` +```bash git clone https://github.com/huggingface/optimum-executorch.git cd optimum-executorch python install_dev.py ``` -## CUDA Support -This guide focuses on CUDA backend support for Gemma3, which provides accelerated performance on NVIDIA GPUs. +## Obtaining the Tokenizer +Both Gemma 3 models share the same tokenizer. Download `tokenizer.json` from HuggingFace: +```bash +mkdir -p gemma-3 +curl -L https://huggingface.co/google/gemma-3-1b-it/resolve/main/tokenizer.json -o gemma-3/tokenizer.json +``` + +--- + +# Gemma 3 1B Text-Only Model (CPU) + +This section covers running the lightweight Gemma 3 1B instruction-tuned model for text-only inference on CPU. + +## Exporting Gemma 3 1B + +```bash +optimum-cli export executorch \ + --model "google/gemma-3-1b-it" \ + --task "text-generation" \ + --recipe "xnnpack" \ + --use_custom_sdpa \ + --use_custom_kv_cache \ + --output_dir="gemma-3/gemma-3-1b-it" +``` + +This will generate: +- `model.pte` - The exported model + +## Building the Text Runner + +```bash +make gemma3-text-cpu +``` + +## Running the Text Model -### Exporting with CUDA +```bash +./cmake-out/examples/models/gemma3/gemma3_text_runner \ + --model_path=gemma-3/gemma-3-1b-it/model.pte \ + --tokenizer_path=gemma-3/tokenizer.json \ + --prompt="What is the capital of France?" \ + --max_new_tokens=100 +``` + +### Available Options +| Flag | Description | Default | +|------|-------------|---------| +| `--model_path` | Path to the exported model.pte | `model.pte` | +| `--tokenizer_path` | Path to tokenizer.json | `tokenizer.json` | +| `--prompt` | Text prompt for generation | `Hello, world!` | +| `--temperature` | Sampling temperature (0 = greedy) | `0.0` | +| `--max_new_tokens` | Maximum tokens to generate | `100` | +| `--cpu_threads` | Number of CPU threads (-1 = auto) | `-1` | +| `--warmup` | Run warmup before generation | `false` | + +### Example Output +``` +The capital of France is **Paris**. +PyTorchObserver {"prompt_tokens":15,"generated_tokens":12,...} +``` + +--- + +# Gemma 3 4B Multimodal Model (CUDA) + +This section covers running the Gemma 3 4B vision-language multimodal model with CUDA backend support. + +## Exporting Gemma 3 4B + +### Standard Export ```bash optimum-cli export executorch \ --model "google/gemma-3-4b-it" \ @@ -29,15 +98,15 @@ optimum-cli export executorch \ --recipe "cuda" \ --dtype bfloat16 \ --device cuda \ - --output_dir="path/to/output/dir" + --output_dir="gemma-3/gemma-3-4b-it" ``` This will generate: - `model.pte` - The exported model - `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime -### Exporting with INT4 Quantization (Tile Packed) -For improved performance and reduced memory footprint, you can export Gemma3 with INT4 weight quantization using tile-packed format: +### Export with INT4 Quantization (Tile Packed) +For improved performance and reduced memory footprint: ```bash optimum-cli export executorch \ @@ -50,59 +119,45 @@ optimum-cli export executorch \ --qlinear_encoder 4w \ --qlinear_packing_format tile_packed_to_4d \ --qlinear_encoder_packing_format tile_packed_to_4d \ - --output_dir="path/to/output/dir" -``` - -This will generate the same files (`model.pte` and `aoti_cuda_blob.ptd`) in the `int4` directory. - -See the "Building the Gemma3 runner" section below for instructions on building with CUDA support, and the "Running the model" section for runtime instructions. - -# Running the model -To run the model, we will use the Gemma3 runner, which utilizes ExecuTorch's MultiModal runner API. -The Gemma3 runner will do the following: - -- **Image Input**: Load image files (PNG, JPG, etc.) and format them as input tensors for the model -- **Text Input**: Process text prompts using the tokenizer -- **Feed the formatted inputs** to the multimodal runner for inference - -## Obtaining the tokenizer -You can download the `tokenizer.json` file from [Gemma 3's HuggingFace repo](https://huggingface.co/unsloth/gemma-3-1b-it): -```bash -curl -L https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json -o tokenizer.json + --output_dir="gemma-3/gemma-3-4b-it-int4" ``` -## Building the Gemma3 runner +## Building the Multimodal Runner ### Prerequisites -Ensure you have a CUDA-capable GPU and CUDA toolkit installed on your system. +Ensure you have a CUDA-capable GPU and CUDA toolkit installed. -### Building for CUDA +### Build Commands ```bash -# Build the Gemma3 runner with CUDA enabled +# Build with CUDA backend make gemma3-cuda -# Build the Gemma3 runner with CPU enabled +# Build with CPU backend make gemma3-cpu ``` -## Running the model -You need to provide the following files to run Gemma3: -- `model.pte` - The exported model file -- `aoti_cuda_blob.ptd` - The CUDA kernel blob -- `tokenizer.json` - The tokenizer file -- An image file (PNG, JPG, etc.) +## Running the Multimodal Model + +The multimodal runner processes both image and text inputs: -### Example usage ```bash ./cmake-out/examples/models/gemma3/gemma3_e2e_runner \ - --model_path path/to/model.pte \ - --data_path path/to/aoti_cuda_blob.ptd \ - --tokenizer_path path/to/tokenizer.json \ - --image_path docs/source/_static/img/et-logo.png \ # here we use the ExecuTorch logo as an example - --temperature 0 + --model_path=gemma-3/gemma-3-4b-it/model.pte \ + --data_path=gemma-3/gemma-3-4b-it/aoti_cuda_blob.ptd \ + --tokenizer_path=gemma-3/tokenizer.json \ + --image_path=docs/source/_static/img/et-logo.png \ + --temperature=0 ``` -# Example output +### Required Files +| File | Description | +|------|-------------| +| `model.pte` | The exported model file | +| `aoti_cuda_blob.ptd` | CUDA kernel blob (CUDA only) | +| `tokenizer.json` | Shared tokenizer | +| Image file | PNG, JPG, or other supported format | + +### Example Output ``` Okay, let's break down what's in the image! @@ -111,5 +166,5 @@ It appears to be a stylized graphic combining: * **A Microchip:** The core shape is a representation of a microchip (the integrated circuit). * **An "On" Symbol:** There's an "On" symbol (often represented as a circle with a vertical line) incorporated into the microchip design. * **Color Scheme:** The microchip is colored in gray, and -PyTorchObserver {"prompt_tokens":271,"generated_tokens":99,"model_load_start_ms":0,"model_load_end_ms":0,"inference_start_ms":1761118126790,"inference_end_ms":1761118128385,"prompt_eval_end_ms":1761118127175,"first_token_ms":1761118127175,"aggregate_sampling_time_ms":86,"SCALING_FACTOR_UNITS_PER_SECOND":1000} +PyTorchObserver {"prompt_tokens":271,"generated_tokens":99,...} ``` diff --git a/examples/models/gemma3/text_runner.cpp b/examples/models/gemma3/text_runner.cpp new file mode 100644 index 00000000000..6474a0d3ea5 --- /dev/null +++ b/examples/models/gemma3/text_runner.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include + +#include +#include +#include +#include + +#if defined(ET_USE_THREADPOOL) +#include +#include +#endif + +DEFINE_string( + model_path, + "model.pte", + "Model serialized in flatbuffer format."); + +DEFINE_string(tokenizer_path, "tokenizer.json", "Tokenizer path."); + +DEFINE_string(prompt, "Hello, world!", "Text prompt."); + +DEFINE_double( + temperature, + 0.0f, + "Temperature; Default is 0. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic"); + +DEFINE_int32( + max_new_tokens, + 100, + "Maximum number of tokens to generate."); + +DEFINE_int32( + cpu_threads, + -1, + "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device."); + +DEFINE_bool(warmup, false, "Whether to run a warmup run."); + +int32_t main(int32_t argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + + const char* model_path = FLAGS_model_path.c_str(); + const char* tokenizer_path = FLAGS_tokenizer_path.c_str(); + const char* prompt = FLAGS_prompt.c_str(); + float temperature = FLAGS_temperature; + int32_t max_new_tokens = FLAGS_max_new_tokens; + int32_t cpu_threads = FLAGS_cpu_threads; + bool warmup = FLAGS_warmup; + +#if defined(ET_USE_THREADPOOL) + uint32_t num_performant_cores = cpu_threads == -1 + ? ::executorch::extension::cpuinfo::get_num_performant_cores() + : static_cast(cpu_threads); + ET_LOG( + Info, "Resetting threadpool with num threads = %d", num_performant_cores); + if (num_performant_cores > 0) { + ::executorch::extension::threadpool::get_threadpool() + ->_unsafe_reset_threadpool(num_performant_cores); + } +#endif + + std::unique_ptr<::tokenizers::Tokenizer> tokenizer = + ::executorch::extension::llm::load_tokenizer(tokenizer_path); + if (tokenizer == nullptr) { + ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path); + return 1; + } + + // Create text LLM runner + std::unique_ptr<::executorch::extension::llm::TextLLMRunner> runner = + ::executorch::extension::llm::create_text_llm_runner( + model_path, std::move(tokenizer)); + + if (runner == nullptr) { + ET_LOG(Error, "Failed to create text LLM runner"); + return 1; + } + + // Load runner + auto load_error = runner->load(); + if (load_error != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to load text LLM runner"); + return 1; + } + + // Format prompt with Gemma3 chat template + std::string formatted_prompt = std::string("user\n") + + std::string(prompt) + std::string("\nmodel\n"); + + ::executorch::extension::llm::GenerationConfig config; + config.max_new_tokens = max_new_tokens; + config.temperature = temperature; + + // Run warmup if requested + if (warmup) { + ET_LOG(Info, "Running warmup..."); + auto warmup_error = runner->warmup(formatted_prompt, max_new_tokens); + if (warmup_error != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to run warmup"); + return 1; + } + runner->reset(); + } + + ET_LOG(Info, "Generating response..."); + + // Note: TextLLMRunner::generate() already handles printing tokens and stats + // internally, so we don't need to pass callbacks for printing + auto error = runner->generate(formatted_prompt, config); + + if (error != ::executorch::runtime::Error::Ok) { + ET_LOG(Error, "Failed to generate with text LLM runner\n"); + return 1; + } + + return 0; +}