pytorch · seyeong-han · Jan 26, 2026
diff --git a/Makefile b/Makefile
@@ -88,7 +88,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cpu parakeet-metal llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cpu parakeet-metal llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma3-text-cpu clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -106,6 +106,7 @@ help:
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
+	@echo "  gemma3-text-cpu     - Build Gemma3 text-only runner with CPU backend"
 	@echo "  clean               - Clean build artifacts"
 
 voxtral-cuda:
@@ -234,6 +235,15 @@ gemma3-cpu:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/gemma3/gemma3_e2e_runner"
 
+gemma3-text-cpu:
+	@echo "==> Building and installing ExecuTorch..."
+	cmake --workflow --preset llm-release
+	@echo "==> Building Gemma3 text runner (CPU)..."
+	cd examples/models/gemma3 && cmake --workflow --preset gemma3-text-cpu
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/gemma3/gemma3_text_runner"
+
 clean:
 	rm -rf cmake-out \
 	       extension/llm/tokenizers/build \

@@ -37,7 +37,6 @@ find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
 executorch_target_link_options_shared_lib(executorch)
 
 set(link_libraries executorch gflags)
-set(_srcs e2e_runner.cpp)
 
 list(
   APPEND
@@ -109,7 +108,8 @@ endif()
 # Add tokenizers
 list(APPEND link_libraries tokenizers::tokenizers)
 
-add_executable(gemma3_e2e_runner ${_srcs})
+# Executable for multimodal e2e runner (with image support)
+add_executable(gemma3_e2e_runner e2e_runner.cpp)
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(gemma3_e2e_runner)
   if(NOT APPLE)
@@ -122,3 +122,18 @@ target_include_directories(
 )
 target_link_libraries(gemma3_e2e_runner PUBLIC ${link_libraries})
 target_compile_options(gemma3_e2e_runner PUBLIC ${_common_compile_options})
+
+# Executable for text-only runner (no image support)
+add_executable(gemma3_text_runner text_runner.cpp)
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(gemma3_text_runner)
+  if(NOT APPLE)
+    target_link_options(gemma3_text_runner PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(
+  gemma3_text_runner PUBLIC ${_common_include_directories}
+)
+target_link_libraries(gemma3_text_runner PUBLIC ${link_libraries})
+target_compile_options(gemma3_text_runner PUBLIC ${_common_compile_options})
@@ -36,6 +36,12 @@
             "configurePreset": "gemma3-cpu",
             "targets": ["gemma3_e2e_runner"]
         },
+        {
+            "name": "gemma3-text-cpu",
+            "displayName": "Build Gemma3 text runner (CPU)",
+            "configurePreset": "gemma3-cpu",
+            "targets": ["gemma3_text_runner"]
+        },
         {
             "name": "gemma3-cuda",
             "displayName": "Build Gemma3 runner (CUDA)",
@@ -58,6 +64,20 @@
                 }
             ]
         },
+        {
+            "name": "gemma3-text-cpu",
+            "displayName": "Configure and build Gemma3 text runner (CPU)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "gemma3-cpu"
+                },
+                {
+                    "type": "build",
+                    "name": "gemma3-text-cpu"
+                }
+            ]
+        },
         {
             "name": "gemma3-cuda",
             "displayName": "Configure and build Gemma3 runner (CUDA)",

@@ -1,43 +1,112 @@
 # Summary
 
-This example demonstrates how to export and run Google's [Gemma 3](https://huggingface.co/google/gemma-3-4b-it) vision-language multimodal model locally on ExecuTorch with CUDA backend support.
+This example demonstrates how to export and run Google's Gemma 3 models on ExecuTorch:
+- [Gemma 3 4B](https://huggingface.co/google/gemma-3-4b-it) - Vision-language multimodal model (CUDA/CPU)
+- [Gemma 3 1B](https://huggingface.co/google/gemma-3-1b-it) - Text-only instruction-tuned model (CPU)
 
-# Exporting the model
-To export the model, we use [Optimum ExecuTorch](https://github.com/huggingface/optimum-executorch), a repo that enables exporting models straight from the source - from HuggingFace's Transformers repo.
+# Prerequisites
 
 ## Setting up Optimum ExecuTorch
-Install through pip package:
-```
+To export the models, we use [Optimum ExecuTorch](https://github.com/huggingface/optimum-executorch), which enables exporting models from HuggingFace's Transformers.
+
+Install through pip:
+```bash
 pip install optimum-executorch
 ```
 
 Or install from source:
-```
+```bash
 git clone https://github.com/huggingface/optimum-executorch.git
 cd optimum-executorch
 python install_dev.py
 ```
 
-## CUDA Support
-This guide focuses on CUDA backend support for Gemma3, which provides accelerated performance on NVIDIA GPUs.
+## Obtaining the Tokenizer
+Both Gemma 3 models share the same tokenizer. Download `tokenizer.json` from HuggingFace:
+```bash
+mkdir -p gemma-3
+curl -L https://huggingface.co/google/gemma-3-1b-it/resolve/main/tokenizer.json -o gemma-3/tokenizer.json
+```
+
+---
+
+# Gemma 3 1B Text-Only Model (CPU)
+
+This section covers running the lightweight Gemma 3 1B instruction-tuned model for text-only inference on CPU.
+
+## Exporting Gemma 3 1B
+
+```bash
+optimum-cli export executorch \
+  --model "google/gemma-3-1b-it" \
+  --task "text-generation" \
+  --recipe "xnnpack" \
+  --use_custom_sdpa \
+  --use_custom_kv_cache \
+  --output_dir="gemma-3/gemma-3-1b-it"
+```
+
+This will generate:
+- `model.pte` - The exported model
+
+## Building the Text Runner
+
+```bash
+make gemma3-text-cpu
+```
+
+## Running the Text Model
 
-### Exporting with CUDA
+```bash
+./cmake-out/examples/models/gemma3/gemma3_text_runner \
+    --model_path=gemma-3/gemma-3-1b-it/model.pte \
+    --tokenizer_path=gemma-3/tokenizer.json \
+    --prompt="What is the capital of France?" \
+    --max_new_tokens=100
+```
+
+### Available Options
+| Flag | Description | Default |
+|------|-------------|---------|
+| `--model_path` | Path to the exported model.pte | `model.pte` |
+| `--tokenizer_path` | Path to tokenizer.json | `tokenizer.json` |
+| `--prompt` | Text prompt for generation | `Hello, world!` |
+| `--temperature` | Sampling temperature (0 = greedy) | `0.0` |
+| `--max_new_tokens` | Maximum tokens to generate | `100` |
+| `--cpu_threads` | Number of CPU threads (-1 = auto) | `-1` |
+| `--warmup` | Run warmup before generation | `false` |
+
+### Example Output
+```
+The capital of France is **Paris**.
+PyTorchObserver {"prompt_tokens":15,"generated_tokens":12,...}
+```
+
+---
+
+# Gemma 3 4B Multimodal Model (CUDA)
+
+This section covers running the Gemma 3 4B vision-language multimodal model with CUDA backend support.
+
+## Exporting Gemma 3 4B
+
+### Standard Export
 ```bash
 optimum-cli export executorch \
   --model "google/gemma-3-4b-it" \
   --task "multimodal-text-to-text" \
   --recipe "cuda" \
   --dtype bfloat16 \
   --device cuda \
-  --output_dir="path/to/output/dir"
+  --output_dir="gemma-3/gemma-3-4b-it"
 ```
 
 This will generate:
 - `model.pte` - The exported model
 - `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime
 
-### Exporting with INT4 Quantization (Tile Packed)
-For improved performance and reduced memory footprint, you can export Gemma3 with INT4 weight quantization using tile-packed format:
+### Export with INT4 Quantization (Tile Packed)
+For improved performance and reduced memory footprint:
 
 ```bash
 optimum-cli export executorch \
@@ -50,59 +119,45 @@ optimum-cli export executorch \
   --qlinear_encoder 4w \
   --qlinear_packing_format tile_packed_to_4d \
   --qlinear_encoder_packing_format tile_packed_to_4d \
-  --output_dir="path/to/output/dir"
-```
-
-This will generate the same files (`model.pte` and `aoti_cuda_blob.ptd`) in the `int4` directory.
-
-See the "Building the Gemma3 runner" section below for instructions on building with CUDA support, and the "Running the model" section for runtime instructions.
-
-# Running the model
-To run the model, we will use the Gemma3 runner, which utilizes ExecuTorch's MultiModal runner API.
-The Gemma3 runner will do the following:
-
-- **Image Input**: Load image files (PNG, JPG, etc.) and format them as input tensors for the model
-- **Text Input**: Process text prompts using the tokenizer
-- **Feed the formatted inputs** to the multimodal runner for inference
-
-## Obtaining the tokenizer
-You can download the `tokenizer.json` file from [Gemma 3's HuggingFace repo](https://huggingface.co/unsloth/gemma-3-1b-it):
-```bash
-curl -L https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json -o tokenizer.json
+  --output_dir="gemma-3/gemma-3-4b-it-int4"
 ```
 
-## Building the Gemma3 runner
+## Building the Multimodal Runner
 
 ### Prerequisites
-Ensure you have a CUDA-capable GPU and CUDA toolkit installed on your system.
+Ensure you have a CUDA-capable GPU and CUDA toolkit installed.
 
-### Building for CUDA
+### Build Commands
 ```bash
-# Build the Gemma3 runner with CUDA enabled
+# Build with CUDA backend
 make gemma3-cuda
 
-# Build the Gemma3 runner with CPU enabled
+# Build with CPU backend
 make gemma3-cpu
 ```
 
-## Running the model
-You need to provide the following files to run Gemma3:
-- `model.pte` - The exported model file
-- `aoti_cuda_blob.ptd` - The CUDA kernel blob
-- `tokenizer.json` - The tokenizer file
-- An image file (PNG, JPG, etc.)
+## Running the Multimodal Model
+
+The multimodal runner processes both image and text inputs:
 
-### Example usage
 ```bash
 ./cmake-out/examples/models/gemma3/gemma3_e2e_runner \
-  --model_path path/to/model.pte \
-  --data_path path/to/aoti_cuda_blob.ptd \
-  --tokenizer_path path/to/tokenizer.json \
-  --image_path docs/source/_static/img/et-logo.png \ # here we use the ExecuTorch logo as an example
-  --temperature 0
+  --model_path=gemma-3/gemma-3-4b-it/model.pte \
+  --data_path=gemma-3/gemma-3-4b-it/aoti_cuda_blob.ptd \
+  --tokenizer_path=gemma-3/tokenizer.json \
+  --image_path=docs/source/_static/img/et-logo.png \
+  --temperature=0
 ```
 
-# Example output
+### Required Files
+| File | Description |
+|------|-------------|
+| `model.pte` | The exported model file |
+| `aoti_cuda_blob.ptd` | CUDA kernel blob (CUDA only) |
+| `tokenizer.json` | Shared tokenizer |
+| Image file | PNG, JPG, or other supported format |
+
+### Example Output
 ```
 Okay, let's break down what's in the image!
 
@@ -111,5 +166,5 @@ It appears to be a stylized graphic combining:
 *   **A Microchip:** The core shape is a representation of a microchip (the integrated circuit).
 *   **An "On" Symbol:**  There's an "On" symbol (often represented as a circle with a vertical line) incorporated into the microchip design.
 *   **Color Scheme:** The microchip is colored in gray, and
-PyTorchObserver {"prompt_tokens":271,"generated_tokens":99,"model_load_start_ms":0,"model_load_end_ms":0,"inference_start_ms":1761118126790,"inference_end_ms":1761118128385,"prompt_eval_end_ms":1761118127175,"first_token_ms":1761118127175,"aggregate_sampling_time_ms":86,"SCALING_FACTOR_UNITS_PER_SECOND":1000}
+PyTorchObserver {"prompt_tokens":271,"generated_tokens":99,...}
 ```