Skip to content
10 changes: 8 additions & 2 deletions .github/workflows/test_models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,17 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies for ExecuTorch
run: |
# Clean up cache to save space
pip cache purge || true
rm -rf ~/.cache/huggingface/hub/* || true

if [ "${{ matrix.executorch-version }}" == "nightly" ]; then
python install_dev.py
else
pip install '.[dev]'
pip install executorch==${{ matrix.executorch-version }}
# Use CPU-only torch to avoid CUDA dependencies (saves ~5GB)
pip install --no-cache-dir '.[dev]' \
--extra-index-url https://download.pytorch.org/whl/cpu
pip install --no-cache-dir executorch==${{ matrix.executorch-version }}
fi
pip list
- name: Run tests
Expand Down
13 changes: 7 additions & 6 deletions install_dev.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

def install_torch_nightly_deps():
"""Install torch related dependencies from pinned nightly"""
EXECUTORCH_NIGHTLY_VERSION = "dev20251003"
EXECUTORCH_NIGHTLY_VERSION = "dev20251104"
TORCHAO_NIGHTLY_VERSION = "dev20251104"
# Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/torch_pin.py#L2
TORCH_NIGHTLY_VERSION = "dev20251104"
Expand All @@ -15,6 +15,7 @@ def install_torch_nightly_deps():
"-m",
"pip",
"install",
"--no-cache-dir", # Prevent cached CUDA packages
f"executorch==1.1.0.{EXECUTORCH_NIGHTLY_VERSION}",
f"torch==2.10.0.{TORCH_NIGHTLY_VERSION}",
f"torchvision==0.25.0.{TORCH_NIGHTLY_VERSION}",
Expand All @@ -34,7 +35,7 @@ def install_dep_from_source():
"-m",
"pip",
"install",
"git+https://github.com/huggingface/transformers@91393fe4cc3266a05bc0d129e34ff5f761bb46e2#egg=transformers", # 4.56.1
"git+https://github.com/huggingface/transformers@cbc6716945cff1d8e124d344ba0150e6e27f8b6e#egg=transformers", # v5.0.0rc0
]
)
subprocess.check_call(
Expand All @@ -58,13 +59,13 @@ def main():
)
args = parser.parse_args()

# Install package with dev extras
subprocess.check_call([sys.executable, "-m", "pip", "install", ".[dev]"])

# Install nightly dependencies
# Install nightly torch dependencies FIRST to avoid pulling CUDA versions
if not args.skip_override_torch:
install_torch_nightly_deps()

# Install package with dev extras
subprocess.check_call([sys.executable, "-m", "pip", "install", ".[dev]"])

# Install source dependencies
install_dep_from_source()

Expand Down
5 changes: 3 additions & 2 deletions optimum/commands/export/executorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from pathlib import Path
from typing import TYPE_CHECKING

from ...exporters import TasksManager
from transformers.pipelines import get_supported_tasks

from ..base import BaseOptimumCLICommand, CommandInfo


Expand Down Expand Up @@ -46,7 +47,7 @@ def parse_args_executorch(parser):
default="text-generation",
help=(
"The task to export the model for. Available tasks depend on the model, but are among:"
f" {str(TasksManager.get_all_tasks())}."
f" {str(get_supported_tasks())}."
),
)
required_group.add_argument(
Expand Down
4 changes: 2 additions & 2 deletions optimum/commands/register/register_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from ..export import ExportCommand
from ..export.executorch import ExecuTorchExportCommand
from optimum.commands.export.base import ExportCommand
from optimum.commands.export.executorch import ExecuTorchExportCommand


REGISTER_COMMANDS = [(ExecuTorchExportCommand, ExportCommand)]
4 changes: 2 additions & 2 deletions optimum/executorch/attentions/custom_kv_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def __init__(
device=device,
dtype=dtype,
)
num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
num_heads = getattr(config, "num_key_value_heads", None) or config.num_attention_heads
self.early_initialization(
batch_size=max_batch_size, num_heads=num_heads, head_dim=head_dim, dtype=dtype, device=device
)
Expand Down
58 changes: 54 additions & 4 deletions optimum/executorch/attentions/custom_sdpa.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,59 @@
from executorch.extension.llm.custom_ops.custom_ops import custom_sdpa # noqa


def sdpa_mask_passthrough(
batch_size: int,
cache_position: torch.Tensor,
kv_length: int,
kv_offset: int = 0,
mask_function: Optional[Callable] = None,
attention_mask: Optional[torch.Tensor] = None,
local_size: Optional[int] = None,
allow_is_causal_skip: bool = True,
allow_torch_fix: bool = True,
**kwargs,
) -> Optional[torch.Tensor]:
"""
Pass-through for attention mask creation since it is never used:
- For regular attention, the custom sdpa op in causal mode creates its own attention mask
- For sliding window attention, the attention mask from the attention mask API is ditched and re-created during the attention API since it needs to know about cache internals

Additionally, there were some vmap export issues with sliding window attention mask creation in Transformers.

Args:
batch_size (`int`):
The batch size of the input sequence.
cache_position (`torch.Tensor`):
A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
kv_length (`int`):
The size that the key and value states will have during the attention computation.
kv_offset (`int`, optional):
An optional offset to indicate at which first position the key and values states will refer to.
mask_function (`Callable`):
The mask factory function describing the mask pattern.
attention_mask (`torch.Tensor`, optional):
The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
local_size (`int`, optional):
The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
to try to skip mask creation if possible.
allow_is_causal_skip (`bool`, optional):
Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
`torch.sdpa` instead. Default to `True`.
allow_torch_fix (`bool`, optional):
Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
versions. We need an arg to skip it when using eager. By default `True`.

"""
return None


def custom_sdpa_with_start_pos_forward(
module: torch.nn.Module,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attention_mask: Union[torch.Tensor, "BlockMask"], # noqa
position_ids: Optional[torch.Tensor] = None,
scaling: Optional[float] = None,
softcap: Optional[float] = None,
head_mask: Optional[torch.Tensor] = None,
Expand Down Expand Up @@ -56,10 +103,10 @@ def custom_sdpa_with_start_pos_forward(
# Calculate the input pos from attention mask.
# Branch out for float vs bool mask
# assert attention_mask.dim() == 2, f"attention_mask must be a 2D matrix."
attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1])
first_row_mask = attention_mask[0, :]
# [0, 0, 0, 0, -inf, -inf, -inf, -inf], start_pos = 3
start_pos = torch.argmin(first_row_mask.to(torch.long)).item() - 1
assert (
position_ids is not None
), "position_ids must be provided to find start position for causal attention"
start_pos = position_ids[0][0].item()
else:
start_pos = 0

Expand Down Expand Up @@ -95,6 +142,7 @@ def _custom_sdpa_for_ring_kv_cache(
key: torch.Tensor,
value: torch.Tensor,
attention_mask: Union[torch.Tensor, "BlockMask"], # noqa
position_ids: Optional[torch.Tensor] = None,
scaling: Optional[float] = None,
softcap: Optional[float] = None,
head_mask: Optional[torch.Tensor] = None,
Expand Down Expand Up @@ -122,6 +170,7 @@ def _custom_sdpa_for_ring_kv_cache(
key,
value,
attention_mask,
position_ids,
scaling,
softcap,
head_mask,
Expand All @@ -134,6 +183,7 @@ def _custom_sdpa_for_ring_kv_cache(
key,
value,
attention_mask,
position_ids,
scaling,
softcap,
head_mask,
Expand Down
21 changes: 8 additions & 13 deletions optimum/executorch/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
AutoModelForSeq2SeqLM,
AutoModelForSpeechSeq2Seq,
PreTrainedTokenizer,
add_start_docstrings,
)
from transformers.configuration_utils import PretrainedConfig
from transformers.pipelines import get_task
from transformers.processing_utils import ProcessorMixin
from transformers.utils import is_offline_mode

Expand All @@ -46,13 +46,11 @@
)
from executorch.kernels import quantized # noqa

from ..exporters import TasksManager
from ..exporters.executorch import main_export
from ..exporters.executorch.utils import (
process_conversation_inputs,
verify_eos_tokens_in_pretrained_tokenizer,
)
from ..modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel
from ..utils.file_utils import find_files_matching_pattern
from .stats import Stats

Expand All @@ -63,7 +61,7 @@
logger = logging.getLogger(__name__)


class ExecuTorchModelBase(OptimizedModel, ABC):
class ExecuTorchModelBase(ABC):
"""
ExecuTorch model for inference using the ExecuTorch Runtime.

Expand Down Expand Up @@ -99,8 +97,6 @@ def __init__(
models: Dict[str, "ExecuTorchModule"],
config: "PretrainedConfig",
):
super().__init__(model=None, config=config)

if self.__class__.auto_model_class is None:
raise ValueError(
f"Class {self.__class__.__name__} must set auto_model_class. "
Expand Down Expand Up @@ -268,6 +264,7 @@ def _export(
cls,
model_id: str,
recipe: str,
task: Optional[str] = None,
config: Optional[PretrainedConfig] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
Expand All @@ -278,9 +275,8 @@ def _export(
local_files_only: bool = False,
**kwargs,
) -> Dict[str, "ExecuTorchModule"]:
task = kwargs.pop("task", None)
inferred_task = TasksManager.infer_task_from_model(cls.auto_model_class) if not task else task
logging.info(f"Inferred task from model class: {inferred_task}")
inferred_task = get_task(model_id) if not task else task
logging.info(f"Using task: {inferred_task}")

save_dir = TemporaryDirectory(prefix="executorch_export_")
save_dir_path = Path(save_dir.name)
Expand Down Expand Up @@ -316,7 +312,6 @@ def _save_pretrained(self, save_directory):
raise NotImplementedError

@classmethod
@add_start_docstrings(FROM_PRETRAINED_START_DOCSTRING)
def from_pretrained(
cls,
model_id: Union[str, Path],
Expand Down Expand Up @@ -1322,9 +1317,9 @@ def generate(

def text_generation(
self,
processor: ProcessorMixin,
tokenizer: "PreTrainedTokenizer",
input_conversation: List[Dict],
processor: Optional[ProcessorMixin] = None,
tokenizer: Optional["PreTrainedTokenizer"] = None,
echo: bool = True,
max_seq_len: Optional[int] = None,
):
Expand Down Expand Up @@ -1362,9 +1357,9 @@ def text_generation(
self.stats.on_inference_start()

inputs = process_conversation_inputs(
input_conversation,
processor,
tokenizer,
input_conversation,
)

self.stats.on_token_encode_end()
Expand Down
5 changes: 2 additions & 3 deletions optimum/exporters/executorch/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
from pathlib import Path
from typing import Union

from transformers.integrations.executorch import sdpa_mask_without_vmap
from transformers.masking_utils import AttentionMaskInterface
from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, AttentionMaskInterface
from transformers.modeling_utils import AttentionInterface

from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward
Expand All @@ -29,7 +28,7 @@


AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward)
AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap)
AttentionMaskInterface.register("custom_sdpa", ALL_MASK_ATTENTION_FUNCTIONS["sdpa"])


def export_to_executorch(
Expand Down
Loading