huggingface · jackzhxng · Nov 18, 2025 · Dec 8, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
@@ -52,11 +52,17 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies for ExecuTorch
         run: |
+          # Clean up cache to save space
+          pip cache purge || true
+          rm -rf ~/.cache/huggingface/hub/* || true
+
           if [ "${{ matrix.executorch-version }}" == "nightly" ]; then
             python install_dev.py
           else
-            pip install '.[dev]'
-            pip install executorch==${{ matrix.executorch-version }}
+            # Use CPU-only torch to avoid CUDA dependencies (saves ~5GB)
+            pip install --no-cache-dir '.[dev]' \
+              --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install --no-cache-dir executorch==${{ matrix.executorch-version }}
           fi
           pip list
       - name: Run tests

diff --git a/install_dev.py b/install_dev.py
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20251003"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20251104"
     TORCHAO_NIGHTLY_VERSION = "dev20251104"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/torch_pin.py#L2
     TORCH_NIGHTLY_VERSION = "dev20251104"
@@ -15,6 +15,7 @@ def install_torch_nightly_deps():
             "-m",
             "pip",
             "install",
+            "--no-cache-dir",  # Prevent cached CUDA packages
             f"executorch==1.1.0.{EXECUTORCH_NIGHTLY_VERSION}",
             f"torch==2.10.0.{TORCH_NIGHTLY_VERSION}",
             f"torchvision==0.25.0.{TORCH_NIGHTLY_VERSION}",
@@ -34,7 +35,7 @@ def install_dep_from_source():
             "-m",
             "pip",
             "install",
-            "git+https://github.com/huggingface/transformers@91393fe4cc3266a05bc0d129e34ff5f761bb46e2#egg=transformers",  # 4.56.1
+            "git+https://github.com/huggingface/transformers@cbc6716945cff1d8e124d344ba0150e6e27f8b6e#egg=transformers",  # v5.0.0rc0
         ]
     )
     subprocess.check_call(
@@ -58,13 +59,13 @@ def main():
     )
     args = parser.parse_args()
 
-    # Install package with dev extras
-    subprocess.check_call([sys.executable, "-m", "pip", "install", ".[dev]"])
-
-    # Install nightly dependencies
+    # Install nightly torch dependencies FIRST to avoid pulling CUDA versions
     if not args.skip_override_torch:
         install_torch_nightly_deps()
 
+    # Install package with dev extras
+    subprocess.check_call([sys.executable, "-m", "pip", "install", ".[dev]"])
+
     # Install source dependencies
     install_dep_from_source()
 

diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
@@ -17,7 +17,8 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-from ...exporters import TasksManager
+from transformers.pipelines import get_supported_tasks
+
 from ..base import BaseOptimumCLICommand, CommandInfo
 
 
@@ -46,7 +47,7 @@ def parse_args_executorch(parser):
         default="text-generation",
         help=(
             "The task to export the model for. Available tasks depend on the model, but are among:"
-            f" {str(TasksManager.get_all_tasks())}."
+            f" {str(get_supported_tasks())}."
         ),
     )
     required_group.add_argument(

diff --git a/optimum/commands/register/register_export.py b/optimum/commands/register/register_export.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..export import ExportCommand
-from ..export.executorch import ExecuTorchExportCommand
+from optimum.commands.export.base import ExportCommand
+from optimum.commands.export.executorch import ExecuTorchExportCommand
 
 
 REGISTER_COMMANDS = [(ExecuTorchExportCommand, ExportCommand)]
diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
@@ -45,8 +45,8 @@ def __init__(
             device=device,
             dtype=dtype,
         )
-        num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
-        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        num_heads = getattr(config, "num_key_value_heads", None) or config.num_attention_heads
         self.early_initialization(
             batch_size=max_batch_size, num_heads=num_heads, head_dim=head_dim, dtype=dtype, device=device
         )

diff --git a/optimum/executorch/attentions/custom_sdpa.py b/optimum/executorch/attentions/custom_sdpa.py
@@ -18,12 +18,59 @@
 from executorch.extension.llm.custom_ops.custom_ops import custom_sdpa  # noqa
 
 
+def sdpa_mask_passthrough(
+    batch_size: int,
+    cache_position: torch.Tensor,
+    kv_length: int,
+    kv_offset: int = 0,
+    mask_function: Optional[Callable] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    local_size: Optional[int] = None,
+    allow_is_causal_skip: bool = True,
+    allow_torch_fix: bool = True,
+    **kwargs,
+) -> Optional[torch.Tensor]:
+    """
+    Pass-through for attention mask creation since it is never used:
+    - For regular attention, the custom sdpa op in causal mode creates its own attention mask
+    - For sliding window attention, the attention mask from the attention mask API is ditched and re-created during the attention API since it needs to know about cache internals
+
+    Additionally, there were some vmap export issues with sliding window attention mask creation in Transformers.
+
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        cache_position (`torch.Tensor`):
+            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+        local_size (`int`, optional):
+            The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
+            to try to skip mask creation if possible.
+        allow_is_causal_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
+            `torch.sdpa` instead. Default to `True`.
+        allow_torch_fix (`bool`, optional):
+            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
+            versions. We need an arg to skip it when using eager. By default `True`.
+
+    """
+    return None
+
+
 def custom_sdpa_with_start_pos_forward(
     module: torch.nn.Module,
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     attention_mask: Union[torch.Tensor, "BlockMask"],  # noqa
+    position_ids: Optional[torch.Tensor] = None,
     scaling: Optional[float] = None,
     softcap: Optional[float] = None,
     head_mask: Optional[torch.Tensor] = None,
@@ -56,10 +103,10 @@ def custom_sdpa_with_start_pos_forward(
             # Calculate the input pos from attention mask.
             # Branch out for float vs bool mask
             # assert attention_mask.dim() == 2, f"attention_mask must be a 2D matrix."
-            attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1])
-            first_row_mask = attention_mask[0, :]
-            # [0, 0, 0, 0, -inf, -inf, -inf, -inf], start_pos = 3
-            start_pos = torch.argmin(first_row_mask.to(torch.long)).item() - 1
+            assert (
+                position_ids is not None
+            ), "position_ids must be provided to find start position for causal attention"
+            start_pos = position_ids[0][0].item()
         else:
             start_pos = 0
 
@@ -95,6 +142,7 @@ def _custom_sdpa_for_ring_kv_cache(
         key: torch.Tensor,
         value: torch.Tensor,
         attention_mask: Union[torch.Tensor, "BlockMask"],  # noqa
+        position_ids: Optional[torch.Tensor] = None,
         scaling: Optional[float] = None,
         softcap: Optional[float] = None,
         head_mask: Optional[torch.Tensor] = None,
@@ -122,6 +170,7 @@ def _custom_sdpa_for_ring_kv_cache(
                 key,
                 value,
                 attention_mask,
+                position_ids,
                 scaling,
                 softcap,
                 head_mask,
@@ -134,6 +183,7 @@ def _custom_sdpa_for_ring_kv_cache(
                 key,
                 value,
                 attention_mask,
+                position_ids,
                 scaling,
                 softcap,
                 head_mask,

diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -34,9 +34,9 @@
     AutoModelForSeq2SeqLM,
     AutoModelForSpeechSeq2Seq,
     PreTrainedTokenizer,
-    add_start_docstrings,
 )
 from transformers.configuration_utils import PretrainedConfig
+from transformers.pipelines import get_task
 from transformers.processing_utils import ProcessorMixin
 from transformers.utils import is_offline_mode
 
@@ -46,13 +46,11 @@
 )
 from executorch.kernels import quantized  # noqa
 
-from ..exporters import TasksManager
 from ..exporters.executorch import main_export
 from ..exporters.executorch.utils import (
     process_conversation_inputs,
     verify_eos_tokens_in_pretrained_tokenizer,
 )
-from ..modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel
 from ..utils.file_utils import find_files_matching_pattern
 from .stats import Stats
 
@@ -63,7 +61,7 @@
 logger = logging.getLogger(__name__)
 
 
-class ExecuTorchModelBase(OptimizedModel, ABC):
+class ExecuTorchModelBase(ABC):
     """
     ExecuTorch model for inference using the ExecuTorch Runtime.
 
@@ -99,8 +97,6 @@ def __init__(
         models: Dict[str, "ExecuTorchModule"],
         config: "PretrainedConfig",
     ):
-        super().__init__(model=None, config=config)
-
         if self.__class__.auto_model_class is None:
             raise ValueError(
                 f"Class {self.__class__.__name__} must set auto_model_class. "
@@ -268,6 +264,7 @@ def _export(
         cls,
         model_id: str,
         recipe: str,
+        task: Optional[str] = None,
         config: Optional[PretrainedConfig] = None,
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
@@ -278,9 +275,8 @@ def _export(
         local_files_only: bool = False,
         **kwargs,
     ) -> Dict[str, "ExecuTorchModule"]:
-        task = kwargs.pop("task", None)
-        inferred_task = TasksManager.infer_task_from_model(cls.auto_model_class) if not task else task
-        logging.info(f"Inferred task from model class: {inferred_task}")
+        inferred_task = get_task(model_id) if not task else task
+        logging.info(f"Using task: {inferred_task}")
 
         save_dir = TemporaryDirectory(prefix="executorch_export_")
         save_dir_path = Path(save_dir.name)
@@ -316,7 +312,6 @@ def _save_pretrained(self, save_directory):
         raise NotImplementedError
 
     @classmethod
-    @add_start_docstrings(FROM_PRETRAINED_START_DOCSTRING)
     def from_pretrained(
         cls,
         model_id: Union[str, Path],
@@ -1322,9 +1317,9 @@ def generate(
 
     def text_generation(
         self,
-        processor: ProcessorMixin,
-        tokenizer: "PreTrainedTokenizer",
         input_conversation: List[Dict],
+        processor: Optional[ProcessorMixin] = None,
+        tokenizer: Optional["PreTrainedTokenizer"] = None,
         echo: bool = True,
         max_seq_len: Optional[int] = None,
     ):
@@ -1362,9 +1357,9 @@ def text_generation(
         self.stats.on_inference_start()
 
         inputs = process_conversation_inputs(
+            input_conversation,
             processor,
             tokenizer,
-            input_conversation,
         )
 
         self.stats.on_token_encode_end()

diff --git a/optimum/exporters/executorch/convert.py b/optimum/exporters/executorch/convert.py
@@ -19,8 +19,7 @@
 from pathlib import Path
 from typing import Union
 
-from transformers.integrations.executorch import sdpa_mask_without_vmap
-from transformers.masking_utils import AttentionMaskInterface
+from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, AttentionMaskInterface
 from transformers.modeling_utils import AttentionInterface
 
 from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward
@@ -29,7 +28,7 @@
 
 
 AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward)
-AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap)
+AttentionMaskInterface.register("custom_sdpa", ALL_MASK_ATTENTION_FUNCTIONS["sdpa"])
 
 
 def export_to_executorch(