diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b06d98b3..7fd8866cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,6 +153,15 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
+        # Set version for mtmd (required by upstream CMakeLists.txt)
+        # NOTE: This is a workaround for mtmd build requirements.
+        # Version is set to 0.0.0 for local builds. If upstream adds version
+        # compatibility checks, this may need to match llama.cpp version.
+        if (NOT DEFINED LLAMA_BUILD_NUMBER)
+            set(LLAMA_BUILD_NUMBER 0)
+        endif()
+        set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
+
         # Building llava
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index c1dde7046..0d56603e2 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.16"
+__version__ = "0.4.0"
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 71d94ebd8..18d8bc66d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -91,9 +91,9 @@ def __init__(
         logits_all: bool = False,
         embedding: bool = False,
         offload_kqv: bool = True,
-        flash_attn: bool = False,
         op_offload: Optional[bool] = None,
         swa_full: Optional[bool] = None,
+        flash_attn: Optional[bool] = None,
         # Sampling Params
         no_perf: bool = False,
         last_n_tokens_size: int = 64,
@@ -173,7 +173,7 @@ def __init__(
             logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
-            flash_attn: Use flash attention.
+            flash_attn: Use flash attention. None = auto, True = enabled, False = disabled.
             op_offload: offload host tensor operations to device
             swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
             no_perf: Measure performance timings.
@@ -341,7 +341,16 @@ def __init__(
         self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
-        self.context_params.flash_attn = flash_attn
+        if flash_attn is None:
+            self.context_params.flash_attn_type = llama_cpp.LLAMA_FLASH_ATTN_TYPE_AUTO
+        elif flash_attn:
+            self.context_params.flash_attn_type = (
+                llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+            )
+        else:
+            self.context_params.flash_attn_type = (
+                llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+            )
 
         if op_offload is not None:
             self.context_params.op_offload = op_offload
@@ -934,7 +943,8 @@ def generate(
 
                 sample_idx += 1
                 if stopping_criteria is not None and stopping_criteria(
-                    self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :]
+                    self._input_ids[:sample_idx],
+                    self._scores[sample_idx - self.n_tokens, :],
                 ):
                     return
                 tokens_or_none = yield token
@@ -1041,7 +1051,9 @@ def embed(
         data: Union[List[List[float]], List[List[List[float]]]] = []
 
         def decode_batch(seq_sizes: List[int]):
-            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+            mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+            if mem is not None:
+                llama_cpp.llama_memory_clear(mem, True)
             self._ctx.decode(self._batch)
             self._batch.reset()
 
@@ -1112,7 +1124,9 @@ def decode_batch(seq_sizes: List[int]):
 
         output = data[0] if isinstance(input, str) else data
 
-        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+        mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+        if mem is not None:
+            llama_cpp.llama_memory_clear(mem, True)
         self.reset()
 
         if return_count:
@@ -1157,9 +1171,9 @@ def _create_completion(
         bos_token_id: int = self.token_bos()
         cls_token_id: int = self._model.token_cls()
         sep_token_id: int = self._model.token_sep()
-        prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
-        middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
-        suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
+        prefix_token_id: int = self._model.token_prefix()
+        middle_token_id: int = self._model.token_middle()
+        suffix_token_id: int = self._model.token_suffix()
         add_space_prefix: bool = (
             self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
         )
@@ -1315,7 +1329,7 @@ def logit_bias_processor(
         if seed is not None:
             self.set_seed(seed)
         else:
-            self.set_seed(random.Random(self._seed).randint(0, 2 ** 32))
+            self.set_seed(random.Random(self._seed).randint(0, 2**32))
 
         finish_reason = "length"
         multibyte_fix = 0
@@ -2056,7 +2070,10 @@ def create_chat_completion_openai_v1(
             stream = kwargs.get("stream", False)  # type: ignore
             assert isinstance(stream, bool)
             if stream:
-                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))  # type: ignore
+                return (
+                    ChatCompletionChunk(**chunk)
+                    for chunk in self.create_chat_completion(*args, **kwargs)
+                )  # type: ignore
             else:
                 return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
         except ImportError:
@@ -2096,7 +2113,7 @@ def __getstate__(self):
             logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
-            flash_attn=self.context_params.flash_attn,
+            flash_attn=self.context_params.flash_attn_type,
             op_offload=self.context_params.op_offload,
             swa_full=self.context_params.swa_full,
             # Sampling Params
@@ -2316,19 +2333,23 @@ def from_pretrained(
         )
 
         if additional_files:
-            for additonal_file_name in additional_files:
+            for additional_file_name in additional_files:
                 # find the additional shard file:
-                matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
+                matching_additional_files = [
+                    file
+                    for file in file_list
+                    if fnmatch.fnmatch(file, additional_file_name)
+                ]
 
                 if len(matching_additional_files) == 0:
                     raise ValueError(
-                        f"No file found in {repo_id} that match {additonal_file_name}\n\n"
+                        f"No file found in {repo_id} that match {additional_file_name}\n\n"
                         f"Available Files:\n{json.dumps(file_list)}"
                     )
 
                 if len(matching_additional_files) > 1:
                     raise ValueError(
-                        f"Multiple files found in {repo_id} matching {additonal_file_name}\n\n"
+                        f"Multiple files found in {repo_id} matching {additional_file_name}\n\n"
                         f"Available Files:\n{json.dumps(files)}"
                     )
 
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 711d42a6a..4e8719e07 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -117,6 +117,14 @@
 # typedef bool (*ggml_abort_callback)(void * data);
 ggml_abort_callback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p)
 
+# typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
+ggml_log_callback = ctypes.CFUNCTYPE(
+    None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p
+)
+
+# typedef struct ggml_threadpool * ggml_threadpool_t;
+ggml_threadpool_t = ctypes.c_void_p
+
 # llama.h bindings
 
 _lib.llama_max_devices.argtypes = []
@@ -177,6 +185,13 @@
 # typedef int32_t llama_seq_id;
 llama_seq_id = ctypes.c_int32
 
+# typedef uint32_t llama_state_seq_flags;
+llama_state_seq_flags = ctypes.c_uint32
+
+# State sequence flags
+LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1
+LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 2
+
 
 # enum llama_vocab_type {
 #     LLAMA_VOCAB_TYPE_NONE   = 0, // For models without vocab
@@ -294,6 +309,7 @@
 LLAMA_ROPE_TYPE_NORM = 0
 LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2
 LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8
+LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40
 LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24
 
 
@@ -462,6 +478,14 @@
 LLAMA_ATTENTION_TYPE_CAUSAL = 0
 LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1
 
+# enum llama_flash_attn_type {
+#     LLAMA_FLASH_ATTN_TYPE_AUTO     = -1,
+#     LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
+#     LLAMA_FLASH_ATTN_TYPE_ENABLED  = 1,
+# };
+LLAMA_FLASH_ATTN_TYPE_AUTO = -1
+LLAMA_FLASH_ATTN_TYPE_DISABLED = 0
+LLAMA_FLASH_ATTN_TYPE_ENABLED = 1
 
 # enum llama_split_mode {
 #     LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
@@ -472,6 +496,14 @@
 LLAMA_SPLIT_MODE_LAYER = 1
 LLAMA_SPLIT_MODE_ROW = 2
 
+# enum llama_params_fit_status {
+#     LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0,
+#     LLAMA_PARAMS_FIT_STATUS_FAILURE = 1,
+#     LLAMA_PARAMS_FIT_STATUS_ERROR   = 2,
+# };
+LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0
+LLAMA_PARAMS_FIT_STATUS_FAILURE = 1
+LLAMA_PARAMS_FIT_STATUS_ERROR = 2
 
 # typedef struct llama_token_data {
 #     llama_token id; // token id
@@ -613,6 +645,22 @@ class llama_batch(ctypes.Structure):
 LLAMA_KV_OVERRIDE_TYPE_BOOL = 2
 LLAMA_KV_OVERRIDE_TYPE_STR = 3
 
+# enum llama_model_meta_key {
+#     LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
+#     ...
+# };
+LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE = 0
+LLAMA_MODEL_META_KEY_SAMPLING_TOP_K = 1
+LLAMA_MODEL_META_KEY_SAMPLING_TOP_P = 2
+LLAMA_MODEL_META_KEY_SAMPLING_MIN_P = 3
+LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY = 4
+LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD = 5
+LLAMA_MODEL_META_KEY_SAMPLING_TEMP = 6
+LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N = 7
+LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT = 8
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT = 9
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU = 10
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA = 11
 
 # struct llama_model_kv_override {
 #     enum llama_model_kv_override_type tag;
@@ -745,6 +793,8 @@ class llama_model_params(ctypes.Structure):
         ("use_mlock", ctypes.c_bool),
         ("check_tensors", ctypes.c_bool),
         ("use_extra_bufts", ctypes.c_bool),
+        ("no_host", ctypes.c_bool),
+        ("no_alloc", ctypes.c_bool),
     ]
 
 
@@ -875,6 +925,7 @@ class llama_context_params(ctypes.Structure):
         ("rope_scaling_type", ctypes.c_int),
         ("pooling_type", ctypes.c_int),
         ("attention_type", ctypes.c_int),
+        ("flash_attn_type", ctypes.c_int),
         ("rope_freq_base", ctypes.c_float),
         ("rope_freq_scale", ctypes.c_float),
         ("yarn_ext_factor", ctypes.c_float),
@@ -1146,11 +1197,26 @@ def llama_numa_init(numa: int, /):
 #         struct llama_context * ctx,
 #            ggml_threadpool_t   threadpool,
 #            ggml_threadpool_t   threadpool_batch);
-# TODO: Add llama_attach_threadpool
+@ctypes_function(
+    "llama_attach_threadpool",
+    [llama_context_p_ctypes, ggml_threadpool_t, ggml_threadpool_t],
+    None,
+)
+def llama_attach_threadpool(
+    ctx: llama_context_p,
+    threadpool: ctypes.c_void_p,
+    threadpool_batch: ctypes.c_void_p,
+    /,
+):
+    """Attach threadpools to context"""
+    ...
 
 
 # LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
-# TODO: Add llama_detach_threadpool
+@ctypes_function("llama_detach_threadpool", [llama_context_p_ctypes], None)
+def llama_detach_threadpool(ctx: llama_context_p, /):
+    """Detach threadpool from context"""
+    ...
 
 
 # DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -1331,12 +1397,75 @@ def llama_supports_rpc() -> bool:
     ...
 
 
+# LLAMA_API size_t llama_max_tensor_buft_overrides(void);
+@ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t)
+def llama_max_tensor_buft_overrides() -> int:
+    """Get maximum number of tensor buffer type overrides"""
+    ...
+
+
+# LLAMA_API enum llama_params_fit_status llama_params_fit(
+#                                const char   * path_model,
+#                 struct llama_model_params   * mparams,
+#                 struct llama_context_params * cparams,
+#                                       float * tensor_split,
+#     struct llama_model_tensor_buft_override * tensor_buft_overrides,
+#                                      size_t   margin,
+#                                    uint32_t   n_ctx_min,
+#                         enum ggml_log_level   log_level);
+@ctypes_function(
+    "llama_params_fit",
+    [
+        ctypes.c_char_p,
+        ctypes.POINTER(llama_model_params),
+        ctypes.POINTER(llama_context_params),
+        ctypes.POINTER(ctypes.c_float),
+        ctypes.c_void_p,  # tensor_buft_overrides - not fully bound
+        ctypes.c_size_t,  # margin
+        ctypes.c_uint32,  # n_ctx_min
+        ctypes.c_int,  # ggml_log_level (enum)
+    ],
+    ctypes.c_int,
+)
+def llama_params_fit(
+    path_model: bytes,
+    mparams: CtypesPointerOrRef[llama_model_params],
+    cparams: CtypesPointerOrRef[llama_context_params],
+    tensor_split: CtypesArray[ctypes.c_float],
+    tensor_buft_overrides: Optional[ctypes.c_void_p],
+    margin: Union[ctypes.c_size_t, int],
+    n_ctx_min: Union[ctypes.c_uint32, int],
+    log_level: int,
+    /,
+) -> int:
+    """Check if model parameters will fit in memory
+
+    Args:
+        margin: Memory margin to leave per device in bytes
+        n_ctx_min: Minimum context size when trying to reduce memory
+        log_level: Minimum log level (ggml_log_level enum)
+
+    Returns:
+        LLAMA_PARAMS_FIT_STATUS_SUCCESS (0) - found allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_FAILURE (1) - could not find allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_ERROR (2) - a hard error occurred
+    """
+    ...
+
+
 # LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
 @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_ctx(ctx: llama_context_p, /) -> int:
     ...
 
 
+# LLAMA_API uint32_t llama_n_ctx_seq(const struct llama_context * ctx);
+@ctypes_function("llama_n_ctx_seq", [llama_context_p_ctypes], ctypes.c_uint32)
+def llama_n_ctx_seq(ctx: llama_context_p, /) -> int:
+    """Get the context sequence size"""
+    ...
+
+
 # LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
 @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_batch(ctx: llama_context_p, /) -> int:
@@ -1405,16 +1534,6 @@ def llama_pooling_type(ctx: llama_context_p, /) -> int:
 
 
 # DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
-@ctypes_function(
-    "llama_get_kv_self",
-    [llama_context_p_ctypes],
-    llama_kv_cache_p_ctypes,
-)
-def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
-    """Get the KV cache for self-attention (DEPRECATED)"""
-    ...
-
-
 # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
 @ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes)
 def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]:
@@ -1439,6 +1558,13 @@ def llama_model_n_embd(model: llama_model_p, /) -> int:
     ...
 
 
+# LLAMA_API int32_t llama_model_n_embd_inp(const struct llama_model * model);
+@ctypes_function("llama_model_n_embd_inp", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_embd_inp(model: llama_model_p, /) -> int:
+    """Get the input embedding dimension"""
+    ...
+
+
 # LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
 @ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_model_n_layer(model: llama_model_p, /) -> int:
@@ -1663,6 +1789,13 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool:
     ...
 
 
+# LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model);
+@ctypes_function("llama_model_is_hybrid", [llama_model_p_ctypes], ctypes.c_bool)
+def llama_model_is_hybrid(model: llama_model_p, /) -> bool:
+    """Returns true if model is hybrid (Jamba, Granite, etc.)"""
+    ...
+
+
 # // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
 # LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
 @ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool)
@@ -1726,6 +1859,80 @@ def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /):
     ...
 
 
+# LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
+@ctypes_function(
+    "llama_adapter_meta_val_str",
+    [llama_adapter_lora_p_ctypes, ctypes.c_char_p, ctypes.c_char_p, ctypes.c_size_t],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_val_str(
+    adapter: llama_adapter_lora_p, key: bytes, buf: bytes, buf_size: int, /
+) -> int:
+    """Get adapter metadata value as string"""
+    ...
+
+
+# LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
+@ctypes_function(
+    "llama_adapter_meta_count", [llama_adapter_lora_p_ctypes], ctypes.c_int32
+)
+def llama_adapter_meta_count(adapter: llama_adapter_lora_p, /) -> int:
+    """Get number of adapter metadata pairs"""
+    ...
+
+
+# LLAMA_API int32_t llama_adapter_meta_key_by_index(...);
+@ctypes_function(
+    "llama_adapter_meta_key_by_index",
+    [llama_adapter_lora_p_ctypes, ctypes.c_int32, ctypes.c_char_p, ctypes.c_size_t],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_key_by_index(
+    adapter: llama_adapter_lora_p, i: int, buf: bytes, buf_size: int, /
+) -> int:
+    """Get adapter metadata key by index"""
+    ...
+
+
+# LLAMA_API int32_t llama_adapter_meta_val_str_by_index(...);
+@ctypes_function(
+    "llama_adapter_meta_val_str_by_index",
+    [llama_adapter_lora_p_ctypes, ctypes.c_int32, ctypes.c_char_p, ctypes.c_size_t],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_val_str_by_index(
+    adapter: llama_adapter_lora_p, i: int, buf: bytes, buf_size: int, /
+) -> int:
+    """Get adapter metadata value by index"""
+    ...
+
+
+# LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(...);
+@ctypes_function(
+    "llama_adapter_get_alora_n_invocation_tokens",
+    [llama_adapter_lora_p_ctypes],
+    ctypes.c_uint64,
+)
+def llama_adapter_get_alora_n_invocation_tokens(
+    adapter: llama_adapter_lora_p, /
+) -> int:
+    """Get alora invocation token count"""
+    ...
+
+
+# LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens(...);
+@ctypes_function(
+    "llama_adapter_get_alora_invocation_tokens",
+    [llama_adapter_lora_p_ctypes],
+    ctypes.POINTER(llama_token),
+)
+def llama_adapter_get_alora_invocation_tokens(
+    adapter: llama_adapter_lora_p, /
+) -> ctypes.Array:
+    """Get alora invocation tokens"""
+    ...
+
+
 # // The following functions operate on a llama_context, hence the naming: llama_verb_...
 
 
@@ -2038,256 +2245,6 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool:
 
 # //
 # // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
-# //
-
-# // Returns the number of tokens in the KV cache (slow, use only for debug)
-# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
-#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function(
-    "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
-)
-def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
-    """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)"""
-    ...
-
-
-# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
-#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function(
-    "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32
-)
-def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
-    """Returns the number of used KV cells (DEPRECATED)"""
-    ...
-
-
-# // Clear the KV cache - both cell info is erased and KV data is zeroed
-# DEPRECATED(LLAMA_API void llama_kv_self_clear(
-#             struct llama_context * ctx),
-#         "Use llama_memory_clear() instead");
-@ctypes_function(
-    "llama_kv_self_clear", [llama_context_p_ctypes], None
-)
-def llama_kv_self_clear(ctx: llama_context_p, /):
-    """Clear the KV cache (DEPRECATED)"""
-    ...
-
-
-# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
-# // seq_id < 0 : match any sequence
-# // p0 < 0     : [0,  p1]
-# // p1 < 0     : [p0, inf)
-# DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1),
-#         "Use llama_memory_seq_rm() instead");
-@ctypes_function(
-    "llama_kv_self_seq_rm",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-    ],
-    ctypes.c_bool,
-)
-def llama_kv_self_seq_rm(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    /,
-) -> bool:
-    """Remove tokens from KV cache (DEPRECATED)"""
-    ...
-
-
-# // Copy all tokens that belong to the specified sequence to another sequence
-# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id_src,
-#                 llama_seq_id   seq_id_dst,
-#                    llama_pos   p0,
-#                    llama_pos   p1),
-#         "Use llama_memory_seq_cp() instead");
-@ctypes_function(
-    "llama_kv_self_seq_cp",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-    ],
-    None,
-)
-def llama_kv_self_seq_cp(
-    ctx: llama_context_p,
-    seq_id_src: Union[llama_seq_id, int],
-    seq_id_dst: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    /,
-):
-    """Copy tokens in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Removes all tokens that do not belong to the specified sequence
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_keep() instead");
-@ctypes_function(
-    "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
-)
-def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
-    """Keep only specified sequence in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-# // If the KV cache is RoPEd, the KV data is updated accordingly:
-# //   - lazily on next llama_decode()
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1,
-#                    llama_pos   delta),
-#         "Use llama_memory_seq_add() instead");
-@ctypes_function(
-    "llama_kv_self_seq_add",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-        llama_pos,
-    ],
-    None,
-)
-def llama_kv_self_seq_add(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    delta: Union[llama_pos, int],
-    /,
-):
-    """Add delta to sequence positions in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Integer division of the positions by factor of `d > 1`
-# // If the KV cache is RoPEd, the KV data is updated accordingly:
-# //   - lazily on next llama_decode()
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1,
-#                          int   d),
-#         "Use llama_memory_seq_div() instead");
-@ctypes_function(
-    "llama_kv_self_seq_div",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-        ctypes.c_int,
-    ],
-    None,
-)
-def llama_kv_self_seq_div(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    d: Union[ctypes.c_int, int],
-    /,
-):
-    """Divide sequence positions in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Returns the smallest position present in the KV cache for the specified sequence
-# // This is typically non-zero only for SWA caches
-# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-# // Return -1 if the sequence is empty
-# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_pos_min() instead");
-@ctypes_function(
-    "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos
-)
-def llama_kv_self_seq_pos_min(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
-) -> int:
-    """Returns the smallest position in KV cache for sequence (DEPRECATED)"""
-    ...
-
-
-# // Returns the largest position present in the KV cache for the specified sequence
-# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-# // Return -1 if the sequence is empty
-# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_pos_max() instead");
-@ctypes_function(
-    "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos
-)
-def llama_kv_self_seq_pos_max(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
-) -> int:
-    """Returns the largest position in KV cache for sequence (DEPRECATED)"""
-    ...
-
-
-# // Defragment the KV cache
-# // This will be applied:
-# //   - lazily on next llama_decode()
-# DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
-#         "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
-@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
-def llama_kv_self_defrag(ctx: llama_context_p, /):
-    """Defragment the KV cache (DEPRECATED)"""
-    ...
-
-
-# // Check if the context supports KV cache shifting
-# DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
-#         "use llama_memory_can_shift() instead");
-@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
-def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
-    """Check if the context supports KV cache shifting (DEPRECATED)"""
-    ...
-
-
-# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-# DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
-#         "simply remove this call, updates are applied lazily on the next llama_decode()");
-@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
-def llama_kv_self_update(ctx: llama_context_p, /):
-    """Apply the KV cache updates (DEPRECATED)"""
-    ...
-
-
 # //
 # // State / sessions
 # //
@@ -2634,6 +2591,83 @@ def llama_state_seq_load_file(
     ...
 
 
+# LLAMA_API size_t llama_state_seq_get_size_ext(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_get_size_ext",
+    [llama_context_p_ctypes, llama_seq_id, llama_state_seq_flags],
+    ctypes.c_size_t,
+)
+def llama_state_seq_get_size_ext(
+    ctx: llama_context_p,
+    seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
+) -> int:
+    """Get size needed to copy sequence state with flags"""
+    ...
+
+
+# LLAMA_API size_t llama_state_seq_get_data_ext(
+#         struct llama_context * ctx,
+#                      uint8_t * dst,
+#                       size_t   size,
+#                 llama_seq_id   seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_get_data_ext",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+        llama_state_seq_flags,
+    ],
+    ctypes.c_size_t,
+)
+def llama_state_seq_get_data_ext(
+    ctx: llama_context_p,
+    dst: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
+) -> int:
+    """Copy sequence state to buffer with flags"""
+    ...
+
+
+# LLAMA_API size_t llama_state_seq_set_data_ext(
+#         struct llama_context * ctx,
+#                const uint8_t * src,
+#                       size_t   size,
+#                 llama_seq_id   dest_seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_set_data_ext",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+        llama_state_seq_flags,
+    ],
+    ctypes.c_size_t,
+)
+def llama_state_seq_set_data_ext(
+    ctx: llama_context_p,
+    src: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    dest_seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
+) -> int:
+    """Restore sequence state from buffer with flags"""
+    ...
+
+
 # //
 # // Decoding
 # //
@@ -3806,9 +3840,6 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
 # /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
 # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
 #     "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
-@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
-def llama_sampler_init_softmax() -> llama_sampler_p:
-    ...
 
 
 # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -4174,12 +4205,31 @@ def llama_print_system_info() -> bytes:
     ...
 
 
-# // Set callback for all future logging events.
-# // If this is not called, or NULL is supplied, everything is output on stderr.
+# LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type type);
+@ctypes_function("llama_flash_attn_type_name", [ctypes.c_int], ctypes.c_char_p)
+def llama_flash_attn_type_name(type: int, /) -> bytes:
+    """Get name of flash attention type"""
+    ...
+
+
+# LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
+@ctypes_function("llama_model_meta_key_str", [ctypes.c_int], ctypes.c_char_p)
+def llama_model_meta_key_str(key: int, /) -> bytes:
+    """Get string representation of model meta key"""
+    ...
+
+
+# LLAMA_API ggml_log_callback llama_log_get(void);
+@ctypes_function("llama_log_get", [], ggml_log_callback)
+def llama_log_get() -> ggml_log_callback:
+    """Get current log callback"""
+    ...
+
+
 # LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
 @ctypes_function(
     "llama_log_set",
-    [ctypes.c_void_p, ctypes.c_void_p],
+    [ggml_log_callback, ctypes.c_void_p],
     None,
 )
 def llama_log_set(
@@ -4193,7 +4243,17 @@ def llama_log_set(
     ...
 
 
-# //
+# LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
+@ctypes_function(
+    "llama_memory_breakdown_print",
+    [llama_context_p_ctypes],
+    None,
+)
+def llama_memory_breakdown_print(ctx: llama_context_p, /):
+    """Print memory breakdown for context"""
+    ...
+
+
 # // Performance utils
 # //
 
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index a45f8f406..e00eb3a0b 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -39,7 +39,11 @@
 # Specify the base name of the shared library to load
 _libmtmd_base_name = "mtmd"
 _libmtmd_override_path = os.environ.get("MTMD_CPP_LIB")
-_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
+_libmtmd_base_path = (
+    pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
+    if _libmtmd_override_path is None
+    else pathlib.Path(_libmtmd_override_path)
+)
 
 # Load the library
 _libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
@@ -71,17 +75,22 @@
 MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
 MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
 
+
 # Structures
 class mtmd_context_params(Structure):
     _fields_ = [
         ("use_gpu", c_bool),
         ("print_timings", c_bool),
         ("n_threads", c_int),
-        ("verbosity", c_int),  # ggml_log_level
         ("image_marker", c_char_p),
         ("media_marker", c_char_p),
+        ("flash_attn_type", c_int),  # enum llama_flash_attn_type
+        ("warmup", c_bool),
+        ("image_min_tokens", c_int),
+        ("image_max_tokens", c_int),
     ]
 
+
 class mtmd_input_text(Structure):
     _fields_ = [
         ("text", c_char_p),
@@ -89,19 +98,21 @@ class mtmd_input_text(Structure):
         ("parse_special", c_bool),
     ]
 
+
 ################################################
 # mtmd.h functions
 ################################################
 
+
 # MTMD_API const char * mtmd_default_marker(void);
 @ctypes_function("mtmd_default_marker", [], c_char_p)
-def mtmd_default_marker() -> bytes:
-    ...
+def mtmd_default_marker() -> bytes: ...
+
 
 # MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
 @ctypes_function("mtmd_context_params_default", [], mtmd_context_params)
-def mtmd_context_params_default() -> mtmd_context_params:
-    ...
+def mtmd_context_params_default() -> mtmd_context_params: ...
+
 
 # MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
 #                                             const struct llama_model * text_model,
@@ -109,70 +120,68 @@ def mtmd_context_params_default() -> mtmd_context_params:
 @ctypes_function(
     "mtmd_init_from_file",
     [c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params],
-    mtmd_context_p_ctypes
+    mtmd_context_p_ctypes,
 )
 def mtmd_init_from_file(
     mmproj_fname: bytes,
     text_model: llama_cpp.llama_model_p,
     ctx_params: mtmd_context_params,
     /,
-) -> Optional[mtmd_context_p]:
-    ...
+) -> Optional[mtmd_context_p]: ...
+
 
 # MTMD_API void mtmd_free(mtmd_context * ctx);
 @ctypes_function("mtmd_free", [mtmd_context_p_ctypes], None)
-def mtmd_free(ctx: mtmd_context_p, /):
-    ...
+def mtmd_free(ctx: mtmd_context_p, /): ...
+
 
 # MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
 @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool)
-def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool:
-    ...
+def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: ...
+
 
 # MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data);
 @ctypes_function(
-    "mtmd_bitmap_init",
-    [c_uint32, c_uint32, POINTER(c_uint8)],
-    mtmd_bitmap_p_ctypes
+    "mtmd_bitmap_init", [c_uint32, c_uint32, POINTER(c_uint8)], mtmd_bitmap_p_ctypes
 )
 def mtmd_bitmap_init(
     nx: Union[c_uint32, int],
     ny: Union[c_uint32, int],
     data: CtypesArray[c_uint8],
     /,
-) -> Optional[mtmd_bitmap_p]:
-    ...
+) -> Optional[mtmd_bitmap_p]: ...
+
 
 # MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap);
 @ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None)
-def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /):
-    ...
+def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): ...
+
 
 # MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
 @ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes)
-def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]:
-    ...
+def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]: ...
+
 
 # MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
 @ctypes_function("mtmd_input_chunks_free", [mtmd_input_chunks_p_ctypes], None)
-def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /):
-    ...
+def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /): ...
+
 
 # MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
 @ctypes_function("mtmd_input_chunks_size", [mtmd_input_chunks_p_ctypes], c_size_t)
-def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int:
-    ...
+def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int: ...
+
 
 # MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx);
 @ctypes_function(
     "mtmd_input_chunks_get",
     [mtmd_input_chunks_p_ctypes, c_size_t],
-    mtmd_input_chunk_p_ctypes
+    mtmd_input_chunk_p_ctypes,
 )
 def mtmd_input_chunks_get(
     chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], /
-) -> Optional[mtmd_input_chunk_p]:
-    ...
+) -> Optional[mtmd_input_chunk_p]: ...
+
 
 # MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
 #                                mtmd_input_chunks * output,
@@ -197,52 +206,53 @@ def mtmd_tokenize(
     bitmaps: CtypesArray[mtmd_bitmap_p_ctypes],
     n_bitmaps: Union[c_size_t, int],
     /,
-) -> int:
-    ...
+) -> int: ...
+
 
 # MTMD_API size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk);
 @ctypes_function("mtmd_input_chunk_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t)
-def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int:
-    ...
+def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int: ...
+
 
 # MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk);
 @ctypes_function("mtmd_input_chunk_get_type", [mtmd_input_chunk_p_ctypes], c_int)
-def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int:
-    ...
+def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int: ...
+
 
 # MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output);
 @ctypes_function(
     "mtmd_input_chunk_get_tokens_text",
     [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)],
-    POINTER(llama_cpp.llama_token)
+    POINTER(llama_cpp.llama_token),
 )
 def mtmd_input_chunk_get_tokens_text(
     chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", /
-) -> Optional["_Pointer[llama_cpp.llama_token]"]:
-    ...
+) -> Optional["_Pointer[llama_cpp.llama_token]"]: ...
+
 
 ################################################
 # mtmd-helper.h functions
 ################################################
 
+
 # MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
 @ctypes_function(
     "mtmd_helper_bitmap_init_from_buf",
     [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t],
-    mtmd_bitmap_p_ctypes
+    mtmd_bitmap_p_ctypes,
 )
 def mtmd_helper_bitmap_init_from_buf(
     ctx: mtmd_context_p,
     buf: CtypesArray[c_uint8],
     length: Union[c_size_t, int],
     /,
-) -> Optional[mtmd_bitmap_p]:
-    ...
+) -> Optional[mtmd_bitmap_p]: ...
+
 
 # MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
 @ctypes_function("mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t)
-def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int:
-    ...
+def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int: ...
+
 
 # MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
 #                                                struct llama_context * lctx,
@@ -276,5 +286,4 @@ def mtmd_helper_eval_chunk_single(
     logits_last: Union[c_bool, bool],
     new_n_past: "_Pointer[llama_cpp.llama_pos]",
     /,
-) -> int:
-    ...
+) -> int: ...
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 13c951241..bad0d4ee7 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -103,8 +103,9 @@ class ModelSettings(BaseSettings):
     offload_kqv: bool = Field(
         default=True, description="Whether to offload kqv to the GPU."
     )
-    flash_attn: bool = Field(
-        default=False, description="Whether to use flash attention."
+    flash_attn: Optional[bool] = Field(
+        default=None,
+        description="Use flash attention. None=auto, True=enabled, False=disabled.",
     )
     # Sampling Params
     last_n_tokens_size: int = Field(
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4227c9be4..be47fb928 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4227c9be4268ac844921b90f31595f81236bd317
+Subproject commit be47fb9285779e900915bd8246eb9664110d4ba5