From 103f671b53c3b44cfd4c940c305492bb114fabec Mon Sep 17 00:00:00 2001
From: Ralf Waldukat <ralf.waldukat@gmail.com>
Date: Mon, 5 Jan 2026 13:27:51 +0100
Subject: [PATCH 1/4] Update llama.cpp to 2026-01-01
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update vendor/llama.cpp submodule to be47fb92 (2026-01-01)
- Bump version from 0.3.16 to 0.4.0

Breaking changes:
- Migrate flash_attn bool to flash_attn_type enum (backward compatible via None=AUTO)
- Replace llama_kv_self_* API with llama_memory_* API

New features:
- Add LLAMA_FLASH_ATTN_TYPE_* enum (AUTO/DISABLED/ENABLED)
- Add llama_model_params fields: no_host, no_alloc
- Add mtmd_context_params fields: flash_attn_type, warmup, image_min/max_tokens
- Add LLAMA_ROPE_TYPE_IMROPE, LLAMA_PARAMS_FIT_STATUS_* enums
- Add 15+ new functions: llama_max_tensor_buft_overrides, llama_n_ctx_seq,
  llama_model_n_embd_inp, llama_model_is_hybrid, llama_log_*, llama_memory_*,
  llama_attach/detach_threadpool, llama_adapter_meta_* (4 functions)

Fixes:
- Server settings: flash_attn default None (AUTO) instead of False (DISABLED)
- Enable FIM token functions: token_prefix/middle/suffix
- Fix typos: additonal→additional, unnused→unused
- Remove deprecated verbosity field from mtmd_context_params
- Add CMake version workaround documentation

Code quality:
- Consistent stub style (... not pass)
- Struct alignment verified against llama.h and mtmd.h
- Minimal whitespace noise (0.4% of diff)
---
 CMakeLists.txt               |   9 +
 llama_cpp/__init__.py        |   2 +-
 llama_cpp/llama.py           |  53 +++-
 llama_cpp/llama_cpp.py       | 598 +++++++++++++++++++----------------
 llama_cpp/mtmd_cpp.py        |  99 +++---
 llama_cpp/server/settings.py |   5 +-
 vendor/llama.cpp             |   2 +-
 7 files changed, 434 insertions(+), 334 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b06d98b3..7fd8866cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,6 +153,15 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
+        # Set version for mtmd (required by upstream CMakeLists.txt)
+        # NOTE: This is a workaround for mtmd build requirements.
+        # Version is set to 0.0.0 for local builds. If upstream adds version
+        # compatibility checks, this may need to match llama.cpp version.
+        if (NOT DEFINED LLAMA_BUILD_NUMBER)
+            set(LLAMA_BUILD_NUMBER 0)
+        endif()
+        set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
+
         # Building llava
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index c1dde7046..0d56603e2 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.16"
+__version__ = "0.4.0"
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 71d94ebd8..18d8bc66d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -91,9 +91,9 @@ def __init__(
         logits_all: bool = False,
         embedding: bool = False,
         offload_kqv: bool = True,
-        flash_attn: bool = False,
         op_offload: Optional[bool] = None,
         swa_full: Optional[bool] = None,
+        flash_attn: Optional[bool] = None,
         # Sampling Params
         no_perf: bool = False,
         last_n_tokens_size: int = 64,
@@ -173,7 +173,7 @@ def __init__(
             logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
-            flash_attn: Use flash attention.
+            flash_attn: Use flash attention. None = auto, True = enabled, False = disabled.
             op_offload: offload host tensor operations to device
             swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
             no_perf: Measure performance timings.
@@ -341,7 +341,16 @@ def __init__(
         self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
-        self.context_params.flash_attn = flash_attn
+        if flash_attn is None:
+            self.context_params.flash_attn_type = llama_cpp.LLAMA_FLASH_ATTN_TYPE_AUTO
+        elif flash_attn:
+            self.context_params.flash_attn_type = (
+                llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+            )
+        else:
+            self.context_params.flash_attn_type = (
+                llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+            )
 
         if op_offload is not None:
             self.context_params.op_offload = op_offload
@@ -934,7 +943,8 @@ def generate(
 
                 sample_idx += 1
                 if stopping_criteria is not None and stopping_criteria(
-                    self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :]
+                    self._input_ids[:sample_idx],
+                    self._scores[sample_idx - self.n_tokens, :],
                 ):
                     return
                 tokens_or_none = yield token
@@ -1041,7 +1051,9 @@ def embed(
         data: Union[List[List[float]], List[List[List[float]]]] = []
 
         def decode_batch(seq_sizes: List[int]):
-            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+            mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+            if mem is not None:
+                llama_cpp.llama_memory_clear(mem, True)
             self._ctx.decode(self._batch)
             self._batch.reset()
 
@@ -1112,7 +1124,9 @@ def decode_batch(seq_sizes: List[int]):
 
         output = data[0] if isinstance(input, str) else data
 
-        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+        mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+        if mem is not None:
+            llama_cpp.llama_memory_clear(mem, True)
         self.reset()
 
         if return_count:
@@ -1157,9 +1171,9 @@ def _create_completion(
         bos_token_id: int = self.token_bos()
         cls_token_id: int = self._model.token_cls()
         sep_token_id: int = self._model.token_sep()
-        prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
-        middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
-        suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
+        prefix_token_id: int = self._model.token_prefix()
+        middle_token_id: int = self._model.token_middle()
+        suffix_token_id: int = self._model.token_suffix()
         add_space_prefix: bool = (
             self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
         )
@@ -1315,7 +1329,7 @@ def logit_bias_processor(
         if seed is not None:
             self.set_seed(seed)
         else:
-            self.set_seed(random.Random(self._seed).randint(0, 2 ** 32))
+            self.set_seed(random.Random(self._seed).randint(0, 2**32))
 
         finish_reason = "length"
         multibyte_fix = 0
@@ -2056,7 +2070,10 @@ def create_chat_completion_openai_v1(
             stream = kwargs.get("stream", False)  # type: ignore
             assert isinstance(stream, bool)
             if stream:
-                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))  # type: ignore
+                return (
+                    ChatCompletionChunk(**chunk)
+                    for chunk in self.create_chat_completion(*args, **kwargs)
+                )  # type: ignore
             else:
                 return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
         except ImportError:
@@ -2096,7 +2113,7 @@ def __getstate__(self):
             logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
-            flash_attn=self.context_params.flash_attn,
+            flash_attn=self.context_params.flash_attn_type,
             op_offload=self.context_params.op_offload,
             swa_full=self.context_params.swa_full,
             # Sampling Params
@@ -2316,19 +2333,23 @@ def from_pretrained(
         )
 
         if additional_files:
-            for additonal_file_name in additional_files:
+            for additional_file_name in additional_files:
                 # find the additional shard file:
-                matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
+                matching_additional_files = [
+                    file
+                    for file in file_list
+                    if fnmatch.fnmatch(file, additional_file_name)
+                ]
 
                 if len(matching_additional_files) == 0:
                     raise ValueError(
-                        f"No file found in {repo_id} that match {additonal_file_name}\n\n"
+                        f"No file found in {repo_id} that match {additional_file_name}\n\n"
                         f"Available Files:\n{json.dumps(file_list)}"
                     )
 
                 if len(matching_additional_files) > 1:
                     raise ValueError(
-                        f"Multiple files found in {repo_id} matching {additonal_file_name}\n\n"
+                        f"Multiple files found in {repo_id} matching {additional_file_name}\n\n"
                         f"Available Files:\n{json.dumps(files)}"
                     )
 
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 711d42a6a..4e8719e07 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -117,6 +117,14 @@
 # typedef bool (*ggml_abort_callback)(void * data);
 ggml_abort_callback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p)
 
+# typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
+ggml_log_callback = ctypes.CFUNCTYPE(
+    None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p
+)
+
+# typedef struct ggml_threadpool * ggml_threadpool_t;
+ggml_threadpool_t = ctypes.c_void_p
+
 # llama.h bindings
 
 _lib.llama_max_devices.argtypes = []
@@ -177,6 +185,13 @@
 # typedef int32_t llama_seq_id;
 llama_seq_id = ctypes.c_int32
 
+# typedef uint32_t llama_state_seq_flags;
+llama_state_seq_flags = ctypes.c_uint32
+
+# State sequence flags
+LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1
+LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 2
+
 
 # enum llama_vocab_type {
 #     LLAMA_VOCAB_TYPE_NONE   = 0, // For models without vocab
@@ -294,6 +309,7 @@
 LLAMA_ROPE_TYPE_NORM = 0
 LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2
 LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8
+LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40
 LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24
 
 
@@ -462,6 +478,14 @@
 LLAMA_ATTENTION_TYPE_CAUSAL = 0
 LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1
 
+# enum llama_flash_attn_type {
+#     LLAMA_FLASH_ATTN_TYPE_AUTO     = -1,
+#     LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
+#     LLAMA_FLASH_ATTN_TYPE_ENABLED  = 1,
+# };
+LLAMA_FLASH_ATTN_TYPE_AUTO = -1
+LLAMA_FLASH_ATTN_TYPE_DISABLED = 0
+LLAMA_FLASH_ATTN_TYPE_ENABLED = 1
 
 # enum llama_split_mode {
 #     LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
@@ -472,6 +496,14 @@
 LLAMA_SPLIT_MODE_LAYER = 1
 LLAMA_SPLIT_MODE_ROW = 2
 
+# enum llama_params_fit_status {
+#     LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0,
+#     LLAMA_PARAMS_FIT_STATUS_FAILURE = 1,
+#     LLAMA_PARAMS_FIT_STATUS_ERROR   = 2,
+# };
+LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0
+LLAMA_PARAMS_FIT_STATUS_FAILURE = 1
+LLAMA_PARAMS_FIT_STATUS_ERROR = 2
 
 # typedef struct llama_token_data {
 #     llama_token id; // token id
@@ -613,6 +645,22 @@ class llama_batch(ctypes.Structure):
 LLAMA_KV_OVERRIDE_TYPE_BOOL = 2
 LLAMA_KV_OVERRIDE_TYPE_STR = 3
 
+# enum llama_model_meta_key {
+#     LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
+#     ...
+# };
+LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE = 0
+LLAMA_MODEL_META_KEY_SAMPLING_TOP_K = 1
+LLAMA_MODEL_META_KEY_SAMPLING_TOP_P = 2
+LLAMA_MODEL_META_KEY_SAMPLING_MIN_P = 3
+LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY = 4
+LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD = 5
+LLAMA_MODEL_META_KEY_SAMPLING_TEMP = 6
+LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N = 7
+LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT = 8
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT = 9
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU = 10
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA = 11
 
 # struct llama_model_kv_override {
 #     enum llama_model_kv_override_type tag;
@@ -745,6 +793,8 @@ class llama_model_params(ctypes.Structure):
         ("use_mlock", ctypes.c_bool),
         ("check_tensors", ctypes.c_bool),
         ("use_extra_bufts", ctypes.c_bool),
+        ("no_host", ctypes.c_bool),
+        ("no_alloc", ctypes.c_bool),
     ]
 
 
@@ -875,6 +925,7 @@ class llama_context_params(ctypes.Structure):
         ("rope_scaling_type", ctypes.c_int),
         ("pooling_type", ctypes.c_int),
         ("attention_type", ctypes.c_int),
+        ("flash_attn_type", ctypes.c_int),
         ("rope_freq_base", ctypes.c_float),
         ("rope_freq_scale", ctypes.c_float),
         ("yarn_ext_factor", ctypes.c_float),
@@ -1146,11 +1197,26 @@ def llama_numa_init(numa: int, /):
 #         struct llama_context * ctx,
 #            ggml_threadpool_t   threadpool,
 #            ggml_threadpool_t   threadpool_batch);
-# TODO: Add llama_attach_threadpool
+@ctypes_function(
+    "llama_attach_threadpool",
+    [llama_context_p_ctypes, ggml_threadpool_t, ggml_threadpool_t],
+    None,
+)
+def llama_attach_threadpool(
+    ctx: llama_context_p,
+    threadpool: ctypes.c_void_p,
+    threadpool_batch: ctypes.c_void_p,
+    /,
+):
+    """Attach threadpools to context"""
+    ...
 
 
 # LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
-# TODO: Add llama_detach_threadpool
+@ctypes_function("llama_detach_threadpool", [llama_context_p_ctypes], None)
+def llama_detach_threadpool(ctx: llama_context_p, /):
+    """Detach threadpool from context"""
+    ...
 
 
 # DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -1331,12 +1397,75 @@ def llama_supports_rpc() -> bool:
     ...
 
 
+# LLAMA_API size_t llama_max_tensor_buft_overrides(void);
+@ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t)
+def llama_max_tensor_buft_overrides() -> int:
+    """Get maximum number of tensor buffer type overrides"""
+    ...
+
+
+# LLAMA_API enum llama_params_fit_status llama_params_fit(
+#                                const char   * path_model,
+#                 struct llama_model_params   * mparams,
+#                 struct llama_context_params * cparams,
+#                                       float * tensor_split,
+#     struct llama_model_tensor_buft_override * tensor_buft_overrides,
+#                                      size_t   margin,
+#                                    uint32_t   n_ctx_min,
+#                         enum ggml_log_level   log_level);
+@ctypes_function(
+    "llama_params_fit",
+    [
+        ctypes.c_char_p,
+        ctypes.POINTER(llama_model_params),
+        ctypes.POINTER(llama_context_params),
+        ctypes.POINTER(ctypes.c_float),
+        ctypes.c_void_p,  # tensor_buft_overrides - not fully bound
+        ctypes.c_size_t,  # margin
+        ctypes.c_uint32,  # n_ctx_min
+        ctypes.c_int,  # ggml_log_level (enum)
+    ],
+    ctypes.c_int,
+)
+def llama_params_fit(
+    path_model: bytes,
+    mparams: CtypesPointerOrRef[llama_model_params],
+    cparams: CtypesPointerOrRef[llama_context_params],
+    tensor_split: CtypesArray[ctypes.c_float],
+    tensor_buft_overrides: Optional[ctypes.c_void_p],
+    margin: Union[ctypes.c_size_t, int],
+    n_ctx_min: Union[ctypes.c_uint32, int],
+    log_level: int,
+    /,
+) -> int:
+    """Check if model parameters will fit in memory
+
+    Args:
+        margin: Memory margin to leave per device in bytes
+        n_ctx_min: Minimum context size when trying to reduce memory
+        log_level: Minimum log level (ggml_log_level enum)
+
+    Returns:
+        LLAMA_PARAMS_FIT_STATUS_SUCCESS (0) - found allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_FAILURE (1) - could not find allocations that are projected to fit
+        LLAMA_PARAMS_FIT_STATUS_ERROR (2) - a hard error occurred
+    """
+    ...
+
+
 # LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
 @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_ctx(ctx: llama_context_p, /) -> int:
     ...
 
 
+# LLAMA_API uint32_t llama_n_ctx_seq(const struct llama_context * ctx);
+@ctypes_function("llama_n_ctx_seq", [llama_context_p_ctypes], ctypes.c_uint32)
+def llama_n_ctx_seq(ctx: llama_context_p, /) -> int:
+    """Get the context sequence size"""
+    ...
+
+
 # LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
 @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_batch(ctx: llama_context_p, /) -> int:
@@ -1405,16 +1534,6 @@ def llama_pooling_type(ctx: llama_context_p, /) -> int:
 
 
 # DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
-@ctypes_function(
-    "llama_get_kv_self",
-    [llama_context_p_ctypes],
-    llama_kv_cache_p_ctypes,
-)
-def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
-    """Get the KV cache for self-attention (DEPRECATED)"""
-    ...
-
-
 # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
 @ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes)
 def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]:
@@ -1439,6 +1558,13 @@ def llama_model_n_embd(model: llama_model_p, /) -> int:
     ...
 
 
+# LLAMA_API int32_t llama_model_n_embd_inp(const struct llama_model * model);
+@ctypes_function("llama_model_n_embd_inp", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_embd_inp(model: llama_model_p, /) -> int:
+    """Get the input embedding dimension"""
+    ...
+
+
 # LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
 @ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_model_n_layer(model: llama_model_p, /) -> int:
@@ -1663,6 +1789,13 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool:
     ...
 
 
+# LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model);
+@ctypes_function("llama_model_is_hybrid", [llama_model_p_ctypes], ctypes.c_bool)
+def llama_model_is_hybrid(model: llama_model_p, /) -> bool:
+    """Returns true if model is hybrid (Jamba, Granite, etc.)"""
+    ...
+
+
 # // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
 # LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
 @ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool)
@@ -1726,6 +1859,80 @@ def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /):
     ...
 
 
+# LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
+@ctypes_function(
+    "llama_adapter_meta_val_str",
+    [llama_adapter_lora_p_ctypes, ctypes.c_char_p, ctypes.c_char_p, ctypes.c_size_t],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_val_str(
+    adapter: llama_adapter_lora_p, key: bytes, buf: bytes, buf_size: int, /
+) -> int:
+    """Get adapter metadata value as string"""
+    ...
+
+
+# LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
+@ctypes_function(
+    "llama_adapter_meta_count", [llama_adapter_lora_p_ctypes], ctypes.c_int32
+)
+def llama_adapter_meta_count(adapter: llama_adapter_lora_p, /) -> int:
+    """Get number of adapter metadata pairs"""
+    ...
+
+
+# LLAMA_API int32_t llama_adapter_meta_key_by_index(...);
+@ctypes_function(
+    "llama_adapter_meta_key_by_index",
+    [llama_adapter_lora_p_ctypes, ctypes.c_int32, ctypes.c_char_p, ctypes.c_size_t],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_key_by_index(
+    adapter: llama_adapter_lora_p, i: int, buf: bytes, buf_size: int, /
+) -> int:
+    """Get adapter metadata key by index"""
+    ...
+
+
+# LLAMA_API int32_t llama_adapter_meta_val_str_by_index(...);
+@ctypes_function(
+    "llama_adapter_meta_val_str_by_index",
+    [llama_adapter_lora_p_ctypes, ctypes.c_int32, ctypes.c_char_p, ctypes.c_size_t],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_val_str_by_index(
+    adapter: llama_adapter_lora_p, i: int, buf: bytes, buf_size: int, /
+) -> int:
+    """Get adapter metadata value by index"""
+    ...
+
+
+# LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(...);
+@ctypes_function(
+    "llama_adapter_get_alora_n_invocation_tokens",
+    [llama_adapter_lora_p_ctypes],
+    ctypes.c_uint64,
+)
+def llama_adapter_get_alora_n_invocation_tokens(
+    adapter: llama_adapter_lora_p, /
+) -> int:
+    """Get alora invocation token count"""
+    ...
+
+
+# LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens(...);
+@ctypes_function(
+    "llama_adapter_get_alora_invocation_tokens",
+    [llama_adapter_lora_p_ctypes],
+    ctypes.POINTER(llama_token),
+)
+def llama_adapter_get_alora_invocation_tokens(
+    adapter: llama_adapter_lora_p, /
+) -> ctypes.Array:
+    """Get alora invocation tokens"""
+    ...
+
+
 # // The following functions operate on a llama_context, hence the naming: llama_verb_...
 
 
@@ -2038,256 +2245,6 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool:
 
 # //
 # // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
-# //
-
-# // Returns the number of tokens in the KV cache (slow, use only for debug)
-# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
-#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function(
-    "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
-)
-def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
-    """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)"""
-    ...
-
-
-# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
-#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function(
-    "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32
-)
-def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
-    """Returns the number of used KV cells (DEPRECATED)"""
-    ...
-
-
-# // Clear the KV cache - both cell info is erased and KV data is zeroed
-# DEPRECATED(LLAMA_API void llama_kv_self_clear(
-#             struct llama_context * ctx),
-#         "Use llama_memory_clear() instead");
-@ctypes_function(
-    "llama_kv_self_clear", [llama_context_p_ctypes], None
-)
-def llama_kv_self_clear(ctx: llama_context_p, /):
-    """Clear the KV cache (DEPRECATED)"""
-    ...
-
-
-# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
-# // seq_id < 0 : match any sequence
-# // p0 < 0     : [0,  p1]
-# // p1 < 0     : [p0, inf)
-# DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1),
-#         "Use llama_memory_seq_rm() instead");
-@ctypes_function(
-    "llama_kv_self_seq_rm",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-    ],
-    ctypes.c_bool,
-)
-def llama_kv_self_seq_rm(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    /,
-) -> bool:
-    """Remove tokens from KV cache (DEPRECATED)"""
-    ...
-
-
-# // Copy all tokens that belong to the specified sequence to another sequence
-# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id_src,
-#                 llama_seq_id   seq_id_dst,
-#                    llama_pos   p0,
-#                    llama_pos   p1),
-#         "Use llama_memory_seq_cp() instead");
-@ctypes_function(
-    "llama_kv_self_seq_cp",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-    ],
-    None,
-)
-def llama_kv_self_seq_cp(
-    ctx: llama_context_p,
-    seq_id_src: Union[llama_seq_id, int],
-    seq_id_dst: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    /,
-):
-    """Copy tokens in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Removes all tokens that do not belong to the specified sequence
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_keep() instead");
-@ctypes_function(
-    "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
-)
-def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
-    """Keep only specified sequence in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-# // If the KV cache is RoPEd, the KV data is updated accordingly:
-# //   - lazily on next llama_decode()
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1,
-#                    llama_pos   delta),
-#         "Use llama_memory_seq_add() instead");
-@ctypes_function(
-    "llama_kv_self_seq_add",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-        llama_pos,
-    ],
-    None,
-)
-def llama_kv_self_seq_add(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    delta: Union[llama_pos, int],
-    /,
-):
-    """Add delta to sequence positions in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Integer division of the positions by factor of `d > 1`
-# // If the KV cache is RoPEd, the KV data is updated accordingly:
-# //   - lazily on next llama_decode()
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1,
-#                          int   d),
-#         "Use llama_memory_seq_div() instead");
-@ctypes_function(
-    "llama_kv_self_seq_div",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-        ctypes.c_int,
-    ],
-    None,
-)
-def llama_kv_self_seq_div(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    d: Union[ctypes.c_int, int],
-    /,
-):
-    """Divide sequence positions in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Returns the smallest position present in the KV cache for the specified sequence
-# // This is typically non-zero only for SWA caches
-# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-# // Return -1 if the sequence is empty
-# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_pos_min() instead");
-@ctypes_function(
-    "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos
-)
-def llama_kv_self_seq_pos_min(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
-) -> int:
-    """Returns the smallest position in KV cache for sequence (DEPRECATED)"""
-    ...
-
-
-# // Returns the largest position present in the KV cache for the specified sequence
-# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-# // Return -1 if the sequence is empty
-# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_pos_max() instead");
-@ctypes_function(
-    "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos
-)
-def llama_kv_self_seq_pos_max(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
-) -> int:
-    """Returns the largest position in KV cache for sequence (DEPRECATED)"""
-    ...
-
-
-# // Defragment the KV cache
-# // This will be applied:
-# //   - lazily on next llama_decode()
-# DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
-#         "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
-@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
-def llama_kv_self_defrag(ctx: llama_context_p, /):
-    """Defragment the KV cache (DEPRECATED)"""
-    ...
-
-
-# // Check if the context supports KV cache shifting
-# DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
-#         "use llama_memory_can_shift() instead");
-@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
-def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
-    """Check if the context supports KV cache shifting (DEPRECATED)"""
-    ...
-
-
-# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-# DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
-#         "simply remove this call, updates are applied lazily on the next llama_decode()");
-@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
-def llama_kv_self_update(ctx: llama_context_p, /):
-    """Apply the KV cache updates (DEPRECATED)"""
-    ...
-
-
 # //
 # // State / sessions
 # //
@@ -2634,6 +2591,83 @@ def llama_state_seq_load_file(
     ...
 
 
+# LLAMA_API size_t llama_state_seq_get_size_ext(
+#         struct llama_context * ctx,
+#                 llama_seq_id   seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_get_size_ext",
+    [llama_context_p_ctypes, llama_seq_id, llama_state_seq_flags],
+    ctypes.c_size_t,
+)
+def llama_state_seq_get_size_ext(
+    ctx: llama_context_p,
+    seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
+) -> int:
+    """Get size needed to copy sequence state with flags"""
+    ...
+
+
+# LLAMA_API size_t llama_state_seq_get_data_ext(
+#         struct llama_context * ctx,
+#                      uint8_t * dst,
+#                       size_t   size,
+#                 llama_seq_id   seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_get_data_ext",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+        llama_state_seq_flags,
+    ],
+    ctypes.c_size_t,
+)
+def llama_state_seq_get_data_ext(
+    ctx: llama_context_p,
+    dst: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
+) -> int:
+    """Copy sequence state to buffer with flags"""
+    ...
+
+
+# LLAMA_API size_t llama_state_seq_set_data_ext(
+#         struct llama_context * ctx,
+#                const uint8_t * src,
+#                       size_t   size,
+#                 llama_seq_id   dest_seq_id,
+#        llama_state_seq_flags   flags);
+@ctypes_function(
+    "llama_state_seq_set_data_ext",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_uint8),
+        ctypes.c_size_t,
+        llama_seq_id,
+        llama_state_seq_flags,
+    ],
+    ctypes.c_size_t,
+)
+def llama_state_seq_set_data_ext(
+    ctx: llama_context_p,
+    src: CtypesArray[ctypes.c_uint8],
+    size: Union[ctypes.c_size_t, int],
+    dest_seq_id: Union[llama_seq_id, int],
+    flags: Union[llama_state_seq_flags, int],
+    /,
+) -> int:
+    """Restore sequence state from buffer with flags"""
+    ...
+
+
 # //
 # // Decoding
 # //
@@ -3806,9 +3840,6 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
 # /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
 # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
 #     "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
-@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
-def llama_sampler_init_softmax() -> llama_sampler_p:
-    ...
 
 
 # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -4174,12 +4205,31 @@ def llama_print_system_info() -> bytes:
     ...
 
 
-# // Set callback for all future logging events.
-# // If this is not called, or NULL is supplied, everything is output on stderr.
+# LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type type);
+@ctypes_function("llama_flash_attn_type_name", [ctypes.c_int], ctypes.c_char_p)
+def llama_flash_attn_type_name(type: int, /) -> bytes:
+    """Get name of flash attention type"""
+    ...
+
+
+# LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
+@ctypes_function("llama_model_meta_key_str", [ctypes.c_int], ctypes.c_char_p)
+def llama_model_meta_key_str(key: int, /) -> bytes:
+    """Get string representation of model meta key"""
+    ...
+
+
+# LLAMA_API ggml_log_callback llama_log_get(void);
+@ctypes_function("llama_log_get", [], ggml_log_callback)
+def llama_log_get() -> ggml_log_callback:
+    """Get current log callback"""
+    ...
+
+
 # LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
 @ctypes_function(
     "llama_log_set",
-    [ctypes.c_void_p, ctypes.c_void_p],
+    [ggml_log_callback, ctypes.c_void_p],
     None,
 )
 def llama_log_set(
@@ -4193,7 +4243,17 @@ def llama_log_set(
     ...
 
 
-# //
+# LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
+@ctypes_function(
+    "llama_memory_breakdown_print",
+    [llama_context_p_ctypes],
+    None,
+)
+def llama_memory_breakdown_print(ctx: llama_context_p, /):
+    """Print memory breakdown for context"""
+    ...
+
+
 # // Performance utils
 # //
 
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index a45f8f406..e00eb3a0b 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -39,7 +39,11 @@
 # Specify the base name of the shared library to load
 _libmtmd_base_name = "mtmd"
 _libmtmd_override_path = os.environ.get("MTMD_CPP_LIB")
-_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
+_libmtmd_base_path = (
+    pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
+    if _libmtmd_override_path is None
+    else pathlib.Path(_libmtmd_override_path)
+)
 
 # Load the library
 _libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
@@ -71,17 +75,22 @@
 MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
 MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
 
+
 # Structures
 class mtmd_context_params(Structure):
     _fields_ = [
         ("use_gpu", c_bool),
         ("print_timings", c_bool),
         ("n_threads", c_int),
-        ("verbosity", c_int),  # ggml_log_level
         ("image_marker", c_char_p),
         ("media_marker", c_char_p),
+        ("flash_attn_type", c_int),  # enum llama_flash_attn_type
+        ("warmup", c_bool),
+        ("image_min_tokens", c_int),
+        ("image_max_tokens", c_int),
     ]
 
+
 class mtmd_input_text(Structure):
     _fields_ = [
         ("text", c_char_p),
@@ -89,19 +98,21 @@ class mtmd_input_text(Structure):
         ("parse_special", c_bool),
     ]
 
+
 ################################################
 # mtmd.h functions
 ################################################
 
+
 # MTMD_API const char * mtmd_default_marker(void);
 @ctypes_function("mtmd_default_marker", [], c_char_p)
-def mtmd_default_marker() -> bytes:
-    ...
+def mtmd_default_marker() -> bytes: ...
+
 
 # MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
 @ctypes_function("mtmd_context_params_default", [], mtmd_context_params)
-def mtmd_context_params_default() -> mtmd_context_params:
-    ...
+def mtmd_context_params_default() -> mtmd_context_params: ...
+
 
 # MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
 #                                             const struct llama_model * text_model,
@@ -109,70 +120,68 @@ def mtmd_context_params_default() -> mtmd_context_params:
 @ctypes_function(
     "mtmd_init_from_file",
     [c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params],
-    mtmd_context_p_ctypes
+    mtmd_context_p_ctypes,
 )
 def mtmd_init_from_file(
     mmproj_fname: bytes,
     text_model: llama_cpp.llama_model_p,
     ctx_params: mtmd_context_params,
     /,
-) -> Optional[mtmd_context_p]:
-    ...
+) -> Optional[mtmd_context_p]: ...
+
 
 # MTMD_API void mtmd_free(mtmd_context * ctx);
 @ctypes_function("mtmd_free", [mtmd_context_p_ctypes], None)
-def mtmd_free(ctx: mtmd_context_p, /):
-    ...
+def mtmd_free(ctx: mtmd_context_p, /): ...
+
 
 # MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
 @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool)
-def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool:
-    ...
+def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: ...
+
 
 # MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data);
 @ctypes_function(
-    "mtmd_bitmap_init",
-    [c_uint32, c_uint32, POINTER(c_uint8)],
-    mtmd_bitmap_p_ctypes
+    "mtmd_bitmap_init", [c_uint32, c_uint32, POINTER(c_uint8)], mtmd_bitmap_p_ctypes
 )
 def mtmd_bitmap_init(
     nx: Union[c_uint32, int],
     ny: Union[c_uint32, int],
     data: CtypesArray[c_uint8],
     /,
-) -> Optional[mtmd_bitmap_p]:
-    ...
+) -> Optional[mtmd_bitmap_p]: ...
+
 
 # MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap);
 @ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None)
-def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /):
-    ...
+def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): ...
+
 
 # MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
 @ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes)
-def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]:
-    ...
+def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]: ...
+
 
 # MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
 @ctypes_function("mtmd_input_chunks_free", [mtmd_input_chunks_p_ctypes], None)
-def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /):
-    ...
+def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /): ...
+
 
 # MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
 @ctypes_function("mtmd_input_chunks_size", [mtmd_input_chunks_p_ctypes], c_size_t)
-def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int:
-    ...
+def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int: ...
+
 
 # MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx);
 @ctypes_function(
     "mtmd_input_chunks_get",
     [mtmd_input_chunks_p_ctypes, c_size_t],
-    mtmd_input_chunk_p_ctypes
+    mtmd_input_chunk_p_ctypes,
 )
 def mtmd_input_chunks_get(
     chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], /
-) -> Optional[mtmd_input_chunk_p]:
-    ...
+) -> Optional[mtmd_input_chunk_p]: ...
+
 
 # MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
 #                                mtmd_input_chunks * output,
@@ -197,52 +206,53 @@ def mtmd_tokenize(
     bitmaps: CtypesArray[mtmd_bitmap_p_ctypes],
     n_bitmaps: Union[c_size_t, int],
     /,
-) -> int:
-    ...
+) -> int: ...
+
 
 # MTMD_API size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk);
 @ctypes_function("mtmd_input_chunk_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t)
-def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int:
-    ...
+def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int: ...
+
 
 # MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk);
 @ctypes_function("mtmd_input_chunk_get_type", [mtmd_input_chunk_p_ctypes], c_int)
-def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int:
-    ...
+def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int: ...
+
 
 # MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output);
 @ctypes_function(
     "mtmd_input_chunk_get_tokens_text",
     [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)],
-    POINTER(llama_cpp.llama_token)
+    POINTER(llama_cpp.llama_token),
 )
 def mtmd_input_chunk_get_tokens_text(
     chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", /
-) -> Optional["_Pointer[llama_cpp.llama_token]"]:
-    ...
+) -> Optional["_Pointer[llama_cpp.llama_token]"]: ...
+
 
 ################################################
 # mtmd-helper.h functions
 ################################################
 
+
 # MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
 @ctypes_function(
     "mtmd_helper_bitmap_init_from_buf",
     [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t],
-    mtmd_bitmap_p_ctypes
+    mtmd_bitmap_p_ctypes,
 )
 def mtmd_helper_bitmap_init_from_buf(
     ctx: mtmd_context_p,
     buf: CtypesArray[c_uint8],
     length: Union[c_size_t, int],
     /,
-) -> Optional[mtmd_bitmap_p]:
-    ...
+) -> Optional[mtmd_bitmap_p]: ...
+
 
 # MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
 @ctypes_function("mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t)
-def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int:
-    ...
+def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int: ...
+
 
 # MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
 #                                                struct llama_context * lctx,
@@ -276,5 +286,4 @@ def mtmd_helper_eval_chunk_single(
     logits_last: Union[c_bool, bool],
     new_n_past: "_Pointer[llama_cpp.llama_pos]",
     /,
-) -> int:
-    ...
+) -> int: ...
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 13c951241..bad0d4ee7 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -103,8 +103,9 @@ class ModelSettings(BaseSettings):
     offload_kqv: bool = Field(
         default=True, description="Whether to offload kqv to the GPU."
     )
-    flash_attn: bool = Field(
-        default=False, description="Whether to use flash attention."
+    flash_attn: Optional[bool] = Field(
+        default=None,
+        description="Use flash attention. None=auto, True=enabled, False=disabled.",
     )
     # Sampling Params
     last_n_tokens_size: int = Field(
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4227c9be4..be47fb928 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4227c9be4268ac844921b90f31595f81236bd317
+Subproject commit be47fb9285779e900915bd8246eb9664110d4ba5

From a1d99cb482bf7b8b532a28e18db89ac6364b8c31 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Sun, 4 Jan 2026 00:30:58 -0500
Subject: [PATCH 2/4] feat: support Granite-Docling model

---
 CHANGELOG.md                     |  3 +++
 examples/granite_docling/main.py | 34 ++++++++++++++++++++++++++++++++
 llama_cpp/llama_chat_format.py   | 31 +++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+)
 create mode 100644 examples/granite_docling/main.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16954eb88..3e479ac95 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Add chat completion handler for Granite-Docling (and SmolVLM)
+- feat: Add `special` argument to keep special tokens in chat completion output
+
 ## [0.3.16]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@4227c9be4268ac844921b90f31595f81236bd317
diff --git a/examples/granite_docling/main.py b/examples/granite_docling/main.py
new file mode 100644
index 000000000..e2417b8f3
--- /dev/null
+++ b/examples/granite_docling/main.py
@@ -0,0 +1,34 @@
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import GraniteDoclingChatHandler
+
+
+chat_handler = GraniteDoclingChatHandler.from_pretrained(
+    repo_id="ggml-org/granite-docling-258M-GGUF",
+    filename="mmproj*Q8_0*",
+)
+llama = Llama.from_pretrained(
+    repo_id="ggml-org/granite-docling-258M-GGUF",
+    filename="granite*Q8_0*",
+    chat_handler=chat_handler,
+    n_ctx=8192,
+)
+response = llama.create_chat_completion(
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": "https://huggingface.co/spaces/ibm-granite/granite-docling-258m-demo/resolve/main/data/images/new_arxiv.png"},
+                {"type": "text", "text": "Convert this page to docling."},
+            ],
+        }
+    ],
+    stream=True,
+)
+
+for chunk in response:
+    delta = chunk["choices"][0]["delta"]
+    if "content" not in delta:
+        continue
+    print(delta["content"], end="", flush=True)
+
+print()
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index f738ab9bb..f127df2f2 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3519,6 +3519,37 @@ def __call__(self, **kwargs):
         return super().__call__(**kwargs)
 
 
+class GraniteDoclingChatHandler(Llava15ChatHandler):
+    DEFAULT_SYSTEM_MESSAGE = None
+
+    CHAT_FORMAT = """{%- for message in messages -%}
+{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}}
+{%- if message['content'] is string -%}
+{{- message['content'] -}}
+{%- else -%}
+{%- for part in message['content'] -%}
+{%- if part['type'] == 'text' -%}
+{{- part['text'] -}}
+{%- elif part['type'] == 'image_url' -%}
+{%- if part['image_url'] is string %}
+{{- part['image_url'] }}
+{%- elif part['image_url'] is mapping -%}
+{{- part['image_url']['url'] -}}
+{%- endif -%}
+{%- endif -%}
+{%- endfor -%}
+{%- endif -%}
+{{- '<|end_of_text|>
+' -}}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+{{- '<|start_of_role|>assistant' -}}
+{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%}
+{{- '<|end_of_role|>' -}}
+{%- endif -%}
+"""
+
+
 @register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
     llama: llama.Llama,

From 2e3dd38129f0e916aebc8189a7c1c82de3559a53 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Sun, 4 Jan 2026 12:56:28 -0500
Subject: [PATCH 3/4] feat: add `special` argument needed to make
 Granite-Docling useful

---
 examples/granite_docling/main.py |  1 +
 llama_cpp/llama.py               | 65 ++++++++++++++++++++++++--------
 llama_cpp/llama_chat_format.py   |  3 ++
 3 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/examples/granite_docling/main.py b/examples/granite_docling/main.py
index e2417b8f3..c13318c42 100644
--- a/examples/granite_docling/main.py
+++ b/examples/granite_docling/main.py
@@ -23,6 +23,7 @@
         }
     ],
     stream=True,
+    special=True,
 )
 
 for chunk in response:
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 18d8bc66d..ecc0e49f6 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1161,6 +1161,7 @@ def _create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[int, float]] = None,
+        special: bool = False,
     ) -> Union[
         Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
     ]:
@@ -1352,13 +1353,17 @@ def logit_bias_processor(
             grammar=grammar,
         ):
             if llama_cpp.llama_token_is_eog(self._model.vocab, token):
-                text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+                text = self.detokenize(
+                    completion_tokens, prev_tokens=prompt_tokens, special=special
+                )
                 finish_reason = "stop"
                 break
 
             completion_tokens.append(token)
 
-            all_text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+            all_text = self.detokenize(
+                completion_tokens, prev_tokens=prompt_tokens, special=special
+            )
 
             # Contains multi-byte UTF8
             for k, char in enumerate(all_text[-3:]):
@@ -1385,6 +1390,7 @@ def logit_bias_processor(
                 remaining_text = self.detokenize(
                     remaining_tokens,
                     prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
+                    special=special,
                 )
                 remaining_length = len(remaining_text)
 
@@ -1412,6 +1418,7 @@ def logit_bias_processor(
                                 [token],
                                 prev_tokens=prompt_tokens
                                 + completion_tokens[:returned_tokens],
+                                special=special,
                             )
                         )
                         # Check if stop sequence is in the token
@@ -1423,12 +1430,14 @@ def logit_bias_processor(
                             [token],
                             prev_tokens=prompt_tokens
                             + completion_tokens[:returned_tokens],
+                            special=special,
                         ).decode("utf-8", errors="ignore")
                         text_offset = len(prompt) + len(
                             self.detokenize(
                                 completion_tokens[:returned_tokens],
                                 prev_tokens=prompt_tokens
                                 + completion_tokens[:returned_tokens],
+                                special=special,
                             ).decode("utf-8", errors="ignore")
                         )
                         token_offset = len(prompt_tokens) + returned_tokens
@@ -1441,7 +1450,7 @@ def logit_bias_processor(
                             )
                         )
                         top_logprob = {
-                            self.detokenize([i]).decode(
+                            self.detokenize([i], special=special).decode(
                                 "utf-8", errors="ignore"
                             ): logprob
                             for logprob, i in sorted_logprobs[:logprobs]
@@ -1453,6 +1462,7 @@ def logit_bias_processor(
                                     [token],
                                     prev_tokens=prompt_tokens
                                     + completion_tokens[:returned_tokens],
+                                    special=special,
                                 ).decode("utf-8", errors="ignore")
                             ],
                             "text_offset": [text_offset],
@@ -1471,6 +1481,7 @@ def logit_bias_processor(
                                         [token],
                                         prev_tokens=prompt_tokens
                                         + completion_tokens[:returned_tokens],
+                                        special=special,
                                     ).decode("utf-8", errors="ignore"),
                                     "index": 0,
                                     "logprobs": logprobs_or_none,
@@ -1487,6 +1498,7 @@ def logit_bias_processor(
                                     remaining_tokens[:i],
                                     prev_tokens=prompt_tokens
                                     + completion_tokens[:returned_tokens],
+                                    special=special,
                                 )
                                 ts = bs.decode("utf-8")
                                 decode_success = True
@@ -1522,14 +1534,18 @@ def logit_bias_processor(
                         }
 
             if len(completion_tokens) >= max_tokens:
-                text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+                text = self.detokenize(
+                    completion_tokens, prev_tokens=prompt_tokens, special=special
+                )
                 finish_reason = "length"
                 break
 
         if stopping_criteria is not None and stopping_criteria(
             self._input_ids, self._scores[-1, :]
         ):
-            text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+            text = self.detokenize(
+                completion_tokens, prev_tokens=prompt_tokens, special=special
+            )
             finish_reason = "stop"
 
         if self.verbose:
@@ -1540,6 +1556,7 @@ def logit_bias_processor(
             remaining_text = self.detokenize(
                 remaining_tokens,
                 prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
+                special=special,
             )
             any_stop = [s for s in stop_sequences if s in remaining_text]
             if len(any_stop) > 0:
@@ -1553,6 +1570,7 @@ def logit_bias_processor(
                     self.detokenize(
                         [token],
                         prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
+                        special=special,
                     )
                 )
 
@@ -1561,13 +1579,16 @@ def logit_bias_processor(
                     if token == bos_token_id:
                         continue
                     token_str = self.detokenize([token]).decode(
-                        "utf-8", errors="ignore"
+                        "utf-8",
+                        errors="ignore",
+                        special=special,
                     )
                     text_offset = len(prompt) + len(
                         self.detokenize(
                             completion_tokens[:returned_tokens],
                             prev_tokens=prompt_tokens
                             + completion_tokens[:returned_tokens],
+                            special=special,
                         )
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
@@ -1580,13 +1601,18 @@ def logit_bias_processor(
                         )
                     )
                     top_logprob = {
-                        self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
+                        self.detokenize([i]).decode(
+                            "utf-8", errors="ignore", special=special
+                        ): logprob
                         for logprob, i in sorted_logprobs[:logprobs]
                     }
                     top_logprob.update({token_str: current_logprobs[int(token)]})
                     logprobs_or_none = {
                         "tokens": [
-                            self.detokenize([token]).decode("utf-8", errors="ignore")
+                            self.detokenize(
+                                [token],
+                                special=special,
+                            ).decode("utf-8", errors="ignore")
                         ],
                         "text_offset": [text_offset],
                         "token_logprobs": [current_logprobs[int(token)]],
@@ -1594,7 +1620,7 @@ def logit_bias_processor(
                     }
 
                 if token_end_position >= end:
-                    last_text = self.detokenize([token])
+                    last_text = self.detokenize([token], special=special)
                     if token_end_position == end - 1:
                         break
                     returned_tokens += 1
@@ -1623,7 +1649,7 @@ def logit_bias_processor(
                     "model": model_name,
                     "choices": [
                         {
-                            "text": self.detokenize([token]).decode(
+                            "text": self.detokenize([token], special=special).decode(
                                 "utf-8", errors="ignore"
                             ),
                             "index": 0,
@@ -1687,7 +1713,7 @@ def logit_bias_processor(
 
             all_token_strs = [
                 self.detokenize([token], prev_tokens=all_tokens[:i]).decode(
-                    "utf-8", errors="ignore"
+                    "utf-8", errors="ignore", special=special
                 )
                 for i, token in enumerate(all_tokens)
             ]
@@ -1702,7 +1728,7 @@ def logit_bias_processor(
                     text_offset
                     + len(
                         self.detokenize(all_tokens[:idx]).decode(
-                            "utf-8", errors="ignore"
+                            "utf-8", errors="ignore", special=special
                         )
                     )
                 )
@@ -1714,9 +1740,9 @@ def logit_bias_processor(
                 )
                 token_logprobs.append(logprobs_token[int(token)])
                 top_logprob: Optional[Dict[str, float]] = {
-                    self.detokenize([i], prev_tokens=all_tokens[:idx]).decode(
-                        "utf-8", errors="ignore"
-                    ): logprob
+                    self.detokenize(
+                        [i], prev_tokens=all_tokens[:idx], special=special
+                    ).decode("utf-8", errors="ignore"): logprob
                     for logprob, i in sorted_logprobs[:logprobs]
                 }
                 top_logprob.update({token_str: logprobs_token[int(token)]})
@@ -1781,6 +1807,7 @@ def create_completion(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[int, float]] = None,
+        special: bool = False,
     ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
@@ -1810,6 +1837,7 @@ def create_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use for constrained sampling.
             logit_bias: A logit bias to use.
+            special: Include special tokens in output.
 
         Raises:
             ValueError: If the requested tokens exceed the context window.
@@ -1844,6 +1872,7 @@ def create_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            special=special,
         )
         if stream:
             chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
@@ -1878,6 +1907,7 @@ def __call__(
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[int, float]] = None,
+        special: bool = False,
     ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
@@ -1907,6 +1937,7 @@ def __call__(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use for constrained sampling.
             logit_bias: A logit bias to use.
+            special: Include special tokens in output.
 
         Raises:
             ValueError: If the requested tokens exceed the context window.
@@ -1941,6 +1972,7 @@ def __call__(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            special=special,
         )
 
     def create_chat_completion(
@@ -1973,6 +2005,7 @@ def create_chat_completion(
         logit_bias: Optional[Dict[int, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        special: bool = False,
     ) -> Union[
         CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
     ]:
@@ -2005,6 +2038,7 @@ def create_chat_completion(
             logits_processor: A list of logits processors to use.
             grammar: A grammar to use.
             logit_bias: A logit bias to use.
+            special: Include special tokens in output.
 
         Returns:
             Generated chat completion or a stream of chat completion chunks.
@@ -2044,6 +2078,7 @@ def create_chat_completion(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            special=special,
         )
 
     def create_chat_completion_openai_v1(
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index f127df2f2..97748bdeb 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -102,6 +102,7 @@ def __call__(
         grammar: Optional[llama.LlamaGrammar] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        special: bool = False,
         **kwargs,  # type: ignore
     ) -> Union[
         llama_types.CreateChatCompletionResponse,
@@ -2798,6 +2799,7 @@ def __call__(
         logit_bias: Optional[Dict[str, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        special: bool = False,
         **kwargs,  # type: ignore
     ) -> Union[
         llama_types.CreateChatCompletionResponse,
@@ -3018,6 +3020,7 @@ def __call__(
             logits_processor=logits_processor,
             grammar=grammar,
             logit_bias=logit_bias,
+            special=special,
         )
         
         if tool is not None:

From 8790ce6c01edb2d3fc0f4d3680a2524b08978e0f Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Sun, 4 Jan 2026 13:57:44 -0500
Subject: [PATCH 4/4] feat: add special to all formatters/completers

---
 llama_cpp/llama_chat_format.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 97748bdeb..23d5b9427 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -589,6 +589,7 @@ def chat_completion_handler(
         logit_bias: Optional[Dict[str, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
+        special: bool = False,
         **kwargs,  # type: ignore
     ) -> Union[
         llama_types.CreateChatCompletionResponse,
@@ -691,6 +692,7 @@ def chat_completion_handler(
             stopping_criteria=stopping_criteria,
             grammar=grammar,
             logit_bias=logit_bias,
+            special=special,
         )
         if tool is not None:
             tool_name = tool["function"]["name"]
@@ -1426,6 +1428,7 @@ def functionary_chat_handler(
     model: Optional[str] = None,
     logits_processor: Optional[llama.LogitsProcessorList] = None,
     grammar: Optional[llama.LlamaGrammar] = None,
+    special: bool = False,
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
@@ -1632,6 +1635,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
             model=model,
             logits_processor=logits_processor,
             grammar=grammar,
+            special=special,
         )
         return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
 
@@ -1712,6 +1716,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         mirostat_eta=mirostat_eta,
         model=model,
         logits_processor=logits_processor,
+        special=special,
     )  # type: ignore
 
     assert "usage" in completion
@@ -1785,6 +1790,7 @@ def functionary_v1_v2_chat_handler(
     model: Optional[str] = None,
     logits_processor: Optional[llama.LogitsProcessorList] = None,
     grammar: Optional[llama.LlamaGrammar] = None,
+    special: bool = False,
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
@@ -2001,6 +2007,7 @@ def prepare_messages_for_inference(
             model=model,
             logits_processor=logits_processor,
             grammar=grammar,
+            special=special,
         )
         if stream is False:
             completion_or_completion_chunks["choices"][0]["text"] = (
@@ -2064,6 +2071,7 @@ def create_completion(prompt, stop, grammar):
                 model=model,
                 logits_processor=logits_processor,
                 grammar=grammar,
+                special=special,
             ),
         )
 
@@ -3582,6 +3590,7 @@ def chatml_function_calling(
     grammar: Optional[llama.LlamaGrammar] = None,
     logprobs: Optional[bool] = None,
     top_logprobs: Optional[int] = None,
+    special: bool = False,
     **kwargs,  # type: ignore
 ) -> Union[
     llama_types.CreateChatCompletionResponse,
@@ -3712,6 +3721,7 @@ def chatml_function_calling(
                 logits_processor=logits_processor,
                 grammar=grammar,
                 logprobs=top_logprobs if logprobs else None,
+                special=special,
             ),
             stream=stream,
         )
@@ -3764,6 +3774,7 @@ def chatml_function_calling(
             model=model,
             logits_processor=logits_processor,
             grammar=grammar,
+            special=special,
         )
         return _convert_completion_to_chat_function(
             tool_name, completion_or_chunks, stream
@@ -3810,6 +3821,7 @@ def chatml_function_calling(
         grammar=llama_grammar.LlamaGrammar.from_string(
             initial_gbnf_tool_grammar, verbose=llama.verbose
         ),
+        special=special,
     )
     completion: llama_types.CreateCompletionResponse = completion_or_chunks  # type: ignore
     text = completion["choices"][0]["text"]
@@ -3838,6 +3850,7 @@ def chatml_function_calling(
                 grammar=llama_grammar.LlamaGrammar.from_string(
                     follow_up_gbnf_tool_grammar, verbose=llama.verbose
                 ),
+                special=special,
             ),
             stream=stream,
         )
@@ -3883,6 +3896,7 @@ def chatml_function_calling(
                 model=model,
                 logits_processor=logits_processor,
                 grammar=grammar,
+                special=special,
             )
             completion_or_chunks = cast(
                 llama_types.CreateCompletionResponse, completion_or_chunks
@@ -3914,6 +3928,7 @@ def chatml_function_calling(
                 grammar=llama_grammar.LlamaGrammar.from_string(
                     follow_up_gbnf_tool_grammar, verbose=llama.verbose
                 ),
+                special=special,
             )
             response = cast(llama_types.CreateCompletionResponse, response)