From 103f671b53c3b44cfd4c940c305492bb114fabec Mon Sep 17 00:00:00 2001 From: Ralf Waldukat Date: Mon, 5 Jan 2026 13:27:51 +0100 Subject: [PATCH 1/4] Update llama.cpp to 2026-01-01 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update vendor/llama.cpp submodule to be47fb92 (2026-01-01) - Bump version from 0.3.16 to 0.4.0 Breaking changes: - Migrate flash_attn bool to flash_attn_type enum (backward compatible via None=AUTO) - Replace llama_kv_self_* API with llama_memory_* API New features: - Add LLAMA_FLASH_ATTN_TYPE_* enum (AUTO/DISABLED/ENABLED) - Add llama_model_params fields: no_host, no_alloc - Add mtmd_context_params fields: flash_attn_type, warmup, image_min/max_tokens - Add LLAMA_ROPE_TYPE_IMROPE, LLAMA_PARAMS_FIT_STATUS_* enums - Add 15+ new functions: llama_max_tensor_buft_overrides, llama_n_ctx_seq, llama_model_n_embd_inp, llama_model_is_hybrid, llama_log_*, llama_memory_*, llama_attach/detach_threadpool, llama_adapter_meta_* (4 functions) Fixes: - Server settings: flash_attn default None (AUTO) instead of False (DISABLED) - Enable FIM token functions: token_prefix/middle/suffix - Fix typos: additonal→additional, unnused→unused - Remove deprecated verbosity field from mtmd_context_params - Add CMake version workaround documentation Code quality: - Consistent stub style (... not pass) - Struct alignment verified against llama.h and mtmd.h - Minimal whitespace noise (0.4% of diff) --- CMakeLists.txt | 9 + llama_cpp/__init__.py | 2 +- llama_cpp/llama.py | 53 +++- llama_cpp/llama_cpp.py | 598 +++++++++++++++++++---------------- llama_cpp/mtmd_cpp.py | 99 +++--- llama_cpp/server/settings.py | 5 +- vendor/llama.cpp | 2 +- 7 files changed, 434 insertions(+), 334 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b06d98b3..7fd8866cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,6 +153,15 @@ if (LLAMA_BUILD) add_compile_definitions(GGML_USE_METAL) endif() + # Set version for mtmd (required by upstream CMakeLists.txt) + # NOTE: This is a workaround for mtmd build requirements. + # Version is set to 0.0.0 for local builds. If upstream adds version + # compatibility checks, this may need to match llama.cpp version. + if (NOT DEFINED LLAMA_BUILD_NUMBER) + set(LLAMA_BUILD_NUMBER 0) + endif() + set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER}) + # Building llava add_subdirectory(vendor/llama.cpp/tools/mtmd) diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index c1dde7046..0d56603e2 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.16" +__version__ = "0.4.0" diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 71d94ebd8..18d8bc66d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -91,9 +91,9 @@ def __init__( logits_all: bool = False, embedding: bool = False, offload_kqv: bool = True, - flash_attn: bool = False, op_offload: Optional[bool] = None, swa_full: Optional[bool] = None, + flash_attn: Optional[bool] = None, # Sampling Params no_perf: bool = False, last_n_tokens_size: int = 64, @@ -173,7 +173,7 @@ def __init__( logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs. embedding: Embedding mode only. offload_kqv: Offload K, Q, V to GPU. - flash_attn: Use flash attention. + flash_attn: Use flash attention. None = auto, True = enabled, False = disabled. op_offload: offload host tensor operations to device swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) no_perf: Measure performance timings. @@ -341,7 +341,16 @@ def __init__( self._logits_all = logits_all if draft_model is None else True self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv - self.context_params.flash_attn = flash_attn + if flash_attn is None: + self.context_params.flash_attn_type = llama_cpp.LLAMA_FLASH_ATTN_TYPE_AUTO + elif flash_attn: + self.context_params.flash_attn_type = ( + llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED + ) + else: + self.context_params.flash_attn_type = ( + llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED + ) if op_offload is not None: self.context_params.op_offload = op_offload @@ -934,7 +943,8 @@ def generate( sample_idx += 1 if stopping_criteria is not None and stopping_criteria( - self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :] + self._input_ids[:sample_idx], + self._scores[sample_idx - self.n_tokens, :], ): return tokens_or_none = yield token @@ -1041,7 +1051,9 @@ def embed( data: Union[List[List[float]], List[List[List[float]]]] = [] def decode_batch(seq_sizes: List[int]): - llama_cpp.llama_kv_self_clear(self._ctx.ctx) + mem = llama_cpp.llama_get_memory(self._ctx.ctx) + if mem is not None: + llama_cpp.llama_memory_clear(mem, True) self._ctx.decode(self._batch) self._batch.reset() @@ -1112,7 +1124,9 @@ def decode_batch(seq_sizes: List[int]): output = data[0] if isinstance(input, str) else data - llama_cpp.llama_kv_self_clear(self._ctx.ctx) + mem = llama_cpp.llama_get_memory(self._ctx.ctx) + if mem is not None: + llama_cpp.llama_memory_clear(mem, True) self.reset() if return_count: @@ -1157,9 +1171,9 @@ def _create_completion( bos_token_id: int = self.token_bos() cls_token_id: int = self._model.token_cls() sep_token_id: int = self._model.token_sep() - prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix - middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix - suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix + prefix_token_id: int = self._model.token_prefix() + middle_token_id: int = self._model.token_middle() + suffix_token_id: int = self._model.token_suffix() add_space_prefix: bool = ( self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true" ) @@ -1315,7 +1329,7 @@ def logit_bias_processor( if seed is not None: self.set_seed(seed) else: - self.set_seed(random.Random(self._seed).randint(0, 2 ** 32)) + self.set_seed(random.Random(self._seed).randint(0, 2**32)) finish_reason = "length" multibyte_fix = 0 @@ -2056,7 +2070,10 @@ def create_chat_completion_openai_v1( stream = kwargs.get("stream", False) # type: ignore assert isinstance(stream, bool) if stream: - return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore + return ( + ChatCompletionChunk(**chunk) + for chunk in self.create_chat_completion(*args, **kwargs) + ) # type: ignore else: return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore except ImportError: @@ -2096,7 +2113,7 @@ def __getstate__(self): logits_all=self._logits_all, embedding=self.context_params.embeddings, offload_kqv=self.context_params.offload_kqv, - flash_attn=self.context_params.flash_attn, + flash_attn=self.context_params.flash_attn_type, op_offload=self.context_params.op_offload, swa_full=self.context_params.swa_full, # Sampling Params @@ -2316,19 +2333,23 @@ def from_pretrained( ) if additional_files: - for additonal_file_name in additional_files: + for additional_file_name in additional_files: # find the additional shard file: - matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)] + matching_additional_files = [ + file + for file in file_list + if fnmatch.fnmatch(file, additional_file_name) + ] if len(matching_additional_files) == 0: raise ValueError( - f"No file found in {repo_id} that match {additonal_file_name}\n\n" + f"No file found in {repo_id} that match {additional_file_name}\n\n" f"Available Files:\n{json.dumps(file_list)}" ) if len(matching_additional_files) > 1: raise ValueError( - f"Multiple files found in {repo_id} matching {additonal_file_name}\n\n" + f"Multiple files found in {repo_id} matching {additional_file_name}\n\n" f"Available Files:\n{json.dumps(files)}" ) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 711d42a6a..4e8719e07 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -117,6 +117,14 @@ # typedef bool (*ggml_abort_callback)(void * data); ggml_abort_callback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p) +# typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); +ggml_log_callback = ctypes.CFUNCTYPE( + None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p +) + +# typedef struct ggml_threadpool * ggml_threadpool_t; +ggml_threadpool_t = ctypes.c_void_p + # llama.h bindings _lib.llama_max_devices.argtypes = [] @@ -177,6 +185,13 @@ # typedef int32_t llama_seq_id; llama_seq_id = ctypes.c_int32 +# typedef uint32_t llama_state_seq_flags; +llama_state_seq_flags = ctypes.c_uint32 + +# State sequence flags +LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1 +LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY = 2 + # enum llama_vocab_type { # LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab @@ -294,6 +309,7 @@ LLAMA_ROPE_TYPE_NORM = 0 LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2 LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8 +LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40 LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24 @@ -462,6 +478,14 @@ LLAMA_ATTENTION_TYPE_CAUSAL = 0 LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1 +# enum llama_flash_attn_type { +# LLAMA_FLASH_ATTN_TYPE_AUTO = -1, +# LLAMA_FLASH_ATTN_TYPE_DISABLED = 0, +# LLAMA_FLASH_ATTN_TYPE_ENABLED = 1, +# }; +LLAMA_FLASH_ATTN_TYPE_AUTO = -1 +LLAMA_FLASH_ATTN_TYPE_DISABLED = 0 +LLAMA_FLASH_ATTN_TYPE_ENABLED = 1 # enum llama_split_mode { # LLAMA_SPLIT_MODE_NONE = 0, // single GPU @@ -472,6 +496,14 @@ LLAMA_SPLIT_MODE_LAYER = 1 LLAMA_SPLIT_MODE_ROW = 2 +# enum llama_params_fit_status { +# LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, +# LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, +# LLAMA_PARAMS_FIT_STATUS_ERROR = 2, +# }; +LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0 +LLAMA_PARAMS_FIT_STATUS_FAILURE = 1 +LLAMA_PARAMS_FIT_STATUS_ERROR = 2 # typedef struct llama_token_data { # llama_token id; // token id @@ -613,6 +645,22 @@ class llama_batch(ctypes.Structure): LLAMA_KV_OVERRIDE_TYPE_BOOL = 2 LLAMA_KV_OVERRIDE_TYPE_STR = 3 +# enum llama_model_meta_key { +# LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE, +# ... +# }; +LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE = 0 +LLAMA_MODEL_META_KEY_SAMPLING_TOP_K = 1 +LLAMA_MODEL_META_KEY_SAMPLING_TOP_P = 2 +LLAMA_MODEL_META_KEY_SAMPLING_MIN_P = 3 +LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY = 4 +LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD = 5 +LLAMA_MODEL_META_KEY_SAMPLING_TEMP = 6 +LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N = 7 +LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT = 8 +LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT = 9 +LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU = 10 +LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA = 11 # struct llama_model_kv_override { # enum llama_model_kv_override_type tag; @@ -745,6 +793,8 @@ class llama_model_params(ctypes.Structure): ("use_mlock", ctypes.c_bool), ("check_tensors", ctypes.c_bool), ("use_extra_bufts", ctypes.c_bool), + ("no_host", ctypes.c_bool), + ("no_alloc", ctypes.c_bool), ] @@ -875,6 +925,7 @@ class llama_context_params(ctypes.Structure): ("rope_scaling_type", ctypes.c_int), ("pooling_type", ctypes.c_int), ("attention_type", ctypes.c_int), + ("flash_attn_type", ctypes.c_int), ("rope_freq_base", ctypes.c_float), ("rope_freq_scale", ctypes.c_float), ("yarn_ext_factor", ctypes.c_float), @@ -1146,11 +1197,26 @@ def llama_numa_init(numa: int, /): # struct llama_context * ctx, # ggml_threadpool_t threadpool, # ggml_threadpool_t threadpool_batch); -# TODO: Add llama_attach_threadpool +@ctypes_function( + "llama_attach_threadpool", + [llama_context_p_ctypes, ggml_threadpool_t, ggml_threadpool_t], + None, +) +def llama_attach_threadpool( + ctx: llama_context_p, + threadpool: ctypes.c_void_p, + threadpool_batch: ctypes.c_void_p, + /, +): + """Attach threadpools to context""" + ... # LLAMA_API void llama_detach_threadpool(struct llama_context * ctx); -# TODO: Add llama_detach_threadpool +@ctypes_function("llama_detach_threadpool", [llama_context_p_ctypes], None) +def llama_detach_threadpool(ctx: llama_context_p, /): + """Detach threadpool from context""" + ... # DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file( @@ -1331,12 +1397,75 @@ def llama_supports_rpc() -> bool: ... +# LLAMA_API size_t llama_max_tensor_buft_overrides(void); +@ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t) +def llama_max_tensor_buft_overrides() -> int: + """Get maximum number of tensor buffer type overrides""" + ... + + +# LLAMA_API enum llama_params_fit_status llama_params_fit( +# const char * path_model, +# struct llama_model_params * mparams, +# struct llama_context_params * cparams, +# float * tensor_split, +# struct llama_model_tensor_buft_override * tensor_buft_overrides, +# size_t margin, +# uint32_t n_ctx_min, +# enum ggml_log_level log_level); +@ctypes_function( + "llama_params_fit", + [ + ctypes.c_char_p, + ctypes.POINTER(llama_model_params), + ctypes.POINTER(llama_context_params), + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, # tensor_buft_overrides - not fully bound + ctypes.c_size_t, # margin + ctypes.c_uint32, # n_ctx_min + ctypes.c_int, # ggml_log_level (enum) + ], + ctypes.c_int, +) +def llama_params_fit( + path_model: bytes, + mparams: CtypesPointerOrRef[llama_model_params], + cparams: CtypesPointerOrRef[llama_context_params], + tensor_split: CtypesArray[ctypes.c_float], + tensor_buft_overrides: Optional[ctypes.c_void_p], + margin: Union[ctypes.c_size_t, int], + n_ctx_min: Union[ctypes.c_uint32, int], + log_level: int, + /, +) -> int: + """Check if model parameters will fit in memory + + Args: + margin: Memory margin to leave per device in bytes + n_ctx_min: Minimum context size when trying to reduce memory + log_level: Minimum log level (ggml_log_level enum) + + Returns: + LLAMA_PARAMS_FIT_STATUS_SUCCESS (0) - found allocations that are projected to fit + LLAMA_PARAMS_FIT_STATUS_FAILURE (1) - could not find allocations that are projected to fit + LLAMA_PARAMS_FIT_STATUS_ERROR (2) - a hard error occurred + """ + ... + + # LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32) def llama_n_ctx(ctx: llama_context_p, /) -> int: ... +# LLAMA_API uint32_t llama_n_ctx_seq(const struct llama_context * ctx); +@ctypes_function("llama_n_ctx_seq", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_ctx_seq(ctx: llama_context_p, /) -> int: + """Get the context sequence size""" + ... + + # LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32) def llama_n_batch(ctx: llama_context_p, /) -> int: @@ -1405,16 +1534,6 @@ def llama_pooling_type(ctx: llama_context_p, /) -> int: # DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead"); -@ctypes_function( - "llama_get_kv_self", - [llama_context_p_ctypes], - llama_kv_cache_p_ctypes, -) -def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]: - """Get the KV cache for self-attention (DEPRECATED)""" - ... - - # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); @ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes) def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]: @@ -1439,6 +1558,13 @@ def llama_model_n_embd(model: llama_model_p, /) -> int: ... +# LLAMA_API int32_t llama_model_n_embd_inp(const struct llama_model * model); +@ctypes_function("llama_model_n_embd_inp", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_embd_inp(model: llama_model_p, /) -> int: + """Get the input embedding dimension""" + ... + + # LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model); @ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32) def llama_model_n_layer(model: llama_model_p, /) -> int: @@ -1663,6 +1789,13 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool: ... +# LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model); +@ctypes_function("llama_model_is_hybrid", [llama_model_p_ctypes], ctypes.c_bool) +def llama_model_is_hybrid(model: llama_model_p, /) -> bool: + """Returns true if model is hybrid (Jamba, Granite, etc.)""" + ... + + # // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.) # LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model); @ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool) @@ -1726,6 +1859,80 @@ def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /): ... +# LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size); +@ctypes_function( + "llama_adapter_meta_val_str", + [llama_adapter_lora_p_ctypes, ctypes.c_char_p, ctypes.c_char_p, ctypes.c_size_t], + ctypes.c_int32, +) +def llama_adapter_meta_val_str( + adapter: llama_adapter_lora_p, key: bytes, buf: bytes, buf_size: int, / +) -> int: + """Get adapter metadata value as string""" + ... + + +# LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter); +@ctypes_function( + "llama_adapter_meta_count", [llama_adapter_lora_p_ctypes], ctypes.c_int32 +) +def llama_adapter_meta_count(adapter: llama_adapter_lora_p, /) -> int: + """Get number of adapter metadata pairs""" + ... + + +# LLAMA_API int32_t llama_adapter_meta_key_by_index(...); +@ctypes_function( + "llama_adapter_meta_key_by_index", + [llama_adapter_lora_p_ctypes, ctypes.c_int32, ctypes.c_char_p, ctypes.c_size_t], + ctypes.c_int32, +) +def llama_adapter_meta_key_by_index( + adapter: llama_adapter_lora_p, i: int, buf: bytes, buf_size: int, / +) -> int: + """Get adapter metadata key by index""" + ... + + +# LLAMA_API int32_t llama_adapter_meta_val_str_by_index(...); +@ctypes_function( + "llama_adapter_meta_val_str_by_index", + [llama_adapter_lora_p_ctypes, ctypes.c_int32, ctypes.c_char_p, ctypes.c_size_t], + ctypes.c_int32, +) +def llama_adapter_meta_val_str_by_index( + adapter: llama_adapter_lora_p, i: int, buf: bytes, buf_size: int, / +) -> int: + """Get adapter metadata value by index""" + ... + + +# LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(...); +@ctypes_function( + "llama_adapter_get_alora_n_invocation_tokens", + [llama_adapter_lora_p_ctypes], + ctypes.c_uint64, +) +def llama_adapter_get_alora_n_invocation_tokens( + adapter: llama_adapter_lora_p, / +) -> int: + """Get alora invocation token count""" + ... + + +# LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens(...); +@ctypes_function( + "llama_adapter_get_alora_invocation_tokens", + [llama_adapter_lora_p_ctypes], + ctypes.POINTER(llama_token), +) +def llama_adapter_get_alora_invocation_tokens( + adapter: llama_adapter_lora_p, / +) -> ctypes.Array: + """Get alora invocation tokens""" + ... + + # // The following functions operate on a llama_context, hence the naming: llama_verb_... @@ -2038,256 +2245,6 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool: # // # // KV cache for self-attention (TODO: deprecate in favor of llama_memory) -# // - -# // Returns the number of tokens in the KV cache (slow, use only for debug) -# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times -# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx), -# "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); -@ctypes_function( - "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32 -) -def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: - """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)""" - ... - - -# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) -# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx), -# "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); -@ctypes_function( - "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32 -) -def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int: - """Returns the number of used KV cells (DEPRECATED)""" - ... - - -# // Clear the KV cache - both cell info is erased and KV data is zeroed -# DEPRECATED(LLAMA_API void llama_kv_self_clear( -# struct llama_context * ctx), -# "Use llama_memory_clear() instead"); -@ctypes_function( - "llama_kv_self_clear", [llama_context_p_ctypes], None -) -def llama_kv_self_clear(ctx: llama_context_p, /): - """Clear the KV cache (DEPRECATED)""" - ... - - -# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) -# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails -# // seq_id < 0 : match any sequence -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm( -# struct llama_context * ctx, -# llama_seq_id seq_id, -# llama_pos p0, -# llama_pos p1), -# "Use llama_memory_seq_rm() instead"); -@ctypes_function( - "llama_kv_self_seq_rm", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - ], - ctypes.c_bool, -) -def llama_kv_self_seq_rm( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - /, -) -> bool: - """Remove tokens from KV cache (DEPRECATED)""" - ... - - -# // Copy all tokens that belong to the specified sequence to another sequence -# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API void llama_kv_self_seq_cp( -# struct llama_context * ctx, -# llama_seq_id seq_id_src, -# llama_seq_id seq_id_dst, -# llama_pos p0, -# llama_pos p1), -# "Use llama_memory_seq_cp() instead"); -@ctypes_function( - "llama_kv_self_seq_cp", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_seq_id, - llama_pos, - llama_pos, - ], - None, -) -def llama_kv_self_seq_cp( - ctx: llama_context_p, - seq_id_src: Union[llama_seq_id, int], - seq_id_dst: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - /, -): - """Copy tokens in KV cache (DEPRECATED)""" - ... - - -# // Removes all tokens that do not belong to the specified sequence -# DEPRECATED(LLAMA_API void llama_kv_self_seq_keep( -# struct llama_context * ctx, -# llama_seq_id seq_id), -# "Use llama_memory_seq_keep() instead"); -@ctypes_function( - "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None -) -def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): - """Keep only specified sequence in KV cache (DEPRECATED)""" - ... - - -# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) -# // If the KV cache is RoPEd, the KV data is updated accordingly: -# // - lazily on next llama_decode() -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API void llama_kv_self_seq_add( -# struct llama_context * ctx, -# llama_seq_id seq_id, -# llama_pos p0, -# llama_pos p1, -# llama_pos delta), -# "Use llama_memory_seq_add() instead"); -@ctypes_function( - "llama_kv_self_seq_add", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - llama_pos, - ], - None, -) -def llama_kv_self_seq_add( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - delta: Union[llama_pos, int], - /, -): - """Add delta to sequence positions in KV cache (DEPRECATED)""" - ... - - -# // Integer division of the positions by factor of `d > 1` -# // If the KV cache is RoPEd, the KV data is updated accordingly: -# // - lazily on next llama_decode() -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API void llama_kv_self_seq_div( -# struct llama_context * ctx, -# llama_seq_id seq_id, -# llama_pos p0, -# llama_pos p1, -# int d), -# "Use llama_memory_seq_div() instead"); -@ctypes_function( - "llama_kv_self_seq_div", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - ctypes.c_int, - ], - None, -) -def llama_kv_self_seq_div( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - d: Union[ctypes.c_int, int], - /, -): - """Divide sequence positions in KV cache (DEPRECATED)""" - ... - - -# // Returns the smallest position present in the KV cache for the specified sequence -# // This is typically non-zero only for SWA caches -# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache -# // Return -1 if the sequence is empty -# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min( -# struct llama_context * ctx, -# llama_seq_id seq_id), -# "Use llama_memory_seq_pos_min() instead"); -@ctypes_function( - "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos -) -def llama_kv_self_seq_pos_min( - ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / -) -> int: - """Returns the smallest position in KV cache for sequence (DEPRECATED)""" - ... - - -# // Returns the largest position present in the KV cache for the specified sequence -# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache -# // Return -1 if the sequence is empty -# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max( -# struct llama_context * ctx, -# llama_seq_id seq_id), -# "Use llama_memory_seq_pos_max() instead"); -@ctypes_function( - "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos -) -def llama_kv_self_seq_pos_max( - ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / -) -> int: - """Returns the largest position in KV cache for sequence (DEPRECATED)""" - ... - - -# // Defragment the KV cache -# // This will be applied: -# // - lazily on next llama_decode() -# DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx), -# "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'"); -@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None) -def llama_kv_self_defrag(ctx: llama_context_p, /): - """Defragment the KV cache (DEPRECATED)""" - ... - - -# // Check if the context supports KV cache shifting -# DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx), -# "use llama_memory_can_shift() instead"); -@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool) -def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool: - """Check if the context supports KV cache shifting (DEPRECATED)""" - ... - - -# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) -# DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx), -# "simply remove this call, updates are applied lazily on the next llama_decode()"); -@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None) -def llama_kv_self_update(ctx: llama_context_p, /): - """Apply the KV cache updates (DEPRECATED)""" - ... - - # // # // State / sessions # // @@ -2634,6 +2591,83 @@ def llama_state_seq_load_file( ... +# LLAMA_API size_t llama_state_seq_get_size_ext( +# struct llama_context * ctx, +# llama_seq_id seq_id, +# llama_state_seq_flags flags); +@ctypes_function( + "llama_state_seq_get_size_ext", + [llama_context_p_ctypes, llama_seq_id, llama_state_seq_flags], + ctypes.c_size_t, +) +def llama_state_seq_get_size_ext( + ctx: llama_context_p, + seq_id: Union[llama_seq_id, int], + flags: Union[llama_state_seq_flags, int], + /, +) -> int: + """Get size needed to copy sequence state with flags""" + ... + + +# LLAMA_API size_t llama_state_seq_get_data_ext( +# struct llama_context * ctx, +# uint8_t * dst, +# size_t size, +# llama_seq_id seq_id, +# llama_state_seq_flags flags); +@ctypes_function( + "llama_state_seq_get_data_ext", + [ + llama_context_p_ctypes, + ctypes.POINTER(ctypes.c_uint8), + ctypes.c_size_t, + llama_seq_id, + llama_state_seq_flags, + ], + ctypes.c_size_t, +) +def llama_state_seq_get_data_ext( + ctx: llama_context_p, + dst: CtypesArray[ctypes.c_uint8], + size: Union[ctypes.c_size_t, int], + seq_id: Union[llama_seq_id, int], + flags: Union[llama_state_seq_flags, int], + /, +) -> int: + """Copy sequence state to buffer with flags""" + ... + + +# LLAMA_API size_t llama_state_seq_set_data_ext( +# struct llama_context * ctx, +# const uint8_t * src, +# size_t size, +# llama_seq_id dest_seq_id, +# llama_state_seq_flags flags); +@ctypes_function( + "llama_state_seq_set_data_ext", + [ + llama_context_p_ctypes, + ctypes.POINTER(ctypes.c_uint8), + ctypes.c_size_t, + llama_seq_id, + llama_state_seq_flags, + ], + ctypes.c_size_t, +) +def llama_state_seq_set_data_ext( + ctx: llama_context_p, + src: CtypesArray[ctypes.c_uint8], + size: Union[ctypes.c_size_t, int], + dest_seq_id: Union[llama_seq_id, int], + flags: Union[llama_state_seq_flags, int], + /, +) -> int: + """Restore sequence state from buffer with flags""" + ... + + # // # // Decoding # // @@ -3806,9 +3840,6 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p: # /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), # "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); -@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes) -def llama_sampler_init_softmax() -> llama_sampler_p: - ... # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 @@ -4174,12 +4205,31 @@ def llama_print_system_info() -> bytes: ... -# // Set callback for all future logging events. -# // If this is not called, or NULL is supplied, everything is output on stderr. +# LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type type); +@ctypes_function("llama_flash_attn_type_name", [ctypes.c_int], ctypes.c_char_p) +def llama_flash_attn_type_name(type: int, /) -> bytes: + """Get name of flash attention type""" + ... + + +# LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key); +@ctypes_function("llama_model_meta_key_str", [ctypes.c_int], ctypes.c_char_p) +def llama_model_meta_key_str(key: int, /) -> bytes: + """Get string representation of model meta key""" + ... + + +# LLAMA_API ggml_log_callback llama_log_get(void); +@ctypes_function("llama_log_get", [], ggml_log_callback) +def llama_log_get() -> ggml_log_callback: + """Get current log callback""" + ... + + # LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); @ctypes_function( "llama_log_set", - [ctypes.c_void_p, ctypes.c_void_p], + [ggml_log_callback, ctypes.c_void_p], None, ) def llama_log_set( @@ -4193,7 +4243,17 @@ def llama_log_set( ... -# // +# LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx); +@ctypes_function( + "llama_memory_breakdown_print", + [llama_context_p_ctypes], + None, +) +def llama_memory_breakdown_print(ctx: llama_context_p, /): + """Print memory breakdown for context""" + ... + + # // Performance utils # // diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index a45f8f406..e00eb3a0b 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -39,7 +39,11 @@ # Specify the base name of the shared library to load _libmtmd_base_name = "mtmd" _libmtmd_override_path = os.environ.get("MTMD_CPP_LIB") -_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path() +_libmtmd_base_path = ( + pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" + if _libmtmd_override_path is None + else pathlib.Path(_libmtmd_override_path) +) # Load the library _libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path) @@ -71,17 +75,22 @@ MTMD_INPUT_CHUNK_TYPE_IMAGE = 1 MTMD_INPUT_CHUNK_TYPE_AUDIO = 2 + # Structures class mtmd_context_params(Structure): _fields_ = [ ("use_gpu", c_bool), ("print_timings", c_bool), ("n_threads", c_int), - ("verbosity", c_int), # ggml_log_level ("image_marker", c_char_p), ("media_marker", c_char_p), + ("flash_attn_type", c_int), # enum llama_flash_attn_type + ("warmup", c_bool), + ("image_min_tokens", c_int), + ("image_max_tokens", c_int), ] + class mtmd_input_text(Structure): _fields_ = [ ("text", c_char_p), @@ -89,19 +98,21 @@ class mtmd_input_text(Structure): ("parse_special", c_bool), ] + ################################################ # mtmd.h functions ################################################ + # MTMD_API const char * mtmd_default_marker(void); @ctypes_function("mtmd_default_marker", [], c_char_p) -def mtmd_default_marker() -> bytes: - ... +def mtmd_default_marker() -> bytes: ... + # MTMD_API struct mtmd_context_params mtmd_context_params_default(void); @ctypes_function("mtmd_context_params_default", [], mtmd_context_params) -def mtmd_context_params_default() -> mtmd_context_params: - ... +def mtmd_context_params_default() -> mtmd_context_params: ... + # MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname, # const struct llama_model * text_model, @@ -109,70 +120,68 @@ def mtmd_context_params_default() -> mtmd_context_params: @ctypes_function( "mtmd_init_from_file", [c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params], - mtmd_context_p_ctypes + mtmd_context_p_ctypes, ) def mtmd_init_from_file( mmproj_fname: bytes, text_model: llama_cpp.llama_model_p, ctx_params: mtmd_context_params, /, -) -> Optional[mtmd_context_p]: - ... +) -> Optional[mtmd_context_p]: ... + # MTMD_API void mtmd_free(mtmd_context * ctx); @ctypes_function("mtmd_free", [mtmd_context_p_ctypes], None) -def mtmd_free(ctx: mtmd_context_p, /): - ... +def mtmd_free(ctx: mtmd_context_p, /): ... + # MTMD_API bool mtmd_support_vision(mtmd_context * ctx); @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool) -def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: - ... +def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: ... + # MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data); @ctypes_function( - "mtmd_bitmap_init", - [c_uint32, c_uint32, POINTER(c_uint8)], - mtmd_bitmap_p_ctypes + "mtmd_bitmap_init", [c_uint32, c_uint32, POINTER(c_uint8)], mtmd_bitmap_p_ctypes ) def mtmd_bitmap_init( nx: Union[c_uint32, int], ny: Union[c_uint32, int], data: CtypesArray[c_uint8], /, -) -> Optional[mtmd_bitmap_p]: - ... +) -> Optional[mtmd_bitmap_p]: ... + # MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap); @ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None) -def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): - ... +def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): ... + # MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void); @ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes) -def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]: - ... +def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]: ... + # MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks); @ctypes_function("mtmd_input_chunks_free", [mtmd_input_chunks_p_ctypes], None) -def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /): - ... +def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /): ... + # MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks); @ctypes_function("mtmd_input_chunks_size", [mtmd_input_chunks_p_ctypes], c_size_t) -def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int: - ... +def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int: ... + # MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx); @ctypes_function( "mtmd_input_chunks_get", [mtmd_input_chunks_p_ctypes, c_size_t], - mtmd_input_chunk_p_ctypes + mtmd_input_chunk_p_ctypes, ) def mtmd_input_chunks_get( chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], / -) -> Optional[mtmd_input_chunk_p]: - ... +) -> Optional[mtmd_input_chunk_p]: ... + # MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx, # mtmd_input_chunks * output, @@ -197,52 +206,53 @@ def mtmd_tokenize( bitmaps: CtypesArray[mtmd_bitmap_p_ctypes], n_bitmaps: Union[c_size_t, int], /, -) -> int: - ... +) -> int: ... + # MTMD_API size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk); @ctypes_function("mtmd_input_chunk_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t) -def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int: - ... +def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int: ... + # MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk); @ctypes_function("mtmd_input_chunk_get_type", [mtmd_input_chunk_p_ctypes], c_int) -def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int: - ... +def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int: ... + # MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output); @ctypes_function( "mtmd_input_chunk_get_tokens_text", [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)], - POINTER(llama_cpp.llama_token) + POINTER(llama_cpp.llama_token), ) def mtmd_input_chunk_get_tokens_text( chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", / -) -> Optional["_Pointer[llama_cpp.llama_token]"]: - ... +) -> Optional["_Pointer[llama_cpp.llama_token]"]: ... + ################################################ # mtmd-helper.h functions ################################################ + # MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); @ctypes_function( "mtmd_helper_bitmap_init_from_buf", [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t], - mtmd_bitmap_p_ctypes + mtmd_bitmap_p_ctypes, ) def mtmd_helper_bitmap_init_from_buf( ctx: mtmd_context_p, buf: CtypesArray[c_uint8], length: Union[c_size_t, int], /, -) -> Optional[mtmd_bitmap_p]: - ... +) -> Optional[mtmd_bitmap_p]: ... + # MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); @ctypes_function("mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t) -def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int: - ... +def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int: ... + # MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, # struct llama_context * lctx, @@ -276,5 +286,4 @@ def mtmd_helper_eval_chunk_single( logits_last: Union[c_bool, bool], new_n_past: "_Pointer[llama_cpp.llama_pos]", /, -) -> int: - ... +) -> int: ... diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 13c951241..bad0d4ee7 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -103,8 +103,9 @@ class ModelSettings(BaseSettings): offload_kqv: bool = Field( default=True, description="Whether to offload kqv to the GPU." ) - flash_attn: bool = Field( - default=False, description="Whether to use flash attention." + flash_attn: Optional[bool] = Field( + default=None, + description="Use flash attention. None=auto, True=enabled, False=disabled.", ) # Sampling Params last_n_tokens_size: int = Field( diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4227c9be4..be47fb928 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4227c9be4268ac844921b90f31595f81236bd317 +Subproject commit be47fb9285779e900915bd8246eb9664110d4ba5 From a1d99cb482bf7b8b532a28e18db89ac6364b8c31 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sun, 4 Jan 2026 00:30:58 -0500 Subject: [PATCH 2/4] feat: support Granite-Docling model --- CHANGELOG.md | 3 +++ examples/granite_docling/main.py | 34 ++++++++++++++++++++++++++++++++ llama_cpp/llama_chat_format.py | 31 +++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 examples/granite_docling/main.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 16954eb88..3e479ac95 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: Add chat completion handler for Granite-Docling (and SmolVLM) +- feat: Add `special` argument to keep special tokens in chat completion output + ## [0.3.16] - feat: Update llama.cpp to ggerganov/llama.cpp@4227c9be4268ac844921b90f31595f81236bd317 diff --git a/examples/granite_docling/main.py b/examples/granite_docling/main.py new file mode 100644 index 000000000..e2417b8f3 --- /dev/null +++ b/examples/granite_docling/main.py @@ -0,0 +1,34 @@ +from llama_cpp import Llama +from llama_cpp.llama_chat_format import GraniteDoclingChatHandler + + +chat_handler = GraniteDoclingChatHandler.from_pretrained( + repo_id="ggml-org/granite-docling-258M-GGUF", + filename="mmproj*Q8_0*", +) +llama = Llama.from_pretrained( + repo_id="ggml-org/granite-docling-258M-GGUF", + filename="granite*Q8_0*", + chat_handler=chat_handler, + n_ctx=8192, +) +response = llama.create_chat_completion( + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": "https://huggingface.co/spaces/ibm-granite/granite-docling-258m-demo/resolve/main/data/images/new_arxiv.png"}, + {"type": "text", "text": "Convert this page to docling."}, + ], + } + ], + stream=True, +) + +for chunk in response: + delta = chunk["choices"][0]["delta"] + if "content" not in delta: + continue + print(delta["content"], end="", flush=True) + +print() diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f738ab9bb..f127df2f2 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3519,6 +3519,37 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class GraniteDoclingChatHandler(Llava15ChatHandler): + DEFAULT_SYSTEM_MESSAGE = None + + CHAT_FORMAT = """{%- for message in messages -%} +{{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' -}} +{%- if message['content'] is string -%} +{{- message['content'] -}} +{%- else -%} +{%- for part in message['content'] -%} +{%- if part['type'] == 'text' -%} +{{- part['text'] -}} +{%- elif part['type'] == 'image_url' -%} +{%- if part['image_url'] is string %} +{{- part['image_url'] }} +{%- elif part['image_url'] is mapping -%} +{{- part['image_url']['url'] -}} +{%- endif -%} +{%- endif -%} +{%- endfor -%} +{%- endif -%} +{{- '<|end_of_text|> +' -}} +{%- endfor -%} +{%- if add_generation_prompt -%} +{{- '<|start_of_role|>assistant' -}} +{%- if controls -%}{{- ' ' + controls | tojson() -}}{%- endif -%} +{{- '<|end_of_role|>' -}} +{%- endif -%} +""" + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama, From 2e3dd38129f0e916aebc8189a7c1c82de3559a53 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sun, 4 Jan 2026 12:56:28 -0500 Subject: [PATCH 3/4] feat: add `special` argument needed to make Granite-Docling useful --- examples/granite_docling/main.py | 1 + llama_cpp/llama.py | 65 ++++++++++++++++++++++++-------- llama_cpp/llama_chat_format.py | 3 ++ 3 files changed, 54 insertions(+), 15 deletions(-) diff --git a/examples/granite_docling/main.py b/examples/granite_docling/main.py index e2417b8f3..c13318c42 100644 --- a/examples/granite_docling/main.py +++ b/examples/granite_docling/main.py @@ -23,6 +23,7 @@ } ], stream=True, + special=True, ) for chunk in response: diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 18d8bc66d..ecc0e49f6 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1161,6 +1161,7 @@ def _create_completion( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, logit_bias: Optional[Dict[int, float]] = None, + special: bool = False, ) -> Union[ Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] ]: @@ -1352,13 +1353,17 @@ def logit_bias_processor( grammar=grammar, ): if llama_cpp.llama_token_is_eog(self._model.vocab, token): - text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + text = self.detokenize( + completion_tokens, prev_tokens=prompt_tokens, special=special + ) finish_reason = "stop" break completion_tokens.append(token) - all_text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + all_text = self.detokenize( + completion_tokens, prev_tokens=prompt_tokens, special=special + ) # Contains multi-byte UTF8 for k, char in enumerate(all_text[-3:]): @@ -1385,6 +1390,7 @@ def logit_bias_processor( remaining_text = self.detokenize( remaining_tokens, prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + special=special, ) remaining_length = len(remaining_text) @@ -1412,6 +1418,7 @@ def logit_bias_processor( [token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + special=special, ) ) # Check if stop sequence is in the token @@ -1423,12 +1430,14 @@ def logit_bias_processor( [token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + special=special, ).decode("utf-8", errors="ignore") text_offset = len(prompt) + len( self.detokenize( completion_tokens[:returned_tokens], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + special=special, ).decode("utf-8", errors="ignore") ) token_offset = len(prompt_tokens) + returned_tokens @@ -1441,7 +1450,7 @@ def logit_bias_processor( ) ) top_logprob = { - self.detokenize([i]).decode( + self.detokenize([i], special=special).decode( "utf-8", errors="ignore" ): logprob for logprob, i in sorted_logprobs[:logprobs] @@ -1453,6 +1462,7 @@ def logit_bias_processor( [token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + special=special, ).decode("utf-8", errors="ignore") ], "text_offset": [text_offset], @@ -1471,6 +1481,7 @@ def logit_bias_processor( [token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + special=special, ).decode("utf-8", errors="ignore"), "index": 0, "logprobs": logprobs_or_none, @@ -1487,6 +1498,7 @@ def logit_bias_processor( remaining_tokens[:i], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + special=special, ) ts = bs.decode("utf-8") decode_success = True @@ -1522,14 +1534,18 @@ def logit_bias_processor( } if len(completion_tokens) >= max_tokens: - text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + text = self.detokenize( + completion_tokens, prev_tokens=prompt_tokens, special=special + ) finish_reason = "length" break if stopping_criteria is not None and stopping_criteria( self._input_ids, self._scores[-1, :] ): - text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + text = self.detokenize( + completion_tokens, prev_tokens=prompt_tokens, special=special + ) finish_reason = "stop" if self.verbose: @@ -1540,6 +1556,7 @@ def logit_bias_processor( remaining_text = self.detokenize( remaining_tokens, prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + special=special, ) any_stop = [s for s in stop_sequences if s in remaining_text] if len(any_stop) > 0: @@ -1553,6 +1570,7 @@ def logit_bias_processor( self.detokenize( [token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + special=special, ) ) @@ -1561,13 +1579,16 @@ def logit_bias_processor( if token == bos_token_id: continue token_str = self.detokenize([token]).decode( - "utf-8", errors="ignore" + "utf-8", + errors="ignore", + special=special, ) text_offset = len(prompt) + len( self.detokenize( completion_tokens[:returned_tokens], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + special=special, ) ) token_offset = len(prompt_tokens) + returned_tokens - 1 @@ -1580,13 +1601,18 @@ def logit_bias_processor( ) ) top_logprob = { - self.detokenize([i]).decode("utf-8", errors="ignore"): logprob + self.detokenize([i]).decode( + "utf-8", errors="ignore", special=special + ): logprob for logprob, i in sorted_logprobs[:logprobs] } top_logprob.update({token_str: current_logprobs[int(token)]}) logprobs_or_none = { "tokens": [ - self.detokenize([token]).decode("utf-8", errors="ignore") + self.detokenize( + [token], + special=special, + ).decode("utf-8", errors="ignore") ], "text_offset": [text_offset], "token_logprobs": [current_logprobs[int(token)]], @@ -1594,7 +1620,7 @@ def logit_bias_processor( } if token_end_position >= end: - last_text = self.detokenize([token]) + last_text = self.detokenize([token], special=special) if token_end_position == end - 1: break returned_tokens += 1 @@ -1623,7 +1649,7 @@ def logit_bias_processor( "model": model_name, "choices": [ { - "text": self.detokenize([token]).decode( + "text": self.detokenize([token], special=special).decode( "utf-8", errors="ignore" ), "index": 0, @@ -1687,7 +1713,7 @@ def logit_bias_processor( all_token_strs = [ self.detokenize([token], prev_tokens=all_tokens[:i]).decode( - "utf-8", errors="ignore" + "utf-8", errors="ignore", special=special ) for i, token in enumerate(all_tokens) ] @@ -1702,7 +1728,7 @@ def logit_bias_processor( text_offset + len( self.detokenize(all_tokens[:idx]).decode( - "utf-8", errors="ignore" + "utf-8", errors="ignore", special=special ) ) ) @@ -1714,9 +1740,9 @@ def logit_bias_processor( ) token_logprobs.append(logprobs_token[int(token)]) top_logprob: Optional[Dict[str, float]] = { - self.detokenize([i], prev_tokens=all_tokens[:idx]).decode( - "utf-8", errors="ignore" - ): logprob + self.detokenize( + [i], prev_tokens=all_tokens[:idx], special=special + ).decode("utf-8", errors="ignore"): logprob for logprob, i in sorted_logprobs[:logprobs] } top_logprob.update({token_str: logprobs_token[int(token)]}) @@ -1781,6 +1807,7 @@ def create_completion( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, logit_bias: Optional[Dict[int, float]] = None, + special: bool = False, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -1810,6 +1837,7 @@ def create_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. logit_bias: A logit bias to use. + special: Include special tokens in output. Raises: ValueError: If the requested tokens exceed the context window. @@ -1844,6 +1872,7 @@ def create_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + special=special, ) if stream: chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks @@ -1878,6 +1907,7 @@ def __call__( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, logit_bias: Optional[Dict[int, float]] = None, + special: bool = False, ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -1907,6 +1937,7 @@ def __call__( logits_processor: A list of logits processors to use. grammar: A grammar to use for constrained sampling. logit_bias: A logit bias to use. + special: Include special tokens in output. Raises: ValueError: If the requested tokens exceed the context window. @@ -1941,6 +1972,7 @@ def __call__( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + special=special, ) def create_chat_completion( @@ -1973,6 +2005,7 @@ def create_chat_completion( logit_bias: Optional[Dict[int, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + special: bool = False, ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -2005,6 +2038,7 @@ def create_chat_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use. logit_bias: A logit bias to use. + special: Include special tokens in output. Returns: Generated chat completion or a stream of chat completion chunks. @@ -2044,6 +2078,7 @@ def create_chat_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + special=special, ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f127df2f2..97748bdeb 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -102,6 +102,7 @@ def __call__( grammar: Optional[llama.LlamaGrammar] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + special: bool = False, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -2798,6 +2799,7 @@ def __call__( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + special: bool = False, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -3018,6 +3020,7 @@ def __call__( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + special=special, ) if tool is not None: From 8790ce6c01edb2d3fc0f4d3680a2524b08978e0f Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sun, 4 Jan 2026 13:57:44 -0500 Subject: [PATCH 4/4] feat: add special to all formatters/completers --- llama_cpp/llama_chat_format.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 97748bdeb..23d5b9427 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -589,6 +589,7 @@ def chat_completion_handler( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + special: bool = False, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -691,6 +692,7 @@ def chat_completion_handler( stopping_criteria=stopping_criteria, grammar=grammar, logit_bias=logit_bias, + special=special, ) if tool is not None: tool_name = tool["function"]["name"] @@ -1426,6 +1428,7 @@ def functionary_chat_handler( model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, + special: bool = False, **kwargs, # type: ignore ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" @@ -1632,6 +1635,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): model=model, logits_processor=logits_processor, grammar=grammar, + special=special, ) return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore @@ -1712,6 +1716,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): mirostat_eta=mirostat_eta, model=model, logits_processor=logits_processor, + special=special, ) # type: ignore assert "usage" in completion @@ -1785,6 +1790,7 @@ def functionary_v1_v2_chat_handler( model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, + special: bool = False, **kwargs, # type: ignore ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" @@ -2001,6 +2007,7 @@ def prepare_messages_for_inference( model=model, logits_processor=logits_processor, grammar=grammar, + special=special, ) if stream is False: completion_or_completion_chunks["choices"][0]["text"] = ( @@ -2064,6 +2071,7 @@ def create_completion(prompt, stop, grammar): model=model, logits_processor=logits_processor, grammar=grammar, + special=special, ), ) @@ -3582,6 +3590,7 @@ def chatml_function_calling( grammar: Optional[llama.LlamaGrammar] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + special: bool = False, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -3712,6 +3721,7 @@ def chatml_function_calling( logits_processor=logits_processor, grammar=grammar, logprobs=top_logprobs if logprobs else None, + special=special, ), stream=stream, ) @@ -3764,6 +3774,7 @@ def chatml_function_calling( model=model, logits_processor=logits_processor, grammar=grammar, + special=special, ) return _convert_completion_to_chat_function( tool_name, completion_or_chunks, stream @@ -3810,6 +3821,7 @@ def chatml_function_calling( grammar=llama_grammar.LlamaGrammar.from_string( initial_gbnf_tool_grammar, verbose=llama.verbose ), + special=special, ) completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore text = completion["choices"][0]["text"] @@ -3838,6 +3850,7 @@ def chatml_function_calling( grammar=llama_grammar.LlamaGrammar.from_string( follow_up_gbnf_tool_grammar, verbose=llama.verbose ), + special=special, ), stream=stream, ) @@ -3883,6 +3896,7 @@ def chatml_function_calling( model=model, logits_processor=logits_processor, grammar=grammar, + special=special, ) completion_or_chunks = cast( llama_types.CreateCompletionResponse, completion_or_chunks @@ -3914,6 +3928,7 @@ def chatml_function_calling( grammar=llama_grammar.LlamaGrammar.from_string( follow_up_gbnf_tool_grammar, verbose=llama.verbose ), + special=special, ) response = cast(llama_types.CreateCompletionResponse, response)