abetlen · dhdaines · Jan 5, 2026 · Jan 4, 2026 · Jan 4, 2026 · Jan 4, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Add chat completion handler for Granite-Docling (and SmolVLM)
+- feat: Add `special` argument to keep special tokens in chat completion output
+
 ## [0.3.16]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@4227c9be4268ac844921b90f31595f81236bd317

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -153,6 +153,15 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
+        # Set version for mtmd (required by upstream CMakeLists.txt)
+        # NOTE: This is a workaround for mtmd build requirements.
+        # Version is set to 0.0.0 for local builds. If upstream adds version
+        # compatibility checks, this may need to match llama.cpp version.
+        if (NOT DEFINED LLAMA_BUILD_NUMBER)
+            set(LLAMA_BUILD_NUMBER 0)
+        endif()
+        set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
+
         # Building llava
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 

diff --git a/examples/granite_docling/main.py b/examples/granite_docling/main.py
@@ -0,0 +1,35 @@
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import GraniteDoclingChatHandler
+
+
+chat_handler = GraniteDoclingChatHandler.from_pretrained(
+    repo_id="ggml-org/granite-docling-258M-GGUF",
+    filename="mmproj*Q8_0*",
+)
+llama = Llama.from_pretrained(
+    repo_id="ggml-org/granite-docling-258M-GGUF",
+    filename="granite*Q8_0*",
+    chat_handler=chat_handler,
+    n_ctx=8192,
+)
+response = llama.create_chat_completion(
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": "https://huggingface.co/spaces/ibm-granite/granite-docling-258m-demo/resolve/main/data/images/new_arxiv.png"},
+                {"type": "text", "text": "Convert this page to docling."},
+            ],
+        }
+    ],
+    stream=True,
+    special=True,
+)
+
+for chunk in response:
+    delta = chunk["choices"][0]["delta"]
+    if "content" not in delta:
+        continue
+    print(delta["content"], end="", flush=True)
+
+print()
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.16"
+__version__ = "0.4.0"