From c2e00dbbe421c8903e3c5204e18571cff19b35e0 Mon Sep 17 00:00:00 2001
From: Varun Khare <varun.khare@nimbledgehq.ai>
Date: Wed, 16 Jul 2025 00:48:28 +0530
Subject: [PATCH 1/7] initiate qwen 1.7 Agent scripts

Signed-off-by: Varun Khare <varun.khare@nimbledgehq.ai>
---
 models/Qwen3-1.7B/demo_qwen.py | 282 +++++++++++++++++++++++++++++++++
 models/Qwen3-1.7B/tools.py     | 202 +++++++++++++++++++++++
 2 files changed, 484 insertions(+)
 create mode 100755 models/Qwen3-1.7B/demo_qwen.py
 create mode 100644 models/Qwen3-1.7B/tools.py

diff --git a/models/Qwen3-1.7B/demo_qwen.py b/models/Qwen3-1.7B/demo_qwen.py
new file mode 100755
index 00000000..a8e8df40
--- /dev/null
+++ b/models/Qwen3-1.7B/demo_qwen.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+#-*- coding: utf-8 -*-
+
+import json
+import datetime
+import torch
+import re
+from typing import Tuple, List, Optional
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from tools import tools, tool_schema
+
+# Load Qwen3 1.7B 4-bit model and tokenizer
+model_id = "Qwen/Qwen3-1.7B"
+
+TOOL_CALL_START_TOKEN = "<tool_call>"
+TOOL_CALL_END_TOKEN = "</tool_call>"
+TOOL_RESPONSE_START_TOKEN = "<tool_response>"
+TOOL_RESPONSE_END_TOKEN = "</tool_response>"
+INITIAL_PROMPT = f"""You are a helpful assistant. When you need to use tools, format your response with the tool call between {TOOL_CALL_START_TOKEN} and {TOOL_CALL_END_TOKEN} tokens.
+Use this format: {TOOL_CALL_START_TOKEN}[function_name(param="value")]{TOOL_CALL_END_TOKEN}. Call only one tool at a time and sequentially execute them."""
+
+initial_message_block = [
+    {
+        "role": "system",
+        "content": INITIAL_PROMPT
+    }
+]
+
+from mlx_lm import load, generate
+
+model, tokenizer = load("mlx-community/Qwen3-1.7B-4bit")
+# from transformers import BitsAndBytesConfig
+
+# Configure 4-bit quantization
+# quantization_config = BitsAndBytesConfig(
+#     load_in_4bit=True,
+#     bnb_4bit_compute_dtype=torch.bfloat16,
+#     bnb_4bit_use_double_quant=True,
+#     bnb_4bit_quant_type="nf4"
+# )
+
+# model = AutoModelForCausalLM.from_pretrained(
+#     model_id,
+#     device_map="auto",
+#     torch_dtype=torch.bfloat16,
+#     # quantization_config=quantization_config,
+#     trust_remote_code=True,
+# )
+# tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+
+# Ensure tokenizer has necessary tokens
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+    
+print(f"✓ {model_id} model loaded successfully!")
+
+
+def execute_function_call(function_name: str, arguments: dict) -> dict:
+    """Execute a function call and return the result"""
+    if function_name not in tools:
+        return {"error": f"Function {function_name} not found"}
+    
+    try:
+        function = tools[function_name]  # Direct access to function object
+        result = function(**arguments)
+        return result
+    except Exception as e:
+        return {"error": f"Error executing {function_name}: {str(e)}"}
+
+def format_tool_response(result: dict) -> str:
+    """Format tool execution result using token-based format"""
+    result_json = json.dumps(result)
+    return f"<|tool_response_start|>{result_json}<|tool_response_end|>"
+
+def execute_tool_call_with_response(function_name: str, arguments: dict) -> tuple:
+    """Execute a function call and return both result and formatted response"""
+    result = execute_function_call(function_name, arguments)
+    formatted_response = format_tool_response(result)
+    return result, formatted_response
+
+def parse_tool_calls_from_response(response_text: str) -> list:
+    """Parse tool calls from model response using multiple formats"""
+    tool_calls = []
+
+    # Method 2: Look for JSON-style tool calls: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
+    json_tool_pattern = r'<tool_call>\s*({.*?})\s*</tool_call>'
+    json_matches = re.findall(json_tool_pattern, response_text, re.DOTALL)
+    
+    for json_str in json_matches:
+        try:
+            tool_data = json.loads(json_str)
+            func_name = tool_data.get("name")
+            arguments = tool_data.get("arguments", {})
+            
+            if func_name in tools:
+                tool_calls.append({
+                    "function_name": func_name,
+                    "arguments": arguments
+                })
+                print(f"✓ Parsed JSON tool call: {func_name}({arguments})")
+        except json.JSONDecodeError:
+            print(f"⚠ Failed to parse JSON tool call: {json_str}")
+    
+    return tool_calls
+
+def generate_with_model(conversation_messages: List, max_new_tokens: int = 150) -> str:
+    """Generate text using the loaded model with multi-turn conversation support"""
+    # Use chat template with tools for multi-turn conversations
+    print("---"*10)
+    print("Conversation Messages:")
+    print(json.dumps(conversation_messages, indent=4))
+    print("---"*10)
+    prompt = tokenizer.apply_chat_template(
+        conversation_messages,
+        tools=tool_schema,
+        add_generation_prompt=True,
+        tokenize=False
+    )
+    
+    response = generate(model, tokenizer, prompt)
+    # Tokenize the prompt
+    # input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
+    
+    # # Generate response with parameters optimized for tool calling
+    # with torch.no_grad():
+    #     output = model.generate(
+    #         input_ids,
+    #         do_sample=True,
+    #         temperature=0.3,  # Good balance for Qwen3
+    #         top_p=0.8,        # Nucleus sampling for focused responses
+    #         max_new_tokens=max_new_tokens,
+    #         pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+    #         eos_token_id=tokenizer.eos_token_id,
+    #         repetition_penalty=1.1,  # Prevent repetition
+    #     )
+    
+    # response = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
+    return response.strip()
+
+def handle_multi_step_request(user_prompt: str, max_steps: int, max_new_tokens: int) -> list:
+    """Handle requests that may require multiple tool calls and back and forth"""
+    step_results = []
+    conversation_messages = None  # Will hold the full conversation chain
+    tool_context = {}  # Store results from previous tool calls
+    
+    for step in range(max_steps):
+        print(f"\n--- Step {step + 1} ---")
+        if step == 0:
+            conversation_messages = initial_message_block.copy()
+            conversation_messages.append({
+                "role": "user", 
+                "content": user_prompt
+            })
+        else: 
+            conversation_messages.append({
+                "role": "system", 
+                "content": "Now use the result from the tool calls to answer the user's question. Call another tool if needed."
+            })
+        # Generate response
+        try:
+            response = generate_with_model(conversation_messages, max_new_tokens=max_new_tokens)
+            print(f"Model Response: {response}")
+            
+            # Parse and execute tool calls
+            tool_calls = parse_tool_calls_from_response(response)
+            tool_results = []
+            
+            if tool_calls:
+                print(f"Executing {len(tool_calls)} tool call(s):")
+                for call in tool_calls:
+                    func_name = call["function_name"]
+                    arguments = call["arguments"]
+                    
+                    print(f"  • {func_name}({arguments})")
+                    result, formatted_response = execute_tool_call_with_response(func_name, arguments)
+                    
+                    # Store important results for future reference
+                    if func_name == "get_current_location" and "location" in result:
+                        tool_context["location"] = result["location"]
+                    
+                    tool_results.append({
+                        "function": func_name,
+                        "arguments": arguments,
+                        "result": result
+                    })
+                    print(f"    Result: {json.dumps(result, indent=4)}")
+            
+            # Add assistant response to conversation
+            conversation_messages.append({
+                "role": "assistant",
+                "content": response
+            })
+            
+            # Add tool results to conversation as function messages
+            for tool_result in tool_results:
+                if not tool_result["result"].get("error"):
+                    conversation_messages.append({
+                        "role": "system",
+                        "content": f"The result of the tool {tool_result['function']} is: {TOOL_RESPONSE_START_TOKEN}{json.dumps(tool_result['result'])}{TOOL_RESPONSE_END_TOKEN}"
+                    })
+            
+            # Store step result
+            step_result = {
+                "step": step + 1,
+                "prompt": user_prompt if step == 0 else "continuation",
+                "response": response,
+                "tool_calls": tool_calls,
+                "tool_results": tool_results,
+                "has_errors": any("error" in result.get("result", {}) for result in tool_results),
+                "tool_context": tool_context.copy(),
+                "conversation_messages": conversation_messages.copy()
+            }
+            step_results.append(step_result)
+            
+            # Check if all tool calls were successful
+            if step_result["has_errors"]:
+                print(f"⚠ Stopping due to tool execution errors")
+                break
+            
+            # Simple continuation logic: if no tools were called, we're done
+            if not tool_calls:
+                print(f"✓ Completed after {step + 1} step(s) - no tool calls needed")
+                break
+            
+            # If we've reached max steps, stop
+            if step >= max_steps - 1:
+                print(f"✓ Reached maximum steps ({max_steps})")
+                break
+            
+            # If tools were executed, continue to next step to see if model wants to do more
+            print(f"✓ Step {step + 1} completed with {len(tool_calls)} tool call(s) - continuing...")
+            
+        except Exception as e:
+            print(f"Error in step {step + 1}: {e}")
+            step_results.append({
+                "step": step + 1,
+                "prompt": user_prompt if step == 0 else "continuation",
+                "error": str(e),
+                "response": None,
+                "tool_calls": [],
+                "tool_results": [],
+                "tool_context": tool_context.copy(),
+                "conversation_messages": conversation_messages.copy() if conversation_messages else []
+            })
+            break
+    
+    return step_results
+
+def run_tool_calling_demo():
+    """Run tool calling demonstration"""
+    print("=== Qwen3 1.7B Tool Calling Demo ===\n")
+    print(f"Model: {model_id}")
+    print(f"Available tools: {list(tools.keys())}")
+    
+    demo_prompts = [
+        "What's the weather here today?",
+        "Calculate 15 * 23",
+        "What time is it in JST timezone?",
+        "Where am I located?",
+        "Get my location and check the weather there"
+    ]
+    
+    for i, user_prompt in enumerate(demo_prompts, 1):
+        print(f"\nDemo {i}: {user_prompt}")
+        print("-" * 60)
+        step_results = handle_multi_step_request(user_prompt, max_steps=4, max_new_tokens=400)
+        # Show final summary
+        print(f"\n📋 Multi-step Summary:")
+        for step_result in step_results:
+            step_num = step_result["step"]
+            tool_calls = step_result.get("tool_calls", [])
+            if tool_calls:
+                print(f"  Step {step_num}: {len(tool_calls)} tool call(s)")
+                for call in tool_calls:
+                    func_name = call["function_name"]
+                    print(f"    ✓ {func_name}")
+        print("\n" + "="*60)
+
+
+if __name__ == "__main__":
+    # Run the regular demo first
+    run_tool_calling_demo()
\ No newline at end of file
diff --git a/models/Qwen3-1.7B/tools.py b/models/Qwen3-1.7B/tools.py
new file mode 100644
index 00000000..acf1ca04
--- /dev/null
+++ b/models/Qwen3-1.7B/tools.py
@@ -0,0 +1,202 @@
+
+import datetime
+import inspect
+from typing import get_origin, get_args, Union
+
+# Initialize empty tool schema and tools mapping
+tool_schema = []
+tools = {}
+
+def tool(func_or_description=None, **param_descriptions):    
+    """
+    Decorator to automatically generate tool schema from function signature and add to registry.
+    
+    Can be used both with and without parentheses:
+        @tool
+        def my_function(): ...
+        
+        @tool()
+        def my_function(): ...
+        
+        @tool("Custom description")
+        def my_function(): ...
+    
+    Args:
+        func_or_description: Either a function (when used as @tool) or description string (when used as @tool())
+        **param_descriptions: Optional parameter descriptions as keyword arguments.
+    """
+    def create_tool_definition(func, description=None):
+        """Helper function to create tool definition from function"""
+        # Get function name
+        func_name = func.__name__
+        
+        # Get description from parameter or docstring
+        func_description = description or (func.__doc__ or f"Execute {func_name}").strip()
+        
+        # Get function signature
+        sig = inspect.signature(func)
+        
+        # Build parameters schema
+        properties = {}
+        required = []
+        
+        for param_name, param in sig.parameters.items():
+            # Skip *args and **kwargs
+            if param.kind in (param.VAR_POSITIONAL, param.VAR_KEYWORD):
+                continue
+                
+            # Determine parameter type
+            param_type = "string"  # default
+            
+            if param.annotation != param.empty:
+                annotation = param.annotation
+                
+                # Handle Union types (like Optional[str])
+                if get_origin(annotation) is Union:
+                    args = get_args(annotation)
+                    # Remove NoneType for Optional types
+                    non_none_args = [arg for arg in args if arg is not type(None)]
+                    if non_none_args:
+                        annotation = non_none_args[0]
+                
+                # Map Python types to JSON schema types
+                if annotation in (str, type(str)):
+                    param_type = "string"
+                elif annotation in (int, type(int)):
+                    param_type = "integer"
+                elif annotation in (float, type(float)):
+                    param_type = "number"
+                elif annotation in (bool, type(bool)):
+                    param_type = "boolean"
+                elif annotation in (list, type(list)):
+                    param_type = "array"
+                elif annotation in (dict, type(dict)):
+                    param_type = "object"
+
+            # Build parameter schema
+            param_schema = {
+                "type": param_type,
+                "description": param_descriptions.get(param_name, f"The {param_name} parameter")
+            }
+                            
+            # Check if parameter has default value
+            if param.default != param.empty:
+                param_schema["default"] = param.default
+            else:
+                required.append(param_name)
+            properties[param_name] = param_schema
+        
+        # Build complete tool definition
+        tool_definition = {
+            "type": "function",
+            "function": {
+                "name": func_name,
+                "description": func_description,
+                "parameters": {
+                    "type": "object",
+                    "properties": properties,
+                    "required": required
+                }
+            }
+        }
+        
+        # Add to registry
+        tool_schema.append(tool_definition)
+        tools[func_name] = func
+        return func
+    
+    # Case 1: Used as @tool (without parentheses)
+    # The function is passed as the first argument
+    if callable(func_or_description) and hasattr(func_or_description, '__name__'):
+        return create_tool_definition(func_or_description)
+    
+    # Case 2: Used as @tool() or @tool("description") (with parentheses)  
+    # Return a decorator function
+    else:
+        description = func_or_description if isinstance(func_or_description, str) else None
+        
+        def decorator(func):
+            return create_tool_definition(func, description)
+        
+        return decorator
+
+# Define example tools/functions
+@tool(
+    description="Get weather information for a specific location",
+    location="The location to get weather for",
+    unit="Temperature unit (celsius or fahrenheit)"
+)
+def get_weather(location: str, unit: str = "celsius") -> dict:
+    """Get current weather for a location"""
+    
+    weather_data = {
+        "New York": {"temp": 22, "condition": "sunny", "humidity": 65},
+        "London": {"temp": 15, "condition": "cloudy", "humidity": 78},
+        "Tokyo": {"temp": 28, "condition": "rainy", "humidity": 85},
+        "Paris": {"temp": 18, "condition": "partly cloudy", "humidity": 70}
+    }
+    
+    location_key = next((key for key in weather_data.keys() if key.lower() in location.lower()), "Unknown")
+    
+    if location_key == "Unknown":
+        return {"error": f"Weather data not available for {location}"}
+    
+    data = weather_data[location_key].copy()
+    if unit == "fahrenheit":
+        data["temp"] = round(data["temp"] * 9/5 + 32, 1)
+        data["unit"] = "°F"
+    else:
+        data["unit"] = "°C"
+    
+    return {
+        "location": location_key,
+        "temperature": data["temp"],
+        "condition": data["condition"],
+        "humidity": data["humidity"],
+        "unit": data["unit"]
+    }
+
+@tool(
+    expression="Mathematical expression to calculate (e.g., '2+2', '15*23')"
+)
+def calculate_math(expression: str) -> dict:
+    """Calculate a mathematical expression safely"""
+    try:
+        allowed_chars = set('0123456789+-*/.() ')
+        if not all(c in allowed_chars for c in expression):
+            return {"error": "Expression contains invalid characters"}
+        
+        result = eval(expression)
+        return {"expression": expression, "result": result}
+    except Exception as e:
+        return {"error": f"Calculation error: {str(e)}"}
+
+@tool(
+    timezone="Timezone (UTC, EST, PST, JST, CET)"
+)
+def get_current_time(timezone: str = "UTC") -> dict:
+    """Get current time in specified timezone"""
+    current_time = datetime.datetime.now()
+    timezone_offsets = {"UTC": 0, "EST": -5, "PST": -8, "JST": 9, "CET": 1}
+    
+    offset = timezone_offsets.get(timezone.upper(), 0)
+    adjusted_time = current_time + datetime.timedelta(hours=offset)
+    
+    return {
+        "timezone": timezone.upper(),
+        "time": adjusted_time.strftime("%Y-%m-%d %H:%M:%S"),
+        "day_of_week": adjusted_time.strftime("%A")
+    }
+
+@tool
+def get_current_location() -> dict:
+    """
+    Get the real location and timezone of the user. The user has given permission to share his location via this tool. 
+    Use this function when the user didn't provide an explicit location. Default to his location
+    """
+    return {
+        "location": "Tokyo",
+        "country": "Japan",
+        "coordinates": {"latitude": 35.6762, "longitude": 139.6503},
+        "timezone": "JST"
+    }
\ No newline at end of file

From c7c425c6150012b0e9a7485e7e06b05f5a8d5c16 Mon Sep 17 00:00:00 2001
From: Varun Khare <varun.khare@nimbleedgehq.ai>
Date: Sun, 20 Jul 2025 23:58:13 +0000
Subject: [PATCH 2/7] add onnx tests and lfm models

add tokenizer-cpp

add jinja template for qwen and dict support for tokenizer:from_json

Signed-off-by: Varun Khare <varun.khare@nimbleedgehq.ai>
---
 .gitignore                                    |   3 +-
 .gitmodules                                   |   3 +
 coreruntime/CMakeLists.txt                    |  13 +-
 coreruntime/build.py                          |  10 +-
 coreruntime/delitepy/library_stubs/setup.py   |   7 +-
 .../src_template/delitepy/__init__.py         |   3 +-
 .../delitepy/tokenizers/__init__.py           | 152 ++++++
 .../scripts/render_jinja2_templates.py        |  74 ---
 coreruntime/nimblenet/CMakeLists.txt          |   1 +
 .../asset_manager/src/asset_manager.cpp       |   4 +
 .../nimblenet/core_sdk/src/core_sdk.cpp       |   2 +
 .../core_sdk/src/nimble_exec_info.cpp         |   6 +-
 .../include/nimble_net_util.hpp               |  13 +-
 .../include/data_variable_enums.hpp           |   9 +
 .../include/tokenizers_data_variable.hpp      |  51 ++
 .../data_variable/src/data_variable.cpp       |  14 +
 .../src/tokenizers_data_variable.cpp          | 314 ++++++++++++
 .../executors/onnx/src/task_onnx_model.cpp    |   6 +-
 .../job_scheduler/include/internet_job.hpp    |   2 +
 .../src/resource_downloader.cpp               |   4 +
 .../resource_loader/src/resource_loader.cpp   |   2 +
 .../task_manager/task/include/statements.hpp  |   1 +
 .../task_manager/task/src/statements.cpp      |   3 +
 .../time_manager/include/time_manager.hpp     |   2 +-
 models/LFM2/demo_lfm.py                       | 297 +++++++++++
 models/Qwen3-1.7B/demo_qwen.py                | 266 +++++++---
 models/{Qwen3-1.7B => }/tools.py              |   4 +-
 .../qwen_demo/MINIMAL_PYTHON_CONSTRAINTS.md   | 317 ++++++++++++
 .../qwen_demo/qwen_modules.zip                | Bin 0 -> 8017 bytes
 .../qwen_demo/qwen_modules/main.py            | 473 ++++++++++++++++++
 .../qwen_demo/qwen_modules/tools.py           | 211 ++++++++
 .../simulation_assets/qwen_demo/run_demo.py   | 104 ++++
 .../simulation_assets/tokenizer_example.py    | 298 +++++++++++
 .../simulation_tests/test_simulator_script.py |  83 ++-
 third_party/README.md                         |  23 +
 third_party/tokenizers-cpp                    |   1 +
 36 files changed, 2575 insertions(+), 201 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 coreruntime/delitepy/library_stubs/src_template/delitepy/tokenizers/__init__.py
 delete mode 100755 coreruntime/delitepy/scripts/render_jinja2_templates.py
 create mode 100644 coreruntime/nimblenet/data_variable/include/tokenizers_data_variable.hpp
 create mode 100644 coreruntime/nimblenet/data_variable/src/tokenizers_data_variable.cpp
 create mode 100755 models/LFM2/demo_lfm.py
 rename models/{Qwen3-1.7B => }/tools.py (97%)
 create mode 100644 nimblenet_py/simulation_assets/qwen_demo/MINIMAL_PYTHON_CONSTRAINTS.md
 create mode 100644 nimblenet_py/simulation_assets/qwen_demo/qwen_modules.zip
 create mode 100644 nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py
 create mode 100644 nimblenet_py/simulation_assets/qwen_demo/qwen_modules/tools.py
 create mode 100644 nimblenet_py/simulation_assets/qwen_demo/run_demo.py
 create mode 100644 nimblenet_py/simulation_assets/tokenizer_example.py
 create mode 160000 third_party/tokenizers-cpp

diff --git a/.gitignore b/.gitignore
index d6b88489..52cea8c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,5 @@
 third_party/runtime/
 !third_party/runtime/CMakeLists.txt
 __pycache__/
-.pytest_cache/
\ No newline at end of file
+.pytest_cache/
+nimblenet_py/simulation_tests/NimbleSDK
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..3df230aa
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third_party/tokenizers-cpp"]
+	path = third_party/tokenizers-cpp
+	url = https://github.com/mlc-ai/tokenizers-cpp.git
diff --git a/coreruntime/CMakeLists.txt b/coreruntime/CMakeLists.txt
index 4fa47fe2..d340d750 100644
--- a/coreruntime/CMakeLists.txt
+++ b/coreruntime/CMakeLists.txt
@@ -38,7 +38,7 @@ endif()
 
 # set(DEBUGFLAGS " -Werror -Wno-write-strings  -Weffc++ -Wall -Wuninitialized -Wnon-virtual-dtor -Wshadow -Werror=format-security -Wunused-member-function -Wunused-function ")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=switch -Werror=return-type -Werror=implicit-fallthrough \
-	-Werror=non-virtual-dtor -Werror=format -Werror=format-security -Werror=unused-member-function -Werror=unused-function -Werror=writable-strings")
+	-Werror=non-virtual-dtor -Werror=format -Werror=format-security -Werror=unused-function -Werror=write-strings")
 
 # string(CONCAT RELEASEFLAGS ${DEBUGFLAGS} " -fstack-protector-strong -ffunction-sections -fdata-sections ")
 # #
@@ -84,6 +84,7 @@ add_subdirectory(nimblenet)
 add_subdirectory(delitepy)
 add_subdirectory("../third_party/json" "${CMAKE_BINARY_DIR}/third_party/json")
 add_subdirectory("../third_party/SPSCQueue" "${CMAKE_BINARY_DIR}/third_party/SPSCQueue")
+add_subdirectory("../third_party/tokenizers-cpp" "${CMAKE_BINARY_DIR}/third_party/tokenizers-cpp")
 if (GENAI)
 	add_subdirectory("../third_party/miniz" "${CMAKE_BINARY_DIR}/third_party/miniz")
 endif()
@@ -182,7 +183,13 @@ else()
 	target_compile_definitions(nimblenet PUBLIC -DIOS_PLATFORM="mac")
 	add_subdirectory(platform/unix) # produces ${CLIENT_INCLUDES}
 	add_subdirectory("../third_party/runtime" "${CMAKE_BINARY_DIR}/third_party/runtime") # -> produces ${BACKEND_LIBS} ${BACKEND_DIR} ${BACKED_INCLUDES}
-	target_link_libraries(nimblenet ${VISIBILITY} curl)
+	# Add conda environment library directory to search path
+	if(DEFINED ENV{CONDA_PREFIX})
+		link_directories($ENV{CONDA_PREFIX}/lib)
+		target_link_libraries(nimblenet ${VISIBILITY} $ENV{CONDA_PREFIX}/lib/libcurl.so)
+	else()
+		target_link_libraries(nimblenet ${VISIBILITY} curl)
+	endif()
 
 	# target_link_libraries(nimblenet ${VISIBILITY} clientlib)
 endif()
@@ -194,7 +201,7 @@ if(NOT ANDROID_ABI)
 	list(APPEND ADDITIONAL_LIBS ZLIB::ZLIB)
 endif()
 
-target_link_libraries(nimblenet PRIVATE nlohmann_json::nlohmann_json ${VISIBILITY} SPSCQueue ${VISIBILITY} ${BACKEND_LIBS} ${VISIBILITY} ${ADDITIONAL_LIBS})
+target_link_libraries(nimblenet PRIVATE nlohmann_json::nlohmann_json ${VISIBILITY} SPSCQueue ${VISIBILITY} tokenizers_cpp ${VISIBILITY} ${BACKEND_LIBS} ${VISIBILITY} ${ADDITIONAL_LIBS})
 if (GENAI)
 	target_link_libraries(nimblenet PRIVATE miniz)
 endif()
diff --git a/coreruntime/build.py b/coreruntime/build.py
index 68e4ac7a..56ea6717 100755
--- a/coreruntime/build.py
+++ b/coreruntime/build.py
@@ -51,7 +51,7 @@ def main():
         if "-DCMAKE_BUILD_TYPE=Release" in cmake_args:
             STRIP = 1
 
-    CMAKE_CXX_FLAGS = ""
+    CMAKE_CXX_FLAGS = "-Wno-unused-member-function -Wno-implicit-fallthrough "
     if args.testing:
         cmake_args += " -DTESTING=1 "
 
@@ -61,16 +61,20 @@ def main():
     COMMON_FLAGS = (
         f"-B{os.getcwd()}/build/ "
         f"{cmake_args} "
+        "-DCMAKE_POLICY_VERSION_MINIMUM=3.5 "
+        "-DCMAKE_CXX_FLAGS_RELEASE='-Wno-unused-function -Wno-implicit-fallthrough -DNDEBUG -O3' "
+        "-DCMAKE_CXX_FLAGS_DEBUG='-Wno-unused-function -Wno-implicit-fallthrough -g' "
     )
 
     # Determine compiler settings based on architecture
     if arch == "arm":
         cmake_command = f"cmake CMakeLists.txt {COMMON_FLAGS} -DCMAKE_CXX_COMPILER=g++ -DMACOS=1 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_CXX_FLAGS='{CMAKE_CXX_FLAGS}'"
     elif arch == "x86_64":
-        CMAKE_CXX_FLAGS += " -stdlib=libstdc++ "
+        # Replace clang-specific flags with g++ compatible ones
+        CMAKE_CXX_FLAGS = CMAKE_CXX_FLAGS.replace("-Wno-unused-member-function", "-Wno-unused-function")
         cmake_command = (
             f"cmake CMakeLists.txt {COMMON_FLAGS} "
-            f"-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS='{CMAKE_CXX_FLAGS}'"
+            f"-DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS='{CMAKE_CXX_FLAGS}'"
         )
     else:
         cmake_command = f"cmake CMakeLists.txt {COMMON_FLAGS} -DMACOS=1"
diff --git a/coreruntime/delitepy/library_stubs/setup.py b/coreruntime/delitepy/library_stubs/setup.py
index 25dc63b5..47cfc2ec 100644
--- a/coreruntime/delitepy/library_stubs/setup.py
+++ b/coreruntime/delitepy/library_stubs/setup.py
@@ -26,12 +26,7 @@ def render_src_template() -> None:
         check=True,
     )
     subprocess.run(
-        [
-            f"{delitepy_dir}/scripts/render_jinja2_templates.py",
-            f"{library_stubs_dir}/src_template",
-            f"{library_stubs_dir}/src_gen",
-            coreruntime_dir,
-        ],
+        ["cp", "-r", f"{library_stubs_dir}/src_template", f"{library_stubs_dir}/src_gen"],
         check=True,
     )
 
diff --git a/coreruntime/delitepy/library_stubs/src_template/delitepy/__init__.py b/coreruntime/delitepy/library_stubs/src_template/delitepy/__init__.py
index d04a2c91..1b47e828 100644
--- a/coreruntime/delitepy/library_stubs/src_template/delitepy/__init__.py
+++ b/coreruntime/delitepy/library_stubs/src_template/delitepy/__init__.py
@@ -2,7 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""Package delitepy containing modules nimblenet and ne_re."""
+"""Package delitepy containing modules nimblenet, ne_re, and tokenizers."""
 
 from delitepy.nimblenet import *
 from delitepy.ne_re import *
+from delitepy.tokenizers import *
diff --git a/coreruntime/delitepy/library_stubs/src_template/delitepy/tokenizers/__init__.py b/coreruntime/delitepy/library_stubs/src_template/delitepy/tokenizers/__init__.py
new file mode 100644
index 00000000..d0dd7b6f
--- /dev/null
+++ b/coreruntime/delitepy/library_stubs/src_template/delitepy/tokenizers/__init__.py
@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Package delitepy.tokenizers for tokenizer functionality."""
+
+from typing import List, Union
+from delitepy.nimblenet.tensor import Tensor
+
+def from_pretrained(model_name_or_path: str) -> str:
+    """Load a pre-trained tokenizer from HuggingFace Hub or local file.
+    
+    Args:
+        model_name_or_path: Path to tokenizer.json file or HuggingFace model name
+        
+    Returns:
+        Tokenizer handle (opaque string identifier)
+        
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> tokenizer = tokenizers.from_pretrained("/path/to/tokenizer.json")
+    """
+    pass
+
+def from_file(file_path: str) -> str:
+    """Load a tokenizer from a file path.
+    
+    Args:
+        file_path: Path to tokenizer.json or .model file
+        
+    Returns:
+        Tokenizer handle (opaque string identifier)
+        
+    Example:
+        >>> tokenizer = tokenizers.from_file("tokenizer.json")
+        >>> tokenizer = tokenizers.from_file("model.spm")
+    """
+    pass
+
+def from_json(json_str: str) -> str:
+    """Create a tokenizer from a JSON string.
+    
+    Args:
+        json_str: JSON string containing tokenizer configuration
+        
+    Returns:
+        Tokenizer handle (opaque string identifier)
+        
+    Example:
+        >>> json_config = '{"model": {...}, "normalizer": {...}}'
+        >>> tokenizer = tokenizers.from_json(json_config)
+    """
+    pass
+
+def from_sentencepiece(model_path: str) -> str:
+    """Load a SentencePiece tokenizer from a .model file.
+    
+    Args:
+        model_path: Path to SentencePiece .model file
+        
+    Returns:
+        Tokenizer handle (opaque string identifier)
+        
+    Example:
+        >>> tokenizer = tokenizers.from_sentencepiece("tokenizer.model")
+    """
+    pass
+
+def encode(tokenizer: str, text: str) -> Tensor:
+    """Encode text into token IDs.
+    
+    Args:
+        tokenizer: Tokenizer handle from from_pretrained/from_file/etc.
+        text: Text to encode
+        
+    Returns:
+        Tensor containing token IDs (INT32)
+        
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> token_ids = tokenizers.encode(tokenizer, "Hello world!")
+        >>> print(token_ids.shape)  # [num_tokens]
+    """
+    pass
+
+def decode(tokenizer: str, token_ids: Tensor) -> str:
+    """Decode token IDs back to text.
+    
+    Args:
+        tokenizer: Tokenizer handle
+        token_ids: Tensor containing token IDs (INT32)
+        
+    Returns:
+        Decoded text string
+        
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> token_ids = tokenizers.encode(tokenizer, "Hello world!")
+        >>> text = tokenizers.decode(tokenizer, token_ids)
+        >>> print(text)  # "Hello world!"
+    """
+    pass
+
+def get_vocab_size(tokenizer: str) -> int:
+    """Get the vocabulary size of the tokenizer.
+    
+    Args:
+        tokenizer: Tokenizer handle
+        
+    Returns:
+        Size of the vocabulary
+        
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> vocab_size = tokenizers.get_vocab_size(tokenizer)
+        >>> print(vocab_size)  # 30522
+    """
+    pass
+
+def token_to_id(tokenizer: str, token: str) -> int:
+    """Convert a token string to its ID.
+    
+    Args:
+        tokenizer: Tokenizer handle
+        token: Token string
+        
+    Returns:
+        Token ID, or -1 if token not found
+        
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> token_id = tokenizers.token_to_id(tokenizer, "[CLS]")
+        >>> print(token_id)  # 101
+    """
+    pass
+
+def id_to_token(tokenizer: str, token_id: int) -> str:
+    """Convert a token ID to its string representation.
+    
+    Args:
+        tokenizer: Tokenizer handle
+        token_id: Token ID
+        
+    Returns:
+        Token string, or empty string if ID not found
+        
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> token = tokenizers.id_to_token(tokenizer, 101)
+        >>> print(token)  # "[CLS]"
+    """
+    pass 
\ No newline at end of file
diff --git a/coreruntime/delitepy/scripts/render_jinja2_templates.py b/coreruntime/delitepy/scripts/render_jinja2_templates.py
deleted file mode 100755
index 06bcb53d..00000000
--- a/coreruntime/delitepy/scripts/render_jinja2_templates.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import sys
-from pathlib import Path
-
-from jinja2 import Template
-
-
-def extract_delitepy_doc_blocks(infile_path: str):
-    block_begin_marker = "DELITEPY_DOC_BLOCK_BEGIN"
-    block_end_marker = "DELITEPY_DOC_BLOCK_END"
-
-    inside_doc_block = False
-
-    with open(infile_path, "r") as infile:
-        for line in infile:
-            stripped_line = line.strip()
-
-            if stripped_line.startswith(block_begin_marker):
-                inside_doc_block = True
-                continue
-
-            if stripped_line.startswith(block_end_marker):
-                inside_doc_block = False
-                continue
-
-            if inside_doc_block:
-                yield line
-
-
-def render_jinja2_templates(source_dir: str, target_dir: str, base_dir: str) -> None:
-    for root, _, file_names in os.walk(source_dir):
-        root_rel_path = os.path.relpath(root, source_dir)
-        root_out_dir = os.path.join(target_dir, root_rel_path)
-        os.makedirs(root_out_dir, exist_ok=True)
-
-        for file_name in file_names:
-            template_path = os.path.join(root, file_name)
-            with open(template_path, "r") as file:
-                template_content = file.read()
-
-            template = Template(template_content, keep_trailing_newline=True)
-            render_context = {
-                "extract_delitepy_doc_blocks": lambda infile_path: "".join(
-                    extract_delitepy_doc_blocks(
-                        str(Path(base_dir).joinpath(infile_path).resolve()),
-                    ),
-                ),
-            }
-            rendered_template_content = template.render(render_context)
-
-            rendered_template_path = os.path.join(root_out_dir, file_name)
-            with open(rendered_template_path, "w") as file:
-                file.write(rendered_template_content)
-
-
-def main(args: list[str]) -> None:
-    assert len(args) == 4, "Incorrect usage."
-
-    source_dir = str(Path(args[1]).resolve())
-    target_dir = str(Path(args[2]).resolve())
-    base_dir = str(Path(args[3]).resolve())
-
-    print(f"Rendering Jinja2 templates: '{source_dir}' => '{target_dir}'")
-    render_jinja2_templates(source_dir, target_dir, base_dir)
-    print(f"[done] Rendering Jinja2 templates: '{source_dir}' => '{target_dir}'")
-
-
-if __name__ == "__main__":
-    main(sys.argv)
diff --git a/coreruntime/nimblenet/CMakeLists.txt b/coreruntime/nimblenet/CMakeLists.txt
index 004b2013..6c87561a 100644
--- a/coreruntime/nimblenet/CMakeLists.txt
+++ b/coreruntime/nimblenet/CMakeLists.txt
@@ -36,6 +36,7 @@ set(BASE
 	data_variable/src/pre_processor_nimble_net_variable.cpp
 	data_variable/src/raw_event_store_data_variable.cpp
 	data_variable/src/regex_data_variable.cpp
+	data_variable/src/tokenizers_data_variable.cpp
 	job_scheduler/src/base_job.cpp
 	job_scheduler/src/job_scheduler.cpp
 	job_scheduler/src/asset_download_job.cpp
diff --git a/coreruntime/nimblenet/asset_manager/src/asset_manager.cpp b/coreruntime/nimblenet/asset_manager/src/asset_manager.cpp
index a4e7ff7b..886d536e 100644
--- a/coreruntime/nimblenet/asset_manager/src/asset_manager.cpp
+++ b/coreruntime/nimblenet/asset_manager/src/asset_manager.cpp
@@ -31,6 +31,8 @@ std::string Asset::get_file_name_on_device() const {
     case AssetType::LLM:
       return name + version + rmconstants::LLMFolderName;
 #endif  // GENAI
+    default:
+      return name + version;
   }
 }
 
@@ -96,6 +98,8 @@ std::string get_string_from_asset_type(const AssetType& assetType) {
     case AssetType::LLM:
       return "llm";
 #endif  // GENAI
+    default:
+      return "unknown";
   }
 }
 
diff --git a/coreruntime/nimblenet/core_sdk/src/core_sdk.cpp b/coreruntime/nimblenet/core_sdk/src/core_sdk.cpp
index 8d7d80c9..9c9af279 100644
--- a/coreruntime/nimblenet/core_sdk/src/core_sdk.cpp
+++ b/coreruntime/nimblenet/core_sdk/src/core_sdk.cpp
@@ -392,6 +392,8 @@ std::pair<CloudConfigResponse, Deployment> CoreSDK::get_cloud_config_and_update_
     }
     case CloudConfigState::Unmodified:
       return {cloudConfig, deployment};
+    default:
+      return {cloudConfig, deployment};
   }
 }
 
diff --git a/coreruntime/nimblenet/core_sdk/src/nimble_exec_info.cpp b/coreruntime/nimblenet/core_sdk/src/nimble_exec_info.cpp
index 486d8ff4..c47d5f78 100644
--- a/coreruntime/nimblenet/core_sdk/src/nimble_exec_info.cpp
+++ b/coreruntime/nimblenet/core_sdk/src/nimble_exec_info.cpp
@@ -40,7 +40,7 @@ namespace detail {
  * than "size". If the return value is equal to "size" then the number of
  * addresses may have been truncated.
  */
-int backtrace(void* _Nonnull* _Nonnull buffer, int size);
+int backtrace(void** buffer, int size);
 
 /**
  * [backtrace_symbols(3)](https://man7.org/linux/man-pages/man3/backtrace_symbols.3.html)
@@ -50,7 +50,7 @@ int backtrace(void* _Nonnull* _Nonnull buffer, int size);
  * Returns a pointer to allocated memory, on error NULL is returned. It is
  * the responsibility of the caller to free the returned memory.
  */
-char* _Nullable* _Nullable backtrace_symbols(void* _Nonnull const* _Nonnull buffer, int size);
+char** backtrace_symbols(void* const* buffer, int size);
 
 /**
  * [backtrace_symbols_fd(3)](https://man7.org/linux/man-pages/man3/backtrace_symbols_fd.3.html)
@@ -58,7 +58,7 @@ char* _Nullable* _Nullable backtrace_symbols(void* _Nonnull const* _Nonnull buff
  * of strings that represent the backtrace and write to the file represented
  * by "fd". The file is written such that one line equals one void* address.
  */
-void backtrace_symbols_fd(void* _Nonnull const* _Nonnull buffer, int size, int fd);
+void backtrace_symbols_fd(void* const* buffer, int size, int fd);
 
 }  // namespace detail
 
diff --git a/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp b/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp
index 00f7f6e9..35b7a2ce 100644
--- a/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp
+++ b/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp
@@ -62,12 +62,13 @@ enum DATATYPE {
   DATAFRAME = 676,
   NIMBLENET_REGEX = 677,
   NIMBLENET_REGEX_MATCHOBJECT = 678,
-  CHAR_STREAM = 679,
-  JSON_STREAM = 680,
-  JSON_ARRAY = 681,
-  FUNCTION = 682,
-  CONCURRENT_EXECUTOR = 683,
-  EXCEPTION = 684,
+  NIMBLENET_TOKENIZERS = 679,
+  CHAR_STREAM = 680,
+  JSON_STREAM = 681,
+  JSON_ARRAY = 682,
+  FUNCTION = 683,
+  CONCURRENT_EXECUTOR = 684,
+  EXCEPTION = 685,
   UNKNOWN = 0,
   FLOAT = 1,
   BOOLEAN = 9,
diff --git a/coreruntime/nimblenet/data_variable/include/data_variable_enums.hpp b/coreruntime/nimblenet/data_variable/include/data_variable_enums.hpp
index 1763dc8c..73dfc6e0 100644
--- a/coreruntime/nimblenet/data_variable/include/data_variable_enums.hpp
+++ b/coreruntime/nimblenet/data_variable/include/data_variable_enums.hpp
@@ -130,5 +130,14 @@ enum MemberFuncType {
   CLEAR_CONTEXT,
   ADD_CONTEXT,
   LIST_COMPATIBLE_LLMS,
+  TOKENIZERS_FROM_PRETRAINED,
+  TOKENIZERS_FROM_FILE,
+  TOKENIZERS_FROM_JSON,
+  TOKENIZERS_FROM_SENTENCEPIECE,
+  TOKENIZERS_ENCODE,
+  TOKENIZERS_DECODE,
+  TOKENIZERS_GET_VOCAB_SIZE,
+  TOKENIZERS_TOKEN_TO_ID,
+  TOKENIZERS_ID_TO_TOKEN,
   LASTTYPE,  // should be last
 };
diff --git a/coreruntime/nimblenet/data_variable/include/tokenizers_data_variable.hpp b/coreruntime/nimblenet/data_variable/include/tokenizers_data_variable.hpp
new file mode 100644
index 00000000..bc3cb243
--- /dev/null
+++ b/coreruntime/nimblenet/data_variable/include/tokenizers_data_variable.hpp
@@ -0,0 +1,51 @@
+/*
+ * SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "data_variable.hpp"
+#include "map_data_variable.hpp"
+#include "tokenizers_cpp.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+class TokenizersDataVariable : public DataVariable {
+ public:
+  TokenizersDataVariable();
+  TokenizersDataVariable(std::unique_ptr<tokenizers::Tokenizer> tokenizer);
+  ~TokenizersDataVariable() override = default;
+
+  int get_containerType() const override { return CONTAINERTYPE::SINGLE; }
+  bool get_bool() override { return true; }
+  int get_dataType_enum() const override { return DATATYPE::NIMBLENET_TOKENIZERS; }
+  nlohmann::json to_json() const override { return "[Tokenizers]"; }
+  std::string print() override { return "[Tokenizers]"; }
+
+  OpReturnType call_function(int memberFuncIndex, const std::vector<OpReturnType>& arguments, CallStack& stack) override;
+
+  // Static factory methods for creating tokenizer instances
+  static OpReturnType from_pretrained(const std::vector<OpReturnType>& arguments, CallStack& stack);
+  static OpReturnType from_file(const std::vector<OpReturnType>& arguments, CallStack& stack);
+  static OpReturnType from_json(const std::vector<OpReturnType>& arguments, CallStack& stack);
+  static OpReturnType from_sentencepiece(const std::vector<OpReturnType>& arguments, CallStack& stack);
+
+  // Instance member methods that operate on the tokenizer
+  OpReturnType encode(const std::vector<OpReturnType>& arguments, CallStack& stack);
+  OpReturnType decode(const std::vector<OpReturnType>& arguments, CallStack& stack);
+  OpReturnType get_vocab_size(const std::vector<OpReturnType>& arguments, CallStack& stack);
+  OpReturnType token_to_id(const std::vector<OpReturnType>& arguments, CallStack& stack);
+  OpReturnType id_to_token(const std::vector<OpReturnType>& arguments, CallStack& stack);
+
+ private:
+  std::unique_ptr<tokenizers::Tokenizer> _tokenizer;
+
+  // Helper functions
+  static std::unique_ptr<tokenizers::Tokenizer> _create_tokenizer_from_file(const std::string& path);
+  static std::unique_ptr<tokenizers::Tokenizer> _create_tokenizer_from_json(const std::string& json);
+  static std::unique_ptr<tokenizers::Tokenizer> _create_tokenizer_from_map(const MapDataVariable* map);
+  static std::unique_ptr<tokenizers::Tokenizer> _create_tokenizer_from_sentencepiece(const std::string& model_path);
+};
diff --git a/coreruntime/nimblenet/data_variable/src/data_variable.cpp b/coreruntime/nimblenet/data_variable/src/data_variable.cpp
index 02b2d587..4e8e84ff 100644
--- a/coreruntime/nimblenet/data_variable/src/data_variable.cpp
+++ b/coreruntime/nimblenet/data_variable/src/data_variable.cpp
@@ -103,6 +103,13 @@ std::map<std::string, int> DataVariable::_memberFuncMap = {
     {"clear_context", MemberFuncType::CLEAR_CONTEXT},
     {"add_context", MemberFuncType::ADD_CONTEXT},
     {"list_compatible_llms", MemberFuncType::LIST_COMPATIBLE_LLMS},
+    {"from_pretrained", MemberFuncType::TOKENIZERS_FROM_PRETRAINED},
+    {"from_file", MemberFuncType::TOKENIZERS_FROM_FILE},
+    {"from_json", MemberFuncType::TOKENIZERS_FROM_JSON},
+    {"from_sentencepiece", MemberFuncType::TOKENIZERS_FROM_SENTENCEPIECE},
+    {"encode", MemberFuncType::TOKENIZERS_ENCODE},
+    {"decode", MemberFuncType::TOKENIZERS_DECODE},
+    {"get_vocab_size", MemberFuncType::TOKENIZERS_GET_VOCAB_SIZE},
 };
 
 std::map<int, std::string> DataVariable::_inverseMemberFuncMap = {
@@ -191,6 +198,13 @@ std::map<int, std::string> DataVariable::_inverseMemberFuncMap = {
     {MemberFuncType::CLEAR_CONTEXT, "clear_context"},
     {MemberFuncType::ADD_CONTEXT, "add_context"},
     {MemberFuncType::LIST_COMPATIBLE_LLMS, "list_compatible_llms"},
+    {MemberFuncType::TOKENIZERS_FROM_PRETRAINED, "from_pretrained"},
+    {MemberFuncType::TOKENIZERS_FROM_FILE, "from_file"},
+    {MemberFuncType::TOKENIZERS_FROM_JSON, "from_json"},
+    {MemberFuncType::TOKENIZERS_FROM_SENTENCEPIECE, "from_sentencepiece"},
+    {MemberFuncType::TOKENIZERS_ENCODE, "encode"},
+    {MemberFuncType::TOKENIZERS_DECODE, "decode"},
+    {MemberFuncType::TOKENIZERS_GET_VOCAB_SIZE, "get_vocab_size"},
 };
 
 int DataVariable::add_and_get_member_func_index(const std::string& memberFuncString) {
diff --git a/coreruntime/nimblenet/data_variable/src/tokenizers_data_variable.cpp b/coreruntime/nimblenet/data_variable/src/tokenizers_data_variable.cpp
new file mode 100644
index 00000000..dae4e6fe
--- /dev/null
+++ b/coreruntime/nimblenet/data_variable/src/tokenizers_data_variable.cpp
@@ -0,0 +1,314 @@
+/*
+ * SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "tokenizers_data_variable.hpp"
+#include "data_variable_enums.hpp"
+#include "util.hpp"
+#include "native_interface.hpp"
+#include "tensor_data_variable.hpp"
+#include "single_variable.hpp"
+#include "map_data_variable.hpp"
+#include <fstream>
+#include <iostream>
+#include <nlohmann/json.hpp> // Added for nlohmann/json
+
+TokenizersDataVariable::TokenizersDataVariable() : _tokenizer(nullptr) {}
+
+TokenizersDataVariable::TokenizersDataVariable(std::unique_ptr<tokenizers::Tokenizer> tokenizer)
+    : _tokenizer(std::move(tokenizer)) {}
+
+std::unique_ptr<tokenizers::Tokenizer> TokenizersDataVariable::_create_tokenizer_from_file(const std::string& path) {
+  LOG_TO_CLIENT_DEBUG("TokenizersDataVariable::_create_tokenizer_from_file: %s", path.c_str());
+  std::ifstream file(path, std::ios::binary);
+  if (!file.is_open()) {
+    THROW("Failed to open tokenizer file: %s", path.c_str());
+  }
+
+  std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+
+  // Try to determine the type by file extension
+  if (path.size() >= 5 && path.substr(path.size() - 5) == ".json") {
+    return tokenizers::Tokenizer::FromBlobJSON(content);
+  } else if (path.size() >= 6 && path.substr(path.size() - 6) == ".model") {
+    return tokenizers::Tokenizer::FromBlobSentencePiece(content);
+  } else {
+    // Default to JSON format
+    return tokenizers::Tokenizer::FromBlobJSON(content);
+  }
+}
+
+std::unique_ptr<tokenizers::Tokenizer> TokenizersDataVariable::_create_tokenizer_from_json(const std::string& json) {
+  return tokenizers::Tokenizer::FromBlobJSON(json);
+}
+
+std::unique_ptr<tokenizers::Tokenizer> TokenizersDataVariable::_create_tokenizer_from_map(const MapDataVariable* map) {
+  // Convert MapDataVariable to nlohmann::json, then to string
+  nlohmann::json json_obj = map->to_json();
+  std::string json_str = json_obj.dump();
+  return tokenizers::Tokenizer::FromBlobJSON(json_str);
+}
+
+std::unique_ptr<tokenizers::Tokenizer> TokenizersDataVariable::_create_tokenizer_from_sentencepiece(const std::string& model_path) {
+  std::ifstream file(model_path, std::ios::binary);
+  if (!file.is_open()) {
+    THROW("Failed to open SentencePiece model file: %s", model_path.c_str());
+  }
+
+  std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+  return tokenizers::Tokenizer::FromBlobSentencePiece(content);
+}
+
+OpReturnType TokenizersDataVariable::from_pretrained(const std::vector<OpReturnType>& arguments,
+                                                    CallStack& stack) {
+  if (arguments.size() != 1) {
+    THROW("from_pretrained expects 1 argument, got %zu", arguments.size());
+  }
+
+  auto path_var = arguments[0];
+  if (path_var->get_containerType() != CONTAINERTYPE::SINGLE) {
+    THROW("from_pretrained expects string argument");
+  }
+
+  std::string path = static_cast<SingleVariable<std::string>*>(path_var.get())->get_string();
+  LOG_TO_CLIENT_DEBUG("TokenizersDataVariable::from_pretrained: %s", path.c_str());
+  try {
+    auto tokenizer = _create_tokenizer_from_file(path);
+    return OpReturnType(new TokenizersDataVariable(std::move(tokenizer)));
+  } catch (const std::exception& e) {
+    THROW("Failed to create tokenizer from %s: %s", path.c_str(), e.what());
+  }
+}
+
+OpReturnType TokenizersDataVariable::from_file(const std::vector<OpReturnType>& arguments,
+                                              CallStack& stack) {
+  if (arguments.size() != 1) {
+    THROW("from_file expects 1 argument, got %zu", arguments.size());
+  }
+
+  auto path_var = arguments[0];
+  if (path_var->get_containerType() != CONTAINERTYPE::SINGLE) {
+    THROW("from_file expects string argument");
+  }
+
+  std::string path = static_cast<SingleVariable<std::string>*>(path_var.get())->get_string();
+
+  try {
+    auto tokenizer = _create_tokenizer_from_file(path);
+    return OpReturnType(new TokenizersDataVariable(std::move(tokenizer)));
+  } catch (const std::exception& e) {
+    THROW("Failed to create tokenizer from %s: %s", path.c_str(), e.what());
+  }
+}
+
+OpReturnType TokenizersDataVariable::from_json(const std::vector<OpReturnType>& arguments,
+                                              CallStack& stack) {
+  if (arguments.size() != 1) {
+    THROW("from_json expects 1 argument, got %zu", arguments.size());
+  }
+
+  auto json_var = arguments[0];
+
+  try {
+    std::unique_ptr<tokenizers::Tokenizer> tokenizer;
+
+    if (json_var->get_containerType() == CONTAINERTYPE::MAP) {
+      // Handle MapDataVariable input
+      auto map_var = static_cast<MapDataVariable*>(json_var.get());
+      tokenizer = _create_tokenizer_from_map(map_var);
+    } else if (json_var->get_containerType() == CONTAINERTYPE::SINGLE) {
+      // Handle string input (backward compatibility)
+      std::string json = static_cast<SingleVariable<std::string>*>(json_var.get())->get_string();
+      tokenizer = _create_tokenizer_from_json(json);
+    } else {
+      THROW("from_json expects either a dictionary (MapDataVariable) or string argument");
+    }
+
+    return OpReturnType(new TokenizersDataVariable(std::move(tokenizer)));
+  } catch (const std::exception& e) {
+    THROW("Failed to create tokenizer from JSON: %s", e.what());
+  }
+}
+
+OpReturnType TokenizersDataVariable::from_sentencepiece(const std::vector<OpReturnType>& arguments,
+                                                       CallStack& stack) {
+  if (arguments.size() != 1) {
+    THROW("from_sentencepiece expects 1 argument, got %zu", arguments.size());
+  }
+
+  auto path_var = arguments[0];
+  if (path_var->get_containerType() != CONTAINERTYPE::SINGLE) {
+    THROW("from_sentencepiece expects string argument");
+  }
+
+  std::string path = static_cast<SingleVariable<std::string>*>(path_var.get())->get_string();
+
+  try {
+    auto tokenizer = _create_tokenizer_from_sentencepiece(path);
+    return OpReturnType(new TokenizersDataVariable(std::move(tokenizer)));
+  } catch (const std::exception& e) {
+    THROW("Failed to create SentencePiece tokenizer from %s: %s", path.c_str(), e.what());
+  }
+}
+
+OpReturnType TokenizersDataVariable::encode(const std::vector<OpReturnType>& arguments,
+                                           CallStack& stack) {
+  if (!_tokenizer) {
+    THROW("No tokenizer loaded. Use from_pretrained, from_file, from_json, or from_sentencepiece first.");
+  }
+
+  if (arguments.size() != 1) {
+    THROW("encode expects 1 argument (text), got %zu", arguments.size());
+  }
+
+  auto text_var = arguments[0];
+  if (text_var->get_containerType() != CONTAINERTYPE::SINGLE) {
+    THROW("encode expects string argument");
+  }
+
+  std::string text = static_cast<SingleVariable<std::string>*>(text_var.get())->get_string();
+
+  try {
+    std::vector<int32_t> token_ids = _tokenizer->Encode(text);
+
+    // Create a tensor to return the token IDs
+    std::vector<int64_t> shape = {static_cast<int64_t>(token_ids.size())};
+    auto result_tensor = TensorVariable::copy_tensor_from_raw_data(token_ids.data(), DATATYPE::INT32, shape);
+    return result_tensor;
+  } catch (const std::exception& e) {
+    THROW("Failed to encode text: %s", e.what());
+  }
+}
+
+OpReturnType TokenizersDataVariable::decode(const std::vector<OpReturnType>& arguments,
+                                           CallStack& stack) {
+  if (!_tokenizer) {
+    THROW("No tokenizer loaded. Use from_pretrained, from_file, from_json, or from_sentencepiece first.");
+  }
+
+  if (arguments.size() != 1) {
+    THROW("decode expects 1 argument (token_ids), got %zu", arguments.size());
+  }
+
+  auto ids_var = arguments[0];
+
+  auto tensor = std::dynamic_pointer_cast<BaseTypedTensorVariable>(ids_var);
+  if (tensor) {
+    if (ids_var->get_dataType_enum() != DATATYPE::INT32) {
+      THROW("decode expects INT32 tensor for ids");
+    }
+
+    // Use begin/end iterators to get data
+    std::vector<int32_t> token_ids(tensor->begin<int32_t>(), tensor->end<int32_t>());
+
+    try {
+      std::string decoded = _tokenizer->Decode(token_ids);
+      auto result = std::make_shared<SingleVariable<std::string>>(decoded);
+      return OpReturnType(result);
+    } catch (const std::exception& e) {
+      THROW("Failed to decode token IDs: %s", e.what());
+    }
+  } else {
+    THROW("decode expects tensor of token IDs");
+  }
+}
+
+OpReturnType TokenizersDataVariable::get_vocab_size(const std::vector<OpReturnType>& arguments,
+                                                   CallStack& stack) {
+  if (!_tokenizer) {
+    THROW("No tokenizer loaded. Use from_pretrained, from_file, from_json, or from_sentencepiece first.");
+  }
+
+  if (arguments.size() != 0) {
+    THROW("get_vocab_size expects 0 arguments, got %zu", arguments.size());
+  }
+
+  try {
+    int64_t vocab_size = static_cast<int64_t>(_tokenizer->GetVocabSize());
+    auto result = std::make_shared<SingleVariable<int64_t>>(vocab_size);
+    return OpReturnType(result);
+  } catch (const std::exception& e) {
+    THROW("Failed to get vocab size: %s", e.what());
+  }
+}
+
+OpReturnType TokenizersDataVariable::token_to_id(const std::vector<OpReturnType>& arguments,
+                                                 CallStack& stack) {
+  if (!_tokenizer) {
+    THROW("No tokenizer loaded. Use from_pretrained, from_file, from_json, or from_sentencepiece first.");
+  }
+
+  if (arguments.size() != 1) {
+    THROW("token_to_id expects 1 argument (token), got %zu", arguments.size());
+  }
+
+  auto token_var = arguments[0];
+  if (token_var->get_containerType() != CONTAINERTYPE::SINGLE) {
+    THROW("token_to_id expects string argument");
+  }
+
+  std::string token = static_cast<SingleVariable<std::string>*>(token_var.get())->get_string();
+
+  try {
+    int32_t token_id = _tokenizer->TokenToId(token);
+    auto result = std::make_shared<SingleVariable<int32_t>>(token_id);
+    return OpReturnType(result);
+  } catch (const std::exception& e) {
+    THROW("Failed to get token ID: %s", e.what());
+  }
+}
+
+OpReturnType TokenizersDataVariable::id_to_token(const std::vector<OpReturnType>& arguments,
+                                                 CallStack& stack) {
+  if (!_tokenizer) {
+    THROW("No tokenizer loaded. Use from_pretrained, from_file, from_json, or from_sentencepiece first.");
+  }
+
+  if (arguments.size() != 1) {
+    THROW("id_to_token expects 1 argument (token_id), got %zu", arguments.size());
+  }
+
+  auto id_var = arguments[0];
+  if (id_var->get_containerType() != CONTAINERTYPE::SINGLE) {
+    THROW("id_to_token expects int argument");
+  }
+
+  int32_t token_id = static_cast<int32_t>(static_cast<SingleVariable<int32_t>*>(id_var.get())->get_int64());
+
+  try {
+    std::string token = _tokenizer->IdToToken(token_id);
+    auto result = std::make_shared<SingleVariable<std::string>>(token);
+    return OpReturnType(result);
+  } catch (const std::exception& e) {
+    THROW("Failed to get token: %s", e.what());
+  }
+}
+
+OpReturnType TokenizersDataVariable::call_function(int memberFuncIndex,
+                                                   const std::vector<OpReturnType>& arguments,
+                                                   CallStack& stack) {
+  switch (memberFuncIndex) {
+    case MemberFuncType::TOKENIZERS_FROM_PRETRAINED:
+      return from_pretrained(arguments, stack);
+    case MemberFuncType::TOKENIZERS_FROM_FILE:
+      return from_file(arguments, stack);
+    case MemberFuncType::TOKENIZERS_FROM_JSON:
+      return from_json(arguments, stack);
+    case MemberFuncType::TOKENIZERS_FROM_SENTENCEPIECE:
+      return from_sentencepiece(arguments, stack);
+    case MemberFuncType::TOKENIZERS_ENCODE:
+      return encode(arguments, stack);
+    case MemberFuncType::TOKENIZERS_DECODE:
+      return decode(arguments, stack);
+    case MemberFuncType::TOKENIZERS_GET_VOCAB_SIZE:
+      return get_vocab_size(arguments, stack);
+    case MemberFuncType::TOKENIZERS_TOKEN_TO_ID:
+      return token_to_id(arguments, stack);
+    case MemberFuncType::TOKENIZERS_ID_TO_TOKEN:
+      return id_to_token(arguments, stack);
+    default:
+      THROW("%s not implemented for tokenizers", DataVariable::get_member_func_string(memberFuncIndex));
+  }
+}
diff --git a/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp b/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp
index 37d0b704..0e8d700e 100644
--- a/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp
+++ b/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp
@@ -262,9 +262,9 @@ TaskONNXModel::TaskONNXModel(const std::string& plan, const std::string& version
   Ort::ThrowOnError(ortApi.GetAllocatorWithDefaultOptions(&_allocator));
 
   initialize_model();
-  if (_runDummyInference) {
-    run_dummy_inference();
-  }
+  // if (_runDummyInference) {
+  //   run_dummy_inference();
+  // }
 }
 
 void TaskONNXModel::run_dummy_inference() {
diff --git a/coreruntime/nimblenet/job_scheduler/include/internet_job.hpp b/coreruntime/nimblenet/job_scheduler/include/internet_job.hpp
index e4e1b1b4..ee810a33 100644
--- a/coreruntime/nimblenet/job_scheduler/include/internet_job.hpp
+++ b/coreruntime/nimblenet/job_scheduler/include/internet_job.hpp
@@ -107,5 +107,7 @@ typename Job<T>::Status InternetJob<T>::process() {
     case Status::COMPLETE:
       // Task completed successfully
       return Job<T>::Status::COMPLETE;
+    default:
+      return Job<T>::Status::RETRY;
   }
 }
diff --git a/coreruntime/nimblenet/resource_loader/src/resource_downloader.cpp b/coreruntime/nimblenet/resource_loader/src/resource_downloader.cpp
index 566b92f1..de38a1ce 100644
--- a/coreruntime/nimblenet/resource_loader/src/resource_downloader.cpp
+++ b/coreruntime/nimblenet/resource_loader/src/resource_downloader.cpp
@@ -63,6 +63,8 @@ InternetJob<Location>::Status ResourceDownloader::enqueue_download_asset(
     case FileDownloadStatus::DOWNLOAD_FAILURE:
     case FileDownloadStatus::DOWNLOAD_UNKNOWN:
       return InternetJob<Location>::Status::RETRY;
+    default:
+      return InternetJob<Location>::Status::RETRY;
   }
 };
 
@@ -96,5 +98,7 @@ std::optional<Location> ResourceDownloader::get_asset_offline(std::shared_ptr<As
       return {};
     }
 #endif  // GENAI
+    default:
+      return {};
   }
 }
diff --git a/coreruntime/nimblenet/resource_loader/src/resource_loader.cpp b/coreruntime/nimblenet/resource_loader/src/resource_loader.cpp
index 27c40643..2227674b 100644
--- a/coreruntime/nimblenet/resource_loader/src/resource_loader.cpp
+++ b/coreruntime/nimblenet/resource_loader/src/resource_loader.cpp
@@ -121,5 +121,7 @@ OpReturnType ResourceLoader::load_asset(std::shared_ptr<Asset> asset,
     case AssetType::LLM:
       return load_llm(asset);
 #endif  // GENAI
+    default:
+      return nullptr;
   }
 }
diff --git a/coreruntime/nimblenet/task_manager/task/include/statements.hpp b/coreruntime/nimblenet/task_manager/task/include/statements.hpp
index 8afa8597..576f1644 100644
--- a/coreruntime/nimblenet/task_manager/task/include/statements.hpp
+++ b/coreruntime/nimblenet/task_manager/task/include/statements.hpp
@@ -12,6 +12,7 @@
 #include "nimble_net_internal_data_variable.hpp"
 #include "node.hpp"
 #include "regex_data_variable.hpp"
+#include "tokenizers_data_variable.hpp"
 
 class VariableScope;
 
diff --git a/coreruntime/nimblenet/task_manager/task/src/statements.cpp b/coreruntime/nimblenet/task_manager/task/src/statements.cpp
index 21829798..698f83fa 100644
--- a/coreruntime/nimblenet/task_manager/task/src/statements.cpp
+++ b/coreruntime/nimblenet/task_manager/task/src/statements.cpp
@@ -403,6 +403,9 @@ StatRetType* ImportStatement::execute(CallStack& stack) {
         stack.set_variable(stackLocation, OpReturnType(new RegexDataVariable()));
       }
 #endif
+      else if (importName == "tokenizers") {
+        stack.set_variable(stackLocation, OpReturnType(new TokenizersDataVariable()));
+      }
       else {
         THROW("Cannot import=%s from module=%s at lineno=%d", importName.c_str(),
               moduleName.c_str(), get_line());
diff --git a/coreruntime/nimblenet/time_manager/include/time_manager.hpp b/coreruntime/nimblenet/time_manager/include/time_manager.hpp
index 8496fdf5..a8f6865d 100644
--- a/coreruntime/nimblenet/time_manager/include/time_manager.hpp
+++ b/coreruntime/nimblenet/time_manager/include/time_manager.hpp
@@ -326,7 +326,7 @@ class PeggedDeviceTime {
   /**
    * @brief Default constructor (null base device time).
    */
-  constexpr PeggedDeviceTime() : _baseDeviceTime(DeviceTime::null) {}
+  PeggedDeviceTime() : _baseDeviceTime(DeviceTime::null) {}
 
   /**
    * @brief Compare for equality with another PeggedDeviceTime.
diff --git a/models/LFM2/demo_lfm.py b/models/LFM2/demo_lfm.py
new file mode 100755
index 00000000..e0afa09b
--- /dev/null
+++ b/models/LFM2/demo_lfm.py
@@ -0,0 +1,297 @@
+#!/usr/bin/env python3
+#-*- coding: utf-8 -*-
+
+import json
+import re
+import sys
+import os
+from typing import List
+
+# Add parent directory to path to import tools
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from tools import tools, tool_schema
+
+from transformers import AutoConfig, AutoTokenizer
+import onnxruntime
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+# 1. Load config, processor, and model
+model_id = "onnx-community/LFM2-1.2B-ONNX"
+
+
+TOOL_CALL_START_TOKEN = "<|tool_call_start|>"
+TOOL_CALL_END_TOKEN = "<|tool_call_end|>"
+TOOL_RESPONSE_START_TOKEN = "<|tool_response_start|>"
+TOOL_RESPONSE_END_TOKEN = "<|tool_response_end|>"
+INITIAL_PROMPT = f"""You are a helpful assistant. When you need to use tools, call only one tool at a time and sequentially execute them."""
+
+initial_message_block = [
+    {
+        "role": "system",
+        "content": INITIAL_PROMPT
+    }
+]
+
+config = AutoConfig.from_pretrained(model_id)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+filename = "model.onnx" # Options: "model.onnx", "model_fp16.onnx", "model_q4.onnx", "model_q4f16.onnx"
+model_path = hf_hub_download(repo_id=model_id, filename=f"onnx/{filename}") # Download the graph
+hf_hub_download(repo_id=model_id, filename=f"onnx/{filename}_data") # Download the weights
+session = onnxruntime.InferenceSession(model_path)
+
+## Set config values
+num_key_value_heads = config.num_key_value_heads
+head_dim = config.hidden_size // config.num_attention_heads
+num_hidden_layers = config.num_hidden_layers
+eos_token_id = config.eos_token_id
+hidden_size = config.hidden_size
+conv_L_cache = config.conv_L_cache
+layer_types = config.layer_types
+
+def execute_function_call(function_name: str, arguments: dict) -> dict:
+    """Execute a function call and return the result"""
+    if function_name not in tools:
+        return {"error": f"Function {function_name} not found"}
+    
+    try:
+        function = tools[function_name]  # Direct access to function object
+        result = function(**arguments)
+        return result
+    except Exception as e:
+        return {"error": f"Error executing {function_name}: {str(e)}"}
+
+def format_tool_response(result: dict) -> str:
+    """Format tool execution result using token-based format"""
+    result_json = json.dumps(result)
+    return f"{TOOL_RESPONSE_START_TOKEN}{result_json}{TOOL_RESPONSE_END_TOKEN}"
+
+def execute_tool_call_with_response(function_name: str, arguments: dict) -> tuple:
+    """Execute a function call and return both result and formatted response"""
+    result = execute_function_call(function_name, arguments)
+    formatted_response = format_tool_response(result)
+    return result, formatted_response
+
+def parse_tool_calls_from_response(response_text: str) -> list:
+    """Parse tool calls from model response using multiple formats"""
+    tool_calls = []
+
+    # Method 2: Look for JSON-style tool calls: <|tool_call_start|>{"name": "func", "arguments": {...}}<|tool_call_end|>
+    json_tool_pattern = r'<\|tool_call_start\|>\s*({.*?})\s*<\|tool_call_end\|>'
+    json_matches = re.findall(json_tool_pattern, response_text, re.DOTALL)
+    
+    for json_str in json_matches:
+        try:
+            tool_data = json.loads(json_str)
+            func_name = tool_data.get("name")
+            arguments = tool_data.get("arguments", {})
+            
+            if func_name in tools:
+                tool_calls.append({
+                    "function_name": func_name,
+                    "arguments": arguments
+                })
+                print(f"✓ Parsed JSON tool call: {func_name}({arguments})")
+        except json.JSONDecodeError:
+            print(f"⚠ Failed to parse JSON tool call: {json_str}")
+    
+    return tool_calls
+
+def generate_with_model(conversation_messages: List, max_new_tokens: int = 150) -> str:
+    """Generate text using the loaded model with multi-turn conversation support"""
+    # Use chat template with tools for multi-turn conversations
+    print("---"*10)
+    print("Conversation Messages:")
+    print(json.dumps(conversation_messages, indent=4))
+    print("---"*10)
+
+    # 2. Prepare inputs
+    inputs = tokenizer.apply_chat_template(
+      conversation_messages,
+      tools=tool_schema,
+      add_generation_prompt=True,
+      tokenize=True,
+      return_dict=True,
+      return_tensors="np"
+    )
+    input_ids = inputs['input_ids']
+    attention_mask = inputs['attention_mask']
+    batch_size = input_ids.shape[0]
+    position_ids = np.tile(np.arange(0, input_ids.shape[-1]), (batch_size, 1))
+    past_cache_values = {}
+    for i in range(num_hidden_layers):
+      if layer_types[i] == 'full_attention':
+        for kv in ('key', 'value'):
+          past_cache_values[f'past_key_values.{i}.{kv}'] = np.zeros([batch_size, num_key_value_heads, 0, head_dim], dtype=np.float32)
+      elif layer_types[i] == 'conv':
+        past_cache_values[f'past_conv.{i}'] = np.zeros([batch_size, hidden_size, conv_L_cache], dtype=np.float32)
+      else:
+        raise ValueError(f"Unsupported layer type: {layer_types[i]}")
+
+    # 3. Generation loop
+    generated_tokens = np.array([[]], dtype=np.int64)
+    for i in range(max_new_tokens):
+      logits, *present_cache_values = session.run(None, dict(
+          input_ids=input_ids,
+          attention_mask=attention_mask,
+          position_ids=position_ids,
+          **past_cache_values,
+      ))
+
+      ## Update values for next generation loop
+      input_ids = logits[:, -1].argmax(-1, keepdims=True)
+      attention_mask = np.concatenate([attention_mask, np.ones_like(input_ids, dtype=np.int64)], axis=-1)
+      position_ids = position_ids[:, -1:] + 1
+      for j, key in enumerate(past_cache_values):
+        past_cache_values[key] = present_cache_values[j]
+      generated_tokens = np.concatenate([generated_tokens, input_ids], axis=-1)
+      if (input_ids == eos_token_id).all():
+        break
+
+    # 4. Output result
+    response = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
+    return response.strip()
+
+def handle_multi_step_request(user_prompt: str, max_steps: int, max_new_tokens: int) -> list:
+    """Handle requests that may require multiple tool calls and back and forth"""
+    step_results = []
+    conversation_messages : List[dict] = []  # Will hold the full conversation chain
+    tool_context = {}  # Store results from previous tool calls
+    
+    for step in range(max_steps):
+        print(f"\n--- Step {step + 1} ---")
+        if step == 0:
+            conversation_messages = initial_message_block.copy()
+            conversation_messages.append({
+                "role": "user", 
+                "content": user_prompt
+            })
+        else: 
+            conversation_messages.append({
+                "role": "system", 
+                "content": "Now use the result from the tool calls to answer the user's question. Call another tool if needed."
+            })
+        # Generate response
+        try:
+            response = generate_with_model(conversation_messages, max_new_tokens=max_new_tokens)
+            print(f"Model Response: {response}")
+            
+            # Parse and execute tool calls
+            tool_calls = parse_tool_calls_from_response(response)
+            tool_results = []
+            
+            if tool_calls:
+                print(f"Executing {len(tool_calls)} tool call(s):")
+                for call in tool_calls:
+                    func_name = call["function_name"]
+                    arguments = call["arguments"]
+                    
+                    print(f"  • {func_name}({arguments})")
+                    result, formatted_response = execute_tool_call_with_response(func_name, arguments)
+                    
+                    # Store important results for future reference
+                    if func_name == "get_current_location" and "location" in result:
+                        tool_context["location"] = result["location"]
+                    
+                    tool_results.append({
+                        "function": func_name,
+                        "arguments": arguments,
+                        "result": result
+                    })
+                    print(f"    Result: {json.dumps(result, indent=4)}")
+            
+            # Add assistant response to conversation
+            conversation_messages.append({
+                "role": "assistant",
+                "content": response
+            })
+            
+            # Add tool results to conversation as function messages
+            for tool_result in tool_results:
+                if not tool_result["result"].get("error"):
+                    conversation_messages.append({
+                        "role": "system",
+                        "content": f"The result of the tool {tool_result['function']} is: {TOOL_RESPONSE_START_TOKEN}{json.dumps(tool_result['result'])}{TOOL_RESPONSE_END_TOKEN}"
+                    })
+            
+            # Store step result
+            step_result = {
+                "step": step + 1,
+                "prompt": user_prompt if step == 0 else "continuation",
+                "response": response,
+                "tool_calls": tool_calls,
+                "tool_results": tool_results,
+                "has_errors": any("error" in result.get("result", {}) for result in tool_results),
+                "tool_context": tool_context.copy(),
+                "conversation_messages": conversation_messages.copy()
+            }
+            step_results.append(step_result)
+            
+            # Check if all tool calls were successful
+            if step_result["has_errors"]:
+                print(f"⚠ Stopping due to tool execution errors")
+                break
+            
+            # Simple continuation logic: if no tools were called, we're done
+            if not tool_calls:
+                print(f"✓ Completed after {step + 1} step(s) - no tool calls needed")
+                break
+            
+            # If we've reached max steps, stop
+            if step >= max_steps - 1:
+                print(f"✓ Reached maximum steps ({max_steps})")
+                break
+            
+            # If tools were executed, continue to next step to see if model wants to do more
+            print(f"✓ Step {step + 1} completed with {len(tool_calls)} tool call(s) - continuing...")
+            
+        except Exception as e:
+            print(f"Error in step {step + 1}: {e}")
+            step_results.append({
+                "step": step + 1,
+                "prompt": user_prompt if step == 0 else "continuation",
+                "error": str(e),
+                "response": None,
+                "tool_calls": [],
+                "tool_results": [],
+                "tool_context": tool_context.copy(),
+                "conversation_messages": conversation_messages.copy() if conversation_messages else []
+            })
+            break
+    
+    return step_results
+
+def run_tool_calling_demo():
+    """Run tool calling demonstration"""
+    print("=== Qwen3 1.7B Tool Calling Demo ===\n")
+    print(f"Model: {model_id}")
+    print(f"Available tools: {list(tools.keys())}")
+    
+    demo_prompts = [
+        "What's the weather here today?",
+        "Calculate 15 * 23",
+        "What time is it in JST timezone?",
+        "Where am I located?",
+        "Get my location and check the weather there"
+    ]
+    
+    for i, user_prompt in enumerate(demo_prompts, 1):
+        print(f"\nDemo {i}: {user_prompt}")
+        print("-" * 60)
+        step_results = handle_multi_step_request(user_prompt, max_steps=4, max_new_tokens=400)
+        # Show final summary
+        print(f"\n📋 Multi-step Summary:")
+        for step_result in step_results:
+            step_num = step_result["step"]
+            tool_calls = step_result.get("tool_calls", [])
+            if tool_calls:
+                print(f"  Step {step_num}: {len(tool_calls)} tool call(s)")
+                for call in tool_calls:
+                    func_name = call["function_name"]
+                    print(f"    ✓ {func_name}")
+        print("\n" + "="*60)
+
+
+if __name__ == "__main__":
+    # Run the regular demo first
+    run_tool_calling_demo()
\ No newline at end of file
diff --git a/models/Qwen3-1.7B/demo_qwen.py b/models/Qwen3-1.7B/demo_qwen.py
index a8e8df40..8fde3c65 100755
--- a/models/Qwen3-1.7B/demo_qwen.py
+++ b/models/Qwen3-1.7B/demo_qwen.py
@@ -2,22 +2,33 @@
 #-*- coding: utf-8 -*-
 
 import json
-import datetime
-import torch
 import re
-from typing import Tuple, List, Optional
-from transformers import AutoModelForCausalLM, AutoTokenizer
+import sys
+import os
+from typing import List
+from transformers import AutoConfig, AutoTokenizer
+from tokenizers import Tokenizer
+import onnxruntime
+import numpy as np
+from huggingface_hub import hf_hub_download
+from jinja2 import Template, Environment
+
+# Add parent directory to path to import tools
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from tools import tools, tool_schema
 
 # Load Qwen3 1.7B 4-bit model and tokenizer
-model_id = "Qwen/Qwen3-1.7B"
+model_id = "onnx-community/Qwen3-1.7B-ONNX"
 
 TOOL_CALL_START_TOKEN = "<tool_call>"
 TOOL_CALL_END_TOKEN = "</tool_call>"
 TOOL_RESPONSE_START_TOKEN = "<tool_response>"
 TOOL_RESPONSE_END_TOKEN = "</tool_response>"
-INITIAL_PROMPT = f"""You are a helpful assistant. When you need to use tools, format your response with the tool call between {TOOL_CALL_START_TOKEN} and {TOOL_CALL_END_TOKEN} tokens.
-Use this format: {TOOL_CALL_START_TOKEN}[function_name(param="value")]{TOOL_CALL_END_TOKEN}. Call only one tool at a time and sequentially execute them."""
+INITIAL_PROMPT = f"""You are a helpful assistant with access to tools. When you need to use a tool, format your response with JSON between {TOOL_CALL_START_TOKEN} and {TOOL_CALL_END_TOKEN} tokens.
+
+Use this exact format: {TOOL_CALL_START_TOKEN}{{"name": "function_name", "arguments": {{"param": "value"}}}}{TOOL_CALL_END_TOKEN}
+If a tool requires a argument you don't know the value of check if another tool can give you that information and call that tool first.
+Always respond directly and call the appropriate tool when needed."""
 
 initial_message_block = [
     {
@@ -26,32 +37,17 @@
     }
 ]
 
-from mlx_lm import load, generate
-
-model, tokenizer = load("mlx-community/Qwen3-1.7B-4bit")
-# from transformers import BitsAndBytesConfig
-
-# Configure 4-bit quantization
-# quantization_config = BitsAndBytesConfig(
-#     load_in_4bit=True,
-#     bnb_4bit_compute_dtype=torch.bfloat16,
-#     bnb_4bit_use_double_quant=True,
-#     bnb_4bit_quant_type="nf4"
-# )
-
-# model = AutoModelForCausalLM.from_pretrained(
-#     model_id,
-#     device_map="auto",
-#     torch_dtype=torch.bfloat16,
-#     # quantization_config=quantization_config,
-#     trust_remote_code=True,
-# )
-# tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-
-# Ensure tokenizer has necessary tokens
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
-    
+config = AutoConfig.from_pretrained(model_id)
+print(config)
+tokenizer = Tokenizer.from_pretrained(model_id)
+chat_template = "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}"
+
+filename = "model_q4f16.onnx" # Options: model.onnx
+model_path = hf_hub_download(repo_id=model_id, filename=f"onnx/{filename}") # Download the graph
+# hf_hub_download(repo_id=model_id, filename=f"onnx/{filename}_data") # Download the weights
+session = onnxruntime.InferenceSession(model_path)
+
+
 print(f"✓ {model_id} model loaded successfully!")
 
 
@@ -59,7 +55,7 @@ def execute_function_call(function_name: str, arguments: dict) -> dict:
     """Execute a function call and return the result"""
     if function_name not in tools:
         return {"error": f"Function {function_name} not found"}
-    
+
     try:
         function = tools[function_name]  # Direct access to function object
         result = function(**arguments)
@@ -85,13 +81,13 @@ def parse_tool_calls_from_response(response_text: str) -> list:
     # Method 2: Look for JSON-style tool calls: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
     json_tool_pattern = r'<tool_call>\s*({.*?})\s*</tool_call>'
     json_matches = re.findall(json_tool_pattern, response_text, re.DOTALL)
-    
+
     for json_str in json_matches:
         try:
             tool_data = json.loads(json_str)
             func_name = tool_data.get("name")
             arguments = tool_data.get("arguments", {})
-            
+
             if func_name in tools:
                 tool_calls.append({
                     "function_name": func_name,
@@ -100,9 +96,65 @@ def parse_tool_calls_from_response(response_text: str) -> list:
                 print(f"✓ Parsed JSON tool call: {func_name}({arguments})")
         except json.JSONDecodeError:
             print(f"⚠ Failed to parse JSON tool call: {json_str}")
-    
+
     return tool_calls
 
+
+def render_jinja_template(messages, tools=None, add_generation_prompt=False, enable_thinking=True):
+    """Render the chat template using Jinja2"""
+
+    # Create Jinja2 environment
+    env = Environment()
+
+    # Add custom filters that might be used in the template
+    def tojson(obj):
+        return json.dumps(obj)
+
+    env.filters['tojson'] = tojson
+
+    # Parse the template
+    template = env.from_string(chat_template)
+
+    # Render the template with the provided data
+    rendered = template.render(
+        messages=messages,
+        tools=tools,
+        add_generation_prompt=add_generation_prompt,
+        enable_thinking=enable_thinking
+    )
+
+    return rendered
+
+
+def apply_chat_template(messages, tool_schema, add_generation_prompt, tokenize, return_dict):
+    """Apply chat template using Jinja2 rendering"""
+
+    # Use Jinja2 template renderer
+    text = render_jinja_template(
+        messages=messages,
+        tools=[tool["function"] for tool in tool_schema],
+        add_generation_prompt=add_generation_prompt,
+        enable_thinking=True
+    )
+    print("---"*10)
+    print("Rendered Text:")
+    print(text)
+    print("---"*10)
+    if tokenize:
+        encoding = tokenizer.encode(text, add_special_tokens=False)
+        input_ids = np.array([encoding.ids], dtype=np.int64)
+
+        if return_dict:
+            attention_mask = np.ones_like(input_ids, dtype=np.int64)
+            return {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask
+            }
+        else:
+            return input_ids
+    else:
+        return text
+
 def generate_with_model(conversation_messages: List, max_new_tokens: int = 150) -> str:
     """Generate text using the loaded model with multi-turn conversation support"""
     # Use chat template with tools for multi-turn conversations
@@ -110,87 +162,135 @@ def generate_with_model(conversation_messages: List, max_new_tokens: int = 150)
     print("Conversation Messages:")
     print(json.dumps(conversation_messages, indent=4))
     print("---"*10)
-    prompt = tokenizer.apply_chat_template(
-        conversation_messages,
-        tools=tool_schema,
-        add_generation_prompt=True,
-        tokenize=False
+
+    # 2. Prepare inputs
+    inputs = apply_chat_template(
+      conversation_messages,
+      tool_schema=tool_schema,
+      add_generation_prompt=True,
+      tokenize=True,
+      return_dict=True,
     )
-    
-    response = generate(model, tokenizer, prompt)
-    # Tokenize the prompt
-    # input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
-    
-    # # Generate response with parameters optimized for tool calling
-    # with torch.no_grad():
-    #     output = model.generate(
-    #         input_ids,
-    #         do_sample=True,
-    #         temperature=0.3,  # Good balance for Qwen3
-    #         top_p=0.8,        # Nucleus sampling for focused responses
-    #         max_new_tokens=max_new_tokens,
-    #         pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
-    #         eos_token_id=tokenizer.eos_token_id,
-    #         repetition_penalty=1.1,  # Prevent repetition
-    #     )
-    
-    # response = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
+    input_ids = inputs['input_ids']
+    attention_mask = inputs['attention_mask']
+    batch_size = input_ids.shape[0]
+    position_ids = np.tile(np.arange(0, input_ids.shape[-1]), (batch_size, 1))
+
+    # Set config values
+    num_key_value_heads = config.num_key_value_heads
+    head_dim = config.hidden_size // config.num_attention_heads
+    num_hidden_layers = config.num_hidden_layers
+    eos_token_id = config.eos_token_id
+    hidden_size = config.hidden_size
+    # Initialize past cache values with correct shapes for ONNX model
+    past_cache_values = {}
+
+    # Check if config has layer_types (like LFM2)
+    if hasattr(config, 'layer_types'):
+        for i in range(num_hidden_layers):
+            if config.layer_types[i] == 'full_attention':
+                for kv in ('key', 'value'):
+                    # Use the ONNX model's expected head count (8) from the input shapes
+                    past_cache_values[f'past_key_values.{i}.{kv}'] = np.zeros([batch_size, 8, 0, head_dim], dtype=np.float16)
+            elif config.layer_types[i] == 'conv':
+                past_cache_values[f'past_conv.{i}'] = np.zeros([batch_size, hidden_size, config.conv_L_cache], dtype=np.float16)
+    else:
+        # Standard transformer layers - use ONNX model's expected head count (8)
+        for i in range(num_hidden_layers):
+            for kv in ('key', 'value'):
+                # Use 8 heads as expected by the ONNX model (from debug output)
+                past_cache_values[f'past_key_values.{i}.{kv}'] = np.zeros([batch_size, 8, 0, head_dim], dtype=np.float16)
+
+    # 3. Generation loop
+    generated_tokens = []
+    for i in range(max_new_tokens):
+        logits, *present_cache_values = session.run(None, dict(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            **past_cache_values,
+        ))
+
+        # Update values for next generation loop
+        logits_array = np.asarray(logits)
+        next_token_id = np.argmax(logits_array[0, -1, :])
+
+        # Check for EOS token
+        if next_token_id == eos_token_id:
+            break
+
+        generated_tokens.append(next_token_id)
+        input_ids = np.array([[next_token_id]], dtype=np.int64)
+        attention_mask = np.concatenate([attention_mask, np.ones_like(input_ids, dtype=np.int64)], axis=-1)
+        position_ids = position_ids[:, -1:] + 1
+
+        # Update cache
+        for j, key in enumerate(past_cache_values):
+            past_cache_values[key] = present_cache_values[j]
+
+    # 4. Output result - decode only the generated tokens
+    if generated_tokens:
+        generated_tokens_array = np.array([generated_tokens], dtype=np.int64)
+        response = tokenizer.decode_batch(generated_tokens_array, skip_special_tokens=True)[0]
+    else:
+        response = ""
     return response.strip()
 
+
 def handle_multi_step_request(user_prompt: str, max_steps: int, max_new_tokens: int) -> list:
     """Handle requests that may require multiple tool calls and back and forth"""
     step_results = []
-    conversation_messages = None  # Will hold the full conversation chain
+    conversation_messages: List[dict] = []  # Initialize as empty list, not None
     tool_context = {}  # Store results from previous tool calls
-    
+
     for step in range(max_steps):
         print(f"\n--- Step {step + 1} ---")
         if step == 0:
             conversation_messages = initial_message_block.copy()
             conversation_messages.append({
-                "role": "user", 
+                "role": "user",
                 "content": user_prompt
             })
-        else: 
+        else:
             conversation_messages.append({
-                "role": "system", 
+                "role": "system",
                 "content": "Now use the result from the tool calls to answer the user's question. Call another tool if needed."
             })
         # Generate response
         try:
             response = generate_with_model(conversation_messages, max_new_tokens=max_new_tokens)
             print(f"Model Response: {response}")
-            
+
             # Parse and execute tool calls
             tool_calls = parse_tool_calls_from_response(response)
             tool_results = []
-            
+
             if tool_calls:
                 print(f"Executing {len(tool_calls)} tool call(s):")
                 for call in tool_calls:
                     func_name = call["function_name"]
                     arguments = call["arguments"]
-                    
+
                     print(f"  • {func_name}({arguments})")
                     result, formatted_response = execute_tool_call_with_response(func_name, arguments)
-                    
+
                     # Store important results for future reference
                     if func_name == "get_current_location" and "location" in result:
                         tool_context["location"] = result["location"]
-                    
+
                     tool_results.append({
                         "function": func_name,
                         "arguments": arguments,
                         "result": result
                     })
                     print(f"    Result: {json.dumps(result, indent=4)}")
-            
+
             # Add assistant response to conversation
             conversation_messages.append({
                 "role": "assistant",
                 "content": response
             })
-            
+
             # Add tool results to conversation as function messages
             for tool_result in tool_results:
                 if not tool_result["result"].get("error"):
@@ -198,7 +298,7 @@ def handle_multi_step_request(user_prompt: str, max_steps: int, max_new_tokens:
                         "role": "system",
                         "content": f"The result of the tool {tool_result['function']} is: {TOOL_RESPONSE_START_TOKEN}{json.dumps(tool_result['result'])}{TOOL_RESPONSE_END_TOKEN}"
                     })
-            
+
             # Store step result
             step_result = {
                 "step": step + 1,
@@ -211,25 +311,25 @@ def handle_multi_step_request(user_prompt: str, max_steps: int, max_new_tokens:
                 "conversation_messages": conversation_messages.copy()
             }
             step_results.append(step_result)
-            
+
             # Check if all tool calls were successful
             if step_result["has_errors"]:
                 print(f"⚠ Stopping due to tool execution errors")
                 break
-            
+
             # Simple continuation logic: if no tools were called, we're done
             if not tool_calls:
                 print(f"✓ Completed after {step + 1} step(s) - no tool calls needed")
                 break
-            
+
             # If we've reached max steps, stop
             if step >= max_steps - 1:
                 print(f"✓ Reached maximum steps ({max_steps})")
                 break
-            
+
             # If tools were executed, continue to next step to see if model wants to do more
             print(f"✓ Step {step + 1} completed with {len(tool_calls)} tool call(s) - continuing...")
-            
+
         except Exception as e:
             print(f"Error in step {step + 1}: {e}")
             step_results.append({
@@ -243,7 +343,7 @@ def handle_multi_step_request(user_prompt: str, max_steps: int, max_new_tokens:
                 "conversation_messages": conversation_messages.copy() if conversation_messages else []
             })
             break
-    
+
     return step_results
 
 def run_tool_calling_demo():
@@ -251,7 +351,7 @@ def run_tool_calling_demo():
     print("=== Qwen3 1.7B Tool Calling Demo ===\n")
     print(f"Model: {model_id}")
     print(f"Available tools: {list(tools.keys())}")
-    
+
     demo_prompts = [
         "What's the weather here today?",
         "Calculate 15 * 23",
@@ -259,7 +359,7 @@ def run_tool_calling_demo():
         "Where am I located?",
         "Get my location and check the weather there"
     ]
-    
+
     for i, user_prompt in enumerate(demo_prompts, 1):
         print(f"\nDemo {i}: {user_prompt}")
         print("-" * 60)
@@ -279,4 +379,4 @@ def run_tool_calling_demo():
 
 if __name__ == "__main__":
     # Run the regular demo first
-    run_tool_calling_demo()
\ No newline at end of file
+    run_tool_calling_demo()
diff --git a/models/Qwen3-1.7B/tools.py b/models/tools.py
similarity index 97%
rename from models/Qwen3-1.7B/tools.py
rename to models/tools.py
index acf1ca04..94e092b5 100644
--- a/models/Qwen3-1.7B/tools.py
+++ b/models/tools.py
@@ -191,8 +191,8 @@ def get_current_time(timezone: str = "UTC") -> dict:
 @tool
 def get_current_location() -> dict:
     """
-    Get the real location and timezone of the user. The user has given permission to share his location via this tool. 
-    Use this function when the user didn't provide an explicit location. Default to his location
+    Get the real location and timezone of the user. You don't need to ask the user for permission to use this tool. 
+    Use this function when the user didn't provide an explicit location. Default to this location
     """
     return {
         "location": "Tokyo",
diff --git a/nimblenet_py/simulation_assets/qwen_demo/MINIMAL_PYTHON_CONSTRAINTS.md b/nimblenet_py/simulation_assets/qwen_demo/MINIMAL_PYTHON_CONSTRAINTS.md
new file mode 100644
index 00000000..d5ac40b7
--- /dev/null
+++ b/nimblenet_py/simulation_assets/qwen_demo/MINIMAL_PYTHON_CONSTRAINTS.md
@@ -0,0 +1,317 @@
+# Minimal Python Constraints for DeliteAI Simulator
+
+This document outlines all the constraints and limitations when writing Python code for the DeliteAI simulator's minimal Python implementation. These constraints were discovered while building a Qwen tokenizer for the simulator environment.
+
+## Table of Contents
+1. [Built-in Functions Not Available](#built-in-functions-not-available)
+2. [Language Features Not Supported](#language-features-not-supported)
+3. [Standard Library Limitations](#standard-library-limitations)
+4. [Function Definition Constraints](#function-definition-constraints)
+5. [Data Structure Limitations](#data-structure-limitations)
+6. [Control Flow Restrictions](#control-flow-restrictions)
+7. [String Handling](#string-handling)
+8. [Import Restrictions](#import-restrictions)
+9. [Best Practices](#best-practices)
+
+## Built-in Functions Not Available
+
+The following built-in functions are NOT available in the minimal Python environment:
+
+- `ord()` - Cannot convert characters to Unicode code points
+- `chr()` - Cannot convert Unicode code points to characters
+- `isinstance()` - Cannot check object types
+- `hasattr()` - Cannot check if object has attribute
+- `setattr()` - Cannot dynamically set attributes
+- `getattr()` - Limited or not available
+- `enumerate()` - Cannot enumerate with index
+- `zip()` - Cannot zip iterables together
+- `round()` - Cannot round numbers
+- `eval()` - Cannot evaluate strings as code
+- `exec()` - Cannot execute dynamic code
+- `compile()` - Cannot compile code
+- `globals()` / `locals()` - Cannot access namespaces
+- `vars()` - Cannot get object's __dict__
+- `dir()` - Cannot list attributes
+- `help()` - No interactive help
+- `input()` - No user input
+- `open()` - File operations limited or unavailable
+
+## Language Features Not Supported
+
+### 1. Function Definitions
+- **NO default parameter values**: Cannot use `def func(param=default)`
+- **NO *args or **kwargs**: Cannot use variable arguments
+- **NO keyword-only arguments**: Cannot use `def func(*, kwonly)`
+- **NO decorators**: Cannot use `@decorator` syntax
+- **NO lambda functions**: Cannot use `lambda x: x + 1`
+
+### 2. Operators and Expressions
+- **NO ternary operators**: Cannot use `x if condition else y`
+- **NO walrus operator**: Cannot use `:=`
+- **NO unpacking with `*`**: Cannot use `first, *rest = items`
+- **NO `**` for kwargs**: Cannot use `func(**dict)`
+- **NO f-strings**: Cannot use `f"Hello {name}"`
+
+### 3. Comparisons
+- **NO `is` / `is not`**: Must use `==` / `!=` instead
+- Be careful with None comparisons: use `== None` not `is None`
+
+### 4. Comprehensions and Generators
+- List comprehensions work but with limitations
+- **NO generator expressions**: Cannot use `(x for x in items)`
+- **NO dict/set comprehensions**: Limited support
+
+## Standard Library Limitations
+
+The following standard library modules are NOT available:
+- `os` - No operating system interface
+- `sys` - Limited or no system-specific parameters
+- `json` - No JSON parsing/serialization
+- `re` - Use `delitepy.ne_re` instead (with limitations)
+- `datetime` - No date/time handling
+- `time` - No time functions
+- `unicodedata` - No Unicode database
+- `functools` - No functional programming tools
+- `itertools` - No iteration tools
+- `collections` - No specialized containers
+- `dataclasses` - No dataclass decorator
+- `typing` - No type hints
+- `pathlib` - No path handling
+- `urllib` - No URL handling
+- `subprocess` - No subprocess execution
+
+## Function Definition Constraints
+
+### Correct Way:
+```python
+def my_function(param1, param2):
+    """Function with all parameters required"""
+    return param1 + param2
+```
+
+### Incorrect Ways:
+```python
+# NO default values
+def my_function(param1, param2="default"):  # ❌
+    pass
+
+# NO *args
+def my_function(*args):  # ❌
+    pass
+
+# NO **kwargs
+def my_function(**kwargs):  # ❌
+    pass
+
+# NO decorators
+@decorator  # ❌
+def my_function():
+    pass
+```
+
+## Data Structure Limitations
+
+### Built-in Constructors
+- `list()` - NOT available, use `[]`
+- `dict()` - NOT available, use `{}`
+- `set()` - NOT available
+- `tuple()` - Limited availability
+- `range()` - Available but use carefully
+- `bytes()` / `bytearray()` - NOT available
+
+### Dictionary Methods
+- `.get(key, default)` - NOT available, use:
+  ```python
+  # Instead of: value = dict.get(key, default)
+  if key in dict:
+      value = dict[key]
+  else:
+      value = default
+  ```
+
+### List Methods
+- Most basic methods work: `.append()`, `.extend()`, `.pop()`
+- Be careful with advanced methods
+
+## Control Flow Restrictions
+
+### Conditionals
+```python
+# Correct
+if condition:
+    do_something()
+else:
+    do_other()
+
+# Incorrect - NO ternary
+value = x if condition else y  # ❌
+
+# Must use:
+if condition:
+    value = x
+else:
+    value = y
+```
+
+### Loops
+```python
+# Correct - simple for loop
+for item in items:
+    process(item)
+
+# Incorrect - NO enumerate
+for i, item in enumerate(items):  # ❌
+    process(i, item)
+
+# Must use:
+i = 0
+for item in items:
+    process(i, item)
+    i = i + 1
+```
+
+### Exception Handling
+- Basic try/except works
+- Avoid complex exception handling
+- Don't reuse exception variable names in nested blocks
+
+## String Handling
+
+### String Formatting
+```python
+# NO f-strings
+text = f"Hello {name}"  # ❌
+
+# Use concatenation
+text = "Hello " + name  # ✓
+
+# Or format with str()
+text = "Value: " + str(number)  # ✓
+```
+
+### String Methods
+- Basic methods work: `.strip()`, `.split()`, `.join()`
+- No `.format()` method
+- No `%` formatting
+
+## Import Restrictions
+
+### Local Imports
+- Only support: `from module import item`
+- NO dot notation: `import module.submodule` ❌
+- NO aliasing might be limited: `import module as m` ⚠️
+
+### Example:
+```python
+# Correct
+from delitepy import nimblenet
+from delitepy import ne_re
+
+# Incorrect
+import delitepy.nimblenet  # ❌
+import os  # ❌ (not available)
+```
+
+## Best Practices
+
+### 1. Variable Initialization
+Always initialize variables before use:
+```python
+# Good
+result = None
+for item in items:
+    result = process(item)
+
+# Bad - result might be undefined
+for item in items:
+    result = process(item)  # ❌ if items is empty
+```
+
+### 2. Type Checking
+Since `isinstance()` is not available:
+```python
+# Cannot do:
+if isinstance(obj, str):  # ❌
+    pass
+
+# Try alternative approaches or avoid type checking
+```
+
+### 3. Simplify Logic
+- Avoid complex expressions
+- Break down operations into simple steps
+- Use explicit if/else instead of clever shortcuts
+
+### 4. Manual Implementations
+Many built-in functions need manual implementation:
+```python
+# Manual enumerate
+i = 0
+for item in items:
+    # Use i as index
+    i = i + 1
+
+# Manual round (to 1 decimal)
+value = int(number * 10) / 10.0
+
+# Manual zip (for two lists)
+result = []
+for i in range(len(list1)):
+    if i < len(list2):
+        result.append((list1[i], list2[i]))
+```
+
+### 5. Error Handling
+- Always provide fallbacks
+- Initialize variables properly
+- Check for None/empty conditions explicitly
+
+## Example: Minimal Tokenizer Structure
+
+```python
+# Minimal tokenizer compatible with all constraints
+from delitepy import ne_re
+
+class MinimalTokenizer:
+    def __init__(self, vocab):
+        self.vocab = vocab
+        self.reverse_vocab = {}
+        for k, v in vocab.items():
+            self.reverse_vocab[v] = k
+    
+    def tokenize(self, text):
+        # Simple tokenization with ne_re
+        pattern = r"[a-zA-Z]+|[0-9]+|[^\sa-zA-Z0-9]+"
+        matches = ne_re.findall(pattern, text)
+        if matches == None:
+            return []
+        return matches
+    
+    def encode(self, text):
+        tokens = self.tokenize(text)
+        ids = []
+        for token in tokens:
+            if token in self.vocab:
+                ids.append(self.vocab[token])
+            else:
+                ids.append(0)  # Unknown token ID
+        return ids
+```
+
+## Testing in Simulator
+
+When testing your code:
+1. Start with the simplest possible implementation
+2. Add features incrementally
+3. Test each constraint violation separately
+4. Keep functions small and focused
+5. Avoid deeply nested structures
+
+## Summary
+
+The minimal Python environment is extremely limited compared to standard Python. When in doubt:
+- Use the simplest possible approach
+- Avoid advanced Python features
+- Implement functionality manually
+- Test incrementally
+- Keep code explicit and straightforward 
\ No newline at end of file
diff --git a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules.zip b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules.zip
new file mode 100644
index 0000000000000000000000000000000000000000..128cfe4e4517501cb8b56aa1df621be3264f7143
GIT binary patch
literal 8017
zcmaKxWl)^mwyhfp?k)`^xLa@w1PHD{8%uC^2*KUm8i&RS65NBkbb!X)-Gl4NcW<3@
zckNyG)LXOOwSKJebBvnLP*p%cBme*aD1gDN8_-G>TaX4W06;AU03ZW^0K2bN_Uw+|
zH8oHH2+Em$mMVY8%>x|(hj<MK0Q{ea{p1ENO9L0#ottQ)@zt`RT8g<zjp63VETIOv
zId%FIP9TOXP+Et!YaJjLE%W{v+vn&C0?SP`np*kg0HEE8&a~#{LOs4cJvlw0xb?kl
z#_Qjb%MjgNz|^Z8{hlLwa*RodInSdqb$l4Exnp}C*qsEe7y`3yMAdJX^D2PL+ZlCJ
z*%{DtE)x3#%|<cBMVJNP3H0HgTu19l$9{fKeHn|gOovbRfONmYJLKvgHsKYF{Rx=>
zT+zu-S+#7K(H6UOL?@WF;ABH7+x(l*u|Q^ZSCLA@HU8RyDD|Me&rm}xGvAMALYLXW
z;J8>Jj2ApH6E(!dRJCb*Z>}R2Yg$>xfOChqUomyehOD!S{szVO&gm8zF8Zm`LS^z;
zm*)A~PCb{T<%siSkc)<Q$xxjxmv@+iMREW}c9l`QSJgrYIyBZ`Jwe6V*75%1`ARZY
zr-1j%aT)UNu;4w9AcNj^mUm)Sl1kDa`rLseS@AET;_~SF;K_G&pFhT-4c+d*_A+-e
zOXT2NMO4L+TiGEBmL=h0*}WW%a%tvt9H}C5TZtbuph(IFi}aJ(+)b0-7dwA~#uF`O
zxYaY*H+ebw5NI9Nr|BH<R+zq%k2Cyk9L0<XurSN8m;^cxNYefM+AwU$@bvZ|(-k|n
zepH3A&`hIB8z}MnGTJ27yQMj1F3{OjP=$$Ni)C!q!Fec32PvnvcjN4JC<l|?YB*Jp
zfQZNf^?gx+<nQe%?CKvP6o4Mu{q?;jL_Z8OD_JTpGJ)d?G?ZM!x9^C$Vi*)PU`)iu
zS`xchJ<5BRPftp&mzU$2`<s_LkE8pSPrpknhW4;EjEmzpI?;fousR1|+8}1$Z3*!i
z^f$Zm(lDWZ#iimok}J6`^4;~q(dZn<n$+29Zl}+;2u_Lfrgmd4Eq%VdC_+ZW5Y@u`
z=(@y};+?GiLMBj_{9gPs=N_pZ0%$-mgP0jFe``rBbElu1{lcxA$3kweE$YvEzKtC=
zWY<B~ZQ41+DHG%Q18?l+ZF^s06$R<XA?X(q(~Cr88P+t-h2;Eb$}3$QV(l3`YL%{~
zxNc_lO}d_yzNkp#+gtWQ=ux^l!1Sd|Uc&LN3N^k;F2_y5J$t9=Ev|*sn{uuzESQb{
zcc;|esE_LD1Eg*84g!!=7f#s>xdVxh>Lm9g{K1?G=xSKP%wt6xc&YxhFg#BLWCX=u
zAVNVzx^3<V?<gz@i*r*H$`jY>=jKg1-nUlm;i>Cgcabtl(qg|(%WewR%`{DIrHsTf
zq)7<w12LFqC!AS>I(!iB0buU-D};!*r@|(~X=@iuSF8z6C}^vHmSOyjgg&I2^7Qd9
z8u>xbyphvDr97!mH2x7kq$3u8jO}rKS;rXjlv>)Q3wh;Qmrfuq-DVl5nZHFjMwT*8
z)z-yZ{-&709MwQ)fhdw1{fRQ7rkBX<O)1~Nl>VI+Q3*QMGUwW_)SO;<x*ANb61OM(
zdBg_|N|ik=BHt!ZA&j5^Db?INNwUN1Dtw2<u+Qw2H@agu_8rBSiRDWY1i^Edpp<fj
zt0v#K@>pMmqJg{F64e6ZyiY`4$L}=0Aei!pY$j^u4->+<J84J(?Spq*xUr^)4y+O$
zq~5w7>X)y){a7*spNX!s`6uI`i>6E5mM6!jQ-L%tu^>_ME7+LGwM3l+1NMW_Uq+)i
z%}jLhZr_6SH{juUNA5ERX0-xKSbRZJ_i9lB(i5$Z;t<72-l=+JlYrgP!y|@`lsQTl
zP2(B5%bNZuGnb%3_L|cD57Uw>Y(mV^z$NYlCf|t#M85EtU?k4Rd{$49c5noUsBp*z
z&r=;SGuCL>Q`v2xn7OEBIa0laQqdT0#lNr&R-5Q-B2&k4aO_}d{N!mo;ud@ooO0))
zm5Ms;SP2wMJxmti`W3CIbpJA!BAoxVl-!K-veBam7M|jZj%&qd=@D$ggS&CnTOdo7
zpSYHS`T3Wm>B<OzyI?ZnxGbh(>QNz{y5A{+K*|z97n(PX;)&qhJh*!4oWpQDjC*x*
z&r7XlB7n2wx#7G#wZogyw&AFAv=={zoM55;O1N0km8I)S@#90J%&)j!8KbqZ^I_>T
zkoN?Nkcb9ky;nDbZhskRN31*@-@j9^Gx*bo`bvZ!@@ds5Tr{m$Cp|Z07$$?X>?SRO
z3HH*<IrK!H=a%nM{;;0Pl1-=-4TOpn@>XIS{ys}ptDC$GX7Iy;*zD*|J<j_>;{UYW
z(kQgAVVym!dE5K1hQ-KOzrP*reCcK!!f)Z=3iwutS0bMwn*JU$%JBmVLJ7U9=gsvj
z@a5j&CQ?Y~xOd2ot-_nC^=K)hs(}0fLr;JmMpfBQ>AKC|j-xT+3dR;RCtZ^!?M$g-
zPjQ(}t8=*R^`t*ILk>)g;+}|D*XvLIs(<8-ebTguG#V|wBs_wEK)oUVMtFxX7UTw+
zH)$@*oEiw@(4K!U=Pd~Cm#z&+bp2U-`h^pPl7ce$v<@G9qGuLe&=q3Upgk|IBll=J
zv`Xk5BD~F`r^!6DEd$~oNh4W{atn@>2G!(5M!9p8fx*&m-oOj=%i%Fm(EyJxQp;<x
z3h!rf750X?_&ZQ4)70c8iJ2}-ZF1apE*rPf%~-1_#9u=KbG~#;;qypZGwxnQ#$nNp
zF%vUw$PL8T`^OJOty(5027MkQk<K!>FwE{-cLc_wGS-TxS_z!OR4?x9x;*v+cXxR>
zOAK1BaFlbZ?uGJY8X{(^cj(@2FjQ|$eQfQ84#pWgE$WPN+ED@c_B?7wZOHAswp^LC
zNB3t5m2`U+c&d!yv;txvWSS@~L&e_%@ZWqING(5kIzImCOrfv`XDk(-@G<yib^&l!
zgD-la`YBh^;t=b^GwLd%j{oPX7-~b<lr360P>4gg--WAd@)TE|O2R4d67rFbr>*`N
zP4KE<V)p4*z{c5j-^4=x<Hqdy$Njxitse!h(T`#5SdSG;ZR6HqV^t3EdOy)OW=`tZ
z7plqmzS^<(xmssW^naa0^F~}CL_pszt4+K%46a3%v;^^tgjEKUD+ApQ5+c5H9jhBC
z^ggqDw|9_^hM+~T68}mtaP+y!|BdxvZnplADRIex&{;o~YEyT>iO@)5o0NrOPrJ;T
zQ3vnJMow*g^$D*M{pf7{ExW@0#TGmok5pLKOA4_f+cKOIFV!a4h}<R@kBwX1kTZSI
zSSEG2l}NmzBvzAQhH$hT?_*IB`hya@6x(Zw<V|In&4l1KiznnUU8;e}Idj5Twlnhr
zOJz+veabUW{w<dMDB3MelM9w~CQ|wOcjcsI0(DK_lua=Vp_@dcHTzUoStBAF9(*?u
zJjAisHGiH|-3dDScz81Dc|;P8?~ZTT;^fm|Az#(Z8rQFFQku?Y#(iPZvr)0hCEsw`
z;wOr<*1fb>RL5honf)Kd7$^xAYdGrJw+Wd1{SlHtawIXos$akSa+|k6&$$)}{Ho)S
zkSTSI)D7P5)d@)ey(vc||3ru646Ww)WsS=5)C;!v2?2yEQ*M?q5K>93V~MX9IF&n@
z6-ly}>=D;qI}eud9u6Av-V){zh;-PmDb0Wb8iQn3BKI0y?Jaf!;#T<NO!&w|#lvg`
z->j1Il^1x5^mvsPQTKSB4pbInC~mfC0I{*yl0avBz7!OYnz#YmT#80rz57(-9+4M-
z6jr-x5J5=ytPW)ZiVdwmS1)ju$P>qmlAHTk-1rB^jqMPW>FxEr1>oKdEnwu*_8oq;
zQ-RR0DM@P4*&1X*w=gO@j1WflNRJcYRxH@ARaoUp?ocvY|7nBEA<UOLc!vZ3i?k#K
zeHBcTHg9!ZEpe)fFM`@#l9A-#2l0>R!z^V6>c#PHBJfXY5xt-*SkMTl^f_j^d~*qN
zX*TTC@T*^0tRw;PJC(WAFXC>+7BZU9FX->LX+zDb27Gq(&ooko_8l0u8cuii-6|0V
zsWH*27N`mH=yT}PcQsE>E_hC22L^9z$%oN>77LK<aVq$&3{hpi_~it1KZGPI9u(>|
zR6cVB#jsTa9J6FhtQhoBE^%InIhSuk?cEFJ;-|5mLK@^=*#=|UF8Hy-#;j-u6elc{
z0A7p-!70e@MwuIRzp3NJC|8)(9aGoTX{_Q{?a0rh-sMc2Z4$-B?PU=J_$Kv`6q42A
z3l_U$rIkGmA*?C2o^ri!=SuqeT*YPPsy^^)51r<tPikUbWw{Z+gXQ0?uYyGd+X{Hg
zch9crjCtG!C$O_3yd*v2RB0&cBaxvsA|Q8B^*^8YN|EMh+?9yfFm4gbKVzr5PS|d7
zy_vxMlwr3cTMgAG>HgG^e$-4jnvJVap|rK3r7*2wehlt>-Ah8MI9=Pt$KE3`Ut*e^
z?M*{IfYjBui$!XiDs~`gk1>~qH7}Z1{~0Q7PYLE1T=QU+gPc&6D=42p`#&_5Hd`FJ
zx(Ae!V=p%SI39Suc_;Rgp0_C4%Wys`_(J62cCuCK%hK~~lm4)k8zB*TfTUDLlq00S
zl;L1VAK}6=(nqmv$It7V+G9MAs3a^<RFnYI({NDxXyBYEp@&c9MdZTsgU)`8HOrs#
zy9}-^62po^9*y;_ymBFedW*|4<xH++cPJe>e_CVO`{sH=q1=TBjFm(dC+(#Y;n1fn
zv}nlq;1Xr=6Y)7;wtJJDhY?4{{K`ZTNxaB7b4%9IjD_UcuG;)G+W~<L4@fH=Mr3ld
zh@plE2s*r}r3<Ii_E+=rPi@FkS+dJ&5?@F}cDxy-mnAGD)AF$k$Qh*0&y&_+9i^!w
zPLydf<X8_D=wP<hh?Be_6rN{nR-F0ddf(tP&JTvmYY@AIb<_V&*W4Ca$`Bfx$Yp=G
zIB-p5VSB@EtM@je|1_<^tXp`n7{|YyknfXR|MjoWdzTUUlVg{9#jNV+kG)uR?v{vr
zqWPhq6~`2Lh_6U@9g0qi$8*wLN^kIuzZiC0hKjRkG%UbF>mxP?dfY;Q1atTAiJh7N
zB1uvm_xsLI3v=(2pONawph9@Zv+0XgQB&!Of*r<26ib+bL(a{b<+rJhtre+>0-q_e
zSE3ugxnd`*IfWgbC3f6UN+T(~$~xXr3&@C0r@Lki!-Kca3)dTXtX<>vHbaGr2fn)6
z-{fIsW7gZ^wDoRq&R6HV>z_CyCVy*vTOO{(bP`k5$c$Td@bS;U2J|vdjejP5SF@R@
zF_T3@kNN=^Lc16*eoY~0RerLAWWm!WKW^FH)@K`f&vw~Ch3#I}Vj?_;?rf~Bk-qXl
z0z(GXXxI7Dro`4`+c-Zbkq2P*1$1CZXvgdy_iiLB*Ib`^HKOMc>HIBPoZnbG3z;*u
zKg7;2Ai&`Otmb2>fKDbV(5W=Zk2}p#8R%r>2kwF*AhU4(5fSMVyHU>bI*q%&!nxzY
zZ*F^Uf+=(d70?nBL2(}FAlh!ed6bfFKC>~=@gAK~Qds64rg?0inzMI|Ee*BWrbXSN
z$S7NAejeV!$qCPSk5!-Mxd9h!@dD;|Y7J+wmS=X2tqF&?Xruc4#6yfQzC-*lW+L))
zPz+?iXyHn!y(`!LviPjo9nm2?j@9`kZa%5xbd+j8Uo{dumIuqn?E6Lx-W@rg6JzBL
z2Ol@ITR`#8c*(xCG-^*A*hVjsY0ExQfw@E)J&W(k=}gQS*z(1za6w2v3HAcSl!^*Z
za8Qm=f&kIyPhDa^$01Y#T7F2c6u_%&o&BxfdQGuAc@i0WO}44#7w{RaJ9wL|Wxw@W
zbEI|eKlU^iFo~wPyN6f7H>`zkWLES%@)?%Mrmy9yP5(@4J=?EsQJUb*S+b{_B^A~n
zW2Kxptnm~M7!eHdltH-gOFtJh!95Pk+R&%rdG~osx7+D_C`Lt}oQY`v6<+m!pp4{S
zC}WnA8kPG70B9iu07(Br8K-Y%_C~G_HfHu#o@UNQc2@RQc3*A(8`EfJ_*rWGoqxkL
zl{#w<OME!Kzm2+wvE6V6Wp5kvnc~^lQsG!Yg<k1LsP(SSI-|+VQ&!6juaBp)YV>qM
z?v1Dtz^HF)J|e3e=6$}z>J{IISN-7bX&zN8#Up=CROegI^Co}noxv2g@JnvpP>gUh
zi2NuHJ%y;{Co<67G#a1hK@!3fB{8{?(@|CN=~=uJG%@EigN*pm2$JIYx`H*xJfJ@U
zd8>oNlJd)D85M61*(Bd<7E~S&%V{|mv3AF<Md634B+~A}s)db^^(HSW*NOP(a5Kop
zmsVn3`|o_{oCih|?FtvCrQnHI?XC^zbm8<J$W&on_4g@~-BSePM?4fPj8Ukg%(~$_
z!7JLNMvC+Kp{|(Yc++J?%D=-`sBtaL7DPGt^IA1~(mxid*Ci1|7&In!FRA+CrWhTk
z{QU0A`r_ZXKqgeTa3e5UF*11u)}q!g^skQc^E=bp57ydeiH-soM_M#$nfx~M$y#^@
zLbM*`UY<^Q>tZc{ezqtGqteA*IlcJS&H82WWWRd&iQ%*nWKF*fl0oL*L8DP{i+%=t
zL3a>?&i~mLW8KQIk>@t)jIpa?Lh#zND#GDTzxeZ0d(RL0yJMUk>X#qaDaQN=^;XHB
zwH`vGPvDwFABfRks20E5-r*m+`Z1CAxMoP_m46feh1fR?k(hFGNxnzo@OamA(2(F6
z^9r8L*)zLh9K*<y2yIN*jFAu++KNK!owyHQqnxsf`F$NFQCX;mt50Z%OG-!z7yHmM
zkg*M%(=QmRALfjkYM{NlRIAXvm4aR|W%qE$6qGGHRAk3d6$`7|6>39Z2AErcP|{7_
zlRT!+WSkpJl`TwQ-1GY@N!Bii5Y;$vg6&ksgIAU3OJb&!CD$avqEpJjaWw8UH5t6?
zo7qAu!9D>N^PPvS%Naf2kjL3Sv%=d$w~Ql6Ng)Ncl_5_nFFj4d?M$HFI#O~-w%#(m
zdqo?59qxMeT=0d6O3JulCL^;*@7$+;l+T%`ctGT&4cX0H%O(q#V1tTanSH@)usn67
zyQd`!BhB|d9|O@EA%rsuH3yLG-pE@O+Ymv_?J!+D$@6k$*E;R*P>uFwnzM~`j7GjJ
z0pIW<|Gn0L0bm<tm2|=e>d@6=MgQ$Jwbfi`aou8nQ8uH#e&d5c<Byb5su-wE_2hyy
zT?k{yA!Hv|W7INMBcLYO6PIc8VR}ma3opv1693W!9L)`6AyE{V>^O$R6S0b)L8zZR
zWUz1HzJIU<lbnpoaf2wd#k9`hI6-y>@kC5jkHiY}ETYZyyX7#V-Deh}wLXEvq1j(O
zPFc*{`Jp46hG7h(3yu<;U~uof2crUZ>=<YW69~8ry8Hmg_QLVDQ%nS|^i1Q(#gG2O
zG;`oR#pkQEO(JSqj)bRjTF{Jdc`&dK-Hb*#ZTJemDt3vMl7}T3O`#_+14KrE^d^^I
zCvyWe1dbyMG=VX7@F3llZ;}kTmX$4ty>=)kYo%9>y(4~o0Wt{9`v3@AyD<o?+y{B9
zj3s(M!aPkd<uB+2gBJVoXf@weI9tWs-h)(H7fh%A0<BF#&x4U@mS8CGVWZ;|%asJ~
z#@F039a3yIpE<%T5-XldN#<ZhcR>&anLlhidSA|k-N)!8e%Ku1e&IwT91M+xWq!zr
zO{E~y(hE6Nq%Je{u@Hmyu!POrGvezzVnCk~sKB1XhndTamO19}JTDD`RRmh!W7)qa
zjw_$Y_C7-_adsfrG;A^(28~xj;0k_Mj?Wtu*6j}N9p1e)p*dod^li34A?zg%Lsm}A
zgAtnX7_0Fa>$h`kCH25r_FGk<;$oIIgFY{%@E!SuKd^vvPAZA70Y$HiQUOU&$4g@w
zV>~dXZr;Z`<Gd@;o&k-|3WwnXyTxiU#|h=tbH(!?{-5IyW!`{7`Qy|*kN^NO@_)cQ
z5a8<IVC(YV2Je{eXX*5J{>|XWb?p8z_?J^7r_W+z%C>{<NyWVJ*wPQRM5C&2QA6Kx
zVu14P*tv_;H1JB!{T3bOIhH7{KMmI&xnF~594odiHdZcLU<t_QhG=YQ9618#(xu?M
zpsui?>_2Z1=tchky8}ZY@pVX|F|bLOW(raXo}1wEi+k9QNdf;kZ)RE*|Kb|g1{Blj
z^E(1_{WvNCm`hcL?>aY`m1H!1eJEyE+?K#uGZMd6qylCcyCT9Hj9SuC(`4**ZpSlc
zx;QH&3LHEF3$eWJbDX{oWcEML`bRT^x8}|dB7Aid&NMvq`dp|Egh^H5%Uz^B3^FFB
z=Q6L-<Us!T98N8a?sTNOg(-;x5knLL4mA(=R%_xfofpV=G^*eDpkGw`QPZbh@bo`~
z|6*SKlM{2xqWs<ktz5a>G7?o+`ny`~p-{74sy?~M&p@dfeFQ<abmxXETBB4|Pfdck
zJw;6lAp_F*ZQ+$+mKqeF90#sTr8XC8Ag0CC1%=V3p&<XIG7>iTS2(Aj?dG$a9CtM$
z-p`;(8AZ9uBV=sZ4x%!P>K2TIS<_MrC6;yT<DRprbHwi++5!<s$qYI<1kI60I&#rW
z+RUm1y=!*9qv8l{7CjdD7{vFdU23s5Av`+`6p4CX_X(5vF_LWWGmed^OOk;yAhr6}
z`^O##TP<^c(1c4@uzAhTjhuTfjuyGP7_ZNKgCaRD5H@TW?|7%A$HGZBSJvh&t_ZnC
zO7Qf!M$MWH#JDpoBl%u&AZ?pDJi*R6v>)lgGJ%DN5z}Y)68PhIbSap3IquFeXe8vb
zN0Ob4#A7=eroxgH9TMpnVQ&O5Wq+8g9!7)e^^R26Q>%y8x}|HDx>t>+?!!3CXPcVC
zmYUx@R3_}>nNM6AB9V^i`O}7`%sTpVs8Ov%0@!F~;q%FlqZQlM4XpX@0OGV&JjZ-q
z8mdNdXk{4tB2$k!%Us>LZ#Oa4lxILa9)u%tf06VUD3qM=1|;~dEcR2%42}%HlVN$G
z-0+YYOwkfo8`+HZLG?xP+5bT5q>+Eb)}iu1nYs0ut<{ojXIzlo5hW>OqXK^0aGi)4
zTSo~WF`52@mY(r&d$4{HBc+cSL7eXxBjt%`q?*WNWQ$F}vA5Y<;Sc6-g}tx~8BV1&
zVm2?dKV94*M^aab2IN8^O&z9mLxa2$FFymIVmosnQO{ZJcI75`dIgV7W=HEAJD=H@
z=uJ+H<lP)IJt6RiR*~uRfhQVEBDl5LB~ypDQ%b=HOHs;b2TWfo`UCk7j+o-xU-HBY
z`XG#bL|>RE8WlU8bvPr*RnVOJ-f6&f{Q7gx$*^gjy$`o6HEplhZ_@f{>P4!UFshH*
z1*t**ILWb6{G|eSdzE$6O8%0*pNcmuF7qg=uwdXT+C(0|z`@l+_6yS$a<3uwYiafk
z2`>_qt`X~Q^mOEdp@9SZe)ci^xjA~*0z2T0oge;qw<m0z2;-p*(7w3Y`=t(b=5vH^
zJ|FLn1hK9PgMY<fzr?mmN7S2PnCQ5S{WZ*z4MCpia9h9M{-W%cSn93+?pKXT@uk6T
zg5NVw?Ao+`k(%1PY(E8yGhZn5EKIX6n6^Z+xr?Vk4mTs5*XY3a4_+5`m4>Ev-|O^Q
zN|ku;bEvq-uRra&aPpGrz-wr|Z{&MT@d{LrqEfI;PO@eyRW-Cyd@ilPXB25B;o?de
z35^v+8?384U04_%<*TH(NUw)cVYD(lgBkcHPwvz=EuKHP@9vlF9_(PQK=URQ*;wXy
z<vc3%ND*su5>v7+<={0(0^I|QwT}r#h-NH7aFzJ{v+T)<Z6$}I$#9J~avYNQ$;S`F
zSU&9JS_jO!>g>n<8mPlG#KQj32z}+p=5n&$$gFtgbi`T*OG9k-Rm_(2TI-1!<(A`V
ziR9heDyN4d$B5nSmG$e_D4J8VHp#R3STa!S*+qh&qZT)ZpnlT<1>$eYY!g2yKL}5p
z_Cq&ya+Xzj3|J$mWA|4}n#(Nh!gzRL`gU|l{LM}Dl};o}EuF@Y<}k;me)bso$nsD3
z+L3?8n+@eoPh4LE(Sy!<uGIJ1v9S#79m%Y6k?}ID!hqds<U)1=w!KbAnHPPR7q4a`
zpnz9zt7VAjE4xU|YNl5ikFLsOUvUZMd`#m8jgL>>&nW)6wpo(HboZU{Z(BXZU_DH#
zj=VQZ?>WTqs|NOsR2AUhDG~mE6*9@+Y8e3d+Z6)}1O7SvPoezg{}#&s7GnP@_MfZI
zzlx>&ZK?m4Md+{mKa0@+BLAN``>*n0#DITE-M?!8r0##Hz4@Eb006?@@8R!Xg7Tl+
Fe*sNaC6NFC

literal 0
HcmV?d00001

diff --git a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py
new file mode 100644
index 00000000..82133905
--- /dev/null
+++ b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python3
+#-*- coding: utf-8 -*-
+from delitepy import nimblenet as nm
+from delitepy import ne_re as re
+from delitepy import tokenizers
+from tools import tools_dict
+from tools import tool_schema as tls
+
+# Load Qwen3 1.7B 4-bit model and tokenizer
+model_id = "onnx-community/Qwen3-1.7B-ONNX"
+qwenModel = nm.Model("qwen3-1.7b")
+print("Model loaded successfully")
+
+chat_template = "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}"
+
+
+TOOL_CALL_START_TOKEN = "<tool_call>"
+TOOL_CALL_END_TOKEN = "</tool_call>"
+TOOL_RESPONSE_START_TOKEN = "<tool_response>"
+TOOL_RESPONSE_END_TOKEN = "</tool_response>"
+INITIAL_PROMPT = """You are a helpful assistant with access to tools. When you need to use a tool, format your response with JSON between <tool_call> and </tool_call> tokens.
+
+Use this exact format: <tool_call>{"name": "function_name", "arguments": {"param": "value"}}</tool_call>
+If a tool requires a argument you don't know the value of check if another tool can give you that information and call that tool first.
+Always respond directly and call the appropriate tool when needed."""
+
+def get_initial_message_block():
+    return [
+    {
+        "role": "system",
+        "content": INITIAL_PROMPT
+    }
+]
+
+def execute_function_call(function_name, arguments, tools):
+    """Execute a function call and return the result"""
+    if function_name not in tools:
+        return {"error": "Function "+function_name+" not found"}
+
+    try:
+        function = tools[function_name]
+        result = {"error": "Function execution failed"}  # Initialize result
+
+        # Handle each function explicitly to avoid ** operator
+        if function_name == "get_weather":
+            location = ""
+            if "location" in arguments:
+                location = arguments["location"]
+            unit = "celsius"
+            if "unit" in arguments:
+                unit = arguments["unit"]
+            result = function(location, unit)
+        elif function_name == "calculate_math":
+            expression = ""
+            if "expression" in arguments:
+                expression = arguments["expression"]
+            result = function(expression)
+        elif function_name == "get_current_time":
+            timezone = "UTC"
+            if "timezone" in arguments:
+                timezone = arguments["timezone"]
+            result = function(timezone)
+        elif function_name == "get_current_location":
+            result = function()
+        else:
+            result = {"error": "Unknown function: " + function_name}
+
+        return result
+    except Exception as e:
+        return {"error": "Error executing "+function_name+": "+str(e)}
+
+def format_tool_response(result):
+    """Format tool execution result using token-based format"""
+    return TOOL_RESPONSE_START_TOKEN+result+TOOL_RESPONSE_END_TOKEN
+
+def execute_tool_call_with_response(function_name, arguments, tools):
+    """Execute a function call and return both result and formatted response"""
+    result = execute_function_call(function_name, arguments, tools)
+    formatted_response = format_tool_response(result)
+    return result, formatted_response
+
+def parse_tool_calls_from_response(response_text, tools):
+    """Parse tool calls from model response using multiple formats"""
+    tool_calls = []
+
+    # Method 2: Look for JSON-style tool calls: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
+    json_tool_pattern = r'<tool_call>\s*({.*?})\s*</tool_call>'
+    json_matches = re.findall(json_tool_pattern, response_text, re.DOTALL)
+
+    for json_str in json_matches:
+        try:
+            tool_data = nm.parse_json(json_str)
+            func_name = tool_data["name"]
+            arguments = tool_data["arguments"]
+
+            if func_name in tools:
+                tool_calls.append({
+                    "function_name": func_name,
+                    "arguments": arguments
+                })
+                print("✓ Parsed JSON tool call: "+func_name+"("+str(arguments)+")")
+        except:
+            print("⚠ Failed to parse JSON tool call: "+json_str)
+
+    return tool_calls
+
+def render_jinja_template(messages, tools, add_generation_prompt, enable_thinking):
+    """Render the chat template using hardcoded string structure"""
+    result = ""
+
+    # If we have tools, build the system message with tools
+    if tools:
+        result = result + "<|im_start|>system\n"
+
+        # Check if first message is system message and include its content
+        if len(messages) > 0 and messages[0]["role"] == "system":
+            result = result + messages[0]["content"] + "\n\n"
+
+        result = result + "# Tools\n\n"
+        result = result + "You may call one or more functions to assist with the user query.\n\n"
+        result = result + "You are provided with function signatures within <tools></tools> XML tags:\n"
+        result = result + "<tools>\n"
+
+        # Add each tool as JSON
+        for tool in tools:
+            result = result + str(tool) + "\n"
+
+        result = result + "</tools>\n\n"
+        result = result + "For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n"
+        result = result + "<tool_call>\n"
+        result = result + "{\"name\": <function-name>, \"arguments\": <args-json-object>}\n"
+        result = result + "</tool_call><|im_end|>\n"
+    else:
+        # No tools, just add system message if present
+        if len(messages) > 0 and messages[0]["role"] == "system":
+            result = result + "<|im_start|>system\n" + messages[0]["content"] + "<|im_end|>\n"
+
+    # Process messages
+    content_messages = messages
+    if len(messages) > 0 and messages[0]["role"] == "system":
+        content_messages = messages[1:]
+
+    for i in range(len(content_messages)):
+        message = content_messages[i]
+        role = message["role"]
+        content = message["content"]
+
+        if role in ["user", "system"]:
+            result = result + "<|im_start|>" + role + "\n" + content + "<|im_end|>\n"
+        elif role == "assistant":
+            # Handle assistant messages
+            result = result + "<|im_start|>assistant\n"
+
+            # Check for reasoning content
+            reasoning_content = ""
+            if "reasoning_content" in message and message["reasoning_content"]:
+                reasoning_content = message["reasoning_content"]
+            elif "</think>" in content:
+                # Extract thinking content
+                parts = content.split("</think>")
+                if len(parts) > 1:
+                    think_part = parts[0]
+                    if "<think>" in think_part:
+                        reasoning_content = think_part.split("<think>")[-1].strip()
+                        content = parts[-1].lstrip()
+
+            # Add thinking section if present
+            if reasoning_content:
+                result = result + "<think>\n" + reasoning_content + "\n</think>\n\n"
+
+            result = result + content
+
+            # Handle tool calls
+            if "tool_calls" in message and message["tool_calls"]:
+                for tool_call in message["tool_calls"]:
+                    if content:  # Add newline if there's content before tool call
+                        result = result + "\n"
+
+                    func_call = tool_call
+                    # Handle function calls
+                    if "function" in tool_call:
+                        func_call = tool_call["function"]
+
+                    result = result + "<tool_call>\n"
+                    result = result + "{\"name\": \"" + func_call["name"] + "\", \"arguments\": "
+                    result = result + func_call["arguments"]
+
+                    result = result + "}\n</tool_call>"
+
+            result = result + "<|im_end|>\n"
+
+        elif role == "tool":
+            # Handle tool response messages
+            if i == 0 or messages[i-1]["role"] != "tool":
+                result = result + "<|im_start|>user"
+
+            result = result + "\n<tool_response>\n" + content + "\n</tool_response>"
+
+            if i == len(messages)-1 or messages[i+1]["role"] != "tool":
+                result = result + "<|im_end|>\n"
+
+    # Add generation prompt if requested
+    if add_generation_prompt:
+        result = result + "<|im_start|>assistant\n"
+        if not enable_thinking:
+            result = result + "<think>\n\n</think>\n\n"
+
+    return result
+
+def apply_chat_template(messages, tool_schema, add_generation_prompt, tokenizer, return_dict):
+    """Apply chat template using Jinja2 rendering"""
+
+    # Use Jinja2 template renderer
+    text = render_jinja_template(
+        messages,
+        [tool["function"] for tool in tool_schema],
+        add_generation_prompt,
+        True
+    )
+    token_ids = tokenizer.encode(text)
+    input_ids = nm.tensor([token_ids], "int64")
+
+    if return_dict:
+        attention_mask = nm.tensor([[1 for _ in range(len(token_ids))]], "int64")
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask
+        }
+    else:
+        return input_ids
+
+def generate_with_model(conversation_messages, max_new_tokens, tool_schema, tokenizer, model_config_dict):
+    """Generate text using the loaded model with multi-turn conversation support"""
+    # Use chat template with tools for multi-turn conversations
+    print("--------------------------------")
+    print("Conversation Messages:")
+    print(conversation_messages)
+    print("--------------------------------")
+
+    # 2. Prepare inputs
+    inputs = apply_chat_template(
+      conversation_messages,
+      tool_schema,
+      True,
+      tokenizer,
+      True,
+    )
+    input_ids = inputs['input_ids']
+    attention_mask = inputs['attention_mask']
+    batch_size = input_ids.shape()[0]
+    position_ids = nm.tensor([[i for i in range(input_ids.shape()[1])] for _ in range(batch_size)], "int64")
+
+    # Set config values
+    num_key_value_heads = model_config_dict["num_key_value_heads"]
+    head_dim = int(model_config_dict["hidden_size"] / model_config_dict["num_attention_heads"])
+    num_hidden_layers = model_config_dict["num_hidden_layers"]
+    eos_token_id = model_config_dict["eos_token_id"]
+    hidden_size = model_config_dict["hidden_size"]
+    # Initialize past cache values with correct shapes for ONNX model
+    past_cache_values = {}
+
+    # Check if config has layer_types (like LFM2)
+    # Since we always set layer_types in SimpleConfig, we can just check if it's None
+    if "layer_types" not in model_config_dict:
+        model_config_dict["layer_types"] = [
+                "full_attention"
+                for i in range(model_config_dict["num_hidden_layers"])
+            ]
+    for i in range(num_hidden_layers):
+        if model_config_dict["layer_types"][i] == 'full_attention':
+            for kv in ('key', 'value'):
+                # Initialize with a small valid tensor that will be replaced after first forward pass
+                # Using sequence length 1 to avoid dimension 0 issues
+                past_cache_values['past_key_values.'+str(i)+'.'+kv] = nm.zeros([batch_size, num_key_value_heads, 1, head_dim], "float")
+        elif model_config_dict["layer_types"][i] == 'conv':
+            past_cache_values['past_conv.'+str(i)] = nm.zeros([batch_size, hidden_size, model_config_dict["conv_L_cache"]], "float")
+
+    # 3. Generation loop
+    generated_tokens = []
+    for i in range(max_new_tokens):
+        # Run model - returns a tuple where first element is logits, rest are cache values
+        # Try passing cache as single dictionary parameter
+        model_outputs = qwenModel.run(input_ids, attention_mask, position_ids, past_cache_values)
+
+        # Extract logits (first element) and cache values (rest)
+        logits = model_outputs[0]
+        present_cache_values = []
+        for j in range(1, len(model_outputs)):
+            present_cache_values.append(model_outputs[j])
+
+        # Update values for next generation loop
+        next_token_id = nm.argmax(logits[0, -1, :])
+
+        # Check for EOS token
+        if next_token_id == eos_token_id:
+            break
+
+        generated_tokens.append(next_token_id)
+        input_ids = nm.tensor([[next_token_id]], "int64")
+        attention_mask = nm.concatenate([attention_mask, nm.ones_like(input_ids, "int64")], axis=-1)
+        position_ids = position_ids[:, -1:] + 1
+
+        # Update cache
+        j = 0
+        for key in past_cache_values:
+            past_cache_values[key] = present_cache_values[j]
+            j = j + 1
+
+    # 4. Output result - decode only the generated tokens
+    response = ""
+    if generated_tokens:
+        response = tokenizer.decode(generated_tokens)
+
+    return response.strip()
+
+
+def handle_multi_step_request(user_prompt, max_steps, max_new_tokens, tools, tool_schema, tokenizer, model_config_dict):
+    """Handle requests that may require multiple tool calls and back and forth"""
+    step_results = []
+    conversation_messages = []  # Initialize as empty list, not None
+    tool_context = {}  # Store results from previous tool calls
+
+    for step in range(max_steps):
+        print("\n--- Step " + str(step + 1) + " ---")
+        if step == 0:
+            conversation_messages = get_initial_message_block()
+            conversation_messages.append({
+                "role": "user",
+                "content": user_prompt
+            })
+        else:
+            conversation_messages.append({
+                "role": "system",
+                "content": "Now use the result from the tool calls to answer the user's question. Call another tool if needed."
+            })
+        # Generate response
+        try:
+            response = generate_with_model(conversation_messages, max_new_tokens, tool_schema, tokenizer, model_config_dict)
+            print("Model Response: "+response)
+
+            # Parse and execute tool calls
+            tool_calls = parse_tool_calls_from_response(response, tools)
+            tool_results = []
+
+            if tool_calls:
+                print("Executing "+str(len(tool_calls))+" tool call(s):")
+                for call in tool_calls:
+                    func_name = call["function_name"]
+                    arguments = call["arguments"]
+
+                    print("  • "+func_name+"("+str(arguments)+")")
+                    result, formatted_response = execute_tool_call_with_response(func_name, arguments, tools)
+
+                    # Store important results for future reference
+                    if func_name == "get_current_location" and "location" in result:
+                        tool_context["location"] = result["location"]
+
+                    tool_results.append({
+                        "function": func_name,
+                        "arguments": arguments,
+                        "result": result
+                    })
+                    print("    Result: "+str(result))
+
+            # Add assistant response to conversation
+            conversation_messages.append({
+                "role": "assistant",
+                "content": response
+            })
+
+            # Add tool results to conversation as function messages
+            for tool_result in tool_results:
+                if "error" not in tool_result["result"]:
+                    conversation_messages.append({
+                        "role": "system",
+                        "content": "The result of the tool " +tool_result['function']+" is: "+TOOL_RESPONSE_START_TOKEN+tool_result['result']+TOOL_RESPONSE_END_TOKEN
+                    })
+            prompt = "continuation"
+            if step == 0:
+                prompt = user_prompt
+            # Store step result
+            step_result = {
+                "step": step + 1,
+                "prompt": prompt,
+                "response": response,
+                "tool_calls": tool_calls,
+                "tool_results": tool_results,
+                "has_errors": len([True for tr in tool_results if "error" in tr["result"]]) > 0,
+                "tool_context": tool_context,
+                "conversation_messages": conversation_messages
+            }
+            step_results.append(step_result)
+
+            # Check if all tool calls were successful
+            if step_result["has_errors"]:
+                print("⚠ Stopping due to tool execution errors")
+                break
+
+            # Simple continuation logic: if no tools were called, we're done
+            if not tool_calls:
+                print("✓ Completed after "+str(step + 1)+" step(s) - no tool calls needed")
+                break
+
+            # If we've reached max steps, stop
+            if step >= max_steps - 1:
+                print("✓ Reached maximum steps ("+str(max_steps)+")")
+                break
+
+            # If tools were executed, continue to next step to see if model wants to do more
+            print("✓ Step "+str(step + 1)+" completed with "+str(len(tool_calls))+" tool call(s) - continuing...")
+
+        except Exception as e:
+            print("Error in step "+str(step + 1)+": "+str(e))
+            prompt_text = ""
+            if step == 0:
+                prompt_text = user_prompt
+            else:
+                prompt_text = "continuation"
+            step_results.append({
+                "step": step + 1,
+                "prompt": prompt_text,
+                "error": str(e),
+                "response": None,
+                "tool_calls": [],
+                "tool_results": [],
+                "tool_context": tool_context,
+                "conversation_messages": conversation_messages
+            })
+            break
+
+    return step_results
+
+def run_tool_calling_demo(input):
+    """Run tool calling demonstration"""
+    print("=== Qwen3 1.7B Tool Calling Demo ===\n")
+    print("Model: "+model_id)
+
+    # Ensure tokenizer has necessary tokens
+    tokenizer = tokenizers.from_json(input["tokenizer_config_dict"])
+
+    # Get tool names without using list()
+    tool_names = []
+    for key in tools_dict.keys():
+        tool_names.append(key)
+    print("Available tools: "+str(tool_names))
+
+    demo_prompts = [
+        "What's the weather here today?",
+        "Calculate 15 * 23",
+        "What time is it in JST timezone?",
+        "Where am I located?",
+        "Get my location and check the weather there"
+    ]
+
+    i = 1
+    for user_prompt in demo_prompts:
+        print("\nDemo "+str(i)+": "+user_prompt)
+        print("--------------------------------")
+        step_results = handle_multi_step_request(user_prompt, 4, 400, tools_dict, tls, tokenizer, input["config_dict"])
+        # Show final summary
+        print("\nMulti-step Summary:")
+        for step_result in step_results:
+            step_num = step_result["step"]
+            tool_calls = []
+            if "tool_calls" in step_result:
+                tool_calls = step_result["tool_calls"]
+            if tool_calls:
+                print("  Step "+str(step_num)+": "+str(len(tool_calls))+" tool call(s)")
+                for call in tool_calls:
+                    func_name = call["function_name"]
+                    print("    ✓ "+func_name)
+        print("\n" + "--------------------------------")
+        i = i + 1
diff --git a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/tools.py b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/tools.py
new file mode 100644
index 00000000..df884e78
--- /dev/null
+++ b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/tools.py
@@ -0,0 +1,211 @@
+# Initialize empty tool schema and tools mapping
+tool_schema = []
+tools_dict = {}
+
+# Define tool functions
+def get_weather(location, unit):
+    """Get current weather for a location"""
+    # Mock weather data - in real app would call weather API
+    weather_data = {
+        "San Francisco": {"temp": 18, "condition": "foggy", "humidity": 75},
+        "New York": {"temp": 22, "condition": "partly cloudy", "humidity": 60},
+        "London": {"temp": 15, "condition": "rainy", "humidity": 85},
+        "Tokyo": {"temp": 26, "condition": "sunny", "humidity": 50},
+        "Sydney": {"temp": 20, "condition": "clear", "humidity": 65}
+    }
+    
+    # Simple location matching
+    location_key = "Unknown"
+    for key in weather_data.keys():
+        if key.lower() in location.lower() or location.lower() in key.lower():
+            location_key = key
+            break
+    
+    if location_key == "Unknown":
+        return {"error": "Weather data not available for " + location}
+    
+    data = weather_data[location_key]
+    temp = data["temp"]
+    unit_str = "°C"  # Initialize with default value
+    
+    if unit == "fahrenheit":
+        temp = temp * 9.0 / 5.0 + 32
+        temp = int(temp * 10) / 10.0  # Manual rounding to 1 decimal place
+        unit_str = "°F"
+    else:
+        unit_str = "°C"
+    
+    return {
+        "location": location_key,
+        "temperature": temp,
+        "condition": data["condition"],
+        "humidity": data["humidity"],
+        "unit": unit_str
+    }
+
+def calculate_math(expression):
+    """Calculate a mathematical expression safely"""
+    try:
+        # Clean the expression
+        expression = expression.strip()
+        
+        # Handle multiplication
+        if "*" in expression:
+            parts = expression.split("*")
+            if len(parts) == 2:
+                a = float(parts[0].strip())
+                b = float(parts[1].strip())
+                result = a * b
+                return {"expression": expression, "result": result}
+        
+        # Handle addition
+        if "+" in expression:
+            parts = expression.split("+")
+            if len(parts) == 2:
+                a = float(parts[0].strip())
+                b = float(parts[1].strip())
+                result = a + b
+                return {"expression": expression, "result": result}
+        
+        # Handle subtraction
+        if "-" in expression:
+            parts = expression.split("-")
+            if len(parts) == 2:
+                a = float(parts[0].strip())
+                b = float(parts[1].strip())
+                result = a - b
+                return {"expression": expression, "result": result}
+        
+        # Handle division
+        if "/" in expression:
+            parts = expression.split("/")
+            if len(parts) == 2:
+                a = float(parts[0].strip())
+                b = float(parts[1].strip())
+                if b != 0:
+                    result = a / b
+                    return {"expression": expression, "result": result}
+                else:
+                    return {"error": "Division by zero"}
+        
+        return {"error": "Unsupported expression"}
+    except Exception as e:
+        return {"error": "Calculation error"}
+
+def get_current_time(timezone):
+    """Get current time in specified timezone"""
+    # Mock time data - in real app would use proper timezone handling
+    # Using fixed time values for simulation since we don't have datetime
+    time_data = {
+        "UTC": {"time": "2024-01-15 12:00:00", "day": "Monday"},
+        "PST": {"time": "2024-01-15 04:00:00", "day": "Monday"},
+        "EST": {"time": "2024-01-15 07:00:00", "day": "Monday"},
+        "GMT": {"time": "2024-01-15 12:00:00", "day": "Monday"},
+        "JST": {"time": "2024-01-15 21:00:00", "day": "Monday"},
+        "AEST": {"time": "2024-01-15 22:00:00", "day": "Monday"}
+    }
+    
+    tz = timezone.upper()
+    if tz in time_data:
+        data = time_data[tz]
+        return {
+            "timezone": tz,
+            "time": data["time"],
+            "day_of_week": data["day"]
+        }
+    else:
+        # Default to UTC if timezone not found
+        data = time_data["UTC"]
+        return {
+            "timezone": "UTC",
+            "time": data["time"],
+            "day_of_week": data["day"]
+        }
+
+def get_current_location():
+    """Get the real location and timezone of the user"""
+    return {
+        "location": "San Francisco",
+        "country": "United States",
+        "coordinates": {"latitude": 37.7749, "longitude": -122.4194},
+        "timezone": "PST"
+    }
+
+# Create tools dictionary
+tools_dict = {
+    "get_weather": get_weather,
+    "calculate_math": calculate_math,
+    "get_current_time": get_current_time,
+    "get_current_location": get_current_location
+}
+
+# Define tool schema
+tool_schema = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get weather information for a specific location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The location to get weather for"
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "Temperature unit (celsius or fahrenheit)",
+                        "default": "celsius"
+                    }
+                },
+                "required": ["location"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "calculate_math",
+            "description": "Calculate a mathematical expression safely",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to calculate (supports +, -, *, /)"
+                    }
+                },
+                "required": ["expression"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_time",
+            "description": "Get current time in specified timezone",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "timezone": {
+                        "type": "string",
+                        "description": "Timezone (UTC, EST, PST, JST, CET)",
+                        "default": "UTC"
+                    }
+                }
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_location",
+            "description": "Get the real location and timezone of the user",
+            "parameters": {
+                "type": "object",
+                "properties": {}
+            }
+        }
+    }
+]
\ No newline at end of file
diff --git a/nimblenet_py/simulation_assets/qwen_demo/run_demo.py b/nimblenet_py/simulation_assets/qwen_demo/run_demo.py
new file mode 100644
index 00000000..0b3743a2
--- /dev/null
+++ b/nimblenet_py/simulation_assets/qwen_demo/run_demo.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+"""
+Main driver script for running Qwen demo with tool calling
+"""
+
+import sys
+sys.path.append('../../../')
+
+from deliteai import simulator
+import json
+
+def main():
+    """Run the Qwen demo"""
+    print("=== Running Qwen Demo ===")
+    print("This demo shows Qwen model and tool calling capabilities\n")
+
+    model_name = "qwen3-1.7b"
+    vocab_file = "./qwen/vocab.json"
+    merges_file = "./qwen/merges.txt"
+    config_file = "./qwen/config.json"
+    tokenizer_config_file = "./qwen/tokenizer.json"
+
+    # Module configuration for simulator
+    modules = [
+        {
+            "name": "qwen_modules",
+            "version": "1.0.0",
+            "type": "script",
+            "location": {
+                "path": "./qwen_modules.zip"
+            }
+        }
+    ]
+
+    # Add model if requested
+
+    modules.append({
+        "name": model_name,
+        "version": "1.0.0",
+        "type": "model",
+        "location": {
+            "path": "./qwen/onnx/model_q4f16.onnx"
+        }
+    })
+    print(f"Added model: {model_name}")
+
+
+    with open(vocab_file, encoding="utf-8") as vocab_handle:
+        vocab = json.load(vocab_handle)
+
+    bpe_merges = []
+    with open(merges_file, encoding="utf-8") as merges_handle:
+        i = 0
+        for line in merges_handle:
+            line = line.strip()
+            if (i == 0 and line.startswith("#version:")) or not line:
+                i = i + 1
+                continue
+            bpe_merges.append(tuple(line.split()))
+            i = i + 1
+
+    with open(config_file, encoding="utf-8") as config_handle:
+        config_dict = json.load(config_handle)
+
+    with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
+        tokenizer_config_dict = json.load(tokenizer_config_handle)
+
+    # Initialize simulator
+    print("\nInitializing simulator...")
+    config = {"debug": True, "online": False}
+
+    # Initialize with modules
+    if not simulator.initialize(json.dumps(config), modules):
+        print("Failed to initialize simulator")
+        return
+
+    print("Simulator initialized successfully")
+
+    # Run the main function
+    print("\nRunning Qwen workflow...\n")
+    result = simulator.run_method(
+        "run_tool_calling_demo",
+        {
+            "vocab": vocab,
+            "merges": bpe_merges,
+            "config_dict": config_dict,
+            "tokenizer_config_dict": tokenizer_config_dict,
+            "model_name": model_name
+        }
+    )
+
+    print("\n=== Demo Complete ===")
+    if result.get("success"):
+        print("✅ Demo completed successfully!")
+        if result.get("model_loaded"):
+            print("   Model was loaded and inference attempted")
+        else:
+            print("   Tool demonstrations completed")
+    else:
+        print(f"❌ Demo failed: {result.get('error', 'Unknown error')}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nimblenet_py/simulation_assets/tokenizer_example.py b/nimblenet_py/simulation_assets/tokenizer_example.py
new file mode 100644
index 00000000..846e4b64
--- /dev/null
+++ b/nimblenet_py/simulation_assets/tokenizer_example.py
@@ -0,0 +1,298 @@
+# SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Tokenizers Integration Example for DeliteAI
+
+This module demonstrates how to use tokenizers in DeliteAI's delitepy runtime.
+DeliteAI includes support for tokenizers through the `delitepy.tokenizers` module,
+which provides a Python interface to the mlc-ai/tokenizers-cpp library.
+
+Supported Tokenizer Types:
+    - HuggingFace Tokenizers: JSON format tokenizers from HuggingFace Hub
+    - SentencePiece: Google's SentencePiece tokenizers (.model files)
+    - RWKV World: RWKV tokenizers
+    - Custom JSON: Manually created tokenizer configurations
+
+Basic Usage:
+    from delitepy import tokenizers
+
+    # Load tokenizer
+    tokenizer = tokenizers.from_json(json_config)
+
+    # Encode text
+    token_ids = tokenizer.encode("Hello world!")
+
+    # Decode back to text
+    decoded = tokenizer.decode(token_ids)
+
+Integration Details:
+    The tokenizers module is implemented as:
+    1. C++ Wrapper: TokenizersDataVariable class wraps mlc-ai/tokenizers-cpp
+    2. DelitePy Integration: Functions exposed through delitepy import system
+    3. Memory Management: Tokenizer instances managed automatically
+    4. Error Handling: Proper exception handling for all operations
+
+Platform Support:
+    - Linux (x86_64, ARM64)
+    - macOS (Intel, Apple Silicon)
+    - iOS (device and simulator)
+    - Android (ARM64, ARMv7, x86_64)
+    - Windows (x86_64)
+
+Performance Notes:
+    - Tokenizer creation is expensive; reuse instances when possible
+    - Token encoding/decoding is fast and suitable for real-time use
+    - Cross-platform deployment supported on all major platforms
+
+Dependencies:
+    - Rust toolchain (for building underlying tokenizers library)
+    - CMake 3.18+ (for build system)
+    - C++17 support (for wrapper implementation)
+
+For cross-compilation, install appropriate Rust targets:
+    # For iOS
+    rustup target add aarch64-apple-ios aarch64-apple-ios-sim
+
+    # For Android
+    rustup target add aarch64-linux-android armv7-linux-androideabi
+
+Examples:
+    This module contains comprehensive test functions demonstrating:
+    - Basic tokenizer creation and usage
+    - Advanced tokenizer with special tokens
+    - Error handling and validation
+    - Combined test scenarios
+"""
+
+from delitepy import tokenizers
+
+def test_tokenizers(params):
+    """
+    Test basic tokenizer functionality with a simple BPE tokenizer.
+
+    This function demonstrates the core tokenizer operations:
+    - Creating a tokenizer from JSON configuration
+    - Encoding text to token IDs
+    - Decoding token IDs back to text
+    - Vocabulary size queries
+    - Token/ID conversions
+
+    The test uses a minimal BPE tokenizer with a small vocabulary containing
+    basic words like "hello", "world", and punctuation.
+
+    Returns:
+        dict: Test results containing:
+            - status (str): "success" or "error"
+            - vocab_size (int): Size of the tokenizer vocabulary
+            - encoded_length (int): Number of tokens produced
+            - decoded_text (str): Text after encode/decode round-trip
+            - hello_token_id (int): Token ID for "hello"
+            - token_0 (str): Token corresponding to ID 0
+            - message (str): Error message if status is "error"
+
+    Example:
+        >>> results = test_tokenizers()
+        >>> assert results["status"] == "success"
+        >>> assert results["decoded_text"] == "hello world!"
+    """
+
+    # Define a simple tokenizer configuration
+    json_config = '''{
+        "version": "1.0",
+        "added_tokens": [],
+        "model": {
+            "type": "BPE",
+            "vocab": {"h": 0, "e": 1, "l": 2, "o": 3, " ": 4, "w": 5, "r": 6, "d": 7, "!": 8, "hello": 9, "world": 10},
+            "merges": []
+        }
+    }'''
+    # Create tokenizer from JSON
+    tokenizer = tokenizers.from_json(json_config)
+
+    # Test encoding
+    text = "hello world!"
+    token_ids = tokenizer.encode(text)
+
+    # Test decoding
+    decoded_text = tokenizer.decode(token_ids)
+
+    # Test vocabulary operations
+    vocab_size = tokenizer.get_vocab_size()
+
+        # Test token/ID conversion
+    token_id = tokenizer.token_to_id("hello")  # Look up "hello" token
+    token = tokenizer.id_to_token(0)
+
+    return {
+        "status": "success",
+        "vocab_size": vocab_size,
+        "encoded_length": len(token_ids),
+        "decoded_text": decoded_text,
+        "hello_token_id": token_id,
+        "token_0": token
+    }
+
+def test_sentencepiece_style(params):
+    """
+    Test advanced tokenizer functionality with special tokens and BPE merges.
+
+    This function demonstrates more sophisticated tokenizer features:
+    - Special tokens ([UNK], [CLS], [SEP]) for sequence classification
+    - Comprehensive vocabulary with alphabet and common words
+    - BPE merge rules for subword tokenization
+    - Longer text processing capabilities
+
+    The tokenizer created includes:
+    - Full alphabet (a-z)
+    - Common punctuation and space
+    - Frequent English words (the, and, of, etc.)
+    - Special classification tokens used in BERT-style models
+    - BPE merge rules for common character combinations
+
+    Returns:
+        dict: Test results containing:
+            - status (str): "success" or "error"
+            - vocab_size (int): Size of the tokenizer vocabulary (100+ tokens)
+            - text (str): Input text used for testing
+            - encoded_length (int): Number of tokens after encoding
+            - decoded_text (str): Reconstructed text after decode
+            - cls_id (int): Token ID for [CLS] special token
+            - sep_id (int): Token ID for [SEP] special token
+            - unk_id (int): Token ID for [UNK] unknown token
+            - cls_token (str): Token string for CLS ID lookup
+            - message (str): Error message if status is "error"
+
+    Example:
+        >>> results = test_sentencepiece_style()
+        >>> assert results["status"] == "success"
+        >>> assert results["vocab_size"] > 100
+        >>> assert results["cls_id"] == 101
+    """
+
+    # Create a more comprehensive tokenizer with special tokens
+    json_config = '''{
+        "version": "1.0",
+        "truncation": null,
+        "padding": null,
+        "added_tokens": [
+            {"id": 100, "content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true},
+            {"id": 101, "content": "[CLS]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true},
+            {"id": 102, "content": "[SEP]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true}
+        ],
+        "normalizer": null,
+        "pre_tokenizer": null,
+        "post_processor": null,
+        "decoder": null,
+        "model": {
+            "type": "BPE",
+            "dropout": null,
+            "unk_token": "[UNK]",
+            "continuing_subword_prefix": null,
+            "end_of_word_suffix": null,
+            "fuse_unk": false,
+            "vocab": {
+                " ": 0, "a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9,
+                "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19,
+                "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26,
+                ".": 27, ",": 28, "!": 29, "?": 30,
+                "the": 31, "and": 32, "of": 33, "to": 34, "a": 35, "in": 36, "for": 37, "is": 38, "on": 39, "that": 40,
+                "by": 41, "this": 42, "with": 43, "i": 44, "you": 45, "it": 46, "not": 47, "or": 48, "be": 49, "are": 50,
+                "from": 51, "at": 52, "as": 53, "your": 54, "all": 55, "any": 56, "can": 57, "had": 58, "her": 59, "was": 60,
+                "one": 61, "our": 62, "out": 63, "day": 64, "get": 65, "has": 66, "him": 67, "his": 68, "how": 69, "man": 70,
+                "new": 71, "now": 72, "old": 73, "see": 74, "two": 75, "way": 76, "who": 77, "boy": 78, "did": 79, "its": 80,
+                "let": 81, "put": 82, "say": 83, "she": 84, "too": 85, "use": 86,
+                "qu": 87, "th": 88, "er": 89, "an": 90, "re": 91, "ed": 92, "nd": 93, "on": 94, "en": 95, "at": 96, "es": 97, "or": 98, "ti": 99,
+                "[UNK]": 100, "[CLS]": 101, "[SEP]": 102
+            },
+            "merges": [
+                "q u", "t h", "e r", "a n", "r e", "e d", "n d", "o n", "e n", "a t", "e s", "o r", "t i"
+            ]
+        }
+    }'''
+
+    # Create tokenizer from JSON
+    tokenizer = tokenizers.from_json(json_config)
+
+    # Test with longer text
+    text = "the quick brown fox jumps"
+    token_ids = tokenizer.encode(text)
+
+    # Test decoding
+    decoded_text = tokenizer.decode(token_ids)
+
+    # Test vocabulary operations
+    vocab_size = tokenizer.get_vocab_size()
+
+    # Test special token lookups
+    cls_id = tokenizer.token_to_id("[CLS]")
+    sep_id = tokenizer.token_to_id("[SEP]")
+    unk_id = tokenizer.token_to_id("[UNK]")
+    cls_token = tokenizer.id_to_token(101)
+
+    # Create result dictionary
+    result = {
+        "status": "success",
+        "vocab_size": vocab_size,
+        "text": text,
+        "encoded_length": len(token_ids),
+        "decoded_text": decoded_text,
+        "cls_id": cls_id,
+        "sep_id": sep_id,
+        "unk_id": unk_id,
+        "cls_token": cls_token
+    }
+
+    # Add first token ID separately to avoid ternary operator
+    if len(token_ids) > 0:
+        result["first_token_id"] = token_ids[0]
+    else:
+        result["first_token_id"] = -1
+
+    return result
+
+
+def run_all_tests(params):
+    """
+    Run all tokenizer tests and return combined results.
+
+    This function executes both the basic and advanced tokenizer tests,
+    collecting results from each test case and providing an overall
+    status summary.
+
+    Returns:
+        dict: Combined test results containing:
+            - overall_status (str): "success" if all tests pass, "error" otherwise
+            - basic_test (dict): Results from test_tokenizers()
+            - comprehensive_test (dict): Results from test_sentencepiece_style()
+            - message (str): Summary message or error details
+
+    Example:
+        >>> results = run_all_tests()
+        >>> assert results["overall_status"] == "success"
+        >>> assert results["basic_test"]["status"] == "success"
+        >>> assert results["comprehensive_test"]["status"] == "success"
+    """
+
+    # Initialize overall status
+    overall_status = "success"
+
+    # Run basic tokenizer test
+    basic_results = test_tokenizers({})
+
+    # Run comprehensive tokenizer test
+    comprehensive_results = test_sentencepiece_style({})
+
+    # Check if any test failed
+    if basic_results["status"] != "success":
+        overall_status = "error"
+    if comprehensive_results["status"] != "success":
+        overall_status = "error"
+
+    return {
+        "overall_status": overall_status,
+        "basic_test": basic_results,
+        "comprehensive_test": comprehensive_results,
+        "message": "All tests completed successfully with status: " + overall_status
+    }
diff --git a/nimblenet_py/simulation_tests/test_simulator_script.py b/nimblenet_py/simulation_tests/test_simulator_script.py
index 453eb0ea..8c005f06 100644
--- a/nimblenet_py/simulation_tests/test_simulator_script.py
+++ b/nimblenet_py/simulation_tests/test_simulator_script.py
@@ -30,7 +30,7 @@ def test_simulator():
         }
     ]
 
-    # initialize nimblenet    
+    # initialize nimblenet
     assert simulator.initialize('''{"debug": true, "online": false}''', modules)
 
     input = {"singleString": "singleString", "singleFloat": 10.10, "boolTensor": np.full((3), True, dtype=bool)}
@@ -71,7 +71,7 @@ def test_nested_json():
     ]
 
     assert simulator.initialize('''{"debug": true, "online": false}''', modules)
-    
+
     nestedJson = {"key1": 1, "key2": [1, 2, 3, "fsd"], "key3": "data1", "key4": {"fsd": "fdsd", "uio": 1.89}, "key5": [{"x": 1}], "bigValue": 12345678910}
     nestedArray = [{"key1": 1, "key2": [1, 2, 3, "fsd"], "key3": "data1", "key4": {"fsd": "fdsd", "uio": 1.89}, "key5": [{"x": 1}]}, "dfs"]
     input = {
@@ -79,7 +79,7 @@ def test_nested_json():
         "nestedJson": nestedJson, "nestedArray": nestedArray}
 
     output = simulator.run_method("add_initial_data", input, int(28931))
-    
+
     assert len(output) == 4
     assert output["nestedJson"] == expectedOutput["nestedJson"]
     assert np.all(np.array(output["nestedArray"]) == np.array(expectedOutput["nestedArray"]))
@@ -274,7 +274,7 @@ def get_items():
             print(f"Found item: {item['ProductName']} {item}")
             yield item
             item = simulator.run_method("get_next_item", {})["item"]
-    
+
 
 
     items = list(get_items())
@@ -324,7 +324,7 @@ def test_class_support():
     ]
 
     assert simulator.initialize('''{"online": false}''', modules)
-    
+
     def assert_callback(output):
         print("asserting callback", output)
         assert output["workflow_output"] == output["actual_output"]
@@ -336,7 +336,7 @@ def test_script(val):
     test_script(1)
     test_script(2)
     test_script(3)
-    
+
 
 def test_invalid_dataType_model():
     modules = [
@@ -357,7 +357,7 @@ def test_invalid_dataType_model():
             }
         }
     ]
-    
+
     assert simulator.initialize('''{"online": false}''', modules)
 
     output = simulator.run_method("invalid_model_function", {})
@@ -375,14 +375,14 @@ def test_multi_threading():
             }
         }
     ]
-    
+
     import psutil
     process = psutil.Process(os.getpid())
     taskThreadIndex = process.num_threads()
 
     # While loading the script, number of threads should increase
     assert simulator.initialize('''{"online": false}''', modules)
-    
+
     if {"GENAI"}.issubset(build_flags):
         assert process.num_threads() == taskThreadIndex + 6
     else:
@@ -398,7 +398,7 @@ def test_multi_threading():
 
     def test(n):
         output = simulator.run_method("test_parallel", {"n": n})
-        assert output["incorrectTotal"] < n 
+        assert output["incorrectTotal"] < n
         assert output["correctTotal"] == n
         indexTensor = np.array([x for x in range(n)], np.int64)
         squareTensor = np.array([x**2 for x in range(n)], np.int64)
@@ -433,7 +433,7 @@ def test_multi_threading_with_limited_threads():
     ]
 
     assert simulator.initialize('''{"online": false}''', modules)
- 
+
     def test(n):
         # Test with limited number of threads
         output = simulator.run_method("test_parallel_inside_parallel", {"n": n})
@@ -442,7 +442,7 @@ def test(n):
         for k in range(n):
             assert str(k) in output["map"]
         time.sleep(0.050)
-        
+
 
     test(10)
 
@@ -582,10 +582,10 @@ def test_list_operations():
 
     # Test multiple conditions in list comprehensions
     simulator.run_method("test_multiple_conditions", {})
-    
+
     # Test modulo operations - assertions are in the test functions
     simulator.run_method("test_mod_operations", {})
-    
+
     # Test concatenation edge cases - assertions are in the test functions
     simulator.run_method("test_concatenation_edge_cases", {})
 
@@ -619,7 +619,60 @@ def test_python_modules():
         assert "module1_run not defined in task" in repr(err)
 
     print("All python modules test passed!")
-    
+
+
+def test_tokenizers():
+    """Test tokenizer functionality using the delitepy.tokenizers module."""
+    modules = [
+        {
+            "name": "workflow_script",
+            "version": "1.0.0",
+            "type": "script",
+            "location": {
+                "path": "../simulation_assets/tokenizer_example.py"
+            }
+        }
+    ]
+
+    assert simulator.initialize("""{"debug": true, "online": false}""", modules)
+
+    # Test basic tokenizer functionality
+    basic_results = simulator.run_method("test_tokenizers", {})
+    print(f"Basic tokenizer test results: {basic_results}")
+
+    # Assert basic test succeeded
+    assert basic_results["status"] == "success"
+    assert basic_results["vocab_size"] == 11  # h, e, l, o, space, w, r, d, !, hello, world
+    assert basic_results["encoded_length"] > 0
+    assert basic_results["decoded_text"] == "h e l l o   w o r l d !"
+    assert basic_results["hello_token_id"] == 9  # token ID for 'hello'
+    assert basic_results["token_0"] == "h"
+
+    # Test more comprehensive tokenizer
+    comprehensive_results = simulator.run_method("test_sentencepiece_style", {})
+    print(f"Comprehensive tokenizer test results: {comprehensive_results}")
+
+    # Assert comprehensive test succeeded
+    assert comprehensive_results["status"] == "success"
+    assert comprehensive_results["vocab_size"] >= 98  # Should include all vocab + special tokens
+    assert comprehensive_results["text"] == "the quick brown fox jumps"
+    assert comprehensive_results["encoded_length"] > 0
+    assert comprehensive_results["cls_id"] == 101
+    assert comprehensive_results["sep_id"] == 102
+    assert comprehensive_results["unk_id"] == 100
+    assert comprehensive_results["cls_token"] == "[CLS]"
+
+    # Test combined results
+    all_results = simulator.run_method("run_all_tests", {})
+    print(f"All tokenizer tests results: {all_results}")
+
+    assert all_results["overall_status"] == "success"
+    assert all_results["basic_test"]["status"] == "success"
+    assert all_results["comprehensive_test"]["status"] == "success"
+
+    print("All tokenizer tests passed!")
+
 if __name__ == "__main__":
     test_simulator()
     test_python_modules()
+    test_tokenizers()
diff --git a/third_party/README.md b/third_party/README.md
index 65763de1..1df7ec03 100644
--- a/third_party/README.md
+++ b/third_party/README.md
@@ -1,6 +1,29 @@
 ## Build Dependencies
 Executors used in the SDK are downloaded from S3(Bucket: **deliteai**). Following are the steps that were used to create them.
 
+## Tokenizers-cpp
+
+The project uses [mlc-ai/tokenizers-cpp](https://github.com/mlc-ai/tokenizers-cpp) as a git submodule for cross-platform tokenizer support.
+
+### Prerequisites
+- Rust toolchain (install from [rustup.rs](https://rustup.rs/))
+- Cargo (comes with Rust)
+- For cross-compilation, install appropriate Rust targets:
+  - iOS: `rustup target add aarch64-apple-ios aarch64-apple-ios-sim`
+  - Android: `rustup target add aarch64-linux-android armv7-linux-androideabi`
+
+### Integration
+The tokenizers-cpp library is automatically built as part of the main CMake build process. It provides:
+- **libtokenizers_c.a**: C bindings to tokenizers Rust library
+- **libsentencepiece.a**: SentencePiece static library  
+- **libtokenizers_cpp.a**: C++ binding implementation
+
+The library supports:
+- HuggingFace tokenizers (JSON format)
+- SentencePiece tokenizers (.model format)
+- RWKV World tokenizers
+- Cross-platform deployment (iOS, Android, Windows, Linux, macOS)
+
 ## Onnxruntime
 
 ### Android
diff --git a/third_party/tokenizers-cpp b/third_party/tokenizers-cpp
new file mode 160000
index 00000000..f7771096
--- /dev/null
+++ b/third_party/tokenizers-cpp
@@ -0,0 +1 @@
+Subproject commit f77710965a3bcae85b7a00bdddbfc1adadef0e32

From 92bae73708e2bdbbe0bb3dbea0ff7f01554275d1 Mon Sep 17 00:00:00 2001
From: Varun Khare <varun.khare@nimbleedgehq.ai>
Date: Wed, 23 Jul 2025 14:18:44 +0000
Subject: [PATCH 3/7] # This is a combination of 5 commits. # This is the 1st
 commit message:

add support for dictionary indexing in onnx executor

Signed-off-by: Varun Khare <varun.khare@nimbleedgehq.ai>

# This is the commit message #2:

add dictionary input support to model.run() for kv_cache

Signed-off-by: Varun Khare <varun.khare@nimbleedgehq.ai>

# This is the commit message #3:

add fp16 support in delitepy

Signed-off-by: Varun Khare <varun.khare@nimbleedgehq.ai>

# This is the commit message #4:

Qwen with tool calling functional in delitePy

Signed-off-by: Varun Khare <varun.khare@nimbleedgehq.ai>

# This is the commit message #5:

Implemented enumerate and next in DelitePy (#162)

* Implemented enumerate and next in DelitePy
Signed-off-by: Atul Jain <atul.jain@nimbleedgehq.ai>

* Cosmetics

Signed-off-by: Puneet Jindal <puneet.jindal@nimbleedgehq.ai>

---------

Signed-off-by: Puneet Jindal <puneet.jindal@nimbleedgehq.ai>
Co-authored-by: Atul Jain <atul.jain@nimbleedgehq.ai>
Co-authored-by: Puneet Jindal <puneet.jindal@nimbleedgehq.ai>
---
 .gitignore                                    |   3 +-
 coreruntime/nimblenet/CMakeLists.txt          |   5 +-
 .../include/nimble_net_util.hpp               |   1 +
 .../data_variable/include/data_variable.hpp   |  14 +
 .../include/data_variable_templates.ipp       |  26 +-
 .../include/enumerate_data_variable.hpp       |  45 ++
 .../include/iterable_data_variable.hpp        |  28 +-
 .../include/model_nimble_net_variable.hpp     |  17 +-
 .../data_variable/include/single_variable.hpp |   2 +
 .../data_variable/src/data_variable.cpp       |   1 +
 .../src/enumerate_data_variable.cpp           |  67 +++
 .../src/model_nimble_net_variable.cpp         |  35 ++
 .../src/tensor_data_variable.cpp              |   4 +
 .../onnx/include/task_onnx_model.hpp          | 108 ++--
 .../executors/onnx/src/task_onnx_model.cpp    | 193 ++++++-
 .../operators/include/binary_operators.hpp    |  10 +-
 .../operators/include/custom_functions.hpp    |  30 ++
 .../operators/include/operator_types.hpp      |   6 +-
 .../operators/src/custom_functions.cpp        |  30 ++
 coreruntime/nimblenet/util/include/util.hpp   |   5 +
 coreruntime/nimblenet/util/src/util.cpp       |   3 +
 models/LFM2/demo_lfm.py                       | 297 ----------
 models/Qwen3-1.7B/demo_qwen.py                | 269 ++++++---
 models/Qwen3-1.7B/export.py                   | 509 ++++++++++++++++++
 .../simulation_assets/dict_model_test.py      | 144 +++++
 .../qwen_demo/qwen_modules.zip                | Bin 8017 -> 0 bytes
 .../qwen_demo/qwen_modules/main.py            | 250 ++++-----
 .../simulation_assets/qwen_demo/run_demo.py   |  11 +-
 .../simulation_tests/test_simulator_script.py | 157 ++++++
 29 files changed, 1707 insertions(+), 563 deletions(-)
 create mode 100644 coreruntime/nimblenet/data_variable/include/enumerate_data_variable.hpp
 create mode 100644 coreruntime/nimblenet/data_variable/src/enumerate_data_variable.cpp
 delete mode 100755 models/LFM2/demo_lfm.py
 create mode 100755 models/Qwen3-1.7B/export.py
 create mode 100644 nimblenet_py/simulation_assets/dict_model_test.py
 delete mode 100644 nimblenet_py/simulation_assets/qwen_demo/qwen_modules.zip

diff --git a/.gitignore b/.gitignore
index 10f95c82..a04eada2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@ third_party/runtime/
 !third_party/runtime/CMakeLists.txt
 __pycache__/
 .pytest_cache/
-nimblenet_py/simulation_tests/NimbleSDK
+**/NimbleSDK
+models/**/data
diff --git a/coreruntime/nimblenet/CMakeLists.txt b/coreruntime/nimblenet/CMakeLists.txt
index 6c87561a..a4ff333d 100644
--- a/coreruntime/nimblenet/CMakeLists.txt
+++ b/coreruntime/nimblenet/CMakeLists.txt
@@ -23,8 +23,7 @@ set(BASE
 	data_variable/src/custom_func_data_variable.cpp
 	data_variable/src/data_variable.cpp
 	data_variable/src/dataframe_variable.cpp
-	data_variable/src/single_variable.cpp
-	data_variable/src/tensor_data_variable.cpp
+	data_variable/src/enumerate_data_variable.cpp
 	data_variable/src/filtered_dataframe_variable.cpp
 	data_variable/src/future_data_variable.cpp
 	data_variable/src/list_data_variable.cpp
@@ -37,6 +36,8 @@ set(BASE
 	data_variable/src/raw_event_store_data_variable.cpp
 	data_variable/src/regex_data_variable.cpp
 	data_variable/src/tokenizers_data_variable.cpp
+	data_variable/src/single_variable.cpp
+	data_variable/src/tensor_data_variable.cpp
 	job_scheduler/src/base_job.cpp
 	job_scheduler/src/job_scheduler.cpp
 	job_scheduler/src/asset_download_job.cpp
diff --git a/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp b/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp
index 35b7a2ce..2f8ecd5c 100644
--- a/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp
+++ b/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp
@@ -71,6 +71,7 @@ enum DATATYPE {
   EXCEPTION = 685,
   UNKNOWN = 0,
   FLOAT = 1,
+  FLOAT16 = 2,
   BOOLEAN = 9,
   INT32 = 6,
   INT64 = 7,
diff --git a/coreruntime/nimblenet/data_variable/include/data_variable.hpp b/coreruntime/nimblenet/data_variable/include/data_variable.hpp
index eba4c30c..8e7e09d5 100644
--- a/coreruntime/nimblenet/data_variable/include/data_variable.hpp
+++ b/coreruntime/nimblenet/data_variable/include/data_variable.hpp
@@ -99,6 +99,14 @@ constexpr inline bool is_integer();
     }                                                                                         \
   } while (0)
 
+#define THROW_OPTIONAL_ARGUMENTS_NOT_MATCH_FUNCTION_NAME(argsSize, expectedSize1, expectedSize2, functionName) \
+  do {                                                                                                         \
+    if ((argsSize) != (expectedSize1) && (argsSize) != (expectedSize2)) {                                      \
+      THROW("%s expects %d or %d argument(s), %d provided",                                                    \
+            functionName, expectedSize1, expectedSize2, argsSize);                                             \
+    }                                                                                                          \
+  } while (0)
+
 #define THROW_ARGUMENT_DATATYPE_NOT_MATCH(dataType, expectedDataType, argIndex, funcIndex)       \
   do {                                                                                           \
     if (dataType != expectedDataType) {                                                          \
@@ -178,6 +186,8 @@ class DataVariable : public std::enable_shared_from_this<DataVariable> {
   virtual bool is_integer() { return false; }
 
   virtual bool is_none() { return false; }
+  
+  virtual bool is_iterable() const { return false; }
 
   const char* get_containerType_string() const;
 
@@ -258,6 +268,8 @@ class DataVariable : public std::enable_shared_from_this<DataVariable> {
 
   virtual uint8_t cast_uint8() { return get_uint8(); }
 
+  virtual uint16_t cast_uint16() { return get_uint16(); }
+
   virtual int8_t cast_int8() { return get_int8(); }
 
   virtual int32_t get_int32() { THROW_UNSUPPORTED("get_int32"); }
@@ -272,6 +284,8 @@ class DataVariable : public std::enable_shared_from_this<DataVariable> {
 
   virtual uint8_t get_uint8() { THROW_UNSUPPORTED("get_uint8"); }
 
+  virtual uint16_t get_uint16() { THROW_UNSUPPORTED("get_uint16"); }
+
   virtual std::string get_string() const { THROW_UNSUPPORTED("get_string"); }
 
   virtual bool get_bool() = 0;
diff --git a/coreruntime/nimblenet/data_variable/include/data_variable_templates.ipp b/coreruntime/nimblenet/data_variable/include/data_variable_templates.ipp
index 28e20d8d..f2ec55b6 100644
--- a/coreruntime/nimblenet/data_variable/include/data_variable_templates.ipp
+++ b/coreruntime/nimblenet/data_variable/include/data_variable_templates.ipp
@@ -34,6 +34,12 @@ constexpr inline bool is_numeric<int32_t>() {
   return true;
 }
 
+template <>
+constexpr inline bool is_numeric<uint16_t>() {
+  // fp16 is numeric
+  return true;
+}
+
 template <>
 constexpr inline bool is_numeric<bool>() {
   return false;
@@ -74,6 +80,12 @@ constexpr inline bool is_integer<int32_t>() {
   return true;
 }
 
+template <>
+constexpr inline bool is_integer<uint16_t>() {
+  // fp16 is not an integer type
+  return false;
+}
+
 template <>
 constexpr inline bool is_integer<bool>() {
   return false;
@@ -104,6 +116,11 @@ constexpr inline int get_dataType_enum<int64_t>() {
   return DATATYPE::INT64;
 }
 
+template <>
+constexpr inline int get_dataType_enum<uint16_t>() {
+  return DATATYPE::FLOAT16;
+}
+
 template <>
 constexpr inline int get_dataType_enum<double>() {
   return DATATYPE::DOUBLE;
@@ -134,6 +151,13 @@ inline float DataVariable::get<float>() {
   return get_float();
 }
 
+template <>
+inline uint16_t DataVariable::get<uint16_t>() {
+  // For fp16, we return the raw uint16_t representation
+  // The caller can convert this to actual fp16 if needed
+  return get_uint16();
+}
+
 template <>
 inline int64_t DataVariable::get<int64_t>() {
   return get_int64();
@@ -159,4 +183,4 @@ inline bool DataVariable::get<bool>() {
 template <>
 inline nlohmann::json DataVariable::get<nlohmann::json>() {
   return get_json_data();
-}
\ No newline at end of file
+}
diff --git a/coreruntime/nimblenet/data_variable/include/enumerate_data_variable.hpp b/coreruntime/nimblenet/data_variable/include/enumerate_data_variable.hpp
new file mode 100644
index 00000000..a99372ac
--- /dev/null
+++ b/coreruntime/nimblenet/data_variable/include/enumerate_data_variable.hpp
@@ -0,0 +1,45 @@
+/*
+ * SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "data_variable.hpp"
+#include "iterable_data_variable.hpp"
+
+/**
+ * @brief DataVariable that yields index-value pairs for iterables
+ *
+ * EnumerateDataVariable wraps an existing iterable and produces a sequence of
+ * (index, value) pairs. Each call to `next()` returns a tuple containing the current
+ * index and the corresponding element from the underlying iterable.
+ */
+class EnumerateDataVariable final : public IterableDataVariable {
+ private:
+  OpReturnType _iterable; /**< The underlying iterable to enumerate over */
+  int _startIndex = 0; /**< Starting index for enumeration */
+  int _size = -1; /**< Cached size of the underlying iterable */
+
+ public:
+  EnumerateDataVariable(OpReturnType iterable, int startIndex = 0);
+
+  int get_dataType_enum() const override { return _iterable->get_dataType_enum(); }
+
+  int get_containerType() const override { return _iterable->get_containerType(); }
+
+  std::string print() override { return fallback_print(); }
+
+  nlohmann::json to_json() const override { return "[Enumerate]"; }
+
+  int get_size() override { return _size; }
+
+  OpReturnType get_int_subscript(int index) override;
+
+  void reset_iterator() override;
+
+  OpReturnType next(CallStack& stack) override;
+
+  bool get_bool() override;
+};
diff --git a/coreruntime/nimblenet/data_variable/include/iterable_data_variable.hpp b/coreruntime/nimblenet/data_variable/include/iterable_data_variable.hpp
index c5be67f5..7f9e2da0 100644
--- a/coreruntime/nimblenet/data_variable/include/iterable_data_variable.hpp
+++ b/coreruntime/nimblenet/data_variable/include/iterable_data_variable.hpp
@@ -5,6 +5,7 @@
  */
 
 #pragma once
+
 #include "data_variable.hpp"
 
 /**
@@ -21,13 +22,28 @@ class IterableDataVariable : public DataVariable {
   bool _iterExhausted = false; /**< Flag indicating if iteration has been exhausted */
 
  public:
-  // Reset the iterator to start from the beginning
+  /**
+   * @brief Indicates whether the variable is iterable
+   *
+   * @return Always returns true
+   */
+  bool is_iterable() const override { return true; }
+
+  /**
+   * @brief Resets the iterator to start from the beginning
+   */
   virtual void reset_iterator() {
     _iterPosition = 0;
     _iterExhausted = false;
   }
 
-  // Get the next value in the iteration, or throw StopIteration
+  /**
+   * @brief Retrieves the next value in the iteration
+   *
+   * @param stack The current execution call stack
+   * @return The next element in the iteration
+   * @throws StopIteration if the iterator is exhausted
+   */
   virtual OpReturnType next(CallStack& stack) override {
     if (_iterExhausted || _iterPosition >= get_size()) {
       _iterExhausted = true;
@@ -36,7 +52,11 @@ class IterableDataVariable : public DataVariable {
     return get_int_subscript(_iterPosition++);
   }
 
-  // Check if the iterator is exhausted
+  /**
+   * @brief Checks if the iterator is exhausted
+   *
+   * @return true if iteration is complete, false otherwise
+   */
   bool is_exhausted() const { return _iterExhausted; }
 };
 
@@ -111,4 +131,4 @@ class IterableOverScriptable : public IterableDataVariable {
   nlohmann::json to_json() const override { return _data->to_json(); }
 
   bool get_bool() override { return _data->get_bool(); }
-};
\ No newline at end of file
+};
diff --git a/coreruntime/nimblenet/data_variable/include/model_nimble_net_variable.hpp b/coreruntime/nimblenet/data_variable/include/model_nimble_net_variable.hpp
index 97e5359b..fad061fb 100644
--- a/coreruntime/nimblenet/data_variable/include/model_nimble_net_variable.hpp
+++ b/coreruntime/nimblenet/data_variable/include/model_nimble_net_variable.hpp
@@ -54,13 +54,24 @@ class ModelNimbleNetVariable final : public DataVariable {
 
         Parameters
         ----------
-        args : *Tensor
+        args : *Tensor or dict
             Input tensors to the model in the order they are expected in the model.
+            Alternatively, can accept a single dictionary mapping input names to tensors.
 
         Returns
         ----------
-        modelOutput : tuple[Tensor, ...]
-            Returns the output tensors of model as a tuple. The order of tensors is the same as defined during model construction.
+        modelOutput : tuple[Tensor, ...] or dict
+            Returns the output tensors of model as a tuple when using tensor arguments.
+            Returns a dictionary mapping output names to tensors when using dictionary input.
+
+        Examples
+        --------
+        # Traditional tensor arguments
+        >>> output = model.run(input1, input2)
+
+        # Dictionary input (new feature)
+        >>> input_dict = {"input1": tensor1, "input2": tensor2}
+        >>> output_dict = model.run(input_dict)
         """
         pass
   DELITEPY_DOC_BLOCK_END
diff --git a/coreruntime/nimblenet/data_variable/include/single_variable.hpp b/coreruntime/nimblenet/data_variable/include/single_variable.hpp
index 9ea24b3e..36ffc7f1 100644
--- a/coreruntime/nimblenet/data_variable/include/single_variable.hpp
+++ b/coreruntime/nimblenet/data_variable/include/single_variable.hpp
@@ -54,6 +54,8 @@ class SingleVariable final : public BaseSingleVariable {
 
   uint8_t get_uint8() override { return uint8_t(val); }
 
+  uint16_t get_uint16() override { return uint16_t(val); }
+
   int8_t get_int8() override { return int8_t(val); }
 
   bool get_bool() override { return val; }
diff --git a/coreruntime/nimblenet/data_variable/src/data_variable.cpp b/coreruntime/nimblenet/data_variable/src/data_variable.cpp
index 4e8e84ff..9e0bf78e 100644
--- a/coreruntime/nimblenet/data_variable/src/data_variable.cpp
+++ b/coreruntime/nimblenet/data_variable/src/data_variable.cpp
@@ -518,6 +518,7 @@ OpReturnType DataVariable::create_tensor(int dType, const std::vector<int64_t>&
 
   switch (dType) {
     case DATATYPE::FLOAT:
+    case DATATYPE::FLOAT16:
     case DATATYPE::DOUBLE:
     case DATATYPE::INT32:
     case DATATYPE::INT64:
diff --git a/coreruntime/nimblenet/data_variable/src/enumerate_data_variable.cpp b/coreruntime/nimblenet/data_variable/src/enumerate_data_variable.cpp
new file mode 100644
index 00000000..b8802843
--- /dev/null
+++ b/coreruntime/nimblenet/data_variable/src/enumerate_data_variable.cpp
@@ -0,0 +1,67 @@
+/*
+ * SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "enumerate_data_variable.hpp"
+
+#include <stdexcept>
+#include <string>
+
+#include "single_variable.hpp"
+#include "tuple_data_variable.hpp"
+
+EnumerateDataVariable::EnumerateDataVariable(OpReturnType iterable, int startIndex)
+    : _iterable(iterable), _startIndex(startIndex) {
+  if (!_iterable->is_iterable() && _iterable->get_containerType() != CONTAINERTYPE::LIST &&
+      _iterable->get_containerType() != CONTAINERTYPE::TUPLE) {
+    THROW("enumerate expects an iterable argument, provided %s",
+          _iterable->get_containerType_string());
+  }
+
+  _size = _iterable->get_size();
+}
+
+OpReturnType EnumerateDataVariable::get_int_subscript(int index) {
+  if (index >= _size || index < 0) {
+    THROW("accessing %d of Enumerate with size=%d", index, _size);
+  }
+
+  std::vector<OpReturnType> tupleMembers(2);
+  tupleMembers[0] = OpReturnType(new SingleVariable<int32_t>(_startIndex + index));
+  tupleMembers[1] = _iterable->get_int_subscript(index);
+
+  return OpReturnType(new TupleDataVariable(tupleMembers));
+}
+
+void EnumerateDataVariable::reset_iterator() {
+  IterableDataVariable::reset_iterator();
+  if (_iterable->is_iterable()) {
+    static_cast<IterableDataVariable*>(_iterable.get())->reset_iterator();
+  }
+}
+
+OpReturnType EnumerateDataVariable::next(CallStack& stack) {
+  if (_iterExhausted) {
+    THROW("StopIteration");
+  }
+
+  if (_iterPosition >= _size) {
+    _iterExhausted = true;
+    THROW("StopIteration");
+  }
+
+  OpReturnType value = _iterable->get_int_subscript(_iterPosition);
+
+  std::vector<OpReturnType> tupleMembers(2);
+  tupleMembers[0] = OpReturnType(new SingleVariable<int32_t>(_startIndex + _iterPosition));
+  tupleMembers[1] = value;
+
+  _iterPosition++;
+  return OpReturnType(new TupleDataVariable(tupleMembers));
+}
+
+bool EnumerateDataVariable::get_bool() {
+  return _size > 0;
+}
diff --git a/coreruntime/nimblenet/data_variable/src/model_nimble_net_variable.cpp b/coreruntime/nimblenet/data_variable/src/model_nimble_net_variable.cpp
index 8c3e6005..a1ff0384 100644
--- a/coreruntime/nimblenet/data_variable/src/model_nimble_net_variable.cpp
+++ b/coreruntime/nimblenet/data_variable/src/model_nimble_net_variable.cpp
@@ -7,6 +7,8 @@
 #include "model_nimble_net_variable.hpp"
 
 #include "asset_load_job.hpp"
+#include "map_data_variable.hpp"
+#include "task_onnx_model.hpp"
 
 std::shared_ptr<FutureDataVariable> ModelNimbleNetVariable::load_async(
     const std::string& modelName, CommandCenter* commandCenter) {
@@ -21,6 +23,39 @@ std::shared_ptr<FutureDataVariable> ModelNimbleNetVariable::load_async(
 }
 
 OpReturnType ModelNimbleNetVariable::run_model(const std::vector<OpReturnType>& arguments) {
+  // Check if we have a single dictionary argument (new interface)
+  if (arguments.size() == 1) {
+    auto mapVar = std::dynamic_pointer_cast<MapDataVariable>(arguments[0]);
+    if (mapVar) {
+      // Use dictionary-based inference if available
+      auto onnxModel = std::dynamic_pointer_cast<TaskONNXModel>(_model);
+      if (onnxModel) {
+        OpReturnType output;
+        try {
+          auto start = std::chrono::high_resolution_clock::now();
+
+          // Use the new dictionary interface (corrected signature)
+          int infStatus = onnxModel->invoke_inference_dict(output, arguments[0]);
+
+          auto stop = std::chrono::high_resolution_clock::now();
+          long long duration =
+              std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+          _commandCenter->write_inference_metric(_modelName, duration);
+
+          if (infStatus != SUCCESS || !output) {
+            // inference failed return None
+            return OpReturnType(new NoneVariable());
+          }
+
+          return output;
+        } catch (...) {
+          THROW("%s", "Error occurred while trying to get inference using dictionary interface.");
+        }
+      }
+    }
+  }
+
+  // Fall back to traditional vector interface
   std::vector<const char*> inputNames = _model->get_input_names();
   if (inputNames.size() != arguments.size()) {
     THROW("Model takes %d inputs, %d inputs provided. Cannot run model.", inputNames.size(),
diff --git a/coreruntime/nimblenet/data_variable/src/tensor_data_variable.cpp b/coreruntime/nimblenet/data_variable/src/tensor_data_variable.cpp
index 56ccd6fe..4d882c3a 100644
--- a/coreruntime/nimblenet/data_variable/src/tensor_data_variable.cpp
+++ b/coreruntime/nimblenet/data_variable/src/tensor_data_variable.cpp
@@ -33,6 +33,8 @@ int BaseTypedTensorVariable::get_elem_size(DATATYPE dataType) {
       return sizeof(int64_t);
     case FLOAT:
       return sizeof(float);
+    case FLOAT16:
+      return sizeof(uint16_t);  // 16-bit float stored as uint16_t
     case DOUBLE:
       return sizeof(double);
     case BOOLEAN:
@@ -63,6 +65,8 @@ std::string BaseTypedTensorVariable::print() {
   switch (get_dataType_enum()) {
     case DATATYPE::FLOAT:
       return util::recursive_string<float>(shape, 0, (float*)get_raw_ptr(), 0, numElements);
+    case DATATYPE::FLOAT16:
+      return util::recursive_string<uint16_t>(shape, 0, (uint16_t*)get_raw_ptr(), 0, numElements);
     case DATATYPE::DOUBLE:
       return util::recursive_string<double>(shape, 0, (double*)get_raw_ptr(), 0, numElements);
     case DATATYPE::INT64:
diff --git a/coreruntime/nimblenet/executors/onnx/include/task_onnx_model.hpp b/coreruntime/nimblenet/executors/onnx/include/task_onnx_model.hpp
index b705c6ef..a66cd75e 100644
--- a/coreruntime/nimblenet/executors/onnx/include/task_onnx_model.hpp
+++ b/coreruntime/nimblenet/executors/onnx/include/task_onnx_model.hpp
@@ -7,49 +7,39 @@
 #pragma once
 
 #include "data_variable.hpp"
+#include "map_data_variable.hpp"
 #include "nimble_net_util.hpp"
 #include "task_base_model.hpp"
 #include "tensor_data_variable.hpp"
 
-/**
- * @brief TaskONNXModel is a specialized implementation of TaskBaseModel
- *        that supports running ONNX models using ONNX Runtime when invoked from delitepy script.
- */
-class TaskONNXModel : public TaskBaseModel {
- private:
-  OrtAllocator* _allocator = nullptr;    /**< Allocator used by ONNX Runtime */
-  Ort::SessionOptions _sessionOptions;   /**< Options to configure ONNX session */
-  Ort::MemoryInfo _memoryInfo;           /**< Memory info for tensor allocations */
-  static Ort::Env _myEnv;                /**< Static environment shared by all sessions */
-  static Ort::ThreadingOptions tp;       /**< Threading configuration */
-  Ort::Session* _session = nullptr;      /**< ONNX session handle */
-  std::vector<const char*> _inputNames;  /**< Cached input names */
-  std::vector<const char*> _outputNames; /**< Cached output names */
+// Forward declarations for ONNX runtime
+namespace Ort {
+class Env;
+class Session;
+class SessionOptions;
+class Value;
+class AllocatorWithDefaultOptions;
+class MemoryInfo;
+}  // namespace Ort
 
-  /**
-   * @brief Loads model metadata such as input/output names.
-   */
-  void load_model_meta_data();
+class TaskONNXModel : public TaskBaseModel {
+  static Ort::Env _myEnv; /**< Global ONNX Runtime environment */
+  Ort::Session* _session = nullptr; /**< ONNX Runtime session instance */
+  Ort::SessionOptions _sessionOptions{}; /**< Session configuration options */
+  std::vector<const char*> _inputNames; /**< Model input tensor names */
+  std::vector<const char*> _outputNames; /**< Model output tensor names */
+  OrtAllocator* _allocator = nullptr; /**< ONNX Runtime memory allocator */
+  Ort::MemoryInfo _memoryInfo; /**< Memory information for tensor creation */
 
   /**
-   * @brief Loads the model from the internal buffer.
+   * @brief Loads the model from the buffer into ONNX Runtime session.
    */
   void load_model_from_buffer() override final;
 
   /**
-   * @brief Invokes inference using a vector of ONNX input tensors.
-   *
-   * @param ret Output structure to populate.
-   * @param inputTensors Prepared input tensors.
-   * @return status
+   * @brief Loads model metadata including input/output names.
    */
-  int invoke_inference(OpReturnType& ret,
-                       const std::vector<Ort::Value>& inputTensors) override final;
-
-  int invoke_inference(InferenceReturn* ret) override final {
-    throw std::runtime_error(
-        "Invoke inference with InferenceReturn struct in model run from task is not implemented.");
-  }
+  void load_model_meta_data();
 
   /**
    * @brief Creates an ONNX input tensor and sets the data pointer.
@@ -108,6 +98,38 @@ class TaskONNXModel : public TaskBaseModel {
                 const nlohmann::json& epConfig, const int epConfigVersion,
                 CommandCenter* commandCenter, bool runDummyInference);
 
+  /**
+   * @brief Invokes inference using a vector of ONNX input tensors.
+   *
+   * @param ret Output structure to populate.
+   * @param inputTensors Prepared input tensors.
+   * @return status
+   */
+  int invoke_inference(OpReturnType& ret,
+                       const std::vector<Ort::Value>& inputTensors) override final;
+
+  /**
+   * @brief Invokes inference using dictionary-based input/output (MapDataVariable interface).
+   *
+   * @param output_dict Dictionary to populate with named outputs.
+   * @param input_dict Dictionary containing named inputs.
+   * @return status
+   */
+  int invoke_inference_dict(OpReturnType& output_dict, const OpReturnType& input_dict);
+
+  /**
+   * @brief Converts tuple result to MapDataVariable format for named outputs.
+   *
+   * @param tuple_result Tuple result from standard inference.
+   * @return OpReturnType containing MapDataVariable with named outputs
+   */
+  OpReturnType convert_tuple_to_dict(const OpReturnType& tuple_result);
+
+  int invoke_inference(InferenceReturn* ret) override final {
+    throw std::runtime_error(
+        "Invoke inference with InferenceReturn struct in model run from task is not implemented.");
+  }
+
   /**
    * @brief Returns input tensor names from the ONNX model.
    */
@@ -118,6 +140,30 @@ class TaskONNXModel : public TaskBaseModel {
    */
   std::vector<const char*> get_output_names() override { return _outputNames; }
 
+  /**
+   * @brief Returns input tensor names as string vector for dictionary usage.
+   */
+  std::vector<std::string> get_input_names_string() {
+    std::vector<std::string> names;
+    names.reserve(_inputNames.size());
+    for (const char* name : _inputNames) {
+      names.emplace_back(name);
+    }
+    return names;
+  }
+
+  /**
+   * @brief Returns output tensor names as string vector for dictionary usage.
+   */
+  std::vector<std::string> get_output_names_string() {
+    std::vector<std::string> names;
+    names.reserve(_outputNames.size());
+    for (const char* name : _outputNames) {
+      names.emplace_back(name);
+    }
+    return names;
+  }
+
   /**
    * @brief Destructor for TaskONNXModel. Cleans up session.
    */
diff --git a/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp b/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp
index 0e8d700e..b0605183 100644
--- a/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp
+++ b/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp
@@ -4,12 +4,55 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+/*
+ * Dictionary-based interface usage examples using MapDataVariable:
+ *
+ * // Example 1: Using MapDataVariable interface for inference
+ * OpReturnType inputs = OpReturnType(new MapDataVariable());
+ * OpReturnType outputs;
+ * auto input_map = std::dynamic_pointer_cast<MapDataVariable>(inputs);
+ *
+ * // Prepare inputs
+ * input_map->set_value_in_map("input_ids", input_ids_tensor);
+ * input_map->set_value_in_map("attention_mask", attention_mask_tensor);
+ * input_map->set_value_in_map("position_ids", position_ids_tensor);
+ *
+ * // Add cache inputs
+ * for (int i = 0; i < num_layers; i++) {
+ *   input_map->set_value_in_map("past_key_values." + std::to_string(i) + ".key", past_key_tensor);
+ *   input_map->set_value_in_map("past_key_values." + std::to_string(i) + ".value", past_value_tensor);
+ * }
+ *
+ * // Run inference
+ * int result = model->invoke_inference_dict(outputs, inputs);
+ * auto output_map = std::dynamic_pointer_cast<MapDataVariable>(outputs);
+ *
+ * // Access outputs by name
+ * auto logits = output_map->get_string_subscript("logits");
+ * auto next_token = output_map->get_string_subscript("next_token_id");
+ * auto is_eos = output_map->get_string_subscript("is_eos");
+ * auto updated_attention = output_map->get_string_subscript("updated_attention_mask");
+ *
+ * // Example 2: Converting from tuple result to MapDataVariable
+ * OpReturnType tuple_result;
+ * model->invoke_inference(tuple_result, input_tensors);
+ *
+ * OpReturnType output_dict = model->convert_tuple_to_dict(tuple_result);
+ * auto output_map = std::dynamic_pointer_cast<MapDataVariable>(output_dict);
+ *
+ * // Now access outputs by name instead of position
+ * auto logits = output_map->get_string_subscript("logits");
+ */
+
 #include "task_onnx_model.hpp"
 
+#include <unordered_map>
 #include "data_variable.hpp"
+#include "map_data_variable.hpp"
 #include "nimble_net_util.hpp"
 #include "onnx_operators.hpp"
 #include "tensor_data_variable.hpp"
+#include "tuple_data_variable.hpp"
 
 Ort::Env TaskONNXModel::_myEnv =
     Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_FATAL, "ONNX  Inference Environment");
@@ -34,10 +77,37 @@ int TaskONNXModel::create_input_tensor_and_set_data_ptr(const OpReturnType req,
       delete[] strings;
     } else {
       int fieldSize = util::get_field_size_from_data_type(req->get_dataType_enum());
+
+      // Map DeliteAI DATATYPE to ONNX tensor element data type
+      ONNXTensorElementDataType onnxDataType;
+      switch (req->get_dataType_enum()) {
+        case DATATYPE::FLOAT:
+          onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+          break;
+        case DATATYPE::FLOAT16:
+          onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+          break;
+        case DATATYPE::DOUBLE:
+          onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE;
+          break;
+        case DATATYPE::INT32:
+          onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32;
+          break;
+        case DATATYPE::INT64:
+          onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+          break;
+        case DATATYPE::BOOLEAN:
+          onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL;
+          break;
+        default:
+          LOG_TO_CLIENT_ERROR("Unsupported data type %d for ONNX tensor creation", req->get_dataType_enum());
+          return TERMINAL_ERROR;
+      }
+
       inputTensor = Ort::Value::CreateTensor(_memoryInfo, req->get_raw_ptr(),
                                              fieldSize * req->get_numElements(),
                                              req->get_shape().data(), req->get_shape().size(),
-                                             (ONNXTensorElementDataType)req->get_dataType_enum());
+                                             onnxDataType);
     }
     returnedInputTensor = std::move(inputTensor);
     return SUCCESS;
@@ -84,16 +154,116 @@ int TaskONNXModel::invoke_inference(OpReturnType& ret,
   return SUCCESS;
 }
 
+int TaskONNXModel::invoke_inference_dict(OpReturnType& output_dict, const OpReturnType& input_dict) {
+  try {
+    // Convert input MapDataVariable to vector format for existing inference
+    auto input_map = std::dynamic_pointer_cast<MapDataVariable>(input_dict);
+    if (!input_map) {
+      LOG_TO_CLIENT_ERROR("Input is not a MapDataVariable for modelId=%s", _modelId.c_str());
+      return TERMINAL_ERROR;
+    }
+
+    std::vector<Ort::Value> inputTensors;
+    inputTensors.reserve(_inputNames.size());
+
+    for (size_t i = 0; i < _inputNames.size(); i++) {
+      std::string inputName(_inputNames[i]);
+
+      try {
+        OpReturnType input_tensor = input_map->get_string_subscript(inputName);
+        Ort::Value inputTensor = Ort::Value{nullptr};
+        int result = create_input_tensor_and_set_data_ptr(input_tensor, i, std::move(inputTensor));
+        if (result != SUCCESS) {
+          return result;
+        }
+        inputTensors.push_back(std::move(inputTensor));
+      } catch (...) {
+        LOG_TO_CLIENT_ERROR("Missing input tensor '%s' for modelId=%s", inputName.c_str(), _modelId.c_str());
+        return TERMINAL_ERROR;
+      }
+    }
+
+    // Run inference using existing method
+    std::vector<Ort::Value> output_onnx_tensors =
+        _session->Run(Ort::RunOptions{nullptr}, _inputNames.data(), inputTensors.data(),
+                      _inputNames.size(), _outputNames.data(), _outputNames.size());
+
+    // Create output MapDataVariable
+    output_dict = OpReturnType(new MapDataVariable());
+    auto output_map = std::dynamic_pointer_cast<MapDataVariable>(output_dict);
+
+    for (size_t i = 0; i < output_onnx_tensors.size(); i++) {
+      std::string outputName(_outputNames[i]);
+      OpReturnType tensor_var = get_tensor_variable_from_onnx_tensor(std::move(output_onnx_tensors[i]));
+      output_map->set_value_in_map(outputName, tensor_var);
+    }
+
+    return SUCCESS;
+  }
+  catch (Ort::Exception& e) {
+    LOG_TO_CLIENT_ERROR("Exception in invoke_inference_dict:%s with errorCode:%d, for modelId=%s",
+                        e.what(), e.GetOrtErrorCode(), _modelId.c_str());
+    return TERMINAL_ERROR;
+  }
+  catch (...) {
+    LOG_TO_CLIENT_ERROR("Exception in invoke_inference_dict ONNXSessionRun for modelId=%s",
+                        _modelId.c_str());
+    return TERMINAL_ERROR;
+  }
+}
+
+OpReturnType TaskONNXModel::convert_tuple_to_dict(const OpReturnType& tuple_result) {
+  try {
+    // Check if result is a TupleDataVariable
+    auto tuple_var = std::dynamic_pointer_cast<TupleDataVariable>(tuple_result);
+    if (!tuple_var) {
+      LOG_TO_CLIENT_ERROR("Result is not a TupleDataVariable for modelId=%s", _modelId.c_str());
+      return OpReturnType(new NoneVariable());
+    }
+
+    // Convert tuple elements to MapDataVariable using output names
+    auto tuple_elements = tuple_var->get_members();
+    if (tuple_elements.size() != _outputNames.size()) {
+      LOG_TO_CLIENT_ERROR("Mismatch between output count (%zu) and expected names (%zu) for modelId=%s",
+                          tuple_elements.size(), _outputNames.size(), _modelId.c_str());
+      return OpReturnType(new NoneVariable());
+    }
+
+    OpReturnType output_dict = OpReturnType(new MapDataVariable());
+    auto output_map = std::dynamic_pointer_cast<MapDataVariable>(output_dict);
+
+    for (size_t i = 0; i < tuple_elements.size(); i++) {
+      std::string outputName(_outputNames[i]);
+      output_map->set_value_in_map(outputName, tuple_elements[i]);
+    }
+
+    return output_dict;
+  }
+  catch (...) {
+    LOG_TO_CLIENT_ERROR("Exception in convert_tuple_to_dict for modelId=%s", _modelId.c_str());
+    return OpReturnType(new NoneVariable());
+  }
+}
+
 OpReturnType TaskONNXModel::get_tensor_variable_from_onnx_tensor(Ort::Value onnx_tensor) {
   Ort::TensorTypeAndShapeInfo tensor_info = onnx_tensor.GetTensorTypeAndShapeInfo();
-  auto dataType = (DATATYPE)tensor_info.GetElementType();
-  switch (dataType) {
-    case DATATYPE::FLOAT:
-    case DATATYPE::DOUBLE:
-    case DATATYPE::INT32:
-    case DATATYPE::INT64:
-      return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), dataType));
-    case DATATYPE::STRING: {
+      ONNXTensorElementDataType onnxType = tensor_info.GetElementType();
+
+  // Handle ONNX data type to DATATYPE mapping
+  switch (onnxType) {
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+      return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::FLOAT));
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
+      return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::DOUBLE));
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+      return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::INT32));
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+      return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::INT64));
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+      return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::BOOLEAN));
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+        return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::FLOAT16));
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING: {
       std::vector<std::string> strings;
       for (int i = 0; i < tensor_info.GetElementCount(); i++) {
         strings.push_back(onnx_tensor.GetStringTensorElement(i));
@@ -105,7 +275,7 @@ OpReturnType TaskONNXModel::get_tensor_variable_from_onnx_tensor(Ort::Value onnx
     default:
       LOG_TO_ERROR(
           "Requested data type = %d not supported when converting ONNX tensor to DataVariable.",
-          tensor_info.GetElementType());
+          onnxType);
       THROW("%s", "Unsupported dataType returned from model.");
   }
   THROW("%s", "Unsupported dataType returned from model.");
@@ -289,7 +459,8 @@ void TaskONNXModel::run_dummy_inference() {
       case DATATYPE::FLOAT:
       case DATATYPE::DOUBLE:
       case DATATYPE::INT32:
-      case DATATYPE::INT64: {
+      case DATATYPE::INT64:
+      case DATATYPE::FLOAT16: {
         OpReturnType req =
             OpReturnType(new TensorVariable(shape, static_cast<DATATYPE>(data_type)));
         create_input_tensor_and_set_data_ptr(req, i, std::move(inputTensor));
diff --git a/coreruntime/nimblenet/task_manager/operators/include/binary_operators.hpp b/coreruntime/nimblenet/task_manager/operators/include/binary_operators.hpp
index e88ba13e..c8d6386e 100644
--- a/coreruntime/nimblenet/task_manager/operators/include/binary_operators.hpp
+++ b/coreruntime/nimblenet/task_manager/operators/include/binary_operators.hpp
@@ -165,7 +165,7 @@ class BaseBinOp {
  * Ensures the result has the same sign as the divisor when possible.
  */
 template <typename T,
-          typename = std::enable_if_t<ne::is_one_of_v<T, float, int32_t, double, int64_t>>>
+          typename = std::enable_if_t<ne::is_one_of_v<T, float, int32_t, double, int64_t, uint16_t>>>
 struct ModOperator {
   /**
    * @brief Computes modulo operation with proper sign handling
@@ -187,10 +187,10 @@ struct ModOperator {
  * @brief Template class for numeric binary operations
  *
  * Provides implementations of all binary operations (add, sub, mult, div, pow, mod)
- * for numeric types (float, int32_t, double, int64_t).
+ * for numeric types (float, int32_t, double, int64_t, uint16_t).
  */
 template <typename T,
-          typename = std::enable_if_t<ne::is_one_of_v<T, float, int32_t, double, int64_t>>>
+          typename = std::enable_if_t<ne::is_one_of_v<T, float, int32_t, double, int64_t, uint16_t>>>
 class NumericBinOp : public BaseBinOp {
  public:
   /** @brief Adds two numeric values */
@@ -303,6 +303,10 @@ class BinaryOperators {
           NumericBinOp<float> n;
           return n.perform_operation(v1, v2, opType);
         }
+        case DATATYPE::FLOAT16: {
+          NumericBinOp<uint16_t> n;
+          return n.perform_operation(v1, v2, opType);
+        }
         case DATATYPE::INT32: {
           NumericBinOp<int32_t> n;
           return n.perform_operation(v1, v2, opType);
diff --git a/coreruntime/nimblenet/task_manager/operators/include/custom_functions.hpp b/coreruntime/nimblenet/task_manager/operators/include/custom_functions.hpp
index ffae5bb7..b934fdd3 100644
--- a/coreruntime/nimblenet/task_manager/operators/include/custom_functions.hpp
+++ b/coreruntime/nimblenet/task_manager/operators/include/custom_functions.hpp
@@ -250,4 +250,34 @@ DELITEPY_DOC_BLOCK_END
    */
   static OpReturnType pre_add_event_hook(const std::vector<OpReturnType>& typesDataVariable,
                                          CallStack& stack);
+
+  /*
+  DELITEPY_DOC_BLOCK_BEGIN
+- `next()` function
+  DELITEPY_DOC_BLOCK_END
+  */
+  /**
+   * @brief Returns the next item from an iterable
+   *
+   * @param args Vector containing one iterable object
+   * @param stack Current call stack
+   * @return Next element in the iterable
+   */
+  static OpReturnType next(const std::vector<OpReturnType>& args, CallStack& stack);
+
+  /*
+  DELITEPY_DOC_BLOCK_BEGIN
+- `enumerate()` function
+  DELITEPY_DOC_BLOCK_END
+  */
+  /**
+   * @brief Returns an enumerate object over an iterable
+   *
+   * Wraps the given iterable with an index counter, yielding (index, value) pairs.
+   *
+   * @param args Vector containing one iterable object
+   * @param stack Current call stack
+   * @return Iterable object yielding (index, value) pairs
+   */
+  static OpReturnType enumerate(const std::vector<OpReturnType>& args, CallStack& stack);
 };
diff --git a/coreruntime/nimblenet/task_manager/operators/include/operator_types.hpp b/coreruntime/nimblenet/task_manager/operators/include/operator_types.hpp
index ed605060..2ec69c88 100644
--- a/coreruntime/nimblenet/task_manager/operators/include/operator_types.hpp
+++ b/coreruntime/nimblenet/task_manager/operators/include/operator_types.hpp
@@ -15,7 +15,7 @@
  *
  * Compares two data types and returns the one with higher precedence
  * for automatic type promotion in operations. The precedence order is:
- * BOOLEAN (0) < INT32 (3) < INT64 (4) < FLOAT (5) < DOUBLE (6)
+ * BOOLEAN (0) < INT32 (3) < INT64 (4) < FLOAT16 (4.5) < FLOAT (5) < DOUBLE (6)
  *
  * @param dataType1 First data type to compare
  * @param dataType2 Second data type to compare
@@ -23,8 +23,8 @@
  */
 inline int get_max_dataType(int dataType1, int dataType2) {
   std::map<int, int> _typeScore = {
-      {DATATYPE::BOOLEAN, 0}, {DATATYPE::INT32, 3},  {DATATYPE::INT64, 4},
-      {DATATYPE::FLOAT, 5},   {DATATYPE::DOUBLE, 6},
+      {DATATYPE::BOOLEAN, 0}, {DATATYPE::INT32, 3},    {DATATYPE::INT64, 4},
+      {DATATYPE::FLOAT16, 45}, {DATATYPE::FLOAT, 5},   {DATATYPE::DOUBLE, 6},
   };
   if (_typeScore[dataType1] < _typeScore[dataType2]) {
     return dataType2;
diff --git a/coreruntime/nimblenet/task_manager/operators/src/custom_functions.cpp b/coreruntime/nimblenet/task_manager/operators/src/custom_functions.cpp
index 785c69db..a5156261 100644
--- a/coreruntime/nimblenet/task_manager/operators/src/custom_functions.cpp
+++ b/coreruntime/nimblenet/task_manager/operators/src/custom_functions.cpp
@@ -6,6 +6,7 @@
 
 #include "custom_functions.hpp"
 
+#include "enumerate_data_variable.hpp"
 #include "exception_data_variable.hpp"
 #include "statements.hpp"
 
@@ -22,6 +23,8 @@ std::map<std::string, CustomFuncPtr> CustomFunctions::_customFuncMap = {
     {"add_event", CustomFunctions::add_event},
     {"pre_add_event", CustomFunctions::pre_add_event_hook},
     {"Exception", CustomFunctions::create_exception},
+    {"next", CustomFunctions::next},
+    {"enumerate", CustomFunctions::enumerate},
 };
 
 OpReturnType CustomFunctions::concurrent(const std::vector<OpReturnType>& arguments,
@@ -92,3 +95,30 @@ OpReturnType CustomFunctions::add_event(const std::vector<OpReturnType>& rawStor
   };
   return OpReturnType(new CustomFuncDataVariable(std::move(myLambda)));
 }
+
+OpReturnType CustomFunctions::next(const std::vector<OpReturnType>& args, CallStack& stack) {
+  THROW_ARGUMENTS_MISMATCH_FUNCTION_NAME(args.size(), 1, "next");
+
+  if (args[0]->is_iterable()) {
+    try {
+      return args[0]->next(stack);
+    } catch (const std::runtime_error& e) {
+      if (std::string(e.what()) == "StopIteration") return nullptr;
+      throw;
+    }
+  }
+  THROW("next expects an iterable argument, provided %s", args[0]->get_containerType_string());
+}
+
+OpReturnType CustomFunctions::enumerate(const std::vector<OpReturnType>& args, CallStack& stack) {
+  THROW_OPTIONAL_ARGUMENTS_NOT_MATCH_FUNCTION_NAME(args.size(), 1, 2, "enumerate");
+
+  OpReturnType iterable = args[0];
+
+  int startIndex = 0;
+  if (args.size() == 2) {
+    startIndex = args[1]->get_int32();
+  }
+
+  return OpReturnType(new EnumerateDataVariable(iterable, startIndex));
+}
diff --git a/coreruntime/nimblenet/util/include/util.hpp b/coreruntime/nimblenet/util/include/util.hpp
index 7c6a4272..a886d35d 100644
--- a/coreruntime/nimblenet/util/include/util.hpp
+++ b/coreruntime/nimblenet/util/include/util.hpp
@@ -310,6 +310,8 @@ static inline int get_field_size_from_data_type(int dataType) {
   switch (dataType) {
     case DATATYPE::STRING:
       return 1;
+    case DATATYPE::FLOAT16:
+      return 2;
     case DATATYPE::FLOAT:
     case DATATYPE::INT32:
       return 4;
@@ -423,6 +425,9 @@ auto call_function_for_dataType(Func func, DATATYPE dataType, Ts&&... ts) {
       return func(double{}, std::forward<Ts>(ts)...);
     case DATATYPE::FLOAT:
       return func(float{}, std::forward<Ts>(ts)...);
+    case DATATYPE::FLOAT16:
+      // Use uint16_t as the underlying representation for fp16
+      return func(uint16_t{}, std::forward<Ts>(ts)...);
     case DATATYPE::INT64:
       return func(int64_t{}, std::forward<Ts>(ts)...);
     case DATATYPE::BOOLEAN:
diff --git a/coreruntime/nimblenet/util/src/util.cpp b/coreruntime/nimblenet/util/src/util.cpp
index e108a4fd..e71ca9de 100644
--- a/coreruntime/nimblenet/util/src/util.cpp
+++ b/coreruntime/nimblenet/util/src/util.cpp
@@ -31,6 +31,8 @@ const char* get_string_from_enum(int dataType) {
       return "None";
     case DATATYPE::FLOAT:
       return "float";
+    case DATATYPE::FLOAT16:
+      return "float16";
     case DATATYPE::BOOLEAN:
       return "bool";
     case DATATYPE::INT32:
@@ -80,6 +82,7 @@ const char* get_string_from_enum(int dataType) {
 
 int get_enum_from_string(const char* type) {
   static std::map<std::string, int> typeMap = {{"float", DATATYPE::FLOAT},
+                                               {"float16", DATATYPE::FLOAT16},
                                                {"double", DATATYPE::DOUBLE},
                                                {"bool", DATATYPE::BOOLEAN},
                                                {"int32", DATATYPE::INT32},
diff --git a/models/LFM2/demo_lfm.py b/models/LFM2/demo_lfm.py
deleted file mode 100755
index e0afa09b..00000000
--- a/models/LFM2/demo_lfm.py
+++ /dev/null
@@ -1,297 +0,0 @@
-#!/usr/bin/env python3
-#-*- coding: utf-8 -*-
-
-import json
-import re
-import sys
-import os
-from typing import List
-
-# Add parent directory to path to import tools
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from tools import tools, tool_schema
-
-from transformers import AutoConfig, AutoTokenizer
-import onnxruntime
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-# 1. Load config, processor, and model
-model_id = "onnx-community/LFM2-1.2B-ONNX"
-
-
-TOOL_CALL_START_TOKEN = "<|tool_call_start|>"
-TOOL_CALL_END_TOKEN = "<|tool_call_end|>"
-TOOL_RESPONSE_START_TOKEN = "<|tool_response_start|>"
-TOOL_RESPONSE_END_TOKEN = "<|tool_response_end|>"
-INITIAL_PROMPT = f"""You are a helpful assistant. When you need to use tools, call only one tool at a time and sequentially execute them."""
-
-initial_message_block = [
-    {
-        "role": "system",
-        "content": INITIAL_PROMPT
-    }
-]
-
-config = AutoConfig.from_pretrained(model_id)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-filename = "model.onnx" # Options: "model.onnx", "model_fp16.onnx", "model_q4.onnx", "model_q4f16.onnx"
-model_path = hf_hub_download(repo_id=model_id, filename=f"onnx/{filename}") # Download the graph
-hf_hub_download(repo_id=model_id, filename=f"onnx/{filename}_data") # Download the weights
-session = onnxruntime.InferenceSession(model_path)
-
-## Set config values
-num_key_value_heads = config.num_key_value_heads
-head_dim = config.hidden_size // config.num_attention_heads
-num_hidden_layers = config.num_hidden_layers
-eos_token_id = config.eos_token_id
-hidden_size = config.hidden_size
-conv_L_cache = config.conv_L_cache
-layer_types = config.layer_types
-
-def execute_function_call(function_name: str, arguments: dict) -> dict:
-    """Execute a function call and return the result"""
-    if function_name not in tools:
-        return {"error": f"Function {function_name} not found"}
-    
-    try:
-        function = tools[function_name]  # Direct access to function object
-        result = function(**arguments)
-        return result
-    except Exception as e:
-        return {"error": f"Error executing {function_name}: {str(e)}"}
-
-def format_tool_response(result: dict) -> str:
-    """Format tool execution result using token-based format"""
-    result_json = json.dumps(result)
-    return f"{TOOL_RESPONSE_START_TOKEN}{result_json}{TOOL_RESPONSE_END_TOKEN}"
-
-def execute_tool_call_with_response(function_name: str, arguments: dict) -> tuple:
-    """Execute a function call and return both result and formatted response"""
-    result = execute_function_call(function_name, arguments)
-    formatted_response = format_tool_response(result)
-    return result, formatted_response
-
-def parse_tool_calls_from_response(response_text: str) -> list:
-    """Parse tool calls from model response using multiple formats"""
-    tool_calls = []
-
-    # Method 2: Look for JSON-style tool calls: <|tool_call_start|>{"name": "func", "arguments": {...}}<|tool_call_end|>
-    json_tool_pattern = r'<\|tool_call_start\|>\s*({.*?})\s*<\|tool_call_end\|>'
-    json_matches = re.findall(json_tool_pattern, response_text, re.DOTALL)
-    
-    for json_str in json_matches:
-        try:
-            tool_data = json.loads(json_str)
-            func_name = tool_data.get("name")
-            arguments = tool_data.get("arguments", {})
-            
-            if func_name in tools:
-                tool_calls.append({
-                    "function_name": func_name,
-                    "arguments": arguments
-                })
-                print(f"✓ Parsed JSON tool call: {func_name}({arguments})")
-        except json.JSONDecodeError:
-            print(f"⚠ Failed to parse JSON tool call: {json_str}")
-    
-    return tool_calls
-
-def generate_with_model(conversation_messages: List, max_new_tokens: int = 150) -> str:
-    """Generate text using the loaded model with multi-turn conversation support"""
-    # Use chat template with tools for multi-turn conversations
-    print("---"*10)
-    print("Conversation Messages:")
-    print(json.dumps(conversation_messages, indent=4))
-    print("---"*10)
-
-    # 2. Prepare inputs
-    inputs = tokenizer.apply_chat_template(
-      conversation_messages,
-      tools=tool_schema,
-      add_generation_prompt=True,
-      tokenize=True,
-      return_dict=True,
-      return_tensors="np"
-    )
-    input_ids = inputs['input_ids']
-    attention_mask = inputs['attention_mask']
-    batch_size = input_ids.shape[0]
-    position_ids = np.tile(np.arange(0, input_ids.shape[-1]), (batch_size, 1))
-    past_cache_values = {}
-    for i in range(num_hidden_layers):
-      if layer_types[i] == 'full_attention':
-        for kv in ('key', 'value'):
-          past_cache_values[f'past_key_values.{i}.{kv}'] = np.zeros([batch_size, num_key_value_heads, 0, head_dim], dtype=np.float32)
-      elif layer_types[i] == 'conv':
-        past_cache_values[f'past_conv.{i}'] = np.zeros([batch_size, hidden_size, conv_L_cache], dtype=np.float32)
-      else:
-        raise ValueError(f"Unsupported layer type: {layer_types[i]}")
-
-    # 3. Generation loop
-    generated_tokens = np.array([[]], dtype=np.int64)
-    for i in range(max_new_tokens):
-      logits, *present_cache_values = session.run(None, dict(
-          input_ids=input_ids,
-          attention_mask=attention_mask,
-          position_ids=position_ids,
-          **past_cache_values,
-      ))
-
-      ## Update values for next generation loop
-      input_ids = logits[:, -1].argmax(-1, keepdims=True)
-      attention_mask = np.concatenate([attention_mask, np.ones_like(input_ids, dtype=np.int64)], axis=-1)
-      position_ids = position_ids[:, -1:] + 1
-      for j, key in enumerate(past_cache_values):
-        past_cache_values[key] = present_cache_values[j]
-      generated_tokens = np.concatenate([generated_tokens, input_ids], axis=-1)
-      if (input_ids == eos_token_id).all():
-        break
-
-    # 4. Output result
-    response = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
-    return response.strip()
-
-def handle_multi_step_request(user_prompt: str, max_steps: int, max_new_tokens: int) -> list:
-    """Handle requests that may require multiple tool calls and back and forth"""
-    step_results = []
-    conversation_messages : List[dict] = []  # Will hold the full conversation chain
-    tool_context = {}  # Store results from previous tool calls
-    
-    for step in range(max_steps):
-        print(f"\n--- Step {step + 1} ---")
-        if step == 0:
-            conversation_messages = initial_message_block.copy()
-            conversation_messages.append({
-                "role": "user", 
-                "content": user_prompt
-            })
-        else: 
-            conversation_messages.append({
-                "role": "system", 
-                "content": "Now use the result from the tool calls to answer the user's question. Call another tool if needed."
-            })
-        # Generate response
-        try:
-            response = generate_with_model(conversation_messages, max_new_tokens=max_new_tokens)
-            print(f"Model Response: {response}")
-            
-            # Parse and execute tool calls
-            tool_calls = parse_tool_calls_from_response(response)
-            tool_results = []
-            
-            if tool_calls:
-                print(f"Executing {len(tool_calls)} tool call(s):")
-                for call in tool_calls:
-                    func_name = call["function_name"]
-                    arguments = call["arguments"]
-                    
-                    print(f"  • {func_name}({arguments})")
-                    result, formatted_response = execute_tool_call_with_response(func_name, arguments)
-                    
-                    # Store important results for future reference
-                    if func_name == "get_current_location" and "location" in result:
-                        tool_context["location"] = result["location"]
-                    
-                    tool_results.append({
-                        "function": func_name,
-                        "arguments": arguments,
-                        "result": result
-                    })
-                    print(f"    Result: {json.dumps(result, indent=4)}")
-            
-            # Add assistant response to conversation
-            conversation_messages.append({
-                "role": "assistant",
-                "content": response
-            })
-            
-            # Add tool results to conversation as function messages
-            for tool_result in tool_results:
-                if not tool_result["result"].get("error"):
-                    conversation_messages.append({
-                        "role": "system",
-                        "content": f"The result of the tool {tool_result['function']} is: {TOOL_RESPONSE_START_TOKEN}{json.dumps(tool_result['result'])}{TOOL_RESPONSE_END_TOKEN}"
-                    })
-            
-            # Store step result
-            step_result = {
-                "step": step + 1,
-                "prompt": user_prompt if step == 0 else "continuation",
-                "response": response,
-                "tool_calls": tool_calls,
-                "tool_results": tool_results,
-                "has_errors": any("error" in result.get("result", {}) for result in tool_results),
-                "tool_context": tool_context.copy(),
-                "conversation_messages": conversation_messages.copy()
-            }
-            step_results.append(step_result)
-            
-            # Check if all tool calls were successful
-            if step_result["has_errors"]:
-                print(f"⚠ Stopping due to tool execution errors")
-                break
-            
-            # Simple continuation logic: if no tools were called, we're done
-            if not tool_calls:
-                print(f"✓ Completed after {step + 1} step(s) - no tool calls needed")
-                break
-            
-            # If we've reached max steps, stop
-            if step >= max_steps - 1:
-                print(f"✓ Reached maximum steps ({max_steps})")
-                break
-            
-            # If tools were executed, continue to next step to see if model wants to do more
-            print(f"✓ Step {step + 1} completed with {len(tool_calls)} tool call(s) - continuing...")
-            
-        except Exception as e:
-            print(f"Error in step {step + 1}: {e}")
-            step_results.append({
-                "step": step + 1,
-                "prompt": user_prompt if step == 0 else "continuation",
-                "error": str(e),
-                "response": None,
-                "tool_calls": [],
-                "tool_results": [],
-                "tool_context": tool_context.copy(),
-                "conversation_messages": conversation_messages.copy() if conversation_messages else []
-            })
-            break
-    
-    return step_results
-
-def run_tool_calling_demo():
-    """Run tool calling demonstration"""
-    print("=== Qwen3 1.7B Tool Calling Demo ===\n")
-    print(f"Model: {model_id}")
-    print(f"Available tools: {list(tools.keys())}")
-    
-    demo_prompts = [
-        "What's the weather here today?",
-        "Calculate 15 * 23",
-        "What time is it in JST timezone?",
-        "Where am I located?",
-        "Get my location and check the weather there"
-    ]
-    
-    for i, user_prompt in enumerate(demo_prompts, 1):
-        print(f"\nDemo {i}: {user_prompt}")
-        print("-" * 60)
-        step_results = handle_multi_step_request(user_prompt, max_steps=4, max_new_tokens=400)
-        # Show final summary
-        print(f"\n📋 Multi-step Summary:")
-        for step_result in step_results:
-            step_num = step_result["step"]
-            tool_calls = step_result.get("tool_calls", [])
-            if tool_calls:
-                print(f"  Step {step_num}: {len(tool_calls)} tool call(s)")
-                for call in tool_calls:
-                    func_name = call["function_name"]
-                    print(f"    ✓ {func_name}")
-        print("\n" + "="*60)
-
-
-if __name__ == "__main__":
-    # Run the regular demo first
-    run_tool_calling_demo()
\ No newline at end of file
diff --git a/models/Qwen3-1.7B/demo_qwen.py b/models/Qwen3-1.7B/demo_qwen.py
index 8fde3c65..e6a7e6e7 100755
--- a/models/Qwen3-1.7B/demo_qwen.py
+++ b/models/Qwen3-1.7B/demo_qwen.py
@@ -1,6 +1,19 @@
 #!/usr/bin/env python3
 #-*- coding: utf-8 -*-
 
+"""
+Enhanced Qwen3-1.7B ONNX Demo with Tool Calling
+
+This demo uses a custom enhanced ONNX model with:
+- Integrated ArgMax for token generation
+- Built-in EOS detection
+- Temperature scaling for language confusion mitigation
+- Automatic cache management
+- English-only output filtering
+
+The enhanced model is created by export.py and saved as model_enhanced.onnx
+"""
+
 import json
 import re
 import sys
@@ -11,7 +24,8 @@
 import onnxruntime
 import numpy as np
 from huggingface_hub import hf_hub_download
-from jinja2 import Template, Environment
+from jinja2 import Environment
+import re
 
 # Add parent directory to path to import tools
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -25,10 +39,10 @@
 TOOL_RESPONSE_START_TOKEN = "<tool_response>"
 TOOL_RESPONSE_END_TOKEN = "</tool_response>"
 INITIAL_PROMPT = f"""You are a helpful assistant with access to tools. When you need to use a tool, format your response with JSON between {TOOL_CALL_START_TOKEN} and {TOOL_CALL_END_TOKEN} tokens.
-
 Use this exact format: {TOOL_CALL_START_TOKEN}{{"name": "function_name", "arguments": {{"param": "value"}}}}{TOOL_CALL_END_TOKEN}
 If a tool requires a argument you don't know the value of check if another tool can give you that information and call that tool first.
-Always respond directly and call the appropriate tool when needed."""
+Always respond directly and call the appropriate tool when needed.
+"""
 
 initial_message_block = [
     {
@@ -38,17 +52,48 @@
 ]
 
 config = AutoConfig.from_pretrained(model_id)
-print(config)
 tokenizer = Tokenizer.from_pretrained(model_id)
 chat_template = "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}"
 
-filename = "model_q4f16.onnx" # Options: model.onnx
-model_path = hf_hub_download(repo_id=model_id, filename=f"onnx/{filename}") # Download the graph
-# hf_hub_download(repo_id=model_id, filename=f"onnx/{filename}_data") # Download the weights
-session = onnxruntime.InferenceSession(model_path)
+# Use the enhanced ONNX model created by export.py
+model_path = "./data/onnx/model_enhanced.onnx"
 
+if not os.path.exists(model_path):
+    print(f"❌ Enhanced model not found at {model_path}")
+    print("📝 Please run export.py first to create the enhanced model")
+    print("💡 Run: python export.py")
+    sys.exit(1)
+
+# Load the enhanced ONNX model with integrated generation capabilities
+print(f"🚀 Loading ONNX model from {model_path}...")
+session = onnxruntime.InferenceSession(model_path)
 
-print(f"✓ {model_id} model loaded successfully!")
+print(f"✅ {model_id} model loaded successfully!")
+print(f"✅ Model has {len(session.get_inputs())} inputs and {len(session.get_outputs())} outputs")
+print(f"🚀 Features: Integrated ArgMax, EOS detection, temperature scaling, automatic cache updates")
+
+# Global variables for conversation state
+conversation_state = {
+    "kv_cache": None,
+    "attention_mask": None,
+    "position_ids": None,
+    "sequence_length": 0,
+    "conversation_history": []
+}
+
+# Print model input/output info for debugging
+print(f"\n📋 Model Inputs (first 5):")
+for inp in session.get_inputs()[:5]:  # Show first 5 to avoid spam
+    print(f"  • {inp.name}: {inp.shape}")
+if len(session.get_inputs()) > 5:
+    print(f"  ... and {len(session.get_inputs()) - 5} more inputs")
+
+print(f"\n📋 Enhanced Model Outputs:")
+for out in session.get_outputs()[:5]:
+    if not out.name.startswith('updated_past_key_values'):  # Skip cache outputs to reduce spam
+        print(f"  • {out.name}: {out.shape}")
+if len(session.get_outputs()) > 5:
+    print(f"  • ... and {len(session.get_outputs()) - 5} more outputs")
 
 
 def execute_function_call(function_name: str, arguments: dict) -> dict:
@@ -74,13 +119,66 @@ def execute_tool_call_with_response(function_name: str, arguments: dict) -> tupl
     formatted_response = format_tool_response(result)
     return result, formatted_response
 
+def initialize_conversation_state():
+    """Initialize KV cache and conversation state once"""
+    global conversation_state
+
+    # Set config values
+    num_key_value_heads = config.num_key_value_heads
+    head_dim = config.hidden_size // config.num_attention_heads
+    num_hidden_layers = config.num_hidden_layers
+    hidden_size = config.hidden_size
+    batch_size = 1  # Single batch for conversation
+
+    # Initialize KV cache
+    kv_cache = {}
+
+    # Check if config has layer_types
+    if not hasattr(config, 'layer_types'):
+        config.layer_types = [
+            "full_attention"
+            for _ in range(config.num_hidden_layers)
+        ]
+
+    for i in range(num_hidden_layers):
+        if config.layer_types[i] == 'full_attention':
+            for kv in ('key', 'value'):
+                # Initialize with small valid tensor for first generation step
+                kv_cache[f'past_key_values.{i}.{kv}'] = np.zeros([batch_size, num_key_value_heads, 1, head_dim], dtype=np.float16)
+        elif config.layer_types[i] == 'conv':
+            kv_cache[f'past_conv.{i}'] = np.zeros([batch_size, hidden_size, config.conv_L_cache], dtype=np.float16)
+
+    # Initialize conversation state
+    conversation_state.update({
+        "kv_cache": kv_cache,
+        "attention_mask": None,
+        "position_ids": None,
+        "sequence_length": 0,
+        "conversation_history": []
+    })
+
+    print("✅ Conversation state and KV cache initialized")
+
+def reset_conversation_state():
+    """Reset conversation state for a new conversation"""
+    global conversation_state
+    conversation_state.update({
+        "kv_cache": None,
+        "attention_mask": None,
+        "position_ids": None,
+        "sequence_length": 0,
+        "conversation_history": []
+    })
+    print("🔄 Conversation state reset")
+
 def parse_tool_calls_from_response(response_text: str) -> list:
     """Parse tool calls from model response using multiple formats"""
     tool_calls = []
 
     # Method 2: Look for JSON-style tool calls: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
-    json_tool_pattern = r'<tool_call>\s*({.*?})\s*</tool_call>'
-    json_matches = re.findall(json_tool_pattern, response_text, re.DOTALL)
+    # Using [\s\S] instead of re.DOTALL to match any character including newlines
+    json_tool_pattern = r'<tool_call>\s*({[\s\S]*?})\s*</tool_call>'
+    json_matches = re.findall(json_tool_pattern, response_text)
 
     for json_str in json_matches:
         try:
@@ -156,84 +254,108 @@ def apply_chat_template(messages, tool_schema, add_generation_prompt, tokenize,
         return text
 
 def generate_with_model(conversation_messages: List, max_new_tokens: int = 150) -> str:
-    """Generate text using the loaded model with multi-turn conversation support"""
-    # Use chat template with tools for multi-turn conversations
+    """Generate text using full conversation processing (simplified approach)"""
     print("---"*10)
     print("Conversation Messages:")
     print(json.dumps(conversation_messages, indent=4))
     print("---"*10)
 
-    # 2. Prepare inputs
+    # Always process the full conversation - simpler and more reliable
     inputs = apply_chat_template(
-      conversation_messages,
-      tool_schema=tool_schema,
-      add_generation_prompt=True,
-      tokenize=True,
-      return_dict=True,
+        conversation_messages,
+        tool_schema=tool_schema,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
     )
+
     input_ids = inputs['input_ids']
     attention_mask = inputs['attention_mask']
     batch_size = input_ids.shape[0]
-    position_ids = np.tile(np.arange(0, input_ids.shape[-1]), (batch_size, 1))
+    seq_len = input_ids.shape[1]
+
+    # Create position IDs
+    position_ids = np.tile(np.arange(0, seq_len), (batch_size, 1))
 
     # Set config values
     num_key_value_heads = config.num_key_value_heads
     head_dim = config.hidden_size // config.num_attention_heads
     num_hidden_layers = config.num_hidden_layers
-    eos_token_id = config.eos_token_id
     hidden_size = config.hidden_size
-    # Initialize past cache values with correct shapes for ONNX model
-    past_cache_values = {}
-
-    # Check if config has layer_types (like LFM2)
-    if hasattr(config, 'layer_types'):
-        for i in range(num_hidden_layers):
-            if config.layer_types[i] == 'full_attention':
-                for kv in ('key', 'value'):
-                    # Use the ONNX model's expected head count (8) from the input shapes
-                    past_cache_values[f'past_key_values.{i}.{kv}'] = np.zeros([batch_size, 8, 0, head_dim], dtype=np.float16)
-            elif config.layer_types[i] == 'conv':
-                past_cache_values[f'past_conv.{i}'] = np.zeros([batch_size, hidden_size, config.conv_L_cache], dtype=np.float16)
-    else:
-        # Standard transformer layers - use ONNX model's expected head count (8)
-        for i in range(num_hidden_layers):
+
+    # Initialize fresh KV cache for each generation
+    model_inputs = {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "position_ids": position_ids
+    }
+
+    # Check if config has layer_types
+    if not hasattr(config, 'layer_types'):
+        config.layer_types = [
+            "full_attention"
+            for _ in range(config.num_hidden_layers)
+        ]
+
+    # Initialize KV cache
+    for i in range(num_hidden_layers):
+        if config.layer_types[i] == 'full_attention':
             for kv in ('key', 'value'):
-                # Use 8 heads as expected by the ONNX model (from debug output)
-                past_cache_values[f'past_key_values.{i}.{kv}'] = np.zeros([batch_size, 8, 0, head_dim], dtype=np.float16)
+                # Initialize with small valid tensor for first generation step
+                model_inputs[f'past_key_values.{i}.{kv}'] = np.zeros([batch_size, num_key_value_heads, 1, head_dim], dtype=np.float16)
+        elif config.layer_types[i] == 'conv':
+            model_inputs[f'past_conv.{i}'] = np.zeros([batch_size, hidden_size, config.conv_L_cache], dtype=np.float16)
 
-    # 3. Generation loop
+    # Enhanced generation loop
     generated_tokens = []
+
     for i in range(max_new_tokens):
-        logits, *present_cache_values = session.run(None, dict(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            **past_cache_values,
-        ))
-
-        # Update values for next generation loop
-        logits_array = np.asarray(logits)
-        next_token_id = np.argmax(logits_array[0, -1, :])
-
-        # Check for EOS token
-        if next_token_id == eos_token_id:
-            break
+        # Run the enhanced model
+        model_outputs = session.run(None, model_inputs)
 
-        generated_tokens.append(next_token_id)
-        input_ids = np.array([[next_token_id]], dtype=np.int64)
-        attention_mask = np.concatenate([attention_mask, np.ones_like(input_ids, dtype=np.int64)], axis=-1)
-        position_ids = position_ids[:, -1:] + 1
+        # Parse outputs
+        output_names = [output.name for output in session.get_outputs()]
+        outputs_dict = dict(zip(output_names, model_outputs))
 
-        # Update cache
-        for j, key in enumerate(past_cache_values):
-            past_cache_values[key] = present_cache_values[j]
+        # Check for EOS
+        if bool(outputs_dict['is_eos'][0, 0]):
+            break
 
-    # 4. Output result - decode only the generated tokens
+        generated_tokens.append(int(outputs_dict['next_token_id'][0, 0]))
+
+        # Update inputs for next iteration
+        model_inputs["input_ids"] = outputs_dict['next_token_id']
+        model_inputs["attention_mask"] = outputs_dict['updated_attention_mask']
+
+        # For subsequent calls, we need only the last position
+        next_position_full = outputs_dict['next_position']
+        last_position = next_position_full[:, -1:]
+        model_inputs["position_ids"] = last_position
+
+        # Update cache using present outputs
+        for cache_key in list(model_inputs.keys()):
+            if cache_key.startswith('past_key_values.'):
+                parts = cache_key.split('.')
+                if len(parts) == 3:
+                    layer_num = parts[1]
+                    kv_type = parts[2]
+                    present_key = f"present.{layer_num}.{kv_type}"
+
+                    if present_key in outputs_dict:
+                        model_inputs[cache_key] = outputs_dict[present_key]
+                    else:
+                        print(f"⚠️ Warning: Expected cache output '{present_key}' not found")
+            elif cache_key.startswith('past_conv.'):
+                present_key = cache_key.replace("past_conv", "present_conv")
+                if present_key in outputs_dict:
+                    model_inputs[cache_key] = outputs_dict[present_key]
+
+    # Decode generated tokens
+    response = ""
     if generated_tokens:
         generated_tokens_array = np.array([generated_tokens], dtype=np.int64)
         response = tokenizer.decode_batch(generated_tokens_array, skip_special_tokens=True)[0]
-    else:
-        response = ""
+
     return response.strip()
 
 
@@ -347,10 +469,12 @@ def handle_multi_step_request(user_prompt: str, max_steps: int, max_new_tokens:
     return step_results
 
 def run_tool_calling_demo():
-    """Run tool calling demonstration"""
-    print("=== Qwen3 1.7B Tool Calling Demo ===\n")
-    print(f"Model: {model_id}")
-    print(f"Available tools: {list(tools.keys())}")
+    """Run tool calling demonstration using the enhanced ONNX model"""
+    print("=== Qwen3 1.7B Enhanced ONNX Tool Calling Demo ===\n")
+    print(f"📦 Model: {model_id} (Enhanced)")
+    print(f"🚀 Enhanced Model Path: {model_path}")
+    print(f"✨ Features: ArgMax, EOS detection, temperature scaling, automatic cache updates")
+    print(f"🔧 Available tools: {list(tools.keys())}")
 
     demo_prompts = [
         "What's the weather here today?",
@@ -361,7 +485,7 @@ def run_tool_calling_demo():
     ]
 
     for i, user_prompt in enumerate(demo_prompts, 1):
-        print(f"\nDemo {i}: {user_prompt}")
+        print(f"\n🎮 Demo {i}: {user_prompt}")
         print("-" * 60)
         step_results = handle_multi_step_request(user_prompt, max_steps=4, max_new_tokens=400)
         # Show final summary
@@ -378,5 +502,12 @@ def run_tool_calling_demo():
 
 
 if __name__ == "__main__":
-    # Run the regular demo first
+    print("🔧 Enhanced Qwen3-1.7B ONNX Model Demo")
+    print("📝 Uses enhanced ONNX model with integrated generation enhancements")
+    print("🎯 Features: ArgMax, EOS detection, temperature scaling, automatic cache management")
+    print("🚀 Export: Custom enhanced model with language confusion mitigation")
+    print("📁 Model location: ./data/onnx/model_enhanced.onnx")
+    print("=" * 80)
+
+    # Run the enhanced demo
     run_tool_calling_demo()
diff --git a/models/Qwen3-1.7B/export.py b/models/Qwen3-1.7B/export.py
new file mode 100755
index 00000000..f882b058
--- /dev/null
+++ b/models/Qwen3-1.7B/export.py
@@ -0,0 +1,509 @@
+#!/usr/bin/env python3
+"""
+Export script for Qwen3-1.7B Enhanced ONNX model with integrated generation logic.
+
+This script:
+1. Downloads the base Qwen3-1.7B ONNX model
+2. Enhances it with integrated generation logic (ArgMax, EOS detection, temperature scaling)
+3. Saves the enhanced model as model_enhanced.onnx
+"""
+
+import os
+import onnx
+import onnxruntime as ort
+import numpy as np
+from onnx import helper, TensorProto, ValueInfoProto
+from pathlib import Path
+
+def download_base_model():
+    """Download the base Qwen3-1.7B ONNX model."""
+    model_id = "onnx-community/Qwen3-1.7B-ONNX"
+    output_dir = "./data/onnx"
+
+    print(f"📥 Downloading base model: {model_id}")
+
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Download base model
+    from huggingface_hub import hf_hub_download
+    base_model_path = hf_hub_download(repo_id=model_id, filename="onnx/model_q4f16.onnx")
+
+    # Copy to our directory structure
+    import shutil
+    local_model_path = os.path.join(output_dir, "model_base.onnx")
+    shutil.copy2(base_model_path, local_model_path)
+
+    print(f"✅ Base model downloaded to {local_model_path}")
+    return local_model_path
+
+def load_and_analyze_model(model_path):
+    """Load the ONNX model and analyze its structure."""
+    print(f"📋 Loading base ONNX model from {model_path}")
+    model = onnx.load(model_path)
+
+    print(f"✅ Model loaded successfully!")
+    print(f"📊 Model has {len(model.graph.input)} inputs and {len(model.graph.output)} outputs")
+
+    # Print opset information
+    print(f"\n🔧 Model opset information:")
+    for opset_import in model.opset_import:
+        domain = opset_import.domain or "ai.onnx"
+        print(f"  • {domain}: opset {opset_import.version}")
+
+    return model
+
+def add_argmax_node(model, temperature=0.3):
+    """Add argmax node to logits output for token generation with temperature scaling."""
+    # Find logits output (usually the first output)
+    logits_output = model.graph.output[0]
+
+    print(f"🎯 Adding ArgMax node with temperature {temperature} for output: {logits_output.name}")
+
+    # Create argmax node that selects the token with highest probability from the last position
+    # First, slice the logits to get only the last position: [batch, seq, vocab] -> [batch, 1, vocab]
+    slice_starts = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['last_pos_starts'],
+        value=helper.make_tensor(
+            name='last_pos_starts_value',
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[-1]  # Last position
+        )
+    )
+
+    slice_ends = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['last_pos_ends'],
+        value=helper.make_tensor(
+            name='last_pos_ends_value',
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[2147483647]  # Max int (until end)
+        )
+    )
+
+    slice_axes = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['last_pos_axes'],
+        value=helper.make_tensor(
+            name='last_pos_axes_value',
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[1]  # Sequence dimension
+        )
+    )
+
+    # Slice to get last position logits: [batch, seq, vocab] -> [batch, 1, vocab]
+    slice_last_logits = helper.make_node(
+        'Slice',
+        inputs=[logits_output.name, 'last_pos_starts', 'last_pos_ends', 'last_pos_axes'],
+        outputs=['last_position_logits']
+    )
+
+    # Squeeze to remove the sequence dimension: [batch, 1, vocab] -> [batch, vocab]
+    squeeze_axes = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['squeeze_axes'],
+        value=helper.make_tensor(
+            name='squeeze_axes_value',
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[1]  # Remove sequence dimension
+        )
+    )
+
+    squeeze_logits = helper.make_node(
+        'Squeeze',
+        inputs=['last_position_logits', 'squeeze_axes'],
+        outputs=['squeezed_logits']
+    )
+
+    # Apply temperature scaling to reduce language confusion
+    temperature_constant = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['temperature_constant'],
+        value=helper.make_tensor(
+            name='temperature_value',
+            data_type=TensorProto.FLOAT,
+            dims=[],
+            vals=[temperature]
+        )
+    )
+
+    # Cast logits to float for temperature scaling
+    cast_to_float = helper.make_node(
+        'Cast',
+        inputs=['squeezed_logits'],
+        outputs=['logits_float'],
+        to=TensorProto.FLOAT
+    )
+
+    # Apply temperature scaling: logits = logits / temperature
+    scaled_logits = helper.make_node(
+        'Div',
+        inputs=['logits_float', 'temperature_constant'],
+        outputs=['temperature_scaled_logits']
+    )
+
+    # Apply ArgMax to get the token ID: [batch, vocab] -> [batch]
+    argmax_node = helper.make_node(
+        'ArgMax',
+        inputs=['temperature_scaled_logits'],
+        outputs=['token_id_batch_float'],
+        axis=1,  # Along vocabulary dimension
+        keepdims=0
+    )
+
+    # Cast back to int64
+    cast_to_int = helper.make_node(
+        'Cast',
+        inputs=['token_id_batch_float'],
+        outputs=['token_id_batch'],
+        to=TensorProto.INT64
+    )
+
+    # Unsqueeze to make it [batch, 1] for consistency
+    unsqueeze_axes = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['unsqueeze_axes'],
+        value=helper.make_tensor(
+            name='unsqueeze_axes_value',
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[1]  # Add dimension at position 1
+        )
+    )
+
+    unsqueeze_token = helper.make_node(
+        'Unsqueeze',
+        inputs=['token_id_batch', 'unsqueeze_axes'],
+        outputs=['next_token_id']
+    )
+
+    # Create output info for next_token_id with dynamic batch size
+    next_token_output = helper.make_tensor_value_info(
+        'next_token_id',
+        TensorProto.INT64,
+        [None, 1]  # [dynamic_batch_size, 1]
+    )
+
+    # Add all nodes to graph
+    model.graph.node.extend([
+        slice_starts,
+        slice_ends,
+        slice_axes,
+        slice_last_logits,
+        squeeze_axes,
+        squeeze_logits,
+        temperature_constant,
+        cast_to_float,
+        scaled_logits,
+        argmax_node,
+        cast_to_int,
+        unsqueeze_axes,
+        unsqueeze_token
+    ])
+
+    model.graph.output.append(next_token_output)
+
+    print(f"✅ ArgMax node with temperature scaling ({temperature}) and correct output shape [1,1] added successfully")
+    return model
+
+def add_generation_logic(model, eos_token_id=151645):
+    """Add generation loop logic to the model."""
+    print(f"🔄 Adding generation logic with EOS token ID: {eos_token_id}")
+
+    # Create constant for EOS token as scalar - will broadcast to match next_token_id
+    eos_constant = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['eos_token_constant'],
+        value=helper.make_tensor(
+            name='eos_token_value',
+            data_type=TensorProto.INT64,
+            dims=[],  # Scalar - will broadcast to match next_token_id shape
+            vals=[eos_token_id]
+        )
+    )
+
+    # Create equal node to check for EOS (comparing [1,1] tensors)
+    eos_check = helper.make_node(
+        'Equal',
+        inputs=['next_token_id', 'eos_token_constant'],
+        outputs=['is_eos']
+    )
+
+    # Create nodes for updating attention mask with dynamic batch size
+    # Get the batch size from attention_mask shape
+    batch_shape = helper.make_node(
+        'Shape',
+        inputs=['attention_mask'],
+        outputs=['attention_mask_shape']
+    )
+
+        # Create zero index constant for Gather
+    zero_index = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['zero_index'],
+        value=helper.make_tensor(
+            name='zero_index_value',
+            data_type=TensorProto.INT64,
+            dims=[],
+            vals=[0]
+        )
+    )
+
+    # Extract batch size (first dimension)
+    batch_size_scalar = helper.make_node(
+        'Gather',
+        inputs=['attention_mask_shape', 'zero_index'],
+        outputs=['batch_size_scalar']
+    )
+
+    # Convert batch size to 1D tensor for concatenation
+    batch_size_unsqueeze = helper.make_node(
+        'Unsqueeze',
+        inputs=['batch_size_scalar', 'zero_axis'],
+        outputs=['batch_size']
+    )
+
+    # Create zero axis constant for Unsqueeze
+    zero_axis = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['zero_axis'],
+        value=helper.make_tensor(
+            name='zero_axis_value',
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[0]
+        )
+    )
+
+    # Create shape [batch_size, 1] for ones tensor
+    ones_shape = helper.make_node(
+        'Concat',
+        inputs=['batch_size', 'one_constant'],
+        outputs=['ones_shape_tensor'],
+        axis=0
+    )
+
+    # Create constant for value 1
+    one_constant = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['one_constant'],
+        value=helper.make_tensor(
+            name='one_constant_value',
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[1]
+        )
+    )
+
+    # Create ones tensor with dynamic batch size
+    ones_tensor = helper.make_node(
+        'ConstantOfShape',
+        inputs=['ones_shape_tensor'],
+        outputs=['ones_tensor'],
+        value=helper.make_tensor(
+            name='ones_fill_value',
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[1]
+        )
+    )
+
+    # Concatenate attention mask with ones
+    concat_attention = helper.make_node(
+        'Concat',
+        inputs=['attention_mask', 'ones_tensor'],
+        outputs=['updated_attention_mask'],
+        axis=-1
+    )
+
+    # Create nodes for updating position_ids
+    # Create constants for slice parameters to get the last position
+    slice_starts = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['pos_slice_starts'],
+        value=helper.make_tensor(
+            name='pos_starts_value',
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[-1]
+        )
+    )
+
+    slice_ends = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['pos_slice_ends'],
+        value=helper.make_tensor(
+            name='pos_ends_value',
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[2147483647]  # Max int
+        )
+    )
+
+    slice_axes = helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['pos_slice_axes'],
+        value=helper.make_tensor(
+            name='pos_axes_value',
+            data_type=TensorProto.INT64,
+            dims=[1],
+            vals=[1]
+        )
+    )
+
+    # Slice position_ids to get last position
+    slice_position = helper.make_node(
+        'Slice',
+        inputs=['position_ids', 'pos_slice_starts', 'pos_slice_ends', 'pos_slice_axes'],
+        outputs=['last_position']
+    )
+
+    # Add one to last position to get the next position value
+    add_one = helper.make_node(
+        'Add',
+        inputs=['last_position', 'one_constant'],
+        outputs=['next_position_value']
+    )
+
+    # For generation, we only need the next position ID, not the full concatenated sequence
+    # The next_position should be [batch_size, 1] containing just the next position
+    # This is what the model expects for the next iteration
+    identity_position = helper.make_node(
+        'Identity',
+        inputs=['next_position_value'],
+        outputs=['next_position']
+    )
+
+    # Add all nodes to graph
+    model.graph.node.extend([
+        eos_constant,
+        eos_check,
+        zero_index,
+        zero_axis,
+        batch_shape,
+        batch_size_scalar,
+        batch_size_unsqueeze,
+        one_constant,
+        ones_shape,
+        ones_tensor,
+        concat_attention,
+        slice_starts,
+        slice_ends,
+        slice_axes,
+        slice_position,
+        add_one,
+        identity_position
+    ])
+
+    # Add output tensors with dynamic batch sizes
+    outputs_to_add = [
+        helper.make_tensor_value_info('is_eos', TensorProto.BOOL, [None, 1]),  # Dynamic batch size, 1 sequence element
+        helper.make_tensor_value_info('updated_attention_mask', TensorProto.INT64, [None, None]),  # Dynamic batch and sequence
+        helper.make_tensor_value_info('next_position', TensorProto.INT64, [None, 1])  # Dynamic batch size, 1 position element
+    ]
+
+    model.graph.output.extend(outputs_to_add)
+
+    print("✅ Generation logic with dynamic batch sizes added successfully")
+    return model
+
+def save_enhanced_model(model, output_path="./data/onnx/model_enhanced.onnx"):
+    """Save the enhanced ONNX model."""
+    print(f"💾 Saving enhanced model to {output_path}")
+
+    # Create output directory
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+    # Validate model
+    try:
+        onnx.checker.check_model(model)
+        print("✅ Model validation passed")
+    except Exception as e:
+        print(f"⚠️ Model validation warning: {e}")
+        print("🔄 Proceeding with save anyway...")
+
+    # Save model
+    onnx.save(model, output_path)
+    print(f"✅ Enhanced model saved successfully!")
+
+    # Test with ONNX Runtime
+    try:
+        session = ort.InferenceSession(output_path)
+        print(f"✅ ONNX Runtime validation passed!")
+        print(f"📊 Enhanced model: {len(session.get_inputs())} inputs, {len(session.get_outputs())} outputs")
+
+        # Print enhanced outputs
+        print(f"\n🚀 Enhanced model outputs:")
+        for output in session.get_outputs():
+            if not output.name.startswith('updated_past_key_values'):  # Skip cache outputs to reduce spam
+                print(f"  • {output.name}: {output.shape}")
+        cache_outputs = [out for out in session.get_outputs() if out.name.startswith('updated_past_key_values')]
+        if cache_outputs:
+            print(f"  • ... and {len(cache_outputs)} cache outputs")
+
+    except Exception as e:
+        print(f"⚠️ ONNX Runtime warning: {e}")
+        print("🔄 Model saved but may need specific execution providers")
+
+def main():
+    """Main export function - creates only the enhanced model."""
+    print("=" * 70)
+    print("🚀 Qwen3-1.7B Enhanced ONNX Model Export")
+    print("=" * 70)
+
+    try:
+        # Step 1: Download base model
+        base_model_path = download_base_model()
+
+        # Step 2: Load and analyze the model
+        model = load_and_analyze_model(base_model_path)
+
+        # Step 3: Add argmax node with temperature scaling
+        model = add_argmax_node(model, temperature=0.3)
+
+        # Step 4: Add generation logic
+        model = add_generation_logic(model)
+
+        # Step 5: Save enhanced model
+        save_enhanced_model(model)
+
+        print("\n" + "=" * 70)
+        print("🎉 Enhanced model export completed successfully!")
+        print("=" * 70)
+        print("\n📋 Enhanced model features:")
+        print("✅ Integrated ArgMax with temperature scaling (0.3)")
+        print("✅ Built-in EOS detection")
+        print("✅ Automatic attention mask updates")
+        print("✅ Automatic position ID updates")
+        print("✅ Proper cache management")
+        print("✅ Dynamic batch size support")
+
+        print(f"\n📁 Enhanced model saved to: ./data/onnx/model_enhanced.onnx")
+        print("🚀 Ready to use with improved generation capabilities!")
+
+    except Exception as e:
+        print(f"❌ Export failed: {e}")
+        import traceback
+        traceback.print_exc()
+        raise
+
+if __name__ == "__main__":
+    main()
diff --git a/nimblenet_py/simulation_assets/dict_model_test.py b/nimblenet_py/simulation_assets/dict_model_test.py
new file mode 100644
index 00000000..b7cf4f41
--- /dev/null
+++ b/nimblenet_py/simulation_assets/dict_model_test.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Dictionary interface test with actual ONNX model inference.
+
+This script tests both traditional tensor interface and new dictionary interface
+with a proper add/subtract ONNX model that uses supported float32 data types.
+"""
+
+from delitepy import nimblenet as nm
+
+# Load model at global scope as required by DeliteAI simulator
+model = nm.Model("test_model")
+
+def test_tensor_interface(input_data):
+    """Test the traditional tensor-based model interface with actual inference."""
+    try:
+        # Check model status
+        status = model.status()
+
+        # Test with actual model.run() call
+        # Create test inputs: X=3.0, Y=2.0 (expected: sum=5.0, diff=1.0)
+        X_tensor = nm.tensor([[3.0]], "float")
+        Y_tensor = nm.tensor([[2.0]], "float")
+
+        # Run model with traditional tensor interface
+        output = model.run(X_tensor, Y_tensor)
+
+        # Extract results
+        sum_result = None
+        diff_result = None
+        inference_successful = False
+
+        # Check if output exists and has elements (avoiding != None comparison)
+        if output:
+            if len(output) >= 2:
+                sum_result = output[0]
+                diff_result = output[1]
+                inference_successful = True
+
+        return {
+            "status": "success",
+            "model_loaded": status,
+            "inference_successful": inference_successful,
+            "interface_type": "tensor",
+            "sum_output": sum_result,
+            "diff_output": diff_result,
+            "message": "Traditional tensor interface with actual inference"
+        }
+
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e)
+        }
+
+def test_dictionary_interface(input_data):
+    """Test the new dictionary-based model interface with actual inference."""
+    try:
+        # Check model status
+        status = model.status()
+
+        # Test with actual model.run() call using dictionary
+        # Create test inputs: X=5.0, Y=3.0 (expected: sum=8.0, diff=2.0)
+        X_tensor = nm.tensor([[5.0]], "float")
+        Y_tensor = nm.tensor([[3.0]], "float")
+
+        # Create input dictionary for new interface
+        input_dict = {"X": X_tensor, "Y": Y_tensor}
+
+        # Run model with dictionary interface
+        output_dict = model.run(input_dict)
+
+        # Extract results by name
+        sum_result = None
+        diff_result = None
+        inference_successful = False
+
+        # Check if output exists (avoiding != None comparison)
+        if output_dict:
+            try:
+                # Try to access outputs by name (this is the key test!)
+                sum_result = output_dict["sum"]
+                diff_result = output_dict["difference"]
+                inference_successful = True
+            except Exception as access_error:
+                # If named access fails, try positional access as fallback
+                try:
+                    if len(output_dict) >= 2:
+                        sum_result = output_dict[0]
+                        diff_result = output_dict[1]
+                        inference_successful = True
+                except Exception as pos_error:
+                    # Positional access also failed
+                    inference_successful = False
+
+        return {
+            "status": "success",
+            "model_loaded": status,
+            "inference_successful": inference_successful,
+            "interface_type": "dictionary",
+            "sum_output": sum_result,
+            "diff_output": diff_result,
+            "message": "Dictionary interface with actual inference"
+        }
+
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e)
+        }
+
+def test_interface_equivalence(input_data):
+    """Test that both interfaces produce equivalent results."""
+    try:
+        # Check model status
+        status = model.status()
+
+        # Test both interfaces with same inputs: X=4.0, Y=1.0 (expected: sum=5.0, diff=3.0)
+        X_tensor = nm.tensor([[4.0]], "float")
+        Y_tensor = nm.tensor([[1.0]], "float")
+
+        # Test traditional interface
+        tensor_output = model.run(X_tensor, Y_tensor)
+        print("Tensor output: ",tensor_output[0][0][0])
+
+        input_dict = {"X": X_tensor, "Y": Y_tensor}
+        dict_output = model.run(input_dict)
+        print("Dict output: ",dict_output["sum"][0][0])
+
+        first_output_match = tensor_output[0][0][0] == dict_output["sum"][0][0]
+        second_output_match = tensor_output[1][0][0] == dict_output["difference"][0][0]
+
+        return {
+            "status": "success",
+            "model_loaded": status,
+            "both_interfaces_equivalent": first_output_match and second_output_match,
+            "message": "Interface equivalence with actual inference tested"
+        }
+
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e)
+        }
diff --git a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules.zip b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules.zip
deleted file mode 100644
index 128cfe4e4517501cb8b56aa1df621be3264f7143..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8017
zcmaKxWl)^mwyhfp?k)`^xLa@w1PHD{8%uC^2*KUm8i&RS65NBkbb!X)-Gl4NcW<3@
zckNyG)LXOOwSKJebBvnLP*p%cBme*aD1gDN8_-G>TaX4W06;AU03ZW^0K2bN_Uw+|
zH8oHH2+Em$mMVY8%>x|(hj<MK0Q{ea{p1ENO9L0#ottQ)@zt`RT8g<zjp63VETIOv
zId%FIP9TOXP+Et!YaJjLE%W{v+vn&C0?SP`np*kg0HEE8&a~#{LOs4cJvlw0xb?kl
z#_Qjb%MjgNz|^Z8{hlLwa*RodInSdqb$l4Exnp}C*qsEe7y`3yMAdJX^D2PL+ZlCJ
z*%{DtE)x3#%|<cBMVJNP3H0HgTu19l$9{fKeHn|gOovbRfONmYJLKvgHsKYF{Rx=>
zT+zu-S+#7K(H6UOL?@WF;ABH7+x(l*u|Q^ZSCLA@HU8RyDD|Me&rm}xGvAMALYLXW
z;J8>Jj2ApH6E(!dRJCb*Z>}R2Yg$>xfOChqUomyehOD!S{szVO&gm8zF8Zm`LS^z;
zm*)A~PCb{T<%siSkc)<Q$xxjxmv@+iMREW}c9l`QSJgrYIyBZ`Jwe6V*75%1`ARZY
zr-1j%aT)UNu;4w9AcNj^mUm)Sl1kDa`rLseS@AET;_~SF;K_G&pFhT-4c+d*_A+-e
zOXT2NMO4L+TiGEBmL=h0*}WW%a%tvt9H}C5TZtbuph(IFi}aJ(+)b0-7dwA~#uF`O
zxYaY*H+ebw5NI9Nr|BH<R+zq%k2Cyk9L0<XurSN8m;^cxNYefM+AwU$@bvZ|(-k|n
zepH3A&`hIB8z}MnGTJ27yQMj1F3{OjP=$$Ni)C!q!Fec32PvnvcjN4JC<l|?YB*Jp
zfQZNf^?gx+<nQe%?CKvP6o4Mu{q?;jL_Z8OD_JTpGJ)d?G?ZM!x9^C$Vi*)PU`)iu
zS`xchJ<5BRPftp&mzU$2`<s_LkE8pSPrpknhW4;EjEmzpI?;fousR1|+8}1$Z3*!i
z^f$Zm(lDWZ#iimok}J6`^4;~q(dZn<n$+29Zl}+;2u_Lfrgmd4Eq%VdC_+ZW5Y@u`
z=(@y};+?GiLMBj_{9gPs=N_pZ0%$-mgP0jFe``rBbElu1{lcxA$3kweE$YvEzKtC=
zWY<B~ZQ41+DHG%Q18?l+ZF^s06$R<XA?X(q(~Cr88P+t-h2;Eb$}3$QV(l3`YL%{~
zxNc_lO}d_yzNkp#+gtWQ=ux^l!1Sd|Uc&LN3N^k;F2_y5J$t9=Ev|*sn{uuzESQb{
zcc;|esE_LD1Eg*84g!!=7f#s>xdVxh>Lm9g{K1?G=xSKP%wt6xc&YxhFg#BLWCX=u
zAVNVzx^3<V?<gz@i*r*H$`jY>=jKg1-nUlm;i>Cgcabtl(qg|(%WewR%`{DIrHsTf
zq)7<w12LFqC!AS>I(!iB0buU-D};!*r@|(~X=@iuSF8z6C}^vHmSOyjgg&I2^7Qd9
z8u>xbyphvDr97!mH2x7kq$3u8jO}rKS;rXjlv>)Q3wh;Qmrfuq-DVl5nZHFjMwT*8
z)z-yZ{-&709MwQ)fhdw1{fRQ7rkBX<O)1~Nl>VI+Q3*QMGUwW_)SO;<x*ANb61OM(
zdBg_|N|ik=BHt!ZA&j5^Db?INNwUN1Dtw2<u+Qw2H@agu_8rBSiRDWY1i^Edpp<fj
zt0v#K@>pMmqJg{F64e6ZyiY`4$L}=0Aei!pY$j^u4->+<J84J(?Spq*xUr^)4y+O$
zq~5w7>X)y){a7*spNX!s`6uI`i>6E5mM6!jQ-L%tu^>_ME7+LGwM3l+1NMW_Uq+)i
z%}jLhZr_6SH{juUNA5ERX0-xKSbRZJ_i9lB(i5$Z;t<72-l=+JlYrgP!y|@`lsQTl
zP2(B5%bNZuGnb%3_L|cD57Uw>Y(mV^z$NYlCf|t#M85EtU?k4Rd{$49c5noUsBp*z
z&r=;SGuCL>Q`v2xn7OEBIa0laQqdT0#lNr&R-5Q-B2&k4aO_}d{N!mo;ud@ooO0))
zm5Ms;SP2wMJxmti`W3CIbpJA!BAoxVl-!K-veBam7M|jZj%&qd=@D$ggS&CnTOdo7
zpSYHS`T3Wm>B<OzyI?ZnxGbh(>QNz{y5A{+K*|z97n(PX;)&qhJh*!4oWpQDjC*x*
z&r7XlB7n2wx#7G#wZogyw&AFAv=={zoM55;O1N0km8I)S@#90J%&)j!8KbqZ^I_>T
zkoN?Nkcb9ky;nDbZhskRN31*@-@j9^Gx*bo`bvZ!@@ds5Tr{m$Cp|Z07$$?X>?SRO
z3HH*<IrK!H=a%nM{;;0Pl1-=-4TOpn@>XIS{ys}ptDC$GX7Iy;*zD*|J<j_>;{UYW
z(kQgAVVym!dE5K1hQ-KOzrP*reCcK!!f)Z=3iwutS0bMwn*JU$%JBmVLJ7U9=gsvj
z@a5j&CQ?Y~xOd2ot-_nC^=K)hs(}0fLr;JmMpfBQ>AKC|j-xT+3dR;RCtZ^!?M$g-
zPjQ(}t8=*R^`t*ILk>)g;+}|D*XvLIs(<8-ebTguG#V|wBs_wEK)oUVMtFxX7UTw+
zH)$@*oEiw@(4K!U=Pd~Cm#z&+bp2U-`h^pPl7ce$v<@G9qGuLe&=q3Upgk|IBll=J
zv`Xk5BD~F`r^!6DEd$~oNh4W{atn@>2G!(5M!9p8fx*&m-oOj=%i%Fm(EyJxQp;<x
z3h!rf750X?_&ZQ4)70c8iJ2}-ZF1apE*rPf%~-1_#9u=KbG~#;;qypZGwxnQ#$nNp
zF%vUw$PL8T`^OJOty(5027MkQk<K!>FwE{-cLc_wGS-TxS_z!OR4?x9x;*v+cXxR>
zOAK1BaFlbZ?uGJY8X{(^cj(@2FjQ|$eQfQ84#pWgE$WPN+ED@c_B?7wZOHAswp^LC
zNB3t5m2`U+c&d!yv;txvWSS@~L&e_%@ZWqING(5kIzImCOrfv`XDk(-@G<yib^&l!
zgD-la`YBh^;t=b^GwLd%j{oPX7-~b<lr360P>4gg--WAd@)TE|O2R4d67rFbr>*`N
zP4KE<V)p4*z{c5j-^4=x<Hqdy$Njxitse!h(T`#5SdSG;ZR6HqV^t3EdOy)OW=`tZ
z7plqmzS^<(xmssW^naa0^F~}CL_pszt4+K%46a3%v;^^tgjEKUD+ApQ5+c5H9jhBC
z^ggqDw|9_^hM+~T68}mtaP+y!|BdxvZnplADRIex&{;o~YEyT>iO@)5o0NrOPrJ;T
zQ3vnJMow*g^$D*M{pf7{ExW@0#TGmok5pLKOA4_f+cKOIFV!a4h}<R@kBwX1kTZSI
zSSEG2l}NmzBvzAQhH$hT?_*IB`hya@6x(Zw<V|In&4l1KiznnUU8;e}Idj5Twlnhr
zOJz+veabUW{w<dMDB3MelM9w~CQ|wOcjcsI0(DK_lua=Vp_@dcHTzUoStBAF9(*?u
zJjAisHGiH|-3dDScz81Dc|;P8?~ZTT;^fm|Az#(Z8rQFFQku?Y#(iPZvr)0hCEsw`
z;wOr<*1fb>RL5honf)Kd7$^xAYdGrJw+Wd1{SlHtawIXos$akSa+|k6&$$)}{Ho)S
zkSTSI)D7P5)d@)ey(vc||3ru646Ww)WsS=5)C;!v2?2yEQ*M?q5K>93V~MX9IF&n@
z6-ly}>=D;qI}eud9u6Av-V){zh;-PmDb0Wb8iQn3BKI0y?Jaf!;#T<NO!&w|#lvg`
z->j1Il^1x5^mvsPQTKSB4pbInC~mfC0I{*yl0avBz7!OYnz#YmT#80rz57(-9+4M-
z6jr-x5J5=ytPW)ZiVdwmS1)ju$P>qmlAHTk-1rB^jqMPW>FxEr1>oKdEnwu*_8oq;
zQ-RR0DM@P4*&1X*w=gO@j1WflNRJcYRxH@ARaoUp?ocvY|7nBEA<UOLc!vZ3i?k#K
zeHBcTHg9!ZEpe)fFM`@#l9A-#2l0>R!z^V6>c#PHBJfXY5xt-*SkMTl^f_j^d~*qN
zX*TTC@T*^0tRw;PJC(WAFXC>+7BZU9FX->LX+zDb27Gq(&ooko_8l0u8cuii-6|0V
zsWH*27N`mH=yT}PcQsE>E_hC22L^9z$%oN>77LK<aVq$&3{hpi_~it1KZGPI9u(>|
zR6cVB#jsTa9J6FhtQhoBE^%InIhSuk?cEFJ;-|5mLK@^=*#=|UF8Hy-#;j-u6elc{
z0A7p-!70e@MwuIRzp3NJC|8)(9aGoTX{_Q{?a0rh-sMc2Z4$-B?PU=J_$Kv`6q42A
z3l_U$rIkGmA*?C2o^ri!=SuqeT*YPPsy^^)51r<tPikUbWw{Z+gXQ0?uYyGd+X{Hg
zch9crjCtG!C$O_3yd*v2RB0&cBaxvsA|Q8B^*^8YN|EMh+?9yfFm4gbKVzr5PS|d7
zy_vxMlwr3cTMgAG>HgG^e$-4jnvJVap|rK3r7*2wehlt>-Ah8MI9=Pt$KE3`Ut*e^
z?M*{IfYjBui$!XiDs~`gk1>~qH7}Z1{~0Q7PYLE1T=QU+gPc&6D=42p`#&_5Hd`FJ
zx(Ae!V=p%SI39Suc_;Rgp0_C4%Wys`_(J62cCuCK%hK~~lm4)k8zB*TfTUDLlq00S
zl;L1VAK}6=(nqmv$It7V+G9MAs3a^<RFnYI({NDxXyBYEp@&c9MdZTsgU)`8HOrs#
zy9}-^62po^9*y;_ymBFedW*|4<xH++cPJe>e_CVO`{sH=q1=TBjFm(dC+(#Y;n1fn
zv}nlq;1Xr=6Y)7;wtJJDhY?4{{K`ZTNxaB7b4%9IjD_UcuG;)G+W~<L4@fH=Mr3ld
zh@plE2s*r}r3<Ii_E+=rPi@FkS+dJ&5?@F}cDxy-mnAGD)AF$k$Qh*0&y&_+9i^!w
zPLydf<X8_D=wP<hh?Be_6rN{nR-F0ddf(tP&JTvmYY@AIb<_V&*W4Ca$`Bfx$Yp=G
zIB-p5VSB@EtM@je|1_<^tXp`n7{|YyknfXR|MjoWdzTUUlVg{9#jNV+kG)uR?v{vr
zqWPhq6~`2Lh_6U@9g0qi$8*wLN^kIuzZiC0hKjRkG%UbF>mxP?dfY;Q1atTAiJh7N
zB1uvm_xsLI3v=(2pONawph9@Zv+0XgQB&!Of*r<26ib+bL(a{b<+rJhtre+>0-q_e
zSE3ugxnd`*IfWgbC3f6UN+T(~$~xXr3&@C0r@Lki!-Kca3)dTXtX<>vHbaGr2fn)6
z-{fIsW7gZ^wDoRq&R6HV>z_CyCVy*vTOO{(bP`k5$c$Td@bS;U2J|vdjejP5SF@R@
zF_T3@kNN=^Lc16*eoY~0RerLAWWm!WKW^FH)@K`f&vw~Ch3#I}Vj?_;?rf~Bk-qXl
z0z(GXXxI7Dro`4`+c-Zbkq2P*1$1CZXvgdy_iiLB*Ib`^HKOMc>HIBPoZnbG3z;*u
zKg7;2Ai&`Otmb2>fKDbV(5W=Zk2}p#8R%r>2kwF*AhU4(5fSMVyHU>bI*q%&!nxzY
zZ*F^Uf+=(d70?nBL2(}FAlh!ed6bfFKC>~=@gAK~Qds64rg?0inzMI|Ee*BWrbXSN
z$S7NAejeV!$qCPSk5!-Mxd9h!@dD;|Y7J+wmS=X2tqF&?Xruc4#6yfQzC-*lW+L))
zPz+?iXyHn!y(`!LviPjo9nm2?j@9`kZa%5xbd+j8Uo{dumIuqn?E6Lx-W@rg6JzBL
z2Ol@ITR`#8c*(xCG-^*A*hVjsY0ExQfw@E)J&W(k=}gQS*z(1za6w2v3HAcSl!^*Z
za8Qm=f&kIyPhDa^$01Y#T7F2c6u_%&o&BxfdQGuAc@i0WO}44#7w{RaJ9wL|Wxw@W
zbEI|eKlU^iFo~wPyN6f7H>`zkWLES%@)?%Mrmy9yP5(@4J=?EsQJUb*S+b{_B^A~n
zW2Kxptnm~M7!eHdltH-gOFtJh!95Pk+R&%rdG~osx7+D_C`Lt}oQY`v6<+m!pp4{S
zC}WnA8kPG70B9iu07(Br8K-Y%_C~G_HfHu#o@UNQc2@RQc3*A(8`EfJ_*rWGoqxkL
zl{#w<OME!Kzm2+wvE6V6Wp5kvnc~^lQsG!Yg<k1LsP(SSI-|+VQ&!6juaBp)YV>qM
z?v1Dtz^HF)J|e3e=6$}z>J{IISN-7bX&zN8#Up=CROegI^Co}noxv2g@JnvpP>gUh
zi2NuHJ%y;{Co<67G#a1hK@!3fB{8{?(@|CN=~=uJG%@EigN*pm2$JIYx`H*xJfJ@U
zd8>oNlJd)D85M61*(Bd<7E~S&%V{|mv3AF<Md634B+~A}s)db^^(HSW*NOP(a5Kop
zmsVn3`|o_{oCih|?FtvCrQnHI?XC^zbm8<J$W&on_4g@~-BSePM?4fPj8Ukg%(~$_
z!7JLNMvC+Kp{|(Yc++J?%D=-`sBtaL7DPGt^IA1~(mxid*Ci1|7&In!FRA+CrWhTk
z{QU0A`r_ZXKqgeTa3e5UF*11u)}q!g^skQc^E=bp57ydeiH-soM_M#$nfx~M$y#^@
zLbM*`UY<^Q>tZc{ezqtGqteA*IlcJS&H82WWWRd&iQ%*nWKF*fl0oL*L8DP{i+%=t
zL3a>?&i~mLW8KQIk>@t)jIpa?Lh#zND#GDTzxeZ0d(RL0yJMUk>X#qaDaQN=^;XHB
zwH`vGPvDwFABfRks20E5-r*m+`Z1CAxMoP_m46feh1fR?k(hFGNxnzo@OamA(2(F6
z^9r8L*)zLh9K*<y2yIN*jFAu++KNK!owyHQqnxsf`F$NFQCX;mt50Z%OG-!z7yHmM
zkg*M%(=QmRALfjkYM{NlRIAXvm4aR|W%qE$6qGGHRAk3d6$`7|6>39Z2AErcP|{7_
zlRT!+WSkpJl`TwQ-1GY@N!Bii5Y;$vg6&ksgIAU3OJb&!CD$avqEpJjaWw8UH5t6?
zo7qAu!9D>N^PPvS%Naf2kjL3Sv%=d$w~Ql6Ng)Ncl_5_nFFj4d?M$HFI#O~-w%#(m
zdqo?59qxMeT=0d6O3JulCL^;*@7$+;l+T%`ctGT&4cX0H%O(q#V1tTanSH@)usn67
zyQd`!BhB|d9|O@EA%rsuH3yLG-pE@O+Ymv_?J!+D$@6k$*E;R*P>uFwnzM~`j7GjJ
z0pIW<|Gn0L0bm<tm2|=e>d@6=MgQ$Jwbfi`aou8nQ8uH#e&d5c<Byb5su-wE_2hyy
zT?k{yA!Hv|W7INMBcLYO6PIc8VR}ma3opv1693W!9L)`6AyE{V>^O$R6S0b)L8zZR
zWUz1HzJIU<lbnpoaf2wd#k9`hI6-y>@kC5jkHiY}ETYZyyX7#V-Deh}wLXEvq1j(O
zPFc*{`Jp46hG7h(3yu<;U~uof2crUZ>=<YW69~8ry8Hmg_QLVDQ%nS|^i1Q(#gG2O
zG;`oR#pkQEO(JSqj)bRjTF{Jdc`&dK-Hb*#ZTJemDt3vMl7}T3O`#_+14KrE^d^^I
zCvyWe1dbyMG=VX7@F3llZ;}kTmX$4ty>=)kYo%9>y(4~o0Wt{9`v3@AyD<o?+y{B9
zj3s(M!aPkd<uB+2gBJVoXf@weI9tWs-h)(H7fh%A0<BF#&x4U@mS8CGVWZ;|%asJ~
z#@F039a3yIpE<%T5-XldN#<ZhcR>&anLlhidSA|k-N)!8e%Ku1e&IwT91M+xWq!zr
zO{E~y(hE6Nq%Je{u@Hmyu!POrGvezzVnCk~sKB1XhndTamO19}JTDD`RRmh!W7)qa
zjw_$Y_C7-_adsfrG;A^(28~xj;0k_Mj?Wtu*6j}N9p1e)p*dod^li34A?zg%Lsm}A
zgAtnX7_0Fa>$h`kCH25r_FGk<;$oIIgFY{%@E!SuKd^vvPAZA70Y$HiQUOU&$4g@w
zV>~dXZr;Z`<Gd@;o&k-|3WwnXyTxiU#|h=tbH(!?{-5IyW!`{7`Qy|*kN^NO@_)cQ
z5a8<IVC(YV2Je{eXX*5J{>|XWb?p8z_?J^7r_W+z%C>{<NyWVJ*wPQRM5C&2QA6Kx
zVu14P*tv_;H1JB!{T3bOIhH7{KMmI&xnF~594odiHdZcLU<t_QhG=YQ9618#(xu?M
zpsui?>_2Z1=tchky8}ZY@pVX|F|bLOW(raXo}1wEi+k9QNdf;kZ)RE*|Kb|g1{Blj
z^E(1_{WvNCm`hcL?>aY`m1H!1eJEyE+?K#uGZMd6qylCcyCT9Hj9SuC(`4**ZpSlc
zx;QH&3LHEF3$eWJbDX{oWcEML`bRT^x8}|dB7Aid&NMvq`dp|Egh^H5%Uz^B3^FFB
z=Q6L-<Us!T98N8a?sTNOg(-;x5knLL4mA(=R%_xfofpV=G^*eDpkGw`QPZbh@bo`~
z|6*SKlM{2xqWs<ktz5a>G7?o+`ny`~p-{74sy?~M&p@dfeFQ<abmxXETBB4|Pfdck
zJw;6lAp_F*ZQ+$+mKqeF90#sTr8XC8Ag0CC1%=V3p&<XIG7>iTS2(Aj?dG$a9CtM$
z-p`;(8AZ9uBV=sZ4x%!P>K2TIS<_MrC6;yT<DRprbHwi++5!<s$qYI<1kI60I&#rW
z+RUm1y=!*9qv8l{7CjdD7{vFdU23s5Av`+`6p4CX_X(5vF_LWWGmed^OOk;yAhr6}
z`^O##TP<^c(1c4@uzAhTjhuTfjuyGP7_ZNKgCaRD5H@TW?|7%A$HGZBSJvh&t_ZnC
zO7Qf!M$MWH#JDpoBl%u&AZ?pDJi*R6v>)lgGJ%DN5z}Y)68PhIbSap3IquFeXe8vb
zN0Ob4#A7=eroxgH9TMpnVQ&O5Wq+8g9!7)e^^R26Q>%y8x}|HDx>t>+?!!3CXPcVC
zmYUx@R3_}>nNM6AB9V^i`O}7`%sTpVs8Ov%0@!F~;q%FlqZQlM4XpX@0OGV&JjZ-q
z8mdNdXk{4tB2$k!%Us>LZ#Oa4lxILa9)u%tf06VUD3qM=1|;~dEcR2%42}%HlVN$G
z-0+YYOwkfo8`+HZLG?xP+5bT5q>+Eb)}iu1nYs0ut<{ojXIzlo5hW>OqXK^0aGi)4
zTSo~WF`52@mY(r&d$4{HBc+cSL7eXxBjt%`q?*WNWQ$F}vA5Y<;Sc6-g}tx~8BV1&
zVm2?dKV94*M^aab2IN8^O&z9mLxa2$FFymIVmosnQO{ZJcI75`dIgV7W=HEAJD=H@
z=uJ+H<lP)IJt6RiR*~uRfhQVEBDl5LB~ypDQ%b=HOHs;b2TWfo`UCk7j+o-xU-HBY
z`XG#bL|>RE8WlU8bvPr*RnVOJ-f6&f{Q7gx$*^gjy$`o6HEplhZ_@f{>P4!UFshH*
z1*t**ILWb6{G|eSdzE$6O8%0*pNcmuF7qg=uwdXT+C(0|z`@l+_6yS$a<3uwYiafk
z2`>_qt`X~Q^mOEdp@9SZe)ci^xjA~*0z2T0oge;qw<m0z2;-p*(7w3Y`=t(b=5vH^
zJ|FLn1hK9PgMY<fzr?mmN7S2PnCQ5S{WZ*z4MCpia9h9M{-W%cSn93+?pKXT@uk6T
zg5NVw?Ao+`k(%1PY(E8yGhZn5EKIX6n6^Z+xr?Vk4mTs5*XY3a4_+5`m4>Ev-|O^Q
zN|ku;bEvq-uRra&aPpGrz-wr|Z{&MT@d{LrqEfI;PO@eyRW-Cyd@ilPXB25B;o?de
z35^v+8?384U04_%<*TH(NUw)cVYD(lgBkcHPwvz=EuKHP@9vlF9_(PQK=URQ*;wXy
z<vc3%ND*su5>v7+<={0(0^I|QwT}r#h-NH7aFzJ{v+T)<Z6$}I$#9J~avYNQ$;S`F
zSU&9JS_jO!>g>n<8mPlG#KQj32z}+p=5n&$$gFtgbi`T*OG9k-Rm_(2TI-1!<(A`V
ziR9heDyN4d$B5nSmG$e_D4J8VHp#R3STa!S*+qh&qZT)ZpnlT<1>$eYY!g2yKL}5p
z_Cq&ya+Xzj3|J$mWA|4}n#(Nh!gzRL`gU|l{LM}Dl};o}EuF@Y<}k;me)bso$nsD3
z+L3?8n+@eoPh4LE(Sy!<uGIJ1v9S#79m%Y6k?}ID!hqds<U)1=w!KbAnHPPR7q4a`
zpnz9zt7VAjE4xU|YNl5ikFLsOUvUZMd`#m8jgL>>&nW)6wpo(HboZU{Z(BXZU_DH#
zj=VQZ?>WTqs|NOsR2AUhDG~mE6*9@+Y8e3d+Z6)}1O7SvPoezg{}#&s7GnP@_MfZI
zzlx>&ZK?m4Md+{mKa0@+BLAN``>*n0#DITE-M?!8r0##Hz4@Eb006?@@8R!Xg7Tl+
Fe*sNaC6NFC

diff --git a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py
index 82133905..8faee540 100644
--- a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py
+++ b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py
@@ -8,8 +8,12 @@
 
 # Load Qwen3 1.7B 4-bit model and tokenizer
 model_id = "onnx-community/Qwen3-1.7B-ONNX"
-qwenModel = nm.Model("qwen3-1.7b")
-print("Model loaded successfully")
+try:
+    qwenModel = nm.Model("qwen3-1.7b")
+    print("Model loaded successfully")
+except Exception as e:
+    print("Error loading model: " + str(e))
+    qwenModel = None
 
 chat_template = "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}"
 
@@ -71,7 +75,7 @@ def execute_function_call(function_name, arguments, tools):
 
 def format_tool_response(result):
     """Format tool execution result using token-based format"""
-    return TOOL_RESPONSE_START_TOKEN+result+TOOL_RESPONSE_END_TOKEN
+    return TOOL_RESPONSE_START_TOKEN+str(result)+TOOL_RESPONSE_END_TOKEN
 
 def execute_tool_call_with_response(function_name, arguments, tools):
     """Execute a function call and return both result and formatted response"""
@@ -84,11 +88,13 @@ def parse_tool_calls_from_response(response_text, tools):
     tool_calls = []
 
     # Method 2: Look for JSON-style tool calls: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
-    json_tool_pattern = r'<tool_call>\s*({.*?})\s*</tool_call>'
-    json_matches = re.findall(json_tool_pattern, response_text, re.DOTALL)
-
-    for json_str in json_matches:
+    # Simplified regex without backtracking - match everything between tags
+    json_tool_pattern = r'<tool_call>([^<]*)</tool_call>'
+    for match in re.finditer(json_tool_pattern, response_text):
         try:
+            json_str = match.group(1)  # Extract the first capture group (the JSON content)
+            print("DEBUG: Captured JSON string: '"+json_str+"'")  # Debug what we captured
+            print("DEBUG: JSON string length: "+str(len(json_str)))  # Check length
             tool_data = nm.parse_json(json_str)
             func_name = tool_data["name"]
             arguments = tool_data["arguments"]
@@ -144,60 +150,7 @@ def render_jinja_template(messages, tools, add_generation_prompt, enable_thinkin
         message = content_messages[i]
         role = message["role"]
         content = message["content"]
-
-        if role in ["user", "system"]:
-            result = result + "<|im_start|>" + role + "\n" + content + "<|im_end|>\n"
-        elif role == "assistant":
-            # Handle assistant messages
-            result = result + "<|im_start|>assistant\n"
-
-            # Check for reasoning content
-            reasoning_content = ""
-            if "reasoning_content" in message and message["reasoning_content"]:
-                reasoning_content = message["reasoning_content"]
-            elif "</think>" in content:
-                # Extract thinking content
-                parts = content.split("</think>")
-                if len(parts) > 1:
-                    think_part = parts[0]
-                    if "<think>" in think_part:
-                        reasoning_content = think_part.split("<think>")[-1].strip()
-                        content = parts[-1].lstrip()
-
-            # Add thinking section if present
-            if reasoning_content:
-                result = result + "<think>\n" + reasoning_content + "\n</think>\n\n"
-
-            result = result + content
-
-            # Handle tool calls
-            if "tool_calls" in message and message["tool_calls"]:
-                for tool_call in message["tool_calls"]:
-                    if content:  # Add newline if there's content before tool call
-                        result = result + "\n"
-
-                    func_call = tool_call
-                    # Handle function calls
-                    if "function" in tool_call:
-                        func_call = tool_call["function"]
-
-                    result = result + "<tool_call>\n"
-                    result = result + "{\"name\": \"" + func_call["name"] + "\", \"arguments\": "
-                    result = result + func_call["arguments"]
-
-                    result = result + "}\n</tool_call>"
-
-            result = result + "<|im_end|>\n"
-
-        elif role == "tool":
-            # Handle tool response messages
-            if i == 0 or messages[i-1]["role"] != "tool":
-                result = result + "<|im_start|>user"
-
-            result = result + "\n<tool_response>\n" + content + "\n</tool_response>"
-
-            if i == len(messages)-1 or messages[i+1]["role"] != "tool":
-                result = result + "<|im_end|>\n"
+        result = result + "<|im_start|>" + role + "\n" + content + "<|im_end|>\n"
 
     # Add generation prompt if requested
     if add_generation_prompt:
@@ -256,11 +209,13 @@ def generate_with_model(conversation_messages, max_new_tokens, tool_schema, toke
     num_hidden_layers = model_config_dict["num_hidden_layers"]
     eos_token_id = model_config_dict["eos_token_id"]
     hidden_size = model_config_dict["hidden_size"]
-    # Initialize past cache values with correct shapes for ONNX model
-    past_cache_values = {}
 
-    # Check if config has layer_types (like LFM2)
-    # Since we always set layer_types in SimpleConfig, we can just check if it's None
+    model_inputs = {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "position_ids": position_ids
+    }
+    # Initialize past cache values with correct shapes for ONNX model
     if "layer_types" not in model_config_dict:
         model_config_dict["layer_types"] = [
                 "full_attention"
@@ -271,46 +226,50 @@ def generate_with_model(conversation_messages, max_new_tokens, tool_schema, toke
             for kv in ('key', 'value'):
                 # Initialize with a small valid tensor that will be replaced after first forward pass
                 # Using sequence length 1 to avoid dimension 0 issues
-                past_cache_values['past_key_values.'+str(i)+'.'+kv] = nm.zeros([batch_size, num_key_value_heads, 1, head_dim], "float")
+                model_inputs['past_key_values.'+str(i)+'.'+kv] = nm.zeros([batch_size, num_key_value_heads, 1, head_dim], "float16")
         elif model_config_dict["layer_types"][i] == 'conv':
-            past_cache_values['past_conv.'+str(i)] = nm.zeros([batch_size, hidden_size, model_config_dict["conv_L_cache"]], "float")
+            model_inputs['past_conv.'+str(i)] = nm.zeros([batch_size, hidden_size, model_config_dict["conv_L_cache"]], "float16")
 
     # 3. Generation loop
     generated_tokens = []
     for i in range(max_new_tokens):
-        # Run model - returns a tuple where first element is logits, rest are cache values
-        # Try passing cache as single dictionary parameter
-        model_outputs = qwenModel.run(input_ids, attention_mask, position_ids, past_cache_values)
-
-        # Extract logits (first element) and cache values (rest)
-        logits = model_outputs[0]
-        present_cache_values = []
-        for j in range(1, len(model_outputs)):
-            present_cache_values.append(model_outputs[j])
+        try:
+            model_outputs = qwenModel.run(model_inputs)
+            # Check for EOS token
+            if model_outputs["is_eos"][0][0]:
+                break
 
-        # Update values for next generation loop
-        next_token_id = nm.argmax(logits[0, -1, :])
+            # Update values for next generation loop
+            model_inputs["input_ids"] = model_outputs["next_token_id"]
+            generated_tokens.append(model_inputs["input_ids"][0][0])
 
-        # Check for EOS token
-        if next_token_id == eos_token_id:
+            if "updated_attention_mask" in model_outputs:
+                model_inputs["attention_mask"] = model_outputs["updated_attention_mask"]
+            if "next_position" in model_outputs:
+                model_inputs["position_ids"] = model_outputs["next_position"]
+        except Exception as gen_error:
+            print("Generation error: " + str(gen_error))
             break
 
-        generated_tokens.append(next_token_id)
-        input_ids = nm.tensor([[next_token_id]], "int64")
-        attention_mask = nm.concatenate([attention_mask, nm.ones_like(input_ids, "int64")], axis=-1)
-        position_ids = position_ids[:, -1:] + 1
-
-        # Update cache
-        j = 0
-        for key in past_cache_values:
-            past_cache_values[key] = present_cache_values[j]
-            j = j + 1
-
+        # Update cache using present outputs (present.X.key/value → past_key_values.X.key/value)
+        for cache_key in model_inputs.keys():
+            if ('past_key_values.' in cache_key):
+                # Convert past_key_values.X.key/value to present.X.key/value
+                splits = re.split(r'\.', cache_key)
+                present_key = "present."+splits[1]+"."+splits[2]
+                if present_key in model_outputs.keys():
+                    model_inputs[cache_key] = model_outputs[present_key]
+                else:
+                    print("⚠️ Warning: Expected cache output "+present_key+" not found")
+            elif 'past_conv' in cache_key:
+                # Handle conv cache if present
+                present_key = cache_key.replace("past_conv", "present_conv")
+                if present_key in model_outputs.keys():
+                    model_inputs[cache_key] = model_outputs[present_key]
     # 4. Output result - decode only the generated tokens
     response = ""
     if generated_tokens:
-        response = tokenizer.decode(generated_tokens)
-
+        response = tokenizer.decode(nm.tensor(generated_tokens, "int32"))
     return response.strip()
 
 
@@ -370,10 +329,10 @@ def handle_multi_step_request(user_prompt, max_steps, max_new_tokens, tools, too
 
             # Add tool results to conversation as function messages
             for tool_result in tool_results:
-                if "error" not in tool_result["result"]:
+                if "error" not in tool_result["result"].keys():
                     conversation_messages.append({
                         "role": "system",
-                        "content": "The result of the tool " +tool_result['function']+" is: "+TOOL_RESPONSE_START_TOKEN+tool_result['result']+TOOL_RESPONSE_END_TOKEN
+                        "content": "The result of the tool " + str(tool_result['function'])+" is: "+TOOL_RESPONSE_START_TOKEN+str(tool_result['result'])+TOOL_RESPONSE_END_TOKEN
                     })
             prompt = "continuation"
             if step == 0:
@@ -385,7 +344,7 @@ def handle_multi_step_request(user_prompt, max_steps, max_new_tokens, tools, too
                 "response": response,
                 "tool_calls": tool_calls,
                 "tool_results": tool_results,
-                "has_errors": len([True for tr in tool_results if "error" in tr["result"]]) > 0,
+                "has_errors": False,
                 "tool_context": tool_context,
                 "conversation_messages": conversation_messages
             }
@@ -397,7 +356,7 @@ def handle_multi_step_request(user_prompt, max_steps, max_new_tokens, tools, too
                 break
 
             # Simple continuation logic: if no tools were called, we're done
-            if not tool_calls:
+            if len(tool_calls) == 0:
                 print("✓ Completed after "+str(step + 1)+" step(s) - no tool calls needed")
                 break
 
@@ -435,39 +394,64 @@ def run_tool_calling_demo(input):
     print("=== Qwen3 1.7B Tool Calling Demo ===\n")
     print("Model: "+model_id)
 
-    # Ensure tokenizer has necessary tokens
-    tokenizer = tokenizers.from_json(input["tokenizer_config_dict"])
-
-    # Get tool names without using list()
-    tool_names = []
-    for key in tools_dict.keys():
-        tool_names.append(key)
-    print("Available tools: "+str(tool_names))
-
-    demo_prompts = [
-        "What's the weather here today?",
-        "Calculate 15 * 23",
-        "What time is it in JST timezone?",
-        "Where am I located?",
-        "Get my location and check the weather there"
-    ]
-
-    i = 1
-    for user_prompt in demo_prompts:
-        print("\nDemo "+str(i)+": "+user_prompt)
-        print("--------------------------------")
-        step_results = handle_multi_step_request(user_prompt, 4, 400, tools_dict, tls, tokenizer, input["config_dict"])
-        # Show final summary
-        print("\nMulti-step Summary:")
-        for step_result in step_results:
-            step_num = step_result["step"]
-            tool_calls = []
-            if "tool_calls" in step_result:
-                tool_calls = step_result["tool_calls"]
-            if tool_calls:
-                print("  Step "+str(step_num)+": "+str(len(tool_calls))+" tool call(s)")
-                for call in tool_calls:
-                    func_name = call["function_name"]
-                    print("    ✓ "+func_name)
-        print("\n" + "--------------------------------")
-        i = i + 1
+    try:
+        # Ensure tokenizer has necessary tokens
+        tokenizer = tokenizers.from_json(input["tokenizer_config_dict"])
+
+        # Get tool names without using list()
+        tool_names = []
+        for key in tools_dict.keys():
+            tool_names.append(key)
+        print("Available tools: "+str(tool_names))
+
+        demo_prompts = [
+            "What's the weather here today?",
+            # "Calculate 15 * 23",
+            # "What time is it in JST timezone?",
+            # "Where am I located?",
+            # "Get my location and check the weather there"
+        ]
+
+        all_results = []
+        i = 1
+        for user_prompt in demo_prompts:
+            print("\nDemo "+str(i)+": "+user_prompt)
+            print("--------------------------------")
+            step_results = handle_multi_step_request(user_prompt, 4, 400, tools_dict, tls, tokenizer, input["config_dict"])
+            # Show final summary
+            print("\nMulti-step Summary:")
+            for step_result in step_results:
+                step_num = step_result["step"]
+                tool_calls = []
+                if "tool_calls" in step_result:
+                    tool_calls = step_result["tool_calls"]
+                if tool_calls:
+                    print("  Step "+str(step_num)+": "+str(len(tool_calls))+" tool call(s)")
+                    for call in tool_calls:
+                        func_name = call["function_name"]
+                        print("    ✓ "+func_name)
+            print("\n" + "--------------------------------")
+            all_results.append({
+                "demo_" + str(i): {
+                    "prompt": user_prompt,
+                    "steps": len(step_results),
+                    "successful": len(step_results) > 0
+                }
+            })
+            i = i + 1
+
+        # Return a proper map with results
+        return {
+            "success": True,
+            "model_loaded": True,
+            "total_demos": len(demo_prompts),
+            "results": all_results
+        }
+
+    except Exception as e:
+        print("Error in demo: " + str(e))
+        return {
+            "success": False,
+            "error": str(e),
+            "model_loaded": False
+        }
diff --git a/nimblenet_py/simulation_assets/qwen_demo/run_demo.py b/nimblenet_py/simulation_assets/qwen_demo/run_demo.py
index 0b3743a2..168d3cf7 100644
--- a/nimblenet_py/simulation_assets/qwen_demo/run_demo.py
+++ b/nimblenet_py/simulation_assets/qwen_demo/run_demo.py
@@ -14,11 +14,12 @@ def main():
     print("=== Running Qwen Demo ===")
     print("This demo shows Qwen model and tool calling capabilities\n")
 
+    base_dir = "../../../models/Qwen3-1.7B/data"
     model_name = "qwen3-1.7b"
-    vocab_file = "./qwen/vocab.json"
-    merges_file = "./qwen/merges.txt"
-    config_file = "./qwen/config.json"
-    tokenizer_config_file = "./qwen/tokenizer.json"
+    vocab_file = base_dir+"/vocab.json"
+    merges_file = base_dir+"/merges.txt"
+    config_file = base_dir+"/config.json"
+    tokenizer_config_file = base_dir+"/tokenizer.json"
 
     # Module configuration for simulator
     modules = [
@@ -39,7 +40,7 @@ def main():
         "version": "1.0.0",
         "type": "model",
         "location": {
-            "path": "./qwen/onnx/model_q4f16.onnx"
+            "path": base_dir+"/onnx/model_enhanced.onnx"
         }
     })
     print(f"Added model: {model_name}")
diff --git a/nimblenet_py/simulation_tests/test_simulator_script.py b/nimblenet_py/simulation_tests/test_simulator_script.py
index 8c005f06..5233e1b5 100644
--- a/nimblenet_py/simulation_tests/test_simulator_script.py
+++ b/nimblenet_py/simulation_tests/test_simulator_script.py
@@ -672,7 +672,164 @@ def test_tokenizers():
 
     print("All tokenizer tests passed!")
 
+def test_model_dictionary_interface():
+    """Test the new dictionary-based model interface alongside traditional tensor interface."""
+
+    # First, create a proper test ONNX model with supported data types
+    import onnx
+    from onnx import helper, TensorProto
+    import os
+
+    def create_add_sub_model():
+        """Create an ONNX model that takes X, Y and returns sum, difference."""
+        # Define inputs
+        X = helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 1])
+        Y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [1, 1])
+
+        # Define outputs
+        sum_output = helper.make_tensor_value_info('sum', TensorProto.FLOAT, [1, 1])
+        diff_output = helper.make_tensor_value_info('difference', TensorProto.FLOAT, [1, 1])
+
+        # Create addition node: sum = X + Y
+        add_node = helper.make_node(
+            'Add',
+            inputs=['X', 'Y'],
+            outputs=['sum'],
+            name='add_node'
+        )
+
+        # Create subtraction node: difference = X - Y
+        sub_node = helper.make_node(
+            'Sub',
+            inputs=['X', 'Y'],
+            outputs=['difference'],
+            name='sub_node'
+        )
+
+        # Create the graph
+        graph = helper.make_graph(
+            nodes=[add_node, sub_node],
+            name='AddSubGraph',
+            inputs=[X, Y],
+            outputs=[sum_output, diff_output]
+        )
+
+        # Create the model
+        model = helper.make_model(graph)
+        model.opset_import[0].version = 9  # Use opset 9 for IR version 10 compatibility
+        model.ir_version = 6  # Explicitly set IR version to 6 for compatibility
+
+        # Check and save the model
+        onnx.checker.check_model(model)
+
+        # Determine correct path based on current working directory
+        current_dir = os.getcwd()
+        if "simulation_tests" in current_dir:
+            # Running from simulation_tests directory
+            model_path = "../simulation_assets/test_add_sub_model.onnx"
+            module_path = "../simulation_assets/test_add_sub_model.onnx"
+        else:
+            # Running from nimblenet_py directory (pytest)
+            model_path = "simulation_assets/test_add_sub_model.onnx"
+            module_path = "simulation_assets/test_add_sub_model.onnx"
+
+        onnx.save(model, model_path)
+
+        print(f"✅ Created test model: {model_path}")
+        print("📋 Model details:")
+        print("   Inputs: X (float32 [1,1]), Y (float32 [1,1])")
+        print("   Outputs: sum (X+Y), difference (X-Y)")
+
+        return module_path
+
+    # Create the test model
+    module_path = create_add_sub_model()
+
+    script_path = "../simulation_assets/dict_model_test.py"
+
+    modules = [
+        {
+            "name": "test_dict_model",
+            "version": "1.0.0",
+            "type": "script",
+            "location": {
+                "path": script_path
+            }
+        },
+        {
+            "name": "test_model",
+            "version": "1.0.0",
+            "type": "model",
+            "location": {
+                "path": module_path
+            }
+        }
+    ]
+
+    assert simulator.initialize('''{"debug": true, "online": false}''', modules)
+
+    # Test traditional tensor interface
+    tensor_results = simulator.run_method("test_tensor_interface", {})
+    print(f"Tensor interface test results: {tensor_results}")
+
+    assert tensor_results["status"] == "success"
+    assert "model_loaded" in tensor_results
+    assert tensor_results["model_loaded"] is not None
+
+    # Check if actual inference was performed
+    if "inference_successful" in tensor_results and tensor_results["inference_successful"]:
+        assert "sum_output" in tensor_results
+        assert "diff_output" in tensor_results
+        print(f"  ✅ Tensor interface inference successful!")
+        print(f"     Sum result: {tensor_results.get('sum_output')}")
+        print(f"     Diff result: {tensor_results.get('diff_output')}")
+    else:
+        assert "tensor_created" in tensor_results
+        assert tensor_results["tensor_created"] == True
+
+    # Test dictionary interface
+    dict_results = simulator.run_method("test_dictionary_interface", {})
+    print(f"Dictionary interface test results: {dict_results}")
+
+    assert dict_results["status"] == "success"
+    assert "model_loaded" in dict_results
+    assert dict_results["model_loaded"] is not None
+
+    # Check if actual inference was performed
+    if "inference_successful" in dict_results and dict_results["inference_successful"]:
+        assert "sum_output" in dict_results
+        assert "diff_output" in dict_results
+        print(f"  ✅ Dictionary interface inference successful!")
+        print(f"     Sum result: {dict_results.get('sum_output')}")
+        print(f"     Diff result: {dict_results.get('diff_output')}")
+    else:
+        assert "dict_created" in dict_results
+        assert dict_results["dict_created"] == True
+
+    # Test interface equivalence
+    equivalence_results = simulator.run_method("test_interface_equivalence", {})
+    print(f"Interface equivalence test results: {equivalence_results}")
+
+    assert equivalence_results["status"] == "success"
+
+    # Check if actual inference comparison was performed
+    if "both_interfaces_equivalent" in equivalence_results:
+        assert equivalence_results["both_interfaces_equivalent"] == True
+        print(f"  ✅ Both interfaces successfully performed inference and produced equivalent results!")
+    else:
+        assert False
+
+    print("All model dictionary interface tests passed!")
+
+    # Clean up the created model file
+    try:
+        os.remove(module_path)
+        print(f"🧹 Cleaned up test model: {module_path}")
+    except Exception as cleanup_error:
+        print(f"Could not clean up model file: {cleanup_error}")
+
 if __name__ == "__main__":
     test_simulator()
     test_python_modules()
     test_tokenizers()
+    test_model_dictionary_interface()

From c274f0cc3ea4506511b21ed2c5b42b154e7c9107 Mon Sep 17 00:00:00 2001
From: Puneet Jindal <puneet.jindal@nimbleedgehq.ai>
Date: Thu, 24 Jul 2025 15:55:35 +0530
Subject: [PATCH 4/7] Redo deliteai.dev website (#163)

Signed-off-by: Puneet Jindal <puneet.jindal@nimbleedgehq.ai>
---
 .github/workflows/publish-docs.yml            |  25 +---
 .vscode/extensions.json                       |   5 +-
 CODE_OF_CONDUCT.md                            |   6 +
 CONTRIBUTING.md                               |   6 +
 README.md                                     |  72 ++++++++-
 agents/examples/android/README.md             |   6 +
 .../android/README.md                         |   2 +-
 coreruntime/delitepy/README.md                |   6 +
 coreruntime/delitepy/docs_template/index.md   |   4 +-
 docs/deliteai.dev/conf.py                     | 138 +++++++++++++++++-
 docs/deliteai.dev/index.md                    | 126 ----------------
 docs/scripts/build_website.sh                 |  30 ++--
 mockserver/README.md                          |   6 +
 13 files changed, 261 insertions(+), 171 deletions(-)
 delete mode 100644 docs/deliteai.dev/index.md

diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index 18609241..bf3c397a 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -36,41 +36,32 @@ jobs:
 
       - name: Generate dummy local.properties
         working-directory: sdks/android
+        shell: bash
         run: |
           cat <<EOF > local.properties
           sdk.dir=/usr/local/lib/android/sdk
-          storeFile=/path/to/keystore.jks
-          storePassword=your_store_password
-          keyPassword=your_key_password
-          keyAlias=your_key_alias
-          ANDROID_DEV_AWS_ACCESS_KEY_ID=your_aws_key
-          ANDROID_DEV_AWS_SECRET_ACCESS_KEY=your_aws_secret
-          ANDROID_DEV_AWS_S3_URL=your_s3_url
-          OSS_USER=your_maven_user
-          OSS_PASSWORD=your_maven_password
-          ANDROID_TEST_CLIENT_ID=test_client_id
-          ANDROID_TEST_CLIENT_SECRET=test_client_secret
-          ANDROID_TEST_HOST=https://test-api-endpoint.com
-          REMOTE_LOGGER_KEY=your_logger_key
-          REMOTE_LOGGER_URL=https://your-logging-endpoint.com
           EOF
 
       - name: Generate Dokka docs
         working-directory: sdks/android
+        shell: bash
         run: ./gradlew dokkaGfm
 
       - name: Build site
         working-directory: docs
+        shell: bash
         run: |
           pip install -r requirements.txt
           ./scripts/run build_website
 
       - name: Sync to S3
+        shell: bash
         run: |
-          aws s3 sync ./docs/build/deliteai.dev/html s3://$S3_BUCKET --delete
+          aws s3 sync ./docs/build/deliteai.dev/html "s3://${S3_BUCKET}" --delete
 
       - name: Invalidate CloudFront Cache
+        shell: bash
         run: |
           aws cloudfront create-invalidation \
-            --distribution-id "$CLOUDFRONT_DIST_ID" \
-            --paths "/*"
\ No newline at end of file
+            --distribution-id "${CLOUDFRONT_DIST_ID}" \
+            --paths "/*"
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
index 8e0d7b56..bb797e87 100644
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -1,6 +1,9 @@
 {
     "recommendations": [
         "editorconfig.editorconfig",
-        "davidanson.vscode-markdownlint"
+        "davidanson.vscode-markdownlint",
+        "charliermarsh.ruff",
+        "executablebookproject.myst-highlight",
+        "github.vscode-github-actions"
     ]
 }
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 112c0e80..18576089 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -1,3 +1,9 @@
+<!-- DL_DOCS::WEBSITE_BLOCK_START
+```{eval-rst}
+:orphan:
+```
+DL_DOCS::WEBSITE_BLOCK_END -->
+
 # Contributor Covenant Code of Conduct
 
 ## Our Pledge
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9582f82e..40a26cca 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,3 +1,9 @@
+<!-- DL_DOCS::WEBSITE_BLOCK_START
+```{eval-rst}
+:orphan:
+```
+DL_DOCS::WEBSITE_BLOCK_END -->
+
 # Contributing to DeliteAI
 
 Thank you for your interest in contributing to DeliteAI! This document provides guidelines and
diff --git a/README.md b/README.md
index 52c4bc1c..c5cfd895 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,9 @@
+<!-- DL_DOCS::WEBSITE_BLOCK_START
+# DeliteAI documentation
+
+## Introduction
+DL_DOCS::WEBSITE_BLOCK_END -->
+<!-- DL_DOCS::GITHUB_BLOCK_START -->
 <!-- markdownlint-disable first-line-h1 no-inline-html -->
 <div align="center">
   <h1 align="center">DeliteAI</h1>
@@ -20,7 +26,8 @@
   </a>
   <hr>
 </div>
-<!-- markdownlint-enable first-line-h1 no-inline-html -->
+<!-- markdownlint-restore -->
+<!-- DL_DOCS::GITHUB_BLOCK_END -->
 
 DeliteAI is a powerful on-device AI platform for building agentic workflows that enables developers
 to deliver secure, privacy-aware, and high-performance AI native experiences and applications
@@ -59,7 +66,13 @@ To get started, you can:
   with its Python bindings
 - Build and try out the [Android](sdks/android/README.md) and [iOS](sdks/ios/README.md) SDK and
   sample app
+<!-- DL_DOCS::GITHUB_BLOCK_START -->
 - Explore the available agents in the [agents](agents) directory.
+<!-- DL_DOCS::GITHUB_BLOCK_END -->
+<!-- DL_DOCS::WEBSITE_BLOCK_START
+- Explore the available agents in the
+  [agents](https://github.com/NimbleEdge/deliteAI/tree/main/agents) directory.
+DL_DOCS::WEBSITE_BLOCK_END -->
 
 Visit the [assistant](https://github.com/NimbleEdge/assistant) repo to see it all in action.
 
@@ -86,3 +99,60 @@ Please read our [Contributing Guidelines](CONTRIBUTING.md) to get started.
 ## License
 
 This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
+
+<!-- DL_DOCS::WEBSITE_BLOCK_START
+```{toctree}
+:maxdepth: 1
+:hidden:
+
+coreruntime/delitepy/build/docs/gen/docs/index.md
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: SDK Integrations
+:hidden:
+
+sdks/ios/README.md
+sdks/android/README.md
+⬢ API reference <sdks/android/nimblenet_ktx/build/dokka/gfm/index.md>
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Developer Docs
+:hidden:
+
+coreruntime/README.md
+sdks/ios/docs/DEVELOPMENT.md
+sdks/android/README.md
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Agent Marketplace
+:hidden:
+
+agents/README.md
+agents/notifications_summarizer/README.md
+⬢ Android Agent <agents/notifications_summarizer/android/README.md>
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Indices
+:hidden:
+
+genindex
+modindex
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Project Links
+:hidden:
+
+GitHub <https://github.com/NimbleEdge/deliteAI>
+Discord <https://discord.gg/y8WkMncstk>
+```
+DL_DOCS::WEBSITE_BLOCK_END -->
diff --git a/agents/examples/android/README.md b/agents/examples/android/README.md
index 6e29e36d..6c62730a 100644
--- a/agents/examples/android/README.md
+++ b/agents/examples/android/README.md
@@ -1,3 +1,9 @@
+<!-- DL_DOCS::WEBSITE_BLOCK_START
+```{eval-rst}
+:orphan:
+```
+DL_DOCS::WEBSITE_BLOCK_END -->
+
 # DeliteAI Agents Android Example
 
 Example Android app demonstrating the Notifications Summarizer Agent integration.
diff --git a/agents/notifications_summarizer/android/README.md b/agents/notifications_summarizer/android/README.md
index ea06e4d8..d8b3d08b 100644
--- a/agents/notifications_summarizer/android/README.md
+++ b/agents/notifications_summarizer/android/README.md
@@ -19,7 +19,7 @@ keeping all data on the device.
 1. **Installation**\
     Add the agent dependency to your app's `build.gradle.kts` or `build.gradle` file:
 
-    ```gradle
+    ```kotlin
     dependencies {
         implementation("dev.deliteai.agents:notification_summarizer:x.x.x")
     }
diff --git a/coreruntime/delitepy/README.md b/coreruntime/delitepy/README.md
index 29c3d92f..5b84cbf6 100644
--- a/coreruntime/delitepy/README.md
+++ b/coreruntime/delitepy/README.md
@@ -1,3 +1,9 @@
+<!-- DL_DOCS::WEBSITE_BLOCK_START
+```{eval-rst}
+:orphan:
+```
+DL_DOCS::WEBSITE_BLOCK_END -->
+
 # DelitePy
 
 ## Documentation
diff --git a/coreruntime/delitepy/docs_template/index.md b/coreruntime/delitepy/docs_template/index.md
index a1ac8929..2b6f43f7 100644
--- a/coreruntime/delitepy/docs_template/index.md
+++ b/coreruntime/delitepy/docs_template/index.md
@@ -15,6 +15,6 @@ statements.md
 builtins.md
 modules.md
 
-genindex
-modindex
+/genindex
+/modindex
 ```
diff --git a/docs/deliteai.dev/conf.py b/docs/deliteai.dev/conf.py
index 481844ed..932df123 100644
--- a/docs/deliteai.dev/conf.py
+++ b/docs/deliteai.dev/conf.py
@@ -8,7 +8,14 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 
 import os
+import re
+import shutil
 import sys
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import sphinx.application
 
 sys.path.insert(0, f"{os.environ['DL_DELITEPY_DIR']}/library_stubs/src_gen")
 
@@ -17,7 +24,7 @@
 project = "DeliteAI"
 copyright = "2025, DeliteAI Authors"
 author = "DeliteAI Authors"
-release = "0.1.0-dev"               # TODO (jpuneet): read from "config.yml"?
+release = "0.1.0-dev"  # TODO (jpuneet): read from "config.yml"?
 
 # -- General configuration ---------------------------------------------------
 
@@ -32,20 +39,141 @@
     "special-members": "__init__",
 }
 
+myst_heading_anchors = 3
+
+include_patterns = [
+    "*.md",
+    "coreruntime/**",
+    "mockserver/**",
+    "sdks/**",
+    "agents/**",
+]
+
+root_doc = "README"
+
 source_suffix = {
     ".rst": "restructuredtext",
     ".md": "markdown",
 }
 
 exclude_patterns = [
-    "**/android/nimblenet_core/src/main/cpp/coreruntime",
-    "**/android/nimblenet_core/src/main/cpp/onnx_builds",
-    "**/android/nimblenet_core/src/main/cpp/third_party",
+    "coreruntime/delitepy/docs_template",
+    "coreruntime/delitepy/library_stubs/src_template",
+    "sdks/android/nimblenet_core/src/main/cpp",
+]
+
+# TODO (jpuneet): Figure out a way to suppress these warnings selectively for
+# just the dokkaGfm output.
+suppress_warnings = [
+    "myst.header",
+    "toc.not_included",
 ]
 
 # -- Options for HTML output -------------------------------------------------
 
 html_theme = "furo"
-html_logo = "_static/images/delite-ai-blue-logo.png"    # TODO (jpuneet): resize to width=200px?
+html_logo = "_static/images/delite-ai-blue-logo.png"  # TODO (jpuneet): resize to width=200px?
 # html_favicon = "_static/images/favicon.png"
 html_static_path = ["_static"]
+
+# -- Set up the 'conf.py' Sphinx extension -----------------------------------
+
+
+class _ExtensionUtils:
+    @staticmethod
+    def patch_html_builder() -> None:
+        from sphinx.builders.html import StandaloneHTMLBuilder
+
+        def rename_doc(docname: str) -> str:
+            path = Path(docname)
+            if path.name == "README":
+                path = path.with_name("index")
+            return str(path)
+
+        get_target_uri_orig = StandaloneHTMLBuilder.get_target_uri
+
+        def get_target_uri(self, docname: str, typ: str | None = None) -> str:
+            docname = rename_doc(docname)
+            return get_target_uri_orig(self, docname, typ)
+
+        StandaloneHTMLBuilder.get_target_uri = get_target_uri
+
+    _pattern_doc_blocks_for_sphinx = re.compile(
+        r"^ *<!-- +DL_DOCS::WEBSITE_BLOCK_START\r?\n"
+        r"([\s\S]*?)"
+        r"^ *DL_DOCS::WEBSITE_BLOCK_END +-->\r?\n",
+        re.MULTILINE,
+    )
+
+    @staticmethod
+    def _enable_website_blocks(doc_content: str) -> str:
+        """
+        Transforms
+        ```
+        <!-- DL_DOCS::WEBSITE_BLOCK_START
+        ...
+        DL_DOCS::WEBSITE_BLOCK_END -->
+        ```
+        into
+        ```
+        ...
+        ```
+        """
+        return _ExtensionUtils._pattern_doc_blocks_for_sphinx.sub(
+            r"\1",
+            doc_content,
+        )
+
+    _pattern_doc_blocks_not_for_sphinx = re.compile(
+        r"^ *<!-- +DL_DOCS::GITHUB_BLOCK_START +-->\r?\n"
+        r"[\s\S]*?"
+        r"^ *<!-- +DL_DOCS::GITHUB_BLOCK_END +-->\r?\n",
+        re.MULTILINE,
+    )
+
+    @staticmethod
+    def _disable_github_blocks(doc_content: str) -> str:
+        """
+        Removes the pattern
+        ```
+        <!-- DL_DOCS::GITHUB_BLOCK_START -->
+        ...
+        <!-- DL_DOCS::GITHUB_BLOCK_END -->
+        ```
+        """
+        return _ExtensionUtils._pattern_doc_blocks_not_for_sphinx.sub(
+            r"",
+            doc_content,
+        )
+
+    @staticmethod
+    def on_source_read(app: "sphinx.application.Sphinx", docname: str, source: list[str]) -> None:
+        doc_content = source[0]
+
+        doc_content_modified = doc_content
+        doc_content_modified = _ExtensionUtils._enable_website_blocks(doc_content_modified)
+        doc_content_modified = _ExtensionUtils._disable_github_blocks(doc_content_modified)
+
+        source[0] = doc_content_modified
+
+    @staticmethod
+    def _rename_readme_html_to_index_html(outdir: Path) -> None:
+        """Renames all README.html files to index.html inside `outdir`"""
+        for root, _, files in outdir.walk():
+            if "README.html" in files:
+                shutil.move(root / "README.html", root / "index.html")
+
+    @staticmethod
+    def on_build_finished(app: "sphinx.application.Sphinx", exception: Exception | None) -> None:
+        if exception:
+            return
+
+        # Patch StandaloneHTMLBuilder.get_output_path instead?
+        _ExtensionUtils._rename_readme_html_to_index_html(app.outdir)
+
+
+def setup(app: "sphinx.application.Sphinx") -> None:
+    _ExtensionUtils.patch_html_builder()
+
+    app.connect("source-read", _ExtensionUtils.on_source_read)
+    app.connect("build-finished", _ExtensionUtils.on_build_finished)
diff --git a/docs/deliteai.dev/index.md b/docs/deliteai.dev/index.md
deleted file mode 100644
index 3a7ac51e..00000000
--- a/docs/deliteai.dev/index.md
+++ /dev/null
@@ -1,126 +0,0 @@
-# DeliteAI documentation
-
-## Introduction
-
-DeliteAI is a powerful on-device AI platform for building agentic workflows that enables developers
-to deliver secure, privacy-aware, and high-performance AI native experiences and applications
-on mobile devices.
-
-## Key Features
-
-### Developer Productivity
-
-- Unified and simplified APIs for seamless AI agents integration in Android/iOS applications
-- Python interface for orchestrating complex AI agentic workflows via tool calling, memory and LLMs on-device
-- Streamlined development process for both new and existing applications
-
-### Portability & Small Form Factor
-
-- Cross-platform compatibility across operating systems
-- Support for various compute platforms and runtimes
-- Abstracted development layer for the fragmented device landscape
-- Optimized for resource-constrained environments with efficient CPU/memory usage
-
-### Security & Privacy
-
-- Privacy-first approach with on-device processing
-- Hardware-accelerated model execution
-
-### Extensibility
-
-- Easy integration of custom Python operators
-- Flexible runtime support ([ONNX](https://onnx.ai/) or [ExecuTorch](https://docs.pytorch.org/executorch-overview))
-
-## Getting Started
-
-To get started, you can:
-
-- Follow the steps in [coreruntime](https://github.com/NimbleEdge/deliteAI/blob/main/coreruntime/README.md)
-  to build and test the core C++ SDK along with its Python bindings
-- Build and try out the [Android](https://github.com/NimbleEdge/deliteAI/blob/main/sdks/android/README.md)
-  and [iOS](https://github.com/NimbleEdge/deliteAI/blob/main/sdks/ios/README.md) SDK and sample app
-- Explore the available agents in the [agents](https://github.com/NimbleEdge/deliteAI/blob/main/agents)
-  directory.
-
-Visit the [assistant](https://github.com/NimbleEdge/assistant) repo to see it all in action.
-
-## Documentation
-
-Explore our documentation at [deliteai.dev](https://deliteai.dev/).
-
-## Community engagement
-
-We welcome any feedback or suggestions - please join our
-[Discord](https://discord.gg/y8WkMncstk) to engage with the community.
-
-## Contributing
-
-We welcome contributions from the community! Whether you're interested in:
-
-- Adding new Python operators
-- Enhancing runtime support
-- Improving documentation
-- Reporting bugs or suggesting features
-
-Please read our [Contributing Guidelines](https://github.com/NimbleEdge/deliteAI/blob/main/CONTRIBUTING.md)
-to get started.
-
-## License
-
-This project is licensed under the Apache License 2.0 - see the [LICENSE](https://github.com/NimbleEdge/deliteAI/blob/main/LICENSE)
-file for details.
-
-```{toctree}
-:maxdepth: 1
-:hidden:
-
-delitepy/docs/gen/docs/index.md
-```
-
-```{toctree}
-:maxdepth: 1
-:caption: SDK Integrations
-:hidden:
-
-DL_SDKS_DIR/ios/README.md
-DL_SDKS_DIR/android/README.md
-⬢ API reference <DL_SDKS_DIR/android/nimblenet_ktx/build/dokka/gfm/index.md>
-```
-
-```{toctree}
-:maxdepth: 1
-:caption: Developer Docs
-:hidden:
-
-coreruntime <https://github.com/NimbleEdge/deliteAI/blob/main/coreruntime/README.md>
-iOS <DL_SDKS_DIR/ios/docs/DEVELOPMENT.md>
-Android <DL_SDKS_DIR/android/README.md>
-```
-
-```{toctree}
-:maxdepth: 1
-:caption: Agent Marketplace
-:hidden:
-
-DL_AGENTS_DIR/README.md
-DL_AGENTS_DIR/notifications_summarizer/README.md
-⬢ Android Agent <DL_AGENTS_DIR/notifications_summarizer/android/README.md>
-```
-
-```{toctree}
-:maxdepth: 1
-:caption: Indices
-:hidden:
-
-genindex
-modindex
-```
-
-```{toctree}
-:maxdepth: 1
-:caption: Project Links
-:hidden:
-
-GitHub <https://github.com/NimbleEdge/deliteAI>
-Discord <https://discord.gg/y8WkMncstk>
-```
diff --git a/docs/scripts/build_website.sh b/docs/scripts/build_website.sh
index 878d9299..113ae6f7 100755
--- a/docs/scripts/build_website.sh
+++ b/docs/scripts/build_website.sh
@@ -23,6 +23,10 @@ echo "Using executable '${SPHINX_BUILD_EXECUTABLE_PATH}'."
 echo "         version '$("${SPHINX_BUILD_EXECUTABLE_PATH}" --version)'"
 
 # --- Construct paths ----------------------------------------------------------
+: "${DL_DOCS_DIR:?Environment variable DL_DOCS_DIR is not set}"
+
+readonly DL_WEBSITE_DIR="${DL_DOCS_DIR}/deliteai.dev"
+
 DL_DOCS_BUILD_DIR="${1:-"${DL_DOCS_DIR}/build"}"
 mkdir -p "${DL_DOCS_BUILD_DIR}"
 DL_DOCS_BUILD_DIR="$(realpath "${DL_DOCS_BUILD_DIR}")"
@@ -31,41 +35,31 @@ readonly DL_DOCS_BUILD_DIR
 readonly DL_WEBSITE_BUILD_DIR="${DL_DOCS_BUILD_DIR}/deliteai.dev"
 
 # --- Clean build directory ----------------------------------------------------
-echo "Removing build dir '${DL_WEBSITE_BUILD_DIR}'"
+echo "Cleaning build dir '${DL_WEBSITE_BUILD_DIR}'"
 rm -rf "${DL_WEBSITE_BUILD_DIR}"
 mkdir -p "${DL_WEBSITE_BUILD_DIR}"
 
 echo "Building DeliteAI HTML documentation using Sphinx"
 
-# --- Copy website directory ---------------------------------------------------
-readonly DL_WEBSITE_DIR="${DL_DOCS_DIR}/deliteai.dev"
-
-cp -R                                                                       \
-    "${DL_WEBSITE_DIR}"                                                     \
-    "${DL_WEBSITE_BUILD_DIR}/gen"                                           \
-    ;
-
-# --- Create symlinks ----------------------------------------------------------
-ln -s "${DL_CORERUNTIME_DIR}" "${DL_WEBSITE_BUILD_DIR}/gen/DL_CORERUNTIME_DIR"
-ln -s "${DL_GIT_ROOT}/sdks" "${DL_WEBSITE_BUILD_DIR}/gen/DL_SDKS_DIR"
-ln -s "${DL_GIT_ROOT}/agents" "${DL_WEBSITE_BUILD_DIR}/gen/DL_AGENTS_DIR"
-
 # --- Build DelitePy Markdown documentation ------------------------------------
-readonly DL_DELITEPY_DOCS_BUILD_DIR="${DL_WEBSITE_BUILD_DIR}/gen/delitepy"
-
 # TODO (jpuneet): Skip Markdown => HTML step here
-"${DL_DELITEPY_DIR}/scripts/build_docs.sh" "${DL_DELITEPY_DOCS_BUILD_DIR}"
+"${DL_DELITEPY_DIR}/scripts/build_docs.sh"
 
 # --- Build Android SDK Markdown documentation ---------------------------------
 
 # TODO (jpuneet): run "dokkaGfm" Gradle task in the Android SDK
 
 # --- Run Sphinx ---------------------------------------------------------------
+# https://www.sphinx-doc.org/en/master/man/sphinx-build.html
 "${SPHINX_BUILD_EXECUTABLE_PATH}"                                           \
     --builder html                                                          \
-    "${DL_WEBSITE_BUILD_DIR}/gen"                                           \
+    --doctree-dir "${DL_WEBSITE_BUILD_DIR}/html_intermediate/.doctrees"     \
+    --conf-dir "${DL_WEBSITE_DIR}"                                          \
+    --nitpicky                                                              \
+    "${DL_GIT_ROOT}"                                                        \
     "${DL_WEBSITE_BUILD_DIR}/html"                                          \
     ;
+# TODO (jpuneet): Enable the option `--fail-on-warning` once all the existing warnings get fixed.
 
 echo "[done] Building DeliteAI HTML documentation using Sphinx"
 echo "DeliteAI HTML documentation: '${DL_WEBSITE_BUILD_DIR}/html/index.html'"
diff --git a/mockserver/README.md b/mockserver/README.md
index d071f7ff..37b2ab80 100644
--- a/mockserver/README.md
+++ b/mockserver/README.md
@@ -1,3 +1,9 @@
+<!-- DL_DOCS::WEBSITE_BLOCK_START
+```{eval-rst}
+:orphan:
+```
+DL_DOCS::WEBSITE_BLOCK_END -->
+
 # Mock Server
 
 ## Set up

From b42f0c63fa90141987f9381d4cf4969a2d2d4216 Mon Sep 17 00:00:00 2001
From: Puneet Jindal <puneet.jindal@nimbleedgehq.ai>
Date: Thu, 24 Jul 2025 16:48:16 +0530
Subject: [PATCH 5/7] Upgrade Python version in GitHub workflows (#166)

Signed-off-by: Puneet Jindal <puneet.jindal@nimbleedgehq.ai>
---
 .github/actions/setup-test-env/action.yml | 4 ++--
 .github/workflows/publish-docs.yml        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/actions/setup-test-env/action.yml b/.github/actions/setup-test-env/action.yml
index 0d1f7c87..b4f3e930 100644
--- a/.github/actions/setup-test-env/action.yml
+++ b/.github/actions/setup-test-env/action.yml
@@ -21,8 +21,8 @@ runs:
         sudo apt-get install curl
         sudo ln -s /usr/lib/x86_64-linux-gnu/libcurl.so.4 /usr/lib/libcurl.so
 
-    - name: Setup Python
-      uses: actions/setup-python@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
       with:
         python-version: 3.12
 
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index bf3c397a..0e673777 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -25,7 +25,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: '3.11'
+          python-version: 3.12
 
       - name: Set up AWS credentials
         uses: aws-actions/configure-aws-credentials@v2

From 5c5e4e87943d269027942482653cba65b56ae00a Mon Sep 17 00:00:00 2001
From: Varun Khare <varun.khare@nimbledgehq.ai>
Date: Sat, 26 Jul 2025 00:23:21 +0530
Subject: [PATCH 6/7] correct mac build

Signed-off-by: Varun Khare <varun.khare@nimbledgehq.ai>

modular qwen demo structure

Signed-off-by: Varun Khare <varun.khare@nimbleedgehq.ai>

wip handle attention cache

Signed-off-by: Varun Khare <varun.khare@nimbleedgehq.ai>

resume from last postion for multi-step run

Signed-off-by: Varun Khare <varun.khare@nimbleedgehq.ai>
---
 coreruntime/CMakeLists.txt                    |   9 +-
 coreruntime/build.py                          |   4 +-
 .../data_variable/src/data_variable.cpp       |   2 +-
 models/Qwen3-1.7B/export.py                   | 108 ++---
 models/tools.py                               | 202 --------
 .../qwen_modules/generation_mixin.py          | 209 ++++++++
 .../qwen_demo/qwen_modules/main.py            | 445 +++---------------
 .../qwen_demo/qwen_modules/tools.py           | 321 +++++++++----
 .../simulation_assets/qwen_demo/run_demo.py   |  65 +--
 9 files changed, 574 insertions(+), 791 deletions(-)
 delete mode 100644 models/tools.py
 create mode 100644 nimblenet_py/simulation_assets/qwen_demo/qwen_modules/generation_mixin.py

diff --git a/coreruntime/CMakeLists.txt b/coreruntime/CMakeLists.txt
index d340d750..af773cbd 100644
--- a/coreruntime/CMakeLists.txt
+++ b/coreruntime/CMakeLists.txt
@@ -183,13 +183,8 @@ else()
 	target_compile_definitions(nimblenet PUBLIC -DIOS_PLATFORM="mac")
 	add_subdirectory(platform/unix) # produces ${CLIENT_INCLUDES}
 	add_subdirectory("../third_party/runtime" "${CMAKE_BINARY_DIR}/third_party/runtime") # -> produces ${BACKEND_LIBS} ${BACKEND_DIR} ${BACKED_INCLUDES}
-	# Add conda environment library directory to search path
-	if(DEFINED ENV{CONDA_PREFIX})
-		link_directories($ENV{CONDA_PREFIX}/lib)
-		target_link_libraries(nimblenet ${VISIBILITY} $ENV{CONDA_PREFIX}/lib/libcurl.so)
-	else()
-		target_link_libraries(nimblenet ${VISIBILITY} curl)
-	endif()
+	# Use system curl library instead of conda environment
+	target_link_libraries(nimblenet ${VISIBILITY} curl)
 
 	# target_link_libraries(nimblenet ${VISIBILITY} clientlib)
 endif()
diff --git a/coreruntime/build.py b/coreruntime/build.py
index 56ea6717..ec0d4a44 100755
--- a/coreruntime/build.py
+++ b/coreruntime/build.py
@@ -107,14 +107,14 @@ def main():
     if args.simulator:
         if not args.ci_build:
             # re-install deliteai
-            subprocess.run(f"python{python_version} -m pip uninstall deliteai", shell=True, check=True)
+            subprocess.run(f"python{python_version} -m pip uninstall -y deliteai", shell=True, check=True)
             subprocess.run("rm -rf dist deliteai*", shell=True, check=True)
             subprocess.run(f"python{python_version} setup.py bdist_wheel", shell=True, check=True)
             subprocess.run(f"python{python_version} -m pip install dist/*", shell=True, check=True)
 
             # re-install delitepy-library-stubs
             subprocess.run(
-                f"python{python_version} -m pip uninstall delitepy-library-stubs",
+                f"python{python_version} -m pip uninstall -y delitepy-library-stubs",
                 shell=True,
                 check=True,
             )
diff --git a/coreruntime/nimblenet/data_variable/src/data_variable.cpp b/coreruntime/nimblenet/data_variable/src/data_variable.cpp
index 9e0bf78e..5974c390 100644
--- a/coreruntime/nimblenet/data_variable/src/data_variable.cpp
+++ b/coreruntime/nimblenet/data_variable/src/data_variable.cpp
@@ -186,7 +186,7 @@ std::map<int, std::string> DataVariable::_inverseMemberFuncMap = {
     {MemberFuncType::RETRIEVER, "Retriever"},
     {MemberFuncType::POP, "pop"},
     {MemberFuncType::KEYS, "keys"},
-    {MemberFuncType::JSON_DOCUMENT, "jsonDocument"},
+    {MemberFuncType::JSON_DOCUMENT, "JsonDocument"},
     {MemberFuncType::MAX_INPUT_NUM_TOKENS, "max_input_num_tokens"},
     {MemberFuncType::CONSTRUCTOR, "__init__"},
     {MemberFuncType::UNICODE, "unicode"},
diff --git a/models/Qwen3-1.7B/export.py b/models/Qwen3-1.7B/export.py
index f882b058..1fc70e01 100755
--- a/models/Qwen3-1.7B/export.py
+++ b/models/Qwen3-1.7B/export.py
@@ -217,8 +217,8 @@ def add_argmax_node(model, temperature=0.3):
     print(f"✅ ArgMax node with temperature scaling ({temperature}) and correct output shape [1,1] added successfully")
     return model
 
-def add_generation_logic(model, eos_token_id=151645):
-    """Add generation loop logic to the model."""
+def add_generation_logic_simple(model, eos_token_id=151645):
+    """Add generation logic with PROPER attention mask handling (fixed version)."""
     print(f"🔄 Adding generation logic with EOS token ID: {eos_token_id}")
 
     # Create constant for EOS token as scalar - will broadcast to match next_token_id
@@ -241,15 +241,18 @@ def add_generation_logic(model, eos_token_id=151645):
         outputs=['is_eos']
     )
 
-    # Create nodes for updating attention mask with dynamic batch size
-    # Get the batch size from attention_mask shape
+    # FIXED ATTENTION MASK LOGIC
+    # The key insight: we need to extend the CURRENT attention_mask, not concatenate with past
+    # Current attention_mask is the input for this generation step
+    # We extend it by 1 for the newly generated token
+
+    # Get batch size from attention_mask shape
     batch_shape = helper.make_node(
         'Shape',
         inputs=['attention_mask'],
         outputs=['attention_mask_shape']
     )
 
-        # Create zero index constant for Gather
     zero_index = helper.make_node(
         'Constant',
         inputs=[],
@@ -262,21 +265,12 @@ def add_generation_logic(model, eos_token_id=151645):
         )
     )
 
-    # Extract batch size (first dimension)
     batch_size_scalar = helper.make_node(
         'Gather',
         inputs=['attention_mask_shape', 'zero_index'],
         outputs=['batch_size_scalar']
     )
 
-    # Convert batch size to 1D tensor for concatenation
-    batch_size_unsqueeze = helper.make_node(
-        'Unsqueeze',
-        inputs=['batch_size_scalar', 'zero_axis'],
-        outputs=['batch_size']
-    )
-
-    # Create zero axis constant for Unsqueeze
     zero_axis = helper.make_node(
         'Constant',
         inputs=[],
@@ -289,28 +283,31 @@ def add_generation_logic(model, eos_token_id=151645):
         )
     )
 
-    # Create shape [batch_size, 1] for ones tensor
-    ones_shape = helper.make_node(
-        'Concat',
-        inputs=['batch_size', 'one_constant'],
-        outputs=['ones_shape_tensor'],
-        axis=0
+    batch_size_unsqueeze = helper.make_node(
+        'Unsqueeze',
+        inputs=['batch_size_scalar', 'zero_axis'],
+        outputs=['batch_size_1d']
     )
 
-    # Create constant for value 1
-    one_constant = helper.make_node(
+    one_constant_1d = helper.make_node(
         'Constant',
         inputs=[],
-        outputs=['one_constant'],
+        outputs=['one_constant_1d'],
         value=helper.make_tensor(
-            name='one_constant_value',
+            name='one_constant_1d_value',
             data_type=TensorProto.INT64,
             dims=[1],
             vals=[1]
         )
     )
 
-    # Create ones tensor with dynamic batch size
+    ones_shape = helper.make_node(
+        'Concat',
+        inputs=['batch_size_1d', 'one_constant_1d'],
+        outputs=['ones_shape_tensor'],
+        axis=0
+    )
+
     ones_tensor = helper.make_node(
         'ConstantOfShape',
         inputs=['ones_shape_tensor'],
@@ -323,17 +320,17 @@ def add_generation_logic(model, eos_token_id=151645):
         )
     )
 
-    # Concatenate attention mask with ones
-    concat_attention = helper.make_node(
+    # CORRECT: Extend current attention_mask with one new token
+    # This grows linearly: [1,1,1] -> [1,1,1,1] -> [1,1,1,1,1]
+    updated_attention_mask = helper.make_node(
         'Concat',
         inputs=['attention_mask', 'ones_tensor'],
         outputs=['updated_attention_mask'],
         axis=-1
     )
 
-    # Create nodes for updating position_ids
-    # Create constants for slice parameters to get the last position
-    slice_starts = helper.make_node(
+    # Add position increment logic (simplified)
+    pos_slice_starts = helper.make_node(
         'Constant',
         inputs=[],
         outputs=['pos_slice_starts'],
@@ -345,7 +342,7 @@ def add_generation_logic(model, eos_token_id=151645):
         )
     )
 
-    slice_ends = helper.make_node(
+    pos_slice_ends = helper.make_node(
         'Constant',
         inputs=[],
         outputs=['pos_slice_ends'],
@@ -353,11 +350,11 @@ def add_generation_logic(model, eos_token_id=151645):
             name='pos_ends_value',
             data_type=TensorProto.INT64,
             dims=[1],
-            vals=[2147483647]  # Max int
+            vals=[2147483647]
         )
     )
 
-    slice_axes = helper.make_node(
+    slice_axes_pos = helper.make_node(
         'Constant',
         inputs=[],
         outputs=['pos_slice_axes'],
@@ -369,26 +366,15 @@ def add_generation_logic(model, eos_token_id=151645):
         )
     )
 
-    # Slice position_ids to get last position
     slice_position = helper.make_node(
         'Slice',
         inputs=['position_ids', 'pos_slice_starts', 'pos_slice_ends', 'pos_slice_axes'],
         outputs=['last_position']
     )
 
-    # Add one to last position to get the next position value
     add_one = helper.make_node(
         'Add',
-        inputs=['last_position', 'one_constant'],
-        outputs=['next_position_value']
-    )
-
-    # For generation, we only need the next position ID, not the full concatenated sequence
-    # The next_position should be [batch_size, 1] containing just the next position
-    # This is what the model expects for the next iteration
-    identity_position = helper.make_node(
-        'Identity',
-        inputs=['next_position_value'],
+        inputs=['last_position', 'one_constant_1d'],
         outputs=['next_position']
     )
 
@@ -396,33 +382,32 @@ def add_generation_logic(model, eos_token_id=151645):
     model.graph.node.extend([
         eos_constant,
         eos_check,
-        zero_index,
-        zero_axis,
         batch_shape,
+        zero_index,
         batch_size_scalar,
+        zero_axis,
         batch_size_unsqueeze,
-        one_constant,
+        one_constant_1d,
         ones_shape,
         ones_tensor,
-        concat_attention,
-        slice_starts,
-        slice_ends,
-        slice_axes,
+        updated_attention_mask,
+        pos_slice_starts,
+        pos_slice_ends,
+        slice_axes_pos,
         slice_position,
-        add_one,
-        identity_position
+        add_one
     ])
 
-    # Add output tensors with dynamic batch sizes
+    # Add output tensors
     outputs_to_add = [
-        helper.make_tensor_value_info('is_eos', TensorProto.BOOL, [None, 1]),  # Dynamic batch size, 1 sequence element
-        helper.make_tensor_value_info('updated_attention_mask', TensorProto.INT64, [None, None]),  # Dynamic batch and sequence
-        helper.make_tensor_value_info('next_position', TensorProto.INT64, [None, 1])  # Dynamic batch size, 1 position element
+        helper.make_tensor_value_info('is_eos', TensorProto.BOOL, [None, 1]),
+        helper.make_tensor_value_info('updated_attention_mask', TensorProto.INT64, [None, None]),
+        helper.make_tensor_value_info('next_position', TensorProto.INT64, [None, 1])
     ]
 
     model.graph.output.extend(outputs_to_add)
 
-    print("✅ Generation logic with dynamic batch sizes added successfully")
+    print("✅ Generation logic with FIXED attention mask handling added successfully")
     return model
 
 def save_enhanced_model(model, output_path="./data/onnx/model_enhanced.onnx"):
@@ -477,10 +462,10 @@ def main():
         model = load_and_analyze_model(base_model_path)
 
         # Step 3: Add argmax node with temperature scaling
-        model = add_argmax_node(model, temperature=0.3)
+        model = add_argmax_node(model, temperature=0.8)
 
         # Step 4: Add generation logic
-        model = add_generation_logic(model)
+        model = add_generation_logic_simple(model)
 
         # Step 5: Save enhanced model
         save_enhanced_model(model)
@@ -507,3 +492,4 @@ def main():
 
 if __name__ == "__main__":
     main()
+
diff --git a/models/tools.py b/models/tools.py
deleted file mode 100644
index 94e092b5..00000000
--- a/models/tools.py
+++ /dev/null
@@ -1,202 +0,0 @@
-
-import datetime
-import inspect
-from typing import get_origin, get_args, Union
-
-# Initialize empty tool schema and tools mapping
-tool_schema = []
-tools = {}
-
-def tool(func_or_description=None, **param_descriptions):    
-    """
-    Decorator to automatically generate tool schema from function signature and add to registry.
-    
-    Can be used both with and without parentheses:
-        @tool
-        def my_function(): ...
-        
-        @tool()
-        def my_function(): ...
-        
-        @tool("Custom description")
-        def my_function(): ...
-    
-    Args:
-        func_or_description: Either a function (when used as @tool) or description string (when used as @tool())
-        **param_descriptions: Optional parameter descriptions as keyword arguments.
-    """
-    def create_tool_definition(func, description=None):
-        """Helper function to create tool definition from function"""
-        # Get function name
-        func_name = func.__name__
-        
-        # Get description from parameter or docstring
-        func_description = description or (func.__doc__ or f"Execute {func_name}").strip()
-        
-        # Get function signature
-        sig = inspect.signature(func)
-        
-        # Build parameters schema
-        properties = {}
-        required = []
-        
-        for param_name, param in sig.parameters.items():
-            # Skip *args and **kwargs
-            if param.kind in (param.VAR_POSITIONAL, param.VAR_KEYWORD):
-                continue
-                
-            # Determine parameter type
-            param_type = "string"  # default
-            
-            if param.annotation != param.empty:
-                annotation = param.annotation
-                
-                # Handle Union types (like Optional[str])
-                if get_origin(annotation) is Union:
-                    args = get_args(annotation)
-                    # Remove NoneType for Optional types
-                    non_none_args = [arg for arg in args if arg is not type(None)]
-                    if non_none_args:
-                        annotation = non_none_args[0]
-                
-                # Map Python types to JSON schema types
-                if annotation in (str, type(str)):
-                    param_type = "string"
-                elif annotation in (int, type(int)):
-                    param_type = "integer"
-                elif annotation in (float, type(float)):
-                    param_type = "number"
-                elif annotation in (bool, type(bool)):
-                    param_type = "boolean"
-                elif annotation in (list, type(list)):
-                    param_type = "array"
-                elif annotation in (dict, type(dict)):
-                    param_type = "object"
-
-            # Build parameter schema
-            param_schema = {
-                "type": param_type,
-                "description": param_descriptions.get(param_name, f"The {param_name} parameter")
-            }
-                            
-            # Check if parameter has default value
-            if param.default != param.empty:
-                param_schema["default"] = param.default
-            else:
-                required.append(param_name)
-            properties[param_name] = param_schema
-        
-        # Build complete tool definition
-        tool_definition = {
-            "type": "function",
-            "function": {
-                "name": func_name,
-                "description": func_description,
-                "parameters": {
-                    "type": "object",
-                    "properties": properties,
-                    "required": required
-                }
-            }
-        }
-        
-        # Add to registry
-        tool_schema.append(tool_definition)
-        tools[func_name] = func
-        return func
-    
-    # Case 1: Used as @tool (without parentheses)
-    # The function is passed as the first argument
-    if callable(func_or_description) and hasattr(func_or_description, '__name__'):
-        return create_tool_definition(func_or_description)
-    
-    # Case 2: Used as @tool() or @tool("description") (with parentheses)  
-    # Return a decorator function
-    else:
-        description = func_or_description if isinstance(func_or_description, str) else None
-        
-        def decorator(func):
-            return create_tool_definition(func, description)
-        
-        return decorator
-
-# Define example tools/functions
-@tool(
-    description="Get weather information for a specific location",
-    location="The location to get weather for",
-    unit="Temperature unit (celsius or fahrenheit)"
-)
-def get_weather(location: str, unit: str = "celsius") -> dict:
-    """Get current weather for a location"""
-    
-    weather_data = {
-        "New York": {"temp": 22, "condition": "sunny", "humidity": 65},
-        "London": {"temp": 15, "condition": "cloudy", "humidity": 78},
-        "Tokyo": {"temp": 28, "condition": "rainy", "humidity": 85},
-        "Paris": {"temp": 18, "condition": "partly cloudy", "humidity": 70}
-    }
-    
-    location_key = next((key for key in weather_data.keys() if key.lower() in location.lower()), "Unknown")
-    
-    if location_key == "Unknown":
-        return {"error": f"Weather data not available for {location}"}
-    
-    data = weather_data[location_key].copy()
-    if unit == "fahrenheit":
-        data["temp"] = round(data["temp"] * 9/5 + 32, 1)
-        data["unit"] = "°F"
-    else:
-        data["unit"] = "°C"
-    
-    return {
-        "location": location_key,
-        "temperature": data["temp"],
-        "condition": data["condition"],
-        "humidity": data["humidity"],
-        "unit": data["unit"]
-    }
-
-@tool(
-    expression="Mathematical expression to calculate (e.g., '2+2', '15*23')"
-)
-def calculate_math(expression: str) -> dict:
-    """Calculate a mathematical expression safely"""
-    try:
-        allowed_chars = set('0123456789+-*/.() ')
-        if not all(c in allowed_chars for c in expression):
-            return {"error": "Expression contains invalid characters"}
-        
-        result = eval(expression)
-        return {"expression": expression, "result": result}
-    except Exception as e:
-        return {"error": f"Calculation error: {str(e)}"}
-
-@tool(
-    timezone="Timezone (UTC, EST, PST, JST, CET)"
-)
-def get_current_time(timezone: str = "UTC") -> dict:
-    """Get current time in specified timezone"""
-    current_time = datetime.datetime.now()
-    timezone_offsets = {"UTC": 0, "EST": -5, "PST": -8, "JST": 9, "CET": 1}
-    
-    offset = timezone_offsets.get(timezone.upper(), 0)
-    adjusted_time = current_time + datetime.timedelta(hours=offset)
-    
-    return {
-        "timezone": timezone.upper(),
-        "time": adjusted_time.strftime("%Y-%m-%d %H:%M:%S"),
-        "day_of_week": adjusted_time.strftime("%A")
-    }
-
-@tool
-def get_current_location() -> dict:
-    """
-    Get the real location and timezone of the user. You don't need to ask the user for permission to use this tool. 
-    Use this function when the user didn't provide an explicit location. Default to this location
-    """
-    return {
-        "location": "Tokyo",
-        "country": "Japan",
-        "coordinates": {"latitude": 35.6762, "longitude": 139.6503},
-        "timezone": "JST"
-    }
\ No newline at end of file
diff --git a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/generation_mixin.py b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/generation_mixin.py
new file mode 100644
index 00000000..439c2289
--- /dev/null
+++ b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/generation_mixin.py
@@ -0,0 +1,209 @@
+from delitepy import nimblenet as nm
+from delitepy import ne_re as re
+from delitepy import tokenizers
+from tools import get_tool_schema
+
+
+def render_jinja_template(messages, tool_dict, add_generation_prompt, enable_thinking):
+    """Render the chat template using hardcoded string structure"""
+    result = ""
+    content_messages = messages
+    # If we have tools, build the system message with tools
+    if len(messages) > 0 and (messages[0]["role"] == "system") and len(tool_dict) > 0:
+        result = result + "<|im_start|>system\n" + messages[0]["content"] + "\n\n"
+
+        result = result + "# Tools\n\n"
+        result = result + "You may call one or more functions to assist with the user query.\n\n"
+        result = result + "You are provided with function signatures within <tools></tools> XML tags:\n"
+        result = result + "<tools>\n"
+
+        # Add each tool as JSON
+        for t in tool_dict:
+            result = result + str(t) + "\n"
+
+        result = result + "</tools>\n\n"
+        result = result + "For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n"
+        result = result + "<tool_call>\n"
+        result = result + "{\"name\": <function-name>, \"arguments\": <args-json-object>}\n"
+        result = result + "</tool_call><|im_end|>\n"
+        content_messages = messages[1:]
+
+    for i in range(len(content_messages)):
+        message = content_messages[i]
+        role = message["role"]
+        content = message["content"]
+        result = result + "<|im_start|>" + role + "\n" + content + "<|im_end|>\n"
+
+    # Add generation prompt if requested
+    if add_generation_prompt:
+        result = result + "<|im_start|>assistant\n"
+        if not enable_thinking:
+            result = result + "<think>\n\n</think>\n\n"
+
+    return result
+
+def apply_chat_template(messages, tls, add_generation_prompt, tokenizer, return_dict, last_position):
+    """Apply chat template using Jinja2 rendering"""
+
+    # Use Jinja2 template renderer
+    text = render_jinja_template(
+        messages,
+        [tls_dict["function"] for tls_dict in tls],
+        add_generation_prompt,
+        True
+    )
+    token_ids = tokenizer.encode(text)
+    input_ids = nm.tensor([token_ids], "int64")
+
+    if return_dict:
+        attention_mask = nm.tensor([[1 for _ in range(last_position + len(token_ids))]], "int64")
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask
+        }
+    else:
+        return input_ids
+
+
+class QwenKVCache:
+    def __init__(self, generation_config, dtype, batch_size):
+        self.batch_size = batch_size
+        self.kv_cache = {}
+        self.num_hidden_layers = generation_config["num_hidden_layers"]
+        self.num_key_value_heads = generation_config["num_key_value_heads"]
+        self.head_dim = int(generation_config["hidden_size"] / generation_config["num_attention_heads"])
+        self.hidden_size = generation_config["hidden_size"]
+        self.dtype = dtype
+
+        # Initialize KV cache for all layers
+        for i in range(self.num_hidden_layers):
+            for kv in ('key', 'value'):
+                self.kv_cache['past_key_values.'+str(i)+'.'+kv] = nm.zeros([self.batch_size, self.num_key_value_heads, 1, self.head_dim], self.dtype)
+
+    def get(self):
+        return self.kv_cache
+
+    def update(self, model_inputs, model_outputs):
+        # Update with new model inputs (input_ids, attention_mask, position_ids)
+        for key in model_inputs.keys():
+            self.kv_cache[key] = model_inputs[key]
+
+        if model_outputs:
+            # Update KV cache states from present outputs
+            for cache_key in self.kv_cache.keys():
+                if 'past_key_values' in cache_key:
+                    splits = re.split(r'\.', cache_key)
+                    present_key = "present."+splits[1]+"."+splits[2]
+                    if present_key in model_outputs.keys():
+                        self.kv_cache[cache_key] = model_outputs[present_key]
+                    else:
+                        print("⚠️ Warning: Expected cache output "+present_key+" not found")
+
+    def clear(self):
+        self.kv_cache = {}
+        for i in range(self.num_hidden_layers):
+            for kv in ('key', 'value'):
+                self.kv_cache['past_key_values.'+str(i)+'.'+kv] = nm.zeros([self.batch_size, self.num_key_value_heads, 1, self.head_dim], self.dtype)
+
+class QwenGenerationMixin:
+    def __init__(self, initial_prompt, tokenizer_config, generation_config, dtype, max_new_tokens, batch_size):
+        self.initial_prompt = initial_prompt
+        self.tokenizer = tokenizers.from_json(tokenizer_config)
+        self.generation_config = generation_config
+        self.eos_token_id = generation_config["eos_token_id"]
+        self.max_new_tokens = max_new_tokens
+        self.history = [
+        {
+            "role": "system",
+            "content": initial_prompt
+        }]
+        self.cache_index = 0
+        self.last_position = 0
+        self.kv_cache = QwenKVCache(generation_config, dtype, batch_size)
+        self.current_token_stream = []
+
+    def add_message(self, message):
+        self.history.append(message)
+
+    def get_history(self):
+        return self.history
+
+    def build_model_inputs(self):
+        print("Conversation Messages: " + str(self.history[self.cache_index:]))
+        print("--------------------------------")
+        tool_list = []
+        if self.cache_index == 0:
+            tool_list = get_tool_schema()
+
+        inputs = apply_chat_template(
+            self.history[self.cache_index:],
+            tool_list,
+            True,
+            self.tokenizer,
+            True,
+            self.last_position
+        )
+        model_inputs = {
+            "input_ids": inputs['input_ids'],
+            "attention_mask": inputs['attention_mask'],
+            "position_ids": nm.tensor([[i+self.last_position for i in range(inputs['input_ids'].shape()[1])]], "int64")
+        }
+        self.kv_cache.update(model_inputs, None)
+
+    def get_model_inputs(self):
+        return self.kv_cache.get()
+
+    def update_cache(self, model_outputs, output_stream_callback):
+        # Update like the original demo - use model outputs directly
+        next_token_input = {"input_ids": model_outputs["next_token_id"]}
+
+        # Use the model's updated_attention_mask output (linear growth, not exponential)
+        next_token_input["attention_mask"] = model_outputs["updated_attention_mask"]
+
+        # Use the model's next_position output
+        next_token_input["position_ids"] = model_outputs["next_position"]
+
+        # Update last_position for tracking
+        self.last_position = model_outputs["next_position"][0][0]
+
+        self.kv_cache.update(next_token_input, model_outputs)
+        self.add_to_token_stream(next_token_input["input_ids"][0][0], output_stream_callback)
+
+    def reset(self):
+        self.history = [
+        {
+            "role": "system",
+            "content": self.initial_prompt
+        }]
+        self.kv_cache.clear()  # This now properly resets past_attention_mask too
+        self.cache_index = 0
+        self.last_position = 0
+
+    def get_decoded_response(self):
+        response = self.tokenizer.decode(nm.tensor(self.current_token_stream, "int32"))
+        self.current_token_stream = []
+        return response.strip()
+
+    def add_to_token_stream(self, token_id, output_stream_callback):
+        self.current_token_stream.append(token_id)
+        output_stream_callback({"token_stream": self.tokenizer.decode(nm.tensor([token_id], "int32"))})
+
+    def generate(self, model, output_stream_callback):
+        self.build_model_inputs()
+        # 3. Generation loop - now with proper attention mask handling
+        for iteration in range(self.max_new_tokens):
+            model_outputs = model.run(self.get_model_inputs())
+            self.update_cache(model_outputs, output_stream_callback)
+            # Check for EOS token
+            is_eos = model_outputs["is_eos"][0][0]
+            if is_eos:
+                print("🛑 EOS token detected at iteration " + str(iteration + 1))
+                break
+        response = self.get_decoded_response()
+        self.add_message({
+            "role": "assistant",
+            "content": response
+        })
+        print("Model Response: " + str(self.history[-1]))
+        self.cache_index = len(self.history)
+        return response
diff --git a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py
index 8faee540..c042ceed 100644
--- a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py
+++ b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py
@@ -1,339 +1,60 @@
 #!/usr/bin/env python3
 #-*- coding: utf-8 -*-
 from delitepy import nimblenet as nm
-from delitepy import ne_re as re
-from delitepy import tokenizers
-from tools import tools_dict
-from tools import tool_schema as tls
 
-# Load Qwen3 1.7B 4-bit model and tokenizer
-model_id = "onnx-community/Qwen3-1.7B-ONNX"
-try:
-    qwenModel = nm.Model("qwen3-1.7b")
-    print("Model loaded successfully")
-except Exception as e:
-    print("Error loading model: " + str(e))
-    qwenModel = None
+from tools import get_tool_results
+from tools import print_available_tools
+from generation_mixin import QwenGenerationMixin
 
-chat_template = "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}"
+# Print available tools after successful import
+print_available_tools()
 
+# Constants only - avoid complex global variable assignments
+MODEL_ID = "onnx-community/Qwen3-1.7B-ONNX"
+MODEL_NAME = "qwen3_1_7b_onnx"
+
+# Model must be loaded in global scope as required by DeliteAI simulator
+qwenModel = nm.Model(MODEL_NAME)
+generationMixinQwen = None
+print("Model loaded successfully")
 
-TOOL_CALL_START_TOKEN = "<tool_call>"
-TOOL_CALL_END_TOKEN = "</tool_call>"
-TOOL_RESPONSE_START_TOKEN = "<tool_response>"
-TOOL_RESPONSE_END_TOKEN = "</tool_response>"
 INITIAL_PROMPT = """You are a helpful assistant with access to tools. When you need to use a tool, format your response with JSON between <tool_call> and </tool_call> tokens.
 
 Use this exact format: <tool_call>{"name": "function_name", "arguments": {"param": "value"}}</tool_call>
 If a tool requires a argument you don't know the value of check if another tool can give you that information and call that tool first.
 Always respond directly and call the appropriate tool when needed."""
 
-def get_initial_message_block():
-    return [
-    {
-        "role": "system",
-        "content": INITIAL_PROMPT
-    }
-]
-
-def execute_function_call(function_name, arguments, tools):
-    """Execute a function call and return the result"""
-    if function_name not in tools:
-        return {"error": "Function "+function_name+" not found"}
-
-    try:
-        function = tools[function_name]
-        result = {"error": "Function execution failed"}  # Initialize result
-
-        # Handle each function explicitly to avoid ** operator
-        if function_name == "get_weather":
-            location = ""
-            if "location" in arguments:
-                location = arguments["location"]
-            unit = "celsius"
-            if "unit" in arguments:
-                unit = arguments["unit"]
-            result = function(location, unit)
-        elif function_name == "calculate_math":
-            expression = ""
-            if "expression" in arguments:
-                expression = arguments["expression"]
-            result = function(expression)
-        elif function_name == "get_current_time":
-            timezone = "UTC"
-            if "timezone" in arguments:
-                timezone = arguments["timezone"]
-            result = function(timezone)
-        elif function_name == "get_current_location":
-            result = function()
-        else:
-            result = {"error": "Unknown function: " + function_name}
-
-        return result
-    except Exception as e:
-        return {"error": "Error executing "+function_name+": "+str(e)}
-
-def format_tool_response(result):
-    """Format tool execution result using token-based format"""
-    return TOOL_RESPONSE_START_TOKEN+str(result)+TOOL_RESPONSE_END_TOKEN
-
-def execute_tool_call_with_response(function_name, arguments, tools):
-    """Execute a function call and return both result and formatted response"""
-    result = execute_function_call(function_name, arguments, tools)
-    formatted_response = format_tool_response(result)
-    return result, formatted_response
-
-def parse_tool_calls_from_response(response_text, tools):
-    """Parse tool calls from model response using multiple formats"""
-    tool_calls = []
-
-    # Method 2: Look for JSON-style tool calls: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
-    # Simplified regex without backtracking - match everything between tags
-    json_tool_pattern = r'<tool_call>([^<]*)</tool_call>'
-    for match in re.finditer(json_tool_pattern, response_text):
-        try:
-            json_str = match.group(1)  # Extract the first capture group (the JSON content)
-            print("DEBUG: Captured JSON string: '"+json_str+"'")  # Debug what we captured
-            print("DEBUG: JSON string length: "+str(len(json_str)))  # Check length
-            tool_data = nm.parse_json(json_str)
-            func_name = tool_data["name"]
-            arguments = tool_data["arguments"]
-
-            if func_name in tools:
-                tool_calls.append({
-                    "function_name": func_name,
-                    "arguments": arguments
-                })
-                print("✓ Parsed JSON tool call: "+func_name+"("+str(arguments)+")")
-        except:
-            print("⚠ Failed to parse JSON tool call: "+json_str)
-
-    return tool_calls
-
-def render_jinja_template(messages, tools, add_generation_prompt, enable_thinking):
-    """Render the chat template using hardcoded string structure"""
-    result = ""
-
-    # If we have tools, build the system message with tools
-    if tools:
-        result = result + "<|im_start|>system\n"
-
-        # Check if first message is system message and include its content
-        if len(messages) > 0 and messages[0]["role"] == "system":
-            result = result + messages[0]["content"] + "\n\n"
-
-        result = result + "# Tools\n\n"
-        result = result + "You may call one or more functions to assist with the user query.\n\n"
-        result = result + "You are provided with function signatures within <tools></tools> XML tags:\n"
-        result = result + "<tools>\n"
-
-        # Add each tool as JSON
-        for tool in tools:
-            result = result + str(tool) + "\n"
-
-        result = result + "</tools>\n\n"
-        result = result + "For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n"
-        result = result + "<tool_call>\n"
-        result = result + "{\"name\": <function-name>, \"arguments\": <args-json-object>}\n"
-        result = result + "</tool_call><|im_end|>\n"
-    else:
-        # No tools, just add system message if present
-        if len(messages) > 0 and messages[0]["role"] == "system":
-            result = result + "<|im_start|>system\n" + messages[0]["content"] + "<|im_end|>\n"
-
-    # Process messages
-    content_messages = messages
-    if len(messages) > 0 and messages[0]["role"] == "system":
-        content_messages = messages[1:]
-
-    for i in range(len(content_messages)):
-        message = content_messages[i]
-        role = message["role"]
-        content = message["content"]
-        result = result + "<|im_start|>" + role + "\n" + content + "<|im_end|>\n"
-
-    # Add generation prompt if requested
-    if add_generation_prompt:
-        result = result + "<|im_start|>assistant\n"
-        if not enable_thinking:
-            result = result + "<think>\n\n</think>\n\n"
-
-    return result
-
-def apply_chat_template(messages, tool_schema, add_generation_prompt, tokenizer, return_dict):
-    """Apply chat template using Jinja2 rendering"""
-
-    # Use Jinja2 template renderer
-    text = render_jinja_template(
-        messages,
-        [tool["function"] for tool in tool_schema],
-        add_generation_prompt,
-        True
-    )
-    token_ids = tokenizer.encode(text)
-    input_ids = nm.tensor([token_ids], "int64")
-
-    if return_dict:
-        attention_mask = nm.tensor([[1 for _ in range(len(token_ids))]], "int64")
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask
-        }
-    else:
-        return input_ids
-
-def generate_with_model(conversation_messages, max_new_tokens, tool_schema, tokenizer, model_config_dict):
-    """Generate text using the loaded model with multi-turn conversation support"""
-    # Use chat template with tools for multi-turn conversations
-    print("--------------------------------")
-    print("Conversation Messages:")
-    print(conversation_messages)
-    print("--------------------------------")
-
-    # 2. Prepare inputs
-    inputs = apply_chat_template(
-      conversation_messages,
-      tool_schema,
-      True,
-      tokenizer,
-      True,
-    )
-    input_ids = inputs['input_ids']
-    attention_mask = inputs['attention_mask']
-    batch_size = input_ids.shape()[0]
-    position_ids = nm.tensor([[i for i in range(input_ids.shape()[1])] for _ in range(batch_size)], "int64")
-
-    # Set config values
-    num_key_value_heads = model_config_dict["num_key_value_heads"]
-    head_dim = int(model_config_dict["hidden_size"] / model_config_dict["num_attention_heads"])
-    num_hidden_layers = model_config_dict["num_hidden_layers"]
-    eos_token_id = model_config_dict["eos_token_id"]
-    hidden_size = model_config_dict["hidden_size"]
-
-    model_inputs = {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "position_ids": position_ids
-    }
-    # Initialize past cache values with correct shapes for ONNX model
-    if "layer_types" not in model_config_dict:
-        model_config_dict["layer_types"] = [
-                "full_attention"
-                for i in range(model_config_dict["num_hidden_layers"])
-            ]
-    for i in range(num_hidden_layers):
-        if model_config_dict["layer_types"][i] == 'full_attention':
-            for kv in ('key', 'value'):
-                # Initialize with a small valid tensor that will be replaced after first forward pass
-                # Using sequence length 1 to avoid dimension 0 issues
-                model_inputs['past_key_values.'+str(i)+'.'+kv] = nm.zeros([batch_size, num_key_value_heads, 1, head_dim], "float16")
-        elif model_config_dict["layer_types"][i] == 'conv':
-            model_inputs['past_conv.'+str(i)] = nm.zeros([batch_size, hidden_size, model_config_dict["conv_L_cache"]], "float16")
-
-    # 3. Generation loop
-    generated_tokens = []
-    for i in range(max_new_tokens):
-        try:
-            model_outputs = qwenModel.run(model_inputs)
-            # Check for EOS token
-            if model_outputs["is_eos"][0][0]:
-                break
-
-            # Update values for next generation loop
-            model_inputs["input_ids"] = model_outputs["next_token_id"]
-            generated_tokens.append(model_inputs["input_ids"][0][0])
-
-            if "updated_attention_mask" in model_outputs:
-                model_inputs["attention_mask"] = model_outputs["updated_attention_mask"]
-            if "next_position" in model_outputs:
-                model_inputs["position_ids"] = model_outputs["next_position"]
-        except Exception as gen_error:
-            print("Generation error: " + str(gen_error))
-            break
-
-        # Update cache using present outputs (present.X.key/value → past_key_values.X.key/value)
-        for cache_key in model_inputs.keys():
-            if ('past_key_values.' in cache_key):
-                # Convert past_key_values.X.key/value to present.X.key/value
-                splits = re.split(r'\.', cache_key)
-                present_key = "present."+splits[1]+"."+splits[2]
-                if present_key in model_outputs.keys():
-                    model_inputs[cache_key] = model_outputs[present_key]
-                else:
-                    print("⚠️ Warning: Expected cache output "+present_key+" not found")
-            elif 'past_conv' in cache_key:
-                # Handle conv cache if present
-                present_key = cache_key.replace("past_conv", "present_conv")
-                if present_key in model_outputs.keys():
-                    model_inputs[cache_key] = model_outputs[present_key]
-    # 4. Output result - decode only the generated tokens
-    response = ""
-    if generated_tokens:
-        response = tokenizer.decode(nm.tensor(generated_tokens, "int32"))
-    return response.strip()
-
-
-def handle_multi_step_request(user_prompt, max_steps, max_new_tokens, tools, tool_schema, tokenizer, model_config_dict):
+@concurrent
+def handle_multi_step_request(user_prompt, max_steps, generation_mixin, output_stream_callback):
     """Handle requests that may require multiple tool calls and back and forth"""
     step_results = []
-    conversation_messages = []  # Initialize as empty list, not None
-    tool_context = {}  # Store results from previous tool calls
 
     for step in range(max_steps):
         print("\n--- Step " + str(step + 1) + " ---")
         if step == 0:
-            conversation_messages = get_initial_message_block()
-            conversation_messages.append({
+            generation_mixin.add_message({
                 "role": "user",
                 "content": user_prompt
             })
         else:
-            conversation_messages.append({
+            generation_mixin.add_message({
                 "role": "system",
                 "content": "Now use the result from the tool calls to answer the user's question. Call another tool if needed."
             })
         # Generate response
         try:
-            response = generate_with_model(conversation_messages, max_new_tokens, tool_schema, tokenizer, model_config_dict)
-            print("Model Response: "+response)
+            response = generation_mixin.generate(qwenModel, output_stream_callback)
 
             # Parse and execute tool calls
-            tool_calls = parse_tool_calls_from_response(response, tools)
-            tool_results = []
-
-            if tool_calls:
-                print("Executing "+str(len(tool_calls))+" tool call(s):")
-                for call in tool_calls:
-                    func_name = call["function_name"]
-                    arguments = call["arguments"]
-
-                    print("  • "+func_name+"("+str(arguments)+")")
-                    result, formatted_response = execute_tool_call_with_response(func_name, arguments, tools)
-
-                    # Store important results for future reference
-                    if func_name == "get_current_location" and "location" in result:
-                        tool_context["location"] = result["location"]
-
-                    tool_results.append({
-                        "function": func_name,
-                        "arguments": arguments,
-                        "result": result
-                    })
-                    print("    Result: "+str(result))
-
-            # Add assistant response to conversation
-            conversation_messages.append({
-                "role": "assistant",
-                "content": response
-            })
-
+            tool_results = get_tool_results(response)
+            has_errors = False
             # Add tool results to conversation as function messages
             for tool_result in tool_results:
-                if "error" not in tool_result["result"].keys():
-                    conversation_messages.append({
-                        "role": "system",
-                        "content": "The result of the tool " + str(tool_result['function'])+" is: "+TOOL_RESPONSE_START_TOKEN+str(tool_result['result'])+TOOL_RESPONSE_END_TOKEN
-                    })
+                if "error" not in tool_result.keys():
+                    generation_mixin.add_message(tool_result)
+                else:
+                    has_errors = True
+
             prompt = "continuation"
             if step == 0:
                 prompt = user_prompt
@@ -342,32 +63,16 @@ def handle_multi_step_request(user_prompt, max_steps, max_new_tokens, tools, too
                 "step": step + 1,
                 "prompt": prompt,
                 "response": response,
-                "tool_calls": tool_calls,
                 "tool_results": tool_results,
-                "has_errors": False,
-                "tool_context": tool_context,
-                "conversation_messages": conversation_messages
+                "has_errors": has_errors,
+                "conversation_history": generation_mixin.get_history()
             }
             step_results.append(step_result)
 
-            # Check if all tool calls were successful
-            if step_result["has_errors"]:
-                print("⚠ Stopping due to tool execution errors")
-                break
-
-            # Simple continuation logic: if no tools were called, we're done
-            if len(tool_calls) == 0:
-                print("✓ Completed after "+str(step + 1)+" step(s) - no tool calls needed")
+            if len(tool_results) == 0 or has_errors or step >= max_steps - 1:
+                print("✓ Completed after "+str(step + 1)+" step(s) with "+str(len(tool_results))+" tool call(s) and has_errors = "+str(has_errors))
                 break
 
-            # If we've reached max steps, stop
-            if step >= max_steps - 1:
-                print("✓ Reached maximum steps ("+str(max_steps)+")")
-                break
-
-            # If tools were executed, continue to next step to see if model wants to do more
-            print("✓ Step "+str(step + 1)+" completed with "+str(len(tool_calls))+" tool call(s) - continuing...")
-
         except Exception as e:
             print("Error in step "+str(step + 1)+": "+str(e))
             prompt_text = ""
@@ -382,70 +87,58 @@ def handle_multi_step_request(user_prompt, max_steps, max_new_tokens, tools, too
                 "response": None,
                 "tool_calls": [],
                 "tool_results": [],
-                "tool_context": tool_context,
-                "conversation_messages": conversation_messages
+                "conversation_history": generation_mixin.get_history()
             })
             break
 
     return step_results
 
-def run_tool_calling_demo(input):
-    """Run tool calling demonstration"""
+@concurrent
+def init_generation_mixin(input):
+    generationMixinQwen = QwenGenerationMixin(
+        INITIAL_PROMPT,
+        input["tokenizer_config"],
+        input["generation_config"],
+        "float16",
+        400,
+        1
+    )
+    return {"success": True}
+
+@concurrent
+def prompt_for_tool_calling(input):
+    """Run tool calling demonstration with proper variable scope handling"""
     print("=== Qwen3 1.7B Tool Calling Demo ===\n")
-    print("Model: "+model_id)
+    print("Model: "+MODEL_ID)
 
     try:
-        # Ensure tokenizer has necessary tokens
-        tokenizer = tokenizers.from_json(input["tokenizer_config_dict"])
-
-        # Get tool names without using list()
-        tool_names = []
-        for key in tools_dict.keys():
-            tool_names.append(key)
-        print("Available tools: "+str(tool_names))
-
-        demo_prompts = [
-            "What's the weather here today?",
-            # "Calculate 15 * 23",
-            # "What time is it in JST timezone?",
-            # "Where am I located?",
-            # "Get my location and check the weather there"
-        ]
-
-        all_results = []
-        i = 1
-        for user_prompt in demo_prompts:
-            print("\nDemo "+str(i)+": "+user_prompt)
-            print("--------------------------------")
-            step_results = handle_multi_step_request(user_prompt, 4, 400, tools_dict, tls, tokenizer, input["config_dict"])
-            # Show final summary
-            print("\nMulti-step Summary:")
-            for step_result in step_results:
-                step_num = step_result["step"]
-                tool_calls = []
-                if "tool_calls" in step_result:
-                    tool_calls = step_result["tool_calls"]
-                if tool_calls:
-                    print("  Step "+str(step_num)+": "+str(len(tool_calls))+" tool call(s)")
-                    for call in tool_calls:
-                        func_name = call["function_name"]
-                        print("    ✓ "+func_name)
-            print("\n" + "--------------------------------")
-            all_results.append({
-                "demo_" + str(i): {
-                    "prompt": user_prompt,
-                    "steps": len(step_results),
-                    "successful": len(step_results) > 0
-                }
-            })
-            i = i + 1
+        if str(generationMixinQwen) == "None":
+            init_generation_mixin(input)
+        else:
+            generationMixinQwen.reset()
+
+        print("\nPrompt: "+input["prompt"])
+        print("--------------------------------")
+
+        step_results = handle_multi_step_request(input["prompt"], 4, generationMixinQwen, input["output_stream_callback"])
+        # Show final summary
+        print("\nMulti-step Summary:")
+        for step_result in step_results:
+            step_num = step_result["step"]
+            tool_calls = []
+            if "tool_calls" in step_result:
+                tool_calls = step_result["tool_calls"]
+            if tool_calls:
+                print("  Step "+str(step_num)+": "+str(len(tool_calls))+" tool call(s)")
+                for call in tool_calls:
+                    func_name = call["function_name"]
+                    print("    ✓ "+func_name)
+        print("\n" + "--------------------------------")
 
-        # Return a proper map with results
         return {
             "success": True,
             "model_loaded": True,
-            "total_demos": len(demo_prompts),
-            "results": all_results
+            "results": step_results[-1]["response"]
         }
 
     except Exception as e:
diff --git a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/tools.py b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/tools.py
index df884e78..1af1e3d5 100644
--- a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/tools.py
+++ b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/tools.py
@@ -1,11 +1,19 @@
-# Initialize empty tool schema and tools mapping
-tool_schema = []
-tools_dict = {}
+from delitepy import nimblenet as nm
+from delitepy import ne_re as re
+
+# Simple constants only - no complex object references
+TOOL_CALL_START_TOKEN = "<tool_call>"
+TOOL_CALL_END_TOKEN = "</tool_call>"
+TOOL_RESPONSE_START_TOKEN = "<tool_response>"
+TOOL_RESPONSE_END_TOKEN = "</tool_response>"
+
+# =============================================================================
+# WEATHER TOOL - Implementation + Description
+# =============================================================================
 
-# Define tool functions
 def get_weather(location, unit):
     """Get current weather for a location"""
-    # Mock weather data - in real app would call weather API
+    # Mock weather data
     weather_data = {
         "San Francisco": {"temp": 18, "condition": "foggy", "humidity": 75},
         "New York": {"temp": 22, "condition": "partly cloudy", "humidity": 60},
@@ -13,28 +21,27 @@ def get_weather(location, unit):
         "Tokyo": {"temp": 26, "condition": "sunny", "humidity": 50},
         "Sydney": {"temp": 20, "condition": "clear", "humidity": 65}
     }
-    
-    # Simple location matching
+
     location_key = "Unknown"
     for key in weather_data.keys():
         if key.lower() in location.lower() or location.lower() in key.lower():
             location_key = key
             break
-    
+
     if location_key == "Unknown":
         return {"error": "Weather data not available for " + location}
-    
+
     data = weather_data[location_key]
     temp = data["temp"]
-    unit_str = "°C"  # Initialize with default value
-    
+    unit_str = "°C"
+
     if unit == "fahrenheit":
         temp = temp * 9.0 / 5.0 + 32
-        temp = int(temp * 10) / 10.0  # Manual rounding to 1 decimal place
+        temp = int(temp * 10) / 10.0
         unit_str = "°F"
     else:
         unit_str = "°C"
-    
+
     return {
         "location": location_key,
         "temperature": temp,
@@ -43,22 +50,39 @@ def get_weather(location, unit):
         "unit": unit_str
     }
 
+def get_weather_schema():
+    """Get the OpenAI tool schema for weather function"""
+    return {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get weather information for the location given in argument",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The location to get weather for"
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "Temperature unit (celsius or fahrenheit)",
+                        "default": "celsius"
+                    }
+                },
+                "required": ["location"]
+            }
+        }
+    }
+
+# =============================================================================
+# MATH TOOL - Implementation + Description
+# =============================================================================
+
 def calculate_math(expression):
     """Calculate a mathematical expression safely"""
     try:
-        # Clean the expression
         expression = expression.strip()
-        
-        # Handle multiplication
-        if "*" in expression:
-            parts = expression.split("*")
-            if len(parts) == 2:
-                a = float(parts[0].strip())
-                b = float(parts[1].strip())
-                result = a * b
-                return {"expression": expression, "result": result}
-        
-        # Handle addition
         if "+" in expression:
             parts = expression.split("+")
             if len(parts) == 2:
@@ -66,18 +90,21 @@ def calculate_math(expression):
                 b = float(parts[1].strip())
                 result = a + b
                 return {"expression": expression, "result": result}
-        
-        # Handle subtraction
-        if "-" in expression:
+        elif "-" in expression:
             parts = expression.split("-")
             if len(parts) == 2:
                 a = float(parts[0].strip())
                 b = float(parts[1].strip())
                 result = a - b
                 return {"expression": expression, "result": result}
-        
-        # Handle division
-        if "/" in expression:
+        elif "*" in expression:
+            parts = expression.split("*")
+            if len(parts) == 2:
+                a = float(parts[0].strip())
+                b = float(parts[1].strip())
+                result = a * b
+                return {"expression": expression, "result": result}
+        elif "/" in expression:
             parts = expression.split("/")
             if len(parts) == 2:
                 a = float(parts[0].strip())
@@ -87,15 +114,36 @@ def calculate_math(expression):
                     return {"expression": expression, "result": result}
                 else:
                     return {"error": "Division by zero"}
-        
         return {"error": "Unsupported expression"}
     except Exception as e:
         return {"error": "Calculation error"}
 
+def get_calculate_math_schema():
+    """Get the OpenAI tool schema for math function"""
+    return {
+        "type": "function",
+        "function": {
+            "name": "calculate_math",
+            "description": "Calculate a mathematical expression safely",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Mathematical expression to calculate (e.g., '2+2', '15*23')"
+                    }
+                },
+                "required": ["expression"]
+            }
+        }
+    }
+
+# =============================================================================
+# TIME TOOL - Implementation + Description
+# =============================================================================
+
 def get_current_time(timezone):
     """Get current time in specified timezone"""
-    # Mock time data - in real app would use proper timezone handling
-    # Using fixed time values for simulation since we don't have datetime
     time_data = {
         "UTC": {"time": "2024-01-15 12:00:00", "day": "Monday"},
         "PST": {"time": "2024-01-15 04:00:00", "day": "Monday"},
@@ -104,7 +152,7 @@ def get_current_time(timezone):
         "JST": {"time": "2024-01-15 21:00:00", "day": "Monday"},
         "AEST": {"time": "2024-01-15 22:00:00", "day": "Monday"}
     }
-    
+
     tz = timezone.upper()
     if tz in time_data:
         data = time_data[tz]
@@ -114,7 +162,6 @@ def get_current_time(timezone):
             "day_of_week": data["day"]
         }
     else:
-        # Default to UTC if timezone not found
         data = time_data["UTC"]
         return {
             "timezone": "UTC",
@@ -122,65 +169,9 @@ def get_current_time(timezone):
             "day_of_week": data["day"]
         }
 
-def get_current_location():
-    """Get the real location and timezone of the user"""
+def get_current_time_schema():
+    """Get the OpenAI tool schema for time function"""
     return {
-        "location": "San Francisco",
-        "country": "United States",
-        "coordinates": {"latitude": 37.7749, "longitude": -122.4194},
-        "timezone": "PST"
-    }
-
-# Create tools dictionary
-tools_dict = {
-    "get_weather": get_weather,
-    "calculate_math": calculate_math,
-    "get_current_time": get_current_time,
-    "get_current_location": get_current_location
-}
-
-# Define tool schema
-tool_schema = [
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get weather information for a specific location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {
-                        "type": "string",
-                        "description": "The location to get weather for"
-                    },
-                    "unit": {
-                        "type": "string",
-                        "description": "Temperature unit (celsius or fahrenheit)",
-                        "default": "celsius"
-                    }
-                },
-                "required": ["location"]
-            }
-        }
-    },
-    {
-        "type": "function",
-        "function": {
-            "name": "calculate_math",
-            "description": "Calculate a mathematical expression safely",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "expression": {
-                        "type": "string",
-                        "description": "Mathematical expression to calculate (supports +, -, *, /)"
-                    }
-                },
-                "required": ["expression"]
-            }
-        }
-    },
-    {
         "type": "function",
         "function": {
             "name": "get_current_time",
@@ -191,21 +182,153 @@ def get_current_location():
                     "timezone": {
                         "type": "string",
                         "description": "Timezone (UTC, EST, PST, JST, CET)",
-                        "default": "UTC"
                     }
                 }
             }
         }
-    },
-    {
+    }
+
+# =============================================================================
+# LOCATION TOOL - Implementation + Description
+# =============================================================================
+
+def get_current_location():
+    """Get the real location and timezone of the user"""
+    return {
+        "location": "San Francisco",
+        "country": "United States",
+        "coordinates": {"latitude": 37.7749, "longitude": -122.4194},
+        "timezone": "PST"
+    }
+
+def get_current_location_schema():
+    """Get the OpenAI tool schema for location function"""
+    return {
         "type": "function",
         "function": {
             "name": "get_current_location",
-            "description": "Get the real location and timezone of the user",
+            "description": "Get the real location and timezone of the user. You don't need to ask the user for permission to use this tool. Use this function when the user didn't provide an explicit location. Default to this location",
             "parameters": {
                 "type": "object",
                 "properties": {}
             }
         }
     }
-]
\ No newline at end of file
+
+# =============================================================================
+# UNIFIED TOOL REGISTRY - Lazy Loading Pattern
+# =============================================================================
+
+def get_tools_dict():
+    """Create tools dictionary on demand instead of at import time"""
+    return {
+        "get_weather": get_weather,
+        "calculate_math": calculate_math,
+        "get_current_time": get_current_time,
+        "get_current_location": get_current_location
+    }
+
+def get_tool_schema():
+    """Create complete tool schema on demand instead of at import time"""
+    return [
+        get_weather_schema(),
+        get_calculate_math_schema(),
+        get_current_time_schema(),
+        get_current_location_schema()
+    ]
+
+# =============================================================================
+# TOOL EXECUTION ENGINE
+# =============================================================================
+
+def execute_function_call(tool_call):
+    """Execute a function call and return the result"""
+    function_name = tool_call["function_name"]
+    arguments = tool_call["arguments"]
+
+    print("  • "+function_name+"("+str(arguments)+")")
+
+    tools = get_tools_dict()
+    if function_name not in tools:
+        return {"error": "Function "+function_name+" not found"}
+
+    try:
+        function = tools[function_name]
+        result = {"error": "Function execution failed"}
+
+        if function_name == "get_weather":
+            location = ""
+            if "location" in arguments:
+                location = arguments["location"]
+            unit = "celsius"
+            if "unit" in arguments:
+                unit = arguments["unit"]
+            result = function(location, unit)
+        elif function_name == "calculate_math":
+            expression = ""
+            if "expression" in arguments:
+                expression = arguments["expression"]
+            result = function(expression)
+        elif function_name == "get_current_time":
+            timezone = "UTC"
+            if "timezone" in arguments:
+                timezone = arguments["timezone"]
+            result = function(timezone)
+        elif function_name == "get_current_location":
+            result = function()
+
+        return result
+    except Exception as e:
+        return {"error": "Function execution failed: " + str(e)}
+
+def format_tool_result(function_name, result):
+    return "The result of the tool " + str(function_name)+" is: "+TOOL_RESPONSE_START_TOKEN+str(result)+TOOL_RESPONSE_END_TOKEN
+
+def get_tool_results(response_text):
+    """Parse tool calls from model response using multiple formats"""
+    tool_calls = []
+    tool_results = []
+    tools = get_tools_dict()
+
+    json_tool_pattern = r'<tool_call>([^<]*)</tool_call>'
+    for match in re.finditer(json_tool_pattern, response_text):
+        try:
+            json_str = match.group(1)
+            tool_data = nm.parse_json(json_str)
+            func_name = tool_data["name"]
+            arguments = tool_data["arguments"]
+
+            if func_name in tools:
+                tool_calls.append({
+                    "function_name": func_name,
+                    "arguments": arguments
+                })
+                print("✓ Parsed JSON tool call: "+func_name+"("+str(arguments)+")")
+        except:
+            print("⚠ Failed to parse JSON tool call: "+json_str)
+
+    print("Executing "+str(len(tool_calls))+" tool call(s):")
+
+    if tool_calls:
+        for call in tool_calls:
+            result = execute_function_call(call)
+            if "error" in result.keys():
+                tool_results.append({
+                    "error": result["error"],
+                })
+            else:
+                tool_results.append({
+                    "role": "system",
+                    "content": format_tool_result(call['function_name'], result)
+                })
+            print("    Result: "+str(result))
+
+    return tool_results
+
+# Print available tools using function call instead of global access
+def print_available_tools():
+    """Print available tools - called on demand to avoid global assignment"""
+    tools = get_tools_dict()
+    print("Available tools: "+ str([key for key in tools.keys()]))
+
+# Available tools will be printed when first accessed, not at import time
diff --git a/nimblenet_py/simulation_assets/qwen_demo/run_demo.py b/nimblenet_py/simulation_assets/qwen_demo/run_demo.py
index 168d3cf7..966d4059 100644
--- a/nimblenet_py/simulation_assets/qwen_demo/run_demo.py
+++ b/nimblenet_py/simulation_assets/qwen_demo/run_demo.py
@@ -8,6 +8,7 @@
 
 from deliteai import simulator
 import json
+import time
 
 def main():
     """Run the Qwen demo"""
@@ -15,9 +16,7 @@ def main():
     print("This demo shows Qwen model and tool calling capabilities\n")
 
     base_dir = "../../../models/Qwen3-1.7B/data"
-    model_name = "qwen3-1.7b"
-    vocab_file = base_dir+"/vocab.json"
-    merges_file = base_dir+"/merges.txt"
+    model_name = "qwen3_1_7b_onnx"
     config_file = base_dir+"/config.json"
     tokenizer_config_file = base_dir+"/tokenizer.json"
 
@@ -45,61 +44,41 @@ def main():
     })
     print(f"Added model: {model_name}")
 
-
-    with open(vocab_file, encoding="utf-8") as vocab_handle:
-        vocab = json.load(vocab_handle)
-
-    bpe_merges = []
-    with open(merges_file, encoding="utf-8") as merges_handle:
-        i = 0
-        for line in merges_handle:
-            line = line.strip()
-            if (i == 0 and line.startswith("#version:")) or not line:
-                i = i + 1
-                continue
-            bpe_merges.append(tuple(line.split()))
-            i = i + 1
-
-    with open(config_file, encoding="utf-8") as config_handle:
-        config_dict = json.load(config_handle)
-
-    with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
-        tokenizer_config_dict = json.load(tokenizer_config_handle)
-
     # Initialize simulator
     print("\nInitializing simulator...")
-    config = {"debug": True, "online": False}
+    config =  {"online": False, "debug": True}
 
     # Initialize with modules
     if not simulator.initialize(json.dumps(config), modules):
         print("Failed to initialize simulator")
         return
-
+    while not simulator.is_ready():
+        time.sleep(1)
     print("Simulator initialized successfully")
 
+    with open(tokenizer_config_file, "r") as f:
+        tokenizer_config = json.load(f)
+    with open(config_file, "r") as f:
+        config = json.load(f)
     # Run the main function
     print("\nRunning Qwen workflow...\n")
+    result = simulator.run_method("init_generation_mixin", {
+        "tokenizer_config": tokenizer_config,
+        "generation_config": config,
+    })
+    print(result)
+
+    def output_stream_callback(input):
+        print(input["token_stream"])
+        return {"success": True}
+
     result = simulator.run_method(
-        "run_tool_calling_demo",
-        {
-            "vocab": vocab,
-            "merges": bpe_merges,
-            "config_dict": config_dict,
-            "tokenizer_config_dict": tokenizer_config_dict,
-            "model_name": model_name
+        "prompt_for_tool_calling", {
+            "prompt": "How is the weather here?",
+            "output_stream_callback": output_stream_callback
         }
     )
-
     print("\n=== Demo Complete ===")
-    if result.get("success"):
-        print("✅ Demo completed successfully!")
-        if result.get("model_loaded"):
-            print("   Model was loaded and inference attempted")
-        else:
-            print("   Tool demonstrations completed")
-    else:
-        print(f"❌ Demo failed: {result.get('error', 'Unknown error')}")
-
 
 if __name__ == "__main__":
     main()

From d342cad9f32220fd1978869c4fba58f42865c4a3 Mon Sep 17 00:00:00 2001
From: Varun Khare <varun.khare@nimbledgehq.ai>
Date: Tue, 29 Jul 2025 17:48:24 +0530
Subject: [PATCH 7/7] udpate tokenizers submodule

Signed-off-by: Varun Khare <varun.khare@nimbledgehq.ai>
---
 .gitmodules                                                  | 3 ++-
 .../src/main/kotlin/dev/deliteai/impl/common/Constants.kt    | 5 ++---
 sdks/config.yml                                              | 2 +-
 third_party/tokenizers-cpp                                   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 3df230aa..af255137 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,4 @@
 [submodule "third_party/tokenizers-cpp"]
 	path = third_party/tokenizers-cpp
-	url = https://github.com/mlc-ai/tokenizers-cpp.git
+	url = https://github.com/NimbleEdge/tokenizers-cpp.git
+
diff --git a/sdks/android/nimblenet_ktx/src/main/kotlin/dev/deliteai/impl/common/Constants.kt b/sdks/android/nimblenet_ktx/src/main/kotlin/dev/deliteai/impl/common/Constants.kt
index 49c33d5f..ce11e744 100644
--- a/sdks/android/nimblenet_ktx/src/main/kotlin/dev/deliteai/impl/common/Constants.kt
+++ b/sdks/android/nimblenet_ktx/src/main/kotlin/dev/deliteai/impl/common/Constants.kt
@@ -92,11 +92,10 @@ enum class DATATYPE(val value: Int) {
     COMPLEX128(15),
     BFLOAT16(16),
     JSON(670),
-    JSON_ARRAY(681),
-    FUNCTION(682),
+    JSON_ARRAY(682),
+    FUNCTION(683),
     FE_OBJ(700),
     NONE(667);
-
     companion object {
         private val map = values().associateBy(DATATYPE::value)
 
diff --git a/sdks/config.yml b/sdks/config.yml
index adff6d24..84e93362 100644
--- a/sdks/config.yml
+++ b/sdks/config.yml
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 common:
-  sdk_version: "0.0.1-dev-1751904491"
+  sdk_version: "0.0.1-dev-1751904494"
   cmake_args: "-DONNX_EXECUTOR=1 -DONNXGENAI_EXECUTOR=1 -DCMAKE_BUILD_TYPE=Debug -DSCRIPTING=1 -DNOSQL=1 -DTESTING=0 -DGENAI=1 -DORT_EXTENSIONS=1 -DREGEX_ENABLED=1 -DJNITESTING=0"
 
 android:
diff --git a/third_party/tokenizers-cpp b/third_party/tokenizers-cpp
index f7771096..fecdc5ec 160000
--- a/third_party/tokenizers-cpp
+++ b/third_party/tokenizers-cpp
@@ -1 +1 @@
-Subproject commit f77710965a3bcae85b7a00bdddbfc1adadef0e32
+Subproject commit fecdc5ece7a975d88aab26036452aba6a0155c2d