diff --git a/.gitignore b/.gitignore index 2457fbff..a04eada2 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ third_party/runtime/ !third_party/runtime/CMakeLists.txt __pycache__/ .pytest_cache/ +**/NimbleSDK +models/**/data diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..af255137 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "third_party/tokenizers-cpp"] + path = third_party/tokenizers-cpp + url = https://github.com/NimbleEdge/tokenizers-cpp.git + diff --git a/coreruntime/CMakeLists.txt b/coreruntime/CMakeLists.txt index 4fa47fe2..af773cbd 100644 --- a/coreruntime/CMakeLists.txt +++ b/coreruntime/CMakeLists.txt @@ -38,7 +38,7 @@ endif() # set(DEBUGFLAGS " -Werror -Wno-write-strings -Weffc++ -Wall -Wuninitialized -Wnon-virtual-dtor -Wshadow -Werror=format-security -Wunused-member-function -Wunused-function ") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=switch -Werror=return-type -Werror=implicit-fallthrough \ - -Werror=non-virtual-dtor -Werror=format -Werror=format-security -Werror=unused-member-function -Werror=unused-function -Werror=writable-strings") + -Werror=non-virtual-dtor -Werror=format -Werror=format-security -Werror=unused-function -Werror=write-strings") # string(CONCAT RELEASEFLAGS ${DEBUGFLAGS} " -fstack-protector-strong -ffunction-sections -fdata-sections ") # # @@ -84,6 +84,7 @@ add_subdirectory(nimblenet) add_subdirectory(delitepy) add_subdirectory("../third_party/json" "${CMAKE_BINARY_DIR}/third_party/json") add_subdirectory("../third_party/SPSCQueue" "${CMAKE_BINARY_DIR}/third_party/SPSCQueue") +add_subdirectory("../third_party/tokenizers-cpp" "${CMAKE_BINARY_DIR}/third_party/tokenizers-cpp") if (GENAI) add_subdirectory("../third_party/miniz" "${CMAKE_BINARY_DIR}/third_party/miniz") endif() @@ -182,6 +183,7 @@ else() target_compile_definitions(nimblenet PUBLIC -DIOS_PLATFORM="mac") add_subdirectory(platform/unix) # produces ${CLIENT_INCLUDES} add_subdirectory("../third_party/runtime" "${CMAKE_BINARY_DIR}/third_party/runtime") # -> produces ${BACKEND_LIBS} ${BACKEND_DIR} ${BACKED_INCLUDES} + # Use system curl library instead of conda environment target_link_libraries(nimblenet ${VISIBILITY} curl) # target_link_libraries(nimblenet ${VISIBILITY} clientlib) @@ -194,7 +196,7 @@ if(NOT ANDROID_ABI) list(APPEND ADDITIONAL_LIBS ZLIB::ZLIB) endif() -target_link_libraries(nimblenet PRIVATE nlohmann_json::nlohmann_json ${VISIBILITY} SPSCQueue ${VISIBILITY} ${BACKEND_LIBS} ${VISIBILITY} ${ADDITIONAL_LIBS}) +target_link_libraries(nimblenet PRIVATE nlohmann_json::nlohmann_json ${VISIBILITY} SPSCQueue ${VISIBILITY} tokenizers_cpp ${VISIBILITY} ${BACKEND_LIBS} ${VISIBILITY} ${ADDITIONAL_LIBS}) if (GENAI) target_link_libraries(nimblenet PRIVATE miniz) endif() diff --git a/coreruntime/build.py b/coreruntime/build.py index 68e4ac7a..ec0d4a44 100755 --- a/coreruntime/build.py +++ b/coreruntime/build.py @@ -51,7 +51,7 @@ def main(): if "-DCMAKE_BUILD_TYPE=Release" in cmake_args: STRIP = 1 - CMAKE_CXX_FLAGS = "" + CMAKE_CXX_FLAGS = "-Wno-unused-member-function -Wno-implicit-fallthrough " if args.testing: cmake_args += " -DTESTING=1 " @@ -61,16 +61,20 @@ def main(): COMMON_FLAGS = ( f"-B{os.getcwd()}/build/ " f"{cmake_args} " + "-DCMAKE_POLICY_VERSION_MINIMUM=3.5 " + "-DCMAKE_CXX_FLAGS_RELEASE='-Wno-unused-function -Wno-implicit-fallthrough -DNDEBUG -O3' " + "-DCMAKE_CXX_FLAGS_DEBUG='-Wno-unused-function -Wno-implicit-fallthrough -g' " ) # Determine compiler settings based on architecture if arch == "arm": cmake_command = f"cmake CMakeLists.txt {COMMON_FLAGS} -DCMAKE_CXX_COMPILER=g++ -DMACOS=1 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_CXX_FLAGS='{CMAKE_CXX_FLAGS}'" elif arch == "x86_64": - CMAKE_CXX_FLAGS += " -stdlib=libstdc++ " + # Replace clang-specific flags with g++ compatible ones + CMAKE_CXX_FLAGS = CMAKE_CXX_FLAGS.replace("-Wno-unused-member-function", "-Wno-unused-function") cmake_command = ( f"cmake CMakeLists.txt {COMMON_FLAGS} " - f"-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS='{CMAKE_CXX_FLAGS}'" + f"-DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS='{CMAKE_CXX_FLAGS}'" ) else: cmake_command = f"cmake CMakeLists.txt {COMMON_FLAGS} -DMACOS=1" @@ -103,14 +107,14 @@ def main(): if args.simulator: if not args.ci_build: # re-install deliteai - subprocess.run(f"python{python_version} -m pip uninstall deliteai", shell=True, check=True) + subprocess.run(f"python{python_version} -m pip uninstall -y deliteai", shell=True, check=True) subprocess.run("rm -rf dist deliteai*", shell=True, check=True) subprocess.run(f"python{python_version} setup.py bdist_wheel", shell=True, check=True) subprocess.run(f"python{python_version} -m pip install dist/*", shell=True, check=True) # re-install delitepy-library-stubs subprocess.run( - f"python{python_version} -m pip uninstall delitepy-library-stubs", + f"python{python_version} -m pip uninstall -y delitepy-library-stubs", shell=True, check=True, ) diff --git a/coreruntime/delitepy/library_stubs/setup.py b/coreruntime/delitepy/library_stubs/setup.py index 25dc63b5..47cfc2ec 100644 --- a/coreruntime/delitepy/library_stubs/setup.py +++ b/coreruntime/delitepy/library_stubs/setup.py @@ -26,12 +26,7 @@ def render_src_template() -> None: check=True, ) subprocess.run( - [ - f"{delitepy_dir}/scripts/render_jinja2_templates.py", - f"{library_stubs_dir}/src_template", - f"{library_stubs_dir}/src_gen", - coreruntime_dir, - ], + ["cp", "-r", f"{library_stubs_dir}/src_template", f"{library_stubs_dir}/src_gen"], check=True, ) diff --git a/coreruntime/delitepy/library_stubs/src_template/delitepy/__init__.py b/coreruntime/delitepy/library_stubs/src_template/delitepy/__init__.py index d04a2c91..1b47e828 100644 --- a/coreruntime/delitepy/library_stubs/src_template/delitepy/__init__.py +++ b/coreruntime/delitepy/library_stubs/src_template/delitepy/__init__.py @@ -2,7 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 -"""Package delitepy containing modules nimblenet and ne_re.""" +"""Package delitepy containing modules nimblenet, ne_re, and tokenizers.""" from delitepy.nimblenet import * from delitepy.ne_re import * +from delitepy.tokenizers import * diff --git a/coreruntime/delitepy/library_stubs/src_template/delitepy/tokenizers/__init__.py b/coreruntime/delitepy/library_stubs/src_template/delitepy/tokenizers/__init__.py new file mode 100644 index 00000000..d0dd7b6f --- /dev/null +++ b/coreruntime/delitepy/library_stubs/src_template/delitepy/tokenizers/__init__.py @@ -0,0 +1,152 @@ +# SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors +# +# SPDX-License-Identifier: Apache-2.0 + +"""Package delitepy.tokenizers for tokenizer functionality.""" + +from typing import List, Union +from delitepy.nimblenet.tensor import Tensor + +def from_pretrained(model_name_or_path: str) -> str: + """Load a pre-trained tokenizer from HuggingFace Hub or local file. + + Args: + model_name_or_path: Path to tokenizer.json file or HuggingFace model name + + Returns: + Tokenizer handle (opaque string identifier) + + Example: + >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased") + >>> tokenizer = tokenizers.from_pretrained("/path/to/tokenizer.json") + """ + pass + +def from_file(file_path: str) -> str: + """Load a tokenizer from a file path. + + Args: + file_path: Path to tokenizer.json or .model file + + Returns: + Tokenizer handle (opaque string identifier) + + Example: + >>> tokenizer = tokenizers.from_file("tokenizer.json") + >>> tokenizer = tokenizers.from_file("model.spm") + """ + pass + +def from_json(json_str: str) -> str: + """Create a tokenizer from a JSON string. + + Args: + json_str: JSON string containing tokenizer configuration + + Returns: + Tokenizer handle (opaque string identifier) + + Example: + >>> json_config = '{"model": {...}, "normalizer": {...}}' + >>> tokenizer = tokenizers.from_json(json_config) + """ + pass + +def from_sentencepiece(model_path: str) -> str: + """Load a SentencePiece tokenizer from a .model file. + + Args: + model_path: Path to SentencePiece .model file + + Returns: + Tokenizer handle (opaque string identifier) + + Example: + >>> tokenizer = tokenizers.from_sentencepiece("tokenizer.model") + """ + pass + +def encode(tokenizer: str, text: str) -> Tensor: + """Encode text into token IDs. + + Args: + tokenizer: Tokenizer handle from from_pretrained/from_file/etc. + text: Text to encode + + Returns: + Tensor containing token IDs (INT32) + + Example: + >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased") + >>> token_ids = tokenizers.encode(tokenizer, "Hello world!") + >>> print(token_ids.shape) # [num_tokens] + """ + pass + +def decode(tokenizer: str, token_ids: Tensor) -> str: + """Decode token IDs back to text. + + Args: + tokenizer: Tokenizer handle + token_ids: Tensor containing token IDs (INT32) + + Returns: + Decoded text string + + Example: + >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased") + >>> token_ids = tokenizers.encode(tokenizer, "Hello world!") + >>> text = tokenizers.decode(tokenizer, token_ids) + >>> print(text) # "Hello world!" + """ + pass + +def get_vocab_size(tokenizer: str) -> int: + """Get the vocabulary size of the tokenizer. + + Args: + tokenizer: Tokenizer handle + + Returns: + Size of the vocabulary + + Example: + >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased") + >>> vocab_size = tokenizers.get_vocab_size(tokenizer) + >>> print(vocab_size) # 30522 + """ + pass + +def token_to_id(tokenizer: str, token: str) -> int: + """Convert a token string to its ID. + + Args: + tokenizer: Tokenizer handle + token: Token string + + Returns: + Token ID, or -1 if token not found + + Example: + >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased") + >>> token_id = tokenizers.token_to_id(tokenizer, "[CLS]") + >>> print(token_id) # 101 + """ + pass + +def id_to_token(tokenizer: str, token_id: int) -> str: + """Convert a token ID to its string representation. + + Args: + tokenizer: Tokenizer handle + token_id: Token ID + + Returns: + Token string, or empty string if ID not found + + Example: + >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased") + >>> token = tokenizers.id_to_token(tokenizer, 101) + >>> print(token) # "[CLS]" + """ + pass \ No newline at end of file diff --git a/coreruntime/delitepy/scripts/render_jinja2_templates.py b/coreruntime/delitepy/scripts/render_jinja2_templates.py deleted file mode 100755 index 06bcb53d..00000000 --- a/coreruntime/delitepy/scripts/render_jinja2_templates.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors -# -# SPDX-License-Identifier: Apache-2.0 - -import os -import sys -from pathlib import Path - -from jinja2 import Template - - -def extract_delitepy_doc_blocks(infile_path: str): - block_begin_marker = "DELITEPY_DOC_BLOCK_BEGIN" - block_end_marker = "DELITEPY_DOC_BLOCK_END" - - inside_doc_block = False - - with open(infile_path, "r") as infile: - for line in infile: - stripped_line = line.strip() - - if stripped_line.startswith(block_begin_marker): - inside_doc_block = True - continue - - if stripped_line.startswith(block_end_marker): - inside_doc_block = False - continue - - if inside_doc_block: - yield line - - -def render_jinja2_templates(source_dir: str, target_dir: str, base_dir: str) -> None: - for root, _, file_names in os.walk(source_dir): - root_rel_path = os.path.relpath(root, source_dir) - root_out_dir = os.path.join(target_dir, root_rel_path) - os.makedirs(root_out_dir, exist_ok=True) - - for file_name in file_names: - template_path = os.path.join(root, file_name) - with open(template_path, "r") as file: - template_content = file.read() - - template = Template(template_content, keep_trailing_newline=True) - render_context = { - "extract_delitepy_doc_blocks": lambda infile_path: "".join( - extract_delitepy_doc_blocks( - str(Path(base_dir).joinpath(infile_path).resolve()), - ), - ), - } - rendered_template_content = template.render(render_context) - - rendered_template_path = os.path.join(root_out_dir, file_name) - with open(rendered_template_path, "w") as file: - file.write(rendered_template_content) - - -def main(args: list[str]) -> None: - assert len(args) == 4, "Incorrect usage." - - source_dir = str(Path(args[1]).resolve()) - target_dir = str(Path(args[2]).resolve()) - base_dir = str(Path(args[3]).resolve()) - - print(f"Rendering Jinja2 templates: '{source_dir}' => '{target_dir}'") - render_jinja2_templates(source_dir, target_dir, base_dir) - print(f"[done] Rendering Jinja2 templates: '{source_dir}' => '{target_dir}'") - - -if __name__ == "__main__": - main(sys.argv) diff --git a/coreruntime/nimblenet/CMakeLists.txt b/coreruntime/nimblenet/CMakeLists.txt index c8845fc4..a4ff333d 100644 --- a/coreruntime/nimblenet/CMakeLists.txt +++ b/coreruntime/nimblenet/CMakeLists.txt @@ -35,6 +35,7 @@ set(BASE data_variable/src/pre_processor_nimble_net_variable.cpp data_variable/src/raw_event_store_data_variable.cpp data_variable/src/regex_data_variable.cpp + data_variable/src/tokenizers_data_variable.cpp data_variable/src/single_variable.cpp data_variable/src/tensor_data_variable.cpp job_scheduler/src/base_job.cpp diff --git a/coreruntime/nimblenet/asset_manager/src/asset_manager.cpp b/coreruntime/nimblenet/asset_manager/src/asset_manager.cpp index a4e7ff7b..886d536e 100644 --- a/coreruntime/nimblenet/asset_manager/src/asset_manager.cpp +++ b/coreruntime/nimblenet/asset_manager/src/asset_manager.cpp @@ -31,6 +31,8 @@ std::string Asset::get_file_name_on_device() const { case AssetType::LLM: return name + version + rmconstants::LLMFolderName; #endif // GENAI + default: + return name + version; } } @@ -96,6 +98,8 @@ std::string get_string_from_asset_type(const AssetType& assetType) { case AssetType::LLM: return "llm"; #endif // GENAI + default: + return "unknown"; } } diff --git a/coreruntime/nimblenet/core_sdk/src/core_sdk.cpp b/coreruntime/nimblenet/core_sdk/src/core_sdk.cpp index 8d7d80c9..9c9af279 100644 --- a/coreruntime/nimblenet/core_sdk/src/core_sdk.cpp +++ b/coreruntime/nimblenet/core_sdk/src/core_sdk.cpp @@ -392,6 +392,8 @@ std::pair CoreSDK::get_cloud_config_and_update_ } case CloudConfigState::Unmodified: return {cloudConfig, deployment}; + default: + return {cloudConfig, deployment}; } } diff --git a/coreruntime/nimblenet/core_sdk/src/nimble_exec_info.cpp b/coreruntime/nimblenet/core_sdk/src/nimble_exec_info.cpp index 486d8ff4..c47d5f78 100644 --- a/coreruntime/nimblenet/core_sdk/src/nimble_exec_info.cpp +++ b/coreruntime/nimblenet/core_sdk/src/nimble_exec_info.cpp @@ -40,7 +40,7 @@ namespace detail { * than "size". If the return value is equal to "size" then the number of * addresses may have been truncated. */ -int backtrace(void* _Nonnull* _Nonnull buffer, int size); +int backtrace(void** buffer, int size); /** * [backtrace_symbols(3)](https://man7.org/linux/man-pages/man3/backtrace_symbols.3.html) @@ -50,7 +50,7 @@ int backtrace(void* _Nonnull* _Nonnull buffer, int size); * Returns a pointer to allocated memory, on error NULL is returned. It is * the responsibility of the caller to free the returned memory. */ -char* _Nullable* _Nullable backtrace_symbols(void* _Nonnull const* _Nonnull buffer, int size); +char** backtrace_symbols(void* const* buffer, int size); /** * [backtrace_symbols_fd(3)](https://man7.org/linux/man-pages/man3/backtrace_symbols_fd.3.html) @@ -58,7 +58,7 @@ char* _Nullable* _Nullable backtrace_symbols(void* _Nonnull const* _Nonnull buff * of strings that represent the backtrace and write to the file represented * by "fd". The file is written such that one line equals one void* address. */ -void backtrace_symbols_fd(void* _Nonnull const* _Nonnull buffer, int size, int fd); +void backtrace_symbols_fd(void* const* buffer, int size, int fd); } // namespace detail diff --git a/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp b/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp index 00f7f6e9..2f8ecd5c 100644 --- a/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp +++ b/coreruntime/nimblenet/cross_platform/include/nimble_net_util.hpp @@ -62,14 +62,16 @@ enum DATATYPE { DATAFRAME = 676, NIMBLENET_REGEX = 677, NIMBLENET_REGEX_MATCHOBJECT = 678, - CHAR_STREAM = 679, - JSON_STREAM = 680, - JSON_ARRAY = 681, - FUNCTION = 682, - CONCURRENT_EXECUTOR = 683, - EXCEPTION = 684, + NIMBLENET_TOKENIZERS = 679, + CHAR_STREAM = 680, + JSON_STREAM = 681, + JSON_ARRAY = 682, + FUNCTION = 683, + CONCURRENT_EXECUTOR = 684, + EXCEPTION = 685, UNKNOWN = 0, FLOAT = 1, + FLOAT16 = 2, BOOLEAN = 9, INT32 = 6, INT64 = 7, diff --git a/coreruntime/nimblenet/data_variable/include/data_variable.hpp b/coreruntime/nimblenet/data_variable/include/data_variable.hpp index ba2e1188..b3713e3a 100644 --- a/coreruntime/nimblenet/data_variable/include/data_variable.hpp +++ b/coreruntime/nimblenet/data_variable/include/data_variable.hpp @@ -269,6 +269,8 @@ class DataVariable : public std::enable_shared_from_this { virtual uint8_t cast_uint8() { return get_uint8(); } + virtual uint16_t cast_uint16() { return get_uint16(); } + virtual int8_t cast_int8() { return get_int8(); } virtual int32_t get_int32() { THROW_UNSUPPORTED("get_int32"); } @@ -283,6 +285,8 @@ class DataVariable : public std::enable_shared_from_this { virtual uint8_t get_uint8() { THROW_UNSUPPORTED("get_uint8"); } + virtual uint16_t get_uint16() { THROW_UNSUPPORTED("get_uint16"); } + virtual std::string get_string() const { THROW_UNSUPPORTED("get_string"); } virtual bool get_bool() = 0; diff --git a/coreruntime/nimblenet/data_variable/include/data_variable_enums.hpp b/coreruntime/nimblenet/data_variable/include/data_variable_enums.hpp index 5a65db4d..009ea6a3 100644 --- a/coreruntime/nimblenet/data_variable/include/data_variable_enums.hpp +++ b/coreruntime/nimblenet/data_variable/include/data_variable_enums.hpp @@ -133,6 +133,15 @@ enum MemberFuncType { CLEAR_CONTEXT, ADD_CONTEXT, LIST_COMPATIBLE_LLMS, + TOKENIZERS_FROM_PRETRAINED, + TOKENIZERS_FROM_FILE, + TOKENIZERS_FROM_JSON, + TOKENIZERS_FROM_SENTENCEPIECE, + TOKENIZERS_ENCODE, + TOKENIZERS_DECODE, + TOKENIZERS_GET_VOCAB_SIZE, + TOKENIZERS_TOKEN_TO_ID, + TOKENIZERS_ID_TO_TOKEN, GET_HARDWARE_INFO, SET_XNNPACK_NUM_THREADS, #if DELITEAI_TARGET_OS_ANDROID || DELITEAI_TARGET_OS_IOS diff --git a/coreruntime/nimblenet/data_variable/include/data_variable_templates.ipp b/coreruntime/nimblenet/data_variable/include/data_variable_templates.ipp index 28e20d8d..f2ec55b6 100644 --- a/coreruntime/nimblenet/data_variable/include/data_variable_templates.ipp +++ b/coreruntime/nimblenet/data_variable/include/data_variable_templates.ipp @@ -34,6 +34,12 @@ constexpr inline bool is_numeric() { return true; } +template <> +constexpr inline bool is_numeric() { + // fp16 is numeric + return true; +} + template <> constexpr inline bool is_numeric() { return false; @@ -74,6 +80,12 @@ constexpr inline bool is_integer() { return true; } +template <> +constexpr inline bool is_integer() { + // fp16 is not an integer type + return false; +} + template <> constexpr inline bool is_integer() { return false; @@ -104,6 +116,11 @@ constexpr inline int get_dataType_enum() { return DATATYPE::INT64; } +template <> +constexpr inline int get_dataType_enum() { + return DATATYPE::FLOAT16; +} + template <> constexpr inline int get_dataType_enum() { return DATATYPE::DOUBLE; @@ -134,6 +151,13 @@ inline float DataVariable::get() { return get_float(); } +template <> +inline uint16_t DataVariable::get() { + // For fp16, we return the raw uint16_t representation + // The caller can convert this to actual fp16 if needed + return get_uint16(); +} + template <> inline int64_t DataVariable::get() { return get_int64(); @@ -159,4 +183,4 @@ inline bool DataVariable::get() { template <> inline nlohmann::json DataVariable::get() { return get_json_data(); -} \ No newline at end of file +} diff --git a/coreruntime/nimblenet/data_variable/include/model_nimble_net_variable.hpp b/coreruntime/nimblenet/data_variable/include/model_nimble_net_variable.hpp index 674b888b..82effc55 100644 --- a/coreruntime/nimblenet/data_variable/include/model_nimble_net_variable.hpp +++ b/coreruntime/nimblenet/data_variable/include/model_nimble_net_variable.hpp @@ -54,13 +54,24 @@ class ModelNimbleNetVariable final : public DataVariable { Parameters ---------- - args : *Tensor + args : *Tensor or dict Input tensors to the model in the order they are expected in the model. + Alternatively, can accept a single dictionary mapping input names to tensors. Returns ---------- - modelOutput : tuple[Tensor, ...] - Returns the output tensors of model as a tuple. The order of tensors is the same as defined during model construction. + modelOutput : tuple[Tensor, ...] or dict + Returns the output tensors of model as a tuple when using tensor arguments. + Returns a dictionary mapping output names to tensors when using dictionary input. + + Examples + -------- + # Traditional tensor arguments + >>> output = model.run(input1, input2) + + # Dictionary input (new feature) + >>> input_dict = {"input1": tensor1, "input2": tensor2} + >>> output_dict = model.run(input_dict) """ pass DELITEPY_DOC_BLOCK_END diff --git a/coreruntime/nimblenet/data_variable/include/single_variable.hpp b/coreruntime/nimblenet/data_variable/include/single_variable.hpp index 9ea24b3e..36ffc7f1 100644 --- a/coreruntime/nimblenet/data_variable/include/single_variable.hpp +++ b/coreruntime/nimblenet/data_variable/include/single_variable.hpp @@ -54,6 +54,8 @@ class SingleVariable final : public BaseSingleVariable { uint8_t get_uint8() override { return uint8_t(val); } + uint16_t get_uint16() override { return uint16_t(val); } + int8_t get_int8() override { return int8_t(val); } bool get_bool() override { return val; } diff --git a/coreruntime/nimblenet/data_variable/include/tokenizers_data_variable.hpp b/coreruntime/nimblenet/data_variable/include/tokenizers_data_variable.hpp new file mode 100644 index 00000000..bc3cb243 --- /dev/null +++ b/coreruntime/nimblenet/data_variable/include/tokenizers_data_variable.hpp @@ -0,0 +1,51 @@ +/* + * SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "data_variable.hpp" +#include "map_data_variable.hpp" +#include "tokenizers_cpp.h" +#include +#include +#include + +class TokenizersDataVariable : public DataVariable { + public: + TokenizersDataVariable(); + TokenizersDataVariable(std::unique_ptr tokenizer); + ~TokenizersDataVariable() override = default; + + int get_containerType() const override { return CONTAINERTYPE::SINGLE; } + bool get_bool() override { return true; } + int get_dataType_enum() const override { return DATATYPE::NIMBLENET_TOKENIZERS; } + nlohmann::json to_json() const override { return "[Tokenizers]"; } + std::string print() override { return "[Tokenizers]"; } + + OpReturnType call_function(int memberFuncIndex, const std::vector& arguments, CallStack& stack) override; + + // Static factory methods for creating tokenizer instances + static OpReturnType from_pretrained(const std::vector& arguments, CallStack& stack); + static OpReturnType from_file(const std::vector& arguments, CallStack& stack); + static OpReturnType from_json(const std::vector& arguments, CallStack& stack); + static OpReturnType from_sentencepiece(const std::vector& arguments, CallStack& stack); + + // Instance member methods that operate on the tokenizer + OpReturnType encode(const std::vector& arguments, CallStack& stack); + OpReturnType decode(const std::vector& arguments, CallStack& stack); + OpReturnType get_vocab_size(const std::vector& arguments, CallStack& stack); + OpReturnType token_to_id(const std::vector& arguments, CallStack& stack); + OpReturnType id_to_token(const std::vector& arguments, CallStack& stack); + + private: + std::unique_ptr _tokenizer; + + // Helper functions + static std::unique_ptr _create_tokenizer_from_file(const std::string& path); + static std::unique_ptr _create_tokenizer_from_json(const std::string& json); + static std::unique_ptr _create_tokenizer_from_map(const MapDataVariable* map); + static std::unique_ptr _create_tokenizer_from_sentencepiece(const std::string& model_path); +}; diff --git a/coreruntime/nimblenet/data_variable/src/data_variable.cpp b/coreruntime/nimblenet/data_variable/src/data_variable.cpp index 2ec045d4..b9f1c5d5 100644 --- a/coreruntime/nimblenet/data_variable/src/data_variable.cpp +++ b/coreruntime/nimblenet/data_variable/src/data_variable.cpp @@ -105,6 +105,13 @@ std::map DataVariable::_memberFuncMap = { {"clear_context", MemberFuncType::CLEAR_CONTEXT}, {"add_context", MemberFuncType::ADD_CONTEXT}, {"list_compatible_llms", MemberFuncType::LIST_COMPATIBLE_LLMS}, + {"from_pretrained", MemberFuncType::TOKENIZERS_FROM_PRETRAINED}, + {"from_file", MemberFuncType::TOKENIZERS_FROM_FILE}, + {"from_json", MemberFuncType::TOKENIZERS_FROM_JSON}, + {"from_sentencepiece", MemberFuncType::TOKENIZERS_FROM_SENTENCEPIECE}, + {"encode", MemberFuncType::TOKENIZERS_ENCODE}, + {"decode", MemberFuncType::TOKENIZERS_DECODE}, + {"get_vocab_size", MemberFuncType::TOKENIZERS_GET_VOCAB_SIZE}, {"get_hardware_info", MemberFuncType::GET_HARDWARE_INFO}, {"set_xnnpack_num_threads", MemberFuncType::SET_XNNPACK_NUM_THREADS}, #if DELITEAI_TARGET_OS_ANDROID || DELITEAI_TARGET_OS_IOS @@ -187,7 +194,7 @@ std::map DataVariable::_inverseMemberFuncMap = { {MemberFuncType::RETRIEVER, "Retriever"}, {MemberFuncType::POP, "pop"}, {MemberFuncType::KEYS, "keys"}, - {MemberFuncType::JSON_DOCUMENT, "jsonDocument"}, + {MemberFuncType::JSON_DOCUMENT, "JsonDocument"}, {MemberFuncType::MAX_INPUT_NUM_TOKENS, "max_input_num_tokens"}, {MemberFuncType::CONSTRUCTOR, "__init__"}, {MemberFuncType::UNICODE, "unicode"}, @@ -199,6 +206,13 @@ std::map DataVariable::_inverseMemberFuncMap = { {MemberFuncType::CLEAR_CONTEXT, "clear_context"}, {MemberFuncType::ADD_CONTEXT, "add_context"}, {MemberFuncType::LIST_COMPATIBLE_LLMS, "list_compatible_llms"}, + {MemberFuncType::TOKENIZERS_FROM_PRETRAINED, "from_pretrained"}, + {MemberFuncType::TOKENIZERS_FROM_FILE, "from_file"}, + {MemberFuncType::TOKENIZERS_FROM_JSON, "from_json"}, + {MemberFuncType::TOKENIZERS_FROM_SENTENCEPIECE, "from_sentencepiece"}, + {MemberFuncType::TOKENIZERS_ENCODE, "encode"}, + {MemberFuncType::TOKENIZERS_DECODE, "decode"}, + {MemberFuncType::TOKENIZERS_GET_VOCAB_SIZE, "get_vocab_size"}, {MemberFuncType::GET_HARDWARE_INFO, "get_hardware_info"}, {MemberFuncType::SET_XNNPACK_NUM_THREADS, "set_xnnpack_num_threads"}, #if DELITEAI_TARGET_OS_ANDROID || DELITEAI_TARGET_OS_IOS @@ -517,6 +531,7 @@ OpReturnType DataVariable::create_tensor(int dType, const std::vector& switch (dType) { case DATATYPE::FLOAT: + case DATATYPE::FLOAT16: case DATATYPE::DOUBLE: case DATATYPE::INT32: case DATATYPE::INT64: diff --git a/coreruntime/nimblenet/data_variable/src/model_nimble_net_variable.cpp b/coreruntime/nimblenet/data_variable/src/model_nimble_net_variable.cpp index 54da14a2..92317f48 100644 --- a/coreruntime/nimblenet/data_variable/src/model_nimble_net_variable.cpp +++ b/coreruntime/nimblenet/data_variable/src/model_nimble_net_variable.cpp @@ -7,6 +7,8 @@ #include "model_nimble_net_variable.hpp" #include "asset_load_job.hpp" +#include "map_data_variable.hpp" +#include "task_onnx_model.hpp" #include "task_onnx_model.hpp" void ModelNimbleNetVariable::set_xnnpack_intra_op_num_threads(int num_threads) { @@ -30,6 +32,39 @@ std::shared_ptr ModelNimbleNetVariable::load_async( } OpReturnType ModelNimbleNetVariable::run_model(const std::vector& arguments) { + // Check if we have a single dictionary argument (new interface) + if (arguments.size() == 1) { + auto mapVar = std::dynamic_pointer_cast(arguments[0]); + if (mapVar) { + // Use dictionary-based inference if available + auto onnxModel = std::dynamic_pointer_cast(_model); + if (onnxModel) { + OpReturnType output; + try { + auto start = std::chrono::high_resolution_clock::now(); + + // Use the new dictionary interface (corrected signature) + int infStatus = onnxModel->invoke_inference_dict(output, arguments[0]); + + auto stop = std::chrono::high_resolution_clock::now(); + long long duration = + std::chrono::duration_cast(stop - start).count(); + _commandCenter->write_inference_metric(_modelName, duration); + + if (infStatus != SUCCESS || !output) { + // inference failed return None + return OpReturnType(new NoneVariable()); + } + + return output; + } catch (...) { + THROW("%s", "Error occurred while trying to get inference using dictionary interface."); + } + } + } + } + + // Fall back to traditional vector interface std::vector inputNames = _model->get_input_names(); if (inputNames.size() != arguments.size()) { THROW("Model takes %d inputs, %d inputs provided. Cannot run model.", inputNames.size(), diff --git a/coreruntime/nimblenet/data_variable/src/tensor_data_variable.cpp b/coreruntime/nimblenet/data_variable/src/tensor_data_variable.cpp index 56ccd6fe..4d882c3a 100644 --- a/coreruntime/nimblenet/data_variable/src/tensor_data_variable.cpp +++ b/coreruntime/nimblenet/data_variable/src/tensor_data_variable.cpp @@ -33,6 +33,8 @@ int BaseTypedTensorVariable::get_elem_size(DATATYPE dataType) { return sizeof(int64_t); case FLOAT: return sizeof(float); + case FLOAT16: + return sizeof(uint16_t); // 16-bit float stored as uint16_t case DOUBLE: return sizeof(double); case BOOLEAN: @@ -63,6 +65,8 @@ std::string BaseTypedTensorVariable::print() { switch (get_dataType_enum()) { case DATATYPE::FLOAT: return util::recursive_string(shape, 0, (float*)get_raw_ptr(), 0, numElements); + case DATATYPE::FLOAT16: + return util::recursive_string(shape, 0, (uint16_t*)get_raw_ptr(), 0, numElements); case DATATYPE::DOUBLE: return util::recursive_string(shape, 0, (double*)get_raw_ptr(), 0, numElements); case DATATYPE::INT64: diff --git a/coreruntime/nimblenet/data_variable/src/tokenizers_data_variable.cpp b/coreruntime/nimblenet/data_variable/src/tokenizers_data_variable.cpp new file mode 100644 index 00000000..dae4e6fe --- /dev/null +++ b/coreruntime/nimblenet/data_variable/src/tokenizers_data_variable.cpp @@ -0,0 +1,314 @@ +/* + * SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "tokenizers_data_variable.hpp" +#include "data_variable_enums.hpp" +#include "util.hpp" +#include "native_interface.hpp" +#include "tensor_data_variable.hpp" +#include "single_variable.hpp" +#include "map_data_variable.hpp" +#include +#include +#include // Added for nlohmann/json + +TokenizersDataVariable::TokenizersDataVariable() : _tokenizer(nullptr) {} + +TokenizersDataVariable::TokenizersDataVariable(std::unique_ptr tokenizer) + : _tokenizer(std::move(tokenizer)) {} + +std::unique_ptr TokenizersDataVariable::_create_tokenizer_from_file(const std::string& path) { + LOG_TO_CLIENT_DEBUG("TokenizersDataVariable::_create_tokenizer_from_file: %s", path.c_str()); + std::ifstream file(path, std::ios::binary); + if (!file.is_open()) { + THROW("Failed to open tokenizer file: %s", path.c_str()); + } + + std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + + // Try to determine the type by file extension + if (path.size() >= 5 && path.substr(path.size() - 5) == ".json") { + return tokenizers::Tokenizer::FromBlobJSON(content); + } else if (path.size() >= 6 && path.substr(path.size() - 6) == ".model") { + return tokenizers::Tokenizer::FromBlobSentencePiece(content); + } else { + // Default to JSON format + return tokenizers::Tokenizer::FromBlobJSON(content); + } +} + +std::unique_ptr TokenizersDataVariable::_create_tokenizer_from_json(const std::string& json) { + return tokenizers::Tokenizer::FromBlobJSON(json); +} + +std::unique_ptr TokenizersDataVariable::_create_tokenizer_from_map(const MapDataVariable* map) { + // Convert MapDataVariable to nlohmann::json, then to string + nlohmann::json json_obj = map->to_json(); + std::string json_str = json_obj.dump(); + return tokenizers::Tokenizer::FromBlobJSON(json_str); +} + +std::unique_ptr TokenizersDataVariable::_create_tokenizer_from_sentencepiece(const std::string& model_path) { + std::ifstream file(model_path, std::ios::binary); + if (!file.is_open()) { + THROW("Failed to open SentencePiece model file: %s", model_path.c_str()); + } + + std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + return tokenizers::Tokenizer::FromBlobSentencePiece(content); +} + +OpReturnType TokenizersDataVariable::from_pretrained(const std::vector& arguments, + CallStack& stack) { + if (arguments.size() != 1) { + THROW("from_pretrained expects 1 argument, got %zu", arguments.size()); + } + + auto path_var = arguments[0]; + if (path_var->get_containerType() != CONTAINERTYPE::SINGLE) { + THROW("from_pretrained expects string argument"); + } + + std::string path = static_cast*>(path_var.get())->get_string(); + LOG_TO_CLIENT_DEBUG("TokenizersDataVariable::from_pretrained: %s", path.c_str()); + try { + auto tokenizer = _create_tokenizer_from_file(path); + return OpReturnType(new TokenizersDataVariable(std::move(tokenizer))); + } catch (const std::exception& e) { + THROW("Failed to create tokenizer from %s: %s", path.c_str(), e.what()); + } +} + +OpReturnType TokenizersDataVariable::from_file(const std::vector& arguments, + CallStack& stack) { + if (arguments.size() != 1) { + THROW("from_file expects 1 argument, got %zu", arguments.size()); + } + + auto path_var = arguments[0]; + if (path_var->get_containerType() != CONTAINERTYPE::SINGLE) { + THROW("from_file expects string argument"); + } + + std::string path = static_cast*>(path_var.get())->get_string(); + + try { + auto tokenizer = _create_tokenizer_from_file(path); + return OpReturnType(new TokenizersDataVariable(std::move(tokenizer))); + } catch (const std::exception& e) { + THROW("Failed to create tokenizer from %s: %s", path.c_str(), e.what()); + } +} + +OpReturnType TokenizersDataVariable::from_json(const std::vector& arguments, + CallStack& stack) { + if (arguments.size() != 1) { + THROW("from_json expects 1 argument, got %zu", arguments.size()); + } + + auto json_var = arguments[0]; + + try { + std::unique_ptr tokenizer; + + if (json_var->get_containerType() == CONTAINERTYPE::MAP) { + // Handle MapDataVariable input + auto map_var = static_cast(json_var.get()); + tokenizer = _create_tokenizer_from_map(map_var); + } else if (json_var->get_containerType() == CONTAINERTYPE::SINGLE) { + // Handle string input (backward compatibility) + std::string json = static_cast*>(json_var.get())->get_string(); + tokenizer = _create_tokenizer_from_json(json); + } else { + THROW("from_json expects either a dictionary (MapDataVariable) or string argument"); + } + + return OpReturnType(new TokenizersDataVariable(std::move(tokenizer))); + } catch (const std::exception& e) { + THROW("Failed to create tokenizer from JSON: %s", e.what()); + } +} + +OpReturnType TokenizersDataVariable::from_sentencepiece(const std::vector& arguments, + CallStack& stack) { + if (arguments.size() != 1) { + THROW("from_sentencepiece expects 1 argument, got %zu", arguments.size()); + } + + auto path_var = arguments[0]; + if (path_var->get_containerType() != CONTAINERTYPE::SINGLE) { + THROW("from_sentencepiece expects string argument"); + } + + std::string path = static_cast*>(path_var.get())->get_string(); + + try { + auto tokenizer = _create_tokenizer_from_sentencepiece(path); + return OpReturnType(new TokenizersDataVariable(std::move(tokenizer))); + } catch (const std::exception& e) { + THROW("Failed to create SentencePiece tokenizer from %s: %s", path.c_str(), e.what()); + } +} + +OpReturnType TokenizersDataVariable::encode(const std::vector& arguments, + CallStack& stack) { + if (!_tokenizer) { + THROW("No tokenizer loaded. Use from_pretrained, from_file, from_json, or from_sentencepiece first."); + } + + if (arguments.size() != 1) { + THROW("encode expects 1 argument (text), got %zu", arguments.size()); + } + + auto text_var = arguments[0]; + if (text_var->get_containerType() != CONTAINERTYPE::SINGLE) { + THROW("encode expects string argument"); + } + + std::string text = static_cast*>(text_var.get())->get_string(); + + try { + std::vector token_ids = _tokenizer->Encode(text); + + // Create a tensor to return the token IDs + std::vector shape = {static_cast(token_ids.size())}; + auto result_tensor = TensorVariable::copy_tensor_from_raw_data(token_ids.data(), DATATYPE::INT32, shape); + return result_tensor; + } catch (const std::exception& e) { + THROW("Failed to encode text: %s", e.what()); + } +} + +OpReturnType TokenizersDataVariable::decode(const std::vector& arguments, + CallStack& stack) { + if (!_tokenizer) { + THROW("No tokenizer loaded. Use from_pretrained, from_file, from_json, or from_sentencepiece first."); + } + + if (arguments.size() != 1) { + THROW("decode expects 1 argument (token_ids), got %zu", arguments.size()); + } + + auto ids_var = arguments[0]; + + auto tensor = std::dynamic_pointer_cast(ids_var); + if (tensor) { + if (ids_var->get_dataType_enum() != DATATYPE::INT32) { + THROW("decode expects INT32 tensor for ids"); + } + + // Use begin/end iterators to get data + std::vector token_ids(tensor->begin(), tensor->end()); + + try { + std::string decoded = _tokenizer->Decode(token_ids); + auto result = std::make_shared>(decoded); + return OpReturnType(result); + } catch (const std::exception& e) { + THROW("Failed to decode token IDs: %s", e.what()); + } + } else { + THROW("decode expects tensor of token IDs"); + } +} + +OpReturnType TokenizersDataVariable::get_vocab_size(const std::vector& arguments, + CallStack& stack) { + if (!_tokenizer) { + THROW("No tokenizer loaded. Use from_pretrained, from_file, from_json, or from_sentencepiece first."); + } + + if (arguments.size() != 0) { + THROW("get_vocab_size expects 0 arguments, got %zu", arguments.size()); + } + + try { + int64_t vocab_size = static_cast(_tokenizer->GetVocabSize()); + auto result = std::make_shared>(vocab_size); + return OpReturnType(result); + } catch (const std::exception& e) { + THROW("Failed to get vocab size: %s", e.what()); + } +} + +OpReturnType TokenizersDataVariable::token_to_id(const std::vector& arguments, + CallStack& stack) { + if (!_tokenizer) { + THROW("No tokenizer loaded. Use from_pretrained, from_file, from_json, or from_sentencepiece first."); + } + + if (arguments.size() != 1) { + THROW("token_to_id expects 1 argument (token), got %zu", arguments.size()); + } + + auto token_var = arguments[0]; + if (token_var->get_containerType() != CONTAINERTYPE::SINGLE) { + THROW("token_to_id expects string argument"); + } + + std::string token = static_cast*>(token_var.get())->get_string(); + + try { + int32_t token_id = _tokenizer->TokenToId(token); + auto result = std::make_shared>(token_id); + return OpReturnType(result); + } catch (const std::exception& e) { + THROW("Failed to get token ID: %s", e.what()); + } +} + +OpReturnType TokenizersDataVariable::id_to_token(const std::vector& arguments, + CallStack& stack) { + if (!_tokenizer) { + THROW("No tokenizer loaded. Use from_pretrained, from_file, from_json, or from_sentencepiece first."); + } + + if (arguments.size() != 1) { + THROW("id_to_token expects 1 argument (token_id), got %zu", arguments.size()); + } + + auto id_var = arguments[0]; + if (id_var->get_containerType() != CONTAINERTYPE::SINGLE) { + THROW("id_to_token expects int argument"); + } + + int32_t token_id = static_cast(static_cast*>(id_var.get())->get_int64()); + + try { + std::string token = _tokenizer->IdToToken(token_id); + auto result = std::make_shared>(token); + return OpReturnType(result); + } catch (const std::exception& e) { + THROW("Failed to get token: %s", e.what()); + } +} + +OpReturnType TokenizersDataVariable::call_function(int memberFuncIndex, + const std::vector& arguments, + CallStack& stack) { + switch (memberFuncIndex) { + case MemberFuncType::TOKENIZERS_FROM_PRETRAINED: + return from_pretrained(arguments, stack); + case MemberFuncType::TOKENIZERS_FROM_FILE: + return from_file(arguments, stack); + case MemberFuncType::TOKENIZERS_FROM_JSON: + return from_json(arguments, stack); + case MemberFuncType::TOKENIZERS_FROM_SENTENCEPIECE: + return from_sentencepiece(arguments, stack); + case MemberFuncType::TOKENIZERS_ENCODE: + return encode(arguments, stack); + case MemberFuncType::TOKENIZERS_DECODE: + return decode(arguments, stack); + case MemberFuncType::TOKENIZERS_GET_VOCAB_SIZE: + return get_vocab_size(arguments, stack); + case MemberFuncType::TOKENIZERS_TOKEN_TO_ID: + return token_to_id(arguments, stack); + case MemberFuncType::TOKENIZERS_ID_TO_TOKEN: + return id_to_token(arguments, stack); + default: + THROW("%s not implemented for tokenizers", DataVariable::get_member_func_string(memberFuncIndex)); + } +} diff --git a/coreruntime/nimblenet/executors/onnx/include/task_onnx_model.hpp b/coreruntime/nimblenet/executors/onnx/include/task_onnx_model.hpp index ab193db7..250b52f2 100644 --- a/coreruntime/nimblenet/executors/onnx/include/task_onnx_model.hpp +++ b/coreruntime/nimblenet/executors/onnx/include/task_onnx_model.hpp @@ -7,49 +7,39 @@ #pragma once #include "data_variable.hpp" +#include "map_data_variable.hpp" #include "nimble_net_util.hpp" #include "task_base_model.hpp" #include "tensor_data_variable.hpp" -/** - * @brief TaskONNXModel is a specialized implementation of TaskBaseModel - * that supports running ONNX models using ONNX Runtime when invoked from delitepy script. - */ -class TaskONNXModel : public TaskBaseModel { - private: - OrtAllocator* _allocator = nullptr; /**< Allocator used by ONNX Runtime */ - Ort::SessionOptions _sessionOptions; /**< Options to configure ONNX session */ - Ort::MemoryInfo _memoryInfo; /**< Memory info for tensor allocations */ - static Ort::Env _myEnv; /**< Static environment shared by all sessions */ - static Ort::ThreadingOptions tp; /**< Threading configuration */ - Ort::Session* _session = nullptr; /**< ONNX session handle */ - std::vector _inputNames; /**< Cached input names */ - std::vector _outputNames; /**< Cached output names */ +// Forward declarations for ONNX runtime +namespace Ort { +class Env; +class Session; +class SessionOptions; +class Value; +class AllocatorWithDefaultOptions; +class MemoryInfo; +} // namespace Ort - /** - * @brief Loads model metadata such as input/output names. - */ - void load_model_meta_data(); +class TaskONNXModel : public TaskBaseModel { + static Ort::Env _myEnv; /**< Global ONNX Runtime environment */ + Ort::Session* _session = nullptr; /**< ONNX Runtime session instance */ + Ort::SessionOptions _sessionOptions{}; /**< Session configuration options */ + std::vector _inputNames; /**< Model input tensor names */ + std::vector _outputNames; /**< Model output tensor names */ + OrtAllocator* _allocator = nullptr; /**< ONNX Runtime memory allocator */ + Ort::MemoryInfo _memoryInfo; /**< Memory information for tensor creation */ /** - * @brief Loads the model from the internal buffer. + * @brief Loads the model from the buffer into ONNX Runtime session. */ void load_model_from_buffer() override final; /** - * @brief Invokes inference using a vector of ONNX input tensors. - * - * @param ret Output structure to populate. - * @param inputTensors Prepared input tensors. - * @return status + * @brief Loads model metadata including input/output names. */ - int invoke_inference(OpReturnType& ret, - const std::vector& inputTensors) override final; - - int invoke_inference(InferenceReturn* ret) override final { - throw std::runtime_error( - "Invoke inference with InferenceReturn struct in model run from task is not implemented."); - } + void load_model_meta_data(); /** * @brief Creates an ONNX input tensor and sets the data pointer. @@ -115,6 +105,38 @@ class TaskONNXModel : public TaskBaseModel { const nlohmann::json& epConfig, const int epConfigVersion, CommandCenter* commandCenter, bool runDummyInference); + /** + * @brief Invokes inference using a vector of ONNX input tensors. + * + * @param ret Output structure to populate. + * @param inputTensors Prepared input tensors. + * @return status + */ + int invoke_inference(OpReturnType& ret, + const std::vector& inputTensors) override final; + + /** + * @brief Invokes inference using dictionary-based input/output (MapDataVariable interface). + * + * @param output_dict Dictionary to populate with named outputs. + * @param input_dict Dictionary containing named inputs. + * @return status + */ + int invoke_inference_dict(OpReturnType& output_dict, const OpReturnType& input_dict); + + /** + * @brief Converts tuple result to MapDataVariable format for named outputs. + * + * @param tuple_result Tuple result from standard inference. + * @return OpReturnType containing MapDataVariable with named outputs + */ + OpReturnType convert_tuple_to_dict(const OpReturnType& tuple_result); + + int invoke_inference(InferenceReturn* ret) override final { + throw std::runtime_error( + "Invoke inference with InferenceReturn struct in model run from task is not implemented."); + } + /** * @brief Returns input tensor names from the ONNX model. */ @@ -125,6 +147,30 @@ class TaskONNXModel : public TaskBaseModel { */ std::vector get_output_names() override { return _outputNames; } + /** + * @brief Returns input tensor names as string vector for dictionary usage. + */ + std::vector get_input_names_string() { + std::vector names; + names.reserve(_inputNames.size()); + for (const char* name : _inputNames) { + names.emplace_back(name); + } + return names; + } + + /** + * @brief Returns output tensor names as string vector for dictionary usage. + */ + std::vector get_output_names_string() { + std::vector names; + names.reserve(_outputNames.size()); + for (const char* name : _outputNames) { + names.emplace_back(name); + } + return names; + } + /** * @brief Destructor for TaskONNXModel. Cleans up session. */ diff --git a/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp b/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp index dddc8d55..8c85d079 100644 --- a/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp +++ b/coreruntime/nimblenet/executors/onnx/src/task_onnx_model.cpp @@ -4,13 +4,56 @@ * SPDX-License-Identifier: Apache-2.0 */ +/* + * Dictionary-based interface usage examples using MapDataVariable: + * + * // Example 1: Using MapDataVariable interface for inference + * OpReturnType inputs = OpReturnType(new MapDataVariable()); + * OpReturnType outputs; + * auto input_map = std::dynamic_pointer_cast(inputs); + * + * // Prepare inputs + * input_map->set_value_in_map("input_ids", input_ids_tensor); + * input_map->set_value_in_map("attention_mask", attention_mask_tensor); + * input_map->set_value_in_map("position_ids", position_ids_tensor); + * + * // Add cache inputs + * for (int i = 0; i < num_layers; i++) { + * input_map->set_value_in_map("past_key_values." + std::to_string(i) + ".key", past_key_tensor); + * input_map->set_value_in_map("past_key_values." + std::to_string(i) + ".value", past_value_tensor); + * } + * + * // Run inference + * int result = model->invoke_inference_dict(outputs, inputs); + * auto output_map = std::dynamic_pointer_cast(outputs); + * + * // Access outputs by name + * auto logits = output_map->get_string_subscript("logits"); + * auto next_token = output_map->get_string_subscript("next_token_id"); + * auto is_eos = output_map->get_string_subscript("is_eos"); + * auto updated_attention = output_map->get_string_subscript("updated_attention_mask"); + * + * // Example 2: Converting from tuple result to MapDataVariable + * OpReturnType tuple_result; + * model->invoke_inference(tuple_result, input_tensors); + * + * OpReturnType output_dict = model->convert_tuple_to_dict(tuple_result); + * auto output_map = std::dynamic_pointer_cast(output_dict); + * + * // Now access outputs by name instead of position + * auto logits = output_map->get_string_subscript("logits"); + */ + #include "task_onnx_model.hpp" +#include #include "data_variable.hpp" +#include "map_data_variable.hpp" #include "nimble_net_util.hpp" #include "nimble_net/config.h" #include "onnx_operators.hpp" #include "tensor_data_variable.hpp" +#include "tuple_data_variable.hpp" #ifdef ORT_EXTENSIONS DELITEAI_EXTERN_C_BEGIN @@ -73,10 +116,37 @@ int TaskONNXModel::create_input_tensor_and_set_data_ptr(const OpReturnType req, delete[] strings; } else { int fieldSize = util::get_field_size_from_data_type(req->get_dataType_enum()); + + // Map DeliteAI DATATYPE to ONNX tensor element data type + ONNXTensorElementDataType onnxDataType; + switch (req->get_dataType_enum()) { + case DATATYPE::FLOAT: + onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; + break; + case DATATYPE::FLOAT16: + onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; + break; + case DATATYPE::DOUBLE: + onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; + break; + case DATATYPE::INT32: + onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32; + break; + case DATATYPE::INT64: + onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; + break; + case DATATYPE::BOOLEAN: + onnxDataType = ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL; + break; + default: + LOG_TO_CLIENT_ERROR("Unsupported data type %d for ONNX tensor creation", req->get_dataType_enum()); + return TERMINAL_ERROR; + } + inputTensor = Ort::Value::CreateTensor(_memoryInfo, req->get_raw_ptr(), fieldSize * req->get_numElements(), req->get_shape().data(), req->get_shape().size(), - (ONNXTensorElementDataType)req->get_dataType_enum()); + onnxDataType); } returnedInputTensor = std::move(inputTensor); return SUCCESS; @@ -123,16 +193,116 @@ int TaskONNXModel::invoke_inference(OpReturnType& ret, return SUCCESS; } +int TaskONNXModel::invoke_inference_dict(OpReturnType& output_dict, const OpReturnType& input_dict) { + try { + // Convert input MapDataVariable to vector format for existing inference + auto input_map = std::dynamic_pointer_cast(input_dict); + if (!input_map) { + LOG_TO_CLIENT_ERROR("Input is not a MapDataVariable for modelId=%s", _modelId.c_str()); + return TERMINAL_ERROR; + } + + std::vector inputTensors; + inputTensors.reserve(_inputNames.size()); + + for (size_t i = 0; i < _inputNames.size(); i++) { + std::string inputName(_inputNames[i]); + + try { + OpReturnType input_tensor = input_map->get_string_subscript(inputName); + Ort::Value inputTensor = Ort::Value{nullptr}; + int result = create_input_tensor_and_set_data_ptr(input_tensor, i, std::move(inputTensor)); + if (result != SUCCESS) { + return result; + } + inputTensors.push_back(std::move(inputTensor)); + } catch (...) { + LOG_TO_CLIENT_ERROR("Missing input tensor '%s' for modelId=%s", inputName.c_str(), _modelId.c_str()); + return TERMINAL_ERROR; + } + } + + // Run inference using existing method + std::vector output_onnx_tensors = + _session->Run(Ort::RunOptions{nullptr}, _inputNames.data(), inputTensors.data(), + _inputNames.size(), _outputNames.data(), _outputNames.size()); + + // Create output MapDataVariable + output_dict = OpReturnType(new MapDataVariable()); + auto output_map = std::dynamic_pointer_cast(output_dict); + + for (size_t i = 0; i < output_onnx_tensors.size(); i++) { + std::string outputName(_outputNames[i]); + OpReturnType tensor_var = get_tensor_variable_from_onnx_tensor(std::move(output_onnx_tensors[i])); + output_map->set_value_in_map(outputName, tensor_var); + } + + return SUCCESS; + } + catch (Ort::Exception& e) { + LOG_TO_CLIENT_ERROR("Exception in invoke_inference_dict:%s with errorCode:%d, for modelId=%s", + e.what(), e.GetOrtErrorCode(), _modelId.c_str()); + return TERMINAL_ERROR; + } + catch (...) { + LOG_TO_CLIENT_ERROR("Exception in invoke_inference_dict ONNXSessionRun for modelId=%s", + _modelId.c_str()); + return TERMINAL_ERROR; + } +} + +OpReturnType TaskONNXModel::convert_tuple_to_dict(const OpReturnType& tuple_result) { + try { + // Check if result is a TupleDataVariable + auto tuple_var = std::dynamic_pointer_cast(tuple_result); + if (!tuple_var) { + LOG_TO_CLIENT_ERROR("Result is not a TupleDataVariable for modelId=%s", _modelId.c_str()); + return OpReturnType(new NoneVariable()); + } + + // Convert tuple elements to MapDataVariable using output names + auto tuple_elements = tuple_var->get_members(); + if (tuple_elements.size() != _outputNames.size()) { + LOG_TO_CLIENT_ERROR("Mismatch between output count (%zu) and expected names (%zu) for modelId=%s", + tuple_elements.size(), _outputNames.size(), _modelId.c_str()); + return OpReturnType(new NoneVariable()); + } + + OpReturnType output_dict = OpReturnType(new MapDataVariable()); + auto output_map = std::dynamic_pointer_cast(output_dict); + + for (size_t i = 0; i < tuple_elements.size(); i++) { + std::string outputName(_outputNames[i]); + output_map->set_value_in_map(outputName, tuple_elements[i]); + } + + return output_dict; + } + catch (...) { + LOG_TO_CLIENT_ERROR("Exception in convert_tuple_to_dict for modelId=%s", _modelId.c_str()); + return OpReturnType(new NoneVariable()); + } +} + OpReturnType TaskONNXModel::get_tensor_variable_from_onnx_tensor(Ort::Value onnx_tensor) { Ort::TensorTypeAndShapeInfo tensor_info = onnx_tensor.GetTensorTypeAndShapeInfo(); - auto dataType = (DATATYPE)tensor_info.GetElementType(); - switch (dataType) { - case DATATYPE::FLOAT: - case DATATYPE::DOUBLE: - case DATATYPE::INT32: - case DATATYPE::INT64: - return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), dataType)); - case DATATYPE::STRING: { + ONNXTensorElementDataType onnxType = tensor_info.GetElementType(); + + // Handle ONNX data type to DATATYPE mapping + switch (onnxType) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: + return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::FLOAT)); + case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: + return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::DOUBLE)); + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: + return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::INT32)); + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: + return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::INT64)); + case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: + return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::BOOLEAN)); + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: + return OpReturnType(new OrtTensorVariable(std::move(onnx_tensor), DATATYPE::FLOAT16)); + case ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING: { std::vector strings; for (int i = 0; i < tensor_info.GetElementCount(); i++) { strings.push_back(onnx_tensor.GetStringTensorElement(i)); @@ -144,7 +314,7 @@ OpReturnType TaskONNXModel::get_tensor_variable_from_onnx_tensor(Ort::Value onnx default: LOG_TO_ERROR( "Requested data type = %d not supported when converting ONNX tensor to DataVariable.", - tensor_info.GetElementType()); + onnxType); THROW("%s", "Unsupported dataType returned from model."); } THROW("%s", "Unsupported dataType returned from model."); @@ -284,9 +454,9 @@ TaskONNXModel::TaskONNXModel(const std::string& plan, const std::string& version Ort::ThrowOnError(ortApi.GetAllocatorWithDefaultOptions(&_allocator)); initialize_model(); - if (_runDummyInference) { - run_dummy_inference(); - } + // if (_runDummyInference) { + // run_dummy_inference(); + // } } void TaskONNXModel::run_dummy_inference() { @@ -311,7 +481,8 @@ void TaskONNXModel::run_dummy_inference() { case DATATYPE::FLOAT: case DATATYPE::DOUBLE: case DATATYPE::INT32: - case DATATYPE::INT64: { + case DATATYPE::INT64: + case DATATYPE::FLOAT16: { OpReturnType req = OpReturnType(new TensorVariable(shape, static_cast(data_type))); create_input_tensor_and_set_data_ptr(req, i, std::move(inputTensor)); diff --git a/coreruntime/nimblenet/job_scheduler/include/internet_job.hpp b/coreruntime/nimblenet/job_scheduler/include/internet_job.hpp index e4e1b1b4..ee810a33 100644 --- a/coreruntime/nimblenet/job_scheduler/include/internet_job.hpp +++ b/coreruntime/nimblenet/job_scheduler/include/internet_job.hpp @@ -107,5 +107,7 @@ typename Job::Status InternetJob::process() { case Status::COMPLETE: // Task completed successfully return Job::Status::COMPLETE; + default: + return Job::Status::RETRY; } } diff --git a/coreruntime/nimblenet/resource_loader/src/resource_downloader.cpp b/coreruntime/nimblenet/resource_loader/src/resource_downloader.cpp index 566b92f1..de38a1ce 100644 --- a/coreruntime/nimblenet/resource_loader/src/resource_downloader.cpp +++ b/coreruntime/nimblenet/resource_loader/src/resource_downloader.cpp @@ -63,6 +63,8 @@ InternetJob::Status ResourceDownloader::enqueue_download_asset( case FileDownloadStatus::DOWNLOAD_FAILURE: case FileDownloadStatus::DOWNLOAD_UNKNOWN: return InternetJob::Status::RETRY; + default: + return InternetJob::Status::RETRY; } }; @@ -96,5 +98,7 @@ std::optional ResourceDownloader::get_asset_offline(std::shared_ptr asset, case AssetType::LLM: return load_llm(asset); #endif // GENAI + default: + return nullptr; } } diff --git a/coreruntime/nimblenet/task_manager/operators/include/binary_operators.hpp b/coreruntime/nimblenet/task_manager/operators/include/binary_operators.hpp index e88ba13e..c8d6386e 100644 --- a/coreruntime/nimblenet/task_manager/operators/include/binary_operators.hpp +++ b/coreruntime/nimblenet/task_manager/operators/include/binary_operators.hpp @@ -165,7 +165,7 @@ class BaseBinOp { * Ensures the result has the same sign as the divisor when possible. */ template >> + typename = std::enable_if_t>> struct ModOperator { /** * @brief Computes modulo operation with proper sign handling @@ -187,10 +187,10 @@ struct ModOperator { * @brief Template class for numeric binary operations * * Provides implementations of all binary operations (add, sub, mult, div, pow, mod) - * for numeric types (float, int32_t, double, int64_t). + * for numeric types (float, int32_t, double, int64_t, uint16_t). */ template >> + typename = std::enable_if_t>> class NumericBinOp : public BaseBinOp { public: /** @brief Adds two numeric values */ @@ -303,6 +303,10 @@ class BinaryOperators { NumericBinOp n; return n.perform_operation(v1, v2, opType); } + case DATATYPE::FLOAT16: { + NumericBinOp n; + return n.perform_operation(v1, v2, opType); + } case DATATYPE::INT32: { NumericBinOp n; return n.perform_operation(v1, v2, opType); diff --git a/coreruntime/nimblenet/task_manager/operators/include/operator_types.hpp b/coreruntime/nimblenet/task_manager/operators/include/operator_types.hpp index ed605060..2ec69c88 100644 --- a/coreruntime/nimblenet/task_manager/operators/include/operator_types.hpp +++ b/coreruntime/nimblenet/task_manager/operators/include/operator_types.hpp @@ -15,7 +15,7 @@ * * Compares two data types and returns the one with higher precedence * for automatic type promotion in operations. The precedence order is: - * BOOLEAN (0) < INT32 (3) < INT64 (4) < FLOAT (5) < DOUBLE (6) + * BOOLEAN (0) < INT32 (3) < INT64 (4) < FLOAT16 (4.5) < FLOAT (5) < DOUBLE (6) * * @param dataType1 First data type to compare * @param dataType2 Second data type to compare @@ -23,8 +23,8 @@ */ inline int get_max_dataType(int dataType1, int dataType2) { std::map _typeScore = { - {DATATYPE::BOOLEAN, 0}, {DATATYPE::INT32, 3}, {DATATYPE::INT64, 4}, - {DATATYPE::FLOAT, 5}, {DATATYPE::DOUBLE, 6}, + {DATATYPE::BOOLEAN, 0}, {DATATYPE::INT32, 3}, {DATATYPE::INT64, 4}, + {DATATYPE::FLOAT16, 45}, {DATATYPE::FLOAT, 5}, {DATATYPE::DOUBLE, 6}, }; if (_typeScore[dataType1] < _typeScore[dataType2]) { return dataType2; diff --git a/coreruntime/nimblenet/task_manager/task/include/statements.hpp b/coreruntime/nimblenet/task_manager/task/include/statements.hpp index 8afa8597..576f1644 100644 --- a/coreruntime/nimblenet/task_manager/task/include/statements.hpp +++ b/coreruntime/nimblenet/task_manager/task/include/statements.hpp @@ -12,6 +12,7 @@ #include "nimble_net_internal_data_variable.hpp" #include "node.hpp" #include "regex_data_variable.hpp" +#include "tokenizers_data_variable.hpp" class VariableScope; diff --git a/coreruntime/nimblenet/task_manager/task/src/statements.cpp b/coreruntime/nimblenet/task_manager/task/src/statements.cpp index 21829798..698f83fa 100644 --- a/coreruntime/nimblenet/task_manager/task/src/statements.cpp +++ b/coreruntime/nimblenet/task_manager/task/src/statements.cpp @@ -403,6 +403,9 @@ StatRetType* ImportStatement::execute(CallStack& stack) { stack.set_variable(stackLocation, OpReturnType(new RegexDataVariable())); } #endif + else if (importName == "tokenizers") { + stack.set_variable(stackLocation, OpReturnType(new TokenizersDataVariable())); + } else { THROW("Cannot import=%s from module=%s at lineno=%d", importName.c_str(), moduleName.c_str(), get_line()); diff --git a/coreruntime/nimblenet/time_manager/include/time_manager.hpp b/coreruntime/nimblenet/time_manager/include/time_manager.hpp index 8496fdf5..a8f6865d 100644 --- a/coreruntime/nimblenet/time_manager/include/time_manager.hpp +++ b/coreruntime/nimblenet/time_manager/include/time_manager.hpp @@ -326,7 +326,7 @@ class PeggedDeviceTime { /** * @brief Default constructor (null base device time). */ - constexpr PeggedDeviceTime() : _baseDeviceTime(DeviceTime::null) {} + PeggedDeviceTime() : _baseDeviceTime(DeviceTime::null) {} /** * @brief Compare for equality with another PeggedDeviceTime. diff --git a/coreruntime/nimblenet/util/include/util.hpp b/coreruntime/nimblenet/util/include/util.hpp index 7c6a4272..a886d35d 100644 --- a/coreruntime/nimblenet/util/include/util.hpp +++ b/coreruntime/nimblenet/util/include/util.hpp @@ -310,6 +310,8 @@ static inline int get_field_size_from_data_type(int dataType) { switch (dataType) { case DATATYPE::STRING: return 1; + case DATATYPE::FLOAT16: + return 2; case DATATYPE::FLOAT: case DATATYPE::INT32: return 4; @@ -423,6 +425,9 @@ auto call_function_for_dataType(Func func, DATATYPE dataType, Ts&&... ts) { return func(double{}, std::forward(ts)...); case DATATYPE::FLOAT: return func(float{}, std::forward(ts)...); + case DATATYPE::FLOAT16: + // Use uint16_t as the underlying representation for fp16 + return func(uint16_t{}, std::forward(ts)...); case DATATYPE::INT64: return func(int64_t{}, std::forward(ts)...); case DATATYPE::BOOLEAN: diff --git a/coreruntime/nimblenet/util/src/util.cpp b/coreruntime/nimblenet/util/src/util.cpp index e108a4fd..e71ca9de 100644 --- a/coreruntime/nimblenet/util/src/util.cpp +++ b/coreruntime/nimblenet/util/src/util.cpp @@ -31,6 +31,8 @@ const char* get_string_from_enum(int dataType) { return "None"; case DATATYPE::FLOAT: return "float"; + case DATATYPE::FLOAT16: + return "float16"; case DATATYPE::BOOLEAN: return "bool"; case DATATYPE::INT32: @@ -80,6 +82,7 @@ const char* get_string_from_enum(int dataType) { int get_enum_from_string(const char* type) { static std::map typeMap = {{"float", DATATYPE::FLOAT}, + {"float16", DATATYPE::FLOAT16}, {"double", DATATYPE::DOUBLE}, {"bool", DATATYPE::BOOLEAN}, {"int32", DATATYPE::INT32}, diff --git a/models/Qwen3-1.7B/demo_qwen.py b/models/Qwen3-1.7B/demo_qwen.py new file mode 100755 index 00000000..e6a7e6e7 --- /dev/null +++ b/models/Qwen3-1.7B/demo_qwen.py @@ -0,0 +1,513 @@ +#!/usr/bin/env python3 +#-*- coding: utf-8 -*- + +""" +Enhanced Qwen3-1.7B ONNX Demo with Tool Calling + +This demo uses a custom enhanced ONNX model with: +- Integrated ArgMax for token generation +- Built-in EOS detection +- Temperature scaling for language confusion mitigation +- Automatic cache management +- English-only output filtering + +The enhanced model is created by export.py and saved as model_enhanced.onnx +""" + +import json +import re +import sys +import os +from typing import List +from transformers import AutoConfig, AutoTokenizer +from tokenizers import Tokenizer +import onnxruntime +import numpy as np +from huggingface_hub import hf_hub_download +from jinja2 import Environment +import re + +# Add parent directory to path to import tools +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from tools import tools, tool_schema + +# Load Qwen3 1.7B 4-bit model and tokenizer +model_id = "onnx-community/Qwen3-1.7B-ONNX" + +TOOL_CALL_START_TOKEN = "" +TOOL_CALL_END_TOKEN = "" +TOOL_RESPONSE_START_TOKEN = "" +TOOL_RESPONSE_END_TOKEN = "" +INITIAL_PROMPT = f"""You are a helpful assistant with access to tools. When you need to use a tool, format your response with JSON between {TOOL_CALL_START_TOKEN} and {TOOL_CALL_END_TOKEN} tokens. +Use this exact format: {TOOL_CALL_START_TOKEN}{{"name": "function_name", "arguments": {{"param": "value"}}}}{TOOL_CALL_END_TOKEN} +If a tool requires a argument you don't know the value of check if another tool can give you that information and call that tool first. +Always respond directly and call the appropriate tool when needed. +""" + +initial_message_block = [ + { + "role": "system", + "content": INITIAL_PROMPT + } +] + +config = AutoConfig.from_pretrained(model_id) +tokenizer = Tokenizer.from_pretrained(model_id) +chat_template = "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('') and message.content.endswith('')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '' in message.content %}\n {%- set content = message.content.split('')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('')[0].rstrip('\\n').split('')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n\\n' + reasoning_content.strip('\\n') + '\\n\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '\\n\\n\\n\\n' }}\n {%- endif %}\n{%- endif %}" + +# Use the enhanced ONNX model created by export.py +model_path = "./data/onnx/model_enhanced.onnx" + +if not os.path.exists(model_path): + print(f"❌ Enhanced model not found at {model_path}") + print("📝 Please run export.py first to create the enhanced model") + print("💡 Run: python export.py") + sys.exit(1) + +# Load the enhanced ONNX model with integrated generation capabilities +print(f"🚀 Loading ONNX model from {model_path}...") +session = onnxruntime.InferenceSession(model_path) + +print(f"✅ {model_id} model loaded successfully!") +print(f"✅ Model has {len(session.get_inputs())} inputs and {len(session.get_outputs())} outputs") +print(f"🚀 Features: Integrated ArgMax, EOS detection, temperature scaling, automatic cache updates") + +# Global variables for conversation state +conversation_state = { + "kv_cache": None, + "attention_mask": None, + "position_ids": None, + "sequence_length": 0, + "conversation_history": [] +} + +# Print model input/output info for debugging +print(f"\n📋 Model Inputs (first 5):") +for inp in session.get_inputs()[:5]: # Show first 5 to avoid spam + print(f" • {inp.name}: {inp.shape}") +if len(session.get_inputs()) > 5: + print(f" ... and {len(session.get_inputs()) - 5} more inputs") + +print(f"\n📋 Enhanced Model Outputs:") +for out in session.get_outputs()[:5]: + if not out.name.startswith('updated_past_key_values'): # Skip cache outputs to reduce spam + print(f" • {out.name}: {out.shape}") +if len(session.get_outputs()) > 5: + print(f" • ... and {len(session.get_outputs()) - 5} more outputs") + + +def execute_function_call(function_name: str, arguments: dict) -> dict: + """Execute a function call and return the result""" + if function_name not in tools: + return {"error": f"Function {function_name} not found"} + + try: + function = tools[function_name] # Direct access to function object + result = function(**arguments) + return result + except Exception as e: + return {"error": f"Error executing {function_name}: {str(e)}"} + +def format_tool_response(result: dict) -> str: + """Format tool execution result using token-based format""" + result_json = json.dumps(result) + return f"<|tool_response_start|>{result_json}<|tool_response_end|>" + +def execute_tool_call_with_response(function_name: str, arguments: dict) -> tuple: + """Execute a function call and return both result and formatted response""" + result = execute_function_call(function_name, arguments) + formatted_response = format_tool_response(result) + return result, formatted_response + +def initialize_conversation_state(): + """Initialize KV cache and conversation state once""" + global conversation_state + + # Set config values + num_key_value_heads = config.num_key_value_heads + head_dim = config.hidden_size // config.num_attention_heads + num_hidden_layers = config.num_hidden_layers + hidden_size = config.hidden_size + batch_size = 1 # Single batch for conversation + + # Initialize KV cache + kv_cache = {} + + # Check if config has layer_types + if not hasattr(config, 'layer_types'): + config.layer_types = [ + "full_attention" + for _ in range(config.num_hidden_layers) + ] + + for i in range(num_hidden_layers): + if config.layer_types[i] == 'full_attention': + for kv in ('key', 'value'): + # Initialize with small valid tensor for first generation step + kv_cache[f'past_key_values.{i}.{kv}'] = np.zeros([batch_size, num_key_value_heads, 1, head_dim], dtype=np.float16) + elif config.layer_types[i] == 'conv': + kv_cache[f'past_conv.{i}'] = np.zeros([batch_size, hidden_size, config.conv_L_cache], dtype=np.float16) + + # Initialize conversation state + conversation_state.update({ + "kv_cache": kv_cache, + "attention_mask": None, + "position_ids": None, + "sequence_length": 0, + "conversation_history": [] + }) + + print("✅ Conversation state and KV cache initialized") + +def reset_conversation_state(): + """Reset conversation state for a new conversation""" + global conversation_state + conversation_state.update({ + "kv_cache": None, + "attention_mask": None, + "position_ids": None, + "sequence_length": 0, + "conversation_history": [] + }) + print("🔄 Conversation state reset") + +def parse_tool_calls_from_response(response_text: str) -> list: + """Parse tool calls from model response using multiple formats""" + tool_calls = [] + + # Method 2: Look for JSON-style tool calls: {"name": "func", "arguments": {...}} + # Using [\s\S] instead of re.DOTALL to match any character including newlines + json_tool_pattern = r'\s*({[\s\S]*?})\s*' + json_matches = re.findall(json_tool_pattern, response_text) + + for json_str in json_matches: + try: + tool_data = json.loads(json_str) + func_name = tool_data.get("name") + arguments = tool_data.get("arguments", {}) + + if func_name in tools: + tool_calls.append({ + "function_name": func_name, + "arguments": arguments + }) + print(f"✓ Parsed JSON tool call: {func_name}({arguments})") + except json.JSONDecodeError: + print(f"⚠ Failed to parse JSON tool call: {json_str}") + + return tool_calls + + +def render_jinja_template(messages, tools=None, add_generation_prompt=False, enable_thinking=True): + """Render the chat template using Jinja2""" + + # Create Jinja2 environment + env = Environment() + + # Add custom filters that might be used in the template + def tojson(obj): + return json.dumps(obj) + + env.filters['tojson'] = tojson + + # Parse the template + template = env.from_string(chat_template) + + # Render the template with the provided data + rendered = template.render( + messages=messages, + tools=tools, + add_generation_prompt=add_generation_prompt, + enable_thinking=enable_thinking + ) + + return rendered + + +def apply_chat_template(messages, tool_schema, add_generation_prompt, tokenize, return_dict): + """Apply chat template using Jinja2 rendering""" + + # Use Jinja2 template renderer + text = render_jinja_template( + messages=messages, + tools=[tool["function"] for tool in tool_schema], + add_generation_prompt=add_generation_prompt, + enable_thinking=True + ) + print("---"*10) + print("Rendered Text:") + print(text) + print("---"*10) + if tokenize: + encoding = tokenizer.encode(text, add_special_tokens=False) + input_ids = np.array([encoding.ids], dtype=np.int64) + + if return_dict: + attention_mask = np.ones_like(input_ids, dtype=np.int64) + return { + "input_ids": input_ids, + "attention_mask": attention_mask + } + else: + return input_ids + else: + return text + +def generate_with_model(conversation_messages: List, max_new_tokens: int = 150) -> str: + """Generate text using full conversation processing (simplified approach)""" + print("---"*10) + print("Conversation Messages:") + print(json.dumps(conversation_messages, indent=4)) + print("---"*10) + + # Always process the full conversation - simpler and more reliable + inputs = apply_chat_template( + conversation_messages, + tool_schema=tool_schema, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + ) + + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + batch_size = input_ids.shape[0] + seq_len = input_ids.shape[1] + + # Create position IDs + position_ids = np.tile(np.arange(0, seq_len), (batch_size, 1)) + + # Set config values + num_key_value_heads = config.num_key_value_heads + head_dim = config.hidden_size // config.num_attention_heads + num_hidden_layers = config.num_hidden_layers + hidden_size = config.hidden_size + + # Initialize fresh KV cache for each generation + model_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids + } + + # Check if config has layer_types + if not hasattr(config, 'layer_types'): + config.layer_types = [ + "full_attention" + for _ in range(config.num_hidden_layers) + ] + + # Initialize KV cache + for i in range(num_hidden_layers): + if config.layer_types[i] == 'full_attention': + for kv in ('key', 'value'): + # Initialize with small valid tensor for first generation step + model_inputs[f'past_key_values.{i}.{kv}'] = np.zeros([batch_size, num_key_value_heads, 1, head_dim], dtype=np.float16) + elif config.layer_types[i] == 'conv': + model_inputs[f'past_conv.{i}'] = np.zeros([batch_size, hidden_size, config.conv_L_cache], dtype=np.float16) + + # Enhanced generation loop + generated_tokens = [] + + for i in range(max_new_tokens): + # Run the enhanced model + model_outputs = session.run(None, model_inputs) + + # Parse outputs + output_names = [output.name for output in session.get_outputs()] + outputs_dict = dict(zip(output_names, model_outputs)) + + # Check for EOS + if bool(outputs_dict['is_eos'][0, 0]): + break + + generated_tokens.append(int(outputs_dict['next_token_id'][0, 0])) + + # Update inputs for next iteration + model_inputs["input_ids"] = outputs_dict['next_token_id'] + model_inputs["attention_mask"] = outputs_dict['updated_attention_mask'] + + # For subsequent calls, we need only the last position + next_position_full = outputs_dict['next_position'] + last_position = next_position_full[:, -1:] + model_inputs["position_ids"] = last_position + + # Update cache using present outputs + for cache_key in list(model_inputs.keys()): + if cache_key.startswith('past_key_values.'): + parts = cache_key.split('.') + if len(parts) == 3: + layer_num = parts[1] + kv_type = parts[2] + present_key = f"present.{layer_num}.{kv_type}" + + if present_key in outputs_dict: + model_inputs[cache_key] = outputs_dict[present_key] + else: + print(f"⚠️ Warning: Expected cache output '{present_key}' not found") + elif cache_key.startswith('past_conv.'): + present_key = cache_key.replace("past_conv", "present_conv") + if present_key in outputs_dict: + model_inputs[cache_key] = outputs_dict[present_key] + + # Decode generated tokens + response = "" + if generated_tokens: + generated_tokens_array = np.array([generated_tokens], dtype=np.int64) + response = tokenizer.decode_batch(generated_tokens_array, skip_special_tokens=True)[0] + + return response.strip() + + +def handle_multi_step_request(user_prompt: str, max_steps: int, max_new_tokens: int) -> list: + """Handle requests that may require multiple tool calls and back and forth""" + step_results = [] + conversation_messages: List[dict] = [] # Initialize as empty list, not None + tool_context = {} # Store results from previous tool calls + + for step in range(max_steps): + print(f"\n--- Step {step + 1} ---") + if step == 0: + conversation_messages = initial_message_block.copy() + conversation_messages.append({ + "role": "user", + "content": user_prompt + }) + else: + conversation_messages.append({ + "role": "system", + "content": "Now use the result from the tool calls to answer the user's question. Call another tool if needed." + }) + # Generate response + try: + response = generate_with_model(conversation_messages, max_new_tokens=max_new_tokens) + print(f"Model Response: {response}") + + # Parse and execute tool calls + tool_calls = parse_tool_calls_from_response(response) + tool_results = [] + + if tool_calls: + print(f"Executing {len(tool_calls)} tool call(s):") + for call in tool_calls: + func_name = call["function_name"] + arguments = call["arguments"] + + print(f" • {func_name}({arguments})") + result, formatted_response = execute_tool_call_with_response(func_name, arguments) + + # Store important results for future reference + if func_name == "get_current_location" and "location" in result: + tool_context["location"] = result["location"] + + tool_results.append({ + "function": func_name, + "arguments": arguments, + "result": result + }) + print(f" Result: {json.dumps(result, indent=4)}") + + # Add assistant response to conversation + conversation_messages.append({ + "role": "assistant", + "content": response + }) + + # Add tool results to conversation as function messages + for tool_result in tool_results: + if not tool_result["result"].get("error"): + conversation_messages.append({ + "role": "system", + "content": f"The result of the tool {tool_result['function']} is: {TOOL_RESPONSE_START_TOKEN}{json.dumps(tool_result['result'])}{TOOL_RESPONSE_END_TOKEN}" + }) + + # Store step result + step_result = { + "step": step + 1, + "prompt": user_prompt if step == 0 else "continuation", + "response": response, + "tool_calls": tool_calls, + "tool_results": tool_results, + "has_errors": any("error" in result.get("result", {}) for result in tool_results), + "tool_context": tool_context.copy(), + "conversation_messages": conversation_messages.copy() + } + step_results.append(step_result) + + # Check if all tool calls were successful + if step_result["has_errors"]: + print(f"⚠ Stopping due to tool execution errors") + break + + # Simple continuation logic: if no tools were called, we're done + if not tool_calls: + print(f"✓ Completed after {step + 1} step(s) - no tool calls needed") + break + + # If we've reached max steps, stop + if step >= max_steps - 1: + print(f"✓ Reached maximum steps ({max_steps})") + break + + # If tools were executed, continue to next step to see if model wants to do more + print(f"✓ Step {step + 1} completed with {len(tool_calls)} tool call(s) - continuing...") + + except Exception as e: + print(f"Error in step {step + 1}: {e}") + step_results.append({ + "step": step + 1, + "prompt": user_prompt if step == 0 else "continuation", + "error": str(e), + "response": None, + "tool_calls": [], + "tool_results": [], + "tool_context": tool_context.copy(), + "conversation_messages": conversation_messages.copy() if conversation_messages else [] + }) + break + + return step_results + +def run_tool_calling_demo(): + """Run tool calling demonstration using the enhanced ONNX model""" + print("=== Qwen3 1.7B Enhanced ONNX Tool Calling Demo ===\n") + print(f"📦 Model: {model_id} (Enhanced)") + print(f"🚀 Enhanced Model Path: {model_path}") + print(f"✨ Features: ArgMax, EOS detection, temperature scaling, automatic cache updates") + print(f"🔧 Available tools: {list(tools.keys())}") + + demo_prompts = [ + "What's the weather here today?", + "Calculate 15 * 23", + "What time is it in JST timezone?", + "Where am I located?", + "Get my location and check the weather there" + ] + + for i, user_prompt in enumerate(demo_prompts, 1): + print(f"\n🎮 Demo {i}: {user_prompt}") + print("-" * 60) + step_results = handle_multi_step_request(user_prompt, max_steps=4, max_new_tokens=400) + # Show final summary + print(f"\n📋 Multi-step Summary:") + for step_result in step_results: + step_num = step_result["step"] + tool_calls = step_result.get("tool_calls", []) + if tool_calls: + print(f" Step {step_num}: {len(tool_calls)} tool call(s)") + for call in tool_calls: + func_name = call["function_name"] + print(f" ✓ {func_name}") + print("\n" + "="*60) + + +if __name__ == "__main__": + print("🔧 Enhanced Qwen3-1.7B ONNX Model Demo") + print("📝 Uses enhanced ONNX model with integrated generation enhancements") + print("🎯 Features: ArgMax, EOS detection, temperature scaling, automatic cache management") + print("🚀 Export: Custom enhanced model with language confusion mitigation") + print("📁 Model location: ./data/onnx/model_enhanced.onnx") + print("=" * 80) + + # Run the enhanced demo + run_tool_calling_demo() diff --git a/models/Qwen3-1.7B/export.py b/models/Qwen3-1.7B/export.py new file mode 100755 index 00000000..1fc70e01 --- /dev/null +++ b/models/Qwen3-1.7B/export.py @@ -0,0 +1,495 @@ +#!/usr/bin/env python3 +""" +Export script for Qwen3-1.7B Enhanced ONNX model with integrated generation logic. + +This script: +1. Downloads the base Qwen3-1.7B ONNX model +2. Enhances it with integrated generation logic (ArgMax, EOS detection, temperature scaling) +3. Saves the enhanced model as model_enhanced.onnx +""" + +import os +import onnx +import onnxruntime as ort +import numpy as np +from onnx import helper, TensorProto, ValueInfoProto +from pathlib import Path + +def download_base_model(): + """Download the base Qwen3-1.7B ONNX model.""" + model_id = "onnx-community/Qwen3-1.7B-ONNX" + output_dir = "./data/onnx" + + print(f"📥 Downloading base model: {model_id}") + + # Create output directory + os.makedirs(output_dir, exist_ok=True) + + # Download base model + from huggingface_hub import hf_hub_download + base_model_path = hf_hub_download(repo_id=model_id, filename="onnx/model_q4f16.onnx") + + # Copy to our directory structure + import shutil + local_model_path = os.path.join(output_dir, "model_base.onnx") + shutil.copy2(base_model_path, local_model_path) + + print(f"✅ Base model downloaded to {local_model_path}") + return local_model_path + +def load_and_analyze_model(model_path): + """Load the ONNX model and analyze its structure.""" + print(f"📋 Loading base ONNX model from {model_path}") + model = onnx.load(model_path) + + print(f"✅ Model loaded successfully!") + print(f"📊 Model has {len(model.graph.input)} inputs and {len(model.graph.output)} outputs") + + # Print opset information + print(f"\n🔧 Model opset information:") + for opset_import in model.opset_import: + domain = opset_import.domain or "ai.onnx" + print(f" • {domain}: opset {opset_import.version}") + + return model + +def add_argmax_node(model, temperature=0.3): + """Add argmax node to logits output for token generation with temperature scaling.""" + # Find logits output (usually the first output) + logits_output = model.graph.output[0] + + print(f"🎯 Adding ArgMax node with temperature {temperature} for output: {logits_output.name}") + + # Create argmax node that selects the token with highest probability from the last position + # First, slice the logits to get only the last position: [batch, seq, vocab] -> [batch, 1, vocab] + slice_starts = helper.make_node( + 'Constant', + inputs=[], + outputs=['last_pos_starts'], + value=helper.make_tensor( + name='last_pos_starts_value', + data_type=TensorProto.INT64, + dims=[1], + vals=[-1] # Last position + ) + ) + + slice_ends = helper.make_node( + 'Constant', + inputs=[], + outputs=['last_pos_ends'], + value=helper.make_tensor( + name='last_pos_ends_value', + data_type=TensorProto.INT64, + dims=[1], + vals=[2147483647] # Max int (until end) + ) + ) + + slice_axes = helper.make_node( + 'Constant', + inputs=[], + outputs=['last_pos_axes'], + value=helper.make_tensor( + name='last_pos_axes_value', + data_type=TensorProto.INT64, + dims=[1], + vals=[1] # Sequence dimension + ) + ) + + # Slice to get last position logits: [batch, seq, vocab] -> [batch, 1, vocab] + slice_last_logits = helper.make_node( + 'Slice', + inputs=[logits_output.name, 'last_pos_starts', 'last_pos_ends', 'last_pos_axes'], + outputs=['last_position_logits'] + ) + + # Squeeze to remove the sequence dimension: [batch, 1, vocab] -> [batch, vocab] + squeeze_axes = helper.make_node( + 'Constant', + inputs=[], + outputs=['squeeze_axes'], + value=helper.make_tensor( + name='squeeze_axes_value', + data_type=TensorProto.INT64, + dims=[1], + vals=[1] # Remove sequence dimension + ) + ) + + squeeze_logits = helper.make_node( + 'Squeeze', + inputs=['last_position_logits', 'squeeze_axes'], + outputs=['squeezed_logits'] + ) + + # Apply temperature scaling to reduce language confusion + temperature_constant = helper.make_node( + 'Constant', + inputs=[], + outputs=['temperature_constant'], + value=helper.make_tensor( + name='temperature_value', + data_type=TensorProto.FLOAT, + dims=[], + vals=[temperature] + ) + ) + + # Cast logits to float for temperature scaling + cast_to_float = helper.make_node( + 'Cast', + inputs=['squeezed_logits'], + outputs=['logits_float'], + to=TensorProto.FLOAT + ) + + # Apply temperature scaling: logits = logits / temperature + scaled_logits = helper.make_node( + 'Div', + inputs=['logits_float', 'temperature_constant'], + outputs=['temperature_scaled_logits'] + ) + + # Apply ArgMax to get the token ID: [batch, vocab] -> [batch] + argmax_node = helper.make_node( + 'ArgMax', + inputs=['temperature_scaled_logits'], + outputs=['token_id_batch_float'], + axis=1, # Along vocabulary dimension + keepdims=0 + ) + + # Cast back to int64 + cast_to_int = helper.make_node( + 'Cast', + inputs=['token_id_batch_float'], + outputs=['token_id_batch'], + to=TensorProto.INT64 + ) + + # Unsqueeze to make it [batch, 1] for consistency + unsqueeze_axes = helper.make_node( + 'Constant', + inputs=[], + outputs=['unsqueeze_axes'], + value=helper.make_tensor( + name='unsqueeze_axes_value', + data_type=TensorProto.INT64, + dims=[1], + vals=[1] # Add dimension at position 1 + ) + ) + + unsqueeze_token = helper.make_node( + 'Unsqueeze', + inputs=['token_id_batch', 'unsqueeze_axes'], + outputs=['next_token_id'] + ) + + # Create output info for next_token_id with dynamic batch size + next_token_output = helper.make_tensor_value_info( + 'next_token_id', + TensorProto.INT64, + [None, 1] # [dynamic_batch_size, 1] + ) + + # Add all nodes to graph + model.graph.node.extend([ + slice_starts, + slice_ends, + slice_axes, + slice_last_logits, + squeeze_axes, + squeeze_logits, + temperature_constant, + cast_to_float, + scaled_logits, + argmax_node, + cast_to_int, + unsqueeze_axes, + unsqueeze_token + ]) + + model.graph.output.append(next_token_output) + + print(f"✅ ArgMax node with temperature scaling ({temperature}) and correct output shape [1,1] added successfully") + return model + +def add_generation_logic_simple(model, eos_token_id=151645): + """Add generation logic with PROPER attention mask handling (fixed version).""" + print(f"🔄 Adding generation logic with EOS token ID: {eos_token_id}") + + # Create constant for EOS token as scalar - will broadcast to match next_token_id + eos_constant = helper.make_node( + 'Constant', + inputs=[], + outputs=['eos_token_constant'], + value=helper.make_tensor( + name='eos_token_value', + data_type=TensorProto.INT64, + dims=[], # Scalar - will broadcast to match next_token_id shape + vals=[eos_token_id] + ) + ) + + # Create equal node to check for EOS (comparing [1,1] tensors) + eos_check = helper.make_node( + 'Equal', + inputs=['next_token_id', 'eos_token_constant'], + outputs=['is_eos'] + ) + + # FIXED ATTENTION MASK LOGIC + # The key insight: we need to extend the CURRENT attention_mask, not concatenate with past + # Current attention_mask is the input for this generation step + # We extend it by 1 for the newly generated token + + # Get batch size from attention_mask shape + batch_shape = helper.make_node( + 'Shape', + inputs=['attention_mask'], + outputs=['attention_mask_shape'] + ) + + zero_index = helper.make_node( + 'Constant', + inputs=[], + outputs=['zero_index'], + value=helper.make_tensor( + name='zero_index_value', + data_type=TensorProto.INT64, + dims=[], + vals=[0] + ) + ) + + batch_size_scalar = helper.make_node( + 'Gather', + inputs=['attention_mask_shape', 'zero_index'], + outputs=['batch_size_scalar'] + ) + + zero_axis = helper.make_node( + 'Constant', + inputs=[], + outputs=['zero_axis'], + value=helper.make_tensor( + name='zero_axis_value', + data_type=TensorProto.INT64, + dims=[1], + vals=[0] + ) + ) + + batch_size_unsqueeze = helper.make_node( + 'Unsqueeze', + inputs=['batch_size_scalar', 'zero_axis'], + outputs=['batch_size_1d'] + ) + + one_constant_1d = helper.make_node( + 'Constant', + inputs=[], + outputs=['one_constant_1d'], + value=helper.make_tensor( + name='one_constant_1d_value', + data_type=TensorProto.INT64, + dims=[1], + vals=[1] + ) + ) + + ones_shape = helper.make_node( + 'Concat', + inputs=['batch_size_1d', 'one_constant_1d'], + outputs=['ones_shape_tensor'], + axis=0 + ) + + ones_tensor = helper.make_node( + 'ConstantOfShape', + inputs=['ones_shape_tensor'], + outputs=['ones_tensor'], + value=helper.make_tensor( + name='ones_fill_value', + data_type=TensorProto.INT64, + dims=[1], + vals=[1] + ) + ) + + # CORRECT: Extend current attention_mask with one new token + # This grows linearly: [1,1,1] -> [1,1,1,1] -> [1,1,1,1,1] + updated_attention_mask = helper.make_node( + 'Concat', + inputs=['attention_mask', 'ones_tensor'], + outputs=['updated_attention_mask'], + axis=-1 + ) + + # Add position increment logic (simplified) + pos_slice_starts = helper.make_node( + 'Constant', + inputs=[], + outputs=['pos_slice_starts'], + value=helper.make_tensor( + name='pos_starts_value', + data_type=TensorProto.INT64, + dims=[1], + vals=[-1] + ) + ) + + pos_slice_ends = helper.make_node( + 'Constant', + inputs=[], + outputs=['pos_slice_ends'], + value=helper.make_tensor( + name='pos_ends_value', + data_type=TensorProto.INT64, + dims=[1], + vals=[2147483647] + ) + ) + + slice_axes_pos = helper.make_node( + 'Constant', + inputs=[], + outputs=['pos_slice_axes'], + value=helper.make_tensor( + name='pos_axes_value', + data_type=TensorProto.INT64, + dims=[1], + vals=[1] + ) + ) + + slice_position = helper.make_node( + 'Slice', + inputs=['position_ids', 'pos_slice_starts', 'pos_slice_ends', 'pos_slice_axes'], + outputs=['last_position'] + ) + + add_one = helper.make_node( + 'Add', + inputs=['last_position', 'one_constant_1d'], + outputs=['next_position'] + ) + + # Add all nodes to graph + model.graph.node.extend([ + eos_constant, + eos_check, + batch_shape, + zero_index, + batch_size_scalar, + zero_axis, + batch_size_unsqueeze, + one_constant_1d, + ones_shape, + ones_tensor, + updated_attention_mask, + pos_slice_starts, + pos_slice_ends, + slice_axes_pos, + slice_position, + add_one + ]) + + # Add output tensors + outputs_to_add = [ + helper.make_tensor_value_info('is_eos', TensorProto.BOOL, [None, 1]), + helper.make_tensor_value_info('updated_attention_mask', TensorProto.INT64, [None, None]), + helper.make_tensor_value_info('next_position', TensorProto.INT64, [None, 1]) + ] + + model.graph.output.extend(outputs_to_add) + + print("✅ Generation logic with FIXED attention mask handling added successfully") + return model + +def save_enhanced_model(model, output_path="./data/onnx/model_enhanced.onnx"): + """Save the enhanced ONNX model.""" + print(f"💾 Saving enhanced model to {output_path}") + + # Create output directory + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + # Validate model + try: + onnx.checker.check_model(model) + print("✅ Model validation passed") + except Exception as e: + print(f"⚠️ Model validation warning: {e}") + print("🔄 Proceeding with save anyway...") + + # Save model + onnx.save(model, output_path) + print(f"✅ Enhanced model saved successfully!") + + # Test with ONNX Runtime + try: + session = ort.InferenceSession(output_path) + print(f"✅ ONNX Runtime validation passed!") + print(f"📊 Enhanced model: {len(session.get_inputs())} inputs, {len(session.get_outputs())} outputs") + + # Print enhanced outputs + print(f"\n🚀 Enhanced model outputs:") + for output in session.get_outputs(): + if not output.name.startswith('updated_past_key_values'): # Skip cache outputs to reduce spam + print(f" • {output.name}: {output.shape}") + cache_outputs = [out for out in session.get_outputs() if out.name.startswith('updated_past_key_values')] + if cache_outputs: + print(f" • ... and {len(cache_outputs)} cache outputs") + + except Exception as e: + print(f"⚠️ ONNX Runtime warning: {e}") + print("🔄 Model saved but may need specific execution providers") + +def main(): + """Main export function - creates only the enhanced model.""" + print("=" * 70) + print("🚀 Qwen3-1.7B Enhanced ONNX Model Export") + print("=" * 70) + + try: + # Step 1: Download base model + base_model_path = download_base_model() + + # Step 2: Load and analyze the model + model = load_and_analyze_model(base_model_path) + + # Step 3: Add argmax node with temperature scaling + model = add_argmax_node(model, temperature=0.8) + + # Step 4: Add generation logic + model = add_generation_logic_simple(model) + + # Step 5: Save enhanced model + save_enhanced_model(model) + + print("\n" + "=" * 70) + print("🎉 Enhanced model export completed successfully!") + print("=" * 70) + print("\n📋 Enhanced model features:") + print("✅ Integrated ArgMax with temperature scaling (0.3)") + print("✅ Built-in EOS detection") + print("✅ Automatic attention mask updates") + print("✅ Automatic position ID updates") + print("✅ Proper cache management") + print("✅ Dynamic batch size support") + + print(f"\n📁 Enhanced model saved to: ./data/onnx/model_enhanced.onnx") + print("🚀 Ready to use with improved generation capabilities!") + + except Exception as e: + print(f"❌ Export failed: {e}") + import traceback + traceback.print_exc() + raise + +if __name__ == "__main__": + main() + diff --git a/nimblenet_py/simulation_assets/dict_model_test.py b/nimblenet_py/simulation_assets/dict_model_test.py new file mode 100644 index 00000000..b7cf4f41 --- /dev/null +++ b/nimblenet_py/simulation_assets/dict_model_test.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Dictionary interface test with actual ONNX model inference. + +This script tests both traditional tensor interface and new dictionary interface +with a proper add/subtract ONNX model that uses supported float32 data types. +""" + +from delitepy import nimblenet as nm + +# Load model at global scope as required by DeliteAI simulator +model = nm.Model("test_model") + +def test_tensor_interface(input_data): + """Test the traditional tensor-based model interface with actual inference.""" + try: + # Check model status + status = model.status() + + # Test with actual model.run() call + # Create test inputs: X=3.0, Y=2.0 (expected: sum=5.0, diff=1.0) + X_tensor = nm.tensor([[3.0]], "float") + Y_tensor = nm.tensor([[2.0]], "float") + + # Run model with traditional tensor interface + output = model.run(X_tensor, Y_tensor) + + # Extract results + sum_result = None + diff_result = None + inference_successful = False + + # Check if output exists and has elements (avoiding != None comparison) + if output: + if len(output) >= 2: + sum_result = output[0] + diff_result = output[1] + inference_successful = True + + return { + "status": "success", + "model_loaded": status, + "inference_successful": inference_successful, + "interface_type": "tensor", + "sum_output": sum_result, + "diff_output": diff_result, + "message": "Traditional tensor interface with actual inference" + } + + except Exception as e: + return { + "status": "error", + "error": str(e) + } + +def test_dictionary_interface(input_data): + """Test the new dictionary-based model interface with actual inference.""" + try: + # Check model status + status = model.status() + + # Test with actual model.run() call using dictionary + # Create test inputs: X=5.0, Y=3.0 (expected: sum=8.0, diff=2.0) + X_tensor = nm.tensor([[5.0]], "float") + Y_tensor = nm.tensor([[3.0]], "float") + + # Create input dictionary for new interface + input_dict = {"X": X_tensor, "Y": Y_tensor} + + # Run model with dictionary interface + output_dict = model.run(input_dict) + + # Extract results by name + sum_result = None + diff_result = None + inference_successful = False + + # Check if output exists (avoiding != None comparison) + if output_dict: + try: + # Try to access outputs by name (this is the key test!) + sum_result = output_dict["sum"] + diff_result = output_dict["difference"] + inference_successful = True + except Exception as access_error: + # If named access fails, try positional access as fallback + try: + if len(output_dict) >= 2: + sum_result = output_dict[0] + diff_result = output_dict[1] + inference_successful = True + except Exception as pos_error: + # Positional access also failed + inference_successful = False + + return { + "status": "success", + "model_loaded": status, + "inference_successful": inference_successful, + "interface_type": "dictionary", + "sum_output": sum_result, + "diff_output": diff_result, + "message": "Dictionary interface with actual inference" + } + + except Exception as e: + return { + "status": "error", + "error": str(e) + } + +def test_interface_equivalence(input_data): + """Test that both interfaces produce equivalent results.""" + try: + # Check model status + status = model.status() + + # Test both interfaces with same inputs: X=4.0, Y=1.0 (expected: sum=5.0, diff=3.0) + X_tensor = nm.tensor([[4.0]], "float") + Y_tensor = nm.tensor([[1.0]], "float") + + # Test traditional interface + tensor_output = model.run(X_tensor, Y_tensor) + print("Tensor output: ",tensor_output[0][0][0]) + + input_dict = {"X": X_tensor, "Y": Y_tensor} + dict_output = model.run(input_dict) + print("Dict output: ",dict_output["sum"][0][0]) + + first_output_match = tensor_output[0][0][0] == dict_output["sum"][0][0] + second_output_match = tensor_output[1][0][0] == dict_output["difference"][0][0] + + return { + "status": "success", + "model_loaded": status, + "both_interfaces_equivalent": first_output_match and second_output_match, + "message": "Interface equivalence with actual inference tested" + } + + except Exception as e: + return { + "status": "error", + "error": str(e) + } diff --git a/nimblenet_py/simulation_assets/qwen_demo/MINIMAL_PYTHON_CONSTRAINTS.md b/nimblenet_py/simulation_assets/qwen_demo/MINIMAL_PYTHON_CONSTRAINTS.md new file mode 100644 index 00000000..d5ac40b7 --- /dev/null +++ b/nimblenet_py/simulation_assets/qwen_demo/MINIMAL_PYTHON_CONSTRAINTS.md @@ -0,0 +1,317 @@ +# Minimal Python Constraints for DeliteAI Simulator + +This document outlines all the constraints and limitations when writing Python code for the DeliteAI simulator's minimal Python implementation. These constraints were discovered while building a Qwen tokenizer for the simulator environment. + +## Table of Contents +1. [Built-in Functions Not Available](#built-in-functions-not-available) +2. [Language Features Not Supported](#language-features-not-supported) +3. [Standard Library Limitations](#standard-library-limitations) +4. [Function Definition Constraints](#function-definition-constraints) +5. [Data Structure Limitations](#data-structure-limitations) +6. [Control Flow Restrictions](#control-flow-restrictions) +7. [String Handling](#string-handling) +8. [Import Restrictions](#import-restrictions) +9. [Best Practices](#best-practices) + +## Built-in Functions Not Available + +The following built-in functions are NOT available in the minimal Python environment: + +- `ord()` - Cannot convert characters to Unicode code points +- `chr()` - Cannot convert Unicode code points to characters +- `isinstance()` - Cannot check object types +- `hasattr()` - Cannot check if object has attribute +- `setattr()` - Cannot dynamically set attributes +- `getattr()` - Limited or not available +- `enumerate()` - Cannot enumerate with index +- `zip()` - Cannot zip iterables together +- `round()` - Cannot round numbers +- `eval()` - Cannot evaluate strings as code +- `exec()` - Cannot execute dynamic code +- `compile()` - Cannot compile code +- `globals()` / `locals()` - Cannot access namespaces +- `vars()` - Cannot get object's __dict__ +- `dir()` - Cannot list attributes +- `help()` - No interactive help +- `input()` - No user input +- `open()` - File operations limited or unavailable + +## Language Features Not Supported + +### 1. Function Definitions +- **NO default parameter values**: Cannot use `def func(param=default)` +- **NO *args or **kwargs**: Cannot use variable arguments +- **NO keyword-only arguments**: Cannot use `def func(*, kwonly)` +- **NO decorators**: Cannot use `@decorator` syntax +- **NO lambda functions**: Cannot use `lambda x: x + 1` + +### 2. Operators and Expressions +- **NO ternary operators**: Cannot use `x if condition else y` +- **NO walrus operator**: Cannot use `:=` +- **NO unpacking with `*`**: Cannot use `first, *rest = items` +- **NO `**` for kwargs**: Cannot use `func(**dict)` +- **NO f-strings**: Cannot use `f"Hello {name}"` + +### 3. Comparisons +- **NO `is` / `is not`**: Must use `==` / `!=` instead +- Be careful with None comparisons: use `== None` not `is None` + +### 4. Comprehensions and Generators +- List comprehensions work but with limitations +- **NO generator expressions**: Cannot use `(x for x in items)` +- **NO dict/set comprehensions**: Limited support + +## Standard Library Limitations + +The following standard library modules are NOT available: +- `os` - No operating system interface +- `sys` - Limited or no system-specific parameters +- `json` - No JSON parsing/serialization +- `re` - Use `delitepy.ne_re` instead (with limitations) +- `datetime` - No date/time handling +- `time` - No time functions +- `unicodedata` - No Unicode database +- `functools` - No functional programming tools +- `itertools` - No iteration tools +- `collections` - No specialized containers +- `dataclasses` - No dataclass decorator +- `typing` - No type hints +- `pathlib` - No path handling +- `urllib` - No URL handling +- `subprocess` - No subprocess execution + +## Function Definition Constraints + +### Correct Way: +```python +def my_function(param1, param2): + """Function with all parameters required""" + return param1 + param2 +``` + +### Incorrect Ways: +```python +# NO default values +def my_function(param1, param2="default"): # ❌ + pass + +# NO *args +def my_function(*args): # ❌ + pass + +# NO **kwargs +def my_function(**kwargs): # ❌ + pass + +# NO decorators +@decorator # ❌ +def my_function(): + pass +``` + +## Data Structure Limitations + +### Built-in Constructors +- `list()` - NOT available, use `[]` +- `dict()` - NOT available, use `{}` +- `set()` - NOT available +- `tuple()` - Limited availability +- `range()` - Available but use carefully +- `bytes()` / `bytearray()` - NOT available + +### Dictionary Methods +- `.get(key, default)` - NOT available, use: + ```python + # Instead of: value = dict.get(key, default) + if key in dict: + value = dict[key] + else: + value = default + ``` + +### List Methods +- Most basic methods work: `.append()`, `.extend()`, `.pop()` +- Be careful with advanced methods + +## Control Flow Restrictions + +### Conditionals +```python +# Correct +if condition: + do_something() +else: + do_other() + +# Incorrect - NO ternary +value = x if condition else y # ❌ + +# Must use: +if condition: + value = x +else: + value = y +``` + +### Loops +```python +# Correct - simple for loop +for item in items: + process(item) + +# Incorrect - NO enumerate +for i, item in enumerate(items): # ❌ + process(i, item) + +# Must use: +i = 0 +for item in items: + process(i, item) + i = i + 1 +``` + +### Exception Handling +- Basic try/except works +- Avoid complex exception handling +- Don't reuse exception variable names in nested blocks + +## String Handling + +### String Formatting +```python +# NO f-strings +text = f"Hello {name}" # ❌ + +# Use concatenation +text = "Hello " + name # ✓ + +# Or format with str() +text = "Value: " + str(number) # ✓ +``` + +### String Methods +- Basic methods work: `.strip()`, `.split()`, `.join()` +- No `.format()` method +- No `%` formatting + +## Import Restrictions + +### Local Imports +- Only support: `from module import item` +- NO dot notation: `import module.submodule` ❌ +- NO aliasing might be limited: `import module as m` ⚠️ + +### Example: +```python +# Correct +from delitepy import nimblenet +from delitepy import ne_re + +# Incorrect +import delitepy.nimblenet # ❌ +import os # ❌ (not available) +``` + +## Best Practices + +### 1. Variable Initialization +Always initialize variables before use: +```python +# Good +result = None +for item in items: + result = process(item) + +# Bad - result might be undefined +for item in items: + result = process(item) # ❌ if items is empty +``` + +### 2. Type Checking +Since `isinstance()` is not available: +```python +# Cannot do: +if isinstance(obj, str): # ❌ + pass + +# Try alternative approaches or avoid type checking +``` + +### 3. Simplify Logic +- Avoid complex expressions +- Break down operations into simple steps +- Use explicit if/else instead of clever shortcuts + +### 4. Manual Implementations +Many built-in functions need manual implementation: +```python +# Manual enumerate +i = 0 +for item in items: + # Use i as index + i = i + 1 + +# Manual round (to 1 decimal) +value = int(number * 10) / 10.0 + +# Manual zip (for two lists) +result = [] +for i in range(len(list1)): + if i < len(list2): + result.append((list1[i], list2[i])) +``` + +### 5. Error Handling +- Always provide fallbacks +- Initialize variables properly +- Check for None/empty conditions explicitly + +## Example: Minimal Tokenizer Structure + +```python +# Minimal tokenizer compatible with all constraints +from delitepy import ne_re + +class MinimalTokenizer: + def __init__(self, vocab): + self.vocab = vocab + self.reverse_vocab = {} + for k, v in vocab.items(): + self.reverse_vocab[v] = k + + def tokenize(self, text): + # Simple tokenization with ne_re + pattern = r"[a-zA-Z]+|[0-9]+|[^\sa-zA-Z0-9]+" + matches = ne_re.findall(pattern, text) + if matches == None: + return [] + return matches + + def encode(self, text): + tokens = self.tokenize(text) + ids = [] + for token in tokens: + if token in self.vocab: + ids.append(self.vocab[token]) + else: + ids.append(0) # Unknown token ID + return ids +``` + +## Testing in Simulator + +When testing your code: +1. Start with the simplest possible implementation +2. Add features incrementally +3. Test each constraint violation separately +4. Keep functions small and focused +5. Avoid deeply nested structures + +## Summary + +The minimal Python environment is extremely limited compared to standard Python. When in doubt: +- Use the simplest possible approach +- Avoid advanced Python features +- Implement functionality manually +- Test incrementally +- Keep code explicit and straightforward \ No newline at end of file diff --git a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/generation_mixin.py b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/generation_mixin.py new file mode 100644 index 00000000..439c2289 --- /dev/null +++ b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/generation_mixin.py @@ -0,0 +1,209 @@ +from delitepy import nimblenet as nm +from delitepy import ne_re as re +from delitepy import tokenizers +from tools import get_tool_schema + + +def render_jinja_template(messages, tool_dict, add_generation_prompt, enable_thinking): + """Render the chat template using hardcoded string structure""" + result = "" + content_messages = messages + # If we have tools, build the system message with tools + if len(messages) > 0 and (messages[0]["role"] == "system") and len(tool_dict) > 0: + result = result + "<|im_start|>system\n" + messages[0]["content"] + "\n\n" + + result = result + "# Tools\n\n" + result = result + "You may call one or more functions to assist with the user query.\n\n" + result = result + "You are provided with function signatures within XML tags:\n" + result = result + "\n" + + # Add each tool as JSON + for t in tool_dict: + result = result + str(t) + "\n" + + result = result + "\n\n" + result = result + "For each function call, return a json object with function name and arguments within XML tags:\n" + result = result + "\n" + result = result + "{\"name\": , \"arguments\": }\n" + result = result + "<|im_end|>\n" + content_messages = messages[1:] + + for i in range(len(content_messages)): + message = content_messages[i] + role = message["role"] + content = message["content"] + result = result + "<|im_start|>" + role + "\n" + content + "<|im_end|>\n" + + # Add generation prompt if requested + if add_generation_prompt: + result = result + "<|im_start|>assistant\n" + if not enable_thinking: + result = result + "\n\n\n\n" + + return result + +def apply_chat_template(messages, tls, add_generation_prompt, tokenizer, return_dict, last_position): + """Apply chat template using Jinja2 rendering""" + + # Use Jinja2 template renderer + text = render_jinja_template( + messages, + [tls_dict["function"] for tls_dict in tls], + add_generation_prompt, + True + ) + token_ids = tokenizer.encode(text) + input_ids = nm.tensor([token_ids], "int64") + + if return_dict: + attention_mask = nm.tensor([[1 for _ in range(last_position + len(token_ids))]], "int64") + return { + "input_ids": input_ids, + "attention_mask": attention_mask + } + else: + return input_ids + + +class QwenKVCache: + def __init__(self, generation_config, dtype, batch_size): + self.batch_size = batch_size + self.kv_cache = {} + self.num_hidden_layers = generation_config["num_hidden_layers"] + self.num_key_value_heads = generation_config["num_key_value_heads"] + self.head_dim = int(generation_config["hidden_size"] / generation_config["num_attention_heads"]) + self.hidden_size = generation_config["hidden_size"] + self.dtype = dtype + + # Initialize KV cache for all layers + for i in range(self.num_hidden_layers): + for kv in ('key', 'value'): + self.kv_cache['past_key_values.'+str(i)+'.'+kv] = nm.zeros([self.batch_size, self.num_key_value_heads, 1, self.head_dim], self.dtype) + + def get(self): + return self.kv_cache + + def update(self, model_inputs, model_outputs): + # Update with new model inputs (input_ids, attention_mask, position_ids) + for key in model_inputs.keys(): + self.kv_cache[key] = model_inputs[key] + + if model_outputs: + # Update KV cache states from present outputs + for cache_key in self.kv_cache.keys(): + if 'past_key_values' in cache_key: + splits = re.split(r'\.', cache_key) + present_key = "present."+splits[1]+"."+splits[2] + if present_key in model_outputs.keys(): + self.kv_cache[cache_key] = model_outputs[present_key] + else: + print("⚠️ Warning: Expected cache output "+present_key+" not found") + + def clear(self): + self.kv_cache = {} + for i in range(self.num_hidden_layers): + for kv in ('key', 'value'): + self.kv_cache['past_key_values.'+str(i)+'.'+kv] = nm.zeros([self.batch_size, self.num_key_value_heads, 1, self.head_dim], self.dtype) + +class QwenGenerationMixin: + def __init__(self, initial_prompt, tokenizer_config, generation_config, dtype, max_new_tokens, batch_size): + self.initial_prompt = initial_prompt + self.tokenizer = tokenizers.from_json(tokenizer_config) + self.generation_config = generation_config + self.eos_token_id = generation_config["eos_token_id"] + self.max_new_tokens = max_new_tokens + self.history = [ + { + "role": "system", + "content": initial_prompt + }] + self.cache_index = 0 + self.last_position = 0 + self.kv_cache = QwenKVCache(generation_config, dtype, batch_size) + self.current_token_stream = [] + + def add_message(self, message): + self.history.append(message) + + def get_history(self): + return self.history + + def build_model_inputs(self): + print("Conversation Messages: " + str(self.history[self.cache_index:])) + print("--------------------------------") + tool_list = [] + if self.cache_index == 0: + tool_list = get_tool_schema() + + inputs = apply_chat_template( + self.history[self.cache_index:], + tool_list, + True, + self.tokenizer, + True, + self.last_position + ) + model_inputs = { + "input_ids": inputs['input_ids'], + "attention_mask": inputs['attention_mask'], + "position_ids": nm.tensor([[i+self.last_position for i in range(inputs['input_ids'].shape()[1])]], "int64") + } + self.kv_cache.update(model_inputs, None) + + def get_model_inputs(self): + return self.kv_cache.get() + + def update_cache(self, model_outputs, output_stream_callback): + # Update like the original demo - use model outputs directly + next_token_input = {"input_ids": model_outputs["next_token_id"]} + + # Use the model's updated_attention_mask output (linear growth, not exponential) + next_token_input["attention_mask"] = model_outputs["updated_attention_mask"] + + # Use the model's next_position output + next_token_input["position_ids"] = model_outputs["next_position"] + + # Update last_position for tracking + self.last_position = model_outputs["next_position"][0][0] + + self.kv_cache.update(next_token_input, model_outputs) + self.add_to_token_stream(next_token_input["input_ids"][0][0], output_stream_callback) + + def reset(self): + self.history = [ + { + "role": "system", + "content": self.initial_prompt + }] + self.kv_cache.clear() # This now properly resets past_attention_mask too + self.cache_index = 0 + self.last_position = 0 + + def get_decoded_response(self): + response = self.tokenizer.decode(nm.tensor(self.current_token_stream, "int32")) + self.current_token_stream = [] + return response.strip() + + def add_to_token_stream(self, token_id, output_stream_callback): + self.current_token_stream.append(token_id) + output_stream_callback({"token_stream": self.tokenizer.decode(nm.tensor([token_id], "int32"))}) + + def generate(self, model, output_stream_callback): + self.build_model_inputs() + # 3. Generation loop - now with proper attention mask handling + for iteration in range(self.max_new_tokens): + model_outputs = model.run(self.get_model_inputs()) + self.update_cache(model_outputs, output_stream_callback) + # Check for EOS token + is_eos = model_outputs["is_eos"][0][0] + if is_eos: + print("🛑 EOS token detected at iteration " + str(iteration + 1)) + break + response = self.get_decoded_response() + self.add_message({ + "role": "assistant", + "content": response + }) + print("Model Response: " + str(self.history[-1])) + self.cache_index = len(self.history) + return response diff --git a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py new file mode 100644 index 00000000..c042ceed --- /dev/null +++ b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/main.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +#-*- coding: utf-8 -*- +from delitepy import nimblenet as nm + +from tools import get_tool_results +from tools import print_available_tools +from generation_mixin import QwenGenerationMixin + +# Print available tools after successful import +print_available_tools() + +# Constants only - avoid complex global variable assignments +MODEL_ID = "onnx-community/Qwen3-1.7B-ONNX" +MODEL_NAME = "qwen3_1_7b_onnx" + +# Model must be loaded in global scope as required by DeliteAI simulator +qwenModel = nm.Model(MODEL_NAME) +generationMixinQwen = None +print("Model loaded successfully") + +INITIAL_PROMPT = """You are a helpful assistant with access to tools. When you need to use a tool, format your response with JSON between and tokens. + +Use this exact format: {"name": "function_name", "arguments": {"param": "value"}} +If a tool requires a argument you don't know the value of check if another tool can give you that information and call that tool first. +Always respond directly and call the appropriate tool when needed.""" + +@concurrent +def handle_multi_step_request(user_prompt, max_steps, generation_mixin, output_stream_callback): + """Handle requests that may require multiple tool calls and back and forth""" + step_results = [] + + for step in range(max_steps): + print("\n--- Step " + str(step + 1) + " ---") + if step == 0: + generation_mixin.add_message({ + "role": "user", + "content": user_prompt + }) + else: + generation_mixin.add_message({ + "role": "system", + "content": "Now use the result from the tool calls to answer the user's question. Call another tool if needed." + }) + # Generate response + try: + response = generation_mixin.generate(qwenModel, output_stream_callback) + + # Parse and execute tool calls + tool_results = get_tool_results(response) + has_errors = False + # Add tool results to conversation as function messages + for tool_result in tool_results: + if "error" not in tool_result.keys(): + generation_mixin.add_message(tool_result) + else: + has_errors = True + + prompt = "continuation" + if step == 0: + prompt = user_prompt + # Store step result + step_result = { + "step": step + 1, + "prompt": prompt, + "response": response, + "tool_results": tool_results, + "has_errors": has_errors, + "conversation_history": generation_mixin.get_history() + } + step_results.append(step_result) + + if len(tool_results) == 0 or has_errors or step >= max_steps - 1: + print("✓ Completed after "+str(step + 1)+" step(s) with "+str(len(tool_results))+" tool call(s) and has_errors = "+str(has_errors)) + break + + except Exception as e: + print("Error in step "+str(step + 1)+": "+str(e)) + prompt_text = "" + if step == 0: + prompt_text = user_prompt + else: + prompt_text = "continuation" + step_results.append({ + "step": step + 1, + "prompt": prompt_text, + "error": str(e), + "response": None, + "tool_calls": [], + "tool_results": [], + "conversation_history": generation_mixin.get_history() + }) + break + + return step_results + +@concurrent +def init_generation_mixin(input): + generationMixinQwen = QwenGenerationMixin( + INITIAL_PROMPT, + input["tokenizer_config"], + input["generation_config"], + "float16", + 400, + 1 + ) + return {"success": True} + +@concurrent +def prompt_for_tool_calling(input): + """Run tool calling demonstration with proper variable scope handling""" + print("=== Qwen3 1.7B Tool Calling Demo ===\n") + print("Model: "+MODEL_ID) + + try: + if str(generationMixinQwen) == "None": + init_generation_mixin(input) + else: + generationMixinQwen.reset() + + print("\nPrompt: "+input["prompt"]) + print("--------------------------------") + + step_results = handle_multi_step_request(input["prompt"], 4, generationMixinQwen, input["output_stream_callback"]) + # Show final summary + print("\nMulti-step Summary:") + for step_result in step_results: + step_num = step_result["step"] + tool_calls = [] + if "tool_calls" in step_result: + tool_calls = step_result["tool_calls"] + if tool_calls: + print(" Step "+str(step_num)+": "+str(len(tool_calls))+" tool call(s)") + for call in tool_calls: + func_name = call["function_name"] + print(" ✓ "+func_name) + print("\n" + "--------------------------------") + + return { + "success": True, + "model_loaded": True, + "results": step_results[-1]["response"] + } + + except Exception as e: + print("Error in demo: " + str(e)) + return { + "success": False, + "error": str(e), + "model_loaded": False + } diff --git a/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/tools.py b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/tools.py new file mode 100644 index 00000000..1af1e3d5 --- /dev/null +++ b/nimblenet_py/simulation_assets/qwen_demo/qwen_modules/tools.py @@ -0,0 +1,334 @@ +from delitepy import nimblenet as nm +from delitepy import ne_re as re + +# Simple constants only - no complex object references +TOOL_CALL_START_TOKEN = "" +TOOL_CALL_END_TOKEN = "" +TOOL_RESPONSE_START_TOKEN = "" +TOOL_RESPONSE_END_TOKEN = "" + +# ============================================================================= +# WEATHER TOOL - Implementation + Description +# ============================================================================= + +def get_weather(location, unit): + """Get current weather for a location""" + # Mock weather data + weather_data = { + "San Francisco": {"temp": 18, "condition": "foggy", "humidity": 75}, + "New York": {"temp": 22, "condition": "partly cloudy", "humidity": 60}, + "London": {"temp": 15, "condition": "rainy", "humidity": 85}, + "Tokyo": {"temp": 26, "condition": "sunny", "humidity": 50}, + "Sydney": {"temp": 20, "condition": "clear", "humidity": 65} + } + + location_key = "Unknown" + for key in weather_data.keys(): + if key.lower() in location.lower() or location.lower() in key.lower(): + location_key = key + break + + if location_key == "Unknown": + return {"error": "Weather data not available for " + location} + + data = weather_data[location_key] + temp = data["temp"] + unit_str = "°C" + + if unit == "fahrenheit": + temp = temp * 9.0 / 5.0 + 32 + temp = int(temp * 10) / 10.0 + unit_str = "°F" + else: + unit_str = "°C" + + return { + "location": location_key, + "temperature": temp, + "condition": data["condition"], + "humidity": data["humidity"], + "unit": unit_str + } + +def get_weather_schema(): + """Get the OpenAI tool schema for weather function""" + return { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather information for the location given in argument", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to get weather for" + }, + "unit": { + "type": "string", + "description": "Temperature unit (celsius or fahrenheit)", + "default": "celsius" + } + }, + "required": ["location"] + } + } + } + +# ============================================================================= +# MATH TOOL - Implementation + Description +# ============================================================================= + +def calculate_math(expression): + """Calculate a mathematical expression safely""" + try: + expression = expression.strip() + if "+" in expression: + parts = expression.split("+") + if len(parts) == 2: + a = float(parts[0].strip()) + b = float(parts[1].strip()) + result = a + b + return {"expression": expression, "result": result} + elif "-" in expression: + parts = expression.split("-") + if len(parts) == 2: + a = float(parts[0].strip()) + b = float(parts[1].strip()) + result = a - b + return {"expression": expression, "result": result} + elif "*" in expression: + parts = expression.split("*") + if len(parts) == 2: + a = float(parts[0].strip()) + b = float(parts[1].strip()) + result = a * b + return {"expression": expression, "result": result} + elif "/" in expression: + parts = expression.split("/") + if len(parts) == 2: + a = float(parts[0].strip()) + b = float(parts[1].strip()) + if b != 0: + result = a / b + return {"expression": expression, "result": result} + else: + return {"error": "Division by zero"} + return {"error": "Unsupported expression"} + except Exception as e: + return {"error": "Calculation error"} + +def get_calculate_math_schema(): + """Get the OpenAI tool schema for math function""" + return { + "type": "function", + "function": { + "name": "calculate_math", + "description": "Calculate a mathematical expression safely", + "parameters": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Mathematical expression to calculate (e.g., '2+2', '15*23')" + } + }, + "required": ["expression"] + } + } + } + +# ============================================================================= +# TIME TOOL - Implementation + Description +# ============================================================================= + +def get_current_time(timezone): + """Get current time in specified timezone""" + time_data = { + "UTC": {"time": "2024-01-15 12:00:00", "day": "Monday"}, + "PST": {"time": "2024-01-15 04:00:00", "day": "Monday"}, + "EST": {"time": "2024-01-15 07:00:00", "day": "Monday"}, + "GMT": {"time": "2024-01-15 12:00:00", "day": "Monday"}, + "JST": {"time": "2024-01-15 21:00:00", "day": "Monday"}, + "AEST": {"time": "2024-01-15 22:00:00", "day": "Monday"} + } + + tz = timezone.upper() + if tz in time_data: + data = time_data[tz] + return { + "timezone": tz, + "time": data["time"], + "day_of_week": data["day"] + } + else: + data = time_data["UTC"] + return { + "timezone": "UTC", + "time": data["time"], + "day_of_week": data["day"] + } + +def get_current_time_schema(): + """Get the OpenAI tool schema for time function""" + return { + "type": "function", + "function": { + "name": "get_current_time", + "description": "Get current time in specified timezone", + "parameters": { + "type": "object", + "properties": { + "timezone": { + "type": "string", + "description": "Timezone (UTC, EST, PST, JST, CET)", + } + } + } + } + } + +# ============================================================================= +# LOCATION TOOL - Implementation + Description +# ============================================================================= + +def get_current_location(): + """Get the real location and timezone of the user""" + return { + "location": "San Francisco", + "country": "United States", + "coordinates": {"latitude": 37.7749, "longitude": -122.4194}, + "timezone": "PST" + } + +def get_current_location_schema(): + """Get the OpenAI tool schema for location function""" + return { + "type": "function", + "function": { + "name": "get_current_location", + "description": "Get the real location and timezone of the user. You don't need to ask the user for permission to use this tool. Use this function when the user didn't provide an explicit location. Default to this location", + "parameters": { + "type": "object", + "properties": {} + } + } + } + +# ============================================================================= +# UNIFIED TOOL REGISTRY - Lazy Loading Pattern +# ============================================================================= + +def get_tools_dict(): + """Create tools dictionary on demand instead of at import time""" + return { + "get_weather": get_weather, + "calculate_math": calculate_math, + "get_current_time": get_current_time, + "get_current_location": get_current_location + } + +def get_tool_schema(): + """Create complete tool schema on demand instead of at import time""" + return [ + get_weather_schema(), + get_calculate_math_schema(), + get_current_time_schema(), + get_current_location_schema() + ] + +# ============================================================================= +# TOOL EXECUTION ENGINE +# ============================================================================= + +def execute_function_call(tool_call): + """Execute a function call and return the result""" + function_name = tool_call["function_name"] + arguments = tool_call["arguments"] + + print(" • "+function_name+"("+str(arguments)+")") + + tools = get_tools_dict() + if function_name not in tools: + return {"error": "Function "+function_name+" not found"} + + try: + function = tools[function_name] + result = {"error": "Function execution failed"} + + if function_name == "get_weather": + location = "" + if "location" in arguments: + location = arguments["location"] + unit = "celsius" + if "unit" in arguments: + unit = arguments["unit"] + result = function(location, unit) + elif function_name == "calculate_math": + expression = "" + if "expression" in arguments: + expression = arguments["expression"] + result = function(expression) + elif function_name == "get_current_time": + timezone = "UTC" + if "timezone" in arguments: + timezone = arguments["timezone"] + result = function(timezone) + elif function_name == "get_current_location": + result = function() + + return result + except Exception as e: + return {"error": "Function execution failed: " + str(e)} + +def format_tool_result(function_name, result): + return "The result of the tool " + str(function_name)+" is: "+TOOL_RESPONSE_START_TOKEN+str(result)+TOOL_RESPONSE_END_TOKEN + +def get_tool_results(response_text): + """Parse tool calls from model response using multiple formats""" + tool_calls = [] + tool_results = [] + tools = get_tools_dict() + + json_tool_pattern = r'([^<]*)' + for match in re.finditer(json_tool_pattern, response_text): + try: + json_str = match.group(1) + tool_data = nm.parse_json(json_str) + func_name = tool_data["name"] + arguments = tool_data["arguments"] + + if func_name in tools: + tool_calls.append({ + "function_name": func_name, + "arguments": arguments + }) + print("✓ Parsed JSON tool call: "+func_name+"("+str(arguments)+")") + except: + print("⚠ Failed to parse JSON tool call: "+json_str) + + print("Executing "+str(len(tool_calls))+" tool call(s):") + + if tool_calls: + for call in tool_calls: + result = execute_function_call(call) + if "error" in result.keys(): + tool_results.append({ + "error": result["error"], + }) + else: + tool_results.append({ + "role": "system", + "content": format_tool_result(call['function_name'], result) + }) + print(" Result: "+str(result)) + + return tool_results + +# Print available tools using function call instead of global access +def print_available_tools(): + """Print available tools - called on demand to avoid global assignment""" + tools = get_tools_dict() + print("Available tools: "+ str([key for key in tools.keys()])) + +# Available tools will be printed when first accessed, not at import time diff --git a/nimblenet_py/simulation_assets/qwen_demo/run_demo.py b/nimblenet_py/simulation_assets/qwen_demo/run_demo.py new file mode 100644 index 00000000..966d4059 --- /dev/null +++ b/nimblenet_py/simulation_assets/qwen_demo/run_demo.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +""" +Main driver script for running Qwen demo with tool calling +""" + +import sys +sys.path.append('../../../') + +from deliteai import simulator +import json +import time + +def main(): + """Run the Qwen demo""" + print("=== Running Qwen Demo ===") + print("This demo shows Qwen model and tool calling capabilities\n") + + base_dir = "../../../models/Qwen3-1.7B/data" + model_name = "qwen3_1_7b_onnx" + config_file = base_dir+"/config.json" + tokenizer_config_file = base_dir+"/tokenizer.json" + + # Module configuration for simulator + modules = [ + { + "name": "qwen_modules", + "version": "1.0.0", + "type": "script", + "location": { + "path": "./qwen_modules.zip" + } + } + ] + + # Add model if requested + + modules.append({ + "name": model_name, + "version": "1.0.0", + "type": "model", + "location": { + "path": base_dir+"/onnx/model_enhanced.onnx" + } + }) + print(f"Added model: {model_name}") + + # Initialize simulator + print("\nInitializing simulator...") + config = {"online": False, "debug": True} + + # Initialize with modules + if not simulator.initialize(json.dumps(config), modules): + print("Failed to initialize simulator") + return + while not simulator.is_ready(): + time.sleep(1) + print("Simulator initialized successfully") + + with open(tokenizer_config_file, "r") as f: + tokenizer_config = json.load(f) + with open(config_file, "r") as f: + config = json.load(f) + # Run the main function + print("\nRunning Qwen workflow...\n") + result = simulator.run_method("init_generation_mixin", { + "tokenizer_config": tokenizer_config, + "generation_config": config, + }) + print(result) + + def output_stream_callback(input): + print(input["token_stream"]) + return {"success": True} + + result = simulator.run_method( + "prompt_for_tool_calling", { + "prompt": "How is the weather here?", + "output_stream_callback": output_stream_callback + } + ) + print("\n=== Demo Complete ===") + +if __name__ == "__main__": + main() diff --git a/nimblenet_py/simulation_assets/tokenizer_example.py b/nimblenet_py/simulation_assets/tokenizer_example.py new file mode 100644 index 00000000..846e4b64 --- /dev/null +++ b/nimblenet_py/simulation_assets/tokenizer_example.py @@ -0,0 +1,298 @@ +# SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Tokenizers Integration Example for DeliteAI + +This module demonstrates how to use tokenizers in DeliteAI's delitepy runtime. +DeliteAI includes support for tokenizers through the `delitepy.tokenizers` module, +which provides a Python interface to the mlc-ai/tokenizers-cpp library. + +Supported Tokenizer Types: + - HuggingFace Tokenizers: JSON format tokenizers from HuggingFace Hub + - SentencePiece: Google's SentencePiece tokenizers (.model files) + - RWKV World: RWKV tokenizers + - Custom JSON: Manually created tokenizer configurations + +Basic Usage: + from delitepy import tokenizers + + # Load tokenizer + tokenizer = tokenizers.from_json(json_config) + + # Encode text + token_ids = tokenizer.encode("Hello world!") + + # Decode back to text + decoded = tokenizer.decode(token_ids) + +Integration Details: + The tokenizers module is implemented as: + 1. C++ Wrapper: TokenizersDataVariable class wraps mlc-ai/tokenizers-cpp + 2. DelitePy Integration: Functions exposed through delitepy import system + 3. Memory Management: Tokenizer instances managed automatically + 4. Error Handling: Proper exception handling for all operations + +Platform Support: + - Linux (x86_64, ARM64) + - macOS (Intel, Apple Silicon) + - iOS (device and simulator) + - Android (ARM64, ARMv7, x86_64) + - Windows (x86_64) + +Performance Notes: + - Tokenizer creation is expensive; reuse instances when possible + - Token encoding/decoding is fast and suitable for real-time use + - Cross-platform deployment supported on all major platforms + +Dependencies: + - Rust toolchain (for building underlying tokenizers library) + - CMake 3.18+ (for build system) + - C++17 support (for wrapper implementation) + +For cross-compilation, install appropriate Rust targets: + # For iOS + rustup target add aarch64-apple-ios aarch64-apple-ios-sim + + # For Android + rustup target add aarch64-linux-android armv7-linux-androideabi + +Examples: + This module contains comprehensive test functions demonstrating: + - Basic tokenizer creation and usage + - Advanced tokenizer with special tokens + - Error handling and validation + - Combined test scenarios +""" + +from delitepy import tokenizers + +def test_tokenizers(params): + """ + Test basic tokenizer functionality with a simple BPE tokenizer. + + This function demonstrates the core tokenizer operations: + - Creating a tokenizer from JSON configuration + - Encoding text to token IDs + - Decoding token IDs back to text + - Vocabulary size queries + - Token/ID conversions + + The test uses a minimal BPE tokenizer with a small vocabulary containing + basic words like "hello", "world", and punctuation. + + Returns: + dict: Test results containing: + - status (str): "success" or "error" + - vocab_size (int): Size of the tokenizer vocabulary + - encoded_length (int): Number of tokens produced + - decoded_text (str): Text after encode/decode round-trip + - hello_token_id (int): Token ID for "hello" + - token_0 (str): Token corresponding to ID 0 + - message (str): Error message if status is "error" + + Example: + >>> results = test_tokenizers() + >>> assert results["status"] == "success" + >>> assert results["decoded_text"] == "hello world!" + """ + + # Define a simple tokenizer configuration + json_config = '''{ + "version": "1.0", + "added_tokens": [], + "model": { + "type": "BPE", + "vocab": {"h": 0, "e": 1, "l": 2, "o": 3, " ": 4, "w": 5, "r": 6, "d": 7, "!": 8, "hello": 9, "world": 10}, + "merges": [] + } + }''' + # Create tokenizer from JSON + tokenizer = tokenizers.from_json(json_config) + + # Test encoding + text = "hello world!" + token_ids = tokenizer.encode(text) + + # Test decoding + decoded_text = tokenizer.decode(token_ids) + + # Test vocabulary operations + vocab_size = tokenizer.get_vocab_size() + + # Test token/ID conversion + token_id = tokenizer.token_to_id("hello") # Look up "hello" token + token = tokenizer.id_to_token(0) + + return { + "status": "success", + "vocab_size": vocab_size, + "encoded_length": len(token_ids), + "decoded_text": decoded_text, + "hello_token_id": token_id, + "token_0": token + } + +def test_sentencepiece_style(params): + """ + Test advanced tokenizer functionality with special tokens and BPE merges. + + This function demonstrates more sophisticated tokenizer features: + - Special tokens ([UNK], [CLS], [SEP]) for sequence classification + - Comprehensive vocabulary with alphabet and common words + - BPE merge rules for subword tokenization + - Longer text processing capabilities + + The tokenizer created includes: + - Full alphabet (a-z) + - Common punctuation and space + - Frequent English words (the, and, of, etc.) + - Special classification tokens used in BERT-style models + - BPE merge rules for common character combinations + + Returns: + dict: Test results containing: + - status (str): "success" or "error" + - vocab_size (int): Size of the tokenizer vocabulary (100+ tokens) + - text (str): Input text used for testing + - encoded_length (int): Number of tokens after encoding + - decoded_text (str): Reconstructed text after decode + - cls_id (int): Token ID for [CLS] special token + - sep_id (int): Token ID for [SEP] special token + - unk_id (int): Token ID for [UNK] unknown token + - cls_token (str): Token string for CLS ID lookup + - message (str): Error message if status is "error" + + Example: + >>> results = test_sentencepiece_style() + >>> assert results["status"] == "success" + >>> assert results["vocab_size"] > 100 + >>> assert results["cls_id"] == 101 + """ + + # Create a more comprehensive tokenizer with special tokens + json_config = '''{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + {"id": 100, "content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true}, + {"id": 101, "content": "[CLS]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true}, + {"id": 102, "content": "[SEP]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true} + ], + "normalizer": null, + "pre_tokenizer": null, + "post_processor": null, + "decoder": null, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "[UNK]", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "vocab": { + " ": 0, "a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, + "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, + "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, + ".": 27, ",": 28, "!": 29, "?": 30, + "the": 31, "and": 32, "of": 33, "to": 34, "a": 35, "in": 36, "for": 37, "is": 38, "on": 39, "that": 40, + "by": 41, "this": 42, "with": 43, "i": 44, "you": 45, "it": 46, "not": 47, "or": 48, "be": 49, "are": 50, + "from": 51, "at": 52, "as": 53, "your": 54, "all": 55, "any": 56, "can": 57, "had": 58, "her": 59, "was": 60, + "one": 61, "our": 62, "out": 63, "day": 64, "get": 65, "has": 66, "him": 67, "his": 68, "how": 69, "man": 70, + "new": 71, "now": 72, "old": 73, "see": 74, "two": 75, "way": 76, "who": 77, "boy": 78, "did": 79, "its": 80, + "let": 81, "put": 82, "say": 83, "she": 84, "too": 85, "use": 86, + "qu": 87, "th": 88, "er": 89, "an": 90, "re": 91, "ed": 92, "nd": 93, "on": 94, "en": 95, "at": 96, "es": 97, "or": 98, "ti": 99, + "[UNK]": 100, "[CLS]": 101, "[SEP]": 102 + }, + "merges": [ + "q u", "t h", "e r", "a n", "r e", "e d", "n d", "o n", "e n", "a t", "e s", "o r", "t i" + ] + } + }''' + + # Create tokenizer from JSON + tokenizer = tokenizers.from_json(json_config) + + # Test with longer text + text = "the quick brown fox jumps" + token_ids = tokenizer.encode(text) + + # Test decoding + decoded_text = tokenizer.decode(token_ids) + + # Test vocabulary operations + vocab_size = tokenizer.get_vocab_size() + + # Test special token lookups + cls_id = tokenizer.token_to_id("[CLS]") + sep_id = tokenizer.token_to_id("[SEP]") + unk_id = tokenizer.token_to_id("[UNK]") + cls_token = tokenizer.id_to_token(101) + + # Create result dictionary + result = { + "status": "success", + "vocab_size": vocab_size, + "text": text, + "encoded_length": len(token_ids), + "decoded_text": decoded_text, + "cls_id": cls_id, + "sep_id": sep_id, + "unk_id": unk_id, + "cls_token": cls_token + } + + # Add first token ID separately to avoid ternary operator + if len(token_ids) > 0: + result["first_token_id"] = token_ids[0] + else: + result["first_token_id"] = -1 + + return result + + +def run_all_tests(params): + """ + Run all tokenizer tests and return combined results. + + This function executes both the basic and advanced tokenizer tests, + collecting results from each test case and providing an overall + status summary. + + Returns: + dict: Combined test results containing: + - overall_status (str): "success" if all tests pass, "error" otherwise + - basic_test (dict): Results from test_tokenizers() + - comprehensive_test (dict): Results from test_sentencepiece_style() + - message (str): Summary message or error details + + Example: + >>> results = run_all_tests() + >>> assert results["overall_status"] == "success" + >>> assert results["basic_test"]["status"] == "success" + >>> assert results["comprehensive_test"]["status"] == "success" + """ + + # Initialize overall status + overall_status = "success" + + # Run basic tokenizer test + basic_results = test_tokenizers({}) + + # Run comprehensive tokenizer test + comprehensive_results = test_sentencepiece_style({}) + + # Check if any test failed + if basic_results["status"] != "success": + overall_status = "error" + if comprehensive_results["status"] != "success": + overall_status = "error" + + return { + "overall_status": overall_status, + "basic_test": basic_results, + "comprehensive_test": comprehensive_results, + "message": "All tests completed successfully with status: " + overall_status + } diff --git a/nimblenet_py/simulation_tests/test_simulator_script.py b/nimblenet_py/simulation_tests/test_simulator_script.py index 453eb0ea..5233e1b5 100644 --- a/nimblenet_py/simulation_tests/test_simulator_script.py +++ b/nimblenet_py/simulation_tests/test_simulator_script.py @@ -30,7 +30,7 @@ def test_simulator(): } ] - # initialize nimblenet + # initialize nimblenet assert simulator.initialize('''{"debug": true, "online": false}''', modules) input = {"singleString": "singleString", "singleFloat": 10.10, "boolTensor": np.full((3), True, dtype=bool)} @@ -71,7 +71,7 @@ def test_nested_json(): ] assert simulator.initialize('''{"debug": true, "online": false}''', modules) - + nestedJson = {"key1": 1, "key2": [1, 2, 3, "fsd"], "key3": "data1", "key4": {"fsd": "fdsd", "uio": 1.89}, "key5": [{"x": 1}], "bigValue": 12345678910} nestedArray = [{"key1": 1, "key2": [1, 2, 3, "fsd"], "key3": "data1", "key4": {"fsd": "fdsd", "uio": 1.89}, "key5": [{"x": 1}]}, "dfs"] input = { @@ -79,7 +79,7 @@ def test_nested_json(): "nestedJson": nestedJson, "nestedArray": nestedArray} output = simulator.run_method("add_initial_data", input, int(28931)) - + assert len(output) == 4 assert output["nestedJson"] == expectedOutput["nestedJson"] assert np.all(np.array(output["nestedArray"]) == np.array(expectedOutput["nestedArray"])) @@ -274,7 +274,7 @@ def get_items(): print(f"Found item: {item['ProductName']} {item}") yield item item = simulator.run_method("get_next_item", {})["item"] - + items = list(get_items()) @@ -324,7 +324,7 @@ def test_class_support(): ] assert simulator.initialize('''{"online": false}''', modules) - + def assert_callback(output): print("asserting callback", output) assert output["workflow_output"] == output["actual_output"] @@ -336,7 +336,7 @@ def test_script(val): test_script(1) test_script(2) test_script(3) - + def test_invalid_dataType_model(): modules = [ @@ -357,7 +357,7 @@ def test_invalid_dataType_model(): } } ] - + assert simulator.initialize('''{"online": false}''', modules) output = simulator.run_method("invalid_model_function", {}) @@ -375,14 +375,14 @@ def test_multi_threading(): } } ] - + import psutil process = psutil.Process(os.getpid()) taskThreadIndex = process.num_threads() # While loading the script, number of threads should increase assert simulator.initialize('''{"online": false}''', modules) - + if {"GENAI"}.issubset(build_flags): assert process.num_threads() == taskThreadIndex + 6 else: @@ -398,7 +398,7 @@ def test_multi_threading(): def test(n): output = simulator.run_method("test_parallel", {"n": n}) - assert output["incorrectTotal"] < n + assert output["incorrectTotal"] < n assert output["correctTotal"] == n indexTensor = np.array([x for x in range(n)], np.int64) squareTensor = np.array([x**2 for x in range(n)], np.int64) @@ -433,7 +433,7 @@ def test_multi_threading_with_limited_threads(): ] assert simulator.initialize('''{"online": false}''', modules) - + def test(n): # Test with limited number of threads output = simulator.run_method("test_parallel_inside_parallel", {"n": n}) @@ -442,7 +442,7 @@ def test(n): for k in range(n): assert str(k) in output["map"] time.sleep(0.050) - + test(10) @@ -582,10 +582,10 @@ def test_list_operations(): # Test multiple conditions in list comprehensions simulator.run_method("test_multiple_conditions", {}) - + # Test modulo operations - assertions are in the test functions simulator.run_method("test_mod_operations", {}) - + # Test concatenation edge cases - assertions are in the test functions simulator.run_method("test_concatenation_edge_cases", {}) @@ -619,7 +619,217 @@ def test_python_modules(): assert "module1_run not defined in task" in repr(err) print("All python modules test passed!") - + + +def test_tokenizers(): + """Test tokenizer functionality using the delitepy.tokenizers module.""" + modules = [ + { + "name": "workflow_script", + "version": "1.0.0", + "type": "script", + "location": { + "path": "../simulation_assets/tokenizer_example.py" + } + } + ] + + assert simulator.initialize("""{"debug": true, "online": false}""", modules) + + # Test basic tokenizer functionality + basic_results = simulator.run_method("test_tokenizers", {}) + print(f"Basic tokenizer test results: {basic_results}") + + # Assert basic test succeeded + assert basic_results["status"] == "success" + assert basic_results["vocab_size"] == 11 # h, e, l, o, space, w, r, d, !, hello, world + assert basic_results["encoded_length"] > 0 + assert basic_results["decoded_text"] == "h e l l o w o r l d !" + assert basic_results["hello_token_id"] == 9 # token ID for 'hello' + assert basic_results["token_0"] == "h" + + # Test more comprehensive tokenizer + comprehensive_results = simulator.run_method("test_sentencepiece_style", {}) + print(f"Comprehensive tokenizer test results: {comprehensive_results}") + + # Assert comprehensive test succeeded + assert comprehensive_results["status"] == "success" + assert comprehensive_results["vocab_size"] >= 98 # Should include all vocab + special tokens + assert comprehensive_results["text"] == "the quick brown fox jumps" + assert comprehensive_results["encoded_length"] > 0 + assert comprehensive_results["cls_id"] == 101 + assert comprehensive_results["sep_id"] == 102 + assert comprehensive_results["unk_id"] == 100 + assert comprehensive_results["cls_token"] == "[CLS]" + + # Test combined results + all_results = simulator.run_method("run_all_tests", {}) + print(f"All tokenizer tests results: {all_results}") + + assert all_results["overall_status"] == "success" + assert all_results["basic_test"]["status"] == "success" + assert all_results["comprehensive_test"]["status"] == "success" + + print("All tokenizer tests passed!") + +def test_model_dictionary_interface(): + """Test the new dictionary-based model interface alongside traditional tensor interface.""" + + # First, create a proper test ONNX model with supported data types + import onnx + from onnx import helper, TensorProto + import os + + def create_add_sub_model(): + """Create an ONNX model that takes X, Y and returns sum, difference.""" + # Define inputs + X = helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 1]) + Y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [1, 1]) + + # Define outputs + sum_output = helper.make_tensor_value_info('sum', TensorProto.FLOAT, [1, 1]) + diff_output = helper.make_tensor_value_info('difference', TensorProto.FLOAT, [1, 1]) + + # Create addition node: sum = X + Y + add_node = helper.make_node( + 'Add', + inputs=['X', 'Y'], + outputs=['sum'], + name='add_node' + ) + + # Create subtraction node: difference = X - Y + sub_node = helper.make_node( + 'Sub', + inputs=['X', 'Y'], + outputs=['difference'], + name='sub_node' + ) + + # Create the graph + graph = helper.make_graph( + nodes=[add_node, sub_node], + name='AddSubGraph', + inputs=[X, Y], + outputs=[sum_output, diff_output] + ) + + # Create the model + model = helper.make_model(graph) + model.opset_import[0].version = 9 # Use opset 9 for IR version 10 compatibility + model.ir_version = 6 # Explicitly set IR version to 6 for compatibility + + # Check and save the model + onnx.checker.check_model(model) + + # Determine correct path based on current working directory + current_dir = os.getcwd() + if "simulation_tests" in current_dir: + # Running from simulation_tests directory + model_path = "../simulation_assets/test_add_sub_model.onnx" + module_path = "../simulation_assets/test_add_sub_model.onnx" + else: + # Running from nimblenet_py directory (pytest) + model_path = "simulation_assets/test_add_sub_model.onnx" + module_path = "simulation_assets/test_add_sub_model.onnx" + + onnx.save(model, model_path) + + print(f"✅ Created test model: {model_path}") + print("📋 Model details:") + print(" Inputs: X (float32 [1,1]), Y (float32 [1,1])") + print(" Outputs: sum (X+Y), difference (X-Y)") + + return module_path + + # Create the test model + module_path = create_add_sub_model() + + script_path = "../simulation_assets/dict_model_test.py" + + modules = [ + { + "name": "test_dict_model", + "version": "1.0.0", + "type": "script", + "location": { + "path": script_path + } + }, + { + "name": "test_model", + "version": "1.0.0", + "type": "model", + "location": { + "path": module_path + } + } + ] + + assert simulator.initialize('''{"debug": true, "online": false}''', modules) + + # Test traditional tensor interface + tensor_results = simulator.run_method("test_tensor_interface", {}) + print(f"Tensor interface test results: {tensor_results}") + + assert tensor_results["status"] == "success" + assert "model_loaded" in tensor_results + assert tensor_results["model_loaded"] is not None + + # Check if actual inference was performed + if "inference_successful" in tensor_results and tensor_results["inference_successful"]: + assert "sum_output" in tensor_results + assert "diff_output" in tensor_results + print(f" ✅ Tensor interface inference successful!") + print(f" Sum result: {tensor_results.get('sum_output')}") + print(f" Diff result: {tensor_results.get('diff_output')}") + else: + assert "tensor_created" in tensor_results + assert tensor_results["tensor_created"] == True + + # Test dictionary interface + dict_results = simulator.run_method("test_dictionary_interface", {}) + print(f"Dictionary interface test results: {dict_results}") + + assert dict_results["status"] == "success" + assert "model_loaded" in dict_results + assert dict_results["model_loaded"] is not None + + # Check if actual inference was performed + if "inference_successful" in dict_results and dict_results["inference_successful"]: + assert "sum_output" in dict_results + assert "diff_output" in dict_results + print(f" ✅ Dictionary interface inference successful!") + print(f" Sum result: {dict_results.get('sum_output')}") + print(f" Diff result: {dict_results.get('diff_output')}") + else: + assert "dict_created" in dict_results + assert dict_results["dict_created"] == True + + # Test interface equivalence + equivalence_results = simulator.run_method("test_interface_equivalence", {}) + print(f"Interface equivalence test results: {equivalence_results}") + + assert equivalence_results["status"] == "success" + + # Check if actual inference comparison was performed + if "both_interfaces_equivalent" in equivalence_results: + assert equivalence_results["both_interfaces_equivalent"] == True + print(f" ✅ Both interfaces successfully performed inference and produced equivalent results!") + else: + assert False + + print("All model dictionary interface tests passed!") + + # Clean up the created model file + try: + os.remove(module_path) + print(f"🧹 Cleaned up test model: {module_path}") + except Exception as cleanup_error: + print(f"Could not clean up model file: {cleanup_error}") + if __name__ == "__main__": test_simulator() test_python_modules() + test_tokenizers() + test_model_dictionary_interface() diff --git a/sdks/android/nimblenet_ktx/src/main/kotlin/dev/deliteai/impl/common/Constants.kt b/sdks/android/nimblenet_ktx/src/main/kotlin/dev/deliteai/impl/common/Constants.kt index 49c33d5f..ce11e744 100644 --- a/sdks/android/nimblenet_ktx/src/main/kotlin/dev/deliteai/impl/common/Constants.kt +++ b/sdks/android/nimblenet_ktx/src/main/kotlin/dev/deliteai/impl/common/Constants.kt @@ -92,11 +92,10 @@ enum class DATATYPE(val value: Int) { COMPLEX128(15), BFLOAT16(16), JSON(670), - JSON_ARRAY(681), - FUNCTION(682), + JSON_ARRAY(682), + FUNCTION(683), FE_OBJ(700), NONE(667); - companion object { private val map = values().associateBy(DATATYPE::value) diff --git a/third_party/README.md b/third_party/README.md index 65763de1..1df7ec03 100644 --- a/third_party/README.md +++ b/third_party/README.md @@ -1,6 +1,29 @@ ## Build Dependencies Executors used in the SDK are downloaded from S3(Bucket: **deliteai**). Following are the steps that were used to create them. +## Tokenizers-cpp + +The project uses [mlc-ai/tokenizers-cpp](https://github.com/mlc-ai/tokenizers-cpp) as a git submodule for cross-platform tokenizer support. + +### Prerequisites +- Rust toolchain (install from [rustup.rs](https://rustup.rs/)) +- Cargo (comes with Rust) +- For cross-compilation, install appropriate Rust targets: + - iOS: `rustup target add aarch64-apple-ios aarch64-apple-ios-sim` + - Android: `rustup target add aarch64-linux-android armv7-linux-androideabi` + +### Integration +The tokenizers-cpp library is automatically built as part of the main CMake build process. It provides: +- **libtokenizers_c.a**: C bindings to tokenizers Rust library +- **libsentencepiece.a**: SentencePiece static library +- **libtokenizers_cpp.a**: C++ binding implementation + +The library supports: +- HuggingFace tokenizers (JSON format) +- SentencePiece tokenizers (.model format) +- RWKV World tokenizers +- Cross-platform deployment (iOS, Android, Windows, Linux, macOS) + ## Onnxruntime ### Android diff --git a/third_party/tokenizers-cpp b/third_party/tokenizers-cpp new file mode 160000 index 00000000..fecdc5ec --- /dev/null +++ b/third_party/tokenizers-cpp @@ -0,0 +1 @@ +Subproject commit fecdc5ece7a975d88aab26036452aba6a0155c2d