NimbleEdge · vkkhare · Jul 15, 2025 · Jul 20, 2025 · Jul 23, 2025 · Jul 23, 2025
@@ -6,3 +6,5 @@ third_party/runtime/
 !third_party/runtime/CMakeLists.txt
 __pycache__/
 .pytest_cache/
+**/NimbleSDK
+models/**/data
@@ -0,0 +1,4 @@
+[submodule "third_party/tokenizers-cpp"]
+	path = third_party/tokenizers-cpp
+	url = https://github.com/NimbleEdge/tokenizers-cpp.git
+
@@ -38,7 +38,7 @@ endif()
 
 # set(DEBUGFLAGS " -Werror -Wno-write-strings  -Weffc++ -Wall -Wuninitialized -Wnon-virtual-dtor -Wshadow -Werror=format-security -Wunused-member-function -Wunused-function ")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=switch -Werror=return-type -Werror=implicit-fallthrough \
-	-Werror=non-virtual-dtor -Werror=format -Werror=format-security -Werror=unused-member-function -Werror=unused-function -Werror=writable-strings")
+	-Werror=non-virtual-dtor -Werror=format -Werror=format-security -Werror=unused-function -Werror=write-strings")
 
 # string(CONCAT RELEASEFLAGS ${DEBUGFLAGS} " -fstack-protector-strong -ffunction-sections -fdata-sections ")
 # #
@@ -84,6 +84,7 @@ add_subdirectory(nimblenet)
 add_subdirectory(delitepy)
 add_subdirectory("../third_party/json" "${CMAKE_BINARY_DIR}/third_party/json")
 add_subdirectory("../third_party/SPSCQueue" "${CMAKE_BINARY_DIR}/third_party/SPSCQueue")
+add_subdirectory("../third_party/tokenizers-cpp" "${CMAKE_BINARY_DIR}/third_party/tokenizers-cpp")
 if (GENAI)
 	add_subdirectory("../third_party/miniz" "${CMAKE_BINARY_DIR}/third_party/miniz")
 endif()
@@ -182,6 +183,7 @@ else()
 	target_compile_definitions(nimblenet PUBLIC -DIOS_PLATFORM="mac")
 	add_subdirectory(platform/unix) # produces ${CLIENT_INCLUDES}
 	add_subdirectory("../third_party/runtime" "${CMAKE_BINARY_DIR}/third_party/runtime") # -> produces ${BACKEND_LIBS} ${BACKEND_DIR} ${BACKED_INCLUDES}
+	# Use system curl library instead of conda environment
 	target_link_libraries(nimblenet ${VISIBILITY} curl)
 
 	# target_link_libraries(nimblenet ${VISIBILITY} clientlib)
@@ -194,7 +196,7 @@ if(NOT ANDROID_ABI)
 	list(APPEND ADDITIONAL_LIBS ZLIB::ZLIB)
 endif()
 
-target_link_libraries(nimblenet PRIVATE nlohmann_json::nlohmann_json ${VISIBILITY} SPSCQueue ${VISIBILITY} ${BACKEND_LIBS} ${VISIBILITY} ${ADDITIONAL_LIBS})
+target_link_libraries(nimblenet PRIVATE nlohmann_json::nlohmann_json ${VISIBILITY} SPSCQueue ${VISIBILITY} tokenizers_cpp ${VISIBILITY} ${BACKEND_LIBS} ${VISIBILITY} ${ADDITIONAL_LIBS})
 if (GENAI)
 	target_link_libraries(nimblenet PRIVATE miniz)
 endif()

@@ -51,7 +51,7 @@ def main():
         if "-DCMAKE_BUILD_TYPE=Release" in cmake_args:
             STRIP = 1
 
-    CMAKE_CXX_FLAGS = ""
+    CMAKE_CXX_FLAGS = "-Wno-unused-member-function -Wno-implicit-fallthrough "
     if args.testing:
         cmake_args += " -DTESTING=1 "
 
@@ -61,16 +61,20 @@ def main():
     COMMON_FLAGS = (
         f"-B{os.getcwd()}/build/ "
         f"{cmake_args} "
+        "-DCMAKE_POLICY_VERSION_MINIMUM=3.5 "
+        "-DCMAKE_CXX_FLAGS_RELEASE='-Wno-unused-function -Wno-implicit-fallthrough -DNDEBUG -O3' "
+        "-DCMAKE_CXX_FLAGS_DEBUG='-Wno-unused-function -Wno-implicit-fallthrough -g' "
     )
 
     # Determine compiler settings based on architecture
     if arch == "arm":
         cmake_command = f"cmake CMakeLists.txt {COMMON_FLAGS} -DCMAKE_CXX_COMPILER=g++ -DMACOS=1 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_CXX_FLAGS='{CMAKE_CXX_FLAGS}'"
     elif arch == "x86_64":
-        CMAKE_CXX_FLAGS += " -stdlib=libstdc++ "
+        # Replace clang-specific flags with g++ compatible ones
+        CMAKE_CXX_FLAGS = CMAKE_CXX_FLAGS.replace("-Wno-unused-member-function", "-Wno-unused-function")
         cmake_command = (
             f"cmake CMakeLists.txt {COMMON_FLAGS} "
-            f"-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS='{CMAKE_CXX_FLAGS}'"
+            f"-DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS='{CMAKE_CXX_FLAGS}'"
         )
     else:
         cmake_command = f"cmake CMakeLists.txt {COMMON_FLAGS} -DMACOS=1"
@@ -103,14 +107,14 @@ def main():
     if args.simulator:
         if not args.ci_build:
             # re-install deliteai
-            subprocess.run(f"python{python_version} -m pip uninstall deliteai", shell=True, check=True)
+            subprocess.run(f"python{python_version} -m pip uninstall -y deliteai", shell=True, check=True)
             subprocess.run("rm -rf dist deliteai*", shell=True, check=True)
             subprocess.run(f"python{python_version} setup.py bdist_wheel", shell=True, check=True)
             subprocess.run(f"python{python_version} -m pip install dist/*", shell=True, check=True)
 
             # re-install delitepy-library-stubs
             subprocess.run(
-                f"python{python_version} -m pip uninstall delitepy-library-stubs",
+                f"python{python_version} -m pip uninstall -y delitepy-library-stubs",
                 shell=True,
                 check=True,
             )

@@ -26,12 +26,7 @@ def render_src_template() -> None:
         check=True,
     )
     subprocess.run(
-        [
-            f"{delitepy_dir}/scripts/render_jinja2_templates.py",
-            f"{library_stubs_dir}/src_template",
-            f"{library_stubs_dir}/src_gen",
-            coreruntime_dir,
-        ],
+        ["cp", "-r", f"{library_stubs_dir}/src_template", f"{library_stubs_dir}/src_gen"],
         check=True,
     )
 

@@ -2,7 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""Package delitepy containing modules nimblenet and ne_re."""
+"""Package delitepy containing modules nimblenet, ne_re, and tokenizers."""
 
 from delitepy.nimblenet import *
 from delitepy.ne_re import *
+from delitepy.tokenizers import *
@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: (C) 2025 DeliteAI Authors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Package delitepy.tokenizers for tokenizer functionality."""
+
+from typing import List, Union
+from delitepy.nimblenet.tensor import Tensor
+
+def from_pretrained(model_name_or_path: str) -> str:
+    """Load a pre-trained tokenizer from HuggingFace Hub or local file.
+
+    Args:
+        model_name_or_path: Path to tokenizer.json file or HuggingFace model name
+
+    Returns:
+        Tokenizer handle (opaque string identifier)
+
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> tokenizer = tokenizers.from_pretrained("/path/to/tokenizer.json")
+    """
+    pass
+
+def from_file(file_path: str) -> str:
+    """Load a tokenizer from a file path.
+
+    Args:
+        file_path: Path to tokenizer.json or .model file
+
+    Returns:
+        Tokenizer handle (opaque string identifier)
+
+    Example:
+        >>> tokenizer = tokenizers.from_file("tokenizer.json")
+        >>> tokenizer = tokenizers.from_file("model.spm")
+    """
+    pass
+
+def from_json(json_str: str) -> str:
+    """Create a tokenizer from a JSON string.
+
+    Args:
+        json_str: JSON string containing tokenizer configuration
+
+    Returns:
+        Tokenizer handle (opaque string identifier)
+
+    Example:
+        >>> json_config = '{"model": {...}, "normalizer": {...}}'
+        >>> tokenizer = tokenizers.from_json(json_config)
+    """
+    pass
+
+def from_sentencepiece(model_path: str) -> str:
+    """Load a SentencePiece tokenizer from a .model file.
+
+    Args:
+        model_path: Path to SentencePiece .model file
+
+    Returns:
+        Tokenizer handle (opaque string identifier)
+
+    Example:
+        >>> tokenizer = tokenizers.from_sentencepiece("tokenizer.model")
+    """
+    pass
+
+def encode(tokenizer: str, text: str) -> Tensor:
+    """Encode text into token IDs.
+
+    Args:
+        tokenizer: Tokenizer handle from from_pretrained/from_file/etc.
+        text: Text to encode
+
+    Returns:
+        Tensor containing token IDs (INT32)
+
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> token_ids = tokenizers.encode(tokenizer, "Hello world!")
+        >>> print(token_ids.shape)  # [num_tokens]
+    """
+    pass
+
+def decode(tokenizer: str, token_ids: Tensor) -> str:
+    """Decode token IDs back to text.
+
+    Args:
+        tokenizer: Tokenizer handle
+        token_ids: Tensor containing token IDs (INT32)
+
+    Returns:
+        Decoded text string
+
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> token_ids = tokenizers.encode(tokenizer, "Hello world!")
+        >>> text = tokenizers.decode(tokenizer, token_ids)
+        >>> print(text)  # "Hello world!"
+    """
+    pass
+
+def get_vocab_size(tokenizer: str) -> int:
+    """Get the vocabulary size of the tokenizer.
+
+    Args:
+        tokenizer: Tokenizer handle
+
+    Returns:
+        Size of the vocabulary
+
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> vocab_size = tokenizers.get_vocab_size(tokenizer)
+        >>> print(vocab_size)  # 30522
+    """
+    pass
+
+def token_to_id(tokenizer: str, token: str) -> int:
+    """Convert a token string to its ID.
+
+    Args:
+        tokenizer: Tokenizer handle
+        token: Token string
+
+    Returns:
+        Token ID, or -1 if token not found
+
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> token_id = tokenizers.token_to_id(tokenizer, "[CLS]")
+        >>> print(token_id)  # 101
+    """
+    pass
+
+def id_to_token(tokenizer: str, token_id: int) -> str:
+    """Convert a token ID to its string representation.
+
+    Args:
+        tokenizer: Tokenizer handle
+        token_id: Token ID
+
+    Returns:
+        Token string, or empty string if ID not found
+
+    Example:
+        >>> tokenizer = tokenizers.from_pretrained("bert-base-uncased")
+        >>> token = tokenizers.id_to_token(tokenizer, 101)
+        >>> print(token)  # "[CLS]"
+    """
+    pass 
@@ -35,6 +35,7 @@ set(BASE
 	data_variable/src/pre_processor_nimble_net_variable.cpp
 	data_variable/src/raw_event_store_data_variable.cpp
 	data_variable/src/regex_data_variable.cpp
+	data_variable/src/tokenizers_data_variable.cpp
 	data_variable/src/single_variable.cpp
 	data_variable/src/tensor_data_variable.cpp
 	job_scheduler/src/base_job.cpp

@@ -31,6 +31,8 @@ std::string Asset::get_file_name_on_device() const {
     case AssetType::LLM:
       return name + version + rmconstants::LLMFolderName;
 #endif  // GENAI
+    default:
+      return name + version;
   }
 }
 
@@ -96,6 +98,8 @@ std::string get_string_from_asset_type(const AssetType& assetType) {
     case AssetType::LLM:
       return "llm";
 #endif  // GENAI
+    default:
+      return "unknown";
   }
 }
 

@@ -392,6 +392,8 @@ std::pair<CloudConfigResponse, Deployment> CoreSDK::get_cloud_config_and_update_
     }
     case CloudConfigState::Unmodified:
       return {cloudConfig, deployment};
+    default:
+      return {cloudConfig, deployment};
   }
 }