From 8ed9b3643c42d8c56c6b976088a6b85fdb5a6b7c Mon Sep 17 00:00:00 2001 From: PARTH SHAHH Date: Fri, 8 Aug 2025 11:47:05 +0530 Subject: [PATCH] Add examples to build tokenizers_cpp for local ONNX models --- .gitmodules | 3 + .../examples/tokenizers_cpp/CMakeLists.txt | 26 ++++++ samples/examples/tokenizers_cpp/README.md | 93 +++++++++++++++++++ samples/examples/tokenizers_cpp/build.ps1 | 93 +++++++++++++++++++ .../tokenizers_cpp/externals/tokenizers-cpp | 1 + .../tokenizers_cpp/src/tokenizer_exports.cpp | 25 +++++ 6 files changed, 241 insertions(+) create mode 100644 .gitmodules create mode 100644 samples/examples/tokenizers_cpp/CMakeLists.txt create mode 100644 samples/examples/tokenizers_cpp/README.md create mode 100644 samples/examples/tokenizers_cpp/build.ps1 create mode 160000 samples/examples/tokenizers_cpp/externals/tokenizers-cpp create mode 100644 samples/examples/tokenizers_cpp/src/tokenizer_exports.cpp diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..64be2b2 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "samples/examples/tokenizers_cpp/externals/tokenizers-cpp"] + path = samples/examples/tokenizers_cpp/externals/tokenizers-cpp + url = https://github.com/mlc-ai/tokenizers-cpp diff --git a/samples/examples/tokenizers_cpp/CMakeLists.txt b/samples/examples/tokenizers_cpp/CMakeLists.txt new file mode 100644 index 0000000..9dbef38 --- /dev/null +++ b/samples/examples/tokenizers_cpp/CMakeLists.txt @@ -0,0 +1,26 @@ +cmake_minimum_required(VERSION 3.20) + +# Set the project name +project(tokenizers_cpp_onnx) + +# Set the C++ standard to C++17 +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED True) + +# Add the tokenizers_cpp external project +add_subdirectory(externals/tokenizers-cpp) + +# Define the shared library to be generated +add_library(tokenizers_cpp_onnx SHARED src/tokenizer_exports.cpp) + +# Set the output name to "tokenizers_cpp" for the final DLL +set_target_properties(tokenizers_cpp_onnx + PROPERTIES + OUTPUT_NAME "tokenizers_cpp" +) + +# Link against tokenizers_cpp +target_link_libraries(tokenizers_cpp_onnx PRIVATE tokenizers_cpp) + +# Specify include directories +target_include_directories(tokenizers_cpp_onnx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/externals/tokenizers-cpp/include) \ No newline at end of file diff --git a/samples/examples/tokenizers_cpp/README.md b/samples/examples/tokenizers_cpp/README.md new file mode 100644 index 0000000..e6d6d69 --- /dev/null +++ b/samples/examples/tokenizers_cpp/README.md @@ -0,0 +1,93 @@ +# SQL Server Language Extensions - Tokenizers + +This project provides a C++ tokenizer library (`tokenizers_cpp.dll`) that can be used with SQL Server Language Extensions. It includes implementations of various tokenizers including HuggingFace tokenizers and SentencePiece tokenization. +This is a wrapper on top of [tokenizer_cpp project](https://github.com/mlc-ai/tokenizers-cpp) and provides standardized interface for the SQL Server Language Extensions. + +## Overview + +This library serves as a bridge between SQL Server and various tokenization implementations, allowing efficient text processing and tokenization within SQL Server operations. It's particularly useful for: + +- Natural Language Processing (NLP) tasks in SQL Server +- Text preprocessing for machine learning models +- Standardized tokenization across different SQL Server instances + +## Project Structure + +``` +├── build.ps1 # Main build script for Windows +├── CMakeLists.txt # Main CMake configuration +├── src/ +│ └── tokenizer_exports.cpp # Main tokenizer exports +├── externals/ + └── tokenizers-cpp/ # tokenizer sub-module from HuggingFace +``` + +## Prerequisites + +- Windows operating system +- PowerShell +- CMake (3.2x or higher) +- Rust toolchain (will be automatically installed by the build script if missing) +- Visual Studio 2022 or higher with C++ development tools + +## Building the Project + +1. Clone the repository with submodules: + ```powershell + git clone --recursive + ``` + +2. Run the build script: + ```powershell + .\build.ps1 + ``` + + By default, the script builds both Debug and Release configurations. You can specify a specific configuration: + ```powershell + .\build.ps1 -BuildConfig "Release" # For Release only + .\build.ps1 -BuildConfig "Debug" # For Debug only + ``` + +The build process will: +- Check and install Rust if needed +- Initialize and update git submodules +- Create a build directory +- Configure and build the project using CMake +- Generate the tokenizers_cpp.dll library + +## Build Output + +After a successful build, you can find the compiled library at: +- Release: `build/Release/tokenizers_cpp.dll` +- Debug: `build/Debug/tokenizers_cpp.dll` + +## Usage + +The output binaries supports below tokenizer implementations: + +1. HuggingFace Tokenizer (`huggingface_tokenizer.cc`) +2. SentencePiece Tokenizer (`sentencepiece_tokenizer.cc`) + +The output binaries needs to be placed in the same directory as Runtimes directory and to be supplied as LOCAL_RUNTIME_PATH in the external model T-SQL query. + +Example: +```sql +CREATE EXTERNAL MODEL myLocalOnnxModel +WITH ( + LOCATION = 'C:/Models/sentence-transformers_all-MiniLM-L6-v2', + API_FORMAT = 'ONNX Runtime', + MODEL_TYPE = EMBEDDINGS, + MODEL = 'allMiniLM', + PARAMETERS = '{"ONNX_TOKENIZER_DLL_FILE_NAME":"tokenizers_cpp.dll" }', -- optional param to explictly mention dll name + LOCAL_RUNTIME_PATH = 'C:/Runtimes' +); +``` +where `C:/Runtimes` directory contains both the onnxruntime.dll and tokenizers_cpp.dll. + +## Troubleshooting + +If you encounter build issues: +1. Ensure all prerequisites are installed +2. Check that submodules are properly initialized +3. Verify Visual Studio installation and C++ components +4. Check the build logs in the `build` directory \ No newline at end of file diff --git a/samples/examples/tokenizers_cpp/build.ps1 b/samples/examples/tokenizers_cpp/build.ps1 new file mode 100644 index 0000000..ca69cc2 --- /dev/null +++ b/samples/examples/tokenizers_cpp/build.ps1 @@ -0,0 +1,93 @@ +param ( + [Parameter(Position=0)] + [ValidateSet("Release", "Debug", "Debug Release", "Release Debug")] + [string]$BuildConfig = "Debug Release" # Default to Release if no argument is provided +) + +function Write-Log { + param( + [string]$Message, + [ValidateSet("INFO", "ERROR", "WARNING")] + [string]$Level = "INFO" + ) + + $logMessage = "[$Level] $Message" + Write-Host $logMessage +} + +# Parse build configurations +$configs = $BuildConfig.Split(" ", [StringSplitOptions]::RemoveEmptyEntries) +Write-Log "Starting build process for configurations: $($configs -join ', ')" + +# Check if rustup is installed and install it if not +Write-Log "Checking Rustup installation..." +if (-not (Get-Command rustup -ErrorAction SilentlyContinue)) { + Write-Log "Rustup is not installed. Installing rustup..." + try { + Invoke-WebRequest -Uri https://sh.rustup.rs -OutFile rustup-init.exe + Start-Process -FilePath .\rustup-init.exe -ArgumentList '-y' -Wait + Remove-Item -Path .\rustup-init.exe -Force + if (-not (Get-Command rustup -ErrorAction SilentlyContinue)) { + Write-Log "Error: Failed to install rustup." "ERROR" + exit 1 + } + } + catch { + Write-Log "Error installing Rustup: $_" "ERROR" + exit 1 + } +} else { + Write-Log "Rustup is already installed." +} + +# Add submodule and update +Write-Log "Setting up git submodules..." +try { + git submodule update --init --recursive + Write-Log "Submodules updated successfully." +} +catch { + Write-Log "Error updating submodules: $_" "ERROR" + exit 1 +} + +# Create build directory +Write-Log "Creating build directory..." +$buildDir = "build" +if (-not (Test-Path $buildDir)) { + New-Item -ItemType Directory -Path $buildDir | Out-Null +} +Push-Location $buildDir + +# Build for each configuration +foreach ($config in $configs) { + Write-Log "Starting build for configuration: $config" + + try { + # Use CMake to configure and build the project + Write-Log "Running CMake configuration..." + cmake .. -DCMAKE_BUILD_TYPE=$config + if ($LASTEXITCODE -ne 0) { + Write-Log "Error: CMake configuration failed for $config." "ERROR" + continue + } + + Write-Log "Building project..." + cmake --build . --config $config + if ($LASTEXITCODE -ne 0) { + Write-Log "Error: CMake build failed for $config." "ERROR" + continue + } + + Write-Log "Build completed successfully for $config" + } + catch { + Write-Log "Unexpected error during $config build: $_" "ERROR" + continue + } +} + +Pop-Location + +# Return the last exit code +exit $LASTEXITCODE diff --git a/samples/examples/tokenizers_cpp/externals/tokenizers-cpp b/samples/examples/tokenizers_cpp/externals/tokenizers-cpp new file mode 160000 index 0000000..b7f7763 --- /dev/null +++ b/samples/examples/tokenizers_cpp/externals/tokenizers-cpp @@ -0,0 +1 @@ +Subproject commit b7f7763b904d537dcbaa751b30de3390f1d600c0 diff --git a/samples/examples/tokenizers_cpp/src/tokenizer_exports.cpp b/samples/examples/tokenizers_cpp/src/tokenizer_exports.cpp new file mode 100644 index 0000000..6683464 --- /dev/null +++ b/samples/examples/tokenizers_cpp/src/tokenizer_exports.cpp @@ -0,0 +1,25 @@ +// @File: StaticExports.cc +// +// Purpose: +// This file contains the implementation of a function that loads a tokenizer from a JSON blob +// +//************************************************************************************************** + +#include "tokenizers_cpp.h" + +//-------------------------------------------------------------------------------------------------- +// Name: LoadBlobJsonAndEncode +// +// Description: +// Exports a function for use in other modules or applications. +// Loads a tokenizer from a JSON blob and encodes the input text into token IDs. +// Parameters: +// json_blob - The serialized tokenizer JSON. +// text - The input string to tokenize. +// token_ids - Output vector to receive the encoded token IDs. +// +extern "C" __declspec(dllexport) void LoadBlobJsonAndEncode(const std::string& json_blob, const std::string& text, std::vector& token_ids) +{ + auto tokenizer = tokenizers::Tokenizer::FromBlobJSON(json_blob); + token_ids = tokenizer->Encode(text); +}