Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "samples/examples/tokenizers_cpp/externals/tokenizers-cpp"]
path = samples/examples/tokenizers_cpp/externals/tokenizers-cpp
url = https://github.com/mlc-ai/tokenizers-cpp
26 changes: 26 additions & 0 deletions samples/examples/tokenizers_cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
cmake_minimum_required(VERSION 3.20)

# Set the project name
project(tokenizers_cpp_onnx)

# Set the C++ standard to C++17
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED True)

# Add the tokenizers_cpp external project
add_subdirectory(externals/tokenizers-cpp)

# Define the shared library to be generated
add_library(tokenizers_cpp_onnx SHARED src/tokenizer_exports.cpp)

# Set the output name to "tokenizers_cpp" for the final DLL
set_target_properties(tokenizers_cpp_onnx
PROPERTIES
OUTPUT_NAME "tokenizers_cpp"
)

# Link against tokenizers_cpp
target_link_libraries(tokenizers_cpp_onnx PRIVATE tokenizers_cpp)

# Specify include directories
target_include_directories(tokenizers_cpp_onnx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/externals/tokenizers-cpp/include)
93 changes: 93 additions & 0 deletions samples/examples/tokenizers_cpp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# SQL Server Language Extensions - Tokenizers

This project provides a C++ tokenizer library (`tokenizers_cpp.dll`) that can be used with SQL Server Language Extensions. It includes implementations of various tokenizers including HuggingFace tokenizers and SentencePiece tokenization.
This is a wrapper on top of [tokenizer_cpp project](https://github.com/mlc-ai/tokenizers-cpp) and provides standardized interface for the SQL Server Language Extensions.

## Overview

This library serves as a bridge between SQL Server and various tokenization implementations, allowing efficient text processing and tokenization within SQL Server operations. It's particularly useful for:

- Natural Language Processing (NLP) tasks in SQL Server
- Text preprocessing for machine learning models
- Standardized tokenization across different SQL Server instances

## Project Structure

```
├── build.ps1 # Main build script for Windows
├── CMakeLists.txt # Main CMake configuration
├── src/
│ └── tokenizer_exports.cpp # Main tokenizer exports
├── externals/
└── tokenizers-cpp/ # tokenizer sub-module from HuggingFace
```

## Prerequisites

- Windows operating system
- PowerShell
- CMake (3.2x or higher)
- Rust toolchain (will be automatically installed by the build script if missing)
- Visual Studio 2022 or higher with C++ development tools

## Building the Project

1. Clone the repository with submodules:
```powershell
git clone --recursive <repository-url>
```

2. Run the build script:
```powershell
.\build.ps1
```

By default, the script builds both Debug and Release configurations. You can specify a specific configuration:
```powershell
.\build.ps1 -BuildConfig "Release" # For Release only
.\build.ps1 -BuildConfig "Debug" # For Debug only
```

The build process will:
- Check and install Rust if needed
- Initialize and update git submodules
- Create a build directory
- Configure and build the project using CMake
- Generate the tokenizers_cpp.dll library

## Build Output

After a successful build, you can find the compiled library at:
- Release: `build/Release/tokenizers_cpp.dll`
- Debug: `build/Debug/tokenizers_cpp.dll`

## Usage

The output binaries supports below tokenizer implementations:

1. HuggingFace Tokenizer (`huggingface_tokenizer.cc`)
2. SentencePiece Tokenizer (`sentencepiece_tokenizer.cc`)

The output binaries needs to be placed in the same directory as Runtimes directory and to be supplied as LOCAL_RUNTIME_PATH in the external model T-SQL query.

Example:
```sql
CREATE EXTERNAL MODEL myLocalOnnxModel
WITH (
LOCATION = 'C:/Models/sentence-transformers_all-MiniLM-L6-v2',
API_FORMAT = 'ONNX Runtime',
MODEL_TYPE = EMBEDDINGS,
MODEL = 'allMiniLM',
PARAMETERS = '{"ONNX_TOKENIZER_DLL_FILE_NAME":"tokenizers_cpp.dll" }', -- optional param to explictly mention dll name
LOCAL_RUNTIME_PATH = 'C:/Runtimes'
);
```
where `C:/Runtimes` directory contains both the onnxruntime.dll and tokenizers_cpp.dll.

## Troubleshooting

If you encounter build issues:
1. Ensure all prerequisites are installed
2. Check that submodules are properly initialized
3. Verify Visual Studio installation and C++ components
4. Check the build logs in the `build` directory
93 changes: 93 additions & 0 deletions samples/examples/tokenizers_cpp/build.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
param (
[Parameter(Position=0)]
[ValidateSet("Release", "Debug", "Debug Release", "Release Debug")]
[string]$BuildConfig = "Debug Release" # Default to Release if no argument is provided
)

function Write-Log {
param(
[string]$Message,
[ValidateSet("INFO", "ERROR", "WARNING")]
[string]$Level = "INFO"
)

$logMessage = "[$Level] $Message"
Write-Host $logMessage
}

# Parse build configurations
$configs = $BuildConfig.Split(" ", [StringSplitOptions]::RemoveEmptyEntries)
Write-Log "Starting build process for configurations: $($configs -join ', ')"

# Check if rustup is installed and install it if not
Write-Log "Checking Rustup installation..."
if (-not (Get-Command rustup -ErrorAction SilentlyContinue)) {
Write-Log "Rustup is not installed. Installing rustup..."
try {
Invoke-WebRequest -Uri https://sh.rustup.rs -OutFile rustup-init.exe
Start-Process -FilePath .\rustup-init.exe -ArgumentList '-y' -Wait
Remove-Item -Path .\rustup-init.exe -Force
if (-not (Get-Command rustup -ErrorAction SilentlyContinue)) {
Write-Log "Error: Failed to install rustup." "ERROR"
exit 1
}
}
catch {
Write-Log "Error installing Rustup: $_" "ERROR"
exit 1
}
} else {
Write-Log "Rustup is already installed."
}

# Add submodule and update
Write-Log "Setting up git submodules..."
try {
git submodule update --init --recursive
Write-Log "Submodules updated successfully."
}
catch {
Write-Log "Error updating submodules: $_" "ERROR"
exit 1
}

# Create build directory
Write-Log "Creating build directory..."
$buildDir = "build"
if (-not (Test-Path $buildDir)) {
New-Item -ItemType Directory -Path $buildDir | Out-Null
}
Push-Location $buildDir

# Build for each configuration
foreach ($config in $configs) {
Write-Log "Starting build for configuration: $config"

try {
# Use CMake to configure and build the project
Write-Log "Running CMake configuration..."
cmake .. -DCMAKE_BUILD_TYPE=$config
if ($LASTEXITCODE -ne 0) {
Write-Log "Error: CMake configuration failed for $config." "ERROR"
continue
}

Write-Log "Building project..."
cmake --build . --config $config
if ($LASTEXITCODE -ne 0) {
Write-Log "Error: CMake build failed for $config." "ERROR"
continue
}

Write-Log "Build completed successfully for $config"
}
catch {
Write-Log "Unexpected error during $config build: $_" "ERROR"
continue
}
}

Pop-Location

# Return the last exit code
exit $LASTEXITCODE
1 change: 1 addition & 0 deletions samples/examples/tokenizers_cpp/externals/tokenizers-cpp
Submodule tokenizers-cpp added at b7f776
25 changes: 25 additions & 0 deletions samples/examples/tokenizers_cpp/src/tokenizer_exports.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// @File: StaticExports.cc
//
// Purpose:
// This file contains the implementation of a function that loads a tokenizer from a JSON blob
//
//**************************************************************************************************

#include "tokenizers_cpp.h"

//--------------------------------------------------------------------------------------------------
// Name: LoadBlobJsonAndEncode
//
// Description:
// Exports a function for use in other modules or applications.
// Loads a tokenizer from a JSON blob and encodes the input text into token IDs.
// Parameters:
// json_blob - The serialized tokenizer JSON.
// text - The input string to tokenize.
// token_ids - Output vector to receive the encoded token IDs.
//
extern "C" __declspec(dllexport) void LoadBlobJsonAndEncode(const std::string& json_blob, const std::string& text, std::vector<int32_t>& token_ids)
{
auto tokenizer = tokenizers::Tokenizer::FromBlobJSON(json_blob);
token_ids = tokenizer->Encode(text);
}