diff --git a/Cargo.lock b/Cargo.lock index c2fd59685..74768f2a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9541,7 +9541,6 @@ dependencies = [ name = "ruvllm-wasm" version = "2.0.0" dependencies = [ - "bytemuck", "console_error_panic_hook", "js-sys", "serde", diff --git a/crates/ruvllm-wasm/Cargo.toml b/crates/ruvllm-wasm/Cargo.toml index 78ff8292a..095cd38c8 100644 --- a/crates/ruvllm-wasm/Cargo.toml +++ b/crates/ruvllm-wasm/Cargo.toml @@ -10,6 +10,9 @@ description = "WASM bindings for RuvLLM - browser-compatible LLM inference runti keywords = ["wasm", "llm", "inference", "browser", "webgpu"] categories = ["wasm", "api-bindings", "web-programming"] +[package.metadata.wasm-pack.profile.release] +wasm-opt = false + [lib] crate-type = ["cdylib", "rlib"] @@ -18,12 +21,12 @@ crate-type = ["cdylib", "rlib"] wasm-bindgen = "0.2" wasm-bindgen-futures = "0.4" js-sys = "0.3" +# Core web-sys features (always needed) web-sys = { version = "0.3", features = [ "console", "Performance", "Window", "Navigator", - # Web Workers support (enabled with parallel feature) "Worker", "WorkerOptions", "WorkerType", @@ -33,39 +36,6 @@ web-sys = { version = "0.3", features = [ "MessageEvent", "ErrorEvent", "DedicatedWorkerGlobalScope", - # WebGPU features (enabled with webgpu feature) - "Gpu", - "GpuAdapter", - "GpuAdapterInfo", - "GpuDevice", - "GpuQueue", - "GpuBuffer", - "GpuBufferDescriptor", - "GpuShaderModule", - "GpuShaderModuleDescriptor", - "GpuBindGroup", - "GpuBindGroupDescriptor", - "GpuBindGroupEntry", - "GpuBindGroupLayout", - "GpuBindGroupLayoutDescriptor", - "GpuBindGroupLayoutEntry", - "GpuBufferBinding", - "GpuBufferBindingLayout", - "GpuBufferBindingType", - "GpuComputePipeline", - "GpuComputePipelineDescriptor", - "GpuPipelineLayout", - "GpuPipelineLayoutDescriptor", - "GpuProgrammableStage", - "GpuCommandEncoder", - "GpuCommandEncoderDescriptor", - "GpuCommandBuffer", - "GpuComputePassEncoder", - "GpuComputePassDescriptor", - "gpu_map_mode", - "GpuRequestAdapterOptions", - "GpuDeviceDescriptor", - "GpuSupportedLimits", ] } # Serialization @@ -76,16 +46,27 @@ serde_json = "1.0" # Error handling console_error_panic_hook = { version = "0.1", optional = true } -# Byte casting for GPU buffers -bytemuck = { version = "1.14", features = ["derive"] } [dev-dependencies] wasm-bindgen-test = "0.3" [features] default = ["console_error_panic_hook"] -# WebGPU acceleration -webgpu = [] +# WebGPU acceleration (adds GPU compute pipeline, shader compilation, buffer management) +webgpu = ["web-sys/Gpu", "web-sys/GpuAdapter", "web-sys/GpuAdapterInfo", + "web-sys/GpuDevice", "web-sys/GpuQueue", "web-sys/GpuBuffer", + "web-sys/GpuBufferDescriptor", "web-sys/GpuShaderModule", + "web-sys/GpuShaderModuleDescriptor", "web-sys/GpuBindGroup", + "web-sys/GpuBindGroupDescriptor", "web-sys/GpuBindGroupEntry", + "web-sys/GpuBindGroupLayout", "web-sys/GpuBindGroupLayoutDescriptor", + "web-sys/GpuBindGroupLayoutEntry", "web-sys/GpuBufferBinding", + "web-sys/GpuBufferBindingLayout", "web-sys/GpuBufferBindingType", + "web-sys/GpuComputePipeline", "web-sys/GpuComputePipelineDescriptor", + "web-sys/GpuPipelineLayout", "web-sys/GpuPipelineLayoutDescriptor", + "web-sys/GpuProgrammableStage", "web-sys/GpuCommandEncoder", + "web-sys/GpuCommandEncoderDescriptor", "web-sys/GpuCommandBuffer", + "web-sys/GpuComputePassEncoder", "web-sys/GpuComputePassDescriptor", + "web-sys/GpuRequestAdapterOptions", "web-sys/GpuDeviceDescriptor"] # Enable parallel inference with Web Workers parallel = [] # Enable SIMD optimizations (requires wasm-simd target feature) diff --git a/docs/adr/ADR-084-ruvllm-wasm-publish.md b/docs/adr/ADR-084-ruvllm-wasm-publish.md new file mode 100644 index 000000000..61549c59c --- /dev/null +++ b/docs/adr/ADR-084-ruvllm-wasm-publish.md @@ -0,0 +1,83 @@ +# ADR-084: ruvllm-wasm — First Functional npm Publish + +**Status**: Accepted +**Date**: 2026-03-06 +**Authors**: RuVector Team +**Deciders**: ruv +**Related**: ADR-083 (Brain Training Loops), Issue #238 (placeholder deprecation) + +## 1. Context + +The `@ruvector/ruvllm-wasm` npm package (v0.1.0) was a placeholder — published without compiled WASM binaries. It was deprecated in PR #239. Meanwhile, the Rust crate `ruvllm-wasm` (v2.0.0) contains substantial working code: + +| Subsystem | Status | Exports | +|-----------|--------|---------| +| KV Cache (two-tier FP32+u8) | Working | `KvCacheWasm`, `KvCacheConfigWasm` | +| Memory (arena + buffer pool) | Working | `InferenceArenaWasm`, `BufferPoolWasm` | +| Chat Templates (7 formats) | Working | `ChatTemplateWasm`, `ChatMessageWasm` | +| HNSW Semantic Router | Working | `HnswRouterWasm`, `PatternWasm`, `RouteResultWasm` | +| MicroLoRA (rank 1-4) | Working | `MicroLoraWasm`, `AdaptFeedbackWasm` | +| SONA Instant Learning | Working | `SonaInstantWasm`, `SonaConfigWasm` | +| Web Workers | Working | `ParallelInference`, feature detection | +| WebGPU (matmul shader) | Feature-gated | `WebGpuInference`, `WebGpuContext` | +| IntelligentLLM (combined) | Commented out | Pending API compatibility | + +## 2. Decision + +### 2.1 Fix WASM Build + +The Rust 1.91 compiler has a codegen bug where release-profile optimizations produce invalid WASM (type mismatch: `expected i32, found f64` in wasm-bindgen post-processing). Debug builds validate fine. + +**Workaround**: Build with `codegen-units=256` + `lto=off`. This prevents cross-function optimization passes that trigger the bug while still producing optimized output. + +```bash +CARGO_PROFILE_RELEASE_CODEGEN_UNITS=256 \ +CARGO_PROFILE_RELEASE_LTO=off \ +wasm-pack build crates/ruvllm-wasm --target web --scope ruvector --release +``` + +Added `wasm-opt = false` to `[package.metadata.wasm-pack.profile.release]` since wasm-opt's validator also rejects the binary. + +### 2.2 Gate WebGPU Features + +WebGPU `web-sys` features (`gpu_map_mode`, `GpuSupportedLimits`, 28 GPU types) were compiled unconditionally, inflating binary size. Moved all GPU web-sys features behind the `webgpu` Cargo feature flag. + +Removed unused `bytemuck` dependency and `gpu_map_mode` / `GpuSupportedLimits` (declared but never referenced in source). + +### 2.3 Publish as v2.0.0 + +Published `@ruvector/ruvllm-wasm@2.0.0` to npm with: +- Compiled WASM binary (~435 KB, ~150 KB gzipped) +- TypeScript definitions (`.d.ts`) +- ES module JS glue code +- Accurate README with working API examples + +### 2.4 README + +Replaced placeholder README with accurate documentation covering all exported types, working code examples, and browser compatibility table. + +## 3. Files Modified + +| File | Changes | +|------|---------| +| `crates/ruvllm-wasm/Cargo.toml` | Gate WebGPU features, remove unused bytemuck/gpu_map_mode/GpuSupportedLimits, add wasm-opt=false | +| `crates/ruvllm-wasm/pkg/README.md` | Complete rewrite with accurate API docs | +| `crates/ruvllm-wasm/pkg/` | Generated: `.wasm`, `.js`, `.d.ts` files | + +## 4. Build Artifact Details + +| File | Size | +|------|------| +| `ruvllm_wasm_bg.wasm` | 435 KB | +| `ruvllm_wasm.js` | 128 KB | +| `ruvllm_wasm.d.ts` | 45 KB | + +## 5. Known Limitations + +| Area | Limitation | Resolution Path | +|------|-----------|-----------------| +| Rust 1.91 codegen bug | Requires `codegen-units=256` workaround | Fixed in future Rust compiler release | +| IntelligentLLMWasm | Commented out, references non-existent `HnswRouterConfigWasm` | Create config struct or pass params directly | +| WebGPU attention | CPU fallback only (matmul has GPU path) | Implement attention WGSL shader pipeline | +| Worker pool | Uses `setTimeout` polling instead of proper task completion signals | Implement message-based completion tracking | +| GGUF model loading | Not yet wired (no `load_model_from_url`) | Requires streaming fetch + parser integration |