diff --git a/Cargo.lock b/Cargo.lock index d2f58d739..c2fd59685 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9296,23 +9296,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "ruvector-sparse-inference-wasm" -version = "2.0.5" -dependencies = [ - "console_error_panic_hook", - "getrandom 0.3.4", - "js-sys", - "ruvector-sparse-inference", - "serde", - "serde-wasm-bindgen", - "serde_json", - "wasm-bindgen", - "wasm-bindgen-futures", - "wasm-bindgen-test", - "web-sys", -] - [[package]] name = "ruvector-temporal-tensor" version = "2.0.5" diff --git a/Cargo.toml b/Cargo.toml index 59e1b4cb7..7a48ba6e9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -exclude = ["crates/micro-hnsw-wasm", "crates/ruvector-hyperbolic-hnsw", "crates/ruvector-hyperbolic-hnsw-wasm", "examples/ruvLLM/esp32", "examples/ruvLLM/esp32-flash", "examples/edge-net", "examples/data", "examples/ruvLLM", "examples/delta-behavior", "crates/rvf", "examples/rvf-desktop", "crates/mcp-brain-server"] +exclude = ["crates/micro-hnsw-wasm", "crates/ruvector-hyperbolic-hnsw", "crates/ruvector-hyperbolic-hnsw-wasm", "examples/ruvLLM/esp32", "examples/ruvLLM/esp32-flash", "examples/edge-net", "examples/data", "examples/ruvLLM", "examples/delta-behavior", "crates/rvf", "crates/rvf/*", "crates/rvf/*/*", "examples/rvf-desktop", "crates/mcp-brain-server"] members = [ "crates/ruvector-core", "crates/ruvector-node", @@ -54,7 +54,6 @@ members = [ "crates/ruvector-fpga-transformer", "crates/ruvector-fpga-transformer-wasm", "crates/ruvector-sparse-inference", - "crates/ruvector-sparse-inference-wasm", "crates/ruvector-math", "crates/ruvector-math-wasm", "examples/benchmarks", diff --git a/crates/ruvector-sparse-inference-wasm/src/lib.rs b/crates/ruvector-sparse-inference-wasm/src/lib.rs index d3e8cea8c..3949c17c7 100644 --- a/crates/ruvector-sparse-inference-wasm/src/lib.rs +++ b/crates/ruvector-sparse-inference-wasm/src/lib.rs @@ -1,7 +1,6 @@ use ruvector_sparse_inference::{ - model::{GenerationConfig, GgufParser, KVCache, ModelMetadata, ModelRunner}, - predictor::LowRankPredictor, - InferenceConfig, SparseModel, SparsityConfig, + model::{GgufParser, ModelMetadata, ModelRunner, SparseModel}, + InferenceConfig, LowRankPredictor, SparsityConfig, }; use wasm_bindgen::prelude::*; @@ -17,7 +16,6 @@ pub fn init() { pub struct SparseInferenceEngine { model: SparseModel, config: InferenceConfig, - predictors: Vec, } #[wasm_bindgen] @@ -31,13 +29,7 @@ impl SparseInferenceEngine { let model = GgufParser::parse(model_bytes) .map_err(|e| JsError::new(&format!("Failed to parse model: {}", e)))?; - let predictors = Self::init_predictors(&model, &config); - - Ok(Self { - model, - config, - predictors, - }) + Ok(Self { model, config }) } /// Load model with streaming (for large models) @@ -46,7 +38,6 @@ impl SparseInferenceEngine { url: &str, config_json: &str, ) -> Result { - // Fetch model in chunks let bytes = fetch_model_bytes(url).await?; Self::new(&bytes, config_json) } @@ -59,21 +50,6 @@ impl SparseInferenceEngine { .map_err(|e| JsError::new(&format!("Inference failed: {}", e))) } - /// Run text generation (for LLM models) - #[wasm_bindgen] - pub fn generate(&mut self, input_ids: &[u32], max_tokens: u32) -> Result, JsError> { - let config = GenerationConfig { - max_new_tokens: max_tokens as usize, - temperature: self.config.temperature, - top_k: self.config.top_k, - ..Default::default() - }; - - self.model - .generate(input_ids, &config) - .map_err(|e| JsError::new(&format!("Generation failed: {}", e))) - } - /// Get model metadata as JSON #[wasm_bindgen] pub fn metadata(&self) -> String { @@ -87,34 +63,14 @@ impl SparseInferenceEngine { serde_json::to_string(&stats).unwrap_or_default() } - /// Update sparsity threshold - #[wasm_bindgen] - pub fn set_sparsity(&mut self, threshold: f32) { - self.config.sparsity.threshold = threshold; - for predictor in &mut self.predictors { - predictor.set_threshold(threshold); - } - } - - /// Calibrate predictors with sample inputs + /// Calibrate with sample inputs #[wasm_bindgen] pub fn calibrate(&mut self, samples: &[f32], sample_dim: usize) -> Result<(), JsError> { let samples: Vec> = samples.chunks(sample_dim).map(|c| c.to_vec()).collect(); - self.model .calibrate(&samples) .map_err(|e| JsError::new(&format!("Calibration failed: {}", e))) } - - /// Initialize predictors for each layer - fn init_predictors(model: &SparseModel, config: &InferenceConfig) -> Vec { - let num_layers = model.metadata().num_layers; - let hidden_size = model.metadata().hidden_size; - - (0..num_layers) - .map(|_| LowRankPredictor::new(hidden_size, config.sparsity.threshold)) - .collect() - } } /// Embedding model wrapper for sentence transformers @@ -147,7 +103,6 @@ impl EmbeddingModel { pub fn encode_batch(&self, input_ids: &[u32], lengths: &[u32]) -> Result, JsError> { let mut results = Vec::new(); let mut offset = 0usize; - for &len in lengths { let len = len as usize; if offset + len > input_ids.len() { @@ -162,7 +117,6 @@ impl EmbeddingModel { results.extend(embedding); offset += len; } - Ok(results) } @@ -173,51 +127,6 @@ impl EmbeddingModel { } } -/// LLM model wrapper for text generation -#[wasm_bindgen] -pub struct LLMModel { - engine: SparseInferenceEngine, - kv_cache: KVCache, -} - -#[wasm_bindgen] -impl LLMModel { - #[wasm_bindgen(constructor)] - pub fn new(model_bytes: &[u8], config_json: &str) -> Result { - let engine = SparseInferenceEngine::new(model_bytes, config_json)?; - let cache_size = engine.model.metadata().max_position_embeddings; - let kv_cache = KVCache::new(cache_size); - Ok(Self { engine, kv_cache }) - } - - /// Generate next token - #[wasm_bindgen] - pub fn next_token(&mut self, input_ids: &[u32]) -> Result { - self.engine - .model - .next_token(input_ids, &mut self.kv_cache) - .map_err(|e| JsError::new(&format!("Generation failed: {}", e))) - } - - /// Generate multiple tokens - #[wasm_bindgen] - pub fn generate(&mut self, input_ids: &[u32], max_tokens: u32) -> Result, JsError> { - self.engine.generate(input_ids, max_tokens) - } - - /// Reset KV cache (for new conversation) - #[wasm_bindgen] - pub fn reset_cache(&mut self) { - self.kv_cache.clear(); - } - - /// Get generation statistics - #[wasm_bindgen] - pub fn stats(&self) -> String { - serde_json::to_string(&self.engine.model.generation_stats()).unwrap_or_default() - } -} - /// Performance measurement utilities #[wasm_bindgen] pub fn measure_inference_time( diff --git a/crates/rvf/rvf-node/src/lib.rs b/crates/rvf/rvf-node/src/lib.rs index c22d4f574..09606d395 100644 --- a/crates/rvf/rvf-node/src/lib.rs +++ b/crates/rvf/rvf-node/src/lib.rs @@ -54,6 +54,8 @@ pub struct RvfOptions { pub metric: Option, /// Hardware profile: 0=Generic, 1=Core, 2=Hot, 3=Full. Defaults to 0. pub profile: Option, + /// Compression profile: "None" | "Scalar" | "Product". Defaults to "None". + pub compression: Option, /// Whether segment signing is enabled. Defaults to false. pub signing: Option, /// HNSW M parameter. Defaults to 16. @@ -166,16 +168,34 @@ fn parse_metric(s: &str) -> Result { } } +fn parse_compression(s: &str) -> Result { + use rvf_runtime::options::CompressionProfile; + match s { + "None" | "none" => Ok(CompressionProfile::None), + "Scalar" | "scalar" => Ok(CompressionProfile::Scalar), + "Product" | "product" => Ok(CompressionProfile::Product), + _ => Err(napi::Error::from_reason(format!( + "Invalid compression '{s}'. Expected 'None', 'Scalar', or 'Product'." + ))), + } +} + fn js_options_to_rust(opts: &RvfOptions) -> Result { let metric = match &opts.metric { Some(m) => parse_metric(m)?, None => DistanceMetric::L2, }; + let compression = match &opts.compression { + Some(c) => parse_compression(c)?, + None => rvf_runtime::options::CompressionProfile::None, + }; + Ok(RustRvfOptions { dimension: opts.dimension as u16, metric, profile: opts.profile.unwrap_or(0) as u8, + compression, signing: opts.signing.unwrap_or(false), m: opts.m.unwrap_or(16) as u16, ef_construction: opts.ef_construction.unwrap_or(200) as u16, diff --git a/examples/google-cloud/src/self_learning.rs b/examples/google-cloud/src/self_learning.rs index fce36a055..ff30c9e55 100644 --- a/examples/google-cloud/src/self_learning.rs +++ b/examples/google-cloud/src/self_learning.rs @@ -268,7 +268,8 @@ pub struct AutonomousModel { impl AutonomousModel { pub fn new(input_dim: usize, hidden_dim: usize, _output_dim: usize) -> Self { - let gnn_layer = RuvectorLayer::new(input_dim, hidden_dim, 8, 0.1); + let gnn_layer = RuvectorLayer::new(input_dim, hidden_dim, 8, 0.1) + .expect("Failed to create GNN layer"); let optimizer = Optimizer::new(OptimizerType::Adam { learning_rate: 0.001, diff --git a/examples/onnx-embeddings/src/model.rs b/examples/onnx-embeddings/src/model.rs index e77c42ce0..c8ff38df7 100644 --- a/examples/onnx-embeddings/src/model.rs +++ b/examples/onnx-embeddings/src/model.rs @@ -185,9 +185,11 @@ impl OnnxModel { let inputs: Vec = session.inputs.iter().map(|i| i.name.clone()).collect(); let outputs: Vec = session.outputs.iter().map(|o| o.name.clone()).collect(); - // Default embedding dimension (will be determined at runtime from actual output) - // Most sentence-transformers models output 384 dimensions - let dimension = 384; + // Try to detect embedding dimension from the ONNX output tensor metadata. + // The last dimension of the first output is typically the hidden/embedding size. + let dimension = Self::detect_dimension_from_outputs(session) + .or_else(|| Self::detect_dimension_from_path(path)) + .unwrap_or(384); let name = path .file_stem() @@ -204,6 +206,51 @@ impl OnnxModel { }) } + /// Try to read the embedding dimension from the ONNX output tensor shape metadata. + /// Returns `None` if the dimension is dynamic or unavailable. + fn detect_dimension_from_outputs(session: &Session) -> Option { + if let Some(output) = session.outputs.first() { + // In ort 2.0, tensor_shape() returns Option<&Shape> where Shape contains i64 dims. + // Dynamic dimensions are -1; we want the last fixed (positive) dimension, + // which is typically the embedding/hidden size. + if let Some(shape) = output.output_type.tensor_shape() { + for &d in shape.iter().rev() { + if d > 0 { + debug!("Detected embedding dimension from ONNX output metadata: {}", d); + return Some(d as usize); + } + } + } + } + None + } + + /// Heuristic: infer dimension from known model names in the file path. + fn detect_dimension_from_path(path: &Path) -> Option { + let path_str = path.to_string_lossy().to_lowercase(); + let mappings: &[(&str, usize)] = &[ + ("mpnet-base", 768), + ("all-mpnet", 768), + ("bge-large", 1024), + ("bge-base", 768), + ("bge-small", 384), + ("e5-large", 1024), + ("e5-base", 768), + ("e5-small", 384), + ("gte-large", 1024), + ("gte-base", 768), + ("gte-small", 384), + ("minilm", 384), + ]; + for &(pattern, dim) in mappings { + if path_str.contains(pattern) { + debug!("Inferred embedding dimension {} from path pattern '{}'", dim, pattern); + return Some(dim); + } + } + None + } + /// Run inference on encoded inputs #[instrument(skip_all, fields(batch_size, seq_length))] pub fn run( @@ -236,19 +283,23 @@ impl OnnxModel { )) .map_err(|e| EmbeddingError::invalid_model(e.to_string()))?; - let token_type_ids_tensor = Tensor::from_array(( - vec![batch_size, seq_length], - token_type_ids.to_vec().into_boxed_slice(), - )) - .map_err(|e| EmbeddingError::invalid_model(e.to_string()))?; - - // Build inputs vector - let inputs = vec![ + // Build inputs vector — only include token_type_ids if the model expects it. + // Models like DistilBERT and many newer transformers do not use token_type_ids + // and will crash if it is provided. + let mut inputs = vec![ ("input_ids", input_ids_tensor.into_dyn()), ("attention_mask", attention_mask_tensor.into_dyn()), - ("token_type_ids", token_type_ids_tensor.into_dyn()), ]; + if self.info.input_names.iter().any(|n| n == "token_type_ids") { + let token_type_ids_tensor = Tensor::from_array(( + vec![batch_size, seq_length], + token_type_ids.to_vec().into_boxed_slice(), + )) + .map_err(|e| EmbeddingError::invalid_model(e.to_string()))?; + inputs.push(("token_type_ids", token_type_ids_tensor.into_dyn())); + } + // Run inference let outputs = self.session.run(inputs) .map_err(EmbeddingError::OnnxRuntime)?; diff --git a/npm/packages/ruvllm/bin/cli.js b/npm/packages/ruvllm/bin/cli.js index b5d120740..7767c4fac 100755 --- a/npm/packages/ruvllm/bin/cli.js +++ b/npm/packages/ruvllm/bin/cli.js @@ -72,6 +72,8 @@ async function runQuery(llm, text, flags) { } async function runGenerate(llm, prompt, flags) { + console.error('Warning: Built-in SIMD inference is experimental. For production use, configure an external LLM provider (Ollama, OpenAI, etc.).'); + const config = {}; if (flags.temperature) config.temperature = parseFloat(flags.temperature); if (flags['max-tokens']) config.maxTokens = parseInt(flags['max-tokens']); @@ -106,19 +108,33 @@ async function runMemorySearch(llm, query, flags) { } async function runStats(llm, flags) { - const stats = llm.stats(); + let stats; + try { + stats = llm.stats(); + } catch (e) { + stats = null; + } + + if (!stats) { + if (flags.json) { + console.log(formatJson({ error: 'No inference data available' })); + } else { + console.log('\nNo inference data available. Run some queries first.'); + } + return; + } if (flags.json) { console.log(formatJson(stats)); } else { console.log('\nRuvLLM Statistics:'); console.log(formatTable({ - 'Total Queries': stats.totalQueries, - 'Memory Nodes': stats.memoryNodes, - 'Patterns Learned': stats.patternsLearned, - 'Avg Latency': `${stats.avgLatencyMs.toFixed(2)}ms`, - 'Cache Hit Rate': `${(stats.cacheHitRate * 100).toFixed(1)}%`, - 'Router Accuracy': `${(stats.routerAccuracy * 100).toFixed(1)}%`, + 'Total Queries': stats.totalQueries ?? 0, + 'Memory Nodes': stats.memoryNodes ?? 0, + 'Patterns Learned': stats.patternsLearned ?? 0, + 'Avg Latency': `${(stats.avgLatencyMs ?? 0).toFixed(2)}ms`, + 'Cache Hit Rate': `${((stats.cacheHitRate ?? 0) * 100).toFixed(1)}%`, + 'Router Accuracy': `${((stats.routerAccuracy ?? 0) * 100).toFixed(1)}%`, })); } }