diff --git a/Cargo.lock b/Cargo.lock
index d2f58d739..c2fd59685 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9296,23 +9296,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "ruvector-sparse-inference-wasm"
-version = "2.0.5"
-dependencies = [
- "console_error_panic_hook",
- "getrandom 0.3.4",
- "js-sys",
- "ruvector-sparse-inference",
- "serde",
- "serde-wasm-bindgen",
- "serde_json",
- "wasm-bindgen",
- "wasm-bindgen-futures",
- "wasm-bindgen-test",
- "web-sys",
-]
-
 [[package]]
 name = "ruvector-temporal-tensor"
 version = "2.0.5"
diff --git a/Cargo.toml b/Cargo.toml
index 59e1b4cb7..7a48ba6e9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,5 @@
 [workspace]
-exclude = ["crates/micro-hnsw-wasm", "crates/ruvector-hyperbolic-hnsw", "crates/ruvector-hyperbolic-hnsw-wasm", "examples/ruvLLM/esp32", "examples/ruvLLM/esp32-flash", "examples/edge-net", "examples/data", "examples/ruvLLM", "examples/delta-behavior", "crates/rvf", "examples/rvf-desktop", "crates/mcp-brain-server"]
+exclude = ["crates/micro-hnsw-wasm", "crates/ruvector-hyperbolic-hnsw", "crates/ruvector-hyperbolic-hnsw-wasm", "examples/ruvLLM/esp32", "examples/ruvLLM/esp32-flash", "examples/edge-net", "examples/data", "examples/ruvLLM", "examples/delta-behavior", "crates/rvf", "crates/rvf/*", "crates/rvf/*/*", "examples/rvf-desktop", "crates/mcp-brain-server"]
 members = [
     "crates/ruvector-core",
     "crates/ruvector-node",
@@ -54,7 +54,6 @@ members = [
     "crates/ruvector-fpga-transformer",
     "crates/ruvector-fpga-transformer-wasm",
     "crates/ruvector-sparse-inference",
-    "crates/ruvector-sparse-inference-wasm",
     "crates/ruvector-math",
     "crates/ruvector-math-wasm",
     "examples/benchmarks",
diff --git a/crates/ruvector-sparse-inference-wasm/src/lib.rs b/crates/ruvector-sparse-inference-wasm/src/lib.rs
index d3e8cea8c..3949c17c7 100644
--- a/crates/ruvector-sparse-inference-wasm/src/lib.rs
+++ b/crates/ruvector-sparse-inference-wasm/src/lib.rs
@@ -1,7 +1,6 @@
 use ruvector_sparse_inference::{
-    model::{GenerationConfig, GgufParser, KVCache, ModelMetadata, ModelRunner},
-    predictor::LowRankPredictor,
-    InferenceConfig, SparseModel, SparsityConfig,
+    model::{GgufParser, ModelMetadata, ModelRunner, SparseModel},
+    InferenceConfig, LowRankPredictor, SparsityConfig,
 };
 use wasm_bindgen::prelude::*;
 
@@ -17,7 +16,6 @@ pub fn init() {
 pub struct SparseInferenceEngine {
     model: SparseModel,
     config: InferenceConfig,
-    predictors: Vec<LowRankPredictor>,
 }
 
 #[wasm_bindgen]
@@ -31,13 +29,7 @@ impl SparseInferenceEngine {
         let model = GgufParser::parse(model_bytes)
             .map_err(|e| JsError::new(&format!("Failed to parse model: {}", e)))?;
 
-        let predictors = Self::init_predictors(&model, &config);
-
-        Ok(Self {
-            model,
-            config,
-            predictors,
-        })
+        Ok(Self { model, config })
     }
 
     /// Load model with streaming (for large models)
@@ -46,7 +38,6 @@ impl SparseInferenceEngine {
         url: &str,
         config_json: &str,
     ) -> Result<SparseInferenceEngine, JsError> {
-        // Fetch model in chunks
         let bytes = fetch_model_bytes(url).await?;
         Self::new(&bytes, config_json)
     }
@@ -59,21 +50,6 @@ impl SparseInferenceEngine {
             .map_err(|e| JsError::new(&format!("Inference failed: {}", e)))
     }
 
-    /// Run text generation (for LLM models)
-    #[wasm_bindgen]
-    pub fn generate(&mut self, input_ids: &[u32], max_tokens: u32) -> Result<Vec<u32>, JsError> {
-        let config = GenerationConfig {
-            max_new_tokens: max_tokens as usize,
-            temperature: self.config.temperature,
-            top_k: self.config.top_k,
-            ..Default::default()
-        };
-
-        self.model
-            .generate(input_ids, &config)
-            .map_err(|e| JsError::new(&format!("Generation failed: {}", e)))
-    }
-
     /// Get model metadata as JSON
     #[wasm_bindgen]
     pub fn metadata(&self) -> String {
@@ -87,34 +63,14 @@ impl SparseInferenceEngine {
         serde_json::to_string(&stats).unwrap_or_default()
     }
 
-    /// Update sparsity threshold
-    #[wasm_bindgen]
-    pub fn set_sparsity(&mut self, threshold: f32) {
-        self.config.sparsity.threshold = threshold;
-        for predictor in &mut self.predictors {
-            predictor.set_threshold(threshold);
-        }
-    }
-
-    /// Calibrate predictors with sample inputs
+    /// Calibrate with sample inputs
     #[wasm_bindgen]
     pub fn calibrate(&mut self, samples: &[f32], sample_dim: usize) -> Result<(), JsError> {
         let samples: Vec<Vec<f32>> = samples.chunks(sample_dim).map(|c| c.to_vec()).collect();
-
         self.model
             .calibrate(&samples)
             .map_err(|e| JsError::new(&format!("Calibration failed: {}", e)))
     }
-
-    /// Initialize predictors for each layer
-    fn init_predictors(model: &SparseModel, config: &InferenceConfig) -> Vec<LowRankPredictor> {
-        let num_layers = model.metadata().num_layers;
-        let hidden_size = model.metadata().hidden_size;
-
-        (0..num_layers)
-            .map(|_| LowRankPredictor::new(hidden_size, config.sparsity.threshold))
-            .collect()
-    }
 }
 
 /// Embedding model wrapper for sentence transformers
@@ -147,7 +103,6 @@ impl EmbeddingModel {
     pub fn encode_batch(&self, input_ids: &[u32], lengths: &[u32]) -> Result<Vec<f32>, JsError> {
         let mut results = Vec::new();
         let mut offset = 0usize;
-
         for &len in lengths {
             let len = len as usize;
             if offset + len > input_ids.len() {
@@ -162,7 +117,6 @@ impl EmbeddingModel {
             results.extend(embedding);
             offset += len;
         }
-
         Ok(results)
     }
 
@@ -173,51 +127,6 @@ impl EmbeddingModel {
     }
 }
 
-/// LLM model wrapper for text generation
-#[wasm_bindgen]
-pub struct LLMModel {
-    engine: SparseInferenceEngine,
-    kv_cache: KVCache,
-}
-
-#[wasm_bindgen]
-impl LLMModel {
-    #[wasm_bindgen(constructor)]
-    pub fn new(model_bytes: &[u8], config_json: &str) -> Result<LLMModel, JsError> {
-        let engine = SparseInferenceEngine::new(model_bytes, config_json)?;
-        let cache_size = engine.model.metadata().max_position_embeddings;
-        let kv_cache = KVCache::new(cache_size);
-        Ok(Self { engine, kv_cache })
-    }
-
-    /// Generate next token
-    #[wasm_bindgen]
-    pub fn next_token(&mut self, input_ids: &[u32]) -> Result<u32, JsError> {
-        self.engine
-            .model
-            .next_token(input_ids, &mut self.kv_cache)
-            .map_err(|e| JsError::new(&format!("Generation failed: {}", e)))
-    }
-
-    /// Generate multiple tokens
-    #[wasm_bindgen]
-    pub fn generate(&mut self, input_ids: &[u32], max_tokens: u32) -> Result<Vec<u32>, JsError> {
-        self.engine.generate(input_ids, max_tokens)
-    }
-
-    /// Reset KV cache (for new conversation)
-    #[wasm_bindgen]
-    pub fn reset_cache(&mut self) {
-        self.kv_cache.clear();
-    }
-
-    /// Get generation statistics
-    #[wasm_bindgen]
-    pub fn stats(&self) -> String {
-        serde_json::to_string(&self.engine.model.generation_stats()).unwrap_or_default()
-    }
-}
-
 /// Performance measurement utilities
 #[wasm_bindgen]
 pub fn measure_inference_time(
diff --git a/crates/rvf/rvf-node/src/lib.rs b/crates/rvf/rvf-node/src/lib.rs
index c22d4f574..09606d395 100644
--- a/crates/rvf/rvf-node/src/lib.rs
+++ b/crates/rvf/rvf-node/src/lib.rs
@@ -54,6 +54,8 @@ pub struct RvfOptions {
     pub metric: Option<String>,
     /// Hardware profile: 0=Generic, 1=Core, 2=Hot, 3=Full. Defaults to 0.
     pub profile: Option<u32>,
+    /// Compression profile: "None" | "Scalar" | "Product". Defaults to "None".
+    pub compression: Option<String>,
     /// Whether segment signing is enabled. Defaults to false.
     pub signing: Option<bool>,
     /// HNSW M parameter. Defaults to 16.
@@ -166,16 +168,34 @@ fn parse_metric(s: &str) -> Result<DistanceMetric> {
     }
 }
 
+fn parse_compression(s: &str) -> Result<rvf_runtime::options::CompressionProfile> {
+    use rvf_runtime::options::CompressionProfile;
+    match s {
+        "None" | "none" => Ok(CompressionProfile::None),
+        "Scalar" | "scalar" => Ok(CompressionProfile::Scalar),
+        "Product" | "product" => Ok(CompressionProfile::Product),
+        _ => Err(napi::Error::from_reason(format!(
+            "Invalid compression '{s}'. Expected 'None', 'Scalar', or 'Product'."
+        ))),
+    }
+}
+
 fn js_options_to_rust(opts: &RvfOptions) -> Result<RustRvfOptions> {
     let metric = match &opts.metric {
         Some(m) => parse_metric(m)?,
         None => DistanceMetric::L2,
     };
 
+    let compression = match &opts.compression {
+        Some(c) => parse_compression(c)?,
+        None => rvf_runtime::options::CompressionProfile::None,
+    };
+
     Ok(RustRvfOptions {
         dimension: opts.dimension as u16,
         metric,
         profile: opts.profile.unwrap_or(0) as u8,
+        compression,
         signing: opts.signing.unwrap_or(false),
         m: opts.m.unwrap_or(16) as u16,
         ef_construction: opts.ef_construction.unwrap_or(200) as u16,
diff --git a/examples/google-cloud/src/self_learning.rs b/examples/google-cloud/src/self_learning.rs
index fce36a055..ff30c9e55 100644
--- a/examples/google-cloud/src/self_learning.rs
+++ b/examples/google-cloud/src/self_learning.rs
@@ -268,7 +268,8 @@ pub struct AutonomousModel {
 
 impl AutonomousModel {
     pub fn new(input_dim: usize, hidden_dim: usize, _output_dim: usize) -> Self {
-        let gnn_layer = RuvectorLayer::new(input_dim, hidden_dim, 8, 0.1);
+        let gnn_layer = RuvectorLayer::new(input_dim, hidden_dim, 8, 0.1)
+            .expect("Failed to create GNN layer");
 
         let optimizer = Optimizer::new(OptimizerType::Adam {
             learning_rate: 0.001,
diff --git a/examples/onnx-embeddings/src/model.rs b/examples/onnx-embeddings/src/model.rs
index e77c42ce0..c8ff38df7 100644
--- a/examples/onnx-embeddings/src/model.rs
+++ b/examples/onnx-embeddings/src/model.rs
@@ -185,9 +185,11 @@ impl OnnxModel {
         let inputs: Vec<String> = session.inputs.iter().map(|i| i.name.clone()).collect();
         let outputs: Vec<String> = session.outputs.iter().map(|o| o.name.clone()).collect();
 
-        // Default embedding dimension (will be determined at runtime from actual output)
-        // Most sentence-transformers models output 384 dimensions
-        let dimension = 384;
+        // Try to detect embedding dimension from the ONNX output tensor metadata.
+        // The last dimension of the first output is typically the hidden/embedding size.
+        let dimension = Self::detect_dimension_from_outputs(session)
+            .or_else(|| Self::detect_dimension_from_path(path))
+            .unwrap_or(384);
 
         let name = path
             .file_stem()
@@ -204,6 +206,51 @@ impl OnnxModel {
         })
     }
 
+    /// Try to read the embedding dimension from the ONNX output tensor shape metadata.
+    /// Returns `None` if the dimension is dynamic or unavailable.
+    fn detect_dimension_from_outputs(session: &Session) -> Option<usize> {
+        if let Some(output) = session.outputs.first() {
+            // In ort 2.0, tensor_shape() returns Option<&Shape> where Shape contains i64 dims.
+            // Dynamic dimensions are -1; we want the last fixed (positive) dimension,
+            // which is typically the embedding/hidden size.
+            if let Some(shape) = output.output_type.tensor_shape() {
+                for &d in shape.iter().rev() {
+                    if d > 0 {
+                        debug!("Detected embedding dimension from ONNX output metadata: {}", d);
+                        return Some(d as usize);
+                    }
+                }
+            }
+        }
+        None
+    }
+
+    /// Heuristic: infer dimension from known model names in the file path.
+    fn detect_dimension_from_path(path: &Path) -> Option<usize> {
+        let path_str = path.to_string_lossy().to_lowercase();
+        let mappings: &[(&str, usize)] = &[
+            ("mpnet-base", 768),
+            ("all-mpnet", 768),
+            ("bge-large", 1024),
+            ("bge-base", 768),
+            ("bge-small", 384),
+            ("e5-large", 1024),
+            ("e5-base", 768),
+            ("e5-small", 384),
+            ("gte-large", 1024),
+            ("gte-base", 768),
+            ("gte-small", 384),
+            ("minilm", 384),
+        ];
+        for &(pattern, dim) in mappings {
+            if path_str.contains(pattern) {
+                debug!("Inferred embedding dimension {} from path pattern '{}'", dim, pattern);
+                return Some(dim);
+            }
+        }
+        None
+    }
+
     /// Run inference on encoded inputs
     #[instrument(skip_all, fields(batch_size, seq_length))]
     pub fn run(
@@ -236,19 +283,23 @@ impl OnnxModel {
         ))
         .map_err(|e| EmbeddingError::invalid_model(e.to_string()))?;
 
-        let token_type_ids_tensor = Tensor::from_array((
-            vec![batch_size, seq_length],
-            token_type_ids.to_vec().into_boxed_slice(),
-        ))
-        .map_err(|e| EmbeddingError::invalid_model(e.to_string()))?;
-
-        // Build inputs vector
-        let inputs = vec![
+        // Build inputs vector — only include token_type_ids if the model expects it.
+        // Models like DistilBERT and many newer transformers do not use token_type_ids
+        // and will crash if it is provided.
+        let mut inputs = vec![
             ("input_ids", input_ids_tensor.into_dyn()),
             ("attention_mask", attention_mask_tensor.into_dyn()),
-            ("token_type_ids", token_type_ids_tensor.into_dyn()),
         ];
 
+        if self.info.input_names.iter().any(|n| n == "token_type_ids") {
+            let token_type_ids_tensor = Tensor::from_array((
+                vec![batch_size, seq_length],
+                token_type_ids.to_vec().into_boxed_slice(),
+            ))
+            .map_err(|e| EmbeddingError::invalid_model(e.to_string()))?;
+            inputs.push(("token_type_ids", token_type_ids_tensor.into_dyn()));
+        }
+
         // Run inference
         let outputs = self.session.run(inputs)
             .map_err(EmbeddingError::OnnxRuntime)?;
diff --git a/npm/packages/ruvllm/bin/cli.js b/npm/packages/ruvllm/bin/cli.js
index b5d120740..7767c4fac 100755
--- a/npm/packages/ruvllm/bin/cli.js
+++ b/npm/packages/ruvllm/bin/cli.js
@@ -72,6 +72,8 @@ async function runQuery(llm, text, flags) {
 }
 
 async function runGenerate(llm, prompt, flags) {
+  console.error('Warning: Built-in SIMD inference is experimental. For production use, configure an external LLM provider (Ollama, OpenAI, etc.).');
+
   const config = {};
   if (flags.temperature) config.temperature = parseFloat(flags.temperature);
   if (flags['max-tokens']) config.maxTokens = parseInt(flags['max-tokens']);
@@ -106,19 +108,33 @@ async function runMemorySearch(llm, query, flags) {
 }
 
 async function runStats(llm, flags) {
-  const stats = llm.stats();
+  let stats;
+  try {
+    stats = llm.stats();
+  } catch (e) {
+    stats = null;
+  }
+
+  if (!stats) {
+    if (flags.json) {
+      console.log(formatJson({ error: 'No inference data available' }));
+    } else {
+      console.log('\nNo inference data available. Run some queries first.');
+    }
+    return;
+  }
 
   if (flags.json) {
     console.log(formatJson(stats));
   } else {
     console.log('\nRuvLLM Statistics:');
     console.log(formatTable({
-      'Total Queries': stats.totalQueries,
-      'Memory Nodes': stats.memoryNodes,
-      'Patterns Learned': stats.patternsLearned,
-      'Avg Latency': `${stats.avgLatencyMs.toFixed(2)}ms`,
-      'Cache Hit Rate': `${(stats.cacheHitRate * 100).toFixed(1)}%`,
-      'Router Accuracy': `${(stats.routerAccuracy * 100).toFixed(1)}%`,
+      'Total Queries': stats.totalQueries ?? 0,
+      'Memory Nodes': stats.memoryNodes ?? 0,
+      'Patterns Learned': stats.patternsLearned ?? 0,
+      'Avg Latency': `${(stats.avgLatencyMs ?? 0).toFixed(2)}ms`,
+      'Cache Hit Rate': `${((stats.cacheHitRate ?? 0) * 100).toFixed(1)}%`,
+      'Router Accuracy': `${((stats.routerAccuracy ?? 0) * 100).toFixed(1)}%`,
     }));
   }
 }