From b681e4717842db00ded7e07b202f55c623308e99 Mon Sep 17 00:00:00 2001 From: Louis Date: Wed, 18 Jun 2025 19:23:26 +0200 Subject: [PATCH 1/3] Fix latency graph, memory eviction bug --- .../provisioning/dashboards/dashboard.json | 20 ++++---- src/cache/cache_impl.rs | 46 ++++++++++++++----- src/main.rs | 4 +- 3 files changed, 47 insertions(+), 23 deletions(-) diff --git a/monitoring/grafana/provisioning/dashboards/dashboard.json b/monitoring/grafana/provisioning/dashboards/dashboard.json index 6cdc575..cda41db 100644 --- a/monitoring/grafana/provisioning/dashboards/dashboard.json +++ b/monitoring/grafana/provisioning/dashboards/dashboard.json @@ -85,7 +85,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "12.0.1", + "pluginVersion": "12.0.1+security-01", "targets": [ { "editorMode": "code", @@ -149,7 +149,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "12.0.1", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -219,7 +219,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "12.0.1", + "pluginVersion": "12.0.1+security-01", "targets": [ { "editorMode": "code", @@ -324,7 +324,7 @@ "sort": "none" } }, - "pluginVersion": "12.0.1", + "pluginVersion": "12.0.1+security-01", "targets": [ { "editorMode": "code", @@ -373,7 +373,7 @@ } ] }, - "unit": "ms" + "unit": "s" }, "overrides": [] }, @@ -401,7 +401,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "12.0.1", + "pluginVersion": "12.0.1+security-01", "targets": [ { "editorMode": "code", @@ -508,7 +508,7 @@ "sort": "none" } }, - "pluginVersion": "12.0.1", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -634,7 +634,7 @@ "sort": "none" } }, - "pluginVersion": "12.0.1", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -733,7 +733,7 @@ "sort": "none" } }, - "pluginVersion": "12.0.1", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -772,5 +772,5 @@ "timezone": "", "title": "Semcache", "uid": "chat-api-dashboard", - "version": 2 + "version": 1 } \ No newline at end of file diff --git a/src/cache/cache_impl.rs b/src/cache/cache_impl.rs index 4e42f3e..699c5cc 100644 --- a/src/cache/cache_impl.rs +++ b/src/cache/cache_impl.rs @@ -53,15 +53,23 @@ where fn is_full(&self) -> bool { match &self.eviction_policy { - EvictionPolicy::EntryLimit(limit) => self.response_store.len() >= *limit, + EvictionPolicy::EntryLimit(limit) => { + debug!( + "Cache size: {}, limit: {}", + self.response_store.len(), + limit + ); + self.response_store.len() >= *limit + } EvictionPolicy::MemoryLimitMb(limit) => { let response_store_memory_used_mb = - self.response_store.memory_usage_bytes() as f64 / 1024.0; + self.response_store.memory_usage_bytes() as f64 / (1024.0 * 1024.0); let semantic_store_memory_used_mb = - self.semantic_store.memory_usage_bytes() as f64 / 1024.0; + self.semantic_store.memory_usage_bytes() as f64 / (1024.0 * 1024.0); let total_memory_used_mb = response_store_memory_used_mb + semantic_store_memory_used_mb; let limit_mb = *limit as f64; + debug!("Cache size: {}, limit: {}", total_memory_used_mb, limit_mb); total_memory_used_mb >= limit_mb } } @@ -304,36 +312,50 @@ mod tests { #[test] fn insert_should_evict_when_memory_limit_reached() { + use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; + let embedding = vec![0.1, 0.2, 0.3]; - let response = "A".repeat(100 * 1024).into_bytes(); + let response = "A".repeat(400 * 1024).into_bytes(); // 400KB + + // Track number of entries in semantic store for realistic memory reporting + let entry_count = Arc::new(AtomicUsize::new(0)); + let entry_count_clone = Arc::clone(&entry_count); + let entry_count_clone2 = Arc::clone(&entry_count); // given let mut mock_store = MockSemanticStore::new(); - mock_store.expect_put().times(3).returning(|_, _| Ok(())); - mock_store.expect_delete().times(2).returning(|_| Ok(())); + mock_store.expect_put().times(3).returning(move |_, _| { + entry_count_clone.fetch_add(1, Ordering::Relaxed); + Ok(()) + }); + mock_store.expect_delete().times(2).returning(move |_| { + entry_count_clone2.fetch_sub(1, Ordering::Relaxed); + Ok(()) + }); mock_store .expect_memory_usage_bytes() - .returning(|| 100 * 1024); + .returning(move || entry_count.load(Ordering::Relaxed) * 400 * 1024); // 400KB per entry let response_store = ResponseStore::new(); + // Set limit to 1MB - each entry uses ~0.8MB (400KB response + 400KB semantic) let cache = CacheImpl::new( Box::new(mock_store), response_store, 0.9, - EvictionPolicy::MemoryLimitMb(300), + EvictionPolicy::MemoryLimitMb(1), ); // when - add first entry cache.insert(embedding.clone(), response.clone()).unwrap(); - assert!(!cache.is_full()); // should have ~200 megabytes (100 string + overhead (32 bytes) + 100 semantic) + assert!(!cache.is_full()); // should have ~0.8MB which is under 1MB limit - // when - add second entry, this triggers eviction because memory exceeds limit of 300 (200 - // string + overhead (2 * 32 bytes) + 100 semantic) + // when - add second entry, this should trigger eviction because 2 entries would be ~1.6MB cache.insert(embedding.clone(), response.clone()).unwrap(); assert_eq!(cache.response_store.len(), 1); // evicted back to 1 - assert!(!cache.is_full()); + assert!(!cache.is_full()); // single entry is under limit // when - add third entry, again triggers eviction cache.insert(embedding.clone(), response.clone()).unwrap(); diff --git a/src/main.rs b/src/main.rs index dd0d329..d80fdcb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,7 +24,7 @@ use providers::ProviderType; use std::sync::Arc; use tokio::signal; use tower_http::services::ServeDir; -use tracing::{error, info}; +use tracing::{debug, error, info}; use tracing_subscriber::EnvFilter; const CONFIG_FILE: &str = "config.yaml"; @@ -51,6 +51,8 @@ async fn main() { panic!("Missing or malformed eviction policy in config") }); + info!("Eviction policy {:?}", eviction_policy); + let shared_state = Arc::new(AppState::new(similarity_threshold, eviction_policy)); // read through cache (proxy) routes From b73308c1d995a9bf8ac43ce3dddd341faba78b6f Mon Sep 17 00:00:00 2001 From: Louis Date: Wed, 18 Jun 2025 19:24:08 +0200 Subject: [PATCH 2/3] Import optimize --- src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index d80fdcb..708e6ba 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,15 +16,15 @@ use crate::endpoints::metrics::handler::prometheus_metrics_handler; use crate::metrics::metrics::{init_metrics, track_metrics}; use crate::providers::OPEN_AI_REST_PATH; use app_state::AppState; -use axum::Router; use axum::http::StatusCode; use axum::routing::{get, post, put}; +use axum::Router; use config::{get_log_level, get_port, get_similarity_threshold}; use providers::ProviderType; use std::sync::Arc; use tokio::signal; use tower_http::services::ServeDir; -use tracing::{debug, error, info}; +use tracing::{error, info}; use tracing_subscriber::EnvFilter; const CONFIG_FILE: &str = "config.yaml"; From eb011ee25dcc155fd61c25833530ab5de9193a95 Mon Sep 17 00:00:00 2001 From: Louis Date: Wed, 18 Jun 2025 19:28:02 +0200 Subject: [PATCH 3/3] Fix cargo fmt --- src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 708e6ba..950e70d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,9 +16,9 @@ use crate::endpoints::metrics::handler::prometheus_metrics_handler; use crate::metrics::metrics::{init_metrics, track_metrics}; use crate::providers::OPEN_AI_REST_PATH; use app_state::AppState; +use axum::Router; use axum::http::StatusCode; use axum::routing::{get, post, put}; -use axum::Router; use config::{get_log_level, get_port, get_similarity_threshold}; use providers::ProviderType; use std::sync::Arc;