From 28b1076736633f141c4edb360ff1eb948e7e6845 Mon Sep 17 00:00:00 2001 From: Tjemmmic Date: Wed, 18 Jun 2025 17:54:47 -0500 Subject: [PATCH 1/3] feat: quality of service docs --- pages/developers/_meta.ts | 1 + pages/developers/blueprint-qos.mdx | 337 +++++++++++++++++++++++++ pages/operators/_meta.ts | 1 + pages/operators/quality-of-service.mdx | 131 ++++++++++ 4 files changed, 470 insertions(+) create mode 100644 pages/developers/blueprint-qos.mdx create mode 100644 pages/operators/quality-of-service.mdx diff --git a/pages/developers/_meta.ts b/pages/developers/_meta.ts index f505ad1..dd6d98c 100644 --- a/pages/developers/_meta.ts +++ b/pages/developers/_meta.ts @@ -14,6 +14,7 @@ const meta: Meta = { "blueprint-sdk": "Introduction", "blueprint-contexts": "Contexts", "blueprint-runner": "Blueprint Runner", + "blueprint-qos": "Quality of Service Integration", "p2p-networking": "P2P Networking", "tangle-avs": "Build a Tangle Blueprint", "eigenlayer-avs": "Build an Eigenlayer AVS", diff --git a/pages/developers/blueprint-qos.mdx b/pages/developers/blueprint-qos.mdx new file mode 100644 index 0000000..de6d110 --- /dev/null +++ b/pages/developers/blueprint-qos.mdx @@ -0,0 +1,337 @@ +--- +title: Quality of Service (QoS) Integration +--- + +# Quality of Service (QoS) Integration Guide + +This guide explains how to integrate and use the Blueprint SDK's Quality of Service (QoS) system to add comprehensive observability, monitoring, and dashboard capabilities to any Blueprint. QoS provides unified metrics collection, log aggregation, heartbeat monitoring, and visualization through a cohesive interface. + +## Prerequisites + +- Understanding of Blueprint concepts and execution model +- Familiarity with Tangle Network architecture +- Basic knowledge of observability concepts (metrics, logging, monitoring) + +## QoS Overview + +The Blueprint QoS system provides a complete observability stack: + +- **Heartbeat Service**: Sends periodic heartbeats to Tangle to prevent slashing +- **Metrics Collection**: Captures system and application metrics +- **Logging**: Aggregates logs via Loki for centralized querying +- **Dashboards**: Creates Grafana visualizations automatically +- **Server Management**: Optionally runs containerized instances of Prometheus, Loki, and Grafana + +The QoS system is designed to be added to any Blueprint type (Tangle, Eigenlayer, P2P, or Cron) as a background service. + +## Integrating QoS into a Blueprint + +The integration process involves setting up the QoS configuration and implementing the HeartbeatConsumer trait. Here's a step-by-step guide. + +### Main Blueprint Setup + +```rust +#[tokio::main] +async fn main() -> Result<(), blueprint_sdk::Error> { + let env = BlueprintEnvironment::load()?; + + // Create your Blueprint's primary context + let context = MyContext::new(env.clone()).await?; + + // Configure QoS system + let qos_config = blueprint_qos::default_qos_config(); + let heartbeat_consumer = Arc::new(MyHeartbeatConsumer::new()); + + // Standard Blueprint runner setup with QoS + BlueprintRunner::builder(TangleConfig::default(), env) + .router(Router::new() + .route(JOB_ID, handler.layer(TangleLayer)) + .with_context(context)) + .producer(producer) + .consumer(consumer) + .qos_service(qos_config, Some(heartbeat_consumer)) + .run() + .await +} +``` + +### Implementing HeartbeatConsumer + +To enable the heartbeat service, you must implement the `HeartbeatConsumer` trait, which is responsible for sending heartbeat signals to the Tangle Network: + +```rust +#[derive(Clone)] +struct MyHeartbeatConsumer { + // Add any required fields for heartbeat submission +} + +impl HeartbeatConsumer for MyHeartbeatConsumer { + fn consume_heartbeat( + &self, + service_id: u64, + blueprint_id: u64, + metrics_data: String, + ) -> Result<(), Box> { + // Implement custom heartbeat logic here, specific to blueprint + Ok(()) + } +} +``` + +## QoS Configuration Options + +### Using Default Configuration + +The simplest way to get started is with the default configuration: + +```rust +let qos_config = blueprint_qos::default_qos_config(); +``` + +This initializes a configuration with: +- Heartbeat service (disabled until configured) +- Metrics collection +- Loki logging +- Grafana integration +- Automatic server management set to `false` + +### Custom Configuration + +Customize the configuration for your specific needs: + +```rust +let qos_config = QoSConfig { + heartbeat: Some(HeartbeatConfig { + service_id: Some(42), + blueprint_id: Some(7), + interval_seconds: 60, + jitter_seconds: 5, + }), + metrics: Some(MetricsConfig::default()), + loki: Some(LokiConfig::default()), + grafana: Some(GrafanaConfig { + endpoint: "http://localhost:3000".into(), + admin_user: Some("admin".into()), + admin_password: Some("admin".into()), + folder: None, + }), + grafana_server: Some(GrafanaServerConfig::default()), + loki_server: Some(LokiServerConfig::default()), + prometheus_server: Some(PrometheusServerConfig::default()), + docker_network: Some("blueprint-network".into()), + manage_servers: true, + service_id: Some(42), + blueprint_id: Some(7), + docker_bind_ip: Some("0.0.0.0".into()), +}; +``` + +### Using the Builder Pattern + +The builder pattern provides a fluent API for configuration: + +```rust +let qos_service = QoSServiceBuilder::new() + .with_heartbeat_config(HeartbeatConfig { + service_id: Some(service_id), + blueprint_id: Some(blueprint_id), + interval_seconds: 60, + jitter_seconds: 5, + }) + .with_heartbeat_consumer(Arc::new(consumer)) + .with_metrics_config(MetricsConfig::default()) + .with_loki_config(LokiConfig::default()) + .with_grafana_config(GrafanaConfig::default()) + .with_prometheus_server_config(PrometheusServerConfig { + host: "0.0.0.0".into(), + port: 9090, + ..Default::default() + }) + .manage_servers(true) + .with_ws_rpc_endpoint(ws_endpoint) + .with_keystore_uri(keystore_uri) + .build()?; +``` + +## Recording Blueprint Metrics and Events + +### Job Performance Tracking + +Tracking job execution and performance in your job handlers is essential for monitoring and optimization: + +```rust +pub async fn process_job( + Context(ctx): Context, + TangleArg(data): TangleArg, +) -> Result> { + let start_time = std::time::Instant::now(); + + // Process the job + let result = perform_processing(&data)?; + + // Record job execution metrics + if let Some(qos) = &ctx.qos_service { + qos.record_job_execution( + JOB_ID, + start_time.elapsed().as_secs_f64(), + ctx.service_id, + ctx.blueprint_id + ); + } + + Ok(TangleResult::Success(result)) +} +``` + +### Error Tracking + +Tracking job errors is crucial for monitoring and alerts: + +```rust +match perform_complex_operation() { + Ok(value) => Ok(TangleResult::Success(value)), + Err(e) => { + if let Some(qos) = &ctx.qos_service { + qos.record_job_error(JOB_ID, "complex_operation_failure"); + } + Err(e.into()) + } +} +``` + +## Automatic Dashboard Creation + +QoS can automatically create Grafana dashboards that display your Blueprint's metrics: + +```rust +// Create a custom dashboard for your Blueprint +if let Some(mut qos) = qos_service { + if let Err(e) = qos.create_dashboard("My Blueprint") { + error!("Failed to create dashboard: {}", e); + } else { + info!("Created Grafana dashboard for My Blueprint"); + } +} +``` + +The dashboard includes: +- System resource usage (CPU, memory, disk, network) +- Job execution metrics (frequency, duration, error rates) +- Log visualization panels (when Loki is configured) +- Service status and uptime information + +## Accessing QoS in Context + +Typically, you'll want to store the QoS service in your Blueprint context: + +```rust +#[derive(Clone)] +pub struct MyContext { + #[config] + pub env: BlueprintEnvironment, + pub data_dir: PathBuf, + pub qos_service: Option>>, + pub service_id: u64, + pub blueprint_id: u64, +} + +impl MyContext { + pub async fn new(env: BlueprintEnvironment) -> Result { + // Initialize QoS service + let qos_service = initialize_qos(&env)?; + + Ok(Self { + data_dir: env.data_dir.clone().unwrap_or_else(default_data_dir), + qos_service: Some(Arc::new(qos_service)), + service_id: 42, + blueprint_id: 7, + env, + }) + } +} +``` + +You can then access the QoS service in your job handlers: + +```rust +pub async fn my_job( + Context(ctx): Context, + TangleArg(data): TangleArg, +) -> Result> { + // Access QoS metrics provider + if let Some(qos) = &ctx.qos_service { + if let Some(provider) = qos.provider() { + let cpu_usage = provider.get_cpu_usage()?; + info!("Current CPU usage: {}%", cpu_usage); + } + } + + // Job implementation + Ok(TangleResult::Success(())) +} +``` + +## Server Management + +QoS can automatically manage Grafana, Prometheus, and Loki servers: + +```rust +// Configure server management +let qos_config = QoSConfig { + grafana_server: Some(GrafanaServerConfig { + port: 3000, + container_name: "blueprint-grafana".into(), + image: "grafana/grafana:latest".into(), + ..Default::default() + }), + loki_server: Some(LokiServerConfig { + port: 3100, + container_name: "blueprint-loki".into(), + image: "grafana/loki:latest".into(), + ..Default::default() + }), + prometheus_server: Some(PrometheusServerConfig { + port: 9090, + container_name: "blueprint-prometheus".into(), + image: "prom/prometheus:latest".into(), + host: "0.0.0.0".into(), + ..Default::default() + }), + docker_network: Some("blueprint-network".into()), + manage_servers: true, + ..Default::default() +}; +``` + +For proper operation with Docker containers, ensure: +1. Your application binds metrics endpoints to `0.0.0.0` (not `127.0.0.1`) +2. Prometheus configuration uses `host.docker.internal` to access host metrics +3. Docker is installed and the user has the necessary permissions +4. A common Docker network is used for all containers + +## Best Practices + +✅ DO: +- Initialize QoS early in your Blueprint's startup sequence +- Add QoS as a background service using `BlueprintRunner::background_service()` +- Record job execution metrics for all important jobs +- Use `#[derive(Clone)]` for your `HeartbeatConsumer` implementation +- Access QoS APIs through your Blueprint's context + +❌ DON'T: +- Don't create separate QoS instances for different components +- Avoid using hardcoded admin credentials in production code +- Don't pass the QoS service directly between jobs; use the context pattern +- Don't forget to bind Prometheus metrics server to `0.0.0.0` for Docker accessibility +- Don't ignore QoS shutdown or creation errors; they may indicate more serious issues + +## QoS Components Reference + +| Component | Primary Struct | Config | Purpose | +|-----------|---------------|--------|----------| +| Unified Service | `QoSService` | `QoSConfig` | Main entry point for QoS integration | +| Heartbeat | `HeartbeatService` | `HeartbeatConfig` | Sends periodic liveness signals to chain | +| Metrics | `MetricsService` | `MetricsConfig` | Collects system and application metrics | +| Logging | N/A | `LokiConfig` | Configures log aggregation to Loki | +| Dashboards | `GrafanaClient` | `GrafanaConfig` | Creates and manages Grafana dashboards | +| Server Management | `ServerManager` | Various server configs | Manages Docker containers for observability stack | diff --git a/pages/operators/_meta.ts b/pages/operators/_meta.ts index abbc4ba..feac451 100644 --- a/pages/operators/_meta.ts +++ b/pages/operators/_meta.ts @@ -16,6 +16,7 @@ const meta: Meta = { operator: "Running an operator", pricing: "Pricing", benchmarking: "Blueprint Benchmarking", + "quality-of-service": "Quality of Service", "-- Eigenlayer AVS Operators": { type: "separator", title: "Eigenlayer AVS Operators", diff --git a/pages/operators/quality-of-service.mdx b/pages/operators/quality-of-service.mdx new file mode 100644 index 0000000..1e79f55 --- /dev/null +++ b/pages/operators/quality-of-service.mdx @@ -0,0 +1,131 @@ +--- +title: Quality of Service Monitoring +--- + +# Quality of Service Monitoring + +As an operator, the Quality of Service (QoS) system provides you with comprehensive visibility into your running blueprints. This guide explains how to access and interpret the QoS dashboards and metrics provided by the operators running your blueprints. + +## What is the QoS System? + +The Quality of Service (QoS) system in Tangle Network provides a complete observability stack that gives you access to optional insights into your running blueprints: + +- Real-time monitoring of blueprint health and performance +- Centralized logs for troubleshooting and audit trails +- Heartbeat monitoring to verify continuous operation +- Visualization dashboards for all key metrics + +The information provided by the QoS service may be optional and unique depending upon the blueprint in question, so it is recommended to check the documentation of a given blueprint for more specifics. + +## Accessing QoS Dashboards + +When a blueprint is running for you, the operator provides access to QoS dashboards through Grafana. Here's how to access them: + +1. In your blueprint execution details, locate the operator's QoS endpoint (typically provided after blueprint execution begins) +2. Navigate to the Grafana URL (default: `http://[operator-endpoint]:3000`) - while the port defaults to 3000, it may be different and specified by the operator running it. +3. Log in using the credentials provided by the operator (typically admin/admin for basic setups) - this may also differ from blueprint to blueprint. +4. Once logged in, navigate to the "Dashboards" section in the left sidebar +5. Look for a dashboard with a name that corresponds to the ID of your blueprint + +## What You Can Monitor + +The QoS dashboards provide comprehensive visibility into your blueprint's operation: + +### 1. System Performance + +The system metrics panels can show you how the blueprint is utilizing resources, with some example metrics being: + +- **CPU Usage**: Real-time CPU utilization by your blueprint +- **Memory Consumption**: RAM usage over time +- **Disk I/O**: Storage activity for data-intensive operations +- **Network Traffic**: Inbound/outbound network traffic + +These metrics help you understand if your blueprint has adequate resources and is performing efficiently. + +### 2. Blueprint-specific Metrics + +These panels show you how your specific blueprint is performing: + +- **Job Execution Frequency**: How often jobs are being executed +- **Job Duration Statistics**: How long jobs are taking to complete +- **Error Rates**: Percentage of jobs failing or experiencing errors +- **Resource Utilization**: How efficiently resources are being used + +Any given blueprint may also have additional information that is specific to that blueprint and the jobs it runs. + +### 3. Heartbeat Monitoring + +The heartbeat section shows you the operational status of your blueprint: + +- **Last Heartbeat Timestamp**: When the most recent heartbeat was recorded +- **Heartbeat Success Rate**: Percentage of successful heartbeats +- **Chain Confirmation Status**: Verification that heartbeats are being recorded on-chain + +These heartbeats ensure that an operator is punished (slashed) if they do not run the blueprint as they should. + +### 4. Log Visualization with Loki + +Centralized logs provide detailed insights into blueprint operation: + +- **Error Logs**: Any errors or warnings generated by your blueprint +- **Information Logs**: Standard operational logs from your blueprint +- **System Logs**: Underlying system events that may affect your blueprint + +## Interpreting QoS Data + +### Key Performance Indicators + +When monitoring your blueprints, pay attention to these important indicators: + +1. **Job Success Rate**: Should be close to 100% under normal conditions +2. **Response Time**: How quickly jobs are being completed +3. **Resource Efficiency**: Is your blueprint using resources as expected? +4. **Heartbeat Regularity**: Heartbeats should occur at consistent intervals + +### Warning Signs to Watch For + +These patterns may indicate issues with your blueprint: + +- **Increasing Error Rates**: May indicate logic problems or resource constraints +- **Growing Response Times**: Could suggest performance degradation +- **Missing Heartbeats**: May indicate blueprint instability or network issues +- **Unexpected Resource Spikes**: Could indicate inefficient operations or potential attacks + +## Troubleshooting Using QoS Data + +When you encounter issues with your blueprints, the QoS dashboard provides valuable diagnostics: + +### For Failed Jobs +1. Check the logs panel for specific error messages +2. Look at resource usage at the time of failure +3. Examine any pattern in failures (time of day, specific job types) + +### For Performance Issues +1. Monitor CPU and memory usage during slow periods +2. Look for concurrent operations that may cause contention +3. Check network traffic for potential bottlenecks + +### For Stability Problems +1. Review the heartbeat history for gaps or irregularities +2. Examine system logs around times of instability +3. Check for correlations between resource exhaustion and failures + +## Frequently Asked Questions + +**Q: How do I access QoS dashboards if the URL wasn't provided?** +A: The endpoint of your operator is available on-chain, and you can access the QoS dashboards by following the instructions in the [Accessing QoS Dashboards](#accessing-qos-dashboards) section. + +**Q: Can I export QoS metrics for my own analysis?** +A: Yes, most Grafana dashboards allow data export in various formats (CSV, JSON). + +**Q: How long is QoS data retained?** +A: This data is only retained during the duration of the service, unless otherwise stated by the operator/blueprint. + +## Related Information + +To learn more about operating with Tangle Network blueprints, you may want to review: + +- [Blueprint Benchmarking](/operators/benchmarking) +- [Pricing Strategies](/operators/pricing) + +Understanding how to interpret QoS metrics helps you gain insights into blueprint performance and troubleshoot issues effectively. From bd34fa4770eee2487fbfd32584cfcc0e6c45145c Mon Sep 17 00:00:00 2001 From: Tjemmmic Date: Wed, 18 Jun 2025 17:55:18 -0500 Subject: [PATCH 2/3] chore: prettier --- pages/developers/blueprint-qos.mdx | 37 +++++++++++++++----------- pages/operators/quality-of-service.mdx | 3 +++ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/pages/developers/blueprint-qos.mdx b/pages/developers/blueprint-qos.mdx index de6d110..bd0541d 100644 --- a/pages/developers/blueprint-qos.mdx +++ b/pages/developers/blueprint-qos.mdx @@ -34,14 +34,14 @@ The integration process involves setting up the QoS configuration and implementi #[tokio::main] async fn main() -> Result<(), blueprint_sdk::Error> { let env = BlueprintEnvironment::load()?; - + // Create your Blueprint's primary context let context = MyContext::new(env.clone()).await?; - + // Configure QoS system let qos_config = blueprint_qos::default_qos_config(); let heartbeat_consumer = Arc::new(MyHeartbeatConsumer::new()); - + // Standard Blueprint runner setup with QoS BlueprintRunner::builder(TangleConfig::default(), env) .router(Router::new() @@ -89,6 +89,7 @@ let qos_config = blueprint_qos::default_qos_config(); ``` This initializes a configuration with: + - Heartbeat service (disabled until configured) - Metrics collection - Loki logging @@ -165,10 +166,10 @@ pub async fn process_job( TangleArg(data): TangleArg, ) -> Result> { let start_time = std::time::Instant::now(); - + // Process the job let result = perform_processing(&data)?; - + // Record job execution metrics if let Some(qos) = &ctx.qos_service { qos.record_job_execution( @@ -178,7 +179,7 @@ pub async fn process_job( ctx.blueprint_id ); } - + Ok(TangleResult::Success(result)) } ``` @@ -215,6 +216,7 @@ if let Some(mut qos) = qos_service { ``` The dashboard includes: + - System resource usage (CPU, memory, disk, network) - Job execution metrics (frequency, duration, error rates) - Log visualization panels (when Loki is configured) @@ -239,7 +241,7 @@ impl MyContext { pub async fn new(env: BlueprintEnvironment) -> Result { // Initialize QoS service let qos_service = initialize_qos(&env)?; - + Ok(Self { data_dir: env.data_dir.clone().unwrap_or_else(default_data_dir), qos_service: Some(Arc::new(qos_service)), @@ -265,7 +267,7 @@ pub async fn my_job( info!("Current CPU usage: {}%", cpu_usage); } } - + // Job implementation Ok(TangleResult::Success(())) } @@ -304,6 +306,7 @@ let qos_config = QoSConfig { ``` For proper operation with Docker containers, ensure: + 1. Your application binds metrics endpoints to `0.0.0.0` (not `127.0.0.1`) 2. Prometheus configuration uses `host.docker.internal` to access host metrics 3. Docker is installed and the user has the necessary permissions @@ -312,6 +315,7 @@ For proper operation with Docker containers, ensure: ## Best Practices ✅ DO: + - Initialize QoS early in your Blueprint's startup sequence - Add QoS as a background service using `BlueprintRunner::background_service()` - Record job execution metrics for all important jobs @@ -319,6 +323,7 @@ For proper operation with Docker containers, ensure: - Access QoS APIs through your Blueprint's context ❌ DON'T: + - Don't create separate QoS instances for different components - Avoid using hardcoded admin credentials in production code - Don't pass the QoS service directly between jobs; use the context pattern @@ -327,11 +332,11 @@ For proper operation with Docker containers, ensure: ## QoS Components Reference -| Component | Primary Struct | Config | Purpose | -|-----------|---------------|--------|----------| -| Unified Service | `QoSService` | `QoSConfig` | Main entry point for QoS integration | -| Heartbeat | `HeartbeatService` | `HeartbeatConfig` | Sends periodic liveness signals to chain | -| Metrics | `MetricsService` | `MetricsConfig` | Collects system and application metrics | -| Logging | N/A | `LokiConfig` | Configures log aggregation to Loki | -| Dashboards | `GrafanaClient` | `GrafanaConfig` | Creates and manages Grafana dashboards | -| Server Management | `ServerManager` | Various server configs | Manages Docker containers for observability stack | +| Component | Primary Struct | Config | Purpose | +| ----------------- | ------------------ | ---------------------- | ------------------------------------------------- | +| Unified Service | `QoSService` | `QoSConfig` | Main entry point for QoS integration | +| Heartbeat | `HeartbeatService` | `HeartbeatConfig` | Sends periodic liveness signals to chain | +| Metrics | `MetricsService` | `MetricsConfig` | Collects system and application metrics | +| Logging | N/A | `LokiConfig` | Configures log aggregation to Loki | +| Dashboards | `GrafanaClient` | `GrafanaConfig` | Creates and manages Grafana dashboards | +| Server Management | `ServerManager` | Various server configs | Manages Docker containers for observability stack | diff --git a/pages/operators/quality-of-service.mdx b/pages/operators/quality-of-service.mdx index 1e79f55..1ba3880 100644 --- a/pages/operators/quality-of-service.mdx +++ b/pages/operators/quality-of-service.mdx @@ -96,16 +96,19 @@ These patterns may indicate issues with your blueprint: When you encounter issues with your blueprints, the QoS dashboard provides valuable diagnostics: ### For Failed Jobs + 1. Check the logs panel for specific error messages 2. Look at resource usage at the time of failure 3. Examine any pattern in failures (time of day, specific job types) ### For Performance Issues + 1. Monitor CPU and memory usage during slow periods 2. Look for concurrent operations that may cause contention 3. Check network traffic for potential bottlenecks ### For Stability Problems + 1. Review the heartbeat history for gaps or irregularities 2. Examine system logs around times of instability 3. Check for correlations between resource exhaustion and failures From 1cd727f0f3b2651498025a7254b76c142fb3ea54 Mon Sep 17 00:00:00 2001 From: Tjemmmic Date: Thu, 19 Jun 2025 21:48:16 -0500 Subject: [PATCH 3/3] fix: broken link --- pages/network/governance/overview.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pages/network/governance/overview.mdx b/pages/network/governance/overview.mdx index e1ecf91..c227489 100644 --- a/pages/network/governance/overview.mdx +++ b/pages/network/governance/overview.mdx @@ -22,7 +22,7 @@ The governance system of Tangle Network is divided into two parts, the public re Proposals can be made by any token holder. Others can agree with the proposal by seconding it and providing tokens equivalent to the original bond. The most seconded proposal during every launch period is moved to the public referenda table for active voting. Voters can lock their tokens for a longer duration to amplify their vote. -Detailed information on the governance system can be found [here](https://wiki.polkadot.network/learn/archive/learn-governance). +Detailed information on the governance system can be found [here](https://wiki.polkadot.network/general/governance-apps/). ## Important Parameters for Democracy Module