diff --git a/ldk-server-protos/src/endpoints.rs b/ldk-server-protos/src/endpoints.rs index 52cd4a5..9f8a740 100644 --- a/ldk-server-protos/src/endpoints.rs +++ b/ldk-server-protos/src/endpoints.rs @@ -30,3 +30,4 @@ pub const SPONTANEOUS_SEND_PATH: &str = "SpontaneousSend"; pub const SIGN_MESSAGE_PATH: &str = "SignMessage"; pub const VERIFY_SIGNATURE_PATH: &str = "VerifySignature"; pub const EXPORT_PATHFINDING_SCORES_PATH: &str = "ExportPathfindingScores"; +pub const GET_METRICS_PATH: &str = "metrics"; diff --git a/ldk-server/src/main.rs b/ldk-server/src/main.rs index 9ad2e8d..3327b3b 100644 --- a/ldk-server/src/main.rs +++ b/ldk-server/src/main.rs @@ -50,6 +50,7 @@ use crate::io::persist::{ use crate::service::NodeService; use crate::util::config::{load_config, ArgsConfig, ChainSource}; use crate::util::logger::ServerLogger; +use crate::util::metrics::{Metrics, BUILD_METRICS_INTERVAL}; use crate::util::proto_adapter::{forwarded_payment_to_proto, payment_to_proto}; use crate::util::tls::get_or_generate_tls_config; @@ -256,6 +257,19 @@ fn main() { } }; let event_node = Arc::clone(&node); + + let metrics_node = Arc::clone(&node); + let mut interval = tokio::time::interval(BUILD_METRICS_INTERVAL); + let metrics = Arc::new(Metrics::new()); + let metrics_bg = Arc::clone(&metrics); + + runtime.spawn(async move { + loop { + interval.tick().await; + metrics_bg.update_service_health_score(&metrics_node); + } + }); + let rest_svc_listener = TcpListener::bind(config_file.rest_service_addr) .await .expect("Failed to bind listening port"); @@ -415,7 +429,7 @@ fn main() { res = rest_svc_listener.accept() => { match res { Ok((stream, _)) => { - let node_service = NodeService::new(Arc::clone(&node), Arc::clone(&paginated_store), api_key.clone()); + let node_service = NodeService::new(Arc::clone(&node), Arc::clone(&paginated_store), api_key.clone(), Arc::clone(&metrics)); let acceptor = tls_acceptor.clone(); runtime.spawn(async move { match acceptor.accept(stream).await { diff --git a/ldk-server/src/service.rs b/ldk-server/src/service.rs index 3a18876..b7d4051 100644 --- a/ldk-server/src/service.rs +++ b/ldk-server/src/service.rs @@ -21,10 +21,10 @@ use ldk_node::Node; use ldk_server_protos::endpoints::{ BOLT11_RECEIVE_PATH, BOLT11_SEND_PATH, BOLT12_RECEIVE_PATH, BOLT12_SEND_PATH, CLOSE_CHANNEL_PATH, CONNECT_PEER_PATH, EXPORT_PATHFINDING_SCORES_PATH, - FORCE_CLOSE_CHANNEL_PATH, GET_BALANCES_PATH, GET_NODE_INFO_PATH, GET_PAYMENT_DETAILS_PATH, - LIST_CHANNELS_PATH, LIST_FORWARDED_PAYMENTS_PATH, LIST_PAYMENTS_PATH, ONCHAIN_RECEIVE_PATH, - ONCHAIN_SEND_PATH, OPEN_CHANNEL_PATH, SIGN_MESSAGE_PATH, SPLICE_IN_PATH, SPLICE_OUT_PATH, - SPONTANEOUS_SEND_PATH, UPDATE_CHANNEL_CONFIG_PATH, VERIFY_SIGNATURE_PATH, + FORCE_CLOSE_CHANNEL_PATH, GET_BALANCES_PATH, GET_METRICS_PATH, GET_NODE_INFO_PATH, + GET_PAYMENT_DETAILS_PATH, LIST_CHANNELS_PATH, LIST_FORWARDED_PAYMENTS_PATH, LIST_PAYMENTS_PATH, + ONCHAIN_RECEIVE_PATH, ONCHAIN_SEND_PATH, OPEN_CHANNEL_PATH, SIGN_MESSAGE_PATH, SPLICE_IN_PATH, + SPLICE_OUT_PATH, SPONTANEOUS_SEND_PATH, UPDATE_CHANNEL_CONFIG_PATH, VERIFY_SIGNATURE_PATH, }; use prost::Message; @@ -52,6 +52,7 @@ use crate::api::spontaneous_send::handle_spontaneous_send_request; use crate::api::update_channel_config::handle_update_channel_config_request; use crate::api::verify_signature::handle_verify_signature_request; use crate::io::persist::paginated_kv_store::PaginatedKVStore; +use crate::util::metrics::Metrics; use crate::util::proto_adapter::to_error_response; // Maximum request body size: 10 MB @@ -63,13 +64,15 @@ pub struct NodeService { node: Arc, paginated_kv_store: Arc, api_key: String, + metrics: Arc, } impl NodeService { pub(crate) fn new( node: Arc, paginated_kv_store: Arc, api_key: String, + metrics: Arc, ) -> Self { - Self { node, paginated_kv_store, api_key } + Self { node, paginated_kv_store, api_key, metrics } } } @@ -153,6 +156,17 @@ impl Service> for NodeService { type Future = Pin> + Send>>; fn call(&self, req: Request) -> Self::Future { + // Handle metrics endpoint separately to bypass auth and return plain text + if req.uri().path().len() > 1 && &req.uri().path()[1..] == GET_METRICS_PATH { + let metrics = Arc::clone(&self.metrics); + return Box::pin(async move { + Ok(Response::builder() + .header("Content-Type", "text/plain") + .body(Full::new(Bytes::from(metrics.gather_metrics()))) + .unwrap()) + }); + } + // Extract auth params from headers (validation happens after body is read) let auth_params = match extract_auth_params(&req) { Ok(params) => params, diff --git a/ldk-server/src/util/metrics.rs b/ldk-server/src/util/metrics.rs new file mode 100644 index 0000000..1533bc8 --- /dev/null +++ b/ldk-server/src/util/metrics.rs @@ -0,0 +1,172 @@ +// This file is Copyright its original authors, visible in version control +// history. +// +// This file is licensed under the Apache License, Version 2.0 or the MIT license +// , at your option. +// You may not use this file except in accordance with one or both of these +// licenses. + +use std::sync::atomic::{AtomicI64, Ordering}; +use std::time::Duration; + +use ldk_node::Node; + +pub const BUILD_METRICS_INTERVAL: Duration = Duration::from_secs(60); + +/// This represents a [`Metrics`] type that can go up and down in value. +pub struct IntGauge { + inner: AtomicI64, +} + +impl IntGauge { + pub fn new() -> Self { + Self { inner: AtomicI64::new(0) } + } + + pub fn set(&self, value: i64) { + self.inner.store(value, Ordering::Relaxed); + } + + pub fn get(&self) -> i64 { + self.inner.load(Ordering::Relaxed) + } +} + +/// Represents the [`Metrics`] output values and type. +pub struct MetricsOutput { + name: String, + help_text: String, + metric_type: String, + value: String, +} + +impl MetricsOutput { + pub fn new(name: &str, help_text: &str, metric_type: &str, value: &str) -> Self { + Self { + name: name.to_string(), + help_text: help_text.to_string(), + metric_type: metric_type.to_string(), + value: value.to_string(), + } + } +} + +pub struct Metrics { + pub service_health_score: IntGauge, +} + +impl Metrics { + pub fn new() -> Self { + Self { service_health_score: IntGauge::new() } + } + + pub fn update_service_health_score(&self, node: &Node) { + let score = self.calculate_ldk_server_health_score(node); + self.service_health_score.set(score); + } + + /// The health score computation is pretty basic for now and simply + /// calculated based on the impacted events on the components of the + /// `Node`. The events severity and weightage value are as follows: + /// + /// - Critical: 0 (Total failure) + /// - Major: 35% + /// - Minor: 25% + /// + /// Using the assigned score above, the health score of the `Node` is + /// computed as: + /// + /// Health score = Maximum health score - Sum(Event severity score) + /// + /// Where: + /// + /// - Maximum health score = 100 + /// + /// If the `Node` is not running/online, i.e `is_running` is false, + /// the severity is critical with a weightage value of -100%. + /// + /// If the `Node` is running but isn't connected to any peer yet, + /// the severity is major with a weightage value of -35%. + /// + /// If the `Node` is running but the Lightning Wallet hasn't been synced + /// yet, the severity is minor with a weightage value of -25%. + pub fn calculate_ldk_server_health_score(&self, node: &Node) -> i64 { + Self::compute_health_score( + node.status().is_running, + !node.list_peers().is_empty(), + node.status().latest_lightning_wallet_sync_timestamp.is_some(), + ) + } + + pub fn format_metrics_output(&self, buffer: &mut String, options: &MetricsOutput) { + buffer.push_str(&format!("# HELP {} {}\n", options.name, options.help_text)); + buffer.push_str(&format!("# TYPE {} {}\n", options.name, options.metric_type)); + buffer.push_str(&format!("{} {}\n", options.name, options.value)); + } + + pub fn gather_metrics(&self) -> String { + let mut buffer = String::new(); + let options = &MetricsOutput::new( + "ldk_server_health_score", + "Current health score (0-100)", + "gauge", + &self.service_health_score.get().to_string(), + ); + + self.format_metrics_output(&mut buffer, options); + + buffer + } + + fn compute_health_score(is_running: bool, has_peers: bool, is_wallet_synced: bool) -> i64 { + if !is_running { + return 0; + } + + let mut health_score = 100; + + if !has_peers { + health_score -= 35; + } + + if !is_wallet_synced { + health_score -= 25; + } + + health_score + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_compute_health_score() { + // Node is not running + assert_eq!(Metrics::compute_health_score(false, true, true), 0); + assert_eq!(Metrics::compute_health_score(false, false, false), 0); + + // Node is running, connected to a peer and wallet is synced + assert_eq!(Metrics::compute_health_score(true, true, true), 100); + + // Node is running, not connected to a peer but wallet is synced + assert_eq!(Metrics::compute_health_score(true, false, true), 65); + + // Node is running, connected to a peer but wallet is not synced + assert_eq!(Metrics::compute_health_score(true, true, false), 75); + + // Node is running, not connected to a peer and wallet is not synced + assert_eq!(Metrics::compute_health_score(true, false, false), 40); + } + + #[test] + fn test_gather_metrics_format() { + let metrics = Metrics::new(); + + let result = metrics.gather_metrics(); + assert!(result.contains("ldk_server_health_score")); + } +} diff --git a/ldk-server/src/util/mod.rs b/ldk-server/src/util/mod.rs index 3662b12..1d22bb9 100644 --- a/ldk-server/src/util/mod.rs +++ b/ldk-server/src/util/mod.rs @@ -9,5 +9,6 @@ pub(crate) mod config; pub(crate) mod logger; +pub(crate) mod metrics; pub(crate) mod proto_adapter; pub(crate) mod tls;