Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,6 @@ tmp/

# Output directory
out/
data/
logs/
storage/
30 changes: 30 additions & 0 deletions cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package cmd
import (
"context"
"fmt"
"time"

"github.com/keep-network/keep-core/pkg/tbtcpg"

"github.com/keep-network/keep-common/pkg/persistence"
Expand Down Expand Up @@ -87,6 +89,23 @@ func start(cmd *cobra.Command) error {
blockCounter,
)

// Wire performance metrics into network provider if available
var perfMetrics *clientinfo.PerformanceMetrics
if clientInfoRegistry != nil {
perfMetrics = clientinfo.NewPerformanceMetrics(clientInfoRegistry)
// Type assert to libp2p provider to set metrics recorder
// The provider struct is not exported, so we use interface assertion
if setter, ok := netProvider.(interface {
SetMetricsRecorder(recorder interface {
IncrementCounter(name string, value float64)
SetGauge(name string, value float64)
RecordDuration(name string, duration time.Duration)
})
}); ok {
setter.SetMetricsRecorder(perfMetrics)
}
}

// Initialize beacon and tbtc only for non-bootstrap nodes.
// Skip initialization for bootstrap nodes as they are only used for network
// discovery.
Expand All @@ -113,6 +132,16 @@ func start(cmd *cobra.Command) error {

clientInfoRegistry.RegisterBtcChainInfoSource(btcChain)

if clientInfoRegistry != nil {
rpcHealthChecker := clientinfo.NewRPCHealthChecker(
clientInfoRegistry,
blockCounter,
btcChain,
clientConfig.ClientInfo.RPCHealthCheckInterval,
)
rpcHealthChecker.Start(ctx)
}

err = beacon.Initialize(
ctx,
beaconChain,
Expand Down Expand Up @@ -140,6 +169,7 @@ func start(cmd *cobra.Command) error {
proposalGenerator,
clientConfig.Tbtc,
clientInfoRegistry,
perfMetrics, // Pass the existing performance metrics instance to avoid duplicate registrations
)
if err != nil {
return fmt.Errorf("error initializing TBTC: [%v]", err)
Expand Down
238 changes: 238 additions & 0 deletions docs/performance-metrics.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
= Performance Metrics

The Keep Core client exposes performance metrics that can be used to monitor
the health and performance of node operations. These metrics are available
through the `/metrics` endpoint when the client info endpoint is configured.

== Metrics Endpoint

Metrics are exposed via HTTP at the `/metrics` endpoint on the port configured
in the `ClientInfo` section of the configuration file (default: `9601`).

Example:
----
curl http://localhost:9601/metrics
----

== Metric Types

The client uses three types of metrics:

* **Counters**: Cumulative counts that only increase (e.g., total operations)
* **Gauges**: Current values that can go up or down (e.g., queue sizes, active operations)
* **Durations**: Time measurements for operations (exposed as average duration and count)

== Available Metrics

=== Distributed Key Generation (DKG) Metrics

==== `performance_dkg_joined_total`
*Type*: Counter
*Description*: Total number of times the node has joined a DKG process
*Labels*: None

==== `performance_dkg_failed_total`
*Type*: Counter
*Description*: Total number of failed DKG attempts
*Labels*: None

==== `performance_dkg_duration_seconds`
*Type*: Gauge (average)
*Description*: Average duration of DKG operations in seconds
*Labels*: None

==== `performance_dkg_duration_seconds_count`
*Type*: Gauge
*Description*: Total number of DKG operations completed
*Labels*: None

==== `performance_dkg_validation_total`
*Type*: Counter
*Description*: Total number of DKG result validations performed
*Labels*: None

==== `performance_dkg_challenges_submitted_total`
*Type*: Counter
*Description*: Total number of DKG challenges submitted on-chain
*Labels*: None

==== `performance_dkg_approvals_submitted_total`
*Type*: Counter
*Description*: Total number of DKG approvals submitted on-chain
*Labels*: None

=== Signing Operation Metrics

==== `performance_signing_operations_total`
*Type*: Counter
*Description*: Total number of signing operations attempted
*Labels*: None

==== `performance_signing_success_total`
*Type*: Counter
*Description*: Total number of successful signing operations
*Labels*: None

==== `performance_signing_failed_total`
*Type*: Counter
*Description*: Total number of failed signing operations
*Labels*: None

==== `performance_signing_duration_seconds`
*Type*: Gauge (average)
*Description*: Average duration of signing operations in seconds
*Labels*: None

==== `performance_signing_duration_seconds_count`
*Type*: Gauge
*Description*: Total number of signing operations completed
*Labels*: None

==== `performance_signing_timeouts_total`
*Type*: Counter
*Description*: Total number of signing operations that timed out
*Labels*: None

=== Wallet Action Metrics

==== `performance_wallet_actions_total`
*Type*: Counter
*Description*: Total number of wallet actions dispatched
*Labels*: None

==== `performance_wallet_action_success_total`
*Type*: Counter
*Description*: Total number of successfully completed wallet actions
*Labels*: None

==== `performance_wallet_action_failed_total`
*Type*: Counter
*Description*: Total number of failed wallet actions
*Labels*: None

==== `performance_wallet_action_duration_seconds`
*Type*: Gauge (average)
*Description*: Average duration of wallet actions in seconds
*Labels*: None

==== `performance_wallet_action_duration_seconds_count`
*Type*: Gauge
*Description*: Total number of wallet actions completed
*Labels*: None

==== `performance_wallet_heartbeat_failures_total`
*Type*: Counter
*Description*: Total number of heartbeat failures across all wallets
*Labels*: None

=== Wallet Dispatcher Metrics

==== `performance_wallet_dispatcher_active_actions`
*Type*: Gauge
*Description*: Current number of wallets with active actions being executed
*Labels*: None
*Note*: This metric helps identify when wallets are busy and cannot accept new actions

==== `performance_wallet_dispatcher_rejected_total`
*Type*: Counter
*Description*: Total number of wallet actions rejected because the wallet was busy
*Labels*: None
*Note*: High values indicate that wallets are frequently busy and actions may need retry logic

=== Coordination Metrics

==== `performance_coordination_windows_detected_total`
*Type*: Counter
*Description*: Total number of coordination windows detected
*Labels*: None

==== `performance_coordination_procedures_executed_total`
*Type*: Counter
*Description*: Total number of coordination procedures executed
*Labels*: None

==== `performance_coordination_failed_total`
*Type*: Counter
*Description*: Total number of failed coordination procedures
*Labels*: None

==== `performance_coordination_duration_seconds`
*Type*: Gauge (average)
*Description*: Average duration of coordination procedures in seconds
*Labels*: None

=== Network Metrics

==== `performance_incoming_message_queue_size`
*Type*: Gauge
*Description*: Current size of the incoming message queue
*Labels*: `channel` (channel name)
*Note*: Maximum queue size is 4096. Values approaching this limit indicate message processing bottlenecks.

==== `performance_message_handler_queue_size`
*Type*: Gauge
*Description*: Current size of message handler queues
*Labels*: `channel` (channel name), `handler` (handler ID)
*Note*: Maximum queue size per handler is 512.

==== `performance_peer_connections_total`
*Type*: Counter
*Description*: Total number of peer connections established
*Labels*: None

==== `performance_peer_disconnections_total`
*Type*: Counter
*Description*: Total number of peer disconnections
*Labels*: None

==== `performance_message_broadcast_total`
*Type*: Counter
*Description*: Total number of messages broadcast to the network
*Labels*: None

==== `performance_message_received_total`
*Type*: Counter
*Description*: Total number of messages received from the network
*Labels*: None

==== `performance_ping_test_total`
*Type*: Counter
*Description*: Total number of ping tests performed
*Labels*: None

==== `performance_ping_test_success_total`
*Type*: Counter
*Description*: Total number of successful ping tests
*Labels*: None

==== `performance_ping_test_failed_total`
*Type*: Counter
*Description*: Total number of failed ping tests
*Labels*: None

=== Relay Entry Metrics (Beacon Node)

==== `performance_relay_entry_generation_total`
*Type*: Counter
*Description*: Total number of relay entry generation attempts
*Labels*: None

==== `performance_relay_entry_success_total`
*Type*: Counter
*Description*: Total number of successful relay entries generated
*Labels*: None

==== `performance_relay_entry_failed_total`
*Type*: Counter
*Description*: Total number of failed relay entry generations
*Labels*: None

==== `performance_relay_entry_duration_seconds`
*Type*: Gauge (average)
*Description*: Average duration of relay entry generation in seconds
*Labels*: None

==== `performance_relay_entry_timeout_reported_total`
*Type*: Counter
*Description*: Total number of relay entry timeouts reported on-chain
*Labels*: None
9 changes: 5 additions & 4 deletions pkg/clientinfo/clientinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ var logger = log.Logger("keep-clientinfo")

// Config stores configuration for the client info.
type Config struct {
Port int
NetworkMetricsTick time.Duration
EthereumMetricsTick time.Duration
BitcoinMetricsTick time.Duration
Port int
NetworkMetricsTick time.Duration
EthereumMetricsTick time.Duration
BitcoinMetricsTick time.Duration
RPCHealthCheckInterval time.Duration
}

// Registry wraps keep-common clientinfo registry and exposes additional
Expand Down
12 changes: 11 additions & 1 deletion pkg/clientinfo/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package clientinfo

import (
"fmt"
"strings"
"time"

"github.com/keep-network/keep-common/pkg/clientinfo"
Expand Down Expand Up @@ -159,7 +160,16 @@ func (r *Registry) observe(
) {
observer, err := r.NewMetricGaugeObserver(name, clientinfo.MetricObserverInput(input))
if err != nil {
logger.Warnf("could not create gauge observer [%v]", name)
// Check if the error is due to metric already existing (expected in some cases)
errStr := err.Error()
if strings.Contains(errStr, "already exists") {
// Metric already registered, this is expected if registerAllMetrics is called multiple times
// or if the same metric is registered in multiple places. Log at debug level.
logger.Debugf("metric [%v] already registered, skipping duplicate registration: %v", name, err)
return
}
// For other errors, log as warning
logger.Warnf("could not create gauge observer [%v]: %v", name, err)
return
}

Expand Down
Loading
Loading