Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 80 additions & 24 deletions crates/audit/src/archiver.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
use crate::metrics::Metrics;
use crate::reader::EventReader;
use crate::storage::EventWriter;
use crate::metrics::{
EventType, decrement_in_flight_archive_tasks, increment_events_processed,
increment_failed_archive_tasks, increment_in_flight_archive_tasks,
record_archive_event_duration, record_event_age, record_kafka_commit_duration,
record_kafka_read_duration,
};
use crate::reader::{EventReader, UserOpEventReader};
use crate::storage::{EventWriter, UserOpEventWriter};
use anyhow::Result;
use std::fmt;
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
Expand All @@ -15,7 +20,6 @@ where
{
reader: R,
writer: W,
metrics: Metrics,
}

impl<R, W> fmt::Debug for KafkaAuditArchiver<R, W>
Expand All @@ -35,11 +39,7 @@ where
{
/// Creates a new archiver with the given reader and writer.
pub fn new(reader: R, writer: W) -> Self {
Self {
reader,
writer,
metrics: Metrics::default(),
}
Self { reader, writer }
}

/// Runs the archiver loop, reading events and writing them to storage.
Expand All @@ -50,42 +50,35 @@ where
let read_start = Instant::now();
match self.reader.read_event().await {
Ok(event) => {
self.metrics
.kafka_read_duration
.record(read_start.elapsed().as_secs_f64());
record_kafka_read_duration(read_start.elapsed(), EventType::Bundle);

let now_ms = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as i64;
let event_age_ms = now_ms.saturating_sub(event.timestamp);
self.metrics.event_age.record(event_age_ms as f64);
record_event_age(event_age_ms as f64, EventType::Bundle);

// TODO: the integration test breaks because Minio doesn't support etag
let writer = self.writer.clone();
let metrics = self.metrics.clone();
self.metrics.in_flight_archive_tasks.increment(1.0);
increment_in_flight_archive_tasks(EventType::Bundle);
tokio::spawn(async move {
let archive_start = Instant::now();
if let Err(e) = writer.archive_event(event).await {
error!(error = %e, "Failed to write event");
metrics.failed_archive_tasks.increment(1);
increment_failed_archive_tasks(EventType::Bundle);
} else {
metrics
.archive_event_duration
.record(archive_start.elapsed().as_secs_f64());
metrics.events_processed.increment(1);
record_archive_event_duration(archive_start.elapsed(), EventType::Bundle);
increment_events_processed(EventType::Bundle);
}
metrics.in_flight_archive_tasks.decrement(1.0);
decrement_in_flight_archive_tasks(EventType::Bundle);
});

let commit_start = Instant::now();
if let Err(e) = self.reader.commit().await {
error!(error = %e, "Failed to commit message");
}
self.metrics
.kafka_commit_duration
.record(commit_start.elapsed().as_secs_f64());
record_kafka_commit_duration(commit_start.elapsed(), EventType::Bundle);
}
Err(e) => {
error!(error = %e, "Error reading events");
Expand All @@ -95,3 +88,66 @@ where
}
}
}

pub struct KafkaUserOpAuditArchiver<R, W>
where
R: UserOpEventReader,
W: UserOpEventWriter + Clone + Send + 'static,
{
reader: R,
writer: W,
}

impl<R, W> KafkaUserOpAuditArchiver<R, W>
where
R: UserOpEventReader,
W: UserOpEventWriter + Clone + Send + 'static,
{
pub fn new(reader: R, writer: W) -> Self {
Self { reader, writer }
}

pub async fn run(&mut self) -> Result<()> {
info!("Starting Kafka UserOp archiver");

loop {
let read_start = Instant::now();
match self.reader.read_event().await {
Ok(event) => {
record_kafka_read_duration(read_start.elapsed(), EventType::UserOp);

let now_ms = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as i64;
let event_age_ms = now_ms.saturating_sub(event.timestamp);
record_event_age(event_age_ms as f64, EventType::UserOp);

let writer = self.writer.clone();
increment_in_flight_archive_tasks(EventType::UserOp);
tokio::spawn(async move {
let archive_start = Instant::now();
if let Err(e) = writer.archive_userop_event(event).await {
error!(error = %e, "Failed to write UserOp event");
increment_failed_archive_tasks(EventType::UserOp);
} else {
record_archive_event_duration(archive_start.elapsed(), EventType::UserOp);
increment_events_processed(EventType::UserOp);
}
decrement_in_flight_archive_tasks(EventType::UserOp);
});

let commit_start = Instant::now();
if let Err(e) = self.reader.commit().await {
error!(error = %e, "Failed to commit message");
}
record_kafka_commit_duration(commit_start.elapsed(), EventType::UserOp);
}
Err(e) => {
error!(error = %e, "Error reading UserOp events");
sleep(Duration::from_secs(1)).await;
}
}
}
}
}
83 changes: 55 additions & 28 deletions crates/audit/src/metrics.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,65 @@
use metrics::{Counter, Gauge, Histogram};
use metrics_derive::Metrics;
use std::time::Duration;

/// Metrics for audit operations including Kafka reads, S3 writes, and event processing.
#[derive(Metrics, Clone)]
#[metrics(scope = "tips_audit")]
pub struct Metrics {
/// Duration of archive_event operations.
#[metric(describe = "Duration of archive_event")]
pub archive_event_duration: Histogram,
/// Event type tag for metrics differentiation
#[derive(Clone, Copy)]
pub enum EventType {
Bundle,
UserOp,
}

impl EventType {
pub fn as_str(&self) -> &'static str {
match self {
EventType::Bundle => "bundle",
EventType::UserOp => "userop",
}
}
}

/// Age of event when processed (now - event timestamp).
#[metric(describe = "Age of event when processed (now - event timestamp)")]
pub event_age: Histogram,
pub fn record_archive_event_duration(duration: Duration, event_type: EventType) {
metrics::histogram!("tips_audit_archive_event_duration", "type" => event_type.as_str())
.record(duration.as_secs_f64());
}

pub fn record_event_age(age_ms: f64, event_type: EventType) {
metrics::histogram!("tips_audit_event_age", "type" => event_type.as_str()).record(age_ms);
}

/// Duration of Kafka read_event operations.
#[metric(describe = "Duration of Kafka read_event")]
pub kafka_read_duration: Histogram,
pub fn record_kafka_read_duration(duration: Duration, event_type: EventType) {
metrics::histogram!("tips_audit_kafka_read_duration", "type" => event_type.as_str())
.record(duration.as_secs_f64());
}

/// Duration of Kafka commit operations.
#[metric(describe = "Duration of Kafka commit")]
pub kafka_commit_duration: Histogram,
pub fn record_kafka_commit_duration(duration: Duration, event_type: EventType) {
metrics::histogram!("tips_audit_kafka_commit_duration", "type" => event_type.as_str())
.record(duration.as_secs_f64());
}

pub fn increment_events_processed(event_type: EventType) {
metrics::counter!("tips_audit_events_processed", "type" => event_type.as_str()).increment(1);
}

pub fn increment_in_flight_archive_tasks(event_type: EventType) {
metrics::gauge!("tips_audit_in_flight_archive_tasks", "type" => event_type.as_str())
.increment(1.0);
}

pub fn decrement_in_flight_archive_tasks(event_type: EventType) {
metrics::gauge!("tips_audit_in_flight_archive_tasks", "type" => event_type.as_str())
.decrement(1.0);
}

pub fn increment_failed_archive_tasks(event_type: EventType) {
metrics::counter!("tips_audit_failed_archive_tasks", "type" => event_type.as_str())
.increment(1);
}

/// Metrics for audit operations including Kafka reads, S3 writes, and event processing.
#[derive(Metrics, Clone)]
#[metrics(scope = "tips_audit")]
pub struct Metrics {
/// Duration of update_bundle_history operations.
#[metric(describe = "Duration of update_bundle_history")]
pub update_bundle_history_duration: Histogram,
Expand All @@ -37,19 +76,7 @@ pub struct Metrics {
#[metric(describe = "Duration of S3 put_object")]
pub s3_put_duration: Histogram,

/// Total events processed.
#[metric(describe = "Total events processed")]
pub events_processed: Counter,

/// Total S3 writes skipped due to deduplication.
#[metric(describe = "Total S3 writes skipped due to dedup")]
pub s3_writes_skipped: Counter,

/// Number of in-flight archive tasks.
#[metric(describe = "Number of in-flight archive tasks")]
pub in_flight_archive_tasks: Gauge,

/// Number of failed archive tasks.
#[metric(describe = "Number of failed archive tasks")]
pub failed_archive_tasks: Counter,
}
Loading
Loading