Skip to content

Commit 27b8023

Browse files
author
Pete Stevenson
authored
perf_profiler.cc|h: Implment better metrics for overrun of expected number of stack trace samples. (#1403)
Summary: We implement some additional metrics for case where the profiler detects an overrun vs. the number of expected stack trace samples. Although this overrun may not cause issues (because of over-provisioning), we want to understand the magnitude and frequency of the issue so that we can re-assess our provisioning. Type of change: /kind feature Test Plan: Existing tests. We know these metrics are reporting infrequent overruns, we expect to see more granular data after these new metrics roll out. Also, added some [verbose logging](https://phab.corp.pixielabs.ai/P400) to capture both expected values and actual values in the metrics. For this experiment, we lowered the provisioned number of stack traces to ensure that the overflow condition was triggered. --------- Signed-off-by: Pete Stevenson <jps@pixielabs.ai>
1 parent 38bc694 commit 27b8023

File tree

2 files changed

+39
-9
lines changed

2 files changed

+39
-9
lines changed

src/stirling/source_connectors/perf_profiler/perf_profile_connector.cc

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,12 @@ PerfProfileConnector::PerfProfileConnector(std::string_view source_name)
6161
sampling_period_(
6262
std::chrono::milliseconds{1000 * FLAGS_stirling_profiler_table_update_period_seconds}),
6363
push_period_(sampling_period_ / 2),
64+
profiler_state_overflow_gauge_(
65+
BuildGauge("perf_profiler_overflow_gauge",
66+
"Overflow ratio, i.e. actual:expected number of stack traces.")),
67+
profiler_transfer_data_counter_(
68+
BuildCounter("perf_profiler_transfer_data_counter",
69+
"Count of times perf profiler transfer data is invoked.")),
6470
profiler_state_overflow_counter_(
6571
BuildCounter("perf_profiler_overflow",
6672
"Count of times the perf profiler overran CFG_OVERRUN_THRESHOLD")),
@@ -89,19 +95,19 @@ Status PerfProfileConnector::InitImpl() {
8995
IntRoundUpDivide(sampling_period_.count(), stack_trace_sampling_period_.count());
9096

9197
// Because sampling occurs per-cpu, the total number of expected stack traces is:
92-
const int32_t expected_stack_traces = ncpus * expected_stack_traces_per_cpu;
98+
expected_stack_traces_ = ncpus * expected_stack_traces_per_cpu;
9399

94100
// Include some margin to ensure that hash collisions and data races do not cause data drop:
95101
const double stack_traces_overprovision_factor = FLAGS_stirling_profiler_stack_trace_size_factor;
96102

97103
// Compute the size of the stack traces map.
98104
const int32_t provisioned_stack_traces =
99-
static_cast<int32_t>(stack_traces_overprovision_factor * expected_stack_traces);
105+
static_cast<int32_t>(stack_traces_overprovision_factor * expected_stack_traces_);
100106

101107
// A threshold for checking that we've overrun the maps.
102108
// This should be higher than expected_stack_traces due to timing variations,
103109
// but it should be lower than provisioned_stack_traces.
104-
const int32_t overrun_threshold = (expected_stack_traces + provisioned_stack_traces) / 2;
110+
const int32_t overrun_threshold = (expected_stack_traces_ + provisioned_stack_traces) / 2;
105111

106112
// Compute the size of the perf buffers.
107113
const double perf_buffer_overprovision_factor = FLAGS_stirling_profiler_perf_buffer_size_factor;
@@ -354,23 +360,44 @@ void PerfProfileConnector::ProcessBPFStackTraces(ConnectorContext* ctx, DataTabl
354360
// Read BPF stack traces & histogram, build records, incorporate records to data table.
355361
CreateRecords(stack_traces.get(), ctx, data_table);
356362

363+
uint64_t num_stack_traces_sampled;
364+
profiler_state_->get_value(sample_count_idx, num_stack_traces_sampled);
365+
CheckProfilerState(num_stack_traces_sampled);
366+
357367
// Now that we've consumed the data, reset the sample count in BPF.
358368
profiler_state_->update_value(sample_count_idx, 0);
359369
}
360370

361-
void PerfProfileConnector::CheckProfilerState() {
371+
void PerfProfileConnector::CheckProfilerState(const uint64_t num_stack_traces) {
362372
uint64_t error_code;
363373
profiler_state_->get_value(kErrorStatusIdx, error_code);
364374

365375
DCHECK_EQ(error_code, kPerfProfilerStatusOk);
366376

367377
switch (error_code) {
368-
case kOverflowError:
378+
case kOverflowError: {
379+
// overflow_ratio is actual:expected. That is, the actual number of stack traces sampled
380+
// vs. the expected number of stack traces. We keep its max value in the gauge.
381+
const double overflow_ratio =
382+
static_cast<double>(num_stack_traces) / static_cast<double>(expected_stack_traces_);
383+
if (overflow_ratio > profiler_state_overflow_gauge_.Value()) {
384+
profiler_state_overflow_gauge_.Set(overflow_ratio);
385+
}
386+
387+
// Compute the increment to profiler_transfer_data_counter_ such that the counter value is
388+
// equal to the total number of transfer data invocations.
389+
const double current_transfer_counter = profiler_transfer_data_counter_.Value();
390+
const double transfer_count_increment = transfer_count_ - current_transfer_counter;
391+
392+
// Track the total number of overflows and the total number of transfer data invocations.
393+
profiler_transfer_data_counter_.Increment(transfer_count_increment);
369394
profiler_state_overflow_counter_.Increment();
370395
break;
371-
case kMapReadFailureError:
396+
}
397+
case kMapReadFailureError: {
372398
profiler_state_map_read_error_counter_.Increment();
373399
break;
400+
}
374401
}
375402
// Reset the BPF map to its default value so that each occurrence
376403
// can be detected.
@@ -399,8 +426,6 @@ void PerfProfileConnector::TransferDataImpl(ConnectorContext* ctx) {
399426
if (sampling_freq_mgr_.count() % stats_log_interval_ == 0) {
400427
PrintStats();
401428
}
402-
403-
CheckProfilerState();
404429
}
405430

406431
void PerfProfileConnector::PrintStats() const {

src/stirling/source_connectors/perf_profiler/perf_profile_connector.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,16 +102,21 @@ class PerfProfileConnector : public SourceConnector, public bpf_tools::BCCWrappe
102102
void CleanupSymbolizers(const absl::flat_hash_set<md::UPID>& deleted_upids);
103103

104104
void PrintStats() const;
105-
void CheckProfilerState();
105+
void CheckProfilerState(const uint64_t num_stack_traces);
106106

107107
// data structures shared with BPF:
108108
std::unique_ptr<ebpf::BPFStackTable> stack_traces_a_;
109109
std::unique_ptr<ebpf::BPFStackTable> stack_traces_b_;
110110

111111
std::unique_ptr<ebpf::BPFArrayTable<uint64_t>> profiler_state_;
112+
prometheus::Gauge& profiler_state_overflow_gauge_;
113+
prometheus::Counter& profiler_transfer_data_counter_;
112114
prometheus::Counter& profiler_state_overflow_counter_;
113115
prometheus::Counter& profiler_state_map_read_error_counter_;
114116

117+
// Expected number of stack traces sampled per transfer data invocation.
118+
int32_t expected_stack_traces_;
119+
115120
// Number of iterations, where each iteration is drains the information collected in BPF.
116121
uint64_t transfer_count_ = 0;
117122

0 commit comments

Comments
 (0)