From c669f0e590d10443d381ef0b17fcae004c79af46 Mon Sep 17 00:00:00 2001 From: Kevin Woo <3469532+kevinawoo@users.noreply.github.com> Date: Mon, 23 Dec 2024 23:13:33 -0800 Subject: [PATCH 1/4] fix(promql-to-scrape): fixed type output --- .../promql-to-scrape/cmd/genconfig/main.go | 6 +-- .../promql-to-scrape/examples/config.yaml | 40 +++++++++---------- .../promql-to-scrape/internal/client.go | 5 ++- .../promql-to-scrape/internal/metric.go | 18 +++++++++ .../promql-to-scrape/internal/serialize.go | 2 +- .../promql-to-scrape/internal/server.go | 9 ++++- 6 files changed, 52 insertions(+), 28 deletions(-) create mode 100644 cloud/observability/promql-to-scrape/internal/metric.go diff --git a/cloud/observability/promql-to-scrape/cmd/genconfig/main.go b/cloud/observability/promql-to-scrape/cmd/genconfig/main.go index 0ea94e7..cb32b01 100644 --- a/cloud/observability/promql-to-scrape/cmd/genconfig/main.go +++ b/cloud/observability/promql-to-scrape/cmd/genconfig/main.go @@ -45,9 +45,9 @@ func main() { if err != nil { log.Fatalf("Failed to pull metric names: %s", err) } - fmt.Println(counters) - fmt.Println(gauges) - fmt.Println(histograms) + fmt.Println("counters: ", counters, "\n") + fmt.Println("gauges: ", gauges, "\n") + fmt.Println("histograms: ", histograms, "\n") conf := internal.Config{} diff --git a/cloud/observability/promql-to-scrape/examples/config.yaml b/cloud/observability/promql-to-scrape/examples/config.yaml index 184c746..7936532 100644 --- a/cloud/observability/promql-to-scrape/examples/config.yaml +++ b/cloud/observability/promql-to-scrape/examples/config.yaml @@ -1,43 +1,43 @@ metrics: - - metric_name: temporal_cloud_v0_frontend_service_error_count:rate1m + - metric_name: temporal_cloud_v0_frontend_service_error_count query: rate(temporal_cloud_v0_frontend_service_error_count[1m]) - metric_name: temporal_cloud_v0_frontend_service_pending_requests query: temporal_cloud_v0_frontend_service_pending_requests - - metric_name: temporal_cloud_v0_frontend_service_request_count:rate1m + - metric_name: temporal_cloud_v0_frontend_service_request_count query: rate(temporal_cloud_v0_frontend_service_request_count[1m]) - - metric_name: temporal_cloud_v0_poll_success_count:rate1m + - metric_name: temporal_cloud_v0_poll_success_count query: rate(temporal_cloud_v0_poll_success_count[1m]) - - metric_name: temporal_cloud_v0_poll_success_sync_count:rate1m + - metric_name: temporal_cloud_v0_poll_success_sync_count query: rate(temporal_cloud_v0_poll_success_sync_count[1m]) - - metric_name: temporal_cloud_v0_poll_timeout_count:rate1m + - metric_name: temporal_cloud_v0_poll_timeout_count query: rate(temporal_cloud_v0_poll_timeout_count[1m]) - - metric_name: temporal_cloud_v0_resource_exhausted_error_count:rate1m + - metric_name: temporal_cloud_v0_resource_exhausted_error_count query: rate(temporal_cloud_v0_resource_exhausted_error_count[1m]) - - metric_name: temporal_cloud_v0_schedule_action_success_count:rate1m + - metric_name: temporal_cloud_v0_schedule_action_success_count query: rate(temporal_cloud_v0_schedule_action_success_count[1m]) - - metric_name: temporal_cloud_v0_schedule_buffer_overruns_count:rate1m + - metric_name: temporal_cloud_v0_schedule_buffer_overruns_count query: rate(temporal_cloud_v0_schedule_buffer_overruns_count[1m]) - - metric_name: temporal_cloud_v0_schedule_missed_catchup_window_count:rate1m + - metric_name: temporal_cloud_v0_schedule_missed_catchup_window_count query: rate(temporal_cloud_v0_schedule_missed_catchup_window_count[1m]) - - metric_name: temporal_cloud_v0_service_latency_bucket:histogram_quantile_p99_1m + - metric_name: temporal_cloud_v0_service_latency_bucket query: histogram_quantile(0.99, sum(rate(temporal_cloud_v0_service_latency_bucket[1m])) by (le, operation, temporal_namespace)) - - metric_name: temporal_cloud_v0_service_latency_count:rate1m + - metric_name: temporal_cloud_v0_service_latency_count query: rate(temporal_cloud_v0_service_latency_count[1m]) - - metric_name: temporal_cloud_v0_service_latency_sum:rate1m + - metric_name: temporal_cloud_v0_service_latency_sum query: rate(temporal_cloud_v0_service_latency_sum[1m]) - - metric_name: temporal_cloud_v0_state_transition_count:rate1m + - metric_name: temporal_cloud_v0_state_transition_count query: rate(temporal_cloud_v0_state_transition_count[1m]) - - metric_name: temporal_cloud_v0_total_action_count:rate1m + - metric_name: temporal_cloud_v0_total_action_count query: rate(temporal_cloud_v0_total_action_count[1m]) - - metric_name: temporal_cloud_v0_workflow_cancel_count:rate1m + - metric_name: temporal_cloud_v0_workflow_cancel_count query: rate(temporal_cloud_v0_workflow_cancel_count[1m]) - - metric_name: temporal_cloud_v0_workflow_continued_as_new_count:rate1m + - metric_name: temporal_cloud_v0_workflow_continued_as_new_count query: rate(temporal_cloud_v0_workflow_continued_as_new_count[1m]) - - metric_name: temporal_cloud_v0_workflow_failed_count:rate1m + - metric_name: temporal_cloud_v0_workflow_failed_count query: rate(temporal_cloud_v0_workflow_failed_count[1m]) - - metric_name: temporal_cloud_v0_workflow_success_count:rate1m + - metric_name: temporal_cloud_v0_workflow_success_count query: rate(temporal_cloud_v0_workflow_success_count[1m]) - - metric_name: temporal_cloud_v0_workflow_terminate_count:rate1m + - metric_name: temporal_cloud_v0_workflow_terminate_count query: rate(temporal_cloud_v0_workflow_terminate_count[1m]) - - metric_name: temporal_cloud_v0_workflow_timeout_count:rate1m + - metric_name: temporal_cloud_v0_workflow_timeout_count query: rate(temporal_cloud_v0_workflow_timeout_count[1m]) diff --git a/cloud/observability/promql-to-scrape/internal/client.go b/cloud/observability/promql-to-scrape/internal/client.go index 4a7ade5..90bbb90 100644 --- a/cloud/observability/promql-to-scrape/internal/client.go +++ b/cloud/observability/promql-to-scrape/internal/client.go @@ -71,9 +71,10 @@ func (c *APIClient) ListMetrics(metricPrefix string) ([]string, []string, []stri if !strings.HasPrefix(string(v), metricPrefix) { continue } - if strings.HasSuffix(string(v), "_bucket") { + t := getMetricType(string(v)) + if t == metricTypeHistogram { histograms = append(histograms, string(v)) - } else if strings.HasSuffix(string(v), "_count") || strings.HasSuffix(string(v), "_sum") { + } else if t == metricTypeCounter { counts = append(counts, string(v)) } else { gauges = append(gauges, string(v)) diff --git a/cloud/observability/promql-to-scrape/internal/metric.go b/cloud/observability/promql-to-scrape/internal/metric.go new file mode 100644 index 0000000..f86ff17 --- /dev/null +++ b/cloud/observability/promql-to-scrape/internal/metric.go @@ -0,0 +1,18 @@ +package internal + +import "strings" + +const ( + metricTypeHistogram = "histogram" + metricTypeCounter = "count" + metricTypeGauge = "gauge" +) + +func getMetricType(v string) string { + if strings.HasSuffix(v, "_bucket") { + return metricTypeHistogram + } else if strings.HasSuffix(v, "_count") || strings.HasSuffix(v, "_sum") { + return metricTypeCounter + } + return metricTypeGauge +} diff --git a/cloud/observability/promql-to-scrape/internal/serialize.go b/cloud/observability/promql-to-scrape/internal/serialize.go index 7d0225e..3524943 100644 --- a/cloud/observability/promql-to-scrape/internal/serialize.go +++ b/cloud/observability/promql-to-scrape/internal/serialize.go @@ -38,7 +38,7 @@ func SamplesToString(queriedMetrics map[string][]*model.Sample) string { sb.WriteString("# TYPE ") sb.WriteString(nameWithoutSuffix) sb.WriteByte(' ') - sb.WriteString("gauge") + sb.WriteString(getMetricType(metricName)) sb.WriteByte('\n') for _, s := range samples { diff --git a/cloud/observability/promql-to-scrape/internal/server.go b/cloud/observability/promql-to-scrape/internal/server.go index acb267a..afb45d2 100644 --- a/cloud/observability/promql-to-scrape/internal/server.go +++ b/cloud/observability/promql-to-scrape/internal/server.go @@ -46,7 +46,11 @@ func (s *PromToScrapeServer) metricsHandler(w http.ResponseWriter, r *http.Reque s.RLock() defer s.RUnlock() if time.Since(s.lastSuccessfulTime) < 5*time.Minute { - fmt.Fprint(w, s.data) + _, err := fmt.Fprint(w, s.data) + if err != nil { + w.WriteHeader(http.StatusInternalServerError) + slog.Error("can't serve metrics", "error", err) + } } else { w.WriteHeader(http.StatusInternalServerError) slog.Error("can't serve metrics", "error", "metrics queried are stale (more than 5 minutes old)") @@ -71,7 +75,7 @@ func (s *PromToScrapeServer) run() string { // // keep the objects returned from the query, or convert them into something a bit more ergonomic // and create ConstMetrics with the prometheus client. I happened to have the code lying around for working -// with model.Sample, but the CosntMetrics route is probably more idiomatic and safe. +// with model.Sample, but the ConstMetrics route is probably more idiomatic and safe. func (s *PromToScrapeServer) queryMetrics() { start := time.Now() queriedMetrics, err := QueryMetrics(s.conf, s.client) @@ -88,5 +92,6 @@ func (s *PromToScrapeServer) queryMetrics() { // Start runs the embedded http.Server. func (s *PromToScrapeServer) Start() error { + slog.Info("listening on", "addr", s.server.Addr) return s.server.ListenAndServe() } From fe1632795da23bfe38a9f0615f33abe00a544480 Mon Sep 17 00:00:00 2001 From: Kevin Woo <3469532+kevinawoo@users.noreply.github.com> Date: Fri, 27 Dec 2024 13:59:49 -0800 Subject: [PATCH 2/4] fixed typo --- .../promql-to-scrape/internal/{metric.go => metric-types.go} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename cloud/observability/promql-to-scrape/internal/{metric.go => metric-types.go} (91%) diff --git a/cloud/observability/promql-to-scrape/internal/metric.go b/cloud/observability/promql-to-scrape/internal/metric-types.go similarity index 91% rename from cloud/observability/promql-to-scrape/internal/metric.go rename to cloud/observability/promql-to-scrape/internal/metric-types.go index f86ff17..91ac9ed 100644 --- a/cloud/observability/promql-to-scrape/internal/metric.go +++ b/cloud/observability/promql-to-scrape/internal/metric-types.go @@ -4,7 +4,7 @@ import "strings" const ( metricTypeHistogram = "histogram" - metricTypeCounter = "count" + metricTypeCounter = "counter" metricTypeGauge = "gauge" ) From cf0196ab7e1b3cbdbb1fad3a9599375746673269 Mon Sep 17 00:00:00 2001 From: Kevin Woo <3469532+kevinawoo@users.noreply.github.com> Date: Fri, 27 Dec 2024 14:02:22 -0800 Subject: [PATCH 3/4] revert metric names sincei it's an example --- .../promql-to-scrape/examples/config.yaml | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/cloud/observability/promql-to-scrape/examples/config.yaml b/cloud/observability/promql-to-scrape/examples/config.yaml index 7936532..184c746 100644 --- a/cloud/observability/promql-to-scrape/examples/config.yaml +++ b/cloud/observability/promql-to-scrape/examples/config.yaml @@ -1,43 +1,43 @@ metrics: - - metric_name: temporal_cloud_v0_frontend_service_error_count + - metric_name: temporal_cloud_v0_frontend_service_error_count:rate1m query: rate(temporal_cloud_v0_frontend_service_error_count[1m]) - metric_name: temporal_cloud_v0_frontend_service_pending_requests query: temporal_cloud_v0_frontend_service_pending_requests - - metric_name: temporal_cloud_v0_frontend_service_request_count + - metric_name: temporal_cloud_v0_frontend_service_request_count:rate1m query: rate(temporal_cloud_v0_frontend_service_request_count[1m]) - - metric_name: temporal_cloud_v0_poll_success_count + - metric_name: temporal_cloud_v0_poll_success_count:rate1m query: rate(temporal_cloud_v0_poll_success_count[1m]) - - metric_name: temporal_cloud_v0_poll_success_sync_count + - metric_name: temporal_cloud_v0_poll_success_sync_count:rate1m query: rate(temporal_cloud_v0_poll_success_sync_count[1m]) - - metric_name: temporal_cloud_v0_poll_timeout_count + - metric_name: temporal_cloud_v0_poll_timeout_count:rate1m query: rate(temporal_cloud_v0_poll_timeout_count[1m]) - - metric_name: temporal_cloud_v0_resource_exhausted_error_count + - metric_name: temporal_cloud_v0_resource_exhausted_error_count:rate1m query: rate(temporal_cloud_v0_resource_exhausted_error_count[1m]) - - metric_name: temporal_cloud_v0_schedule_action_success_count + - metric_name: temporal_cloud_v0_schedule_action_success_count:rate1m query: rate(temporal_cloud_v0_schedule_action_success_count[1m]) - - metric_name: temporal_cloud_v0_schedule_buffer_overruns_count + - metric_name: temporal_cloud_v0_schedule_buffer_overruns_count:rate1m query: rate(temporal_cloud_v0_schedule_buffer_overruns_count[1m]) - - metric_name: temporal_cloud_v0_schedule_missed_catchup_window_count + - metric_name: temporal_cloud_v0_schedule_missed_catchup_window_count:rate1m query: rate(temporal_cloud_v0_schedule_missed_catchup_window_count[1m]) - - metric_name: temporal_cloud_v0_service_latency_bucket + - metric_name: temporal_cloud_v0_service_latency_bucket:histogram_quantile_p99_1m query: histogram_quantile(0.99, sum(rate(temporal_cloud_v0_service_latency_bucket[1m])) by (le, operation, temporal_namespace)) - - metric_name: temporal_cloud_v0_service_latency_count + - metric_name: temporal_cloud_v0_service_latency_count:rate1m query: rate(temporal_cloud_v0_service_latency_count[1m]) - - metric_name: temporal_cloud_v0_service_latency_sum + - metric_name: temporal_cloud_v0_service_latency_sum:rate1m query: rate(temporal_cloud_v0_service_latency_sum[1m]) - - metric_name: temporal_cloud_v0_state_transition_count + - metric_name: temporal_cloud_v0_state_transition_count:rate1m query: rate(temporal_cloud_v0_state_transition_count[1m]) - - metric_name: temporal_cloud_v0_total_action_count + - metric_name: temporal_cloud_v0_total_action_count:rate1m query: rate(temporal_cloud_v0_total_action_count[1m]) - - metric_name: temporal_cloud_v0_workflow_cancel_count + - metric_name: temporal_cloud_v0_workflow_cancel_count:rate1m query: rate(temporal_cloud_v0_workflow_cancel_count[1m]) - - metric_name: temporal_cloud_v0_workflow_continued_as_new_count + - metric_name: temporal_cloud_v0_workflow_continued_as_new_count:rate1m query: rate(temporal_cloud_v0_workflow_continued_as_new_count[1m]) - - metric_name: temporal_cloud_v0_workflow_failed_count + - metric_name: temporal_cloud_v0_workflow_failed_count:rate1m query: rate(temporal_cloud_v0_workflow_failed_count[1m]) - - metric_name: temporal_cloud_v0_workflow_success_count + - metric_name: temporal_cloud_v0_workflow_success_count:rate1m query: rate(temporal_cloud_v0_workflow_success_count[1m]) - - metric_name: temporal_cloud_v0_workflow_terminate_count + - metric_name: temporal_cloud_v0_workflow_terminate_count:rate1m query: rate(temporal_cloud_v0_workflow_terminate_count[1m]) - - metric_name: temporal_cloud_v0_workflow_timeout_count + - metric_name: temporal_cloud_v0_workflow_timeout_count:rate1m query: rate(temporal_cloud_v0_workflow_timeout_count[1m]) From f60a7f701966d286634e147a874ba8edbe1f184f Mon Sep 17 00:00:00 2001 From: Kevin Woo <3469532+kevinawoo@users.noreply.github.com> Date: Fri, 27 Dec 2024 14:02:22 -0800 Subject: [PATCH 4/4] revert exported metric names since it's an example --- .../promql-to-scrape/examples/config.yaml | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/cloud/observability/promql-to-scrape/examples/config.yaml b/cloud/observability/promql-to-scrape/examples/config.yaml index 7936532..184c746 100644 --- a/cloud/observability/promql-to-scrape/examples/config.yaml +++ b/cloud/observability/promql-to-scrape/examples/config.yaml @@ -1,43 +1,43 @@ metrics: - - metric_name: temporal_cloud_v0_frontend_service_error_count + - metric_name: temporal_cloud_v0_frontend_service_error_count:rate1m query: rate(temporal_cloud_v0_frontend_service_error_count[1m]) - metric_name: temporal_cloud_v0_frontend_service_pending_requests query: temporal_cloud_v0_frontend_service_pending_requests - - metric_name: temporal_cloud_v0_frontend_service_request_count + - metric_name: temporal_cloud_v0_frontend_service_request_count:rate1m query: rate(temporal_cloud_v0_frontend_service_request_count[1m]) - - metric_name: temporal_cloud_v0_poll_success_count + - metric_name: temporal_cloud_v0_poll_success_count:rate1m query: rate(temporal_cloud_v0_poll_success_count[1m]) - - metric_name: temporal_cloud_v0_poll_success_sync_count + - metric_name: temporal_cloud_v0_poll_success_sync_count:rate1m query: rate(temporal_cloud_v0_poll_success_sync_count[1m]) - - metric_name: temporal_cloud_v0_poll_timeout_count + - metric_name: temporal_cloud_v0_poll_timeout_count:rate1m query: rate(temporal_cloud_v0_poll_timeout_count[1m]) - - metric_name: temporal_cloud_v0_resource_exhausted_error_count + - metric_name: temporal_cloud_v0_resource_exhausted_error_count:rate1m query: rate(temporal_cloud_v0_resource_exhausted_error_count[1m]) - - metric_name: temporal_cloud_v0_schedule_action_success_count + - metric_name: temporal_cloud_v0_schedule_action_success_count:rate1m query: rate(temporal_cloud_v0_schedule_action_success_count[1m]) - - metric_name: temporal_cloud_v0_schedule_buffer_overruns_count + - metric_name: temporal_cloud_v0_schedule_buffer_overruns_count:rate1m query: rate(temporal_cloud_v0_schedule_buffer_overruns_count[1m]) - - metric_name: temporal_cloud_v0_schedule_missed_catchup_window_count + - metric_name: temporal_cloud_v0_schedule_missed_catchup_window_count:rate1m query: rate(temporal_cloud_v0_schedule_missed_catchup_window_count[1m]) - - metric_name: temporal_cloud_v0_service_latency_bucket + - metric_name: temporal_cloud_v0_service_latency_bucket:histogram_quantile_p99_1m query: histogram_quantile(0.99, sum(rate(temporal_cloud_v0_service_latency_bucket[1m])) by (le, operation, temporal_namespace)) - - metric_name: temporal_cloud_v0_service_latency_count + - metric_name: temporal_cloud_v0_service_latency_count:rate1m query: rate(temporal_cloud_v0_service_latency_count[1m]) - - metric_name: temporal_cloud_v0_service_latency_sum + - metric_name: temporal_cloud_v0_service_latency_sum:rate1m query: rate(temporal_cloud_v0_service_latency_sum[1m]) - - metric_name: temporal_cloud_v0_state_transition_count + - metric_name: temporal_cloud_v0_state_transition_count:rate1m query: rate(temporal_cloud_v0_state_transition_count[1m]) - - metric_name: temporal_cloud_v0_total_action_count + - metric_name: temporal_cloud_v0_total_action_count:rate1m query: rate(temporal_cloud_v0_total_action_count[1m]) - - metric_name: temporal_cloud_v0_workflow_cancel_count + - metric_name: temporal_cloud_v0_workflow_cancel_count:rate1m query: rate(temporal_cloud_v0_workflow_cancel_count[1m]) - - metric_name: temporal_cloud_v0_workflow_continued_as_new_count + - metric_name: temporal_cloud_v0_workflow_continued_as_new_count:rate1m query: rate(temporal_cloud_v0_workflow_continued_as_new_count[1m]) - - metric_name: temporal_cloud_v0_workflow_failed_count + - metric_name: temporal_cloud_v0_workflow_failed_count:rate1m query: rate(temporal_cloud_v0_workflow_failed_count[1m]) - - metric_name: temporal_cloud_v0_workflow_success_count + - metric_name: temporal_cloud_v0_workflow_success_count:rate1m query: rate(temporal_cloud_v0_workflow_success_count[1m]) - - metric_name: temporal_cloud_v0_workflow_terminate_count + - metric_name: temporal_cloud_v0_workflow_terminate_count:rate1m query: rate(temporal_cloud_v0_workflow_terminate_count[1m]) - - metric_name: temporal_cloud_v0_workflow_timeout_count + - metric_name: temporal_cloud_v0_workflow_timeout_count:rate1m query: rate(temporal_cloud_v0_workflow_timeout_count[1m])