From eb733d32c2aaad575a1b367b29f96f9ba63a336e Mon Sep 17 00:00:00 2001 From: johha Date: Thu, 18 Dec 2025 09:34:02 +0100 Subject: [PATCH] Add vital and db connection metrics for clock and deployment updater --- ..._webserver.rb => api_metrics_webserver.rb} | 2 +- lib/cloud_controller/clock/scheduler.rb | 7 ++ .../config_schemas/clock_schema.rb | 3 + .../deployment_updater_schema.rb | 3 + lib/cloud_controller/db.rb | 6 +- lib/cloud_controller/dependency_locator.rb | 20 ++++- .../deployment_updater/scheduler.rb | 6 ++ lib/cloud_controller/execution_context.rb | 39 ++++++++++ .../metrics/periodic_updater.rb | 46 ++++++----- .../metrics/prometheus_updater.rb | 70 +++++++++++++---- lib/cloud_controller/runner.rb | 7 +- lib/cloud_controller/runners/puma_runner.rb | 19 ++--- .../standalone_metrics_webserver.rb | 76 +++++++++++++++++++ lib/delayed_job/delayed_worker.rb | 46 ++--------- .../delayed_jobs_metrics.rb | 2 +- lib/sequel/extensions/connection_metrics.rb | 6 +- lib/tasks/clock.rake | 4 +- lib/tasks/deployment_updater.rake | 4 +- lib/tasks/jobs.rake | 5 +- lib/vcap/stats.rb | 8 +- spec/request/internal/metrics_spec.rb | 15 ++-- spec/request/status_spec.rb | 2 +- spec/request/v2/auth_spec.rb | 8 +- ..._spec.rb => api_metrics_webserver_spec.rb} | 2 +- .../cloud_controller/clock/scheduler_spec.rb | 29 +++++++ .../deployment_updater/scheduler_spec.rb | 26 +++++++ .../execution_context_spec.rb | 69 +++++++++++++++++ .../metrics/periodic_updater_spec.rb | 57 +++++++++++++- .../metrics/prometheus_updater_spec.rb | 40 ++++++++++ spec/unit/lib/cloud_controller/runner_spec.rb | 4 +- .../runners/puma_runner_spec.rb | 16 ---- .../standalone_metrics_webserver_spec.rb | 55 ++++++++++++++ .../lib/delayed_job/delayed_worker_spec.rb | 8 +- .../extensions/connection_metrics_spec.rb | 18 ++--- 34 files changed, 571 insertions(+), 157 deletions(-) rename lib/cloud_controller/{metrics_webserver.rb => api_metrics_webserver.rb} (98%) create mode 100644 lib/cloud_controller/execution_context.rb create mode 100644 lib/cloud_controller/standalone_metrics_webserver.rb rename spec/unit/lib/cloud_controller/{metrics_webserver_spec.rb => api_metrics_webserver_spec.rb} (96%) create mode 100644 spec/unit/lib/cloud_controller/execution_context_spec.rb create mode 100644 spec/unit/lib/cloud_controller/standalone_metrics_webserver_spec.rb diff --git a/lib/cloud_controller/metrics_webserver.rb b/lib/cloud_controller/api_metrics_webserver.rb similarity index 98% rename from lib/cloud_controller/metrics_webserver.rb rename to lib/cloud_controller/api_metrics_webserver.rb index 447f5d569ab..72602498f3c 100644 --- a/lib/cloud_controller/metrics_webserver.rb +++ b/lib/cloud_controller/api_metrics_webserver.rb @@ -3,7 +3,7 @@ module VCAP module CloudController - class MetricsWebserver + class ApiMetricsWebserver attr_reader :app def initialize diff --git a/lib/cloud_controller/clock/scheduler.rb b/lib/cloud_controller/clock/scheduler.rb index cc2dbb853b6..e8c7a9ba9fe 100644 --- a/lib/cloud_controller/clock/scheduler.rb +++ b/lib/cloud_controller/clock/scheduler.rb @@ -1,6 +1,7 @@ require 'clockwork' require 'cloud_controller/clock/clock' require 'cloud_controller/clock/job_timeout_calculator' +require 'cloud_controller/standalone_metrics_webserver' module VCAP::CloudController class Scheduler @@ -35,6 +36,12 @@ def initialize(config) end def start + if @config.get(:publish_metrics) || false + StandaloneMetricsWebserver.start_for_bosh_job(@config.get(:prometheus_port) || 9394) + periodic_updater = CloudController::DependencyLocator.instance.vitals_periodic_updater + periodic_updater.setup_updates + end + start_daily_jobs start_frequent_jobs start_inline_jobs diff --git a/lib/cloud_controller/config_schemas/clock_schema.rb b/lib/cloud_controller/config_schemas/clock_schema.rb index 14c9952c0eb..06ade4c2d3f 100644 --- a/lib/cloud_controller/config_schemas/clock_schema.rb +++ b/lib/cloud_controller/config_schemas/clock_schema.rb @@ -165,6 +165,9 @@ class ClockSchema < VCAP::Config optional(:port) => Integer }, + optional(:publish_metrics) => bool, + optional(:prometheus_port) => Integer, + skip_cert_verify: bool, optional(:routing_api) => { diff --git a/lib/cloud_controller/config_schemas/deployment_updater_schema.rb b/lib/cloud_controller/config_schemas/deployment_updater_schema.rb index 267cb8acc8c..9760b4b5b44 100644 --- a/lib/cloud_controller/config_schemas/deployment_updater_schema.rb +++ b/lib/cloud_controller/config_schemas/deployment_updater_schema.rb @@ -148,6 +148,9 @@ class DeploymentUpdaterSchema < VCAP::Config stacks_file: String, + optional(:publish_metrics) => bool, + optional(:prometheus_port) => Integer, + skip_cert_verify: bool, optional(:credhub_api) => { diff --git a/lib/cloud_controller/db.rb b/lib/cloud_controller/db.rb index b1e81d970cf..78648843d50 100644 --- a/lib/cloud_controller/db.rb +++ b/lib/cloud_controller/db.rb @@ -2,6 +2,7 @@ require 'cloud_controller/db_migrator' require 'cloud_controller/db_connection/options_factory' require 'cloud_controller/db_connection/finalizer' +require 'cloud_controller/execution_context' require 'sequel/extensions/query_length_logging' require 'sequel/extensions/request_query_metrics' @@ -73,8 +74,9 @@ def self.add_connection_expiration_extension(db, opts) end def self.add_connection_metrics_extension(db) - # only add the metrics for api and cc-worker processes. Otherwise e.g. rake db:migrate would also initialize metric updaters, which need additional config - return if Object.const_defined?(:RakeConfig) && RakeConfig.context != :worker + # only add the metrics for api, cc-worker, clock & deployment_updater processes. + # Otherwise, e.g. rake db:migrate would also initialize metric updaters, which need additional config + return if ExecutionContext.from_process_type_env.nil? db.extension(:connection_metrics) # so that we gather connection metrics from the beginning diff --git a/lib/cloud_controller/dependency_locator.rb b/lib/cloud_controller/dependency_locator.rb index 6d7aabc4fb8..7b3271d916c 100644 --- a/lib/cloud_controller/dependency_locator.rb +++ b/lib/cloud_controller/dependency_locator.rb @@ -24,6 +24,7 @@ require 'cloud_controller/packager/local_bits_packer' require 'credhub/client' require 'cloud_controller/metrics/prometheus_updater' +require 'cloud_controller/execution_context' module CloudController class DependencyLocator @@ -70,12 +71,21 @@ def periodic_updater )) end - def prometheus_updater - @dependencies[:prometheus_updater] || register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new) + def vitals_periodic_updater + @dependencies[:vitals_periodic_updater] || + register(:vitals_periodic_updater, + VCAP::CloudController::Metrics::PeriodicUpdater.new( + Time.now.utc, + log_counter, + Steno.logger('cc.vitals'), + statsd_updater, + prometheus_updater, + task_list: [VCAP::CloudController::Metrics::PeriodicUpdater::VITALS_TASK], + )) end - def cc_worker_prometheus_updater - @dependencies[:cc_worker_prometheus_updater] || register(:cc_worker_prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new(cc_worker: true)) + def prometheus_updater + @dependencies[:prometheus_updater] || register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new) end def statsd_updater @@ -362,6 +372,8 @@ def statsd_client else register(:statsd_client, NullStatsdClient.new) end + rescue VCAP::CloudController::Config::InvalidConfigPath + register(:statsd_client, NullStatsdClient.new) end private diff --git a/lib/cloud_controller/deployment_updater/scheduler.rb b/lib/cloud_controller/deployment_updater/scheduler.rb index c6b2b5a1a5e..edb5695365d 100644 --- a/lib/cloud_controller/deployment_updater/scheduler.rb +++ b/lib/cloud_controller/deployment_updater/scheduler.rb @@ -1,6 +1,7 @@ require 'cloud_controller/deployment_updater/dispatcher' require 'locket/lock_worker' require 'locket/lock_runner' +require 'cloud_controller/standalone_metrics_webserver' module VCAP::CloudController module DeploymentUpdater @@ -9,6 +10,11 @@ class << self def start with_error_logging('cc.deployment_updater') do config = CloudController::DependencyLocator.instance.config + if config.get(:publish_metrics) || false + VCAP::CloudController::StandaloneMetricsWebserver.start_for_bosh_job(config.get(:prometheus_port) || 9395) + periodic_updater = CloudController::DependencyLocator.instance.vitals_periodic_updater + periodic_updater.setup_updates + end statsd_client = CloudController::DependencyLocator.instance.statsd_client update_step = proc { diff --git a/lib/cloud_controller/execution_context.rb b/lib/cloud_controller/execution_context.rb new file mode 100644 index 00000000000..08f9f863cbb --- /dev/null +++ b/lib/cloud_controller/execution_context.rb @@ -0,0 +1,39 @@ +module VCAP::CloudController + class ExecutionContext + ExecutionInfo = Struct.new(:process_type, :capi_job_name, :rake_context, keyword_init: true) do + def initialize(process_type:, capi_job_name:, rake_context: nil) + super + end + + def set_process_type_env + ENV['PROCESS_TYPE'] = process_type + end + + def set_rake_context + raise 'RakeConfig is not defined or rake_context argument is nil' if rake_context.nil? || !Object.const_defined?(:RakeConfig) + + RakeConfig.context = rake_context + end + end + + API_PUMA_MAIN = ExecutionInfo.new(process_type: 'main', capi_job_name: 'cloud_controller_ng') + API_PUMA_WORKER = ExecutionInfo.new(process_type: 'puma_worker', capi_job_name: 'cloud_controller_ng') + CC_WORKER = ExecutionInfo.new(process_type: 'cc-worker', capi_job_name: 'cloud_controller_worker', rake_context: :worker) + CLOCK = ExecutionInfo.new(process_type: 'clock', capi_job_name: 'cloud_controller_clock', rake_context: :clock) + DEPLOYMENT_UPDATER = ExecutionInfo.new(process_type: 'deployment_updater', capi_job_name: 'cc_deployment_updater', rake_context: :deployment_updater) + + ALL_EXECUTION_CONTEXTS = [API_PUMA_MAIN, API_PUMA_WORKER, CC_WORKER, CLOCK, DEPLOYMENT_UPDATER].freeze + + class << self + def from_process_type_env + process_type = ENV.fetch('PROCESS_TYPE', nil) + exec_ctx = ALL_EXECUTION_CONTEXTS.find { |p| p.process_type == process_type } + + # For test environments where PROCESS_TYPE may not be set, default to API_PUMA_MAIN + exec_ctx = API_PUMA_MAIN if exec_ctx.nil? && ENV.fetch('CC_TEST', nil) == 'true' + + exec_ctx + end + end + end +end diff --git a/lib/cloud_controller/metrics/periodic_updater.rb b/lib/cloud_controller/metrics/periodic_updater.rb index b40ae97176f..51a57ff5548 100644 --- a/lib/cloud_controller/metrics/periodic_updater.rb +++ b/lib/cloud_controller/metrics/periodic_updater.rb @@ -3,29 +3,35 @@ module VCAP::CloudController::Metrics class PeriodicUpdater - def initialize(start_time, log_counter, logger, statsd_updater, prometheus_updater) + UPDATE_TASK = Struct.new(:method_name, :interval) + + USER_COUNT_TASK = UPDATE_TASK.new(:update_user_count, 600).freeze + JOB_QUEUE_LENGTH_TASK = UPDATE_TASK.new(:update_job_queue_length, 30).freeze + JOB_QUEUE_LOAD_TASK = UPDATE_TASK.new(:update_job_queue_load, 30).freeze + FAILED_JOB_COUNT_TASK = UPDATE_TASK.new(:update_failed_job_count, 30).freeze + VITALS_TASK = UPDATE_TASK.new(:update_vitals, 30).freeze + LOG_COUNTS_TASK = UPDATE_TASK.new(:update_log_counts, 30).freeze + TASK_STATS_TASK = UPDATE_TASK.new(:update_task_stats, 30).freeze + DEPLOYING_COUNT_TASK = UPDATE_TASK.new(:update_deploying_count, 30).freeze + WEBSERVER_STATS_TASK = UPDATE_TASK.new(:update_webserver_stats, 30).freeze + + ALL_TASKS = [USER_COUNT_TASK, JOB_QUEUE_LENGTH_TASK, JOB_QUEUE_LOAD_TASK, FAILED_JOB_COUNT_TASK, VITALS_TASK, LOG_COUNTS_TASK, TASK_STATS_TASK, DEPLOYING_COUNT_TASK, + WEBSERVER_STATS_TASK].freeze + + def initialize(start_time, log_counter, logger, statsd_updater, prometheus_updater, task_list: ALL_TASKS) @start_time = start_time @statsd_updater = statsd_updater @prometheus_updater = prometheus_updater @log_counter = log_counter @logger = logger - @known_job_queues = { - VCAP::CloudController::Jobs::Queues.local(VCAP::CloudController::Config.config).to_sym => 0 - } + @known_job_queues = { VCAP::CloudController::Jobs::Queues.local(VCAP::CloudController::Config.config).to_sym => 0 } + @task_list = task_list end def setup_updates - update! + @task_list.each { |task| update!(task) } @update_tasks = [] - @update_tasks << Concurrent::TimerTask.new(execution_interval: 600) { catch_error { update_user_count } } - @update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_job_queue_length } } - @update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_job_queue_load } } - @update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_failed_job_count } } - @update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_vitals } } - @update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_log_counts } } - @update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_task_stats } } - @update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_deploying_count } } - @update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_webserver_stats } } + @task_list.each { |task| @update_tasks << Concurrent::TimerTask.new(execution_interval: task.interval) { catch_error { update!(task) } } } @update_tasks.each(&:execute) end @@ -35,16 +41,8 @@ def stop_updates @update_tasks.each(&:shutdown) end - def update! - update_user_count - update_job_queue_length - update_job_queue_load - update_failed_job_count - update_vitals - update_log_counts - update_task_stats - update_deploying_count - update_webserver_stats + def update!(task) + send(task.method_name) end def catch_error diff --git a/lib/cloud_controller/metrics/prometheus_updater.rb b/lib/cloud_controller/metrics/prometheus_updater.rb index 966cb9e1baf..7425feafcc8 100644 --- a/lib/cloud_controller/metrics/prometheus_updater.rb +++ b/lib/cloud_controller/metrics/prometheus_updater.rb @@ -1,5 +1,6 @@ require 'prometheus/client' require 'prometheus/client/data_stores/direct_file_store' +require 'cloud_controller/execution_context' module VCAP::CloudController::Metrics class PrometheusUpdater @@ -29,12 +30,6 @@ def self.allow_pid_label { type: :histogram, name: :cc_staging_failed_duration_seconds, docstring: 'Durations of failed staging events', buckets: DURATION_BUCKETS }, { type: :gauge, name: :cc_requests_outstanding_total, docstring: 'Requests outstanding', aggregation: :sum }, { type: :counter, name: :cc_requests_completed_total, docstring: 'Requests completed' }, - { type: :gauge, name: :cc_vitals_started_at, docstring: 'CloudController Vitals: started_at', aggregation: :most_recent }, - { type: :gauge, name: :cc_vitals_mem_bytes, docstring: 'CloudController Vitals: mem_bytes', aggregation: :most_recent }, - { type: :gauge, name: :cc_vitals_cpu_load_avg, docstring: 'CloudController Vitals: cpu_load_avg', aggregation: :most_recent }, - { type: :gauge, name: :cc_vitals_mem_used_bytes, docstring: 'CloudController Vitals: mem_used_bytes', aggregation: :most_recent }, - { type: :gauge, name: :cc_vitals_mem_free_bytes, docstring: 'CloudController Vitals: mem_free_bytes', aggregation: :most_recent }, - { type: :gauge, name: :cc_vitals_num_cores, docstring: 'CloudController Vitals: num_cores', aggregation: :most_recent }, { type: :gauge, name: :cc_running_tasks_total, docstring: 'Total running tasks', aggregation: :most_recent }, { type: :gauge, name: :cc_running_tasks_memory_bytes, docstring: 'Total memory consumed by running tasks', aggregation: :most_recent }, { type: :gauge, name: :cc_users_total, docstring: 'Number of users', aggregation: :most_recent }, @@ -67,19 +62,68 @@ def self.allow_pid_label { type: :histogram, name: :cc_job_duration_seconds, docstring: 'Job processing time (start to finish)', labels: %i[queue worker], buckets: DELAYED_JOB_METRIC_BUCKETS } ].freeze - def initialize(registry: Prometheus::Client.registry, cc_worker: false) + VITAL_METRICS = [ + { type: :gauge, name: :cc_vitals_started_at, docstring: 'CloudController Vitals: started_at', aggregation: :most_recent }, + { type: :gauge, name: :cc_vitals_mem_bytes, docstring: 'CloudController Vitals: mem_bytes', aggregation: :most_recent }, + { type: :gauge, name: :cc_vitals_cpu_load_avg, docstring: 'CloudController Vitals: cpu_load_avg', aggregation: :most_recent }, + { type: :gauge, name: :cc_vitals_mem_used_bytes, docstring: 'CloudController Vitals: mem_used_bytes', aggregation: :most_recent }, + { type: :gauge, name: :cc_vitals_mem_free_bytes, docstring: 'CloudController Vitals: mem_free_bytes', aggregation: :most_recent }, + { type: :gauge, name: :cc_vitals_num_cores, docstring: 'CloudController Vitals: num_cores', aggregation: :most_recent } + ].freeze + + def initialize(registry: Prometheus::Client.registry) self.class.allow_pid_label @registry = registry + execution_context = VCAP::CloudController::ExecutionContext.from_process_type_env - # Register all metrics, to initialize them for discoverability - DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) } - DELAYED_JOB_METRICS.each { |metric| register(metric) } + register_metrics_for_process(execution_context) + initialize_cc_db_connection_pool_timeouts_total(execution_context) + end + + private + + # rubocop:disable Metrics/CyclomaticComplexity + def register_metrics_for_process(execution_context) + case execution_context + when VCAP::CloudController::ExecutionContext::CC_WORKER + DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) } + DELAYED_JOB_METRICS.each { |metric| register(metric) } + VITAL_METRICS.each { |metric| register(metric) } + when VCAP::CloudController::ExecutionContext::CLOCK, VCAP::CloudController::ExecutionContext::DEPLOYMENT_UPDATER + DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) } + VITAL_METRICS.each { |metric| register(metric) } + when VCAP::CloudController::ExecutionContext::API_PUMA_MAIN, VCAP::CloudController::ExecutionContext::API_PUMA_WORKER + DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) } + DELAYED_JOB_METRICS.each { |metric| register(metric) } + VITAL_METRICS.each { |metric| register(metric) } + METRICS.each { |metric| register(metric) } + PUMA_METRICS.each { |metric| register(metric) } if is_puma_webserver? + else + raise 'Could not register Prometheus metrics: Unknown execution context' + end + end + # rubocop:enable Metrics/CyclomaticComplexity + + def initialize_cc_db_connection_pool_timeouts_total(execution_context) + return if execution_context.nil? # In unit tests, the execution context might not be set - thus skip initialization + return unless @registry.exist?(:cc_db_connection_pool_timeouts_total) # If the metric is not registered, we don't need to initialize it + + # initialize metric with 0 for discoverability, because it likely won't get updated on healthy systems + update_gauge_metric(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: execution_context.process_type }) + + return unless execution_context == VCAP::CloudController::ExecutionContext::API_PUMA_MAIN + + # also initialize for puma_worker + update_gauge_metric(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: VCAP::CloudController::ExecutionContext::API_PUMA_WORKER.process_type }) + end - return if cc_worker + public - METRICS.each { |metric| register(metric) } - PUMA_METRICS.each { |metric| register(metric) } if VCAP::CloudController::Config.config&.get(:webserver) == 'puma' + def is_puma_webserver? + VCAP::CloudController::Config.config&.get(:webserver) == 'puma' + rescue VCAP::CloudController::Config::InvalidConfigPath + false end def update_gauge_metric(metric, value, labels: {}) diff --git a/lib/cloud_controller/runner.rb b/lib/cloud_controller/runner.rb index 001528fb466..bc9ac2e989e 100644 --- a/lib/cloud_controller/runner.rb +++ b/lib/cloud_controller/runner.rb @@ -12,7 +12,8 @@ require 'cloud_controller/telemetry_logger' require 'cloud_controller/secrets_fetcher' require 'cloud_controller/runners/puma_runner' -require 'cloud_controller/metrics_webserver' +require 'cloud_controller/api_metrics_webserver' +require 'cloud_controller/execution_context' require 'prometheus/client/data_stores/direct_file_store' require 'prometheus/middleware/exporter' @@ -33,7 +34,7 @@ def initialize(argv) # DB connection metrics have a label to determine whether the process accessing the connection is the # main or a worker process. We need to set this env variable before `setup_db` otherwise the main process # will show up twice in the metrics as main and worker. - ENV['PROCESS_TYPE'] = 'main' + VCAP::CloudController::ExecutionContext::API_PUMA_MAIN.set_process_type_env setup_cloud_controller @@ -130,7 +131,7 @@ def setup_metrics Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: prometheus_dir) Thread.new do - VCAP::CloudController::MetricsWebserver.new.start(@config) + VCAP::CloudController::ApiMetricsWebserver.new.start(@config) end end diff --git a/lib/cloud_controller/runners/puma_runner.rb b/lib/cloud_controller/runners/puma_runner.rb index a9e38ad51cc..d0ef426052a 100644 --- a/lib/cloud_controller/runners/puma_runner.rb +++ b/lib/cloud_controller/runners/puma_runner.rb @@ -2,6 +2,7 @@ require 'puma/configuration' require 'puma/events' require 'cloud_controller/logs/steno_io' +require 'cloud_controller/execution_context' module VCAP::CloudController class PumaRunner @@ -46,25 +47,15 @@ def initialize(config, app, logger, periodic_updater, request_logs) conf.before_fork do Sequel::Model.db.disconnect end - conf.before_worker_boot do - ENV['PROCESS_TYPE'] = 'puma_worker' - prometheus_updater.update_gauge_metric(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: 'puma_worker' }) - end - conf.before_worker_shutdown do - request_logs.log_incomplete_requests if request_logs - end + conf.before_worker_boot { ExecutionContext::API_PUMA_WORKER.set_process_type_env } + conf.before_worker_shutdown { request_logs.log_incomplete_requests if request_logs } end log_writer = Puma::LogWriter.new(StenoIO.new(logger, :info), StenoIO.new(logger, :error)) events = Puma::Events.new - events.after_booted do - prometheus_updater.update_gauge_metric(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: 'main' }) - @periodic_updater.setup_updates - end - events.after_stopped do - @periodic_updater.stop_updates unless @periodic_updater.nil? - end + events.after_booted { @periodic_updater.setup_updates } + events.after_stopped { @periodic_updater.stop_updates unless @periodic_updater.nil? } @puma_launcher = Puma::Launcher.new(puma_config, log_writer:, events:) end diff --git a/lib/cloud_controller/standalone_metrics_webserver.rb b/lib/cloud_controller/standalone_metrics_webserver.rb new file mode 100644 index 00000000000..ffdccff7880 --- /dev/null +++ b/lib/cloud_controller/standalone_metrics_webserver.rb @@ -0,0 +1,76 @@ +require 'cloud_controller/execution_context' + +module VCAP::CloudController + class StandaloneMetricsWebserver + def self.start_for_bosh_job(port) + new(port).start + end + + def initialize(port) + @port = port + end + + def start + metrics_app = build_app + + Thread.new do + server = Puma::Server.new(metrics_app) + add_listener(server) + server.run + end + end + + private + + def bosh_job_name + VCAP::CloudController::ExecutionContext.from_process_type_env.capi_job_name + end + + def build_app + Rack::Builder.new do + use Prometheus::Middleware::Exporter, path: '/metrics' + + # Return 404 for any other request + map('/') { run ->(_env) { ['404', { 'Content-Type' => 'text/plain' }, ['Not Found']] } } + end + end + + def add_listener(server) + if use_ssl? + server.add_ssl_listener('127.0.0.1', @port, ssl_context) + else + logger.warn('Starting metrics webserver without TLS. This is not recommended for production environments.') + server.add_tcp_listener('127.0.0.1', @port) + end + end + + def use_ssl? + File.exist?(cert_path) && File.exist?(key_path) && File.exist?(ca_path) + end + + def ssl_context + context = Puma::MiniSSL::Context.new + context.cert = cert_path + context.key = key_path + context.ca = ca_path + context.verify_mode = Puma::MiniSSL::VERIFY_PEER | Puma::MiniSSL::VERIFY_FAIL_IF_NO_PEER_CERT + context + end + + def cert_path + "/var/vcap/jobs/#{bosh_job_name}/config/certs/scrape.crt" + end + + def key_path + "/var/vcap/jobs/#{bosh_job_name}/config/certs/scrape.key" + end + + def ca_path + "/var/vcap/jobs/#{bosh_job_name}/config/certs/scrape_ca.crt" + end + + def logger + Steno.logger('companion_metrics_webserver') + end + end +end diff --git a/lib/delayed_job/delayed_worker.rb b/lib/delayed_job/delayed_worker.rb index f54d6966413..02152faa86d 100644 --- a/lib/delayed_job/delayed_worker.rb +++ b/lib/delayed_job/delayed_worker.rb @@ -2,6 +2,7 @@ require 'rack' require 'puma' require 'prometheus/middleware/exporter' +require 'cloud_controller/standalone_metrics_webserver' class CloudController::DelayedWorker DEFAULT_READ_AHEAD_POSTGRES = 0 @@ -25,7 +26,12 @@ def initialize(options) def start_working config = RakeConfig.config - setup_metrics(config) if @publish_metrics + if @publish_metrics + setup_metrics(config) + periodic_updater = CloudController::DependencyLocator.instance.vitals_periodic_updater + periodic_updater.setup_updates + end + BackgroundJobEnvironment.new(config).setup_environment(readiness_port) logger = Steno.logger('cc-worker') @@ -116,42 +122,6 @@ def setup_metrics(config) prometheus_dir = File.join(config.get(:directories, :tmpdir), 'prometheus') Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: prometheus_dir) - setup_webserver(config, prometheus_dir) if is_first_generic_worker_on_machine? - - # initialize metric with 0 for discoverability, because it likely won't get updated on healthy systems - CloudController::DependencyLocator.instance.cc_worker_prometheus_updater.update_gauge_metric(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: 'cc-worker' }) - end - - def setup_webserver(config, prometheus_dir) - FileUtils.mkdir_p(prometheus_dir) - - # Resetting metrics on startup - Dir["#{prometheus_dir}/*.bin"].each do |file_path| - File.unlink(file_path) - end - - metrics_app = Rack::Builder.new do - use Prometheus::Middleware::Exporter, path: '/metrics' - - map '/' do - run lambda { |_env| - # Return 404 for any other request - ['404', { 'Content-Type' => 'text/plain' }, ['Not Found']] - } - end - end - - Thread.new do - server = Puma::Server.new(metrics_app) - - context = Puma::MiniSSL::Context.new - context.cert = '/var/vcap/jobs/cloud_controller_worker/config/certs/scrape.crt' - context.key = '/var/vcap/jobs/cloud_controller_worker/config/certs/scrape.key' - context.ca = '/var/vcap/jobs/cloud_controller_worker/config/certs/scrape_ca.crt' - context.verify_mode = Puma::MiniSSL::VERIFY_PEER | Puma::MiniSSL::VERIFY_FAIL_IF_NO_PEER_CERT - - server.add_ssl_listener('127.0.0.1', config.get(:prometheus_port) || 9394, context) - server.run - end + VCAP::CloudController::StandaloneMetricsWebserver.start_for_bosh_job(config.get(:prometheus_port) || 9394) if is_first_generic_worker_on_machine? end end diff --git a/lib/delayed_job_plugins/delayed_jobs_metrics.rb b/lib/delayed_job_plugins/delayed_jobs_metrics.rb index 1f7546bf9c3..fa6286f967d 100644 --- a/lib/delayed_job_plugins/delayed_jobs_metrics.rb +++ b/lib/delayed_job_plugins/delayed_jobs_metrics.rb @@ -4,7 +4,7 @@ class << self attr_writer :prometheus def prometheus - @prometheus ||= CloudController::DependencyLocator.instance.cc_worker_prometheus_updater + @prometheus ||= CloudController::DependencyLocator.instance.prometheus_updater end end diff --git a/lib/sequel/extensions/connection_metrics.rb b/lib/sequel/extensions/connection_metrics.rb index 4ee8590a7d4..e8fa75466f7 100644 --- a/lib/sequel/extensions/connection_metrics.rb +++ b/lib/sequel/extensions/connection_metrics.rb @@ -23,11 +23,7 @@ def self.extended(pool) pool.instance_exec do sync do - @prometheus_updater = if process_type == 'cc-worker' - CloudController::DependencyLocator.instance.cc_worker_prometheus_updater - else - CloudController::DependencyLocator.instance.prometheus_updater - end + @prometheus_updater = CloudController::DependencyLocator.instance.prometheus_updater @connection_info = {} end end diff --git a/lib/tasks/clock.rake b/lib/tasks/clock.rake index 64302127997..7bec072ca93 100644 --- a/lib/tasks/clock.rake +++ b/lib/tasks/clock.rake @@ -4,8 +4,10 @@ namespace :clock do puts RUBY_DESCRIPTION require 'cloud_controller/clock/scheduler' + require 'cloud_controller/execution_context' - RakeConfig.context = :clock + VCAP::CloudController::ExecutionContext::CLOCK.set_rake_context + VCAP::CloudController::ExecutionContext::CLOCK.set_process_type_env BackgroundJobEnvironment.new(RakeConfig.config).setup_environment(RakeConfig.config.get(:readiness_port, :clock)) scheduler = VCAP::CloudController::Scheduler.new(RakeConfig.config) diff --git a/lib/tasks/deployment_updater.rake b/lib/tasks/deployment_updater.rake index e968da5d290..29616607a5f 100644 --- a/lib/tasks/deployment_updater.rake +++ b/lib/tasks/deployment_updater.rake @@ -3,8 +3,10 @@ namespace :deployment_updater do task start: :environment do puts RUBY_DESCRIPTION require 'cloud_controller/deployment_updater/scheduler' + require 'cloud_controller/execution_context' - RakeConfig.context = :deployment_updater + VCAP::CloudController::ExecutionContext::DEPLOYMENT_UPDATER.set_rake_context + VCAP::CloudController::ExecutionContext::DEPLOYMENT_UPDATER.set_process_type_env BackgroundJobEnvironment.new(RakeConfig.config).setup_environment(RakeConfig.config.get(:readiness_port, :deployment_updater)) VCAP::CloudController::DeploymentUpdater::Scheduler.start diff --git a/lib/tasks/jobs.rake b/lib/tasks/jobs.rake index 9dcb8254c34..f33cae1be9f 100644 --- a/lib/tasks/jobs.rake +++ b/lib/tasks/jobs.rake @@ -43,7 +43,6 @@ namespace :jobs do args.with_defaults(num_threads: nil) args.with_defaults(thread_grace_period_seconds: nil) - RakeConfig.context = :worker queues = [ VCAP::CloudController::Jobs::Queues.generic, 'app_usage_events', @@ -64,8 +63,10 @@ namespace :jobs do 'prune_excess_app_revisions' ] - ENV['PROCESS_TYPE'] = 'cc-worker' require 'cloud_controller/metrics/custom_process_id' + require 'cloud_controller/execution_context' + VCAP::CloudController::ExecutionContext::CC_WORKER.set_rake_context + VCAP::CloudController::ExecutionContext::CC_WORKER.set_process_type_env publish_metrics = RakeConfig.config.get(:publish_metrics) || false diff --git a/lib/vcap/stats.rb b/lib/vcap/stats.rb index 72d1cb77b22..0963b679c73 100644 --- a/lib/vcap/stats.rb +++ b/lib/vcap/stats.rb @@ -9,7 +9,7 @@ def process_memory_bytes_and_cpu pcpu = [] ps_out = ps_pid - ps_out += ps_ppid if VCAP::CloudController::Config.config.get(:webserver) == 'puma' + ps_out += ps_ppid if is_puma_webserver? ps_out.split.each_with_index { |e, i| i.even? ? rss << e : pcpu << e } [rss.map(&:to_i).sum * 1024, pcpu.map(&:to_f).sum.round] @@ -42,6 +42,12 @@ def ps_ppid `ps -o rss=,pcpu= --ppid #{Process.pid}` end end + + def is_puma_webserver? + VCAP::CloudController::Config.config.get(:webserver) == 'puma' + rescue VCAP::CloudController::Config::InvalidConfigPath + false + end end end end diff --git a/spec/request/internal/metrics_spec.rb b/spec/request/internal/metrics_spec.rb index 62284d6b43d..7fa9d4774f3 100644 --- a/spec/request/internal/metrics_spec.rb +++ b/spec/request/internal/metrics_spec.rb @@ -3,7 +3,8 @@ RSpec.describe 'Metrics' do let(:user) { VCAP::CloudController::User.make } let(:user_header) { headers_for(user) } - let(:metrics_webserver) { VCAP::CloudController::MetricsWebserver.new } + let(:metrics_webserver) { VCAP::CloudController::ApiMetricsWebserver.new } + let(:periodic_updater) { CloudController::DependencyLocator.instance.periodic_updater } delegate :app, to: :metrics_webserver @@ -36,7 +37,7 @@ 10.times do VCAP::CloudController::User.make end - CloudController::DependencyLocator.instance.periodic_updater.update! + VCAP::CloudController::Metrics::PeriodicUpdater::ALL_TASKS.each { |task| periodic_updater.update!(task) } end it 'reports the total number of users' do @@ -50,7 +51,7 @@ context 'cc_vitals' do it 'reports vitals' do - CloudController::DependencyLocator.instance.periodic_updater.update! + VCAP::CloudController::Metrics::PeriodicUpdater::ALL_TASKS.each { |task| periodic_updater.update!(task) } get '/internal/v4/metrics', nil expect(last_response.body).to match(/cc_vitals_num_cores [1-9][0-9]*\.\d+/) @@ -67,7 +68,7 @@ Delayed::Job.enqueue(VCAP::CloudController::Jobs::Runtime::EventsCleanup.new(1), { queue: 'cc_api_0', run_at: Time.now + 1.day }) Delayed::Job.enqueue(VCAP::CloudController::Jobs::Runtime::EventsCleanup.new(1), { queue: 'cc_generic', run_at: Time.now + 1.day }) - CloudController::DependencyLocator.instance.periodic_updater.update! + VCAP::CloudController::Metrics::PeriodicUpdater::ALL_TASKS.each { |task| periodic_updater.update!(task) } end after do @@ -87,7 +88,7 @@ Delayed::Job.enqueue(VCAP::CloudController::Jobs::Runtime::EventsCleanup.new(1), { queue: 'cc_api_0', run_at: Time.now }) Delayed::Job.enqueue(VCAP::CloudController::Jobs::Runtime::EventsCleanup.new(1), { queue: 'cc_generic', run_at: Time.now }) - CloudController::DependencyLocator.instance.periodic_updater.update! + VCAP::CloudController::Metrics::PeriodicUpdater::ALL_TASKS.each { |task| periodic_updater.update!(task) } end after do @@ -107,7 +108,7 @@ Delayed::Job.enqueue(VCAP::CloudController::Jobs::Runtime::EventsCleanup.new(1), { queue: 'cc_api_0', run_at: Time.now + 1.minute }) Delayed::Job.enqueue(VCAP::CloudController::Jobs::Runtime::EventsCleanup.new(1), { queue: 'cc_generic', run_at: Time.now + 1.minute }) - CloudController::DependencyLocator.instance.periodic_updater.update! + VCAP::CloudController::Metrics::PeriodicUpdater::ALL_TASKS.each { |task| periodic_updater.update!(task) } end after do @@ -128,7 +129,7 @@ Delayed::Job.enqueue(VCAP::CloudController::Jobs::Runtime::EventsCleanup.new(1), { queue: 'cc_generic', run_at: Time.now + 1.day }) Delayed::Job.dataset.update(failed_at: Time.now.utc) - CloudController::DependencyLocator.instance.periodic_updater.update! + VCAP::CloudController::Metrics::PeriodicUpdater::ALL_TASKS.each { |task| periodic_updater.update!(task) } end after do diff --git a/spec/request/status_spec.rb b/spec/request/status_spec.rb index 72c89455f97..648994d024f 100644 --- a/spec/request/status_spec.rb +++ b/spec/request/status_spec.rb @@ -4,7 +4,7 @@ module VCAP::CloudController RSpec.describe 'Status Endpoint' do include Rack::Test::Methods - let(:metrics_webserver) { MetricsWebserver.new } + let(:metrics_webserver) { ApiMetricsWebserver.new } delegate :app, to: :metrics_webserver diff --git a/spec/request/v2/auth_spec.rb b/spec/request/v2/auth_spec.rb index d4bd9cd2e4a..f60a9628375 100644 --- a/spec/request/v2/auth_spec.rb +++ b/spec/request/v2/auth_spec.rb @@ -6,12 +6,8 @@ let(:user_header) { headers_for(user) } before do - @test_mode = ENV.fetch('CC_TEST', nil) - ENV['CC_TEST'] = nil - end - - after do - ENV['CC_TEST'] = @test_mode + allow(ENV).to receive(:fetch).with('CC_TEST', nil).and_return(nil) + allow(ENV).to receive(:fetch).and_call_original end context 'when the user has a valid token' do diff --git a/spec/unit/lib/cloud_controller/metrics_webserver_spec.rb b/spec/unit/lib/cloud_controller/api_metrics_webserver_spec.rb similarity index 96% rename from spec/unit/lib/cloud_controller/metrics_webserver_spec.rb rename to spec/unit/lib/cloud_controller/api_metrics_webserver_spec.rb index 47b6c6e7e48..93362117560 100644 --- a/spec/unit/lib/cloud_controller/metrics_webserver_spec.rb +++ b/spec/unit/lib/cloud_controller/api_metrics_webserver_spec.rb @@ -1,7 +1,7 @@ require 'spec_helper' module VCAP::CloudController - RSpec.describe MetricsWebserver do + RSpec.describe ApiMetricsWebserver do let(:metrics_webserver) { described_class.new } let(:config) { double('config', get: nil) } diff --git a/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb b/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb index 4a44cd3c305..cca6dfe9fcc 100644 --- a/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb +++ b/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb @@ -192,6 +192,35 @@ module VCAP::CloudController schedule.start end end + + describe 'publish metrics' do + before do + allow(clock).to receive(:schedule_frequent_worker_job) + allow(clock).to receive(:schedule_frequent_inline_job) + allow(clock).to receive(:schedule_daily_job) + allow(VCAP::CloudController::StandaloneMetricsWebserver).to receive(:start_for_bosh_job) + end + + context 'when set to false' do + before do + TestConfig.override(publish_metrics: false) + end + + it 'does not publish metrics' do + schedule.start + expect(VCAP::CloudController::StandaloneMetricsWebserver).not_to have_received(:start_for_bosh_job) + end + end + + context 'when set to true' do + before { TestConfig.override(publish_metrics: true) } + + it 'sets up a webserve' do + schedule.start + expect(VCAP::CloudController::StandaloneMetricsWebserver).to have_received(:start_for_bosh_job) + end + end + end end end end diff --git a/spec/unit/lib/cloud_controller/deployment_updater/scheduler_spec.rb b/spec/unit/lib/cloud_controller/deployment_updater/scheduler_spec.rb index 8f3dcafebf3..1d07f8e1de6 100644 --- a/spec/unit/lib/cloud_controller/deployment_updater/scheduler_spec.rb +++ b/spec/unit/lib/cloud_controller/deployment_updater/scheduler_spec.rb @@ -39,6 +39,7 @@ module VCAP::CloudController allow(DeploymentUpdater::Dispatcher).to receive(:dispatch) allow(CloudController::DependencyLocator.instance).to receive_messages(statsd_client:) allow(statsd_client).to receive(:timing) + allow(statsd_client).to receive(:batch) # mock vital metrics updates end it 'correctly configures a LockRunner and uses it to initialize a LockWorker' do @@ -153,6 +154,31 @@ module VCAP::CloudController ) end end + + describe 'publish metrics' do + before do + allow(VCAP::CloudController::StandaloneMetricsWebserver).to receive(:start_for_bosh_job) + end + + context 'when set to false' do + before do + TestConfig.override(publish_metrics: false) + end + + it 'does not publish metrics' do + DeploymentUpdater::Scheduler.start + end + end + + context 'when set to true' do + before { TestConfig.override(publish_metrics: true) } + + it 'sets up a webserve' do + DeploymentUpdater::Scheduler.start + expect(VCAP::CloudController::StandaloneMetricsWebserver).to have_received(:start_for_bosh_job) + end + end + end end end end diff --git a/spec/unit/lib/cloud_controller/execution_context_spec.rb b/spec/unit/lib/cloud_controller/execution_context_spec.rb new file mode 100644 index 00000000000..daa6c507dae --- /dev/null +++ b/spec/unit/lib/cloud_controller/execution_context_spec.rb @@ -0,0 +1,69 @@ +require 'spec_helper' +require 'tasks/rake_config' +require 'cloud_controller/execution_context' + +module VCAP::CloudController + RSpec.describe ExecutionContext do + context 'ExecutionInfo' do + let(:exec_info) { ExecutionContext::CC_WORKER } + + it 'has correct attributes' do + expect(exec_info.process_type).to eq('cc-worker') + expect(exec_info.capi_job_name).to eq('cloud_controller_worker') + expect(exec_info.rake_context).to eq(:worker) + end + + describe '#set_process_type_env' do + it 'sets the PROCESS_TYPE environment variable' do + expect(ENV).to receive(:[]=).with('PROCESS_TYPE', 'cc-worker') + exec_info.set_process_type_env + end + end + + describe '#set_rake_context' do + context 'when RakeConfig is defined' do + before { allow(RakeConfig).to receive(:config).and_return(TestConfig.config_instance) } + + it 'sets the RakeConfig context' do + exec_info.set_rake_context + expect(RakeConfig.context).to eq(:worker) + end + end + + context 'when RakeConfig is not defined' do + it 'raises an error' do + hide_const('RakeConfig') + expect { exec_info.set_rake_context }.to raise_error('RakeConfig is not defined or rake_context argument is nil') + end + end + + context 'when rake_context is nil' do + let(:exec_info) { ExecutionContext::API_PUMA_MAIN } + + it 'raises an error' do + expect { exec_info.set_rake_context }.to raise_error('RakeConfig is not defined or rake_context argument is nil') + end + end + end + end + + context 'from process type env' do + it 'returns the CC_WORKER execution context' do + allow(ENV).to receive(:fetch).with('PROCESS_TYPE', nil).and_return('cc-worker') + expect(ExecutionContext.from_process_type_env).to eq(ExecutionContext::CC_WORKER) + end + + it 'returns nil for unknown process type' do + allow(ENV).to receive(:fetch).with('PROCESS_TYPE', nil).and_return('unknown-process') + allow(ENV).to receive(:fetch).with('CC_TEST', nil).and_return(nil) + expect(ExecutionContext.from_process_type_env).to be_nil + end + + it 'returns API_PUMA_MAIN when PROCESS_TYPE is not set in unit test context' do + allow(ENV).to receive(:fetch).with('PROCESS_TYPE', nil).and_return(nil) + allow(ENV).to receive(:fetch).with('CC_TEST', nil).and_call_original # set by spec_helper + expect(ExecutionContext.from_process_type_env).to eq(ExecutionContext::API_PUMA_MAIN) + end + end + end +end diff --git a/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb b/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb index da96a56260a..773c701e148 100644 --- a/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb +++ b/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb @@ -654,7 +654,7 @@ module VCAP::CloudController::Metrics expect(periodic_updater).to receive(:update_deploying_count).once expect(periodic_updater).to receive(:update_webserver_stats).once - periodic_updater.update! + PeriodicUpdater::ALL_TASKS.each { |task| periodic_updater.update!(task) } end end @@ -679,5 +679,60 @@ module VCAP::CloudController::Metrics expect(logger).to have_received(:info).with(exception) end end + + describe 'task list configuration' do + describe 'with vitals task only' do + let(:vitals_updater) do + PeriodicUpdater.new(start_time, log_counter, logger, statsd_updater, prometheus_updater, [PeriodicUpdater::VITALS_TASK]) + end + + before do + allow(statsd_updater).to receive(:update_vitals) + allow(prometheus_updater).to receive(:update_vitals) + end + + it 'only sets up one timer task' do + vitals_updater.setup_updates + + expect(vitals_updater.instance_variable_get(:@update_tasks).length).to eq(1) + end + + it 'executes the vitals task' do + vitals_updater.update!(PeriodicUpdater::VITALS_TASK) + + expect(statsd_updater).to have_received(:update_vitals) + expect(prometheus_updater).to have_received(:update_vitals) + end + end + + describe 'with multiple specific tasks' do + let(:custom_updater) do + PeriodicUpdater.new( + start_time, + log_counter, + logger, + statsd_updater, + prometheus_updater, + [PeriodicUpdater::VITALS_TASK, PeriodicUpdater::LOG_COUNTS_TASK] + ) + end + + it 'sets up timer tasks for specified tasks only' do + allow(statsd_updater).to receive(:update_vitals) + allow(statsd_updater).to receive(:update_log_counts) + allow(prometheus_updater).to receive(:update_vitals) + + custom_updater.setup_updates + + expect(custom_updater.instance_variable_get(:@update_tasks).length).to eq(2) + end + end + + describe 'default ALL_TASKS behavior' do + it 'uses all tasks when no task_list is provided' do + expect(periodic_updater.instance_variable_get(:@task_list)).to eq(PeriodicUpdater::ALL_TASKS) + end + end + end end end diff --git a/spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb b/spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb index eb6b2603af3..361690d83b4 100644 --- a/spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb +++ b/spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb @@ -10,6 +10,46 @@ module VCAP::CloudController::Metrics Prometheus::Client::Registry.new end + describe '#initialize' do + RSpec.shared_examples 'metric registration based on execution context' do |exec_ctx, expected_metrics| + context "when running in #{exec_ctx} mode" do + before { allow(VCAP::CloudController::ExecutionContext).to receive(:from_process_type_env).and_return(exec_ctx) } + + it 'registers all expected metrics' do + expected_metric_names = expected_metrics.flatten(1).pluck(:name) + expect(updater.instance_variable_get('@registry').instance_variable_get('@metrics').keys).to match_array(expected_metric_names) + end + end + end + + it_behaves_like 'metric registration based on execution context', VCAP::CloudController::ExecutionContext::API_PUMA_MAIN, + [PrometheusUpdater::METRICS, PrometheusUpdater::PUMA_METRICS, PrometheusUpdater::DB_CONNECTION_POOL_METRICS, PrometheusUpdater::DELAYED_JOB_METRICS, PrometheusUpdater::VITAL_METRICS] + it_behaves_like 'metric registration based on execution context', VCAP::CloudController::ExecutionContext::API_PUMA_WORKER, + [PrometheusUpdater::METRICS, PrometheusUpdater::PUMA_METRICS, PrometheusUpdater::DB_CONNECTION_POOL_METRICS, PrometheusUpdater::DELAYED_JOB_METRICS, PrometheusUpdater::VITAL_METRICS] + it_behaves_like 'metric registration based on execution context', VCAP::CloudController::ExecutionContext::CC_WORKER, + [PrometheusUpdater::DB_CONNECTION_POOL_METRICS, PrometheusUpdater::DELAYED_JOB_METRICS, PrometheusUpdater::VITAL_METRICS] + it_behaves_like 'metric registration based on execution context', VCAP::CloudController::ExecutionContext::CLOCK, [PrometheusUpdater::DB_CONNECTION_POOL_METRICS, PrometheusUpdater::VITAL_METRICS] + it_behaves_like 'metric registration based on execution context', VCAP::CloudController::ExecutionContext::DEPLOYMENT_UPDATER, + [PrometheusUpdater::DB_CONNECTION_POOL_METRICS, PrometheusUpdater::VITAL_METRICS] + + it 'raises error when execution context is nil' do + allow(VCAP::CloudController::ExecutionContext).to receive(:from_process_type_env).and_return(nil) + expect { PrometheusUpdater.new(registry: prom_client) }.to raise_error('Could not register Prometheus metrics: Unknown execution context') + end + + context 'when execution context is set' do + before { allow(VCAP::CloudController::ExecutionContext).to receive(:from_process_type_env).and_return(VCAP::CloudController::ExecutionContext::API_PUMA_MAIN) } + + it 'initializes cc_db_connection_pool_timeouts_total metric with process_type label' do + PrometheusUpdater.new(registry: prom_client) # somehow we need to re-initialize to have the correct execution context applied + metric = prom_client.metrics.find { |m| m.name == :cc_db_connection_pool_timeouts_total } + expect(metric).to be_present + expect(metric.get(labels: { process_type: 'main' })).to eq(0.0) + expect(metric.get(labels: { process_type: 'puma_worker' })).to eq(0.0) + end + end + end + describe 'Prometheus creation guards work correctly' do # This might look to be a duplicate of 'records the current number of deployments that are DEPLOYING' # below, but it tests that at least one of the metric updating functions can be called multiple times diff --git a/spec/unit/lib/cloud_controller/runner_spec.rb b/spec/unit/lib/cloud_controller/runner_spec.rb index 499a4526e6d..7038071a593 100644 --- a/spec/unit/lib/cloud_controller/runner_spec.rb +++ b/spec/unit/lib/cloud_controller/runner_spec.rb @@ -271,8 +271,8 @@ module VCAP::CloudController let(:test_config_overrides) { super().merge(webserver: 'puma') } it 'starts the MetricsWebserver' do - expect(MetricsWebserver).to receive(:new).and_call_original - expect_any_instance_of(MetricsWebserver).to receive(:start) + expect(ApiMetricsWebserver).to receive(:new).and_call_original + expect_any_instance_of(ApiMetricsWebserver).to receive(:start) subject end diff --git a/spec/unit/lib/cloud_controller/runners/puma_runner_spec.rb b/spec/unit/lib/cloud_controller/runners/puma_runner_spec.rb index 40199689cc8..a38e882178e 100644 --- a/spec/unit/lib/cloud_controller/runners/puma_runner_spec.rb +++ b/spec/unit/lib/cloud_controller/runners/puma_runner_spec.rb @@ -163,13 +163,6 @@ module VCAP::CloudController puma_launcher.config.final_options[:before_worker_shutdown].first[:block].call end - it 'initializes the cc_db_connection_pool_timeouts_total for the worker before worker boot' do - subject - - expect(prometheus_updater).to receive(:update_gauge_metric).with(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: 'puma_worker' }) - puma_launcher.config.final_options[:before_worker_boot].first[:block].call - end - it 'sets environment variable `PROCESS_TYPE` to `puma_worker`' do subject @@ -194,15 +187,6 @@ module VCAP::CloudController end describe 'Events' do - describe 'after_booted' do - it 'sets up periodic metrics updater with EM and initializes cc_db_connection_pool_timeouts_total for the main process' do - expect(periodic_updater).to receive(:setup_updates) - expect(prometheus_updater).to receive(:update_gauge_metric).with(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: 'main' }) - - puma_launcher.events.fire(:after_booted) - end - end - describe 'after_stopped' do it 'stops the TimerTasks and logs incomplete requests' do expect(periodic_updater).to receive(:stop_updates) diff --git a/spec/unit/lib/cloud_controller/standalone_metrics_webserver_spec.rb b/spec/unit/lib/cloud_controller/standalone_metrics_webserver_spec.rb new file mode 100644 index 00000000000..09536f301ca --- /dev/null +++ b/spec/unit/lib/cloud_controller/standalone_metrics_webserver_spec.rb @@ -0,0 +1,55 @@ +require 'spec_helper' +require 'cloud_controller/standalone_metrics_webserver' +require 'cloud_controller/execution_context' + +module VCAP::CloudController + RSpec.describe StandaloneMetricsWebserver do + let(:port) { 1899 } + let(:server) { StandaloneMetricsWebserver.new(port) } + let(:puma_server) { instance_double(Puma::Server, run: nil) } + + before do + allow(server).to receive(:bosh_job_name).and_return('cloud_controller_ng') + allow(Puma::Server).to receive(:new).and_return(puma_server) + allow(Thread).to receive(:new).and_yield # otherwise the server runs in a separate thread and we can't test it + end + + describe '#initialize' do + it 'sets up the port' do + expect(server.instance_variable_get(:@port)).to eq(port) + end + end + + describe '#start' do + context 'when certificates are NOT configured' do + before do + allow(puma_server).to receive(:add_tcp_listener) + end + + it 'configures and starts a Puma server' do + server.start + + expect(puma_server).to have_received(:run) + expect(Puma::Server).to have_received(:new).with(kind_of(Rack::Builder)) + expect(puma_server).to have_received(:add_tcp_listener).with('127.0.0.1', port) + end + end + + context 'when certificates are configured' do + before do + allow(server).to receive_messages(use_ssl?: true, cert_path: '/some/path/cert.pem', key_path: '/some/path/key.pem', ca_path: '/some/path/ca.pem') + allow_any_instance_of(Puma::MiniSSL::Context).to receive(:check_file).and_return(true) + allow(puma_server).to receive(:add_ssl_listener) + end + + it 'configures and starts a Puma server with SSL' do + server.start + + expect(puma_server).to have_received(:run) + expect(Puma::Server).to have_received(:new).with(kind_of(Rack::Builder)) + expect(puma_server).to have_received(:add_ssl_listener).with('127.0.0.1', port, kind_of(Puma::MiniSSL::Context)) + end + end + end + end +end diff --git a/spec/unit/lib/delayed_job/delayed_worker_spec.rb b/spec/unit/lib/delayed_job/delayed_worker_spec.rb index acc681d39fe..956387f09d7 100644 --- a/spec/unit/lib/delayed_job/delayed_worker_spec.rb +++ b/spec/unit/lib/delayed_job/delayed_worker_spec.rb @@ -228,12 +228,16 @@ before do allow(cc_delayed_worker).to receive(:is_first_generic_worker_on_machine?).and_return(true) allow(cc_delayed_worker).to receive(:readiness_port) - allow(cc_delayed_worker).to receive(:setup_webserver) + allow(cc_delayed_worker).to receive(:setup_metrics).and_call_original + allow(TestConfig.config_instance).to receive(:get).and_call_original + allow(TestConfig.config_instance).to receive(:get).with(:prometheus_port).and_return(9394) + allow(VCAP::CloudController::StandaloneMetricsWebserver).to receive(:start_for_bosh_job) end it 'sets up a webserver' do cc_delayed_worker.start_working - expect(cc_delayed_worker).to have_received(:setup_webserver) + expect(cc_delayed_worker).to have_received(:setup_metrics) + expect(VCAP::CloudController::StandaloneMetricsWebserver).to have_received(:start_for_bosh_job) end end end diff --git a/spec/unit/lib/sequel/extensions/connection_metrics_spec.rb b/spec/unit/lib/sequel/extensions/connection_metrics_spec.rb index d639e698d66..8007123a781 100644 --- a/spec/unit/lib/sequel/extensions/connection_metrics_spec.rb +++ b/spec/unit/lib/sequel/extensions/connection_metrics_spec.rb @@ -1,6 +1,7 @@ require 'spec_helper' +require 'cloud_controller/execution_context' -RSpec.describe Sequel::ConnectionMetrics do +RSpec.describe 'Sequel::ConnectionMetrics' do let(:db_config) { DbConfig.new } # each test will have their own db connection pool. This helps to isolate the test from each other. let(:db) { VCAP::CloudController::DB.connect(db_config.config, db_config.db_logger) } @@ -16,15 +17,13 @@ describe 'initialize' do context 'api process' do before do + VCAP::CloudController::ExecutionContext::API_PUMA_WORKER.set_process_type_env db.loggers << logger db.sql_log_level = :info - db.extension(:connection_metrics) allow(thread).to receive(:alive?).and_return(true) db.pool.instance_variable_set(:@prometheus_updater, prometheus_updater) - - ENV['PROCESS_TYPE'] = 'puma_worker' end it 'initializes the prometheus_updater and connection_info' do @@ -35,33 +34,30 @@ context 'cc-worker process' do before do - ENV['PROCESS_TYPE'] = 'cc-worker' - allow(VCAP::CloudController::Metrics::PrometheusUpdater).to receive(:new).and_return(VCAP::CloudController::Metrics::PrometheusUpdater.new(cc_worker: true)) + VCAP::CloudController::ExecutionContext::CC_WORKER.set_process_type_env + allow(VCAP::CloudController::Metrics::PrometheusUpdater).to receive(:new).and_return(VCAP::CloudController::Metrics::PrometheusUpdater.new) db.loggers << logger db.sql_log_level = :info - db.extension(:connection_metrics) allow(thread).to receive(:alive?).and_return(true) end it 'initializes the prometheus_updater and for cc-worker' do - expect(VCAP::CloudController::Metrics::PrometheusUpdater).to have_received(:new).with({ cc_worker: true }) + expect(VCAP::CloudController::Metrics::PrometheusUpdater).to have_received(:new) end end end describe 'methods' do before do + VCAP::CloudController::ExecutionContext::API_PUMA_WORKER.set_process_type_env db.loggers << logger db.sql_log_level = :info - db.extension(:connection_metrics) allow(thread).to receive(:alive?).and_return(true) db.pool.instance_variable_set(:@prometheus_updater, prometheus_updater) - - ENV['PROCESS_TYPE'] = 'puma_worker' end describe 'acquire' do