From 584ad7dd8405bfaa104ec2e05c996608012d6c70 Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Tue, 10 Jun 2025 13:43:07 -0500 Subject: [PATCH 1/3] feat: add uptime monitors with datadog synthetics --- pillar/dev/secrets/datadog.sls | 2 + salt/datadog/synthetics.sls | 76 ++++++++++++++++++++++++++++++++++ salt/top.sls | 2 + 3 files changed, 80 insertions(+) create mode 100644 pillar/dev/secrets/datadog.sls create mode 100644 salt/datadog/synthetics.sls diff --git a/pillar/dev/secrets/datadog.sls b/pillar/dev/secrets/datadog.sls new file mode 100644 index 00000000..193250ce --- /dev/null +++ b/pillar/dev/secrets/datadog.sls @@ -0,0 +1,2 @@ +datadog_api_key: deadbeefdeadbeefdeadbeefdeadbeefdeadbeef +datadog_app_key: deadbeefdeadbeefdeadbeefdeadbeefdeadbeef diff --git a/salt/datadog/synthetics.sls b/salt/datadog/synthetics.sls new file mode 100644 index 00000000..71c962e4 --- /dev/null +++ b/salt/datadog/synthetics.sls @@ -0,0 +1,76 @@ +{% set minion_id = grains['id'] %} +{% set fqdn = grains['fqdn'] %} +{% set api_key = pillar.get('datadog_api_key') %} +{% set app_key = pillar.get('datadog_app_key') %} +{% set datadog_locations = salt['http.query']('https://api.datadoghq.com/api/v1/synthetics/locations', header_list=['DD-API-KEY: ' + api_key, 'DD-APPLICATION-KEY: ' + app_key], decode=True) %} +{% set existing_monitors = salt['http.query']('https://api.datadoghq.com/api/v1/synthetics/tests', header_list=['DD-API-KEY: ' + api_key, 'DD-APPLICATION-KEY: ' + app_key], decode=True) %} +{% set monitor_name = minion_id + ' HTTP Health' %} +{% set monitor_exists = existing_monitors.get('dict', {}).get('tests', []) | selectattr('name', 'equalto', monitor_name) | list | length > 0 %} + +#notable this fails to capture multi-host minions (bugs has bugs.python, bugs.jython, bugs.roundup and +# codespeed has speed.python and speed.pypy) +{% set web_roles = ['loadbalancer', 'docs', 'downloads', 'hg', 'moin', 'planet', 'bugs', 'buildbot', 'codespeed', 'pythontest'] %} +{% set matched_roles = [] %} +{% for role in web_roles %} + {% if salt["match.compound"](pillar["roles"][role]["pattern"]) %} + {% set _ = matched_roles.append(role) %} + {% endif %} +{% endfor %} +{% set is_web_minion = matched_roles|length > 0 %} +{% set is_loadbalancer = 'loadbalancer' in matched_roles %} + +# hit the haproxy status page for loadbalancers or the root for other web minions +# web minions also haev _haproxy_status endpoint but im not sure if that needs to be checked? +{% set health_url = "https://" + fqdn + ("/_haproxy_status" if is_loadbalancer else "/") %} + +{% if is_web_minion and api_key and app_key and not monitor_exists %} +create-synthetics-monitor-{{ minion_id }}: + http.query: + - name: https://api.datadoghq.com/api/v1/synthetics/tests + - method: POST + - header_list: + - "DD-API-KEY: {{ api_key }}" + - "DD-APPLICATION-KEY: {{ app_key }}" + - "Content-Type: application/json" + - data: | + { + "name": "{{ minion_id }} HTTP Health", + "type": "api", + "subtype": "http", + "config": { + "request": { + "url": "{{ health_url }}", + "method": "GET", + "timeout": 30 + }, + "assertions": [ + { + "type": "statusCode", + "operator": "is", + "target": 200 + }, + { + "type": "responseTime", + "operator": "lessThan", + "target": 2000 + } + ] + }, + "locations": {{ datadog_locations.get('dict', {}).get('locations', []) | map(attribute='id') | list | tojson }}, + "options": { + "tick_every": 60, + "min_failure_duration": 180, + "min_location_failed": 5, + "retry": { + "count": 1, + "interval": 300 + } + }, + "message": "{{ minion_id }} is down in 5 or more locations! @pagerduty-Datadog", + "tags": [ + "minion_id:{{ minion_id }}", + "auto_created:salt.synthetics.sls" + ] + } + - status: 200 +{% endif %} diff --git a/salt/top.sls b/salt/top.sls index ab5dd716..f80e09fa 100644 --- a/salt/top.sls +++ b/salt/top.sls @@ -17,6 +17,8 @@ base: - tls - rsyslog - datadog + - datadog.synthetics + - secrets.datadog - base.motd - base.swap From 0b53f1e5b5511a48b9982715f1c570999fb6dad6 Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Tue, 10 Jun 2025 13:48:31 -0500 Subject: [PATCH 2/3] chore: uv run tox -e lint --- salt/datadog/synthetics.sls | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/salt/datadog/synthetics.sls b/salt/datadog/synthetics.sls index 71c962e4..0d7d5edd 100644 --- a/salt/datadog/synthetics.sls +++ b/salt/datadog/synthetics.sls @@ -51,7 +51,7 @@ create-synthetics-monitor-{{ minion_id }}: }, { "type": "responseTime", - "operator": "lessThan", + "operator": "lessThan", "target": 2000 } ] From acf3a723fde04942cdac4e092f8d71fbfefe3925 Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Tue, 10 Jun 2025 14:10:55 -0500 Subject: [PATCH 3/3] docs: pingdom -> datadog --- docs/guides/migration-recipe.md | 2 +- docs/overview.rst | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/guides/migration-recipe.md b/docs/guides/migration-recipe.md index 7b0aedc5..2abdeab0 100644 --- a/docs/guides/migration-recipe.md +++ b/docs/guides/migration-recipe.md @@ -155,7 +155,7 @@ index 68387c9..7a8ace1 100644 sudo service nginx stop ``` ```{note} - Don't forget to pause service checks for both the old and new hosts in things like Dead Man's Snitch, Pingdom, etc. + Don't forget to pause service checks for both the old and new hosts in things like Dead Man's Snitch, Datadog, etc. ``` 4. Ensure that any additional volumes are mounted and in the correct location: - Check what disks are currently mounted and where: `df` diff --git a/docs/overview.rst b/docs/overview.rst index 8e2ff8b2..d87d2d3c 100644 --- a/docs/overview.rst +++ b/docs/overview.rst @@ -57,11 +57,8 @@ Amazon Route 53 It is currently manually managed by Infrastructure Staff. DataDog - `DataDog `_ provides metrics, dashboards, and alerts. - -Pingdom - `Pingdom `_ provides monitoring and complains to us - when services are down. + `DataDog `_ provides metrics, dashboards, alerts, + provides some monitoring and complains to us when services are down. PagerDuty `PagerDuty `_ is used for on-call rotation for PSF