From e2c39938498cf68121f6d5d6b1a9257a131822be Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Fri, 24 Oct 2025 16:51:16 +0200 Subject: [PATCH 01/19] Add force-new-cluster helper --- helpers/README.md | 37 +++++ helpers/force-new-cluster.yml | 292 ++++++++++++++++++++++++++++++++++ release-notes.md | 11 ++ 3 files changed, 340 insertions(+) create mode 100644 helpers/force-new-cluster.yml diff --git a/helpers/README.md b/helpers/README.md index a6faaf1..3ccaebd 100644 --- a/helpers/README.md +++ b/helpers/README.md @@ -26,6 +26,43 @@ This directory contains multiple helper tools for various OpenShift cluster oper ## Available Tools +### Force New Cluster + +Automates etcd cluster recovery by configuring CIB (Cluster Information Base) attributes to force a new etcd cluster formation. This is useful when etcd quorum is lost and manual intervention is required to restore cluster functionality. + +**Features:** +- Automated etcd snapshot creation before recovery operations +- CIB attribute management for force-new-cluster operations +- Leader/follower node detection and verification +- Etcd member list management +- Automatic cleanup and resource recovery +- STONITH management during operations + +**Usage:** + +```bash +# From helpers/ directory +ansible-playbook -i ../deploy/openshift-clusters/inventory.ini force-new-cluster.yml +``` + +**Prerequisites:** +- Inventory file with exactly 2 nodes in `cluster_vms` group +- SSH access to cluster VMs with sudo privileges +- Running Pacemaker cluster with etcd resources + +**What it does:** +1. Validates cluster has exactly 2 nodes +2. Disables STONITH temporarily for safety +3. Takes etcd snapshots on both nodes (if etcd is not running) +4. Clears existing CIB attributes (learner_node, standalone_node, force_new_cluster) +5. Sets force_new_cluster attribute on the leader node (first node in cluster_vms) +6. Verifies CIB attributes on both nodes +7. Removes follower from etcd member list +8. Performs pcs resource cleanup on both nodes +9. Re-enables STONITH after completion + +**Attribution:** Original shell script by Carlo Lobrano + ### Log Collection Collects etcd related logs from cluster VMs diff --git a/helpers/force-new-cluster.yml b/helpers/force-new-cluster.yml new file mode 100644 index 0000000..c5ae132 --- /dev/null +++ b/helpers/force-new-cluster.yml @@ -0,0 +1,292 @@ +--- +# Force New Cluster - Configure CIB attributes for etcd cluster recovery +# Original shell script by Carlo Lobrano https://gitlab.cee.redhat.com/clobrano/2no-lab/-/blob/main/bin/force-new-cluster +# Ansible conversion for two-node-toolbox project + +- name: Force New Cluster - Configure CIB attributes for etcd cluster recovery + hosts: cluster_vms + gather_facts: true + become: true + + vars: + # Leader is the first node in cluster_vms group, follower is the second + leader_node: "{{ groups['cluster_vms'][0] }}" + follower_node: "{{ groups['cluster_vms'][1] }}" + snapshot_name: "etcd-snapshot-{{ ansible_date_time.iso8601_basic_short }}.db" + snapshot_dir: "/var/home/core" + snapshot_retention_count: 2 + + pre_tasks: + - name: Validate cluster_vms group has exactly 2 nodes + run_once: true + delegate_to: localhost + ansible.builtin.assert: + that: + - groups['cluster_vms'] | length == 2 + fail_msg: "This playbook requires exactly 2 nodes in the cluster_vms group. Found {{ groups['cluster_vms'] | length }} nodes." + success_msg: "Cluster has required 2 nodes: {{ groups['cluster_vms'] | join(', ') }}" + + - name: Gather hostnames from all nodes + ansible.builtin.command: hostname + register: hostname_result + changed_when: false + + - name: Set hostname facts + ansible.builtin.set_fact: + node_hostname: "{{ hostname_result.stdout }}" + + - name: Register leader hostname + ansible.builtin.set_fact: + leader_hostname: "{{ hostvars[leader_node]['node_hostname'] }}" + run_once: true + delegate_to: "{{ leader_node }}" + + - name: Register follower hostname + ansible.builtin.set_fact: + follower_hostname: "{{ hostvars[follower_node]['node_hostname'] }}" + run_once: true + delegate_to: "{{ follower_node }}" + + tasks: + - name: Disable stonith on leader node + ansible.builtin.command: pcs property set stonith-enabled=false + delegate_to: "{{ leader_node }}" + run_once: true + changed_when: true + + - name: Check if etcd is running on leader node + ansible.builtin.command: podman ps + delegate_to: "{{ leader_node }}" + register: leader_etcd_status + changed_when: false + run_once: true + failed_when: false + + - name: Determine recovery scenario + ansible.builtin.set_fact: + leader_has_etcd: "{{ 'etcd' in leader_etcd_status.stdout }}" + run_once: true + + - name: Handle scenario where no etcd is running on leader + when: not leader_has_etcd + block: + - name: Take etcd snapshot on both nodes + ansible.builtin.copy: + src: "/var/lib/etcd/member/snap/db" + dest: "{{ snapshot_dir }}/{{ snapshot_name }}" + remote_src: true + owner: core + group: core + mode: '0644' + + - name: Clean up old snapshots (keep last {{ snapshot_retention_count }}) + ansible.builtin.shell: | + ls -1t {{ snapshot_dir }}/etcd-snapshot-*.db 2>/dev/null | tail -n +{{ snapshot_retention_count + 1 }} | xargs -r rm -f + args: + executable: /bin/bash + changed_when: true + failed_when: false + + - name: Display snapshot location + ansible.builtin.debug: + msg: "✓ etcd snapshot saved on {{ inventory_hostname }} to: {{ snapshot_dir }}/{{ snapshot_name }}" + + - name: Clear CIB attributes on all nodes + block: + - name: Delete learner_node attribute + ansible.builtin.command: crm_attribute --delete --name "learner_node" + failed_when: false + changed_when: true + + - name: Delete standalone_node attribute + ansible.builtin.command: crm_attribute --delete --name "standalone_node" + failed_when: false + changed_when: true + + - name: Clear force_new_cluster attribute from leader node + ansible.builtin.command: crm_attribute --delete --node "{{ leader_hostname }}" --lifetime reboot --name "force_new_cluster" + delegate_to: "{{ leader_node }}" + run_once: true + failed_when: false + changed_when: true + + - name: Clear force_new_cluster attribute from follower node + ansible.builtin.command: crm_attribute --delete --node "{{ follower_hostname }}" --lifetime reboot --name "force_new_cluster" + delegate_to: "{{ follower_node }}" + run_once: true + failed_when: false + changed_when: true + + - name: Set force_new_cluster attribute on leader node + ansible.builtin.command: crm_attribute --lifetime reboot --node "{{ leader_hostname }}" --name "force_new_cluster" --update "{{ leader_hostname }}" + delegate_to: "{{ leader_node }}" + run_once: true + changed_when: true + + - name: Verify CIB attributes on leader node + delegate_to: "{{ leader_node }}" + run_once: true + block: + - name: Query CIB attributes on leader + ansible.builtin.command: crm_attribute --query --node "{{ leader_hostname }}" + register: leader_cib_attrs + changed_when: false + + - name: Check for unexpected standalone or learner attributes on leader + ansible.builtin.assert: + that: + - "'standalone' not in leader_cib_attrs.stdout" + - "'learner' not in leader_cib_attrs.stdout" + fail_msg: | + Unexpected standalone or learner attributes on {{ leader_hostname }} + Output: {{ leader_cib_attrs.stdout }} + + - name: Query reboot-lifetime CIB attributes on leader + ansible.builtin.command: crm_attribute --query --lifetime reboot --node "{{ leader_hostname }}" + register: leader_reboot_attrs + changed_when: false + + - name: Verify force_new_cluster attribute is present on leader + ansible.builtin.assert: + that: + - "'force_new_cluster' in leader_reboot_attrs.stdout" + fail_msg: | + Missing force_new_cluster attribute on {{ leader_hostname }} + Output: {{ leader_reboot_attrs.stdout }} + + - name: Verify CIB attributes on follower node + delegate_to: "{{ follower_node }}" + run_once: true + block: + - name: Query CIB attributes on follower + ansible.builtin.command: crm_attribute --query --node "{{ follower_hostname }}" + register: follower_cib_attrs + changed_when: false + + - name: Check for unexpected standalone or learner attributes on follower + ansible.builtin.assert: + that: + - "'standalone' not in follower_cib_attrs.stdout" + - "'learner' not in follower_cib_attrs.stdout" + fail_msg: | + Unexpected standalone or learner attributes on {{ follower_hostname }} + Output: {{ follower_cib_attrs.stdout }} + + - name: Query reboot-lifetime CIB attributes on follower + ansible.builtin.command: crm_attribute --query --lifetime reboot --node "{{ follower_hostname }}" + register: follower_reboot_attrs + changed_when: false + failed_when: false + + - name: Verify force_new_cluster attribute is NOT present on follower + ansible.builtin.assert: + that: + - "'force_new_cluster' not in follower_reboot_attrs.stdout" + fail_msg: | + Unexpected force_new_cluster attribute on {{ follower_hostname }} + Output: {{ follower_reboot_attrs.stdout }} + + - name: Remove follower from etcd member list + delegate_to: "{{ leader_node }}" + run_once: true + when: leader_has_etcd + block: + - name: Get etcd member list + ansible.builtin.command: podman exec etcd etcdctl member list + register: etcd_member_list + changed_when: false + + - name: Extract follower member ID by hostname + ansible.builtin.set_fact: + follower_member_id: "{{ (etcd_member_list.stdout_lines | select('search', follower_hostname) | first | split(','))[0] | default('') }}" + when: follower_hostname in etcd_member_list.stdout + + - name: Extract follower member ID by unstarted state (fallback) + ansible.builtin.set_fact: + follower_member_id: "{{ (etcd_member_list.stdout_lines | select('search', 'unstarted') | first | split(','))[0] | default('') }}" + when: + - follower_hostname not in etcd_member_list.stdout + - "'unstarted' in etcd_member_list.stdout" + + - name: Display etcd member list if follower not found + ansible.builtin.debug: + msg: | + Could not find follower {{ follower_hostname }} in etcd member list. Nothing to do. + Member list: + {{ etcd_member_list.stdout }} + when: follower_member_id is not defined or follower_member_id == '' + + - name: Remove follower from etcd cluster + ansible.builtin.command: podman exec etcd etcdctl member remove {{ follower_member_id }} + when: + - follower_member_id is defined + - follower_member_id != '' + changed_when: true + + - name: Display removal confirmation + ansible.builtin.debug: + msg: "Removing follower member ID: {{ follower_member_id }} ({{ follower_hostname }})" + when: + - follower_member_id is defined + - follower_member_id != '' + + - name: Cleanup etcd resource on leader node + ansible.builtin.command: pcs resource cleanup etcd + delegate_to: "{{ leader_node }}" + run_once: true + changed_when: true + + - name: Cleanup etcd resource on follower node + ansible.builtin.command: pcs resource cleanup etcd + delegate_to: "{{ follower_node }}" + run_once: true + changed_when: true + + - name: Wait for etcd to potentially start (no-etcd scenario) + ansible.builtin.pause: + seconds: 10 + when: not leader_has_etcd + run_once: true + + - name: Re-check etcd status after cleanup (no-etcd scenario) + ansible.builtin.command: podman ps + delegate_to: "{{ leader_node }}" + register: leader_etcd_recheck + changed_when: false + run_once: true + when: not leader_has_etcd + + - name: Display etcd recovery status + ansible.builtin.debug: + msg: | + {% if not leader_has_etcd %} + {% if 'etcd' in leader_etcd_recheck.stdout %} + ✓ Leader etcd is now running after cleanup. + {% else %} + ⚠ Leader etcd is still not running after cleanup. Manual intervention may be required. + CIB attributes have been set for force-new-cluster on {{ leader_hostname }} + {% endif %} + {% else %} + ✓ All force-new-cluster operations completed successfully. + {% endif %} + run_once: true + + - name: Re-enable stonith on leader node + ansible.builtin.command: pcs property set stonith-enabled=true + delegate_to: "{{ leader_node }}" + run_once: true + changed_when: true + register: stonith_enable + failed_when: false + + - name: Display stonith re-enable status + ansible.builtin.debug: + msg: "{% if stonith_enable.rc != 0 %}⚠ WARNING: Could not re-enable stonith!{% else %}✓ Stonith re-enabled successfully{% endif %}" + run_once: true + + post_tasks: + - name: Display completion message + ansible.builtin.debug: + msg: "✓ Force new cluster operation completed. All tests passed." + run_once: true + when: leader_has_etcd diff --git a/release-notes.md b/release-notes.md index 9726706..23ee2b6 100644 --- a/release-notes.md +++ b/release-notes.md @@ -1,5 +1,16 @@ # Two-Node Toolbox Release Notes +## Version 0.5.6 - Etcd Cluster Recovery +*Release Date: October 2025* + +### New Features + +#### Force New Cluster Playbook +- Added `force-new-cluster.yml` for automated etcd cluster recovery via CIB attributes +- Ansible conversion of Carlo Lobrano's shell script using `cluster_vms` inventory group + +--- + ## Version 0.5.5 - Cluster VM Inventory and Playbook Standardization *Release Date: October 2025* From c0dbbe659a3b97de93c5aeb1360c4cd1380e0c91 Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Mon, 27 Oct 2025 12:51:22 +0100 Subject: [PATCH 02/19] Initial docs for claude etcd slash command --- .../etcd/etcd-ops-guide/clustering.md | 500 ++++ .../etcd/etcd-ops-guide/configuration.md | 345 +++ .../commands/etcd/etcd-ops-guide/container.md | 165 ++ .../etcd/etcd-ops-guide/data_corruption.md | 71 + .../commands/etcd/etcd-ops-guide/failures.md | 48 + .../etcd/etcd-ops-guide/maintenance.md | 176 ++ .../etcd/etcd-ops-guide/monitoring.md | 188 ++ .../commands/etcd/etcd-ops-guide/recovery.md | 148 ++ .../etcd-ops-guide/runtime-configuration.md | 252 ++ .../etcd-ops-guide/runtime-reconf-design.md | 54 + .../administrative.rst | 150 ++ .../Pacemaker_Administration/agents.rst | 1182 ++++++++++ .../Pacemaker_Administration/alerts.rst | 343 +++ .../Pacemaker_Administration/cluster.rst | 21 + .../Pacemaker_Administration/configuring.rst | 263 +++ .../Pacemaker_Administration/index.rst | 28 + .../Pacemaker_Administration/installing.rst | 9 + .../Pacemaker_Administration/intro.rst | 21 + .../Pacemaker_Administration/moving.rst | 303 +++ .../Pacemaker_Administration/options.rst | 232 ++ .../Pacemaker_Administration/pcs-crmsh.rst | 444 ++++ .../Pacemaker_Administration/tools.rst | 576 +++++ .../troubleshooting.rst | 128 + .../Pacemaker_Administration/upgrading.rst | 579 +++++ .../commands/etcd/pacemaker/podman-etcd.sh | 2052 +++++++++++++++++ 25 files changed, 8278 insertions(+) create mode 100644 .claude/commands/etcd/etcd-ops-guide/clustering.md create mode 100644 .claude/commands/etcd/etcd-ops-guide/configuration.md create mode 100644 .claude/commands/etcd/etcd-ops-guide/container.md create mode 100644 .claude/commands/etcd/etcd-ops-guide/data_corruption.md create mode 100644 .claude/commands/etcd/etcd-ops-guide/failures.md create mode 100644 .claude/commands/etcd/etcd-ops-guide/maintenance.md create mode 100644 .claude/commands/etcd/etcd-ops-guide/monitoring.md create mode 100644 .claude/commands/etcd/etcd-ops-guide/recovery.md create mode 100644 .claude/commands/etcd/etcd-ops-guide/runtime-configuration.md create mode 100644 .claude/commands/etcd/etcd-ops-guide/runtime-reconf-design.md create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/administrative.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/agents.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/alerts.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/cluster.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/configuring.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/index.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/installing.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/intro.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/moving.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/options.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/pcs-crmsh.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/tools.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/troubleshooting.rst create mode 100644 .claude/commands/etcd/pacemaker/Pacemaker_Administration/upgrading.rst create mode 100644 .claude/commands/etcd/pacemaker/podman-etcd.sh diff --git a/.claude/commands/etcd/etcd-ops-guide/clustering.md b/.claude/commands/etcd/etcd-ops-guide/clustering.md new file mode 100644 index 0000000..b6c7be3 --- /dev/null +++ b/.claude/commands/etcd/etcd-ops-guide/clustering.md @@ -0,0 +1,500 @@ +--- +title: Clustering Guide +weight: 4150 +description: "Bootstrapping an etcd cluster: Static, etcd Discovery, and DNS Discovery" +--- + +## Overview + +Starting an etcd cluster statically requires that each member knows another in the cluster. In a number of cases, the IPs of the cluster members may be unknown ahead of time. In these cases, the etcd cluster can be bootstrapped with the help of a discovery service. + +Once an etcd cluster is up and running, adding or removing members is done via [runtime reconfiguration][runtime-conf]. To better understand the design behind runtime reconfiguration, we suggest reading [the runtime configuration design document][runtime-reconf-design]. + +This guide will cover the following mechanisms for bootstrapping an etcd cluster: + +* [Static](#static) +* [etcd Discovery](#etcd-discovery) +* [DNS Discovery](#dns-discovery) + +Each of the bootstrapping mechanisms will be used to create a three machine etcd cluster with the following details: + +|Name|Address|Hostname| +|------|---------|------------------| +|infra0|10.0.1.10|infra0.example.com| +|infra1|10.0.1.11|infra1.example.com| +|infra2|10.0.1.12|infra2.example.com| + +## Static + +As we know the cluster members, their addresses and the size of the cluster before starting, we can use an offline bootstrap configuration by setting the `initial-cluster` flag. Each machine will get either the following environment variables or command line: + +``` +ETCD_INITIAL_CLUSTER="infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380" +ETCD_INITIAL_CLUSTER_STATE=new +``` + +``` +--initial-cluster infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380 \ +--initial-cluster-state new +``` + +Note that the URLs specified in `initial-cluster` are the _advertised peer URLs_, i.e. they should match the value of `initial-advertise-peer-urls` on the respective nodes. + +If spinning up multiple clusters (or creating and destroying a single cluster) with same configuration for testing purpose, it is highly recommended that each cluster is given a unique `initial-cluster-token`. By doing this, etcd can generate unique cluster IDs and member IDs for the clusters even if they otherwise have the exact same configuration. This can protect etcd from cross-cluster-interaction, which might corrupt the clusters. + +etcd listens on [`listen-client-urls`][conf-listen-client] to accept client traffic. etcd member advertises the URLs specified in [`advertise-client-urls`][conf-adv-client] to other members, proxies, clients. Please make sure the `advertise-client-urls` are reachable from intended clients. A common mistake is setting `advertise-client-urls` to localhost or leave it as default if the remote clients should reach etcd. + +On each machine, start etcd with these flags: + +``` +$ etcd --name infra0 --initial-advertise-peer-urls http://10.0.1.10:2380 \ + --listen-peer-urls http://10.0.1.10:2380 \ + --listen-client-urls http://10.0.1.10:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.10:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380 \ + --initial-cluster-state new +``` +``` +$ etcd --name infra1 --initial-advertise-peer-urls http://10.0.1.11:2380 \ + --listen-peer-urls http://10.0.1.11:2380 \ + --listen-client-urls http://10.0.1.11:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.11:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380 \ + --initial-cluster-state new +``` +``` +$ etcd --name infra2 --initial-advertise-peer-urls http://10.0.1.12:2380 \ + --listen-peer-urls http://10.0.1.12:2380 \ + --listen-client-urls http://10.0.1.12:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.12:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380 \ + --initial-cluster-state new +``` + +The command line parameters starting with `--initial-cluster` will be ignored on subsequent runs of etcd. Feel free to remove the environment variables or command line flags after the initial bootstrap process. If the configuration needs changes later (for example, adding or removing members to/from the cluster), see the [runtime configuration][runtime-conf] guide. + +### TLS + +etcd supports encrypted communication through the TLS protocol. TLS channels can be used for encrypted internal cluster communication between peers as well as encrypted client traffic. This section provides examples for setting up a cluster with peer and client TLS. Additional information detailing etcd's TLS support can be found in the [security guide][security-guide]. + +#### Self-signed certificates + +A cluster using self-signed certificates both encrypts traffic and authenticates its connections. To start a cluster with self-signed certificates, each cluster member should have a unique key pair (`member.crt`, `member.key`) signed by a shared cluster CA certificate (`ca.crt`) for both peer connections and client connections. Certificates may be generated by following the etcd [TLS setup][tls-setup] example. + +On each machine, etcd would be started with these flags: + +``` +$ etcd --name infra0 --initial-advertise-peer-urls https://10.0.1.10:2380 \ + --listen-peer-urls https://10.0.1.10:2380 \ + --listen-client-urls https://10.0.1.10:2379,https://127.0.0.1:2379 \ + --advertise-client-urls https://10.0.1.10:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster infra0=https://10.0.1.10:2380,infra1=https://10.0.1.11:2380,infra2=https://10.0.1.12:2380 \ + --initial-cluster-state new \ + --client-cert-auth --trusted-ca-file=/path/to/ca-client.crt \ + --cert-file=/path/to/infra0-client.crt --key-file=/path/to/infra0-client.key \ + --peer-client-cert-auth --peer-trusted-ca-file=ca-peer.crt \ + --peer-cert-file=/path/to/infra0-peer.crt --peer-key-file=/path/to/infra0-peer.key +``` +``` +$ etcd --name infra1 --initial-advertise-peer-urls https://10.0.1.11:2380 \ + --listen-peer-urls https://10.0.1.11:2380 \ + --listen-client-urls https://10.0.1.11:2379,https://127.0.0.1:2379 \ + --advertise-client-urls https://10.0.1.11:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster infra0=https://10.0.1.10:2380,infra1=https://10.0.1.11:2380,infra2=https://10.0.1.12:2380 \ + --initial-cluster-state new \ + --client-cert-auth --trusted-ca-file=/path/to/ca-client.crt \ + --cert-file=/path/to/infra1-client.crt --key-file=/path/to/infra1-client.key \ + --peer-client-cert-auth --peer-trusted-ca-file=ca-peer.crt \ + --peer-cert-file=/path/to/infra1-peer.crt --peer-key-file=/path/to/infra1-peer.key +``` +``` +$ etcd --name infra2 --initial-advertise-peer-urls https://10.0.1.12:2380 \ + --listen-peer-urls https://10.0.1.12:2380 \ + --listen-client-urls https://10.0.1.12:2379,https://127.0.0.1:2379 \ + --advertise-client-urls https://10.0.1.12:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster infra0=https://10.0.1.10:2380,infra1=https://10.0.1.11:2380,infra2=https://10.0.1.12:2380 \ + --initial-cluster-state new \ + --client-cert-auth --trusted-ca-file=/path/to/ca-client.crt \ + --cert-file=/path/to/infra2-client.crt --key-file=/path/to/infra2-client.key \ + --peer-client-cert-auth --peer-trusted-ca-file=ca-peer.crt \ + --peer-cert-file=/path/to/infra2-peer.crt --peer-key-file=/path/to/infra2-peer.key +``` + +#### Automatic certificates + +If the cluster needs encrypted communication but does not require authenticated connections, etcd can be configured to automatically generate its keys. On initialization, each member creates its own set of keys based on its advertised IP addresses and hosts. + +On each machine, etcd would be started with these flags: + +``` +$ etcd --name infra0 --initial-advertise-peer-urls https://10.0.1.10:2380 \ + --listen-peer-urls https://10.0.1.10:2380 \ + --listen-client-urls https://10.0.1.10:2379,https://127.0.0.1:2379 \ + --advertise-client-urls https://10.0.1.10:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster infra0=https://10.0.1.10:2380,infra1=https://10.0.1.11:2380,infra2=https://10.0.1.12:2380 \ + --initial-cluster-state new \ + --auto-tls \ + --peer-auto-tls +``` +``` +$ etcd --name infra1 --initial-advertise-peer-urls https://10.0.1.11:2380 \ + --listen-peer-urls https://10.0.1.11:2380 \ + --listen-client-urls https://10.0.1.11:2379,https://127.0.0.1:2379 \ + --advertise-client-urls https://10.0.1.11:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster infra0=https://10.0.1.10:2380,infra1=https://10.0.1.11:2380,infra2=https://10.0.1.12:2380 \ + --initial-cluster-state new \ + --auto-tls \ + --peer-auto-tls +``` +``` +$ etcd --name infra2 --initial-advertise-peer-urls https://10.0.1.12:2380 \ + --listen-peer-urls https://10.0.1.12:2380 \ + --listen-client-urls https://10.0.1.12:2379,https://127.0.0.1:2379 \ + --advertise-client-urls https://10.0.1.12:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster infra0=https://10.0.1.10:2380,infra1=https://10.0.1.11:2380,infra2=https://10.0.1.12:2380 \ + --initial-cluster-state new \ + --auto-tls \ + --peer-auto-tls +``` + +### Error cases + +In the following example, we have not included our new host in the list of enumerated nodes. If this is a new cluster, the node _must_ be added to the list of initial cluster members. + +``` +$ etcd --name infra1 --initial-advertise-peer-urls http://10.0.1.11:2380 \ + --listen-peer-urls https://10.0.1.11:2380 \ + --listen-client-urls http://10.0.1.11:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.11:2379 \ + --initial-cluster infra0=http://10.0.1.10:2380 \ + --initial-cluster-state new +etcd: infra1 not listed in the initial cluster config +exit 1 +``` + +In this example, we are attempting to map a node (infra0) on a different address (127.0.0.1:2380) than its enumerated address in the cluster list (10.0.1.10:2380). If this node is to listen on multiple addresses, all addresses _must_ be reflected in the "initial-cluster" configuration directive. + +``` +$ etcd --name infra0 --initial-advertise-peer-urls http://127.0.0.1:2380 \ + --listen-peer-urls http://10.0.1.10:2380 \ + --listen-client-urls http://10.0.1.10:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.10:2379 \ + --initial-cluster infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380 \ + --initial-cluster-state=new +etcd: error setting up initial cluster: infra0 has different advertised URLs in the cluster and advertised peer URLs list +exit 1 +``` + +If a peer is configured with a different set of configuration arguments and attempts to join this cluster, etcd will report a cluster ID mismatch will exit. + +``` +$ etcd --name infra3 --initial-advertise-peer-urls http://10.0.1.13:2380 \ + --listen-peer-urls http://10.0.1.13:2380 \ + --listen-client-urls http://10.0.1.13:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.13:2379 \ + --initial-cluster infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra3=http://10.0.1.13:2380 \ + --initial-cluster-state=new +etcd: conflicting cluster ID to the target cluster (c6ab534d07e8fcc4 != bc25ea2a74fb18b0). Exiting. +exit 1 +``` + +## Discovery + +In a number of cases, the IPs of the cluster peers may not be known ahead of time. This is common when utilizing cloud providers or when the network uses DHCP. In these cases, rather than specifying a static configuration, use an existing etcd cluster to bootstrap a new one. This process is called "discovery". + +There two methods that can be used for discovery: + +* etcd discovery service +* DNS SRV records + +### etcd discovery + +To better understand the design of the discovery service protocol, we suggest reading the discovery service protocol [documentation][discovery-proto]. + +#### Lifetime of a discovery URL + +A discovery URL identifies a unique etcd cluster. Instead of reusing an existing discovery URL, each etcd instance shares a new discovery URL to bootstrap the new cluster. + +Moreover, discovery URLs should ONLY be used for the initial bootstrapping of a cluster. To change cluster membership after the cluster is already running, see the [runtime reconfiguration][runtime-conf] guide. + +#### Custom etcd discovery service + +Discovery uses an existing cluster to bootstrap itself. If using a private etcd cluster, create a URL like so: + +``` +$ curl -X PUT https://myetcd.local/v2/keys/discovery/6c007a14875d53d9bf0ef5a6fc0257c817f0fb83/_config/size -d value=3 +``` + +By setting the size key to the URL, a discovery URL is created with an expected cluster size of 3. + +The URL to use in this case will be `https://myetcd.local/v2/keys/discovery/6c007a14875d53d9bf0ef5a6fc0257c817f0fb83` and the etcd members will use the `https://myetcd.local/v2/keys/discovery/6c007a14875d53d9bf0ef5a6fc0257c817f0fb83` directory for registration as they start. + +**Each member must have a different name flag specified. `Hostname` or `machine-id` can be a good choice. Or discovery will fail due to duplicated name.** + +Now we start etcd with those relevant flags for each member: + +``` +$ etcd --name infra0 --initial-advertise-peer-urls http://10.0.1.10:2380 \ + --listen-peer-urls http://10.0.1.10:2380 \ + --listen-client-urls http://10.0.1.10:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.10:2379 \ + --discovery https://myetcd.local/v2/keys/discovery/6c007a14875d53d9bf0ef5a6fc0257c817f0fb83 +``` +``` +$ etcd --name infra1 --initial-advertise-peer-urls http://10.0.1.11:2380 \ + --listen-peer-urls http://10.0.1.11:2380 \ + --listen-client-urls http://10.0.1.11:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.11:2379 \ + --discovery https://myetcd.local/v2/keys/discovery/6c007a14875d53d9bf0ef5a6fc0257c817f0fb83 +``` +``` +$ etcd --name infra2 --initial-advertise-peer-urls http://10.0.1.12:2380 \ + --listen-peer-urls http://10.0.1.12:2380 \ + --listen-client-urls http://10.0.1.12:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.12:2379 \ + --discovery https://myetcd.local/v2/keys/discovery/6c007a14875d53d9bf0ef5a6fc0257c817f0fb83 +``` + +This will cause each member to register itself with the custom etcd discovery service and begin the cluster once all machines have been registered. + +#### Public etcd discovery service + +If no exiting cluster is available, use the public discovery service hosted at `discovery.etcd.io`. To create a private discovery URL using the "new" endpoint, use the command: + +``` +$ curl https://discovery.etcd.io/new?size=3 +https://discovery.etcd.io/3e86b59982e49066c5d813af1c2e2579cbf573de +``` + +This will create the cluster with an initial size of 3 members. If no size is specified, a default of 3 is used. + +``` +ETCD_DISCOVERY=https://discovery.etcd.io/3e86b59982e49066c5d813af1c2e2579cbf573de +``` + +``` +--discovery https://discovery.etcd.io/3e86b59982e49066c5d813af1c2e2579cbf573de +``` + +**Each member must have a different name flag specified or else discovery will fail due to duplicated names. `Hostname` or `machine-id` can be a good choice.** + +Now we start etcd with those relevant flags for each member: + +``` +$ etcd --name infra0 --initial-advertise-peer-urls http://10.0.1.10:2380 \ + --listen-peer-urls http://10.0.1.10:2380 \ + --listen-client-urls http://10.0.1.10:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.10:2379 \ + --discovery https://discovery.etcd.io/3e86b59982e49066c5d813af1c2e2579cbf573de +``` +``` +$ etcd --name infra1 --initial-advertise-peer-urls http://10.0.1.11:2380 \ + --listen-peer-urls http://10.0.1.11:2380 \ + --listen-client-urls http://10.0.1.11:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.11:2379 \ + --discovery https://discovery.etcd.io/3e86b59982e49066c5d813af1c2e2579cbf573de +``` +``` +$ etcd --name infra2 --initial-advertise-peer-urls http://10.0.1.12:2380 \ + --listen-peer-urls http://10.0.1.12:2380 \ + --listen-client-urls http://10.0.1.12:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.12:2379 \ + --discovery https://discovery.etcd.io/3e86b59982e49066c5d813af1c2e2579cbf573de +``` + +This will cause each member to register itself with the discovery service and begin the cluster once all members have been registered. + +Use the environment variable `ETCD_DISCOVERY_PROXY` to cause etcd to use an HTTP proxy to connect to the discovery service. + +#### Error and warning cases + +##### Discovery server errors + + +``` +$ etcd --name infra0 --initial-advertise-peer-urls http://10.0.1.10:2380 \ + --listen-peer-urls http://10.0.1.10:2380 \ + --listen-client-urls http://10.0.1.10:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.10:2379 \ + --discovery https://discovery.etcd.io/3e86b59982e49066c5d813af1c2e2579cbf573de +etcd: error: the cluster doesn’t have a size configuration value in https://discovery.etcd.io/3e86b59982e49066c5d813af1c2e2579cbf573de/_config +exit 1 +``` + +##### Warnings + +This is a harmless warning indicating the discovery URL will be ignored on this machine. + +``` +$ etcd --name infra0 --initial-advertise-peer-urls http://10.0.1.10:2380 \ + --listen-peer-urls http://10.0.1.10:2380 \ + --listen-client-urls http://10.0.1.10:2379,http://127.0.0.1:2379 \ + --advertise-client-urls http://10.0.1.10:2379 \ + --discovery https://discovery.etcd.io/3e86b59982e49066c5d813af1c2e2579cbf573de +etcdserver: discovery token ignored since a cluster has already been initialized. Valid log found at /var/lib/etcd +``` + +### DNS discovery + +DNS [SRV records][rfc-srv] can be used as a discovery mechanism. +The `--discovery-srv` flag can be used to set the DNS domain name where the discovery SRV records can be found. +Setting `--discovery-srv example.com` causes DNS SRV records to be looked up in the listed order: + +* _etcd-server-ssl._tcp.example.com +* _etcd-server._tcp.example.com + +If `_etcd-server-ssl._tcp.example.com` is found then etcd will attempt the bootstrapping process over TLS. + +To help clients discover the etcd cluster, the following DNS SRV records are looked up in the listed order: + +* _etcd-client._tcp.example.com +* _etcd-client-ssl._tcp.example.com + +If `_etcd-client-ssl._tcp.example.com` is found, clients will attempt to communicate with the etcd cluster over SSL/TLS. + +If etcd is using TLS, the discovery SRV record (e.g. `example.com`) must be included in the SSL certificate DNS SAN along with the hostname, or clustering will fail with log messages like the following: + +``` +[...] rejected connection from "10.0.1.11:53162" (error "remote error: tls: bad certificate", ServerName "example.com") +``` + +If etcd is using TLS without a custom certificate authority, the discovery domain (e.g., example.com) must match the SRV record domain (e.g., infra1.example.com). This is to mitigate attacks that forge SRV records to point to a different domain; the domain would have a valid certificate under PKI but be controlled by an unknown third party. + +The `-discovery-srv-name` flag additionally configures a suffix to the SRV name that is queried during discovery. +Use this flag to differentiate between multiple etcd clusters under the same domain. +For example, if `discovery-srv=example.com` and `-discovery-srv-name=foo` are set, the following DNS SRV queries are made: + +* _etcd-server-ssl-foo._tcp.example.com +* _etcd-server-foo._tcp.example.com + +#### Create DNS SRV records + +``` +$ dig +noall +answer SRV _etcd-server._tcp.example.com +_etcd-server._tcp.example.com. 300 IN SRV 0 0 2380 infra0.example.com. +_etcd-server._tcp.example.com. 300 IN SRV 0 0 2380 infra1.example.com. +_etcd-server._tcp.example.com. 300 IN SRV 0 0 2380 infra2.example.com. +``` + +``` +$ dig +noall +answer SRV _etcd-client._tcp.example.com +_etcd-client._tcp.example.com. 300 IN SRV 0 0 2379 infra0.example.com. +_etcd-client._tcp.example.com. 300 IN SRV 0 0 2379 infra1.example.com. +_etcd-client._tcp.example.com. 300 IN SRV 0 0 2379 infra2.example.com. +``` + +``` +$ dig +noall +answer infra0.example.com infra1.example.com infra2.example.com +infra0.example.com. 300 IN A 10.0.1.10 +infra1.example.com. 300 IN A 10.0.1.11 +infra2.example.com. 300 IN A 10.0.1.12 +``` + +#### Bootstrap the etcd cluster using DNS + +etcd cluster members can advertise domain names or IP address, the bootstrap process will resolve DNS A records. +Since 3.2 (3.1 prints warnings) `--listen-peer-urls` and `--listen-client-urls` will reject domain name for the network interface binding. + +The resolved address in `--initial-advertise-peer-urls` *must match* one of the resolved addresses in the SRV targets. The etcd member reads the resolved address to find out if it belongs to the cluster defined in the SRV records. + +``` +$ etcd --name infra0 \ +--discovery-srv example.com \ +--initial-advertise-peer-urls http://infra0.example.com:2380 \ +--initial-cluster-token etcd-cluster-1 \ +--initial-cluster-state new \ +--advertise-client-urls http://infra0.example.com:2379 \ +--listen-client-urls http://0.0.0.0:2379 \ +--listen-peer-urls http://0.0.0.0:2380 +``` + +``` +$ etcd --name infra1 \ +--discovery-srv example.com \ +--initial-advertise-peer-urls http://infra1.example.com:2380 \ +--initial-cluster-token etcd-cluster-1 \ +--initial-cluster-state new \ +--advertise-client-urls http://infra1.example.com:2379 \ +--listen-client-urls http://0.0.0.0:2379 \ +--listen-peer-urls http://0.0.0.0:2380 +``` + +``` +$ etcd --name infra2 \ +--discovery-srv example.com \ +--initial-advertise-peer-urls http://infra2.example.com:2380 \ +--initial-cluster-token etcd-cluster-1 \ +--initial-cluster-state new \ +--advertise-client-urls http://infra2.example.com:2379 \ +--listen-client-urls http://0.0.0.0:2379 \ +--listen-peer-urls http://0.0.0.0:2380 +``` + +The cluster can also bootstrap using IP addresses instead of domain names: + +``` +$ etcd --name infra0 \ +--discovery-srv example.com \ +--initial-advertise-peer-urls http://10.0.1.10:2380 \ +--initial-cluster-token etcd-cluster-1 \ +--initial-cluster-state new \ +--advertise-client-urls http://10.0.1.10:2379 \ +--listen-client-urls http://10.0.1.10:2379 \ +--listen-peer-urls http://10.0.1.10:2380 +``` + +``` +$ etcd --name infra1 \ +--discovery-srv example.com \ +--initial-advertise-peer-urls http://10.0.1.11:2380 \ +--initial-cluster-token etcd-cluster-1 \ +--initial-cluster-state new \ +--advertise-client-urls http://10.0.1.11:2379 \ +--listen-client-urls http://10.0.1.11:2379 \ +--listen-peer-urls http://10.0.1.11:2380 +``` + +``` +$ etcd --name infra2 \ +--discovery-srv example.com \ +--initial-advertise-peer-urls http://10.0.1.12:2380 \ +--initial-cluster-token etcd-cluster-1 \ +--initial-cluster-state new \ +--advertise-client-urls http://10.0.1.12:2379 \ +--listen-client-urls http://10.0.1.12:2379 \ +--listen-peer-urls http://10.0.1.12:2380 +``` + +Since v3.1.0 (except v3.2.9), when `etcd --discovery-srv=example.com` is configured with TLS, server will only authenticate peers/clients when the provided certs have root domain `example.com` as an entry in Subject Alternative Name (SAN) field. See [Notes for DNS SRV][security-guide-dns-srv]. + +### Gateway + +etcd gateway is a simple TCP proxy that forwards network data to the etcd cluster. Please read [gateway guide][gateway] for more information. + +### Proxy + +When the `--proxy` flag is set, etcd runs in [proxy mode][proxy]. This proxy mode only supports the etcd v2 API; there are no plans to support the v3 API. Instead, for v3 API support, there will be a new proxy with enhanced features following the etcd 3.0 release. + +To setup an etcd cluster with proxies of v2 API, please read the the [clustering doc in etcd 2.3 release][clustering_etcd2]. + +[clustering_etcd2]: https://github.com/etcd-io/etcd/blob/release-2.3/Documentation/clustering.md +[conf-adv-client]: ../configuration/#clustering +[conf-listen-client]: ../configuration/#member +[discovery-proto]: ../../dev-internal/discovery_protocol/ +[gateway]: ../gateway/ +[proxy]: https://github.com/etcd-io/etcd/blob/release-2.3/Documentation/proxy.md +[rfc-srv]: http://www.ietf.org/rfc/rfc2052.txt +[runtime-conf]: ../runtime-configuration/ +[runtime-reconf-design]: ../runtime-reconf-design/ +[security-guide-dns-srv]: ../security/#notes-for-dns-srv +[security-guide]: ../security/ +[tls-setup]: https://github.com/etcd-io/etcd/tree/main/hack/tls-setup diff --git a/.claude/commands/etcd/etcd-ops-guide/configuration.md b/.claude/commands/etcd/etcd-ops-guide/configuration.md new file mode 100644 index 0000000..b03fa3f --- /dev/null +++ b/.claude/commands/etcd/etcd-ops-guide/configuration.md @@ -0,0 +1,345 @@ +--- +title: Configuration options +weight: 4050 +description: etcd configuration files, flags, and environment variables +--- + +You can configure etcd through the following: + +- **[Command-line flags](#command-line-flags)** +- **Environment variables**: every flag has a corresponding environment variable + that has the same name but is prefixed with `ETCD_` and formatted in all caps and + [snake case][]. For example, `--some-flag` would be `ETCD_SOME_FLAG`. +- **[Configuration file](#configuration-file)** + +{{% alert color="warning" %}} + **Caution**: If you mix-and-match configuration options, then the following +rules apply. + +- Command-line flags take precedence over environment variables. +- If you provide a _configuration file_ all command-line flags and environment variables are **ignored**. +{{% /alert %}} + +## Command-line flags + +Flags are presented below using the format `--flag-name DEFAULT_VALUE`. + +The list of flags provided below may not be up-to-date due to ongoing development changes. For the latest available flags, run `etcd --help` or refer to the [etcd help][]. + +{{% alert color="info" %}} + **Note**: For details concerning new, updated, and deprecated {{< param version >}} flags, + see [CHANGELOG-{{< psubstr version 1 >}}.md][changelog]. + + [changelog]: https://github.com/etcd-io/etcd/blob/main/CHANGELOG/CHANGELOG-{{< psubstr version 1 >}}.md +{{% /alert %}} + +### Member + +```nocode +--name 'default' + Human-readable name for this member. +--data-dir '${name}.etcd' + Path to the data directory. +--wal-dir '' + Path to the dedicated wal directory. +--snapshot-count '10000' + Number of committed transactions to trigger a snapshot to disk. +--heartbeat-interval '100' + Time (in milliseconds) of a heartbeat interval. +--election-timeout '1000' + Time (in milliseconds) for an election to timeout. See tuning documentation for details. +--initial-election-tick-advance 'true' + Whether to fast-forward initial election ticks on boot for faster election. +--listen-peer-urls 'http://localhost:2380' + List of URLs to listen on for peer traffic. +--listen-client-urls 'http://localhost:2379' + List of URLs to listen on for client grpc traffic and http as long as --listen-client-http-urls is not specified. +--listen-client-http-urls '' + List of URLs to listen on for http only client traffic. Enabling this flag removes http services from --listen-client-urls. +--max-snapshots '5' + Maximum number of snapshot files to retain (0 is unlimited). +--max-wals '5' + Maximum number of wal files to retain (0 is unlimited). +--memory-mlock + Enable to enforce etcd pages (in particular bbolt) to stay in RAM. +--quota-backend-bytes '0' + Raise alarms when backend size exceeds the given quota (0 defaults to low space quota). +--backend-bbolt-freelist-type 'map' + BackendFreelistType specifies the type of freelist that boltdb backend uses(array and map are supported types). +--backend-batch-interval '' + BackendBatchInterval is the maximum time before commit the backend transaction. +--backend-batch-limit '0' + BackendBatchLimit is the maximum operations before commit the backend transaction. +--max-txn-ops '128' + Maximum number of operations permitted in a transaction. +--max-request-bytes '1572864' + Maximum client request size in bytes the server will accept. +--grpc-keepalive-min-time '5s' + Minimum duration interval that a client should wait before pinging server. +--grpc-keepalive-interval '2h' + Frequency duration of server-to-client ping to check if a connection is alive (0 to disable). +--grpc-keepalive-timeout '20s' + Additional duration of wait before closing a non-responsive connection (0 to disable). +--socket-reuse-port 'false' + Enable to set socket option SO_REUSEPORT on listeners allowing rebinding of a port already in use. +--socket-reuse-address 'false' + Enable to set socket option SO_REUSEADDR on listeners allowing binding to an address in TIME_WAIT state. +``` + +### Clustering + +```nocode +--initial-advertise-peer-urls 'http://localhost:2380' + List of this member's peer URLs to advertise to the rest of the cluster. +--initial-cluster 'default=http://localhost:2380' + Initial cluster configuration for bootstrapping. +--initial-cluster-state 'new' + Initial cluster state ('new' or 'existing'). +--initial-cluster-token 'etcd-cluster' + Initial cluster token for the etcd cluster during bootstrap. + Specifying this can protect you from unintended cross-cluster interaction when running multiple clusters. +--advertise-client-urls 'http://localhost:2379' + List of this member's client URLs to advertise to the public. + The client URLs advertised should be accessible to machines that talk to etcd cluster. etcd client libraries parse these URLs to connect to the cluster. +--discovery '' + Discovery URL used to bootstrap the cluster. +--discovery-fallback 'proxy' + Expected behavior ('exit' or 'proxy') when discovery services fails. + "proxy" supports v2 API only. +--discovery-proxy '' + HTTP proxy to use for traffic to discovery service. +--discovery-srv '' + DNS srv domain used to bootstrap the cluster. +--discovery-srv-name '' + Suffix to the dns srv name queried when bootstrapping. +--strict-reconfig-check 'true' + Reject reconfiguration requests that would cause quorum loss. +--pre-vote 'true' + Enable the raft Pre-Vote algorithm to prevent disruption when a node that has been partitioned away rejoins the cluster. +--auto-compaction-retention '0' + Auto compaction retention length. 0 means disable auto compaction. +--auto-compaction-mode 'periodic' + Interpret 'auto-compaction-retention' one of: periodic|revision. 'periodic' for duration based retention, defaulting to hours if no time unit is provided (e.g. '5m'). 'revision' for revision number based retention. +--enable-v2 'false' + Accept etcd V2 client requests. Deprecated and to be decommissioned in v3.6. +--v2-deprecation 'not-yet' + Phase of v2store deprecation. Allows to opt-in for higher compatibility mode. + Supported values: + 'not-yet' // Issues a warning if v2store have meaningful content (default in v3.5) + 'write-only' // Custom v2 state is not allowed (planned default in v3.6) + 'write-only-drop-data' // Custom v2 state will get DELETED ! + 'gone' // v2store is not maintained any longer. (planned default in v3.7) +``` + +### Security + +```nocode +--cert-file '' + Path to the client server TLS cert file. +--key-file '' + Path to the client server TLS key file. +--client-cert-auth 'false' + Enable client cert authentication. + It's recommended to enable client cert authentication to prevent attacks from unauthenticated clients (e.g. CVE-2023-44487), especially when running etcd as a public service. +--client-crl-file '' + Path to the client certificate revocation list file. +--client-cert-allowed-hostname '' + Comma-separated list of SAN hostnames for client cert authentication. +--trusted-ca-file '' + Path to the client server TLS trusted CA cert file. + Note setting this parameter will also automatically enable client cert authentication no matter what value is set for `--client-cert-auth`. +--auto-tls 'false' + Client TLS using generated certificates. +--peer-cert-file '' + Path to the peer server TLS cert file. +--peer-key-file '' + Path to the peer server TLS key file. +--peer-client-cert-auth 'false' + Enable peer client cert authentication. + It's recommended to enable peer client cert authentication to prevent attacks from unauthenticated forged peers (e.g. CVE-2023-44487). +--peer-trusted-ca-file '' + Path to the peer server TLS trusted CA file. +--peer-cert-allowed-cn '' + Comma-separated list of allowed CNs for inter-peer TLS authentication. +--peer-cert-allowed-hostname '' + Comma-separated list of allowed SAN hostnames for inter-peer TLS authentication. +--peer-auto-tls 'false' + Peer TLS using self-generated certificates if --peer-key-file and --peer-cert-file are not provided. +--self-signed-cert-validity '1' + The validity period of the client and peer certificates that are automatically generated by etcd when you specify ClientAutoTLS and PeerAutoTLS, the unit is year, and the default is 1. +--peer-crl-file '' + Path to the peer certificate revocation list file. +--cipher-suites '' + Comma-separated list of supported TLS cipher suites between client/server and peers (empty will be auto-populated by Go). +--cors '*' + Comma-separated whitelist of origins for CORS, or cross-origin resource sharing, (empty or * means allow all). +--host-whitelist '*' + Acceptable hostnames from HTTP client requests, if server is not secure (empty or * means allow all). +--tls-min-version 'TLS1.2' + Minimum TLS version supported by etcd. +--tls-max-version '' + Maximum TLS version supported by etcd (empty will be auto-populated by Go). +``` + +### Auth + +```nocode +--auth-token 'simple' + Specify a v3 authentication token type and its options ('simple' or 'jwt'). +--bcrypt-cost 10 + Specify the cost / strength of the bcrypt algorithm for hashing auth passwords. Valid values are between 4 and 31. +--auth-token-ttl 300 + Time (in seconds) of the auth-token-ttl. +``` + +### Profiling and monitoring + +```nocode +--enable-pprof 'false' + Enable runtime profiling data via HTTP server. Address is at client URL + "/debug/pprof/" +--metrics 'basic' + Set level of detail for exported metrics, specify 'extensive' to include server side grpc histogram metrics. +--listen-metrics-urls '' + List of URLs to listen on for the metrics and health endpoints. +``` + +### Logging + +```nocode +--logger 'zap' + Currently only supports 'zap' for structured logging. +--log-outputs 'default' + Specify 'stdout' or 'stderr' to skip journald logging even when running under systemd, or list of comma separated output targets. +--log-level 'info' + Configures log level. Only supports debug, info, warn, error, panic, or fatal. +--log-format 'json' + Configures log format. Only supports json, console. +--enable-log-rotation 'false' + Enable log rotation of a single log-outputs file target. +--log-rotation-config-json '{"maxsize": 100, "maxage": 0, "maxbackups": 0, "localtime": false, "compress": false}' + Configures log rotation if enabled with a JSON logger config. MaxSize(MB), MaxAge(days,0=no limit), MaxBackups(0=no limit), LocalTime(use computers local time), Compress(gzip)". +--warning-unary-request-duration '300ms' + Set time duration after which a warning is logged if a unary request takes more than this duration. +``` + +{{% alert color="info" %}} + **Note**: Several `--experimental-*` flags have been promoted or renamed in v3.7. +Be sure to replace deprecated flags with their stable counterparts listed below. +{{% /alert %}} + +### Distributed tracing + +```nocode +--enable-distributed-tracing 'false' + Enable distributed tracing. +--distributed-tracing-address 'localhost:4317' + Distributed tracing collector address. +--distributed-tracing-service-name 'etcd' + Distributed tracing service name, must be the same across all etcd instances. +--distributed-tracing-instance-id '' + Distributed tracing instance ID, must be unique for each etcd instance. +--distributed-tracing-sampling-rate '0' + Number of samples to collect per million spans for distributed tracing. +``` + +### v2 Proxy + +{{% alert color="warning" %}} +** Note**: flags will be deprecated in v3.6. +{{% /alert %}} + +```nocode +--proxy 'off' + Proxy mode setting ('off', 'readonly' or 'on'). +--proxy-failure-wait 5000 + Time (in milliseconds) an endpoint will be held in a failed state. +--proxy-refresh-interval 30000 + Time (in milliseconds) of the endpoints refresh interval. +--proxy-dial-timeout 1000 + Time (in milliseconds) for a dial to timeout. +--proxy-write-timeout 5000 + Time (in milliseconds) for a write to timeout. +--proxy-read-timeout 0 + Time (in milliseconds) for a read to timeout. +``` + +### Features + +```nocode +--corrupt-check-time '0s' + Duration of time between cluster corruption check passes. +--compact-hash-check-time '1m' + Duration of time between leader checks followers compaction hashes. +--compaction-batch-limit 1000 + CompactionBatchLimit sets the maximum revisions deleted in each compaction batch. +--peer-skip-client-san-verification 'false' + Skip verification of SAN field in client certificate for peer connections. +--watch-progress-notify-interval '10m' + Duration of periodical watch progress notification. +--warning-apply-duration '100ms' + Warning is generated if requests take more than this duration. +--bootstrap-defrag-threshold-megabytes + Enable the defrag during etcd server bootstrap on condition that it will free at least the provided threshold of disk space. Needs to be set to non-zero value to take effect. +--max-learners '1' + Set the max number of learner members allowed in the cluster membership. +--compaction-sleep-interval + Sets the sleep interval between each compaction batch. +--downgrade-check-time + Duration of time between two downgrade status checks. +--snapshot-catchup-entries + Number of entries for a slow follower to catch up after compacting the raft storage entries. +``` + +### Feature Gates + +```nocode +--feature-gates=AllAlpha=true|false + Enables or disables all alpha features. Default is false. +--feature-gates=AllBeta=true|false + Enables or disables all beta features. Default is false. +--feature-gates=CompactHashCheck=true + Enables leader to periodically check follower compaction hashes. + Replaces: --experimental-compact-hash-check-enabled +--feature-gates=InitialCorruptCheck=true + Enables corruption check before serving client/peer traffic. + Replaces: --experimental-initial-corrupt-check +--feature-gates=LeaseCheckpoint=true + ExperimentalEnableLeaseCheckpoint enables primary lessor to persist lease remainingTTL to prevent indefinite auto-renewal of long lived leases. + Replaces: --experimental-enable-lease-checkpoint +--feature-gates=LeaseCheckpointPersist=true + Enable persisting remainingTTL to prevent indefinite auto-renewal of long lived leases. Always enabled in v3.6. Should be used to ensure smooth upgrade from v3.5 clusters with this feature enabled. + Replaces: --experimental-enable-lease-checkpoint-persist +--feature-gates=SetMemberLocalAddr=true + Allows setting a member’s local address. +--feature-gates=StopGRPCServiceOnDefrag=true + Enable etcd gRPC service to stop serving client requests on defragmentation. + Replaces: --experimental-stop-grpc-service-on-defrag +--feature-gates=TxnModeWriteWithSharedBuffer=true + Enable the write transaction to use a shared buffer in its readonly check operations. + Replaces: --experimental-txn-mode-write-with-shared-buffer +``` + +### Unsafe features + +{{% alert color="warning" %}} +** Warning**: using unsafe features may break the guarantees given by the consensus protocol! +{{% /alert %}} + +```nocode +--force-new-cluster 'false' + Force to create a new one-member cluster. +--unsafe-no-fsync 'false' + Disables fsync, unsafe, will cause data loss. +``` + +## Configuration file + +An etcd configuration file consists of a YAML map whose keys are command-line +flag names and values are the flag values. +In order to use this file, specify the file path as a value to the `--config-file` flag or `ETCD_CONFIG_FILE` environment variable. + +For an example, see the [etcd.conf.yml sample][]. + +[etcd help]: https://github.com/etcd-io/etcd/blob/main/server/etcdmain/help.go +[etcd.conf.yml sample]: https://github.com/etcd-io/etcd/blob/main/etcd.conf.yml.sample +[snake case]: https://en.wikipedia.org/wiki/Snake_case diff --git a/.claude/commands/etcd/etcd-ops-guide/container.md b/.claude/commands/etcd/etcd-ops-guide/container.md new file mode 100644 index 0000000..fa6b9c8 --- /dev/null +++ b/.claude/commands/etcd/etcd-ops-guide/container.md @@ -0,0 +1,165 @@ +--- +title: Run etcd clusters inside containers +weight: 4200 +description: Running etcd with Docker using static bootstrapping +--- +The following guide shows how to run etcd with Docker using the [static bootstrap process](../clustering/#static). + +## Docker + +In order to expose the etcd API to clients outside of Docker host, use the host IP address of the container. Please see [`docker inspect`](https://docs.docker.com/engine/reference/commandline/inspect) for more detail on how to get the IP address. Alternatively, specify `--net=host` flag to `docker run` command to skip placing the container inside of a separate network stack. + +### Running a single node etcd + +Use the host IP address when configuring etcd: + +``` +export NODE1=192.168.1.21 +``` + +Configure a Docker volume to store etcd data: + +``` +docker volume create --name etcd-data +export DATA_DIR="etcd-data" +``` + +Run the latest version of etcd (`{{< param git_version_tag >}}` at the time of +writing): + +``` +ETCD_VERSION={{< param git_version_tag >}} +REGISTRY=quay.io/coreos/etcd +# available from v3.2.5 +REGISTRY=gcr.io/etcd-development/etcd + +docker run \ + -p 2379:2379 \ + -p 2380:2380 \ + --volume=${DATA_DIR}:/etcd-data \ + --name etcd ${REGISTRY}:${ETCD_VERSION} \ + /usr/local/bin/etcd \ + --data-dir=/etcd-data --name node1 \ +  --initial-advertise-peer-urls http://${NODE1}:2380 --listen-peer-urls http://0.0.0.0:2380 \ +  --advertise-client-urls http://${NODE1}:2379 --listen-client-urls http://0.0.0.0:2379 \ + --initial-cluster node1=http://${NODE1}:2380 +``` + +List the cluster member: + +``` +etcdctl --endpoints=http://${NODE1}:2379 member list +``` + +### Running a 3 node etcd cluster + +``` +REGISTRY=quay.io/coreos/etcd +# available from v3.2.5 +REGISTRY=gcr.io/etcd-development/etcd + +# For each machine +ETCD_VERSION={{< param git_version_tag >}} +TOKEN=my-etcd-token +CLUSTER_STATE=new +NAME_1=etcd-node-0 +NAME_2=etcd-node-1 +NAME_3=etcd-node-2 +HOST_1=10.20.30.1 +HOST_2=10.20.30.2 +HOST_3=10.20.30.3 +CLUSTER=${NAME_1}=http://${HOST_1}:2380,${NAME_2}=http://${HOST_2}:2380,${NAME_3}=http://${HOST_3}:2380 +DATA_DIR=/var/lib/etcd + +# For node 1 +THIS_NAME=${NAME_1} +THIS_IP=${HOST_1} +docker run \ + -p 2379:2379 \ + -p 2380:2380 \ + --volume=${DATA_DIR}:/etcd-data \ + --name etcd ${REGISTRY}:${ETCD_VERSION} \ + /usr/local/bin/etcd \ + --data-dir=/etcd-data --name ${THIS_NAME} \ +  --initial-advertise-peer-urls http://${THIS_IP}:2380 --listen-peer-urls http://0.0.0.0:2380 \ +  --advertise-client-urls http://${THIS_IP}:2379 --listen-client-urls http://0.0.0.0:2379 \ + --initial-cluster ${CLUSTER} \ + --initial-cluster-state ${CLUSTER_STATE} --initial-cluster-token ${TOKEN} + +# For node 2 +THIS_NAME=${NAME_2} +THIS_IP=${HOST_2} +docker run \ + -p 2379:2379 \ + -p 2380:2380 \ + --volume=${DATA_DIR}:/etcd-data \ + --name etcd ${REGISTRY}:${ETCD_VERSION} \ + /usr/local/bin/etcd \ + --data-dir=/etcd-data --name ${THIS_NAME} \ +  --initial-advertise-peer-urls http://${THIS_IP}:2380 --listen-peer-urls http://0.0.0.0:2380 \ +  --advertise-client-urls http://${THIS_IP}:2379 --listen-client-urls http://0.0.0.0:2379 \ + --initial-cluster ${CLUSTER} \ + --initial-cluster-state ${CLUSTER_STATE} --initial-cluster-token ${TOKEN} + +# For node 3 +THIS_NAME=${NAME_3} +THIS_IP=${HOST_3} +docker run \ + -p 2379:2379 \ + -p 2380:2380 \ + --volume=${DATA_DIR}:/etcd-data \ + --name etcd ${REGISTRY}:${ETCD_VERSION} \ + /usr/local/bin/etcd \ + --data-dir=/etcd-data --name ${THIS_NAME} \ +  --initial-advertise-peer-urls http://${THIS_IP}:2380 --listen-peer-urls http://0.0.0.0:2380 \ +  --advertise-client-urls http://${THIS_IP}:2379 --listen-client-urls http://0.0.0.0:2379 \ + --initial-cluster ${CLUSTER} \ + --initial-cluster-state ${CLUSTER_STATE} --initial-cluster-token ${TOKEN} +``` + +To run `etcdctl` using API version 3: + +``` +docker exec etcd /usr/local/bin/etcdctl put foo bar +``` + +## Bare Metal + +To provision a 3 node etcd cluster on bare-metal, the examples in the [baremetal repo](https://github.com/coreos/coreos-baremetal/tree/master/examples) may be useful. + +## Mounting a certificate volume + +The etcd release container does not include default root certificates. To use HTTPS with certificates trusted by a root authority (e.g., for discovery), mount a certificate directory into the etcd container: + +``` +ETCD_VERSION={{< param git_version_tag >}} +REGISTRY=quay.io/coreos/etcd +# available from v3.2.5 +REGISTRY=docker://gcr.io/etcd-development/etcd + +rkt run \ + --insecure-options=image \ + --volume etcd-ssl-certs-bundle,kind=host,source=/etc/ssl/certs/ca-certificates.crt \ + --mount volume=etcd-ssl-certs-bundle,target=/etc/ssl/certs/ca-certificates.crt \ + ${REGISTRY}:${ETCD_VERSION} -- --name my-name \ + --initial-advertise-peer-urls http://localhost:2380 --listen-peer-urls http://localhost:2380 \ + --advertise-client-urls http://localhost:2379 --listen-client-urls http://localhost:2379 \ + --discovery https://discovery.etcd.io/c11fbcdc16972e45253491a24fcf45e1 +``` + +``` +ETCD_VERSION={{< param git_version_tag >}} +REGISTRY=quay.io/coreos/etcd +# available from v3.2.5 +REGISTRY=gcr.io/etcd-development/etcd + +docker run \ + -p 2379:2379 \ + -p 2380:2380 \ + --volume=/etc/ssl/certs/ca-certificates.crt:/etc/ssl/certs/ca-certificates.crt \ + ${REGISTRY}:${ETCD_VERSION} \ + /usr/local/bin/etcd --name my-name \ + --initial-advertise-peer-urls http://localhost:2380 --listen-peer-urls http://localhost:2380 \ + --advertise-client-urls http://localhost:2379 --listen-client-urls http://localhost:2379 \ + --discovery https://discovery.etcd.io/86a9ff6c8cb8b4c4544c1a2f88f8b801 +``` diff --git a/.claude/commands/etcd/etcd-ops-guide/data_corruption.md b/.claude/commands/etcd/etcd-ops-guide/data_corruption.md new file mode 100644 index 0000000..ac33105 --- /dev/null +++ b/.claude/commands/etcd/etcd-ops-guide/data_corruption.md @@ -0,0 +1,71 @@ +--- +title: Data Corruption +weight: 5000 +description: etcd data corruption and recovery +--- + +etcd has built in automated data corruption detection to prevent member state from diverging. + +## Enabling data corruption detection + +Data corruption detection can be done using: +* Initial check, enabled with `--experimental-initial-corrupt-check` flag. +* Periodic check of: + * Compacted revision hash, enabled with `--experimental-compact-hash-check-enabled` flag. + * Latest revision hash, enabled with `--experimental-corrupt-check-time` flag. + +Initial check will be executed during bootstrap of etcd member. +Member will compare its persistent state vs other members and exit if there is a mismatch. + +Both periodic check will be executed by the cluster leader in a cluster that is already running. +Leader will compare its persistent state vs other members and raise a CORRUPT ALARM if there is a mismatch. +Both checks serve the same purpose, however they are both worth enabling to balance performance and time to detection. +* Compacted revision hash check - requires regular compaction, minimal performance cost, handles slow followers. +* Latest revision hash check - high performance cost, doesn't handle slow followers or frequent compactions. + +### Compacted revision hash check + +When enabled using `--experimental-compact-hash-check-enabled` flag, check will be executed once every minute. +This can be adjusted using `--experimental-compact-hash-check-time` flag using format: `1m` - every minute, `1h` - evey hour. +This check extends compaction to also calculate checksum that can be compared between cluster members. +Doesn't cause additional database scan making it very cheap, but requiring a regular compaction in cluster. + +### Latest revision hash check + +Enabled using `--experimental-corrupt-check-time` flag, requires providing an execution period in format: `1m` - every minute, `1h` - evey hour. +Recommended period is a couple of hours due to a high performance cost. +Running a check requires computing a checksum by scanning entire etcd content at given revision. + +## Restoring a corrupted member + +There are three ways to restore a corrupted member: +* Purge member persistent state +* Replace member +* Restore whole cluster + +After the corrupted member is restored, CORRUPT ALARM can be removed. + +### Purge member persistent state + +Members state can be purged by: +1. Stopping the etcd instance. +2. Backing up etcd data directory. +3. Moving out the `snap` subdirectory from the etcd data directory. +6. Starting `etcd` with `--initial-cluster-state=existing` and cluster members listed in `--initial-cluster`. + +Etcd member is expected to download up-to-date snapshot from the leader. + +### Replace member + +Member can be replaced by: +1. Stopping the etcd instance. +2. Backing up the etcd data directory. +3. Removing the data directory. +4. Removing the member from cluster by running `etcdctl member remove`. +5. Adding it back by running `etcdctl member add` +6. Starting `etcd` with `--initial-cluster-state=existing` and cluster members listed in `--initial-cluster`. + +### Restore whole cluster + +Cluster can be restored by saving a snapshot from current leader and restoring it to all members. +Run `etcdctl snapshot save` against the leader and follow [restoring a cluster procedure](/docs/v3.5/op-guide/recovery). diff --git a/.claude/commands/etcd/etcd-ops-guide/failures.md b/.claude/commands/etcd/etcd-ops-guide/failures.md new file mode 100644 index 0000000..15c47cf --- /dev/null +++ b/.claude/commands/etcd/etcd-ops-guide/failures.md @@ -0,0 +1,48 @@ +--- +title: Failure modes +weight: 4250 +description: Kinds of failures and etcd's tolerance for them +--- + +Failures are common in a large deployment of machines. A machine fails when its hardware or software malfunctions. Multiple machines fail together when there are power failures or network issues. Multiple kinds of failures can also happen at once; it is almost impossible to enumerate all possible failure cases. + +In this section, we catalog kinds of failures and discuss how etcd is designed to tolerate these failures. Most users, if not all, can map a particular failure into one kind of failure. To prepare for rare or [unrecoverable failures][unrecoverable], always [back up][backup] the etcd cluster. + +## Minor followers failure + +When fewer than half of the followers fail, the etcd cluster can still accept requests and make progress without any major disruption. For example, two follower failures will not affect a five member etcd cluster’s operation. However, clients will lose connectivity to the failed members. Client libraries should hide these interruptions from users for read requests by automatically reconnecting to other members. Operators should expect the system load on the other members to increase due to the reconnections. + +## Leader failure + +When a leader fails, the etcd cluster automatically elects a new leader. The election does not happen instantly once the leader fails. It takes about an election timeout to elect a new leader since the failure detection model is timeout based. + +During the leader election the cluster cannot process any writes. Write requests sent during the election are queued for processing until a new leader is elected. + +Writes already sent to the old leader but not yet committed may be lost. The new leader has the power to rewrite any uncommitted entries from the previous leader. From the user perspective, some write requests might time out after a new leader election. However, no committed writes are ever lost. + +The new leader extends timeouts automatically for all leases. This mechanism ensures a lease will not expire before the granted TTL even if it was granted by the old leader. + +## Majority failure + +When the majority members of the cluster fail, the etcd cluster fails and cannot accept more writes. + +The etcd cluster can only recover from a majority failure once the majority of members become available. If a majority of members cannot come back online, then the operator must start [disaster recovery][unrecoverable] to recover the cluster. + +Once a majority of members works, the etcd cluster elects a new leader automatically and returns to a healthy state. The new leader extends timeouts automatically for all leases. This mechanism ensures no lease expires due to server side unavailability. + +## Network partition + +A network partition is similar to a minor followers failure or a leader failure. A network partition divides the etcd cluster into two parts; one with a member majority and the other with a member minority. The majority side becomes the available cluster and the minority side is unavailable. There is no “split-brain” in etcd because cluster members are explicitly added/removed with each such change is approved by the current majority of members. + +If the leader is on the majority side, then from the majority point of view the failure is a minority follower failure. If the leader is on the minority side, then it is a leader failure. The leader on the minority side steps down and the majority side elects a new leader. + +Once the network partition clears, the minority side automatically recognizes the leader from the majority side and recovers its state. + +## Failure during bootstrapping + +A cluster bootstrap is only successful if all required members successfully start. If any failure happens during bootstrapping, remove the data directories on all members and re-bootstrap the cluster with a new cluster-token or new discovery token. + +Of course, it is possible to recover a failed bootstrapped cluster like recovering a running cluster. However, it almost always takes more time and resources to recover that cluster than bootstrapping a new one, since there is no data to recover. + +[backup]: ../maintenance/#snapshot-backup +[unrecoverable]: ../recovery/ diff --git a/.claude/commands/etcd/etcd-ops-guide/maintenance.md b/.claude/commands/etcd/etcd-ops-guide/maintenance.md new file mode 100644 index 0000000..2053828 --- /dev/null +++ b/.claude/commands/etcd/etcd-ops-guide/maintenance.md @@ -0,0 +1,176 @@ +--- +title: Maintenance +weight: 4450 +description: Periodic etcd cluster maintenance guide +--- + +## Overview + +An etcd cluster needs periodic maintenance to remain reliable. Depending on an etcd application's needs, this maintenance can usually be automated and performed without downtime or significantly degraded performance. + +All etcd maintenance manages storage resources consumed by the etcd keyspace. Failure to adequately control the keyspace size is guarded by storage space quotas; if an etcd member runs low on space, a quota will trigger cluster-wide alarms which will put the system into a limited-operation maintenance mode. To avoid running out of space for writes to the keyspace, the etcd keyspace history must be compacted. Storage space itself may be reclaimed by defragmenting etcd members. Finally, periodic snapshot backups of etcd member state makes it possible to recover any unintended logical data loss or corruption caused by operational error. + +## Raft log retention + +`etcd --snapshot-count` configures the number of applied Raft entries to hold in-memory before compaction. When `--snapshot-count` reaches, server first persists snapshot data onto disk, and then truncates old entries. When a slow follower requests logs before a compacted index, leader sends the snapshot forcing the follower to overwrite its state. + +Higher `--snapshot-count` holds more Raft entries in memory until snapshot, thus causing [recurrent higher memory usage](https://github.com/kubernetes/kubernetes/issues/60589#issuecomment-371977156). Since leader retains latest Raft entries for longer, a slow follower has more time to catch up before leader snapshot. `--snapshot-count` is a tradeoff between higher memory usage and better availabilities of slow followers. + +Since v3.2, the default value of `--snapshot-count` has [changed from from 10,000 to 100,000](https://github.com/etcd-io/etcd/pull/7160). + +In performance-wise, `--snapshot-count` greater than 100,000 may impact the write throughput. Higher number of in-memory objects can slow down [Go GC mark phase `runtime.scanobject`](https://golang.org/src/runtime/mgc.go), and infrequent memory reclamation makes allocation slow. Performance varies depending on the workloads and system environments. However, in general, too frequent compaction affects cluster availabilities and write throughputs. Too infrequent compaction is also harmful placing too much pressure on Go garbage collector. See https://www.slideshare.net/mitakeh/understanding-performance-aspects-of-etcd-and-raft for more research results. + +## History compaction: v3 API Key-Value Database + +Since etcd keeps an exact history of its keyspace, this history should be periodically compacted to avoid performance degradation and eventual storage space exhaustion. Compacting the keyspace history drops all information about keys superseded prior to a given keyspace revision. The space used by these keys then becomes available for additional writes to the keyspace. + +The keyspace can be compacted automatically with `etcd`'s time windowed history retention policy, or manually with `etcdctl`. The `etcdctl` method provides fine-grained control over the compacting process whereas automatic compacting fits applications that only need key history for some length of time. + +An `etcdctl` initiated compaction works as follows: + +```sh +# compact up to revision 3 +$ etcdctl compact 3 +``` + +Revisions prior to the compaction revision become inaccessible: + +```sh +$ etcdctl get --rev=2 somekey +Error: rpc error: code = 11 desc = etcdserver: mvcc: required revision has been compacted +``` + +### Auto Compaction + +`etcd` can be set to automatically compact the keyspace with the `--auto-compaction-*` option with a period of hours: + +```sh +# keep one hour of history +$ etcd --auto-compaction-retention=1 +``` + +[v3.0.0](https://github.com/etcd-io/etcd/blob/main/CHANGELOG/CHANGELOG-3.0.md) and [v3.1.0](https://github.com/etcd-io/etcd/blob/main/CHANGELOG/CHANGELOG-3.1.md) with `--auto-compaction-retention=10` run periodic compaction on v3 key-value store for every 10-hour. Compactor only supports periodic compaction. Compactor records latest revisions every 5-minute, until it reaches the first compaction period (e.g. 10-hour). In order to retain key-value history of last compaction period, it uses the last revision that was fetched before compaction period, from the revision records that were collected every 5-minute. When `--auto-compaction-retention=10`, compactor uses revision 100 for compact revision where revision 100 is the latest revision fetched from 10 hours ago. If compaction succeeds or requested revision has already been compacted, it resets period timer and starts over with new historical revision records (e.g. restart revision collect and compact for the next 10-hour period). If compaction fails, it retries in 5 minutes. + +[v3.2.0](https://github.com/etcd-io/etcd/blob/main/CHANGELOG/CHANGELOG-3.2.md) compactor runs [every hour](https://github.com/etcd-io/etcd/pull/7875). Compactor only supports periodic compaction. Compactor continues to record latest revisions every 5-minute. For every hour, it uses the last revision that was fetched before compaction period, from the revision records that were collected every 5-minute. That is, for every hour, compactor discards historical data created before compaction period. The retention window of compaction period moves to next hour. For instance, when hourly writes are 100 and `--auto-compaction-retention=10`, v3.1 compacts revision 1000, 2000, and 3000 for every 10-hour, while v3.2.x, v3.3.0, v3.3.1, and v3.3.2 compact revision 1000, 1100, and 1200 for every 1-hour. If compaction succeeds or requested revision has already been compacted, it resets period timer and removes used compacted revision from historical revision records (e.g. start next revision collect and compaction from previously collected revisions). If compaction fails, it retries in 5 minutes. + +In [v3.3.0](https://github.com/etcd-io/etcd/blob/main/CHANGELOG/CHANGELOG-3.3.md), [v3.3.1](https://github.com/etcd-io/etcd/blob/main/CHANGELOG/CHANGELOG-3.3.md), and [v3.3.2](https://github.com/etcd-io/etcd/blob/main/CHANGELOG/CHANGELOG-3.3.md), `--auto-compaction-mode=revision --auto-compaction-retention=1000` automatically `Compact` on `"latest revision" - 1000` every 5-minute (when latest revision is 30000, compact on revision 29000). For instance, `--auto-compaction-mode=periodic --auto-compaction-retention=72h` automatically `Compact` with 72-hour retention window, for every 7.2-hour. For instance, `--auto-compaction-mode=periodic --auto-compaction-retention=30m` automatically `Compact` with 30-minute retention window, for every 3-minute. Periodic compactor continues to record latest revisions for every 1/10 of given compaction period (e.g. 1-hour when `--auto-compaction-mode=periodic --auto-compaction-retention=10h`). For every 1/10 of given compaction period, compactor uses the last revision that was fetched before compaction period, to discard historical data. The retention window of compaction period moves for every 1/10 of given compaction period. For instance, when hourly writes are 100 and `--auto-compaction-retention=10`, v3.1 compacts revision 1000, 2000, and 3000 for every 10-hour, while v3.2.x, v3.3.0, v3.3.1, and v3.3.2 compact revision 1000, 1100, and 1200 for every 1-hour. Furthermore, when writes per minute are 1000, v3.3.0, v3.3.1, and v3.3.2 with `--auto-compaction-mode=periodic --auto-compaction-retention=30m` compact revision 30000, 33000, and 36000, for every 3-minute with more finer granularity. + +When `--auto-compaction-retention=10h`, etcd first waits 10-hour for the first compaction, and then does compaction every hour (1/10 of 10-hour) afterwards like this: + +``` +0Hr (rev = 1) +1hr (rev = 10) +... +8hr (rev = 80) +9hr (rev = 90) +10hr (rev = 100, Compact(1)) +11hr (rev = 110, Compact(10)) +... +``` + +Whether compaction succeeds or not, this process repeats for every 1/10 of given compaction period. If compaction succeeds, it just removes compacted revision from historical revision records. + +In [v3.3.3](https://github.com/etcd-io/etcd/blob/main/CHANGELOG/CHANGELOG-3.3.md), `--auto-compaction-mode=revision --auto-compaction-retention=1000` automatically `Compact` on `"latest revision" - 1000` every 5-minute (when latest revision is 30000, compact on revision 29000). Previously, `--auto-compaction-mode=periodic --auto-compaction-retention=72h` automatically `Compact` with 72-hour retention window for every 7.2-hour. **Now, `Compact` happens, for every 1-hour but still with 72-hour retention window.** Previously, `--auto-compaction-mode=periodic --auto-compaction-retention=30m` automatically `Compact` with 30-minute retention window for every 3-minute. **Now, `Compact` happens, for every 30-minute but still with 30-minute retention window.** Periodic compactor keeps recording latest revisions for every compaction period when given period is less than 1-hour, or for every 1-hour when given compaction period is greater than 1-hour (e.g. 1-hour when `--auto-compaction-mode=periodic --auto-compaction-retention=24h`). For every compaction period or 1-hour, compactor uses the last revision that was fetched before compaction period, to discard historical data. The retention window of compaction period moves for every given compaction period or hour. For instance, when hourly writes are 100 and `--auto-compaction-mode=periodic --auto-compaction-retention=24h`, `v3.2.x`, `v3.3.0`, `v3.3.1`, and `v3.3.2` compact revision 2400, 2640, and 2880 for every 2.4-hour, while `v3.3.3` *or later* compacts revision 2400, 2500, 2600 for every 1-hour. Furthermore, when `--auto-compaction-mode=periodic --auto-compaction-retention=30m` and writes per minute are about 1000, `v3.3.0`, `v3.3.1`, and `v3.3.2` compact revision 30000, 33000, and 36000, for every 3-minute, while `v3.3.3` *or later* compacts revision 30000, 60000, and 90000, for every 30-minute. + +## Defragmentation + +After compacting the keyspace, the backend database may exhibit internal fragmentation. Any internal fragmentation is space that is free to use by the backend but still consumes storage space. Compacting old revisions internally fragments `etcd` by leaving gaps in backend database. Fragmented space is available for use by `etcd` but unavailable to the host filesystem. In other words, deleting application data does not reclaim the space on disk. + +The process of defragmentation releases this storage space back to the file system. Defragmentation is issued on a per-member basis so that cluster-wide latency spikes may be avoided. + +To defragment an etcd member, use the `etcdctl defrag` command: + +```sh +$ etcdctl defrag +Finished defragmenting etcd member[127.0.0.1:2379] +``` + +**Note that defragmentation to a live member blocks the system from reading and writing data while rebuilding its states**. + +**Note that defragmentation request does not get replicated over cluster. That is, the request is only applied to the local node. Specify all members in `--endpoints` flag or `--cluster` flag to automatically find all cluster members.** + +Run defragment operations for all endpoints in the cluster associated with the default endpoint: + +```bash +$ etcdctl defrag --cluster +Finished defragmenting etcd member[http://127.0.0.1:2379] +Finished defragmenting etcd member[http://127.0.0.1:22379] +Finished defragmenting etcd member[http://127.0.0.1:32379] +``` + +To defragment an etcd data directory directly, while etcd is not running, use the command: + +``` sh +$ etcdutl defrag --data-dir +``` + +## Space quota + +The space quota in `etcd` ensures the cluster operates in a reliable fashion. Without a space quota, `etcd` may suffer from poor performance if the keyspace grows excessively large, or it may simply run out of storage space, leading to unpredictable cluster behavior. If the keyspace's backend database for any member exceeds the space quota, `etcd` raises a cluster-wide alarm that puts the cluster into a maintenance mode which only accepts key reads and deletes. Only after freeing enough space in the keyspace and defragmenting the backend database, along with clearing the space quota alarm can the cluster resume normal operation. + +By default, `etcd` sets a conservative space quota suitable for most applications, but it may be configured on the command line, in bytes: + +```sh +# set a very small 16 MiB quota +$ etcd --quota-backend-bytes=$((16*1024*1024)) +``` + +The space quota can be triggered with a loop: + +```sh +# fill keyspace +$ while [ 1 ]; do dd if=/dev/urandom bs=1024 count=1024 | ETCDCTL_API=3 etcdctl put key || break; done +... +Error: rpc error: code = 8 desc = etcdserver: mvcc: database space exceeded +# confirm quota space is exceeded +$ ETCDCTL_API=3 etcdctl --write-out=table endpoint status ++----------------+------------------+-----------+---------+-----------+-----------+------------+ +| ENDPOINT | ID | VERSION | DB SIZE | IS LEADER | RAFT TERM | RAFT INDEX | ++----------------+------------------+-----------+---------+-----------+-----------+------------+ +| 127.0.0.1:2379 | bf9071f4639c75cc | 2.3.0+git | 18 MB | true | 2 | 3332 | ++----------------+------------------+-----------+---------+-----------+-----------+------------+ +# confirm alarm is raised +$ ETCDCTL_API=3 etcdctl alarm list +memberID:13803658152347727308 alarm:NOSPACE +``` + +Removing excessive keyspace data and defragmenting the backend database will put the cluster back within the quota limits: + +```sh +# get current revision +$ rev=$(ETCDCTL_API=3 etcdctl --endpoints=:2379 endpoint status --write-out="json" | egrep -o '"revision":[0-9]*' | egrep -o '[0-9].*') +# compact away all old revisions +$ ETCDCTL_API=3 etcdctl compact $rev +compacted revision 1516 +# defragment away excessive space +$ ETCDCTL_API=3 etcdctl defrag +Finished defragmenting etcd member[127.0.0.1:2379] +# disarm alarm +$ ETCDCTL_API=3 etcdctl alarm disarm +memberID:13803658152347727308 alarm:NOSPACE +# test puts are allowed again +$ ETCDCTL_API=3 etcdctl put newkey 123 +OK +``` + +The metric `etcd_mvcc_db_total_size_in_use_in_bytes` indicates the actual database usage after a history compaction, while `etcd_debugging_mvcc_db_total_size_in_bytes` shows the database size including free space waiting for defragmentation. The latter increases only when the former is close to it, meaning when both of these metrics are close to the quota, a history compaction is required to avoid triggering the space quota. + +`etcd_debugging_mvcc_db_total_size_in_bytes` is renamed to `etcd_mvcc_db_total_size_in_bytes` from v3.4. + +**NOTE:** it is possible to get an `ErrGRPCNoSpace` error for a Put/Txn/LeaseGrant request, and still have the write request succeed in the backend, because etcd checks space quota at the API layer and the internal Apply layer, and the Apply layer will only raise the `NOSPACE` alarm without blocking the transaction from proceeding. + +## Snapshot backup + +Snapshotting the `etcd` cluster on a regular basis serves as a durable backup for an etcd keyspace. By taking periodic snapshots of an etcd member's backend database, an `etcd` cluster can be recovered to a point in time with a known good state. + +A snapshot is taken with `etcdctl`: + +```sh +$ etcdctl snapshot save backup.db +$ etcdutl --write-out=table snapshot status backup.db ++----------+----------+------------+------------+ +| HASH | REVISION | TOTAL KEYS | TOTAL SIZE | ++----------+----------+------------+------------+ +| fe01cf57 | 10 | 7 | 2.1 MB | ++----------+----------+------------+------------+ +``` diff --git a/.claude/commands/etcd/etcd-ops-guide/monitoring.md b/.claude/commands/etcd/etcd-ops-guide/monitoring.md new file mode 100644 index 0000000..cde0f24 --- /dev/null +++ b/.claude/commands/etcd/etcd-ops-guide/monitoring.md @@ -0,0 +1,188 @@ +--- +title: Monitoring etcd +weight: 4500 +description: Monitoring etcd for system health & cluster debugging +--- + +Each etcd server provides local monitoring information on its client port through http endpoints. The monitoring data is useful for both system health checking and cluster debugging. + +## Debug endpoint + +If `--log-level=debug` is set, the etcd server exports debugging information on its client port under the `/debug` path. Take care when setting `--log-level=debug`, since there will be degraded performance and verbose logging. + +The `/debug/pprof` endpoint is the standard go runtime profiling endpoint. This can be used to profile CPU, heap, mutex, and goroutine utilization. For example, here `go tool pprof` gets the top 10 functions where etcd spends its time: + +```sh +$ go tool pprof http://localhost:2379/debug/pprof/profile +Fetching profile from http://localhost:2379/debug/pprof/profile +Please wait... (30s) +Saved profile in /home/etcd/pprof/pprof.etcd.localhost:2379.samples.cpu.001.pb.gz +Entering interactive mode (type "help" for commands) +(pprof) top10 +310ms of 480ms total (64.58%) +Showing top 10 nodes out of 157 (cum >= 10ms) + flat flat% sum% cum cum% + 130ms 27.08% 27.08% 130ms 27.08% runtime.futex + 70ms 14.58% 41.67% 70ms 14.58% syscall.Syscall + 20ms 4.17% 45.83% 20ms 4.17% github.com/coreos/etcd/vendor/golang.org/x/net/http2/hpack.huffmanDecode + 20ms 4.17% 50.00% 30ms 6.25% runtime.pcvalue + 20ms 4.17% 54.17% 50ms 10.42% runtime.schedule + 10ms 2.08% 56.25% 10ms 2.08% github.com/coreos/etcd/vendor/github.com/coreos/etcd/etcdserver.(*EtcdServer).AuthInfoFromCtx + 10ms 2.08% 58.33% 10ms 2.08% github.com/coreos/etcd/vendor/github.com/coreos/etcd/etcdserver.(*EtcdServer).Lead + 10ms 2.08% 60.42% 10ms 2.08% github.com/coreos/etcd/vendor/github.com/coreos/etcd/pkg/wait.(*timeList).Trigger + 10ms 2.08% 62.50% 10ms 2.08% github.com/coreos/etcd/vendor/github.com/prometheus/client_golang/prometheus.(*MetricVec).hashLabelValues + 10ms 2.08% 64.58% 10ms 2.08% github.com/coreos/etcd/vendor/golang.org/x/net/http2.(*Framer).WriteHeaders +``` + +The `/debug/requests` endpoint gives gRPC traces and performance statistics through a web browser. For example, here is a `Range` request for the key `abc`: + +``` +When Elapsed (s) +2017/08/18 17:34:51.999317 0.000244 /etcdserverpb.KV/Range +17:34:51.999382 . 65 ... RPC: from 127.0.0.1:47204 deadline:4.999377747s +17:34:51.999395 . 13 ... recv: key:"abc" +17:34:51.999499 . 104 ... OK +17:34:51.999535 . 36 ... sent: header: kvs: count:1 +``` + +## Metrics endpoint + +Each etcd server exports metrics under the `/metrics` path on its client port and optionally on locations given by `--listen-metrics-urls`. + +The metrics can be fetched with `curl`: + +```sh +$ curl -L http://localhost:2379/metrics | grep -v debugging # ignore unstable debugging metrics + +# HELP etcd_disk_backend_commit_duration_seconds The latency distributions of commit called by backend. +# TYPE etcd_disk_backend_commit_duration_seconds histogram +etcd_disk_backend_commit_duration_seconds_bucket{le="0.002"} 72756 +etcd_disk_backend_commit_duration_seconds_bucket{le="0.004"} 401587 +etcd_disk_backend_commit_duration_seconds_bucket{le="0.008"} 405979 +etcd_disk_backend_commit_duration_seconds_bucket{le="0.016"} 406464 +... +``` + +## Health Check + +Since v3.3.0, in addition to responding to the `/metrics` endpoint, any locations specified by `--listen-metrics-urls` will also respond to the `/health` endpoint. This can be useful if the standard endpoint is configured with mutual (client) TLS authentication, but a load balancer or monitoring service still needs access to the health check. + +Since v3.4, two new endpoints `/livez` and `/readyz` are added. + +* the `/livez` endpoint reflects whether the process is alive or if it needs a restart. +* the `/readyz` endpoint reflects whether the process is ready to serve traffic. + +Design details of the endpoints are documented in the [KEP](https://github.com/kubernetes/enhancements/tree/master/keps/sig-etcd/4331-livez-readyz). + +Each endpoint includes several individual health checks, and you can use the `verbose` parameter to print out the details of the checks and their status, for example + +```bash +curl -k http://localhost:2379/readyz?verbose +``` + +and you would see the response similar to + +```text +[+]data_corruption ok +[+]serializable_read ok +[+]linearizable_read ok +ok +``` + +The http API also supports to exclude specific checks, for example + +```bash +curl -k http://localhost:2379/readyz?exclude=data_corruption +``` + +## Prometheus + +Running a [Prometheus][prometheus] monitoring service is the easiest way to ingest and record etcd's metrics. + +First, install Prometheus: + +```sh +PROMETHEUS_VERSION="2.0.0" +wget https://github.com/prometheus/prometheus/releases/download/v$PROMETHEUS_VERSION/prometheus-$PROMETHEUS_VERSION.linux-amd64.tar.gz -O /tmp/prometheus-$PROMETHEUS_VERSION.linux-amd64.tar.gz +tar -xvzf /tmp/prometheus-$PROMETHEUS_VERSION.linux-amd64.tar.gz --directory /tmp/ --strip-components=1 +/tmp/prometheus -version +``` + +Set Prometheus's scraper to target the etcd cluster endpoints: + +```sh +cat > /tmp/test-etcd.yaml <> /tmp/test-etcd.log 2>&1 & +``` + +Now Prometheus will scrape etcd metrics every 10 seconds. + + +### Alerting + +There is a set of [default alerts](https://github.com/etcd-io/etcd/tree/main/contrib/mixin) for etcd v3 clusters for Prometheus. + +{{% alert title="Note" color="info" %}} +Note that `job` labels may need to be adjusted to fit a particular need. The rules were written to apply to a single cluster so it is recommended to choose labels unique to a cluster. +{{% /alert %}} + +### Grafana + +[Grafana][grafana] has built-in Prometheus support; just add a Prometheus data source: + +``` +Name: test-etcd +Type: Prometheus +Url: http://localhost:9090 +Access: proxy +``` + +Then import the default [etcd dashboard template][template] and customize. For instance, if Prometheus data source name is `my-etcd`, the `datasource` field values in JSON also need to be `my-etcd`. + +Sample dashboard: + +![](../etcd-sample-grafana.png) + +## Distributed tracing + +In v3.5 etcd has added support for distributed tracing using [OpenTelemetry](https://github.com/open-telemetry). + +{{% alert title="Note" color="info" %}} +This feature is still experimental and can change at any time. +{{% /alert %}} + +To enable this experimental feature, pass the `--experimental-enable-distributed-tracing=true` to the etcd server, along with the `--experimental-distributed-tracing-sampling-rate=` flag to choose how many samples to collect per million spans, the default sampling rate is `0`. + +Configure the distributed tracing by starting etcd server with the following optional flags: + +- `--experimental-distributed-tracing-address` - (Optional) - "localhost:4317" - Address of the tracing collector. + +- `--experimental-distributed-tracing-service-name` - (Optional) - "etcd" - Distributed tracing service name, must be same across all etcd instances. + +- `--experimental-distributed-tracing-instance-id` - (Optional) - Instance ID, while optional it's strongly recommended to set, must be unique per etcd instance. + +Before enabling the distributed tracing, make sure to have the OpenTelemetry endpoint, if that address differs to the default one, override with the `--experimental-distributed-tracing-address` flag. Due to OpenTelemetry having different ways of running, refer to the [collector documentation](https://opentelemetry.io/docs/collector/getting-started/) to learn more. + +{{% alert title="Note" color="info" %}} +There is a resource overhead, as with any observability signal, according to our initial measurements that overhead could be between 2% - 4% CPU overhead. +{{% /alert %}} + +[grafana]: http://grafana.org/ +[prometheus]: https://prometheus.io/ +[template]: ../grafana.json diff --git a/.claude/commands/etcd/etcd-ops-guide/recovery.md b/.claude/commands/etcd/etcd-ops-guide/recovery.md new file mode 100644 index 0000000..a8a7a9b --- /dev/null +++ b/.claude/commands/etcd/etcd-ops-guide/recovery.md @@ -0,0 +1,148 @@ +--- +title: Disaster recovery +weight: 4275 +description: etcd v3 snapshot & restore facilities +--- + +etcd is designed to withstand machine failures. An etcd cluster automatically recovers from temporary failures (e.g., machine reboots) and tolerates up to *(N-1)/2* permanent failures for a cluster of N members. When a member permanently fails, whether due to hardware failure or disk corruption, it loses access to the cluster. If the cluster permanently loses more than *(N-1)/2* members then it disastrously fails, irrevocably losing quorum. Once quorum is lost, the cluster cannot reach consensus and therefore cannot continue accepting updates. + +To recover from disastrous failure, etcd v3 provides snapshot and restore facilities to recreate the cluster without v3 key data loss. To recover v2 keys, refer to the [v2 admin guide][v2_recover]. + +[v2_recover]: /docs/v2.3/admin_guide#disaster-recovery + +## Snapshotting the keyspace + +Recovering a cluster first needs a snapshot of the keyspace from an etcd member. A snapshot may either be taken from a live member with the `etcdctl snapshot save` command or by copying the `member/snap/db` file from an etcd data directory. For example, the following command snapshots the keyspace served by `$ENDPOINT` to the file `snapshot.db`: + +```sh +$ ETCDCTL_API=3 etcdctl --endpoints $ENDPOINT snapshot save snapshot.db +``` + +Note that taking the snapshot from the `member/snap/db` file might lose data that has not been written yet, but is included in the wal (write-ahead-log) folder. + +## Status of a snapshot + +To understand which revision and hash a given snapshot contains, you can use the `etcdutl snapshot status` command: + +```sh +$ etcdutl snapshot status snapshot.db -w table ++---------+----------+------------+------------+ +| HASH | REVISION | TOTAL KEYS | TOTAL SIZE | ++---------+----------+------------+------------+ +| 7ef846e | 485261 | 11642 | 94 MB | ++---------+----------+------------+------------+ +``` + +## Restoring a cluster + +### Revision Difference + +When you are restoring a cluster, existing clients may perceive the revision going back by many hundreds or thousands. This is due to the fact that a given snapshot only contains the data lineage up until the point of when it was taken, whereas the current state might already be further ahead. + +This is particularly a problem when running Kubernetes using etcd, where controllers and operators may use so called `informers` which act as local caches and get notified on updates using watches. Restoring to an older revision may not correctly refresh the caches, causing unpredictable and inconsistent behavior in the controllers. + +When restoring from a snapshot in the context of either: known consumers of the watch API, local cached copies of etcd data or when using Kubernetes in general - it is highly recommended to restore using "revision bumps" below. + +### Restoring from snapshot + +To restore a cluster, all that is needed is a single snapshot "db" file. A cluster restore with `etcdutl snapshot restore` creates new etcd data directories; all members should restore using the same snapshot. Restoring overwrites some snapshot metadata (specifically, the member ID and cluster ID); the member loses its former identity. This metadata overwrite prevents the new member from inadvertently joining an existing cluster. Therefore in order to start a cluster from a snapshot, the restore must start a new logical cluster. + +A simple restore can be excuted like this: + +```sh +$ etcdutl snapshot restore snapshot.db --data-dir output-dir +``` + +### Integrity Checks + +Snapshot integrity may be optionally verified at restore time. If the snapshot is taken with `etcdctl snapshot save`, it will have an integrity hash that is checked by `etcdutl snapshot restore`. If the snapshot is copied from the data directory, there is no integrity hash and it will only restore by using `--skip-hash-check`. + + +### Restoring with revision bump + +In order to ensure the revisions are never decreasing after a restore, you can supply the `--bump-revision` option. This option takes a 64 bit integer, which denotes how many revisions to add to the current revision of the snapshot. Since each write to etcd increases the revision by one, you may cover a week old snapshot with bumping by 1'000'000'000 assuming that etcd runs with less than 1500 writes per second. + +In the context of Kubernetes controllers, it is important to also mark all the revisions, including the bump, as compacted using `--mark-compacted`. This ensures that all watches are terminated and etcd does not respond to requests about revisions that happened after taking the snapshot - effectively invalidating its informer caches. + +A full invocation may look like this: + +```sh +$ etcdutl snapshot restore snapshot.db --bump-revision 1000000000 --mark-compacted --data-dir output-dir +``` + +### Restoring with updated membership + +The members of an etcd cluster are stored in etcd itself and maintained through the raft consensus algorithm. When quorum is lost entirely, you may want to reconsider where and how the new cluster is formed, for example, on an entirely new set of members. + +When restoring from a snapshot, you can directly supply the new membership into the datastore as follows: + +```sh +$ etcdutl snapshot restore snapshot.db \ + --name m1 \ + --initial-cluster m1=http://host1:2380,m2=http://host2:2380,m3=http://host3:2380 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-advertise-peer-urls http://host1:2380 +``` + +This ensures that the newly constructed cluster only connects to the other restored members with the given token and not older members that might still be alive and try to connect. + +Alternatively, when starting up etcd, you can supply `--force-new-cluster` to overwrite cluster membership while keeping existing application data. Note that this is strongly discouraged because it will panic if other members from previous cluster are still alive. Make sure to save snapshots periodically. + + +### End-2-End Example + +Grab a snapshot from a live cluster using: + +```sh +$ etcdctl snapshot save snapshot.db +``` + +Continuing from the previous example, the following creates new etcd data directories (`m1.etcd`, `m2.etcd`, `m3.etcd`) for a three member cluster: + +```sh +$ etcdutl snapshot restore snapshot.db \ + --name m1 \ + --data-dir m1_data_dir.etcd + --initial-cluster m1=http://host1:2380,m2=http://host2:2380,m3=http://host3:2380 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-advertise-peer-urls http://host1:2380 +$ etcdutl snapshot restore snapshot.db \ + --name m2 \ + --data-dir m2_data_dir.etcd + --initial-cluster m1=http://host1:2380,m2=http://host2:2380,m3=http://host3:2380 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-advertise-peer-urls http://host2:2380 +$ etcdutl snapshot restore snapshot.db \ + --name m3 \ + --data-dir m3_data_dir.etcd + --initial-cluster m1=http://host1:2380,m2=http://host2:2380,m3=http://host3:2380 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-advertise-peer-urls http://host3:2380 +``` + +Next, start `etcd` with the new data directories: + +```sh +$ etcd \ + --name m1 \ + --data-dir m1_data_dir.etcd + --listen-client-urls http://host1:2379 \ + --advertise-client-urls http://host1:2379 \ + --listen-peer-urls http://host1:2380 & +$ etcd \ + --name m2 \ + --data-dir m2_data_dir.etcd + --listen-client-urls http://host2:2379 \ + --advertise-client-urls http://host2:2379 \ + --listen-peer-urls http://host2:2380 & +$ etcd \ + --name m3 \ + --data-dir m3_data_dir.etcd + --listen-client-urls http://host3:2379 \ + --advertise-client-urls http://host3:2379 \ + --listen-peer-urls http://host3:2380 & +``` + +Now the restored etcd cluster should be available and serving the keyspace from the snapshot. + +Starting form etcd v3.6, users can only use `etcdctl` to take the data to a snapshot, but use `etcdutl` to restore data from a snapshot. If `--data-dir` is not specified, the default `--data-dir` value is `.etcd` (where `` is the value from `--name`). For example, if `--data-dir` was not provided and the members were named `m1`, `m2`, and `m3`, the `--data-dir` directories would be `m1.etcd`, `m2.etcd`, and `m3.etcd`. diff --git a/.claude/commands/etcd/etcd-ops-guide/runtime-configuration.md b/.claude/commands/etcd/etcd-ops-guide/runtime-configuration.md new file mode 100644 index 0000000..cf03d5b --- /dev/null +++ b/.claude/commands/etcd/etcd-ops-guide/runtime-configuration.md @@ -0,0 +1,252 @@ +--- +title: Runtime reconfiguration +weight: 4700 +description: etcd incremental runtime reconfiguration support +--- + +etcd comes with support for incremental runtime reconfiguration, which allows users to update the membership of the cluster at run time. + +Reconfiguration requests can only be processed when a majority of cluster members are functioning. It is **highly recommended** to always have a cluster size greater than two in production. It is unsafe to remove a member from a two member cluster. The majority of a two member cluster is also two. If there is a failure during the removal process, the cluster might not be able to make progress and need to [restart from majority failure][majority failure]. + +To better understand the design behind runtime reconfiguration, please read [the runtime reconfiguration document][runtime-reconf]. + +## Reconfiguration use cases + +This section will walk through some common reasons for reconfiguring a cluster. Most of these reasons just involve combinations of adding or removing a member, which are explained below under [Cluster Reconfiguration Operations][cluster-reconf]. + +### Cycle or upgrade multiple machines + +If multiple cluster members need to move due to planned maintenance (hardware upgrades, network downtime, etc.), it is recommended to modify members one at a time. + +It is safe to remove the leader, however there is a brief period of downtime while the election process takes place. If the cluster holds more than 50MB of v2 data, it is recommended to [migrate the member's data directory][member migration]. + +### Change the cluster size + +Increasing the cluster size can enhance [failure tolerance][fault tolerance table] and provide better read performance. Since clients can read from any member, increasing the number of members increases the overall serialized read throughput. + +Decreasing the cluster size can improve the write performance of a cluster, with a trade-off of decreased resilience. Writes into the cluster are replicated to a majority of members of the cluster before considered committed. Decreasing the cluster size lowers the majority, and each write is committed more quickly. + +### Replace a failed machine + +If a machine fails due to hardware failure, data directory corruption, or some other fatal situation, it should be replaced as soon as possible. Machines that have failed but haven't been removed adversely affect the quorum and reduce the tolerance for an additional failure. + +To replace the machine, follow the instructions for [removing the member][remove member] from the cluster, and then [add a new member][add member] in its place. If the cluster holds more than 50MB, it is recommended to [migrate the failed member's data directory][member migration] if it is still accessible. + +### Restart cluster from majority failure + +If the majority of the cluster is lost or all of the nodes have changed IP addresses, then manual action is necessary to recover safely. The basic steps in the recovery process include [creating a new cluster using the old data][disaster recovery], forcing a single member to act as the leader, and finally using runtime configuration to [add new members][add member] to this new cluster one at a time. + +### Recover cluster from minority failure + +If a specific member is lost, then it is equivalent to replacing a failed machine. The steps are mentioned in [Replace a failed machine](../runtime-configuration/#replace-a-failed-machine). + +## Cluster reconfiguration operations + +With these use cases in mind, the involved operations can be described for each. + +Before making any change, a simple majority (quorum) of etcd members must be available. This is essentially the same requirement for any kind of write to etcd. + +All changes to the cluster must be done sequentially: + +* To update a single member peerURLs, issue an update operation +* To replace a healthy single member, remove the old member then add a new member +* To increase from 3 to 5 members, issue two add operations +* To decrease from 5 to 3, issue two remove operations + +All of these examples use the `etcdctl` command line tool that ships with etcd. To change membership without `etcdctl`, use the [v2 HTTP members API][member-api] or the [v3 gRPC members API][member-api-grpc]. + +### Update a member + +#### Update advertise client URLs + +To update the advertise client URLs of a member, simply restart that member with updated client urls flag (`--advertise-client-urls`) or environment variable (`ETCD_ADVERTISE_CLIENT_URLS`). The restarted member will self publish the updated URLs. A wrongly updated client URL will not affect the health of the etcd cluster. + +#### Update advertise peer URLs + +To update the advertise peer URLs of a member, first update it explicitly via member command and then restart the member. The additional action is required since updating peer URLs changes the cluster wide configuration and can affect the health of the etcd cluster. + +To update the advertise peer URLs, first find the target member's ID. To list all members with `etcdctl`: + +```sh +$ etcdctl member list +6e3bd23ae5f1eae0: name=node2 peerURLs=http://localhost:23802 clientURLs=http://127.0.0.1:23792 +924e2e83e93f2560: name=node3 peerURLs=http://localhost:23803 clientURLs=http://127.0.0.1:23793 +a8266ecf031671f3: name=node1 peerURLs=http://localhost:23801 clientURLs=http://127.0.0.1:23791 +``` + +This example will `update` a8266ecf031671f3 member ID and change its peerURLs value to `http://10.0.1.10:2380`: + +```sh +$ etcdctl member update a8266ecf031671f3 --peer-urls=http://10.0.1.10:2380 +Updated member with ID a8266ecf031671f3 in cluster +``` + +### Remove a member + +Suppose the member ID to remove is a8266ecf031671f3. Use the `remove` command to perform the removal: + +```sh +$ etcdctl member remove a8266ecf031671f3 +Removed member a8266ecf031671f3 from cluster +``` + +The target member will stop itself at this point and print out the removal in the log: + +``` +etcd: this member has been permanently removed from the cluster. Exiting. +``` + +It is safe to remove the leader, however the cluster will be inactive while a new leader is elected. This duration is normally the period of election timeout plus the voting process. + +### Add a new member + +Adding a member is a two step process: + + * Add the new member to the cluster via the [HTTP members API][member-api], the [gRPC members API][member-api-grpc], or the `etcdctl member add` command. + * Start the new member with the new cluster configuration, including a list of the updated members (existing members + the new member). + +`etcdctl` adds a new member to the cluster by specifying the member's [name][conf-name] and [advertised peer URLs][conf-adv-peer]: + +```sh +$ etcdctl member add infra3 --peer-urls=http://10.0.1.13:2380 +added member 9bf1b35fc7761a23 to cluster + +ETCD_NAME="infra3" +ETCD_INITIAL_CLUSTER="infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380,infra3=http://10.0.1.13:2380" +ETCD_INITIAL_CLUSTER_STATE=existing +``` + +`etcdctl` has informed the cluster about the new member and printed out the environment variables needed to successfully start it. Now start the new etcd process with the relevant flags for the new member: + +```sh +$ export ETCD_NAME="infra3" +$ export ETCD_INITIAL_CLUSTER="infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380,infra3=http://10.0.1.13:2380" +$ export ETCD_INITIAL_CLUSTER_STATE=existing +$ etcd --listen-client-urls http://10.0.1.13:2379 --advertise-client-urls http://10.0.1.13:2379 --listen-peer-urls http://10.0.1.13:2380 --initial-advertise-peer-urls http://10.0.1.13:2380 --data-dir %data_dir% +``` + +The new member will run as a part of the cluster and immediately begin catching up with the rest of the cluster. + +If adding multiple members the best practice is to configure a single member at a time and verify it starts correctly before adding more new members. If adding a new member to a 1-node cluster, the cluster cannot make progress before the new member starts because it needs two members as majority to agree on the consensus. This behavior only happens between the time `etcdctl member add` informs the cluster about the new member and the new member successfully establishing a connection to the existing one. + +#### Add a new member as learner + +Starting from v3.4, etcd supports adding a new member as learner / non-voting member. +The motivation and design can be found in [design doc][design-learner]. +In order to make the process of adding a new member safer, +and to reduce cluster downtime when the new member is added, it is recommended that the new member is added to cluster +as a learner until it catches up. This can be described as a three step process: + + * Add the new member as learner via [gRPC members API][member-api-grpc] or the `etcdctl member add --learner` command. + + * Start the new member with the new cluster configuration, including a list of the updated members (existing members + the new member). + This step is exactly the same as before. + + * Promote the newly added learner to voting member via [gRPC members API][member-api-grpc] or the `etcdctl member promote` command. + etcd server validates promote request to ensure its operational safety. + Only after its raft log has caught up to leader’s can learner be promoted to a voting member. + If a learner member has not caught up to leader's raft log, member promote request will fail + (see [error cases when promoting a member] section for more details). + In this case, user should wait and retry later. + +In v3.4, etcd server limits the number of learners that cluster can have to one. The main consideration is to limit the +extra workload on leader due to propagating data from leader to learner. + +Use `etcdctl member add` with flag `--learner` to add new member to cluster as learner. + +```sh +$ etcdctl member add infra3 --peer-urls=http://10.0.1.13:2380 --learner +Member 9bf1b35fc7761a23 added to cluster a7ef944b95711739 + +ETCD_NAME="infra3" +ETCD_INITIAL_CLUSTER="infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380,infra3=http://10.0.1.13:2380" +ETCD_INITIAL_CLUSTER_STATE=existing +``` + +After new etcd process is started for the newly added learner member, use `etcdctl member promote` to promote learner to voting member. +``` +$ etcdctl member promote 9bf1b35fc7761a23 +Member 9e29bbaa45d74461 promoted in cluster a7ef944b95711739 +``` + +#### Error cases when adding members + +In the following case a new host is not included in the list of enumerated nodes. If this is a new cluster, the node must be added to the list of initial cluster members. + +```sh +$ etcd --name infra3 \ + --initial-cluster infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380 \ + --initial-cluster-state existing +etcdserver: assign ids error: the member count is unequal +exit 1 +``` + +In this case, give a different address (10.0.1.14:2380) from the one used to join the cluster (10.0.1.13:2380): + +```sh +$ etcd --name infra4 \ + --initial-cluster infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380,infra4=http://10.0.1.14:2380 \ + --initial-cluster-state existing +etcdserver: assign ids error: unmatched member while checking PeerURLs +exit 1 +``` + +If etcd starts using the data directory of a removed member, etcd automatically exits if it connects to any active member in the cluster: + +```sh +$ etcd +etcd: this member has been permanently removed from the cluster. Exiting. +exit 1 +``` + +#### Error cases when adding a learner member + +Cannot add learner to cluster if the cluster already has 1 learner (v3.4). +``` +$ etcdctl member add infra4 --peer-urls=http://10.0.1.14:2380 --learner +Error: etcdserver: too many learner members in cluster +``` + +#### Error cases when promoting a learner member + +Learner can only be promoted to voting member if it is in sync with leader. +``` +$ etcdctl member promote 9bf1b35fc7761a23 +Error: etcdserver: can only promote a learner member which is in sync with leader +``` + +Promoting a member that is not a learner will fail. +``` +$ etcdctl member promote 9bf1b35fc7761a23 +Error: etcdserver: can only promote a learner member +``` + +Promoting a member that does not exist in cluster will fail. +``` +$ etcdctl member promote 12345abcde +Error: etcdserver: member not found +``` + + +### Strict reconfiguration check mode (`-strict-reconfig-check`) + +As described in the above, the best practice of adding new members is to configure a single member at a time and verify it starts correctly before adding more new members. This step by step approach is very important because if newly added members is not configured correctly (for example the peer URLs are incorrect), the cluster can lose quorum. The quorum loss happens since the newly added member are counted in the quorum even if that member is not reachable from other existing members. Also quorum loss might happen if there is a connectivity issue or there are operational issues. + +For avoiding this problem, etcd provides an option `-strict-reconfig-check`. If this option is passed to etcd, etcd rejects reconfiguration requests if the number of started members will be less than a quorum of the reconfigured cluster. + +It is enabled by default. + +[add member]: #add-a-new-member +[cluster-reconf]: #cluster-reconfiguration-operations +[conf-adv-peer]: ../configuration#clustering +[conf-name]: ../configuration#member +[design-learner]: ../../learning/design-learner +[disaster recovery]: ../recovery +[error cases when promoting a member]: #error-cases-when-promoting-a-learner-member +[fault tolerance table]: /docs/v2.3/admin_guide/#fault-tolerance-table +[majority failure]: #restart-cluster-from-majority-failure +[member migration]: /docs/v2.3/admin_guide/#member-migration +[member-api]: /docs/v2.3/members_api/ +[member-api-grpc]: ../../dev-guide/api_reference_v3/ +[remove member]: #remove-a-member +[runtime-reconf]: ../runtime-reconf-design/ diff --git a/.claude/commands/etcd/etcd-ops-guide/runtime-reconf-design.md b/.claude/commands/etcd/etcd-ops-guide/runtime-reconf-design.md new file mode 100644 index 0000000..7dd4d8e --- /dev/null +++ b/.claude/commands/etcd/etcd-ops-guide/runtime-reconf-design.md @@ -0,0 +1,54 @@ +--- +title: Design of runtime reconfiguration +weight: 4650 +description: The design of etcd’s runtime reconfiguration commands +--- + +Runtime reconfiguration is one of the hardest and most error prone features in a distributed system, especially in a consensus based system like etcd. + +Read on to learn about the design of etcd's runtime reconfiguration commands and how we tackled these problems. + +## Two phase config changes keep the cluster safe + +In etcd, every runtime reconfiguration has to go through [two phases][add-member] for safety reasons. For example, to add a member, first inform the cluster of the new configuration and then start the new member. + +Phase 1 - Inform cluster of new configuration + +To add a member into an etcd cluster, make an API call to request a new member to be added to the cluster. This is the only way to add a new member into an existing cluster. The API call returns when the cluster agrees on the configuration change. + +Phase 2 - Start new member + +To join the new etcd member into the existing cluster, specify the correct `initial-cluster` and set `initial-cluster-state` to `existing`. When the member starts, it will contact the existing cluster first and verify the current cluster configuration matches the expected one specified in `initial-cluster`. When the new member successfully starts, the cluster has reached the expected configuration. + +By splitting the process into two discrete phases users are forced to be explicit regarding cluster membership changes. This actually gives users more flexibility and makes things easier to reason about. For example, if there is an attempt to add a new member with the same ID as an existing member in an etcd cluster, the action will fail immediately during phase one without impacting the running cluster. Similar protection is provided to prevent adding new members by mistake. If a new etcd member attempts to join the cluster before the cluster has accepted the configuration change, it will not be accepted by the cluster. + +Without the explicit workflow around cluster membership etcd would be vulnerable to unexpected cluster membership changes. For example, if etcd is running under an init system such as systemd, etcd would be restarted after being removed via the membership API, and attempt to rejoin the cluster on startup. This cycle would continue every time a member is removed via the API and systemd is set to restart etcd after failing, which is unexpected. + +We expect runtime reconfiguration to be an infrequent operation. We decided to keep it explicit and user-driven to ensure configuration safety and keep the cluster always running smoothly under explicit control. + +## Permanent loss of quorum requires new cluster + +If a cluster permanently loses a majority of its members, a new cluster will need to be started from an old data directory to recover the previous state. + +It is entirely possible to force removing the failed members from the existing cluster to recover. However, we decided not to support this method since it bypasses the normal consensus committing phase, which is unsafe. If the member to remove is not actually dead or force removed through different members in the same cluster, etcd will end up with a diverged cluster with same clusterID. This is very dangerous and hard to debug/fix afterwards. + +With a correct deployment, the possibility of permanent majority loss is very low. But it is a severe enough problem that is worth special care. We strongly suggest reading the [disaster recovery documentation][disaster-recovery] and preparing for permanent majority loss before putting etcd into production. + +## Do not use public discovery service for runtime reconfiguration + +The public discovery service should only be used for bootstrapping a cluster. To join member into an existing cluster, use the runtime reconfiguration API. + +The discovery service is designed for bootstrapping an etcd cluster in a cloud environment, when the IP addresses of all the members are not known beforehand. After successfully bootstrapping a cluster, the IP addresses of all the members are known. Technically, the discovery service should no longer be needed. + +It seems that using public discovery service is a convenient way to do runtime reconfiguration, after all discovery service already has all the cluster configuration information. However relying on public discovery service brings troubles: + +1. it introduces external dependencies for the entire life-cycle of the cluster, not just bootstrap time. If there is a network issue between the cluster and public discovery service, the cluster will suffer from it. + +2. public discovery service must reflect correct runtime configuration of the cluster during its life-cycle. It has to provide security mechanisms to avoid bad actions, and it is hard. + +3. public discovery service has to keep tens of thousands of cluster configurations. Our public discovery service backend is not ready for that workload. + +To have a discovery service that supports runtime reconfiguration, the best choice is to build a private one. + +[add-member]: ../runtime-configuration/#add-a-new-member +[disaster-recovery]: ../recovery/ diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/administrative.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/administrative.rst new file mode 100644 index 0000000..6add435 --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/administrative.rst @@ -0,0 +1,150 @@ +.. index:: + single: administrative mode + +Administrative Modes +-------------------- + +Intrusive administration can be performed on a Pacemaker cluster without +causing resource failures, recovery, and fencing, by putting the cluster or a +subset of it into an administrative mode. + +Pacemaker supports several administrative modes: + +* Maintenance mode for the entire cluster, specific nodes, or specific + resources +* Unmanaged resources +* Disabled configuration items +* Standby mode for specific nodes + +Rules may be used to automatically set any of these modes for specific times or +other conditions. + + +.. index:: + pair: administrative mode; maintenance mode + +.. _maintenance_mode: + +Maintenance Mode +################ + +In maintenance mode, the cluster will not start or stop resources. Recurring +monitors for affected resources will be paused, except those specifying +``role`` as ``Stopped``. + +To put a specific resource into maintenance mode, set the resource's +``maintenance`` meta-attribute to ``true``. + +To put all active resources on a specific node into maintenance mode, set the +node's ``maintenance`` node attribute to ``true``. When enabled, this overrides +resource-specific maintenance mode. + +.. warning:: + + Restarting Pacemaker on a node that is in single-node maintenance mode will + likely lead to undesirable effects. If ``maintenance`` is set as a transient + attribute, it will be erased when Pacemaker is stopped, which will + immediately take the node out of maintenance mode and likely get it fenced. + If set as a permanent attribute, any resources active on the node will have + their local history erased when Pacemaker is restarted, so the cluster will + no longer consider them running on the node and thus will consider them + managed again, allowing them to be started elsewhere. + +To put all resources in the cluster into maintenance mode, set the +``maintenance-mode`` cluster option to ``true``. When enabled, this overrides +node- or resource- specific maintenance mode. + +Maintenance mode, at any level, overrides other administrative modes. + + +.. index:: + pair: administrative mode; unmanaged resources + +.. _unmanaged_resources: + +Unmanaged Resources +################### + +An unmanaged resource will not be started or stopped by the cluster. A resource +may become unmanaged in several ways: + +* The administrator may set the ``is-managed`` resource meta-attribute to + ``false`` (whether for a specific resource, or all resources without an + explicit setting via ``rsc_defaults``) +* :ref:`Maintenance mode ` causes affected resources to + become unmanaged (and overrides any ``is-managed`` setting) +* Certain types of failure cause affected resources to become unmanaged. These + include: + + * Failed stop operations when the ``fencing-enabled`` cluster property is set + to ``false`` + * Failure of an operation that has ``on-fail`` set to ``block`` + * A resource detected as incorrectly active on more than one node when its + ``multiple-active`` meta-attribute is set to ``block`` + * A resource constrained by a revoked ``rsc_ticket`` with ``loss-policy`` set + to ``freeze`` + * Resources with ``requires`` set (or defaulting) to anything other than + ``nothing`` in a partition that loses quorum when the ``no-quorum-policy`` + cluster option is set to ``freeze`` + +Recurring actions are not affected by unmanaging a resource. + +.. warning:: + + Manually starting an unmanaged resource on a different node is strongly + discouraged. It will at least cause the cluster to consider the resource + failed, and may require the resource's ``target-role`` to be set to + ``Stopped`` then ``Started`` in order for recovery to succeed. + + +.. index:: + pair: administrative mode; disabled configuration + +.. _disabled_configuration: + +Disabled Configuration +###################### + +Some configuration elements disable particular behaviors: + +* The ``fencing-enabled`` cluster option, when set to ``false``, disables node + fencing. This is highly discouraged, as it can lead to data unavailability, + loss, or corruption. + +* The ``stop-all-resources`` cluster option, when set to ``true``, causes all + resources to be stopped. + +* Certain elements support an ``enabled`` meta-attribute, which if set to + ``false``, causes the cluster to act as if the specific element is not + configured. These include ``op``, ``alert`` *(since 2.1.6)*, and + ``recipient`` *(since 2.1.6)*. ``enabled`` may be set for specific ``op`` + elements, or all operations without an explicit setting via ``op_defaults``. + + +.. index:: + pair: administrative mode; standby + +.. _standby: + +Standby Mode +############ + +When a node is put into standby, all resources will be moved away from the +node, and all recurring operations will be stopped on the node, except those +specifying ``role`` as ``Stopped`` (which will be newly initiated if +appropriate). + +A node may be put into standby mode by setting its ``standby`` node attribute +to ``true``. The attribute may be queried and set using the ``crm_standby`` +tool. + + +.. index:: + pair: administrative mode; rules + +Rules +##### + +Rules may be used to set administrative mode options automatically according to +various criteria such as date and time. See the "Rules" chapter of the +*Pacemaker Explained* document for details. diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/agents.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/agents.rst new file mode 100644 index 0000000..c85c14d --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/agents.rst @@ -0,0 +1,1182 @@ +.. index:: + single: resource agent + +Resource Agents +--------------- + + +Action Completion +################# + +If one resource depends on another resource via constraints, the cluster will +interpret an expected result as sufficient to continue with dependent actions. +This may cause timing issues if the resource agent start returns before the +service is not only launched but fully ready to perform its function, or if the +resource agent stop returns before the service has fully released all its +claims on system resources. At a minimum, the start or stop should not return +before a status command would return the expected (started or stopped) result. + + +.. index:: + single: OCF resource agent + single: resource agent; OCF + +OCF Resource Agents +################### + +.. index:: + single: OCF resource agent; location + +Location of Custom Scripts +__________________________ + +OCF Resource Agents are found in ``/usr/lib/ocf/resource.d/$PROVIDER`` + +When creating your own agents, you are encouraged to create a new directory +under ``/usr/lib/ocf/resource.d/`` so that they are not confused with (or +overwritten by) the agents shipped by existing providers. + +So, for example, if you choose the provider name of big-corp and want a new +resource named big-app, you would create a resource agent called +``/usr/lib/ocf/resource.d/big-corp/big-app`` and define a resource: + +.. code-block: xml + + + + +.. index:: + single: OCF resource agent; action + +Actions +_______ + +All OCF resource agents are required to implement the following actions. + +.. list-table:: **Required Actions for OCF Agents** + :class: longtable + :widths: 15 25 60 + :header-rows: 1 + + * - Action + - Description + - Instructions + * - .. _start_action: + + .. index:: + single: OCF resource agent; start + single: start action + + start + - Start the resource + - Return :ref:`OCF_SUCCESS ` on success and an appropriate + error code otherwise. Must not report success until the resource is fully + active. + * - .. _stop_action: + + .. index:: + single: OCF resource agent; stop + single: stop action + + stop + - Stop the resource + - Return :ref:`OCF_SUCCESS ` on success and an appropriate + error code otherwise. Must not report success until the resource is fully + stopped. + * - .. _monitor_action: + + .. index:: + single: OCF resource agent; monitor + single: monitor action + + monitor + - Check the resource's state + - Return :ref:`OCF_SUCCESS ` if the resource is running, + :ref:`OCF_NOT_RUNNING ` if it is stopped, and any other + :ref:`OCF exit code ` if it is failed. **Note:** The + monitor action should test the state of the resource on the local machine + only. + * - .. _meta_data_action: + + .. index:: + single: OCF resource agent; meta-data + single: meta-data action + + meta-data + - Describe the resource + - Provide information about this resource in the XML format defined by the + OCF standard. Return :ref:`OCF_SUCCESS `. **Note:** This is + *not* required to be performed as root. + +OCF resource agents may optionally implement additional actions. Some are used +only with advanced resource types such as clones. + +.. list-table:: **Optional Actions for OCF Resource Agents** + :class: longtable: + :widths: 15 45 40 + :header-rows: 1 + + * - Action + - Description + - Instructions + * - .. _validate_all_action: + + .. index:: + single: OCF resource agent; validate-all + single: validate-all action + + validate-all + - Validate the instance parameters provided. + - Return :ref:`OCF_SUCCESS ` if parameters are valid, + :ref:`OCF_ERR_ARGS ` if not valid, and + :ref:`OCF_ERR_CONFIGURED ` if resource is not + configured. + * - .. _promote_action: + + .. index:: + single: OCF resource agent; promote + single: promote action + + promote + - Bring the local instance of a promotable clone resource to the promoted + role. + - Return :ref:`OCF_SUCCESS ` on success. + * - .. _demote_action: + + .. index:: + single: OCF resource agent; demote + single: demote action + + demote + - Bring the local instance of a promotable clone resource to the unpromoted + role. + - Return :ref:`OCF_SUCCESS ` on success. + * - .. _notify_action: + + .. index:: + single: OCF resource agent; notify + single: notify action + + notify + - Used by the cluster to send the agent pre- and post-notification events + telling the resource what has happened and what will happen. + - Must not fail. Must return :ref:`OCF_SUCCESS `. + * - .. _reload_action: + + .. index:: + single: OCF resource agent; reload + single: reload action + + reload + - Reload the service's own configuration. + - Not used by Pacemaker. + * - .. _reload_agent_action: + + .. index:: + single: OCF resource agent; reload-agent + single: reload-agent action + + reload-agent + - Make effective any changes in instance parameters marked as reloadable in + the agent's meta-data. + - This is used when the agent can handle a change in some of its parameters + more efficiently than stopping and starting the resource. + * - .. _recover_action: + + .. index:: + single: OCF resource agent; recover + single: recover action + + recover + - Restart the service. + - Not used by Pacemaker. + +.. important:: + + If you create a new OCF resource agent, use `ocf-tester` to verify that the + agent complies with the OCF standard properly. + + +.. index:: + single: OCF resource agent; return code + +How Are OCF Return Codes Interpreted? +_____________________________________ + +The first thing the cluster does is to check the return code against the +expected result. If the result does not match the expected value, then the +operation is considered to have failed, and recovery action is initiated. + +There are three types of failure recovery: + +.. list-table:: **Types of Recovery Performed by the Cluster** + :class: longtable + :widths: 10 45 45 + :header-rows: 1 + + * - Type + - Description + - Action Taken by the Cluster + * - .. _soft_error: + + .. index:: + single: OCF resource agent; soft error + + soft + - A transient error + - Restart the resource or move it to a new location + * - .. _hard_error: + + .. index:: + single: OCF resource agent; hard error + + hard + - A non-transient error that may be specific to the current node + - Move the resource elsewhere and prevent it from being retried on the + current node + * - .. _fatal_error: + + .. index:: + single: OCF resource agent; fatal error + + fatal + - A non-transient error that will be common to all cluster nodes (for + example, a bad configuration was specified) + - Stop the resource and prevent it from being started on any cluster node + +.. _ocf_return_codes: + +OCF Return Codes +________________ + +The following table outlines the various OCF return codes and the type of +recovery the cluster will initiate when a failure code is received. Although +counterintuitive, even actions that return ``OCF_SUCCESS`` can be considered to +have failed, if ``OCF_SUCCESS`` was not the expected return value. + +.. list-table:: **OCF Exit Codes and Their Recovery Types** + :class: longtable + :widths: 8 32 50 10 + :header-rows: 1 + + * - Exit Code + - OCF Alias + - Description + - Recovery + * - .. _OCF_SUCCESS: + + .. index:: + single: OCF_SUCCESS + single: OCF return code; OCF_SUCCESS + pair: OCF return code; 0 + + 0 + - OCF_SUCCESS + - Success. The command completed successfully. This is the expected result + for all start, stop, promote, and demote actions. + - :ref:`soft ` + * - .. _OCF_ERR_GENERIC: + + .. index:: + single: OCF_ERR_GENERIC + single: OCF return code; OCF_ERR_GENERIC + pair: OCF return code; 1 + + 1 + - OCF_ERR_GENERIC + - Generic "there was a problem" error code. + - :ref:`hard ` + * - .. _OCF_ERR_ARGS: + + .. index:: + single: OCF_ERR_ARGS + single: OCF return code; OCF_ERR_ARGS + pair: OCF return code; 2 + + 2 + - OCF_ERR_ARGS + - The resource's parameter values are not valid on this machine (for + example, a value refers to a file not found on the local host). + - :ref:`hard ` + * - .. _OCF_ERR_UNIMPLEMENTED: + + .. index:: + single: OCF_ERR_UNIMPLEMENTED + single: OCF return code; OCF_ERR_UNIMPLEMENTED + pair: OCF return code; 3 + + 3 + - OCF_ERR_UNIMPLEMENTED + - The requested action is not implemented. + - :ref:`hard ` + * - .. _OCF_ERR_PERM: + + .. index:: + single: OCF_ERR_PERM + single: OCF return code; OCF_ERR_PERM + pair: OCF return code; 4 + + 4 + - OCF_ERR_PERM + - The resource agent does not have sufficient privileges to complete the + task. + - :ref:`hard ` + * - .. _OCF_ERR_INSTALLED: + + .. index:: + single: OCF_ERR_INSTALLED + single: OCF return code; OCF_ERR_INSTALLED + pair: OCF return code; 5 + + 5 + - OCF_ERR_INSTALLED + - The tools required by the resource are not installed on this machine. + - :ref:`hard ` + * - .. _OCF_ERR_CONFIGURED: + + .. index:: + single: OCF_ERR_CONFIGURED + single: OCF return code; OCF_ERR_CONFIGURED + pair: OCF return code; 6 + + 6 + - OCF_ERR_CONFIGURED + - The resource's parameter values are inherently invalid (for example, a + required parameter was not given). + - :ref:`fatal ` + * - .. _OCF_NOT_RUNNING: + + .. index:: + single: OCF_NOT_RUNNING + single: OCF return code; OCF_NOT_RUNNING + pair: OCF return code; 7 + + 7 + - OCF_NOT_RUNNING + - The resource is safely stopped. This should only be returned by monitor + actions, not stop actions. + - N/A + * - .. _OCF_RUNNING_PROMOTED: + + .. index:: + single: OCF_RUNNING_PROMOTED + single: OCF return code; OCF_RUNNING_PROMOTED + pair: OCF return code; 8 + + 8 + - OCF_RUNNING_PROMOTED + - The resource is running in the promoted role. + - :ref:`soft ` + * - .. _OCF_FAILED_PROMOTED: + + .. index:: + single: OCF_FAILED_PROMOTED + single: OCF return code; OCF_FAILED_PROMOTED + pair: OCF return code; 9 + + 9 + - OCF_FAILED_PROMOTED + - The resource is (or might be) in the promoted role but has failed. The + resource will be demoted, stopped, and then started (and possibly + promoted) again. + - :ref:`soft ` + * - .. _OCF_DEGRADED: + + .. index:: + single: OCF_DEGRADED + single: OCF return code; OCF_DEGRADED + pair: OCF return code; 190 + + 190 + - OCF_DEGRADED + - The resource is properly active, but in such a condition that future + failures are more likely. + - none + * - .. _OCF_DEGRADED_PROMOTED: + + .. index:: + single: OCF_DEGRADED_PROMOTED + single: OCF return code; OCF_DEGRADED_PROMOTED + pair: OCF return code; 191 + + 191 + - OCF_DEGRADED_PROMOTED + - The resource is properly active in the promoted role, but in such a + condition that future failures are more likely. + - none + * - other + - *none* + - Custom error code. + - soft + +Exceptions to the recovery handling described above: + +* Probes (non-recurring monitor actions) that find a resource active + (or in the promoted role) will not result in recovery action unless it is + also found active elsewhere. +* The recovery action taken when a resource is found active more than + once is determined by the resource's ``multiple-active`` property. +* Recurring actions that return ``OCF_ERR_UNIMPLEMENTED`` + do not cause any type of recovery. +* Actions that return one of the "degraded" codes will be treated the same as + if they had returned success, but status output will indicate that the + resource is degraded. + +.. _ocf_env_vars: + +Environment Variables +_____________________ + +Pacemaker sets certain environment variables when it executes an OCF resource +agent. Agents can check these variables to get information about resource +parameters or the execution environment. + +**Note:** Pacemaker may set other environment variables for its own purposes. +They may be present in the agent's environment, but Pacemaker is not providing +them for the agent's use, and so the agent should not rely on any variables not +listed in the table below. + +.. list-table:: **OCF Environment Variables** + :class: longtable + :widths: 50 50 + :header-rows: 1 + + * - Environment Variable + - Description + * - .. _OCF_CHECK_LEVEL: + + .. index:: + single: OCF_CHECK_LEVEL + single: environment variable; OCF_CHECK_LEVEL + + OCF_CHECK_LEVEL + - Requested intensity level of checks in ``monitor`` and ``validate-all`` + actions. Usually set as an operation attribute; see Pacemaker Explained + for an example. + * - .. _OCF_EXIT_REASON_PREFIX: + + .. index:: + single: OCF_EXIT_REASON_PREFIX + single: environment variable; OCF_EXIT_REASON_PREFIX + + OCF_EXIT_REASON_PREFIX + - Prefix for printing fatal error messages from the resource agent. + * - .. _OCF_RA_VERSION_MAJOR: + + .. index:: + single: OCF_RA_VERSION_MAJOR + single: environment variable; OCF_RA_VERSION_MAJOR + + OCF_RA_VERSION_MAJOR + - Major version number of the OCF Resource Agent API. If the script does + not support this revision, it should report an error. + See the `OCF specification `_ for an + explanation of the versioning scheme used. The version number is split + into two numbers for ease of use in shell scripts. These two may be used + by the agent to determine whether it is run under an OCF-compliant + resource manager. + * - .. _OCF_RA_VERSION_MINOR: + + .. index:: + single: OCF_RA_VERSION_MINOR + single: environment variable; OCF_RA_VERSION_MINOR + + OCF_RA_VERSION_MINOR + - Minor version number of the OCF Resource Agent API. See + :ref:`OCF_RA_VERSION_MAJOR ` for more details. + * - .. _OCF_RESKEY_crm_feature_set: + + .. index:: + single: OCF_RESKEY_crm_feature_set + single: environment variable; OCF_RESKEY_crm_feature_set + + OCF_RESKEY_crm_feature_set + - ``crm_feature_set`` on the DC (or on the local node, if the agent is run + by ``crm_resource``). + * - .. _OCF_RESKEY_CRM_meta_interval: + + .. index:: + single: OCF_RESKEY_CRM_meta_interval + single: environment variable; OCF_RESKEY_CRM_meta_interval + + OCF_RESKEY_CRM_meta_interval + - Interval (in milliseconds) of the current operation. + * - .. _OCF_RESKEY_CRM_meta_name: + + .. index:: + single: OCF_RESKEY_CRM_meta_name + single: environment variable; OCF_RESKEY_CRM_meta_name + + OCF_RESKEY_CRM_meta_name + - Name of the current operation. + * - .. _OCF_RESKEY_CRM_meta_notify: + + .. index:: + single: OCF_RESKEY_CRM_meta_notify_* + single: environment variable; OCF_RESKEY_CRM_meta_notify_* + + OCF_RESKEY_CRM_meta_notify_* + - See :ref:`Clone Notifications `. + * - .. _OCF_RESKEY_CRM_meta_on_node: + + .. index:: + single: OCF_RESKEY_CRM_meta_on_node + single: environment variable; OCF_RESKEY_CRM_meta_on_node + + OCF_RESKEY_CRM_meta_on_node + - Name of the node where the current operation is running. + * - .. _OCF_RESKEY_CRM_meta_on_node_uuid: + + .. index:: + single: OCF_RESKEY_CRM_meta_on_node_uuid + single: environment variable; OCF_RESKEY_CRM_meta_on_node_uuid + + OCF_RESKEY_CRM_meta_on_node_uuid + - Cluster-layer ID of the node where the current operation is running (or + node name for Pacemaker Remote nodes). + * - .. _OCF_RESKEY_CRM_meta_physical_host: + + .. index:: + single: OCF_RESKEY_CRM_meta_physical_host + single: environment variable; OCF_RESKEY_CRM_meta_physical_host + + OCF_RESKEY_CRM_meta_physical_host + - If the node where the current operation is running is a guest node, the + host on which the container is running. + * - .. _OCF_RESKEY_CRM_meta_timeout: + + .. index:: + single: OCF_RESKEY_CRM_meta_timeout + single: environment variable; OCF_RESKEY_CRM_meta_timeout + + OCF_RESKEY_CRM_meta_timeout + - Timeout (in milliseconds) of the current operation. + * - .. _OCF_RESKEY_CRM_meta: + + .. index:: + single: OCF_RESKEY_CRM_meta_* + single: environment variable; OCF_RESKEY_CRM_meta_* + + OCF_RESKEY_CRM_meta_* + - Each of a resource's meta-attributes is converted to an environment + variable prefixed with "OCF_RESKEY_CRM_meta\_". See Pacemaker Explained + for some meta-attributes that have special meaning to Pacemaker. + * - .. _OCF_RESKEY: + + .. index:: + single: OCF_RESKEY_* + single: environment variable; OCF_RESKEY_* + + OCF_RESKEY_* + - Each of a resource's instance parameters is converted to an environment + variable prefixed with "OCF_RESKEY\_". + * - .. _OCF_RESOURCE_INSTANCE: + + .. index:: + single: OCF_RESOURCE_INSTANCE + single: environment variable; OCF_RESOURCE_INSTANCE + + OCF_RESOURCE_INSTANCE + - The name of the resource instance. + * - .. _OCF_RESOURCE_PROVIDER: + + .. index:: + single: OCF_RESOURCE_PROVIDER + single: environment variable; OCF_RESOURCE_PROVIDER + + OCF_RESOURCE_PROVIDER + - The name of the resource agent provider. + * - .. _OCF_RESOURCE_TYPE: + + .. index:: + single: OCF_RESOURCE_TYPE + single: environment variable; OCF_RESOURCE_TYPE + + OCF_RESOURCE_TYPE + - The name of the resource type. + * - .. _OCF_ROOT: + + .. index:: + single: OCF_ROOT + single: environment variable; OCF_ROOT + + OCF_ROOT + - The root of the OCF directory hierarchy. + * - .. _OCF_TRACE_FILE: + + .. index:: + single: OCF_TRACE_FILE + single: environment variable; OCF_TRACE_FILE + + OCF_TRACE_FILE + - The absolute path or file descriptor to write trace output to, if + ``OCF_TRACE_RA`` is set to true. Pacemaker sets this only to + ``/dev/stderr`` and only when running a resource agent via + ``crm_resource``. + * - .. _OCF_TRACE_RA: + + .. index:: + single: OCF_TRACE_RA + single: environment variable; OCF_TRACE_RA + + OCF_TRACE_RA + - If set to true, enable tracing of the resource agent. Trace output is + written to ``OCF_TRACE_FILE`` if set; otherwise, it's written to a file + in ``OCF_RESKEY_trace_dir`` if set or in a default directory if not. + Pacemaker sets this to true only when running a resource agent via + ``crm_resource`` with one or more ``-V`` flags. + * - .. _PCMK_DEBUGLOG: + .. _HA_DEBUGLOG: + + .. index:: + single: PCMK_DEBUGLOG + single: environment variable; PCMK_DEBUGLOG + single: HA_DEBUGLOG + single: environment variable; HA_DEBUGLOG + + PCMK_DEBUGLOG (and HA_DEBUGLOG) + - Where to write resource agent debug logs. Pacemaker sets this to + ``PCMK_logfile`` if set to a value other than ``none`` and if debugging + is enabled for the executor. + * - .. _PCMK_LOGFACILITY: + .. _HA_LOGFACILITY: + + .. index:: + single: PCMK_LOGFACILITY + single: environment variable; PCMK_LOGFACILITY + single: HA_LOGFACILITY + single: environment variable; HA_LOGFACILITY + + PCMK_LOGFACILITY (and HA_LOGFACILITY) + - Syslog facility for resource agent logs. Pacemaker sets this to + ``PCMK_logfacility`` if set to a value other than ``none`` or + ``/dev/null``. + * - .. _PCMK_LOGFILE: + .. _HA_LOGFILE: + + .. index:: + single: PCMK_LOGFILE: + single: environment variable; PCMK_LOGFILE: + single: HA_LOGFILE: + single: environment variable; HA_LOGFILE: + + PCMK_LOGFILE (and HA_LOGFILE) + - Where to write resource agent logs. Pacemaker sets this to + ``PCMK_logfile`` if set to a value other than ``none``. + * - .. _PCMK_service: + + .. index:: + single: PCMK_service + single: environment variable; PCMK_service + + PCMK_service + - The name of the Pacemaker subsystem or command-line tool that's executing + the resource agent. Specific values are subject to change; useful mainly + for logging. + +Clone Resource Agent Requirements +_________________________________ + +Any resource can be used as an anonymous clone, as it requires no additional +support from the resource agent. Whether it makes sense to do so depends on your +resource and its resource agent. + +Resource Agent Requirements for Globally Unique Clones +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Globally unique clones require additional support in the resource agent. In +particular, it must respond with ``OCF_SUCCESS`` only if the node has that exact +instance active. All other probes for instances of the clone should result in +``OCF_NOT_RUNNING`` (or one of the other OCF error codes if they are failed). + +Individual instances of a clone are identified by appending a colon and a +numerical offset (for example, ``apache:2``). + +A resource agent can find out how many copies there are by examining the +``OCF_RESKEY_CRM_meta_clone_max`` environment variable and which instance it is +by examining ``OCF_RESKEY_CRM_meta_clone``. + +The resource agent must not make any assumptions (based on +``OCF_RESKEY_CRM_meta_clone``) about which numerical instances are active. In +particular, the list of active copies is not always an unbroken sequence, nor +does it always start at 0. + +Resource Agent Requirements for Promotable Clones +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Promotable clone resources require two extra actions, ``demote`` and ``promote``, +which are responsible for changing the state of the resource. Like ``start`` and +``stop``, they should return ``OCF_SUCCESS`` if they completed successfully or a +relevant error code if they did not. + +The states can mean whatever you wish, but when the resource is started, it must +begin in the unpromoted role. From there, the cluster will decide which +instances to promote. + +In addition to the clone requirements for monitor actions, agents must also +*accurately* report which state they are in. The cluster relies on the agent to +report its status (including role) accurately and does not indicate to the agent +what role it currently believes it to be in. + +.. list-table:: **Role Implications of OCF Return Codes** + :class: longtable + :widths: 50 50 + :header-rows: 1 + + * - Monitor Return Code + - Description + * - :ref:`OCF_NOT_RUNNING ` + - .. index:: + single: OCF_NOT_RUNNING + single: OCF return code; OCF_NOT_RUNNING + + Stopped + * - :ref:`OCF_SUCCESS ` + - .. index:: + single: OCF_SUCCESS + single: OCF return code; OCF_SUCCESS + + Running (Unpromoted) + * - :ref:`OCF_RUNNING_PROMOTED ` + - .. index:: + single: OCF_RUNNING_PROMOTED + single: OCF return code; OCF_RUNNING_PROMOTED + + Running (Promoted) + * - :ref:`OCF_FAILED_PROMOTED ` + - .. index:: + single: OCF_FAILED_PROMOTED + single: OCF return code; OCF_FAILED_PROMOTED + + Failed (Promoted) + * - Other + - Failed (Unpromoted) + +.. _clone_notifications: + +Clone Notifications +~~~~~~~~~~~~~~~~~~~ + +If the clone has the ``notify`` meta-attribute set to ``true`` and the resource +agent supports the ``notify`` action, Pacemaker will call the action when +appropriate, passing a number of extra variables. These variables, when combined +with additional context, can be used to calculate the current state of the +cluster and what is about to happen to it. + +.. index:: + single: clone; environment variables + single: notify; environment variables + +.. list-table:: **Environment Variables Supplied with Clone Notify Actions** + :class: longtable + :widths: 50 50 + :header-rows: 1 + + * - Variable + - Description + * - .. _OCF_RESKEY_CRM_meta_notify_type: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_type + single: OCF_RESKEY_CRM_meta_notify_type + + OCF_RESKEY_CRM_meta_notify_type + - Allowed values: ``pre``, ``post`` + * - .. _OCF_RESKEY_CRM_meta_notify_operation: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_operation + single: OCF_RESKEY_CRM_meta_notify_operation + + OCF_RESKEY_CRM_meta_notify_operation + - Allowed values: ``start``, ``stop`` + * - .. _OCF_RESKEY_CRM_meta_notify_start_resource: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_start_resource + single: OCF_RESKEY_CRM_meta_notify_start_resource + + OCF_RESKEY_CRM_meta_notify_start_resource + - Resources to be started + * - .. _OCF_RESKEY_CRM_meta_notify_stop_resource: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_stop_resource + single: OCF_RESKEY_CRM_meta_notify_stop_resource + + OCF_RESKEY_CRM_meta_notify_stop_resource + - Resources to be stopped + * - .. _OCF_RESKEY_CRM_meta_notify_active_resource: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_active_resource + single: OCF_RESKEY_CRM_meta_notify_active_resource + + OCF_RESKEY_CRM_meta_notify_active_resource + - Resources that are running + * - .. _OCF_RESKEY_CRM_meta_notify_inactive_resource: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_inactive_resource + single: OCF_RESKEY_CRM_meta_notify_inactive_resource + + OCF_RESKEY_CRM_meta_notify_inactive_resource + - Resources that are not running + * - .. _OCF_RESKEY_CRM_meta_notify_start_uname: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_start_uname + single: OCF_RESKEY_CRM_meta_notify_start_uname + + OCF_RESKEY_CRM_meta_notify_start_uname + - Nodes on which resources will be started + * - .. _OCF_RESKEY_CRM_meta_notify_stop_uname: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_stop_uname + single: OCF_RESKEY_CRM_meta_notify_stop_uname + + OCF_RESKEY_CRM_meta_notify_stop_uname + - Nodes on which resources will be stopped + * - .. _OCF_RESKEY_CRM_meta_notify_active_uname: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_active_uname + single: OCF_RESKEY_CRM_meta_notify_active_uname + + OCF_RESKEY_CRM_meta_notify_active_uname + - Nodes on which resources are running + +The variables come in pairs, such as +``OCF_RESKEY_CRM_meta_notify_start_resource`` and +``OCF_RESKEY_CRM_meta_notify_start_uname``, and should be treated as an array of +whitespace-separated elements. + +``OCF_RESKEY_CRM_meta_notify_inactive_resource`` is an exception, as the +matching ``uname`` variable does not exist since inactive resources are not +running on any node. + +Thus, in order to indicate that ``clone:0`` will be started on ``sles-1``, +``clone:2`` will be started on ``sles-3``, and ``clone:3`` will be started +on ``sles-2``, the cluster would set: + +.. topic:: Notification Variables + + .. code-block:: none + + OCF_RESKEY_CRM_meta_notify_start_resource="clone:0 clone:2 clone:3" + OCF_RESKEY_CRM_meta_notify_start_uname="sles-1 sles-3 sles-2" + +.. note:: + + Pacemaker will log but otherwise ignore failures of notify actions. + +Interpretation of Notification Variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Pre-notification (stop):** + +* Active resources: ``$OCF_RESKEY_CRM_meta_notify_active_resource`` +* Inactive resources: ``$OCF_RESKEY_CRM_meta_notify_inactive_resource`` +* Resources to be started: ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources to be stopped: ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + +**Post-notification (stop) / Pre-notification (start):** + +* Active resources + * ``$OCF_RESKEY_CRM_meta_notify_active_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` +* Inactive resources + * ``$OCF_RESKEY_CRM_meta_notify_inactive_resource`` + * plus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` +* Resources that were started: ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources that were stopped: ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + +**Post-notification (start):** + +* Active resources: + * ``$OCF_RESKEY_CRM_meta_notify_active_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + * plus ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Inactive resources: + * ``$OCF_RESKEY_CRM_meta_notify_inactive_resource`` + * plus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources that were started: ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources that were stopped: ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + +Extra Notifications for Promotable Clones +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. index:: + single: clone; environment variables + single: promotable; environment variables + +.. list-table:: **Extra Environment Variables Supplied for Promotable Clones** + :class: longtable + :widths: 50 50 + :header-rows: 1 + + * - Variable + - Description + * - .. _OCF_RESKEY_CRM_meta_notify_promoted_resource: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_promoted_resource + single: OCF_RESKEY_CRM_meta_notify_promoted_resource + + OCF_RESKEY_CRM_meta_notify_promoted_resource + - Resources that are running in the promoted role + * - .. _OCF_RESKEY_CRM_meta_notify_unpromoted_resource: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_unpromoted_resource + single: OCF_RESKEY_CRM_meta_notify_unpromoted_resource + + OCF_RESKEY_CRM_meta_notify_unpromoted_resource + - Resources that are running in the unpromoted role + * - .. _OCF_RESKEY_CRM_meta_notify_promote_resource: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_promote_resource + single: OCF_RESKEY_CRM_meta_notify_promote_resource + + OCF_RESKEY_CRM_meta_notify_promote_resource + - Resources to be promoted + * - .. _OCF_RESKEY_CRM_meta_notify_demote_resource: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_demote_resource + single: OCF_RESKEY_CRM_meta_notify_demote_resource + + OCF_RESKEY_CRM_meta_notify_demote_resource + - Resources to be demoted + * - .. _OCF_RESKEY_CRM_meta_notify_promote_uname: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_promote_uname + single: OCF_RESKEY_CRM_meta_notify_promote_uname + + OCF_RESKEY_CRM_meta_notify_promote_uname + - Nodes on which resources will be promoted + * - .. _OCF_RESKEY_CRM_meta_notify_demote_uname: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_demote_uname + single: OCF_RESKEY_CRM_meta_notify_demote_uname + + OCF_RESKEY_CRM_meta_notify_demote_uname + - Nodes on which resources will be demoted + * - .. _OCF_RESKEY_CRM_meta_notify_promoted_uname: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_promoted_uname + single: OCF_RESKEY_CRM_meta_notify_promoted_uname + + OCF_RESKEY_CRM_meta_notify_promoted_uname + - Nodes on which resources are running in the promoted role + * - .. _OCF_RESKEY_CRM_meta_notify_unpromoted_uname: + + .. index:: + single: environment variable; OCF_RESKEY_CRM_meta_notify_unpromoted_uname + single: OCF_RESKEY_CRM_meta_notify_unpromoted_uname + + OCF_RESKEY_CRM_meta_notify_unpromoted_uname + - Nodes on which resources are running in the unpromoted role + +Interpretation of Promotable Notification Variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Pre-notification (demote):** + +* Active resources: ``$OCF_RESKEY_CRM_meta_notify_active_resource`` +* Promoted resources: ``$OCF_RESKEY_CRM_meta_notify_promoted_resource`` +* Unpromoted resources: ``$OCF_RESKEY_CRM_meta_notify_unpromoted_resource`` +* Inactive resources: ``$OCF_RESKEY_CRM_meta_notify_inactive_resource`` +* Resources to be started: ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources to be promoted: ``$OCF_RESKEY_CRM_meta_notify_promote_resource`` +* Resources to be demoted: ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` +* Resources to be stopped: ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + +**Post-notification (demote) / Pre-notification (stop):** + +* Active resources: ``$OCF_RESKEY_CRM_meta_notify_active_resource`` +* Promoted resources: + * ``$OCF_RESKEY_CRM_meta_notify_promoted_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` +* Unpromoted resources: ``$OCF_RESKEY_CRM_meta_notify_unpromoted_resource`` +* Inactive resources: ``$OCF_RESKEY_CRM_meta_notify_inactive_resource`` +* Resources to be started: ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources to be promoted: ``$OCF_RESKEY_CRM_meta_notify_promote_resource`` +* Resources to be demoted: ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` +* Resources to be stopped: ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` +* Resources that were demoted: ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` + +**Post-notification (stop) / Pre-notification (start)** + +* Active resources: + * ``$OCF_RESKEY_CRM_meta_notify_active_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` +* Promoted resources: + * ``$OCF_RESKEY_CRM_meta_notify_promoted_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` +* Unpromoted resources: + * ``$OCF_RESKEY_CRM_meta_notify_unpromoted_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` +* Inactive resources: + * ``$OCF_RESKEY_CRM_meta_notify_inactive_resource`` + * plus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` +* Resources to be started: ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources to be promoted: ``$OCF_RESKEY_CRM_meta_notify_promote_resource`` +* Resources to be demoted: ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` +* Resources to be stopped: ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` +* Resources that were demoted: ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` +* Resources that were stopped: ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + +**Post-notification (start) / Pre-notification (promote)** + +* Active resources: + * ``$OCF_RESKEY_CRM_meta_notify_active_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + * plus ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Promoted resources: + * ``$OCF_RESKEY_CRM_meta_notify_promoted_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` +* Unpromoted resources: + * ``$OCF_RESKEY_CRM_meta_notify_unpromoted_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + * plus ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Inactive resources: + * ``$OCF_RESKEY_CRM_meta_notify_inactive_resource`` + * plus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources to be started: ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources to be promoted: ``$OCF_RESKEY_CRM_meta_notify_promote_resource`` +* Resources to be demoted: ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` +* Resources to be stopped: ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` +* Resources that were started: ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources that were demoted: ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` +* Resources that were stopped: ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + +**Post-notification (promote)** + +* Active resources: + * ``$OCF_RESKEY_CRM_meta_notify_active_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + * plus ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Promoted resources: + * ``$OCF_RESKEY_CRM_meta_notify_promoted_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` + * plus ``$OCF_RESKEY_CRM_meta_notify_promote_resource`` +* Unpromoted resources: + * ``$OCF_RESKEY_CRM_meta_notify_unpromoted_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + * plus ``$OCF_RESKEY_CRM_meta_notify_start_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_promote_resource`` +* Inactive resources: + * ``$OCF_RESKEY_CRM_meta_notify_inactive_resource`` + * plus ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + * minus ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources to be started: ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources to be promoted: ``$OCF_RESKEY_CRM_meta_notify_promote_resource`` +* Resources to be demoted: ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` +* Resources to be stopped: ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` +* Resources that were started: ``$OCF_RESKEY_CRM_meta_notify_start_resource`` +* Resources that were promoted: ``$OCF_RESKEY_CRM_meta_notify_promote_resource`` +* Resources that were demoted: ``$OCF_RESKEY_CRM_meta_notify_demote_resource`` +* Resources that were stopped: ``$OCF_RESKEY_CRM_meta_notify_stop_resource`` + + +.. index:: + single: resource agent; LSB + single: LSB resource agent + single: init script + +LSB Resource Agents (Init Scripts) +################################## + +LSB Compliance +______________ + +The relevant part of the +`LSB specifications `_ +includes a description of all the return codes listed here. + +Assuming `some_service` is configured correctly and currently +inactive, the following sequence will help you determine if it is +LSB-compatible: + +#. Start (stopped): + + .. code-block:: none + + # /etc/init.d/some_service start ; echo "result: $?" + + * Did the service start? + * Did the echo command print ``result: 0`` (in addition to the init script's + usual output)? + +#. Status (running): + + .. code-block:: none + + # /etc/init.d/some_service status ; echo "result: $?" + + * Did the script accept the command? + * Did the script indicate the service was running? + * Did the echo command print ``result: 0`` (in addition to the init script's + usual output)? + +#. Start (running): + + .. code-block:: none + + # /etc/init.d/some_service start ; echo "result: $?" + + * Is the service still running? + * Did the echo command print ``result: 0`` (in addition to the init + script's usual output)? + +#. Stop (running): + + .. code-block:: none + + # /etc/init.d/some_service stop ; echo "result: $?" + + * Was the service stopped? + * Did the echo command print ``result: 0`` (in addition to the init + script's usual output)? + +#. Status (stopped): + + .. code-block:: none + + # /etc/init.d/some_service status ; echo "result: $?" + + * Did the script accept the command? + * Did the script indicate the service was not running? + * Did the echo command print ``result: 3`` (in addition to the init + script's usual output)? + +#. Stop (stopped): + + .. code-block:: none + + # /etc/init.d/some_service stop ; echo "result: $?" + + * Is the service still stopped? + * Did the echo command print ``result: 0`` (in addition to the init + script's usual output)? + +#. Status (failed): + + This step is not readily testable and relies on manual inspection of the script. + + The script can use one of the error codes (other than 3) listed in the + LSB spec to indicate that it is active but failed. This tells the + cluster that before moving the resource to another node, it needs to + stop it on the existing one first. + +If the answer to any of the above questions is no, then the script is not +LSB-compliant. Your options are then to either fix the script or write an OCF +agent based on the existing script. diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/alerts.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/alerts.rst new file mode 100644 index 0000000..7a421ef --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/alerts.rst @@ -0,0 +1,343 @@ +.. index:: + single: alert; agents + +Alert Agents +------------ + +.. index:: + single: alert; sample agents + +Using the Sample Alert Agents +############################# + +Pacemaker provides several sample alert agents, installed in +``/usr/share/pacemaker/alerts`` by default. + +While these sample scripts may be copied and used as-is, they are provided +mainly as templates to be edited to suit your purposes. See their source code +for the full set of instance attributes they support. + +.. topic:: Sending cluster events as SNMP v2c traps + + .. code-block:: xml + + + + + + + + + + + + + + + +.. note:: **SNMP alert agent attributes** + + The ``timestamp-format`` meta-attribute should always be set to + ``%Y-%m-%d,%H:%M:%S.%01N`` when using the SNMP agent, to match the SNMP + standard. + + The SNMP agent provides a number of instance attributes in addition to the + one used in the example above. The most useful are ``trap_version``, which + defaults to ``2c``, and ``trap_community``, which defaults to ``public``. + See the source code for more details. + +.. topic:: Sending cluster events as SNMP v3 traps + + .. code-block:: xml + + + + + + + + + + + + + + + + + + +.. note:: **SNMP v3 trap configuration** + + To use SNMP v3, ``trap_version`` must be set to ``3``. ``trap_community`` + will be ignored. + + The example above uses the ``trap_options`` instance attribute to override + the security level, authentication protocol, authentication user, and + authentication password from snmp.conf. These will be passed to the snmptrap + command. Passing the password on the command line is considered insecure; + specify authentication and privacy options suitable for your environment. + +.. topic:: Sending cluster events as e-mails + + .. code-block:: xml + + + + + + + + + + + + + +.. index:: + single: alert; agent development + +Writing an Alert Agent +###################### + +.. index:: + single: alert; environment variables + single: environment variable; alert agents + +.. list-table:: **Environment Variables Passed to Alert Agents** + :class: longtable + :widths: 30 50 20 + :header-rows: 1 + + * - Environment Variable + - Description + - Alert Types + * - .. _CRM_alert_kind: + + .. index:: + single: environment variable; CRM_alert_kind + single: CRM_alert_kind + + CRM_alert_kind + - The type of alert (``node``, ``fencing``, ``resource``, or + ``attribute``) + - all + * - .. _CRM_alert_node: + + .. index:: + single: environment variable; CRM_alert_node + single: CRM_alert_node + + CRM_alert_node + - Name of affected node + - all + * - .. _CRM_alert_node_sequence: + + .. index:: + single: environment variable; CRM_alert_node_sequence + single: CRM_alert_node_sequence + + CRM_alert_node_sequence + - A sequence number increased whenever an alert is being issued on the + local node, which can be used to reference the order in which alerts + have been issued by Pacemaker. An alert for an event that happened later + in time reliably has a higher sequence number than alerts for earlier + events. This number has no cluster-wide meaning. + - all + * - .. _CRM_alert_recipient: + + .. index:: + single: environment variable; CRM_alert_recipient + single: CRM_alert_recipient + + CRM_alert_recipient + - The configured recipient + - all + * - .. _CRM_alert_timestamp: + + .. index:: + single: environment variable; CRM_alert_timestamp + single: CRM_alert_timestamp + + CRM_alert_timestamp + - A timestamp created prior to executing the agent, in the format + specified by the ``timestamp-format`` meta-attribute. This allows the + agent to have a reliable, high-precision time of when the event + occurred, regardless of when the agent itself was invoked (which could + potentially be delayed due to system load, etc.). + - all + * - .. _CRM_alert_timestamp_epoch: + + .. index:: + single: environment variable; CRM_alert_timestamp_epoch + single: CRM_alert_timestamp_epoch + + CRM_alert_timestamp_epoch + - The same time as ``CRM_alert_timestamp``, expressed as the integer + number of seconds since January 1, 1970. This (along with + ``CRM_alert_timestamp_usec``) can be useful for alert agents that need + to format time in a specific way rather than let the user configure it. + - all + * - .. _CRM_alert_timestamp_usec: + + .. index:: + single: environment variable; CRM_alert_timestamp_usec + single: CRM_alert_timestamp_usec + + CRM_alert_timestamp_usec + - The same time as ``CRM_alert_timestamp``, expressed as the integer + number of microseconds since ``CRM_alert_timestamp_epoch``. + - all + * - .. _CRM_alert_version: + + .. index:: + single: environment variable; CRM_alert_version + single: CRM_alert_version + + CRM_alert_version + - The version of Pacemaker sending the alert + - all + * - .. _CRM_alert_desc: + + .. index:: + single: environment variable; CRM_alert_desc + single: CRM_alert_desc + + CRM_alert_desc + - Detail about event. For ``node`` alerts, this is the node's current + state (``member`` or ``lost``). For ``fencing`` alerts, this is a + summary of the requested fencing operation, including origin, target, + and fencing operation error code, if any. For ``resource`` alerts, this + is a readable string equivalent of ``CRM_alert_status``. + - ``node``, ``fencing``, ``resource`` + * - .. _CRM_alert_nodeid: + + .. index:: + single: environment variable; CRM_alert_nodeid + single: CRM_alert_nodeid + + CRM_alert_nodeid + - ID of node whose status changed + - ``node`` + * - .. _CRM_alert_rc: + + .. index:: + single: environment variable; CRM_alert_rc + single: CRM_alert_rc + + CRM_alert_rc + - The numerical return code of the fencing or resource operation + - ``fencing``, ``resource`` + * - .. _CRM_alert_task: + + .. index:: + single: environment variable; CRM_alert_task + single: CRM_alert_task + + CRM_alert_task + - The requested fencing or resource operation + - ``fencing``, ``resource`` + * - .. _CRM_alert_exec_time: + + .. index:: + single: environment variable; CRM_alert_exec_time + single: CRM_alert_exec_time + + CRM_alert_exec_time + - The (wall-clock) time, in milliseconds, that it took to execute the + action. If the action timed out, ``CRM_alert_status`` will be 2, + ``CRM_alert_desc`` will be "Timed Out", and this value will be the + action timeout. May not be supported on all platforms. *(since 2.0.1)* + - ``resource`` + * - .. _CRM_alert_interval: + + .. index:: + single: environment variable; CRM_alert_interval + single: CRM_alert_interval + + CRM_alert_interval + - The interval of the resource operation + - ``resource`` + * - .. _CRM_alert_rsc: + + .. index:: + single: environment variable; CRM_alert_rsc + single: CRM_alert_rsc + + CRM_alert_rsc + - The name of the affected resource + - ``resource`` + * - .. _CRM_alert_status: + + .. index:: + single: environment variable; CRM_alert_status + single: CRM_alert_status + + CRM_alert_status + - A numerical code used by Pacemaker to represent the operation result + - ``resource`` + * - .. _CRM_alert_target_rc: + + .. index:: + single: environment variable; CRM_alert_target_rc + single: CRM_alert_target_rc + + CRM_alert_target_rc + - The expected numerical return code of the operation + - ``resource`` + * - .. _CRM_alert_attribute_name: + + .. index:: + single: environment variable; CRM_alert_attribute_name + single: CRM_alert_attribute_name + + CRM_alert_attribute_name + - The name of the node attribute that changed + - ``attribute`` + * - .. _CRM_alert_attribute_value: + + .. index:: + single: environment variable; CRM_alert_attribute_value + single: CRM_alert_attribute_value + + CRM_alert_attribute_value + - The new value of the node attribute that changed + - ``attribute`` + +Special concerns when writing alert agents: + +* Alert agents may be called with no recipient (if none is configured), + so the agent must be able to handle this situation, even if it + only exits in that case. (Users may modify the configuration in + stages, and add a recipient later.) + +* If more than one recipient is configured for an alert, the alert agent will + be called once per recipient. If an agent is not able to run concurrently, it + should be configured with only a single recipient. The agent is free, + however, to interpret the recipient as a list. + +* When a cluster event occurs, all alerts are fired off at the same time as + separate processes. Depending on how many alerts and recipients are + configured, and on what is done within the alert agents, + a significant load burst may occur. The agent could be written to take + this into consideration, for example by queueing resource-intensive actions + into some other instance, instead of directly executing them. + +* Alert agents are run as the |CRM_DAEMON_USER| user, which has a minimal set + of permissions. If an agent requires additional privileges, it is + recommended to configure ``sudo`` to allow the agent to run the necessary + commands as another user with the appropriate privileges. + +* As always, take care to validate and sanitize user-configured parameters, + such as ``CRM_alert_timestamp`` (whose content is specified by the + user-configured ``timestamp-format``), ``CRM_alert_recipient,`` and all + instance attributes. Mostly this is needed simply to protect against + configuration errors, but if some user can modify the CIB without having + |CRM_DAEMON_USER| access to the cluster nodes, it is a potential security + concern as well, to avoid the possibility of code injection. diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/cluster.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/cluster.rst new file mode 100644 index 0000000..3713733 --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/cluster.rst @@ -0,0 +1,21 @@ +.. index:: + single: cluster layer + +The Cluster Layer +----------------- + +Pacemaker utilizes an underlying cluster layer for two purposes: + +* obtaining quorum +* messaging between nodes + +.. index:: + single: cluster layer; Corosync + single: Corosync + +Currently, only Corosync 2 and later is supported for this layer. + +This document assumes you have configured the cluster nodes in Corosync +already. High-level cluster management tools are available that can configure +Corosync for you. If you want the lower-level details, see the +`Corosync documentation `_. diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/configuring.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/configuring.rst new file mode 100644 index 0000000..2132a44 --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/configuring.rst @@ -0,0 +1,263 @@ +.. index:: + single: configuration + single: CIB + +Configuring Pacemaker +--------------------- + +Pacemaker's configuration, the CIB, is stored in XML format. Cluster +administrators have multiple options for modifying the configuration either via +the XML, or at a more abstract (and easier for humans to understand) level. + +Pacemaker reacts to configuration changes as soon as they are saved. +Pacemaker's command-line tools and most higher-level tools provide the ability +to batch changes together and commit them at once, rather than make a series of +small changes, which could cause avoid unnecessary actions as Pacemaker +responds to each change individually. + +Pacemaker tracks revisions to the configuration and will reject any update +older than the current revision. Thus, it is a good idea to serialize all +changes to the configuration. Avoid attempting simultaneous changes, whether on +the same node or different nodes, and whether manually or using some automated +configuration tool. + +.. note:: + + It is not necessary to update the configuration on all cluster nodes. + Pacemaker immediately synchronizes changes to all active members of the + cluster. To reduce bandwidth, the cluster only broadcasts the incremental + updates that result from your changes and uses checksums to ensure that each + copy is consistent. + + +Configuration Using Higher-level Tools +###################################### + +Most users will benefit from using higher-level tools provided by projects +separate from Pacemaker. Popular ones include the crm shell and pcs. [#]_ + +See those projects' documentation for details on how to configure Pacemaker +using them. + + +Configuration Using Pacemaker's Command-Line Tools +################################################## + +Pacemaker provides lower-level, command-line tools to manage the cluster. Most +configuration tasks can be performed with these tools, without needing any XML +knowledge. + +To enable STONITH for example, one could run: + +.. code-block:: none + + # crm_attribute --name fencing-enabled --update 1 + +Or, to check whether **node1** is allowed to run resources, there is: + +.. code-block:: none + + # crm_standby --query --node node1 + +Or, to change the failure threshold of **my-test-rsc**, one can use: + +.. code-block:: none + + # crm_resource -r my-test-rsc --set-parameter migration-threshold --parameter-value 3 --meta + +Examples of using these tools for specific cases will be given throughout this +document where appropriate. See the man pages for further details. + +See :ref:`cibadmin` for how to edit the CIB using XML. + +See :ref:`crm_shadow` for a way to make a series of changes, then commit them +all at once to the live cluster. + + +.. index:: + single: configuration; CIB properties + single: CIB; properties + single: CIB property + +Working with CIB Properties +___________________________ + +Although these fields can be written to by the user, in +most cases the cluster will overwrite any values specified by the +user with the "correct" ones. + +To change the ones that can be specified by the user, for example +``admin_epoch``, one should use: + +.. code-block:: none + + # cibadmin --modify --xml-text '' + +A complete set of CIB properties will look something like this: + +.. topic:: XML attributes set for a cib element + + .. code-block:: xml + + + + +.. index:: + single: configuration; cluster options + +Querying and Setting Cluster Options +____________________________________ + +Cluster options can be queried and modified using the ``crm_attribute`` tool. +To get the current value of ``cluster-delay``, you can run: + +.. code-block:: none + + # crm_attribute --query --name cluster-delay + +which is more simply written as + +.. code-block:: none + + # crm_attribute -G -n cluster-delay + +If a value is found, you'll see a result like this: + +.. code-block:: none + + # crm_attribute -G -n cluster-delay + scope=crm_config name=cluster-delay value=60s + +If no value is found, the tool will display an error: + +.. code-block:: none + + # crm_attribute -G -n clusta-deway + scope=crm_config name=clusta-deway value=(null) + Error performing operation: No such device or address + +To use a different value (for example, 30 seconds), simply run: + +.. code-block:: none + + # crm_attribute --name cluster-delay --update 30s + +To go back to the cluster's default value, you can delete the value, for example: + +.. code-block:: none + + # crm_attribute --name cluster-delay --delete + Deleted crm_config option: id=cib-bootstrap-options-cluster-delay name=cluster-delay + + +When Options are Listed More Than Once +______________________________________ + +If you ever see something like the following, it means that the option you're +modifying is present more than once. + +.. topic:: Deleting an option that is listed twice + + .. code-block:: none + + # crm_attribute --name batch-limit --delete + + Please choose from one of the matches below and supply the 'id' with --id + Multiple attributes match name=batch-limit in crm_config: + Value: 50 (set=cib-bootstrap-options, id=cib-bootstrap-options-batch-limit) + Value: 100 (set=custom, id=custom-batch-limit) + +In such cases, follow the on-screen instructions to perform the requested +action. To determine which value is currently being used by the cluster, refer +to the "Rules" chapter of *Pacemaker Explained*. + + +.. index:: + single: configuration; remote + +.. _remote_connection: + +Connecting from a Remote Machine +################################ + +It is possible to run configuration commands from a machine that is not part of +the cluster. + +For security reasons, this capability is disabled by default. If you wish to +allow remote access, set the ``remote-tls-port`` (encrypted) or +``remote-clear-port`` (unencrypted) CIB properties (attributes of the ``cib`` +element). Encrypted communication can be performed keyless (which makes it +subject to man-in-the-middle attacks), but a better option is to also use +TLS certificates. + +To enable TLS certificates, it is recommended to first set up your own +Certificate Authority (CA) and generate a root CA certificate. Then create a +public/private key pair and certificate signing request (CSR) for your server. +Use the CA to sign this CSR. + +Then, create a public/private key pair and CSR for each remote system that you +wish to have remote access. Use the CA to sign the CSRs. It is recommended to +use a unique certificate for each remote system so they can be revoked if +necessary. + +The server's public/private key pair and signed certificate should be installed +to the |PCMK_CONFIG_DIR| directory and owned by ``CIB_user``. Remember that +private keys should not be readable by anyone other than their owner. Finally, +edit the |PCMK_CONFIG_FILE| file to refer to these credentials: + +.. code-block:: none + + PCMK_ca_file="/etc/pacemaker/ca.cert.pem" + PCMK_cert_file="/etc/pacemaker/server.cert.pem" + PCMK_key_file="/etc/pacemaker/server.key.pem" + +The administrator's machine simply needs Pacemaker installed. To connect to the +cluster, set the following environment variables: + +* :ref:`CIB_port ` (required) +* :ref:`CIB_server ` +* :ref:`CIB_user ` +* :ref:`CIB_passwd ` +* :ref:`CIB_encrypted ` + +Only the Pacemaker daemon user (|CRM_DAEMON_USER|) may be used as ``CIB_user``. + +To use TLS certificates, the administrator's machine also needs their +public/private key pair, signed client certificate, and root CA certificate. +Those must additionally be specified with the following environment variables: + +* :ref:`CIB_ca_file ` +* :ref:`CIB_cert_file ` +* :ref:`CIB_key_file ` + +As an example, if **node1** is a cluster node, and the CIB is configured with +``remote-tls-port`` set to 1234, the administrator could read the current +cluster configuration using the following commands, and would be prompted for +the daemon user's password: + +.. code-block:: none + + # export CIB_server=node1; export CIB_port=1234; export CIB_encrypted=true + # export CIB_ca_file=/etc/pacemaker/ca.cert.pem + # export CIB_cert_file=/etc/pacemaker/admin.cert.pem + # export CIB_key_file=/etc/pacemaker/admin.key.pem + # cibadmin -Q + +Optionally, :ref:`CIB_crl_file ` may be set to the location of a +Certificate Revocation List in PEM format. + +.. note:: + + Pacemaker must have been built with PAM support for remote access to work. + You can check by running ``pacemakerd --features``. If the output contains + **pam**, remote access is supported. *(since 3.0.0; before 3.0.0, in a build + without PAM support, all remote connections are accepted without any + authentication)* + +.. rubric:: Footnotes + +.. [#] For a list, see "Configuration Tools" at + https://clusterlabs.org/components.html diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/index.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/index.rst new file mode 100644 index 0000000..1b071e0 --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/index.rst @@ -0,0 +1,28 @@ +Pacemaker Administration +======================== + +*Managing Pacemaker Clusters* + + +This document has instructions and tips for system administrators who manage +high-availability clusters using Pacemaker. + +.. toctree:: + :maxdepth: 3 + :numbered: + + intro + installing + cluster + options + configuring + tools + administrative + moving + troubleshooting + upgrading + alerts + agents + pcs-crmsh + :ref:`genindex` + :ref:`search` diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/installing.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/installing.rst new file mode 100644 index 0000000..feea962 --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/installing.rst @@ -0,0 +1,9 @@ +Installing Cluster Software +--------------------------- + +.. index:: installation + +Most major Linux distributions have pacemaker packages in their standard +package repositories, or the software can be built from source code. See +`How to Install `_ +on the ClusterLabs wiki for details. diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/intro.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/intro.rst new file mode 100644 index 0000000..aa1c2da --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/intro.rst @@ -0,0 +1,21 @@ +Introduction +------------ + +The Scope of this Document +########################## + +The purpose of this document is to help system administrators learn how to +manage a Pacemaker cluster. + +System administrators may be interested in other parts of the +`Pacemaker documentation set `_ +such as *Clusters from Scratch*, a step-by-step guide to setting up an example +cluster, and *Pacemaker Explained*, an exhaustive reference for cluster +configuration. + +Multiple higher-level tools (both command-line and GUI) are available to +simplify cluster management. However, this document focuses on the lower-level +command-line tools that come with Pacemaker itself. The concepts are applicable +to the higher-level tools, though the syntax would differ. + +.. include:: ../shared/pacemaker-intro.rst diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/moving.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/moving.rst new file mode 100644 index 0000000..2c3c444 --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/moving.rst @@ -0,0 +1,303 @@ +Moving Resources +---------------- + +.. index:: + single: resource; move + +Moving Resources Manually +######################### + +There are primarily two occasions when you would want to move a resource from +its current location: when the whole node is under maintenance, and when a +single resource needs to be moved. + +.. index:: + single: standby mode + single: node; standby mode + +Standby Mode +____________ + +Since everything eventually comes down to a score, you could create constraints +for every resource to prevent them from running on one node. While Pacemaker +configuration can seem convoluted at times, not even we would require this of +administrators. + +Instead, you can set a special node attribute which tells the cluster "don't +let anything run here". There is even a helpful tool to help query and set it, +called ``crm_standby``. To check the standby status of the current machine, +run: + +.. code-block:: none + + # crm_standby -G + +A value of ``on`` indicates that the node is *not* able to host any resources, +while a value of ``off`` says that it *can*. + +You can also check the status of other nodes in the cluster by specifying the +`--node` option: + +.. code-block:: none + + # crm_standby -G --node sles-2 + +To change the current node's standby status, use ``-v`` instead of ``-G``: + +.. code-block:: none + + # crm_standby -v on + +Again, you can change another host's value by supplying a hostname with +``--node``. + +A cluster node in standby mode will not run resources, but still contributes to +quorum, and may fence or be fenced by nodes. + +Moving One Resource +___________________ + +When only one resource is required to move, we could do this by creating +location constraints. However, once again we provide a user-friendly shortcut +as part of the ``crm_resource`` command, which creates and modifies the extra +constraints for you. If ``Email`` were running on ``sles-1`` and you wanted it +moved to a specific location, the command would look something like: + +.. code-block:: none + + # crm_resource -M -r Email -H sles-2 + +Behind the scenes, the tool will create the following location constraint: + +.. code-block:: xml + + + +It is important to note that subsequent invocations of ``crm_resource -M`` are +not cumulative. So, if you ran these commands: + +.. code-block:: none + + # crm_resource -M -r Email -H sles-2 + # crm_resource -M -r Email -H sles-3 + +then it is as if you had never performed the first command. + +To allow the resource to move back again, use: + +.. code-block:: none + + # crm_resource -U -r Email + +Note the use of the word *allow*. The resource *can* move back to its original +location, but depending on ``resource-stickiness``, location constraints, and +so forth, it might stay where it is. + +To be absolutely certain that it moves back to ``sles-1``, move it there before +issuing the call to ``crm_resource -U``: + +.. code-block:: none + + # crm_resource -M -r Email -H sles-1 + # crm_resource -U -r Email + +Alternatively, if you only care that the resource should be moved from its +current location, try: + +.. code-block:: none + + # crm_resource -B -r Email + +which will instead create a negative constraint, like: + +.. code-block:: xml + + + +This will achieve the desired effect, but will also have long-term +consequences. As the tool will warn you, the creation of a ``-INFINITY`` +constraint will prevent the resource from running on that node until +``crm_resource -U`` is used. This includes the situation where every other +cluster node is no longer available! + +In some cases, such as when ``resource-stickiness`` is set to ``INFINITY``, it +is possible that you will end up with nodes with the same score, forcing the +cluster to choose one (which may not be the one you want). The tool can detect +some of these cases and deals with them by creating both positive and negative +constraints. For example: + +.. code-block:: xml + + + + +which has the same long-term consequences as discussed earlier. + +Moving Resources Due to Connectivity Changes +############################################ + +You can configure the cluster to move resources when external connectivity is +lost in two steps. + +.. index:: + single: ocf:pacemaker:ping resource + single: ping resource + +Tell Pacemaker to Monitor Connectivity +______________________________________ + +First, add an ``ocf:pacemaker:ping`` resource to the cluster. The ``ping`` +resource uses the system utility of the same name to a test whether a list of +machines (specified by DNS hostname or IP address) are reachable, and uses the +results to maintain a node attribute. + +The node attribute is called ``pingd`` by default, but is customizable in order +to allow multiple ping groups to be defined. + +Normally, the ping resource should run on all cluster nodes, which means that +you'll need to create a clone. A template for this can be found below, along +with a description of the most interesting parameters. + +.. list-table:: **Commonly Used ocf:pacemaker:ping Resource Parameters** + :widths: 20 80 + :header-rows: 1 + + * - Resource Parameter + - Description + * - dampen + - .. index:: + single: ocf:pacemaker:ping resource; dampen parameter + single: dampen; ocf:pacemaker:ping resource parameter + + The time to wait (dampening) for further changes to occur. Use this to + prevent a resource from bouncing around the cluster when cluster nodes + notice the loss of connectivity at slightly different times. + * - multiplier + - .. index:: + single: ocf:pacemaker:ping resource; multiplier parameter + single: multiplier; ocf:pacemaker:ping resource parameter + + The number of connected ping nodes gets multiplied by this value to get + a score. Useful when there are multiple ping nodes configured. + * - host_list + - .. index:: + single: ocf:pacemaker:ping resource; host_list parameter + single: host_list; ocf:pacemaker:ping resource parameter + + The machines to contact in order to determine the current connectivity + status. Allowed values include resolvable DNS connectivity host names, + IPv4 addresses, and IPv6 addresses. + +.. topic:: Example ping resource that checks node connectivity once every minute + + .. code-block:: xml + + + + + + + + + + + + + + +.. important:: + + You're only half done. The next section deals with telling Pacemaker how to + deal with the connectivity status that ``ocf:pacemaker:ping`` is recording. + +Tell Pacemaker How to Interpret the Connectivity Data +_____________________________________________________ + +.. important:: + + Before attempting the following, make sure you understand rules. See the + "Rules" chapter of the *Pacemaker Explained* document for details. + +There are a number of ways to use the connectivity data. + +The most common setup is for people to have a single ping target (for example, +the service network's default gateway), to prevent the cluster from running a +resource on any unconnected node. + +.. topic:: Don't run a resource on unconnected nodes + + .. code-block:: xml + + + + + + + +A more complex setup is to have a number of ping targets configured. You can +require the cluster to only run resources on nodes that can connect to all (or +a minimum subset) of them. + +.. topic:: Run only on nodes connected to three or more ping targets + + .. code-block:: xml + + + ... + + ... + + ... + + + + + + +Alternatively, you can tell the cluster only to *prefer* nodes with the best +connectivity, by using ``score-attribute`` in the rule. Just be sure to set +``multiplier`` to a value higher than that of ``resource-stickiness`` (and +don't set either of them to ``INFINITY``). + +.. topic:: Prefer node with most connected ping nodes + + .. code-block:: xml + + + + + + + +It is perhaps easier to think of this in terms of the simple constraints that +the cluster translates it into. For example, if ``sles-1`` is connected to all +five ping nodes but ``sles-2`` is only connected to two, then it would be as if +you instead had the following constraints in your configuration: + +.. topic:: How the cluster translates the above location constraint + + .. code-block:: xml + + + + +The advantage is that you don't have to manually update any constraints +whenever your network connectivity changes. + +You can also combine the concepts above into something even more complex. The +example below shows how you can prefer the node with the most connected ping +nodes provided they have connectivity to at least three (again assuming that +``multiplier`` is set to 1000). + +.. topic:: More complex example of choosing location based on connectivity + + .. code-block:: xml + + + + + + + + + diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/options.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/options.rst new file mode 100644 index 0000000..ea339dd --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/options.rst @@ -0,0 +1,232 @@ +.. index:: client options + +Client Options +-------------- + +Pacemaker uses several environment variables set on the client side. + +.. note:: Directory and file paths below may differ on your system depending on + your Pacemaker build settings. Check your Pacemaker configuration + file to find the correct paths. + +.. list-table:: **Client-side Environment Variables** + :class: longtable + :widths: 20 30 50 + :header-rows: 1 + + * - Environment Variable + - Default + - Description + * - .. _CIB_encrypted: + + .. index:: + single: CIB_encrypted + single: environment variable; CIB_encrypted + + CIB_encrypted + - true + - Whether to encrypt network traffic. Used with :ref:`CIB_port ` + for connecting to a remote CIB instance; ignored if + :ref:`CIB_port ` is not set. + * - .. _CIB_file: + + .. index:: + single: CIB_file + single: environment variable; CIB_file + + CIB_file + - + - If set, CIB connections are created against the named XML file. Clients + read an input CIB from, and write the result CIB to, the named file. + Ignored if :ref:`CIB_shadow ` is set. + * - .. _CIB_passwd: + + .. index:: + single: CIB_passwd + single: environment variable; CIB_passwd + + CIB_passwd + - + - :ref:`$CIB_user `'s password. Read from the command line if + unset. Used with :ref:`CIB_port ` for connecting to a remote + CIB instance; ignored if :ref:`CIB_port ` is not set. + * - .. _CIB_port: + + .. index:: + single: CIB_port + single: environment variable; CIB_port + + CIB_port + - + - If set, CIB connections are created as clients to a remote CIB instance + on :ref:`$CIB_server ` via this port. Ignored if + :ref:`CIB_shadow ` or :ref:`CIB_file ` is set. + * - .. _CIB_server: + + .. index:: + single: CIB_server + single: environment variable; CIB_server + + CIB_server + - localhost + - The host to connect to. Used with :ref:`CIB_port ` for + connecting to a remote CIB instance; ignored if + :ref:`CIB_port ` is not set. + * - .. _CIB_ca_file: + + .. index:: + single: CIB_ca_file + single: environment variable; CIB_ca_file + + CIB_ca_file + - + - If this, :ref:`CIB_cert_file `, and + :ref:`CIB_key_file ` are set, remote CIB administration + will be encrypted using X.509 (SSL/TLS) certificates, with this root + certificate for the certificate authority. Used with :ref:`CIB_port + ` for connecting to a remote CIB instance; ignored if + :ref:`CIB_port ` is not set. + * - .. _CIB_cert_file: + + .. index:: + single: CIB_cert_file + single: environment variable; CIB_cert_file + + CIB_cert_file + - + - If this, :ref:`CIB_ca_file `, and + :ref:`CIB_key_file ` are set, remote CIB administration + will be encrypted using X.509 (SSL/TLS) certificates, with this + certificate for the local host. Used with :ref:`CIB_port ` for + connecting to a remote CIB instance; ignored if + :ref:`CIB_port ` is not set. + * - .. _CIB_key_file: + + .. index:: + single: CIB_key_file + single: environment variable; CIB_key_file + + CIB_key_file + - + - If this, :ref:`CIB_ca_file `, and + :ref:`CIB_cert_file ` are set, remote CIB administration + will be encrypted using X.509 (SSL/TLS) certificates, with this + private key for the local host. Used with :ref:`CIB_port ` for + connecting to a remote CIB instance; ignored if + :ref:`CIB_port ` is not set. + * - .. _CIB_crl_file: + + .. index:: + single: CIB_crl_file + single: environment variable; CIB_crl_file + + CIB_crl_file + - + - If this, :ref:`CIB_ca_file `, + :ref:`CIB_cert_file `, and + :ref:`CIB_key_file ` are all set, then certificates listed + in this PEM-format Certificate Revocation List file will be rejected. + * - .. _CIB_shadow: + + .. index:: + single: CIB_shadow + single: environment variable; CIB_shadow + + CIB_shadow + - + - If set, CIB connections are created against a temporary working + ("shadow") CIB file called ``shadow.$CIB_shadow`` in + :ref:`$CIB_shadow_dir `. Should be set only to the name + of a shadow CIB created by :ref:`crm_shadow `. Otherwise, + behavior is undefined. + * - .. _CIB_shadow_dir: + + .. index:: + single: CIB_shadow_dir + single: environment variable; CIB_shadow_dir + + CIB_shadow_dir + - |CRM_CONFIG_DIR| if the current user is ``root`` or |CRM_DAEMON_USER|; + otherwise ``$HOME/.cib`` if :ref:`$HOME ` is set; otherwise + ``$TMPDIR/.cib`` if :ref:`$TMPDIR ` is set to an absolute path; + otherwise ``/tmp/.cib`` + - If set, shadow files are created in this directory. Ignored if + :ref:`CIB_shadow ` is not set. + * - .. _CIB_user: + + .. index:: + single: CIB_user + single: environment variable; CIB_user + + CIB_user + - |CRM_DAEMON_USER| if used with :ref:`CIB_port `, or the current + effective user otherwise + - If used with :ref:`CIB_port `, connect to + :ref:`$CIB_server ` as this user. Must be part of the + |CRM_DAEMON_GROUP| group on :ref:`$CIB_server `. Otherwise + (without :ref:`CIB_port `), this is used only for ACL and + display purposes. + * - .. _EDITOR: + + .. index:: + single: EDITOR + single: environment variable; EDITOR + + EDITOR + - + - Text editor to use for editing shadow files. Required for the ``--edit`` + command of :ref:`crm_shadow `. + * - .. _HOME: + + .. index:: + single: HOME + single: environment variable; HOME + + HOME + - Current user's home directory as configured in the passwd database, if an + entry exists + - Used to create a default :ref:`CIB_shadow_dir ` for non- + privileged users. + * - .. _PE_fail: + + .. index:: + single: PE_fail + single: environment variable; PE_fail + + PE_fail + - 0 + - Advanced use only: A dummy graph action with action ID matching this + option will be marked as failed. Primarily for developer use with + scheduler simulations. + * - .. _PS1: + + .. index:: + single: PS1 + single: environment variable; PS1 + + PS1 + - + - The shell's primary prompt string. Used by + :ref:`crm_shadow `: set to indicate that the user is in an + interactive shadow CIB session, and checked to determine whether the user + is already in an interactive session before creating a new one. + * - .. _SHELL: + + .. index:: + single: SHELL + single: environment variable; SHELL + + SHELL + - + - Absolute path to a shell. Used by :ref:`crm_shadow ` when + launching an interactive session. + * - .. _TMPDIR: + + .. index:: + single: TMPDIR + single: environment variable; TMPDIR + + TMPDIR + - /tmp + - Directory for temporary files. If not an absolute path, the default is + used instead. diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/pcs-crmsh.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/pcs-crmsh.rst new file mode 100644 index 0000000..8baac83 --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/pcs-crmsh.rst @@ -0,0 +1,444 @@ +Quick Comparison of pcs and crm shell +------------------------------------- + +``pcs`` and ``crm shell`` are two popular higher-level command-line interfaces +to Pacemaker. Each has its own syntax; this chapter gives a quick comparion of +how to accomplish the same tasks using either one. Some examples also show the +equivalent command using low-level Pacemaker command-line tools. + +These examples show the simplest syntax; see the respective man pages for all +possible options. + +Show Cluster Configuration and Status +##################################### + +.. topic:: Show Configuration (Raw XML) + + .. code-block:: none + + crmsh # crm configure show xml + pcs # pcs cluster cib + pacemaker # cibadmin -Q + +.. topic:: Show Configuration (Human-friendly) + + .. code-block:: none + + crmsh # crm configure show + pcs # pcs config + +.. topic:: Show Cluster Status + + .. code-block:: none + + crmsh # crm status + pcs # pcs status + pacemaker # crm_mon -1 + +Manage Nodes +############ + +.. topic:: Put node "pcmk-1" in standby mode + + .. code-block:: none + + crmsh # crm node standby pcmk-1 + pcs-0.9 # pcs cluster standby pcmk-1 + pcs-0.10 # pcs node standby pcmk-1 + pacemaker # crm_standby -N pcmk-1 -v on + +.. topic:: Remove node "pcmk-1" from standby mode + + .. code-block:: none + + crmsh # crm node online pcmk-1 + pcs-0.9 # pcs cluster unstandby pcmk-1 + pcs-0.10 # pcs node unstandby pcmk-1 + pacemaker # crm_standby -N pcmk-1 -v off + +Manage Cluster Properties +######################### + +.. topic:: Set the "fencing-enabled" cluster property to "false" + + .. code-block:: none + + crmsh # crm configure property fencing-enabled=false + pcs # pcs property set fencing-enabled=false + pacemaker # crm_attribute -n fencing-enabled -v false + +Show Resource Agent Information +############################### + +.. topic:: List Resource Agent (RA) Classes + + .. code-block:: none + + crmsh # crm ra classes + pcs # pcs resource standards + pacmaker # crm_resource --list-standards + +.. topic:: List Available Resource Agents (RAs) by Standard + + .. code-block:: none + + crmsh # crm ra list ocf + pcs # pcs resource agents ocf + pacemaker # crm_resource --list-agents ocf + +.. topic:: List Available Resource Agents (RAs) by OCF Provider + + .. code-block:: none + + crmsh # crm ra list ocf pacemaker + pcs # pcs resource agents ocf:pacemaker + pacemaker # crm_resource --list-agents ocf:pacemaker + +.. topic:: List Available Resource Agent Parameters + + .. code-block:: none + + crmsh # crm ra info IPaddr2 + pcs # pcs resource describe IPaddr2 + pacemaker # crm_resource --show-metadata ocf:heartbeat:IPaddr2 + +You can also use the full ``class:provider:type`` format with crmsh and pcs if +multiple RAs with the same name are available. + +.. topic:: Show Available Fence Agent Parameters + + .. code-block:: none + + crmsh # crm ra info stonith:fence_ipmilan + pcs # pcs stonith describe fence_ipmilan + +Manage Resources +################ + +.. topic:: Create a Resource + + .. code-block:: none + + crmsh # crm configure primitive ClusterIP IPaddr2 params ip=192.168.122.120 cidr_netmask=24 + pcs # pcs resource create ClusterIP IPaddr2 ip=192.168.122.120 cidr_netmask=24 + +Both crmsh and pcs determine the standard and provider (``ocf:heartbeat``) automatically +since ``IPaddr2`` is unique, and automatically create operations (including +monitor) based on the agent's meta-data. + +.. topic:: Show Configuration of All Resources + + .. code-block:: none + + crmsh # crm configure show + pcs-0.9 # pcs resource show --full + pcs-0.10 # pcs resource config + +.. topic:: Show Configuration of One Resource + + .. code-block:: none + + crmsh # crm configure show ClusterIP + pcs-0.9 # pcs resource show ClusterIP + pcs-0.10 # pcs resource config ClusterIP + +.. topic:: Show Configuration of Fencing Resources + + .. code-block:: none + + crmsh # crm resource status + pcs-0.9 # pcs stonith show --full + pcs-0.10 # pcs stonith config + +.. topic:: Start a Resource + + .. code-block:: none + + crmsh # crm resource start ClusterIP + pcs # pcs resource enable ClusterIP + pacemaker # crm_resource -r ClusterIP --set-parameter target-role --meta -v Started + +.. topic:: Stop a Resource + + .. code-block:: none + + crmsh # crm resource stop ClusterIP + pcs # pcs resource disable ClusterIP + pacemaker # crm_resource -r ClusterIP --set-parameter target-role --meta -v Stopped + +.. topic:: Remove a Resource + + .. code-block:: none + + crmsh # crm configure delete ClusterIP + pcs # pcs resource delete ClusterIP + +.. topic:: Modify a Resource's Instance Parameters + + .. code-block:: none + + crmsh # crm resource param ClusterIP set clusterip_hash=sourceip + pcs # pcs resource update ClusterIP clusterip_hash=sourceip + pacemaker # crm_resource -r ClusterIP --set-parameter clusterip_hash -v sourceip + +crmsh also has an `edit` command which edits the simplified CIB syntax +(same commands as the command line) via a configurable text editor. + +.. topic:: Modify a Resource's Instance Parameters Interactively + + .. code-block:: none + + crmsh # crm configure edit ClusterIP + +Using the interactive shell mode of crmsh, multiple changes can be +edited and verified before committing to the live configuration: + +.. topic:: Make Multiple Configuration Changes Interactively + + .. code-block:: none + + crmsh # crm configure + crmsh # edit + crmsh # verify + crmsh # commit + +.. topic:: Delete a Resource's Instance Parameters + + .. code-block:: none + + crmsh # crm resource param ClusterIP delete nic + pcs # pcs resource update ClusterIP nic= + pacemaker # crm_resource -r ClusterIP --delete-parameter nic + +.. topic:: List Current Resource Defaults + + .. code-block:: none + + crmsh # crm configure show type:rsc_defaults + pcs # pcs resource defaults + pacemaker # cibadmin -Q --scope rsc_defaults + +.. topic:: Set Resource Defaults + + .. code-block:: none + + crmsh # crm configure rsc_defaults resource-stickiness=100 + pcs # pcs resource defaults resource-stickiness=100 + +.. topic:: List Current Operation Defaults + + .. code-block:: none + + crmsh # crm configure show type:op_defaults + pcs # pcs resource op defaults + pacemaker # cibadmin -Q --scope op_defaults + +.. topic:: Set Operation Defaults + + .. code-block:: none + + crmsh # crm configure op_defaults timeout=240s + pcs # pcs resource op defaults timeout=240s + +.. topic:: Enable Resource Agent Tracing for a Resource + + .. code-block:: none + + crmsh # crm resource trace Website + +.. topic:: Clear Fail Counts for a Resource + + .. code-block:: none + + crmsh # crm resource cleanup Website + pcs # pcs resource cleanup Website + pacemaker # crm_resource --cleanup -r Website + +.. topic:: Create a Clone Resource + + .. code-block:: none + + crmsh # crm configure clone WebIP ClusterIP meta globally-unique=true clone-max=2 clone-node-max=2 + pcs # pcs resource clone ClusterIP globally-unique=true clone-max=2 clone-node-max=2 + +.. topic:: Create a Promotable Clone Resource + + .. code-block:: none + + crmsh # crm configure ms WebDataClone WebData \ + meta master-max=1 master-node-max=1 \ + clone-max=2 clone-node-max=1 notify=true + crmsh # crm configure clone WebDataClone WebData \ + meta promotable=true \ + promoted-max=1 promoted-node-max=1 \ + clone-max=2 clone-node-max=1 notify=true + pcs-0.9 # pcs resource master WebDataClone WebData \ + master-max=1 master-node-max=1 \ + clone-max=2 clone-node-max=1 notify=true + pcs-0.10 # pcs resource promotable WebData WebDataClone \ + promoted-max=1 promoted-node-max=1 \ + clone-max=2 clone-node-max=1 notify=true + +crmsh supports both ways ('configure ms' is deprecated) to configure promotable clone since crmsh 4.4.0. +pcs will generate the clone name automatically if it is omitted from the +command line. + + +Manage Constraints +################## + +.. topic:: Create a Colocation Constraint + + .. code-block:: none + + crmsh # crm configure colocation website-with-ip INFINITY: WebSite ClusterIP + pcs # pcs constraint colocation add ClusterIP with WebSite INFINITY + +.. topic:: Create a Colocation Constraint Based on Role + + .. code-block:: none + + crmsh # crm configure colocation another-ip-with-website inf: AnotherIP WebSite:Master + pcs # pcs constraint colocation add Started AnotherIP with Promoted WebSite INFINITY + +.. topic:: Create an Ordering Constraint + + .. code-block:: none + + crmsh # crm configure order apache-after-ip mandatory: ClusterIP WebSite + pcs # pcs constraint order ClusterIP then WebSite + +.. topic:: Create an Ordering Constraint Based on Role + + .. code-block:: none + + crmsh # crm configure order ip-after-website Mandatory: WebSite:Master AnotherIP + pcs # pcs constraint order promote WebSite then start AnotherIP + +.. topic:: Create a Location Constraint + + .. code-block:: none + + crmsh # crm configure location prefer-pcmk-1 WebSite 50: pcmk-1 + pcs # pcs constraint location WebSite prefers pcmk-1=50 + +.. topic:: Create a Location Constraint Based on Role + + .. code-block:: none + + crmsh # crm configure location prefer-pcmk-1 WebSite rule role=Master 50: \#uname eq pcmk-1 + pcs # pcs constraint location WebSite rule role=Promoted 50 \#uname eq pcmk-1 + +.. topic:: Move a Resource to a Specific Node (by Creating a Location Constraint) + + .. code-block:: none + + crmsh # crm resource move WebSite pcmk-1 + pcs # pcs resource move WebSite pcmk-1 + pacemaker # crm_resource -r WebSite --move -N pcmk-1 + +.. topic:: Move a Resource Away from Its Current Node (by Creating a Location Constraint) + + .. code-block:: none + + crmsh # crm resource ban Website pcmk-2 + pcs # pcs resource ban Website pcmk-2 + pacemaker # crm_resource -r WebSite --move + +.. topic:: Remove any Constraints Created by Moving a Resource + + .. code-block:: none + + crmsh # crm resource unmove WebSite + pcs # pcs resource clear WebSite + pacemaker # crm_resource -r WebSite --clear + +Advanced Configuration +###################### + +Manipulate Configuration Elements by Type +_________________________________________ + +.. topic:: List Constraints with IDs + + .. code-block:: none + + pcs # pcs constraint list --full + +.. topic:: Remove Constraint by ID + + .. code-block:: none + + pcs # pcs constraint remove cli-ban-Website-on-pcmk-1 + crmsh # crm configure remove cli-ban-Website-on-pcmk-1 + +crmsh's `show` and `edit` commands can be used to manage resources and +constraints by type: + +.. topic:: Show Configuration Elements + + .. code-block:: none + + crmsh # crm configure show type:primitive + crmsh # crm configure edit type:colocation + +Batch Changes +_____________ + +.. topic:: Make Multiple Changes and Apply Together + + .. code-block:: none + + crmsh # crm + crmsh # cib new drbd_cfg + crmsh # configure primitive WebData ocf:linbit:drbd params drbd_resource=wwwdata \ + op monitor interval=60s + crmsh # configure ms WebDataClone WebData meta master-max=1 master-node-max=1 \ + clone-max=2 clone-node-max=1 notify=true + crmsh # cib commit drbd_cfg + crmsh # quit + + pcs # pcs cluster cib drbd_cfg + pcs # pcs -f drbd_cfg resource create WebData ocf:linbit:drbd drbd_resource=wwwdata \ + op monitor interval=60s + pcs-0.9 # pcs -f drbd_cfg resource master WebDataClone WebData \ + master-max=1 master-node-max=1 clone-max=2 clone-node-max=1 notify=true + pcs-0.10 # pcs -f drbd_cfg resource promotable WebData WebDataClone \ + promoted-max=1 promoted-node-max=1 clone-max=2 clone-node-max=1 notify=true + pcs # pcs cluster cib-push drbd_cfg + +Template Creation +_________________ + +.. topic:: Create Resource Template Based on Existing Primitives of Same Type + + .. code-block:: none + + crmsh # crm configure assist template ClusterIP AdminIP + +Log Analysis +____________ + +.. topic:: Show Information About Recent Cluster Events + + .. code-block:: none + + crmsh # crm history + crmsh # peinputs + crmsh # transition pe-input-10 + crmsh # transition log pe-input-10 + +Configuration Scripts +_____________________ + +.. topic:: Script Multiple-step Cluster Configurations + + .. code-block:: none + + crmsh # crm script show apache + crmsh # crm script run apache \ + id=WebSite \ + install=true \ + virtual-ip:ip=192.168.0.15 \ + database:id=WebData \ + database:install=true diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/tools.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/tools.rst new file mode 100644 index 0000000..13a4534 --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/tools.rst @@ -0,0 +1,576 @@ +.. index:: command-line tool + +Using Pacemaker Command-Line Tools +---------------------------------- + +.. index:: + single: command-line tool; output format + +.. _cmdline_output: + +Controlling Command Line Output +############################### + +Some of the pacemaker command line utilities have been converted to a new +output system. Among these tools are ``crm_mon`` and ``stonith_admin``. This +is an ongoing project, and more tools will be converted over time. This system +lets you control the formatting of output with ``--output-as=`` and the +destination of output with ``--output-to=``. + +The available formats vary by tool, but at least plain text and XML are +supported by all tools that use the new system. The default format is plain +text. The default destination is stdout but can be redirected to any file. +Some formats support command line options for changing the style of the output. +For instance: + +.. code-block:: none + + # crm_mon --help-output + Usage: + crm_mon [OPTION?] + + Provides a summary of cluster's current state. + + Outputs varying levels of detail in a number of different formats. + + Output Options: + --output-as=FORMAT Specify output format as one of: console (default), html, text, xml + --output-to=DEST Specify file name for output (or "-" for stdout) + --html-cgi Add text needed to use output in a CGI program + --html-stylesheet=URI Link to an external CSS stylesheet + --html-title=TITLE Page title + +.. index:: + single: crm_mon + single: command-line tool; crm_mon + +.. _crm_mon: + +Monitor a Cluster with crm_mon +############################## + +The ``crm_mon`` utility displays the current state of an active cluster. It can +show the cluster status organized by node or by resource, and can be used in +either single-shot or dynamically updating mode. It can also display operations +performed and information about failures. + +Using this tool, you can examine the state of the cluster for irregularities, +and see how it responds when you cause or simulate failures. + +See the manual page or the output of ``crm_mon --help`` for a full description +of its many options. + +.. topic:: Sample output from crm_mon -1 + + .. code-block:: none + + Cluster Summary: + * Stack: corosync + * Current DC: node2 (version 2.0.0-1) - partition with quorum + * Last updated: Mon Jan 29 12:18:42 2018 + * Last change: Mon Jan 29 12:18:40 2018 by root via crm_attribute on node3 + * 5 nodes configured + * 2 resources configured + + Node List: + * Online: [ node1 node2 node3 node4 node5 ] + + * Active resources: + * Fencing (stonith:fence_xvm): Started node1 + * IP (ocf:heartbeat:IPaddr2): Started node2 + +.. topic:: Sample output from crm_mon -n -1 + + .. code-block:: none + + Cluster Summary: + * Stack: corosync + * Current DC: node2 (version 2.0.0-1) - partition with quorum + * Last updated: Mon Jan 29 12:21:48 2018 + * Last change: Mon Jan 29 12:18:40 2018 by root via crm_attribute on node3 + * 5 nodes configured + * 2 resources configured + + * Node List: + * Node node1: online + * Fencing (stonith:fence_xvm): Started + * Node node2: online + * IP (ocf:heartbeat:IPaddr2): Started + * Node node3: online + * Node node4: online + * Node node5: online + +As mentioned in an earlier chapter, the DC is the node is where decisions are +made. The cluster elects a node to be DC as needed. The only significance of +the choice of DC to an administrator is the fact that its logs will have the +most information about why decisions were made. + +.. index:: + pair: crm_mon; CSS + +.. _crm_mon_css: + +Styling crm_mon HTML output +___________________________ + +Various parts of ``crm_mon``'s HTML output have a CSS class associated with +them. Not everything does, but some of the most interesting portions do. In +the following example, the status of each node has an ``online`` class and the +details of each resource have an ``rsc-ok`` class. + +.. code-block:: html + +

Node List

+
    +
  • + Node: cluster01 online +
  • +
    • ping (ocf::pacemaker:ping): Started
  • +
  • + Node: cluster02 online +
  • +
    • ping (ocf::pacemaker:ping): Started
  • +
+ +By default, a stylesheet for styling these classes is included in the head of +the HTML output. The relevant portions of this stylesheet that would be used +in the above example is: + +.. code-block:: css + + + +If you want to override some or all of the styling, simply create your own +stylesheet, place it on a web server, and pass ``--html-stylesheet=`` +to ``crm_mon``. The link is added after the default stylesheet, so your +changes take precedence. You don't need to duplicate the entire default. +Only include what you want to change. + +.. index:: + single: cibadmin + single: command-line tool; cibadmin + +.. _cibadmin: + +Edit the CIB XML with cibadmin +############################## + +The most flexible tool for modifying the configuration is Pacemaker's +``cibadmin`` command. With ``cibadmin``, you can query, add, remove, update +or replace any part of the configuration. All changes take effect immediately, +so there is no need to perform a reload-like operation. + +The simplest way of using ``cibadmin`` is to use it to save the current +configuration to a temporary file, edit that file with your favorite +text or XML editor, and then upload the revised configuration. + +.. topic:: Safely using an editor to modify the cluster configuration + + .. code-block:: none + + # cibadmin --query > tmp.xml + # vi tmp.xml + # cibadmin --replace --xml-file tmp.xml + +Some of the better XML editors can make use of a RELAX NG schema to +help make sure any changes you make are valid. The schema describing +the configuration can be found in ``pacemaker.rng``, which may be +deployed in a location such as ``/usr/share/pacemaker`` depending on your +operating system distribution and how you installed the software. + +If you want to modify just one section of the configuration, you can +query and replace just that section to avoid modifying any others. + +.. topic:: Safely using an editor to modify only the resources section + + .. code-block:: none + + # cibadmin --query --scope resources > tmp.xml + # vi tmp.xml + # cibadmin --replace --scope resources --xml-file tmp.xml + +To quickly delete a part of the configuration, identify the object you wish to +delete by XML tag and id. For example, you might search the CIB for all +STONITH-related configuration: + +.. topic:: Searching for STONITH-related configuration items + + .. code-block:: none + + # cibadmin --query | grep stonith + + + + + + + + + + + +If you wanted to delete the ``primitive`` tag with id ``child_DoFencing``, +you would run: + +.. code-block:: none + + # cibadmin --delete --xml-text '' + +See the cibadmin man page for more options. + +.. warning:: + + Never edit the live ``cib.xml`` file directly. Pacemaker will detect such + changes and refuse to use the configuration. + + +.. index:: + single: crm_shadow + single: command-line tool; crm_shadow + +.. _crm_shadow: + +Batch Configuration Changes with crm_shadow +########################################### + +Often, it is desirable to preview the effects of a series of configuration +changes before updating the live configuration all at once. For this purpose, +``crm_shadow`` creates a "shadow" copy of the configuration and arranges for +all the command-line tools to use it. + +To begin, simply invoke ``crm_shadow --create`` with a name of your choice, +and follow the simple on-screen instructions. Shadow copies are identified with +a name to make it possible to have more than one. + +.. warning:: + + Read this section and the on-screen instructions carefully; failure to do so + could result in destroying the cluster's active configuration! + +.. topic:: Creating and displaying the active sandbox + + .. code-block:: none + + # crm_shadow --create test + Setting up shadow instance + Type Ctrl-D to exit the crm_shadow shell + shadow[test]: + shadow[test] # crm_shadow --which + test + +From this point on, all cluster commands will automatically use the shadow copy +instead of talking to the cluster's active configuration. Once you have +finished experimenting, you can either make the changes active via the +``--commit`` option, or discard them using the ``--delete`` option. Again, be +sure to follow the on-screen instructions carefully! + +For a full list of ``crm_shadow`` options and commands, invoke it with the +``--help`` option. + +.. topic:: Use sandbox to make multiple changes all at once, discard them, and verify real configuration is untouched + + .. code-block:: none + + shadow[test] # crm_failcount -r rsc_c001n01 -G + scope=status name=fail-count-rsc_c001n01 value=0 + shadow[test] # crm_standby --node c001n02 -v on + shadow[test] # crm_standby --node c001n02 -G + scope=nodes name=standby value=on + + shadow[test] # cibadmin --erase --force + shadow[test] # cibadmin --query + + + + + + + + + + shadow[test] # crm_shadow --delete test --force + Now type Ctrl-D to exit the crm_shadow shell + shadow[test] # exit + # crm_shadow --which + No active shadow configuration defined + # cibadmin -Q + + + + + + + +See the next section, :ref:`crm_simulate`, for how to test your changes before +committing them to the live cluster. + + +.. index:: + single: crm_simulate + single: command-line tool; crm_simulate + +.. _crm_simulate: + +Simulate Cluster Activity with crm_simulate +########################################### + +The command-line tool `crm_simulate` shows the results of the same logic +the cluster itself uses to respond to a particular cluster configuration and +status. + +As always, the man page is the primary documentation, and should be consulted +for further details. This section aims for a better conceptual explanation and +practical examples. + +Replaying cluster decision-making logic +_______________________________________ + +At any given time, one node in a Pacemaker cluster will be elected DC, and that +node will run Pacemaker's scheduler to make decisions. + +Each time decisions need to be made (a "transition"), the DC will have log +messages like "Calculated transition ... saving inputs in ..." with a file +name. You can grab the named file and replay the cluster logic to see why +particular decisions were made. The file contains the live cluster +configuration at that moment, so you can also look at it directly to see the +value of node attributes, etc., at that time. + +The simplest usage is (replacing $FILENAME with the actual file name): + +.. topic:: Simulate cluster response to a given CIB + + .. code-block:: none + + # crm_simulate --simulate --xml-file $FILENAME + +That will show the cluster state when the process started, the actions that +need to be taken ("Transition Summary"), and the resulting cluster state if the +actions succeed. Most actions will have a brief description of why they were +required. + +The transition inputs may be compressed. ``crm_simulate`` can handle these +compressed files directly, though if you want to edit the file, you'll need to +uncompress it first. + +You can do the same simulation for the live cluster configuration at the +current moment. This is useful mainly when using ``crm_shadow`` to create a +sandbox version of the CIB; the ``--live-check`` option will use the shadow CIB +if one is in effect. + +.. topic:: Simulate cluster response to current live CIB or shadow CIB + + .. code-block:: none + + # crm_simulate --simulate --live-check + + +Why decisions were made +_______________________ + +To get further insight into the "why", it gets user-unfriendly very quickly. If +you add the ``--show-scores`` option, you will also see all the scores that +went into the decision-making. The node with the highest cumulative score for a +resource will run it. You can look for ``-INFINITY`` scores in particular to +see where complete bans came into effect. + +You can also add ``-VVVV`` to get more detailed messages about what's happening +under the hood. You can add up to two more V's even, but that's usually useful +only if you're a masochist or tracing through the source code. + + +Visualizing the action sequence +_______________________________ + +Another handy feature is the ability to generate a visual graph of the actions +needed, using the ``--save-dotfile`` option. This relies on the separate +Graphviz [#]_ project. + +.. topic:: Generate a visual graph of cluster actions from a saved CIB + + .. code-block:: none + + # crm_simulate --simulate --xml-file $FILENAME --save-dotfile $FILENAME.dot + # dot $FILENAME.dot -Tsvg > $FILENAME.svg + +``$FILENAME.dot`` will contain a GraphViz representation of the cluster's +response to your changes, including all actions with their ordering +dependencies. + +``$FILENAME.svg`` will be the same information in a standard graphical format +that you can view in your browser or other app of choice. You could, of course, +use other ``dot`` options to generate other formats. + +How to interpret the graphical output: + + * Bubbles indicate actions, and arrows indicate ordering dependencies + * Resource actions have text of the form + ``__ `` indicating that the + specified action will be executed for the specified resource on the + specified node, once if interval is 0 or at specified recurring interval + otherwise + * Actions with black text will be sent to the executor (that is, the + appropriate agent will be invoked) + * Actions with orange text are "pseudo" actions that the cluster uses + internally for ordering but require no real activity + * Actions with a solid green border are part of the transition (that is, the + cluster will attempt to execute them in the given order -- though a + transition can be interrupted by action failure or new events) + * Dashed arrows indicate dependencies that are not present in the transition + graph + * Actions with a dashed border will not be executed. If the dashed border is + blue, the cluster does not feel the action needs to be executed. If the + dashed border is red, the cluster would like to execute the action but + cannot. Any actions depending on an action with a dashed border will not be + able to execute. + * Loops should not happen, and should be reported as a bug if found. + +.. topic:: Small Cluster Transition + + .. image:: ../shared/images/Policy-Engine-small.png + :alt: An example transition graph as represented by Graphviz + :align: center + +In the above example, it appears that a new node, ``pcmk-2``, has come online +and that the cluster is checking to make sure ``rsc1``, ``rsc2`` and ``rsc3`` +are not already running there (indicated by the ``rscN_monitor_0`` entries). +Once it did that, and assuming the resources were not active there, it would +have liked to stop ``rsc1`` and ``rsc2`` on ``pcmk-1`` and move them to +``pcmk-2``. However, there appears to be some problem and the cluster cannot or +is not permitted to perform the stop actions which implies it also cannot +perform the start actions. For some reason, the cluster does not want to start +``rsc3`` anywhere. + +.. topic:: Complex Cluster Transition + + .. image:: ../shared/images/Policy-Engine-big.png + :alt: Complex transition graph that you're not expected to be able to read + :align: center + + +What-if scenarios +_________________ + +You can make changes to the saved or shadow CIB and simulate it again, to see +how Pacemaker would react differently. You can edit the XML by hand, use +command-line tools such as ``cibadmin`` with either a shadow CIB or the +``CIB_file`` environment variable set to the filename, or use higher-level tool +support (see the man pages of the specific tool you're using for how to perform +actions on a saved CIB file rather than the live CIB). + +You can also inject node failures and/or action failures into the simulation; +see the ``crm_simulate`` man page for more details. + +This capability is useful when using a shadow CIB to edit the configuration. +Before committing the changes to the live cluster with ``crm_shadow --commit``, +you can use ``crm_simulate`` to see how the cluster will react to the changes. + +.. _crm_attribute: + +.. index:: + single: attrd_updater + single: command-line tool; attrd_updater + single: crm_attribute + single: command-line tool; crm_attribute + +Manage Node Attributes, Cluster Options and Defaults with crm_attribute and attrd_updater +######################################################################################### + +``crm_attribute`` and ``attrd_updater`` are confusingly similar tools with subtle +differences. + +``attrd_updater`` can query and update node attributes. ``crm_attribute`` can query +and update not only node attributes, but also cluster options, resource +defaults, and operation defaults. + +To understand the differences, it helps to understand the various types of node +attribute. + +.. list-table:: **Types of Node Attributes** + :widths: 20 16 16 16 16 16 + :header-rows: 1 + + * - Type + - Recorded in CIB? + - Recorded in attribute manager memory? + - Survive full cluster restart? + - Manageable by by crm_attribute? + - Manageable by attrd_updater? + * - permanent + - yes + - no + - yes + - yes + - no + * - transient + - yes + - yes + - no + - yes + - yes + * - private + - no + - yes + - no + - no + - yes + +As you can see from the table above, ``crm_attribute`` can manage permanent and +transient node attributes, while ``attrd_updater`` can manage transient and +private node attributes. + +The difference between the two tools lies mainly in *how* they update node +attributes: ``attrd_updater`` always contacts the Pacemaker attribute manager +directly, while ``crm_attribute`` will contact the attribute manager only for +transient node attributes, and will instead modify the CIB directly for +permanent node attributes (and for transient node attributes when unable to +contact the attribute manager). + +By contacting the attribute manager directly, ``attrd_updater`` can change +an attribute's "dampening" (whether changes are immediately flushed to the CIB +or after a specified amount of time, to minimize disk writes for frequent +changes), set private node attributes (which are never written to the CIB), and +set attributes for nodes that don't yet exist. + +By modifying the CIB directly, ``crm_attribute`` can set permanent node +attributes (which are only in the CIB and not managed by the attribute +manager), and can be used with saved CIB files and shadow CIBs. + +However a transient node attribute is set, it is synchronized between the CIB +and the attribute manager, on all nodes. + + +.. index:: + single: crm_failcount + single: command-line tool; crm_failcount + single: crm_node + single: command-line tool; crm_node + single: crm_report + single: command-line tool; crm_report + single: crm_standby + single: command-line tool; crm_standby + single: crm_verify + single: command-line tool; crm_verify + single: stonith_admin + single: command-line tool; stonith_admin + +Other Commonly Used Tools +######################### + +Other command-line tools include: + +* ``crm_failcount``: query or delete resource fail counts +* ``crm_node``: manage cluster nodes +* ``crm_report``: generate a detailed cluster report for bug submissions +* ``crm_resource``: manage cluster resources +* ``crm_standby``: manage standby status of nodes +* ``crm_verify``: validate a CIB +* ``stonith_admin``: manage fencing devices + +See the manual pages for details. + +.. rubric:: Footnotes + +.. [#] Graph visualization software. See http://www.graphviz.org/ for details. diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/troubleshooting.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/troubleshooting.rst new file mode 100644 index 0000000..ac1b810 --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/troubleshooting.rst @@ -0,0 +1,128 @@ +.. index:: troubleshooting + +Troubleshooting Cluster Problems +-------------------------------- + +.. index:: logging, pacemaker.log + +Logging +####### + +Pacemaker by default logs messages of ``notice`` severity and higher to the +system log, and messages of ``info`` severity and higher to the detail log, +which by default is ``/var/log/pacemaker/pacemaker.log``. + +Logging options can be controlled via environment variables at Pacemaker +start-up. Where these are set varies by operating system (often +``/etc/sysconfig/pacemaker`` or ``/etc/default/pacemaker``). See the comments +in that file for details. + +Because cluster problems are often highly complex, involving multiple machines, +cluster daemons, and managed services, Pacemaker logs rather verbosely to +provide as much context as possible. It is an ongoing priority to make these +logs more user-friendly, but by necessity there is a lot of obscure, low-level +information that can make them difficult to follow. + +The default log rotation configuration shipped with Pacemaker (typically +installed in ``/etc/logrotate.d/pacemaker``) rotates the log when it reaches +100MB in size, or weekly, whichever comes first. + +If you configure debug or (Heaven forbid) trace-level logging, the logs can +grow enormous quite quickly. Because rotated logs are by default named with the +year, month, and day only, this can cause name collisions if your logs exceed +100MB in a single day. You can add ``dateformat -%Y%m%d-%H`` to the rotation +configuration to avoid this. + +Reading the Logs +################ + +When troubleshooting, first check the system log or journal for errors or +warnings from Pacemaker components (conveniently, they will all have +"pacemaker" in their logged process name). For example: + +.. code-block:: none + + # grep 'pacemaker.*\(error\|warning\)' /var/log/messages + Mar 29 14:04:19 node1 pacemaker-controld[86636]: error: Result of monitor operation for rn2 on node1: Timed Out after 45s (Remote executor did not respond) + +If that doesn't give sufficient information, next look at the ``notice`` level +messages from ``pacemaker-controld``. These will show changes in the state of +cluster nodes. On the DC, this will also show resource actions attempted. For +example: + +.. code-block:: none + + # grep 'pacemaker-controld.*notice:' /var/log/messages + ... output skipped for brevity ... + Mar 29 14:05:36 node1 pacemaker-controld[86636]: notice: Node rn2 state is now lost + ... more output skipped for brevity ... + Mar 29 14:12:17 node1 pacemaker-controld[86636]: notice: Initiating stop operation rsc1_stop_0 on node4 + ... more output skipped for brevity ... + +Of course, you can use other tools besides ``grep`` to search the logs. + + +.. index:: transition + +Transitions +########### + +A key concept in understanding how a Pacemaker cluster functions is a +*transition*. A transition is a set of actions that need to be taken to bring +the cluster from its current state to the desired state (as expressed by the +configuration). + +Whenever a relevant event happens (a node joining or leaving the cluster, +a resource failing, etc.), the controller will ask the scheduler to recalculate +the status of the cluster, which generates a new transition. The controller +then performs the actions in the transition in the proper order. + +Each transition can be identified in the DC's logs by a line like: + +.. code-block:: none + + notice: Calculated transition 19, saving inputs in /var/lib/pacemaker/pengine/pe-input-1463.bz2 + +The file listed as the "inputs" is a snapshot of the cluster configuration and +state at that moment (the CIB). This file can help determine why particular +actions were scheduled. The ``crm_simulate`` command, described in +:ref:`crm_simulate`, can be used to replay the file. + +The log messages immediately before the "saving inputs" message will include +any actions that the scheduler thinks need to be done. + +.. important:: + + Any actions that have already been initiated must complete (or time out) + before a new transition can be calculated. + + +Node Failures +############# + +When a node fails, and looking at errors and warnings doesn't give an obvious +explanation, try to answer questions like the following based on log messages: + +* When and what was the last successful message on the node itself, or about + that node in the other nodes' logs? +* Did pacemaker-controld on the other nodes notice the node leave? +* Did pacemaker-controld on the DC invoke the scheduler and schedule a new + transition? +* Did the transition include fencing the failed node? +* Was fencing attempted? +* Did fencing succeed? + +Resource Failures +################# + +When a resource fails, and looking at errors and warnings doesn't give an +obvious explanation, try to answer questions like the following based on log +messages: + +* Did pacemaker-controld record the result of the failed resource action? +* What was the failed action's execution status and exit status? +* What code in the resource agent could result in those status codes? +* Did pacemaker-controld on the DC invoke the scheduler and schedule a new + transition? +* Did the new transition include recovery of the resource? +* Were the recovery actions initiated, and what were their results? diff --git a/.claude/commands/etcd/pacemaker/Pacemaker_Administration/upgrading.rst b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/upgrading.rst new file mode 100644 index 0000000..3dd46da --- /dev/null +++ b/.claude/commands/etcd/pacemaker/Pacemaker_Administration/upgrading.rst @@ -0,0 +1,579 @@ +.. index:: upgrade + +Upgrading a Pacemaker Cluster +----------------------------- + +.. index:: version + +Pacemaker Versioning +#################### + +Pacemaker has an overall release version, plus separate version numbers for +certain internal components. + +.. index:: + single: version; release + +* **Pacemaker release version:** This version consists of three numbers + (*x.y.z*). + + The major version number (the *x* in *x.y.z*) increases when at least some + rolling upgrades are not possible from the previous major version. For example, + a rolling upgrade from 1.0.8 to 1.1.15 should always be supported, but a + rolling upgrade from 1.0.8 to 2.0.0 may not be possible. + + The minor version (the *y* in *x.y.z*) increases when there are significant + changes in cluster default behavior, tool behavior, and/or the API interface + (for software that utilizes Pacemaker libraries). The main benefit is to alert + you to pay closer attention to the release notes, to see if you might be + affected. + + The release counter (the *z* in *x.y.z*) is increased with all public releases + of Pacemaker, which typically include both bug fixes and new features. + +.. index:: + single: feature set + single: version; feature set + +* **CRM feature set:** This version number applies to the communication between + full cluster nodes, and is used to avoid problems in mixed-version clusters. + + The major version number increases when nodes with different versions would not + work (rolling upgrades are not allowed). The minor version number increases + when mixed-version clusters are allowed only during rolling upgrades. The + minor-minor version number is ignored, but allows resource agents to detect + cluster support for various features. [#]_ + + Pacemaker ensures that the longest-running node is the cluster's DC. This + ensures new features are not enabled until all nodes are upgraded to support + them. + +.. index:: + single: version; Pacemaker Remote protocol + +* **Pacemaker Remote protocol version:** This version applies to communication + between a Pacemaker Remote node and the cluster. It increases when an older + cluster node would have problems hosting the connection to a newer + Pacemaker Remote node. To avoid these problems, Pacemaker Remote nodes will + accept connections only from cluster nodes with the same or newer + Pacemaker Remote protocol version. + + Unlike with CRM feature set differences between full cluster nodes, + mixed Pacemaker Remote protocol versions between Pacemaker Remote nodes and + full cluster nodes are fine, as long as the Pacemaker Remote nodes have the + older version. This can be useful, for example, to host a legacy application + in an older operating system version used as a Pacemaker Remote node. + +.. index:: + single: version; XML schema + +* **XML schema version:** Pacemaker’s configuration syntax — what's allowed in + the Configuration Information Base (CIB) — has its own version. This allows + the configuration syntax to evolve over time while still allowing clusters + with older configurations to work without change. + + +.. index:: + single: upgrade; methods + +Upgrading Cluster Software +########################## + +There are three approaches to upgrading a cluster, each with advantages and +disadvantages. + +.. list-table:: **Upgrade Methods** + :widths: 16 14 14 14 14 14 14 + :header-rows: 1 + + * - Method + - Available between all versions + - Can be used with Pacemaker Remote nodes + - Service outage during upgrade + - Service recovery during upgrade + - Exercises failover logic + - Allows change of messaging layer [#]_ + * - Complete cluster shutdown + - yes + - yes + - always + - N/A + - no + - yes + * - Rolling (node by node) + - no + - yes + - always [#]_ + - yes + - yes + - no + * - Detach and reattach + - yes + - no + - only due to failure + - no + - no + - yes + + +.. index:: + single: upgrade; shutdown + +Complete Cluster Shutdown +_________________________ + +In this scenario, one shuts down all cluster nodes and resources, +then upgrades all the nodes before restarting the cluster. + +#. On each node: + + a. Shutdown the cluster software (pacemaker and the messaging layer). + #. Upgrade the Pacemaker software. This may also include upgrading the + messaging layer and/or the underlying operating system. + #. Check the configuration with the ``crm_verify`` tool. + +#. On each node: + + a. Start the cluster software. + +Currently, only Corosync version 2 and greater is supported as the cluster +layer, but if another stack is supported in the future, the stack does not +need to be the same one before the upgrade. + +One variation of this approach is to build a new cluster on new hosts. +This allows the new version to be tested beforehand, and minimizes downtime by +having the new nodes ready to be placed in production as soon as the old nodes +are shut down. + + +.. index:: + single: upgrade; rolling upgrade + +Rolling (node by node) +______________________ + +In this scenario, each node is removed from the cluster, upgraded, and then +brought back online, until all nodes are running the newest version. + +Special considerations when planning a rolling upgrade: + +* If you plan to upgrade other cluster software -- such as the messaging layer -- + at the same time, consult that software's documentation for its compatibility + with a rolling upgrade. + +* If the major version number is changing in the Pacemaker version you are + upgrading to, a rolling upgrade may not be possible. Read the new version's + release notes (as well the information here) for what limitations may exist. + +* If the CRM feature set is changing in the Pacemaker version you are upgrading + to, you should run a mixed-version cluster only during a small rolling + upgrade window. If one of the older nodes drops out of the cluster for any + reason, it will not be able to rejoin until it is upgraded. + +* If the Pacemaker Remote protocol version is changing, all cluster nodes + should be upgraded before upgrading any Pacemaker Remote nodes. + +See the +`Pacemaker release calendar +`_ +on the ClusterLabs wiki to figure out whether the CRM feature set and/or +Pacemaker Remote protocol version changed between the Pacemaker release versions +in your rolling upgrade. + +To perform a rolling upgrade, on each node in turn: + +#. Put the node into standby mode, and wait for any active resources + to be moved cleanly to another node. (This step is optional, but + allows you to deal with any resource issues before the upgrade.) +#. Shut down Pacemaker or ``pacemaker-remoted``. +#. If a cluster node, shut down the messaging layer. +#. Upgrade the Pacemaker software. This may also include upgrading the + messaging layer and/or the underlying operating system. +#. If this is the first node to be upgraded, check the configuration + with the ``crm_verify`` tool. +#. If a cluster node, start the messaging layer. + This must be the same messaging layer (currently only Corosync version 2 and + greater is supported) that the rest of the cluster is using. +#. Start Pacemaker or ``pacemaker-remoted``. + +.. note:: + + Even if a rolling upgrade from the current version of the cluster to the + newest version is not directly possible, it may be possible to perform a + rolling upgrade in multiple steps, by upgrading to an intermediate version + first. + +The following table lists compatible versions for all other nodes in the cluster +when upgrading a cluster node. + +.. list-table:: **Version Compatibility for Cluster Nodes** + :class: longtable + :widths: 50 50 + :header-rows: 1 + + * - Version Being Installed + - Minimum Compatible Version + * - Pacemaker 3.y.z + - Pacemaker 2.0.0 + * - Pacemaker 2.y.z + - Pacemaker 1.1.11 [#]_ + * - Pacemaker 1.y.z + - Pacemaker 1.0.0 + * - Pacemaker 0.6.z to 0.7.z + - Pacemaker 0.6.0 + +When upgrading a Pacemaker Remote node, all cluster nodes must be running at +least the minimum version listed in the table below. + +.. list-table:: **Cluster Node Version Compatibility for Pacemaker Remote Nodes** + :class: longtable + :widths: 50 50 + :header-rows: 1 + + * - Pacemaker Remote Version + - Minimum Cluster Node Version + * - Pacemaker 3.y.z + - Pacemaker 2.0.0 + * - Pacemaker 1.1.9 to 2.1.z + - Pacemaker 1.1.9 [#]_ + +.. index:: + single: upgrade; detach and reattach + +Detach and Reattach +___________________ + +The reattach method is a variant of a complete cluster shutdown, where the +resources are left active and get re-detected when the cluster is restarted. + +This method may not be used if the cluster contains any Pacemaker Remote nodes. + +#. Tell the cluster to stop managing services. This is required to allow the + services to remain active after the cluster shuts down. + + .. code-block:: none + + # crm_attribute --name maintenance-mode --update true + +#. On each node, shutdown the cluster software (pacemaker and the messaging + layer), and upgrade the Pacemaker software. This may also include upgrading + the messaging layer. While the underlying operating system may be upgraded + at the same time, that will be more likely to cause outages in the detached + services (certainly, if a reboot is required). +#. Check the configuration with the ``crm_verify`` tool. +#. On each node, start the cluster software. + Currently, only Corosync version 2 and greater is supported as the cluster + layer, but if another stack is supported in the future, the stack does not + need to be the same one before the upgrade. +#. Verify that the cluster re-detected all resources correctly. +#. Allow the cluster to resume managing resources again: + + .. code-block:: none + + # crm_attribute --name maintenance-mode --delete + +.. note:: + + While the goal of the detach-and-reattach method is to avoid disturbing + running services, resources may still move after the upgrade if any + resource's location is governed by a rule based on transient node + attributes. Transient node attributes are erased when the node leaves the + cluster. A common example is using the ``ocf:pacemaker:ping`` resource to + set a node attribute used to locate other resources. + +.. index:: + pair: upgrade; CIB + +Upgrading the Configuration +########################### + +The CIB schema version can change from one Pacemaker version to another. + +After cluster software is upgraded, the cluster will continue to use the older +schema version that it was previously using. This can be useful, for example, +when administrators have written tools that modify the configuration, and are +based on the older syntax. [#]_ + +However, when using an older syntax, new features may be unavailable, and there +is a performance impact, since the cluster must do a non-persistent +configuration upgrade before each transition. So while using the old syntax is +possible, it is not advisable to continue using it indefinitely. + +Even if you wish to continue using the old syntax, it is a good idea to +follow the upgrade procedure outlined below, except for the last step, to ensure +that the new software has no problems with your existing configuration (since it +will perform much the same task internally). + +If you are brave, it is sufficient simply to run ``cibadmin --upgrade``. + +A more cautious approach would proceed like this: + +#. Create a shadow copy of the configuration. The later commands will + automatically operate on this copy, rather than the live configuration. + + .. code-block:: none + + # crm_shadow --create shadow + +.. index:: + single: configuration; verify + +#. Verify the configuration is valid with the new software (which may be + stricter about syntax mistakes, or may have dropped support for deprecated + features): + + .. code-block:: none + + # crm_verify --live-check + +#. Fix any errors or warnings. +#. Perform the upgrade: + + .. code-block:: none + + # cibadmin --upgrade + +#. If this step fails, there are three main possibilities: + + a. The configuration was not valid to start with (did you do steps 2 and + 3?). + #. The transformation failed; `report a bug `_. + #. The transformation was successful but produced an invalid result. + + If the result of the transformation is invalid, you may see a number of + errors from the validation library. If these are not helpful, try the manual + upgrade procedure described below. + +#. Check the changes: + + .. code-block:: none + + # crm_shadow --diff + + If at this point there is anything about the upgrade that you wish to + fine-tune (for example, to change some of the automatic IDs), now is the + time to do so: + + .. code-block:: none + + # crm_shadow --edit + + This will open the configuration in your favorite editor (whichever is + specified by the standard ``$EDITOR`` environment variable). + +#. Preview how the cluster will react: + + .. code-block:: none + + # crm_simulate --live-check --save-dotfile shadow.dot -S + # dot -Tsvg shadow.dot -o shadow.svg + + You can then view shadow.svg with any compatible image viewer or web + browser. Verify that either no resource actions will occur or that you are + happy with any that are scheduled. If the output contains actions you do + not expect (possibly due to changes to the score calculations), you may need + to make further manual changes. See :ref:`crm_simulate` for further details + on how to interpret the output of ``crm_simulate`` and ``dot``. + +#. Upload the changes: + + .. code-block:: none + + # crm_shadow --commit shadow --force + + In the unlikely event this step fails, please report a bug. + +.. note:: + + It is also possible to perform the configuration upgrade steps manually: + + #. Locate the ``upgrade*.xsl`` conversion scripts provided with the source + code. These will often be installed in a location such as + ``/usr/share/pacemaker``, or may be obtained from the + `source repository `_. + + #. Run the conversion scripts that apply to your older version, for example: + + .. code-block:: none + + # xsltproc /path/to/upgrade06.xsl config06.xml > config10.xml + + #. Locate the ``pacemaker.rng`` script (from the same location as the xsl + files). + #. Check the XML validity: + + .. code-block:: none + + # xmllint --relaxng /path/to/pacemaker.rng config10.xml + + The advantage of this method is that it can be performed without the cluster + running, and any validation errors are often more informative. + + +What Changed in 2.1 +################### + +The Pacemaker 2.1 release is fully backward-compatible in both the CIB XML and +the C API. Highlights: + +* Pacemaker now supports the **OCF Resource Agent API version 1.1**. + Most notably, the ``Master`` and ``Slave`` role names have been renamed to + ``Promoted`` and ``Unpromoted``. + +* Pacemaker now supports colocations where the dependent resource does not + affect the primary resource's placement (via a new ``influence`` colocation + constraint option and ``critical`` resource meta-attribute). This is intended + for cases where a less-important resource must be colocated with an essential + resource, but it is preferred to leave the less-important resource stopped if + it fails, rather than move both resources. + +* If Pacemaker is built with libqb 2.0 or later, the detail log will use + **millisecond-resolution timestamps**. + +* In addition to crm_mon and stonith_admin, the crmadmin, crm_resource, + crm_simulate, and crm_verify commands now support the ``--output-as`` and + ``--output-to`` options, including **XML output** (which scripts and + higher-level tools are strongly recommended to use instead of trying to parse + the text output, which may change from release to release). + +For a detailed list of changes, see the release notes and +`Pacemaker 2.1 Changes +`_ +on the ClusterLabs wiki. + + +What Changed in 2.0 +################### + +The main goal of the 2.0 release was to remove support for deprecated syntax, +along with some small changes in default configuration behavior and tool +behavior. Highlights: + +* Only Corosync version 2 and greater is now supported as the underlying + cluster layer. Support for Heartbeat and Corosync 1 (including CMAN) is + removed. + +* The Pacemaker detail log file is now stored in + ``/var/log/pacemaker/pacemaker.log`` by default. + +* The record-pending cluster property now defaults to true, which + allows status tools such as crm_mon to show operations that are in + progress. + +* Support for a number of deprecated build options, environment variables, + and configuration settings has been removed. + +* The ``master`` tag has been deprecated in favor of using the ``clone`` tag + with the new ``promotable`` meta-attribute set to ``true``. "Master/slave" + clone resources are now referred to as "promotable" clone resources. + +* The public API for Pacemaker libraries that software applications can use + has changed significantly. + +For a detailed list of changes, see the release notes and +`Pacemaker 2.0 Changes +`_ +on the ClusterLabs wiki. + + +What Changed in 1.0 +################### + +New +___ + +* Failure timeouts. +* New section for resource and operation defaults. +* Tool for making offline configuration changes. +* ``Rules``, ``instance_attributes``, ``meta_attributes`` and sets of + operations can be defined once and referenced in multiple places. +* The CIB now accepts XPath-based create/modify/delete operations. See + ``cibadmin --help``. +* Multi-dimensional colocation and ordering constraints. +* The ability to connect to the CIB from non-cluster machines. +* Allow recurring actions to be triggered at known times. + + +Changed +_______ + +* Syntax + + * All resource and cluster options now use dashes (-) instead of underscores + (_) + * ``master_slave`` was renamed to ``master`` + * The ``attributes`` container tag was removed + * The operation field ``pre-req`` has been renamed ``requires`` + * All operations must have an ``interval``, ``start``/``stop`` must have it + set to zero + +* The ``fencing-enabled`` option now defaults to true. +* The cluster will refuse to start resources if ``fencing-enabled`` is true (or + unset) and no STONITH resources have been defined +* The attributes of colocation and ordering constraints were renamed for + clarity. +* ``resource-failure-stickiness`` has been replaced by ``migration-threshold``. +* The parameters for command-line tools have been made consistent +* Switched to 'RelaxNG' schema validation and 'libxml2' parser + + * id fields are now XML IDs which have the following limitations: + + * id's cannot contain colons (:) + * id's cannot begin with a number + * id's must be globally unique (not just unique for that tag) + + * Some fields (such as those in constraints that refer to resources) are + IDREFs. + + This means that they must reference existing resources or objects in + order for the configuration to be valid. Removing an object which is + referenced elsewhere will therefore fail. + + * The CIB representation, from which a MD5 digest is calculated to verify + CIBs on the nodes, has changed. + + This means that every CIB update will require a full refresh on any + upgraded nodes until the cluster is fully upgraded to 1.0. This will result + in significant performance degradation and it is therefore highly + inadvisable to run a mixed 1.0/0.6 cluster for any longer than absolutely + necessary. + +* Ping node information no longer needs to be added to ``ha.cf``. Simply + include the lists of hosts in your ping resource(s). + + +Removed +_______ + + +* Syntax + + * It is no longer possible to set resource meta options as top-level + attributes. Use meta-attributes instead. + * Resource and operation defaults are no longer read from ``crm_config``. + +.. rubric:: Footnotes + +.. [#] Before CRM feature set 3.1.0 (Pacemaker 2.0.0), the minor-minor version + number was treated the same as the minor version. + +.. [#] Currently, Corosync version 2 and greater is the only supported cluster + stack, but other stacks have been supported by past versions, and may be + supported by future versions. + +.. [#] Any active resources will be moved off the node being upgraded, so there + will be at least a brief outage unless all resources can be migrated + "live". + +.. [#] Rolling upgrades from Pacemaker 1.1.z to 2.y.z are possible only if the + cluster uses corosync version 2 or greater as its messaging layer, and + the Cluster Information Base (CIB) uses schema 1.0 or higher in its + ``validate-with`` property. + +.. [#] Pacemaker Remote versions 1.1.15 through 1.1.17 require cluster nodes to + be at least version 1.1.15. Version 1.1.15 introduced an accidental + remote protocol version bump, breaking rolling upgrade compatibility with + older versions. This was fixed in 1.1.18. + +.. [#] As of Pacemaker 2.0.0, only schema versions pacemaker-1.0 and higher + are supported (excluding pacemaker-1.1, which was a special case). diff --git a/.claude/commands/etcd/pacemaker/podman-etcd.sh b/.claude/commands/etcd/pacemaker/podman-etcd.sh new file mode 100644 index 0000000..b8dfb2f --- /dev/null +++ b/.claude/commands/etcd/pacemaker/podman-etcd.sh @@ -0,0 +1,2052 @@ +#!/bin/sh +# +# The podman etcd HA resource agent creates and launches a etcd podman +# container based off a supplied podman image. Containers managed by +# this agent are both created and removed upon the agent's start and +# stop actions. +# +# Based on the podman resource agent. +# +# Copyright (c) 2014 David Vossel +# Michele Baldessari +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults +OCF_RESKEY_image_default="default" +OCF_RESKEY_pod_manifest_default="/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml" +OCF_RESKEY_etcd_certs_dir_default="/etc/kubernetes/static-pod-resources/etcd-certs" +OCF_RESKEY_name_default="etcd" +OCF_RESKEY_nic_default="br-ex" +OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json" +OCF_RESKEY_allow_pull_default="1" +OCF_RESKEY_reuse_default="0" +OCF_RESKEY_oom_default="-997" +OCF_RESKEY_config_location_default="/var/lib/etcd" +OCF_RESKEY_backup_location_default="/var/lib/etcd" + +: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}} +: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}} +: ${OCF_RESKEY_etcd_certs_dir=${OCF_RESKEY_etcd_certs_dir_default}} +: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} +: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}} +: ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}} +: ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}} +: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}} +: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}} +: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}} +: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}} + + +####################################################################### + +meta_data() +{ + cat < + + +1.0 + + +The podman-etcd HA resource agent creates and launches a etcd podman +container based off a supplied podman image. Containers managed by +this agent are both created and removed upon the agent's start and +stop actions. + +Podman etcd container resource agent. + + + + +The Pod manifest with the configuration for Etcd. + +Etcd pod manifest + + + + + +The Etcd certificates directory mounted into the etcd container. +The agent will monitor this directory for changes and restart the etcd container if the certificates have changed. + +Etcd certificates directory + + + + + +The podman image to base this container off of. + +podman image + + + + + +The name to give the created container. By default this will +be that resource's instance name. + +podman container name + + + + + +A mapping of node names to IPs. + +This takes the form of: +n1:ip1;n2:ip2 + +where the etcd container on n1 would have IP ip1 + +Container node name to IP mapping + + + + + +Network interface to lookup interface for host. + +Network interface + + + + + +Path of the authentication file. + +The file is created by podman login. + +Path of the authentication file + + + + + +Allow the image to be pulled from the configured podman registry when +the image does not exist locally. NOTE, this can drastically increase +the time required to start the container if the image repository is +pulled over the network. + +Allow pulling non-local images + + + + + +Add options to be appended to the 'podman run' command which is used +when creating the container during the start action. This option allows +users to do things such as setting a custom entry point and injecting +environment variables into the newly created container. Note the '-d' +option is supplied regardless of this value to force containers to run +in the background. + +NOTE: Do not explicitly specify the --name argument in the run_opts. This +agent will set --name using either the resource's instance or the name +provided in the 'name' argument of this agent. + + +run options + + + + + +Specify a command to launch within the container once +it has initialized. + +run command + + + + + +Options to be added to the 'run_cmd'. + +run command options + + + + + +A comma separated list of directories that the container is expecting to use. +The agent will ensure they exist by running 'mkdir -p' + +Required mount points + + + + + +Specify the full path of a command to launch within the container to check +the health of the container. This command must return 0 to indicate that +the container is healthy. A non-zero return code will indicate that the +container has failed and should be recovered. + +Note: Using this method for monitoring processes inside a container +is not recommended, as containerd tries to track processes running +inside the container and does not deal well with many short-lived +processes being spawned. Ensure that your container monitors its +own processes and terminates on fatal error rather than invoking +a command from the outside. + +monitor command + + + + + +Kill a container immediately rather than waiting for it to gracefully +shutdown + +force kill + + + + + +Allow the container to be reused once it is stopped. By default, +containers get removed once they are stopped. Enable this option +to have the particular one persist when this happens. + +reuse container + + + + + +Use transient drop-in files to add extra dependencies to the systemd +scopes associated to the container. During reboot, this prevents systemd +to stop the container before pacemaker. + +drop-in dependency + + + + + +Tune the host's Out-Of-Memory (OOM) preferences for containers (accepts values from -1000 to 1000). +Default to same OOM score as system-node-critical +https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#node-out-of-memory-behavior + +OOM for container + + + + + +The directory where the resource agent stores its state files, such as the generated etcd configuration and a copy of the pod manifest. + +Resource agent state directory + + + + + +The directory where the resource agent stores its backups. + +Resource agent backup directory + + + + + + + + + + + + + +END +} + +####################################################################### +REQUIRE_IMAGE_PULL=0 + +podman_usage() +{ + cat < "$ETCD_CERTS_HASH_FILE" + ocf_log info "created initial certificate hash: $current_hash" + return $OCF_SUCCESS + fi + + case "$action" in + "update") + if ! echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"; then + ocf_log err "failed to update certificate hash file $ETCD_CERTS_HASH_FILE" + fi + ocf_log info "updated certificate hash: $current_hash" + ;; + "check") + if ! stored_hash=$(cat "$ETCD_CERTS_HASH_FILE"); then + ocf_log err "failed to read stored certificate hash from $ETCD_CERTS_HASH_FILE" + # This should not happen but if for some reason we can not read the stored hash, + # use the current hash and log the error but allow etcd to run as long as possible. + stored_hash="$current_hash" + fi + if [ "$current_hash" != "$stored_hash" ]; then + ocf_exit_reason "$NODENAME etcd certificate files have changed (stored: $stored_hash, current: $current_hash)" + return $OCF_ERR_GENERIC + fi + ;; + *) + ocf_log err "unsupported action: $action" + return $OCF_ERR_GENERIC + ;; + esac + + return $OCF_SUCCESS +} + +monitor_cmd_exec() +{ + local rc=$OCF_SUCCESS + local out + + out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) + rc=$? + # 125: no container with name or ID ${CONTAINER} found + # 126: container state improper (not running) + # 127: any other error + # 255: podman 2+: container not running + case "$rc" in + 125|126|255) + rc=$OCF_NOT_RUNNING + ;; + 0) + ocf_log debug "monitor cmd passed: exit code = $rc" + ;; + *) + ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" + rc=$OCF_ERR_GENERIC + ;; + esac + + return $rc +} + +container_exists() +{ + local rc + local out + + out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) + rc=$? + # 125: no container with name or ID ${CONTAINER} found + if [ $rc -ne 125 ]; then + return 0 + fi + return 1 +} + +# archive_current_container archives the current +# podman etcd container and its configuration files. +archive_current_container() +{ + # don't attempt to archive a container that doesn't exist + if ! container_exists; then + return + fi + + # delete any container named "*-previous", or we won't be able to archive the current container. + if podman inspect "${CONTAINER}-previous" >/dev/null 2>&1; then + ocf_log info "removing old archived container '$CONTAINER-previous'" + if ! ocf_run podman rm --volumes --force "$CONTAINER-previous"; then + ocf_log warn "could not remove old archived container (podman rm failed, error code: $?). Won't be able to archive current container" + return + fi + fi + + ocf_log info "archiving '$CONTAINER' container as '$CONTAINER-previous' for debugging purposes" + if ! ocf_run podman rename "$CONTAINER" "$CONTAINER-previous"; then + ocf_log err "could not archive container '$CONTAINER', error code: $?" + return + fi + + # archive corresponding etcd configuration files + local files_to_archive="" + for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE" "$ETCD_CERTS_HASH_FILE"; do + if [ -f "$file" ]; then + files_to_archive="$files_to_archive $file" + else + ocf_log warn "file '$file' is missing and won't be archived" + fi + done + + if [ -z "$files_to_archive" ]; then + ocf_log warn "could not find any file to archive." + return + fi + + # NOTE: tar will override any existing archive as wanted + # shellcheck disable=SC2086 + if ! ocf_run tar --create --verbose --gzip --file "$ETCD_BACKUP_FILE" $files_to_archive; then + ocf_log warn "container archived successfully, but configuration backup failed (error: $?). Container debugging available, but without matching configuration files" + else + ocf_log info "container configuration also archived in '$ETCD_BACKUP_FILE'" + fi +} + +# Correctly wraps an ipv6 in [] for url otherwise use return normal ipv4 address. +ip_url() { + local ip_addr=$1 + local value + if echo "$ip_addr" | grep -q ":" ; then + value="[$ip_addr]" + else + value="$ip_addr" + fi + echo "https://$value" +} + +attribute_node_ip() +{ + local action="$1" + local attribute="node_ip" + local ip_addr name + + # TODO: We can retrieve both the local and peer IP addresses from this map, which eliminates the need to use CIB to share them between nodes + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do + name=$(echo "$node" | cut -d: -f1) + # ignore other nodes + if [ "$name" != "$NODENAME" ]; then + continue + fi + ip_addr=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6 + done + + if [ -z "$ip_addr" ]; then + ocf_log err "could not get local ip address from node_ip_map: '$OCF_RESKEY_node_ip_map'" + return 1 + fi + + case "$action" in + get) + echo "$ip_addr" + ;; + update) + if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$ip_addr"; then + rc="$?" + ocf_log err "could not set $attribute to $ip_addr, error code: $rc" + return "$rc" + fi + ;; + clear) + crm_attribute --name "$attribute" --delete + ;; + *) + ocf_log err "unsupported $action for $attribute" + return $OCF_ERR_GENERIC + ;; + esac +} + +attribute_node_ip_peer() { + local peer_name + peer_name=$(get_peer_node_name) + crm_attribute --query --name "node_ip" --node "$peer_name" | awk -F"value=" '{print $2}' +} + +get_env_from_manifest() { + local env_var_name="$1" + local env_var_value + + # The agent waits for the manifest to exist before starting, so the + # file should exist already, but this check is included for robustness. + if [ ! -f "$OCF_RESKEY_pod_manifest" ]; then + ocf_log err "external etcd pod manifest ($OCF_RESKEY_pod_manifest) not found" + exit "$OCF_ERR_INSTALLED" + fi + + if ! env_var_value=$(jq -r ".spec.containers[].env[] | select( .name == \"$env_var_name\" ).value" "$OCF_RESKEY_pod_manifest"); then + rc=$? + ocf_log err "could not find environment variable $env_var_name in etcd pod manifest, error code: $rc" + exit "$OCF_ERR_INSTALLED" + fi + + ocf_log debug "ETCD pod environment variable $env_var_name: $env_var_value" + + echo "$env_var_value" +} + +# etcd configuration file expects duration to be expressed in nanoseconds +convert_duration_in_nanoseconds() { + local duration=$1 + local value unit nanoseconds + + if [ -z "$duration" ]; then + ocf_log err "convert_duration_in_nanoseconds: no duration provided" + return 1 + fi + + if ! echo "$duration" | grep -qE '^[0-9]+[numµ]?s$'; then + ocf_log err "convert_duration_in_nanoseconds: invalid duration format \"$duration\". Expected format: where unit is one of s, ms, us, µs, ns" + return 1 + fi + + # Extract numeric value and unit from duration string + value=$(echo "$duration" | sed 's/[^0-9]*$//') + unit=$(echo "$duration" | sed 's/^[0-9]*//') + + case "$unit" in + ns) + nanoseconds=$value + ;; + us|µs) + nanoseconds=$((value * 1000)) + ;; + ms) + nanoseconds=$((value * 1000000)) + ;; + s) + nanoseconds=$((value * 1000000000)) + ;; + *) + # this should not happen as the input is already validated + ocf_log err "convert_duration_in_nanoseconds: unknown duration unit \"$unit\"" + return 1 + ;; + esac + + echo "$nanoseconds" +} + +prepare_env() { + local name ip ipurl standalone_node + + NODEIP="$(attribute_node_ip get)" + NODEIPURL=$(ip_url $NODEIP) + + if is_force_new_cluster; then + ALL_ETCD_ENDPOINTS="$NODEIPURL:2379" + ETCD_INITIAL_CLUSTER_STATE="new" + ETCD_INITIAL_CLUSTER="$NODENAME=$NODEIPURL:2380" + else + ETCD_INITIAL_CLUSTER_STATE="existing" + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do + name=$(echo "$node" | cut -d: -f1) + ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6 + ipurl="$(ip_url $ip)" + if [ -z "$name" ] || [ -z "$ip" ]; then + ocf_exit_reason "name or ip missing for 1 or more nodes" + exit $OCF_ERR_CONFIGURED + fi + + [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="$ipurl:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,$ipurl:2379" + [ -z "$ETCD_INITIAL_CLUSTER" ] && ETCD_INITIAL_CLUSTER="$name=$ipurl:2380" || ETCD_INITIAL_CLUSTER="$ETCD_INITIAL_CLUSTER,$name=$ipurl:2380" + done + fi + + ETCDCTL_API=$(get_env_from_manifest "ETCDCTL_API") + ETCD_CIPHER_SUITES=$(get_env_from_manifest "ETCD_CIPHER_SUITES") + ETCD_DATA_DIR=$(get_env_from_manifest "ETCD_DATA_DIR") + if [ ! -d "$ETCD_DATA_DIR" ]; then + ocf_log err "could not find data-dir at path \"$ETCD_DATA_DIR\"" + return "$OCF_ERR_ARGS" + else + ocf_log info "using data-dir: $ETCD_DATA_DIR" + fi + ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT") + ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF") + ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") + ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") + ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL") + ETCD_QUOTA_BACKEND_BYTES=$(get_env_from_manifest "ETCD_QUOTA_BACKEND_BYTES") + ETCD_SOCKET_REUSE_ADDRESS=$(get_env_from_manifest "ETCD_SOCKET_REUSE_ADDRESS") + + SERVER_CACERT=$(get_env_from_manifest "ETCDCTL_CACERT") + ETCD_PEER_CERT=$(get_env_from_manifest "ETCDCTL_CERT") + ETCD_PEER_KEY=$(get_env_from_manifest "ETCDCTL_KEY") + + LISTEN_CLIENT_URLS="0.0.0.0" + LISTEN_PEER_URLS="0.0.0.0" + LISTEN_METRICS_URLS="0.0.0.0" +} + + +generate_etcd_configuration() { + if is_force_new_cluster; then + # The embedded newline is required for correct YAML formatting. + FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true +force-new-cluster-bump-amount: 1000000000" + else + FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false" + fi + + cat > "$ETCD_CONFIGURATION_FILE" << EOF +logger: zap +log-level: info +snapshot-count: 10000 +name: $NODENAME +data-dir: $ETCD_DATA_DIR +$FORCE_NEW_CLUSTER_CONFIG +socket-reuse-address: $ETCD_SOCKET_REUSE_ADDRESS +election-timeout: $ETCD_ELECTION_TIMEOUT +enable-pprof: $ETCD_ENABLE_PPROF +heartbeat-interval: $ETCD_HEARTBEAT_INTERVAL +quota-backend-bytes: $ETCD_QUOTA_BACKEND_BYTES +initial-advertise-peer-urls: "$NODEIPURL:2380" +listen-peer-urls: "$(ip_url ${LISTEN_PEER_URLS}):2380" +listen-client-urls: "$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0" +initial-cluster: $ETCD_INITIAL_CLUSTER +initial-cluster-state: $ETCD_INITIAL_CLUSTER_STATE +client-transport-security: + cert-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt + key-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key + client-cert-auth: true + trusted-ca-file: $SERVER_CACERT +peer-transport-security: + cert-file: $ETCD_PEER_CERT + key-file: $ETCD_PEER_KEY + client-cert-auth: true + trusted-ca-file: $SERVER_CACERT +advertise-client-urls: "$NODEIPURL:2379" +listen-metrics-urls: "$(ip_url ${LISTEN_METRICS_URLS}):9978" +metrics: extensive +experimental-initial-corrupt-check: true +experimental-max-learners: 1 +experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") +experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") +EOF + + { + if [ -n "$ETCD_CIPHER_SUITES" ]; then + echo "cipher-suites:" + echo "$ETCD_CIPHER_SUITES" | tr ',' '\n' | while read -r cipher; do + echo " - \"$cipher\"" + done + fi + } >> "$ETCD_CONFIGURATION_FILE" +} + +archive_data_folder() +{ + # TODO: use etcd snapshots + local dest_dir_name + local data_dir="/var/lib/etcd/member" + + dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)" + if [ ! -d $data_dir ]; then + ocf_log info "no data dir to backup" + return $OCF_SUCCESS + fi + ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name" + mv "$data_dir" "$HA_RSCTMP/$dest_dir_name" + sync +} + +etcd_pod_container_exists() { + local count_matches + # Check whether the etcd pod exists on the same node (header line included) + count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l) + if [ "$count_matches" -eq 1 ]; then + # etcd pod found + return 0 + fi + # etcd pod not found + return 1 +} + +attribute_node_cluster_id() +{ + local action="$1" + local value + if ! value=$(jq -r ".clusterId" /var/lib/etcd/revision.json); then + rc=$? + ocf_log err "could not get cluster_id, error code: $rc" + return "$rc" + fi + + case "$action" in + get) + echo "$value" + ;; + update) + if ! crm_attribute --type nodes --node "$NODENAME" --name "cluster_id" --update "$value"; then + rc=$? + ocf_log err "could not update cluster_id, error code: $rc" + return "$rc" + fi + ;; + *) + ocf_log err "unsupported $action for attribute_node_cluster_id" + return $OCF_ERR_GENERIC + ;; + esac +} + +attribute_node_cluster_id_peer() +{ + local nodename + + nodename=$(get_peer_node_name) + crm_attribute --query --type nodes --node "$nodename" --name "cluster_id" | awk -F"value=" '{print $2}' +} + +attribute_node_revision() +{ + local action="$1" + local value + local attribute="revision" + + if ! value=$(jq -r ".maxRaftIndex" /var/lib/etcd/revision.json); then + rc=$? + ocf_log err "could not get $attribute, error code: $rc" + return "$rc" + fi + + case "$action" in + get) + echo "$value" + ;; + update) + if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then + rc=$? + ocf_log err "could not update etcd $revision, error code: $rc" + return "$rc" + fi + ;; + *) + ocf_log err "unsupported $action for attribute_node_revision" + return "$OCF_ERR_GENERIC" + ;; + esac +} + +attribute_node_revision_peer() +{ + local nodename + nodename=$(get_peer_node_name) + crm_attribute --query --type nodes --node "$nodename" --name "revision" | awk -F"value=" '{print $2}' +} + +# Converts a decimal number to hexadecimal format with validation +# Args: $1 - decimal number (test for non-negative integer too) +# Returns: 0 on success, OCF_ERR_GENERIC on invalid input +# Outputs: hexadecimal representation to stdout +decimal_to_hex() { + local dec=$1 + + if ! echo "$dec" | grep -q "^[1-9][0-9]*$"; then + ocf_log err "Invalid member ID format: '$dec' (expected decimal number)" + return $OCF_ERR_GENERIC + fi + + printf "%x" "$dec" + return $OCF_SUCCESS +} + +attribute_node_member_id() +{ + local action="$1" + local attribute="member_id" + + if ! container_exists; then + # we need a running container to execute etcdctl. + return 0 + fi + + case "$action" in + get) + # When we need this value at the agent startup we don't have a etcd + # container running, so we always get this value from CIB + crm_attribute --query --type nodes --node "$NODENAME" --name "$attribute" | awk -F"value=" '{print $2}' + ;; + update) + local member_list_json + member_list_json=$(get_member_list_json) + ocf_log info "member list: $member_list_json" + if [ -z "$member_list_json" ] ; then + ocf_log err "could not get $attribute: could not get member list JSON" + return "$rc" + fi + + local value value_hex + if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then + rc=$? + ocf_log err "could not get $attribute from member list JSON, error code: $rc" + return "$rc" + fi + + # JSON member_id is decimal, while etcdctl command needs the hex version + if ! value_hex=$(decimal_to_hex "$value"); then + ocf_log err "could not convert decimal member_id '$value' to hex, error code: $?" + return $OCF_ERR_GENERIC + fi + if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"; then + rc=$? + ocf_log err "could not update etcd $attribute, error code: $rc" + return "$rc" + fi + ;; + clear) + crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --delete + ;; + *) + ocf_log err "unsupported $action for attribute_node_member_id" + return "$OCF_ERR_GENERIC" + ;; + esac +} + +add_member_as_learner() +{ + local rc + local member_name=$1 + local member_ip=$2 + local endpoint_url=$(ip_url $(attribute_node_ip get)) + local peer_url=$(ip_url $member_ip) + + ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner" + out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner) + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not add $member_name as learner, error code: $rc" + return $rc + fi + ocf_log info "$out" + + attribute_learner_node update "$member_name" + return $? +} + +set_force_new_cluster() +{ + local rc + crm_attribute --lifetime reboot --node "$NODENAME" --name "force_new_cluster" --update "$NODENAME" + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not set force_new_cluster attribute to $NODENAME" + fi + return $rc +} + +# get_force_new_cluster returns a space-separated list of nodes that have the force_new_cluster attribute set. +# Return values: +# - Exit code 0 with non-empty output: One or more nodes have the force_new_cluster attribute set +# - Exit code 0 with empty output: No nodes have the force_new_cluster attribute set +# - Exit code 1 with empty output: Error occurred while querying the cluster nodes +get_force_new_cluster() +{ + local node nodes value + local holders="" + + if ! nodes=$(crm_node -l | awk '{print $2}'); then + ocf_log err "could not get force_new_cluster attribute, crm_node error code: $?" + return 1 + fi + if [ -z "$nodes" ]; then + ocf_log err "could not get force_new_cluster attribute, the list of nodes is empty" + return 1 + fi + + for node in $nodes; do + if ! value=$(crm_attribute --query --lifetime reboot --name "force_new_cluster" --node "$node" 2>/dev/null | awk -F'value=' '{print $2}' | tr -d "'"); then + ocf_log err "could not get force_new_cluster attribute, crm_attribut error code: $?" + return 1 + fi + if [ -n "$value" ]; then + holders="$holders$node " + fi + done + echo "$holders" +} + + +clear_force_new_cluster() +{ + # only the holder of "force_new_cluster" attribute can delete it + if ! is_force_new_cluster; then + ocf_log info "force_new_cluster unset or not owned by $NODENAME" + return $OCF_SUCCESS + fi + + if ! crm_attribute --delete --lifetime reboot --node "$NODENAME" --name "force_new_cluster"; then + ocf_log err "could not clear force_new_cluster attribute, error code: $?" + return $OCF_ERR_GENERIC + fi + + ocf_log info "$NODENAME: force_new_cluster attribute cleared" + return $OCF_SUCCESS +} + + +is_force_new_cluster() +{ + # Return 0 if 'force_new_cluster' is set on the current node, 1 otherwise. + local fnc_holders + + if ! fnc_holders=$(get_force_new_cluster); then + ocf_exit_reason "is_force_new_cluster: Failed to get force_new_cluster node holders" + exit $OCF_ERR_GENERIC + fi + + if echo "$fnc_holders" | grep -q -w "$NODENAME"; then + ocf_log debug "$NODENAME has force_new_cluster set" + return 0 + fi + + ocf_log debug "force_new_cluster attribute is not set on $NODENAME" + return 1 +} + +is_standalone() +{ + local standalone_node + + standalone_node=$(get_standalone_node) + if [ -z "$standalone_node" ]; then + ocf_log debug "no node running standalone" + return 1 + fi + + if [ "$NODENAME" = "$standalone_node" ]; then + ocf_log debug "$NODENAME is set as standalone" + return 0 + fi + ocf_log debug "$NODENAME is set as learner" + return 1 + +} + +set_standalone_node() +{ + local rc + + ocf_log info "add $NODENAME as standalone" + crm_attribute --name "standalone_node" --update "$NODENAME" + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not set standalone_node attribute to $NODENAME" + fi + return $rc +} + +get_standalone_node() +{ + crm_attribute --query --name "standalone_node" | awk -F"value=" '{print $2}' +} + +clear_standalone_node() +{ + crm_attribute --name "standalone_node" --delete +} + + +# Promotes an etcd learner member to a voting member +# Args: $1 - learner member ID in decimal format +# Returns: OCF_SUCCESS (even on expected promotion failures), OCF_ERR_GENERIC on conversion errors +# Note: Promotion failures are expected and logged as info (peer may not be up-to-date) +promote_learner_member() +{ + local learner_member_id=$1 + + # JSON member_id is decimal, while etcdctl command needs the hex version + if ! learner_member_id_hex=$(decimal_to_hex "$learner_member_id"); then + ocf_log err "could not convert decimal member_id '$learner_member_id' to hex, error code: $?" + return $OCF_ERR_GENERIC + fi + if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then + # promotion is expected to fail if the peer is not yet up-to-date + ocf_log info "could not promote member $learner_member_id_hex, error code: $?" + return $OCF_SUCCESS + fi + ocf_log info "successfully promoted member '$learner_member_id_hex'" + return $OCF_SUCCESS +} + +# Reconciles etcd cluster member states +# Promotes learner members or clears standalone/learner attributes as needed +# Args: $1 - member list JSON from etcdctl +# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors +# Note: Only operates when exactly 2 started members are present +reconcile_member_state() +{ + local rc + local member_list_json="$1" + + # count only the started members, which have the ".name" JSON field + number_of_started_members=$(printf "%s" "$member_list_json" | jq -r ".members[].name | select(. != null)" | wc -l) + if [ "$number_of_started_members" -ne 2 ]; then + ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_started_members members, need 2" + return $OCF_SUCCESS + fi + + learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID") + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not get isLearner field from member list, error code: $rc" + return $rc + fi + + if [ -n "$learner_member_id" ]; then + promote_learner_member "$learner_member_id" + return $? + fi + + if [ -z "$learner_member_id" ]; then + if ! clear_standalone_node; then + ocf_log error "could not clear standalone_node attribute, error code: $?" + return $OCF_ERR_GENERIC + fi + if ! attribute_learner_node clear; then + ocf_log error "could not clear learner_node attribute, error code: $?" + return $OCF_ERR_GENERIC + fi + fi + + return $OCF_SUCCESS +} + +attribute_learner_node() +{ + local action="$1" + local value="$2" + local attribute="learner_node" + + case "$action" in + get) + crm_attribute --query --name "$attribute" | awk -F"value=" '{print $2}' + ;; + update) + if ! crm_attribute --name "$attribute" --update "$value"; then + rc="$?" + ocf_log err "could not set $attribute to $value, error code: $rc" + return "$rc" + fi + ;; + clear) + crm_attribute --name "$attribute" --delete + ;; + *) + ocf_log err "unsupported $action for $attribute" + return $OCF_ERR_GENERIC + ;; + esac +} + +is_learner() +{ + if [ "$NODENAME" = "$(attribute_learner_node get)" ]; then + return 0 + fi + return 1 +} + +get_peer_node_name() { + crm_node -l | awk '{print $2}' | grep -v "$NODENAME" +} + +get_all_etcd_endpoints() { + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do + name=$(echo "$node" | cut -d: -f1) + ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6 + ipurl="$(ip_url $ip)" + if [ -z "$name" ] || [ -z "$ip" ]; then + ocf_exit_reason "name or ip missing for 1 or more nodes" + exit $OCF_ERR_CONFIGURED + fi + + [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="$ipurl:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,$ipurl:2379" + done + echo "$ALL_ETCD_ENDPOINTS" +} + +get_endpoint_status_json() +{ + # Get the status of all endpoints + local all_etcd_endpoints + + all_etcd_endpoints=$(get_all_etcd_endpoints) + podman exec "${CONTAINER}" etcdctl endpoint status --endpoints="$all_etcd_endpoints" -w json +} + +get_member_list_json() { + # Get the list of members visible to the current node + local this_node_endpoint + + this_node_endpoint="$(ip_url $(attribute_node_ip get)):2379" + podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json +} + +detect_cluster_leadership_loss() +{ + endpoint_status_json=$(get_endpoint_status_json) + ocf_log info "endpoint status: $endpoint_status_json" + + count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l) + if [ "$count_endpoints" -eq 1 ]; then + ocf_log info "one endpoint only: checking status errors" + endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors") + if echo "$endpoint_status_errors" | grep -q "no leader"; then + set_force_new_cluster + set_standalone_node + ocf_exit_reason "$NODENAME must force a new cluster" + return $OCF_ERR_GENERIC + fi + if [ "$endpoint_status_errors" != "null" ]; then + ocf_log err "unmanaged endpoint status error: $endpoint_status_errors" + fi + fi + + return $OCF_SUCCESS +} + + +# Manages etcd peer membership by detecting and handling missing or rejoining peers +# Adds missing peers as learners and reconciles member states when peers rejoin +# Args: $1 - member list JSON from etcdctl +# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors +# Note: Iterates through all peer nodes to ensure proper cluster membership +manage_peer_membership() +{ + local member_list_json="$1" + + # Example of .members[] instance fields in member list json format: + # NOTE that "name" is present in voting members only, while "isLearner" in learner members only + # and the value is always true (not a string) in that case. + # { + # "ID": , + # "name": "", + # "peerURLs": [ + # "https://:2380" + # ], + # "clientURLs": [ + # "https://:2379" + # ] + # } + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do + name=$(echo "$node" | cut -d: -f1) + # do not check itself + if [ "$name" = "$NODENAME" ]; then + continue + fi + + # Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name. + ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6 + peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID") + if [ -z "$peer_member_id" ]; then + ocf_log info "$name is not in the members list" + add_member_as_learner "$name" "$ip" + set_standalone_node + else + ocf_log debug "$name is in the members list by IP: $ip" + reconcile_member_state "$member_list_json" + fi + done +} + +check_peer() +{ + # Check peers endpoint status and locally accessible member list + local member_list_json + + # we need a running container to execute etcdctl. + if ! container_exists; then + return $OCF_SUCCESS + fi + + if ! member_list_json=$(get_member_list_json); then + ocf_log info "podman failed to get member list, error code: $?" + detect_cluster_leadership_loss + return $? + fi + + manage_peer_membership "$member_list_json" + return $OCF_SUCCESS +} + +podman_simple_status() +{ + local rc + + # simple status is implemented via podman exec + # everything besides success is considered "not running" + monitor_cmd_exec + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + rc=$OCF_NOT_RUNNING; + fi + return $rc +} + +podman_monitor() +{ + # We rely on running podman exec to monitor the container + # state because that command seems to be less prone to + # performance issue under IO load. + # + # For probes to work, we expect cmd_exec to be able to report + # when a container is not running. Here, we're not interested + # in distinguishing whether it's stopped or non existing + # (there's function container_exists for that) + monitor_cmd_exec + rc=$? + if [ $rc -ne 0 ]; then + return $rc + fi + + # Check if certificate files have changed, if they have, etcd needs to be restarted + if ! etcd_certificates_hash_manager "check"; then + return $OCF_ERR_GENERIC + fi + + if is_learner; then + ocf_log info "$NODENAME is learner. Cannot get member id" + return "$OCF_SUCCESS" + fi + # Failing to cache data and check member list should not cause the + # monitor operation to fail. + # TODO: move this inside check_peers where we already query member list json + attribute_node_member_id update + if ! check_peer; then + return $OCF_ERR_GENERIC + fi + + # node revision comes from the disk, so if it is not available is a fatal failure + attribute_node_revision update + return $? +} + +podman_create_mounts() { + oldIFS="$IFS" + IFS="," + for directory in $OCF_RESKEY_mount_points; do + mkdir -p "$directory" + done + IFS="$oldIFS" +} + +podman_container_id() +{ + # Retrieve the container ID by doing a "podman ps" rather than + # a "podman inspect", because the latter has performance issues + # under IO load. + # We could have run "podman start $CONTAINER" to get the ID back + # but if the container is stopped, the command will return a + # name instead of a container ID. This would break us. + podman ps --no-trunc --format '{{.ID}} {{.Names}}' | grep -F -w -m1 "$CONTAINER" | cut -d' ' -f1 +} + + +create_transient_drop_in_dependency() +{ + local cid=$1 + local rc=$OCF_SUCCESS + + if [ -z "$cid" ]; then + ocf_exit_reason "Container ID not found for \"$CONTAINER\". Not creating drop-in dependency" + return $OCF_ERR_GENERIC + fi + + ocf_log info "Creating drop-in dependency for \"$CONTAINER\" ($cid)" + for scope in "libpod-$cid.scope.d" "libpod-conmon-$cid.scope.d"; do + if [ $rc -eq $OCF_SUCCESS ] && [ ! -d /run/systemd/transient/"$scope" ]; then + mkdir -p /run/systemd/transient/"$scope" && \ + printf "[Unit]\nBefore=pacemaker.service" > /run/systemd/transient/"$scope"/dep.conf && \ + chmod ago+r /run/systemd/transient/"$scope" /run/systemd/transient/"$scope"/dep.conf + rc=$? + fi + done + + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_log err "Could not create drop-in dependency for \"$CONTAINER\" ($cid)" + else + systemctl daemon-reload + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_log err "Could not refresh service definition after creating drop-in for \"$CONTAINER\"" + fi + fi + + return $rc +} + + +run_new_container() +{ + local opts=$1 + local image=$2 + local cmd=$3 + local rc + + ocf_log info "running container $CONTAINER for the first time" + out=$(podman run $opts $image $cmd 2>&1) + rc=$? + + if [ -n "$out" ]; then + out="$(echo "$out" | tr -s ' \t\r\n' ' ')" + if [ $rc -eq 0 ]; then + ocf_log info "$out" + else + ocf_log err "$out" + fi + fi + + if [ $rc -eq 125 ]; then + # If an internal podman error occurred, it might be because + # the internal storage layer still references an old container + # with the same name, even though podman itself thinks there + # is no such container. If so, purge the storage layer to try + # to clean the corruption and try again. + if echo "$out" | grep -q "unknown.*flag"; then + ocf_exit_reason "$out" + return $rc + fi + + ocf_log warn "Internal podman error while creating new container $CONTAINER. Retrying." + ocf_run podman rm --storage "$CONTAINER" + ocf_run podman run $opts $image $cmd + rc=$? + elif [ $rc -eq 127 ]; then + # rhbz#1972209: podman 3.0.x seems to be hit by a race + # where the cgroup is not yet set up properly when the OCI + # runtime configures the container. If that happens, recreate + # the container as long as we get the same error code or + # until start timeout preempts us. + while [ $rc -eq 127 ] && (echo "$out" | grep -q "cgroup.*scope not found") ; do + ocf_log warn "Internal podman error while assigning cgroup. Retrying." + # Arbitrary sleep to prevent consuming all CPU while looping + sleep 1 + podman rm -f "$CONTAINER" + out=$(podman run $opts $image $cmd 2>&1) + rc=$? + done + # Log the created container ID if it succeeded + if [ $rc -eq 0 ]; then + ocf_log info "$out" + fi + fi + + return $rc +} + +compare_revision() +{ + # Compare local revision (from disk) against peer revision (from CIB). + # returns "older", "equal" or "newer" + local revision + local peer_node_name + local peer_revision + + revision=$(attribute_node_revision get) + peer_revision=$(attribute_node_revision_peer) + + if [ "$revision" = "" ] || [ "$revision" = "null" ] || [ "$peer_revision" = "" ] || [ "$peer_revision" = "null" ]; then + ocf_log err "could not compare revisions: '$NODENAME' local revision='$revision', peer revision='$peer_revision'" + return "$OCF_ERR_GENERIC" + fi + + if [ "$revision" -gt "$peer_revision" ]; then + ocf_log info "$NODENAME revision: '$revision' is newer than peer revision: '$peer_revision'" + echo "newer" + elif [ "$revision" -eq "$peer_revision" ]; then + ocf_log info "$NODENAME revision: '$revision' is equal to peer revision: '$peer_revision'" + echo "equal" + else + ocf_log info "$NODENAME revision: '$revision' is older than peer revision: '$peer_revision'" + echo "older" + fi + return "$OCF_SUCCESS" +} + +ensure_pod_manifest_exists() +{ + local wait_timeout_sec=$((10 * 60)) + local poll_interval_sec=5 + local poll_retries=$((wait_timeout_sec/poll_interval_sec)) + + for try in $(seq "$poll_retries"); do + if [ -f "$OCF_RESKEY_pod_manifest" ]; then + ocf_log info "pod manifest ($OCF_RESKEY_pod_manifest) found" + break + fi + ocf_log debug "pod manifest ($OCF_RESKEY_pod_manifest) does not exist yet: retry in $poll_interval_sec seconds." + sleep "$poll_interval_sec" + done + + if [ ! -f "$OCF_RESKEY_pod_manifest" ]; then + ocf_log err "pod manifest ($OCF_RESKEY_pod_manifest) still missing after $wait_timeout_sec seconds." + return "$OCF_ERR_CONFIGURED" + fi + + return "$OCF_SUCCESS" +} + +filter_pod_manifest() { + # Remove pod-version related fields from POD manifest + local pod_manifest="$1" + local temporary_file + local jq_filter='del(.metadata.labels.revision) | .spec.containers[] |= ( .env |= map(select( .name != "ETCD_STATIC_POD_VERSION" ))) | .spec.volumes |= map( select( .name != "resource-dir" ))' + + if ! temporary_file=$(mktemp); then + ocf_log err "could not create temporary file for '$pod_manifest', error code: $?" + return $OCF_ERR_GENERIC + fi + if ! jq "$jq_filter" "$pod_manifest" > "$temporary_file"; then + ocf_log err "could not remove pod version related data from '$pod_manifest', error code: $?" + return $OCF_ERR_GENERIC + fi + echo "$temporary_file" +} + +can_reuse_container() { + # Decide whether to reuse the existing container or create a new one based on etcd pod manifest changes. + # NOTE: explicitly ignore POD version and POD version related data, as the content might be the same even if the revision number has changed. + local cp_rc + local diff_rc + local filtered_original_pod_manifest + local filtered_copy_pod_manifest + + + # If the container does not exist it cannot be reused + if ! container_exists; then + OCF_RESKEY_reuse=0 + return "$OCF_SUCCESS" + fi + + # If the manifest copy doesn't exist, we need a new container. + if [ ! -f "$POD_MANIFEST_COPY" ]; then + ocf_log info "a working copy of $OCF_RESKEY_pod_manifest was not found. A new etcd container will be created." + OCF_RESKEY_reuse=0 + return "$OCF_SUCCESS" + fi + + if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then + return $OCF_ERR_GENERIC + fi + if ! filtered_copy_pod_manifest=$(filter_pod_manifest "$POD_MANIFEST_COPY"); then + return $OCF_ERR_GENERIC + fi + + ocf_log info "comparing $OCF_RESKEY_pod_manifest with local copy $POD_MANIFEST_COPY" + ocf_run diff -s "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest" + diff_rc="$?" + # clean up temporary files + rm -f "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest" + case "$diff_rc" in + 0) + ocf_log info "Reusing the existing etcd container" + OCF_RESKEY_reuse=1 + ;; + 1) + ocf_log info "Etcd pod manifest changes detected: creating a new etcd container to apply the changes" + if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then + cp_rc="$?" + ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc" + return "$OCF_ERR_GENERIC" + fi + ocf_log info "A working copy of $OCF_RESKEY_pod_manifest was created" + OCF_RESKEY_reuse=0 + ;; + *) + ocf_log err "Could not check if etcd pod manifest has changed, diff rc: $diff_rc" + return "$OCF_ERR_GENERIC" + ;; + esac + + return "$OCF_SUCCESS" +} + +ensure_pod_manifest_copy_exists() { + local cp_rc + + if [ -f "$POD_MANIFEST_COPY" ]; then + return "$OCF_SUCCESS" + fi + + # If the manifest copy doesn't exist, create it and ensure a new container. + if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then + cp_rc="$?" + ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc" + return "$OCF_ERR_GENERIC" + fi + + ocf_log info "a new working copy of $OCF_RESKEY_pod_manifest was created" + + return "$OCF_SUCCESS" +} + +podman_start() +{ + local cid + local rc + local etcd_pod_wait_timeout_sec=$((10 * 60)) + local etcd_pod_poll_interval_sec=10 + local etcd_pod_poll_retries=$((etcd_pod_wait_timeout_sec/etcd_pod_poll_interval_sec)) + local pod_was_running=false + + ocf_log notice "podman-etcd start" + attribute_node_ip update + attribute_node_cluster_id update + attribute_node_revision update + + # ensure the etcd pod is not running before starting the container + ocf_log info "ensure etcd pod is not running (retries: $etcd_pod_poll_retries, interval: $etcd_pod_poll_interval_sec)" + for try in $(seq $etcd_pod_poll_retries); do + if ! etcd_pod_container_exists; then + break + fi + ocf_log info "etcd pod running: retry in $etcd_pod_poll_interval_sec seconds." + pod_was_running=true + sleep $etcd_pod_poll_interval_sec + done + if etcd_pod_container_exists; then + ocf_exit_reason "etcd pod is still running after $etcd_pod_wait_timeout_sec seconds." + return $OCF_ERR_GENERIC + fi + + # Update the certificate hash after the container has started successfully + # this is to ensure that the certificate hash is updated after a restart is initiated + # by a cert rotation event from the monitor command. + if ! etcd_certificates_hash_manager "update"; then + ocf_exit_reason "etcd certificate hash manager failed to update the certificate hash" + return $OCF_ERR_GENERIC + fi + + # check if the container has already started + podman_simple_status + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log info "the '$CONTAINER' has already started. Nothing to do" + return "$OCF_SUCCESS" + fi + + if ! ensure_pod_manifest_exists; then + ocf_exit_reason "could not find etcd pod manifest ($OCF_RESKEY_pod_manifest)" + return "$OCF_ERR_GENERIC" + fi + + if ocf_is_true "$pod_was_running"; then + ocf_log info "static pod was running: start normally" + else + local fnc_holders + if ! fnc_holders=$(get_force_new_cluster); then + ocf_exit_reason "Failed to get force_new_cluster node holders" + return "$OCF_ERR_GENERIC" + fi + + local fnc_holder_count + fnc_holder_count=$(echo "$fnc_holders" | wc -w) + if [ "$fnc_holder_count" -gt 1 ]; then + ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)" + return "$OCF_ERR_GENERIC" + fi + + if [ "$fnc_holder_count" -eq 1 ]; then + if echo "$fnc_holders" | grep -q -w "$NODENAME"; then + # Attribute is set on the local node. + ocf_log notice "$NODENAME marked to force-new-cluster" + JOIN_AS_LEARNER=false + else + # Attribute is set on a peer node. + ocf_log info "$NODENAME shall join as learner because force_new_cluster is set on peer $fnc_holders" + JOIN_AS_LEARNER=true + fi + else + ocf_log info "no node is marked to force-new-cluster" + # When the local agent starts, we can infer the cluster state by counting + # how many agents are starting or already active: + # - 1 active agent: it's the peer (we are just starting) + # - 0 active agents, 1 starting: we are starting; the peer is not starting + # - 0 active agents, 2 starting: both agents are starting simultaneously + local active_resources_count + active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w) + ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')" + case "$active_resources_count" in + 1) + if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then + ocf_log info "peer active but in learner mode: start normally" + else + ocf_log info "peer is active standalone: joining as learner" + JOIN_AS_LEARNER=true + fi + ;; + 0) + # count how many agents are starting now + local start_resources_count + start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w) + ocf_log info "found '$start_resources_count' starting etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_start_resource')" + + # we need to compare the revisions in any of the following branches + # so call the function only once here + if ! revision_compare_result=$(compare_revision); then + ocf_log err "could not compare revisions, error code: $?" + return "$OCF_ERR_GENERIC" + fi + case "$start_resources_count" in + 1) + ocf_log debug "peer not starting: ensure we can start a new cluster" + if [ "$revision_compare_result" != "older" ]; then + # If our revision is the same as or newer than the peer's last saved + # revision, and the peer agent isn't currently starting, we can + # restore e-quorum by forcing a new cluster. + set_force_new_cluster + else + ocf_log err "local revision is older and peer is not starting: cannot start" + ocf_exit_reason "local revision is older and peer is not starting: cannot start" + return "$OCF_ERR_GENERIC" + fi + ;; + 2) + # TODO: can we start "normally", regardless the revisions, if the container-id is the same on both nodes? + ocf_log info "peer starting" + if [ "$revision_compare_result" = "newer" ]; then + set_force_new_cluster + elif [ "$revision_compare_result" = "older" ]; then + ocf_log info "$NODENAME shall join as learner" + JOIN_AS_LEARNER=true + else + if [ "$(attribute_node_cluster_id get)" = "$(attribute_node_cluster_id_peer)" ]; then + ocf_log info "same cluster_id and revision: start normal" + else + ocf_exit_reason "same revision but different cluster id" + return "$OCF_ERR_GENERIC" + fi + fi + ;; + *) + ocf_log err "Unexpected start resource count: $start_resources_count" + podman_notify + return "$OCF_ERR_GENERIC" + ;; + esac + ;; + *) + ocf_log err "Unexpected active resource count: $active_resources_count" + podman_notify + return "$OCF_ERR_GENERIC" + ;; + esac + fi + fi + + podman_create_mounts + local run_opts="--detach --name=${CONTAINER} --replace" + + run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}" + + # check to see if the container has already started + podman_simple_status + if [ $? -eq $OCF_SUCCESS ]; then + return "$OCF_SUCCESS" + fi + + if ocf_is_true "$JOIN_AS_LEARNER"; then + local wait_timeout_sec=$((10*60)) + local poll_interval_sec=5 + local retries=$(( wait_timeout_sec / poll_interval_sec )) + + ocf_log info "ensure the leader node added $NODENAME as learner member before continuing (timeout: $wait_timeout_sec seconds)" + for try in $(seq $retries); do + learner_node=$(attribute_learner_node get) + if [ "$NODENAME" != "$learner_node" ]; then + ocf_log info "$NODENAME is not in the member list yet. Retry in $poll_interval_sec seconds." + sleep $poll_interval_sec + continue + fi + ocf_log info "learner node $learner_node in the member list" + break + done + if [ "$NODENAME" != "$(attribute_learner_node get)" ]; then + ocf_log err "wait for $NODENAME to be in the member list timed out" + return "$OCF_ERR_GENERIC" + fi + + archive_data_folder + fi + + ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced" + if ! can_reuse_container ; then + rc="$?" + ocf_log err "could not determine etcd container reuse strategy, rc: $rc" + return "$rc" + fi + + # Archive current container and its configuration before creating + # new configuration files. + if ! ocf_is_true "$OCF_RESKEY_reuse"; then + # Log archive container failures but don't block, as the priority + # is ensuring the etcd container starts successfully. + archive_current_container + fi + + if ! ensure_pod_manifest_copy_exists; then + return $OCF_ERR_GENERIC + fi + + if ! prepare_env; then + ocf_log err "Could not prepare environment for podman, error code: $?" + return $OCF_ERR_GENERIC + fi + + if ! generate_etcd_configuration; then + ocf_log err "Could not generate etcd configuration, error code: $?" + return $OCF_ERR_GENERIC + fi + + run_opts="$run_opts \ + --network=host \ + -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \ + -v /var/lib/etcd:/var/lib/etcd \ + --env ETCDCTL_API=$ETCDCTL_API \ + --env ETCDCTL_CACERT=$SERVER_CACERT \ + --env ETCDCTL_CERT=$ETCD_PEER_CERT \ + --env ETCDCTL_KEY=$ETCD_PEER_KEY \ + --authfile=$OCF_RESKEY_authfile \ + --security-opt label=disable" + if [ -n "$OCF_RESKEY_run_opts" ]; then + run_opts="$run_opts $OCF_RESKEY_run_opts" + fi + + if [ -f "$ETCD_CONFIGURATION_FILE" ]; then + ocf_log info "using etcd configuration file: $ETCD_CONFIGURATION_FILE" + else + ocf_log err "could not find $ETCD_CONFIGURATION_FILE" + return "$OCF_ERR_GENERIC" + fi + + OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --config-file=$ETCD_CONFIGURATION_FILE" + if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then + OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts" + fi + + if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then + # no container image provided via input parameters. Read it from the pod manifest. + OCF_RESKEY_image=$(jq -r '.spec.containers[] | select( .name=="etcd").image' "$OCF_RESKEY_pod_manifest") + ocf_log info "using container image ($OCF_RESKEY_image) from Pod manifest ($OCF_RESKEY_pod_manifest)" + else + # use the container image provided as input parameter + ocf_log info "using container image ($OCF_RESKEY_image) via input parameters" + fi + + if [ $REQUIRE_IMAGE_PULL -eq 1 ]; then + ocf_log notice "Beginning pull of image, ${OCF_RESKEY_image}" + if ! podman pull --authfile="$OCF_RESKEY_authfile" "${OCF_RESKEY_image}"; then + ocf_exit_reason "failed to pull image ${OCF_RESKEY_image}" + return $OCF_ERR_GENERIC + fi + else + ocf_log notice "Pull image not required, ${OCF_RESKEY_image}" + fi + + if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then + ocf_log info "starting existing container $CONTAINER." + ocf_run podman start "$CONTAINER" + else + ocf_log info "starting new container $CONTAINER." + run_new_container "$run_opts" "$OCF_RESKEY_image" "$OCF_RESKEY_run_cmd" + if [ $? -eq 125 ]; then + return $OCF_ERR_GENERIC + fi + fi + rc=$? + + # if the container was stopped or didn't exist before, systemd + # removed the libpod* scopes. So always try to recreate the drop-ins + if [ $rc -eq 0 ] && ocf_is_true "$OCF_RESKEY_drop_in_dependency"; then + cid=$(podman_container_id) + create_transient_drop_in_dependency "$cid" + rc=$? + fi + + if [ $rc -ne 0 ]; then + ocf_exit_reason "podman failed to launch container (error code: $rc)" + return $OCF_ERR_GENERIC + fi + + # wait for monitor to pass before declaring that the container is started + while true; do + podman_simple_status + if [ $? -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Newly created podman container exited after start" + ocf_run podman logs --tail 20 "${CONTAINER}" + return $OCF_ERR_GENERIC + fi + + monitor_cmd_exec + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log notice "Container $CONTAINER started successfully" + if is_force_new_cluster; then + clear_force_new_cluster + + local peer_node_name + local peer_node_ip + peer_node_name="$(get_peer_node_name)" + peer_node_ip="$(attribute_node_ip_peer)" + if [ -n "$peer_node_name" ] && [ -n "$peer_node_ip" ]; then + add_member_as_learner "$peer_node_name" "$peer_node_ip" + else + ocf_log err "could not add peer as learner (peer node name: ${peer_node_name:-unknown}, peer ip: ${peer_node_ip:-unknown})" + fi + fi + return $OCF_SUCCESS + fi + + ocf_exit_reason "waiting on monitor_cmd to pass after start" + sleep 1 + done +} + +podman_stop() +{ + local timeout=60 + local rc + + ocf_log notice "podman-etcd stop" + podman_simple_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "could not leave members list: etcd container not running" + return $OCF_SUCCESS + fi + + attribute_node_revision update + attribute_node_cluster_id update + + if ! member_id=$(attribute_node_member_id get); then + ocf_log err "error leaving members list: could not get member-id" + else + # TODO: is it worth/possible to check the current status instead than relying on cached attributes? + if is_standalone; then + ocf_log info "last member. Not leaving the member list" + else + ocf_log info "leaving members list as member with ID $member_id" + endpoint="$(ip_url $(attribute_node_ip get)):2379" + if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then + rc=$? + ocf_log err "error leaving members list, error code: $rc" + fi + fi + fi + attribute_node_member_id clear + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + timeout=$(((OCF_RESKEY_CRM_meta_timeout/1000) -10 )) + if [ $timeout -lt 10 ]; then + timeout=10 + fi + fi + + if ocf_is_true "$OCF_RESKEY_force_kill"; then + ocf_run podman kill "$CONTAINER" + rc=$? + else + ocf_log info "waiting $timeout second[s] before killing container" + ocf_run podman stop -t="$timeout" "$CONTAINER" + rc=$? + # on stop, systemd will automatically delete any transient + # drop-in conf that has been created earlier + fi + + if [ $rc -ne 0 ]; then + # If the stop failed, it could be because the controlling conmon + # process died unexpectedly. If so, a generic error code is returned + # but the associated container exit code is -1. If that's the case, + # assume there's no failure and continue with the rm as usual. + if [ $rc -eq 125 ] && \ + podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' "$CONTAINER" | grep -Eq '^(exited|stopped):-1$'; then + ocf_log err "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway." + else + ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi + fi + + return $OCF_SUCCESS +} + +image_exists() +{ + if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then + # the actual container image was not defined yet. Nor by + # the user via OCF_RESKEY, nor by reading the Pod manifest + return 0 + fi + if podman image exists "${OCF_RESKEY_image}"; then + # image found + return 0 + fi + + if ocf_is_true "$OCF_RESKEY_allow_pull"; then + REQUIRE_IMAGE_PULL=1 + ocf_log notice "Image (${OCF_RESKEY_image}) does not exist locally but will be pulled during start" + return 0 + fi + # image not found. + return 1 +} + +podman_validate() +{ + check_binary curl + check_binary crictl + check_binary oc + check_binary podman + check_binary jq + check_binary tar + + if [ -z "$OCF_RESKEY_node_ip_map" ]; then + ocf_exit_reason "'node_ip_map' option is required" + exit $OCF_ERR_CONFIGURED + fi + + if [ -z "$OCF_RESKEY_pod_manifest" ]; then + ocf_exit_reason "'pod_manifest' option is required" + exit $OCF_ERR_CONFIGURED + fi + + if [ -z "$OCF_RESKEY_image" ]; then + ocf_exit_reason "'image' option is required" + exit $OCF_ERR_CONFIGURED + fi + + if ! image_exists; then + ocf_exit_reason "base image, ${OCF_RESKEY_image}, could not be found." + exit $OCF_ERR_CONFIGURED + fi + + if [ "$OCF_RESKEY_oom" -lt -1000 ] || [ "$OCF_RESKEY_oom" -gt 1000 ]; then + ocf_exit_reason "'oom' value ${OCF_RESKEY_oom} is out of range [-1000:1000]" + exit $OCF_ERR_CONFIGURED + fi + + if ! echo "validation test" > "$ETCD_CERTS_HASH_FILE" \ + || ! cat "$ETCD_CERTS_HASH_FILE" >/dev/null 2>&1 \ + || ! rm "$ETCD_CERTS_HASH_FILE"; then + ocf_exit_reason "cannot read/write to certificate hash file $ETCD_CERTS_HASH_FILE" + exit $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +podman_notify() +{ + ocf_log info "notify: type=${OCF_RESKEY_CRM_meta_notify_type}, operation=${OCF_RESKEY_CRM_meta_notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }" +} + +# TODO : +# When a user starts plural clones in a node in globally-unique, a user cannot appoint plural name parameters. +# When a user appoints reuse, the resource agent cannot connect plural clones with a container. + +if ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then + if [ -n "$OCF_RESKEY_name" ]; then + if [ -n "$OCF_RESKEY_CRM_meta_clone_node_max" ] && [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural clones from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + if [ -n "$OCF_RESKEY_CRM_meta_master_node_max" ] && [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural master from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + fi + : ${OCF_RESKEY_name=$(echo ${OCF_RESOURCE_INSTANCE} | tr ':' '-')} +else + : ${OCF_RESKEY_name=${OCF_RESOURCE_INSTANCE}} +fi + +CONTAINER=$OCF_RESKEY_name +POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml" +ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml" +ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz" +ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash" + +# Note: we currently monitor podman containers by with the "podman exec" +# command, so make sure that invocation is always valid by enforcing the +# exec command to be non-empty +: ${OCF_RESKEY_monitor_cmd:=/bin/true} + +# When OCF_RESKEY_drop_in_dependency is not populated, we +# look at another file-based way of enabling the option. +# Otherwise, consider it disabled. +if [ -z "$OCF_RESKEY_drop_in_dependency" ]; then + if [ -f "/etc/sysconfig/podman_drop_in" ] || \ + [ -f "/etc/default/podman_drop_in" ]; then + OCF_RESKEY_drop_in_dependency=yes + fi +fi + + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS;; +usage|help) podman_usage + exit $OCF_SUCCESS + ;; +esac + +NODENAME=$(ocf_local_nodename) +JOIN_AS_LEARNER=false + +case $__OCF_ACTION in +start) + podman_validate || exit $? + podman_start;; +stop) podman_stop;; +monitor) podman_monitor;; +notify) podman_notify;; +validate-all) podman_validate;; +*) podman_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc From 50b0da920ad4a110353356a17350b2dc112aaedc Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Mon, 27 Oct 2025 12:51:29 +0100 Subject: [PATCH 03/19] Initial project file --- .claude/commands/etcd/PROJECT.md | 286 +++++++++++++++++++++++++++++++ website | 1 + 2 files changed, 287 insertions(+) create mode 100644 .claude/commands/etcd/PROJECT.md create mode 160000 website diff --git a/.claude/commands/etcd/PROJECT.md b/.claude/commands/etcd/PROJECT.md new file mode 100644 index 0000000..8cab38e --- /dev/null +++ b/.claude/commands/etcd/PROJECT.md @@ -0,0 +1,286 @@ +# Etcd Troubleshooting Slash Command - Project File + +## Project Overview + +This project is for developing a Claude Code slash command that helps troubleshoot etcd issues on two-node with fencing OpenShift clusters. The command will leverage Ansible to access cluster VMs directly and analyze etcd health, pacemaker status, and relevant system logs to provide diagnostic insights and troubleshooting procedures. + +## Objectives + +1. Create a slash command that validates direct Ansible access to cluster VMs +2. Gather etcd and Pacemaker status using appropriate commands +3. Collect and analyze journalctl logs for both Pacemaker and etcd services +4. Analyze the collected data to identify common issues +5. Propose a structured troubleshooting procedure to the user +6. Provide actionable recommendations based on the analysis + +## Target Environment + +- **Deployment Type**: Two-Node with Fencing (TNF) OpenShift cluster +- **Topology**: Two control plane nodes with BMC-based fencing +- **Access Method**: Ansible via inventory.ini +- **Key Components**: + - Pacemaker (cluster resource management) + - Corosync (cluster communication) + - Etcd (running as Podman containers managed by Pacemaker) + - Fencing agents (BMC/RedFish based) + +## Documentation Resources + +### Primary References + +1. **Fencing Documentation**: `docs/fencing/README.md` + - Overview of Two-Node with Fencing architecture + - Etcd management by Pacemaker + - Disruption handling (graceful and ungraceful) + - Quorum management principles + +2. **Etcd Operations Guide**: `.claude/commands/etcd/etcd-ops-guide/` + - `clustering.md` - Cluster membership and operations + - `configuration.md` - Configuration parameters + - `container.md` - Container-specific operations + - `data_corruption.md` - Data corruption detection and recovery + - `failures.md` - Failure scenarios and handling + - `maintenance.md` - Maintenance procedures + - `monitoring.md` - Monitoring and metrics + - `recovery.md` - Recovery procedures + - `runtime-configuration.md` - Runtime configuration changes + - `runtime-reconf-design.md` - Reconfiguration design patterns + +3. **Pacemaker Documentation**: `.claude/commands/etcd/pacemaker/` + - `podman-etcd.sh` - The resource agent managing etcd containers + - `Pacemaker_Administration/` - Comprehensive Pacemaker administration docs + - `administrative.rst` - Administrative tasks + - `agents.rst` - Resource agents overview + - `alerts.rst` - Alert configuration + - `configuring.rst` - Cluster configuration + - `tools.rst` - Pacemaker command-line tools + - `troubleshooting.rst` - Pacemaker troubleshooting guide + - `moving.rst` - Resource movement and migration + - `options.rst` - Configuration options + +## Technical Approach + +### Phase 1: Validation + +**1.1 Ansible Access Validation:** +- Verify Ansible inventory exists at `deploy/openshift-clusters/inventory.ini` +- Test SSH connectivity to cluster nodes via Ansible ping module +- Validate required tools are available on cluster nodes (pcs, podman, journalctl, crm_attribute) + +**1.2 OpenShift Cluster Access Validation:** +- Attempt to run `oc version` to test direct cluster access +- If direct access fails, check for proxy configuration: + - Look for `deploy/openshift-clusters/proxy.env` file + - If `proxy.env` exists: source it before running `oc` commands + - If `proxy.env` doesn't exist: warn user that cluster access requires proxy setup +- Verify cluster access by running `oc get nodes` (with proxy if needed) +- Store proxy requirement status for subsequent OpenShift API calls + +**Proxy Handling Pattern:** +```bash +# Direct access attempt +oc version + +# If fails, try with proxy +if [ -f deploy/openshift-clusters/proxy.env ]; then + source deploy/openshift-clusters/proxy.env && oc version +else + echo "WARNING: No direct cluster access and proxy.env not found" +fi +``` + +All subsequent `oc` commands must follow the same pattern (source proxy.env if required). + +### Phase 2: Data Collection +Commands to execute via Ansible on cluster VMs. + +**Important**: All commands must be executed with sudo privileges (using Ansible's `become: yes`). + +**Pacemaker Status:** +```bash +sudo pcs status +sudo pcs resource status +sudo pcs constraint list +sudo crm_mon -1 +``` + +**Etcd Container Status:** +```bash +sudo podman ps -a --filter name=etcd +sudo podman inspect etcd +sudo podman logs --tail 100 etcd +``` + +**Etcd Cluster Health:** +```bash +sudo podman exec etcd etcdctl member list -w table +sudo podman exec etcd etcdctl endpoint health -w table +sudo podman exec etcd etcdctl endpoint status -w table +``` + +**System Logs:** +```bash +sudo journalctl -u pacemaker --since "1 hour ago" -n 200 +sudo journalctl -u corosync --since "1 hour ago" -n 100 +sudo journalctl --grep etcd --since "1 hour ago" -n 200 +``` + +**Cluster Attributes:** +```bash +sudo crm_attribute --query --name standalone_node +sudo crm_attribute --query --name learner_node +sudo crm_attribute --query --name force_new_cluster --lifetime reboot +``` + +**OpenShift Cluster Status** (requires proxy.env if configured): +```bash +# Node status +oc get nodes -o wide + +# Etcd operator status +oc get co etcd -o yaml + +# Etcd pods (should not exist in TNF, managed by Pacemaker) +oc get pods -n openshift-etcd + +# Control plane machine config status +oc get mcp master -o yaml + +# Check for degraded operators +oc get co --no-headers | grep -v "True.*False.*False" + +# Etcd-related events +oc get events -n openshift-etcd --sort-by='.lastTimestamp' | tail -50 +``` + +### Phase 3: Analysis +The command should analyze collected data for: + +1. **Cluster Quorum Issues**: + - Corosync quorum status + - Pacemaker partition state + - Node online/offline status + +2. **Etcd Health**: + - Member list consistency + - Leader election status + - Endpoint health + - Learner vs. voting member status + +3. **Resource State**: + - Etcd resource running status + - Failed actions in Pacemaker + - Resource constraints violations + +4. **Common Error Patterns**: + - Certificate expiration/rotation issues + - Network connectivity problems + - Split-brain scenarios + - Fencing failures + - Data corruption indicators + +5. **Cluster ID Mismatches**: + - Detect different cluster IDs between nodes + - Force-new-cluster flag status + +6. **OpenShift Integration Issues**: + - Etcd operator status and conditions + - Unexpected etcd pods running in openshift-etcd namespace (should not exist in TNF) + - Machine config pool degradation + - Cluster operator degradation related to etcd + +### Phase 4: Troubleshooting Procedure +Based on analysis, provide: + +1. **Diagnosis Summary**: Clear statement of identified issues +2. **Root Cause Analysis**: Likely causes based on symptoms +3. **Step-by-Step Remediation**: + - Ordered steps to resolve issues + - Commands to execute + - Expected outcomes at each step + - Rollback procedures if available +4. **Verification Steps**: How to confirm the issue is resolved +5. **Prevention Recommendations**: How to avoid recurrence + +## Key Etcd/Pacemaker Concepts + +### Cluster States +- **Standalone**: Single node running as "cluster-of-one" +- **Learner**: Node rejoining cluster, not yet voting member +- **Force-new-cluster**: Flag to bootstrap new cluster from single node + +### Critical Attributes (stored in CIB) +- `standalone_node` - Which node is running standalone +- `learner_node` - Which node is rejoining as learner +- `force_new_cluster` - Bootstrap flag (lifetime: reboot) +- `node_ip` - Node IP addresses +- `member_id` - Etcd member ID +- `cluster_id` - Etcd cluster ID +- `revision` - Etcd raft index + +### Pacemaker Resource Agent +The `podman-etcd.sh` agent manages: +- Container lifecycle (start/stop) +- Member join/leave operations +- Certificate rotation monitoring +- Cluster ID reconciliation +- Learner promotion to voting member + +### Failure Scenarios + +**Graceful Disruption** (4.19+): +- Pacemaker intercepts reboot +- Removes node from etcd cluster +- Cluster continues as single node +- Node resyncs and rejoins on return + +**Ungraceful Disruption** (4.20+): +- Unreachable node is fenced (powered off) +- Surviving node restarts etcd as cluster-of-one +- New cluster ID is assigned +- Failed node discards old DB and resyncs on restart + +## Implementation Checklist + +- [ ] Create slash command file structure +- [ ] Implement Ansible inventory validation +- [ ] Implement SSH connectivity test via Ansible +- [ ] Implement OpenShift cluster access validation +- [ ] Implement proxy.env detection and handling +- [ ] Create Ansible playbook for data collection (VM-level) +- [ ] Create oc command wrapper for proxy.env sourcing +- [ ] Implement data collection orchestration +- [ ] Create analysis functions for each component +- [ ] Implement error pattern matching +- [ ] Build troubleshooting decision tree +- [ ] Create output formatting for diagnostics +- [ ] Implement remediation procedure generator +- [ ] Add verification steps to procedures +- [ ] Test with various failure scenarios +- [ ] Test with direct cluster access (no proxy) +- [ ] Test with proxy.env required +- [ ] Test with missing proxy.env (graceful degradation) +- [ ] Document slash command usage +- [ ] Add examples of common issues + +## Success Criteria + +1. Command successfully validates Ansible access to cluster VMs +2. Command successfully validates OpenShift cluster access (with or without proxy) +3. Gracefully handles missing proxy.env with clear user warnings +4. Collects comprehensive etcd and Pacemaker status from VMs +5. Collects OpenShift cluster operator and node status +6. Identifies common failure patterns accurately +7. Provides clear, actionable troubleshooting procedures +8. Includes verification steps for each remediation +9. Handles edge cases gracefully (e.g., nodes unreachable, partial data collection) +10. Provides useful output even when some data collection fails + +## Future Enhancements + +- Interactive mode for step-by-step troubleshooting +- Automated remediation for common issues (with user confirmation) +- Historical log analysis to identify patterns over time +- Integration with OpenShift cluster-wide diagnostics +- Export diagnostics bundle for support cases +- Comparison with known-good cluster state diff --git a/website b/website new file mode 160000 index 0000000..a022204 --- /dev/null +++ b/website @@ -0,0 +1 @@ +Subproject commit a022204f20cd90d892ed9a268e4929b491a5cad6 From f40483f78149ac9246869838c527da88cd3fd25d Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Tue, 28 Oct 2025 10:09:38 +0100 Subject: [PATCH 04/19] Add access validation playbook --- .../etcd/playbooks/validate-access.yml | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 .claude/commands/etcd/playbooks/validate-access.yml diff --git a/.claude/commands/etcd/playbooks/validate-access.yml b/.claude/commands/etcd/playbooks/validate-access.yml new file mode 100644 index 0000000..d85693b --- /dev/null +++ b/.claude/commands/etcd/playbooks/validate-access.yml @@ -0,0 +1,63 @@ +--- +# Ansible playbook to validate access to cluster VMs and basic connectivity +# This is the first step in the etcd troubleshooting workflow + +- name: Validate Ansible Access to Cluster VMs + hosts: cluster_vms + gather_facts: yes + become: no + + tasks: + - name: Test basic connectivity + ansible.builtin.ping: + register: ping_result + + - name: Display connectivity result + ansible.builtin.debug: + msg: "Successfully connected to {{ inventory_hostname }} ({{ ansible_host }})" + when: ping_result is succeeded + + - name: Check if required tools are available + ansible.builtin.command: which {{ item }} + loop: + - pcs + - podman + - journalctl + - crm_attribute + - etcdctl + register: tool_check + failed_when: false + changed_when: false + + - name: Report available tools + ansible.builtin.debug: + msg: "{{ item.item }}: {{ 'Available' if item.rc == 0 else 'NOT FOUND' }}" + loop: "{{ tool_check.results }}" + loop_control: + label: "{{ item.item }}" + + - name: Check sudo access + ansible.builtin.command: sudo -n true + register: sudo_check + failed_when: false + changed_when: false + + - name: Report sudo access + ansible.builtin.debug: + msg: "Sudo access: {{ 'Available (passwordless)' if sudo_check.rc == 0 else 'Requires password or not available' }}" + + - name: Gather system information + ansible.builtin.setup: + gather_subset: + - network + - hardware + register: facts + + - name: Display node information + ansible.builtin.debug: + msg: + - "Hostname: {{ ansible_hostname }}" + - "IP Address: {{ ansible_default_ipv4.address | default('N/A') }}" + - "Distribution: {{ ansible_distribution }} {{ ansible_distribution_version }}" + - "Kernel: {{ ansible_kernel }}" + - "Memory: {{ (ansible_memtotal_mb / 1024) | round(1) }} GB" From 31e049aaea5bba3ce0d653c0e079f8862e9dbd84 Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Tue, 28 Oct 2025 10:09:51 +0100 Subject: [PATCH 05/19] Swap from slash command to skill --- .claude/commands/etcd/PROJECT.md | 10 +- .../commands/etcd/TROUBLESHOOTING_SKILL.md | 182 ++++++++++++++++++ 2 files changed, 187 insertions(+), 5 deletions(-) create mode 100644 .claude/commands/etcd/TROUBLESHOOTING_SKILL.md diff --git a/.claude/commands/etcd/PROJECT.md b/.claude/commands/etcd/PROJECT.md index 8cab38e..1cbc119 100644 --- a/.claude/commands/etcd/PROJECT.md +++ b/.claude/commands/etcd/PROJECT.md @@ -1,17 +1,17 @@ -# Etcd Troubleshooting Slash Command - Project File +# Etcd Troubleshooting Skill - Project File ## Project Overview -This project is for developing a Claude Code slash command that helps troubleshoot etcd issues on two-node with fencing OpenShift clusters. The command will leverage Ansible to access cluster VMs directly and analyze etcd health, pacemaker status, and relevant system logs to provide diagnostic insights and troubleshooting procedures. +This project is for developing a Claude Code skill that helps troubleshoot etcd issues on two-node with fencing OpenShift clusters. The skill enables Claude to iteratively diagnose and resolve issues by leveraging Ansible to access cluster VMs directly and analyze etcd health, pacemaker status, and relevant system logs to provide diagnostic insights and troubleshooting procedures. ## Objectives -1. Create a slash command that validates direct Ansible access to cluster VMs +1. Provide troubleshooting expertise that validates direct Ansible access to cluster VMs 2. Gather etcd and Pacemaker status using appropriate commands 3. Collect and analyze journalctl logs for both Pacemaker and etcd services 4. Analyze the collected data to identify common issues -5. Propose a structured troubleshooting procedure to the user -6. Provide actionable recommendations based on the analysis +5. Propose and execute structured troubleshooting procedures iteratively +6. Provide actionable recommendations based on ongoing analysis ## Target Environment diff --git a/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md new file mode 100644 index 0000000..1bd7de0 --- /dev/null +++ b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md @@ -0,0 +1,182 @@ +# Etcd Troubleshooting Skill + +This document defines the Claude Code skill for troubleshooting etcd issues on two-node OpenShift clusters with fencing topology. When activated, Claude becomes an expert etcd/Pacemaker troubleshooter capable of iterative diagnosis and remediation. + +## Skill Overview + +This skill enables Claude to: +- Validate and test access to cluster components via Ansible and OpenShift CLI +- Iteratively collect diagnostic data from Pacemaker, etcd, and OpenShift +- Analyze symptoms and identify root causes +- Propose and execute remediation steps +- Verify fixes and adjust approach based on results +- Provide comprehensive troubleshooting throughout the diagnostic process + +## Step-by-Step Procedure + +### 1. Validate Access + +**1.1 Ansible Inventory Validation:** +- Check if `deploy/openshift-clusters/inventory.ini` exists +- Verify the inventory file has valid cluster node entries +- Test SSH connectivity to cluster nodes using Ansible ping module + +**1.2 OpenShift Cluster Access Validation:** +- Test direct cluster access with `oc version` +- If direct access fails, check for `deploy/openshift-clusters/proxy.env` +- If proxy.env exists, source it before running oc commands +- Verify cluster access with `oc get nodes` +- Remember proxy requirement for all subsequent oc commands + +### 2. Collect Data + +Use Ansible to execute commands on cluster VMs (all commands require sudo/become): + +**Pacemaker Status:** +```bash +sudo pcs status +sudo pcs resource status +sudo pcs constraint list +sudo crm_mon -1 +``` + +**Etcd Container Status:** +```bash +sudo podman ps -a --filter name=etcd +sudo podman inspect etcd +sudo podman logs --tail 100 etcd +``` + +**Etcd Cluster Health:** +```bash +sudo podman exec etcd etcdctl member list -w table +sudo podman exec etcd etcdctl endpoint health -w table +sudo podman exec etcd etcdctl endpoint status -w table +``` + +**System Logs:** +```bash +sudo journalctl -u pacemaker --since "1 hour ago" -n 200 +sudo journalctl -u corosync --since "1 hour ago" -n 100 +sudo journalctl --grep etcd --since "1 hour ago" -n 200 +``` + +**Cluster Attributes:** +```bash +sudo crm_attribute --query --name standalone_node +sudo crm_attribute --query --name learner_node +sudo crm_attribute --query --name force_new_cluster --lifetime reboot +``` + +**OpenShift Cluster Status** (use proxy.env if needed): +```bash +oc get nodes -o wide +oc get co etcd -o yaml +oc get pods -n openshift-etcd +oc get mcp master -o yaml +oc get co --no-headers | grep -v "True.*False.*False" +oc get events -n openshift-etcd --sort-by='.lastTimestamp' | tail -50 +``` + +### 3. Analyze Collected Data + +Look for these key issues: + +**Cluster Quorum:** +- Corosync quorum status +- Pacemaker partition state +- Node online/offline status + +**Etcd Health:** +- Member list consistency +- Leader election status +- Endpoint health +- Learner vs. voting member status +- Cluster ID mismatches between nodes + +**Resource State:** +- Etcd resource running status +- Failed actions in Pacemaker +- Resource constraint violations + +**Common Error Patterns:** +- Certificate expiration/rotation issues +- Network connectivity problems +- Split-brain scenarios +- Fencing failures +- Data corruption indicators + +**OpenShift Integration:** +- Etcd operator status and conditions +- Unexpected etcd pods in openshift-etcd namespace (should not exist in TNF) +- Machine config pool degradation +- Cluster operator degradation related to etcd + +### 4. Provide Troubleshooting Procedure + +Based on your analysis, provide: + +1. **Diagnosis Summary**: Clear statement of identified issues +2. **Root Cause Analysis**: Likely causes based on symptoms +3. **Step-by-Step Remediation**: + - Ordered steps to resolve issues + - Specific commands to execute + - Expected outcomes at each step + - Rollback procedures if available +4. **Verification Steps**: How to confirm the issue is resolved +5. **Prevention Recommendations**: How to avoid recurrence + +## Key Context + +### Cluster States +- **Standalone**: Single node running as "cluster-of-one" +- **Learner**: Node rejoining cluster, not yet voting member +- **Force-new-cluster**: Flag to bootstrap new cluster from single node + +### Critical Attributes +- `standalone_node` - Which node is running standalone +- `learner_node` - Which node is rejoining as learner +- `force_new_cluster` - Bootstrap flag (lifetime: reboot) +- `cluster_id` - Etcd cluster ID (must match on both nodes) + +### Failure Scenarios + +**Graceful Disruption** (4.19+): +- Pacemaker intercepts reboot +- Removes node from etcd cluster +- Cluster continues as single node +- Node resyncs and rejoins on return + +**Ungraceful Disruption** (4.20+): +- Unreachable node is fenced (powered off) +- Surviving node restarts etcd as cluster-of-one +- New cluster ID is assigned +- Failed node discards old DB and resyncs on restart + +## Reference Documentation + +You have access to these slash commands for detailed information: +- `/etcd:etcd-ops-guide:clustering` - Cluster membership operations +- `/etcd:etcd-ops-guide:recovery` - Recovery procedures +- `/etcd:etcd-ops-guide:monitoring` - Monitoring and health checks +- `/etcd:etcd-ops-guide:failures` - Failure scenarios +- `/etcd:etcd-ops-guide:data_corruption` - Data corruption handling + +Pacemaker documentation is available in `.claude/commands/etcd/pacemaker/` directory. + +## Output Format + +Provide clear, concise diagnostics with: +- Markdown formatting for readability +- Code blocks for commands +- Clear sections for diagnosis, remediation, and verification +- Actionable next steps +- Links to relevant files when referencing code or logs + +## Important Notes + +- Handle cases where some data collection fails gracefully +- Provide useful output even with partial data +- Warn user clearly if proxy.env is required but missing +- Always use sudo/become for commands on cluster VMs via Ansible +- Be specific about which node to run commands on when relevant From 19b8739b0eb64baab560ff40fc28eaceeb047425 Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Tue, 28 Oct 2025 19:13:58 +0100 Subject: [PATCH 06/19] Add diagnosis tools --- .../etcd/playbooks/collect-diagnostics.yml | 311 ++++++++++++++++++ .../etcd/scripts/collect-all-diagnostics.sh | 246 ++++++++++++++ .claude/commands/etcd/scripts/oc-wrapper.sh | 69 ++++ .../etcd/scripts/validate-cluster-access.sh | 133 ++++++++ 4 files changed, 759 insertions(+) create mode 100644 .claude/commands/etcd/playbooks/collect-diagnostics.yml create mode 100755 .claude/commands/etcd/scripts/collect-all-diagnostics.sh create mode 100755 .claude/commands/etcd/scripts/oc-wrapper.sh create mode 100755 .claude/commands/etcd/scripts/validate-cluster-access.sh diff --git a/.claude/commands/etcd/playbooks/collect-diagnostics.yml b/.claude/commands/etcd/playbooks/collect-diagnostics.yml new file mode 100644 index 0000000..cc8a30c --- /dev/null +++ b/.claude/commands/etcd/playbooks/collect-diagnostics.yml @@ -0,0 +1,311 @@ +--- +# Comprehensive diagnostic data collection for etcd troubleshooting +# Collects Pacemaker status, etcd health, container status, and system logs + +- name: Collect Etcd and Pacemaker Diagnostics + hosts: cluster_vms + gather_facts: yes + become: yes + + vars: + log_timeframe: "1 hour ago" + log_lines_pacemaker: 200 + log_lines_corosync: 100 + log_lines_etcd: 200 + output_dir: "/tmp/etcd-diagnostics-{{ ansible_date_time.iso8601_basic_short }}" + + tasks: + - name: Create local output directory + ansible.builtin.file: + path: "{{ output_dir }}/{{ inventory_hostname }}" + state: directory + mode: '0755' + delegate_to: localhost + become: no + + # ============================================================ + # Pacemaker Status Collection + # ============================================================ + + - name: Get pcs status + ansible.builtin.command: pcs status + register: pcs_status + changed_when: false + failed_when: false + + - name: Save pcs status + ansible.builtin.copy: + content: "{{ pcs_status.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/pcs_status.txt" + delegate_to: localhost + become: no + + - name: Get pcs resource status + ansible.builtin.command: pcs resource status + register: pcs_resource_status + changed_when: false + failed_when: false + + - name: Save pcs resource status + ansible.builtin.copy: + content: "{{ pcs_resource_status.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/pcs_resource_status.txt" + delegate_to: localhost + become: no + + - name: Get pcs constraint list + ansible.builtin.command: pcs constraint list + register: pcs_constraints + changed_when: false + failed_when: false + + - name: Save pcs constraints + ansible.builtin.copy: + content: "{{ pcs_constraints.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/pcs_constraints.txt" + delegate_to: localhost + become: no + + - name: Get crm_mon output + ansible.builtin.command: crm_mon -1 + register: crm_mon + changed_when: false + failed_when: false + + - name: Save crm_mon output + ansible.builtin.copy: + content: "{{ crm_mon.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/crm_mon.txt" + delegate_to: localhost + become: no + + # ============================================================ + # CIB Attributes Collection + # ============================================================ + + - name: Query standalone_node attribute + ansible.builtin.command: crm_attribute --query --name standalone_node + register: attr_standalone + changed_when: false + failed_when: false + + - name: Query learner_node attribute + ansible.builtin.command: crm_attribute --query --name learner_node + register: attr_learner + changed_when: false + failed_when: false + + - name: Query force_new_cluster attribute + ansible.builtin.command: crm_attribute --query --name force_new_cluster --lifetime reboot + register: attr_force_new + changed_when: false + failed_when: false + + - name: Query node_ip attribute + ansible.builtin.command: crm_attribute --query --name node_ip + register: attr_node_ip + changed_when: false + failed_when: false + + - name: Query member_id attribute + ansible.builtin.command: crm_attribute --query --name member_id + register: attr_member_id + changed_when: false + failed_when: false + + - name: Query cluster_id attribute + ansible.builtin.command: crm_attribute --query --name cluster_id + register: attr_cluster_id + changed_when: false + failed_when: false + + - name: Query revision attribute + ansible.builtin.command: crm_attribute --query --name revision + register: attr_revision + changed_when: false + failed_when: false + + - name: Save CIB attributes + ansible.builtin.copy: + content: | + standalone_node: {{ attr_standalone.stdout | default('N/A') }} + learner_node: {{ attr_learner.stdout | default('N/A') }} + force_new_cluster: {{ attr_force_new.stdout | default('N/A') }} + node_ip: {{ attr_node_ip.stdout | default('N/A') }} + member_id: {{ attr_member_id.stdout | default('N/A') }} + cluster_id: {{ attr_cluster_id.stdout | default('N/A') }} + revision: {{ attr_revision.stdout | default('N/A') }} + dest: "{{ output_dir }}/{{ inventory_hostname }}/cib_attributes.txt" + delegate_to: localhost + become: no + + # ============================================================ + # Etcd Container Status Collection + # ============================================================ + + - name: Get podman ps for etcd + ansible.builtin.command: podman ps -a --filter name=etcd + register: podman_ps + changed_when: false + failed_when: false + + - name: Save podman ps output + ansible.builtin.copy: + content: "{{ podman_ps.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/podman_ps.txt" + delegate_to: localhost + become: no + + - name: Get podman inspect for etcd + ansible.builtin.command: podman inspect etcd + register: podman_inspect + changed_when: false + failed_when: false + + - name: Save podman inspect output + ansible.builtin.copy: + content: "{{ podman_inspect.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/podman_inspect.json" + delegate_to: localhost + become: no + when: podman_inspect.rc == 0 + + - name: Get podman logs for etcd + ansible.builtin.command: podman logs --tail 100 etcd + register: podman_logs + changed_when: false + failed_when: false + + - name: Save podman logs + ansible.builtin.copy: + content: "{{ podman_logs.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/podman_logs.txt" + delegate_to: localhost + become: no + when: podman_logs.rc == 0 + + # ============================================================ + # Etcd Cluster Health Collection + # ============================================================ + + - name: Get etcd member list + ansible.builtin.command: podman exec etcd etcdctl member list -w table + register: etcd_member_list + changed_when: false + failed_when: false + + - name: Save etcd member list + ansible.builtin.copy: + content: "{{ etcd_member_list.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/etcd_member_list.txt" + delegate_to: localhost + become: no + when: etcd_member_list.rc == 0 + + - name: Get etcd endpoint health + ansible.builtin.command: podman exec etcd etcdctl endpoint health -w table + register: etcd_health + changed_when: false + failed_when: false + + - name: Save etcd endpoint health + ansible.builtin.copy: + content: "{{ etcd_health.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/etcd_endpoint_health.txt" + delegate_to: localhost + become: no + when: etcd_health.rc == 0 + + - name: Get etcd endpoint status + ansible.builtin.command: podman exec etcd etcdctl endpoint status -w table + register: etcd_status + changed_when: false + failed_when: false + + - name: Save etcd endpoint status + ansible.builtin.copy: + content: "{{ etcd_status.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/etcd_endpoint_status.txt" + delegate_to: localhost + become: no + when: etcd_status.rc == 0 + + # ============================================================ + # System Logs Collection + # ============================================================ + + - name: Get Pacemaker logs + ansible.builtin.command: journalctl -u pacemaker --since "{{ log_timeframe }}" -n {{ log_lines_pacemaker }} + register: journal_pacemaker + changed_when: false + failed_when: false + + - name: Save Pacemaker logs + ansible.builtin.copy: + content: "{{ journal_pacemaker.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/journal_pacemaker.log" + delegate_to: localhost + become: no + + - name: Get Corosync logs + ansible.builtin.command: journalctl -u corosync --since "{{ log_timeframe }}" -n {{ log_lines_corosync }} + register: journal_corosync + changed_when: false + failed_when: false + + - name: Save Corosync logs + ansible.builtin.copy: + content: "{{ journal_corosync.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/journal_corosync.log" + delegate_to: localhost + become: no + + - name: Get etcd-related logs + ansible.builtin.command: journalctl --grep etcd --since "{{ log_timeframe }}" -n {{ log_lines_etcd }} + register: journal_etcd + changed_when: false + failed_when: false + + - name: Save etcd logs + ansible.builtin.copy: + content: "{{ journal_etcd.stdout }}" + dest: "{{ output_dir }}/{{ inventory_hostname }}/journal_etcd.log" + delegate_to: localhost + become: no + + # ============================================================ + # Summary + # ============================================================ + + - name: Display collection summary + ansible.builtin.debug: + msg: + - "Diagnostics collected for {{ inventory_hostname }}" + - "Output directory: {{ output_dir }}/{{ inventory_hostname }}" + - "Etcd running: {{ 'Yes' if podman_ps.stdout is search('etcd') else 'No' }}" + - "Cluster ID: {{ attr_cluster_id.stdout | default('N/A') }}" + + post_tasks: + - name: Create diagnostics summary + ansible.builtin.copy: + content: | + Etcd Diagnostics Collection Summary + ==================================== + Collection Time: {{ ansible_date_time.iso8601 }} + Output Directory: {{ output_dir }} + + Nodes Analyzed: + {% for host in groups['cluster_vms'] %} + - {{ host }}: {{ hostvars[host]['ansible_host'] }} + {% endfor %} + + Next Steps: + 1. Review the collected data in {{ output_dir }} + 2. Analyze Pacemaker status for resource failures + 3. Check etcd cluster health and member status + 4. Examine logs for error patterns + 5. Compare cluster_id values between nodes + dest: "{{ output_dir }}/README.txt" + delegate_to: localhost + become: no + run_once: yes diff --git a/.claude/commands/etcd/scripts/collect-all-diagnostics.sh b/.claude/commands/etcd/scripts/collect-all-diagnostics.sh new file mode 100755 index 0000000..5d0c787 --- /dev/null +++ b/.claude/commands/etcd/scripts/collect-all-diagnostics.sh @@ -0,0 +1,246 @@ +#!/usr/bin/bash +# Master orchestration script for collecting all etcd/Pacemaker diagnostics +# Collects both VM-level data (via Ansible) and cluster-level data (via oc) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" +INVENTORY_PATH="${INVENTORY_PATH:-deploy/openshift-clusters/inventory.ini}" +PROXY_ENV_PATH="${PROXY_ENV_PATH:-deploy/openshift-clusters/proxy.env}" +TIMESTAMP=$(date +%Y%m%dT%H%M%S) +OUTPUT_DIR="/tmp/etcd-diagnostics-${TIMESTAMP}" + +# Color codes +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +info() { + echo -e "${GREEN}✓${NC} $*" +} + +warn() { + echo -e "${YELLOW}⚠${NC} $*" +} + +error() { + echo -e "${RED}✗${NC} $*" +} + +section() { + echo -e "\n${BLUE}===${NC} $* ${BLUE}===${NC}" +} + +cd "${REPO_ROOT}" + +section "Etcd/Pacemaker Diagnostic Collection" +echo "Timestamp: ${TIMESTAMP}" +echo "Output Directory: ${OUTPUT_DIR}" + +# Create output directory +mkdir -p "${OUTPUT_DIR}/openshift" + +# ============================================================ +# Phase 1: Validate Access +# ============================================================ + +section "Phase 1: Validating Access" + +if ! "${SCRIPT_DIR}/validate-cluster-access.sh"; then + error "Access validation failed. Please resolve issues before collecting diagnostics." + exit 1 +fi + +# ============================================================ +# Phase 2: Collect VM-Level Data +# ============================================================ + +section "Phase 2: Collecting VM-Level Diagnostics (Pacemaker/Etcd)" + +if ! ansible-playbook "${SCRIPT_DIR}/../playbooks/collect-diagnostics.yml" \ + -i "${INVENTORY_PATH}" \ + -e "output_dir=${OUTPUT_DIR}"; then + error "VM-level data collection failed" + exit 1 +fi + +info "VM-level diagnostics collected successfully" + +# ============================================================ +# Phase 3: Collect OpenShift Cluster Data +# ============================================================ + +section "Phase 3: Collecting OpenShift Cluster-Level Diagnostics" + +# Determine if proxy is needed +PROXY_REQUIRED=false +if ! oc version --request-timeout=5s &>/dev/null; then + if [ -f "${PROXY_ENV_PATH}" ]; then + info "Sourcing proxy configuration for cluster access" + # shellcheck disable=SC1090 + source "${PROXY_ENV_PATH}" + PROXY_REQUIRED=true + else + warn "Cannot access cluster and no proxy.env found - skipping cluster-level collection" + PROXY_REQUIRED=skip + fi +fi + +if [ "${PROXY_REQUIRED}" != "skip" ]; then + # Collect node information + info "Collecting node status" + oc get nodes -o wide > "${OUTPUT_DIR}/openshift/nodes.txt" 2>&1 || warn "Failed to get nodes" + oc get nodes -o yaml > "${OUTPUT_DIR}/openshift/nodes.yaml" 2>&1 || warn "Failed to get nodes yaml" + + # Collect etcd operator status + info "Collecting etcd cluster operator status" + oc get co etcd > "${OUTPUT_DIR}/openshift/etcd_operator.txt" 2>&1 || warn "Failed to get etcd operator" + oc get co etcd -o yaml > "${OUTPUT_DIR}/openshift/etcd_operator.yaml" 2>&1 || warn "Failed to get etcd operator yaml" + + # Collect all cluster operators + info "Collecting all cluster operators" + oc get co > "${OUTPUT_DIR}/openshift/cluster_operators.txt" 2>&1 || warn "Failed to get cluster operators" + oc get co -o yaml > "${OUTPUT_DIR}/openshift/cluster_operators.yaml" 2>&1 || warn "Failed to get cluster operators yaml" + + # Check for degraded operators + info "Checking for degraded operators" + oc get co --no-headers | grep -v "True.*False.*False" > "${OUTPUT_DIR}/openshift/degraded_operators.txt" 2>&1 || true + + # Collect etcd pods (should not exist in TNF, but check anyway) + info "Checking for etcd pods in openshift-etcd namespace" + oc get pods -n openshift-etcd > "${OUTPUT_DIR}/openshift/etcd_pods.txt" 2>&1 || warn "Failed to get etcd pods" + oc get pods -n openshift-etcd -o yaml > "${OUTPUT_DIR}/openshift/etcd_pods.yaml" 2>&1 || true + + # Collect machine config pool status + info "Collecting machine config pool status" + oc get mcp master > "${OUTPUT_DIR}/openshift/mcp_master.txt" 2>&1 || warn "Failed to get MCP master" + oc get mcp master -o yaml > "${OUTPUT_DIR}/openshift/mcp_master.yaml" 2>&1 || warn "Failed to get MCP master yaml" + + # Collect recent events + info "Collecting recent etcd-related events" + oc get events -n openshift-etcd --sort-by='.lastTimestamp' > "${OUTPUT_DIR}/openshift/etcd_events.txt" 2>&1 || warn "Failed to get etcd events" + oc get events -A --sort-by='.lastTimestamp' | tail -100 > "${OUTPUT_DIR}/openshift/recent_events.txt" 2>&1 || warn "Failed to get recent events" + + info "OpenShift cluster-level diagnostics collected successfully" +else + warn "Skipped OpenShift cluster-level data collection (no cluster access)" +fi + +# ============================================================ +# Phase 4: Create Summary Report +# ============================================================ + +section "Phase 4: Creating Summary Report" + +cat > "${OUTPUT_DIR}/DIAGNOSTIC_REPORT.txt" </ (VM-level diagnostics for node 1) +│ ├── pcs_status.txt +│ ├── pcs_resource_status.txt +│ ├── cib_attributes.txt +│ ├── podman_ps.txt +│ ├── podman_inspect.json +│ ├── podman_logs.txt +│ ├── etcd_member_list.txt +│ ├── etcd_endpoint_health.txt +│ ├── etcd_endpoint_status.txt +│ ├── journal_pacemaker.log +│ ├── journal_corosync.log +│ └── journal_etcd.log +├── / (VM-level diagnostics for node 2) +│ └── (same structure as node-1) +└── openshift/ (cluster-level diagnostics) + ├── nodes.txt + ├── nodes.yaml + ├── etcd_operator.txt + ├── etcd_operator.yaml + ├── cluster_operators.txt + ├── degraded_operators.txt + ├── etcd_pods.txt + ├── mcp_master.txt + └── etcd_events.txt + +Access Configuration: +- Inventory: ${INVENTORY_PATH} +- Proxy Required: ${PROXY_REQUIRED} +EOF + +if [ "${PROXY_REQUIRED}" = "true" ]; then + echo "- Proxy Config: ${PROXY_ENV_PATH}" >> "${OUTPUT_DIR}/DIAGNOSTIC_REPORT.txt" +fi + +cat >> "${OUTPUT_DIR}/DIAGNOSTIC_REPORT.txt" <&2 +} + +warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +# Change to repo root for consistent path resolution +cd "${REPO_ROOT}" + +# Check if oc is available +if ! command -v oc &> /dev/null; then + error "oc command not found in PATH" + exit 1 +fi + +# Try direct access first +if oc version --request-timeout=5s &>/dev/null; then + info "Direct cluster access available" + exec oc "$@" +fi + +# Direct access failed, check for proxy.env +if [ ! -f "${PROXY_ENV_PATH}" ]; then + error "No direct cluster access and proxy.env not found at: ${PROXY_ENV_PATH}" + error "Please ensure cluster is accessible or create proxy configuration" + exit 1 +fi + +# Source proxy.env and retry +info "Direct access failed, sourcing proxy configuration: ${PROXY_ENV_PATH}" + +# shellcheck disable=SC1090 +source "${PROXY_ENV_PATH}" + +# Verify proxy access works +if ! oc version --request-timeout=5s &>/dev/null; then + error "Cluster access failed even with proxy configuration" + error "Please verify proxy.env settings and cluster availability" + exit 1 +fi + +info "Cluster access via proxy successful" + +# Execute the oc command with all original arguments +exec oc "$@" diff --git a/.claude/commands/etcd/scripts/validate-cluster-access.sh b/.claude/commands/etcd/scripts/validate-cluster-access.sh new file mode 100755 index 0000000..9b88db0 --- /dev/null +++ b/.claude/commands/etcd/scripts/validate-cluster-access.sh @@ -0,0 +1,133 @@ +#!/usr/bin/bash +# Validate access to cluster VMs via Ansible and OpenShift cluster via oc +# This is the comprehensive validation script for etcd troubleshooting + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" +INVENTORY_PATH="${INVENTORY_PATH:-deploy/openshift-clusters/inventory.ini}" +PROXY_ENV_PATH="${PROXY_ENV_PATH:-deploy/openshift-clusters/proxy.env}" + +# Color codes +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +info() { + echo -e "${GREEN}✓${NC} $*" +} + +warn() { + echo -e "${YELLOW}⚠${NC} $*" +} + +error() { + echo -e "${RED}✗${NC} $*" +} + +section() { + echo -e "\n${BLUE}===${NC} $* ${BLUE}===${NC}" +} + +cd "${REPO_ROOT}" + +EXIT_CODE=0 + +section "Validating Ansible Access to Cluster VMs" + +# Check if inventory exists +if [ ! -f "${INVENTORY_PATH}" ]; then + error "Inventory file not found: ${INVENTORY_PATH}" + EXIT_CODE=1 +else + info "Inventory file found: ${INVENTORY_PATH}" + + # Run Ansible validation playbook + if ansible-playbook "${SCRIPT_DIR}/../playbooks/validate-access.yml" \ + -i "${INVENTORY_PATH}" > /tmp/ansible-validation.log 2>&1; then + info "Ansible connectivity test passed" + echo " See /tmp/ansible-validation.log for details" + else + error "Ansible connectivity test failed" + echo " See /tmp/ansible-validation.log for details" + EXIT_CODE=1 + fi +fi + +section "Validating OpenShift Cluster Access" + +# Check if oc is available +if ! command -v oc &> /dev/null; then + error "oc command not found in PATH" + EXIT_CODE=1 +else + info "oc command found" + + # Try direct access + if oc version --request-timeout=5s &>/dev/null; then + info "Direct cluster access successful" + PROXY_REQUIRED=false + else + warn "Direct cluster access failed" + + # Check for proxy.env + if [ -f "${PROXY_ENV_PATH}" ]; then + info "Found proxy configuration: ${PROXY_ENV_PATH}" + + # Source and test proxy access + # shellcheck disable=SC1090 + source "${PROXY_ENV_PATH}" + + if oc version --request-timeout=5s &>/dev/null; then + info "Cluster access via proxy successful" + PROXY_REQUIRED=true + else + error "Cluster access failed even with proxy" + EXIT_CODE=1 + fi + else + error "No proxy.env found at: ${PROXY_ENV_PATH}" + error "Cluster access unavailable" + EXIT_CODE=1 + fi + fi + + # If we have cluster access, test basic operations + if [ ${EXIT_CODE} -eq 0 ]; then + section "Testing OpenShift Cluster Operations" + + if oc get nodes &>/dev/null; then + info "Successfully queried cluster nodes" + oc get nodes -o wide | sed 's/^/ /' + else + error "Failed to query cluster nodes" + EXIT_CODE=1 + fi + + if oc get co etcd &>/dev/null; then + info "Successfully queried etcd cluster operator" + oc get co etcd | sed 's/^/ /' + else + error "Failed to query etcd cluster operator" + EXIT_CODE=1 + fi + fi +fi + +section "Validation Summary" + +if [ ${EXIT_CODE} -eq 0 ]; then + info "All validation checks passed" + if [ "${PROXY_REQUIRED:-false}" = "true" ]; then + warn "NOTE: Cluster access requires proxy.env to be sourced" + echo " Use the oc-wrapper.sh script or source ${PROXY_ENV_PATH} before oc commands" + fi +else + error "Some validation checks failed" + echo "Please resolve the issues above before proceeding with troubleshooting" +fi + +exit ${EXIT_CODE} From 3dee48291d373932ca966c0323ce7385edd32284 Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Tue, 28 Oct 2025 19:14:06 +0100 Subject: [PATCH 07/19] Update READMEs --- .claude/commands/etcd/PERMISSIONS.md | 212 +++++++++++ .claude/commands/etcd/PROJECT.md | 8 + .claude/commands/etcd/README.md | 358 ++++++++++++++++++ .../commands/etcd/TROUBLESHOOTING_SKILL.md | 317 ++++++++++++++++ 4 files changed, 895 insertions(+) create mode 100644 .claude/commands/etcd/PERMISSIONS.md create mode 100644 .claude/commands/etcd/README.md diff --git a/.claude/commands/etcd/PERMISSIONS.md b/.claude/commands/etcd/PERMISSIONS.md new file mode 100644 index 0000000..37dd48c --- /dev/null +++ b/.claude/commands/etcd/PERMISSIONS.md @@ -0,0 +1,212 @@ +# Etcd Troubleshooting Skill - Permission Configuration + +This document defines the permission grants for the etcd troubleshooting skill to enable faster diagnostics without requiring user approval for read-only operations. + +## Permission Philosophy + +**Automatic (No User Approval Required):** +- Read-only operations on diagnostic data +- File reading from diagnostic output directories +- Basic Ansible fact gathering (no changes) +- OpenShift cluster status queries (read-only) + +**Requires User Approval:** +- Any operation that modifies cluster state +- Running Ansible playbooks (except validation) +- Executing remediation scripts +- Pacemaker resource operations (cleanup, restart, etc.) + +## Granted Permissions + +### Bash Tool - Read-Only Commands + +The following Bash commands are automatically approved for execution without user permission: + +``` +# File reading operations +Bash(cat:*) # Read any file +Bash(head:*) # Read beginning of files +Bash(tail:*) # Read end of files +Bash(less:*) # Page through files + +# File searching and filtering +Bash(grep:*) # Search file contents +Bash(find:*) # Find files +Bash(ls:*) # List directory contents + +# Diagnostic data inspection +Bash(jq:*) # Parse JSON output +Bash(yq:*) # Parse YAML output + +# Git read-only operations +Bash(git log:*) # View git history +Bash(git status:*) # Check git status +Bash(git diff:*) # View differences + +# Ansible read-only operations +Bash(ansible cluster_vms -i * -m ping) # Test connectivity +Bash(ansible cluster_vms -i * -m setup) # Gather facts +Bash(ansible *_master_* -i * -m shell -a "cat *") # Read files via Ansible +Bash(ansible *_master_* -i * -m shell -a "grep *") # Search files via Ansible +Bash(ansible *_master_* -i * -m shell -a "tail *") # Read file ends via Ansible +Bash(ansible *_master_* -i * -m shell -a "pcs status*") # Read Pacemaker status +Bash(ansible *_master_* -i * -m shell -a "pcs resource status*") # Read resource status +Bash(ansible *_master_* -i * -m shell -a "podman ps*") # List containers +Bash(ansible *_master_* -i * -m shell -a "podman exec etcd etcdctl member list*") # Read member list +Bash(ansible *_master_* -i * -m shell -a "podman exec etcd etcdctl endpoint health*") # Check health +Bash(ansible *_master_* -i * -m shell -a "podman exec etcd etcdctl endpoint status*") # Check status +Bash(ansible *_master_* -i * -m shell -a "crm_attribute --query*") # Query CIB attributes +Bash(ansible *_master_* -i * -m shell -a "journalctl*") # Read system logs +Bash(ansible *_master_* -i * -m shell -a "systemctl status*") # Check service status + +# OpenShift read-only operations (via oc-wrapper or with proxy sourcing) +Bash(source deploy/openshift-clusters/proxy.env && oc get*) # Read cluster resources +Bash(source deploy/openshift-clusters/proxy.env && oc describe*) # Describe resources +Bash(source deploy/openshift-clusters/proxy.env && oc logs*) # Read pod logs +Bash(*oc-wrapper.sh get*) # Get resources via wrapper +Bash(*oc-wrapper.sh describe*) # Describe resources via wrapper +Bash(*oc-wrapper.sh logs*) # Read logs via wrapper +``` + +### Read Tool - Diagnostic Directories + +The following paths are automatically approved for reading: + +``` +Read(/tmp/etcd-diagnostics-*/*) # All diagnostic collection outputs +Read(/tmp/ansible-validation.log) # Ansible validation output +Read(deploy/openshift-clusters/inventory.ini) # Inventory file (read-only) +Read(deploy/openshift-clusters/proxy.env) # Proxy configuration (read-only) +Read(.claude/commands/etcd/**) # Skill documentation +``` + +### Validation Scripts (Read-Only) + +These scripts only validate access and don't modify state: + +``` +Bash(.claude/commands/etcd/scripts/validate-cluster-access.sh) +``` + +## Operations Requiring User Approval + +The following operations will always prompt for user approval: + +### Ansible Playbooks + +``` +ansible-playbook */collect-diagnostics.yml # Requires approval (executes many commands) +ansible-playbook */validate-access.yml # Requires approval +ansible-playbook helpers/force-new-cluster.yml # ALWAYS requires approval (destructive) +ansible-playbook * # Any other playbook +``` + +### Orchestration Scripts + +``` +.claude/commands/etcd/scripts/collect-all-diagnostics.sh # Requires approval (runs playbook) +``` + +### Pacemaker Operations (Write) + +``` +ansible * -m shell -a "pcs resource cleanup*" # Requires approval (clears failures) +ansible * -m shell -a "pcs resource restart*" # Requires approval (restarts resources) +ansible * -m shell -a "pcs resource disable*" # Requires approval (disables resources) +ansible * -m shell -a "pcs resource enable*" # Requires approval (enables resources) +ansible * -m shell -a "pcs property set*" # Requires approval (changes config) +ansible * -m shell -a "crm_attribute --delete*" # Requires approval (modifies CIB) +ansible * -m shell -a "crm_attribute --update*" # Requires approval (modifies CIB) +``` + +### Etcd Operations (Write) + +``` +ansible * -m shell -a "podman exec etcd etcdctl member remove*" # Requires approval +ansible * -m shell -a "podman exec etcd etcdctl member add*" # Requires approval +ansible * -m shell -a "podman exec etcd etcdctl put*" # Requires approval +ansible * -m shell -a "podman exec etcd etcdctl del*" # Requires approval +``` + +### System Operations + +``` +ansible * -m shell -a "systemctl restart*" # Requires approval +ansible * -m shell -a "systemctl stop*" # Requires approval +ansible * -m shell -a "systemctl start*" # Requires approval +ansible * -m shell -a "reboot*" # Requires approval +``` + +## Usage in Claude Code + +To apply these permissions, they need to be added to the Claude Code system configuration. This is typically done in one of two ways: + +1. **Project-level**: In `.claude/settings.json` or project configuration +2. **User-level**: In global Claude Code settings + +### Example Configuration Format + +```json +{ + "autoApprove": { + "bash": [ + "cat:*", + "tail:*", + "head:*", + "grep:*", + "ls:*", + "git status:*", + "git log:*", + "ansible cluster_vms -i * -m ping", + "source deploy/openshift-clusters/proxy.env && oc get*" + ], + "read": [ + "/tmp/etcd-diagnostics-*/**", + "/tmp/ansible-validation.log", + "deploy/openshift-clusters/inventory.ini", + "deploy/openshift-clusters/proxy.env", + ".claude/commands/etcd/**" + ] + } +} +``` + +## Safety Considerations + +### Why These Permissions Are Safe + +**Read-only Bash commands:** +- Cannot modify cluster state +- Cannot delete data +- Cannot change configurations +- Only inspect and report + +**Read tool permissions:** +- Limited to diagnostic output and documentation +- No write access to sensitive files +- Inventory and proxy.env are read-only copies + +**Validation scripts:** +- Only test connectivity +- Don't execute remediation actions +- Safe to run repeatedly + +### What Remains Protected + +**Anything that changes state:** +- Resource operations (cleanup, restart, etc.) +- CIB attribute modifications +- Playbook executions +- Service restarts +- Member additions/removals + +This ensures the skill can quickly gather and analyze diagnostic information while still requiring explicit user approval for any corrective actions. + +## Updating Permissions + +As the skill evolves, this document should be updated to reflect: +1. New safe read-only operations that can be auto-approved +2. New operations that require user approval +3. Any changes to the permission boundaries + +When in doubt, default to requiring user approval - it's better to ask permission than to execute an unexpected operation. diff --git a/.claude/commands/etcd/PROJECT.md b/.claude/commands/etcd/PROJECT.md index 1cbc119..4fce476 100644 --- a/.claude/commands/etcd/PROJECT.md +++ b/.claude/commands/etcd/PROJECT.md @@ -58,6 +58,14 @@ This project is for developing a Claude Code skill that helps troubleshoot etcd - `moving.rst` - Resource movement and migration - `options.rst` - Configuration options +4. **Remediation Tools**: `helpers/` + - `force-new-cluster.yml` - Ansible playbook for automated cluster recovery + - Sets force_new_cluster CIB attribute on leader node + - Clears conflicting attributes (learner_node, standalone_node) + - Removes follower from etcd member list + - Creates etcd snapshots before recovery + - Handles both scenarios: etcd running on leader, or etcd stopped on both nodes + ## Technical Approach ### Phase 1: Validation diff --git a/.claude/commands/etcd/README.md b/.claude/commands/etcd/README.md new file mode 100644 index 0000000..e3c83e1 --- /dev/null +++ b/.claude/commands/etcd/README.md @@ -0,0 +1,358 @@ +# Etcd Troubleshooting Skill for Two-Node Clusters + +This directory contains the etcd/Pacemaker troubleshooting skill for Claude Code, designed specifically for two-node OpenShift clusters with fencing topology. + +## Overview + +The etcd troubleshooting skill enables Claude to interactively diagnose and resolve etcd and Pacemaker issues on two-node clusters. It provides: + +- Automated diagnostic data collection from cluster VMs and OpenShift +- Systematic analysis frameworks for identifying root causes +- Step-by-step remediation procedures +- Verification and prevention recommendations + +## Directory Structure + +``` +.claude/commands/etcd/ +├── README.md # This file +├── PROJECT.md # Project specification and checklist +├── TROUBLESHOOTING_SKILL.md # Skill definition and guidelines +├── playbooks/ # Ansible playbooks +│ ├── validate-access.yml # Validate Ansible connectivity +│ └── collect-diagnostics.yml # Collect VM-level diagnostics +├── scripts/ # Helper scripts +│ ├── validate-cluster-access.sh # Validate both Ansible and oc access +│ ├── collect-all-diagnostics.sh # Master orchestration script +│ └── oc-wrapper.sh # oc wrapper with proxy.env handling +├── etcd-ops-guide/ # Etcd operations documentation +│ ├── clustering.md +│ ├── recovery.md +│ ├── monitoring.md +│ ├── failures.md +│ └── ... (other etcd docs) +└── pacemaker/ # Pacemaker documentation + ├── podman-etcd.sh # The resource agent script + └── Pacemaker_Administration/ # Pacemaker admin guides +``` + +## Quick Start + +### Activating the Skill + +In Claude Code, the etcd troubleshooting skill is activated by reading the TROUBLESHOOTING_SKILL.md file or referencing it in your request: + +``` +"Help me troubleshoot etcd issues on my two-node cluster. Use the etcd troubleshooting skill." +``` + +### Running Diagnostic Collection + +The fastest way to gather all diagnostics: + +```bash +# From repository root +./.claude/commands/etcd/scripts/collect-all-diagnostics.sh +``` + +This will: +1. Validate Ansible access to cluster VMs +2. Validate OpenShift cluster access (with proxy detection) +3. Collect VM-level diagnostics (Pacemaker, etcd, containers, logs) +4. Collect OpenShift cluster-level diagnostics (operators, nodes, events) +5. Generate a summary report with analysis guidance + +### Individual Components + +**Validate Access Only:** +```bash +./.claude/commands/etcd/scripts/validate-cluster-access.sh +``` + +**Collect VM-Level Diagnostics Only:** +```bash +ansible-playbook .claude/commands/etcd/playbooks/collect-diagnostics.yml \ + -i deploy/openshift-clusters/inventory.ini +``` + +**Use oc with Automatic Proxy Handling:** +```bash +./.claude/commands/etcd/scripts/oc-wrapper.sh get nodes +./.claude/commands/etcd/scripts/oc-wrapper.sh get co etcd +``` + +## Prerequisites + +### Environment Requirements + +- Ansible inventory at `deploy/openshift-clusters/inventory.ini` +- SSH access to cluster VMs (usually via ProxyJump through bastion) +- `oc` command in PATH +- Optional: `deploy/openshift-clusters/proxy.env` for cluster access + +### Cluster Requirements + +- Two-node OpenShift cluster with fencing topology +- Pacemaker and Corosync running on both nodes +- Etcd running as Podman containers managed by Pacemaker +- Stonith (fencing) configured + +## Usage Patterns + +### Pattern 1: Interactive Troubleshooting + +When working with Claude interactively: + +1. **Describe the issue** to Claude +2. Claude will **follow the decision tree** in TROUBLESHOOTING_SKILL.md +3. Claude will **collect necessary data** using playbooks/scripts +4. Claude will **analyze** the data systematically +5. Claude will **propose remediation** steps +6. You **execute or approve** the remediation +7. Claude helps **verify** the fix worked +8. Claude provides **prevention** recommendations + +### Pattern 2: Automated Diagnostics + +When you want to gather all data first: + +1. Run `collect-all-diagnostics.sh` +2. Share the output directory with Claude +3. Claude analyzes the collected data +4. Claude provides diagnosis and remediation plan + +### Pattern 3: Specific Issue Investigation + +When you know the general area of the problem: + +1. Tell Claude the symptoms (e.g., "etcd container won't start on node-1") +2. Claude uses targeted data collection +3. Claude applies component-specific analysis (see TROUBLESHOOTING_SKILL.md) +4. Claude provides focused remediation + +## Key Features + +### Proxy Handling + +All scripts automatically detect and handle proxy requirements: + +- Direct cluster access is tried first +- Falls back to `proxy.env` if needed +- Gracefully handles missing proxy.env with warnings +- `oc-wrapper.sh` can be used for all oc commands + +### Comprehensive Data Collection + +The collect-diagnostics playbook gathers: + +**Pacemaker:** +- Cluster status and resource status +- Constraints and failed actions +- CIB attributes (cluster_id, standalone_node, etc.) + +**Etcd:** +- Container status and logs +- Member list and endpoint health +- Cluster health and leadership info + +**Logs:** +- Pacemaker, Corosync, and etcd journalctl logs +- Configurable timeframe and line limits + +**OpenShift:** +- Node status and conditions +- Etcd operator status +- Cluster operator health +- Recent events + +### Systematic Analysis + +Claude follows structured analysis frameworks (see TROUBLESHOOTING_SKILL.md): + +- Component-specific analysis functions +- Decision tree for systematic diagnosis +- Error pattern matching guidelines +- Common issue recognition + +## Common Scenarios + +### Scenario: Different Cluster IDs + +**Symptoms:** Etcd won't start, nodes show different cluster_id in CIB attributes + +**Quick Fix:** +```bash +# Use the force-new-cluster helper +ansible-playbook helpers/force-new-cluster.yml \ + -i deploy/openshift-clusters/inventory.ini +``` + +This designates the first node in inventory as leader and forces follower to resync. + +### Scenario: Resource Failed to Start + +**Symptoms:** pcs status shows "Failed Resource Actions" + +**Quick Fix:** +```bash +# On cluster VMs via Ansible +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini \ + -m shell -a "pcs resource cleanup etcd" -b +``` + +### Scenario: Fencing Failures + +**Symptoms:** Node shows UNCLEAN, fencing failed errors + +**Investigation:** +```bash +# Check stonith status +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini \ + -m shell -a "pcs stonith status" -b + +# Test fence agent manually +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini \ + -m shell -a "fence_redfish -a -l -p -o status" -b +``` + +## Remediation Tools + +### pcs resource cleanup + +Clears failed resource states and retries operations: + +```bash +sudo pcs resource cleanup etcd # All nodes +sudo pcs resource cleanup etcd # Specific node +``` + +**When to use:** +- After fixing underlying issues +- Resource shows as failed but root cause is resolved +- After manual CIB attribute changes + +### force-new-cluster Helper + +Automated cluster recovery playbook at `helpers/force-new-cluster.yml`: + +```bash +ansible-playbook helpers/force-new-cluster.yml \ + -i deploy/openshift-clusters/inventory.ini +``` + +**When to use:** +- Different etcd cluster IDs between nodes +- Etcd won't start on either node +- After ungraceful disruptions +- Manual recovery attempts failed + +**See TROUBLESHOOTING_SKILL.md** for detailed documentation. + +## Reference Documentation + +### Etcd Operations + +Slash commands for detailed etcd information: +- `/etcd:etcd-ops-guide:clustering` - Cluster membership operations +- `/etcd:etcd-ops-guide:recovery` - Recovery procedures +- `/etcd:etcd-ops-guide:monitoring` - Monitoring and health checks +- `/etcd:etcd-ops-guide:failures` - Failure scenarios +- `/etcd:etcd-ops-guide:data_corruption` - Data corruption handling + +Or read files directly in `.claude/commands/etcd/etcd-ops-guide/` + +### Pacemaker Administration + +Documentation in `.claude/commands/etcd/pacemaker/Pacemaker_Administration/`: +- `troubleshooting.rst` - Pacemaker troubleshooting guide +- `tools.rst` - Command-line tools +- `agents.rst` - Resource agents +- `administrative.rst` - Administrative tasks + +## Development and Testing + +See [PROJECT.md](PROJECT.md) for: +- Implementation checklist +- Technical approach and architecture +- Testing scenarios +- Success criteria + +## Environment Variables + +**INVENTORY_PATH**: Override inventory location (default: `deploy/openshift-clusters/inventory.ini`) +```bash +INVENTORY_PATH=/custom/path/inventory.ini ./scripts/validate-cluster-access.sh +``` + +**PROXY_ENV_PATH**: Override proxy.env location (default: `deploy/openshift-clusters/proxy.env`) +```bash +PROXY_ENV_PATH=/custom/path/proxy.env ./scripts/oc-wrapper.sh get nodes +``` + +## Troubleshooting the Troubleshooter + +If the diagnostic scripts themselves fail: + +**Ansible connectivity issues:** +```bash +# Test basic connectivity +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m ping + +# Check inventory syntax +ansible-inventory -i deploy/openshift-clusters/inventory.ini --list +``` + +**oc access issues:** +```bash +# Test direct access +oc version + +# Test with proxy +source deploy/openshift-clusters/proxy.env && oc version + +# Verify KUBECONFIG +echo $KUBECONFIG +``` + +**Permission issues:** +```bash +# Ensure scripts are executable +chmod +x .claude/commands/etcd/scripts/*.sh +``` + +## Permission Configuration + +To speed up diagnostics, you can configure Claude Code to automatically approve read-only operations without prompting for permission. See [PERMISSIONS.md](PERMISSIONS.md) for: + +- Complete list of safe read-only commands that can be auto-approved +- Operations that always require user approval +- How to configure permissions in Claude Code +- Safety considerations and boundaries + +**Quick summary of auto-approved operations:** +- File reading: `cat`, `tail`, `head`, `grep`, `ls` +- Ansible read-only: `pcs status`, `podman ps`, `etcdctl` queries, `journalctl` +- OpenShift read-only: `oc get`, `oc describe`, `oc logs` +- Validation scripts (no state changes) + +**Always requires approval:** +- Ansible playbooks (including diagnostics collection) +- Pacemaker operations: `pcs resource cleanup`, restart, disable/enable +- Etcd operations: member add/remove, put/delete +- Force-new-cluster recovery +- Any system modifications + +## Contributing + +When adding new diagnostic capabilities: + +1. Update TROUBLESHOOTING_SKILL.md with new analysis patterns +2. Add collection steps to collect-diagnostics.yml if needed +3. Update decision tree and error patterns +4. Document new remediation tools +5. Add examples to this README +6. Update PROJECT.md checklist + +## License + +This is part of the two-node-toolbox project. See repository root for license information. diff --git a/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md index 1bd7de0..29a96e5 100644 --- a/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md +++ b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md @@ -126,6 +126,273 @@ Based on your analysis, provide: 4. **Verification Steps**: How to confirm the issue is resolved 5. **Prevention Recommendations**: How to avoid recurrence +## Analysis Guidelines + +### Component-Specific Analysis Functions + +#### Pacemaker Cluster Analysis + +**Key Indicators:** +- Quorum status: `pcs status` shows "quorum" or "no quorum" +- Node status: online, standby, offline, UNCLEAN +- Resource status: Started, Stopped, Failed, Master/Slave +- Failed actions count and descriptions + +**Analysis Questions:** +1. Do both nodes show as online in the cluster? +2. Is quorum achieved? (Should show quorum with 2 nodes) +3. Are there any failed actions for the etcd resource? +4. Is stonith enabled and configured correctly? +5. Are there any location/order/colocation constraints preventing etcd from starting? + +**Common Issues:** +- **No quorum**: One node is offline or network partition - check corosync logs +- **Failed actions**: Resource failed to start - examine failure reason and run `pcs resource cleanup` +- **UNCLEAN node**: Fencing failed - check fence agent configuration and BMC access + +#### Etcd Container Analysis + +**Key Indicators:** +- Container state: running, stopped, exited +- Exit code if stopped (0 = clean, non-zero = error) +- Container restart count +- Last log messages from container + +**Analysis Questions:** +1. Is the etcd container running on both nodes? +2. If stopped, what was the exit code? +3. What do the last 20 lines of container logs show? +4. Are there certificate errors in the logs? +5. Are there network connectivity errors? + +**Common Issues:** +- **Container not found**: Pacemaker hasn't created it yet or resource is stopped +- **Exit code 1**: Check logs for specific error (certs, permissions, corruption) +- **Repeated restarts**: Likely configuration or persistent error - check logs + +#### Etcd Cluster Health Analysis + +**Key Indicators:** +- Member list: number of members, their status (started/unstarted) +- Endpoint health: healthy/unhealthy, latency +- Endpoint status: leader election, raft index, DB size +- Cluster ID consistency across nodes + +**Analysis Questions:** +1. How many members are in the member list? +2. Are all members started? +3. Is there a leader elected? +4. Do both nodes show the same cluster ID in CIB attributes? +5. Are raft indices progressing or stuck? + +**Common Issues:** +- **Different cluster IDs**: Nodes are in different etcd clusters - need force-new-cluster +- **No leader**: Split-brain or quorum loss - check network and member list +- **Unstarted member**: Node hasn't joined yet or failed to join - check logs +- **3+ members**: Unexpected member entries from previous configs - need cleanup + +#### CIB Attributes Analysis + +**Key Indicators:** +- standalone_node: which node (if any) is running alone +- learner_node: which node (if any) is rejoining +- force_new_cluster: which node should bootstrap new cluster +- cluster_id: must match between nodes in healthy state +- member_id: etcd member ID for each node + +**Analysis Questions:** +1. Are there conflicting attributes set (e.g., both standalone and learner)? +2. Do cluster_id values match between both nodes? +3. Is force_new_cluster set when it shouldn't be? +4. Are learner/standalone attributes stuck from previous operations? + +**Common Issues:** +- **Stuck learner_node**: Previous rejoin didn't complete - may need manual cleanup +- **Mismatched cluster_id**: Nodes diverged - need force-new-cluster recovery +- **Stale force_new_cluster**: Attribute survived reboot when it shouldn't - manual cleanup needed + +#### System Logs Analysis + +**Key Patterns to Search:** + +**Pacemaker Logs:** +- "Failed" - resource failures +- "fencing" - stonith operations +- "could not" - operation failures +- "timeout" - timing issues +- "certificate" - cert problems + +**Corosync Logs:** +- "quorum" - quorum changes +- "lost" - connection losses +- "join" - membership changes +- "totem" - ring protocol issues + +**Etcd Logs:** +- "panic" - fatal errors +- "error" - general errors +- "certificate" - cert issues +- "member" - membership changes +- "leader" - leadership changes +- "database space exceeded" - quota issues +- "mvcc: database space exceeded" - DB full + +### Troubleshooting Decision Tree + +Use this decision tree to systematically diagnose issues: + +``` +START: Etcd not working as expected +│ +├─> Can you access cluster VMs via Ansible? +│ ├─ NO → Fix Ansible connectivity first (check inventory, SSH keys, ProxyJump) +│ └─ YES → Continue +│ +├─> Is Pacemaker running on both nodes? (systemctl status pacemaker) +│ ├─ NO → Start Pacemaker: systemctl start pacemaker +│ └─ YES → Continue +│ +├─> Do both nodes show as online in pcs status? +│ ├─ NO → Check which node is offline +│ │ ├─ Node shows UNCLEAN → Fencing failed +│ │ │ └─ ACTION: Check stonith status, fence agent config, BMC access +│ │ └─ Node shows offline → Network or Pacemaker issue +│ │ └─ ACTION: Check corosync logs, network connectivity +│ └─ YES → Continue +│ +├─> Does cluster have quorum? (pcs status shows "quorum") +│ ├─ NO → Investigate corosync/quorum issues +│ │ └─ ACTION: Check corosync logs for membership changes +│ └─ YES → Continue +│ +├─> Is etcd resource started? (pcs resource status) +│ ├─ NO → Check for failed actions +│ │ ├─ Failed actions present → Resource failed to start +│ │ │ └─ ACTION: Check failure reason, fix root cause, run pcs resource cleanup +│ │ └─ No failed actions → Check constraints +│ │ └─ ACTION: Review pcs constraint list, check node attributes +│ └─ YES → Continue +│ +├─> Is etcd container running on expected nodes? (podman ps) +│ ├─ NO → Container not started or crashed +│ │ └─ ACTION: Check podman logs for errors (certs, corruption, config) +│ └─ YES → Continue +│ +├─> Check cluster IDs in CIB attributes on both nodes +│ ├─ DIFFERENT → Nodes are in separate etcd clusters! +│ │ └─ ACTION: Use force-new-cluster helper to recover +│ └─ SAME → Continue +│ +├─> Check etcd member list (podman exec etcd etcdctl member list) +│ ├─ Lists unexpected members (>2 members) → Stale members from previous config +│ │ └─ ACTION: Remove stale members with etcdctl member remove +│ ├─ Shows "unstarted" members → Node hasn't joined yet +│ │ └─ ACTION: Check logs on unstarted node, may need cleanup and rejoin +│ └─ Lists 2 members, both started → Continue +│ +├─> Check etcd endpoint health (podman exec etcd etcdctl endpoint health) +│ ├─ Unhealthy → Network or performance issues +│ │ └─ ACTION: Check network latency, system load, disk I/O +│ └─ Healthy → Continue +│ +├─> Check etcd endpoint status (podman exec etcd etcdctl endpoint status) +│ ├─ No leader → Leadership election failing +│ │ └─ ACTION: Check logs for raft errors, verify member communication +│ ├─ Leader elected but errors in logs → Operational issues +│ │ └─ ACTION: Investigate specific errors (disk full, corruption, etc.) +│ └─ Leader elected, no errors → Cluster appears healthy +│ +└─> If still experiencing issues → Check OpenShift integration + ├─ Etcd operator degraded? (oc get co etcd) + │ └─ ACTION: Review operator conditions, check for cert rotation, API issues + └─ Check for related operator degradation (oc get co) + └─ ACTION: Review degraded operators, may indicate cluster-wide issues +``` + +### Error Pattern Matching Guidelines + +When analyzing logs and status output, look for these common patterns: + +#### Certificate Issues +**Symptoms:** +- "certificate has expired" in logs +- "x509: certificate" errors +- etcd container exits immediately +- TLS handshake failures + +**Diagnosis:** +```bash +# Check cert expiration on nodes +sudo podman exec etcd ls -la /etc/kubernetes/static-pod-resources/etcd-certs/ +# Look at recent cert-related log messages +sudo journalctl --grep certificate --since "2 hours ago" +``` + +**Resolution:** +- Wait for automatic cert rotation (if in progress) +- Verify etcd operator is healthy and can rotate certs +- Check machine config pool status for cert updates + +#### Split-Brain / Cluster ID Mismatch +**Symptoms:** +- Different cluster_id in CIB attributes between nodes +- Nodes can't join each other's cluster +- "cluster ID mismatch" in logs +- Etcd won't start on one or both nodes + +**Diagnosis:** +```bash +# Compare cluster IDs +ansible cluster_vms -i inventory.ini -m shell \ + -a "crm_attribute --query --name cluster_id" -b +``` + +**Resolution:** +- Use force-new-cluster helper playbook +- Designate one node as leader (first in inventory) +- Follower will resync from leader + +#### Resource Failures / Failed Actions +**Symptoms:** +- pcs status shows "Failed Resource Actions" +- Resource shows as "Stopped" but should be running +- Migration failures + +**Diagnosis:** +```bash +# Check detailed failure info +sudo pcs resource status --full +sudo pcs resource failcount show etcd +``` + +**Resolution:** +1. Identify and fix root cause (see logs) +2. Run: `sudo pcs resource cleanup etcd` +3. Verify resource starts successfully + +#### Fencing Failures +**Symptoms:** +- Node shows as "UNCLEAN" in pcs status +- "fencing failed" in logs +- Stonith errors +- Cluster can't recover from node failure + +**Diagnosis:** +```bash +# Check stonith status and configuration +sudo pcs stonith status +sudo pcs stonith show +# Check fence agent logs +sudo journalctl -u pacemaker --grep fence --since "1 hour ago" +``` + +**Resolution:** +- Verify BMC/RedFish access from both nodes +- Check fence agent credentials +- Ensure network connectivity to BMC interfaces +- Review stonith timeout settings +- Test fence agent manually: `sudo fence_redfish -a -l -p -o status` + ## Key Context ### Cluster States @@ -153,6 +420,56 @@ Based on your analysis, provide: - New cluster ID is assigned - Failed node discards old DB and resyncs on restart +## Available Remediation Tools + +### Pacemaker Resource Cleanup +Use `pcs resource cleanup` to clear failed resource states and retry operations: + +```bash +# Clean up etcd resource on specific node +sudo pcs resource cleanup etcd + +# Clean up etcd resource on all nodes +sudo pcs resource cleanup etcd +``` + +**When to use:** +- After fixing underlying issues (certificates, network, etc.) +- When resource shows as failed but root cause is resolved +- To retry resource start after transient failures +- After manual CIB attribute changes + +### Force New Cluster Helper +Ansible playbook at `helpers/force-new-cluster.yml` automates cluster recovery when both nodes have stopped etcd or cluster IDs are mismatched. + +**What it does:** +1. Disables stonith temporarily for safety +2. Takes etcd snapshots on both nodes (if etcd not running) +3. Clears conflicting CIB attributes (learner_node, standalone_node) +4. Sets force_new_cluster attribute on leader node (first in inventory) +5. Removes follower from etcd member list (if etcd running on leader) +6. Runs `pcs resource cleanup etcd` on both nodes +7. Re-enables stonith +8. Verifies recovery + +**Usage:** +```bash +ansible-playbook helpers/force-new-cluster.yml -i deploy/openshift-clusters/inventory.ini +``` + +**When to use:** +- Both nodes show different etcd cluster IDs +- Etcd is not running on either node and won't start +- After ungraceful disruptions that left cluster in inconsistent state +- Manual recovery attempts have failed +- Need to bootstrap from one node as new cluster + +**Precautions:** +- Only use when normal recovery procedures fail +- Ensure follower node can afford to lose its etcd data +- Leader (first node in inventory) will become the source of truth +- This creates a NEW cluster, follower will resync from leader + ## Reference Documentation You have access to these slash commands for detailed information: From 98818bde7b01c7eb1ed67df83ccd0d668ca7a556 Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Tue, 28 Oct 2025 19:14:26 +0100 Subject: [PATCH 08/19] Update gitignore to ignore create rpm files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index cb85e8e..bc3e58c 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ deploy/openshift-clusters/roles/install-dev/files/pull-secret.json deploy/openshift-clusters/vars/kcli.yml deploy/openshift-clusters/inventory.ini.* logs/* +helpers/*.rpm From ea5fa2db44964f7cbb478e842db83b2ff4f61d9e Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Wed, 29 Oct 2025 13:05:48 +0100 Subject: [PATCH 09/19] Update to add small blurb to README --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index aec24f1..5c6cda3 100644 --- a/README.md +++ b/README.md @@ -60,4 +60,8 @@ See [deploy/openshift-clusters/README-external-host.md](deploy/openshift-cluster **Two-Node with Arbiter (TNA)**: Two master nodes with a separate arbiter node for quorum. See [docs/arbiter/README.md](docs/arbiter/README.md) -**Two-Node with Fencing (TNF)**: Two master nodes with BMC-based fencing for automated node recovery. See [docs/fencing/README.md](docs/fencing/README.md) \ No newline at end of file +**Two-Node with Fencing (TNF)**: Two master nodes with BMC-based fencing for automated node recovery. See [docs/fencing/README.md](docs/fencing/README.md) + +## Troubleshooting with Claude Code + +If you're using [Claude Code](https://claude.ai/code), it can help you troubleshoot etcd issues on two-node fencing clusters. Simply ask Claude to diagnose your etcd problems and it will automatically collect diagnostics, analyze the cluster state, and recommend remediation steps. See [.claude/commands/etcd/README.md](.claude/commands/etcd/README.md) for details. \ No newline at end of file From 9994f8e225e1a9b4852b440ccf57e69ba5f6f9ca Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Wed, 29 Oct 2025 13:58:26 +0100 Subject: [PATCH 10/19] Remove gitsubmodule --- website | 1 - 1 file changed, 1 deletion(-) delete mode 160000 website diff --git a/website b/website deleted file mode 160000 index a022204..0000000 --- a/website +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a022204f20cd90d892ed9a268e4929b491a5cad6 From 95dafe9d7dceae0ee2c6a0b3f804957c85931b21 Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Wed, 29 Oct 2025 14:24:24 +0100 Subject: [PATCH 11/19] Update podman-etcd filename to avoid shell check --- .claude/commands/etcd/PROJECT.md | 4 ++-- .claude/commands/etcd/README.md | 2 +- .../etcd/pacemaker/{podman-etcd.sh => podman-etcd.txt} | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename .claude/commands/etcd/pacemaker/{podman-etcd.sh => podman-etcd.txt} (100%) diff --git a/.claude/commands/etcd/PROJECT.md b/.claude/commands/etcd/PROJECT.md index 4fce476..9ab2e10 100644 --- a/.claude/commands/etcd/PROJECT.md +++ b/.claude/commands/etcd/PROJECT.md @@ -47,7 +47,7 @@ This project is for developing a Claude Code skill that helps troubleshoot etcd - `runtime-reconf-design.md` - Reconfiguration design patterns 3. **Pacemaker Documentation**: `.claude/commands/etcd/pacemaker/` - - `podman-etcd.sh` - The resource agent managing etcd containers + - `podman-etcd.txt` - The resource agent script (reference only, not executable) - `Pacemaker_Administration/` - Comprehensive Pacemaker administration docs - `administrative.rst` - Administrative tasks - `agents.rst` - Resource agents overview @@ -227,7 +227,7 @@ Based on analysis, provide: - `revision` - Etcd raft index ### Pacemaker Resource Agent -The `podman-etcd.sh` agent manages: +The `podman-etcd.txt` resource agent (reference) manages: - Container lifecycle (start/stop) - Member join/leave operations - Certificate rotation monitoring diff --git a/.claude/commands/etcd/README.md b/.claude/commands/etcd/README.md index e3c83e1..9f60775 100644 --- a/.claude/commands/etcd/README.md +++ b/.claude/commands/etcd/README.md @@ -32,7 +32,7 @@ The etcd troubleshooting skill enables Claude to interactively diagnose and reso │ ├── failures.md │ └── ... (other etcd docs) └── pacemaker/ # Pacemaker documentation - ├── podman-etcd.sh # The resource agent script + ├── podman-etcd.txt # The resource agent script (reference) └── Pacemaker_Administration/ # Pacemaker admin guides ``` diff --git a/.claude/commands/etcd/pacemaker/podman-etcd.sh b/.claude/commands/etcd/pacemaker/podman-etcd.txt similarity index 100% rename from .claude/commands/etcd/pacemaker/podman-etcd.sh rename to .claude/commands/etcd/pacemaker/podman-etcd.txt From cb5b1b0dac61334df55829610bc843ee7e2b38c6 Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Mon, 3 Nov 2025 12:01:35 +0100 Subject: [PATCH 12/19] Add quick reference --- .claude/commands/etcd/QUICK_REFERENCE.md | 397 +++++++++++++++++++++++ .claude/commands/etcd/README.md | 59 +++- 2 files changed, 440 insertions(+), 16 deletions(-) create mode 100644 .claude/commands/etcd/QUICK_REFERENCE.md diff --git a/.claude/commands/etcd/QUICK_REFERENCE.md b/.claude/commands/etcd/QUICK_REFERENCE.md new file mode 100644 index 0000000..0198463 --- /dev/null +++ b/.claude/commands/etcd/QUICK_REFERENCE.md @@ -0,0 +1,397 @@ +# Etcd TNF Quick Reference Guide + +**Fast troubleshooting for common etcd issues on Two-Node with Fencing clusters** + +Use this guide for quick diagnosis and remediation. For detailed analysis, refer to [TROUBLESHOOTING_SKILL.md](TROUBLESHOOTING_SKILL.md). + +--- + +## Quick Diagnostics + +**Collect all diagnostics automatically:** +```bash +source .claude/commands/etcd/scripts/collect-all-diagnostics.sh +``` + +**Check cluster health quickly:** +```bash +# Pacemaker status +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a "sudo pcs status" -b + +# Etcd member list +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a "sudo podman exec etcd etcdctl member list -w table" -b + +# OpenShift etcd operator +oc get co etcd -o yaml | grep -A10 "status:" +``` + +--- + +## Common Issues + +### 1. Etcd Start Failure: "No such device or address" + +**Symptoms:** +- `pcs status` shows: `etcd start on returned 'error'` +- Pacemaker logs show: `crm_attribute: Error performing operation: No such device or address` +- Member list shows member as "unstarted" with `IS_LEARNER: true` + +**Root Cause:** +Stale etcd data directory with mismatched member ID. The node is trying to rejoin with old credentials that don't match the current cluster configuration. + +**Diagnosis:** +```bash +# Check for member ID mismatch in logs +ansible -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo journalctl -u pacemaker --since '1 hour ago' | grep -i 'member.*id'" + +# Check member list from working node +ansible -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo podman exec etcd etcdctl member list -w table" +``` + +**Fix:** +```bash +# Clean stale etcd data on failed node +ansible -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo rm -rf /var/lib/etcd/*" -b + +# Cleanup Pacemaker failure state +ansible -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs resource cleanup etcd" -b + +# Monitor recovery +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs status" -b +``` + +**Expected Outcome:** +- Etcd container starts on failed node +- Member joins as learner and gets promoted to voting member +- `oc get co etcd` shows Available=True within 5-10 minutes + +--- + +### 2. Split-Brain: "master-X must force a new cluster" + +**Symptoms:** +- `pcs status` shows: `etcd monitor returned 'error' (master-X must force a new cluster)` +- Both nodes have etcd running but with different cluster IDs +- CIB attributes show different `cluster_id` values + +**Root Cause:** +Network partition or simultaneous failures caused both nodes to start independent etcd clusters. + +**Diagnosis:** +```bash +# Check cluster IDs on both nodes +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo crm_attribute -G -n cluster_id" -b + +# Check which node is standalone +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo crm_attribute -G -n standalone_node" -b +``` + +**Fix:** +```bash +# Identify the node with more recent data (higher revision) +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo podman exec etcd etcdctl endpoint status -w table" -b + +# On the node with LESS data, clean etcd +ansible -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs resource ban etcd && \ + sudo rm -rf /var/lib/etcd/* && \ + sudo pcs resource clear etcd" -b + +# Clear the force_new_cluster flag from CIB +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo crm_attribute -D -n force_new_cluster" -b + +# Cleanup and let Pacemaker recover +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs resource cleanup etcd" -b +``` + +**Expected Outcome:** +- One node becomes standalone, other joins as learner +- Cluster IDs match after recovery +- Both nodes show "started" in member list + +--- + +### 3. Quorum Loss: "no quorum" + +**Symptoms:** +- `pcs status` shows: "partition WITHOUT quorum" +- Etcd resources stopped or failed +- One or both nodes may be offline + +**Root Cause:** +Corosync cluster lost quorum (needs 2 nodes, has <2). + +**Diagnosis:** +```bash +# Check which nodes are online +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs status | grep -A5 'Node List'" -b + +# Check corosync membership +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo corosync-cmapctl | grep members" -b +``` + +**Fix:** + +**If one node is offline:** +```bash +# Restart Pacemaker/Corosync on offline node +ansible -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs cluster start" -b + +# Wait for quorum to be established +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs status" -b +``` + +**If both nodes online but no quorum (network issue):** +```bash +# Check firewall/network connectivity between nodes +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo firewall-cmd --list-all" -b + +# Restart corosync cluster-wide +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs cluster stop --all && sudo pcs cluster start --all" -b +``` + +**Expected Outcome:** +- Both nodes show as "Online" in pcs status +- Quorum achieved +- Resources start automatically + +--- + +### 4. Learner Stuck: Member won't promote + +**Symptoms:** +- Member list shows learner with `STATUS: started, IS_LEARNER: true` for >10 minutes +- OpenShift etcd operator shows "member is a learner, waiting for promotion" +- No errors in logs, just stuck waiting + +**Root Cause:** +OpenShift etcd operator learner promotion workflow stalled or conditions not met. + +**Diagnosis:** +```bash +# Check learner status +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo podman exec etcd etcdctl member list -w table" -b + +# Check etcd operator conditions +oc get co etcd -o yaml | grep -A20 "conditions:" + +# Check for revision controller errors +oc logs -n openshift-etcd-operator deployment/etcd-operator | tail -50 +``` + +**Fix:** +```bash +# Manually promote learner (only if stuck >15 minutes) +# First, get the learner member ID +ansible -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo podman exec etcd etcdctl member list -w table | grep true" -b + +# Promote the learner (replace MEMBER_ID) +ansible -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo podman exec etcd etcdctl member promote " -b + +# Restart etcd operator to reset state machine +oc delete pod -n openshift-etcd-operator -l name=etcd-operator +``` + +**Expected Outcome:** +- Member shows `IS_LEARNER: false` in member list +- Etcd operator shows "2 of 2 members are available" +- Cluster becomes Available + +--- + +### 5. Certificate Issues + +**Symptoms:** +- Etcd logs show: "tls: bad certificate" or "certificate has expired" +- Etcd container fails to start with cert validation errors +- Pacemaker shows etcd start failures + +**Root Cause:** +Expired or incorrect TLS certificates for etcd communication. + +**Diagnosis:** +```bash +# Check certificate expiration +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo openssl x509 -in /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-$(hostname).crt -noout -dates" -b + +# Check for cert errors in logs +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo podman logs etcd 2>&1 | grep -i 'certificate\|tls'" -b +``` + +**Fix:** +```bash +# Force certificate regeneration via machine config +oc patch etcd cluster -p='{"spec": {"forceRedeploymentReason": "cert-refresh-$(date +%s)"}}' --type=merge + +# Or manually trigger cert rotation +oc delete secret -n openshift-etcd etcd-all-certs +oc delete pod -n openshift-etcd-operator -l name=etcd-operator + +# Wait for operator to regenerate certs and restart etcd +oc get pods -n openshift-etcd -w +``` + +**Expected Outcome:** +- New certificates generated +- Etcd pods restart with valid certs +- No more TLS errors in logs + +--- + +### 6. Pacemaker Resource Ban + +**Symptoms:** +- `pcs status` shows: `etcd Stopped` on one or both nodes +- `pcs constraint list` shows location constraints preventing start +- Resource cleanup doesn't fix it + +**Root Cause:** +Resource was manually banned or reached failure threshold causing automatic ban. + +**Diagnosis:** +```bash +# Check for location constraints (bans) +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs constraint list --full" -b + +# Check failure count +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs resource failcount show etcd" -b +``` + +**Fix:** +```bash +# Remove all location constraints for etcd +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs constraint list --full | grep 'location.*etcd' | cut -d' ' -f1 | xargs -I {} sudo pcs constraint remove {}" -b + +# Or clear specific node ban +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs resource clear etcd" -b + +# Reset failure counts +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs resource cleanup etcd" -b +``` + +**Expected Outcome:** +- No location constraints shown +- Etcd starts on appropriate node(s) +- Failure counts reset to 0 + +--- + +### 7. Stonith/Fencing Failures + +**Symptoms:** +- `pcs status` shows: "UNCLEAN" node status +- Logs show: "fence_redfish failed" or stonith timeout +- Resources won't start due to unclean node + +**Root Cause:** +Fencing agent can't reach BMC or authentication failure. + +**Diagnosis:** +```bash +# Check stonith configuration +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs stonith config" -b + +# Test fencing manually +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs stonith fence " -b + +# Check redfish connectivity +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "curl -k -u : https:///redfish/v1/Systems" -b +``` + +**Fix:** +```bash +# Update stonith credentials if needed +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs stonith update _redfish password=" -b + +# Confirm unclean node (if safe - node is really down) +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs stonith confirm " -b + +# Restart cluster after fencing fix +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo pcs cluster stop --all && sudo pcs cluster start --all" -b +``` + +**Expected Outcome:** +- Fencing test succeeds +- No UNCLEAN nodes +- Resources start normally + +--- + +## Quick Verification Checklist + +After any fix, verify: + +```bash +# 1. Pacemaker cluster healthy +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a "sudo pcs status" -b +# Expected: Both nodes Online, quorum achieved, no failed actions + +# 2. Etcd members healthy +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo podman exec etcd etcdctl endpoint health -w table" -b +# Expected: All endpoints healthy + +# 3. Etcd member list correct +ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ + "sudo podman exec etcd etcdctl member list -w table" -b +# Expected: 2 members, both started, both IS_LEARNER=false + +# 4. OpenShift etcd operator healthy +oc get co etcd +# Expected: Available=True, Progressing=False, Degraded=False + +# 5. No degraded operators +oc get co --no-headers | grep -v "True.*False.*False" +# Expected: Empty output (all operators healthy) +``` + +--- + +## When to Escalate + +Use the full [TROUBLESHOOTING_SKILL.md](TROUBLESHOOTING_SKILL.md) methodology when: + +- Issue doesn't match any pattern above +- Fix attempts don't resolve the problem after 2-3 iterations +- Data corruption is suspected +- Multiple components are failing simultaneously +- Need to understand deeper architectural details + +## Additional Resources + +- **Detailed troubleshooting**: [TROUBLESHOOTING_SKILL.md](TROUBLESHOOTING_SKILL.md) +- **Etcd operations**: Slash commands like `/etcd:etcd-ops-guide:recovery` +- **Pacemaker administration**: [pacemaker/Pacemaker_Administration/](pacemaker/Pacemaker_Administration/) +- **Diagnostic collection**: [scripts/collect-all-diagnostics.sh](scripts/collect-all-diagnostics.sh) diff --git a/.claude/commands/etcd/README.md b/.claude/commands/etcd/README.md index 9f60775..966b8d9 100644 --- a/.claude/commands/etcd/README.md +++ b/.claude/commands/etcd/README.md @@ -17,7 +17,8 @@ The etcd troubleshooting skill enables Claude to interactively diagnose and reso .claude/commands/etcd/ ├── README.md # This file ├── PROJECT.md # Project specification and checklist -├── TROUBLESHOOTING_SKILL.md # Skill definition and guidelines +├── QUICK_REFERENCE.md # Fast troubleshooting guide (START HERE) +├── TROUBLESHOOTING_SKILL.md # Detailed skill definition and guidelines ├── playbooks/ # Ansible playbooks │ ├── validate-access.yml # Validate Ansible connectivity │ └── collect-diagnostics.yml # Collect VM-level diagnostics @@ -38,9 +39,27 @@ The etcd troubleshooting skill enables Claude to interactively diagnose and reso ## Quick Start +### For Fast Troubleshooting + +**Start with [QUICK_REFERENCE.md](QUICK_REFERENCE.md)** for common issues and immediate fixes. + +The quick reference covers: +- Common failure patterns with instant fixes +- One-command diagnostics +- Step-by-step remediation for 7 most frequent issues +- Quick verification checklist + +### For Complex Issues + +Use the detailed [TROUBLESHOOTING_SKILL.md](TROUBLESHOOTING_SKILL.md) when: +- Issue doesn't match common patterns +- Multiple components are failing +- Need deeper architectural understanding +- Automated fixes don't resolve the problem + ### Activating the Skill -In Claude Code, the etcd troubleshooting skill is activated by reading the TROUBLESHOOTING_SKILL.md file or referencing it in your request: +In Claude Code, reference the troubleshooting skill in your request: ``` "Help me troubleshoot etcd issues on my two-node cluster. Use the etcd troubleshooting skill." @@ -250,24 +269,32 @@ ansible-playbook helpers/force-new-cluster.yml \ ## Reference Documentation -### Etcd Operations +### Troubleshooting Guides (by detail level) + +1. **[QUICK_REFERENCE.md](QUICK_REFERENCE.md)** - Start here for common issues + - 7 most frequent failure patterns with fixes + - Quick diagnostics commands + - Fast verification checklist -Slash commands for detailed etcd information: -- `/etcd:etcd-ops-guide:clustering` - Cluster membership operations -- `/etcd:etcd-ops-guide:recovery` - Recovery procedures -- `/etcd:etcd-ops-guide:monitoring` - Monitoring and health checks -- `/etcd:etcd-ops-guide:failures` - Failure scenarios -- `/etcd:etcd-ops-guide:data_corruption` - Data corruption handling +2. **[TROUBLESHOOTING_SKILL.md](TROUBLESHOOTING_SKILL.md)** - Detailed methodology + - Systematic analysis frameworks + - Component-specific diagnosis + - Decision trees and error patterns -Or read files directly in `.claude/commands/etcd/etcd-ops-guide/` +3. **Etcd Operations** - Deep reference via slash commands: + - `/etcd:etcd-ops-guide:clustering` - Cluster membership operations + - `/etcd:etcd-ops-guide:recovery` - Recovery procedures + - `/etcd:etcd-ops-guide:monitoring` - Monitoring and health checks + - `/etcd:etcd-ops-guide:failures` - Failure scenarios + - `/etcd:etcd-ops-guide:data_corruption` - Data corruption handling -### Pacemaker Administration + Or read files directly in `.claude/commands/etcd/etcd-ops-guide/` -Documentation in `.claude/commands/etcd/pacemaker/Pacemaker_Administration/`: -- `troubleshooting.rst` - Pacemaker troubleshooting guide -- `tools.rst` - Command-line tools -- `agents.rst` - Resource agents -- `administrative.rst` - Administrative tasks +4. **Pacemaker Administration** - Deep reference in `.claude/commands/etcd/pacemaker/Pacemaker_Administration/`: + - `troubleshooting.rst` - Pacemaker troubleshooting guide + - `tools.rst` - Command-line tools + - `agents.rst` - Resource agents + - `administrative.rst` - Administrative tasks ## Development and Testing From fa3fe6bbe3095b6d55a0c9ae2f61ea6e8cd0093c Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Tue, 4 Nov 2025 10:33:47 +0100 Subject: [PATCH 13/19] Make cluster access optional --- .../commands/etcd/TROUBLESHOOTING_SKILL.md | 18 +++++++++++++ .../etcd/scripts/validate-cluster-access.sh | 25 ++++++++++--------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md index 29a96e5..823c9b2 100644 --- a/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md +++ b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md @@ -28,6 +28,24 @@ This skill enables Claude to: - Verify cluster access with `oc get nodes` - Remember proxy requirement for all subsequent oc commands +**IMPORTANT: No Cluster Access Scenario** + +If OpenShift cluster API access is unavailable (which is expected when etcd is down), **all diagnostics and remediation must be performed via Ansible** using direct VM access. The troubleshooting workflow remains fully functional using only: + +- Ansible ad-hoc commands to cluster VMs +- Ansible playbooks for diagnostics collection +- Direct SSH access to nodes via Ansible + +When cluster access is unavailable: +- ✓ You can still diagnose and fix etcd issues completely +- ✓ All Pacemaker operations work via Ansible +- ✓ All etcd container operations work via Ansible (podman commands) +- ✓ All logs are accessible via Ansible (journalctl commands) +- ✗ Cannot query OpenShift operators or cluster-level resources +- ✗ Cannot use oc commands for verification (use Ansible equivalents instead) + +This is a **normal scenario** when etcd is down - proceed with VM-based troubleshooting. + ### 2. Collect Data Use Ansible to execute commands on cluster VMs (all commands require sudo/become): diff --git a/.claude/commands/etcd/scripts/validate-cluster-access.sh b/.claude/commands/etcd/scripts/validate-cluster-access.sh index 9b88db0..8ccac1f 100755 --- a/.claude/commands/etcd/scripts/validate-cluster-access.sh +++ b/.claude/commands/etcd/scripts/validate-cluster-access.sh @@ -61,8 +61,8 @@ section "Validating OpenShift Cluster Access" # Check if oc is available if ! command -v oc &> /dev/null; then - error "oc command not found in PATH" - EXIT_CODE=1 + warn "oc command not found in PATH" + warn "Cluster API access will not be available (this is OK if etcd is down)" else info "oc command found" @@ -70,6 +70,7 @@ else if oc version --request-timeout=5s &>/dev/null; then info "Direct cluster access successful" PROXY_REQUIRED=false + CLUSTER_ACCESS=true else warn "Direct cluster access failed" @@ -84,35 +85,35 @@ else if oc version --request-timeout=5s &>/dev/null; then info "Cluster access via proxy successful" PROXY_REQUIRED=true + CLUSTER_ACCESS=true else - error "Cluster access failed even with proxy" - EXIT_CODE=1 + warn "Cluster access failed even with proxy" + warn "This is expected if etcd is down - will rely on direct VM access" + CLUSTER_ACCESS=false fi else - error "No proxy.env found at: ${PROXY_ENV_PATH}" - error "Cluster access unavailable" - EXIT_CODE=1 + warn "No proxy.env found at: ${PROXY_ENV_PATH}" + warn "Cluster access unavailable - this is OK if etcd is down" + CLUSTER_ACCESS=false fi fi # If we have cluster access, test basic operations - if [ ${EXIT_CODE} -eq 0 ]; then + if [ "${CLUSTER_ACCESS:-false}" = "true" ]; then section "Testing OpenShift Cluster Operations" if oc get nodes &>/dev/null; then info "Successfully queried cluster nodes" oc get nodes -o wide | sed 's/^/ /' else - error "Failed to query cluster nodes" - EXIT_CODE=1 + warn "Failed to query cluster nodes (may be expected if etcd is down)" fi if oc get co etcd &>/dev/null; then info "Successfully queried etcd cluster operator" oc get co etcd | sed 's/^/ /' else - error "Failed to query etcd cluster operator" - EXIT_CODE=1 + warn "Failed to query etcd cluster operator (may be expected if etcd is down)" fi fi fi From 14a115fdca094e990a1b79127a077bcf20d629c9 Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Tue, 4 Nov 2025 10:37:30 +0100 Subject: [PATCH 14/19] Update to make sure only cluster_vms are targeted with pcs commands --- .../commands/etcd/TROUBLESHOOTING_SKILL.md | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md index 823c9b2..3c1979e 100644 --- a/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md +++ b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md @@ -48,9 +48,23 @@ This is a **normal scenario** when etcd is down - proceed with VM-based troubles ### 2. Collect Data -Use Ansible to execute commands on cluster VMs (all commands require sudo/become): +**IMPORTANT: Target the Correct Host Group** -**Pacemaker Status:** +- **All etcd/Pacemaker commands** must target the `cluster_vms` host group (the OpenShift cluster nodes) +- **VM lifecycle commands** (start/stop VMs) target the hypervisor host +- Use Ansible ad-hoc commands with `-m shell` or run playbooks that target `cluster_vms` +- All commands on cluster VMs require sudo/become privileges + +**Example Ansible targeting:** +```bash +# Correct - targets cluster VMs +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a "pcs status" -b + +# Incorrect - would target hypervisor +ansible hypervisor -i deploy/openshift-clusters/inventory.ini -m shell -a "pcs status" -b +``` + +**Pacemaker Status (on cluster_vms):** ```bash sudo pcs status sudo pcs resource status @@ -58,28 +72,28 @@ sudo pcs constraint list sudo crm_mon -1 ``` -**Etcd Container Status:** +**Etcd Container Status (on cluster_vms):** ```bash sudo podman ps -a --filter name=etcd sudo podman inspect etcd sudo podman logs --tail 100 etcd ``` -**Etcd Cluster Health:** +**Etcd Cluster Health (on cluster_vms):** ```bash sudo podman exec etcd etcdctl member list -w table sudo podman exec etcd etcdctl endpoint health -w table sudo podman exec etcd etcdctl endpoint status -w table ``` -**System Logs:** +**System Logs (on cluster_vms):** ```bash sudo journalctl -u pacemaker --since "1 hour ago" -n 200 sudo journalctl -u corosync --since "1 hour ago" -n 100 sudo journalctl --grep etcd --since "1 hour ago" -n 200 ``` -**Cluster Attributes:** +**Cluster Attributes (on cluster_vms):** ```bash sudo crm_attribute --query --name standalone_node sudo crm_attribute --query --name learner_node From d7fdab97ec9aba6f38ab212a5aabeb3a02f97aaa Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Tue, 4 Nov 2025 10:40:46 +0100 Subject: [PATCH 15/19] Update files to use the right host group --- .claude/commands/etcd/QUICK_REFERENCE.md | 78 +++++++++++-------- .claude/commands/etcd/README.md | 24 ++++++ .../commands/etcd/TROUBLESHOOTING_SKILL.md | 7 ++ 3 files changed, 76 insertions(+), 33 deletions(-) diff --git a/.claude/commands/etcd/QUICK_REFERENCE.md b/.claude/commands/etcd/QUICK_REFERENCE.md index 0198463..df6cbb2 100644 --- a/.claude/commands/etcd/QUICK_REFERENCE.md +++ b/.claude/commands/etcd/QUICK_REFERENCE.md @@ -6,6 +6,18 @@ Use this guide for quick diagnosis and remediation. For detailed analysis, refer --- +## CRITICAL: Target the Correct Hosts + +**Always use `cluster_vms` host group for etcd/Pacemaker commands:** + +- ✓ **Correct:** `ansible cluster_vms -i inventory.ini -m shell -a "pcs status" -b` +- ✗ **Wrong:** `ansible all -i inventory.ini ...` (would include hypervisor) +- ✗ **Wrong:** `ansible hypervisor -i inventory.ini ...` (hypervisor has no etcd) + +The `hypervisor` is only for VM lifecycle (virsh/kcli). All etcd operations run on `cluster_vms`. + +--- + ## Quick Diagnostics **Collect all diagnostics automatically:** @@ -15,13 +27,13 @@ source .claude/commands/etcd/scripts/collect-all-diagnostics.sh **Check cluster health quickly:** ```bash -# Pacemaker status -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a "sudo pcs status" -b +# Pacemaker status (on cluster VMs) +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a "sudo pcs status" -b -# Etcd member list -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a "sudo podman exec etcd etcdctl member list -w table" -b +# Etcd member list (on cluster VMs) +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a "sudo podman exec etcd etcdctl member list -w table" -b -# OpenShift etcd operator +# OpenShift etcd operator (if cluster access available) oc get co etcd -o yaml | grep -A10 "status:" ``` @@ -61,7 +73,7 @@ ansible -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs resource cleanup etcd" -b # Monitor recovery -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs status" -b ``` @@ -85,18 +97,18 @@ Network partition or simultaneous failures caused both nodes to start independen **Diagnosis:** ```bash # Check cluster IDs on both nodes -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo crm_attribute -G -n cluster_id" -b # Check which node is standalone -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo crm_attribute -G -n standalone_node" -b ``` **Fix:** ```bash # Identify the node with more recent data (higher revision) -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo podman exec etcd etcdctl endpoint status -w table" -b # On the node with LESS data, clean etcd @@ -106,11 +118,11 @@ ansible -i deploy/openshift-clusters/inventory.ini -m shell -a \ sudo pcs resource clear etcd" -b # Clear the force_new_cluster flag from CIB -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo crm_attribute -D -n force_new_cluster" -b # Cleanup and let Pacemaker recover -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs resource cleanup etcd" -b ``` @@ -134,11 +146,11 @@ Corosync cluster lost quorum (needs 2 nodes, has <2). **Diagnosis:** ```bash # Check which nodes are online -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs status | grep -A5 'Node List'" -b # Check corosync membership -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo corosync-cmapctl | grep members" -b ``` @@ -151,18 +163,18 @@ ansible -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs cluster start" -b # Wait for quorum to be established -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs status" -b ``` **If both nodes online but no quorum (network issue):** ```bash # Check firewall/network connectivity between nodes -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo firewall-cmd --list-all" -b # Restart corosync cluster-wide -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs cluster stop --all && sudo pcs cluster start --all" -b ``` @@ -186,7 +198,7 @@ OpenShift etcd operator learner promotion workflow stalled or conditions not met **Diagnosis:** ```bash # Check learner status -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo podman exec etcd etcdctl member list -w table" -b # Check etcd operator conditions @@ -231,11 +243,11 @@ Expired or incorrect TLS certificates for etcd communication. **Diagnosis:** ```bash # Check certificate expiration -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo openssl x509 -in /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-$(hostname).crt -noout -dates" -b # Check for cert errors in logs -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo podman logs etcd 2>&1 | grep -i 'certificate\|tls'" -b ``` @@ -272,26 +284,26 @@ Resource was manually banned or reached failure threshold causing automatic ban. **Diagnosis:** ```bash # Check for location constraints (bans) -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs constraint list --full" -b # Check failure count -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs resource failcount show etcd" -b ``` **Fix:** ```bash # Remove all location constraints for etcd -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs constraint list --full | grep 'location.*etcd' | cut -d' ' -f1 | xargs -I {} sudo pcs constraint remove {}" -b # Or clear specific node ban -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs resource clear etcd" -b # Reset failure counts -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs resource cleanup etcd" -b ``` @@ -315,30 +327,30 @@ Fencing agent can't reach BMC or authentication failure. **Diagnosis:** ```bash # Check stonith configuration -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs stonith config" -b # Test fencing manually -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs stonith fence " -b # Check redfish connectivity -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "curl -k -u : https:///redfish/v1/Systems" -b ``` **Fix:** ```bash # Update stonith credentials if needed -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs stonith update _redfish password=" -b # Confirm unclean node (if safe - node is really down) -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs stonith confirm " -b # Restart cluster after fencing fix -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo pcs cluster stop --all && sudo pcs cluster start --all" -b ``` @@ -355,16 +367,16 @@ After any fix, verify: ```bash # 1. Pacemaker cluster healthy -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a "sudo pcs status" -b +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a "sudo pcs status" -b # Expected: Both nodes Online, quorum achieved, no failed actions # 2. Etcd members healthy -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo podman exec etcd etcdctl endpoint health -w table" -b # Expected: All endpoints healthy # 3. Etcd member list correct -ansible all -i deploy/openshift-clusters/inventory.ini -m shell -a \ +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo podman exec etcd etcdctl member list -w table" -b # Expected: 2 members, both started, both IS_LEARNER=false diff --git a/.claude/commands/etcd/README.md b/.claude/commands/etcd/README.md index 966b8d9..cea8f69 100644 --- a/.claude/commands/etcd/README.md +++ b/.claude/commands/etcd/README.md @@ -151,6 +151,30 @@ When you know the general area of the problem: ## Key Features +### Host Group Targeting + +**IMPORTANT:** All etcd and Pacemaker diagnostics must target the correct Ansible host group: + +- **`cluster_vms`** - Use for all etcd, Pacemaker, and cluster diagnostics + - All pcs commands + - All podman commands for etcd containers + - All etcdctl commands + - All journalctl commands for cluster logs + +- **`hypervisor`** - Only for VM lifecycle management + - virsh commands to start/stop VMs + - kcli commands for cluster management + - Do NOT use for etcd-related operations + +**Example:** +```bash +# Correct - targets cluster VMs +ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a "pcs status" -b + +# Incorrect - would target hypervisor instead of cluster nodes +ansible hypervisor -i deploy/openshift-clusters/inventory.ini -m shell -a "pcs status" -b +``` + ### Proxy Handling All scripts automatically detect and handle proxy requirements: diff --git a/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md index 3c1979e..6fb93be 100644 --- a/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md +++ b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md @@ -529,3 +529,10 @@ Provide clear, concise diagnostics with: - Warn user clearly if proxy.env is required but missing - Always use sudo/become for commands on cluster VMs via Ansible - Be specific about which node to run commands on when relevant +- **CRITICAL: Always target the `cluster_vms` host group for all etcd/Pacemaker operations** + - Never target the `hypervisor` host for etcd-related commands + - The hypervisor is only for VM lifecycle management (virsh, kcli commands) + - All Pacemaker, etcd container, and cluster diagnostics run on cluster VMs +- When cluster API access is unavailable, rely exclusively on Ansible-based VM access + - This is normal and expected when etcd is down + - All troubleshooting can be completed without oc commands From 7089cc1c5d31c275d498f95e062daca94aaf7fc9 Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Tue, 4 Nov 2025 10:43:37 +0100 Subject: [PATCH 16/19] Update references to force-new-cluster playbook and other automated tools --- .claude/commands/etcd/QUICK_REFERENCE.md | 16 +++++++++++- .../commands/etcd/TROUBLESHOOTING_SKILL.md | 26 ++++++++++++++++--- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/.claude/commands/etcd/QUICK_REFERENCE.md b/.claude/commands/etcd/QUICK_REFERENCE.md index df6cbb2..7c555d9 100644 --- a/.claude/commands/etcd/QUICK_REFERENCE.md +++ b/.claude/commands/etcd/QUICK_REFERENCE.md @@ -105,7 +105,21 @@ ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ "sudo crm_attribute -G -n standalone_node" -b ``` -**Fix:** +**Fix (RECOMMENDED - Use Automated Playbook):** +```bash +# Use the automated force-new-cluster helper playbook +ansible-playbook helpers/force-new-cluster.yml \ + -i deploy/openshift-clusters/inventory.ini +``` + +This playbook automatically: +- Takes snapshots for safety +- Clears conflicting CIB attributes +- Designates leader (first node in inventory) to force new cluster +- Removes follower from member list +- Cleans up and re-enables stonith + +**Fix (MANUAL - Only if playbook unavailable):** ```bash # Identify the node with more recent data (higher revision) ansible cluster_vms -i deploy/openshift-clusters/inventory.ini -m shell -a \ diff --git a/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md index 6fb93be..5b93bf9 100644 --- a/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md +++ b/.claude/commands/etcd/TROUBLESHOOTING_SKILL.md @@ -379,10 +379,18 @@ ansible cluster_vms -i inventory.ini -m shell \ -a "crm_attribute --query --name cluster_id" -b ``` -**Resolution:** -- Use force-new-cluster helper playbook -- Designate one node as leader (first in inventory) -- Follower will resync from leader +**Resolution (RECOMMENDED):** +```bash +# Use the automated force-new-cluster helper playbook +ansible-playbook helpers/force-new-cluster.yml -i deploy/openshift-clusters/inventory.ini +``` + +This playbook: +- Takes snapshots for safety +- Clears conflicting CIB attributes +- Designates leader (first node in inventory) to force new cluster +- Removes follower from member list +- Handles all cleanup and recovery steps automatically #### Resource Failures / Failed Actions **Symptoms:** @@ -454,6 +462,16 @@ sudo journalctl -u pacemaker --grep fence --since "1 hour ago" ## Available Remediation Tools +**IMPORTANT: Prefer Automated Tools** + +When dealing with cluster recovery scenarios (split-brain, mismatched cluster IDs, both nodes down), **always use the automated helper playbook first** before attempting manual recovery: + +```bash +ansible-playbook helpers/force-new-cluster.yml -i deploy/openshift-clusters/inventory.ini +``` + +This playbook handles all the complex steps safely and is the recommended approach. Manual steps should only be used if the playbook is unavailable or fails. + ### Pacemaker Resource Cleanup Use `pcs resource cleanup` to clear failed resource states and retry operations: From 579f9fe22d683372c87556f35b02ec6a06602ff9 Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Wed, 5 Nov 2025 16:36:52 +0100 Subject: [PATCH 17/19] Update force-new-cluster helper --- helpers/force-new-cluster.yml | 125 +++++----------------------------- 1 file changed, 18 insertions(+), 107 deletions(-) diff --git a/helpers/force-new-cluster.yml b/helpers/force-new-cluster.yml index c5ae132..5089ce2 100644 --- a/helpers/force-new-cluster.yml +++ b/helpers/force-new-cluster.yml @@ -54,42 +54,27 @@ run_once: true changed_when: true - - name: Check if etcd is running on leader node - ansible.builtin.command: podman ps + - name: Disable etcd resource on leader node delegate_to: "{{ leader_node }}" - register: leader_etcd_status - changed_when: false - run_once: true - failed_when: false - - - name: Determine recovery scenario - ansible.builtin.set_fact: - leader_has_etcd: "{{ 'etcd' in leader_etcd_status.stdout }}" run_once: true - - - name: Handle scenario where no etcd is running on leader - when: not leader_has_etcd block: - - name: Take etcd snapshot on both nodes - ansible.builtin.copy: - src: "/var/lib/etcd/member/snap/db" - dest: "{{ snapshot_dir }}/{{ snapshot_name }}" - remote_src: true - owner: core - group: core - mode: '0644' + - name: Disable etcd resource + ansible.builtin.command: pcs resource disable etcd + changed_when: true - - name: Clean up old snapshots (keep last {{ snapshot_retention_count }}) + - name: Wait for etcd to stop ansible.builtin.shell: | - ls -1t {{ snapshot_dir }}/etcd-snapshot-*.db 2>/dev/null | tail -n +{{ snapshot_retention_count + 1 }} | xargs -r rm -f - args: - executable: /bin/bash - changed_when: true + pcs status resources | grep etcd -A 1 | grep -E 'Started|Stopping' + register: etcd_stopping + changed_when: false failed_when: false + until: etcd_stopping.rc != 0 + retries: 60 + delay: 5 - - name: Display snapshot location + - name: Display etcd stopped confirmation ansible.builtin.debug: - msg: "✓ etcd snapshot saved on {{ inventory_hostname }} to: {{ snapshot_dir }}/{{ snapshot_name }}" + msg: "Etcd resource is now stopped." - name: Clear CIB attributes on all nodes block: @@ -186,90 +171,17 @@ Unexpected force_new_cluster attribute on {{ follower_hostname }} Output: {{ follower_reboot_attrs.stdout }} - - name: Remove follower from etcd member list - delegate_to: "{{ leader_node }}" - run_once: true - when: leader_has_etcd - block: - - name: Get etcd member list - ansible.builtin.command: podman exec etcd etcdctl member list - register: etcd_member_list - changed_when: false - - - name: Extract follower member ID by hostname - ansible.builtin.set_fact: - follower_member_id: "{{ (etcd_member_list.stdout_lines | select('search', follower_hostname) | first | split(','))[0] | default('') }}" - when: follower_hostname in etcd_member_list.stdout - - - name: Extract follower member ID by unstarted state (fallback) - ansible.builtin.set_fact: - follower_member_id: "{{ (etcd_member_list.stdout_lines | select('search', 'unstarted') | first | split(','))[0] | default('') }}" - when: - - follower_hostname not in etcd_member_list.stdout - - "'unstarted' in etcd_member_list.stdout" - - - name: Display etcd member list if follower not found - ansible.builtin.debug: - msg: | - Could not find follower {{ follower_hostname }} in etcd member list. Nothing to do. - Member list: - {{ etcd_member_list.stdout }} - when: follower_member_id is not defined or follower_member_id == '' - - - name: Remove follower from etcd cluster - ansible.builtin.command: podman exec etcd etcdctl member remove {{ follower_member_id }} - when: - - follower_member_id is defined - - follower_member_id != '' - changed_when: true - - - name: Display removal confirmation - ansible.builtin.debug: - msg: "Removing follower member ID: {{ follower_member_id }} ({{ follower_hostname }})" - when: - - follower_member_id is defined - - follower_member_id != '' - - - name: Cleanup etcd resource on leader node - ansible.builtin.command: pcs resource cleanup etcd + - name: Enable etcd resource on leader node + ansible.builtin.command: pcs resource enable etcd delegate_to: "{{ leader_node }}" run_once: true changed_when: true - - name: Cleanup etcd resource on follower node + - name: Cleanup etcd resource to restore the cluster ansible.builtin.command: pcs resource cleanup etcd - delegate_to: "{{ follower_node }}" - run_once: true - changed_when: true - - - name: Wait for etcd to potentially start (no-etcd scenario) - ansible.builtin.pause: - seconds: 10 - when: not leader_has_etcd - run_once: true - - - name: Re-check etcd status after cleanup (no-etcd scenario) - ansible.builtin.command: podman ps delegate_to: "{{ leader_node }}" - register: leader_etcd_recheck - changed_when: false - run_once: true - when: not leader_has_etcd - - - name: Display etcd recovery status - ansible.builtin.debug: - msg: | - {% if not leader_has_etcd %} - {% if 'etcd' in leader_etcd_recheck.stdout %} - ✓ Leader etcd is now running after cleanup. - {% else %} - ⚠ Leader etcd is still not running after cleanup. Manual intervention may be required. - CIB attributes have been set for force-new-cluster on {{ leader_hostname }} - {% endif %} - {% else %} - ✓ All force-new-cluster operations completed successfully. - {% endif %} run_once: true + changed_when: true - name: Re-enable stonith on leader node ansible.builtin.command: pcs property set stonith-enabled=true @@ -287,6 +199,5 @@ post_tasks: - name: Display completion message ansible.builtin.debug: - msg: "✓ Force new cluster operation completed. All tests passed." + msg: "✓ Force new cluster operation completed. Etcd cluster recovery initiated." run_once: true - when: leader_has_etcd From 017ca8c536f81829eaed084ad52eaf65b6c7905e Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Fri, 7 Nov 2025 15:32:39 +0100 Subject: [PATCH 18/19] Add log location --- helpers/collect-tnf-logs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/helpers/collect-tnf-logs.yml b/helpers/collect-tnf-logs.yml index 5d221f3..421fa7e 100644 --- a/helpers/collect-tnf-logs.yml +++ b/helpers/collect-tnf-logs.yml @@ -81,3 +81,4 @@ Log collection complete for {{ inventory_hostname }}: - Pacemaker logs: {{ 'collected' if pacemaker_logs.rc == 0 else 'failed' }} - Etcd logs: {{ 'collected' if etcd_logs.rc == 0 else 'failed/timeout' }} + - Logs saved to: {{ playbook_dir }}/../logs/{{ log_timestamp }}/ From 8f455fb04eba28103f8c8fac5c84e5f8a050af63 Mon Sep 17 00:00:00 2001 From: Pablo Fontanilla Date: Mon, 17 Nov 2025 18:04:47 +0100 Subject: [PATCH 19/19] Update force new cluster playbook --- helpers/force-new-cluster.yml | 42 +++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/helpers/force-new-cluster.yml b/helpers/force-new-cluster.yml index 5089ce2..8b7533a 100644 --- a/helpers/force-new-cluster.yml +++ b/helpers/force-new-cluster.yml @@ -88,19 +88,33 @@ failed_when: false changed_when: true - - name: Clear force_new_cluster attribute from leader node - ansible.builtin.command: crm_attribute --delete --node "{{ leader_hostname }}" --lifetime reboot --name "force_new_cluster" + - name: Clear force_new_cluster attribute from leader node (all scopes) delegate_to: "{{ leader_node }}" run_once: true - failed_when: false - changed_when: true + block: + - name: Clear reboot-lifetime force_new_cluster from leader + ansible.builtin.command: crm_attribute --delete --node "{{ leader_hostname }}" --lifetime reboot --name "force_new_cluster" + failed_when: false + changed_when: true - - name: Clear force_new_cluster attribute from follower node - ansible.builtin.command: crm_attribute --delete --node "{{ follower_hostname }}" --lifetime reboot --name "force_new_cluster" + - name: Clear status force_new_cluster from leader using attrd_updater + ansible.builtin.command: attrd_updater -D -n force_new_cluster -N "{{ leader_hostname }}" + failed_when: false + changed_when: true + + - name: Clear force_new_cluster attribute from follower node (all scopes) delegate_to: "{{ follower_node }}" run_once: true - failed_when: false - changed_when: true + block: + - name: Clear reboot-lifetime force_new_cluster from follower + ansible.builtin.command: crm_attribute --delete --node "{{ follower_hostname }}" --lifetime reboot --name "force_new_cluster" + failed_when: false + changed_when: true + + - name: Clear status force_new_cluster from follower using attrd_updater + ansible.builtin.command: attrd_updater -D -n force_new_cluster -N "{{ follower_hostname }}" + failed_when: false + changed_when: true - name: Set force_new_cluster attribute on leader node ansible.builtin.command: crm_attribute --lifetime reboot --node "{{ leader_hostname }}" --name "force_new_cluster" --update "{{ leader_hostname }}" @@ -157,18 +171,18 @@ Unexpected standalone or learner attributes on {{ follower_hostname }} Output: {{ follower_cib_attrs.stdout }} - - name: Query reboot-lifetime CIB attributes on follower - ansible.builtin.command: crm_attribute --query --lifetime reboot --node "{{ follower_hostname }}" + - name: Query reboot-lifetime CIB attributes on follower (with proper scope filter) + ansible.builtin.shell: | + crm_attribute --query --lifetime reboot --node "{{ follower_hostname }}" --name "force_new_cluster" 2>&1 | grep 'scope=reboot' || true register: follower_reboot_attrs changed_when: false - failed_when: false - - name: Verify force_new_cluster attribute is NOT present on follower + - name: Verify force_new_cluster attribute is NOT present on follower in reboot scope ansible.builtin.assert: that: - - "'force_new_cluster' not in follower_reboot_attrs.stdout" + - follower_reboot_attrs.stdout == "" fail_msg: | - Unexpected force_new_cluster attribute on {{ follower_hostname }} + Unexpected force_new_cluster attribute (reboot scope) on {{ follower_hostname }} Output: {{ follower_reboot_attrs.stdout }} - name: Enable etcd resource on leader node