From 4cea22d3d5533e5a7793dbcc1d6a6d8b76ebd45f Mon Sep 17 00:00:00 2001
From: Juliana Oliveira <juliana@fly.io>
Date: Mon, 26 Jan 2026 16:18:49 -0300
Subject: [PATCH 1/7] wip

---
 .../controller/postgrescluster/pgbouncer.go   | 102 ++++++++++++++++++
 internal/pgbouncer/config.go                  |  13 ++-
 internal/pgbouncer/reconnect.go               |  57 ++++++++++
 .../v1beta1/pgbouncer_types.go                |   6 ++
 4 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 internal/pgbouncer/reconnect.go

diff --git a/internal/controller/postgrescluster/pgbouncer.go b/internal/controller/postgrescluster/pgbouncer.go
index 2fcd2802ff..b98c42113b 100644
--- a/internal/controller/postgrescluster/pgbouncer.go
+++ b/internal/controller/postgrescluster/pgbouncer.go
@@ -54,6 +54,11 @@ func (r *Reconciler) reconcilePGBouncer(
 	if err == nil {
 		err = r.reconcilePGBouncerInPostgreSQL(ctx, cluster, instances, secret)
 	}
+	if err == nil {
+		// Trigger RECONNECT if primary has changed to force new server connections.
+		// This prevents stale connections from routing traffic to a demoted replica.
+		err = r.reconcilePGBouncerReconnect(ctx, cluster, instances)
+	}
 	return err
 }
 
@@ -584,3 +589,100 @@ func (r *Reconciler) reconcilePGBouncerPodDisruptionBudget(
 	}
 	return err
 }
+
+// reconcilePGBouncerReconnect triggers a RECONNECT command on all PgBouncer
+// pods when the primary has changed. This forces PgBouncer to establish new
+// server connections to the correct primary, preventing stale connections
+// from routing traffic to a demoted replica after failover.
+//
+// Note: RECONNECT closes server connections when they are "released" according
+// to the pool mode. In transaction mode, this happens after each transaction.
+// In session mode, this happens when the client disconnects - so persistent
+// clients may continue hitting the old primary until they reconnect.
+func (r *Reconciler) reconcilePGBouncerReconnect(
+	ctx context.Context, cluster *v1beta1.PostgresCluster,
+	instances *observedInstances,
+) error {
+	log := logging.FromContext(ctx)
+
+	// Skip if PgBouncer is disabled
+	if cluster.Spec.Proxy == nil || cluster.Spec.Proxy.PGBouncer == nil {
+		return nil
+	}
+
+	var primaryPod *corev1.Pod
+	for _, instance := range instances.forCluster {
+		// Same condition as writablePod fn
+		if writable, known := instance.IsWritable(); writable && known && len(instance.Pods) > 0 {
+			primaryPod = instance.Pods[0]
+			break
+		}
+	}
+
+	if primaryPod == nil {
+		// We will retry later.
+		log.V(1).Info("No writable instance found, skipping PgBouncer RECONNECT")
+		return nil
+	}
+
+	currentPrimaryUID := string(primaryPod.UID)
+	lastReconnectUID := cluster.Status.Proxy.PGBouncer.LastReconnectPrimaryUID
+
+	if currentPrimaryUID == lastReconnectUID {
+		// Primary hasn't changed, no need to Reconnect.
+		return nil
+	}
+
+	log.Info("Primary changed, triggering PgBouncer RECONNECT",
+		"previousPrimaryUID", lastReconnectUID,
+		"currentPrimaryUID", currentPrimaryUID,
+		"currentPrimaryName", primaryPod.Name)
+
+	pgbouncerPods := &corev1.PodList{}
+	selector, err := naming.AsSelector(naming.ClusterPGBouncerSelector(cluster))
+	if err != nil {
+		return errors.WithStack(err)
+	}
+
+	if err := r.Client.List(ctx, pgbouncerPods,
+		client.InNamespace(cluster.Namespace),
+		client.MatchingLabelsSelector{Selector: selector}); err != nil {
+		return errors.WithStack(err)
+	}
+
+	// Send RECONNECT to each running PgBouncer pod
+	var reconnectErr error
+	successCount := 0
+
+	for i := range pgbouncerPods.Items {
+		pod := &pgbouncerPods.Items[i]
+		if pod.Status.Phase != corev1.PodRunning {
+			continue
+		}
+
+		exec := func(ctx context.Context, stdin io.Reader, stdout, stderr io.Writer, command ...string) error {
+			return r.PodExec(ctx, pod.Namespace, pod.Name, naming.ContainerPGBouncer, stdin, stdout, stderr, command...)
+		}
+
+		if err := pgbouncer.Reconnect(ctx, exec); err != nil {
+			log.Error(err, "PgBouncer RECONNECT: failed to issue command to pod.", "pod", pod.Name)
+			reconnectErr = err
+		} else {
+			successCount++
+		}
+	}
+
+	// If we can't send a RECONNECT command to one of the pods, we won't update the LastReconnectPrimaryUID.
+	// This means this will run again in the next reconciliation loop.
+	if reconnectErr == nil {
+		cluster.Status.Proxy.PGBouncer.LastReconnectPrimaryUID = currentPrimaryUID
+	}
+
+	log.Info("PgBouncer RECONNECT: done",
+		"failed", reconnectErr != nil,
+		"successCount", successCount,
+		"totalPods", len(pgbouncerPods.Items),
+	)
+
+	return reconnectErr
+}
diff --git a/internal/pgbouncer/config.go b/internal/pgbouncer/config.go
index 3ac90b5bfe..4f4b8d7c5c 100644
--- a/internal/pgbouncer/config.go
+++ b/internal/pgbouncer/config.go
@@ -120,8 +120,17 @@ func clusterINI(cluster *v1beta1.PostgresCluster) string {
 		"server_tls_sslmode": "verify-full",
 		"server_tls_ca_file": certBackendAuthorityAbsolutePath,
 
-		// Disable Unix sockets to keep the filesystem read-only.
-		"unix_socket_dir": "",
+		// Enable Unix socket for admin console access. The special user
+		// "pgbouncer" can connect without a password when using a Unix socket
+		// from the same UID as the running process. This allows the operator
+		// to send admin commands like RECONNECT after failover.
+		// Ref.: https://www.pgbouncer.org/usage.html#admin-console
+		"unix_socket_dir": "/tmp/pgbouncer",
+
+		// Allow the "pgbouncer" user to run admin commands (PAUSE, RESUME,
+		// RECONNECT, etc.) on the admin console. Combined with unix_socket_dir,
+		// this enables password-free admin access from within the container.
+		"admin_users": "pgbouncer",
 	}
 
 	// Override the above with any specified settings.
diff --git a/internal/pgbouncer/reconnect.go b/internal/pgbouncer/reconnect.go
new file mode 100644
index 0000000000..e527850d48
--- /dev/null
+++ b/internal/pgbouncer/reconnect.go
@@ -0,0 +1,57 @@
+package pgbouncer
+
+import (
+	"bytes"
+	"context"
+
+	"github.com/percona/percona-postgresql-operator/internal/logging"
+	"github.com/percona/percona-postgresql-operator/internal/postgres"
+)
+
+// Reconnect sends the RECONNECT command to PgBouncer's admin console.
+// This closes all server connections at the next opportunity, forcing
+// PgBouncer to establish new connections to the current primary.
+//
+// From PgBouncer docs: "Close each open server connection for the given
+// database, or all databases, at the next opportunity."
+//
+// This is non-disruptive: PgBouncer waits for the connection to be
+// "released" before closing it. In transaction pooling mode, this means
+// waiting for the current transaction to complete. In session pooling
+// mode, this means waiting for the client to disconnect.
+//
+// The command connects via Unix socket as the special "pgbouncer" user,
+// which is allowed without password when the client has the same UID
+// as the running PgBouncer process.
+//
+// Ref.: https://www.pgbouncer.org/usage.html#admin-console
+func Reconnect(ctx context.Context, exec postgres.Executor) error {
+	log := logging.FromContext(ctx)
+	log.Info("Triggering PgBouncer RECONNECT to force new server connections")
+
+	stdout, stderr := &bytes.Buffer{}, &bytes.Buffer{}
+
+	// Connect to PgBouncer admin console via Unix socket.
+	// The "pgbouncer" user can connect without password from same UID.
+	// The "pgbouncer" database is the virtual admin console.
+	// Ref.:
+	err := exec(ctx, nil, stdout, stderr,
+		"psql",
+		"-h", "/tmp/pgbouncer",
+		"-p", "6432",
+		"-U", "pgbouncer",
+		"-d", "pgbouncer",
+		"-c", "RECONNECT",
+	)
+
+	if err != nil {
+		log.Error(err, "RECONNECT failed",
+			"stdout", stdout.String(),
+			"stderr", stderr.String())
+	} else {
+		log.V(1).Info("RECONNECT succeeded",
+			"stdout", stdout.String())
+	}
+
+	return err
+}
diff --git a/pkg/apis/postgres-operator.crunchydata.com/v1beta1/pgbouncer_types.go b/pkg/apis/postgres-operator.crunchydata.com/v1beta1/pgbouncer_types.go
index 9119cd6fe3..8b2066b5b5 100644
--- a/pkg/apis/postgres-operator.crunchydata.com/v1beta1/pgbouncer_types.go
+++ b/pkg/apis/postgres-operator.crunchydata.com/v1beta1/pgbouncer_types.go
@@ -173,4 +173,10 @@ type PGBouncerPodStatus struct {
 
 	// Total number of non-terminated pods.
 	Replicas int32 `json:"replicas,omitempty"`
+
+	// Identifies the primary pod UID when RECONNECT was last triggered.
+	// Used to detect failovers and force PgBouncer to establish new
+	// server connections to the correct primary.
+	// +optional
+	LastReconnectPrimaryUID string `json:"lastReconnectPrimaryUID,omitempty"`
 }

From f25243264d607f127b1ab622c5e78bf458ad8312 Mon Sep 17 00:00:00 2001
From: Juliana Oliveira <juliana@fly.io>
Date: Mon, 26 Jan 2026 20:30:06 -0300
Subject: [PATCH 2/7] wip

---
 .../controller/postgrescluster/instance.go    |   4 +-
 .../postgrescluster/instance_test.go          |  10 +-
 .../controller/postgrescluster/pgbouncer.go   | 105 +++++++++---------
 .../controller/postgrescluster/pgmonitor.go   |   2 +-
 .../controller/postgrescluster/postgres.go    |   4 +-
 internal/pgbouncer/config.go                  |  11 --
 internal/pgbouncer/reconnect.go               |  57 ----------
 internal/pgbouncer/signal_failover.go         |  36 ++++++
 .../v1beta1/pgbouncer_types.go                |   8 +-
 9 files changed, 102 insertions(+), 135 deletions(-)
 delete mode 100644 internal/pgbouncer/reconnect.go
 create mode 100644 internal/pgbouncer/signal_failover.go

diff --git a/internal/controller/postgrescluster/instance.go b/internal/controller/postgrescluster/instance.go
index ee616a0395..fdb61e7a14 100644
--- a/internal/controller/postgrescluster/instance.go
+++ b/internal/controller/postgrescluster/instance.go
@@ -262,11 +262,11 @@ func newObservedInstances(
 	return &observed
 }
 
-// writablePod looks at observedInstances and finds an instance that matches
+// WritablePod looks at observedInstances and finds an instance that matches
 // a few conditions. The instance should be non-terminating, running, and
 // writable i.e. the instance with the primary. If such an instance exists, it
 // is returned along with the instance pod.
-func (observed *observedInstances) writablePod(container string) (*corev1.Pod, *Instance) {
+func (observed *observedInstances) WritablePod(container string) (*corev1.Pod, *Instance) {
 	if observed == nil {
 		return nil, nil
 	}
diff --git a/internal/controller/postgrescluster/instance_test.go b/internal/controller/postgrescluster/instance_test.go
index c4067ad303..02631e21c1 100644
--- a/internal/controller/postgrescluster/instance_test.go
+++ b/internal/controller/postgrescluster/instance_test.go
@@ -381,7 +381,7 @@ func TestWritablePod(t *testing.T) {
 	t.Run("empty observed", func(t *testing.T) {
 		observed := &observedInstances{}
 
-		pod, instance := observed.writablePod("container")
+		pod, instance := observed.WritablePod("container")
 		assert.Assert(t, pod == nil)
 		assert.Assert(t, instance == nil)
 	})
@@ -415,7 +415,7 @@ func TestWritablePod(t *testing.T) {
 		terminating, known := observed.forCluster[0].IsTerminating()
 		assert.Assert(t, terminating && known)
 
-		pod, instance := observed.writablePod("container")
+		pod, instance := observed.WritablePod("container")
 		assert.Assert(t, pod == nil)
 		assert.Assert(t, instance == nil)
 	})
@@ -447,7 +447,7 @@ func TestWritablePod(t *testing.T) {
 		running, known := observed.forCluster[0].IsRunning(container)
 		assert.Check(t, !running && known)
 
-		pod, instance := observed.writablePod("container")
+		pod, instance := observed.WritablePod("container")
 		assert.Assert(t, pod == nil)
 		assert.Assert(t, instance == nil)
 	})
@@ -480,7 +480,7 @@ func TestWritablePod(t *testing.T) {
 		writable, known := observed.forCluster[0].IsWritable()
 		assert.Check(t, !writable && known)
 
-		pod, instance := observed.writablePod("container")
+		pod, instance := observed.WritablePod("container")
 		assert.Assert(t, pod == nil)
 		assert.Assert(t, instance == nil)
 	})
@@ -517,7 +517,7 @@ func TestWritablePod(t *testing.T) {
 		running, known := observed.forCluster[0].IsRunning(container)
 		assert.Check(t, running && known)
 
-		pod, instance := observed.writablePod("container")
+		pod, instance := observed.WritablePod("container")
 		assert.Assert(t, pod != nil)
 		assert.Assert(t, instance != nil)
 	})
diff --git a/internal/controller/postgrescluster/pgbouncer.go b/internal/controller/postgrescluster/pgbouncer.go
index b98c42113b..5bb798dd33 100644
--- a/internal/controller/postgrescluster/pgbouncer.go
+++ b/internal/controller/postgrescluster/pgbouncer.go
@@ -55,7 +55,8 @@ func (r *Reconciler) reconcilePGBouncer(
 		err = r.reconcilePGBouncerInPostgreSQL(ctx, cluster, instances, secret)
 	}
 	if err == nil {
-		// Trigger RECONNECT if primary has changed to force new server connections.
+		// Send SIGTERM to PgBouncer if primary has changed, triggering graceful
+		// shutdown and container restart. New process will do fresh DNS lookup.
 		// This prevents stale connections from routing traffic to a demoted replica.
 		err = r.reconcilePGBouncerReconnect(ctx, cluster, instances)
 	}
@@ -116,18 +117,9 @@ func (r *Reconciler) reconcilePGBouncerInPostgreSQL(
 ) error {
 	log := logging.FromContext(ctx)
 
-	var pod *corev1.Pod
-
 	// Find the PostgreSQL instance that can execute SQL that writes to every
 	// database. When there is none, return early.
-
-	for _, instance := range instances.forCluster {
-		writable, known := instance.IsWritable()
-		if writable && known && len(instance.Pods) > 0 {
-			pod = instance.Pods[0]
-			break
-		}
-	}
+	pod, _ := instances.WritablePod(naming.ContainerDatabase)
 	if pod == nil {
 		return nil
 	}
@@ -590,8 +582,24 @@ func (r *Reconciler) reconcilePGBouncerPodDisruptionBudget(
 	return err
 }
 
-// reconcilePGBouncerReconnect triggers a RECONNECT command on all PgBouncer
-// pods when the primary has changed. This forces PgBouncer to establish new
+// pgbouncerPods returns a list of PgBouncer pods for the given cluster.
+func (r *Reconciler) pgbouncerPods(ctx context.Context, cluster *v1beta1.PostgresCluster) (*corev1.PodList, error) {
+	pgbouncerPods := &corev1.PodList{}
+	selector, err := naming.AsSelector(naming.ClusterPGBouncerSelector(cluster))
+	if err != nil {
+		return nil, errors.WithStack(err)
+	}
+
+	if err := r.Client.List(ctx, pgbouncerPods,
+		client.InNamespace(cluster.Namespace),
+		client.MatchingLabelsSelector{Selector: selector}); err != nil {
+		return nil, errors.WithStack(err)
+	}
+	return pgbouncerPods, nil
+}
+
+// reconcilePGBouncerReconnect is a sub-reconciler that signals PgBouncer pods
+// when the primary has changed. This forces PgBouncer to establish new
 // server connections to the correct primary, preventing stale connections
 // from routing traffic to a demoted replica after failover.
 //
@@ -599,6 +607,7 @@ func (r *Reconciler) reconcilePGBouncerPodDisruptionBudget(
 // to the pool mode. In transaction mode, this happens after each transaction.
 // In session mode, this happens when the client disconnects - so persistent
 // clients may continue hitting the old primary until they reconnect.
+// It returns error for integration with the parent reconciler's error handling chain.
 func (r *Reconciler) reconcilePGBouncerReconnect(
 	ctx context.Context, cluster *v1beta1.PostgresCluster,
 	instances *observedInstances,
@@ -610,79 +619,69 @@ func (r *Reconciler) reconcilePGBouncerReconnect(
 		return nil
 	}
 
-	var primaryPod *corev1.Pod
-	for _, instance := range instances.forCluster {
-		// Same condition as writablePod fn
-		if writable, known := instance.IsWritable(); writable && known && len(instance.Pods) > 0 {
-			primaryPod = instance.Pods[0]
-			break
-		}
-	}
-
+	primaryPod, _ := instances.WritablePod(naming.ContainerDatabase)
 	if primaryPod == nil {
 		// We will retry later.
-		log.V(1).Info("No writable instance found, skipping PgBouncer RECONNECT")
+		log.V(1).Info("No writable instance found, skipping PgBouncer failover signal")
 		return nil
 	}
 
 	currentPrimaryUID := string(primaryPod.UID)
-	lastReconnectUID := cluster.Status.Proxy.PGBouncer.LastReconnectPrimaryUID
+	lastFailoverUID := cluster.Status.Proxy.PGBouncer.LastFailoverPrimaryUID
 
-	if currentPrimaryUID == lastReconnectUID {
-		// Primary hasn't changed, no need to Reconnect.
+	if currentPrimaryUID == lastFailoverUID {
+		// Primary hasn't changed, no need to trigger failover.
 		return nil
 	}
 
-	log.Info("Primary changed, triggering PgBouncer RECONNECT",
-		"previousPrimaryUID", lastReconnectUID,
+	log.Info("Primary changed, triggering PgBouncer failover signal (SIGTERM)",
+		"previousPrimaryUID", lastFailoverUID,
 		"currentPrimaryUID", currentPrimaryUID,
 		"currentPrimaryName", primaryPod.Name)
 
-	pgbouncerPods := &corev1.PodList{}
-	selector, err := naming.AsSelector(naming.ClusterPGBouncerSelector(cluster))
+	pgbouncerPods, err := r.pgbouncerPods(ctx, cluster)
 	if err != nil {
-		return errors.WithStack(err)
-	}
-
-	if err := r.Client.List(ctx, pgbouncerPods,
-		client.InNamespace(cluster.Namespace),
-		client.MatchingLabelsSelector{Selector: selector}); err != nil {
-		return errors.WithStack(err)
+		return err
 	}
 
-	// Send RECONNECT to each running PgBouncer pod
-	var reconnectErr error
+	// Send SIGTERM to each running PgBouncer pod to trigger graceful shutdown
+	// and container restart. New PgBouncer process will do fresh DNS lookup.
+	var failoverErrs []error
 	successCount := 0
 
 	for i := range pgbouncerPods.Items {
-		pod := &pgbouncerPods.Items[i]
+		pod := pgbouncerPods.Items[i] // Copy value to avoid closure reference issues
 		if pod.Status.Phase != corev1.PodRunning {
 			continue
 		}
 
-		exec := func(ctx context.Context, stdin io.Reader, stdout, stderr io.Writer, command ...string) error {
+		if err := pgbouncer.SignalFailover(ctx, func(ctx context.Context, stdin io.Reader, stdout, stderr io.Writer, command ...string) error {
 			return r.PodExec(ctx, pod.Namespace, pod.Name, naming.ContainerPGBouncer, stdin, stdout, stderr, command...)
-		}
-
-		if err := pgbouncer.Reconnect(ctx, exec); err != nil {
-			log.Error(err, "PgBouncer RECONNECT: failed to issue command to pod.", "pod", pod.Name)
-			reconnectErr = err
+		}); err != nil {
+			log.Error(err, "PgBouncer failover signal: failed to send SIGTERM to pod", "pod", pod.Name)
+			failoverErrs = append(failoverErrs, fmt.Errorf("pod %s: %w", pod.Name, err))
 		} else {
 			successCount++
 		}
 	}
 
-	// If we can't send a RECONNECT command to one of the pods, we won't update the LastReconnectPrimaryUID.
-	// This means this will run again in the next reconciliation loop.
-	if reconnectErr == nil {
-		cluster.Status.Proxy.PGBouncer.LastReconnectPrimaryUID = currentPrimaryUID
+	// Update status only if all pods were successfully signaled.
+	// Partial failures will be retried in the next reconciliation loop.
+	if len(failoverErrs) == 0 {
+		cluster.Status.Proxy.PGBouncer.LastFailoverPrimaryUID = currentPrimaryUID
 	}
 
-	log.Info("PgBouncer RECONNECT: done",
-		"failed", reconnectErr != nil,
+	log.Info("PgBouncer failover signal: done",
+		"failed", len(failoverErrs) > 0,
 		"successCount", successCount,
+		"failureCount", len(failoverErrs),
 		"totalPods", len(pgbouncerPods.Items),
 	)
 
-	return reconnectErr
+	// Return aggregated errors if any pods failed
+	if len(failoverErrs) > 0 {
+		return fmt.Errorf("failed to signal %d of %d pgbouncer pods: %w",
+			len(failoverErrs), len(pgbouncerPods.Items), failoverErrs[0])
+	}
+	return nil
 }
diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go
index c8b3d400ba..935a16dd7d 100644
--- a/internal/controller/postgrescluster/pgmonitor.go
+++ b/internal/controller/postgrescluster/pgmonitor.go
@@ -58,7 +58,7 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context,
 
 	// Find the PostgreSQL instance that can execute SQL that writes to every
 	// database. When there is none, return early.
-	writablePod, writableInstance = instances.writablePod(naming.ContainerDatabase)
+	writablePod, writableInstance = instances.WritablePod(naming.ContainerDatabase)
 	if writableInstance == nil || writablePod == nil {
 		return nil
 	}
diff --git a/internal/controller/postgrescluster/postgres.go b/internal/controller/postgrescluster/postgres.go
index b9a35d62d3..ada1108a2f 100644
--- a/internal/controller/postgrescluster/postgres.go
+++ b/internal/controller/postgrescluster/postgres.go
@@ -204,7 +204,7 @@ func (r *Reconciler) reconcilePostgresDatabases(
 
 	// Find the PostgreSQL instance that can execute SQL that writes system
 	// catalogs. When there is none, return early.
-	pod, _ := instances.writablePod(container)
+	pod, _ := instances.WritablePod(container)
 	if pod == nil {
 		return nil
 	}
@@ -1047,7 +1047,7 @@ func (r *Reconciler) reconcileDatabaseInitSQL(ctx context.Context,
 	// Now that we have the data provided by the user. We can check for a
 	// writable pod and get the podExecutor for the pod's database container
 	var podExecutor postgres.Executor
-	pod, _ := instances.writablePod(naming.ContainerDatabase)
+	pod, _ := instances.WritablePod(naming.ContainerDatabase)
 	if pod == nil {
 		log.V(1).Info("Could not find a pod with a writable database container.")
 		return nil
diff --git a/internal/pgbouncer/config.go b/internal/pgbouncer/config.go
index 4f4b8d7c5c..4b6a4afb08 100644
--- a/internal/pgbouncer/config.go
+++ b/internal/pgbouncer/config.go
@@ -120,17 +120,6 @@ func clusterINI(cluster *v1beta1.PostgresCluster) string {
 		"server_tls_sslmode": "verify-full",
 		"server_tls_ca_file": certBackendAuthorityAbsolutePath,
 
-		// Enable Unix socket for admin console access. The special user
-		// "pgbouncer" can connect without a password when using a Unix socket
-		// from the same UID as the running process. This allows the operator
-		// to send admin commands like RECONNECT after failover.
-		// Ref.: https://www.pgbouncer.org/usage.html#admin-console
-		"unix_socket_dir": "/tmp/pgbouncer",
-
-		// Allow the "pgbouncer" user to run admin commands (PAUSE, RESUME,
-		// RECONNECT, etc.) on the admin console. Combined with unix_socket_dir,
-		// this enables password-free admin access from within the container.
-		"admin_users": "pgbouncer",
 	}
 
 	// Override the above with any specified settings.
diff --git a/internal/pgbouncer/reconnect.go b/internal/pgbouncer/reconnect.go
deleted file mode 100644
index e527850d48..0000000000
--- a/internal/pgbouncer/reconnect.go
+++ /dev/null
@@ -1,57 +0,0 @@
-package pgbouncer
-
-import (
-	"bytes"
-	"context"
-
-	"github.com/percona/percona-postgresql-operator/internal/logging"
-	"github.com/percona/percona-postgresql-operator/internal/postgres"
-)
-
-// Reconnect sends the RECONNECT command to PgBouncer's admin console.
-// This closes all server connections at the next opportunity, forcing
-// PgBouncer to establish new connections to the current primary.
-//
-// From PgBouncer docs: "Close each open server connection for the given
-// database, or all databases, at the next opportunity."
-//
-// This is non-disruptive: PgBouncer waits for the connection to be
-// "released" before closing it. In transaction pooling mode, this means
-// waiting for the current transaction to complete. In session pooling
-// mode, this means waiting for the client to disconnect.
-//
-// The command connects via Unix socket as the special "pgbouncer" user,
-// which is allowed without password when the client has the same UID
-// as the running PgBouncer process.
-//
-// Ref.: https://www.pgbouncer.org/usage.html#admin-console
-func Reconnect(ctx context.Context, exec postgres.Executor) error {
-	log := logging.FromContext(ctx)
-	log.Info("Triggering PgBouncer RECONNECT to force new server connections")
-
-	stdout, stderr := &bytes.Buffer{}, &bytes.Buffer{}
-
-	// Connect to PgBouncer admin console via Unix socket.
-	// The "pgbouncer" user can connect without password from same UID.
-	// The "pgbouncer" database is the virtual admin console.
-	// Ref.:
-	err := exec(ctx, nil, stdout, stderr,
-		"psql",
-		"-h", "/tmp/pgbouncer",
-		"-p", "6432",
-		"-U", "pgbouncer",
-		"-d", "pgbouncer",
-		"-c", "RECONNECT",
-	)
-
-	if err != nil {
-		log.Error(err, "RECONNECT failed",
-			"stdout", stdout.String(),
-			"stderr", stderr.String())
-	} else {
-		log.V(1).Info("RECONNECT succeeded",
-			"stdout", stdout.String())
-	}
-
-	return err
-}
diff --git a/internal/pgbouncer/signal_failover.go b/internal/pgbouncer/signal_failover.go
new file mode 100644
index 0000000000..9e7d855c3b
--- /dev/null
+++ b/internal/pgbouncer/signal_failover.go
@@ -0,0 +1,36 @@
+package pgbouncer
+
+import (
+	"context"
+
+	"github.com/percona/percona-postgresql-operator/internal/logging"
+	"github.com/percona/percona-postgresql-operator/internal/postgres"
+)
+
+// SignalFailover sends a SIGTERM to PgBouncer to trigger SHUTDOWN WAIT_FOR_CLIENTS [1]
+// mode, waiting for clients to gracefully disconnect [2]. This approach was
+// suggested by a PgBouncer maintainer [3] to deal with failovers in Kubernetes.
+//
+// What happens:
+//  1. Operator sends SIGTERM [2] to PgBouncer process (PID 1 in container)
+//  2. PgBouncer enters SHUTDOWN WAIT_FOR_CLIENTS mode [1].
+//  3. After Kubernetes grace period (default 30s), SIGKILL is sent if process still hasn't exited
+//  4. Container is terminated and restarted by Kubernetes Deployment controller.
+//  5. New PgBouncer process does fresh DNS lookup → connects to current primary.
+//
+// This approach is more effective than RECONNECT command for session mode with persistent
+// clients (MPG clusters) because RECONNECT waits for clients to disconnect, which never happens
+// for persistent clients. SIGTERM will guarantee termination and restarts after a grace period.
+//
+// [1] https://www.pgbouncer.org/usage.html#shutdown
+// [2] https://www.pgbouncer.org/usage.html#signals
+// [3] https://github.com/pgbouncer/pgbouncer/issues/1361
+func SignalFailover(ctx context.Context, exec postgres.Executor) error {
+	log := logging.FromContext(ctx)
+	log.Info("SignalFailover: sending SIGTERM to force container restart")
+
+	err := exec(ctx, nil, nil, nil, "kill", "-TERM", "1")
+
+	log.Info("SignalFailover: SIGTERM sent.", "failed", err != nil)
+	return err
+}
diff --git a/pkg/apis/postgres-operator.crunchydata.com/v1beta1/pgbouncer_types.go b/pkg/apis/postgres-operator.crunchydata.com/v1beta1/pgbouncer_types.go
index 8b2066b5b5..8a66a0a45e 100644
--- a/pkg/apis/postgres-operator.crunchydata.com/v1beta1/pgbouncer_types.go
+++ b/pkg/apis/postgres-operator.crunchydata.com/v1beta1/pgbouncer_types.go
@@ -174,9 +174,9 @@ type PGBouncerPodStatus struct {
 	// Total number of non-terminated pods.
 	Replicas int32 `json:"replicas,omitempty"`
 
-	// Identifies the primary pod UID when RECONNECT was last triggered.
-	// Used to detect failovers and force PgBouncer to establish new
-	// server connections to the correct primary.
+	// Identifies the primary pod UID when failover signal (SIGTERM) was last triggered.
+	// Used to detect failovers and trigger PgBouncer container restart for fresh
+	// connection pool and DNS lookup to the correct primary.
 	// +optional
-	LastReconnectPrimaryUID string `json:"lastReconnectPrimaryUID,omitempty"`
+	LastFailoverPrimaryUID string `json:"lastFailoverPrimaryUID,omitempty"`
 }

From 9ef6137d106b120700acf1486061cfeeb9f23adc Mon Sep 17 00:00:00 2001
From: Juliana Oliveira <juliana@fly.io>
Date: Mon, 26 Jan 2026 20:35:11 -0300
Subject: [PATCH 3/7] wip

---
 internal/pgbouncer/config.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/internal/pgbouncer/config.go b/internal/pgbouncer/config.go
index 4b6a4afb08..3ac90b5bfe 100644
--- a/internal/pgbouncer/config.go
+++ b/internal/pgbouncer/config.go
@@ -120,6 +120,8 @@ func clusterINI(cluster *v1beta1.PostgresCluster) string {
 		"server_tls_sslmode": "verify-full",
 		"server_tls_ca_file": certBackendAuthorityAbsolutePath,
 
+		// Disable Unix sockets to keep the filesystem read-only.
+		"unix_socket_dir": "",
 	}
 
 	// Override the above with any specified settings.

From 6c7ebcdb93b8be226720c8051a3f2b78bf589922 Mon Sep 17 00:00:00 2001
From: Juliana Oliveira <juliana@fly.io>
Date: Mon, 26 Jan 2026 22:50:09 -0300
Subject: [PATCH 4/7] wip

---
 ...ator.crunchydata.com_postgresclusters.yaml |   6 ++
 ...ator.crunchydata.com_postgresclusters.yaml |   6 ++
 deploy/crd.yaml                               |   6 ++
 .../controller/postgrescluster/pgbouncer.go   | 100 ++++++++++--------
 internal/pgbouncer/signal_failover.go         |  36 -------
 5 files changed, 76 insertions(+), 78 deletions(-)
 delete mode 100644 internal/pgbouncer/signal_failover.go

diff --git a/build/crd/crunchy/generated/postgres-operator.crunchydata.com_postgresclusters.yaml b/build/crd/crunchy/generated/postgres-operator.crunchydata.com_postgresclusters.yaml
index 8d1bf8dbec..9fb80ba5d2 100644
--- a/build/crd/crunchy/generated/postgres-operator.crunchydata.com_postgresclusters.yaml
+++ b/build/crd/crunchy/generated/postgres-operator.crunchydata.com_postgresclusters.yaml
@@ -22972,6 +22972,12 @@ spec:
                 properties:
                   pgBouncer:
                     properties:
+                      lastFailoverPrimaryUID:
+                        description: |-
+                          Identifies the primary pod UID when failover signal (SIGTERM) was last triggered.
+                          Used to detect failovers and trigger PgBouncer container restart for fresh
+                          connection pool and DNS lookup to the correct primary.
+                        type: string
                       postgresRevision:
                         description: |-
                           Identifies the revision of PgBouncer assets that have been installed into
diff --git a/config/crd/bases/postgres-operator.crunchydata.com_postgresclusters.yaml b/config/crd/bases/postgres-operator.crunchydata.com_postgresclusters.yaml
index 17a1908ccf..6bffe1b60a 100644
--- a/config/crd/bases/postgres-operator.crunchydata.com_postgresclusters.yaml
+++ b/config/crd/bases/postgres-operator.crunchydata.com_postgresclusters.yaml
@@ -22870,6 +22870,12 @@ spec:
                 properties:
                   pgBouncer:
                     properties:
+                      lastFailoverPrimaryUID:
+                        description: |-
+                          Identifies the primary pod UID when failover signal (SIGTERM) was last triggered.
+                          Used to detect failovers and trigger PgBouncer container restart for fresh
+                          connection pool and DNS lookup to the correct primary.
+                        type: string
                       postgresRevision:
                         description: |-
                           Identifies the revision of PgBouncer assets that have been installed into
diff --git a/deploy/crd.yaml b/deploy/crd.yaml
index c06debdaec..9bc5532465 100644
--- a/deploy/crd.yaml
+++ b/deploy/crd.yaml
@@ -51472,6 +51472,12 @@ spec:
                 properties:
                   pgBouncer:
                     properties:
+                      lastFailoverPrimaryUID:
+                        description: |-
+                          Identifies the primary pod UID when failover signal (SIGTERM) was last triggered.
+                          Used to detect failovers and trigger PgBouncer container restart for fresh
+                          connection pool and DNS lookup to the correct primary.
+                        type: string
                       postgresRevision:
                         description: |-
                           Identifies the revision of PgBouncer assets that have been installed into
diff --git a/internal/controller/postgrescluster/pgbouncer.go b/internal/controller/postgrescluster/pgbouncer.go
index 5bb798dd33..7b9e2c9b75 100644
--- a/internal/controller/postgrescluster/pgbouncer.go
+++ b/internal/controller/postgrescluster/pgbouncer.go
@@ -6,10 +6,11 @@ package postgrescluster
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"io"
 
-	"github.com/pkg/errors"
+	pkgerrors "github.com/pkg/errors"
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	policyv1 "k8s.io/api/policy/v1"
@@ -55,7 +56,7 @@ func (r *Reconciler) reconcilePGBouncer(
 		err = r.reconcilePGBouncerInPostgreSQL(ctx, cluster, instances, secret)
 	}
 	if err == nil {
-		// Send SIGTERM to PgBouncer if primary has changed, triggering graceful
+		// Stop PgBouncer pods if primary has changed, triggering graceful
 		// shutdown and container restart. New process will do fresh DNS lookup.
 		// This prevents stale connections from routing traffic to a demoted replica.
 		err = r.reconcilePGBouncerReconnect(ctx, cluster, instances)
@@ -77,14 +78,14 @@ func (r *Reconciler) reconcilePGBouncerConfigMap(
 		// PgBouncer is disabled; delete the ConfigMap if it exists. Check the
 		// client cache first using Get.
 		key := client.ObjectKeyFromObject(configmap)
-		err := errors.WithStack(r.Client.Get(ctx, key, configmap))
+		err := pkgerrors.WithStack(r.Client.Get(ctx, key, configmap))
 		if err == nil {
-			err = errors.WithStack(r.deleteControlled(ctx, cluster, configmap))
+			err = pkgerrors.WithStack(r.deleteControlled(ctx, cluster, configmap))
 		}
 		return nil, client.IgnoreNotFound(err)
 	}
 
-	err := errors.WithStack(r.setControllerReference(cluster, configmap))
+	err := pkgerrors.WithStack(r.setControllerReference(cluster, configmap))
 
 	configmap.Annotations = naming.Merge(
 		cluster.Spec.Metadata.GetAnnotationsOrNil(),
@@ -101,7 +102,7 @@ func (r *Reconciler) reconcilePGBouncerConfigMap(
 		pgbouncer.ConfigMap(cluster, configmap)
 	}
 	if err == nil {
-		err = errors.WithStack(r.apply(ctx, configmap))
+		err = pkgerrors.WithStack(r.apply(ctx, configmap))
 	}
 
 	return configmap, err
@@ -137,12 +138,12 @@ func (r *Reconciler) reconcilePGBouncerInPostgreSQL(
 	}
 
 	action := func(ctx context.Context, exec postgres.Executor) error {
-		return errors.WithStack(pgbouncer.EnableInPostgreSQL(ctx, exec, clusterSecret, exposeSuperusers))
+		return pkgerrors.WithStack(pgbouncer.EnableInPostgreSQL(ctx, exec, clusterSecret, exposeSuperusers))
 	}
 	if cluster.Spec.Proxy == nil || cluster.Spec.Proxy.PGBouncer == nil {
 		// PgBouncer is disabled.
 		action = func(ctx context.Context, exec postgres.Executor) error {
-			return errors.WithStack(pgbouncer.DisableInPostgreSQL(ctx, exec))
+			return pkgerrors.WithStack(pgbouncer.DisableInPostgreSQL(ctx, exec))
 		}
 	}
 
@@ -198,7 +199,7 @@ func (r *Reconciler) reconcilePGBouncerSecret(
 	root *pki.RootCertificateAuthority, service *corev1.Service,
 ) (*corev1.Secret, error) {
 	existing := &corev1.Secret{ObjectMeta: naming.ClusterPGBouncer(cluster)}
-	err := errors.WithStack(
+	err := pkgerrors.WithStack(
 		r.Client.Get(ctx, client.ObjectKeyFromObject(existing), existing))
 	if client.IgnoreNotFound(err) != nil {
 		return nil, err
@@ -207,7 +208,7 @@ func (r *Reconciler) reconcilePGBouncerSecret(
 	if cluster.Spec.Proxy == nil || cluster.Spec.Proxy.PGBouncer == nil {
 		// PgBouncer is disabled; delete the Secret if it exists.
 		if err == nil {
-			err = errors.WithStack(r.deleteControlled(ctx, cluster, existing))
+			err = pkgerrors.WithStack(r.deleteControlled(ctx, cluster, existing))
 		}
 		return nil, client.IgnoreNotFound(err)
 	}
@@ -221,7 +222,7 @@ func (r *Reconciler) reconcilePGBouncerSecret(
 	// K8SPG-330: Keep this commented in case of conflicts.
 	// We don't want to delete TLS secrets on cluster deletion.
 	// if err == nil {
-	// 	err = errors.WithStack(r.setControllerReference(cluster, intent))
+	// 	err = pkgerrors.WithStack(r.setControllerReference(cluster, intent))
 	// }
 
 	intent.Annotations = naming.Merge(
@@ -239,7 +240,7 @@ func (r *Reconciler) reconcilePGBouncerSecret(
 		err = pgbouncer.Secret(ctx, cluster, root, existing, service, intent)
 	}
 	if err == nil {
-		err = errors.WithStack(r.apply(ctx, intent))
+		err = pkgerrors.WithStack(r.apply(ctx, intent))
 	}
 
 	return intent, err
@@ -320,7 +321,7 @@ func (r *Reconciler) generatePGBouncerService(
 	}
 	service.Spec.Ports = []corev1.ServicePort{servicePort}
 
-	err := errors.WithStack(r.setControllerReference(cluster, service))
+	err := pkgerrors.WithStack(r.setControllerReference(cluster, service))
 
 	return service, true, err
 }
@@ -338,15 +339,15 @@ func (r *Reconciler) reconcilePGBouncerService(
 		// PgBouncer is disabled; delete the Service if it exists. Check the client
 		// cache first using Get.
 		key := client.ObjectKeyFromObject(service)
-		err := errors.WithStack(r.Client.Get(ctx, key, service))
+		err := pkgerrors.WithStack(r.Client.Get(ctx, key, service))
 		if err == nil {
-			err = errors.WithStack(r.deleteControlled(ctx, cluster, service))
+			err = pkgerrors.WithStack(r.deleteControlled(ctx, cluster, service))
 		}
 		return nil, client.IgnoreNotFound(err)
 	}
 
 	if err == nil {
-		err = errors.WithStack(r.apply(ctx, service))
+		err = pkgerrors.WithStack(r.apply(ctx, service))
 	}
 	return service, err
 }
@@ -454,7 +455,7 @@ func (r *Reconciler) generatePGBouncerDeployment(
 	// set the image pull secrets, if any exist
 	deploy.Spec.Template.Spec.ImagePullSecrets = cluster.Spec.ImagePullSecrets
 
-	err := errors.WithStack(r.setControllerReference(cluster, deploy))
+	err := pkgerrors.WithStack(r.setControllerReference(cluster, deploy))
 
 	if err == nil {
 		pgbouncer.Pod(ctx, cluster, configmap, primaryCertificate, secret, &deploy.Spec.Template.Spec)
@@ -509,15 +510,15 @@ func (r *Reconciler) reconcilePGBouncerDeployment(
 		// PgBouncer is disabled; delete the Deployment if it exists. Check the
 		// client cache first using Get.
 		key := client.ObjectKeyFromObject(deploy)
-		err := errors.WithStack(r.Client.Get(ctx, key, deploy))
+		err := pkgerrors.WithStack(r.Client.Get(ctx, key, deploy))
 		if err == nil {
-			err = errors.WithStack(r.deleteControlled(ctx, cluster, deploy))
+			err = pkgerrors.WithStack(r.deleteControlled(ctx, cluster, deploy))
 		}
 		return client.IgnoreNotFound(err)
 	}
 
 	if err == nil {
-		err = errors.WithStack(r.apply(ctx, deploy))
+		err = pkgerrors.WithStack(r.apply(ctx, deploy))
 	}
 	return err
 }
@@ -534,9 +535,9 @@ func (r *Reconciler) reconcilePGBouncerPodDisruptionBudget(
 ) error {
 	deleteExistingPDB := func(cluster *v1beta1.PostgresCluster) error {
 		existing := &policyv1.PodDisruptionBudget{ObjectMeta: naming.ClusterPGBouncer(cluster)}
-		err := errors.WithStack(r.Client.Get(ctx, client.ObjectKeyFromObject(existing), existing))
+		err := pkgerrors.WithStack(r.Client.Get(ctx, client.ObjectKeyFromObject(existing), existing))
 		if err == nil {
-			err = errors.WithStack(r.deleteControlled(ctx, cluster, existing))
+			err = pkgerrors.WithStack(r.deleteControlled(ctx, cluster, existing))
 		}
 		return client.IgnoreNotFound(err)
 	}
@@ -577,7 +578,7 @@ func (r *Reconciler) reconcilePGBouncerPodDisruptionBudget(
 	}
 
 	if err == nil {
-		err = errors.WithStack(r.apply(ctx, pdb))
+		err = pkgerrors.WithStack(r.apply(ctx, pdb))
 	}
 	return err
 }
@@ -587,27 +588,45 @@ func (r *Reconciler) pgbouncerPods(ctx context.Context, cluster *v1beta1.Postgre
 	pgbouncerPods := &corev1.PodList{}
 	selector, err := naming.AsSelector(naming.ClusterPGBouncerSelector(cluster))
 	if err != nil {
-		return nil, errors.WithStack(err)
+		return nil, pkgerrors.WithStack(err)
 	}
 
 	if err := r.Client.List(ctx, pgbouncerPods,
 		client.InNamespace(cluster.Namespace),
 		client.MatchingLabelsSelector{Selector: selector}); err != nil {
-		return nil, errors.WithStack(err)
+		return nil, pkgerrors.WithStack(err)
 	}
 	return pgbouncerPods, nil
 }
 
-// reconcilePGBouncerReconnect is a sub-reconciler that signals PgBouncer pods
+// reconcilePGBouncerReconnect is a sub-reconciler that deletes PgBouncer pods
 // when the primary has changed. This forces PgBouncer to establish new
 // server connections to the correct primary, preventing stale connections
 // from routing traffic to a demoted replica after failover.
 //
-// Note: RECONNECT closes server connections when they are "released" according
-// to the pool mode. In transaction mode, this happens after each transaction.
-// In session mode, this happens when the client disconnects - so persistent
-// clients may continue hitting the old primary until they reconnect.
-// It returns error for integration with the parent reconciler's error handling chain.
+// Delete pod sends a SIGTERM to PgBouncer to trigger SHUTDOWN WAIT_FOR_CLIENTS [1]
+// mode, waiting for clients to gracefully disconnect [2]. This approach was
+// suggested by a PgBouncer maintainer [3] to deal with failovers in Kubernetes.
+//
+// What happens:
+//  1. Kubernetes sends SIGTERM [2] to PgBouncer
+//  2. PgBouncer enters SHUTDOWN WAIT_FOR_CLIENTS mode [1].
+//  3. After Kubernetes grace period (default 30s), SIGKILL is sent if process still hasn't exited
+//  4. Container is terminated and restarted by Kubernetes Deployment controller.
+//  5. New PgBouncer process does fresh DNS lookup → connects to current primary.
+//
+// This approach is more effective than RECONNECT command for session mode with persistent
+// clients (MPG clusters) because RECONNECT waits for clients to disconnect, which never happens
+// for persistent clients. SIGTERM will guarantee termination and restarts after a grace period.
+//
+// [1] https://www.pgbouncer.org/usage.html#shutdown
+// [2] https://www.pgbouncer.org/usage.html#signals
+// [3] https://github.com/pgbouncer/pgbouncer/issues/1361
+//
+// Note: We don't use RECONNECT because it closes server connections when they are "released"
+// according to the pool mode. In transaction mode, this happens after each transaction. In session
+// mode, this happens when the client disconnects - so persistent clients may continue hitting
+// the old primary until they reconnect.
 func (r *Reconciler) reconcilePGBouncerReconnect(
 	ctx context.Context, cluster *v1beta1.PostgresCluster,
 	instances *observedInstances,
@@ -644,28 +663,24 @@ func (r *Reconciler) reconcilePGBouncerReconnect(
 		return err
 	}
 
-	// Send SIGTERM to each running PgBouncer pod to trigger graceful shutdown
-	// and container restart. New PgBouncer process will do fresh DNS lookup.
 	var failoverErrs []error
 	successCount := 0
 
-	for i := range pgbouncerPods.Items {
-		pod := pgbouncerPods.Items[i] // Copy value to avoid closure reference issues
+	for _, pod := range pgbouncerPods.Items {
 		if pod.Status.Phase != corev1.PodRunning {
 			continue
 		}
 
-		if err := pgbouncer.SignalFailover(ctx, func(ctx context.Context, stdin io.Reader, stdout, stderr io.Writer, command ...string) error {
-			return r.PodExec(ctx, pod.Namespace, pod.Name, naming.ContainerPGBouncer, stdin, stdout, stderr, command...)
-		}); err != nil {
-			log.Error(err, "PgBouncer failover signal: failed to send SIGTERM to pod", "pod", pod.Name)
+		if err := r.Client.Delete(ctx, &pod); err != nil {
+			log.Error(err, "PgBouncer failover signal: failed to delete pod", "pod", pod.Name)
 			failoverErrs = append(failoverErrs, fmt.Errorf("pod %s: %w", pod.Name, err))
 		} else {
+			log.Info("PgBouncer failover signal: deleted pod for recreation", "pod", pod.Name)
 			successCount++
 		}
 	}
 
-	// Update status only if all pods were successfully signaled.
+	// Update status only if all pods were successfully stopped.
 	// Partial failures will be retried in the next reconciliation loop.
 	if len(failoverErrs) == 0 {
 		cluster.Status.Proxy.PGBouncer.LastFailoverPrimaryUID = currentPrimaryUID
@@ -678,10 +693,11 @@ func (r *Reconciler) reconcilePGBouncerReconnect(
 		"totalPods", len(pgbouncerPods.Items),
 	)
 
-	// Return aggregated errors if any pods failed
+	// Return aggregated errors if any failed
 	if len(failoverErrs) > 0 {
 		return fmt.Errorf("failed to signal %d of %d pgbouncer pods: %w",
-			len(failoverErrs), len(pgbouncerPods.Items), failoverErrs[0])
+			len(failoverErrs), len(pgbouncerPods.Items), errors.Join(failoverErrs...))
 	}
+
 	return nil
 }
diff --git a/internal/pgbouncer/signal_failover.go b/internal/pgbouncer/signal_failover.go
deleted file mode 100644
index 9e7d855c3b..0000000000
--- a/internal/pgbouncer/signal_failover.go
+++ /dev/null
@@ -1,36 +0,0 @@
-package pgbouncer
-
-import (
-	"context"
-
-	"github.com/percona/percona-postgresql-operator/internal/logging"
-	"github.com/percona/percona-postgresql-operator/internal/postgres"
-)
-
-// SignalFailover sends a SIGTERM to PgBouncer to trigger SHUTDOWN WAIT_FOR_CLIENTS [1]
-// mode, waiting for clients to gracefully disconnect [2]. This approach was
-// suggested by a PgBouncer maintainer [3] to deal with failovers in Kubernetes.
-//
-// What happens:
-//  1. Operator sends SIGTERM [2] to PgBouncer process (PID 1 in container)
-//  2. PgBouncer enters SHUTDOWN WAIT_FOR_CLIENTS mode [1].
-//  3. After Kubernetes grace period (default 30s), SIGKILL is sent if process still hasn't exited
-//  4. Container is terminated and restarted by Kubernetes Deployment controller.
-//  5. New PgBouncer process does fresh DNS lookup → connects to current primary.
-//
-// This approach is more effective than RECONNECT command for session mode with persistent
-// clients (MPG clusters) because RECONNECT waits for clients to disconnect, which never happens
-// for persistent clients. SIGTERM will guarantee termination and restarts after a grace period.
-//
-// [1] https://www.pgbouncer.org/usage.html#shutdown
-// [2] https://www.pgbouncer.org/usage.html#signals
-// [3] https://github.com/pgbouncer/pgbouncer/issues/1361
-func SignalFailover(ctx context.Context, exec postgres.Executor) error {
-	log := logging.FromContext(ctx)
-	log.Info("SignalFailover: sending SIGTERM to force container restart")
-
-	err := exec(ctx, nil, nil, nil, "kill", "-TERM", "1")
-
-	log.Info("SignalFailover: SIGTERM sent.", "failed", err != nil)
-	return err
-}

From 9a9eca73f8b4a36c8680ff62fb5d5aa0a31d237f Mon Sep 17 00:00:00 2001
From: Juliana Oliveira <juliana@fly.io>
Date: Mon, 26 Jan 2026 22:53:31 -0300
Subject: [PATCH 5/7] wip

---
 internal/controller/postgrescluster/pgbouncer.go | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/internal/controller/postgrescluster/pgbouncer.go b/internal/controller/postgrescluster/pgbouncer.go
index 7b9e2c9b75..e9cadd74f6 100644
--- a/internal/controller/postgrescluster/pgbouncer.go
+++ b/internal/controller/postgrescluster/pgbouncer.go
@@ -622,11 +622,6 @@ func (r *Reconciler) pgbouncerPods(ctx context.Context, cluster *v1beta1.Postgre
 // [1] https://www.pgbouncer.org/usage.html#shutdown
 // [2] https://www.pgbouncer.org/usage.html#signals
 // [3] https://github.com/pgbouncer/pgbouncer/issues/1361
-//
-// Note: We don't use RECONNECT because it closes server connections when they are "released"
-// according to the pool mode. In transaction mode, this happens after each transaction. In session
-// mode, this happens when the client disconnects - so persistent clients may continue hitting
-// the old primary until they reconnect.
 func (r *Reconciler) reconcilePGBouncerReconnect(
 	ctx context.Context, cluster *v1beta1.PostgresCluster,
 	instances *observedInstances,

From 929a98d57c77beec0d9a9d54062f67c3f54f9dd4 Mon Sep 17 00:00:00 2001
From: Juliana Oliveira <juliana@fly.io>
Date: Tue, 27 Jan 2026 14:51:13 -0300
Subject: [PATCH 6/7] test

---
 e2e-tests/functions                           |  1 +
 .../tests/pgbouncer-failover/00-assert.yaml   | 12 ++++
 .../00-deploy-operator.yaml                   | 14 +++++
 .../tests/pgbouncer-failover/01-assert.yaml   | 16 ++++++
 .../pgbouncer-failover/01-create-cluster.yaml | 15 +++++
 .../tests/pgbouncer-failover/02-assert.yaml   | 17 ++++++
 .../02-trigger-switchover.yaml                | 55 +++++++++++++++++++
 .../tests/pgbouncer-failover/03-cleanup.yaml  | 21 +++++++
 8 files changed, 151 insertions(+)
 create mode 100644 e2e-tests/tests/pgbouncer-failover/00-assert.yaml
 create mode 100644 e2e-tests/tests/pgbouncer-failover/00-deploy-operator.yaml
 create mode 100644 e2e-tests/tests/pgbouncer-failover/01-assert.yaml
 create mode 100644 e2e-tests/tests/pgbouncer-failover/01-create-cluster.yaml
 create mode 100644 e2e-tests/tests/pgbouncer-failover/02-assert.yaml
 create mode 100644 e2e-tests/tests/pgbouncer-failover/02-trigger-switchover.yaml
 create mode 100644 e2e-tests/tests/pgbouncer-failover/03-cleanup.yaml

diff --git a/e2e-tests/functions b/e2e-tests/functions
index f27292760d..2a100bea25 100644
--- a/e2e-tests/functions
+++ b/e2e-tests/functions
@@ -54,6 +54,7 @@ deploy_operator() {
 		disable_telemetry=false
 	fi
 	yq eval '.spec.template.spec.containers[0].image = "'${IMAGE}'"' "${DEPLOY_DIR}/${cw_prefix}operator.yaml" \
+    	| yq eval '.spec.template.spec.containers[0].imagePullPolicy = "IfNotPresent"' - \
 		| yq eval '(.spec.template.spec.containers[] | select(.name=="operator") | .env[] | select(.name=="DISABLE_TELEMETRY") | .value) = "'${disable_telemetry}'"' - \
 		| kubectl -n "${OPERATOR_NS:-$NAMESPACE}" apply -f -
 }
diff --git a/e2e-tests/tests/pgbouncer-failover/00-assert.yaml b/e2e-tests/tests/pgbouncer-failover/00-assert.yaml
new file mode 100644
index 0000000000..14beb86c55
--- /dev/null
+++ b/e2e-tests/tests/pgbouncer-failover/00-assert.yaml
@@ -0,0 +1,12 @@
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+timeout: 120
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: percona-postgresql-operator
+status:
+  readyReplicas: 1
+  replicas: 1
+  updatedReplicas: 1
diff --git a/e2e-tests/tests/pgbouncer-failover/00-deploy-operator.yaml b/e2e-tests/tests/pgbouncer-failover/00-deploy-operator.yaml
new file mode 100644
index 0000000000..7faf4da852
--- /dev/null
+++ b/e2e-tests/tests/pgbouncer-failover/00-deploy-operator.yaml
@@ -0,0 +1,14 @@
+apiVersion: kuttl.dev/v1beta1
+kind: TestStep
+timeout: 10
+commands:
+  - script: |-
+      set -o errexit
+      set -o xtrace
+
+      source ../../functions
+      init_temp_dir # do this only in the first TestStep
+
+      deploy_operator
+      deploy_client
+      deploy_s3_secrets
diff --git a/e2e-tests/tests/pgbouncer-failover/01-assert.yaml b/e2e-tests/tests/pgbouncer-failover/01-assert.yaml
new file mode 100644
index 0000000000..d92e1f49c5
--- /dev/null
+++ b/e2e-tests/tests/pgbouncer-failover/01-assert.yaml
@@ -0,0 +1,16 @@
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+timeout: 120
+---
+apiVersion: pgv2.percona.com/v2
+kind: PerconaPGCluster
+metadata:
+  name: pgbouncer-failover
+status:
+  postgres:
+    ready: 2
+    size: 2
+  pgbouncer:
+    ready: 2
+    size: 2
+  state: ready
diff --git a/e2e-tests/tests/pgbouncer-failover/01-create-cluster.yaml b/e2e-tests/tests/pgbouncer-failover/01-create-cluster.yaml
new file mode 100644
index 0000000000..585b620930
--- /dev/null
+++ b/e2e-tests/tests/pgbouncer-failover/01-create-cluster.yaml
@@ -0,0 +1,15 @@
+apiVersion: kuttl.dev/v1beta1
+kind: TestStep
+timeout: 10
+commands:
+  - script: |-
+      set -o errexit
+      set -o xtrace
+
+      source ../../functions
+
+      get_cr \
+        | yq eval '
+            .spec.proxy.pgBouncer.replicas=2 |
+            .spec.instances[].replicas=2' - \
+        | kubectl -n "${NAMESPACE}" apply -f -
diff --git a/e2e-tests/tests/pgbouncer-failover/02-assert.yaml b/e2e-tests/tests/pgbouncer-failover/02-assert.yaml
new file mode 100644
index 0000000000..cd0f5a0773
--- /dev/null
+++ b/e2e-tests/tests/pgbouncer-failover/02-assert.yaml
@@ -0,0 +1,17 @@
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+timeout: 120
+---
+apiVersion: pgv2.percona.com/v2
+kind: PerconaPGCluster
+metadata:
+  name: pgbouncer-failover
+status:
+  postgres:
+    ready: 2
+    size: 2
+  pgbouncer:
+    # PgBouncer should be back to ready state with 2 replicas
+    ready: 2
+    size: 2
+  state: ready
diff --git a/e2e-tests/tests/pgbouncer-failover/02-trigger-switchover.yaml b/e2e-tests/tests/pgbouncer-failover/02-trigger-switchover.yaml
new file mode 100644
index 0000000000..a0d47edbc6
--- /dev/null
+++ b/e2e-tests/tests/pgbouncer-failover/02-trigger-switchover.yaml
@@ -0,0 +1,55 @@
+apiVersion: kuttl.dev/v1beta1
+kind: TestStep
+timeout: 60
+commands:
+  - script: |-
+      set -o errexit
+      set -o xtrace
+
+      source ../../functions
+
+      LEADER=$(kubectl get pods -n "${NAMESPACE}" \
+        -l postgres-operator.crunchydata.com/role=primary \
+        -o jsonpath='{.items[0].metadata.name}')
+
+      echo "Current leader: $LEADER"
+
+      REPLICA=$(kubectl get pods -n "${NAMESPACE}" \
+        -l postgres-operator.crunchydata.com/role=replica \
+        -o jsonpath='{.items[0].metadata.name}')
+
+      echo "Replica to promote: $REPLICA"
+
+      kubectl get pods -n "${NAMESPACE}" \
+        -l postgres-operator.crunchydata.com/role=pgbouncer \
+        -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' \
+        > /tmp/pgbouncer-pods-before.txt
+
+      echo "PgBouncer pods before switchover:"
+      cat /tmp/pgbouncer-pods-before.txt
+
+      kubectl exec "$LEADER" -c database -n "${NAMESPACE}" -- \
+        patronictl switchover \
+        --leader "$LEADER" \
+        --candidate "$REPLICA" \
+        --force
+
+      echo "Switchover triggered successfully"
+
+      # Wait for PgBouncer pods to be recreated (they should be deleted and recreated)
+      sleep 15
+
+      kubectl get pods -n "${NAMESPACE}" \
+        -l postgres-operator.crunchydata.com/role=pgbouncer \
+        -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' \
+        > /tmp/pgbouncer-pods-after.txt
+
+      echo "PgBouncer pods after switchover:"
+      cat /tmp/pgbouncer-pods-after.txt
+
+      if diff /tmp/pgbouncer-pods-before.txt /tmp/pgbouncer-pods-after.txt > /dev/null; then
+        echo "ERROR: PgBouncer pods were NOT recreated after failover!"
+        exit 1
+      else
+        echo "SUCCESS: PgBouncer pods were recreated after failover"
+      fi
diff --git a/e2e-tests/tests/pgbouncer-failover/03-cleanup.yaml b/e2e-tests/tests/pgbouncer-failover/03-cleanup.yaml
new file mode 100644
index 0000000000..c6196ff4cf
--- /dev/null
+++ b/e2e-tests/tests/pgbouncer-failover/03-cleanup.yaml
@@ -0,0 +1,21 @@
+apiVersion: kuttl.dev/v1beta1
+kind: TestStep
+delete:
+  - apiVersion: pgv2.percona.com/v2
+    kind: PerconaPGCluster
+    metadata:
+      name: pgbouncer-failover
+  - apiVersion: postgres-operator.crunchydata.com/v1beta1
+    kind: PostgresCluster
+    metadata:
+      name: pgbouncer-failover
+commands:
+  - script: |-
+      set -o errexit
+      set -o xtrace
+
+      source ../../functions
+
+      remove_all_finalizers
+      destroy_operator
+    timeout: 60

From bc5d95709a21745662280b4874a1e6835816d0e7 Mon Sep 17 00:00:00 2001
From: Juliana Oliveira <juliana@fly.io>
Date: Thu, 29 Jan 2026 18:34:29 -0300
Subject: [PATCH 7/7] fix: do not restart on first run

Signed-off-by: Juliana Oliveira <juliana@fly.io>
---
 internal/controller/postgrescluster/pgbouncer.go | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/internal/controller/postgrescluster/pgbouncer.go b/internal/controller/postgrescluster/pgbouncer.go
index e9cadd74f6..e1aad93f1a 100644
--- a/internal/controller/postgrescluster/pgbouncer.go
+++ b/internal/controller/postgrescluster/pgbouncer.go
@@ -648,6 +648,17 @@ func (r *Reconciler) reconcilePGBouncerReconnect(
 		return nil
 	}
 
+	if lastFailoverUID == "" {
+		// First time seeing this cluster or status field was just added.
+		// Initialize with current primary UID without triggering pod restart.
+		log.V(1).Info("Initializing PgBouncer failover tracking",
+			"currentPrimaryUID", currentPrimaryUID,
+			"currentPrimaryName", primaryPod.Name)
+
+		cluster.Status.Proxy.PGBouncer.LastFailoverPrimaryUID = currentPrimaryUID
+		return nil
+	}
+
 	log.Info("Primary changed, triggering PgBouncer failover signal (SIGTERM)",
 		"previousPrimaryUID", lastFailoverUID,
 		"currentPrimaryUID", currentPrimaryUID,