File pacemaker#3878-0001-Fix-libcrmcommon-Add-retries-on-connect-to-avoid-fat.patch of Package pacemaker
From e438946787b9cceec766fa4a721138d3b4e72956 Mon Sep 17 00:00:00 2001
From: Thomas Jones <thomas.jones@ibm.com>
Date: Fri, 30 May 2025 16:40:13 -0400
Subject: [PATCH] Fix: libcrmcommon: Add retries on connect to avoid fatal
errors when sub-daemons communicate Add pcmk__connect_ipc_retry_conrefused()
and use it where it makes sense Add retry loop to
connect_and_send_attrd_request() that retries connect and send.
---
daemons/controld/controld_schedulerd.c | 2 +-
include/crm/common/ipc_internal.h | 4 +++-
lib/common/ipc_attrd.c | 19 ++++++++++++----
lib/common/ipc_client.c | 31 ++++++++++++++++++++++++++
lib/pacemaker/pcmk_cluster_queries.c | 2 +-
5 files changed, 51 insertions(+), 7 deletions(-)
Index: pacemaker-2.1.10+20250718.fdf796ebc8/daemons/controld/controld_schedulerd.c
===================================================================
--- pacemaker-2.1.10+20250718.fdf796ebc8.orig/daemons/controld/controld_schedulerd.c
+++ pacemaker-2.1.10+20250718.fdf796ebc8/daemons/controld/controld_schedulerd.c
@@ -197,7 +197,7 @@ new_schedulerd_ipc_connection(void)
pcmk_register_ipc_callback(schedulerd_api, scheduler_event_callback, NULL);
- rc = pcmk__connect_ipc(schedulerd_api, pcmk_ipc_dispatch_main, 3);
+ rc = pcmk__connect_ipc_retry_conrefused(schedulerd_api, pcmk_ipc_dispatch_main, 3);
if (rc != pcmk_rc_ok) {
crm_err("Error connecting to %s: %s",
pcmk_ipc_name(schedulerd_api, true), pcmk_rc_str(rc));
Index: pacemaker-2.1.10+20250718.fdf796ebc8/include/crm/common/ipc_internal.h
===================================================================
--- pacemaker-2.1.10+20250718.fdf796ebc8.orig/include/crm/common/ipc_internal.h
+++ pacemaker-2.1.10+20250718.fdf796ebc8/include/crm/common/ipc_internal.h
@@ -100,7 +100,9 @@ int pcmk__connect_generic_ipc(crm_ipc_t
int pcmk__ipc_fd(crm_ipc_t *ipc, int *fd);
int pcmk__connect_ipc(pcmk_ipc_api_t *api, enum pcmk_ipc_dispatch dispatch_type,
int attempts);
-
+int pcmk__connect_ipc_retry_conrefused(pcmk_ipc_api_t *api,
+ enum pcmk_ipc_dispatch dispatch_type,
+ int attempts);
/*
* Server-related
*/
Index: pacemaker-2.1.10+20250718.fdf796ebc8/lib/common/ipc_attrd.c
===================================================================
--- pacemaker-2.1.10+20250718.fdf796ebc8.orig/lib/common/ipc_attrd.c
+++ pacemaker-2.1.10+20250718.fdf796ebc8/lib/common/ipc_attrd.c
@@ -152,6 +152,8 @@ create_attrd_op(const char *user_name)
static int
connect_and_send_attrd_request(pcmk_ipc_api_t *api, const xmlNode *request)
{
+ static const int max_retries = 5;
+ int remaining_attempts = max_retries;
int rc = pcmk_rc_ok;
bool created_api = false;
@@ -163,10 +165,19 @@ connect_and_send_attrd_request(pcmk_ipc_
created_api = true;
}
- rc = pcmk__connect_ipc(api, pcmk_ipc_dispatch_sync, 5);
- if (rc == pcmk_rc_ok) {
- rc = pcmk__send_ipc_request(api, request);
- }
+ // If attrd is killed and is being restarted we will temporarily get
+ // ECONNREFUSED on connect if it is already dead or ENOTCONN if it died
+ // after we connected to it. We should wait a bit and retry in those cases.
+ do {
+ if (rc == ENOTCONN || rc == ECONNREFUSED) {
+ sleep(max_retries - remaining_attempts);
+ }
+ rc = pcmk__connect_ipc(api, pcmk_ipc_dispatch_sync, remaining_attempts);
+ if (rc == pcmk_rc_ok) {
+ rc = pcmk__send_ipc_request(api, request);
+ }
+ remaining_attempts--;
+ } while ((rc == ENOTCONN || rc == ECONNREFUSED) && remaining_attempts >= 0);
if (created_api) {
pcmk_free_ipc_api(api);
Index: pacemaker-2.1.10+20250718.fdf796ebc8/lib/common/ipc_client.c
===================================================================
--- pacemaker-2.1.10+20250718.fdf796ebc8.orig/lib/common/ipc_client.c
+++ pacemaker-2.1.10+20250718.fdf796ebc8/lib/common/ipc_client.c
@@ -490,6 +490,37 @@ connect_without_main_loop(pcmk_ipc_api_t
/*!
* \internal
+ * \brief Connect to a Pacemaker daemon via IPC (retrying after soft errors
+ * and ECONNREFUSED)
+ *
+ * \param[in,out] api IPC API instance
+ * \param[in] dispatch_type How IPC replies should be dispatched
+ * \param[in] attempts How many times to try (in case of soft error)
+ *
+ * \return Standard Pacemaker return code
+ */
+int
+pcmk__connect_ipc_retry_conrefused(pcmk_ipc_api_t *api,
+ enum pcmk_ipc_dispatch dispatch_type,
+ int attempts)
+{
+ int remaining = attempts;
+ int rc = pcmk_rc_ok;
+
+ do {
+ if (rc == ECONNREFUSED) {
+ pcmk__sleep_ms((attempts - remaining) * 500);
+ }
+ rc = pcmk__connect_ipc(api, dispatch_type, remaining);
+ remaining--;
+ } while (rc == ECONNREFUSED && remaining >= 0);
+
+ return rc;
+}
+
+
+/*!
+ * \internal
* \brief Connect to a Pacemaker daemon via IPC (retrying after soft errors)
*
* \param[in,out] api IPC API instance
Index: pacemaker-2.1.10+20250718.fdf796ebc8/lib/pacemaker/pcmk_cluster_queries.c
===================================================================
--- pacemaker-2.1.10+20250718.fdf796ebc8.orig/lib/pacemaker/pcmk_cluster_queries.c
+++ pacemaker-2.1.10+20250718.fdf796ebc8/lib/pacemaker/pcmk_cluster_queries.c
@@ -360,7 +360,7 @@ ipc_connect(data_t *data, enum pcmk_ipc_
pcmk_register_ipc_callback(api, cb, data);
}
- rc = pcmk__connect_ipc(api, dispatch_type, 5);
+ rc = pcmk__connect_ipc_retry_conrefused(api, dispatch_type, 5);
if (rc != pcmk_rc_ok) {
if (rc == EREMOTEIO) {
data->pcmkd_state = pcmk_pacemakerd_state_remote;