File pacemaker#3878-0001-Fix-libcrmcommon-Add-retries-on-connect-to-avoid-fat.patch of Package pacemaker.41101
From e438946787b9cceec766fa4a721138d3b4e72956 Mon Sep 17 00:00:00 2001
From: Thomas Jones <thomas.jones@ibm.com>
Date: Fri, 30 May 2025 16:40:13 -0400
Subject: [PATCH] Fix: libcrmcommon: Add retries on connect to avoid fatal
errors when sub-daemons communicate Add pcmk__connect_ipc_retry_conrefused()
and use it where it makes sense Add retry loop to
connect_and_send_attrd_request() that retries connect and send.
---
daemons/controld/controld_schedulerd.c | 2 +-
include/crm/common/ipc_internal.h | 4 +++-
lib/common/ipc_attrd.c | 19 ++++++++++++----
lib/common/ipc_client.c | 31 ++++++++++++++++++++++++++
lib/pacemaker/pcmk_cluster_queries.c | 2 +-
5 files changed, 51 insertions(+), 7 deletions(-)
Index: pacemaker-2.1.7+20231219.0f7f88312/daemons/controld/controld_schedulerd.c
===================================================================
--- pacemaker-2.1.7+20231219.0f7f88312.orig/daemons/controld/controld_schedulerd.c
+++ pacemaker-2.1.7+20231219.0f7f88312/daemons/controld/controld_schedulerd.c
@@ -197,7 +197,7 @@ new_schedulerd_ipc_connection(void)
pcmk_register_ipc_callback(schedulerd_api, scheduler_event_callback, NULL);
- rc = pcmk__connect_ipc(schedulerd_api, pcmk_ipc_dispatch_main, 3);
+ rc = pcmk__connect_ipc_retry_conrefused(schedulerd_api, pcmk_ipc_dispatch_main, 3);
if (rc != pcmk_rc_ok) {
crm_err("Error connecting to %s: %s",
pcmk_ipc_name(schedulerd_api, true), pcmk_rc_str(rc));
Index: pacemaker-2.1.7+20231219.0f7f88312/include/crm/common/ipc_internal.h
===================================================================
--- pacemaker-2.1.7+20231219.0f7f88312.orig/include/crm/common/ipc_internal.h
+++ pacemaker-2.1.7+20231219.0f7f88312/include/crm/common/ipc_internal.h
@@ -100,7 +100,9 @@ int pcmk__connect_generic_ipc(crm_ipc_t
int pcmk__ipc_fd(crm_ipc_t *ipc, int *fd);
int pcmk__connect_ipc(pcmk_ipc_api_t *api, enum pcmk_ipc_dispatch dispatch_type,
int attempts);
-
+int pcmk__connect_ipc_retry_conrefused(pcmk_ipc_api_t *api,
+ enum pcmk_ipc_dispatch dispatch_type,
+ int attempts);
/*
* Server-related
*/
Index: pacemaker-2.1.7+20231219.0f7f88312/lib/common/ipc_attrd.c
===================================================================
--- pacemaker-2.1.7+20231219.0f7f88312.orig/lib/common/ipc_attrd.c
+++ pacemaker-2.1.7+20231219.0f7f88312/lib/common/ipc_attrd.c
@@ -171,23 +171,33 @@ destroy_api(pcmk_ipc_api_t *api)
static int
connect_and_send_attrd_request(pcmk_ipc_api_t *api, const xmlNode *request)
{
+ static const int max_retries = 5;
+ int remaining_attempts = max_retries;
int rc = pcmk_rc_ok;
- rc = pcmk__connect_ipc(api, pcmk_ipc_dispatch_sync, 5);
- if (rc != pcmk_rc_ok) {
- crm_err("Could not connect to %s: %s",
- pcmk_ipc_name(api, true), pcmk_rc_str(rc));
- return rc;
- }
+ // If attrd is killed and is being restarted we will temporarily get
+ // ECONNREFUSED on connect if it is already dead or ENOTCONN if it died
+ // after we connected to it. We should wait a bit and retry in those cases.
+ do {
+ if (rc == ENOTCONN || rc == ECONNREFUSED) {
+ sleep(max_retries - remaining_attempts);
+ }
+ rc = pcmk__connect_ipc(api, pcmk_ipc_dispatch_sync, remaining_attempts);
+ if (rc != pcmk_rc_ok) {
+ crm_err("Could not connect to %s: %s",
+ pcmk_ipc_name(api, true), pcmk_rc_str(rc));
- rc = pcmk__send_ipc_request(api, request);
- if (rc != pcmk_rc_ok) {
- crm_err("Could not send request to %s: %s",
- pcmk_ipc_name(api, true), pcmk_rc_str(rc));
- return rc;
- }
+ } else {
+ rc = pcmk__send_ipc_request(api, request);
+ if (rc != pcmk_rc_ok) {
+ crm_err("Could not send request to %s: %s",
+ pcmk_ipc_name(api, true), pcmk_rc_str(rc));
+ }
+ }
+ remaining_attempts--;
+ } while ((rc == ENOTCONN || rc == ECONNREFUSED) && remaining_attempts >= 0);
- return pcmk_rc_ok;
+ return rc;
}
static int
Index: pacemaker-2.1.7+20231219.0f7f88312/lib/common/ipc_client.c
===================================================================
--- pacemaker-2.1.7+20231219.0f7f88312.orig/lib/common/ipc_client.c
+++ pacemaker-2.1.7+20231219.0f7f88312/lib/common/ipc_client.c
@@ -489,6 +489,37 @@ connect_without_main_loop(pcmk_ipc_api_t
/*!
* \internal
+ * \brief Connect to a Pacemaker daemon via IPC (retrying after soft errors
+ * and ECONNREFUSED)
+ *
+ * \param[in,out] api IPC API instance
+ * \param[in] dispatch_type How IPC replies should be dispatched
+ * \param[in] attempts How many times to try (in case of soft error)
+ *
+ * \return Standard Pacemaker return code
+ */
+int
+pcmk__connect_ipc_retry_conrefused(pcmk_ipc_api_t *api,
+ enum pcmk_ipc_dispatch dispatch_type,
+ int attempts)
+{
+ int remaining = attempts;
+ int rc = pcmk_rc_ok;
+
+ do {
+ if (rc == ECONNREFUSED) {
+ pcmk__sleep_ms((attempts - remaining) * 500);
+ }
+ rc = pcmk__connect_ipc(api, dispatch_type, remaining);
+ remaining--;
+ } while (rc == ECONNREFUSED && remaining >= 0);
+
+ return rc;
+}
+
+
+/*!
+ * \internal
* \brief Connect to a Pacemaker daemon via IPC (retrying after soft errors)
*
* \param[in,out] api IPC API instance
Index: pacemaker-2.1.7+20231219.0f7f88312/lib/pacemaker/pcmk_cluster_queries.c
===================================================================
--- pacemaker-2.1.7+20231219.0f7f88312.orig/lib/pacemaker/pcmk_cluster_queries.c
+++ pacemaker-2.1.7+20231219.0f7f88312/lib/pacemaker/pcmk_cluster_queries.c
@@ -361,7 +361,7 @@ ipc_connect(data_t *data, enum pcmk_ipc_
pcmk_register_ipc_callback(api, cb, data);
}
- rc = pcmk__connect_ipc(api, dispatch_type, 5);
+ rc = pcmk__connect_ipc_retry_conrefused(api, dispatch_type, 5);
if (rc != pcmk_rc_ok) {
if (rc == EREMOTEIO) {
data->pcmkd_state = pcmk_pacemakerd_state_remote;