File pacemaker#3878-0001-Fix-libcrmcommon-Add-retries-on-connect-to-avoid-fat.patch of Package pacemaker.41101

From e438946787b9cceec766fa4a721138d3b4e72956 Mon Sep 17 00:00:00 2001
From: Thomas Jones <thomas.jones@ibm.com>
Date: Fri, 30 May 2025 16:40:13 -0400
Subject: [PATCH] Fix: libcrmcommon: Add retries on connect to avoid fatal
 errors when sub-daemons communicate Add pcmk__connect_ipc_retry_conrefused()
 and use it where it makes sense Add retry loop to
 connect_and_send_attrd_request() that retries connect and send.

---
 daemons/controld/controld_schedulerd.c |  2 +-
 include/crm/common/ipc_internal.h      |  4 +++-
 lib/common/ipc_attrd.c                 | 19 ++++++++++++----
 lib/common/ipc_client.c                | 31 ++++++++++++++++++++++++++
 lib/pacemaker/pcmk_cluster_queries.c   |  2 +-
 5 files changed, 51 insertions(+), 7 deletions(-)

Index: pacemaker-2.1.7+20231219.0f7f88312/daemons/controld/controld_schedulerd.c
===================================================================
--- pacemaker-2.1.7+20231219.0f7f88312.orig/daemons/controld/controld_schedulerd.c
+++ pacemaker-2.1.7+20231219.0f7f88312/daemons/controld/controld_schedulerd.c
@@ -197,7 +197,7 @@ new_schedulerd_ipc_connection(void)
 
     pcmk_register_ipc_callback(schedulerd_api, scheduler_event_callback, NULL);
 
-    rc = pcmk__connect_ipc(schedulerd_api, pcmk_ipc_dispatch_main, 3);
+    rc = pcmk__connect_ipc_retry_conrefused(schedulerd_api, pcmk_ipc_dispatch_main, 3);
     if (rc != pcmk_rc_ok) {
         crm_err("Error connecting to %s: %s",
                 pcmk_ipc_name(schedulerd_api, true), pcmk_rc_str(rc));
Index: pacemaker-2.1.7+20231219.0f7f88312/include/crm/common/ipc_internal.h
===================================================================
--- pacemaker-2.1.7+20231219.0f7f88312.orig/include/crm/common/ipc_internal.h
+++ pacemaker-2.1.7+20231219.0f7f88312/include/crm/common/ipc_internal.h
@@ -100,7 +100,9 @@ int pcmk__connect_generic_ipc(crm_ipc_t
 int pcmk__ipc_fd(crm_ipc_t *ipc, int *fd);
 int pcmk__connect_ipc(pcmk_ipc_api_t *api, enum pcmk_ipc_dispatch dispatch_type,
                       int attempts);
-
+int pcmk__connect_ipc_retry_conrefused(pcmk_ipc_api_t *api,
+                                       enum pcmk_ipc_dispatch dispatch_type,
+                                       int attempts);
 /*
  * Server-related
  */
Index: pacemaker-2.1.7+20231219.0f7f88312/lib/common/ipc_attrd.c
===================================================================
--- pacemaker-2.1.7+20231219.0f7f88312.orig/lib/common/ipc_attrd.c
+++ pacemaker-2.1.7+20231219.0f7f88312/lib/common/ipc_attrd.c
@@ -171,23 +171,33 @@ destroy_api(pcmk_ipc_api_t *api)
 static int
 connect_and_send_attrd_request(pcmk_ipc_api_t *api, const xmlNode *request)
 {
+    static const int max_retries = 5;
+    int remaining_attempts = max_retries;
     int rc = pcmk_rc_ok;
 
-    rc = pcmk__connect_ipc(api, pcmk_ipc_dispatch_sync, 5);
-    if (rc != pcmk_rc_ok) {
-        crm_err("Could not connect to %s: %s",
-                pcmk_ipc_name(api, true), pcmk_rc_str(rc));
-        return rc;
-    }
+    // If attrd is killed and is being restarted we will temporarily get
+    // ECONNREFUSED on connect if it is already dead or ENOTCONN if it died
+    // after we connected to it. We should wait a bit and retry in those cases.
+    do {
+        if (rc == ENOTCONN || rc == ECONNREFUSED) {
+            sleep(max_retries - remaining_attempts);
+        }
+        rc = pcmk__connect_ipc(api, pcmk_ipc_dispatch_sync, remaining_attempts);
+        if (rc != pcmk_rc_ok) {
+            crm_err("Could not connect to %s: %s",
+                    pcmk_ipc_name(api, true), pcmk_rc_str(rc));
 
-    rc = pcmk__send_ipc_request(api, request);
-    if (rc != pcmk_rc_ok) {
-        crm_err("Could not send request to %s: %s",
-                pcmk_ipc_name(api, true), pcmk_rc_str(rc));
-        return rc;
-    }
+        } else {
+            rc = pcmk__send_ipc_request(api, request);
+            if (rc != pcmk_rc_ok) {
+                crm_err("Could not send request to %s: %s",
+                        pcmk_ipc_name(api, true), pcmk_rc_str(rc));
+            }
+        }
+        remaining_attempts--;
+    } while ((rc == ENOTCONN || rc == ECONNREFUSED) && remaining_attempts >= 0);
 
-    return pcmk_rc_ok;
+    return rc;
 }
 
 static int
Index: pacemaker-2.1.7+20231219.0f7f88312/lib/common/ipc_client.c
===================================================================
--- pacemaker-2.1.7+20231219.0f7f88312.orig/lib/common/ipc_client.c
+++ pacemaker-2.1.7+20231219.0f7f88312/lib/common/ipc_client.c
@@ -489,6 +489,37 @@ connect_without_main_loop(pcmk_ipc_api_t
 
 /*!
  * \internal
+ * \brief Connect to a Pacemaker daemon via IPC (retrying after soft errors
+ *        and ECONNREFUSED)
+ *
+ * \param[in,out] api            IPC API instance
+ * \param[in]     dispatch_type  How IPC replies should be dispatched
+ * \param[in]     attempts       How many times to try (in case of soft error)
+ *
+ * \return Standard Pacemaker return code
+ */
+int
+pcmk__connect_ipc_retry_conrefused(pcmk_ipc_api_t *api,
+                                   enum pcmk_ipc_dispatch dispatch_type,
+                                   int attempts)
+{
+    int remaining = attempts;
+    int rc = pcmk_rc_ok;
+
+    do {
+        if (rc == ECONNREFUSED) {
+            pcmk__sleep_ms((attempts - remaining) * 500);
+        }
+        rc = pcmk__connect_ipc(api, dispatch_type, remaining);
+        remaining--;
+    } while (rc == ECONNREFUSED && remaining >= 0);
+
+    return rc;
+}
+
+
+/*!
+ * \internal
  * \brief Connect to a Pacemaker daemon via IPC (retrying after soft errors)
  *
  * \param[in,out] api            IPC API instance
Index: pacemaker-2.1.7+20231219.0f7f88312/lib/pacemaker/pcmk_cluster_queries.c
===================================================================
--- pacemaker-2.1.7+20231219.0f7f88312.orig/lib/pacemaker/pcmk_cluster_queries.c
+++ pacemaker-2.1.7+20231219.0f7f88312/lib/pacemaker/pcmk_cluster_queries.c
@@ -361,7 +361,7 @@ ipc_connect(data_t *data, enum pcmk_ipc_
         pcmk_register_ipc_callback(api, cb, data);
     }
 
-    rc = pcmk__connect_ipc(api, dispatch_type, 5);
+    rc = pcmk__connect_ipc_retry_conrefused(api, dispatch_type, 5);
     if (rc != pcmk_rc_ok) {
         if (rc == EREMOTEIO) {
             data->pcmkd_state = pcmk_pacemakerd_state_remote;
openSUSE Build Service is sponsored by