Package not found: home:mcalabkova:branches:devel:languages:python/cloud-init

File pacemaker-libcrmcommon-avoid-evicting-IPC-client.patch of Package pacemaker.14737

commit 5d94da7af947810ca4b12dc1d9dce18a7d638f73
Author: Ken Gaillot <kgaillot@redhat.com>
Date:   Mon May 1 17:39:02 2017 -0500

    Fix: libcrmcommon: avoid evicting IPC client if messages spike briefly
    
    Before, an IP server would evict a client if its event queue grew to 500
    messages. Now, we avoid evicting if the backlog is new (a quick spike of many
    messages that immediately crosses the threshold).

diff --git a/include/crm/common/ipcs.h b/include/crm/common/ipcs.h
index 2fec93178..ba1ccefc7 100644
--- a/include/crm/common/ipcs.h
+++ b/include/crm/common/ipcs.h
@@ -73,6 +73,8 @@ struct crm_client_s {
     char *name;
     char *user;
 
+    /* Provided for server use (not used by library) */
+    /* @TODO merge options, flags, and kind (reserving lower bits for server) */
     long long options;
 
     int request_id;
@@ -80,7 +82,7 @@ struct crm_client_s {
     void *userdata;
 
     int event_timer;
-    GList *event_queue;
+    GList *event_queue; /* @TODO use GQueue instead */
 
     /* Depending on the value of kind, only some of the following
      * will be populated/valid
@@ -91,6 +93,7 @@ struct crm_client_s {
 
     struct crm_remote_s *remote;        /* TCP/TLS */
 
+    unsigned int backlog_len; /* IPC queue length after last flush */
 };
 
 extern GHashTable *client_connections;
diff --git a/lib/common/ipc.c b/lib/common/ipc.c
index c9e3da2bb..d32e37390 100644
--- a/lib/common/ipc.c
+++ b/lib/common/ipc.c
@@ -37,6 +37,9 @@
 
 #define PCMK_IPC_VERSION 1
 
+/* Evict clients whose event queue grows this large */
+#define PCMK_IPC_MAX_QUEUE 500
+
 struct crm_ipc_response_header {
     struct qb_ipc_response_header qb;
     uint32_t size_uncompressed;
@@ -523,24 +526,44 @@ crm_ipcs_flush_events(crm_client_t * c)
     }
 
     queue_len -= sent;
-    if (sent > 0 || c->event_queue) {
+    if (sent > 0 || queue_len) {
         crm_trace("Sent %d events (%d remaining) for %p[%d]: %s (%lld)",
                   sent, queue_len, c->ipcs, c->pid,
                   pcmk_strerror(rc < 0 ? rc : 0), (long long) rc);
     }
 
-    if (c->event_queue) {
-        if (queue_len % 100 == 0 && queue_len > 99) {
-            crm_warn("Event queue for %p[%d] has grown to %d", c->ipcs, c->pid, queue_len);
+    if (queue_len) {
+        /* We want to allow clients to briefly fall behind on processing
+         * incoming messages, but drop completely unresponsive clients so the
+         * connection doesn't consume resources indefinitely.
+         *
+         * @TODO It is possible that the queue could reasonably grow large in a
+         * short time. An example is a reprobe of hundreds of resources on many
+         * nodes resulting in a surge of CIB replies to the crmd. We could
+         * possibly give cluster daemons a higher threshold here, and/or prevent
+         * such a surge by throttling LRM history writes in the crmd.
+         */
 
-        } else if (queue_len > 500) {
-            crm_err("Evicting slow client %p[%d]: event queue reached %d entries",
-                    c->ipcs, c->pid, queue_len);
-            qb_ipcs_disconnect(c->ipcs);
-            return rc;
+        if (queue_len > PCMK_IPC_MAX_QUEUE) {
+            if ((c->backlog_len <= 1) || (queue_len < c->backlog_len)) {
+                /* Don't evict for a new or shrinking backlog */
+                crm_warn("Client with process ID %u has a backlog of %u messages "
+                         CRM_XS " %p", c->pid, queue_len, c->ipcs);
+            } else {
+                crm_err("Evicting client with process ID %u due to backlog of %u messages "
+                         CRM_XS " %p", c->pid, queue_len, c->ipcs);
+                c->backlog_len = 0;
+                qb_ipcs_disconnect(c->ipcs);
+                return rc;
+            }
         }
+
+        c->backlog_len = queue_len;
         delay_next_flush(c, queue_len);
 
+    } else {
+        /* Event queue is empty, there is no backlog */
+        c->backlog_len = 0;
     }
 
     return rc;
openSUSE Build Service is sponsored by