File pacemaker-libcrmcommon-avoid-evicting-IPC-client.patch of Package pacemaker.14737
commit 5d94da7af947810ca4b12dc1d9dce18a7d638f73
Author: Ken Gaillot <kgaillot@redhat.com>
Date: Mon May 1 17:39:02 2017 -0500
Fix: libcrmcommon: avoid evicting IPC client if messages spike briefly
Before, an IP server would evict a client if its event queue grew to 500
messages. Now, we avoid evicting if the backlog is new (a quick spike of many
messages that immediately crosses the threshold).
diff --git a/include/crm/common/ipcs.h b/include/crm/common/ipcs.h
index 2fec93178..ba1ccefc7 100644
--- a/include/crm/common/ipcs.h
+++ b/include/crm/common/ipcs.h
@@ -73,6 +73,8 @@ struct crm_client_s {
char *name;
char *user;
+ /* Provided for server use (not used by library) */
+ /* @TODO merge options, flags, and kind (reserving lower bits for server) */
long long options;
int request_id;
@@ -80,7 +82,7 @@ struct crm_client_s {
void *userdata;
int event_timer;
- GList *event_queue;
+ GList *event_queue; /* @TODO use GQueue instead */
/* Depending on the value of kind, only some of the following
* will be populated/valid
@@ -91,6 +93,7 @@ struct crm_client_s {
struct crm_remote_s *remote; /* TCP/TLS */
+ unsigned int backlog_len; /* IPC queue length after last flush */
};
extern GHashTable *client_connections;
diff --git a/lib/common/ipc.c b/lib/common/ipc.c
index c9e3da2bb..d32e37390 100644
--- a/lib/common/ipc.c
+++ b/lib/common/ipc.c
@@ -37,6 +37,9 @@
#define PCMK_IPC_VERSION 1
+/* Evict clients whose event queue grows this large */
+#define PCMK_IPC_MAX_QUEUE 500
+
struct crm_ipc_response_header {
struct qb_ipc_response_header qb;
uint32_t size_uncompressed;
@@ -523,24 +526,44 @@ crm_ipcs_flush_events(crm_client_t * c)
}
queue_len -= sent;
- if (sent > 0 || c->event_queue) {
+ if (sent > 0 || queue_len) {
crm_trace("Sent %d events (%d remaining) for %p[%d]: %s (%lld)",
sent, queue_len, c->ipcs, c->pid,
pcmk_strerror(rc < 0 ? rc : 0), (long long) rc);
}
- if (c->event_queue) {
- if (queue_len % 100 == 0 && queue_len > 99) {
- crm_warn("Event queue for %p[%d] has grown to %d", c->ipcs, c->pid, queue_len);
+ if (queue_len) {
+ /* We want to allow clients to briefly fall behind on processing
+ * incoming messages, but drop completely unresponsive clients so the
+ * connection doesn't consume resources indefinitely.
+ *
+ * @TODO It is possible that the queue could reasonably grow large in a
+ * short time. An example is a reprobe of hundreds of resources on many
+ * nodes resulting in a surge of CIB replies to the crmd. We could
+ * possibly give cluster daemons a higher threshold here, and/or prevent
+ * such a surge by throttling LRM history writes in the crmd.
+ */
- } else if (queue_len > 500) {
- crm_err("Evicting slow client %p[%d]: event queue reached %d entries",
- c->ipcs, c->pid, queue_len);
- qb_ipcs_disconnect(c->ipcs);
- return rc;
+ if (queue_len > PCMK_IPC_MAX_QUEUE) {
+ if ((c->backlog_len <= 1) || (queue_len < c->backlog_len)) {
+ /* Don't evict for a new or shrinking backlog */
+ crm_warn("Client with process ID %u has a backlog of %u messages "
+ CRM_XS " %p", c->pid, queue_len, c->ipcs);
+ } else {
+ crm_err("Evicting client with process ID %u due to backlog of %u messages "
+ CRM_XS " %p", c->pid, queue_len, c->ipcs);
+ c->backlog_len = 0;
+ qb_ipcs_disconnect(c->ipcs);
+ return rc;
+ }
}
+
+ c->backlog_len = queue_len;
delay_next_flush(c, queue_len);
+ } else {
+ /* Event queue is empty, there is no backlog */
+ c->backlog_len = 0;
}
return rc;