File bsc#1198767-0002-Fix-scheduler-Do-not-fence-a-pending-node-that-doesn.patch of Package pacemaker.38495
From 00dd1989fdad9ceafbc2e385d5171ddb18b1dda9 Mon Sep 17 00:00:00 2001
From: "Gao,Yan" <ygao@suse.com>
Date: Tue, 21 Feb 2023 17:08:11 +0100
Subject: [PATCH 2/3] Fix: scheduler: Do not fence a pending node that doesn't
have an uname in node state yet
If a joining peer makes the cluster acquire the quorum from corosync
meanwhile it has not joined CPG membership of pacemaker-controld yet,
it's possible that the created node_state entry doesn't have an uname
yet. Previously in that case, the node would be considered `UNCLEAN
(offline)` and get unnecessarily fenced before it got a chance to join
CPG yet.
The fix resolves that by recognizing the node as `pending` and waiting
for it to join CPG.
---
lib/pengine/unpack.c | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
Index: pacemaker-2.0.5+20201202.ba59be712/lib/pengine/unpack.c
===================================================================
--- pacemaker-2.0.5+20201202.ba59be712.orig/lib/pengine/unpack.c
+++ pacemaker-2.0.5+20201202.ba59be712/lib/pengine/unpack.c
@@ -1156,10 +1156,17 @@ unpack_status(xmlNode * status, pe_worki
this_node = pe_find_node_any(data_set->nodes, id, uname);
if (uname == NULL) {
- /* error */
- continue;
+ /* If a joining peer makes the cluster acquire the quorum from corosync
+ * meanwhile it has not joined CPG membership of pacemaker-controld yet,
+ * it's possible that the created node_state entry doesn't have an uname
+ * yet. We should recognize the node as `pending` and wait for it to
+ * join CPG.
+ */
+ crm_trace("Handling " XML_CIB_TAG_STATE " entry with id=\"%s\" without "
+ XML_ATTR_UNAME, id);
+ }
- } else if (this_node == NULL) {
+ if (this_node == NULL) {
pcmk__config_warn("Ignoring recorded node status for '%s' "
"because no longer in configuration", uname);
continue;
@@ -1174,7 +1181,7 @@ unpack_status(xmlNode * status, pe_worki
continue;
}
- crm_trace("Processing node id=%s, uname=%s", id, uname);
+ crm_trace("Processing node id=%s, uname=%s", id, crm_str(uname));
/* Mark the node as provisionally clean
* - at least we have seen it in the current cluster's lifetime
@@ -1185,19 +1192,19 @@ unpack_status(xmlNode * status, pe_worki
add_node_attrs(attrs, this_node, TRUE, data_set);
if (crm_is_true(pe_node_attribute_raw(this_node, "standby"))) {
- crm_info("Node %s is in standby-mode", this_node->details->uname);
+ crm_info("Node %s is in standby-mode", crm_str(this_node->details->uname));
this_node->details->standby = TRUE;
}
if (crm_is_true(pe_node_attribute_raw(this_node, "maintenance"))) {
- crm_info("Node %s is in maintenance-mode", this_node->details->uname);
+ crm_info("Node %s is in maintenance-mode", crm_str(this_node->details->uname));
this_node->details->maintenance = TRUE;
}
resource_discovery_enabled = pe_node_attribute_raw(this_node, XML_NODE_ATTR_RSC_DISCOVERY);
if (resource_discovery_enabled && !crm_is_true(resource_discovery_enabled)) {
crm_warn("ignoring %s attribute on node %s, disabling resource discovery is not allowed on cluster nodes",
- XML_NODE_ATTR_RSC_DISCOVERY, this_node->details->uname);
+ XML_NODE_ATTR_RSC_DISCOVERY, crm_str(this_node->details->uname));
}
crm_trace("determining node state");