File bsc#1196340-0001-Fix-scheduler-do-not-enforce-resource-stop-on-a-rejo.patch of Package pacemaker.26927
From 2a1d446ddf034136c9af0b816d0dd2bd3cfe13fb Mon Sep 17 00:00:00 2001
From: "Gao,Yan" <ygao@suse.com>
Date: Mon, 9 May 2022 09:05:45 +0200
Subject: [PATCH 1/9] Fix: scheduler: do not enforce resource stop on a
rejoined node that was the target of a failed migrate_to
Previously given a scenario:
- rscA failed to migrate from node1 to node2 with a failed migrate_to on
node1
- rscA failed to stop on node2
- node2 got fenced and rscA recovered on node1
- node2 rejoined
, rscA would be considered being "multiple-active" and an unnecessary
full recovery would be issued.
With this commit, when node2 rejoins, rather then enforcing a likely
unnecessary stop of rscA on it, we wait for the probe of it to be
complete, then we will have a better idea about what we should do with it.
---
lib/pengine/unpack.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 42 insertions(+), 1 deletion(-)
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 426022013..a8d8f3708 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -2593,6 +2593,41 @@ find_lrm_op(const char *resource, const char *op, const char *node, const char *
return xml;
}
+static xmlNode *
+find_lrm_resource(const char *rsc_id, const char *node_name,
+ pe_working_set_t *data_set)
+{
+ int offset = 0;
+ char xpath[STATUS_PATH_MAX];
+ xmlNode *xml = NULL;
+
+ offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset,
+ "//node_state[@uname='%s']", node_name);
+ offset +=
+ snprintf(xpath + offset, STATUS_PATH_MAX - offset,
+ "//" XML_LRM_TAG_RESOURCE "[@id='%s']", rsc_id);
+
+ CRM_LOG_ASSERT(offset > 0);
+ xml = get_xpath_object(xpath, data_set->input, LOG_DEBUG);
+
+ return xml;
+}
+
+static bool
+unknown_on_node(const char *rsc_id, const char *node_name,
+ pe_working_set_t *data_set)
+{
+ xmlNode *lrm_resource = NULL;
+
+ lrm_resource = find_lrm_resource(rsc_id, node_name, data_set);
+
+ /* If the resource has no lrm_rsc_op history on the node, that means its
+ * state is unknown there.
+ */
+ return (lrm_resource == NULL
+ || first_named_child(lrm_resource, XML_LRM_TAG_RSC_OP) == NULL);
+}
+
static int
pe__call_id(xmlNode *op_xml)
{
@@ -2764,7 +2799,13 @@ unpack_migrate_to_failure(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
source, TRUE, data_set);
target_migrate_from_id = pe__call_id(target_migrate_from);
- if ((target_stop == NULL) || (target_stop_id < target_migrate_from_id)) {
+ if (/* If the resource state is unknown on the target, it will likely be
+ * probed there.
+ * Don't just consider it running there. We will get back here anyway in
+ * case the probe detects it's running there.
+ */
+ !unknown_on_node(rsc->id, target, data_set)
+ && ((target_stop == NULL) || (target_stop_id < target_migrate_from_id))) {
/* There was no stop on the target, or a stop that happened before a
* migrate_from, so assume the resource is still active on the target
* (if it is up).
--
2.35.3