File bsc#1196340-0001-Fix-scheduler-do-not-enforce-resource-stop-on-a-rejo.patch of Package pacemaker.34782
From 2a1d446ddf034136c9af0b816d0dd2bd3cfe13fb Mon Sep 17 00:00:00 2001
From: "Gao,Yan" <ygao@suse.com>
Date: Mon, 9 May 2022 09:05:45 +0200
Subject: [PATCH 1/9] Fix: scheduler: do not enforce resource stop on a
 rejoined node that was the target of a failed migrate_to
Previously given a scenario:
- rscA failed to migrate from node1 to node2 with a failed migrate_to on
node1
- rscA failed to stop on node2
- node2 got fenced and rscA recovered on node1
- node2 rejoined
, rscA would be considered being "multiple-active" and an unnecessary
full recovery would be issued.
With this commit, when node2 rejoins, rather then enforcing a likely
unnecessary stop of rscA on it, we wait for the probe of it to be
complete, then we will have a better idea about what we should do with it.
---
 lib/pengine/unpack.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 426022013..a8d8f3708 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -2593,6 +2593,41 @@ find_lrm_op(const char *resource, const char *op, const char *node, const char *
     return xml;
 }
 
+static xmlNode *
+find_lrm_resource(const char *rsc_id, const char *node_name,
+                  pe_working_set_t *data_set)
+{
+    int offset = 0;
+    char xpath[STATUS_PATH_MAX];
+    xmlNode *xml = NULL;
+
+    offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset,
+                       "//node_state[@uname='%s']", node_name);
+    offset +=
+        snprintf(xpath + offset, STATUS_PATH_MAX - offset,
+                 "//" XML_LRM_TAG_RESOURCE "[@id='%s']", rsc_id);
+
+    CRM_LOG_ASSERT(offset > 0);
+    xml = get_xpath_object(xpath, data_set->input, LOG_DEBUG);
+
+    return xml;
+}
+
+static bool
+unknown_on_node(const char *rsc_id, const char *node_name,
+                pe_working_set_t *data_set)
+{
+    xmlNode *lrm_resource = NULL;
+
+    lrm_resource = find_lrm_resource(rsc_id, node_name, data_set);
+
+    /* If the resource has no lrm_rsc_op history on the node, that means its
+     * state is unknown there.
+     */
+    return (lrm_resource == NULL
+            || first_named_child(lrm_resource, XML_LRM_TAG_RSC_OP) == NULL);
+}
+
 static int
 pe__call_id(xmlNode *op_xml)
 {
@@ -2764,7 +2799,13 @@ unpack_migrate_to_failure(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
                                       source, TRUE, data_set);
     target_migrate_from_id = pe__call_id(target_migrate_from);
 
-    if ((target_stop == NULL) || (target_stop_id < target_migrate_from_id)) {
+    if (/* If the resource state is unknown on the target, it will likely be
+         * probed there.
+         * Don't just consider it running there. We will get back here anyway in
+         * case the probe detects it's running there.
+         */
+        !unknown_on_node(rsc->id, target, data_set)
+        && ((target_stop == NULL) || (target_stop_id < target_migrate_from_id))) {
         /* There was no stop on the target, or a stop that happened before a
          * migrate_from, so assume the resource is still active on the target
          * (if it is up).
-- 
2.35.3