File bsc#1177212-0001-Fix-scheduler-don-t-schedule-a-dangling-migration-st.patch of Package pacemaker.26925
From e8704c172c5e68aae00abec68f153d0f74159650 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 18 Aug 2020 14:15:31 -0500
Subject: [PATCH 1/2] Fix: scheduler: don't schedule a dangling migration stop
if one already occurred
See the 2020-08-18 thread to the users@clusterlabs.org list
"why is node fenced ?". If a node had a dangling migration, later had a
successful stop to recover, and is now shutting down, the scheduler would
schedule another stop, which would be unrunnable due to the shutdown, causing
unnecessary fencing.
---
lib/pengine/unpack.c | 38 ++++++++++++++++++++++----------------
1 file changed, 22 insertions(+), 16 deletions(-)
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 3ea6b2d03..a09061f0d 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -2627,6 +2627,17 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
}
}
+// Is there an action_name in node_name's rsc history newer than call_id?
+static bool
+newer_op(pe_resource_t *rsc, const char *action_name, const char *node_name,
+ int call_id, pe_working_set_t *data_set)
+{
+ xmlNode *action = find_lrm_op(rsc->id, action_name, node_name, NULL, TRUE,
+ data_set);
+
+ return pe__call_id(action) > call_id;
+}
+
static void
unpack_migrate_to_failure(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
pe_working_set_t *data_set)
@@ -2657,7 +2668,7 @@ unpack_migrate_to_failure(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
target_migrate_from_id = pe__call_id(target_migrate_from);
if ((target_stop == NULL) || (target_stop_id < target_migrate_from_id)) {
- /* There was no stop on the source, or a stop that happened before a
+ /* There was no stop on the target, or a stop that happened before a
* migrate_from, so assume the resource is still active on the target
* (if it is up).
*/
@@ -2675,24 +2686,19 @@ unpack_migrate_to_failure(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
* scheduled or attempted).
*
* That means this could be a "dangling" migration. But first, check
- * whether there is a newer migrate_from or start on the source node --
- * it's possible the failed migration was followed by a successful
- * full restart or migration in the reverse direction, in which case we
- * don't want to force it to stop.
+ * whether there is a newer successful stop, start, or migrate_from on
+ * the source node -- it's possible the failed migration was followed by
+ * a successful stop, full restart, or migration in the reverse
+ * direction, in which case we don't want to force a stop.
*/
- xmlNode *source_migrate_from = NULL;
- xmlNode *source_start = NULL;
int source_migrate_to_id = pe__call_id(xml_op);
- source_migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, source,
- NULL, TRUE, data_set);
- if (pe__call_id(source_migrate_from) > source_migrate_to_id) {
- return;
- }
-
- source_start = find_lrm_op(rsc->id, CRMD_ACTION_START, source, NULL,
- TRUE, data_set);
- if (pe__call_id(source_start) > source_migrate_to_id) {
+ if (newer_op(rsc, CRMD_ACTION_MIGRATED, source, source_migrate_to_id,
+ data_set)
+ || newer_op(rsc, CRMD_ACTION_START, source, source_migrate_to_id,
+ data_set)
+ || newer_op(rsc, CRMD_ACTION_STOP, source, source_migrate_to_id,
+ data_set)) {
return;
}
--
2.26.2