File bsc#1130122-0001-Fix-scheduler-wait-for-probe-actions-to-complete-to-.patch of Package pacemaker.openSUSE_Leap_15.1_Update
From b94438943aa6bf2756d4654e470a3719ee30ba21 Mon Sep 17 00:00:00 2001
From: "Gao,Yan" <ygao@suse.com>
Date: Mon, 8 Apr 2019 15:01:20 +0200
Subject: [PATCH 1/2] Fix: scheduler: wait for probe actions to complete to
prevent unnecessary restart/re-promote of dependent resources
This addresses the issue brought up from:
https://github.com/ClusterLabs/pacemaker/commit/faf44d811e4f5598dae085c61fdef410c8d18882#commitcomment-22262090
Given an ordering chain in a transition graph like:
A.probe -> A.start -> [...] -> B.start
, if B was already started, it would be scheduled to restart.
Previously, B would be directly stopped, which could turn out to be
unnecessary if A was probed being already started as well. Such
unnecessary restart could be very expensive for heavy workload.
With this commit, a new order will be created:
A.probe -> B.stop
So that any potential restart of B will wait for A.probe to complete. In
case that A is already started, transition will abort and restart of B
won't need to be performed any more.
Similarly for an ordering chain like:
A.probe -> A.start -> [...] -> B.promote
A new order will be created to prevent unnecessary re-promote:
A.probe -> B.demote
---
lib/pacemaker/pcmk_sched_allocate.c | 195 +++++++++++++++++++++++++++++++++++-
1 file changed, 193 insertions(+), 2 deletions(-)
diff --git a/lib/pacemaker/pcmk_sched_allocate.c b/lib/pacemaker/pcmk_sched_allocate.c
index 9f82c0025..a9b0f3efb 100644
--- a/lib/pacemaker/pcmk_sched_allocate.c
+++ b/lib/pacemaker/pcmk_sched_allocate.c
@@ -2268,9 +2268,8 @@ order_first_probe_unneeded(pe_action_t * probe, pe_action_t * rh_action)
return FALSE;
}
-
static void
-order_first_probes(pe_working_set_t * data_set)
+order_first_probes_imply_stops(pe_working_set_t * data_set)
{
GListPtr gIter = NULL;
@@ -2393,6 +2392,198 @@ order_first_probes(pe_working_set_t * data_set)
}
}
+static void
+order_first_probe_then_restart_repromote(pe_action_t * probe,
+ pe_action_t * after,
+ pe_working_set_t * data_set)
+{
+ GListPtr gIter = NULL;
+ bool interleave = FALSE;
+ pe_resource_t *compatible_rsc = NULL;
+
+ if (probe == NULL
+ || probe->rsc == NULL
+ || probe->rsc->variant != pe_native) {
+ return;
+ }
+
+ if (after == NULL
+ // Avoid running into any possible loop
+ || is_set(after->flags, pe_action_tracking)) {
+ return;
+ }
+
+ if (safe_str_neq(probe->task, RSC_STATUS)) {
+ return;
+ }
+
+ pe_set_action_bit(after, pe_action_tracking);
+
+ crm_trace("Processing based on %s %s -> %s %s",
+ probe->uuid,
+ probe->node ? probe->node->details->uname: "",
+ after->uuid,
+ after->node ? after->node->details->uname : "");
+
+ if (after->rsc
+ /* Better not build a dependency directly with a clone/group.
+ * We are going to proceed through the ordering chain and build
+ * dependencies with its children.
+ */
+ && after->rsc->variant == pe_native
+ && probe->rsc != after->rsc) {
+
+ GListPtr then_actions = NULL;
+ enum pe_ordering probe_order_type = pe_order_optional;
+
+ if (safe_str_eq(after->task, RSC_START)) {
+ then_actions = pe__resource_actions(after->rsc, NULL, RSC_STOP, FALSE);
+
+ } else if (safe_str_eq(after->task, RSC_PROMOTE)) {
+ then_actions = pe__resource_actions(after->rsc, NULL, RSC_DEMOTE, FALSE);
+ }
+
+ for (gIter = then_actions; gIter != NULL; gIter = gIter->next) {
+ pe_action_t *then = (pe_action_t *) gIter->data;
+
+ // Skip any pseudo action which for example is implied by fencing
+ if (is_set(then->flags, pe_action_pseudo)) {
+ continue;
+ }
+
+ order_actions(probe, then, probe_order_type);
+ }
+ g_list_free(then_actions);
+ }
+
+ if (after->rsc
+ && after->rsc->variant > pe_group) {
+ const char *interleave_s = g_hash_table_lookup(after->rsc->meta,
+ XML_RSC_ATTR_INTERLEAVE);
+
+ interleave = crm_is_true(interleave_s);
+
+ if (interleave) {
+ /* For an interleaved clone, we should build a dependency only
+ * with the relevant clone child.
+ */
+ compatible_rsc = find_compatible_child(probe->rsc,
+ after->rsc,
+ RSC_ROLE_UNKNOWN,
+ FALSE, data_set);
+ }
+ }
+
+ for (gIter = after->actions_after; gIter != NULL; gIter = gIter->next) {
+ pe_action_wrapper_t *after_wrapper = (pe_action_wrapper_t *) gIter->data;
+ /* pe_order_implies_then is the reason why a required A.start
+ * implies/enforces B.start to be required too, which is the cause of
+ * B.restart/re-promote.
+ *
+ * Not sure about pe_order_implies_then_on_node though. It's now only
+ * used for unfencing case, which tends to introduce transition
+ * loops...
+ */
+
+ if (is_not_set(after_wrapper->type, pe_order_implies_then)) {
+ /* The order type between a group/clone and its child such as
+ * B.start-> B_child.start is:
+ * pe_order_implies_first_printed | pe_order_runnable_left
+ *
+ * Proceed through the ordering chain and build dependencies with
+ * its children.
+ */
+ if (after->rsc == NULL
+ || after->rsc->variant < pe_group
+ || probe->rsc->parent == after->rsc
+ || after_wrapper->action->rsc == NULL
+ || after_wrapper->action->rsc->variant > pe_group
+ || after->rsc != after_wrapper->action->rsc->parent) {
+ continue;
+ }
+
+ /* Proceed to the children of a group or a non-interleaved clone.
+ * For an interleaved clone, proceed only to the relevant child.
+ */
+ if (after->rsc->variant > pe_group
+ && interleave == TRUE
+ && (compatible_rsc == NULL
+ || compatible_rsc != after_wrapper->action->rsc)) {
+ continue;
+ }
+ }
+
+ crm_trace("Proceeding through %s %s -> %s %s (type=0x%.6x)",
+ after->uuid,
+ after->node ? after->node->details->uname: "",
+ after_wrapper->action->uuid,
+ after_wrapper->action->node ? after_wrapper->action->node->details->uname : "",
+ after_wrapper->type);
+
+ order_first_probe_then_restart_repromote(probe, after_wrapper->action, data_set);
+ }
+}
+
+static void clear_actions_tracking_flag(pe_working_set_t * data_set)
+{
+ GListPtr gIter = NULL;
+
+ for (gIter = data_set->actions; gIter != NULL; gIter = gIter->next) {
+ pe_action_t *action = (pe_action_t *) gIter->data;
+
+ if (is_set(action->flags, pe_action_tracking)) {
+ pe_clear_action_bit(action, pe_action_tracking);
+ }
+ }
+}
+
+static void
+order_first_rsc_probes(pe_resource_t * rsc, pe_working_set_t * data_set)
+{
+ GListPtr gIter = NULL;
+ GListPtr probes = NULL;
+
+ for (gIter = rsc->children; gIter != NULL; gIter = gIter->next) {
+ pe_resource_t * child = (pe_resource_t *) gIter->data;
+
+ order_first_rsc_probes(child, data_set);
+ }
+
+ if (rsc->variant != pe_native) {
+ return;
+ }
+
+ probes = pe__resource_actions(rsc, NULL, RSC_STATUS, FALSE);
+
+ for (gIter = probes; gIter != NULL; gIter= gIter->next) {
+ pe_action_t *probe = (pe_action_t *) gIter->data;
+ GListPtr aIter = NULL;
+
+ for (aIter = probe->actions_after; aIter != NULL; aIter = aIter->next) {
+ pe_action_wrapper_t *after_wrapper = (pe_action_wrapper_t *) aIter->data;
+
+ order_first_probe_then_restart_repromote(probe, after_wrapper->action, data_set);
+ clear_actions_tracking_flag(data_set);
+ }
+ }
+
+ g_list_free(probes);
+}
+
+static void
+order_first_probes(pe_working_set_t * data_set)
+{
+ GListPtr gIter = NULL;
+
+ for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) {
+ pe_resource_t *rsc = (pe_resource_t *) gIter->data;
+
+ order_first_rsc_probes(rsc, data_set);
+ }
+
+ order_first_probes_imply_stops(data_set);
+}
+
static void
order_then_probes(pe_working_set_t * data_set)
{
--
2.16.4