File bsc#1130122-0001-Fix-scheduler-wait-for-probe-actions-to-complete-to-.patch of Package pacemaker.26927

From b94438943aa6bf2756d4654e470a3719ee30ba21 Mon Sep 17 00:00:00 2001
From: "Gao,Yan" <ygao@suse.com>
Date: Mon, 8 Apr 2019 15:01:20 +0200
Subject: [PATCH 1/2] Fix: scheduler: wait for probe actions to complete to
 prevent unnecessary restart/re-promote of dependent resources

This addresses the issue brought up from:
https://github.com/ClusterLabs/pacemaker/commit/faf44d811e4f5598dae085c61fdef410c8d18882#commitcomment-22262090

Given an ordering chain in a transition graph like:

A.probe -> A.start -> [...] -> B.start

, if B was already started, it would be scheduled to restart.

Previously, B would be directly stopped, which could turn out to be
unnecessary if A was probed being already started as well. Such
unnecessary restart could be very expensive for heavy workload.

With this commit, a new order will be created:

A.probe -> B.stop

So that any potential restart of B will wait for A.probe to complete. In
case that A is already started, transition will abort and restart of B
won't need to be performed any more.

Similarly for an ordering chain like:

A.probe -> A.start -> [...] -> B.promote

A new order will be created to prevent unnecessary re-promote:
A.probe -> B.demote
---
 pengine/allocate.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 202 insertions(+), 2 deletions(-)

Index: pacemaker-1.1.18+20180430.b12c320f5/pengine/allocate.c
===================================================================
--- pacemaker-1.1.18+20180430.b12c320f5.orig/pengine/allocate.c
+++ pacemaker-1.1.18+20180430.b12c320f5/pengine/allocate.c
@@ -2208,9 +2208,8 @@ order_first_probe_unneeded(pe_action_t *
     return FALSE;
 }
 
-
 static void
-order_first_probes(pe_working_set_t * data_set)
+order_first_probes_imply_stops(pe_working_set_t * data_set)
 {
     GListPtr gIter = NULL;
 
@@ -2338,6 +2337,207 @@ order_first_probes(pe_working_set_t * da
 }
 
 static void
+order_first_probe_then_restart_repromote(pe_action_t * probe,
+                                         pe_action_t * after,
+                                         pe_working_set_t * data_set)
+{
+    GListPtr gIter = NULL;
+    bool interleave = FALSE;
+    pe_resource_t *compatible_rsc = NULL;
+
+    if (probe == NULL
+        || probe->rsc == NULL
+        || probe->rsc->variant != pe_native) {
+        return;
+    }
+
+    if (after == NULL
+        // Avoid running into any possible loop
+        || is_set(after->flags, pe_action_tracking)) {
+        return;
+    }
+
+    if (safe_str_neq(probe->task, RSC_STATUS)) {
+        return;
+    }
+
+    pe_set_action_bit(after, pe_action_tracking);
+
+    crm_trace("Processing based on %s %s -> %s %s",
+              probe->uuid,
+              probe->node ? probe->node->details->uname: "",
+              after->uuid,
+              after->node ? after->node->details->uname : "");
+
+    if (after->rsc
+        /* Better not build a dependency directly with a clone/group.
+         * We are going to proceed through the ordering chain and build
+         * dependencies with its children.
+         */
+        && after->rsc->variant == pe_native
+        && probe->rsc != after->rsc) {
+
+            GListPtr then_actions = NULL;
+            enum pe_ordering probe_order_type = pe_order_optional;
+
+            if (safe_str_eq(after->task, RSC_START)) {
+                char *key = generate_op_key(after->rsc->id, RSC_STOP, 0);
+
+                then_actions = find_actions(after->rsc->actions, key, NULL);
+                free(key);
+
+            } else if (safe_str_eq(after->task, RSC_PROMOTE)) {
+                char *key = generate_op_key(after->rsc->id, RSC_DEMOTE, 0);
+
+                then_actions = find_actions(after->rsc->actions, key, NULL);
+                free(key);
+            }
+
+            for (gIter = then_actions; gIter != NULL; gIter = gIter->next) {
+                pe_action_t *then = (pe_action_t *) gIter->data;
+
+                // Skip any pseudo action which for example is implied by fencing
+                if (is_set(then->flags, pe_action_pseudo)) {
+                    continue;
+                }
+
+                order_actions(probe, then, probe_order_type);
+            }
+            g_list_free(then_actions);
+    }
+
+    if (after->rsc
+        && after->rsc->variant > pe_group) {
+        const char *interleave_s = g_hash_table_lookup(after->rsc->meta,
+                                                       XML_RSC_ATTR_INTERLEAVE);
+
+        interleave = crm_is_true(interleave_s);
+
+        if (interleave) {
+            /* For an interleaved clone, we should build a dependency only
+             * with the relevant clone child.
+             */
+            compatible_rsc = find_compatible_child(probe->rsc,
+                                                   after->rsc,
+                                                   RSC_ROLE_UNKNOWN,
+                                                   FALSE);
+        }
+    }
+
+    for (gIter = after->actions_after; gIter != NULL; gIter = gIter->next) {
+        pe_action_wrapper_t *after_wrapper = (pe_action_wrapper_t *) gIter->data;
+        /* pe_order_implies_then is the reason why a required A.start
+         * implies/enforces B.start to be required too, which is the cause of
+         * B.restart/re-promote.
+         *
+         * Not sure about pe_order_implies_then_on_node though. It's now only
+         * used for unfencing case, which tends to introduce transition
+         * loops...
+         */
+
+        if (is_not_set(after_wrapper->type, pe_order_implies_then)) {
+            /* The order type between a group/clone and its child such as
+             * B.start-> B_child.start is:
+             * pe_order_implies_first_printed | pe_order_runnable_left
+             *
+             * Proceed through the ordering chain and build dependencies with
+             * its children.
+             */
+            if (after->rsc == NULL
+                || after->rsc->variant < pe_group
+                || probe->rsc->parent == after->rsc
+                || after_wrapper->action->rsc == NULL
+                || after_wrapper->action->rsc->variant > pe_group
+                || after->rsc != after_wrapper->action->rsc->parent) {
+                continue;
+            }
+
+            /* Proceed to the children of a group or a non-interleaved clone.
+             * For an interleaved clone, proceed only to the relevant child.
+             */
+            if (after->rsc->variant > pe_group
+                && interleave == TRUE
+                && (compatible_rsc == NULL
+                    || compatible_rsc != after_wrapper->action->rsc)) {
+                continue;
+            }
+        }
+
+        crm_trace("Proceeding through %s %s -> %s %s (type=0x%.6x)",
+                  after->uuid,
+                  after->node ? after->node->details->uname: "",
+                  after_wrapper->action->uuid,
+                  after_wrapper->action->node ? after_wrapper->action->node->details->uname : "",
+                  after_wrapper->type);
+
+        order_first_probe_then_restart_repromote(probe, after_wrapper->action, data_set);
+    }
+}
+
+static void clear_actions_tracking_flag(pe_working_set_t * data_set)
+{
+    GListPtr gIter = NULL;
+
+    for (gIter = data_set->actions; gIter != NULL; gIter = gIter->next) {
+        pe_action_t *action = (pe_action_t *) gIter->data;
+
+        if (is_set(action->flags, pe_action_tracking)) {
+            pe_clear_action_bit(action, pe_action_tracking);
+        }
+    }
+}
+
+static void
+order_first_rsc_probes(pe_resource_t * rsc, pe_working_set_t * data_set)
+{
+    GListPtr gIter = NULL;
+    GListPtr probes = NULL;
+    char *key = NULL;
+
+    for (gIter = rsc->children; gIter != NULL; gIter = gIter->next) {
+        pe_resource_t * child = (pe_resource_t *) gIter->data;
+
+        order_first_rsc_probes(child, data_set);
+    }
+
+    if (rsc->variant != pe_native) {
+        return;
+    }
+
+    key = generate_op_key(rsc->id, RSC_STATUS, 0);
+    probes = find_actions(rsc->actions, key, NULL);
+    free(key);
+
+    for (gIter = probes; gIter != NULL; gIter= gIter->next) {
+        pe_action_t *probe = (pe_action_t *) gIter->data;
+        GListPtr aIter = NULL;
+
+        for (aIter = probe->actions_after; aIter != NULL; aIter = aIter->next) {
+            pe_action_wrapper_t *after_wrapper = (pe_action_wrapper_t *) aIter->data;
+
+            order_first_probe_then_restart_repromote(probe, after_wrapper->action, data_set);
+            clear_actions_tracking_flag(data_set);
+        }
+    }
+
+    g_list_free(probes);
+}
+
+static void
+order_first_probes(pe_working_set_t * data_set)
+{
+    GListPtr gIter = NULL;
+
+    for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) {
+        pe_resource_t *rsc = (pe_resource_t *) gIter->data;
+
+        order_first_rsc_probes(rsc, data_set);
+    }
+
+    order_first_probes_imply_stops(data_set);
+}
+
+static void
 order_then_probes(pe_working_set_t * data_set)
 {
 #if 0
openSUSE Build Service is sponsored by