File jsc#ECO-1611-0002-Feature-scheduler-implement-priority-fencing-delay.patch of Package pacemaker.19271

From 79ded22a9cc7dcb074fdac3174e504502bea147f Mon Sep 17 00:00:00 2001
From: "Gao,Yan" <ygao@suse.com>
Date: Tue, 17 Mar 2020 14:33:35 +0100
Subject: [PATCH 2/9] Feature: scheduler: implement priority-fencing-delay

---
 include/crm/pengine/internal.h      |  4 +-
 include/crm/pengine/pe_types.h      |  2 +
 lib/pacemaker/pcmk_sched_allocate.c | 14 ++---
 lib/pacemaker/pcmk_sched_native.c   |  8 +--
 lib/pengine/native.c                | 47 ++++++++++++++
 lib/pengine/unpack.c                | 43 ++++++++-----
 lib/pengine/utils.c                 | 96 ++++++++++++++++++++++++++++-
 7 files changed, 183 insertions(+), 31 deletions(-)

Index: pacemaker-2.0.1+20190417.13d370ca9/include/crm/pengine/internal.h
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/include/crm/pengine/internal.h
+++ pacemaker-2.0.1+20190417.13d370ca9/include/crm/pengine/internal.h
@@ -318,7 +318,7 @@ typedef struct op_digest_cache_s {
 op_digest_cache_t *rsc_action_digest_cmp(resource_t * rsc, xmlNode * xml_op, node_t * node,
                                          pe_working_set_t * data_set);
 
-action_t *pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe_working_set_t * data_set);
+pe_action_t *pe_fence_op(pe_node_t * node, const char *op, bool optional, const char *reason, bool priority_delay, pe_working_set_t * data_set);
 void trigger_unfencing(
     resource_t * rsc, node_t *node, const char *reason, action_t *dependency, pe_working_set_t * data_set);
 
@@ -335,7 +335,7 @@ gboolean add_tag_ref(GHashTable * tags,
 
 void print_rscs_brief(GListPtr rsc_list, const char * pre_text, long options,
                       void * print_data, gboolean print_all);
-void pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason);
+void pe_fence_node(pe_working_set_t * data_set, pe_node_t * node, const char *reason, bool priority_delay);
 
 node_t *pe_create_node(const char *id, const char *uname, const char *type,
                        const char *score, pe_working_set_t * data_set);
Index: pacemaker-2.0.1+20190417.13d370ca9/include/crm/pengine/pe_types.h
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/include/crm/pengine/pe_types.h
+++ pacemaker-2.0.1+20190417.13d370ca9/include/crm/pengine/pe_types.h
@@ -160,6 +160,7 @@ struct pe_working_set_s {
 
     GList *param_check; // History entries that need to be checked
     GList *stop_needed; // Containers that need stop actions
+    int priority_fencing_delay; // Enforced priority fencing delay
 };
 
 enum pe_check_parameters {
@@ -204,6 +205,7 @@ struct pe_node_shared_s {
     GHashTable *attrs;          /* char* => char* */
     GHashTable *utilization;
     GHashTable *digest_cache;   //!< cache of calculated resource digests
+    int priority; // calculated based on the priority of resources running on the node
 };
 
 struct pe_node_s {
Index: pacemaker-2.0.1+20190417.13d370ca9/lib/pacemaker/pcmk_sched_allocate.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/lib/pacemaker/pcmk_sched_allocate.c
+++ pacemaker-2.0.1+20190417.13d370ca9/lib/pacemaker/pcmk_sched_allocate.c
@@ -928,7 +928,7 @@ probe_resources(pe_working_set_t * data_
             if (pe__is_remote_node(node) && node->details->remote_rsc
                 && (get_remote_node_state(node) == remote_state_failed)) {
 
-                pe_fence_node(data_set, node, "the connection is unrecoverable");
+                pe_fence_node(data_set, node, "the connection is unrecoverable", FALSE);
             }
             continue;
 
@@ -1465,7 +1465,7 @@ fence_guest(pe_node_t *node, pe_working_
     /* Create a fence pseudo-event, so we have an event to order actions
      * against, and the controller can always detect it.
      */
-    stonith_op = pe_fence_op(node, fence_action, FALSE, "guest is unclean", data_set);
+    stonith_op = pe_fence_op(node, fence_action, FALSE, "guest is unclean", FALSE, data_set);
     update_action_flags(stonith_op, pe_action_pseudo | pe_action_runnable,
                         __FUNCTION__, __LINE__);
 
@@ -1474,7 +1474,7 @@ fence_guest(pe_node_t *node, pe_working_
      * (even though start might be closer to what is done for a real reboot).
      */
     if(stop && is_set(stop->flags, pe_action_pseudo)) {
-        pe_action_t *parent_stonith_op = pe_fence_op(stop->node, NULL, FALSE, NULL, data_set);
+        pe_action_t *parent_stonith_op = pe_fence_op(stop->node, NULL, FALSE, NULL, FALSE, data_set);
         crm_info("Implying guest node %s is down (action %d) after %s fencing",
                  node->details->uname, stonith_op->id, stop->node->details->uname);
         order_actions(parent_stonith_op, stonith_op,
@@ -1566,7 +1566,7 @@ stage6(pe_working_set_t * data_set)
         if (node->details->unclean
             && need_stonith && pe_can_fence(data_set, node)) {
 
-            stonith_op = pe_fence_op(node, NULL, FALSE, "node is unclean", data_set);
+            stonith_op = pe_fence_op(node, NULL, FALSE, "node is unclean", FALSE, data_set);
             pe_warn("Scheduling Node %s for STONITH", node->details->uname);
 
             stonith_constraints(node, stonith_op, data_set);
@@ -1864,7 +1864,7 @@ apply_container_ordering(action_t *actio
     CRM_ASSERT(container);
 
     if(is_set(container->flags, pe_rsc_failed)) {
-        pe_fence_node(data_set, action->node, "container failed");
+        pe_fence_node(data_set, action->node, "container failed", FALSE);
     }
 
     crm_trace("Order %s action %s relative to %s%s for %s%s",
@@ -2069,7 +2069,7 @@ apply_remote_ordering(action_t *action,
                  * way to stop it, it is necessary to fence the
                  * node.
                  */
-                pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable");
+                pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable", FALSE);
                 order_action_then_stop(action, remote_rsc,
                                        pe_order_implies_first, data_set);
 
@@ -2119,7 +2119,7 @@ apply_remote_ordering(action_t *action,
                      * Since we have no way to find out, it is
                      * necessary to fence the node.
                      */
-                    pe_fence_node(data_set, action->node, "resources are in an unknown state and the connection is unrecoverable");
+                    pe_fence_node(data_set, action->node, "resources are in an unknown state and the connection is unrecoverable", FALSE);
                 }
 
                 if(cluster_node && state == remote_state_stopped) {
Index: pacemaker-2.0.1+20190417.13d370ca9/lib/pacemaker/pcmk_sched_native.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/lib/pacemaker/pcmk_sched_native.c
+++ pacemaker-2.0.1+20190417.13d370ca9/lib/pacemaker/pcmk_sched_native.c
@@ -1410,7 +1410,7 @@ native_internal_constraints(resource_t *
 
         for (GList *item = allowed_nodes; item; item = item->next) {
             pe_node_t *node = item->data;
-            pe_action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, data_set);
+            pe_action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, FALSE, data_set);
 
             crm_debug("Ordering any stops of %s before %s, and any starts after",
                       rsc->id, unfence->uuid);
@@ -1881,7 +1881,7 @@ rsc_ticket_constraint(resource_t * rsc_l
                 for (gIter = rsc_lh->running_on; gIter != NULL; gIter = gIter->next) {
                     node_t *node = (node_t *) gIter->data;
 
-                    pe_fence_node(data_set, node, "deadman ticket was lost");
+                    pe_fence_node(data_set, node, "deadman ticket was lost", FALSE);
                 }
                 break;
 
@@ -2588,7 +2588,7 @@ StopRsc(resource_t * rsc, node_t * next,
         }
 
         if(is_set(rsc->flags, pe_rsc_needs_unfencing)) {
-            action_t *unfence = pe_fence_op(current, "on", TRUE, NULL, data_set);
+            pe_action_t *unfence = pe_fence_op(current, "on", TRUE, NULL, FALSE, data_set);
 
             order_actions(stop, unfence, pe_order_implies_first);
             if (!node_has_been_unfenced(current)) {
@@ -2618,7 +2618,7 @@ order_after_unfencing(resource_t *rsc, p
          * the node being unfenced, and all its resources being stopped,
          * whenever a new resource is added -- which would be highly suboptimal.
          */
-        action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, data_set);
+        pe_action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, FALSE, data_set);
 
         order_actions(unfence, action, order);
 
Index: pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/native.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/lib/pengine/native.c
+++ pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/native.c
@@ -33,6 +33,51 @@ is_multiply_active(pe_resource_t *rsc)
     return count > 1;
 }
 
+static void
+native_priority_to_node(pe_resource_t * rsc, pe_node_t * node)
+{
+    int priority = 0;
+
+    if (rsc->priority == 0) {
+        return;
+    }
+
+    if (rsc->role == RSC_ROLE_MASTER) {
+        // Promoted instance takes base priority + 1
+        priority = rsc->priority + 1;
+
+    } else {
+        priority = rsc->priority;
+    }
+
+    node->details->priority += priority;
+    pe_rsc_trace(rsc, "Node '%s' now has priority %d with %s'%s' (priority: %d%s)",
+                 node->details->uname, node->details->priority,
+                 rsc->role == RSC_ROLE_MASTER ? "promoted " : "",
+                 rsc->id, rsc->priority,
+                 rsc->role == RSC_ROLE_MASTER ? " + 1" : "");
+
+    /* Priority of a resource running on a guest node is added to the cluster
+     * node as well. */
+    if (node->details->remote_rsc
+        && node->details->remote_rsc->container) {
+        GListPtr gIter = node->details->remote_rsc->container->running_on;
+
+        for (; gIter != NULL; gIter = gIter->next) {
+            pe_node_t *a_node = gIter->data;
+
+            a_node->details->priority += priority;
+            pe_rsc_trace(rsc, "Node '%s' now has priority %d with %s'%s' (priority: %d%s) "
+                         "from guest node '%s'",
+                         a_node->details->uname, a_node->details->priority,
+                         rsc->role == RSC_ROLE_MASTER ? "promoted " : "",
+                         rsc->id, rsc->priority,
+                         rsc->role == RSC_ROLE_MASTER ? " + 1" : "",
+                         node->details->uname);
+        }
+    }
+}
+
 void
 native_add_running(resource_t * rsc, node_t * node, pe_working_set_t * data_set)
 {
@@ -54,6 +99,8 @@ native_add_running(resource_t * rsc, nod
     rsc->running_on = g_list_append(rsc->running_on, node);
     if (rsc->variant == pe_native) {
         node->details->running_rsc = g_list_append(node->details->running_rsc, rsc);
+
+        native_priority_to_node(rsc, node);
     }
 
     if (rsc->variant == pe_native && node->details->maintenance) {
Index: pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/unpack.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/lib/pengine/unpack.c
+++ pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/unpack.c
@@ -65,9 +65,11 @@ is_dangling_guest_node(node_t *node)
  * \param[in,out] data_set  Current working set of cluster
  * \param[in,out] node      Node to fence
  * \param[in]     reason    Text description of why fencing is needed
+ * \param[in]     priority_delay  Whether to consider `priority-fencing-delay`
  */
 void
-pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
+pe_fence_node(pe_working_set_t * data_set, pe_node_t * node,
+              const char *reason, bool priority_delay)
 {
     CRM_CHECK(node, return);
 
@@ -117,7 +119,8 @@ pe_fence_node(pe_working_set_t * data_se
                      reason);
         }
         node->details->unclean = TRUE;
-        pe_fence_op(node, NULL, TRUE, reason, data_set);
+        // No need to apply `priority-fencing-delay` for remote nodes
+        pe_fence_op(node, NULL, TRUE, reason, FALSE, data_set);
 
     } else if (node->details->unclean) {
         crm_trace("Cluster node %s %s because %s",
@@ -131,7 +134,7 @@ pe_fence_node(pe_working_set_t * data_se
                  pe_can_fence(data_set, node)? "will be fenced" : "is unclean",
                  reason);
         node->details->unclean = TRUE;
-        pe_fence_op(node, NULL, TRUE, reason, data_set);
+        pe_fence_op(node, NULL, TRUE, reason, priority_delay, data_set);
     }
 }
 
@@ -215,6 +218,15 @@ unpack_config(xmlNode * config, pe_worki
     crm_debug("Concurrent fencing is %s",
               is_set(data_set->flags, pe_flag_concurrent_fencing) ? "enabled" : "disabled");
 
+    // Default value -1 means `priority-fencing-delay` is disabled
+    data_set->priority_fencing_delay = -1;
+    value = pe_pref(data_set->config_hash,
+                    XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY);
+    if (value) {
+        data_set->priority_fencing_delay = crm_parse_interval_spec(value) / 1000;
+        crm_trace("Priority fencing delay is %ds", data_set->priority_fencing_delay);
+    }
+
     set_config_flag(data_set, "stop-all-resources", pe_flag_stop_everything);
     crm_debug("Stop all active resources: %s",
               is_set(data_set->flags, pe_flag_stop_everything) ? "true" : "false");
@@ -1140,7 +1152,7 @@ unpack_status(xmlNode * status, pe_worki
                 /* Everything else should flow from this automatically
                  * At least until the PE becomes able to migrate off healthy resources
                  */
-                pe_fence_node(data_set, this_node, "cluster does not have quorum");
+                pe_fence_node(data_set, this_node, "cluster does not have quorum", FALSE);
             }
         }
     }
@@ -1212,7 +1224,7 @@ determine_online_status_no_fencing(pe_wo
 
     } else {
         /* mark it unclean */
-        pe_fence_node(data_set, this_node, "peer is unexpectedly down");
+        pe_fence_node(data_set, this_node, "peer is unexpectedly down", FALSE);
         crm_info("\tin_cluster=%s, is_peer=%s, join=%s, expected=%s",
                  crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state));
     }
@@ -1268,10 +1280,10 @@ determine_online_status_fencing(pe_worki
         online = crmd_online;
 
     } else if (in_cluster == NULL) {
-        pe_fence_node(data_set, this_node, "peer has not been seen by the cluster");
+        pe_fence_node(data_set, this_node, "peer has not been seen by the cluster", FALSE);
 
     } else if (safe_str_eq(join, CRMD_JOINSTATE_NACK)) {
-        pe_fence_node(data_set, this_node, "peer failed the pacemaker membership criteria");
+        pe_fence_node(data_set, this_node, "peer failed the pacemaker membership criteria", FALSE);
 
     } else if (do_terminate == FALSE && safe_str_eq(exp_state, CRMD_JOINSTATE_DOWN)) {
 
@@ -1290,14 +1302,15 @@ determine_online_status_fencing(pe_worki
         online = FALSE;
 
     } else if (crm_is_true(in_cluster) == FALSE) {
-        pe_fence_node(data_set, this_node, "peer is no longer part of the cluster");
+        // Consider `priority-fencing-delay` for lost nodes
+        pe_fence_node(data_set, this_node, "peer is no longer part of the cluster", TRUE);
 
     } else if (!crmd_online) {
-        pe_fence_node(data_set, this_node, "peer process is no longer available");
+        pe_fence_node(data_set, this_node, "peer process is no longer available", FALSE);
 
         /* Everything is running at this point, now check join state */
     } else if (do_terminate) {
-        pe_fence_node(data_set, this_node, "termination was requested");
+        pe_fence_node(data_set, this_node, "termination was requested", FALSE);
 
     } else if (safe_str_eq(join, CRMD_JOINSTATE_MEMBER)) {
         crm_info("Node %s is active", this_node->details->uname);
@@ -1309,7 +1322,7 @@ determine_online_status_fencing(pe_worki
         this_node->details->pending = TRUE;
 
     } else {
-        pe_fence_node(data_set, this_node, "peer was in an unknown state");
+        pe_fence_node(data_set, this_node, "peer was in an unknown state", FALSE);
         crm_warn("%s: in-cluster=%s, is-peer=%s, join=%s, expected=%s, term=%d, shutdown=%d",
                  this_node->details->uname, crm_str(in_cluster), crm_str(is_peer),
                  crm_str(join), crm_str(exp_state), do_terminate, this_node->details->shutdown);
@@ -1897,7 +1910,7 @@ process_rsc_state(resource_t * rsc, node
             if (reason == NULL) {
                reason = crm_strdup_printf("%s is thought to be active there", rsc->id);
             }
-            pe_fence_node(data_set, node, reason);
+            pe_fence_node(data_set, node, reason, FALSE);
         }
         free(reason);
     }
@@ -1919,7 +1932,7 @@ process_rsc_state(resource_t * rsc, node
              * but also mark the node as unclean
              */
             reason = crm_strdup_printf("%s failed there", rsc->id);
-            pe_fence_node(data_set, node, reason);
+            pe_fence_node(data_set, node, reason, FALSE);
             free(reason);
             break;
 
@@ -1987,7 +2000,7 @@ process_rsc_state(resource_t * rsc, node
                      * should result in fencing the remote node.
                      */
                     pe_fence_node(data_set, tmpnode,
-                                  "remote connection is unrecoverable");
+                                  "remote connection is unrecoverable", FALSE);
                 }
             }
 
@@ -2988,7 +3001,7 @@ static bool check_operation_expiry(resou
                  * after unpack_node_loop() is done).
                  */
                 pe_action_t *fence = pe_fence_op(remote_node, NULL, TRUE, NULL,
-                                                 data_set);
+                                                 FALSE, data_set);
 
                 crm_info("Clearing %s failure will wait until any scheduled "
                          "fencing of %s completes", task, rsc->id);
Index: pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/utils.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/lib/pengine/utils.c
+++ pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/utils.c
@@ -578,7 +578,7 @@ custom_action(resource_t * rsc, char *ke
             if (is_set(action->rsc->flags, pe_rsc_managed)
                 && save_action && a_task == stop_rsc
                 && action->node->details->unclean == FALSE) {
-                pe_fence_node(data_set, action->node, "resource actions are unrunnable");
+                pe_fence_node(data_set, action->node, "resource actions are unrunnable", FALSE);
             }
 
         } else if (action->node->details->pending) {
@@ -2188,9 +2188,76 @@ find_unfencing_devices(GListPtr candidat
     return matches;
 }
 
+static int
+node_priority_fencing_delay(pe_node_t * node, pe_working_set_t * data_set)
+{
+    int member_count = 0;
+    int online_count = 0;
+    int top_priority = 0;
+    int lowest_priority = 0;
+    GListPtr gIter = NULL;
+
+    // `priority-fencing-delay` is disabled
+    if (data_set->priority_fencing_delay < 0) {
+        return -1;
+    }
+
+    /* No need to delay fencing if the fencing target is not a normal cluster
+     * member, for example if it's a remote node or a guest node. */
+    if (node->details->type != node_member) {
+        return 0;
+    }
+
+    // No need to delay fencing if the fencing target is in our partition
+    if (node->details->online) {
+        return 0;
+    }
+
+    for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
+        pe_node_t *n =  gIter->data;
+
+        if (n->details->type != node_member) {
+            continue;
+        }
+
+        member_count ++;
+
+        if (n->details->online) {
+            online_count++;
+        }
+
+        if (member_count == 1
+            || n->details->priority > top_priority) {
+            top_priority = n->details->priority;
+        }
+
+        if (member_count == 1
+            || n->details->priority < lowest_priority) {
+            lowest_priority = n->details->priority;
+        }
+    }
+
+    // No need to delay if we have more than half of the cluster members
+    if (online_count > member_count / 2) {
+        return 0;
+    }
+
+    /* All the nodes have equal priority.
+     * Any configured corresponding `pcmk_delay_base/max` will be applied. */
+    if (lowest_priority == top_priority) {
+        return -1;
+    }
+
+    if (node->details->priority < top_priority) {
+        return 0;
+    }
+
+    return data_set->priority_fencing_delay;
+}
 
 action_t *
-pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe_working_set_t * data_set)
+pe_fence_op(pe_node_t * node, const char *op, bool optional, const char *reason,
+            bool priority_delay, pe_working_set_t * data_set)
 {
     char *op_key = NULL;
     action_t *stonith_op = NULL;
@@ -2262,6 +2329,29 @@ pe_fence_op(node_t * node, const char *o
         free(op_key);
     }
 
+    if (data_set->priority_fencing_delay >= 0
+
+            /* It's a suitable case where `priority-fencing-delay` applies.
+             * At least add `priority-fencing-delay` field as an indicator. */
+        && (priority_delay
+
+            /* Re-calculate priority delay for the suitable case when
+             * pe_fence_op() is called again by stage6() after node priority has
+             * been actually calculated with native_add_running() */
+            || g_hash_table_lookup(stonith_op->meta,
+                                   XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY) != NULL)) {
+
+            /* Add `priority-fencing-delay` to the fencing op even if it's 0 for
+             * the targeting node. So that it takes precedence over any possible
+             * `pcmk_delay_base/max`.
+             */
+            char *delay_s = crm_itoa(node_priority_fencing_delay(node, data_set));
+
+            g_hash_table_insert(stonith_op->meta,
+                                strdup(XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY),
+                                delay_s);
+    }
+
     if(optional == FALSE && pe_can_fence(data_set, node)) {
         pe_action_required(stonith_op, NULL, reason);
     } else if(reason && stonith_op->reason == NULL) {
@@ -2287,7 +2377,7 @@ trigger_unfencing(
               && node->details->online
               && node->details->unclean == FALSE
               && node->details->shutdown == FALSE) {
-        action_t *unfence = pe_fence_op(node, "on", FALSE, reason, data_set);
+        pe_action_t *unfence = pe_fence_op(node, "on", FALSE, reason, FALSE, data_set);
 
         if(dependency) {
             order_actions(unfence, dependency, pe_order_optional);
openSUSE Build Service is sponsored by