File jsc#ECO-1611-0002-Feature-scheduler-implement-priority-fencing-delay.patch of Package pacemaker.22684
From 79ded22a9cc7dcb074fdac3174e504502bea147f Mon Sep 17 00:00:00 2001
From: "Gao,Yan" <ygao@suse.com>
Date: Tue, 17 Mar 2020 14:33:35 +0100
Subject: [PATCH 2/9] Feature: scheduler: implement priority-fencing-delay
---
include/crm/pengine/internal.h | 4 +-
include/crm/pengine/pe_types.h | 2 +
lib/pacemaker/pcmk_sched_allocate.c | 14 ++---
lib/pacemaker/pcmk_sched_native.c | 8 +--
lib/pengine/native.c | 47 ++++++++++++++
lib/pengine/unpack.c | 43 ++++++++-----
lib/pengine/utils.c | 96 ++++++++++++++++++++++++++++-
7 files changed, 183 insertions(+), 31 deletions(-)
Index: pacemaker-2.0.1+20190417.13d370ca9/include/crm/pengine/internal.h
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/include/crm/pengine/internal.h
+++ pacemaker-2.0.1+20190417.13d370ca9/include/crm/pengine/internal.h
@@ -318,7 +318,7 @@ typedef struct op_digest_cache_s {
op_digest_cache_t *rsc_action_digest_cmp(resource_t * rsc, xmlNode * xml_op, node_t * node,
pe_working_set_t * data_set);
-action_t *pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe_working_set_t * data_set);
+pe_action_t *pe_fence_op(pe_node_t * node, const char *op, bool optional, const char *reason, bool priority_delay, pe_working_set_t * data_set);
void trigger_unfencing(
resource_t * rsc, node_t *node, const char *reason, action_t *dependency, pe_working_set_t * data_set);
@@ -335,7 +335,7 @@ gboolean add_tag_ref(GHashTable * tags,
void print_rscs_brief(GListPtr rsc_list, const char * pre_text, long options,
void * print_data, gboolean print_all);
-void pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason);
+void pe_fence_node(pe_working_set_t * data_set, pe_node_t * node, const char *reason, bool priority_delay);
node_t *pe_create_node(const char *id, const char *uname, const char *type,
const char *score, pe_working_set_t * data_set);
Index: pacemaker-2.0.1+20190417.13d370ca9/include/crm/pengine/pe_types.h
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/include/crm/pengine/pe_types.h
+++ pacemaker-2.0.1+20190417.13d370ca9/include/crm/pengine/pe_types.h
@@ -160,6 +160,7 @@ struct pe_working_set_s {
GList *param_check; // History entries that need to be checked
GList *stop_needed; // Containers that need stop actions
+ int priority_fencing_delay; // Enforced priority fencing delay
};
enum pe_check_parameters {
@@ -204,6 +205,7 @@ struct pe_node_shared_s {
GHashTable *attrs; /* char* => char* */
GHashTable *utilization;
GHashTable *digest_cache; //!< cache of calculated resource digests
+ int priority; // calculated based on the priority of resources running on the node
};
struct pe_node_s {
Index: pacemaker-2.0.1+20190417.13d370ca9/lib/pacemaker/pcmk_sched_allocate.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/lib/pacemaker/pcmk_sched_allocate.c
+++ pacemaker-2.0.1+20190417.13d370ca9/lib/pacemaker/pcmk_sched_allocate.c
@@ -928,7 +928,7 @@ probe_resources(pe_working_set_t * data_
if (pe__is_remote_node(node) && node->details->remote_rsc
&& (get_remote_node_state(node) == remote_state_failed)) {
- pe_fence_node(data_set, node, "the connection is unrecoverable");
+ pe_fence_node(data_set, node, "the connection is unrecoverable", FALSE);
}
continue;
@@ -1465,7 +1465,7 @@ fence_guest(pe_node_t *node, pe_working_
/* Create a fence pseudo-event, so we have an event to order actions
* against, and the controller can always detect it.
*/
- stonith_op = pe_fence_op(node, fence_action, FALSE, "guest is unclean", data_set);
+ stonith_op = pe_fence_op(node, fence_action, FALSE, "guest is unclean", FALSE, data_set);
update_action_flags(stonith_op, pe_action_pseudo | pe_action_runnable,
__FUNCTION__, __LINE__);
@@ -1474,7 +1474,7 @@ fence_guest(pe_node_t *node, pe_working_
* (even though start might be closer to what is done for a real reboot).
*/
if(stop && is_set(stop->flags, pe_action_pseudo)) {
- pe_action_t *parent_stonith_op = pe_fence_op(stop->node, NULL, FALSE, NULL, data_set);
+ pe_action_t *parent_stonith_op = pe_fence_op(stop->node, NULL, FALSE, NULL, FALSE, data_set);
crm_info("Implying guest node %s is down (action %d) after %s fencing",
node->details->uname, stonith_op->id, stop->node->details->uname);
order_actions(parent_stonith_op, stonith_op,
@@ -1566,7 +1566,7 @@ stage6(pe_working_set_t * data_set)
if (node->details->unclean
&& need_stonith && pe_can_fence(data_set, node)) {
- stonith_op = pe_fence_op(node, NULL, FALSE, "node is unclean", data_set);
+ stonith_op = pe_fence_op(node, NULL, FALSE, "node is unclean", FALSE, data_set);
pe_warn("Scheduling Node %s for STONITH", node->details->uname);
stonith_constraints(node, stonith_op, data_set);
@@ -1864,7 +1864,7 @@ apply_container_ordering(action_t *actio
CRM_ASSERT(container);
if(is_set(container->flags, pe_rsc_failed)) {
- pe_fence_node(data_set, action->node, "container failed");
+ pe_fence_node(data_set, action->node, "container failed", FALSE);
}
crm_trace("Order %s action %s relative to %s%s for %s%s",
@@ -2069,7 +2069,7 @@ apply_remote_ordering(action_t *action,
* way to stop it, it is necessary to fence the
* node.
*/
- pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable");
+ pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable", FALSE);
order_action_then_stop(action, remote_rsc,
pe_order_implies_first, data_set);
@@ -2119,7 +2119,7 @@ apply_remote_ordering(action_t *action,
* Since we have no way to find out, it is
* necessary to fence the node.
*/
- pe_fence_node(data_set, action->node, "resources are in an unknown state and the connection is unrecoverable");
+ pe_fence_node(data_set, action->node, "resources are in an unknown state and the connection is unrecoverable", FALSE);
}
if(cluster_node && state == remote_state_stopped) {
Index: pacemaker-2.0.1+20190417.13d370ca9/lib/pacemaker/pcmk_sched_native.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/lib/pacemaker/pcmk_sched_native.c
+++ pacemaker-2.0.1+20190417.13d370ca9/lib/pacemaker/pcmk_sched_native.c
@@ -1410,7 +1410,7 @@ native_internal_constraints(resource_t *
for (GList *item = allowed_nodes; item; item = item->next) {
pe_node_t *node = item->data;
- pe_action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, data_set);
+ pe_action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, FALSE, data_set);
crm_debug("Ordering any stops of %s before %s, and any starts after",
rsc->id, unfence->uuid);
@@ -1881,7 +1881,7 @@ rsc_ticket_constraint(resource_t * rsc_l
for (gIter = rsc_lh->running_on; gIter != NULL; gIter = gIter->next) {
node_t *node = (node_t *) gIter->data;
- pe_fence_node(data_set, node, "deadman ticket was lost");
+ pe_fence_node(data_set, node, "deadman ticket was lost", FALSE);
}
break;
@@ -2588,7 +2588,7 @@ StopRsc(resource_t * rsc, node_t * next,
}
if(is_set(rsc->flags, pe_rsc_needs_unfencing)) {
- action_t *unfence = pe_fence_op(current, "on", TRUE, NULL, data_set);
+ pe_action_t *unfence = pe_fence_op(current, "on", TRUE, NULL, FALSE, data_set);
order_actions(stop, unfence, pe_order_implies_first);
if (!node_has_been_unfenced(current)) {
@@ -2618,7 +2618,7 @@ order_after_unfencing(resource_t *rsc, p
* the node being unfenced, and all its resources being stopped,
* whenever a new resource is added -- which would be highly suboptimal.
*/
- action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, data_set);
+ pe_action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, FALSE, data_set);
order_actions(unfence, action, order);
Index: pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/native.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/lib/pengine/native.c
+++ pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/native.c
@@ -33,6 +33,51 @@ is_multiply_active(pe_resource_t *rsc)
return count > 1;
}
+static void
+native_priority_to_node(pe_resource_t * rsc, pe_node_t * node)
+{
+ int priority = 0;
+
+ if (rsc->priority == 0) {
+ return;
+ }
+
+ if (rsc->role == RSC_ROLE_MASTER) {
+ // Promoted instance takes base priority + 1
+ priority = rsc->priority + 1;
+
+ } else {
+ priority = rsc->priority;
+ }
+
+ node->details->priority += priority;
+ pe_rsc_trace(rsc, "Node '%s' now has priority %d with %s'%s' (priority: %d%s)",
+ node->details->uname, node->details->priority,
+ rsc->role == RSC_ROLE_MASTER ? "promoted " : "",
+ rsc->id, rsc->priority,
+ rsc->role == RSC_ROLE_MASTER ? " + 1" : "");
+
+ /* Priority of a resource running on a guest node is added to the cluster
+ * node as well. */
+ if (node->details->remote_rsc
+ && node->details->remote_rsc->container) {
+ GListPtr gIter = node->details->remote_rsc->container->running_on;
+
+ for (; gIter != NULL; gIter = gIter->next) {
+ pe_node_t *a_node = gIter->data;
+
+ a_node->details->priority += priority;
+ pe_rsc_trace(rsc, "Node '%s' now has priority %d with %s'%s' (priority: %d%s) "
+ "from guest node '%s'",
+ a_node->details->uname, a_node->details->priority,
+ rsc->role == RSC_ROLE_MASTER ? "promoted " : "",
+ rsc->id, rsc->priority,
+ rsc->role == RSC_ROLE_MASTER ? " + 1" : "",
+ node->details->uname);
+ }
+ }
+}
+
void
native_add_running(resource_t * rsc, node_t * node, pe_working_set_t * data_set)
{
@@ -54,6 +99,8 @@ native_add_running(resource_t * rsc, nod
rsc->running_on = g_list_append(rsc->running_on, node);
if (rsc->variant == pe_native) {
node->details->running_rsc = g_list_append(node->details->running_rsc, rsc);
+
+ native_priority_to_node(rsc, node);
}
if (rsc->variant == pe_native && node->details->maintenance) {
Index: pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/unpack.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/lib/pengine/unpack.c
+++ pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/unpack.c
@@ -65,9 +65,11 @@ is_dangling_guest_node(node_t *node)
* \param[in,out] data_set Current working set of cluster
* \param[in,out] node Node to fence
* \param[in] reason Text description of why fencing is needed
+ * \param[in] priority_delay Whether to consider `priority-fencing-delay`
*/
void
-pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason)
+pe_fence_node(pe_working_set_t * data_set, pe_node_t * node,
+ const char *reason, bool priority_delay)
{
CRM_CHECK(node, return);
@@ -117,7 +119,8 @@ pe_fence_node(pe_working_set_t * data_se
reason);
}
node->details->unclean = TRUE;
- pe_fence_op(node, NULL, TRUE, reason, data_set);
+ // No need to apply `priority-fencing-delay` for remote nodes
+ pe_fence_op(node, NULL, TRUE, reason, FALSE, data_set);
} else if (node->details->unclean) {
crm_trace("Cluster node %s %s because %s",
@@ -131,7 +134,7 @@ pe_fence_node(pe_working_set_t * data_se
pe_can_fence(data_set, node)? "will be fenced" : "is unclean",
reason);
node->details->unclean = TRUE;
- pe_fence_op(node, NULL, TRUE, reason, data_set);
+ pe_fence_op(node, NULL, TRUE, reason, priority_delay, data_set);
}
}
@@ -215,6 +218,15 @@ unpack_config(xmlNode * config, pe_worki
crm_debug("Concurrent fencing is %s",
is_set(data_set->flags, pe_flag_concurrent_fencing) ? "enabled" : "disabled");
+ // Default value -1 means `priority-fencing-delay` is disabled
+ data_set->priority_fencing_delay = -1;
+ value = pe_pref(data_set->config_hash,
+ XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY);
+ if (value) {
+ data_set->priority_fencing_delay = crm_parse_interval_spec(value) / 1000;
+ crm_trace("Priority fencing delay is %ds", data_set->priority_fencing_delay);
+ }
+
set_config_flag(data_set, "stop-all-resources", pe_flag_stop_everything);
crm_debug("Stop all active resources: %s",
is_set(data_set->flags, pe_flag_stop_everything) ? "true" : "false");
@@ -1140,7 +1152,7 @@ unpack_status(xmlNode * status, pe_worki
/* Everything else should flow from this automatically
* At least until the PE becomes able to migrate off healthy resources
*/
- pe_fence_node(data_set, this_node, "cluster does not have quorum");
+ pe_fence_node(data_set, this_node, "cluster does not have quorum", FALSE);
}
}
}
@@ -1212,7 +1224,7 @@ determine_online_status_no_fencing(pe_wo
} else {
/* mark it unclean */
- pe_fence_node(data_set, this_node, "peer is unexpectedly down");
+ pe_fence_node(data_set, this_node, "peer is unexpectedly down", FALSE);
crm_info("\tin_cluster=%s, is_peer=%s, join=%s, expected=%s",
crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state));
}
@@ -1268,10 +1280,10 @@ determine_online_status_fencing(pe_worki
online = crmd_online;
} else if (in_cluster == NULL) {
- pe_fence_node(data_set, this_node, "peer has not been seen by the cluster");
+ pe_fence_node(data_set, this_node, "peer has not been seen by the cluster", FALSE);
} else if (safe_str_eq(join, CRMD_JOINSTATE_NACK)) {
- pe_fence_node(data_set, this_node, "peer failed the pacemaker membership criteria");
+ pe_fence_node(data_set, this_node, "peer failed the pacemaker membership criteria", FALSE);
} else if (do_terminate == FALSE && safe_str_eq(exp_state, CRMD_JOINSTATE_DOWN)) {
@@ -1290,14 +1302,15 @@ determine_online_status_fencing(pe_worki
online = FALSE;
} else if (crm_is_true(in_cluster) == FALSE) {
- pe_fence_node(data_set, this_node, "peer is no longer part of the cluster");
+ // Consider `priority-fencing-delay` for lost nodes
+ pe_fence_node(data_set, this_node, "peer is no longer part of the cluster", TRUE);
} else if (!crmd_online) {
- pe_fence_node(data_set, this_node, "peer process is no longer available");
+ pe_fence_node(data_set, this_node, "peer process is no longer available", FALSE);
/* Everything is running at this point, now check join state */
} else if (do_terminate) {
- pe_fence_node(data_set, this_node, "termination was requested");
+ pe_fence_node(data_set, this_node, "termination was requested", FALSE);
} else if (safe_str_eq(join, CRMD_JOINSTATE_MEMBER)) {
crm_info("Node %s is active", this_node->details->uname);
@@ -1309,7 +1322,7 @@ determine_online_status_fencing(pe_worki
this_node->details->pending = TRUE;
} else {
- pe_fence_node(data_set, this_node, "peer was in an unknown state");
+ pe_fence_node(data_set, this_node, "peer was in an unknown state", FALSE);
crm_warn("%s: in-cluster=%s, is-peer=%s, join=%s, expected=%s, term=%d, shutdown=%d",
this_node->details->uname, crm_str(in_cluster), crm_str(is_peer),
crm_str(join), crm_str(exp_state), do_terminate, this_node->details->shutdown);
@@ -1897,7 +1910,7 @@ process_rsc_state(resource_t * rsc, node
if (reason == NULL) {
reason = crm_strdup_printf("%s is thought to be active there", rsc->id);
}
- pe_fence_node(data_set, node, reason);
+ pe_fence_node(data_set, node, reason, FALSE);
}
free(reason);
}
@@ -1919,7 +1932,7 @@ process_rsc_state(resource_t * rsc, node
* but also mark the node as unclean
*/
reason = crm_strdup_printf("%s failed there", rsc->id);
- pe_fence_node(data_set, node, reason);
+ pe_fence_node(data_set, node, reason, FALSE);
free(reason);
break;
@@ -1987,7 +2000,7 @@ process_rsc_state(resource_t * rsc, node
* should result in fencing the remote node.
*/
pe_fence_node(data_set, tmpnode,
- "remote connection is unrecoverable");
+ "remote connection is unrecoverable", FALSE);
}
}
@@ -2988,7 +3001,7 @@ static bool check_operation_expiry(resou
* after unpack_node_loop() is done).
*/
pe_action_t *fence = pe_fence_op(remote_node, NULL, TRUE, NULL,
- data_set);
+ FALSE, data_set);
crm_info("Clearing %s failure will wait until any scheduled "
"fencing of %s completes", task, rsc->id);
Index: pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/utils.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/lib/pengine/utils.c
+++ pacemaker-2.0.1+20190417.13d370ca9/lib/pengine/utils.c
@@ -578,7 +578,7 @@ custom_action(resource_t * rsc, char *ke
if (is_set(action->rsc->flags, pe_rsc_managed)
&& save_action && a_task == stop_rsc
&& action->node->details->unclean == FALSE) {
- pe_fence_node(data_set, action->node, "resource actions are unrunnable");
+ pe_fence_node(data_set, action->node, "resource actions are unrunnable", FALSE);
}
} else if (action->node->details->pending) {
@@ -2188,9 +2188,76 @@ find_unfencing_devices(GListPtr candidat
return matches;
}
+static int
+node_priority_fencing_delay(pe_node_t * node, pe_working_set_t * data_set)
+{
+ int member_count = 0;
+ int online_count = 0;
+ int top_priority = 0;
+ int lowest_priority = 0;
+ GListPtr gIter = NULL;
+
+ // `priority-fencing-delay` is disabled
+ if (data_set->priority_fencing_delay < 0) {
+ return -1;
+ }
+
+ /* No need to delay fencing if the fencing target is not a normal cluster
+ * member, for example if it's a remote node or a guest node. */
+ if (node->details->type != node_member) {
+ return 0;
+ }
+
+ // No need to delay fencing if the fencing target is in our partition
+ if (node->details->online) {
+ return 0;
+ }
+
+ for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
+ pe_node_t *n = gIter->data;
+
+ if (n->details->type != node_member) {
+ continue;
+ }
+
+ member_count ++;
+
+ if (n->details->online) {
+ online_count++;
+ }
+
+ if (member_count == 1
+ || n->details->priority > top_priority) {
+ top_priority = n->details->priority;
+ }
+
+ if (member_count == 1
+ || n->details->priority < lowest_priority) {
+ lowest_priority = n->details->priority;
+ }
+ }
+
+ // No need to delay if we have more than half of the cluster members
+ if (online_count > member_count / 2) {
+ return 0;
+ }
+
+ /* All the nodes have equal priority.
+ * Any configured corresponding `pcmk_delay_base/max` will be applied. */
+ if (lowest_priority == top_priority) {
+ return -1;
+ }
+
+ if (node->details->priority < top_priority) {
+ return 0;
+ }
+
+ return data_set->priority_fencing_delay;
+}
action_t *
-pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe_working_set_t * data_set)
+pe_fence_op(pe_node_t * node, const char *op, bool optional, const char *reason,
+ bool priority_delay, pe_working_set_t * data_set)
{
char *op_key = NULL;
action_t *stonith_op = NULL;
@@ -2262,6 +2329,29 @@ pe_fence_op(node_t * node, const char *o
free(op_key);
}
+ if (data_set->priority_fencing_delay >= 0
+
+ /* It's a suitable case where `priority-fencing-delay` applies.
+ * At least add `priority-fencing-delay` field as an indicator. */
+ && (priority_delay
+
+ /* Re-calculate priority delay for the suitable case when
+ * pe_fence_op() is called again by stage6() after node priority has
+ * been actually calculated with native_add_running() */
+ || g_hash_table_lookup(stonith_op->meta,
+ XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY) != NULL)) {
+
+ /* Add `priority-fencing-delay` to the fencing op even if it's 0 for
+ * the targeting node. So that it takes precedence over any possible
+ * `pcmk_delay_base/max`.
+ */
+ char *delay_s = crm_itoa(node_priority_fencing_delay(node, data_set));
+
+ g_hash_table_insert(stonith_op->meta,
+ strdup(XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY),
+ delay_s);
+ }
+
if(optional == FALSE && pe_can_fence(data_set, node)) {
pe_action_required(stonith_op, NULL, reason);
} else if(reason && stonith_op->reason == NULL) {
@@ -2287,7 +2377,7 @@ trigger_unfencing(
&& node->details->online
&& node->details->unclean == FALSE
&& node->details->shutdown == FALSE) {
- action_t *unfence = pe_fence_op(node, "on", FALSE, reason, data_set);
+ pe_action_t *unfence = pe_fence_op(node, "on", FALSE, reason, FALSE, data_set);
if(dependency) {
order_actions(unfence, dependency, pe_order_optional);