File pacemaker-pengine-resources-on-remote-nodes-not-restarted-until-necessary.patch of Package pacemaker.8397
commit 3f72a028f9a12a6623a7b88733a6fe7ecc879230
Author: Andrew Beekhof <andrew@beekhof.net>
Date: Wed May 24 11:21:44 2017 +1000
PE: Assume resources on remote nodes do not need to be restarted until absolutely necessary
Index: pacemaker/pengine/allocate.c
===================================================================
--- pacemaker.orig/pengine/allocate.c
+++ pacemaker/pengine/allocate.c
@@ -1726,6 +1726,244 @@ rsc_order_first(resource_t * lh_rsc, ord
extern gboolean update_action(action_t * action);
extern void update_colo_start_chain(action_t * action);
+enum remote_connection_state
+{
+ remote_state_unknown = 0,
+ remote_state_alive = 1,
+ remote_state_resting = 2,
+ remote_state_dead= 3
+};
+
+static int
+is_recurring_action(action_t *action)
+{
+ const char *interval_s = g_hash_table_lookup(action->meta, XML_LRM_ATTR_INTERVAL);
+ int interval = crm_parse_int(interval_s, "0");
+ if(interval > 0) {
+ return TRUE;
+ }
+ return FALSE;
+}
+
+static void
+apply_container_ordering(action_t *action, pe_working_set_t *data_set)
+{
+ /* VMs are also classified as containers for these purposes... in
+ * that they both involve a 'thing' running on a real or remote
+ * cluster node.
+ *
+ * This allows us to be smarter about the type and extent of
+ * recovery actions required in various scenarios
+ */
+ resource_t *remote_rsc = NULL;
+ resource_t *container = NULL;
+ enum action_tasks task = text2task(action->task);
+
+ if (action->rsc == NULL) {
+ return;
+ }
+
+ CRM_ASSERT(action->node);
+ CRM_ASSERT(is_remote_node(action->node));
+ CRM_ASSERT(action->node->details->remote_rsc);
+
+ remote_rsc = action->node->details->remote_rsc;
+ CRM_ASSERT(remote_rsc);
+
+ container = remote_rsc->container;
+ CRM_ASSERT(container);
+
+ crm_trace("%s %s %s %s %d", action->uuid, action->task, remote_rsc->id, container->id, is_set(container->flags, pe_rsc_failed));
+ switch (task) {
+ case start_rsc:
+ case action_promote:
+ /* Force resource recovery if the container is recovered */
+ custom_action_order(container, generate_op_key(container->id, RSC_START, 0), NULL,
+ action->rsc, NULL, action,
+ pe_order_preserve | pe_order_implies_then | pe_order_runnable_left, data_set);
+
+ /* Wait for the connection resource to be up too */
+ custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
+ action->rsc, NULL, action,
+ pe_order_preserve | pe_order_runnable_left, data_set);
+ break;
+ case stop_rsc:
+ if(is_set(container->flags, pe_rsc_failed)) {
+ /* When the container representing a guest node fails,
+ * the stop action for all the resources living in
+ * that container is implied by the container
+ * stopping. This is similar to how fencing operations
+ * work for cluster nodes.
+ */
+ custom_action_order(container, generate_op_key(container->id, RSC_STOP, 0), NULL,
+ action->rsc, NULL, action,
+ pe_order_preserve | pe_order_implies_then | pe_order_runnable_left, data_set);
+ pe_set_action_bit(action, pe_action_pseudo);
+ } else {
+ /* Otherwise, ensure the operation happens before the connection is brought down */
+ custom_action_order(action->rsc, NULL, action,
+ remote_rsc, generate_op_key(remote_rsc->id, RSC_STOP, 0), NULL,
+ pe_order_preserve, data_set);
+ }
+ break;
+ case action_demote:
+ if(is_set(container->flags, pe_rsc_failed)) {
+ /* Just like a stop, the demote is implied by the
+ * container having failed/stopped
+ *
+ * If we really wanted to we would order the demote
+ * after the stop, IFF the containers current role was
+ * stopped (otherwise we re-introduce an ordering
+ * loop)
+ */
+ pe_set_action_bit(action, pe_action_pseudo);
+ } else {
+ /* Otherwise, ensure the operation happens before the connection is brought down */
+ custom_action_order(action->rsc, NULL, action,
+ remote_rsc, generate_op_key(remote_rsc->id, RSC_STOP, 0), NULL,
+ pe_order_preserve, data_set);
+ }
+ break;
+ default:
+ /* Wait for the connection resource to be up */
+ if (is_recurring_action(action)) {
+ /* In case we ever get the recovery logic wrong, force
+ * recurring monitors to be restarted, even if just
+ * the connection was re-established
+ */
+ custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
+ action->rsc, NULL, action,
+ pe_order_preserve | pe_order_runnable_left | pe_order_implies_then, data_set);
+ } else {
+ custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
+ action->rsc, NULL, action,
+ pe_order_preserve | pe_order_runnable_left, data_set);
+ }
+ break;
+ }
+}
+
+static void
+apply_remote_ordering(action_t *action, pe_working_set_t *data_set)
+{
+ resource_t *remote_rsc = NULL;
+ node_t *cluster_node = NULL;
+ enum action_tasks task = text2task(action->task);
+ enum remote_connection_state state = remote_state_unknown;
+
+ if (action->rsc == NULL) {
+ return;
+ }
+
+ CRM_ASSERT(action->node);
+ CRM_ASSERT(is_remote_node(action->node));
+ CRM_ASSERT(action->node->details->remote_rsc);
+
+ remote_rsc = action->node->details->remote_rsc;
+ CRM_ASSERT(remote_rsc);
+
+ if(remote_rsc->running_on) {
+ cluster_node = remote_rsc->running_on->data;
+ }
+
+ /* If the cluster node the remote connection resource resides on
+ * is unclean or went offline, we can't process any operations
+ * on that remote node until after it starts elsewhere.
+ */
+ if(remote_rsc->next_role == RSC_ROLE_STOPPED || remote_rsc->allocated_to == NULL) {
+ /* There is no-where left to run the connection resource
+ * We must assume the target has failed
+ */
+ state = remote_state_dead;
+
+ } else if (cluster_node == NULL) {
+ /* Connection is recoverable but not currently running anywhere, see if we can recover it first */
+ state = remote_state_unknown;
+
+ } else if(cluster_node->details->unclean == TRUE
+ || cluster_node->details->online == FALSE) {
+ /* Connection is running on a dead node, see if we can recover it first */
+ state = remote_state_resting;
+
+ } else if (g_list_length(remote_rsc->running_on) > 1
+ && remote_rsc->partial_migration_source
+ && remote_rsc->partial_migration_target) {
+ /* We're in the middle of migrating a connection resource,
+ * wait until after the resource migrates before performing
+ * any actions.
+ */
+ state = remote_state_resting;
+
+ } else {
+ state = remote_state_alive;
+ }
+
+ crm_trace("%s %s %d", action->uuid, action->task, state);
+ switch (task) {
+ case start_rsc:
+ case action_promote:
+ if(state == remote_state_dead) {
+ /* Wait for the connection resource to be up and force recovery */
+ custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
+ action->rsc, NULL, action,
+ pe_order_preserve | pe_order_implies_then | pe_order_runnable_left, data_set);
+ } else {
+ /* Ensure the connection resource is up and assume everything is as we left it */
+ custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
+ action->rsc, NULL, action,
+ pe_order_preserve | pe_order_runnable_left, data_set);
+ }
+ break;
+ case stop_rsc:
+ /* Handle special case with remote node where stop actions need to be
+ * ordered after the connection resource starts somewhere else.
+ */
+ if(state == remote_state_resting) {
+ /* Wait for the connection resource to be up and assume everything is as we left it */
+ custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
+ action->rsc, NULL, action,
+ pe_order_preserve | pe_order_runnable_left, data_set);
+ } else {
+ custom_action_order(action->rsc, NULL, action,
+ remote_rsc, generate_op_key(remote_rsc->id, RSC_STOP, 0), NULL,
+ pe_order_preserve | pe_order_implies_first, data_set);
+ }
+ break;
+ case action_demote:
+
+ /* If the connection is being torn down, we don't want
+ * to build a constraint between a resource's demotion and
+ * the connection resource starting... because the connection
+ * resource can not start. The connection might already be up,
+ * but the "start" action would not be allowed, which in turn would
+ * block the demotion of any resources living in the node.
+ */
+
+ if(state == remote_state_resting || state == remote_state_unknown) {
+ custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
+ action->rsc, NULL, action,
+ pe_order_preserve, data_set);
+ } /* Otherwise we can rely on the stop ordering */
+ break;
+ default:
+ /* Wait for the connection resource to be up */
+ if (is_recurring_action(action)) {
+ /* In case we ever get the recovery logic wrong, force
+ * recurring monitors to be restarted, even if just
+ * the connection was re-established
+ */
+ custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
+ action->rsc, NULL, action,
+ pe_order_preserve | pe_order_runnable_left | pe_order_implies_then, data_set);
+ } else {
+ custom_action_order(remote_rsc, generate_op_key(remote_rsc->id, RSC_START, 0), NULL,
+ action->rsc, NULL, action,
+ pe_order_preserve | pe_order_runnable_left, data_set);
+ }
+ break;
+ }
+}
+
static void
apply_remote_node_ordering(pe_working_set_t *data_set)
{
@@ -1734,10 +1972,9 @@ apply_remote_node_ordering(pe_working_se
if (is_set(data_set->flags, pe_flag_have_remote_nodes) == FALSE) {
return;
}
+
for (; gIter != NULL; gIter = gIter->next) {
action_t *action = (action_t *) gIter->data;
- resource_t *remote_rsc = NULL;
- resource_t *container = NULL;
if (action->rsc == NULL) {
continue;
@@ -1761,7 +1998,7 @@ apply_remote_node_ordering(pe_working_se
pe_order_optional,
data_set);
- continue;
+ continue;
}
/* detect if the action occurs on a remote node. if so create
@@ -1772,134 +2009,13 @@ apply_remote_node_ordering(pe_working_se
is_remote_node(action->node) == FALSE ||
action->node->details->remote_rsc == NULL ||
is_set(action->flags, pe_action_pseudo)) {
- continue;
- }
-
- remote_rsc = action->node->details->remote_rsc;
- container = remote_rsc->container;
-
- if (safe_str_eq(action->task, "monitor") ||
- safe_str_eq(action->task, "start") ||
- safe_str_eq(action->task, "promote") ||
- safe_str_eq(action->task, "notify") ||
- safe_str_eq(action->task, CRM_OP_LRM_REFRESH) ||
- safe_str_eq(action->task, CRM_OP_CLEAR_FAILCOUNT) ||
- safe_str_eq(action->task, "delete")) {
-
- custom_action_order(remote_rsc,
- generate_op_key(remote_rsc->id, RSC_START, 0),
- NULL,
- action->rsc,
- NULL,
- action,
- pe_order_preserve | pe_order_implies_then | pe_order_runnable_left,
- data_set);
+ crm_trace("Nothing required for %s", action->uuid);
- } else if (safe_str_eq(action->task, "demote")) {
+ } else if(action->node->details->remote_rsc->container) {
+ apply_container_ordering(action, data_set);
- /* If the connection is being torn down, we don't want
- * to build a constraint between a resource's demotion and
- * the connection resource starting... because the connection
- * resource can not start. The connection might already be up,
- * but the START action would not be allowed which in turn would
- * block the demotion of any resournces living in the remote-node.
- *
- * In this case, only build the constraint between the demotion and
- * the connection's stop action. This allows the connection and all the
- * resources within the remote-node to be torn down properly. */
- if (remote_rsc->next_role == RSC_ROLE_STOPPED) {
- custom_action_order(action->rsc,
- NULL,
- action,
- remote_rsc,
- generate_op_key(remote_rsc->id, RSC_STOP, 0),
- NULL,
- pe_order_preserve | pe_order_implies_first,
- data_set);
- } else if(container == NULL) {
- custom_action_order(remote_rsc,
- generate_op_key(remote_rsc->id, RSC_START, 0),
- NULL,
- action->rsc,
- NULL,
- action,
- pe_order_preserve | pe_order_implies_then | pe_order_runnable_left,
- data_set);
- }
-
- if(container && is_set(container->flags, pe_rsc_failed)) {
- /* Just like a stop, the demote is implied by the
- * container having failed/stopped
- *
- * If we really wanted to we would order the demote
- * after the stop, IFF the containers current role was
- * stopped (otherwise we re-introduce an ordering
- * loop)
- */
- pe_set_action_bit(action, pe_action_pseudo);
- }
-
- } else if (safe_str_eq(action->task, "stop") &&
- container &&
- is_set(container->flags, pe_rsc_failed)) {
-
- /* when the container representing a remote node fails, the stop
- * action for all the resources living in that container is implied
- * by the container stopping. This is similar to how fencing operations
- * work for cluster nodes. */
- pe_set_action_bit(action, pe_action_pseudo);
- custom_action_order(container,
- generate_op_key(container->id, RSC_STOP, 0),
- NULL,
- action->rsc,
- NULL,
- action,
- pe_order_preserve | pe_order_implies_then | pe_order_runnable_left,
- data_set);
- } else if (safe_str_eq(action->task, "stop")) {
- gboolean after_start = FALSE;
-
- /* handle special case with baremetal remote where stop actions need to be
- * ordered after the connection resource starts somewhere else. */
- if (is_baremetal_remote_node(action->node)) {
- node_t *cluster_node = remote_rsc->running_on ? remote_rsc->running_on->data : NULL;
-
- /* if the current cluster node a baremetal connection resource
- * is residing on is unclean or went offline we can't process any
- * operations on that remote node until after it starts somewhere else. */
- if (cluster_node == NULL ||
- cluster_node->details->unclean == TRUE ||
- cluster_node->details->online == FALSE) {
- after_start = TRUE;
- } else if (g_list_length(remote_rsc->running_on) > 1 &&
- remote_rsc->partial_migration_source &&
- remote_rsc->partial_migration_target) {
- /* if we're caught in the middle of migrating a connection resource,
- * then we have to wait until after the resource migrates before performing
- * any actions. */
- after_start = TRUE;
- }
- }
-
- if (after_start) {
- custom_action_order(remote_rsc,
- generate_op_key(remote_rsc->id, RSC_START, 0),
- NULL,
- action->rsc,
- NULL,
- action,
- pe_order_preserve | pe_order_implies_then | pe_order_runnable_left,
- data_set);
- } else {
- custom_action_order(action->rsc,
- NULL,
- action,
- remote_rsc,
- generate_op_key(remote_rsc->id, RSC_STOP, 0),
- NULL,
- pe_order_preserve | pe_order_implies_first,
- data_set);
- }
+ } else {
+ apply_remote_ordering(action, data_set);
}
}
}