File pacemaker-crmd-check-too-many-stonith-failures.patch of Package pacemaker.14737
commit 100dd5fda476ef526ac1964260252b30864d5ca7
Author: Ken Gaillot <kgaillot@redhat.com>
Date: Fri Apr 7 16:51:29 2017 -0500
Fix: crmd: check for too many stonith failures only when aborting for that reason
Previously, crmd would check for too many stonith failures whenever aborting
a transition. This would lead to a new transition not being triggered when
aborting for some other unrelated reason, such as a configuration change.
Now, crmd checks for too many stonith failures only when aborting due to a new
stonith failure.
diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h
index a1aaad32d..d2f8eb2ab 100644
--- a/crmd/crmd_utils.h
+++ b/crmd/crmd_utils.h
@@ -100,8 +100,8 @@ int crmd_join_phase_count(enum crm_join_phase phase);
void crmd_join_phase_log(int level);
const char *get_timer_desc(fsa_timer_t * timer);
-gboolean too_many_st_failures(void);
void st_fail_count_reset(const char * target);
+void abort_for_stonith_failure(xmlNode *reason);
void crmd_peer_down(crm_node_t *peer, bool full);
/* Convenience macro for registering a CIB callback
diff --git a/crmd/te_actions.c b/crmd/te_actions.c
index a8ad86f94..66dd16ebc 100644
--- a/crmd/te_actions.c
+++ b/crmd/te_actions.c
@@ -726,15 +726,11 @@ notify_crmd(crm_graph_t * graph)
case tg_restart:
type = "restart";
if (fsa_state == S_TRANSITION_ENGINE) {
- if (too_many_st_failures() == FALSE) {
- if (transition_timer->period_ms > 0) {
- crm_timer_stop(transition_timer);
- crm_timer_start(transition_timer);
- } else {
- event = I_PE_CALC;
- }
+ if (transition_timer->period_ms > 0) {
+ crm_timer_stop(transition_timer);
+ crm_timer_start(transition_timer);
} else {
- event = I_TE_SUCCESS;
+ event = I_PE_CALC;
}
} else if (fsa_state == S_POLICY_ENGINE) {
diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c
index 6c0670c96..a0aa0813f 100644
--- a/crmd/te_callbacks.c
+++ b/crmd/te_callbacks.c
@@ -635,7 +635,7 @@ struct st_fail_rec {
int count;
};
-gboolean
+static gboolean
too_many_st_failures(void)
{
GHashTableIter iter;
@@ -694,6 +694,26 @@ st_fail_count_increment(const char *target, int rc)
}
}
+/*!
+ * \internal
+ * \brief Abort transition due to stonith failure
+ *
+ * \param[in] reason Failed stonith action XML, or NULL
+ */
+void
+abort_for_stonith_failure(xmlNode *reason)
+{
+ enum transition_action abort_action = tg_restart;
+
+ /* If stonith repeatedly fails, we eventually give up on starting a new
+ * transition for that reason.
+ */
+ if (too_many_st_failures()) {
+ abort_action = tg_stop;
+ }
+ abort_transition(INFINITY, abort_action, "Stonith failed", reason);
+}
+
void
tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
{
@@ -759,7 +779,7 @@ tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
action->failed = TRUE;
crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
call_id, target, pcmk_strerror(rc));
- abort_transition(INFINITY, tg_restart, "Stonith failed", NULL);
+ abort_for_stonith_failure(NULL);
st_fail_count_increment(target, rc);
}
diff --git a/crmd/te_utils.c b/crmd/te_utils.c
index 3b67afe15..4603307bd 100644
--- a/crmd/te_utils.c
+++ b/crmd/te_utils.c
@@ -162,7 +162,7 @@ fail_incompletable_stonith(crm_graph_t * graph)
if (last_action != NULL) {
crm_warn("STONITHd failure resulted in un-runnable actions");
- abort_transition(INFINITY, tg_restart, "Stonith failure", last_action);
+ abort_for_stonith_failure(last_action);
return TRUE;
}