File pacemaker-crmd-consider-target-checking-stonith-failures.patch of Package pacemaker.14737
commit 3c49a1cf86cb819eca18c841661d90fa65bcb185
Author: Ken Gaillot <kgaillot@redhat.com>
Date: Fri Apr 7 21:03:31 2017 -0500
Low: crmd: consider target when checking stonith failures
Previously, if the crmd aborted a transition due to failure to fence a
particular node, a new transition would not be started if *any* node had
been fenced too many times. Now, only failures of the particular target are
checked in that situation.
Index: pacemaker/crmd/crmd_utils.h
===================================================================
--- pacemaker.orig/crmd/crmd_utils.h
+++ pacemaker/crmd/crmd_utils.h
@@ -97,7 +97,7 @@ void crmd_join_phase_log(int level);
const char *get_timer_desc(fsa_timer_t * timer);
void st_fail_count_reset(const char * target);
-void abort_for_stonith_failure(xmlNode *reason);
+void abort_for_stonith_failure(const char *target, xmlNode *reason);
void crmd_peer_down(crm_node_t *peer, bool full);
/* Convenience macro for registering a CIB callback
Index: pacemaker/crmd/te_callbacks.c
===================================================================
--- pacemaker.orig/crmd/te_callbacks.c
+++ pacemaker/crmd/te_callbacks.c
@@ -622,7 +622,7 @@ struct st_fail_rec {
};
static gboolean
-too_many_st_failures(void)
+too_many_st_failures(const char *target)
{
GHashTableIter iter;
const char *key = NULL;
@@ -632,14 +632,26 @@ too_many_st_failures(void)
return FALSE;
}
- g_hash_table_iter_init(&iter, stonith_failures);
- while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
- if (value->count > 10) {
- crm_warn("Too many failures to fence %s (%d), giving up", key, value->count);
- return TRUE;
+ if (target == NULL) {
+ g_hash_table_iter_init(&iter, stonith_failures);
+ while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
+ if (value->count > 10) {
+ target = (const char*)key;
+ goto too_many;
+ }
+ }
+ } else {
+ value = g_hash_table_lookup(stonith_failures, target);
+ if ((value != NULL) && (value->count > 10)) {
+ goto too_many;
}
}
return FALSE;
+
+too_many:
+ crm_warn("Too many failures (%d) to fence %s, giving up",
+ value->count, target);
+ return TRUE;
}
void
@@ -684,17 +696,18 @@ st_fail_count_increment(const char *targ
* \internal
* \brief Abort transition due to stonith failure
*
- * \param[in] reason Failed stonith action XML, or NULL
+ * \param[in] target Don't restart if this (NULL for any) has too many failures
+ * \param[in] reason Log this stonith action XML as abort reason (or NULL)
*/
void
-abort_for_stonith_failure(xmlNode *reason)
+abort_for_stonith_failure(const char *target, xmlNode *reason)
{
enum transition_action abort_action = tg_restart;
/* If stonith repeatedly fails, we eventually give up on starting a new
* transition for that reason.
*/
- if (too_many_st_failures()) {
+ if (too_many_st_failures(target)) {
abort_action = tg_stop;
}
abort_transition(INFINITY, abort_action, "Stonith failed", reason);
@@ -765,7 +778,7 @@ tengine_stonith_callback(stonith_t * sto
action->failed = TRUE;
crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
call_id, target, pcmk_strerror(rc));
- abort_for_stonith_failure(NULL);
+ abort_for_stonith_failure(target, NULL);
st_fail_count_increment(target, rc);
}
Index: pacemaker/crmd/te_utils.c
===================================================================
--- pacemaker.orig/crmd/te_utils.c
+++ pacemaker/crmd/te_utils.c
@@ -162,7 +162,7 @@ fail_incompletable_stonith(crm_graph_t *
if (last_action != NULL) {
crm_warn("STONITHd failure resulted in un-runnable actions");
- abort_for_stonith_failure(last_action);
+ abort_for_stonith_failure(NULL, last_action);
return TRUE;
}