File pacemaker-crmd-track-stonith-fail-counts-on-all-nodes.patch of Package pacemaker.14737
commit 515424f01b8ac5eb8705cecb26a60e17de3a7df6
Author: Ken Gaillot <kgaillot@redhat.com>
Date: Mon Apr 10 15:23:46 2017 -0500
Fix: crmd: track stonith fail counts on all nodes
Previously, the stonith fail count was incremented in
tengine_stonith_callback(), which is called only on the DC. Now, it is
incremented in tengine_stonith_notify() instead, which is called on all nodes,
ensuring the count is correct when a new node takes over DC.
diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h
index f0289d461..fd8fe7672 100644
--- a/crmd/crmd_utils.h
+++ b/crmd/crmd_utils.h
@@ -101,6 +101,7 @@ void crmd_join_phase_log(int level);
const char *get_timer_desc(fsa_timer_t * timer);
void st_fail_count_reset(const char * target);
+void st_fail_count_increment(const char *target);
void abort_for_stonith_failure(const char *target, xmlNode *reason);
void crmd_peer_down(crm_node_t *peer, bool full);
diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c
index 6e306fde2..aa4a1417d 100644
--- a/crmd/te_callbacks.c
+++ b/crmd/te_callbacks.c
@@ -682,8 +682,8 @@ st_fail_count_reset(const char *target)
}
}
-static void
-st_fail_count_increment(const char *target, int rc)
+void
+st_fail_count_increment(const char *target)
{
struct st_fail_rec *rec = NULL;
@@ -793,7 +793,6 @@ tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
call_id, target, pcmk_strerror(rc));
abort_for_stonith_failure(target, NULL);
- st_fail_count_increment(target, rc);
}
update_graph(transition_graph, action);
diff --git a/crmd/te_utils.c b/crmd/te_utils.c
index 66b088349..32ddae198 100644
--- a/crmd/te_utils.c
+++ b/crmd/te_utils.c
@@ -259,9 +259,12 @@ tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
return;
}
- if (st_event->result == pcmk_ok &&
- safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
- st_fail_count_reset(st_event->target);
+ if (safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
+ if (st_event->result == pcmk_ok) {
+ st_fail_count_reset(st_event->target);
+ } else {
+ st_fail_count_increment(st_event->target);
+ }
}
crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s",