File pacemaker-crmd-consider-target-checking-stonith-failures.patch of Package pacemaker.14737

commit 3c49a1cf86cb819eca18c841661d90fa65bcb185
Author: Ken Gaillot <kgaillot@redhat.com>
Date:   Fri Apr 7 21:03:31 2017 -0500

    Low: crmd: consider target when checking stonith failures
    
    Previously, if the crmd aborted a transition due to failure to fence a
    particular node, a new transition would not be started if *any* node had
    been fenced too many times. Now, only failures of the particular target are
    checked in that situation.

Index: pacemaker/crmd/crmd_utils.h
===================================================================
--- pacemaker.orig/crmd/crmd_utils.h
+++ pacemaker/crmd/crmd_utils.h
@@ -97,7 +97,7 @@ void crmd_join_phase_log(int level);
 
 const char *get_timer_desc(fsa_timer_t * timer);
 void st_fail_count_reset(const char * target);
-void abort_for_stonith_failure(xmlNode *reason);
+void abort_for_stonith_failure(const char *target, xmlNode *reason);
 void crmd_peer_down(crm_node_t *peer, bool full);
 
 /* Convenience macro for registering a CIB callback
Index: pacemaker/crmd/te_callbacks.c
===================================================================
--- pacemaker.orig/crmd/te_callbacks.c
+++ pacemaker/crmd/te_callbacks.c
@@ -622,7 +622,7 @@ struct st_fail_rec {
 };
 
 static gboolean
-too_many_st_failures(void)
+too_many_st_failures(const char *target)
 {
     GHashTableIter iter;
     const char *key = NULL;
@@ -632,14 +632,26 @@ too_many_st_failures(void)
         return FALSE;
     }
 
-    g_hash_table_iter_init(&iter, stonith_failures);
-    while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
-        if (value->count > 10) {
-            crm_warn("Too many failures to fence %s (%d), giving up", key, value->count);
-            return TRUE;
+    if (target == NULL) {
+        g_hash_table_iter_init(&iter, stonith_failures);
+        while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
+            if (value->count > 10) {
+                target = (const char*)key;
+                goto too_many;
+            }
+        }
+    } else {
+        value = g_hash_table_lookup(stonith_failures, target);
+        if ((value != NULL) && (value->count > 10)) {
+            goto too_many;
         }
     }
     return FALSE;
+
+too_many:
+    crm_warn("Too many failures (%d) to fence %s, giving up",
+             value->count, target);
+    return TRUE;
 }
 
 void
@@ -684,17 +696,18 @@ st_fail_count_increment(const char *targ
  * \internal
  * \brief Abort transition due to stonith failure
  *
- * \param[in] reason  Failed stonith action XML, or NULL
+ * \param[in] target  Don't restart if this (NULL for any) has too many failures
+ * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
  */
 void
-abort_for_stonith_failure(xmlNode *reason)
+abort_for_stonith_failure(const char *target, xmlNode *reason)
 {
     enum transition_action abort_action = tg_restart;
 
     /* If stonith repeatedly fails, we eventually give up on starting a new
      * transition for that reason.
      */
-    if (too_many_st_failures()) {
+    if (too_many_st_failures(target)) {
         abort_action = tg_stop;
     }
     abort_transition(INFINITY, abort_action, "Stonith failed", reason);
@@ -765,7 +778,7 @@ tengine_stonith_callback(stonith_t * sto
         action->failed = TRUE;
         crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
                    call_id, target, pcmk_strerror(rc));
-        abort_for_stonith_failure(NULL);
+        abort_for_stonith_failure(target, NULL);
         st_fail_count_increment(target, rc);
     }
 
Index: pacemaker/crmd/te_utils.c
===================================================================
--- pacemaker.orig/crmd/te_utils.c
+++ pacemaker/crmd/te_utils.c
@@ -162,7 +162,7 @@ fail_incompletable_stonith(crm_graph_t *
 
     if (last_action != NULL) {
         crm_warn("STONITHd failure resulted in un-runnable actions");
-        abort_for_stonith_failure(last_action);
+        abort_for_stonith_failure(NULL, last_action);
         return TRUE;
     }
 
openSUSE Build Service is sponsored by