File bsc#1181744-0004-Fix-fence-history-resync-fence-history-after-fenced-.patch of Package pacemaker.21298

From 03c4455fced74f093deb782198b1ba3076e52015 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Tue, 18 Jun 2019 14:12:27 +0200
Subject: [PATCH 4/7] Fix: fence-history: resync fence-history after fenced
 crash

Setting up a 30s fallback timer to trigger history-sync if the
sync via DC doesn't happen
---
 daemons/controld/controld_callbacks.c |  2 +-
 daemons/controld/controld_control.c   |  2 +
 daemons/controld/controld_te_utils.c   | 86 +++++++++++++++++++++++----
 daemons/controld/controld_transition.h   |  3 +-
 4 files changed, 79 insertions(+), 14 deletions(-)

Index: pacemaker-2.0.1+20190417.13d370ca9/daemons/controld/controld_callbacks.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/daemons/controld/controld_callbacks.c
+++ pacemaker-2.0.1+20190417.13d370ca9/daemons/controld/controld_callbacks.c
@@ -210,7 +210,7 @@ peer_update_callback(enum crm_status_typ
 
             } else if(AM_I_DC) {
                 if (appeared) {
-                    te_trigger_stonith_history_sync();
+                    te_trigger_stonith_history_sync(FALSE);
                 } else {
                     erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
                 }
Index: pacemaker-2.0.1+20190417.13d370ca9/daemons/controld/controld_control.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/daemons/controld/controld_control.c
+++ pacemaker-2.0.1+20190417.13d370ca9/daemons/controld/controld_control.c
@@ -118,7 +118,12 @@ do_shutdown(long long action,
         clear_bit(fsa_input_register, R_ST_REQUIRED);
 
         crm_info("Disconnecting from fencer");
-        stonith_api->cmds->disconnect(stonith_api);
+        if (stonith_api->state != stonith_disconnected) {
+            stonith_api->cmds->disconnect(stonith_api);
+        }
+        stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT);
+        stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE);
+        stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED);
     }
 }
 
@@ -272,6 +277,8 @@ crmd_exit(crm_exit_t exit_code)
     crm_timer_stop(wait_timer);
     crm_timer_stop(recheck_timer);
 
+    te_cleanup_stonith_history_sync(NULL, TRUE);
+
     free(transition_timer); transition_timer = NULL;
     free(integration_timer); integration_timer = NULL;
     free(finalization_timer); finalization_timer = NULL;
Index: pacemaker-2.0.1+20190417.13d370ca9/daemons/controld/controld_te_utils.c
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/daemons/controld/controld_te_utils.c
+++ pacemaker-2.0.1+20190417.13d370ca9/daemons/controld/controld_te_utils.c
@@ -22,7 +22,33 @@
 
 crm_trigger_t *stonith_reconnect = NULL;
 static crm_trigger_t *stonith_history_sync_trigger = NULL;
-static mainloop_timer_t *stonith_history_sync_timer = NULL;
+static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
+static mainloop_timer_t *stonith_history_sync_timer_long = NULL;
+
+void
+te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
+{
+    if (free_timers) {
+        mainloop_timer_del(stonith_history_sync_timer_short);
+        stonith_history_sync_timer_short = NULL;
+        mainloop_timer_del(stonith_history_sync_timer_long);
+        stonith_history_sync_timer_long = NULL;
+    } else {
+        mainloop_timer_stop(stonith_history_sync_timer_short);
+        mainloop_timer_stop(stonith_history_sync_timer_long);
+    }
+
+    if (st) {
+        st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED);
+    }
+}
+
+static void
+tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event)
+{
+    te_cleanup_stonith_history_sync(st, FALSE);
+    crm_debug("Fence-history synced - cancel all timers");
+}
 
 /*
  * stonith cleanup list
@@ -163,6 +189,8 @@ fail_incompletable_stonith(crm_graph_t *
 static void
 tengine_stonith_connection_destroy(stonith_t * st, stonith_event_t * e)
 {
+    te_cleanup_stonith_history_sync(st, FALSE);
+
     if (is_set(fsa_input_register, R_ST_REQUIRED)) {
         crm_crit("Fencing daemon connection failed");
         mainloop_set_trigger(stonith_reconnect);
@@ -176,11 +204,12 @@ tengine_stonith_connection_destroy(stoni
         /* the client API won't properly reconnect notifications
          * if they are still in the table - so remove them
          */
-        stonith_api->cmds->remove_notification(st, T_STONITH_NOTIFY_DISCONNECT);
-        stonith_api->cmds->remove_notification(st, T_STONITH_NOTIFY_FENCE);
         if (stonith_api->state != stonith_disconnected) {
             stonith_api->cmds->disconnect(st);
         }
+        stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT);
+        stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE);
+        stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED);
     }
 
     if (AM_I_DC) {
@@ -197,6 +226,9 @@ char *te_client_id = NULL;
 #endif
 
 static void
+tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event);
+
+static void
 tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
 {
     if(te_client_id == NULL) {
@@ -345,6 +377,7 @@ do_stonith_history_sync(gpointer user_da
     if (stonith_api && (stonith_api->state != stonith_disconnected)) {
         stonith_history_t *history = NULL;
 
+        te_cleanup_stonith_history_sync(stonith_api, FALSE);
         stonith_api->cmds->history(stonith_api,
                                    st_opt_sync_call | st_opt_broadcast,
                                    NULL, &history, 5);
@@ -364,11 +397,18 @@ stonith_history_sync_set_trigger(gpointe
 }
 
 void
-te_trigger_stonith_history_sync(void)
+te_trigger_stonith_history_sync(bool long_timeout)
 {
     /* trigger a sync in 5s to give more nodes the
      * chance to show up so that we don't create
      * unnecessary stonith-history-sync traffic
+     *
+     * the long timeout of 30s is there as a fallback
+     * so that after a successful connection to fenced
+     * we will wait for 30s for the DC to trigger a
+     * history-sync
+     * if this doesn't happen we trigger a sync locally
+     * (e.g. fenced segfaults and is restarted by pacemakerd)
      */
 
     /* as we are finally checking the stonith-connection
@@ -382,14 +422,26 @@ te_trigger_stonith_history_sync(void)
                                  do_stonith_history_sync, NULL);
     }
 
-    if(stonith_history_sync_timer == NULL) {
-        stonith_history_sync_timer =
-            mainloop_timer_add("history_sync", 5000,
-                               FALSE, stonith_history_sync_set_trigger,
-                               NULL);
+    if (long_timeout) {
+        if(stonith_history_sync_timer_long == NULL) {
+            stonith_history_sync_timer_long =
+                mainloop_timer_add("history_sync_long", 30000,
+                                   FALSE, stonith_history_sync_set_trigger,
+                                   NULL);
+        }
+        crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
+        mainloop_timer_start(stonith_history_sync_timer_long);
+    } else {
+        if(stonith_history_sync_timer_short == NULL) {
+            stonith_history_sync_timer_short =
+                mainloop_timer_add("history_sync_short", 5000,
+                                   FALSE, stonith_history_sync_set_trigger,
+                                   NULL);
+        }
+        crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
+        mainloop_timer_start(stonith_history_sync_timer_short);
     }
-    crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
-    mainloop_timer_start(stonith_history_sync_timer);
+
 }
 
 gboolean
@@ -437,6 +489,11 @@ te_connect_stonith(gpointer user_data)
 
     stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_FENCE,
                                              tengine_stonith_notify);
+    stonith_api->cmds->register_notification(stonith_api,
+                                             T_STONITH_NOTIFY_HISTORY_SYNCED,
+                                             tengine_stonith_history_synced);
+
+    te_trigger_stonith_history_sync(TRUE);
 
     crm_trace("Connected");
     return TRUE;
Index: pacemaker-2.0.1+20190417.13d370ca9/daemons/controld/controld_transition.h
===================================================================
--- pacemaker-2.0.1+20190417.13d370ca9.orig/daemons/controld/controld_transition.h
+++ pacemaker-2.0.1+20190417.13d370ca9/daemons/controld/controld_transition.h
@@ -70,7 +70,8 @@ extern void abort_transition_graph(int a
 
 extern gboolean te_connect_stonith(gpointer user_data);
 
-extern void te_trigger_stonith_history_sync(void);
+extern void te_trigger_stonith_history_sync(bool long_timeout);
+extern void te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers);
 
 extern crm_trigger_t *transition_trigger;
 extern crm_trigger_t *stonith_reconnect;
openSUSE Build Service is sponsored by