File pacemaker-pengine-re-enable-unrecoverable-remote-fencing.patch of Package pacemaker.8397

commit a72ef2be2240aac04b784710ae81c4671a8189cd
Author: Ken Gaillot <kgaillot@redhat.com>
Date:   Sat Aug 19 15:45:27 2017 -0500

    Fix: pengine: re-enable unrecoverable remote fencing
    
    The fence loop referenced by 00fed62 was due to get_remote_node_state()
    considering a failed remote connection resource unrecoverable, even if it was
    waiting for its reconnect interval to expire before attempting reconnection.

Index: pacemaker-1.1.16+20170320.77ea74d/pengine/allocate.c
===================================================================
--- pacemaker-1.1.16+20170320.77ea74d.orig/pengine/allocate.c
+++ pacemaker-1.1.16+20170320.77ea74d/pengine/allocate.c
@@ -891,13 +891,10 @@ probe_resources(pe_working_set_t * data_
             continue;
 
         } else if (node->details->online == FALSE && node->details->remote_rsc) {
-            // TODO figure out why this results in fence loop
-            /*
             enum remote_connection_state state = get_remote_node_state(node);
             if(state == remote_state_failed) {
                 pe_fence_node(data_set, node, "the connection is unrecoverable");
             }
-            */
             continue;
 
         } else if(node->details->online == FALSE) {
@@ -1889,22 +1886,38 @@ get_remote_node_state(pe_node_t *node)
      * on that remote node until after it starts elsewhere.
      */
     if(remote_rsc->next_role == RSC_ROLE_STOPPED || remote_rsc->allocated_to == NULL) {
-        /* There is nowhere left to run the connection resource,
-         * and the resource is in a failed state (either directly
-         * or because it is located on a failed node).
-         *
-         * If there are any resources known to be active on it (stop),
-         * or if there are resources in an unknown state (probe), we
-         * must assume the worst and fence it.
-         */
-        if (is_set(remote_rsc->flags, pe_rsc_failed)) {
-            return remote_state_failed;
-        } else if(cluster_node && cluster_node->details->unclean) {
+        /* The connection resource is not going to run anywhere */
+
+        if (cluster_node && cluster_node->details->unclean) {
+            /* The remote connection is failed because its resource is on a
+             * failed node and can't be recovered elsewhere, so we must fence.
+             */
             return remote_state_failed;
-        } else {
+        }
+
+        if (is_not_set(remote_rsc->flags, pe_rsc_failed)) {
+            /* Connection resource is cleanly stopped */
             return remote_state_stopped;
         }
 
+        /* Connection resource is failed */
+
+        if ((remote_rsc->next_role == RSC_ROLE_STOPPED)
+            && remote_rsc->remote_reconnect_interval
+            && node->details->remote_was_fenced) {
+
+            /* We won't know whether the connection is recoverable until the
+             * reconnect interval expires and we reattempt connection.
+             */
+            return remote_state_unknown;
+        }
+
+        /* The remote connection is in a failed state. If there are any
+         * resources known to be active on it (stop) or in an unknown state
+         * (probe), we must assume the worst and fence it.
+         */
+        return remote_state_failed;
+
     } else if (cluster_node == NULL) {
         /* Connection is recoverable but not currently running anywhere, see if we can recover it first */
         return remote_state_unknown;
openSUSE Build Service is sponsored by