File make-ha-ocf-reliable.patch of Package rabbitmq-server
From 1bf543bb7b3f36cf4240f8ea57cc34a825dc48bf Mon Sep 17 00:00:00 2001
From: Vincent Untz <vuntz@suse.com>
Date: Wed, 22 Nov 2017 11:41:49 +0100
Subject: [PATCH] Make OCF RA more reliable for old versions of rabbitmq
---
scripts/rabbitmq-server-ha.ocf | 42 +++++++++++++++++++++++++++++-------------
1 file changed, 29 insertions(+), 13 deletions(-)
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index f8e51ae..e1900b2 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -1413,6 +1413,12 @@ get_status() {
body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 )
rc=$?
+ # Transient failure that happen from time to time with old rabbitmq
+ if echo "$body" | grep "TCP connection succeeded but Erlang distribution failed" > /dev/null 2>&1; then
+ ocf_log info "${LH} re-running command once after transient failure"
+ body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 )
+ rc=$?
+ fi
pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null
beam_running=$?
@@ -1422,6 +1428,13 @@ get_status() {
return $OCF_NOT_RUNNING
# return a generic error, if there were errors and beam is found running
elif [ $rc -ne 0 ] ; then
+ # The call is not fully reliable with an old version of rabbitmq, so deal with it
+ if [ $rc -eq 2 ] ; then
+ if echo "$body" | grep "node rabbit@.* up, 'rabbit' application running" > /dev/null 2>&1; then
+ ocf_log info "${LH} temporary failure, but diagnostics tell things are okay"
+ return $OCF_SUCCESS
+ fi
+ fi
ocf_log info "${LH} found the beam process running but failed with code ${rc}. Command output: ${body}"
return $OCF_ERR_GENERIC
fi
@@ -1628,11 +1641,12 @@ get_monitor() {
return $rc
fi
- # rc can be SUCCESS or RUNNING_MASTER, don't touch it unless there
- # is some error uncovered by node_health_check
- if ! node_health_check; then
- rc=$OCF_ERR_GENERIC
- fi
+ # Not reliable with an old version of rabbitmq, so commenting this out
+ ## rc can be SUCCESS or RUNNING_MASTER, don't touch it unless there
+ ## is some error uncovered by node_health_check
+ #if ! node_health_check; then
+ # rc=$OCF_ERR_GENERIC
+ #fi
if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
# If we are the master and healthy, perform various
@@ -1640,7 +1654,8 @@ get_monitor() {
# Order a member to restart if something fishy happens with it.
# All cross-node checks MUST happen only here.
- partitions_report="$(partitions_report)"
+ # Not reliable with an old version of rabbitmq, so commenting this out
+ #partitions_report="$(partitions_report)"
for node in $(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE); do
# Restart node if we don't consider ourselves clustered with it
@@ -1650,13 +1665,14 @@ get_monitor() {
continue
fi
- # Restart node if it has any unresolved partitions
- node_partitions=$(grep_partitions_report $node "$partitions_report")
- if [ ! -z "$node_partitions" ]; then
- ocf_log warn "${LH} Node $node thinks that it is partitoned with $node_partitions"
- order_node_restart "$node"
- continue
- fi
+ # Not reliable with an old version of rabbitmq, so commenting this out
+ ## Restart node if it has any unresolved partitions
+ #node_partitions=$(grep_partitions_report $node "$partitions_report")
+ #if [ ! -z "$node_partitions" ]; then
+ # ocf_log warn "${LH} Node $node thinks that it is partitoned with $node_partitions"
+ # order_node_restart "$node"
+ # continue
+ #fi
done
fi
--
2.15.1