File 0019-High-galera-Backport-patches-from-upstream-bsc-10550.patch of Package resource-agents.5203
From 223d99f2016b187298b0cb4df8c726cf34799423 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristoffer=20Gr=C3=B6nlund?= <krig@koru.se>
Date: Tue, 5 Sep 2017 09:49:53 +0200
Subject: [PATCH 19/21] High: galera: Backport patches from upstream
(bsc#1055017) (bsc#1056635)
* galera: Honor "safe_to_bootstrap" flag in grastate.dat (bsc#1055017)
* galera: Fix instance name in master_exists() (bsc#1056635)
---
heartbeat/galera | 569 +++++++++++++++++++++++++++----------------------------
1 file changed, 278 insertions(+), 291 deletions(-)
diff --git a/heartbeat/galera b/heartbeat/galera
index e4495bec..dc681a47 100755
--- a/heartbeat/galera
+++ b/heartbeat/galera
@@ -32,7 +32,7 @@
# Slave vs Master role:
#
# During the 'Slave' role, galera instances are in read-only mode and
-# will not attempt to connect to the cluster. This role exists as
+# will not attempt to connect to the cluster. This role exists only as
# a means to determine which galera instance is the most up-to-date. The
# most up-to-date node will be used to bootstrap a galera cluster that
# has no current members.
@@ -40,12 +40,9 @@
# The galera instances will only begin to be promoted to the Master role
# once all the nodes in the 'wsrep_cluster_address' connection address
# have entered read-only mode. At that point the node containing the
-# database that is most current will be promoted to Master.
-#
-# Once the first Master instance bootstraps the galera cluster, the
-# other nodes will join the cluster and start synchronizing via SST.
-# They will stay in Slave role as long as the SST is running. Their
-# promotion to Master will happen once synchronization is finished.
+# database that is most current will be promoted to Master. Once the first
+# Master instance bootstraps the galera cluster, the other nodes will be
+# promoted to Master as well.
#
# Example: Create a galera cluster using nodes rhel7-node1 rhel7-node2 rhel7-node3
#
@@ -76,6 +73,8 @@
# in this file
if [ -f "/etc/sysconfig/clustercheck" ]; then
. /etc/sysconfig/clustercheck
+elif [ -f "/etc/default/clustercheck" ]; then
+ . /etc/default/clustercheck
fi
#######################################################################
@@ -206,13 +205,30 @@ The galera cluster address. This takes the form of:
gcomm://node,node,node
Only nodes present in this node list will be allowed to start a galera instance.
-It is expected that the galera node names listed in this address match valid
-pacemaker node names.
+The galera node names listed in this address are expected to match valid
+pacemaker node names. If both names need to differ, you must provide a
+mapping in option cluster_host_map.
</longdesc>
<shortdesc lang="en">Galera cluster address</shortdesc>
<content type="string" default=""/>
</parameter>
+<parameter name="cluster_host_map" unique="0" required="0">
+<longdesc lang="en">
+A mapping of pacemaker node names to galera node names.
+
+To be used when both pacemaker and galera names need to differ,
+(e.g. when galera names map to IP from a specific network interface)
+This takes the form of:
+pcmk1:node.1.galera;pcmk2:node.2.galera;pcmk3:node.3.galera
+
+where the galera resource started on node pcmk1 would be named
+node.1.galera in the wsrep_cluster_address
+</longdesc>
+<shortdesc lang="en">Pacemaker to Galera name mapping</shortdesc>
+<content type="string" default=""/>
+</parameter>
+
<parameter name="check_user" unique="0" required="0">
<longdesc lang="en">
Cluster check user.
@@ -316,6 +332,27 @@ get_last_commit()
fi
}
+clear_safe_to_bootstrap()
+{
+ ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -D
+}
+
+set_safe_to_bootstrap()
+{
+ ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -v $1
+}
+
+get_safe_to_bootstrap()
+{
+ local node=$1
+
+ if [ -z "$node" ]; then
+ ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -Q 2>/dev/null
+ else
+ ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -Q 2>/dev/null
+ fi
+}
+
wait_for_sync()
{
local state=$(get_status_variable "wsrep_local_state")
@@ -328,56 +365,6 @@ wait_for_sync()
ocf_log info "Database synced."
}
-set_sync_needed()
-{
- ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-sync-needed" -v "true"
-}
-
-clear_sync_needed()
-{
- ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-sync-needed" -D
-}
-
-check_sync_needed()
-{
- ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-sync-needed" -Q 2>/dev/null
-}
-
-
-# this function is called when attribute sync-needed is set in the CIB
-check_sync_status()
-{
- # if the pidfile is created, mysqld is up and running
- # an IST might still be in progress, check wsrep status
- if [ -e $OCF_RESKEY_pid ]; then
- local cluster_status=$(get_status_variable "wsrep_cluster_status")
- local state=$(get_status_variable "wsrep_local_state")
- local ready=$(get_status_variable "wsrep_ready")
-
- if [ -z "$cluster_status" -o -z "$state" -o -z "$ready" ]; then
- ocf_exit_reason "Unable to retrieve state transfer status, verify check_user '$OCF_RESKEY_check_user' has permissions to view status"
- return $OCF_ERR_GENERIC
- fi
-
- if [ "$cluster_status" != "Primary" ]; then
- ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state."
- return $OCF_ERR_GENERIC
- fi
-
- if [ "$state" = "4" -a "$ready" = "ON" ]; then
- ocf_log info "local node synced with the cluster"
- # when sync is finished, we are ready to switch to Master
- clear_sync_needed
- set_master_score
- return $OCF_SUCCESS
- fi
- fi
-
- # if we pass here, an IST or SST is still in progress
- ocf_log info "local node syncing"
- return $OCF_SUCCESS
-}
-
is_primary()
{
cluster_status=$(get_status_variable "wsrep_cluster_status")
@@ -420,7 +407,7 @@ master_exists()
return 1
fi
# determine if a master instance is already up and is healthy
- crm_mon --as-xml | grep "resource.*id=\"${OCF_RESOURCE_INSTANCE}\".*role=\"Master\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" > /dev/null 2>&1
+ crm_mon --as-xml | grep "resource.*id=\"${INSTANCE_ATTR_NAME}\".*role=\"Master\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" > /dev/null 2>&1
return $?
}
@@ -445,6 +432,22 @@ set_master_score()
fi
}
+promote_everyone()
+{
+
+ for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do
+ local pcmk_node=$(galera_to_pcmk_name $node)
+ if [ -z "$pcmk_node" ]; then
+ ocf_log err "Could not determine pacemaker node from galera name <${node}>."
+ return
+ else
+ node=$pcmk_node
+ fi
+
+ set_master_score $node
+ done
+}
+
greater_than_equal_long()
{
# there are values we need to compare in this script
@@ -452,17 +455,57 @@ greater_than_equal_long()
echo | awk -v n1="$1" -v n2="$2" '{if (n1>=n2) printf ("true"); else printf ("false");}' | grep -q "true"
}
+galera_to_pcmk_name()
+{
+ local galera=$1
+ if [ -z "$OCF_RESKEY_cluster_host_map" ]; then
+ echo $galera
+ else
+ echo "$OCF_RESKEY_cluster_host_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$2=="'"$galera"'" {print $1;exit}'
+ fi
+}
+
+pcmk_to_galera_name()
+{
+ local pcmk=$1
+ if [ -z "$OCF_RESKEY_cluster_host_map" ]; then
+ echo $pcmk
+ else
+ echo "$OCF_RESKEY_cluster_host_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$1=="'"$pcmk"'" {print $2;exit}'
+ fi
+}
+
+
detect_first_master()
{
local best_commit=0
- local best_node="$NODENAME"
local last_commit=0
local missing_nodes=0
local nodes=""
local nodes_recovered=""
+ local all_nodes
+ local best_node_gcomm
+ local best_node
+ local safe_to_bootstrap
+
+ all_nodes=$(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' ')
+ best_node_gcomm=$(echo "$all_nodes" | sed 's/^.* \(.*\)$/\1/')
+ best_node=$(galera_to_pcmk_name $best_node_gcomm)
+ if [ -z "$best_node" ]; then
+ ocf_log err "Could not determine initial best node from galera name <${best_node_gcomm}>."
+ return
+ fi
# avoid selecting a recovered node as bootstrap if possible
- for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do
+ for node in $all_nodes; do
+ local pcmk_node=$(galera_to_pcmk_name $node)
+ if [ -z "$pcmk_node" ]; then
+ ocf_log err "Could not determine pacemaker node from galera name <${node}>."
+ return
+ else
+ node=$pcmk_node
+ fi
+
if is_no_grastate $node; then
nodes_recovered="$nodes_recovered $node"
else
@@ -471,6 +514,19 @@ detect_first_master()
done
for node in $nodes_recovered $nodes; do
+ safe_to_bootstrap=$(get_safe_to_bootstrap $node)
+
+ if [ "$safe_to_bootstrap" = "1" ]; then
+ # Galera marked the node as safe to boostrap during shutdown. Let's just
+ # pick it as our bootstrap node.
+ ocf_log info "Node <${node}> is marked as safe to bootstrap."
+ best_node=$node
+
+ # We don't need to wait for the other nodes to report state in this case
+ missing_nodes=0
+ break
+ fi
+
last_commit=$(get_last_commit $node)
if [ -z "$last_commit" ]; then
@@ -501,155 +557,20 @@ detect_first_master()
set_bootstrap_node $best_node
}
-detect_galera_pid()
+detect_safe_to_bootstrap()
{
- ps auxww | grep -v -e "${OCF_RESKEY_binary}" -e grep | grep -qe "--pid-file=$OCF_RESKEY_pid"
-}
+ local safe_to_bootstrap=""
-galera_status()
-{
- local loglevel=$1
- local rc
- local running
-
- if [ -e $OCF_RESKEY_pid ]; then
- mysql_common_status $loglevel
- rc=$?
- else
- # if pidfile is not created, the server may
- # still be starting up, e.g. running SST
- detect_galera_pid
- running=$?
- if [ $running -eq 0 ]; then
- rc=$OCF_SUCCESS
- else
- ocf_log $loglevel "MySQL is not running"
- rc=$OCF_NOT_RUNNING
- fi
+ if [ -f ${OCF_RESKEY_datadir}/grastate.dat ]; then
+ ocf_log info "attempting to read safe_to_bootstrap flag from ${OCF_RESKEY_datadir}/grastate.dat"
+ safe_to_bootstrap=$(sed -n 's/^safe_to_bootstrap:\s*\(.*\)$/\1/p' < ${OCF_RESKEY_datadir}/grastate.dat)
fi
- return $rc
-}
-
-galera_start_nowait()
-{
- local mysql_extra_params="$1"
- local pid
- local running
-
- ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \
- --pid-file=$OCF_RESKEY_pid \
- --socket=$OCF_RESKEY_socket \
- --datadir=$OCF_RESKEY_datadir \
- --log-error=$OCF_RESKEY_log \
- --user=$OCF_RESKEY_user $OCF_RESKEY_additional_parameters \
- $mysql_extra_params >/dev/null 2>&1 &
- pid=$!
-
- # Spin waiting for the server to be spawned.
- # Let the CRM/LRM time us out if required.
- start_wait=1
- while [ $start_wait = 1 ]; do
- if ! ps $pid > /dev/null 2>&1; then
- wait $pid
- ocf_exit_reason "MySQL server failed to start (pid=$pid) (rc=$?), please check your installation"
- return $OCF_ERR_GENERIC
- fi
- detect_galera_pid
- running=$?
- if [ $running -eq 0 ]; then
- start_wait=0
- else
- ocf_log info "MySQL is not running"
- fi
- sleep 2
- done
-
- return $OCF_SUCCESS
-}
-
-galera_start_local_node()
-{
- local rc
- local extra_opts
- local bootstrap
-
- bootstrap=$(is_bootstrap)
-
- master_exists
- if [ $? -eq 0 ]; then
- # join without bootstrapping
- ocf_log info "Node <${NODENAME}> is joining the cluster"
- extra_opts="--wsrep-cluster-address=${OCF_RESKEY_wsrep_cluster_address}"
- elif ocf_is_true $bootstrap; then
- ocf_log info "Node <${NODENAME}> is bootstrapping the cluster"
- extra_opts="--wsrep-cluster-address=gcomm://"
+ if [ "$safe_to_bootstrap" = "1" ] || [ "$safe_to_bootstrap" = "0" ]; then
+ set_safe_to_bootstrap $safe_to_bootstrap
else
- ocf_exit_reason "Failure, Attempted to join cluster of $OCF_RESOURCE_INSTANCE before master node has been detected."
- clear_last_commit
- return $OCF_ERR_GENERIC
+ clear_safe_to_bootstrap
fi
-
- # clear last_commit before we start galera to make sure there
- # won't be discrepency between the cib and galera if this node
- # processes a few transactions and fails before we detect it
- clear_last_commit
-
- mysql_common_prepare_dirs
-
- # At start time, if galera requires a SST rather than an IST, the
- # mysql server's pidfile won't be available until SST finishes,
- # which can be longer than the start timeout. So we only check
- # bootstrap node extensively. Joiner nodes are monitored in the
- # "monitor" op
- if ocf_is_true $bootstrap; then
- # start server and wait until it's up and running
- mysql_common_start "$extra_opts"
- rc=$?
- if [ $rc != $OCF_SUCCESS ]; then
- return $rc
- fi
-
- mysql_common_status info
- rc=$?
-
- if [ $rc != $OCF_SUCCESS ]; then
- ocf_exit_reason "Failed initial monitor action"
- return $rc
- fi
-
- is_readonly
- if [ $? -eq 0 ]; then
- ocf_exit_reason "Failure. Master instance started in read-only mode, check configuration."
- return $OCF_ERR_GENERIC
- fi
-
- is_primary
- if [ $? -ne 0 ]; then
- ocf_exit_reason "Failure. Master instance started, but is not in Primary mode."
- return $OCF_ERR_GENERIC
- fi
-
- clear_bootstrap_node
- # clear attribute no-grastate. if last shutdown was
- # not clean, we cannot be extra-cautious by requesting a SST
- # since this is the bootstrap node
- clear_no_grastate
- else
- # only start server, defer full checks to "monitor" op
- galera_start_nowait "$extra_opts"
- rc=$?
- if [ $rc != $OCF_SUCCESS ]; then
- return $rc
- fi
-
- set_sync_needed
- # attribute no-grastate will be cleared once the joiner
- # has finished syncing and is promoted to Master
- fi
-
- ocf_log info "Galera started"
- return $OCF_SUCCESS
}
detect_last_commit()
@@ -660,13 +581,14 @@ detect_last_commit()
--socket=$OCF_RESKEY_socket \
--datadir=$OCF_RESKEY_datadir \
--user=$OCF_RESKEY_user"
+ local recovery_file_regex='s/.*WSREP\:.*position\s*recovery.*--log_error='\''\([^'\'']*\)'\''.*/\1/p'
local recovered_position_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p'
ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat"
last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')"
if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then
local tmp=$(mktemp)
- local tmperr=$(mktemp)
+ chown $OCF_RESKEY_user:$OCF_RESKEY_group $tmp
# if we pass here because grastate.dat doesn't exist,
# try not to bootstrap from this node if possible
@@ -676,33 +598,36 @@ detect_last_commit()
ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'"
- ${OCF_RESKEY_binary} $recover_args --wsrep-recover > $tmp 2> $tmperr
+ ${OCF_RESKEY_binary} $recover_args --wsrep-recover --log-error=$tmp 2>/dev/null
- last_commit="$(cat $tmp | sed -n $recovered_position_regex)"
+ last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)"
if [ -z "$last_commit" ]; then
# Galera uses InnoDB's 2pc transactions internally. If
# server was stopped in the middle of a replication, the
# recovery may find a "prepared" XA transaction in the
# redo log, and mysql won't recover automatically
- cat $tmperr | grep -q -E '\[ERROR\]\s+Found\s+[0-9]+\s+prepared\s+transactions!' 2>/dev/null
- if [ $? -eq 0 ]; then
- # we can only rollback the transaction, but that's OK
- # since the DB will get resynchronized anyway
- ocf_log warn "local node <${NODENAME}> was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover"
- ${OCF_RESKEY_binary} $recover_args --wsrep-recover \
- --tc-heuristic-recover=rollback > $tmp 2>/dev/null
+ local recovery_file="$(cat $tmp | sed -n $recovery_file_regex)"
+ if [ -e $recovery_file ]; then
+ cat $recovery_file | grep -q -E '\[ERROR\]\s+Found\s+[0-9]+\s+prepared\s+transactions!' 2>/dev/null
+ if [ $? -eq 0 ]; then
+ # we can only rollback the transaction, but that's OK
+ # since the DB will get resynchronized anyway
+ ocf_log warn "local node <${NODENAME}> was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover"
+ ${OCF_RESKEY_binary} $recover_args --wsrep-recover \
+ --tc-heuristic-recover=rollback --log-error=$tmp 2>/dev/null
- last_commit="$(cat $tmp | sed -n $recovered_position_regex)"
- if [ ! -z "$last_commit" ]; then
- ocf_log warn "State recovered. force SST at next restart for full resynchronization"
- rm -f ${OCF_RESKEY_datadir}/grastate.dat
- # try not to bootstrap from this node if possible
- set_no_grastate
+ last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)"
+ if [ ! -z "$last_commit" ]; then
+ ocf_log warn "State recovered. force SST at next restart for full resynchronization"
+ rm -f ${OCF_RESKEY_datadir}/grastate.dat
+ # try not to bootstrap from this node if possible
+ set_no_grastate
+ fi
fi
fi
fi
- rm -f $tmp $tmperr
+ rm -f $tmp
fi
if [ ! -z "$last_commit" ]; then
@@ -716,35 +641,95 @@ detect_last_commit()
fi
}
+# For galera, promote is really start
galera_promote()
{
local rc
local extra_opts
local bootstrap
-
+ local safe_to_bootstrap
master_exists
+ if [ $? -eq 0 ]; then
+ # join without bootstrapping
+ extra_opts="--wsrep-cluster-address=${OCF_RESKEY_wsrep_cluster_address}"
+ else
+ bootstrap=$(is_bootstrap)
+
+ if ocf_is_true $bootstrap; then
+ # The best node for bootstrapping wasn't cleanly shutdown. Allow
+ # bootstrapping anyways
+ if [ "$(get_safe_to_bootstrap)" = "0" ]; then
+ sed -ie 's/^\(safe_to_bootstrap:\) 0/\1 1/' ${OCF_RESKEY_datadir}/grastate.dat
+ fi
+ ocf_log info "Node <${NODENAME}> is bootstrapping the cluster"
+ extra_opts="--wsrep-cluster-address=gcomm://"
+ else
+ ocf_exit_reason "Failure, Attempted to promote Master instance of $OCF_RESOURCE_INSTANCE before bootstrap node has been detected."
+ clear_last_commit
+ return $OCF_ERR_GENERIC
+ fi
+ fi
+
+ galera_monitor
+ if [ $? -eq $OCF_RUNNING_MASTER ]; then
+ if ocf_is_true $bootstrap; then
+ promote_everyone
+ clear_bootstrap_node
+ ocf_log info "boostrap node already up, promoting the rest of the galera instances."
+ fi
+ clear_safe_to_bootstrap
+ clear_last_commit
+ return $OCF_SUCCESS
+ fi
+
+ # last commit/safe_to_bootstrap flag are no longer relevant once promoted
+ clear_last_commit
+ clear_safe_to_bootstrap
+
+ mysql_common_prepare_dirs
+ mysql_common_start "$extra_opts"
+ rc=$?
+ if [ $rc != $OCF_SUCCESS ]; then
+ return $rc
+ fi
+
+ galera_monitor
+ rc=$?
+ if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then
+ ocf_exit_reason "Failed initial monitor action"
+ return $rc
+ fi
+
+ is_readonly
+ if [ $? -eq 0 ]; then
+ ocf_exit_reason "Failure. Master instance started in read-only mode, check configuration."
+ return $OCF_ERR_GENERIC
+ fi
+
+ is_primary
if [ $? -ne 0 ]; then
- # promoting the first master will bootstrap the cluster
- if is_bootstrap; then
- galera_start_local_node
- rc=$?
- return $rc
- else
- ocf_exit_reason "Attempted to start the cluster without being a bootstrap node."
- return $OCF_ERR_GENERIC
- fi
+ ocf_exit_reason "Failure. Master instance started, but is not in Primary mode."
+ return $OCF_ERR_GENERIC
+ fi
+
+ if ocf_is_true $bootstrap; then
+ promote_everyone
+ clear_bootstrap_node
+ # clear attribute no-grastate. if last shutdown was
+ # not clean, we cannot be extra-cautious by requesting a SST
+ # since this is the bootstrap node
+ clear_no_grastate
+ ocf_log info "Bootstrap complete, promoting the rest of the galera instances."
else
- # promoting other masters only performs sanity checks
- # as the joining nodes were started during the "monitor" op
- if ! check_sync_needed; then
- # sync is done, clear info about last startup
- clear_no_grastate
- return $OCF_SUCCESS
- else
- ocf_exit_reason "Attempted to promote local node while sync was still needed."
- return $OCF_ERR_GENERIC
- fi
+ # if this is not the bootstrap node, make sure this instance
+ # syncs with the rest of the cluster before promotion returns.
+ wait_for_sync
+ # sync is done, clear info about last startup
+ clear_no_grastate
fi
+
+ ocf_log info "Galera started"
+ return $OCF_SUCCESS
}
galera_demote()
@@ -759,10 +744,18 @@ galera_demote()
# if this node was previously a bootstrap node, that is no longer the case.
clear_bootstrap_node
clear_last_commit
- clear_sync_needed
clear_no_grastate
+ clear_safe_to_bootstrap
+
+ # Clear master score here rather than letting pacemaker do so once
+ # demote finishes. This way a promote cannot take place right
+ # after this demote even if pacemaker is requested to do so. It
+ # will first have to run a start/monitor op, to reprobe the state
+ # of the other galera nodes and act accordingly.
+ clear_master_score
# record last commit for next promotion
+ detect_safe_to_bootstrap
detect_last_commit
rc=$?
return $rc
@@ -771,21 +764,29 @@ galera_demote()
galera_start()
{
local rc
+ local galera_node
- echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME
+ galera_node=$(pcmk_to_galera_name $NODENAME)
+ if [ -z "$galera_node" ]; then
+ ocf_exit_reason "Could not determine galera name from pacemaker node <${NODENAME}>."
+ return $OCF_ERR_CONFIGURED
+ fi
+
+ echo $OCF_RESKEY_wsrep_cluster_address | grep -q -F $galera_node
if [ $? -ne 0 ]; then
- ocf_exit_reason "local node <${NODENAME}> must be a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>to start this galera instance"
+ ocf_exit_reason "local node <${NODENAME}> (galera node <${galera_node}>) must be a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}> to start this galera instance"
return $OCF_ERR_CONFIGURED
fi
- galera_status info
- if [ $? -ne $OCF_NOT_RUNNING ]; then
+ galera_monitor
+ if [ $? -eq $OCF_RUNNING_MASTER ]; then
ocf_exit_reason "master galera instance started outside of the cluster's control"
return $OCF_ERR_GENERIC
fi
mysql_common_prepare_dirs
+ detect_safe_to_bootstrap
detect_last_commit
rc=$?
if [ $rc -ne $OCF_SUCCESS ]; then
@@ -794,7 +795,8 @@ galera_start()
master_exists
if [ $? -eq 0 ]; then
- ocf_log info "Master instances are already up, local node will join in when started"
+ ocf_log info "Master instances are already up, setting master score so this instance will join galera cluster."
+ set_master_score $NODENAME
else
clear_master_score
detect_first_master
@@ -806,6 +808,7 @@ galera_start()
galera_monitor()
{
local rc
+ local galera_node
local status_loglevel="err"
# Set loglevel to info during probe
@@ -813,29 +816,22 @@ galera_monitor()
status_loglevel="info"
fi
- # Check whether mysql is running or about to start after sync
- galera_status $status_loglevel
+ mysql_common_status $status_loglevel
rc=$?
if [ $rc -eq $OCF_NOT_RUNNING ]; then
- last_commit=$(get_last_commit $NODENAME)
- if [ -n "$last_commit" ];then
+ last_commit=$(get_last_commit $node)
+ if [ -n "$last_commit" ]; then
+ # if last commit is set, this instance is considered started in slave mode
rc=$OCF_SUCCESS
-
- if ocf_is_probe; then
- # prevent state change during probe
- return $rc
- fi
-
master_exists
if [ $? -ne 0 ]; then
detect_first_master
else
- # a master instance exists and is healthy.
- # start this node and mark it as "pending sync"
- ocf_log info "cluster is running. start local node to join in"
- galera_start_local_node
- rc=$?
+ # a master instance exists and is healthy, promote this
+ # local read only instance
+ # so it can join the master galera cluster.
+ set_master_score
fi
fi
return $rc
@@ -843,40 +839,31 @@ galera_monitor()
return $rc
fi
- # if we make it here, mysql is running or about to start after sync.
- # Check cluster status now.
+ # if we make it here, mysql is running. Check cluster status now.
+ galera_node=$(pcmk_to_galera_name $NODENAME)
+ if [ -z "$galera_node" ]; then
+ ocf_exit_reason "Could not determine galera name from pacemaker node <${NODENAME}>."
+ return $OCF_ERR_CONFIGURED
+ fi
- echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME
+ echo $OCF_RESKEY_wsrep_cluster_address | grep -q -F $galera_node
if [ $? -ne 0 ]; then
- ocf_exit_reason "local node <${NODENAME}> is started, but is not a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>"
+ ocf_exit_reason "local node <${NODENAME}> (galera node <${galera_node}>) is started, but is not a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>"
return $OCF_ERR_GENERIC
fi
- check_sync_needed
+ is_primary
if [ $? -eq 0 ]; then
- # galera running and sync is needed: slave state
+
if ocf_is_probe; then
- # prevent state change during probe
- rc=$OCF_SUCCESS
- else
- check_sync_status
- rc=$?
+ # restore master score during probe
+ # if we detect this is a master instance
+ set_master_score
fi
+ rc=$OCF_RUNNING_MASTER
else
- is_primary
- if [ $? -ne 0 ]; then
- ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state."
- rc=$OCF_ERR_GENERIC
- else
- # galera running, no need to sync: master state and everything's clear
- rc=$OCF_RUNNING_MASTER
-
- if ocf_is_probe; then
- # restore master score during probe
- # if we detect this is a master instance
- set_master_score
- fi
- fi
+ ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state."
+ rc=$OCF_ERR_GENERIC
fi
return $rc
@@ -887,12 +874,12 @@ galera_stop()
local rc
# make sure the process is stopped
mysql_common_stop
- rc=$?
+ rc=$1
+ clear_safe_to_bootstrap
clear_last_commit
clear_master_score
clear_bootstrap_node
- clear_sync_needed
clear_no_grastate
return $rc
}
@@ -962,7 +949,7 @@ fi
case "$1" in
start) galera_start;;
stop) galera_stop;;
- status) galera_status err;;
+ status) mysql_common_status err;;
monitor) galera_monitor;;
promote) galera_promote;;
demote) galera_demote;;
--
2.14.1