File 0002-Dev-ui_sbd-Add-new-crm-sbd-sublevel-jsc-PED-8256.patch of Package crmsh
From a14c9f4b0725de025848597fd423d9a72677ddbc Mon Sep 17 00:00:00 2001
From: xin liang <xliang@suse.com>
Date: Fri, 14 Jun 2024 14:40:31 +0800
Subject: [PATCH 2/4] Dev: ui_sbd: Add new 'crm sbd' sublevel (jsc#PED-8256)
** Motivation
The main configurations for sbd use cases are scattered among sysconfig,
on-disk meta data, CIB, and even could be related to other OS components
eg. coredump, SCSI, multipath.
It's desirable to reduce the management complexity among them and to
streamline the workflow for the main use case scenarios.
** Changed include
**** Disk-based SBD scenarios
1. Show usage when syntax error
2. Completion
3. Display SBD related configuration (UC4 in PED-8256)
4. Change the on-disk meta data of the existing sbd disks (UC2.1 in
PED-8256)
5. Add a sbd disk with the existing sbd configuration (UC2.2 in
PED-8256)
6. Remove a sbd disk (UC2.3 in PED-8256)
7. Remove sbd from cluster
8. Replace the storage for a sbd disk (UC2.4 in PED-8256)]
9. display status (focusing on the runtime information only) (UC5 in
PED-8256)
**** Disk-less SBD scenarios
1. Show usage when syntax error (diskless)
2. completion (diskless)
3. Display SBD related configuration (UC4 in PED-8256, diskless)
4. Manipulate the basic diskless sbd configuration (UC3.1 in PED-8256)
---
crmsh/bootstrap.py | 22 +-
crmsh/constants.py | 3 +
crmsh/ocfs2.py | 2 +-
crmsh/qdevice.py | 8 +-
crmsh/sbd.py | 789 +++++++++++++++++++++++++--------------------
crmsh/ui_root.py | 5 +
crmsh/ui_sbd.py | 445 +++++++++++++++++++++++++
crmsh/utils.py | 23 +-
crmsh/watchdog.py | 22 +-
9 files changed, 938 insertions(+), 381 deletions(-)
create mode 100644 crmsh/ui_sbd.py
diff --git a/crmsh/bootstrap.py b/crmsh/bootstrap.py
index f68ffaa9..9a958c58 100644
--- a/crmsh/bootstrap.py
+++ b/crmsh/bootstrap.py
@@ -216,8 +216,11 @@ class Context(object):
"""
Validate sbd options
"""
+ from .sbd import SBDUtils
if self.sbd_devices and self.diskless_sbd:
utils.fatal("Can't use -s and -S options together")
+ if self.sbd_devices:
+ SBDUtils.verify_sbd_device(self.sbd_devices)
if self.stage == "sbd":
if not self.sbd_devices and not self.diskless_sbd and self.yes_to_all:
utils.fatal("Stage sbd should specify sbd device by -s or diskless sbd by -S option")
@@ -299,7 +302,7 @@ class Context(object):
def init_sbd_manager(self):
from .sbd import SBDManager
- self.sbd_manager = SBDManager(self)
+ self.sbd_manager = SBDManager(bootstrap_context=self)
def detect_platform(self):
"""
@@ -401,7 +404,7 @@ def prompt_for_string(msg, match=None, default='', valid_func=None, prev_value=[
def confirm(msg):
- if _context.yes_to_all:
+ if config.core.force or (_context and _context.yes_to_all):
return True
disable_completion()
rc = logger_utils.confirm(msg)
@@ -411,12 +414,12 @@ def confirm(msg):
def disable_completion():
- if _context.ui_context:
+ if _context and _context.ui_context:
_context.ui_context.disable_completion()
def enable_completion():
- if _context.ui_context:
+ if _context and _context.ui_context:
_context.ui_context.setup_readline()
@@ -501,7 +504,7 @@ def is_online():
return False
# if peer_node is None, this is in the init process
- if _context.cluster_node is None:
+ if not _context or _context.cluster_node is None:
return True
# In join process
# If the joining node is already online but can't find the init node
@@ -1397,7 +1400,7 @@ def init_sbd():
import crmsh.sbd
if _context.stage == "sbd":
crmsh.sbd.clean_up_existing_sbd_resource()
- _context.sbd_manager.sbd_init()
+ _context.sbd_manager.init_and_deploy_sbd()
def init_upgradeutil():
@@ -2785,7 +2788,12 @@ def sync_file(path):
"""
Sync files between cluster nodes
"""
- if _context.skip_csync2:
+ if _context:
+ skip_csync2 = _context.skip_csync2
+ else:
+ skip_csync2 = not ServiceManager().service_is_active(CSYNC2_SERVICE)
+
+ if skip_csync2:
utils.cluster_copy_file(path, nodes=_context.node_list_in_cluster, output=False)
else:
csync2_update(path)
diff --git a/crmsh/constants.py b/crmsh/constants.py
index c49a69b2..8971eff9 100644
--- a/crmsh/constants.py
+++ b/crmsh/constants.py
@@ -448,4 +448,7 @@ DLM_PORT = 21064
# Commands that are deprecated and hidden from UI
HIDDEN_COMMANDS = {'ms'}
+
+PCMK_SERVICE = "pacemaker.service"
+SBD_SERVICE = "sbd.service"
# vim:ts=4:sw=4:et:
diff --git a/crmsh/ocfs2.py b/crmsh/ocfs2.py
index 346cc5c2..6b5414a4 100644
--- a/crmsh/ocfs2.py
+++ b/crmsh/ocfs2.py
@@ -119,7 +119,7 @@ e.g. crm cluster init ocfs2 -o <ocfs2_device>
"""
from . import sbd
if ServiceManager().service_is_enabled("sbd.service"):
- sbd_device_list = sbd.SBDManager.get_sbd_device_from_config()
+ sbd_device_list = sbd.SBDUtils.get_sbd_device_from_config()
for dev in self.ocfs2_devices:
if dev in sbd_device_list:
self._dynamic_raise_error("{} cannot be the same with SBD device".format(dev))
diff --git a/crmsh/qdevice.py b/crmsh/qdevice.py
index 982d7a68..e81fae41 100644
--- a/crmsh/qdevice.py
+++ b/crmsh/qdevice.py
@@ -614,15 +614,15 @@ class QDevice(object):
"""
Adjust SBD_WATCHDOG_TIMEOUT when configuring qdevice and diskless SBD
"""
- from .sbd import SBDManager, SBDTimeout
+ from .sbd import SBDManager, SBDTimeout, SBDUtils
utils.check_all_nodes_reachable()
- self.using_diskless_sbd = SBDManager.is_using_diskless_sbd()
+ self.using_diskless_sbd = SBDUtils.is_using_diskless_sbd()
# add qdevice after diskless sbd started
if self.using_diskless_sbd:
- res = SBDManager.get_sbd_value_from_config("SBD_WATCHDOG_TIMEOUT")
+ res = SBDUtils.get_sbd_value_from_config("SBD_WATCHDOG_TIMEOUT")
if not res or int(res) < SBDTimeout.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE:
sbd_watchdog_timeout_qdevice = SBDTimeout.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE
- SBDManager.update_configuration({"SBD_WATCHDOG_TIMEOUT": str(sbd_watchdog_timeout_qdevice)})
+ SBDManager.update_sbd_configuration({"SBD_WATCHDOG_TIMEOUT": str(sbd_watchdog_timeout_qdevice)})
utils.set_property("stonith-timeout", SBDTimeout.get_stonith_timeout())
@qnetd_lock_for_same_cluster_name
diff --git a/crmsh/sbd.py b/crmsh/sbd.py
index d7f569e6..792cf545 100644
--- a/crmsh/sbd.py
+++ b/crmsh/sbd.py
@@ -1,5 +1,6 @@
import os
import re
+import typing
from . import utils, sh
from . import bootstrap
from .bootstrap import SYSCONFIG_SBD, SBD_SYSTEMD_DELAY_START_DIR
@@ -7,6 +8,7 @@ from . import log
from . import constants
from . import corosync
from . import xmlutil
+from . import watchdog
from .service_manager import ServiceManager
from .sh import ShellUtils
@@ -14,10 +16,122 @@ logger = log.setup_logger(__name__)
logger_utils = log.LoggerUtils(logger)
+class SBDUtils:
+ '''
+ Consolidate sbd related utility methods
+ '''
+ @staticmethod
+ def get_sbd_device_metadata(dev, timeout_only=False, remote=None) -> dict:
+ '''
+ Extract metadata from sbd device header
+ '''
+ sbd_info = {}
+ try:
+ out = sh.cluster_shell().get_stdout_or_raise_error(f"sbd -d {dev} dump", remote)
+ except:
+ return sbd_info
+
+ pattern = r"UUID\s+:\s+(\S+)|Timeout\s+\((\w+)\)\s+:\s+(\d+)"
+ matches = re.findall(pattern, out)
+ for uuid, timeout_type, timeout_value in matches:
+ if uuid and not timeout_only:
+ sbd_info["uuid"] = uuid
+ elif timeout_type and timeout_value:
+ sbd_info[timeout_type] = int(timeout_value)
+ return sbd_info
+
+ @staticmethod
+ def get_device_uuid(dev, node=None):
+ '''
+ Get UUID for specific device and node
+ '''
+ res = SBDUtils.get_sbd_device_metadata(dev, remote=node).get("uuid")
+ if not res:
+ raise ValueError(f"Cannot find sbd device UUID for {dev}")
+ return res
+
+ @staticmethod
+ def compare_device_uuid(dev, node_list):
+ '''
+ Compare local sbd device UUID with other node's sbd device UUID
+ '''
+ if not node_list:
+ return
+ local_uuid = SBDUtils.get_device_uuid(dev)
+ for node in node_list:
+ remote_uuid = SBDUtils.get_device_uuid(dev, node)
+ if local_uuid != remote_uuid:
+ raise ValueError(f"Device {dev} doesn't have the same UUID with {node}")
+
+ @staticmethod
+ def verify_sbd_device(dev_list, compare_node_list=[]):
+ if len(dev_list) > SBDManager.SBD_DEVICE_MAX:
+ raise ValueError(f"Maximum number of SBD device is {SBDManager.SBD_DEVICE_MAX}")
+ for dev in dev_list:
+ if not utils.is_block_device(dev):
+ raise ValueError(f"{dev} doesn't look like a block device")
+ SBDUtils.compare_device_uuid(dev, compare_node_list)
+
+ @staticmethod
+ def get_sbd_value_from_config(key):
+ '''
+ Get value from /etc/sysconfig/sbd
+ '''
+ return utils.parse_sysconfig(SYSCONFIG_SBD).get(key)
+
+ @staticmethod
+ def get_sbd_device_from_config():
+ '''
+ Get sbd device list from config
+ '''
+ res = SBDUtils.get_sbd_value_from_config("SBD_DEVICE")
+ return res.split(';') if res else []
+
+ @staticmethod
+ def is_using_diskless_sbd():
+ '''
+ Check if using diskless SBD
+ '''
+ dev_list = SBDUtils.get_sbd_device_from_config()
+ return not dev_list and ServiceManager().service_is_active(constants.SBD_SERVICE)
+
+ @staticmethod
+ def has_sbd_device_already_initialized(dev) -> bool:
+ '''
+ Check if sbd device already initialized
+ '''
+ cmd = "sbd -d {} dump".format(dev)
+ rc, _, _ = ShellUtils().get_stdout_stderr(cmd)
+ return rc == 0
+
+ @staticmethod
+ def no_overwrite_device_check(dev) -> bool:
+ '''
+ Check if device already initialized and ask if need to overwrite
+ '''
+ initialized = SBDUtils.has_sbd_device_already_initialized(dev)
+ return initialized and \
+ not bootstrap.confirm(f"{dev} has already been initialized by SBD, do you want to overwrite it?")
+
+ @staticmethod
+ def check_devices_metadata_consistent(dev_list) -> bool:
+ '''
+ Check if all devices have the same metadata
+ '''
+ consistent = True
+ if len(dev_list) < 2:
+ return consistent
+ for dev in dev_list[1:]:
+ if SBDUtils.get_sbd_device_metadata(dev) != SBDUtils.get_sbd_device_metadata(dev_list[0]):
+ logger.warning(f"Device {dev} doesn't have the same metadata as {dev_list[0]}")
+ consistent = False
+ return consistent
+
+
class SBDTimeout(object):
- """
+ '''
Consolidate sbd related timeout methods and constants
- """
+ '''
STONITH_WATCHDOG_TIMEOUT_DEFAULT = -1
SBD_WATCHDOG_TIMEOUT_DEFAULT = 5
SBD_WATCHDOG_TIMEOUT_DEFAULT_S390 = 15
@@ -25,15 +139,14 @@ class SBDTimeout(object):
QDEVICE_SYNC_TIMEOUT_MARGIN = 5
def __init__(self, context=None):
- """
+ '''
Init function
- """
+ '''
self.context = context
self.sbd_msgwait = None
self.stonith_timeout = None
self.sbd_watchdog_timeout = self.SBD_WATCHDOG_TIMEOUT_DEFAULT
self.stonith_watchdog_timeout = self.STONITH_WATCHDOG_TIMEOUT_DEFAULT
- self.sbd_delay_start = None
self.two_node_without_qdevice = False
def initialize_timeout(self):
@@ -44,10 +157,10 @@ class SBDTimeout(object):
self._set_sbd_msgwait()
def _set_sbd_watchdog_timeout(self):
- """
+ '''
Set sbd_watchdog_timeout from profiles.yml if exists
Then adjust it if in s390 environment
- """
+ '''
if "sbd.watchdog_timeout" in self.context.profiles_dict:
self.sbd_watchdog_timeout = int(self.context.profiles_dict["sbd.watchdog_timeout"])
if self.context.is_s390 and self.sbd_watchdog_timeout < self.SBD_WATCHDOG_TIMEOUT_DEFAULT_S390:
@@ -55,10 +168,10 @@ class SBDTimeout(object):
self.sbd_watchdog_timeout = self.SBD_WATCHDOG_TIMEOUT_DEFAULT_S390
def _set_sbd_msgwait(self):
- """
+ '''
Set sbd msgwait from profiles.yml if exists
Default is 2 * sbd_watchdog_timeout
- """
+ '''
sbd_msgwait_default = 2 * self.sbd_watchdog_timeout
sbd_msgwait = sbd_msgwait_default
if "sbd.msgwait" in self.context.profiles_dict:
@@ -68,10 +181,25 @@ class SBDTimeout(object):
sbd_msgwait = sbd_msgwait_default
self.sbd_msgwait = sbd_msgwait
+ @classmethod
+ def get_advised_sbd_timeout(cls, diskless=False) -> typing.Tuple[int, int]:
+ '''
+ Get suitable sbd_watchdog_timeout and sbd_msgwait
+ '''
+ ctx = bootstrap.Context()
+ ctx.diskless_sbd = diskless
+ ctx.load_profiles()
+ time_inst = cls(ctx)
+ time_inst.initialize_timeout()
+
+ sbd_watchdog_timeout = time_inst.sbd_watchdog_timeout
+ sbd_msgwait = None if diskless else time_inst.sbd_msgwait
+ return sbd_watchdog_timeout, sbd_msgwait
+
def _adjust_sbd_watchdog_timeout_with_diskless_and_qdevice(self):
- """
+ '''
When using diskless SBD with Qdevice, adjust value of sbd_watchdog_timeout
- """
+ '''
# add sbd after qdevice started
if corosync.is_qdevice_configured() and ServiceManager().service_is_active("corosync-qdevice.service"):
qdevice_sync_timeout = utils.get_qdevice_sync_timeout()
@@ -87,44 +215,42 @@ class SBDTimeout(object):
@staticmethod
def get_sbd_msgwait(dev):
- """
+ '''
Get msgwait for sbd device
- """
- out = sh.cluster_shell().get_stdout_or_raise_error("sbd -d {} dump".format(dev))
- # Format like "Timeout (msgwait) : 30"
- res = re.search("\(msgwait\)\s+:\s+(\d+)", out)
+ '''
+ res = SBDUtils.get_sbd_device_metadata(dev).get("msgwait")
if not res:
- raise ValueError("Cannot get sbd msgwait for {}".format(dev))
- return int(res.group(1))
+ raise ValueError(f"Cannot get sbd msgwait for {dev}")
+ return res
@staticmethod
def get_sbd_watchdog_timeout():
- """
+ '''
Get SBD_WATCHDOG_TIMEOUT from /etc/sysconfig/sbd
- """
- res = SBDManager.get_sbd_value_from_config("SBD_WATCHDOG_TIMEOUT")
+ '''
+ res = SBDUtils.get_sbd_value_from_config("SBD_WATCHDOG_TIMEOUT")
if not res:
raise ValueError("Cannot get the value of SBD_WATCHDOG_TIMEOUT")
return int(res)
@staticmethod
def get_stonith_watchdog_timeout():
- """
+ '''
For non-bootstrap case, get stonith-watchdog-timeout value from cluster property
- """
+ '''
default = SBDTimeout.STONITH_WATCHDOG_TIMEOUT_DEFAULT
- if not ServiceManager().service_is_active("pacemaker.service"):
+ if not ServiceManager().service_is_active(constants.PCMK_SERVICE):
return default
value = utils.get_property("stonith-watchdog-timeout")
return int(value.strip('s')) if value else default
def _load_configurations(self):
- """
+ '''
Load necessary configurations for both disk-based/disk-less sbd
- """
+ '''
self.two_node_without_qdevice = utils.is_2node_cluster_without_qdevice()
- dev_list = SBDManager.get_sbd_device_from_config()
+ dev_list = SBDUtils.get_sbd_device_from_config()
if dev_list: # disk-based
self.disk_based = True
self.msgwait = SBDTimeout.get_sbd_msgwait(dev_list[0])
@@ -134,19 +260,19 @@ class SBDTimeout(object):
self.sbd_watchdog_timeout = SBDTimeout.get_sbd_watchdog_timeout()
self.stonith_watchdog_timeout = SBDTimeout.get_stonith_watchdog_timeout()
self.sbd_delay_start_value_expected = self.get_sbd_delay_start_expected() if utils.detect_virt() else "no"
- self.sbd_delay_start_value_from_config = SBDManager.get_sbd_value_from_config("SBD_DELAY_START")
+ self.sbd_delay_start_value_from_config = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START")
logger.debug("Inspect SBDTimeout: %s", vars(self))
def get_stonith_timeout_expected(self):
- """
+ '''
Get stonith-timeout value for sbd cases, formulas are:
value_from_sbd = 1.2 * (pcmk_delay_max + msgwait) # for disk-based sbd
value_from_sbd = 1.2 * max (stonith_watchdog_timeout, 2*SBD_WATCHDOG_TIMEOUT) # for disk-less sbd
stonith_timeout = max(value_from_sbd, constants.STONITH_TIMEOUT_DEFAULT) + token + consensus
- """
+ '''
if self.disk_based:
value_from_sbd = int(1.2*(self.pcmk_delay_max + self.msgwait))
else:
@@ -163,12 +289,12 @@ class SBDTimeout(object):
return cls_inst.get_stonith_timeout_expected()
def get_sbd_delay_start_expected(self):
- """
+ '''
Get the value for SBD_DELAY_START, formulas are:
SBD_DELAY_START = (token + consensus + pcmk_delay_max + msgwait) # for disk-based sbd
SBD_DELAY_START = (token + consensus + 2*SBD_WATCHDOG_TIMEOUT) # for disk-less sbd
- """
+ '''
token_and_consensus_timeout = corosync.token_and_consensus_timeout()
if self.disk_based:
value = token_and_consensus_timeout + self.pcmk_delay_max + self.msgwait
@@ -178,34 +304,38 @@ class SBDTimeout(object):
@staticmethod
def get_sbd_delay_start_sec_from_sysconfig():
- """
+ '''
Get suitable systemd start timeout for sbd.service
- """
+ '''
# TODO 5ms, 5us, 5s, 5m, 5h are also valid for sbd sysconfig
- value = SBDManager.get_sbd_value_from_config("SBD_DELAY_START")
+ value = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START")
if utils.is_boolean_true(value):
return 2*SBDTimeout.get_sbd_watchdog_timeout()
return int(value)
@staticmethod
def is_sbd_delay_start():
- """
+ '''
Check if SBD_DELAY_START is not no or not set
- """
- res = SBDManager.get_sbd_value_from_config("SBD_DELAY_START")
+ '''
+ res = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START")
return res and res != "no"
+ @staticmethod
+ def get_sbd_systemd_start_timeout() -> int:
+ cmd = "systemctl show -p TimeoutStartUSec sbd --value"
+ out = sh.cluster_shell().get_stdout_or_raise_error(cmd)
+ return utils.get_systemd_timeout_start_in_sec(out)
+
def adjust_systemd_start_timeout(self):
- """
+ '''
Adjust start timeout for sbd when set SBD_DELAY_START
- """
- sbd_delay_start_value = SBDManager.get_sbd_value_from_config("SBD_DELAY_START")
+ '''
+ sbd_delay_start_value = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START")
if sbd_delay_start_value == "no":
return
- cmd = "systemctl show -p TimeoutStartUSec sbd --value"
- out = sh.cluster_shell().get_stdout_or_raise_error(cmd)
- start_timeout = utils.get_systemd_timeout_start_in_sec(out)
+ start_timeout = SBDTimeout.get_sbd_systemd_start_timeout()
if start_timeout > int(sbd_delay_start_value):
return
@@ -216,15 +346,15 @@ class SBDTimeout(object):
utils.cluster_run_cmd("systemctl daemon-reload")
def adjust_stonith_timeout(self):
- """
+ '''
Adjust stonith-timeout property
- """
+ '''
utils.set_property("stonith-timeout", self.get_stonith_timeout_expected(), conditional=True)
def adjust_sbd_delay_start(self):
- """
+ '''
Adjust SBD_DELAY_START in /etc/sysconfig/sbd
- """
+ '''
expected_value = str(self.sbd_delay_start_value_expected)
config_value = self.sbd_delay_start_value_from_config
if expected_value == config_value:
@@ -232,13 +362,13 @@ class SBDTimeout(object):
if expected_value == "no" \
or (not re.search(r'\d+', config_value)) \
or (int(expected_value) > int(config_value)):
- SBDManager.update_configuration({"SBD_DELAY_START": expected_value})
+ SBDManager.update_sbd_configuration({"SBD_DELAY_START": expected_value})
@classmethod
def adjust_sbd_timeout_related_cluster_configuration(cls):
- """
+ '''
Adjust sbd timeout related configurations
- """
+ '''
cls_inst = cls()
cls_inst._load_configurations()
@@ -249,12 +379,9 @@ class SBDTimeout(object):
cls_inst.adjust_systemd_start_timeout()
-class SBDManager(object):
- """
- Class to manage sbd configuration and services
- """
+class SBDManager:
SYSCONFIG_SBD_TEMPLATE = "/usr/share/fillup-templates/sysconfig.sbd"
- SBD_STATUS_DESCRIPTION = """Configure SBD:
+ SBD_STATUS_DESCRIPTION = '''Configure SBD:
If you have shared storage, for example a SAN or iSCSI target,
you can use it avoid split-brain scenarios by configuring SBD.
This requires a 1 MB partition, accessible to all nodes in the
@@ -262,91 +389,181 @@ class SBDManager(object):
across all nodes in the cluster, so /dev/disk/by-id/* devices
are a good choice. Note that all data on the partition you
specify here will be destroyed.
-"""
- SBD_WARNING = "Not configuring SBD - STONITH will be disabled."
+'''
+ NO_SBD_WARNING = "Not configuring SBD - STONITH will be disabled."
+ DISKLESS_SBD_MIN_EXPECTED_VOTE = 3
DISKLESS_SBD_WARNING = "Diskless SBD requires cluster with three or more nodes. If you want to use diskless SBD for 2-node cluster, should be combined with QDevice."
- PARSE_RE = "[; ]"
- DISKLESS_CRM_CMD = "crm configure property stonith-enabled=true stonith-watchdog-timeout={} stonith-timeout={}"
SBD_RA = "stonith:fence_sbd"
SBD_RA_ID = "stonith-sbd"
+ SBD_DEVICE_MAX = 3
+
+ def __init__(
+ self,
+ device_list_to_init: typing.List[str] | None = None,
+ timeout_dict: typing.Dict[str, int] | None = None,
+ update_dict: typing.Dict[str, str] | None = None,
+ no_overwrite_dev_map: typing.Dict[str, bool] | None = None,
+ diskless_sbd: bool = False,
+ bootstrap_context: bootstrap.Context | None = None
+ ):
+ '''
+ Init function which can be called from crm sbd subcommand or bootstrap
+ '''
+ self.package_installed = utils.package_is_installed("sbd")
+ if not self.package_installed:
+ return
- def __init__(self, context):
- """
- Init function
+ self.device_list_to_init = device_list_to_init or []
+ self.timeout_dict = timeout_dict or {}
+ self.update_dict = update_dict or {}
+ self.diskless_sbd = diskless_sbd
+ self.cluster_is_running = ServiceManager().service_is_active(constants.PCMK_SERVICE)
+ self.bootstrap_context = bootstrap_context
+ self.no_overwrite_dev_map = no_overwrite_dev_map or {}
+
+ # From bootstrap init or join process, override the values
+ if self.bootstrap_context:
+ self.device_list_to_init = self.bootstrap_context.sbd_devices
+ self.diskless_sbd = self.bootstrap_context.diskless_sbd
+ self.cluster_is_running = self.bootstrap_context.cluster_is_running
+
+ def _load_attributes_from_bootstrap(self):
+ if not self.bootstrap_context:
+ return
+ timeout_inst = SBDTimeout(self.bootstrap_context)
+ timeout_inst.initialize_timeout()
+ self.timeout_dict["watchdog"] = timeout_inst.sbd_watchdog_timeout
+ if not self.diskless_sbd:
+ self.timeout_dict["msgwait"] = timeout_inst.sbd_msgwait
+ self.update_dict["SBD_WATCHDOG_TIMEOUT"] = str(timeout_inst.sbd_watchdog_timeout)
+ self.update_dict["SBD_WATCHDOG_DEV"] = watchdog.Watchdog.get_watchdog_device(self.bootstrap_context.watchdog)
+
+ @staticmethod
+ def convert_timeout_dict_to_opt_str(timeout_dict: typing.Dict[str, int]) -> str:
+ timeout_option_map = {
+ "watchdog": "-1",
+ "allocate": "-2",
+ "loop": "-3",
+ "msgwait": "-4"
+ }
+ return ' '.join([f"{timeout_option_map[k]} {v}" for k, v in timeout_dict.items()
+ if k in timeout_option_map])
+
+ def update_configuration(self, new_config=False) -> None:
+ '''
+ Update and sync sbd configuration
+ '''
+ if not self.update_dict:
+ return
+ if new_config:
+ utils.copy_local_file(self.SYSCONFIG_SBD_TEMPLATE, SYSCONFIG_SBD)
+
+ for key, value in self.update_dict.items():
+ logger.info("Update %s in %s: %s", key, SYSCONFIG_SBD, value)
+ utils.sysconfig_set(SYSCONFIG_SBD, **self.update_dict)
+ bootstrap.sync_file(SYSCONFIG_SBD)
+ logger.info("Already synced %s to all nodes", SYSCONFIG_SBD)
+
+ @classmethod
+ def update_sbd_configuration(cls, update_dict: typing.Dict[str, str]) -> None:
+ inst = cls(update_dict=update_dict)
+ inst.update_configuration()
+
+ def initialize_sbd(self):
+ if self.diskless_sbd:
+ logger.info("Configuring diskless SBD")
+ self._warn_diskless_sbd()
+ return
+ elif not all(self.no_overwrite_dev_map.values()):
+ logger.info("Configuring disk-based SBD")
+
+ opt_str = SBDManager.convert_timeout_dict_to_opt_str(self.timeout_dict)
+ shell = sh.cluster_shell()
+ for dev in self.device_list_to_init:
+ # skip if device already initialized and not overwrite
+ if dev in self.no_overwrite_dev_map and self.no_overwrite_dev_map[dev]:
+ continue
+ logger.info("Initializing SBD device %s", dev)
+ cmd = f"sbd {opt_str} -d {dev} create"
+ logger.debug("Running command: %s", cmd)
+ shell.get_stdout_or_raise_error(cmd)
- sbd_devices is provided by '-s' option on init process
- diskless_sbd is provided by '-S' option on init process
- """
- self.sbd_devices_input = context.sbd_devices
- self.diskless_sbd = context.diskless_sbd
- self._sbd_devices = None
- self._watchdog_inst = None
- self._context = context
- self._delay_start = False
- self.timeout_inst = None
- self.no_overwrite_map = {}
- self.no_update_config = False
+ SBDUtils.check_devices_metadata_consistent(self.device_list_to_init)
@staticmethod
- def _get_device_uuid(dev, node=None):
- """
- Get UUID for specific device and node
- """
- out = sh.cluster_shell().get_stdout_or_raise_error("sbd -d {} dump".format(dev), node)
- res = re.search("UUID\s*:\s*(.*)\n", out)
- if not res:
- raise ValueError("Cannot find sbd device UUID for {}".format(dev))
- return res.group(1)
+ def enable_sbd_service():
+ cluster_nodes = utils.list_cluster_nodes() or [utils.this_node()]
+ service_manager = ServiceManager()
- def _compare_device_uuid(self, dev, node_list):
- """
- Compare local sbd device UUID with other node's sbd device UUID
- """
- if not node_list:
+ for node in cluster_nodes:
+ if not service_manager.service_is_enabled(constants.SBD_SERVICE, node):
+ logger.info("Enable %s on node %s", constants.SBD_SERVICE, node)
+ service_manager.enable_service(constants.SBD_SERVICE, node)
+
+ @staticmethod
+ def restart_cluster_if_possible():
+ if not ServiceManager().service_is_active(constants.PCMK_SERVICE):
return
- local_uuid = self._get_device_uuid(dev)
- for node in node_list:
- remote_uuid = self._get_device_uuid(dev, node)
- if local_uuid != remote_uuid:
- raise ValueError("Device {} doesn't have the same UUID with {}".format(dev, node))
-
- def _verify_sbd_device(self, dev_list, compare_node_list=[]):
- """
- Verify sbd device
- """
- if len(dev_list) > 3:
- raise ValueError("Maximum number of SBD device is 3")
- for dev in dev_list:
- if not utils.is_block_device(dev):
- raise ValueError("{} doesn't look like a block device".format(dev))
- self._compare_device_uuid(dev, compare_node_list)
+ if xmlutil.CrmMonXmlParser().is_any_resource_running():
+ logger.warning("Resource is running, need to restart cluster service manually on each node")
+ else:
+ logger.info("Restarting cluster service")
+ utils.cluster_run_cmd("crm cluster restart")
+ bootstrap.wait_for_cluster()
+
+ def configure_sbd_resource_and_properties(self):
+ '''
+ Configure stonith-sbd resource and related properties
+ '''
+ if not self.package_installed or \
+ not ServiceManager().service_is_enabled(constants.SBD_SERVICE) or \
+ xmlutil.CrmMonXmlParser().is_resource_configured(self.SBD_RA):
+ return
+ if self.diskless_sbd:
+ utils.set_property("stonith-watchdog-timeout", SBDTimeout.STONITH_WATCHDOG_TIMEOUT_DEFAULT)
+ else:
+ all_device_list = SBDUtils.get_sbd_device_from_config()
+ devices_param_str = f"params devices=\"{','.join(all_device_list)}\""
+ cmd = f"crm configure primitive {self.SBD_RA_ID} {self.SBD_RA} {devices_param_str}"
+ sh.cluster_shell().get_stdout_or_raise_error(cmd)
+ utils.set_property("stonith-enabled", "true")
- def _no_overwrite_check(self, dev):
- """
- Check if device already initialized and if need to overwrite
- """
- return SBDManager.has_sbd_device_already_initialized(dev) and not bootstrap.confirm("SBD is already configured to use {} - overwrite?".format(dev))
+ # continue to adjust properties if cluster is running
+ # should not be called from bootstrap init process
+ if self.cluster_is_running:
+ bootstrap.adjust_properties()
- def _get_sbd_device_interactive(self):
- """
+ def _warn_diskless_sbd(self, peer=None):
+ '''
+ Give warning when configuring diskless sbd
+ '''
+ # When in sbd stage or join process
+ if (self.diskless_sbd and self.cluster_is_running) or peer:
+ vote_dict = utils.get_quorum_votes_dict(peer)
+ expected_vote = int(vote_dict.get('Expected', 0))
+ if expected_vote < self.DISKLESS_SBD_MIN_EXPECTED_VOTE:
+ logger.warning(self.DISKLESS_SBD_WARNING)
+ # When in init process
+ elif self.diskless_sbd:
+ logger.warning(self.DISKLESS_SBD_WARNING)
+
+ def get_sbd_device_interactive(self):
+ '''
Get sbd device on interactive mode
- """
- if self._context.yes_to_all:
- logger.warning(self.SBD_WARNING)
+ '''
+ if self.bootstrap_context.yes_to_all:
+ logger.warning(self.NO_SBD_WARNING)
return
-
logger.info(self.SBD_STATUS_DESCRIPTION)
-
if not bootstrap.confirm("Do you wish to use SBD?"):
- logger.warning(self.SBD_WARNING)
+ logger.warning(self.NO_SBD_WARNING)
return
- configured_dev_list = self._get_sbd_device_from_config()
- for dev in configured_dev_list:
- self.no_overwrite_map[dev] = self._no_overwrite_check(dev)
- if self.no_overwrite_map and all(self.no_overwrite_map.values()):
- self.no_update_config = True
- return configured_dev_list
+ configured_devices = SBDUtils.get_sbd_device_from_config()
+ for dev in configured_devices:
+ self.no_overwrite_dev_map[dev] = SBDUtils.no_overwrite_device_check(dev)
+ if self.no_overwrite_dev_map and all(self.no_overwrite_dev_map.values()):
+ return configured_devices
dev_list = []
dev_looks_sane = False
@@ -356,21 +573,20 @@ class SBDManager(object):
self.diskless_sbd = True
return
- dev_list = utils.re_split_string(self.PARSE_RE, dev)
+ dev_list = utils.re_split_string("[; ]", dev)
try:
- self._verify_sbd_device(dev_list)
- except ValueError as err_msg:
- logger.error(str(err_msg))
+ SBDUtils.verify_sbd_device(dev_list)
+ except ValueError as e:
+ logger.error(e)
continue
-
for dev in dev_list:
- if dev not in self.no_overwrite_map:
- self.no_overwrite_map[dev] = self._no_overwrite_check(dev)
- if self.no_overwrite_map[dev]:
+ if dev not in self.no_overwrite_dev_map:
+ self.no_overwrite_dev_map[dev] = SBDUtils.no_overwrite_device_check(dev)
+ if self.no_overwrite_dev_map[dev]:
if dev == dev_list[-1]:
return dev_list
continue
- logger.warning("All data on {} will be destroyed!".format(dev))
+ logger.warning("All data on %s will be destroyed", dev)
if bootstrap.confirm('Are you sure you wish to use this device?'):
dev_looks_sane = True
else:
@@ -379,250 +595,72 @@ class SBDManager(object):
return dev_list
- def _get_sbd_device(self):
- """
- Get sbd device from options or interactive mode
- """
- dev_list = []
- if self.sbd_devices_input:
- dev_list = self.sbd_devices_input
- self._verify_sbd_device(dev_list)
- for dev in dev_list:
- self.no_overwrite_map[dev] = self._no_overwrite_check(dev)
- if all(self.no_overwrite_map.values()) and dev_list == self._get_sbd_device_from_config():
- self.no_update_config = True
+ def get_sbd_device_from_bootstrap(self):
+ '''
+ Handle sbd device input from 'crm cluster init' with -s or -S option
+ -s is for disk-based sbd
+ -S is for diskless sbd
+ '''
+ # specified sbd device with -s option
+ if self.device_list_to_init:
+ self.update_dict["SBD_DEVICE"] = ';'.join(self.device_list_to_init)
+ # no -s and no -S option
elif not self.diskless_sbd:
- dev_list = self._get_sbd_device_interactive()
- self._sbd_devices = dev_list
-
- def _initialize_sbd(self):
- """
- Initialize SBD parameters according to profiles.yml, or the crmsh defined defaulst as the last resort.
- This covers both disk-based-sbd, and diskless-sbd scenarios.
- For diskless-sbd, set sbd_watchdog_timeout then return;
- For disk-based-sbd, also calculate the msgwait value, then initialize the SBD device.
- """
- msg = ""
- if self.diskless_sbd:
- msg = "Configuring diskless SBD"
- elif not all(self.no_overwrite_map.values()):
- msg = "Initializing SBD"
- if msg:
- logger.info(msg)
- self.timeout_inst = SBDTimeout(self._context)
- self.timeout_inst.initialize_timeout()
- if self.diskless_sbd:
+ self.device_list_to_init = self.get_sbd_device_interactive()
+
+ def init_and_deploy_sbd(self):
+ '''
+ The process of deploying sbd includes:
+ 1. Initialize sbd device
+ 2. Write config file /etc/sysconfig/sbd
+ 3. Enable sbd.service
+ 4. Restart cluster service if possible
+ 5. Configure stonith-sbd resource and related properties
+ '''
+ if not self.package_installed:
return
- opt = "-4 {} -1 {}".format(self.timeout_inst.sbd_msgwait, self.timeout_inst.sbd_watchdog_timeout)
-
- for dev in self._sbd_devices:
- if dev in self.no_overwrite_map and self.no_overwrite_map[dev]:
- continue
- rc, _, err = bootstrap.invoke("sbd {} -d {} create".format(opt, dev))
- if not rc:
- utils.fatal("Failed to initialize SBD device {}: {}".format(dev, err))
-
- def _update_sbd_configuration(self):
- """
- Update /etc/sysconfig/sbd
- """
- if self.no_update_config:
- bootstrap.sync_file(SYSCONFIG_SBD)
- return
-
- utils.copy_local_file(self.SYSCONFIG_SBD_TEMPLATE, SYSCONFIG_SBD)
- sbd_config_dict = {
- "SBD_WATCHDOG_DEV": self._watchdog_inst.watchdog_device_name,
- "SBD_WATCHDOG_TIMEOUT": str(self.timeout_inst.sbd_watchdog_timeout)
- }
- if self._sbd_devices:
- sbd_config_dict["SBD_DEVICE"] = ';'.join(self._sbd_devices)
- utils.sysconfig_set(SYSCONFIG_SBD, **sbd_config_dict)
- bootstrap.sync_file(SYSCONFIG_SBD)
+ if self.bootstrap_context:
+ self.get_sbd_device_from_bootstrap()
+ if not self.device_list_to_init and not self.diskless_sbd:
+ ServiceManager().disable_service(constants.SBD_SERVICE)
+ return
+ self._load_attributes_from_bootstrap()
- def _get_sbd_device_from_config(self):
- """
- Gets currently configured SBD device, i.e. what's in /etc/sysconfig/sbd
- """
- res = SBDManager.get_sbd_value_from_config("SBD_DEVICE")
- if res:
- return utils.re_split_string(self.PARSE_RE, res)
- else:
- return []
+ self.initialize_sbd()
+ self.update_configuration(new_config=True if self.bootstrap_context else False)
+ SBDManager.enable_sbd_service()
- def _restart_cluster_and_configure_sbd_ra(self):
- """
- Try to configure sbd resource, restart cluster on needed
- """
- if not xmlutil.CrmMonXmlParser().is_any_resource_running():
- logger.info("Restarting cluster service")
- utils.cluster_run_cmd("crm cluster restart")
- bootstrap.wait_for_cluster()
+ if self.cluster_is_running:
+ SBDManager.restart_cluster_if_possible()
self.configure_sbd_resource_and_properties()
- else:
- logger.warning("To start sbd.service, need to restart cluster service manually on each node")
- if self.diskless_sbd:
- cmd = self.DISKLESS_CRM_CMD.format(self.timeout_inst.stonith_watchdog_timeout, SBDTimeout.get_stonith_timeout())
- logger.warning("Then run \"{}\" on any node".format(cmd))
- else:
- self.configure_sbd_resource_and_properties()
-
- def _enable_sbd_service(self):
- """
- Try to enable sbd service
- """
- if self._context.cluster_is_running:
- # in sbd stage, enable sbd.service on cluster wide
- utils.cluster_run_cmd("systemctl enable sbd.service")
- self._restart_cluster_and_configure_sbd_ra()
- else:
- # in init process
- bootstrap.invoke("systemctl enable sbd.service")
-
- def _warn_diskless_sbd(self, peer=None):
- """
- Give warning when configuring diskless sbd
- """
- # When in sbd stage or join process
- if (self.diskless_sbd and self._context.cluster_is_running) or peer:
- vote_dict = utils.get_quorum_votes_dict(peer)
- expected_vote = int(vote_dict['Expected'])
- if (expected_vote < 2 and peer) or (expected_vote < 3 and not peer):
- logger.warning(self.DISKLESS_SBD_WARNING)
- # When in init process
- elif self.diskless_sbd:
- logger.warning(self.DISKLESS_SBD_WARNING)
-
- def sbd_init(self):
- """
- Function sbd_init includes these steps:
- 1. Get sbd device from options or interactive mode
- 2. Initialize sbd device
- 3. Write config file /etc/sysconfig/sbd
- """
- from .watchdog import Watchdog
-
- if not utils.package_is_installed("sbd"):
- return
- self._watchdog_inst = Watchdog(_input=self._context.watchdog)
- self._watchdog_inst.init_watchdog()
- self._get_sbd_device()
- if not self._sbd_devices and not self.diskless_sbd:
- bootstrap.invoke("systemctl disable sbd.service")
- return
- self._warn_diskless_sbd()
- self._initialize_sbd()
- self._update_sbd_configuration()
- self._enable_sbd_service()
-
- def configure_sbd_resource_and_properties(self):
- """
- Configure stonith-sbd resource and related properties
- """
- if not utils.package_is_installed("sbd") or \
- not ServiceManager().service_is_enabled("sbd.service") or \
- xmlutil.CrmMonXmlParser().is_resource_configured(self.SBD_RA):
- return
- shell = sh.cluster_shell()
-
- # disk-based sbd
- if self._get_sbd_device_from_config():
- devices_param_str = f"params devices=\"{','.join(self._sbd_devices)}\""
- cmd = f"crm configure primitive {self.SBD_RA_ID} {self.SBD_RA} {devices_param_str}"
- shell.get_stdout_or_raise_error(cmd)
- utils.set_property("stonith-enabled", "true")
- # disk-less sbd
- else:
- if self.timeout_inst is None:
- self.timeout_inst = SBDTimeout(self._context)
- self.timeout_inst.initialize_timeout()
- cmd = self.DISKLESS_CRM_CMD.format(self.timeout_inst.stonith_watchdog_timeout, constants.STONITH_TIMEOUT_DEFAULT)
- shell.get_stdout_or_raise_error(cmd)
-
- # in sbd stage
- if self._context.cluster_is_running:
- bootstrap.adjust_properties()
def join_sbd(self, remote_user, peer_host):
- """
+ '''
Function join_sbd running on join process only
On joining process, check whether peer node has enabled sbd.service
If so, check prerequisites of SBD and verify sbd device on join node
- """
- from .watchdog import Watchdog
-
- if not utils.package_is_installed("sbd"):
+ '''
+ if not self.package_installed:
return
- if not os.path.exists(SYSCONFIG_SBD) or not ServiceManager().service_is_enabled("sbd.service", peer_host):
- bootstrap.invoke("systemctl disable sbd.service")
+
+ service_manager = ServiceManager()
+ if not os.path.exists(SYSCONFIG_SBD) or not service_manager.service_is_enabled(constants.SBD_SERVICE, peer_host):
+ service_manager.disable_service(constants.SBD_SERVICE)
return
+
+ from .watchdog import Watchdog
self._watchdog_inst = Watchdog(remote_user=remote_user, peer_host=peer_host)
self._watchdog_inst.join_watchdog()
- dev_list = self._get_sbd_device_from_config()
+
+ dev_list = SBDUtils.get_sbd_device_from_config()
if dev_list:
- self._verify_sbd_device(dev_list, [peer_host])
+ SBDUtils.verify_sbd_device(dev_list, [peer_host])
else:
self._warn_diskless_sbd(peer_host)
- logger.info("Got {}SBD configuration".format("" if dev_list else "diskless "))
- bootstrap.invoke("systemctl enable sbd.service")
-
- @classmethod
- def verify_sbd_device(cls):
- """
- This classmethod is for verifying sbd device on a running cluster
- Raise ValueError for exceptions
- """
- inst = cls(bootstrap.Context())
- dev_list = inst._get_sbd_device_from_config()
- if not dev_list:
- raise ValueError("No sbd device configured")
- inst._verify_sbd_device(dev_list, utils.list_cluster_nodes_except_me())
-
- @classmethod
- def get_sbd_device_from_config(cls):
- """
- Get sbd device list from config
- """
- inst = cls(bootstrap.Context())
- return inst._get_sbd_device_from_config()
-
- @classmethod
- def is_using_diskless_sbd(cls):
- """
- Check if using diskless SBD
- """
- inst = cls(bootstrap.Context())
- dev_list = inst._get_sbd_device_from_config()
- if not dev_list and ServiceManager().service_is_active("sbd.service"):
- return True
- return False
-
- @staticmethod
- def update_configuration(sbd_config_dict):
- """
- Update and sync sbd configuration
- """
- utils.sysconfig_set(SYSCONFIG_SBD, **sbd_config_dict)
- bootstrap.sync_file(SYSCONFIG_SBD)
-
- @staticmethod
- def get_sbd_value_from_config(key):
- """
- Get value from /etc/sysconfig/sbd
- """
- conf = utils.parse_sysconfig(SYSCONFIG_SBD)
- res = conf.get(key)
- return res
- @staticmethod
- def has_sbd_device_already_initialized(dev):
- """
- Check if sbd device already initialized
- """
- cmd = "sbd -d {} dump".format(dev)
- rc, _, _ = ShellUtils().get_stdout_stderr(cmd)
- return rc == 0
+ logger.info("Got {}SBD configuration".format("" if dev_list else "diskless "))
+ service_manager.enable_service(constants.SBD_SERVICE)
def clean_up_existing_sbd_resource():
@@ -630,5 +668,40 @@ def clean_up_existing_sbd_resource():
sbd_id_list = xmlutil.CrmMonXmlParser().get_resource_id_list_via_type(SBDManager.SBD_RA)
if xmlutil.CrmMonXmlParser().is_resource_started(SBDManager.SBD_RA):
for sbd_id in sbd_id_list:
+ logger.info("Stop sbd resource '%s'(%s)", sbd_id, SBDManager.SBD_RA)
utils.ext_cmd("crm resource stop {}".format(sbd_id))
+ logger.info("Remove sbd resource '%s'", ';' .join(sbd_id_list))
utils.ext_cmd("crm configure delete {}".format(' '.join(sbd_id_list)))
+
+
+def enable_sbd_on_cluster():
+ cluster_nodes = utils.list_cluster_nodes()
+ service_manager = ServiceManager()
+ for node in cluster_nodes:
+ if not service_manager.service_is_enabled(constants.SBD_SERVICE, node):
+ logger.info("Enable %s on node %s", constants.SBD_SERVICE, node)
+ service_manager.enable_service(constants.SBD_SERVICE, node)
+
+
+def disable_sbd_from_cluster():
+ '''
+ Disable SBD from cluster, the process includes:
+ - stop and remove sbd agent
+ - disable sbd.service
+ - adjust cluster attributes
+ - adjust related timeout values
+ '''
+ clean_up_existing_sbd_resource()
+
+ cluster_nodes = utils.list_cluster_nodes()
+ service_manager = ServiceManager()
+ for node in cluster_nodes:
+ if service_manager.service_is_enabled(constants.SBD_SERVICE, node):
+ logger.info("Disable %s on node %s", constants.SBD_SERVICE, node)
+ service_manager.disable_service(constants.SBD_SERVICE, node)
+
+ out = sh.cluster_shell().get_stdout_or_raise_error("stonith_admin -L")
+ res = re.search("([0-9]+) fence device[s]* found", out)
+ # after disable sbd.service, check if sbd is the last stonith device
+ if res and int(res.group(1)) <= 1:
+ utils.cleanup_stonith_related_properties()
diff --git a/crmsh/ui_root.py b/crmsh/ui_root.py
index 12d0f2e1..19dd5bd1 100644
--- a/crmsh/ui_root.py
+++ b/crmsh/ui_root.py
@@ -33,6 +33,7 @@ from . import ui_ra
from . import ui_resource
from . import ui_script
from . import ui_site
+from . import ui_sbd
class Root(command.UI):
@@ -150,6 +151,10 @@ program.
def do_resource(self):
pass
+ @command.level(ui_sbd.SBD)
+ def do_sbd(self):
+ pass
+
@command.level(ui_script.Script)
@command.help('''Cluster scripts
Cluster scripts can perform cluster-wide configuration,
diff --git a/crmsh/ui_sbd.py b/crmsh/ui_sbd.py
new file mode 100644
index 00000000..37d563b0
--- /dev/null
+++ b/crmsh/ui_sbd.py
@@ -0,0 +1,445 @@
+import logging
+import typing
+import re
+import os
+
+from crmsh import sbd
+from crmsh import watchdog
+from crmsh import command
+from crmsh import utils
+from crmsh import bootstrap
+from crmsh import completers
+from crmsh import sh
+from crmsh import xmlutil
+from crmsh import constants
+from crmsh.service_manager import ServiceManager
+from crmsh.bootstrap import SYSCONFIG_SBD
+
+
+logger = logging.getLogger(__name__)
+
+
+def sbd_devices_completer(completed_list: typing.List[str]) -> typing.List[str]:
+ '''
+ completion for sbd devices
+ '''
+ if not ServiceManager().service_is_active(constants.SBD_SERVICE):
+ return []
+ dev_list = sbd.SBDUtils.get_sbd_device_from_config()
+ if dev_list:
+ return [dev for dev in dev_list if dev not in completed_list]
+ return []
+
+
+def sbd_configure_completer(completed_list: typing.List[str]) -> typing.List[str]:
+ '''
+ completion for sbd configure command
+ '''
+ service_manager = ServiceManager()
+ if not service_manager.service_is_active(constants.PCMK_SERVICE):
+ return []
+ sbd_service_is_enabled = service_manager.service_is_enabled(constants.SBD_SERVICE)
+ dev_list = sbd.SBDUtils.get_sbd_device_from_config()
+ # Show disk-based sbd configure options
+ # if there are devices in config or sbd.service is not enabled
+ is_diskbased = bool(dev_list) or not sbd_service_is_enabled
+
+ parameters_pool = []
+ if completed_list[1] == '':
+ parameters_pool = ["show"]
+ elif completed_list[1] == "show":
+ if len(completed_list) == 3:
+ show_types = SBD.SHOW_TYPES if is_diskbased else SBD.DISKLESS_SHOW_TYPES
+ return [t for t in show_types if t not in completed_list]
+ else:
+ return []
+ if completed_list[-1] == "device=":
+ return []
+
+ timeout_types = SBD.TIMEOUT_TYPES if is_diskbased else SBD.DISKLESS_TIMEOUT_TYPES
+ parameters_pool.extend([f"{t}-timeout=" for t in timeout_types])
+ parameters_pool.append("watchdog-device=")
+ parameters_pool = [
+ p
+ for p in parameters_pool
+ if not any(c.startswith(p) for c in completed_list)
+ ]
+
+ if is_diskbased:
+ dev_count = sum(1 for c in completed_list if c.startswith("device="))
+ if dev_count < sbd.SBDManager.SBD_DEVICE_MAX:
+ parameters_pool.append("device=")
+
+ return parameters_pool
+
+
+class SBD(command.UI):
+ '''
+ Class for sbd sub-level
+
+ Includes commands:
+ - sbd configure
+ - sbd remove
+ - sbd status
+ '''
+ name = "sbd"
+ TIMEOUT_TYPES = ("watchdog", "allocate", "loop", "msgwait")
+ DISKLESS_TIMEOUT_TYPES = ("watchdog",)
+ SHOW_TYPES = ("disk_metadata", "sysconfig", "property")
+ DISKLESS_SHOW_TYPES = ("sysconfig", "property")
+ SYNCED_INFO = f"Already synced {SYSCONFIG_SBD} to all nodes"
+ RESTART_INFO = "Requires to restart cluster service to take effect"
+ PCMK_ATTRS = (
+ "have-watchdog",
+ "stonith-timeout",
+ "stonith-watchdog-timeout",
+ "stonith-enabled",
+ "priority-fencing-delay",
+ "pcmk_delay_max"
+ )
+ PARSE_RE = re.compile(
+ # Match "device" key with any value, including empty
+ r'(device)=("[^"]*"|[\w/\d;]*)'
+ # Match other keys with non-empty values, capturing possible suffix
+ r'|(\w+)(?:-(\w+))?=("[^"]+"|[\w/\d;]+)'
+ # Match standalone device path
+ r'|(/dev/[\w\d]+)'
+ )
+
+ class SyntaxError(Exception):
+ pass
+
+ def __init__(self):
+ command.UI.__init__(self)
+
+ self.device_list_from_config = sbd.SBDUtils.get_sbd_device_from_config()
+ self.device_meta_dict_runtime = {}
+ if self.device_list_from_config:
+ self.device_meta_dict_runtime = sbd.SBDUtils.get_sbd_device_metadata(self.device_list_from_config[0], timeout_only=True)
+ try:
+ self.watchdog_timeout_from_config = sbd.SBDTimeout.get_sbd_watchdog_timeout()
+ except:
+ self.watchdog_timeout_from_config = None
+ self.watchdog_device_from_config = watchdog.Watchdog.get_watchdog_device_from_sbd_config()
+
+ self.service_manager = ServiceManager()
+ self.cluster_shell = sh.cluster_shell()
+ self.cluster_nodes = utils.list_cluster_nodes() or [utils.this_node()]
+
+ def _pre_check(self, need_sbd_service=False) -> bool:
+ if not self.service_manager.service_is_active(constants.PCMK_SERVICE):
+ logger.error("%s is not active", constants.PCMK_SERVICE)
+ return False
+ if not utils.package_is_installed("sbd"):
+ logger.error("sbd is not installed")
+ return False
+ if need_sbd_service and not self.service_manager.service_is_active(constants.SBD_SERVICE):
+ logger.error("%s is not active", constants.SBD_SERVICE)
+ return False
+ return True
+
+ @property
+ def configure_usage(self) -> str:
+ '''
+ Build usage string for sbd configure command,
+ including disk-based and diskless sbd cases
+ '''
+ timeout_types = self.TIMEOUT_TYPES if self.device_list_from_config else self.DISKLESS_TIMEOUT_TYPES
+ timeout_usage_str = " ".join([f"[{t}-timeout=<integer>]" for t in timeout_types])
+ show_types = self.SHOW_TYPES if self.device_list_from_config else self.DISKLESS_SHOW_TYPES
+ show_usage_str = f"[{'|'.join(show_types)}]"
+ return ("Usage:\n"
+ f"crm sbd configure show {show_usage_str}\n"
+ f"crm sbd configure [device=<dev>]... [watchdog-device=<dev>] {timeout_usage_str}\n")
+
+ @staticmethod
+ def _show_sysconfig() -> None:
+ with open(SYSCONFIG_SBD) as f:
+ content_list = [line.strip() for line in f.readlines()
+ if not line.startswith("#")
+ and line.strip()]
+ for line in content_list:
+ print(line)
+
+ def _show_disk_metadata(self) -> None:
+ for dev in self.device_list_from_config:
+ print(self.cluster_shell.get_stdout_or_raise_error(f"sbd -d {dev} dump"))
+ print()
+
+ def _show_property(self) -> None:
+ if self.service_manager.service_is_active(constants.PCMK_SERVICE):
+ cmd = "crm configure show"
+ else:
+ cib_path = os.getenv("CIB_file", constants.CIB_RAW_FILE)
+ if not os.path.exists(cib_path):
+ return
+ cmd = f"CIB_file={cib_path} crm configure show"
+ out = self.cluster_shell.get_stdout_or_raise_error(cmd)
+ regex = f"({'|'.join(self.PCMK_ATTRS)})=([^\s]+)"
+ matches = re.findall(regex, out)
+ for match in matches:
+ print(f"{match[0]}={match[1]}")
+ systemd_start_timeout = sbd.SBDTimeout.get_sbd_systemd_start_timeout()
+ print(f"TimeoutStartUSec={systemd_start_timeout}")
+
+ def _configure_show(self, args) -> bool:
+ if len(args) > 2:
+ raise self.SyntaxError("Invalid argument")
+ elif len(args) == 2:
+ match args[1]:
+ case "disk_metadata":
+ self._show_disk_metadata()
+ case "sysconfig":
+ SBD._show_sysconfig()
+ case "property":
+ self._show_property()
+ case _:
+ raise self.SyntaxError(f"Unknown argument: {args[1]}")
+ else:
+ self._show_disk_metadata()
+ if self.device_list_from_config:
+ print()
+ SBD._show_sysconfig()
+ print()
+ self._show_property()
+ return True
+
+ def _parse_args(self, args: typing.List[str]) -> dict[str, int|str|list[str]]:
+ '''
+ Parse arguments and verify them
+
+ Possible arguments format like:
+ device="/dev/sdb5;/dev/sda6"
+ device="" watchdog-timeout=10
+ /dev/sda5 watchdog-timeout=10 watchdog-device=/dev/watchdog
+ device=/dev/sdb5 device=/dev/sda6 watchdog-timeout=10 msgwait-timeout=20
+ '''
+ parameter_dict = {"device-list": []}
+
+ for arg in args:
+ match = self.PARSE_RE.match(arg)
+ if not match:
+ raise self.SyntaxError(f"Invalid argument: {arg}")
+ device_key, device_value, key, suffix, value, device_path = match.groups()
+
+ # device=<device name> parameter
+ if device_key:
+ if device_value:
+ parameter_dict.setdefault("device-list", []).extend(device_value.split(";"))
+ # explicitly set empty value, stands for diskless sbd
+ elif not parameter_dict.get("device-list"):
+ parameter_dict.pop("device-list", None)
+ # standalone device parameter
+ elif device_path:
+ parameter_dict.setdefault("device-list", []).append(device_path)
+ # timeout related parameters
+ elif key in self.TIMEOUT_TYPES and suffix and suffix == "timeout":
+ if not value.isdigit():
+ raise self.SyntaxError(f"Invalid timeout value: {value}")
+ parameter_dict[key] = int(value)
+ # watchdog device parameter
+ elif key == "watchdog" and suffix == "device":
+ parameter_dict["watchdog-device"] = value
+ else:
+ raise self.SyntaxError(f"Unknown argument: {arg}")
+
+ watchdog_device = parameter_dict.get("watchdog-device")
+ parameter_dict["watchdog-device"] = watchdog.Watchdog.get_watchdog_device(watchdog_device)
+
+ logger.debug("Parsed arguments: %s", parameter_dict)
+ return parameter_dict
+
+ @staticmethod
+ def _adjust_timeout_dict(timeout_dict: dict, diskless: bool = False) -> dict:
+ watchdog_timeout = timeout_dict.get("watchdog")
+ if not watchdog_timeout:
+ watchdog_timeout, _ = sbd.SBDTimeout.get_advised_sbd_timeout(diskless)
+ logger.info("No watchdog timeout specified, use advised value: %s", watchdog_timeout)
+ timeout_dict["watchdog"] = watchdog_timeout
+
+ if diskless:
+ return timeout_dict
+
+ msgwait_timeout = timeout_dict.get("msgwait")
+ if not msgwait_timeout:
+ msgwait_timeout = 2*watchdog_timeout
+ logger.info("No msgwait timeout specified, use 2*watchdog timeout: %s", msgwait_timeout)
+ timeout_dict["msgwait"] = msgwait_timeout
+
+ if msgwait_timeout < 2*watchdog_timeout:
+ logger.warning("It's recommended to set msgwait timeout >= 2*watchdog timeout")
+
+ return timeout_dict
+
+ def _configure_diskbase(self, parameter_dict: dict):
+ '''
+ Configure disk-based SBD based on input parameters and runtime config
+ '''
+ update_dict = {}
+ device_list = parameter_dict.get("device-list", [])
+ if not device_list and not self.device_list_from_config:
+ raise self.SyntaxError("No device specified")
+ if len(device_list) > len(set(device_list)):
+ raise self.SyntaxError("Duplicate device")
+ watchdog_device = parameter_dict.get("watchdog-device")
+ if watchdog_device != self.watchdog_device_from_config:
+ update_dict["SBD_WATCHDOG_DEV"] = watchdog_device
+ timeout_dict = {k: v for k, v in parameter_dict.items() if k in self.TIMEOUT_TYPES}
+
+ all_device_list = list(
+ dict.fromkeys(self.device_list_from_config + device_list)
+ )
+ sbd.SBDUtils.verify_sbd_device(all_device_list)
+
+ new_device_list = list(
+ set(device_list) - set(self.device_list_from_config)
+ )
+ no_overwrite_dev_map : dict[str, bool] = {
+ dev: sbd.SBDUtils.no_overwrite_device_check(dev) for dev in new_device_list
+ }
+ if new_device_list:
+ update_dict["SBD_DEVICE"] = ";".join(all_device_list)
+
+ device_list_to_init = []
+ # initialize new devices only if no timeout parameter specified or timeout parameter is already in runtime config
+ if not timeout_dict or utils.is_subdict(timeout_dict, self.device_meta_dict_runtime):
+ device_list_to_init = new_device_list
+ # initialize all devices
+ else:
+ device_list_to_init = all_device_list
+
+ # merge runtime timeout dict with new timeout dict
+ timeout_dict = self.device_meta_dict_runtime | timeout_dict
+ # adjust watchdog and msgwait timeout
+ timeout_dict = self._adjust_timeout_dict(timeout_dict)
+ watchdog_timeout = timeout_dict.get("watchdog")
+ if watchdog_timeout != self.watchdog_timeout_from_config:
+ update_dict["SBD_WATCHDOG_TIMEOUT"] = str(watchdog_timeout)
+
+ sbd_manager = sbd.SBDManager(
+ device_list_to_init=device_list_to_init,
+ timeout_dict=timeout_dict,
+ update_dict=update_dict,
+ no_overwrite_dev_map=no_overwrite_dev_map
+ )
+ sbd_manager.init_and_deploy_sbd()
+
+ def _configure_diskless(self, parameter_dict: dict):
+ '''
+ Configure diskless SBD based on input parameters and runtime config
+ '''
+ update_dict = {}
+ parameter_dict = self._adjust_timeout_dict(parameter_dict, diskless=True)
+ watchdog_timeout = parameter_dict.get("watchdog")
+ if watchdog_timeout and watchdog_timeout != self.watchdog_timeout_from_config:
+ update_dict["SBD_WATCHDOG_TIMEOUT"] = str(watchdog_timeout)
+ watchdog_device = parameter_dict.get("watchdog-device")
+ if watchdog_device != self.watchdog_device_from_config:
+ update_dict["SBD_WATCHDOG_DEV"] = watchdog_device
+
+ sbd_manager = sbd.SBDManager(
+ update_dict=update_dict,
+ diskless_sbd=True
+ )
+ sbd_manager.init_and_deploy_sbd()
+
+ @command.completers_repeating(sbd_configure_completer)
+ def do_configure(self, context, *args) -> bool:
+ '''
+ Implement sbd configure command
+ '''
+ #if not self._pre_check():
+ # return False
+
+ try:
+ if not args:
+ raise self.SyntaxError("No argument")
+
+ if args[0] == "show":
+ return self._configure_show(args)
+
+ parameter_dict = self._parse_args(args)
+ # disk-based sbd case
+ if "device-list" in parameter_dict:
+ return self._configure_diskbase(parameter_dict)
+ # diskless sbd case
+ else:
+ return self._configure_diskless(parameter_dict)
+
+ except self.SyntaxError as e:
+ logger.error(str(e))
+ print(self.configure_usage)
+ return False
+
+ @command.completers_repeating(sbd_devices_completer)
+ def do_remove(self, context, *args) -> bool:
+ '''
+ Implement sbd remove command
+ '''
+ if not self._pre_check(need_sbd_service=True):
+ return False
+
+ parameter_dict = self._parse_args(args)
+ dev_list = parameter_dict.get("device-list", [])
+ if dev_list:
+ if not self.device_list_from_config:
+ logger.error("No sbd device found in config")
+ return False
+ for dev in dev_list:
+ if dev not in self.device_list_from_config:
+ logger.error("Device %s is not in config", dev)
+ return False
+ changed_dev_list = set(self.device_list_from_config) - set(dev_list)
+ # remove part of devices from config
+ if changed_dev_list:
+ logger.info("Remove '%s' from %s", ";".join(dev_list), SYSCONFIG_SBD)
+ sbd.SBDManager.update_sbd_configuration({"SBD_DEVICE": ";".join(changed_dev_list)})
+ logger.info(self.SYNCED_INFO)
+ # remove all devices, equivalent to stop sbd.service
+ else:
+ sbd.disable_sbd_from_cluster()
+ else:
+ sbd.disable_sbd_from_cluster()
+
+ logger.info(self.RESTART_INFO)
+ return True
+
+ def do_status(self, context) -> bool:
+ '''
+ Implement sbd status command
+ '''
+ #if not self._pre_check():
+ # return False
+
+ print(f"{constants.SBD_SERVICE} status: (active|enabled|since)")
+ for node in self.cluster_nodes:
+ is_active = self.service_manager.service_is_active(constants.SBD_SERVICE, node)
+ is_active_str = "YES" if is_active else "NO"
+ is_enabled = self.service_manager.service_is_enabled(constants.SBD_SERVICE, node)
+ is_enabled_str = "YES" if is_enabled else "NO"
+ systemd_property = "ActiveEnterTimestamp" if is_active else "ActiveExitTimestamp"
+ since_str_prefix = "active since" if is_active else "disactive since"
+ systemctl_show_cmd = f"systemctl show {constants.SBD_SERVICE} --property={systemd_property} --value"
+ since = self.cluster_shell.get_stdout_or_raise_error(systemctl_show_cmd, node) or "N/A"
+ print(f"{node}: {is_active_str:<4}|{is_enabled_str:<4}|{since_str_prefix}: {since}")
+ print()
+
+ print("watchdog info: (device|driver|kernel timeout)")
+ watchdog_sbd_re = "\[[0-9]+\] (/dev/.*)\nIdentity: Busy: .*sbd.*\nDriver: (.*)"
+ for node in self.cluster_nodes:
+ out = self.cluster_shell.get_stdout_or_raise_error("sbd query-watchdog", node)
+ res = re.search(watchdog_sbd_re, out)
+ if res:
+ device, driver = res.groups()
+ kernel_timeout = self.cluster_shell.get_stdout_or_raise_error("cat /proc/sys/kernel/watchdog_thresh", node)
+ print(f"{node}: {device}|{driver}|{kernel_timeout}")
+ else:
+ logger.error("Failed to get watchdog info from %s", node)
+ print()
+
+ if xmlutil.CrmMonXmlParser().is_resource_configured(sbd.SBDManager.SBD_RA):
+ print("fence_sbd status: ")
+ sbd_id_list = xmlutil.CrmMonXmlParser().get_resource_id_list_via_type(sbd.SBDManager.SBD_RA)
+ for sbd_id in sbd_id_list:
+ out = self.cluster_shell.get_stdout_or_raise_error(f"crm resource status {sbd_id}")
+ print(out)
diff --git a/crmsh/utils.py b/crmsh/utils.py
index 0b19e10b..5ad096a9 100644
--- a/crmsh/utils.py
+++ b/crmsh/utils.py
@@ -2522,7 +2522,7 @@ def has_stonith_running():
from . import sbd
out = sh.cluster_shell().get_stdout_or_raise_error("stonith_admin -L")
has_stonith_device = re.search("[1-9]+ fence device[s]* found", out) is not None
- using_diskless_sbd = sbd.SBDManager.is_using_diskless_sbd()
+ using_diskless_sbd = sbd.SBDUtils.is_using_diskless_sbd()
return has_stonith_device or using_diskless_sbd
@@ -2782,13 +2782,15 @@ def get_pcmk_delay_max(two_node_without_qdevice=False):
return 0
-def get_property(name, property_type="crm_config", peer=None):
+def get_property(name, property_type="crm_config", peer=None, get_default=True):
"""
Get cluster properties
"property_type" can be crm_config|rsc_defaults|op_defaults
+ "get_default" is used to get the default value from cluster metadata,
+ when it is False, the property value will be got from cib
"""
- if property_type == "crm_config":
+ if property_type == "crm_config" and get_default:
cib_path = os.getenv('CIB_file', constants.CIB_RAW_FILE)
cmd = "CIB_file={} sudo --preserve-env=CIB_file crm configure get_property {}".format(cib_path, name)
else:
@@ -3161,4 +3163,19 @@ def ansible_facts(module_name) -> dict:
out = out[bracket_pos:]
json_tree = json.loads(out)
return json_tree['ansible_facts']
+
+
+def cleanup_stonith_related_properties():
+ for p in ("stonith-watchdog-timeout", "stonith-timeout", "priority-fencing-delay"):
+ if get_property(p, get_default=False):
+ delete_property(p)
+ if get_property("stonith-enabled") == "true":
+ set_property("stonith-enabled", "false")
+
+
+def is_subdict(sub_dict, main_dict):
+ """
+ Check if sub_dict is a sub-dictionary of main_dict
+ """
+ return all(item in main_dict.items() for item in sub_dict.items())
# vim:ts=4:sw=4:et:
diff --git a/crmsh/watchdog.py b/crmsh/watchdog.py
index 6d0d2cff..00e0f60a 100644
--- a/crmsh/watchdog.py
+++ b/crmsh/watchdog.py
@@ -27,7 +27,7 @@ class Watchdog(object):
return self._watchdog_device_name
@staticmethod
- def _verify_watchdog_device(dev, ignore_error=False):
+ def verify_watchdog_device(dev, ignore_error=False):
"""
Use wdctl to verify watchdog device
"""
@@ -48,7 +48,7 @@ class Watchdog(object):
invoke("systemctl restart systemd-modules-load")
@staticmethod
- def _get_watchdog_device_from_sbd_config():
+ def get_watchdog_device_from_sbd_config():
"""
Try to get watchdog device name from sbd config file
"""
@@ -81,7 +81,7 @@ class Watchdog(object):
Get watchdog device name which has driver_name
"""
for device, driver in self._watchdog_info_dict.items():
- if driver == driver_name and self._verify_watchdog_device(device):
+ if driver == driver_name and self.verify_watchdog_device(device):
return device
return None
@@ -108,7 +108,7 @@ class Watchdog(object):
Get first unused watchdog device name
"""
for dev in self._watchdog_info_dict:
- if self._verify_watchdog_device(dev, ignore_error=True):
+ if self.verify_watchdog_device(dev, ignore_error=True):
return dev
return None
@@ -120,8 +120,8 @@ class Watchdog(object):
3. Set the self._input as softdog
"""
if not self._input:
- dev = self._get_watchdog_device_from_sbd_config()
- if dev and self._verify_watchdog_device(dev, ignore_error=True):
+ dev = self.get_watchdog_device_from_sbd_config()
+ if dev and self.verify_watchdog_device(dev, ignore_error=True):
self._input = dev
return
first_unused = self._get_first_unused_device()
@@ -131,7 +131,7 @@ class Watchdog(object):
"""
Is an unused watchdog device
"""
- if dev in self._watchdog_info_dict and self._verify_watchdog_device(dev):
+ if dev in self._watchdog_info_dict and self.verify_watchdog_device(dev):
return True
return False
@@ -142,7 +142,7 @@ class Watchdog(object):
"""
self._set_watchdog_info()
- res = self._get_watchdog_device_from_sbd_config()
+ res = self.get_watchdog_device_from_sbd_config()
if not res:
utils.fatal("Failed to get watchdog device from {}".format(SYSCONFIG_SBD))
self._input = res
@@ -177,3 +177,9 @@ class Watchdog(object):
if res:
self._watchdog_device_name = res
return
+
+ @classmethod
+ def get_watchdog_device(cls, dev_or_driver=None):
+ w = cls(_input=dev_or_driver)
+ w.init_watchdog()
+ return w.watchdog_device_name
--
2.45.2