Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
SUSE:SLE-12-SP5:GA
pacemaker.9287
bug-970733_pacemaker-fencing-random-delay.patch
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File bug-970733_pacemaker-fencing-random-delay.patch of Package pacemaker.9287
commit e965dde0027517cf2c634dadb380daa0b5b2ffd5 Author: Gao,Yan <ygao@suse.com> Date: Wed Feb 11 15:48:44 2015 +0100 Feature: fencing: cl#5134 - Support random fencing delay to avoid double fencing This prevents double fencing when using slow fencing devices such as sbd. A "pcmk_delay_max" parameter can be configured for a fencing resource to enable random delay for stonith actions and specify the maximum of random delay. diff --git a/fencing/commands.c b/fencing/commands.c index c193a9d..742b81e 100644 --- a/fencing/commands.c +++ b/fencing/commands.c @@ -81,8 +81,11 @@ typedef struct async_command_s { int pid; int fd_stdout; int options; - int default_timeout; - int timeout; + int default_timeout; /* seconds */ + int timeout; /* seconds */ + + int start_delay; /* milliseconds */ + int delay_id; char *op; char *origin; @@ -128,6 +131,24 @@ is_action_required(const char *action, stonith_device_t *device) } static int +get_action_delay_max(stonith_device_t * device, const char * action) +{ + const char *value = NULL; + int delay_max_ms = 0; + + if (safe_str_neq(action, "off") && safe_str_neq(action, "reboot")) { + return 0; + } + + value = g_hash_table_lookup(device->params, STONITH_ATTR_DELAY_MAX); + if (value) { + delay_max_ms = crm_get_msec(value); + } + + return delay_max_ms; +} + +static int get_action_timeout(stonith_device_t * device, const char *action, int default_timeout) { char buffer[512] = { 0, }; @@ -155,6 +176,11 @@ free_async_command(async_command_t * cmd) if (!cmd) { return; } + + if (cmd->delay_id) { + g_source_remove(cmd->delay_id); + } + cmd_list = g_list_remove(cmd_list, cmd); g_list_free_full(cmd->device_list, free); @@ -222,8 +248,16 @@ stonith_device_execute(stonith_device_t * device) if (device->pending_ops) { GList *first = device->pending_ops; - device->pending_ops = g_list_remove_link(device->pending_ops, first); cmd = first->data; + if (cmd && cmd->delay_id) { + crm_trace + ("Operation %s%s%s on %s was asked to run too early, waiting for start_delay timeout of %dms", + cmd->action, cmd->victim ? " for node " : "", cmd->victim ? cmd->victim : "", + device->id, cmd->start_delay); + return TRUE; + } + + device->pending_ops = g_list_remove_link(device->pending_ops, first); g_list_free_1(first); } @@ -301,9 +335,27 @@ stonith_device_dispatch(gpointer user_data) return stonith_device_execute(user_data); } +static gboolean +start_delay_helper(gpointer data) +{ + async_command_t *cmd = data; + stonith_device_t *device = NULL; + + cmd->delay_id = 0; + device = cmd->device ? g_hash_table_lookup(device_list, cmd->device) : NULL; + + if (device) { + mainloop_set_trigger(device->work); + } + + return FALSE; +} + static void schedule_stonith_command(async_command_t * cmd, stonith_device_t * device) { + int delay_max = 0; + CRM_CHECK(cmd != NULL, return); CRM_CHECK(device != NULL, return); @@ -330,6 +382,14 @@ schedule_stonith_command(async_command_t * cmd, stonith_device_t * device) device->pending_ops = g_list_append(device->pending_ops, cmd); mainloop_set_trigger(device->work); + + delay_max = get_action_delay_max(device, cmd->action); + if (delay_max > 0) { + cmd->start_delay = rand() % delay_max; + crm_notice("Delaying %s on %s for %lldms (timeout=%ds)", + cmd->action, device->id, cmd->start_delay, cmd->timeout); + cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd); + } } void @@ -1333,6 +1393,7 @@ stonith_query_capable_device_cb(GList * devices, void *user_data) for (lpc = devices; lpc != NULL; lpc = lpc->next) { stonith_device_t *device = g_hash_table_lookup(device_list, lpc->data); int action_specific_timeout; + int delay_max; if (!device) { /* It is possible the device got unregistered while @@ -1354,6 +1415,12 @@ stonith_query_capable_device_cb(GList * devices, void *user_data) if (action_specific_timeout) { crm_xml_add_int(dev, F_STONITH_ACTION_TIMEOUT, action_specific_timeout); } + + delay_max = get_action_delay_max(device, query->action); + if (delay_max > 0) { + crm_xml_add_int(dev, F_STONITH_DELAY_MAX, delay_max / 1000); + } + if (query->target == NULL) { xmlNode *attrs = create_xml_node(dev, XML_TAG_ATTRS); diff --git a/fencing/main.c b/fencing/main.c index 70b5bde..5249664 100644 --- a/fencing/main.c +++ b/fencing/main.c @@ -1294,6 +1294,15 @@ main(int argc, char **argv) printf(" <content type=\"string\" default=\"dynamic-list\"/>\n"); printf(" </parameter>\n"); + printf(" <parameter name=\"%s\" unique=\"0\">\n", STONITH_ATTR_DELAY_MAX); + printf + (" <shortdesc lang=\"en\">Enable random delay for stonith actions and specify the maximum of random delay</shortdesc>\n"); + printf + (" <longdesc lang=\"en\">This prevents double fencing when using slow devices such as sbd.\n" + "Use this to enable random delay for stonith actions and specify the maximum of random delay.</longdesc>\n"); + printf(" <content type=\"time\" default=\"0s\"/>\n"); + printf(" </parameter>\n"); + for (lpc = 0; lpc < DIMOF(actions); lpc++) { printf(" <parameter name=\"pcmk_%s_action\" unique=\"0\">\n", actions[lpc]); printf diff --git a/fencing/remote.c b/fencing/remote.c index 63c0274..8331cec 100644 --- a/fencing/remote.c +++ b/fencing/remote.c @@ -53,6 +53,7 @@ typedef struct st_query_result_s { gboolean tried; GListPtr device_list; GHashTable *custom_action_timeouts; + GHashTable *delay_maxes; /* Subset of devices that peer has verified connectivity on */ GHashTable *verified_devices; @@ -84,6 +85,7 @@ free_remote_query(gpointer data) free(query->host); g_list_free_full(query->device_list, free); g_hash_table_destroy(query->custom_action_timeouts); + g_hash_table_destroy(query->delay_maxes); g_hash_table_destroy(query->verified_devices); free(query); } @@ -879,14 +881,20 @@ static int get_device_timeout(st_query_result_t * peer, const char *device, int default_timeout) { gpointer res; + int delay_max = 0; if (!peer || !device) { return default_timeout; } + res = g_hash_table_lookup(peer->delay_maxes, device); + if (res && GPOINTER_TO_INT(res) > 0) { + delay_max = GPOINTER_TO_INT(res); + } + res = g_hash_table_lookup(peer->custom_action_timeouts, device); - return res ? GPOINTER_TO_INT(res) : default_timeout; + return res ? GPOINTER_TO_INT(res) + delay_max : default_timeout + delay_max; } static int @@ -1244,17 +1252,20 @@ process_remote_stonith_query(xmlNode * msg) result->host = strdup(host); result->devices = devices; result->custom_action_timeouts = g_hash_table_new_full(crm_str_hash, g_str_equal, free, NULL); + result->delay_maxes = g_hash_table_new_full(crm_str_hash, g_str_equal, free, NULL); result->verified_devices = g_hash_table_new_full(crm_str_hash, g_str_equal, free, NULL); for (child = __xml_first_child(dev); child != NULL; child = __xml_next(child)) { const char *device = ID(child); int action_timeout = 0; + int delay_max = 0; int verified = 0; int required = 0; if (device) { result->device_list = g_list_prepend(result->device_list, strdup(device)); crm_element_value_int(child, F_STONITH_ACTION_TIMEOUT, &action_timeout); + crm_element_value_int(child, F_STONITH_DELAY_MAX, &delay_max); crm_element_value_int(child, F_STONITH_DEVICE_VERIFIED, &verified); crm_element_value_int(child, F_STONITH_DEVICE_REQUIRED, &required); if (action_timeout) { @@ -1263,6 +1274,12 @@ process_remote_stonith_query(xmlNode * msg) g_hash_table_insert(result->custom_action_timeouts, strdup(device), GINT_TO_POINTER(action_timeout)); } + if (delay_max > 0) { + crm_trace("Peer %s with device %s returned maximum of random delay %d", + result->host, device, delay_max); + g_hash_table_insert(result->delay_maxes, + strdup(device), GINT_TO_POINTER(delay_max)); + } if (verified) { crm_trace("Peer %s has confirmed a verified device %s", result->host, device); g_hash_table_insert(result->verified_devices, diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h index 6d9eb12..7f00cfd 100644 --- a/include/crm/fencing/internal.h +++ b/include/crm/fencing/internal.h @@ -63,6 +63,8 @@ xmlNode *create_device_registration_xml(const char *id, const char *namespace, c # define F_STONITH_TOLERANCE "st_tolerance" /*! Action specific timeout period returned in query of fencing devices. */ # define F_STONITH_ACTION_TIMEOUT "st_action_timeout" +/*! Maximum of random fencing delay for a device */ +# define F_STONITH_DELAY_MAX "st_delay_max" /*! Has this device been verified using a monitor type * operation (monitor, list, status) */ # define F_STONITH_DEVICE_VERIFIED "st_monitor_verified" @@ -103,6 +105,7 @@ xmlNode *create_device_registration_xml(const char *id, const char *namespace, c # define STONITH_ATTR_HOSTMAP "pcmk_host_map" # define STONITH_ATTR_HOSTLIST "pcmk_host_list" # define STONITH_ATTR_HOSTCHECK "pcmk_host_check" +# define STONITH_ATTR_DELAY_MAX "pcmk_delay_max" # define STONITH_ATTR_ACTION_OP "action"
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor