Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
SUSE:SLE-12-SP5:GA
pacemaker.27560
bsc#1133866-0002-Low-controller-be-more-toleran...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File bsc#1133866-0002-Low-controller-be-more-tolerant-of-malformed-executo-1.1.patch of Package pacemaker.27560
From 77dd44e214401d4dd953a8bafa2469b36d70948e Mon Sep 17 00:00:00 2001 From: Ken Gaillot <kgaillot@redhat.com> Date: Tue, 27 Nov 2018 17:02:36 -0600 Subject: [PATCH 2/2] Low: controller: be more tolerant of malformed executor events b3f9a5bb was overzealous in discarding faked executor results without any resource information. Since that commit, synthesize_lrmd_failure() would check for resource information, and send a CIB update if the synthesized operation were recordable, but would otherwise (such as for notifications) discard the result. This means the fix was complete, because non-recordable actions for a resource behind a just-died remote connection would get lost. It also exposed two pre-existing bugs regarding notifications mis-scheduled on the wrong node. Any of these would block the transition from completing. Now, process_lrm_event() can handle missing lrm_state or resource information, so it can be called by synthesize_lrmd_failure() without any checking. This leads to all the normal handling for non-recordable operations, which doesn't require resource information. We log an assertion if the resource information is not found, so that we can still get some visibility into bugs. This won't be of use in the case of mis-scheduled notifications, but it could help in other situations. --- crmd/crmd_lrm.h | 2 +- crmd/lrm.c | 148 ++++++++++++++++++++++++++++++-------------------- crmd/lrm_state.c | 2 +- crmd/remote_lrmd_ra.c | 2 +- 4 files changed, 93 insertions(+), 61 deletions(-) diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h index 3e1596d8f..087081773 100644 --- a/crmd/crmd_lrm.h +++ b/crmd/crmd_lrm.h @@ -171,4 +171,4 @@ void remote_ra_process_maintenance_nodes(xmlNode *xml); gboolean remote_ra_controlling_guest(lrm_state_t * lrm_state); void process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, - struct recurring_op_s *pending); + struct recurring_op_s *pending, xmlNode *action_xml); diff --git a/crmd/lrm.c b/crmd/lrm.c index 5e5af9f68..0d64f59be 100644 --- a/crmd/lrm.c +++ b/crmd/lrm.c @@ -314,7 +314,7 @@ lrm_op_callback(lrmd_event_data_t * op) lrm_state = lrm_state_find(nodename); CRM_ASSERT(lrm_state != NULL); - process_lrm_event(lrm_state, op, NULL); + process_lrm_event(lrm_state, op, NULL, NULL); } /* A_LRM_CONNECT */ @@ -1434,7 +1434,6 @@ static void synthesize_lrmd_failure(lrm_state_t *lrm_state, xmlNode *action, int rc) { lrmd_event_data_t *op = NULL; - lrmd_rsc_info_t *rsc_info = NULL; const char *operation = crm_element_value(action, XML_LRM_ATTR_TASK); const char *target_node = crm_element_value(action, XML_LRM_ATTR_TARGET); xmlNode *xml_rsc = find_xml_node(action, XML_CIB_TAG_RESOURCE, TRUE); @@ -1464,35 +1463,8 @@ synthesize_lrmd_failure(lrm_state_t *lrm_state, xmlNode *action, int rc) crm_info("Faking %s_%s_%d result (%d) on %s", op->rsc_id, op->op_type, op->interval, op->rc, target_node); - /* Process the result as if it came from the LRM, if possible - * (i.e. resource info can be obtained from the lrm_state). - */ - if (lrm_state) { - rsc_info = lrm_state_get_rsc_info(lrm_state, op->rsc_id, 0); - } - if (rsc_info) { - lrmd_free_rsc_info(rsc_info); - process_lrm_event(lrm_state, op, NULL); - - } else if (controld_action_is_recordable(op->op_type)) { - /* If we can't process the result normally, at least write it to the CIB - * if possible, so the PE can act on it. - */ - const char *standard = crm_element_value(xml_rsc, XML_AGENT_ATTR_CLASS); - const char *provider = crm_element_value(xml_rsc, XML_AGENT_ATTR_PROVIDER); - const char *type = crm_element_value(xml_rsc, XML_ATTR_TYPE); - - if (standard && type) { - rsc_info = lrmd_new_rsc_info(op->rsc_id, standard, provider, type); - do_update_resource(target_node, rsc_info, op); - lrmd_free_rsc_info(rsc_info); - } else { - // @TODO Should we direct ack? - crm_info("Can't fake %s failure (%d) on %s without resource standard and type", - crm_element_value(action, XML_LRM_ATTR_TASK_KEY), rc, - target_node); - } - } + // Process the result as if it came from the LRM + process_lrm_event(lrm_state, op, NULL, action); lrmd_free_event(op); } @@ -1555,7 +1527,7 @@ fail_lrm_resource(xmlNode *xml, lrm_state_t *lrm_state, const char *user_name, if (get_lrm_resource(lrm_state, xml_rsc, TRUE, &rsc) == pcmk_ok) { crm_info("Failing resource %s...", rsc->id); op->exit_reason = strdup("Simulated failure"); - process_lrm_event(lrm_state, op, NULL); + process_lrm_event(lrm_state, op, NULL, xml); op->op_status = PCMK_LRM_OP_DONE; op->rc = PCMK_OCF_OK; lrmd_free_rsc_info(rsc); @@ -2315,7 +2287,7 @@ do_lrm_rsc_op(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *operat crm_err("Operation %s on resource %s failed to execute on remote node %s: %d", operation, rsc->id, lrm_state->node_name, call_id); fake_op_status(lrm_state, op, PCMK_LRM_OP_DONE, PCMK_OCF_UNKNOWN_ERROR); - process_lrm_event(lrm_state, op, NULL); + process_lrm_event(lrm_state, op, NULL, NULL); } else { /* record all operations so we can wait @@ -2516,7 +2488,8 @@ unescape_newlines(const char *string) } void -process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending) +process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, + struct recurring_op_s *pending, xmlNode *action_xml) { char *op_id = NULL; char *op_key = NULL; @@ -2525,16 +2498,49 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr gboolean remove = FALSE; gboolean removed = FALSE; lrmd_rsc_info_t *rsc = NULL; + const char *node_name = NULL; CRM_CHECK(op != NULL, return); CRM_CHECK(op->rsc_id != NULL, return); op_id = make_stop_id(op->rsc_id, op->call_id); op_key = generate_op_key(op->rsc_id, op->op_type, op->interval); - rsc = lrm_state_get_rsc_info(lrm_state, op->rsc_id, 0); + + // Get resource info if available (from executor state or action XML) + if (lrm_state) { + rsc = lrm_state_get_rsc_info(lrm_state, op->rsc_id, 0); + } + if ((rsc == NULL) && action_xml) { + xmlNode *xml = find_xml_node(action_xml, XML_CIB_TAG_RESOURCE, TRUE); + + const char *standard = crm_element_value(xml, XML_AGENT_ATTR_CLASS); + const char *provider = crm_element_value(xml, XML_AGENT_ATTR_PROVIDER); + const char *type = crm_element_value(xml, XML_ATTR_TYPE); + + if (standard && type) { + crm_info("%s agent information not cached, using %s%s%s:%s from action XML", + op->rsc_id, standard, + (provider? ":" : ""), (provider? provider : ""), type); + rsc = lrmd_new_rsc_info(op->rsc_id, standard, provider, type); + } else { + crm_err("Can't process %s result because %s agent information not cached or in XML", + op_key, op->rsc_id); + } + } + CRM_LOG_ASSERT(rsc != NULL); // If it's still NULL, there's a bug somewhere + + // Get node name if available (from executor state or action XML) + if (lrm_state) { + node_name = lrm_state->node_name; + } else if (action_xml) { + node_name = crm_element_value(action_xml, XML_LRM_ATTR_TARGET); + } + if(pending == NULL) { remove = TRUE; - pending = g_hash_table_lookup(lrm_state->pending_ops, op_id); + if (lrm_state) { + pending = g_hash_table_lookup(lrm_state->pending_ops, op_id); + } } if (op->op_status == PCMK_LRM_OP_ERROR) { @@ -2554,7 +2560,14 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr if (op->op_status != PCMK_LRM_OP_CANCELLED) { if (controld_action_is_recordable(op->op_type)) { - update_id = do_update_resource(lrm_state->node_name, rsc, op); + if (node_name && rsc) { + update_id = do_update_resource(node_name, rsc, op); + } else { + // @TODO Should we direct ack? + crm_err("Unable to record %s result in CIB: %s", + op_key, + (node_name? "No resource information" : "No node name")); + } } else { send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); } @@ -2575,7 +2588,9 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr } else if (pending->remove) { /* The tengine canceled this op, we have been waiting for the cancel to finish. */ - erase_lrm_history_by_op(lrm_state, op); + if (lrm_state) { + erase_lrm_history_by_op(lrm_state, op); + } } else if (op->rsc_deleted) { /* The tengine initiated this op, but it was cancelled outside of the @@ -2595,14 +2610,23 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr /* The caller will do this afterwards, but keep the logging consistent */ removed = TRUE; - } else if ((op->interval == 0) && g_hash_table_remove(lrm_state->pending_ops, op_id)) { - removed = TRUE; - crm_trace("Op %s (call=%d, stop-id=%s, remaining=%u): Confirmed", - op_key, op->call_id, op_id, g_hash_table_size(lrm_state->pending_ops)); + } else if (lrm_state && ((op->interval == 0) + || (op->op_status == PCMK_LRM_OP_CANCELLED))) { - } else if(op->interval != 0 && op->op_status == PCMK_LRM_OP_CANCELLED) { - removed = TRUE; - g_hash_table_remove(lrm_state->pending_ops, op_id); + gboolean found = g_hash_table_remove(lrm_state->pending_ops, op_id); + + if (op->interval != 0) { + removed = TRUE; + } else if (found) { + removed = TRUE; + crm_trace("Op %s (call=%d, stop-id=%s, remaining=%u): Confirmed", + op_key, op->call_id, op_id, + g_hash_table_size(lrm_state->pending_ops)); + } + } + + if (node_name == NULL) { + node_name = "unknown node"; // for logging } switch (op->op_status) { @@ -2610,7 +2634,7 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr crm_info("Result of %s operation for %s on %s: %s " CRM_XS " call=%d key=%s confirmed=%s", crm_action_str(op->op_type, op->interval), - op->rsc_id, lrm_state->node_name, + op->rsc_id, node_name, services_lrm_status_str(op->op_status), op->call_id, op_key, (removed? "true" : "false")); break; @@ -2620,7 +2644,7 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr "Result of %s operation for %s on %s: %d (%s) " CRM_XS " call=%d key=%s confirmed=%s cib-update=%d", crm_action_str(op->op_type, op->interval), - op->rsc_id, lrm_state->node_name, + op->rsc_id, node_name, op->rc, services_ocf_exitcode_str(op->rc), op->call_id, op_key, (removed? "true" : "false"), update_id); @@ -2630,7 +2654,7 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr crm_err("Result of %s operation for %s on %s: %s " CRM_XS " call=%d key=%s timeout=%dms", crm_action_str(op->op_type, op->interval), - op->rsc_id, lrm_state->node_name, + op->rsc_id, node_name, services_lrm_status_str(op->op_status), op->call_id, op_key, op->timeout); break; @@ -2639,14 +2663,16 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr crm_err("Result of %s operation for %s on %s: %s " CRM_XS " call=%d key=%s confirmed=%s status=%d cib-update=%d", crm_action_str(op->op_type, op->interval), - op->rsc_id, lrm_state->node_name, + op->rsc_id, node_name, services_lrm_status_str(op->op_status), op->call_id, op_key, (removed? "true" : "false"), op->op_status, update_id); } if (op->output) { char *prefix = - crm_strdup_printf("%s-%s_%s_%d:%d", lrm_state->node_name, op->rsc_id, op->op_type, op->interval, op->call_id); + crm_strdup_printf("%s-%s_%s_%d:%d", node_name, + op->rsc_id, op->op_type, op->interval, + op->call_id); if (op->rc) { crm_log_output(LOG_NOTICE, prefix, op->output); @@ -2656,25 +2682,31 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr free(prefix); } - if (safe_str_neq(op->op_type, RSC_METADATA)) { - crmd_alert_resource_op(lrm_state->node_name, op); - } else if (op->rc == PCMK_OCF_OK) { - char *metadata = unescape_newlines(op->output); + if (lrm_state) { + if (safe_str_neq(op->op_type, RSC_METADATA)) { + crmd_alert_resource_op(lrm_state->node_name, op); + } else if (rsc && (op->rc == PCMK_OCF_OK)) { + char *metadata = unescape_newlines(op->output); - metadata_cache_update(lrm_state->metadata_cache, rsc, metadata); - free(metadata); + metadata_cache_update(lrm_state->metadata_cache, rsc, metadata); + free(metadata); + } } if (op->rsc_deleted) { crm_info("Deletion of resource '%s' complete after %s", op->rsc_id, op_key); - delete_rsc_entry(lrm_state, NULL, op->rsc_id, NULL, pcmk_ok, NULL); + if (lrm_state) { + delete_rsc_entry(lrm_state, NULL, op->rsc_id, NULL, pcmk_ok, NULL); + } } /* If a shutdown was escalated while operations were pending, * then the FSA will be stalled right now... allow it to continue */ mainloop_set_trigger(fsa_source); - update_history_cache(lrm_state, rsc, op); + if (lrm_state && rsc) { + update_history_cache(lrm_state, rsc, op); + } lrmd_free_rsc_info(rsc); free(op_key); diff --git a/crmd/lrm_state.c b/crmd/lrm_state.c index 40da762ee..d8a003936 100644 --- a/crmd/lrm_state.c +++ b/crmd/lrm_state.c @@ -96,7 +96,7 @@ fail_pending_op(gpointer key, gpointer value, gpointer user_data) event.remote_nodename = lrm_state->node_name; event.params = op->params; - process_lrm_event(lrm_state, &event, op); + process_lrm_event(lrm_state, &event, op, NULL); return TRUE; } diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c index 6fa05f67a..2d04588a1 100644 --- a/crmd/remote_lrmd_ra.c +++ b/crmd/remote_lrmd_ra.c @@ -519,7 +519,7 @@ synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char * op.t_run = time(NULL); op.t_rcchange = op.t_run; op.call_id = generate_callid(); - process_lrm_event(lrm_state, &op, NULL); + process_lrm_event(lrm_state, &op, NULL, NULL); } void -- 2.16.4
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor