File pacemaker-crmd-always-write-faked-failures-to-CIB-whenever-possible.patch of Package pacemaker.14737
commit b3f9a5bbb8e0f910315b152bd35d35e072e53457
Author: Ken Gaillot <kgaillot@redhat.com>
Date: Thu Mar 1 15:31:37 2018 -0600
Fix: crmd: always write faked failures to CIB whenever possible
Previously, when the crmd had to fake an LRM result, it would call
process_lrm_event() as long as an lrm_state was available. However, if the
lrm_state was disconnected and did not have the resource info cached (e.g. when
attempting to recover a resource on a remote node whose connection has just
died), then the eventual call to do_update_resource() would be unable to write
the result to the CIB, and the PE would never learn about it.
Now, when faking a result, we check that the resource info is available from
the lrm_state before attempting to process the event normally. If not, we call
do_update_resource() directly with created resource info, the same as is done
when an lrm_state is not available.
Index: pacemaker/crmd/lrm.c
===================================================================
--- pacemaker.orig/crmd/lrm.c
+++ pacemaker/crmd/lrm.c
@@ -1446,20 +1446,22 @@ static void
synthesize_lrmd_failure(lrm_state_t *lrm_state, xmlNode *action, int rc)
{
lrmd_event_data_t *op = NULL;
+ lrmd_rsc_info_t *rsc_info = NULL;
const char *operation = crm_element_value(action, XML_LRM_ATTR_TASK);
const char *target_node = crm_element_value(action, XML_LRM_ATTR_TARGET);
xmlNode *xml_rsc = find_xml_node(action, XML_CIB_TAG_RESOURCE, TRUE);
- if(xml_rsc == NULL) {
- /* Do something else? driect_ack? */
- crm_info("Skipping %s=%d on %s (%p): no resource",
- crm_element_value(action, XML_LRM_ATTR_TASK_KEY), rc, target_node, lrm_state);
+ if ((xml_rsc == NULL) || (ID(xml_rsc) == NULL)) {
+ /* @TODO Should we do something else, like direct ack? */
+ crm_info("Can't fake %s failure (%d) on %s without resource configuration",
+ crm_element_value(action, XML_LRM_ATTR_TASK_KEY), rc,
+ target_node);
return;
} else if(operation == NULL) {
/* This probably came from crm_resource -C, nothing to do */
- crm_info("Skipping %s=%d on %s (%p): no operation",
- crm_element_value(action, XML_ATTR_TRANSITION_KEY), rc, target_node, lrm_state);
+ crm_info("Can't fake %s failure (%d) on %s without operation",
+ ID(xml_rsc), rc, target_node);
return;
}
@@ -1471,25 +1473,36 @@ synthesize_lrmd_failure(lrm_state_t *lrm
fake_op_status(lrm_state, op, PCMK_LRM_OP_ERROR, rc);
}
- crm_info("Faking result %d for %s_%s_%d on %s (%p)", op->rc, op->rsc_id, op->op_type, op->interval, target_node, lrm_state);
+ crm_info("Faking %s_%s_%d result (%d) on %s",
+ op->rsc_id, op->op_type, op->interval, op->rc, target_node);
- if(lrm_state) {
+ /* Process the result as if it came from the LRM, if possible
+ * (i.e. resource info can be obtained from the lrm_state).
+ */
+ if (lrm_state) {
+ rsc_info = lrm_state_get_rsc_info(lrm_state, op->rsc_id, 0);
+ }
+ if (rsc_info) {
process_lrm_event(lrm_state, op, NULL);
} else {
- lrmd_rsc_info_t rsc;
-
- rsc.id = strdup(op->rsc_id);
- rsc.type = crm_element_value_copy(xml_rsc, XML_ATTR_TYPE);
- rsc.class = crm_element_value_copy(xml_rsc, XML_AGENT_ATTR_CLASS);
- rsc.provider = crm_element_value_copy(xml_rsc, XML_AGENT_ATTR_PROVIDER);
-
- do_update_resource(target_node, &rsc, op);
-
- free(rsc.id);
- free(rsc.type);
- free(rsc.class);
- free(rsc.provider);
+ /* If we can't process the result normally, at least write it to the CIB
+ * if possible, so the PE can act on it.
+ */
+ char *standard = crm_element_value_copy(xml_rsc, XML_AGENT_ATTR_CLASS);
+ char *provider = crm_element_value_copy(xml_rsc, XML_AGENT_ATTR_PROVIDER);
+ char *type = crm_element_value_copy(xml_rsc, XML_ATTR_TYPE);
+
+ if (standard && type) {
+ rsc_info = lrmd_new_rsc_info(op->rsc_id, standard, provider, type);
+ do_update_resource(target_node, rsc_info, op);
+ lrmd_free_rsc_info(rsc_info);
+ } else {
+ // @TODO Should we direct ack?
+ crm_info("Can't fake %s failure (%d) on %s without resource standard and type",
+ crm_element_value(action, XML_LRM_ATTR_TASK_KEY), rc,
+ target_node);
+ }
}
lrmd_free_event(op);
}
@@ -1774,10 +1787,8 @@ do_lrm_invoke(long long action,
lrm_state = lrm_state_find(target_node);
if ((lrm_state == NULL) && is_remote_node) {
- crm_err("Failing action because remote node %s has no connection to cluster node %s",
- target_node, fsa_our_uname);
-
- /* The action must be recorded here and in the CIB as failed */
+ crm_err("Failing action because local node has never had connection to remote node %s",
+ target_node);
synthesize_lrmd_failure(NULL, input->xml, PCMK_OCF_CONNECTION_DIED);
return;
}