File pacemaker-crmd-timeout-older-peers-cancelling-resource-operation.patch of Package pacemaker.3577
commit a50be1651ef6f03ed3ef5bf78e0a68255a047b52
Author: Ken Gaillot <kgaillot@redhat.com>
Date: Fri Jun 10 12:09:39 2016 -0500
Fix: crmd: avoid timeout on older peers when cancelling a resource operation
diff --git a/crmd/lrm.c b/crmd/lrm.c
index 8ae3e18..ef0109d 100644
--- a/crmd/lrm.c
+++ b/crmd/lrm.c
@@ -274,6 +274,32 @@ update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_
}
}
+/*
+ * \internal
+ * \brief Send a direct OK ack for a resource task
+ *
+ * \param[in] lrm_state LRM connection
+ * \param[in] input Input message being ack'ed
+ * \param[in] rsc_id ID of affected resource
+ * \param[in] rsc Affected resource (if available)
+ * \param[in] task Operation task being ack'ed
+ * \param[in] ack_host Name of host to send ack to
+ * \param[in] ack_sys IPC system name to ack
+ */
+static void
+send_task_ok_ack(lrm_state_t *lrm_state, ha_msg_input_t *input,
+ const char *rsc_id, lrmd_rsc_info_t *rsc, const char *task,
+ const char *ack_host, const char *ack_sys)
+{
+ lrmd_event_data_t *op = construct_op(lrm_state, input->xml, rsc_id, task);
+
+ CRM_ASSERT(op != NULL);
+ op->rc = PCMK_OCF_OK;
+ op->op_status = PCMK_LRM_OP_DONE;
+ send_direct_ack(ack_host, ack_sys, rsc, op, rsc_id);
+ lrmd_free_event(op);
+}
+
void
lrm_op_callback(lrmd_event_data_t * op)
{
@@ -1625,20 +1651,12 @@ do_lrm_invoke(long long action,
synthesize_lrmd_failure(lrm_state, input->xml, PCMK_OCF_NOT_CONFIGURED);
} else if (rsc == NULL) {
- lrmd_event_data_t *op = NULL;
-
crm_notice("Not creating resource for a %s event: %s", operation, ID(input->xml));
delete_rsc_entry(lrm_state, input, ID(xml_rsc), NULL, pcmk_ok, user_name);
- op = construct_op(lrm_state, input->xml, ID(xml_rsc), operation);
-
/* Deleting something that does not exist is a success */
- op->op_status = PCMK_LRM_OP_DONE;
- op->rc = PCMK_OCF_OK;
- CRM_ASSERT(op != NULL);
-
- send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc));
- lrmd_free_event(op);
+ send_task_ok_ack(lrm_state, input, ID(xml_rsc), NULL, operation,
+ from_host, from_sys);
} else if (safe_str_eq(operation, CRMD_ACTION_CANCEL)) {
char *op_key = NULL;
@@ -1685,24 +1703,35 @@ do_lrm_invoke(long long action,
/* Acknowledge the cancellation operation if it's for a remote connection resource */
if (in_progress == FALSE || is_remote_lrmd_ra(NULL, NULL, rsc->id)) {
- lrmd_event_data_t *op = construct_op(lrm_state, input->xml, rsc->id, op_task);
char *op_id = make_stop_id(rsc->id, call);
if (is_remote_lrmd_ra(NULL, NULL, rsc->id) == FALSE) {
crm_info("Nothing known about operation %d for %s", call, op_key);
}
delete_op_entry(lrm_state, NULL, rsc->id, op_key, call);
+ send_task_ok_ack(lrm_state, input, rsc->id, rsc, op_task,
+ from_host, from_sys);
- CRM_ASSERT(op != NULL);
-
- op->rc = PCMK_OCF_OK;
- op->op_status = PCMK_LRM_OP_DONE;
- send_direct_ack(from_host, from_sys, rsc, op, rsc->id);
- lrmd_free_event(op);
-
- /* needed?? yes for the cancellation operation of a remote connection resource */
+ /* needed at least for cancellation of a remote operation */
g_hash_table_remove(lrm_state->pending_ops, op_id);
free(op_id);
+
+ } else {
+ /* No ack is needed since abcdaa8, but peers with older versions
+ * in a rolling upgrade need one. We didn't bump the feature set
+ * at that commit, so we can only compare against the previous
+ * CRM version (3.0.8). If any peers have feature set 3.0.9 but
+ * not abcdaa8, they will time out waiting for the ack (no
+ * released versions of Pacemaker are affected).
+ */
+ const char *peer_version = crm_element_value(params, XML_ATTR_CRM_VERSION);
+
+ if (compare_version(peer_version, "3.0.8") <= 0) {
+ crm_info("Sending compatibility ack for %s cancellation to %s (CRM version %s)",
+ op_key, from_host, peer_version);
+ send_task_ok_ack(lrm_state, input, rsc->id, rsc, op_task,
+ from_host, from_sys);
+ }
}
free(op_key);