File srp_daemon-fix-CQ-handling.patch of Package rdma-core.7736

commit 0aaa1e9439c6992189f520b1a7e6510f3a1b230a
Author: Nicolas Morey-Chaisemartin <NMoreyChaisemartin@suse.com>
Date:   Mon Dec 11 15:37:28 2017 +0100

    srp_daemon: fix CQ handling
    
    SM traps are polled through poll_cq which waited for a CQ event
    before polling the CQ itself.
    However it may happens that multiple completions are attached
    to a single event. As stated by the ibv_get_cq_event, it is required
    to poll the the CQ to get those event after the call to
    ibv_req_notify_cq.
    
    As completion need to be handled one by one in an outer function,
    start by polling the CQ and return the event (if any) before waiting
    for the next completion event.
    
    The buggy use case seem to appear when the master SM is switched multiple
    times between two nodes. As the number of ping-pong between the SM increases,
    the number of traps sent to notify that the SM just became master increases
    too. This causes burst of completions linked to a single event.
    Note that the race condition is also possible in other scenario.
    
    Signed-off-by: Nicolas Morey-Chaisemartin <NMoreyChaisemartin@suse.com>

diff --git srp_daemon/srp_handle_traps.c srp_daemon/srp_handle_traps.c
index 25f2b9ab4ac0..ea08b311a111 100644
--- srp_daemon/srp_handle_traps.c
+++ srp_daemon/srp_handle_traps.c
@@ -496,6 +496,27 @@ static int stop_threads(struct sync_resources *sync_res)
 	return result;
 }
 
+static int poll_cq_once(struct sync_resources *sync_res, struct ibv_cq *cq,
+			struct ibv_wc *wc)
+{
+	int ret;
+	ret = ibv_poll_cq(cq, 1, wc);
+	if (ret < 0) {
+		pr_err("poll CQ failed\n");
+		return ret;
+	}
+
+	if (ret > 0 && wc->status != IBV_WC_SUCCESS) {
+		if (!stop_threads(sync_res))
+			pr_err("got bad completion with status: 0x%x\n",
+			       wc->status);
+		return -ret;
+	}
+
+	return ret;
+}
+
+
 static int poll_cq(struct sync_resources *sync_res, struct ibv_cq *cq,
 		   struct ibv_wc *wc, struct ibv_comp_channel *channel)
 {
@@ -504,6 +525,12 @@ static int poll_cq(struct sync_resources *sync_res, struct ibv_cq *cq,
 	void          *ev_ctx;
 
 	if (channel) {
+		/* Poll CQ once. There may be extra completion that
+		 * were associated to the previous event */
+		ret = poll_cq_once(sync_res, cq, wc);
+		if (ret)
+			return ret;
+
 		if (ibv_get_cq_event(channel, &ev_cq, &ev_ctx)) {
 			pr_err("Failed to get cq_event\n");
 			return -1;
@@ -524,18 +551,7 @@ static int poll_cq(struct sync_resources *sync_res, struct ibv_cq *cq,
 	}
 
 	do {
-		ret = ibv_poll_cq(cq, 1, wc);
-		if (ret < 0) {
-			pr_err("poll CQ failed\n");
-			return ret;
-		}
-
-		if (ret > 0 && wc->status != IBV_WC_SUCCESS) {
-			if (!stop_threads(sync_res))
-				pr_err("got bad completion with status: 0x%x\n",
-				       wc->status);
-			return -ret;
-		}
+		ret = poll_cq_once(sync_res, cq, wc);
 
 		if (ret == 0 && channel) {
 			pr_err("Weird poll returned no cqe after CQ event\n");
openSUSE Build Service is sponsored by