File remove_rece_deadlock_conn_lost_in_resync.patch of Package drbd.24674

commit ccc813791bddae892c4c8708d39d21f522827923
Author: Philipp Reisner <philipp.reisner@linbit.com>
Date:   Wed Apr 4 10:14:59 2018 +0200

    drbd: Remove a possible receiver deadlock when connection is lost while resyn
    
    We got two reports of incidents where DRBD was stuck in NetworkFailure state.
    It turns out that when a resync runs and application IO is ongoing and
    the connection is lost while the receiver thread is just in the progress
    of receiving a barrier:
    
    Feb 13 13:08:10 vm17 kernel: drbd t1 mqhavm16.hursley.ibm.com: PingAck did not arrive in time.
    Feb 13 13:08:10 vm17 kernel: drbd t1 mqhavm16.hursley.ibm.com: conn( Connected -> NetworkFailure ) peer( Primary -> Unknown )
    Feb 13 13:08:10 vm17 kernel: drbd t1/0 drbd1122 mqhavm16.hursley.ibm.com: pdsk( UpToDate -> DUnknown ) repl( SyncTarget -> Off )
    Feb 13 13:08:10 vm17 kernel: drbd t1 mqhavm16.hursley.ibm.com: ack_receiver terminated
    Feb 13 13:08:10 vm17 kernel: drbd t1 mqhavm16.hursley.ibm.com: Terminating ack_recv thread
    
    The ack_receiver terminated itself, since the connection is dead, Now the receiver
    should terminate:
    
    Feb 13 13:09:39 vm17 kernel: INFO: task drbd_r_t1:4536 blocked for more than 120 seconds.
    Feb 13 13:09:39 vm17 kernel: drbd_r_t1       D ffff880217690000     0  4536      2 0x00000084
    Feb 13 13:09:39 vm17 kernel: Call Trace:
    Feb 13 13:09:39 vm17 kernel: [<ffffffff816ab6d9>] schedule+0x29/0x70
    Feb 13 13:09:39 vm17 kernel: [<ffffffffc0506d97>] __conn_wait_ee_empty+0x97/0xe0 [drbd]
    Feb 13 13:09:39 vm17 kernel: [<ffffffff810b34b0>] ? wake_up_atomic_t+0x30/0x30
    Feb 13 13:09:39 vm17 kernel: [<ffffffffc0506e0d>] conn_wait_ee_empty+0x2d/0x50 [drbd]
    Feb 13 13:09:39 vm17 kernel: [<ffffffffc050fbb3>] receive_Barrier+0x143/0x1f0 [drbd]
    
    The receiver waits for active_ee to become empty. That should be done by the submitter.
    Tha submitter waits for a resync extent do go out of the way:
    
    Feb 13 13:11:39 vm17 kernel: Workqueue: drbd1122_submit do_submit [drbd]
    Feb 13 13:11:39 vm17 kernel: Call Trace:
    Feb 13 13:11:39 vm17 kernel: [<ffffffffc0518905>] ? prepare_al_transaction_nonblock+0x1f5/0x210 [drbd]
    Feb 13 13:11:39 vm17 kernel: [<ffffffff816ab6d9>] schedule+0x29/0x70
    Feb 13 13:11:39 vm17 kernel: [<ffffffffc051cb63>] do_submit+0x363/0x620 [drbd]
    
    That would be removed by the ack receiver, when it receives the P_RS_CANCEL packet from
    the Primary/SyncSource node. But that thread has already ceased its activity.
    
    Fix that by waiting for ee_empty_or_disconnect.
    
    Credits go to Lars Ellenberg. He explained the deadlock, I just wrote the
    patch.

diff --git a/drbd/drbd_receiver.c b/drbd/drbd_receiver.c
index 2762917a..4fe0f6ad 100644
--- a/drbd/drbd_receiver.c
+++ b/drbd/drbd_receiver.c
@@ -1416,6 +1416,12 @@ static void conn_wait_ee_empty(struct drbd_connection *connection, struct list_h
 	wait_event(connection->ee_wait, conn_wait_ee_cond(connection, head));
 }
 
+static void conn_wait_ee_empty_or_disconnect(struct drbd_connection *connection, struct list_head *head)
+{
+	wait_event(connection->ee_wait,
+		   conn_wait_ee_cond(connection, head) || connection->cstate[NOW] < C_CONNECTED);
+}
+
 /**
  * drbd_submit_peer_request()
  * @device:	DRBD device.
@@ -1701,7 +1707,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 	case WO_DRAIN_IO:
 		if (rv == FE_STILL_LIVE) {
 			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &connection->current_epoch->flags);
-			conn_wait_ee_empty(connection, &connection->active_ee);
+			conn_wait_ee_empty_or_disconnect(connection, &connection->active_ee);
 			rv = drbd_flush_after_epoch(connection, connection->current_epoch);
 		}
 		if (rv == FE_RECYCLED)
@@ -1719,7 +1725,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 	if (!epoch) {
 		drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
 		issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &connection->current_epoch->flags);
-		conn_wait_ee_empty(connection, &connection->active_ee);
+		conn_wait_ee_empty_or_disconnect(connection, &connection->active_ee);
 		if (issue_flush) {
 			rv = drbd_flush_after_epoch(connection, connection->current_epoch);
 			if (rv == FE_RECYCLED)
diff --git a/drbd/drbd_state.c b/drbd/drbd_state.c
index 611bed24..e3e9e0f8 100644
--- a/drbd/drbd_state.c
+++ b/drbd/drbd_state.c
@@ -629,6 +629,7 @@ static enum drbd_state_rv ___end_state_change(struct drbd_resource *resource, st
 		connection->susp_fen[NOW] = connection->susp_fen[NEW];
 
 		wake_up(&connection->ping_wait);
+		wake_up(&connection->ee_wait);
 	}
 
 	idr_for_each_entry(&resource->devices, device, vnr) {
Places

File remove_rece_deadlock_conn_lost_in_resync.patch of Package drbd.24674

Places