File 0241-erts-Improve-hash-shrinking.patch of Package erlang
From 4172488e16506ecbf9d66a0fd55a4a91a205e66d Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Mon, 1 Jul 2019 18:38:49 +0200
Subject: [PATCH 1/2] erts: Improve hash shrinking
* Release is_resizing as soon as possible to improve shrink concurrency.
  - Do join of buckets after release, but with kept WLOCK_HASH.
  - Do deallocations of seg and segtab after release
    of both is_resizing and WLOCK_HASH.
* Do lazy initialization of buckets in extended segments.
  - Mark inactive buckets in DEBUG.
---
 erts/emulator/beam/erl_db_hash.c | 200 +++++++++++++++++++++++++++------------
 1 file changed, 142 insertions(+), 58 deletions(-)
diff --git a/erts/emulator/beam/erl_db_hash.c b/erts/emulator/beam/erl_db_hash.c
index ceaccf7e44..0302b9f1a1 100644
--- a/erts/emulator/beam/erl_db_hash.c
+++ b/erts/emulator/beam/erl_db_hash.c
@@ -137,6 +137,11 @@
 
 #define BUCKET(tb, i) SEGTAB(tb)[SLOT_IX_TO_SEG_IX(i)]->buckets[(i) & EXT_SEGSZ_MASK]
 
+#ifdef DEBUG
+#  define DBG_BUCKET_INACTIVE ((HashDbTerm*)0xdead5107)
+#endif
+
+
 /*
  * When deleting a table, the number of records to delete.
  * Approximate number, because we must delete entire buckets.
@@ -377,7 +382,7 @@ typedef int (*extra_match_validator_t)(int keypos, Eterm match, Eterm guard, Ete
 */
 static struct ext_segtab* alloc_ext_segtab(DbTableHash* tb, unsigned seg_ix);
 static void alloc_seg(DbTableHash *tb);
-static int free_seg(DbTableHash *tb, int free_records);
+static int free_seg(DbTableHash *tb);
 static HashDbTerm* next_live(DbTableHash *tb, Uint *iptr, erts_rwmtx_t** lck_ptr,
 			     HashDbTerm *list);
 static HashDbTerm* search_list(DbTableHash* tb, Eterm key, 
@@ -2466,7 +2471,7 @@ static SWord db_free_table_continue_hash(DbTable *tbl, SWord reds)
     erts_atomic_set_relb(&tb->fixdel, (erts_aint_t)NULL);
 
     while(tb->nslots != 0) {
-	reds -= EXT_SEGSZ/64 + free_seg(tb, 1);
+	reds -= EXT_SEGSZ/64 + free_seg(tb);
 
 	/*
 	 * If we have done enough work, get out here.
@@ -2682,7 +2687,14 @@ static void alloc_seg(DbTableHash *tb)
     segtab[seg_ix] = (struct segment*) erts_db_alloc(ERTS_ALC_T_DB_SEG,
                                                      (DbTable *) tb,
                                                      SIZEOF_SEGMENT(EXT_SEGSZ));
-    sys_memset(segtab[seg_ix], 0, SIZEOF_SEGMENT(EXT_SEGSZ));
+#ifdef DEBUG
+    {
+        int i;
+        for (i = 0; i < EXT_SEGSZ; i++) {
+            segtab[seg_ix]->buckets[i] = DBG_BUCKET_INACTIVE;
+        }
+    }
+#endif
     tb->nslots += EXT_SEGSZ;
 }
 
@@ -2693,10 +2705,19 @@ static void dealloc_ext_segtab(void* lop_data)
     erts_free(ERTS_ALC_T_DB_SEG, est);
 }
 
-/* Shrink table by freeing the top segment
+struct dealloc_seg_ops {
+    struct segment* segp;
+    Uint seg_sz;
+
+    struct ext_segtab* est;
+};
+
+/* Shrink table by removing the top segment
 ** free_records: 1=free any records in segment, 0=assume segment is empty 
+** ds_ops: (out) Instructions for dealloc_seg().
 */
-static int free_seg(DbTableHash *tb, int free_records)
+static int remove_seg(DbTableHash *tb, int free_records,
+                      struct dealloc_seg_ops *ds_ops)
 {
     const int seg_ix = SLOT_IX_TO_SEG_IX(tb->nslots) - 1;
     struct segment** const segtab = SEGTAB(tb);
@@ -2704,24 +2725,47 @@ static int free_seg(DbTableHash *tb, int free_records)
     Uint seg_sz;
     int nrecords = 0;
 
+    ERTS_LC_ASSERT(IS_TAB_WLOCKED(tb) || tb->common.status & DB_DELETE
+                   || erts_atomic_read_nob(&tb->is_resizing));
+
     ASSERT(segp != NULL);
-#ifndef DEBUG
-    if (free_records)
-#endif
-    {	
-	int i = (seg_ix == 0) ? FIRST_SEGSZ : EXT_SEGSZ;
-	while (i--) {
-	    HashDbTerm* p = segp->buckets[i];
+    if (free_records) {
+        int ix, n;
+        if (seg_ix == 0) {
+            /* First segment (always fully active) */
+            n = FIRST_SEGSZ;
+            ix = FIRST_SEGSZ-1;
+        }
+        else if (NACTIVE(tb) < tb->nslots) {
+            /* Last extended segment partially active */
+            n = (NACTIVE(tb) - FIRST_SEGSZ) & EXT_SEGSZ_MASK;
+            ix = (NACTIVE(tb)-1) & EXT_SEGSZ_MASK;
+        }
+        else {
+            /* Full extended segment */
+            n = EXT_SEGSZ;
+            ix = EXT_SEGSZ - 1;
+        }
+        for ( ; n > 0; n--, ix--) {
+	    HashDbTerm* p = segp->buckets[ix & EXT_SEGSZ_MASK];
 	    while(p != 0) {		
 		HashDbTerm* nxt = p->next;
-		ASSERT(free_records); /* segment not empty as assumed? */
 		free_term(tb, p);
 		p = nxt;
 		++nrecords;
 	    }
 	}
     }
-    
+#ifdef DEBUG
+    else {
+        int ix = (seg_ix == 0) ? FIRST_SEGSZ-1 : EXT_SEGSZ-1;
+        for ( ; ix >= 0; ix--) {
+            ASSERT(segp->buckets[ix] == DBG_BUCKET_INACTIVE);
+        }
+    }
+#endif
+
+    ds_ops->est = NULL;
     if (seg_ix >= NSEG_1) {
         struct ext_segtab* est = ErtsContainerStruct_(segtab,struct ext_segtab,segtab);
 
@@ -2730,35 +2774,63 @@ static int free_seg(DbTableHash *tb, int free_records)
             SET_SEGTAB(tb, est->prev_segtab);
             tb->nsegs = est->prev_nsegs;
 
-            if (!tb->common.is_thread_safe) {
-                /*
-                 * Table is doing a graceful shrink operation and we must avoid
-                 * deallocating this segtab while it may still be read by other
-                 * threads. Schedule deallocation with thread progress to make
-                 * sure no lingering threads are still hanging in BUCKET macro
-                 * with an old segtab pointer.
-                 */
-                erts_schedule_db_free(&tb->common, dealloc_ext_segtab,
-                                      est, &est->lop,
-                                      SIZEOF_EXT_SEGTAB(est->nsegs));
-            }
-            else
-                erts_db_free(ERTS_ALC_T_DB_SEG, (DbTable*)tb, est,
-                             SIZEOF_EXT_SEGTAB(est->nsegs));
+            ds_ops->est = est;
         }
     }
+
     seg_sz = (seg_ix == 0) ? FIRST_SEGSZ : EXT_SEGSZ;
-    erts_db_free(ERTS_ALC_T_DB_SEG, (DbTable *)tb, segp, SIZEOF_SEGMENT(seg_sz));
+    tb->nslots -= seg_sz;
+    ASSERT(tb->nslots >= 0);
+
+    ds_ops->segp = segp;
+    ds_ops->seg_sz = seg_sz;
     
 #ifdef DEBUG
     if (seg_ix < tb->nsegs)
         SEGTAB(tb)[seg_ix] = NULL;
 #endif
-    tb->nslots -= seg_sz;
-    ASSERT(tb->nslots >= 0);
     return nrecords;
 }
 
+/*
+ * Deallocate segment removed by remove_seg()
+ */
+static void dealloc_seg(DbTableHash *tb, struct dealloc_seg_ops* ds_ops)
+{
+    struct ext_segtab* est = ds_ops->est;
+
+    if (est) {
+        if (!tb->common.is_thread_safe) {
+            /*
+             * Table is doing a graceful shrink operation and we must avoid
+             * deallocating this segtab while it may still be read by other
+             * threads. Schedule deallocation with thread progress to make
+             * sure no lingering threads are still hanging in BUCKET macro
+             * with an old segtab pointer.
+             */
+            erts_schedule_db_free(&tb->common, dealloc_ext_segtab,
+                                  est, &est->lop,
+                                  SIZEOF_EXT_SEGTAB(est->nsegs));
+        }
+        else
+            erts_db_free(ERTS_ALC_T_DB_SEG, (DbTable*)tb, est,
+                         SIZEOF_EXT_SEGTAB(est->nsegs));
+    }
+
+    erts_db_free(ERTS_ALC_T_DB_SEG, (DbTable *)tb,
+                 ds_ops->segp, SIZEOF_SEGMENT(ds_ops->seg_sz));
+}
+
+/* Remove and deallocate top segment and all its contained objects */
+static int free_seg(DbTableHash *tb)
+{
+    struct dealloc_seg_ops ds_ops;
+    int reds;
+
+    reds = remove_seg(tb, 1, &ds_ops);
+    dealloc_seg(tb, &ds_ops);
+    return reds;
+}
 
 /*
 ** Copy terms from ptr1 until ptr2
@@ -2880,6 +2952,7 @@ static void grow(DbTableHash* tb, int nitems)
         pnext = &BUCKET(tb, from_ix);
         p = *pnext;
         to_pnext = &BUCKET(tb, to_ix);
+        ASSERT(*to_pnext == DBG_BUCKET_INACTIVE);
         while (p != NULL) {
             if (is_pseudo_deleted(p)) { /* rare but possible with fine locking */
                 *pnext = p->next;
@@ -2916,14 +2989,16 @@ abort:
 */
 static void shrink(DbTableHash* tb, int nitems)
 {
-    HashDbTerm** src_bp;
-    HashDbTerm** dst_bp;
+    struct dealloc_seg_ops ds_ops;
+    HashDbTerm* src;
+    HashDbTerm* tail;
     HashDbTerm** bp;
     erts_rwmtx_t* lck;
     int src_ix, dst_ix, low_szm;
     int nactive;
     int loop_limit = 5;
 
+    ds_ops.segp = NULL;
     do {
         if (!begin_resizing(tb))
             return; /* already in progress */
@@ -2945,39 +3020,48 @@ static void shrink(DbTableHash* tb, int nitems)
             goto abort;
         }
 
-        src_bp = &BUCKET(tb, src_ix);
-        dst_bp = &BUCKET(tb, dst_ix);
-        bp = src_bp;
-
-        /*
-         * We join lists by appending "dst" at the end of "src"
-         * as we must step through "src" anyway to purge pseudo deleted.
-         */
-        while(*bp != NULL) {
-            if (is_pseudo_deleted(*bp)) {
-                HashDbTerm* deleted = *bp;
-                *bp = deleted->next;
-                free_term(tb, deleted);
-            } else {
-                bp = &(*bp)->next;
-            }
-        }
-        *bp = *dst_bp;
-        *dst_bp = *src_bp;
-        *src_bp = NULL;
-
+        src = BUCKET(tb, src_ix);
+#ifdef DEBUG
+        BUCKET(tb, src_ix) = DBG_BUCKET_INACTIVE;
+#endif
         nactive = src_ix;
         erts_atomic_set_nob(&tb->nactive, nactive);
         if (dst_ix == 0) {
             erts_atomic_set_relb(&tb->szm, low_szm);
         }
-        WUNLOCK_HASH(lck);
-
         if (tb->nslots - src_ix >= EXT_SEGSZ) {
-            free_seg(tb, 0);
+            remove_seg(tb, 0, &ds_ops);
         }
         done_resizing(tb);
 
+        if (src) {
+            /*
+             * We join buckets by appending "dst" list at the end of "src" list
+             * as we must step through "src" anyway to purge pseudo deleted.
+             */
+            bp = &BUCKET(tb, dst_ix);
+            tail = *bp;
+            *bp = src;
+
+            while(*bp != NULL) {
+                if (is_pseudo_deleted(*bp)) {
+                    HashDbTerm* deleted = *bp;
+                    *bp = deleted->next;
+                    free_term(tb, deleted);
+                } else {
+                    bp = &(*bp)->next;
+                }
+            }
+            *bp = tail;
+        }
+
+        WUNLOCK_HASH(lck);
+
+        if (ds_ops.segp) {
+            dealloc_seg(tb, &ds_ops);
+            ds_ops.segp = NULL;
+        }
+
     } while (--loop_limit
              && nactive > FIRST_SEGSZ && nitems < SHRINK_LIMIT(nactive));
     return;
-- 
2.16.4