File 3521-Use-maps-from_keys-2-and-heuristics-in-new-sets.patch of Package erlang

From 1a9df67497d2a80d463b85a6dd11ba444607982e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Valim?= <jose.valim@dashbit.co>
Date: Fri, 18 Dec 2020 14:53:36 +0100
Subject: [PATCH] Use maps:from_keys/2 and heuristics in new sets

maps:from_keys/2 speeds up sets creation by avoiding
a list traversal and reducing memory allocation.

Both intersection/2 and subtract/2 operations were
also augmented with a new heuristic that chooses
between creating a set from scratch or deleting some
of the existing keys, in order to reduce memory
allocation and speed up operations.

As of this patch, I would say the guidelines for
choosing between ordsets, gb_sets and sets (v2) is
the following:

  1. If you only want to traverse, perform unions,
    intersections, or subtractions of values that
    are quick to compare (integers, atoms, etc)
    and they are up to 1k-10k in size, pick ordsets;

  2. If you need take_smallest and take_largest and
    you don't want to do external bookkeeping, use
    gb_sets;

  3. For everything else, choose sets (v2).

Benchmarks for different operations and implementations
can be found at https://github.com/josevalim/sets_bench.
---
 lib/stdlib/src/sets.erl | 71 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 67 insertions(+), 4 deletions(-)

diff --git a/lib/stdlib/src/sets.erl b/lib/stdlib/src/sets.erl
index 344976cd4f..086de0f202 100644
--- a/lib/stdlib/src/sets.erl
+++ b/lib/stdlib/src/sets.erl
@@ -115,7 +115,7 @@ from_list(Ls) ->
       List :: [Element],
       Set :: set(Element).
 from_list(Ls, [{version, 2}]) ->
-    maps:from_list([{K,?VALUE}||K<-Ls]);
+    maps:from_keys(Ls, ?VALUE);
 from_list(Ls, Opts) ->
     case proplists:get_value(version, Opts, 1) of
         1 -> from_list(Ls);
@@ -258,7 +258,14 @@ union1(S1, []) -> S1.
       Set2 :: set(Element),
       Set3 :: set(Element).
 intersection(#{}=S1, #{}=S2) ->
-    maps:intersect(S1, S2);
+    case map_size(S1) < map_size(S2) of
+        true ->
+            Next = maps:next(maps:iterator(S1)),
+            intersection_heuristic(Next, [], [], floor(map_size(S1) * 0.75), S1, S2);
+        false ->
+            Next = maps:next(maps:iterator(S2)),
+            intersection_heuristic(Next, [], [], floor(map_size(S2) * 0.75), S2, S1)
+    end;
 intersection(S1, S2) ->
     case size(S1) < size(S2) of
         true ->
@@ -267,6 +274,33 @@ intersection(S1, S2) ->
 	    filter(fun (E) -> is_element(E, S1) end, S2)
     end.
 
+%% If we are keeping more than 75% of the keys, then it is
+%% cheaper to delete them. Stop accumulating and start deleting.
+intersection_heuristic(Next, _Keep, Delete, 0, Acc, Reference) ->
+  intersection_decided(Next, remove_keys(Delete, Acc), Reference);
+intersection_heuristic({Key, _Value, Iterator}, Keep, Delete, KeepCount, Acc, Reference) ->
+    Next = maps:next(Iterator),
+    case Reference of
+        #{Key := _} ->
+            intersection_heuristic(Next, [Key | Keep], Delete, KeepCount - 1, Acc, Reference);
+        _ ->
+            intersection_heuristic(Next, Keep, [Key | Delete], KeepCount, Acc, Reference)
+    end;
+intersection_heuristic(none, Keep, _Delete, _Count, _Acc, _Reference) ->
+    maps:from_keys(Keep, ?VALUE).
+
+intersection_decided({Key, _Value, Iterator}, Acc0, Reference) ->
+    Acc1 = case Reference of
+        #{Key := _} -> Acc0;
+        #{} -> maps:remove(Key, Acc0)
+    end,
+    intersection_decided(maps:next(Iterator), Acc1, Reference);
+intersection_decided(none, Acc, _Reference) ->
+    Acc.
+
+remove_keys([K | Ks], Map) -> remove_keys(Ks, maps:remove(K, Map));
+remove_keys([], Map) -> Map.
+
 %% intersection([Set]) -> Set.
 %%  Return the intersection of the list of sets.
 -spec intersection(SetList) -> Set when
@@ -323,9 +357,38 @@ is_disjoint_1(Set, Iter) ->
       Set1 :: set(Element),
       Set2 :: set(Element),
       Set3 :: set(Element).
+
+subtract(#{}=S1, #{}=S2) ->
+    Next = maps:next(maps:iterator(S1)),
+    subtract_heuristic(Next, [], [], floor(map_size(S1) * 0.75), S1, S2);
 subtract(S1, S2) ->
     filter(fun (E) -> not is_element(E, S2) end, S1).
 
+%% If we are keeping more than 75% of the keys, then it is
+%% cheaper to delete them. Stop accumulating and start deleting.
+subtract_heuristic(Next, _Keep, Delete, 0, Acc, Reference) ->
+  subtract_decided(Next, remove_keys(Delete, Acc), Reference);
+subtract_heuristic({Key, _Value, Iterator}, Keep, Delete, KeepCount, Acc, Reference) ->
+    Next = maps:next(Iterator),
+    case Reference of
+        #{Key := _} ->
+            subtract_heuristic(Next, Keep, [Key | Delete], KeepCount, Acc, Reference);
+        _ ->
+            subtract_heuristic(Next, [Key | Keep], Delete, KeepCount - 1, Acc, Reference)
+    end;
+subtract_heuristic(none, Keep, _Delete, _Count, _Acc, _Reference) ->
+    maps:from_keys(Keep, ?VALUE).
+
+subtract_decided({Key, _Value, Iterator}, Acc, Reference) ->
+    case Reference of
+        #{Key := _} ->
+            subtract_decided(maps:next(Iterator), maps:remove(Key, Acc), Reference);
+        _ ->
+            subtract_decided(maps:next(Iterator), Acc, Reference)
+    end;
+subtract_decided(none, Acc, _Reference) ->
+    Acc.
+
 %% is_subset(Set1, Set2) -> boolean().
 %%  Return 'true' when every element of Set1 is also a member of
 %%  Set2, else 'false'.
@@ -380,7 +443,7 @@ fold_1(Fun, Acc, Iter) ->
       Pred :: fun((Element) -> boolean()),
       Set1 :: set(Element),
       Set2 :: set(Element).
-filter(F, #{}=D) -> maps:from_list(filter_1(F, maps:iterator(D)));
+filter(F, #{}=D) -> maps:from_keys(filter_1(F, maps:iterator(D)), ?VALUE);
 filter(F, #set{}=D) -> filter_set(F, D).
 
 filter_1(Fun, Iter) ->
@@ -388,7 +451,7 @@ filter_1(Fun, Iter) ->
         {K, _, NextIter} ->
             case Fun(K) of
                 true ->
-                    [{K,?VALUE} | filter_1(Fun, NextIter)];
+                    [K | filter_1(Fun, NextIter)];
                 false ->
                     filter_1(Fun, NextIter)
             end;
-- 
2.26.2
Places

File 3521-Use-maps-from_keys-2-and-heuristics-in-new-sets.patch of Package erlang

Places