File 3521-Use-maps-from_keys-2-and-heuristics-in-new-sets.patch of Package erlang
From 1a9df67497d2a80d463b85a6dd11ba444607982e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Valim?= <jose.valim@dashbit.co>
Date: Fri, 18 Dec 2020 14:53:36 +0100
Subject: [PATCH] Use maps:from_keys/2 and heuristics in new sets
maps:from_keys/2 speeds up sets creation by avoiding
a list traversal and reducing memory allocation.
Both intersection/2 and subtract/2 operations were
also augmented with a new heuristic that chooses
between creating a set from scratch or deleting some
of the existing keys, in order to reduce memory
allocation and speed up operations.
As of this patch, I would say the guidelines for
choosing between ordsets, gb_sets and sets (v2) is
the following:
1. If you only want to traverse, perform unions,
intersections, or subtractions of values that
are quick to compare (integers, atoms, etc)
and they are up to 1k-10k in size, pick ordsets;
2. If you need take_smallest and take_largest and
you don't want to do external bookkeeping, use
gb_sets;
3. For everything else, choose sets (v2).
Benchmarks for different operations and implementations
can be found at https://github.com/josevalim/sets_bench.
---
lib/stdlib/src/sets.erl | 71 ++++++++++++++++++++++++++++++++++++++---
1 file changed, 67 insertions(+), 4 deletions(-)
diff --git a/lib/stdlib/src/sets.erl b/lib/stdlib/src/sets.erl
index 344976cd4f..086de0f202 100644
--- a/lib/stdlib/src/sets.erl
+++ b/lib/stdlib/src/sets.erl
@@ -115,7 +115,7 @@ from_list(Ls) ->
List :: [Element],
Set :: set(Element).
from_list(Ls, [{version, 2}]) ->
- maps:from_list([{K,?VALUE}||K<-Ls]);
+ maps:from_keys(Ls, ?VALUE);
from_list(Ls, Opts) ->
case proplists:get_value(version, Opts, 1) of
1 -> from_list(Ls);
@@ -258,7 +258,14 @@ union1(S1, []) -> S1.
Set2 :: set(Element),
Set3 :: set(Element).
intersection(#{}=S1, #{}=S2) ->
- maps:intersect(S1, S2);
+ case map_size(S1) < map_size(S2) of
+ true ->
+ Next = maps:next(maps:iterator(S1)),
+ intersection_heuristic(Next, [], [], floor(map_size(S1) * 0.75), S1, S2);
+ false ->
+ Next = maps:next(maps:iterator(S2)),
+ intersection_heuristic(Next, [], [], floor(map_size(S2) * 0.75), S2, S1)
+ end;
intersection(S1, S2) ->
case size(S1) < size(S2) of
true ->
@@ -267,6 +274,33 @@ intersection(S1, S2) ->
filter(fun (E) -> is_element(E, S1) end, S2)
end.
+%% If we are keeping more than 75% of the keys, then it is
+%% cheaper to delete them. Stop accumulating and start deleting.
+intersection_heuristic(Next, _Keep, Delete, 0, Acc, Reference) ->
+ intersection_decided(Next, remove_keys(Delete, Acc), Reference);
+intersection_heuristic({Key, _Value, Iterator}, Keep, Delete, KeepCount, Acc, Reference) ->
+ Next = maps:next(Iterator),
+ case Reference of
+ #{Key := _} ->
+ intersection_heuristic(Next, [Key | Keep], Delete, KeepCount - 1, Acc, Reference);
+ _ ->
+ intersection_heuristic(Next, Keep, [Key | Delete], KeepCount, Acc, Reference)
+ end;
+intersection_heuristic(none, Keep, _Delete, _Count, _Acc, _Reference) ->
+ maps:from_keys(Keep, ?VALUE).
+
+intersection_decided({Key, _Value, Iterator}, Acc0, Reference) ->
+ Acc1 = case Reference of
+ #{Key := _} -> Acc0;
+ #{} -> maps:remove(Key, Acc0)
+ end,
+ intersection_decided(maps:next(Iterator), Acc1, Reference);
+intersection_decided(none, Acc, _Reference) ->
+ Acc.
+
+remove_keys([K | Ks], Map) -> remove_keys(Ks, maps:remove(K, Map));
+remove_keys([], Map) -> Map.
+
%% intersection([Set]) -> Set.
%% Return the intersection of the list of sets.
-spec intersection(SetList) -> Set when
@@ -323,9 +357,38 @@ is_disjoint_1(Set, Iter) ->
Set1 :: set(Element),
Set2 :: set(Element),
Set3 :: set(Element).
+
+subtract(#{}=S1, #{}=S2) ->
+ Next = maps:next(maps:iterator(S1)),
+ subtract_heuristic(Next, [], [], floor(map_size(S1) * 0.75), S1, S2);
subtract(S1, S2) ->
filter(fun (E) -> not is_element(E, S2) end, S1).
+%% If we are keeping more than 75% of the keys, then it is
+%% cheaper to delete them. Stop accumulating and start deleting.
+subtract_heuristic(Next, _Keep, Delete, 0, Acc, Reference) ->
+ subtract_decided(Next, remove_keys(Delete, Acc), Reference);
+subtract_heuristic({Key, _Value, Iterator}, Keep, Delete, KeepCount, Acc, Reference) ->
+ Next = maps:next(Iterator),
+ case Reference of
+ #{Key := _} ->
+ subtract_heuristic(Next, Keep, [Key | Delete], KeepCount, Acc, Reference);
+ _ ->
+ subtract_heuristic(Next, [Key | Keep], Delete, KeepCount - 1, Acc, Reference)
+ end;
+subtract_heuristic(none, Keep, _Delete, _Count, _Acc, _Reference) ->
+ maps:from_keys(Keep, ?VALUE).
+
+subtract_decided({Key, _Value, Iterator}, Acc, Reference) ->
+ case Reference of
+ #{Key := _} ->
+ subtract_decided(maps:next(Iterator), maps:remove(Key, Acc), Reference);
+ _ ->
+ subtract_decided(maps:next(Iterator), Acc, Reference)
+ end;
+subtract_decided(none, Acc, _Reference) ->
+ Acc.
+
%% is_subset(Set1, Set2) -> boolean().
%% Return 'true' when every element of Set1 is also a member of
%% Set2, else 'false'.
@@ -380,7 +443,7 @@ fold_1(Fun, Acc, Iter) ->
Pred :: fun((Element) -> boolean()),
Set1 :: set(Element),
Set2 :: set(Element).
-filter(F, #{}=D) -> maps:from_list(filter_1(F, maps:iterator(D)));
+filter(F, #{}=D) -> maps:from_keys(filter_1(F, maps:iterator(D)), ?VALUE);
filter(F, #set{}=D) -> filter_set(F, D).
filter_1(Fun, Iter) ->
@@ -388,7 +451,7 @@ filter_1(Fun, Iter) ->
{K, _, NextIter} ->
case Fun(K) of
true ->
- [{K,?VALUE} | filter_1(Fun, NextIter)];
+ [K | filter_1(Fun, NextIter)];
false ->
filter_1(Fun, NextIter)
end;
--
2.26.2