File 3841-Initial-implementation-of-EEP-50.patch of Package erlang
From e66941e8d7c47b973dff94c0308ea85a6be1958e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Valim?= <jose.valim@dashbit.co>
Date: Thu, 12 Nov 2020 13:27:00 +0100
Subject: [PATCH] Initial implementation of EEP 50
This pull request adds support for maps in the
sets module. According to my [benchmarks][bench],
using maps is faster in the huge majority of cases,
sometimes by multiple orders of magnitude, and in
the few cases it is slower, it is by less than 10%.
[bench]: https://github.com/josevalim/sets_bench
---
lib/stdlib/doc/src/sets.xml | 30 +++++
lib/stdlib/src/sets.erl | 187 ++++++++++++++++++++++++------
lib/stdlib/test/sets_SUITE.erl | 56 +++++++--
lib/stdlib/test/sets_test_lib.erl | 16 ++-
4 files changed, 241 insertions(+), 48 deletions(-)
diff --git a/lib/stdlib/doc/src/sets.xml b/lib/stdlib/doc/src/sets.xml
index 291425c35b..8f19175d98 100644
--- a/lib/stdlib/doc/src/sets.xml
+++ b/lib/stdlib/doc/src/sets.xml
@@ -44,6 +44,20 @@
that while this module considers two elements as different if they
do not match (<c>=:=</c>), <c>ordsets</c> considers two elements as
different if and only if they do not compare equal (<c>==</c>).</p>
+
+ <p>Erlang/OTP 24.0 introduced a new internal representation for sets
+ which is more performant. Developers can use this new representation
+ by passing the <c>{version, 2}</c> flag to <seealso marker="#new/1"><c>new/1</c></seealso>
+ and <seealso marker="#from_list/2"><c>from_list/2</c></seealso>, such
+ as <c>sets:new([{version, 2}])</c>. This new representation will
+ become the default in future Erlang/OTP versions. Functions that
+ work on two sets, such as <seealso marker="#union/2"><c>union/2</c></seealso>
+ and similar, will work with sets of different versions. In such cases,
+ there is no guarantee about the version of the returned set. Explicit
+ conversion from the old version to the new one can be done with
+ <c>sets:from_list(sets:to_list(Old), [{version,2}])</c>.
+ </p>
+
</description>
<datatypes>
@@ -103,6 +117,14 @@
</desc>
</func>
+ <func>
+ <name name="from_list" arity="2" since="OTP 24.0"/>
+ <fsummary>Convert a list into a <c>Set</c> at the given version.</fsummary>
+ <desc>
+ <p>Returns a set of the elements in <c><anno>List</anno></c> at the given version.</p>
+ </desc>
+ </func>
+
<func>
<name name="intersection" arity="1" since=""/>
<fsummary>Return the intersection of a list of <c>Sets</c>.</fsummary>
@@ -174,6 +196,14 @@
</desc>
</func>
+ <func>
+ <name name="new" arity="1" since="OTP 24.0"/>
+ <fsummary>Return an empty set at the given version.</fsummary>
+ <desc>
+ <p>Returns a new empty set at the given version.</p>
+ </desc>
+ </func>
+
<func>
<name name="size" arity="1" since=""/>
<fsummary>Return the number of elements in a set.</fsummary>
diff --git a/lib/stdlib/src/sets.erl b/lib/stdlib/src/sets.erl
index 7ee3454efa..344976cd4f 100644
--- a/lib/stdlib/src/sets.erl
+++ b/lib/stdlib/src/sets.erl
@@ -18,10 +18,13 @@
%% %CopyrightEnd%
%%
-%% We use the dynamic hashing techniques by Per-Åke Larsson as
-%% described in "The Design and Implementation of Dynamic Hashing for
-%% Sets and Tables in Icon" by Griswold and Townsend. Much of the
-%% terminology comes from that paper as well.
+%% The new version 2 has moved to use maps under the roof whenever a
+%% map is given.
+
+%% The previous version (version 1) uses the dynamic hashing techniques
+%% by Per-Åke Larsson as described in "The Design and Implementation
+%% of Dynamic Hashing for Sets and Tables in Icon" by Griswold and
+%% Townsend. Much of the terminology comes from that paper as well.
%% The segments are all of the same fixed size and we just keep
%% increasing the size of the top tuple as the table grows. At the
@@ -44,9 +47,15 @@
-export([is_disjoint/2]).
-export([subtract/2,is_subset/2]).
-export([fold/3,filter/2]).
+-export([new/1, from_list/2]).
-export_type([set/0, set/1]).
+%% This is the value used when sets are represented as maps.
+%% We use an empty list instead of an atom as it is cheaper
+%% to serialize.
+-define(VALUE, []).
+
%% Note: mk_seg/1 must be changed too if seg_size is changed.
-define(seg_size, 16).
-define(max_seg, 32).
@@ -54,6 +63,7 @@
-define(contract_load, 3).
-define(exp_size, ?seg_size * ?expand_load).
-define(con_size, ?seg_size * ?contract_load).
+-compile({no_auto_import,[size/1]}).
%%------------------------------------------------------------------------------
@@ -74,7 +84,7 @@
-type set() :: set(_).
--opaque set(Element) :: #set{segs :: segs(Element)}.
+-opaque set(Element) :: #set{segs :: segs(Element)} | #{Element => ?VALUE}.
%%------------------------------------------------------------------------------
@@ -84,10 +94,41 @@ new() ->
Empty = mk_seg(?seg_size),
#set{empty = Empty, segs = {Empty}}.
+-spec new([{version, 1..2}]) -> set().
+new([{version, 2}]) ->
+ #{};
+new(Opts) ->
+ case proplists:get_value(version, Opts, 1) of
+ 1 -> new();
+ 2 -> new([{version, 2}])
+ end.
+
+%% from_list([Elem]) -> Set.
+%% Build a set from the elements in List.
+-spec from_list(List) -> Set when
+ List :: [Element],
+ Set :: set(Element).
+from_list(Ls) ->
+ lists:foldl(fun (E, S) -> add_element(E, S) end, new(), Ls).
+
+-spec from_list(List, [{version, 1..2}]) -> Set when
+ List :: [Element],
+ Set :: set(Element).
+from_list(Ls, [{version, 2}]) ->
+ maps:from_list([{K,?VALUE}||K<-Ls]);
+from_list(Ls, Opts) ->
+ case proplists:get_value(version, Opts, 1) of
+ 1 -> from_list(Ls);
+ 2 -> from_list(Ls, [{version, 2}])
+ end.
+
+%%------------------------------------------------------------------------------
+
%% is_set(Set) -> boolean().
%% Return 'true' if Set is a set of elements, else 'false'.
-spec is_set(Set) -> boolean() when
Set :: term().
+is_set(#{}) -> true;
is_set(#set{}) -> true;
is_set(_) -> false.
@@ -95,35 +136,36 @@ is_set(_) -> false.
%% Return the number of elements in Set.
-spec size(Set) -> non_neg_integer() when
Set :: set().
-size(S) -> S#set.size.
+size(#{}=S) -> map_size(S);
+size(#set{size=Size}) -> Size.
%% is_empty(Set) -> boolean().
%% Return 'true' if Set is an empty set, otherwise 'false'.
-spec is_empty(Set) -> boolean() when
Set :: set().
-is_empty(S) -> S#set.size=:=0.
+is_empty(#{}=S) -> map_size(S)=:=0;
+is_empty(#set{size=Size}) -> Size=:=0.
%% to_list(Set) -> [Elem].
%% Return the elements in Set as a list.
-spec to_list(Set) -> List when
Set :: set(Element),
List :: [Element].
-to_list(S) ->
+to_list(#{}=S) ->
+ maps:keys(S);
+to_list(#set{} = S) ->
fold(fun (Elem, List) -> [Elem|List] end, [], S).
-%% from_list([Elem]) -> Set.
-%% Build a set from the elements in List.
--spec from_list(List) -> Set when
- List :: [Element],
- Set :: set(Element).
-from_list(L) ->
- lists:foldl(fun (E, S) -> add_element(E, S) end, new(), L).
-
%% is_element(Element, Set) -> boolean().
%% Return 'true' if Element is an element of Set, else 'false'.
-spec is_element(Element, Set) -> boolean() when
Set :: set(Element).
-is_element(E, S) ->
+is_element(E, #{}=S) ->
+ case S of
+ #{E := _} -> true;
+ _ -> false
+ end;
+is_element(E, #set{}=S) ->
Slot = get_slot(S, E),
Bkt = get_bucket(S, Slot),
lists:member(E, Bkt).
@@ -133,7 +175,9 @@ is_element(E, S) ->
-spec add_element(Element, Set1) -> Set2 when
Set1 :: set(Element),
Set2 :: set(Element).
-add_element(E, S0) ->
+add_element(E, #{}=S) ->
+ S#{E=>?VALUE};
+add_element(E, #set{}=S0) ->
Slot = get_slot(S0, E),
Bkt = get_bucket(S0, Slot),
case lists:member(E, Bkt) of
@@ -149,7 +193,9 @@ add_element(E, S0) ->
-spec del_element(Element, Set1) -> Set2 when
Set1 :: set(Element),
Set2 :: set(Element).
-del_element(E, S0) ->
+del_element(E, #{}=S) ->
+ maps:remove(E, S);
+del_element(E, #set{}=S0) ->
Slot = get_slot(S0, E),
Bkt = get_bucket(S0, Slot),
case lists:member(E, Bkt) of
@@ -180,10 +226,15 @@ update_bucket(Set, Slot, NewBucket) ->
Set1 :: set(Element),
Set2 :: set(Element),
Set3 :: set(Element).
-union(S1, S2) when S1#set.size < S2#set.size ->
- fold(fun (E, S) -> add_element(E, S) end, S2, S1);
+union(#{}=S1, #{}=S2) ->
+ maps:merge(S1,S2);
union(S1, S2) ->
- fold(fun (E, S) -> add_element(E, S) end, S1, S2).
+ case size(S1) < size(S2) of
+ true ->
+ fold(fun (E, S) -> add_element(E, S) end, S2, S1);
+ false ->
+ fold(fun (E, S) -> add_element(E, S) end, S1, S2)
+ end.
%% union([Set]) -> Set
%% Return the union of the list of sets.
@@ -206,10 +257,15 @@ union1(S1, []) -> S1.
Set1 :: set(Element),
Set2 :: set(Element),
Set3 :: set(Element).
-intersection(S1, S2) when S1#set.size < S2#set.size ->
- filter(fun (E) -> is_element(E, S2) end, S1);
+intersection(#{}=S1, #{}=S2) ->
+ maps:intersect(S1, S2);
intersection(S1, S2) ->
- filter(fun (E) -> is_element(E, S1) end, S2).
+ case size(S1) < size(S2) of
+ true ->
+ filter(fun (E) -> is_element(E, S2) end, S1);
+ false ->
+ filter(fun (E) -> is_element(E, S1) end, S2)
+ end.
%% intersection([Set]) -> Set.
%% Return the intersection of the list of sets.
@@ -230,14 +286,35 @@ intersection1(S1, []) -> S1.
-spec is_disjoint(Set1, Set2) -> boolean() when
Set1 :: set(Element),
Set2 :: set(Element).
-is_disjoint(S1, S2) when S1#set.size < S2#set.size ->
- fold(fun (_, false) -> false;
- (E, true) -> not is_element(E, S2)
- end, true, S1);
+is_disjoint(#{}=S1, #{}=S2) ->
+ if
+ map_size(S1) < map_size(S2) ->
+ is_disjoint_1(S2, maps:iterator(S1));
+ true ->
+ is_disjoint_1(S1, maps:iterator(S2))
+ end;
is_disjoint(S1, S2) ->
- fold(fun (_, false) -> false;
- (E, true) -> not is_element(E, S1)
- end, true, S2).
+ case size(S1) < size(S2) of
+ true ->
+ fold(fun (_, false) -> false;
+ (E, true) -> not is_element(E, S2)
+ end, true, S1);
+ false ->
+ fold(fun (_, false) -> false;
+ (E, true) -> not is_element(E, S1)
+ end, true, S2)
+ end.
+
+is_disjoint_1(Set, Iter) ->
+ case maps:next(Iter) of
+ {K, _, NextIter} ->
+ case Set of
+ #{K := _} -> false;
+ #{} -> is_disjoint_1(Set, NextIter)
+ end;
+ none ->
+ true
+ end.
%% subtract(Set1, Set2) -> Set.
%% Return all and only the elements of Set1 which are not also in
@@ -255,9 +332,28 @@ subtract(S1, S2) ->
-spec is_subset(Set1, Set2) -> boolean() when
Set1 :: set(Element),
Set2 :: set(Element).
+
+is_subset(#{}=S1, #{}=S2) ->
+ if
+ map_size(S1) > map_size(S2) ->
+ false;
+ true ->
+ is_subset_1(S2, maps:iterator(S1))
+ end;
is_subset(S1, S2) ->
fold(fun (E, Sub) -> Sub andalso is_element(E, S2) end, true, S1).
+is_subset_1(Set, Iter) ->
+ case maps:next(Iter) of
+ {K, _, NextIter} ->
+ case Set of
+ #{K := _} -> is_subset_1(Set, NextIter);
+ #{} -> false
+ end;
+ none ->
+ true
+ end.
+
%% fold(Fun, Accumulator, Set) -> Accumulator.
%% Fold function Fun over all elements in Set and return Accumulator.
-spec fold(Function, Acc0, Set) -> Acc1 when
@@ -267,7 +363,16 @@ is_subset(S1, S2) ->
Acc1 :: Acc,
AccIn :: Acc,
AccOut :: Acc.
-fold(F, Acc, D) -> fold_set(F, Acc, D).
+fold(F, Acc, #{}=D) -> fold_1(F, Acc, maps:iterator(D));
+fold(F, Acc, #set{}=D) -> fold_set(F, Acc, D).
+
+fold_1(Fun, Acc, Iter) ->
+ case maps:next(Iter) of
+ {K, _, NextIter} ->
+ fold_1(Fun, Fun(K,Acc), NextIter);
+ none ->
+ Acc
+ end.
%% filter(Fun, Set) -> Set.
%% Filter Set with Fun.
@@ -275,7 +380,21 @@ fold(F, Acc, D) -> fold_set(F, Acc, D).
Pred :: fun((Element) -> boolean()),
Set1 :: set(Element),
Set2 :: set(Element).
-filter(F, D) -> filter_set(F, D).
+filter(F, #{}=D) -> maps:from_list(filter_1(F, maps:iterator(D)));
+filter(F, #set{}=D) -> filter_set(F, D).
+
+filter_1(Fun, Iter) ->
+ case maps:next(Iter) of
+ {K, _, NextIter} ->
+ case Fun(K) of
+ true ->
+ [{K,?VALUE} | filter_1(Fun, NextIter)];
+ false ->
+ filter_1(Fun, NextIter)
+ end;
+ none ->
+ []
+ end.
%% get_slot(Hashdb, Key) -> Slot.
%% Get the slot. First hash on the new range, if we hit a bucket
diff --git a/lib/stdlib/test/sets_SUITE.erl b/lib/stdlib/test/sets_SUITE.erl
index 2c1b388d52..140c2e4b43 100644
--- a/lib/stdlib/test/sets_SUITE.erl
+++ b/lib/stdlib/test/sets_SUITE.erl
@@ -28,7 +28,7 @@
init_per_testcase/2,end_per_testcase/2,
create/1,add_element/1,del_element/1,
subtract/1,intersection/1,union/1,is_subset/1,
- is_set/1,is_empty/1,fold/1,filter/1,
+ is_disjoint/1,is_set/1,is_empty/1,fold/1,filter/1,
take_smallest/1,take_largest/1, iterate/1]).
-include_lib("common_test/include/ct.hrl").
@@ -48,7 +48,7 @@ suite() ->
all() ->
[create, add_element, del_element, subtract,
intersection, union, is_subset, is_set, fold, filter,
- take_smallest, take_largest, iterate, is_empty].
+ take_smallest, take_largest, iterate, is_empty, is_disjoint].
groups() ->
[].
@@ -123,7 +123,7 @@ del_element(Config) when is_list(Config) ->
del_element_1(List, M) ->
S0 = M(from_list, List),
Empty = foldl(fun(El, Set) -> M(del_element, {El,Set}) end, S0, List),
- Empty = M(empty, []),
+ true = M(equal, {Empty,M(empty, [])}),
true = M(is_empty, Empty),
S1 = foldl(fun(El, Set) ->
M(add_element, {El,Set})
@@ -299,6 +299,22 @@ is_subset_1(List, M) ->
],
res_to_set(Res, M, 0, []).
+is_disjoint(Config) when is_list(Config) ->
+ test_all([{1,132},{253,270},{299,311}], fun is_disjoint_1/2).
+
+is_disjoint_1(List, M) ->
+ S = M(from_list, List),
+ Empty = M(empty, []),
+
+ true = M(is_disjoint, {Empty,Empty}),
+ true = M(is_disjoint, {Empty,S}),
+ true = M(is_disjoint, {S,Empty}),
+ false = M(is_disjoint, {S,S}),
+
+ true = M(is_disjoint, {M(singleton, make_ref()),S}),
+ true = M(is_disjoint, {S,M(singleton, make_ref())}),
+ S.
+
check_subset(X, Y, M) ->
check_one_subset(Y, X, M),
check_one_subset(X, Y, M).
@@ -481,13 +497,37 @@ iterate_set_1(M, {E, I}, R) ->
sets_mods() ->
Ordsets = sets_test_lib:new(ordsets, fun(X, Y) -> X == Y end),
- Sets = sets_test_lib:new(sets, fun(X, Y) ->
- lists:sort(sets:to_list(X)) ==
- lists:sort(sets:to_list(Y)) end),
+
+ NewSets = sets_test_lib:new(sets, fun(X, Y) -> X == Y end,
+ fun() -> sets:new([{version,2}]) end,
+ fun(X) -> sets:from_list(X, [{version,2}]) end),
+
+ MixSets = sets_test_lib:new(sets, fun(X, Y) ->
+ lists:sort(sets:to_list(X)) ==
+ lists:sort(sets:to_list(Y)) end,
+ fun mixed_new/0, fun mixed_from_list/1),
+
+ OldSets = sets_test_lib:new(sets, fun(X, Y) ->
+ lists:sort(sets:to_list(X)) ==
+ lists:sort(sets:to_list(Y)) end,
+ fun sets:new/0, fun sets:from_list/1),
+
Gb = sets_test_lib:new(gb_sets, fun(X, Y) ->
- gb_sets:to_list(X) ==
+ gb_sets:to_list(X) ==
gb_sets:to_list(Y) end),
- [Ordsets,Sets,Gb].
+ [Ordsets,OldSets,MixSets,NewSets,Gb].
+
+mixed_new() ->
+ case erlang:erase(sets_type) of
+ undefined -> erlang:put(sets_type, deprecated), sets:new([{version,2}]);
+ deprecated -> sets:new()
+ end.
+
+mixed_from_list(L) ->
+ case erlang:erase(sets_type) of
+ undefined -> erlang:put(sets_type, deprecated), sets:from_list(L, [{version,2}]);
+ deprecated -> sets:from_list(L)
+ end.
test_all(Tester) ->
Res = [begin
diff --git a/lib/stdlib/test/sets_test_lib.erl b/lib/stdlib/test/sets_test_lib.erl
index e4d476ba54..076e310206 100644
--- a/lib/stdlib/test/sets_test_lib.erl
+++ b/lib/stdlib/test/sets_test_lib.erl
@@ -20,18 +20,22 @@
-module(sets_test_lib).
--export([new/2]).
+-export([new/2, new/4]).
new(Mod, Eq) ->
+ new(Mod, Eq, fun Mod:new/0, fun Mod:from_list/1).
+
+new(Mod, Eq, New, FromList) ->
fun (add_element, {El,S}) -> add_element(Mod, El, S);
(del_element, {El,S}) -> del_element(Mod, El, S);
- (empty, []) -> Mod:new();
+ (empty, []) -> New();
(equal, {S1,S2}) -> Eq(S1, S2);
(filter, {F,S}) -> filter(Mod, F, S);
(fold, {F,A,S}) -> fold(Mod, F, A, S);
- (from_list, L) -> Mod:from_list(L);
+ (from_list, L) -> FromList(L);
(intersection, {S1,S2}) -> intersection(Mod, Eq, S1, S2);
(intersection, Ss) -> intersection(Mod, Eq, Ss);
+ (is_disjoint, {S,Set}) -> Mod:is_disjoint(S, Set);
(is_empty, S) -> Mod:is_empty(S);
(is_set, S) -> Mod:is_set(S);
(is_subset, {S,Set}) -> is_subset(Mod, Eq, S, Set);
@@ -39,7 +43,7 @@ new(Mod, Eq) ->
(iterator_from, {Start, S}) -> Mod:iterator_from(Start, S);
(module, []) -> Mod;
(next, I) -> Mod:next(I);
- (singleton, E) -> singleton(Mod, E);
+ (singleton, E) -> singleton(Mod, FromList, E);
(size, S) -> Mod:size(S);
(subtract, {S1,S2}) -> subtract(Mod, S1, S2);
(to_list, S) -> Mod:to_list(S);
@@ -47,10 +51,10 @@ new(Mod, Eq) ->
(union, Ss) -> union(Mod, Eq, Ss)
end.
-singleton(Mod, E) ->
+singleton(Mod, FromList, E) ->
case erlang:function_exported(Mod, singleton, 1) of
true -> Mod:singleton(E);
- false -> Mod:from_list([E])
+ false -> FromList([E])
end.
add_element(Mod, El, S0) ->
--
2.26.2