File 1410-Retry-to-early-add_table_copy.patch of Package erlang
From d17819b9d053027657497b9e4692d2bbfaeed48d Mon Sep 17 00:00:00 2001
From: Dan Gudmundsson <dgud@erlang.org>
Date: Wed, 8 Nov 2023 16:03:17 +0100
Subject: [PATCH] Retry to early add_table_copy
add_table copy/3 could abort with system_limit as reason
if the "added" node was starting, i.e. alive but not merged
schema yet.
Now abort with node_not_running, which should restart the transaction
automaticly after a short sleep.
Also improve debug (verbose) printouts and saved coredump info.
---
lib/mnesia/src/mnesia_lib.erl | 2 +-
lib/mnesia/src/mnesia_loader.erl | 3 +++
lib/mnesia/src/mnesia_schema.erl | 3 ++-
lib/mnesia/src/mnesia_tm.erl | 30 ++++++++++++++++++++++++++----
4 files changed, 32 insertions(+), 6 deletions(-)
diff --git a/lib/mnesia/src/mnesia_lib.erl b/lib/mnesia/src/mnesia_lib.erl
index 4dce5cefc0..b1ca9ebf35 100644
--- a/lib/mnesia/src/mnesia_lib.erl
+++ b/lib/mnesia/src/mnesia_lib.erl
@@ -1065,7 +1065,7 @@ save2(DbgInfo) ->
Key = {'$$$_report', current_pos},
P =
case ?ets_lookup_element(mnesia_gvar, Key, 2) of
- 30 -> -1;
+ 100 -> -1;
I -> I
end,
set({'$$$_report', current_pos}, P+1),
diff --git a/lib/mnesia/src/mnesia_loader.erl b/lib/mnesia/src/mnesia_loader.erl
index b16d00da61..1a65dac222 100644
--- a/lib/mnesia/src/mnesia_loader.erl
+++ b/lib/mnesia/src/mnesia_loader.erl
@@ -226,6 +226,7 @@ do_get_network_copy(Tab, Reason, Ns, Storage, Cs) ->
dbg_out("Table ~tp copied from ~p to ~p~n", [Tab, Node, node()]),
{loaded, ok};
Err = {error, _} when element(1, Reason) == dumper ->
+ verbose("Copy failed: ~tp ~p~n", [Tab, Err]),
{not_loaded,Err};
restart ->
try_net_load_table(Tab, Reason, Tail ++ [Node], Cs);
@@ -339,6 +340,7 @@ start_receiver(Tab,Storage,Cs,SenderPid,TabSize,DetsData,{dumper,{add_table_copy
Init = table_init_fun(SenderPid, Storage),
case do_init_table(Tab,Storage,Cs,SenderPid,TabSize,DetsData,self(), Init) of
Err = {error, _} ->
+ verbose("Init table failed: ~tp ~p~n", [Tab, Err]),
SenderPid ! {copier_done, node()},
Err;
Else ->
@@ -363,6 +365,7 @@ wait_on_load_complete(Pid) ->
{Pid, Res} ->
Res;
{'EXIT', Pid, Reason} ->
+ verbose("Loader crashed : ~tp ~p~n", [Pid, Reason]),
error(Reason);
Else ->
Pid ! Else,
diff --git a/lib/mnesia/src/mnesia_schema.erl b/lib/mnesia/src/mnesia_schema.erl
index e0f561937e..27a7b118b5 100644
--- a/lib/mnesia/src/mnesia_schema.erl
+++ b/lib/mnesia/src/mnesia_schema.erl
@@ -2466,13 +2466,14 @@ prepare_op(Tid, {op, add_table_copy, Storage, Node, TabDef}, _WaitFor) ->
_ ->
ok
end,
- mnesia_lib:verbose("~w:~w Adding table~n",[?MODULE,?LINE]),
case mnesia_controller:get_network_copy(Tid, Tab, Cs) of
{loaded, ok} ->
%% Tables are created by mnesia_loader get_network code
insert_cstruct(Tid, Cs, true),
{true, optional};
+ {not_loaded, {not_active, schema, Node}} ->
+ mnesia:abort({node_not_running, Node});
{not_loaded, ErrReason} ->
Reason = {system_limit, Tab, {Node, ErrReason}},
mnesia:abort(Reason)
diff --git a/lib/mnesia/src/mnesia_tm.erl b/lib/mnesia/src/mnesia_tm.erl
index 5a070cf0cd..3a080e1d32 100644
--- a/lib/mnesia/src/mnesia_tm.erl
+++ b/lib/mnesia/src/mnesia_tm.erl
@@ -892,27 +892,26 @@ restart(Mod, Tid, Ts, Fun, Args, Factor0, Retries0, Type, Why) ->
return_abort(Fun, Args, Why),
Factor = 1,
SleepTime = mnesia_lib:random_time(Factor, Tid#tid.counter),
- dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
+ log_restart("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
timer:sleep(SleepTime),
execute_outer(Mod, Fun, Args, Factor, Retries, Type);
{node_not_running, _N} -> %% Avoids hanging in receive_release_tid_ack
return_abort(Fun, Args, Why),
Factor = 1,
SleepTime = mnesia_lib:random_time(Factor, Tid#tid.counter),
- dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
+ log_restart("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
timer:sleep(SleepTime),
execute_outer(Mod, Fun, Args, Factor, Retries, Type);
_ ->
SleepTime = mnesia_lib:random_time(Factor0, Tid#tid.counter),
dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
-
+
if
Factor0 /= 10 ->
ignore;
true ->
%% Our serial may be much larger than other nodes ditto
AllNodes = val({current, db_nodes}),
- verbose("Sync serial ~p~n", [Tid]),
rpc:abcast(AllNodes, ?MODULE, {sync_trans_serial, Tid})
end,
intercept_friends(Tid, Ts),
@@ -931,6 +930,24 @@ restart(Mod, Tid, Ts, Fun, Args, Factor0, Retries0, Type, Why) ->
end
end.
+log_restart(F,A) ->
+ case get(transaction_client) of
+ undefined ->
+ dbg_out(F,A);
+ _ ->
+ case get(transaction_count) of
+ undefined ->
+ put(transaction_count, 1),
+ verbose(F,A);
+ N when (N rem 10) == 0 ->
+ put(transaction_count, N+1),
+ verbose(F,A);
+ N ->
+ put(transaction_count, N+1),
+ dbg_out(F,A)
+ end
+ end.
+
get_restarted(Tid) ->
case Res = rec() of
{restarted, Tid} ->
@@ -2086,6 +2103,7 @@ new_cr_format(#commit{ext=Snmp}=Cr) ->
Cr#commit{ext=[{snmp,Snmp}]}.
rec_all([Node | Tail], Tid, Res, Pids) ->
+ put({?MODULE, ?FUNCTION_NAME}, {Node, Tail}),
receive
{?MODULE, Node, {vote_yes, Tid}} ->
rec_all(Tail, Tid, Res, Pids);
@@ -2104,8 +2122,12 @@ rec_all([Node | Tail], Tid, Res, Pids) ->
Abort = {do_abort, {bad_commit, Node}},
?SAFE({?MODULE, Node} ! {Tid, Abort}),
rec_all(Tail, Tid, Abort, Pids)
+ after 15000 ->
+ mnesia_lib:verbose("~p: trans ~p waiting ~p~n", [self(), Tid, Node]),
+ rec_all([Node | Tail], Tid, Res, Pids)
end;
rec_all([], _Tid, Res, Pids) ->
+ erase({?MODULE, ?FUNCTION_NAME}),
{Res, Pids}.
get_transactions() ->
--
2.35.3