File 9371-erl_tar-Stream-file-entries-to-disk-instead-of-loadi.patch of Package erlang
From fbf2c7194f6fce4b7b66dc0bcaa4f6b4b85e088f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eric=20Meadows-J=C3=B6nsson?=
<eric.meadows.jonsson@gmail.com>
Date: Sun, 8 Mar 2026 20:04:58 +0100
Subject: [PATCH] erl_tar: Stream file entries to disk instead of loading into
memory
When extracting tar entries to disk (using the {cwd, Dir} option),
erl_tar previously read each file entry fully into memory before
writing it to disk. For large files this causes unnecessary memory
usage.
This change makes the disk extraction path stream file entries in
chunks (default 64KB) directly from the tar reader to the output
file. The in-memory extraction path (using the memory option) is
unchanged.
A new {chunks, N} option is added for extract/2 to control the
chunk size, matching the existing {chunks, N} option for add/4.
Also fixes the chunk_size comment in add_opts which incorrectly
stated "0=do not chunk" when 0 actually means "use default (65536)".
---
lib/stdlib/src/erl_tar.erl | 104 ++++++++++++++++++++++++++++++----
lib/stdlib/src/erl_tar.hrl | 8 ++-
lib/stdlib/test/tar_SUITE.erl | 83 ++++++++++++++++++++++++++-
3 files changed, 177 insertions(+), 18 deletions(-)
diff --git a/lib/stdlib/doc/src/erl_tar.xml b/lib/stdlib/doc/src/erl_tar.xml
index fb57044c3b..78a2b236e4 100644
--- a/lib/stdlib/doc/src/erl_tar.xml
+++ b/lib/stdlib/doc/src/erl_tar.xml
@@ -165,11 +165,8 @@
</item>
<tag><c>{chunks,ChunkSize}</c></tag>
<item>
- <p>Reads data in parts from the file. This is intended for
- memory-limited machines that, for example, builds a tar file
- on a remote machine over SFTP, see
- <seemfa marker="ssh:ssh_sftp#open_tar/3">
- <c>ssh_sftp:open_tar/3</c></seemfa>.</p>
+ <p>Sets the chunk size, in bytes, for reading data from the file.
+ Defaults to 65536 bytes.</p>
</item>
<tag><c>{atime,non_neg_integer()}</c></tag>
<item>
@@ -358,6 +355,11 @@
<item>
<p>Prints an informational message for each extracted file.</p>
</item>
+ <tag><c>{chunks,ChunkSize}</c></tag>
+ <item>
+ <p>Sets the chunk size, in bytes, for writing extracted file data to disk.
+ Defaults to 65536 bytes.</p>
+ </item>
</taglist>
<warning>
<p>The <c>compressed</c> and <c>cooked</c> flags are invalid when
diff --git a/lib/stdlib/src/erl_tar.erl b/lib/stdlib/src/erl_tar.erl
index fb57044c3b..78a2b236e4 100644
--- a/lib/stdlib/src/erl_tar.erl
+++ b/lib/stdlib/src/erl_tar.erl
@@ -345,12 +348,18 @@ extract1(eof, Reader, _, Acc) ->
extract1(#tar_header{name=Name,size=Size}=Header, Reader0, Opts, Acc0) ->
case check_extract(Name, Opts) of
true ->
- case do_read(Reader0, Size) of
- {ok, Bin, Reader1} ->
- Acc = extract2(Header, Bin, Opts, Acc0),
- {ok, Acc, Reader1};
- {error, _} = Err ->
- throw(Err)
+ case Opts#read_opts.output of
+ memory ->
+ case do_read(Reader0, Size) of
+ {ok, Bin, Reader1} ->
+ Acc = extract2(Header, Bin, Opts, Acc0),
+ {ok, Acc, Reader1};
+ {error, _} = Err ->
+ throw(Err)
+ end;
+ file ->
+ Reader1 = extract_to_file(Header, Reader0, Opts),
+ {ok, Acc0, Reader1}
end;
false ->
{ok, Acc0, skip_file(Reader0)}
@@ -371,6 +380,79 @@ extract2(Header, Bin, Opts, Acc) ->
throw(Err)
end.
+extract_to_file(#tar_header{name=Name0}=Header, Reader0, Opts) ->
+ case typeflag(Header#tar_header.typeflag) of
+ regular ->
+ Name1 = make_safe_path(Name0, Opts),
+ case stream_to_file(Name1, Reader0, Opts) of
+ {ok, Reader1} ->
+ read_verbose(Opts, "x ~ts~n", [Name0]),
+ _ = set_extracted_file_info(Name1, Header),
+ Reader1;
+ {error, _} = Err ->
+ throw(Err)
+ end;
+ _ ->
+ Reader1 = skip_file(Reader0),
+ _ = write_extracted_element(Header, <<>>, Opts),
+ Reader1
+ end.
+
+stream_to_file(Name, Reader0, Opts) ->
+ Write =
+ case Opts#read_opts.keep_old_files of
+ true ->
+ case file:read_file_info(Name) of
+ {ok, _} -> false;
+ _ -> true
+ end;
+ false -> true
+ end,
+ case Write of
+ true ->
+ ChunkSize = Opts#read_opts.chunk_size,
+ case open_output_file(Name) of
+ {ok, Fd} ->
+ try
+ stream_to_file_loop(Fd, Reader0, ChunkSize)
+ after
+ file:close(Fd)
+ end;
+ {error, _} = Err ->
+ Err
+ end;
+ false ->
+ {ok, skip_file(Reader0)}
+ end.
+
+open_output_file(Name) ->
+ case file:open(Name, [write, raw, binary]) of
+ {ok, _} = Ok ->
+ Ok;
+ {error, enoent} ->
+ ok = make_dirs(Name, file),
+ file:open(Name, [write, raw, binary]);
+ {error, _} = Err ->
+ Err
+ end.
+
+stream_to_file_loop(_Fd, #reg_file_reader{num_bytes=0}=Reader, _ChunkSize) ->
+ {ok, Reader};
+stream_to_file_loop(_Fd, #sparse_file_reader{num_bytes=0}=Reader, _ChunkSize) ->
+ {ok, Reader};
+stream_to_file_loop(Fd, Reader, ChunkSize) ->
+ case do_read(Reader, ChunkSize) of
+ {ok, Bin, Reader1} ->
+ case file:write(Fd, Bin) of
+ ok ->
+ stream_to_file_loop(Fd, Reader1, ChunkSize);
+ {error, _} = Err ->
+ Err
+ end;
+ {error, _} = Err ->
+ Err
+ end.
+
%% Checks if the file Name should be extracted.
check_extract(_, #read_opts{files=all}) ->
true;
@@ -2135,9 +2216,6 @@ do_write(#reader{handle=Handle,func=Fun}=Reader0, Data)
Err
end.
-do_copy(#reader{func=Fun}=Reader, Source, #add_opts{chunk_size=0}=Opts)
- when is_function(Fun, 2) ->
- do_copy(Reader, Source, Opts#add_opts{chunk_size=65536});
do_copy(#reader{func=Fun}=Reader, Source, #add_opts{chunk_size=ChunkSize})
when is_function(Fun, 2) ->
case file:open(Source, [read, binary]) of
@@ -2311,6 +2389,8 @@ extract_opts([cooked|Rest], Opts=#read_opts{open_mode=OpenMode}) ->
extract_opts(Rest, Opts#read_opts{open_mode=[cooked|OpenMode]});
extract_opts([verbose|Rest], Opts) ->
extract_opts(Rest, Opts#read_opts{verbose=true});
+extract_opts([{chunks,N}|Rest], Opts) ->
+ extract_opts(Rest, Opts#read_opts{chunk_size=N});
extract_opts([Other|Rest], Opts) ->
extract_opts(Rest, read_opts([Other], Opts));
extract_opts([], Opts) ->
diff --git a/lib/stdlib/src/erl_tar.hrl b/lib/stdlib/src/erl_tar.hrl
index 38bd6834d0..aa48b45cc9 100644
--- a/lib/stdlib/src/erl_tar.hrl
+++ b/lib/stdlib/src/erl_tar.hrl
@@ -21,8 +21,8 @@
%% Options used when adding files to a tar archive.
-record(add_opts, {
- read_info, %% Fun to use for read file/link info.
- chunk_size = 0, %% For file reading when sending to sftp. 0=do not chunk
+ read_info, %% Fun to use for read file/link info.
+ chunk_size = 65536, %% Chunk size for reading files.
verbose = false, %% Verbose on/off.
atime = undefined,
mtime = undefined,
@@ -39,7 +39,8 @@
files = all, %% Set of files to extract (or all)
output = file :: 'file' | 'memory',
open_mode = [], %% Open mode options.
- verbose = false :: boolean()}). %% Verbose on/off.
+ verbose = false :: boolean(), %% Verbose on/off.
+ chunk_size = 65536}). %% Chunk size for streaming to disk.
-type read_opts() :: #read_opts{}.
-type add_opt() :: dereference |
@@ -56,6 +57,7 @@
-type extract_opt() :: {cwd, string()} |
{files, [name_in_archive()]} |
+ {chunks, pos_integer()} |
compressed |
cooked |
memory |
diff --git a/lib/stdlib/test/tar_SUITE.erl b/lib/stdlib/test/tar_SUITE.erl
index 397e8ad01a..89e0b3a5d5 100644
--- a/lib/stdlib/test/tar_SUITE.erl
+++ b/lib/stdlib/test/tar_SUITE.erl
@@ -21,7 +21,7 @@
%%
-module(tar_SUITE).
--export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1,
+-export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1,
init_per_group/2, end_per_group/2,
init_per_testcase/2,
borderline/1, atomic/1, long_names/1,
@@ -31,7 +31,8 @@
memory/1,unicode/1,read_other_implementations/1,bsdtgz/1,
sparse/1, init/1, leading_slash/1, dotdot/1,
roundtrip_metadata/1, apply_file_info_opts/1,
- incompatible_options/1, table_absolute_names/1]).
+ incompatible_options/1, table_absolute_names/1,
+ streamed_extract/1]).
-include_lib("common_test/include/ct.hrl").
-include_lib("kernel/include/file.hrl").
@@ -46,7 +47,8 @@ all() ->
symlinks, open_add_close, cooked_compressed, memory, unicode,
read_other_implementations, bsdtgz,
sparse,init,leading_slash,dotdot,roundtrip_metadata,
- apply_file_info_opts,incompatible_options, table_absolute_names].
+ apply_file_info_opts,incompatible_options, table_absolute_names,
+ streamed_extract].
groups() ->
[].
@@ -1093,6 +1095,81 @@ table_absolute_names(Config) ->
ok.
+%% Test that extracting to disk streams file entries in chunks
+%% instead of loading them fully into memory.
+streamed_extract(Config) ->
+ PrivDir = proplists:get_value(priv_dir, Config),
+ Dir = filename:join(PrivDir, ?FUNCTION_NAME),
+ ok = file:make_dir(Dir),
+
+ %% Create test files of various sizes.
+ EmptyFile = filename:join(Dir, "empty"),
+ ok = file:write_file(EmptyFile, <<>>),
+
+ %% A file larger than the default chunk size (65536 bytes).
+ LargeSize = 200000,
+ LargeData = crypto:strong_rand_bytes(LargeSize),
+ LargeFile = filename:join(Dir, "large"),
+ ok = file:write_file(LargeFile, LargeData),
+
+ %% A file exactly equal to a small chunk size we'll use (1024 bytes).
+ ChunkSize = 1024,
+ BoundaryData = crypto:strong_rand_bytes(ChunkSize),
+ BoundaryFile = filename:join(Dir, "boundary"),
+ ok = file:write_file(BoundaryFile, BoundaryData),
+
+ %% A small file (less than one chunk).
+ SmallData = <<"hello">>,
+ SmallFile = filename:join(Dir, "small"),
+ ok = file:write_file(SmallFile, SmallData),
+
+ %% Create a tar archive containing all test files.
+ TarFile = filename:join(Dir, "test.tar"),
+ ok = erl_tar:create(TarFile, [
+ {"empty", EmptyFile},
+ {"large", LargeFile},
+ {"boundary", BoundaryFile},
+ {"small", SmallFile}
+ ]),
+
+ %% Extract with default chunk size and verify contents.
+ ExtractDir1 = filename:join(Dir, "extract_default"),
+ ok = file:make_dir(ExtractDir1),
+ ok = erl_tar:extract(TarFile, [{cwd, ExtractDir1}]),
+ {ok, <<>>} = file:read_file(filename:join(ExtractDir1, "empty")),
+ {ok, LargeData} = file:read_file(filename:join(ExtractDir1, "large")),
+ {ok, BoundaryData} = file:read_file(filename:join(ExtractDir1, "boundary")),
+ {ok, SmallData} = file:read_file(filename:join(ExtractDir1, "small")),
+
+ %% Extract with a small {chunks, N} to exercise multi-chunk streaming.
+ ExtractDir2 = filename:join(Dir, "extract_chunked"),
+ ok = file:make_dir(ExtractDir2),
+ ok = erl_tar:extract(TarFile, [{cwd, ExtractDir2}, {chunks, ChunkSize}]),
+ {ok, <<>>} = file:read_file(filename:join(ExtractDir2, "empty")),
+ {ok, LargeData} = file:read_file(filename:join(ExtractDir2, "large")),
+ {ok, BoundaryData} = file:read_file(filename:join(ExtractDir2, "boundary")),
+ {ok, SmallData} = file:read_file(filename:join(ExtractDir2, "small")),
+
+ %% Extract from binary with {chunks, N} (binary input, disk output).
+ {ok, TarBin} = file:read_file(TarFile),
+ ExtractDir3 = filename:join(Dir, "extract_binary"),
+ ok = file:make_dir(ExtractDir3),
+ ok = erl_tar:extract({binary, TarBin}, [{cwd, ExtractDir3}, {chunks, ChunkSize}]),
+ {ok, <<>>} = file:read_file(filename:join(ExtractDir3, "empty")),
+ {ok, LargeData} = file:read_file(filename:join(ExtractDir3, "large")),
+ {ok, BoundaryData} = file:read_file(filename:join(ExtractDir3, "boundary")),
+ {ok, SmallData} = file:read_file(filename:join(ExtractDir3, "small")),
+
+ %% Verify that memory extraction still works (not affected by streaming).
+ {ok, MemFiles} = erl_tar:extract(TarFile, [memory]),
+ MemMap = maps:from_list(MemFiles),
+ <<>> = maps:get("empty", MemMap),
+ LargeData = maps:get("large", MemMap),
+ BoundaryData = maps:get("boundary", MemMap),
+ SmallData = maps:get("small", MemMap),
+
+ ok.
+
%% Delete the given list of files.
delete_files([]) -> ok;
delete_files([Item|Rest]) ->
--
2.51.0