File 0121-stdlib-Handle-Unicode-filenames-in-the-zip-module.patch of Package erlang

From 0e423ac950b2f87f12828180470c6b4620cdef90 Mon Sep 17 00:00:00 2001
From: Hans Bolinder <hasse@erlang.org>
Date: Thu, 12 Sep 2019 08:54:54 +0200
Subject: [PATCH 2/2] stdlib: Handle Unicode filenames in the zip module

 Details:

- when searching for a filename, the names are normalized ('nfc');
- character codes greater than 127 are encoded;
- heuristics are used both for the .ZIP file comment and for file
  comments.
---
 lib/stdlib/src/zip.erl                        | 118 +++++++++++++++++------
 lib/stdlib/test/zip_SUITE.erl                 | 132 +++++++++++++++++++++++++-
 lib/stdlib/test/zip_SUITE_data/zip-latin1.zip | Bin 0 -> 115 bytes
 3 files changed, 220 insertions(+), 30 deletions(-)
 create mode 100644 lib/stdlib/test/zip_SUITE_data/zip-latin1.zip

diff --git a/lib/stdlib/src/zip.erl b/lib/stdlib/src/zip.erl
index 345e046e4c..688467d992 100644
--- a/lib/stdlib/src/zip.erl
+++ b/lib/stdlib/src/zip.erl
@@ -53,6 +53,10 @@
 %% for debugging, to turn off catch
 -define(CATCH, catch).
 
+%% Debug.
+-define(SHOW_GP_BIT_11(B, F), ok).
+%%-define(SHOW_GP_BIT_11(B, F), io:format("F = ~.16#, B = ~lp\n", [F, B])).
+
 %% option sets
 -record(unzip_opts, {
 	  output,      % output object (fun)
@@ -138,6 +142,8 @@
 -define(PKWARE_RESERVED, 11).
 -define(BZIP2_COMPRESSED, 12).
 
+-define(GP_BIT_11, 16#800). % Filename and file comment UTF-8 encoded.
+
 %% zip-file records
 -define(LOCAL_FILE_MAGIC,16#04034b50).
 -define(LOCAL_FILE_HEADER_SZ,(4+2+2+2+2+2+4+4+4+2+2)).
@@ -281,8 +287,11 @@ do_openzip_get(_, _) ->
     throw(einval).
 
 file_name_search(Name,Files) ->
-    case lists:dropwhile(fun({ZipFile,_}) -> ZipFile#zip_file.name =/= Name end,
-			 Files) of
+    Fun = fun({ZipFile,_}) ->
+                  not string:equal(ZipFile#zip_file.name, Name,
+                                   _IgnoreCase = false, _Norm = nfc)
+          end,
+    case lists:dropwhile(Fun, Files) of
 	[ZFile|_] -> ZFile;
 	[] -> false
     end.
@@ -621,9 +630,11 @@ get_zip_opt([Unknown | _Rest], _Opts) ->
 %% feedback funs
 silent(_) -> ok.
 
-verbose_unzip(FN) -> io:format("extracting: ~tp\n", [FN]).
+verbose_unzip(FN) ->
+    io:format("extracting: ~ts\n", [io_lib:write_string(FN)]).
 
-verbose_zip(FN) -> io:format("adding: ~tp\n", [FN]).
+verbose_zip(FN) ->
+    io:format("adding: ~ts\n", [io_lib:write_string(FN)]).
 
 %% file filter funs
 all(_) -> true.
@@ -654,7 +665,10 @@ get_zip_options(Files, Options) ->
 		     compress = all,
 		     uncompress = Suffixes
 		    },
-    get_zip_opt(Options, Opts).
+    Opts1 = #zip_opts{comment = Comment} = get_zip_opt(Options, Opts),
+    %% UTF-8 encode characters in the interval from 127 to 255.
+    {Comment1, _} = encode_string(Comment),
+    Opts1#zip_opts{comment = Comment1}.
 
 get_unzip_options(F, Options) ->
     Opts = #unzip_opts{file_filter = fun all/1,
@@ -849,16 +863,18 @@ put_z_files([F | Rest], Z, Out0, Pos0,
 	    regular -> FileInfo#file_info.size;
 	    directory -> 0
 	end,
-    FileName = get_filename(F, Type),
+    FileName0 = get_filename(F, Type),
+    %% UTF-8 encode characters in the interval from 127 to 255.
+    {FileName, GPFlag} = encode_string(FileName0),
     CompMethod = get_comp_method(FileName, UncompSize, Opts, Type),
-    LH = local_file_header_from_info_method_name(FileInfo, UncompSize, CompMethod, FileName),
+    LH = local_file_header_from_info_method_name(FileInfo, UncompSize, CompMethod, FileName, GPFlag),
     BLH = local_file_header_to_bin(LH),
     B = [<<?LOCAL_FILE_MAGIC:32/little>>, BLH],
     Out1 = Output({write, B}, Out0),
     Out2 = Output({write, FileName}, Out1),
     {Out3, CompSize, CRC} = put_z_file(CompMethod, UncompSize, Out2, F1,
 				       0, Input, Output, OpO, Z, Type),
-    FB(FileName),
+    FB(FileName0),
     Patch = <<CRC:32/little, CompSize:32/little>>,
     Out4 = Output({pwrite, Pos0 + ?LOCAL_FILE_HEADER_CRC32_OFFSET, Patch}, Out3),
     Out5 = Output({seek, eof, 0}, Out4),
@@ -1102,10 +1118,10 @@ eocd_to_bin(#eocd{disk_num = DiskNum,
 %% put together a local file header
 local_file_header_from_info_method_name(#file_info{mtime = MTime},
 					UncompSize,
-					CompMethod, Name) ->
+					CompMethod, Name, GPFlag) ->
     {ModDate, ModTime} = dos_date_time_from_datetime(MTime),
     #local_file_header{version_needed = 20,
-		       gp_flag = 0,
+		       gp_flag = GPFlag,
 		       comp_method = CompMethod,
 		       last_mod_time = ModTime,
 		       last_mod_date = ModDate,
@@ -1269,7 +1285,9 @@ get_central_dir(In0, RawIterator, Input) ->
     In2 = Input({seek, bof, EOCD#eocd.offset}, In1),
     N = EOCD#eocd.entries,
     Acc0 = [],
-    Out0 = RawIterator(EOCD, "", binary_to_list(BComment), <<>>, Acc0),
+    %% There is no encoding flag for the archive comment.
+    Comment = heuristic_to_string(BComment),
+    Out0 = RawIterator(EOCD, "", Comment, <<>>, Acc0),
     get_cd_loop(N, In2, RawIterator, Input, Out0).
 
 get_cd_loop(0, In, _RawIterator, _Input, Acc) ->
@@ -1285,20 +1303,32 @@ get_cd_loop(N, In0, RawIterator, Input, Acc0) ->
     ExtraLen = CD#cd_file_header.extra_field_length,
     CommentLen = CD#cd_file_header.file_comment_length,
     ToRead = FileNameLen + ExtraLen + CommentLen,
+    GPFlag = CD#cd_file_header.gp_flag,
     {B2, In2} = Input({read, ToRead}, In1),
     {FileName, Comment, BExtra} =
-	get_name_extra_comment(B2, FileNameLen, ExtraLen, CommentLen),
+	get_name_extra_comment(B2, FileNameLen, ExtraLen, CommentLen, GPFlag),
     Acc1 = RawIterator(CD, FileName, Comment, BExtra, Acc0),
     get_cd_loop(N-1, In2, RawIterator, Input, Acc1).
 
-get_name_extra_comment(B, FileNameLen, ExtraLen, CommentLen) ->
-    case B of
-	<<BFileName:FileNameLen/binary,
-	 BExtra:ExtraLen/binary,
-	 BComment:CommentLen/binary>> ->
-	    {binary_to_list(BFileName), binary_to_list(BComment), BExtra};
-	_ ->
-	    throw(bad_central_directory)
+get_name_extra_comment(B, FileNameLen, ExtraLen, CommentLen, GPFlag) ->
+    try
+        <<BFileName:FileNameLen/binary,
+          BExtra:ExtraLen/binary,
+          BComment:CommentLen/binary>> = B,
+        {binary_to_chars(BFileName, GPFlag),
+         %% Appendix D says: "If general purpose bit 11 is unset, the
+         %% file name and comment should conform to the original ZIP
+         %% character encoding." However, it seems that at least Linux
+         %% zip(1) encodes the comment without setting bit 11 if the
+         %% filename is 7-bit ASCII. If bit 11 is set,
+         %% binary_to_chars/1 could (should?) be called (it can fail),
+         %% but the choice is to employ heuristics in this case too
+         %% (it does not fail).
+         heuristic_to_string(BComment),
+         BExtra}
+    catch
+        _:_ ->
+            throw(bad_central_directory)
     end.
 
 %% get end record, containing the offset to the central directory
@@ -1427,7 +1457,8 @@ get_z_file(In0, Z, Input, Output, OpO, FB,
 					     LH#local_file_header.crc32}
 			       end,
 	    {BFileN, In3} = Input({read, FileNameLen + ExtraLen}, In1),
-	    {FileName, _} = get_file_name_extra(FileNameLen, ExtraLen, BFileN),
+	    {FileName, _} =
+                get_file_name_extra(FileNameLen, ExtraLen, BFileN, GPFlag),
 	    ReadAndWrite =
 		case check_valid_location(CWD, FileName) of
 		    {true,FileName1} ->
@@ -1487,12 +1518,13 @@ check_dir_level([".." | Parts], Level) ->
 check_dir_level([_Dir | Parts], Level) ->
     check_dir_level(Parts, Level+1).
 
-get_file_name_extra(FileNameLen, ExtraLen, B) ->
-    case B of
-	<<BFileName:FileNameLen/binary, BExtra:ExtraLen/binary>> ->
-	    {binary_to_list(BFileName), BExtra};
-	_ ->
-	    throw(bad_file_header)
+get_file_name_extra(FileNameLen, ExtraLen, B, GPFlag) ->
+    try
+        <<BFileName:FileNameLen/binary, BExtra:ExtraLen/binary>> = B,
+        {binary_to_chars(BFileName, GPFlag), BExtra}
+    catch
+        _:_ ->
+            throw(bad_file_header)
     end.
 
 %% get compressed or stored data
@@ -1596,6 +1628,38 @@ skip_bin(B, Pos) when is_binary(B) ->
 	_ -> <<>>
     end.
 
+binary_to_chars(B, GPFlag) ->
+    ?SHOW_GP_BIT_11(B, GPFlag band ?GP_BIT_11),
+    case GPFlag band ?GP_BIT_11 of
+        0 ->
+            binary_to_list(B);
+        ?GP_BIT_11 ->
+            case unicode:characters_to_list(B) of
+                List when is_list(List) ->
+                    List
+            end
+    end.
+
+heuristic_to_string(B) when is_binary(B) ->
+    case unicode:characters_to_binary(B) of
+	B ->
+            unicode:characters_to_list(B);
+	_ ->
+            binary_to_list(B)
+    end.
+
+encode_string(String) ->
+    case lists:any(fun(C) -> C > 127 end, String) of
+        true ->
+            case unicode:characters_to_binary(String) of
+                B when is_binary(B) ->
+                    {binary_to_list(B), ?GP_BIT_11};
+                _ ->
+                    throw({bad_unicode, String})
+            end;
+        false ->
+            {String, 0}
+    end.
 
 %% ZIP header manipulations
 eocd_and_comment_from_bin(<<DiskNum:16/little,
diff --git a/lib/stdlib/test/zip_SUITE.erl b/lib/stdlib/test/zip_SUITE.erl
index 081bffa7cb..6272798c17 100644
--- a/lib/stdlib/test/zip_SUITE.erl
+++ b/lib/stdlib/test/zip_SUITE.erl
@@ -1,7 +1,7 @@
 %%
 %% %CopyrightBegin%
 %%
-%% Copyright Ericsson AB 2006-2018. All Rights Reserved.
+%% Copyright Ericsson AB 2006-2019. All Rights Reserved.
 %%
 %% Licensed under the Apache License, Version 2.0 (the "License");
 %% you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
          openzip_api/1, zip_api/1, open_leak/1, unzip_jar/1,
 	 unzip_traversal_exploit/1,
          compress_control/1,
-	 foldl/1,fd_leak/1]).
+	 foldl/1,fd_leak/1,unicode/1]).
 
 -include_lib("common_test/include/ct.hrl").
 -include_lib("kernel/include/file.hrl").
@@ -40,7 +40,7 @@ all() ->
      unzip_to_binary, zip_to_binary, unzip_options,
      zip_options, list_dir_options, aliases, openzip_api,
      zip_api, open_leak, unzip_jar, compress_control, foldl,
-     unzip_traversal_exploit,fd_leak].
+     unzip_traversal_exploit,fd_leak,unicode].
 
 groups() -> 
     [].
@@ -913,3 +913,129 @@ do_fd_leak(Bad, N) ->
             io:format("Bad error after ~p attempts\n", [N]),
             erlang:raise(C, R, Stk)
     end.
+
+unicode(Config) ->
+    case file:native_name_encoding() of
+        latin1 ->
+            {comment, "Native name encoding is Latin-1; skipping all tests"};
+        utf8 ->
+            DataDir = proplists:get_value(data_dir, Config),
+            ok = file:set_cwd(proplists:get_value(priv_dir, Config)),
+            test_file_comment(DataDir),
+            test_archive_comment(DataDir),
+            test_bad_comment(DataDir),
+            test_latin1_archive(DataDir),
+            case has_zip() of
+                false ->
+                    {comment, "No zip program found; skipping some tests"};
+                true ->
+                    case zip_is_unicode_aware() of
+                        true ->
+                            ok;
+                        false ->
+                            {comment, "Old zip program; skipping some tests"}
+                    end
+            end
+    end.
+
+test_file_comment(DataDir) ->
+    Archive = filename:join(DataDir, "zip_file_comment.zip"),
+    Comments = ["a", [246], [1024]],
+    FileNames = [[C] ++ ".txt" || C <- [$a, 246, 1024]],
+    [begin
+         test_zip_file(FileName, Comment, Archive),
+         test_file_comment(FileName, Comment, Archive)
+     end ||
+        Comment <- Comments, FileName <- FileNames],
+    ok.
+
+test_zip_file(FileName, Comment, Archive) ->
+    _ = file:delete(Archive),
+    io:format("*** zip:zip(). Testing FileName ~ts, Comment ~ts\n",
+              [FileName, Comment]),
+    ok = file:write_file(FileName, ["anything"]),
+    {ok, Archive} =
+        zip:zip(Archive, [FileName], [verbose, {comment, Comment}]),
+    zip_check(Archive, Comment, FileName, "").
+
+test_file_comment(FileName, Comment, Archive) ->
+    case test_zip1() of
+        false ->
+            ok;
+        true ->
+            _ = file:delete(Archive),
+            io:format("*** zip(1). Testing FileName ~ts, Comment ~ts\n",
+                      [FileName, Comment]),
+            ok = file:write_file(FileName, ["anything"]),
+            R = os:cmd("echo " ++ Comment ++ "| zip -c " ++
+                           Archive ++ " " ++ FileName),
+            io:format("os:cmd/1 returns ~lp\n", [R]),
+            zip_check(Archive, "", FileName, Comment)
+    end.
+
+test_archive_comment(DataDir) ->
+    Archive = filename:join(DataDir, "zip_archive_comment.zip"),
+    Chars = [$a, 246, 1024],
+    [test_archive_comment(Char, Archive) || Char <- Chars],
+    ok.
+
+test_archive_comment(Char, Archive) ->
+    case test_zip1() of
+        false ->
+            ok;
+        true ->
+            _ = file:delete(Archive),
+            FileName = "a.txt",
+            Comment = [Char],
+            io:format("*** Testing archive Comment ~ts\n", [Comment]),
+            ok = file:write_file(FileName, ["anything"]),
+
+            {ok, _} =
+                zip:zip(Archive, [FileName], [verbose, {comment, Comment}]),
+            Res = os:cmd("zip -z " ++ Archive),
+            io:format("os:cmd/1 returns ~lp\n", [Res]),
+            true = lists:member(Char, Res),
+
+            os:cmd("echo " ++ Comment ++ "| zip -z "++
+                       Archive ++ " " ++ FileName),
+            zip_check(Archive, Comment, FileName, "")
+    end.
+
+test_zip1() ->
+    has_zip() andalso zip_is_unicode_aware().
+
+has_zip() ->
+    os:find_executable("zip") =/= false.
+
+zip_is_unicode_aware() ->
+    S = os:cmd("zip -v | grep 'UNICODE_SUPPORT'"),
+    string:find(S, "UNICODE_SUPPORT") =/= nomatch.
+
+zip_check(Archive, ArchiveComment, FileName, FileNameComment) ->
+    {ok, CommentAndFiles} = zip:table(Archive),
+    io:format("zip:table/1 returns\n  ~lp\n", [CommentAndFiles]),
+    io:format("checking archive comment ~lp\n", [ArchiveComment]),
+    [_] = [C || #zip_comment{comment = C} <- CommentAndFiles,
+                C =:= ArchiveComment],
+    io:format("checking filename ~lp\n", [FileName]),
+    io:format("and filename comment ~lp\n", [FileNameComment]),
+    [_] = [F || #zip_file{name = F, comment = C} <- CommentAndFiles,
+                F =:= FileName, C =:= FileNameComment],
+    {ok, FileList} = zip:unzip(Archive, [verbose]),
+    io:format("zip:unzip/2 returns\n  ~lp\n", [FileList]),
+    true = lists:member(FileName, FileList),
+    ok.
+
+test_bad_comment(DataDir) ->
+    Archive = filename:join(DataDir, "zip_bad_comment.zip"),
+    FileName = "a.txt",
+    file:write_file(FileName, ["something"]),
+    Comment = [9999999],
+    {error,{bad_unicode,Comment}} =
+        zip:zip(Archive, [FileName], [verbose, {comment, Comment}]).
+
+test_latin1_archive(DataDir) ->
+    Archive = filename:join(DataDir, "zip-latin1.zip"),
+    FileName = [246] ++ ".txt",
+    ArchiveComment = [246],
+    zip_check(Archive, ArchiveComment, FileName, "").
diff --git a/lib/stdlib/test/zip_SUITE_data/zip-latin1.zip b/lib/stdlib/test/zip_SUITE_data/zip-latin1.zip
new file mode 100644
index 0000000000000000000000000000000000000000..d54c783653d5f45b075c0c2e599e498e77cf8089
GIT binary patch
literal 115
zcmWIWW@Zs#0D&l9UH_@_-{!Fa*&xgc#NYHvDoWDw^OF*b0=yZSL>O=@1*%2@V2uIZ
RtRRCJ7=h3jNNa-G-vF035%B;3

literal 0
HcmV?d00001

-- 
2.16.4

openSUSE Build Service is sponsored by