File 3581-stdlib-Improve-API-and-documentation-of-uri_string.patch of Package erlang

From 8efe45a03e615be3a8f3c3b81f5026e170cd893d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= <peterdmv@erlang.org>
Date: Wed, 30 Sep 2020 16:02:48 +0200
Subject: [PATCH] stdlib: Improve API and documentation of uri_string

---
 lib/stdlib/doc/src/Makefile                   |   2 +-
 lib/stdlib/doc/src/part.xml                   |   1 +
 lib/stdlib/doc/src/uri_string.xml             |  49 +++
 lib/stdlib/doc/src/uri_string_usage.xml       | 370 ++++++++++++++++++
 lib/stdlib/src/uri_string.erl                 | 121 +++++-
 .../property_test/uri_string_recompose.erl    |   9 +-
 lib/stdlib/test/uri_string_SUITE.erl          | 151 ++++---
 7 files changed, 642 insertions(+), 61 deletions(-)
 create mode 100644 lib/stdlib/doc/src/uri_string_usage.xml

diff --git a/lib/stdlib/doc/src/Makefile b/lib/stdlib/doc/src/Makefile
index 1092ce3ffa..4b22e35e3b 100644
--- a/lib/stdlib/doc/src/Makefile
+++ b/lib/stdlib/doc/src/Makefile
@@ -101,7 +101,7 @@ XML_REF6_FILES = stdlib_app.xml
 
 XML_PART_FILES = part.xml
 XML_CHAPTER_FILES = introduction.xml io_protocol.xml unicode_usage.xml \
-	notes.xml assert_hrl.xml
+	uri_string_usage.xml notes.xml assert_hrl.xml
 
 BOOK_FILES = book.xml
 
diff --git a/lib/stdlib/doc/src/part.xml b/lib/stdlib/doc/src/part.xml
index 93c47405bf..b6a2f16b57 100644
--- a/lib/stdlib/doc/src/part.xml
+++ b/lib/stdlib/doc/src/part.xml
@@ -37,5 +37,6 @@
   <xi:include href="introduction.xml"/>
   <xi:include href="io_protocol.xml"/>
   <xi:include href="unicode_usage.xml"/>
+  <xi:include href="uri_string_usage.xml"/>
 </part>
 
diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml
index a792decbff..dea8e60979 100644
--- a/lib/stdlib/doc/src/uri_string.xml
+++ b/lib/stdlib/doc/src/uri_string.xml
@@ -84,6 +84,9 @@
       <item>Dissecting form-urlencoded query strings into a list of key-value pairs<br></br>
       <seealso marker="#dissect_query/1"><c>dissect_query/1</c></seealso>
       </item>
+      <item>Decoding percent-encoded triplets<br></br>
+      <seealso marker="#percent_decode/1"><c>percent_decode/1</c></seealso>
+      </item>
     </list>
     <p>There are four different encodings present during the handling of URIs:</p>
     <list type="bulleted">
@@ -149,6 +152,21 @@
 
   <funcs>
 
+    <func>
+      <name name="allowed_characters" arity="0" since="OTP 23.2"/>
+      <fsummary>Print allowed characters in URI components.</fsummary>
+      <desc>
+	<p>This is a utility function meant to be used in the shell for printing
+	the allowed characters in each
+	major URI component, and also in the most important characters sets.
+	Please note that this function does not replace the ABNF rules defined by
+	the standards, these character sets are derived directly from those
+	aformentioned rules. For more information see the
+	<seealso marker="uri_string_usage#percent_encoding">Uniform Resource
+	Identifiers</seealso> chapter in stdlib's Users Guide.</p>
+      </desc>
+    </func>
+
     <func>
       <name name="compose_query" arity="1" since="OTP 21.0"/>
       <fsummary>Compose urlencoded query string.</fsummary>
@@ -308,6 +326,37 @@
       </desc>
     </func>
 
+    <func>
+      <name name="percent_decode" arity="1" since="OTP 23.2"/>
+      <fsummary>Decode percent-decode triplets in the input.</fsummary>
+      <desc>
+	<p>Decodes all percent-encoded triplets in the input that can be both a
+	<c>uri_string()</c> and a <c>uri_map()</c>. Note, that this function performs
+	raw decoding and it shall be used on already parsed URI components. Applying
+	this function directly on a standard URI can effectively change it.</p>
+	<p>If the input encoding is not UTF-8, an error tuple is returned.</p>
+        <p><em>Example:</em></p>
+        <pre>
+1> <input>uri_string:percent_decode(#{host => "localhost-%C3%B6rebro",path => [],</input>
+1> <input>scheme => "http"}).</input>
+#{host => "localhost-örebro",path => [],scheme => "http"}
+2> <![CDATA[uri_string:percent_decode(<<"%C3%B6rebro">>).]]>
+<![CDATA[<<"örebro"/utf8>>]]>
+	</pre>
+	<warning><p>
+	Using <c>uri_string:percent_decode/1</c> directly on a URI is not safe. This
+	example shows, that after each consecutive application of the function
+	the resulting URI will be changed. None of these URIs refer to the same
+        resource.</p>
+	<pre>
+<![CDATA[3> uri_string:percent_decode(<<"http://local%252Fhost/path">>).
+<<"http://local%2Fhost/path">>
+4> uri_string:percent_decode(<<"http://local%2Fhost/path">>).
+<<"http://local/host/path">>]]>
+        </pre></warning>
+      </desc>
+    </func>
+
     <func>
       <name name="recompose" arity="1" since="OTP 21.0"/>
       <fsummary>Recompose URI.</fsummary>
diff --git a/lib/stdlib/doc/src/uri_string_usage.xml b/lib/stdlib/doc/src/uri_string_usage.xml
new file mode 100644
index 0000000000..72851096b7
--- /dev/null
+++ b/lib/stdlib/doc/src/uri_string_usage.xml
@@ -0,0 +1,370 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<!DOCTYPE chapter SYSTEM "chapter.dtd">
+
+<chapter>
+  <header>
+    <copyright>
+      <year>2020</year>
+      <year>2020</year>
+      <holder>Ericsson AB. All Rights Reserved.</holder>
+    </copyright>
+    <legalnotice>
+      Licensed under the Apache License, Version 2.0 (the "License");
+      you may not use this file except in compliance with the License.
+      You may obtain a copy of the License at
+
+          http://www.apache.org/licenses/LICENSE-2.0
+
+      Unless required by applicable law or agreed to in writing, software
+      distributed under the License is distributed on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+      See the License for the specific language governing permissions and
+      limitations under the License.
+
+    </legalnotice>
+
+    <title>Uniform Resource Identifiers</title>
+    <prepared>Péter Dimitrov</prepared>
+    <responsible></responsible>
+    <docno></docno>
+    <approved></approved>
+    <checked></checked>
+    <date>2020-09-30</date>
+    <rev>PA1</rev>
+    <file>uri_string_usage.xml</file>
+  </header>
+  <section>
+    <title>Basics</title>
+    <p>At the time of writing this document, in October 2020, there are
+    two major standards concerning Universal Resource Identifiers and
+    Universal Resource Locators:</p>
+    <list type="bulleted">
+      <item><p>
+	<url href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986 - Uniform Resource
+      Identifier (URI): Generic Syntax</url></p></item>
+      <item><p>
+	<url href="https://url.spec.whatwg.org/">WHAT WG URL - Living standard</url>
+      </p></item>
+    </list>
+    <p>
+    The former is a classical standard with a proper formal syntax, using the so
+    called <url href="https://www.ietf.org/rfc/rfc2234.txt">Augmented Backus-Naur Form
+    (ABNF)</url> for describing
+    the grammar, while the latter is a living document describing the current pratice,
+    that is, how a majority of Web browsers work with URIs. WHAT WG URL is Web focused
+    and it has no formal grammar but a plain english description of the algorithms
+    that should be followed.</p>
+    <p>What is the difference between them, if any? They provide an overlapping
+    definition for resource identifiers and they are not compatible.
+    The <seealso marker="stdlib:uri_string"><c>uri_string</c></seealso> module implements
+    <url href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986</url> and the term URI will
+    be used throughout this document. A URI is an identifier, a string of characters
+    that identifies a particular resource.</p>
+    <p>
+    For a more complete problem
+    statement regarding the URIs check the
+    <url href="https://tools.ietf.org/html/draft-ruby-url-problem-01">URL Problem
+    Statement and Directions</url>.</p>
+  </section>
+
+  <section>
+    <title>What is a URI?</title>
+    <p>Let's start with what it is not. It is not the text that you type in the address
+    bar in your Web browser. Web browsers do all possible heuristics to convert the
+    input into a valid URI that could be sent over the network.</p>
+    <p>A URI is an identifier consisting of a sequence of characters matching the syntax
+    rule named <c>URI</c> in
+    <url href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986</url>.
+    </p>
+    <p>It is crucial to clarify that a <i>character</i> is a symbol that is displayed on
+    a terminal or written to paper and should not be confused with its internal
+    representation.</p>
+    <p>A URI more specifically, is a sequence of characters from a
+    subset of the US ASCII character set. The generic URI syntax consists of a
+    hierarchical sequence of components referred to as the scheme, authority,
+    path, query, and fragment. There is a formal description for
+    each of these components in
+    <url href="https://www.ietf.org/rfc/rfc2234.txt">ABNF</url> notation in
+    <url href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986</url>:</p>
+    <pre>
+    URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+    hier-part   = "//" authority path-abempty
+                   / path-absolute
+                   / path-rootless
+                   / path-empty
+    scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+    authority   = [ userinfo "@" ] host [ ":" port ]
+    userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
+
+    reserved    = gen-delims / sub-delims
+    gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+    sub-delims  = "!" / "$" / "&amp;" / "'" / "(" / ")"
+                / "*" / "+" / "," / ";" / "="
+
+    unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
+    </pre>
+  </section>
+
+  <section>
+    <title>The uri_string module</title>
+    <p>As producing and consuming standard URIs can get quite complex, Erlang/OTP
+    provides
+    a module, <seealso marker="stdlib:uri_string"><c>uri_string</c></seealso>, to handle all the most difficult operations such as parsing,
+    recomposing, normalizing and resolving URIs against a base URI.
+    </p>
+    <p>The API functions in <seealso marker="stdlib:uri_string"><c>uri_string</c></seealso>
+    work on two basic data types
+    <seealso marker="uri_string#type-uri_string"><c>uri_string()</c></seealso> and
+    <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>.
+    <seealso marker="uri_string#type-uri_string"><c>uri_string()</c></seealso> represents a
+    standard URI, while
+    <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso> is a wider datatype,
+    that can represent URI components using
+    <seealso marker="unicode_usage#what-unicode-is">Unicode</seealso> characters.
+    <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>
+    is a convenient choice for enabling
+    operations such as producing standard compliant URIs out of components that have
+    special or <seealso marker="unicode_usage#what-unicode-is">Unicode</seealso>
+    characters. It is easier to explain this by an example.
+    </p>
+    <p>Let's say that we would like to create the following URI and send it over the
+    network: <c>http://cities/örebro?foo bar</c>. This is not a valid URI as it contains
+    characters that are not allowed in a URI such as "ö" and the space. We can verify
+    this by parsing the URI:
+  </p>
+  <pre>
+  1> uri_string:parse("http://cities/örebro?foo bar").
+  {error,invalid_uri,":"}
+  </pre>
+  <p>The URI parser tries all possible combinations to interpret the input and fails
+  at the last attempt when it encounters the colon character <c>":"</c>. Note, that
+  the inital fault occurs when the parser attempts to interpret the character
+  <c>"ö"</c> and after a failure back-tracks to the point where it has another
+  possible parsing alternative.</p>
+  <p>The proper way to solve this problem is to use
+  <seealso marker="uri_string#recompose/1"><c>uri_string:recompose/1</c></seealso>
+  with a <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso> as input:</p>
+  <pre>
+  2> uri_string:recompose(#{scheme => "http", host => "cities", path => "/örebro",
+  query => "foo bar"}).
+  "http://cities/%C3%B6rebro?foo%20bar"
+  </pre>
+  <p>The result is a valid URI where all the special characters are encoded as defined
+  by the standard. Applying
+  <seealso marker="uri_string#parse/1"><c>uri_string:parse/1</c></seealso> and
+  <seealso marker="uri_string#percent_decode/1"><c>uri_string:percent_decode/1</c></seealso>
+  on the URI returns the original input:
+  </p>
+  <pre>
+  3> uri_string:percent_decode(uri_string:parse("http://cities/%C3%B6rebro?foo%20bar")).
+  #{host => "cities",path => "/örebro",query => "foo bar",
+  scheme => "http"}
+  </pre>
+  <p>This symmetric property is heavily used in our property test suite.
+  </p>
+  </section>
+
+  <section>
+    <title>Percent-encoding</title>
+    <p>As you have seen in the previous chapter, a standard URI can only contain a strict
+    subset of the US ASCII character set, moreover the allowed set of characters is not
+    the same in the different URI components. Percent-encoding is a mechanism to
+    represent a data octet in a component when that octet's corresponding character
+    is outside of
+    the allowed set or is being used as a delimiter. This is what you see when <c>"ö"</c>
+    is encoded as <c>%C3%B6</c> and <c>space</c> as <c>%20</c>.
+    Most of the API functions are
+    expecting UTF-8 encoding when handling percent-encoded triplets. The UTF-8 encoding
+    of the <seealso marker="unicode_usage#what-unicode-is">Unicode</seealso>
+    character <c>"ö"</c> is two octets: <c>OxC3 0xB6</c>.
+    The character <c>space</c> is in the first 128 characters of
+    <seealso marker="unicode_usage#what-unicode-is">Unicode</seealso> and it is encoded
+    using a single octet <c>0x20</c>.</p>
+    <note><p><seealso marker="unicode_usage#what-unicode-is">Unicode</seealso>
+    is backward compatible with ASCII, the encoding of the first 128
+    characters is the same binary value as in ASCII.
+    </p></note>
+    <p><marker id="percent_encoding"></marker>
+    It is a major source of confusion exactly which characters will be
+    percent-encoded. In order to make it easier to answer this question the library
+    provides a utility function,
+    <seealso marker="uri_string#allowed_characters/0"><c>uri_string:allowed_characters/0
+    </c></seealso>,
+    that lists the allowed set of characters in each major
+    URI component, and also in the most important standard character sets.
+    </p>
+    <pre>
+    1> uri_string:allowed_characters().
+    <![CDATA[{scheme,
+     "+-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"},
+    {userinfo,
+     "!$%&'()*+,-.0123456789:;=ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+    {host,
+     "!$&'()*+,-.0123456789:;=ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+    {ipv4,".0123456789"},
+    {ipv6,".0123456789:ABCDEFabcdef"},
+    {regname,
+     "!$%&'()*+,-.0123456789;=ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+    {path,
+     "!$%&'()*+,-./0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+    {query,
+     "!$%&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+    {fragment,
+     "!$%&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+    {reserved,"!#$&'()*+,/:;=?@[]"},
+    {unreserved,
+     "-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"}] ]]>
+    </pre>
+    <p>If a URI component has a character that is not allowed, it will be
+    percent-encoded when the URI is produced:
+    </p>
+    <pre>
+    2> uri_string:recompose(#{scheme => "https", host => "local#host", path => ""}).
+    "https://local%23host"
+    </pre>
+    <p>Consuming a URI containing percent-encoded triplets can take many steps. The
+    following example shows how to handle an input URI that is not normalized and
+    contains multiple percent-encoded triplets.
+    First, the input <seealso marker="uri_string#type-uri_string"><c>uri_string()</c></seealso>
+    is to be parsed into a <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>.
+    The parsing only splits the URI into its components without doing any decoding:
+    </p>
+    <pre>
+    3> uri_string:parse("http://%6C%6Fcal%23host/%F6re%26bro%20").
+    #{host => "%6C%6Fcal%23host",path => "/%F6re%26bro%20",
+      scheme => "http"}}
+    </pre>
+    <p>The input is a valid URI but how can you decode those
+    percent-encoded octets? You can try to normalize the input with
+    <seealso marker="uri_string#normalize/1"><c>uri_string:normalize/1</c></seealso>. The
+    normalize operation decodes those
+    percent-encoded triplets that correspond to a character in the unreserved set.
+    Normalization is a safe, idempotent operation that converts a URI into its
+    canonical form:</p>
+    <pre>
+    4> uri_string:normalize("http://%6C%6Fcal%23host/%F6re%26bro%20").
+    "http://local%23host/%F6re%26bro%20"
+    5> uri_string:normalize("http://%6C%6Fcal%23host/%F6re%26bro%20", [return_map]).
+    #{host => "local%23host",path => "/%F6re%26bro%20",
+      scheme => "http"}
+    </pre>
+    <p>There are still a few percent-encoded triplets left in the output. At this point,
+    when the URI is already parsed, it is safe to apply application specific decoding on
+    the remaining character triplets. Erlang/OTP provides a function,
+    <seealso marker="uri_string#percent_decode/1"><c>uri_string:percent_decode/1</c></seealso>
+    for raw percent decoding
+    that you can use on the host and path components, or on the whole map:
+    </p>
+    <pre>
+    6> uri_string:percent_decode("local%23host").
+    "local#host"
+    7> uri_string:percent_decode("/%F6re%26bro%20").
+    <![CDATA[{error,invalid_utf8,<<"/öre&bro ">>}]]>
+    8> uri_string:percent_decode(#{host => "local%23host",path => "/%F6re%26bro%20",
+    scheme => "http"}).
+    <![CDATA[{error,{invalid,{path,{invalid_utf8,<<"/öre&bro ">>}}}}]]>
+    </pre>
+    <p>The <c>host</c> was successfully decoded but the path contains at least one
+    character with
+    non-UTF-8 encoding. In order to be able to decode this, you have to make assumptions
+    about the encoding used in these triplets. The most obvious choice is
+    <i>latin-1</i>, so you can try
+    <seealso marker="uri_string#transcode/2"><c>uri_string:transcode/2</c></seealso>, to
+    transcode the path to UTF-8 and run the percent-decode operation on the
+    transcoded string:
+    </p>
+    <pre>
+    9> uri_string:transcode("/%F6re%26bro%20", [{in_encoding, latin1}]).
+    "/%C3%B6re%26bro%20"
+    10> uri_string:percent_decode("/%C3%B6re%26bro%20").
+    <![CDATA["/öre&bro "]]>
+    </pre>
+    <p>It is important to emphasize that it is not safe to apply
+    <seealso marker="uri_string#percent_decode/1"><c>uri_string:percent_decode/1</c></seealso>
+    directly on an input URI:
+    </p>
+    <pre>
+    11> uri_string:percent_decode("http://%6C%6Fcal%23host/%C3%B6re%26bro%20").
+    <![CDATA["http://local#host/öre&bro "
+    12> uri_string:parse("http://local#host/öre&bro ").]]>
+    {error,invalid_uri,":"}
+    </pre>
+    <note><p>Percent-encoding is implemented in
+    <seealso marker="uri_string#recompose/1"><c>uri_string:recompose/1</c></seealso>
+    and it happens when converting a
+    <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>
+    into a <seealso marker="uri_string#type-uri_string"><c>uri_string()</c></seealso>.
+    There is no equivalent to a raw percent-encoding function as percent-encoding
+    shall be applied on the component level using different sets of allowed characters.
+    Applying percent-encoding directly on an input URI would not be safe just as in
+    the case of
+    <seealso marker="uri_string#percent_decode/1"><c>uri_string:percent_decode/1</c></seealso>,
+    the output could be an invalid URI.
+    </p>
+    </note>
+  </section>
+
+  <section>
+    <title>Normalization</title>
+    <p>Normalization is the operation of converting the input URI into a <i>canonical</i>
+    form and keeping the reference to the same underlying resource. The most common
+    application of normalization is determining whether two URIs are equivalent
+    without accessing their referenced resources.</p>
+    <p>Normalization has 6 distinct steps. First the input URI is parsed into an
+    intermediate form that can handle
+    <seealso marker="unicode_usage#what-unicode-is">Unicode</seealso> characters.
+    This datatype is the
+    <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>, that can hold the
+    components of the URI in map elements of type
+    <seealso marker="unicode#type-chardata"><c>unicode:chardata()</c></seealso>.
+    After having the intermediate form, a sequence of
+    normalization algorithms are applied to the individual URI components:</p>
+    <taglist>
+      <tag>Case normalization</tag>
+      <item>
+	<p>Converts the <c>scheme</c> and <c>host</c> components
+	to lower case as they are not case sensitive.</p>
+      </item>
+      <tag>Percent-encoding normalization</tag>
+      <item>
+	<p>Decodes percent-encoded triplets that
+	correspond to characters in the unreserved set.</p>
+      </item>
+      <tag>Scheme-based normalization</tag>
+      <item>
+	<p>Applying rules for the schemes http, https,
+	ftp, ssh, sftp and tftp.</p>
+      </item>
+      <tag>Path segment normalization</tag>
+      <item>
+	<p>Converts the path into a canonical form.</p>
+      </item>
+    </taglist>
+    <p>After these steps, the intermediate data structure, an
+    <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>,
+    is fully normalized. The last step is applying
+    <seealso marker="uri_string#recompose/1"><c>uri_string:recompose/1</c></seealso>
+    that converts the intermediate structure into a valid canonical URI string.</p>
+    <p>Notice the order, the
+    <seealso marker="uri_string#normalize/2"><c>uri_string:normalize(URIMap, [return_map])</c></seealso> that we
+    used many times in this user guide is a shortcut in the normalization process
+    returning the intermediate datastructure, and allowing us to inspect and apply
+    further decoding on the remaining percent-encoded triplets.</p>
+    <pre>
+    13> uri_string:normalize("hTTp://LocalHost:80/%c3%B6rebro/a/../b").
+    "http://localhost/%C3%B6rebro/b"
+    14> uri_string:normalize("hTTp://LocalHost:80/%c3%B6rebro/a/../b", [return_map]).
+    #{host => "localhost",path => "/%C3%B6rebro/b",
+      scheme => "http"}
+    </pre>
+  </section>
+
+ <section>
+   <title>Special considerations</title>
+   <p>The current URI implementation provides support for producing and consuming
+   standard URIs. The API is not meant to be directly exposed in a Web
+   browser's address bar where users can basically enter free text. Application
+   designers shall implement proper heuristics to map the input into a parsable URI.</p>
+ </section>
+
+</chapter>
diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl
index 0b84a8a91d..bb6c9e542a 100644
--- a/lib/stdlib/src/uri_string.erl
+++ b/lib/stdlib/src/uri_string.erl
@@ -226,10 +226,21 @@
 %%-------------------------------------------------------------------------
 %% External API
 %%-------------------------------------------------------------------------
--export([compose_query/1, compose_query/2,
-         dissect_query/1, normalize/1, normalize/2, parse/1,
-         recompose/1, resolve/2, resolve/3, transcode/2]).
--export_type([error/0, uri_map/0, uri_string/0]).
+-export([allowed_characters/0,
+         compose_query/1,
+         compose_query/2,
+         dissect_query/1,
+         normalize/1,
+         normalize/2,
+         percent_decode/1,
+         parse/1,
+         recompose/1,
+         resolve/2,
+         resolve/3,
+         transcode/2]).
+-export_type([error/0,
+              uri_map/0,
+              uri_string/0]).
 
 
 %%-------------------------------------------------------------------------
@@ -286,7 +297,7 @@
     port => non_neg_integer() | undefined,
     query => unicode:chardata(),
     scheme => unicode:chardata(),
-    userinfo => unicode:chardata()} | #{}.
+    userinfo => unicode:chardata()}.
 
 
 %%-------------------------------------------------------------------------
@@ -452,6 +463,61 @@ transcode(URIString, Options) when is_list(URIString) ->
     end.
 
 
+%%-------------------------------------------------------------------------
+%% Misc
+%%-------------------------------------------------------------------------
+-spec allowed_characters() -> [{atom(), list()}].
+allowed_characters() ->
+    Input = lists:seq(0,127),
+    Scheme = lists:filter(fun is_scheme/1, Input),
+    UserInfo = lists:filter(fun is_userinfo/1, Input),
+    Host = lists:filter(fun is_host/1, Input),
+    IPv4 = lists:filter(fun is_ipv4/1, Input),
+    IPv6 = lists:filter(fun is_ipv6/1, Input),
+    RegName = lists:filter(fun is_reg_name/1, Input),
+    Path = lists:filter(fun is_path/1, Input),
+    Query = lists:filter(fun is_query/1, Input),
+    Fragment = lists:filter(fun is_fragment/1, Input),
+    Reserved = lists:filter(fun is_reserved/1, Input),
+    Unreserved = lists:filter(fun is_unreserved/1, Input),
+    [{scheme, Scheme},
+     {userinfo, UserInfo},
+     {host, Host},
+     {ipv4, IPv4},
+     {ipv6, IPv6},
+     {regname,RegName},
+     {path,Path},
+     {query, Query},
+     {fragment,Fragment},
+     {reserved, Reserved},
+     {unreserved, Unreserved}].
+
+-spec percent_decode(URI) -> Result when
+      URI :: uri_string() | uri_map(),
+      Result :: uri_string() |
+                uri_map() |
+                {error, {invalid, {atom(), {term(), term()}}}}.
+percent_decode(URIMap) when is_map(URIMap)->
+    Fun = fun (K,V) when K =:= userinfo; K =:= host; K =:= path;
+                         K =:= query; K =:= fragment ->
+                  case raw_decode(V) of
+                      {error, Reason, Input} ->
+                          throw({error, {invalid, {K, {Reason, Input}}}});
+                      Else ->
+                          Else
+                  end;
+              %% Handle port and scheme
+              (_,V) ->
+                  V
+          end,
+    try maps:map(Fun, URIMap)
+    catch throw:Return ->
+            Return
+    end;
+percent_decode(URI) when is_list(URI) orelse
+                         is_binary(URI) ->
+    raw_decode(URI).
+
 %%-------------------------------------------------------------------------
 %% Functions for working with the query part of a URI as a list
 %% of key/value pairs.
@@ -1421,8 +1487,15 @@ decode(<<$%,C0,C1,Cs/binary>>, Acc) ->
     case is_hex_digit(C0) andalso is_hex_digit(C1) of
         true ->
             B = ?HEX2DEC(C0)*16+?HEX2DEC(C1),
-            case is_reserved(B) of
-                true ->
+            %% [2.4] When a URI is dereferenced, the components and subcomponents
+            %% significant to the scheme-specific dereferencing process (if any)
+            %% must be parsed and separated before the percent-encoded octets within
+            %% those components can be safely decoded, as otherwise the data may be
+            %% mistaken for component delimiters.  The only exception is for
+            %% percent-encoded octets corresponding to characters in the unreserved
+            %% set, which can be decoded at any time.
+            case is_unreserved(B) of
+                false ->
                     %% [2.2] Characters in the reserved set are protected from
                     %% normalization.
                     %% [2.1] For consistency, URI producers and normalizers should
@@ -1431,7 +1504,7 @@ decode(<<$%,C0,C1,Cs/binary>>, Acc) ->
                     H0 = hex_to_upper(C0),
                     H1 = hex_to_upper(C1),
                     decode(Cs, <<Acc/binary,$%,H0,H1>>);
-                false ->
+                true ->
                     decode(Cs, <<Acc/binary, B>>)
             end;
         false -> throw({error,invalid_percent_encoding,<<$%,C0,C1>>})
@@ -1441,6 +1514,32 @@ decode(<<C,Cs/binary>>, Acc) ->
 decode(<<>>, Acc) ->
     check_utf8(Acc).
 
+-spec raw_decode(list()|binary()) -> list() | binary() | error().
+raw_decode(Cs) ->
+    raw_decode(Cs, <<>>).
+%%
+raw_decode(L, Acc) when is_list(L) ->
+    try
+        B0 = unicode:characters_to_binary(L),
+        B1 = raw_decode(B0, Acc),
+        unicode:characters_to_list(B1)
+    catch
+        throw:{error, Atom, RestData} ->
+            {error, Atom, RestData}
+    end;
+raw_decode(<<$%,C0,C1,Cs/binary>>, Acc) ->
+    case is_hex_digit(C0) andalso is_hex_digit(C1) of
+        true ->
+            B = ?HEX2DEC(C0)*16+?HEX2DEC(C1),
+            raw_decode(Cs, <<Acc/binary, B>>);
+        false ->
+            throw({error,invalid_percent_encoding,<<$%,C0,C1>>})
+    end;
+raw_decode(<<C,Cs/binary>>, Acc) ->
+    raw_decode(Cs, <<Acc/binary, C>>);
+raw_decode(<<>>, Acc) ->
+    check_utf8(Acc).
+
 %% Returns Cs if it is utf8 encoded.
 check_utf8(Cs) ->
     case unicode:characters_to_list(Cs) of
@@ -1897,7 +1996,7 @@ transcode_pct([], Acc, B, InEncoding, OutEncoding) ->
     OutBinary = convert_to_binary(B, InEncoding, OutEncoding),
     PctEncUtf8 = percent_encode_segment(OutBinary),
     Out = convert_to_list(PctEncUtf8, utf8),
-    lists:reverse(Acc) ++ Out.
+    lists:reverse(Acc, Out).
 
 
 %% Convert to binary
@@ -1932,7 +2031,7 @@ flatten_list(L, InEnc) ->
 %%
 flatten_list([H|T], InEnc, Acc) when is_binary(H) ->
     L = convert_to_list(H, InEnc),
-    flatten_list(T, InEnc, lists:reverse(L) ++ Acc);
+    flatten_list(T, InEnc, lists:reverse(L, Acc));
 flatten_list([H|T], InEnc, Acc) when is_list(H) ->
     flatten_list(H ++ T, InEnc, Acc);
 flatten_list([H|T], InEnc, Acc) ->
@@ -1952,7 +2051,7 @@ percent_encode_segment(Segment) ->
 %%-------------------------------------------------------------------------
 
 %% Returns separator to be used between key-value pairs
-get_separator(L) when length(L) =:= 0 ->
+get_separator([]) ->
     <<>>;
 get_separator(_L) ->
     <<"&">>.
diff --git a/lib/stdlib/test/property_test/uri_string_recompose.erl b/lib/stdlib/test/property_test/uri_string_recompose.erl
index 39fadf23c2..3c0dae0f8b 100644
--- a/lib/stdlib/test/property_test/uri_string_recompose.erl
+++ b/lib/stdlib/test/property_test/uri_string_recompose.erl
@@ -85,9 +85,12 @@ prop_recompose() ->
 
 prop_normalize() ->
     ?FORALL(Map, map(),
-            uri_string:normalize(Map, [return_map]) =:=
-                uri_string:normalize(uri_string:parse(uri_string:recompose(Map)),
-                                     [return_map])).
+            uri_string:percent_decode(
+              uri_string:normalize(Map, [return_map])) =:=
+                uri_string:percent_decode(
+                  uri_string:normalize(
+                    uri_string:parse(uri_string:recompose(Map)),
+                                     [return_map]))).
 
 %% Stats
 prop_map_key_length_collect() ->
diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl
index 51f6aac7ad..a326f56b2a 100644
--- a/lib/stdlib/test/uri_string_SUITE.erl
+++ b/lib/stdlib/test/uri_string_SUITE.erl
@@ -1049,14 +1049,20 @@ normalize_map(_Config) ->
 
 normalize_return_map(_Config) ->
     #{scheme := "http",path := "/a/g",host := "localhost-örebro"} =
-        uri_string:normalize("http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g",
-                                   [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            "http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g",
+            [return_map])),
     #{scheme := <<"http">>,path := <<"/a/g">>, host := <<"localhost-örebro"/utf8>>} =
-        uri_string:normalize(<<"http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g">>,
-                                   [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g">>,
+            [return_map])),
     #{scheme := <<"https">>,path := <<"/">>, host := <<"localhost">>} =
-        uri_string:normalize(#{scheme => <<"https">>,port => 443,path => <<>>,
-                               host => <<"localhost">>}, [return_map]).
+        uri_string:percent_decode(
+          uri_string:normalize(
+            #{scheme => <<"https">>,port => 443,path => <<>>,
+              host => <<"localhost">>}, [return_map])).
 
 normalize_negative(_Config) ->
     {error,invalid_uri,":"} =
@@ -1067,64 +1073,103 @@ normalize_negative(_Config) ->
         uri_string:normalize("http://[192.168.0.1]", [return_map]),
     {error,invalid_uri,":"} =
         uri_string:normalize(<<"http://[192.168.0.1]">>, [return_map]),
-    {error,invalid_utf8,<<0,0,0,246>>} = uri_string:normalize("//%00%00%00%F6").
+    {error,invalid_utf8,<<47,47,0,0,0,246>>} =
+        uri_string:percent_decode(uri_string:normalize("//%00%00%00%F6")).
 
 normalize_binary_pct_encoded_userinfo(_Config) ->
     #{scheme := <<"user">>, path := <<"合@気道"/utf8>>} =
-        uri_string:normalize(<<"user:%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"user:%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map])),
     #{path := <<"合気道@"/utf8>>} =
-        uri_string:normalize(<<"%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map])),
     #{path := <<"/合気道@"/utf8>>} =
-        uri_string:normalize(<<"/%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"/%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map])),
     #{path := <<"合@気道"/utf8>>} =
-        uri_string:normalize(<<"%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map])),
     #{userinfo := <<"合"/utf8>>, host := <<"気道"/utf8>>} =
-        uri_string:normalize(<<"//%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"//%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map])),
     #{userinfo := <<"合:気"/utf8>>, host := <<"道"/utf8>>} =
-        uri_string:normalize(<<"//%E5%90%88:%E6%B0%97@%E9%81%93">>, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"//%E5%90%88:%E6%B0%97@%E9%81%93">>, [return_map])),
     #{scheme := <<"foo">>, path := <<"/合気道@"/utf8>>} =
-        uri_string:normalize(<<"foo:/%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"foo:/%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map])),
     #{scheme := <<"foo">>, userinfo := <<"合"/utf8>>, host := <<"気道"/utf8>>} =
-        uri_string:normalize(<<"foo://%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"foo://%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map])),
     #{scheme := <<"foo">>, userinfo := <<"合:気"/utf8>>, host := <<"道"/utf8>>} =
-        uri_string:normalize(<<"foo://%E5%90%88:%E6%B0%97@%E9%81%93">>, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"foo://%E5%90%88:%E6%B0%97@%E9%81%93">>, [return_map])),
     {error,invalid_uri,"@"} =
-        uri_string:normalize(<<"//%E5%90%88@%E6%B0%97%E9%81%93@">>, [return_map]),
+          uri_string:normalize(
+            <<"//%E5%90%88@%E6%B0%97%E9%81%93@">>, [return_map]),
     {error,invalid_uri,":"} =
-        uri_string:normalize(<<"foo://%E5%90%88@%E6%B0%97%E9%81%93@">>, [return_map]).
+          uri_string:normalize(
+            <<"foo://%E5%90%88@%E6%B0%97%E9%81%93@">>, [return_map]).
 
 normalize_binary_pct_encoded_query(_Config) ->
     #{scheme := <<"foo">>, host := <<"example.com">>, path := <<"/">>,
       query := <<"name=合気道"/utf8>>} =
-        uri_string:normalize(<<"foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>,
+            [return_map])),
     #{host := <<"example.com">>, path := <<"/">>, query := <<"name=合気道"/utf8>>} =
-        uri_string:normalize(<<"//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>, [return_map]).
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>, [return_map])).
 
 normalize_binary_pct_encoded_fragment(_Config) ->
     #{scheme := <<"foo">>, host := <<"example.com">>, fragment := <<"合気道"/utf8>>} =
-        uri_string:normalize(<<"foo://example.com#%E5%90%88%E6%B0%97%E9%81%93">>, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"foo://example.com#%E5%90%88%E6%B0%97%E9%81%93">>, [return_map])),
     #{host := <<"example.com">>, path := <<"/">>, fragment := <<"合気道"/utf8>>} =
-        uri_string:normalize(<<"//example.com/#%E5%90%88%E6%B0%97%E9%81%93">>, [return_map]).
+        uri_string:percent_decode(
+          uri_string:normalize(
+            <<"//example.com/#%E5%90%88%E6%B0%97%E9%81%93">>, [return_map])).
 
 normalize_pct_encoded_userinfo(_Config) ->
     #{scheme := "user", path := "合@気道"} =
-        uri_string:normalize("user:%E5%90%88@%E6%B0%97%E9%81%93", [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize("user:%E5%90%88@%E6%B0%97%E9%81%93", [return_map])),
     #{path := "合気道@"} =
-        uri_string:normalize("%E5%90%88%E6%B0%97%E9%81%93@", [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize("%E5%90%88%E6%B0%97%E9%81%93@", [return_map])),
     #{path := "/合気道@"} =
-        uri_string:normalize("/%E5%90%88%E6%B0%97%E9%81%93@", [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize("/%E5%90%88%E6%B0%97%E9%81%93@", [return_map])),
     #{path := "合@気道"} =
-        uri_string:normalize("%E5%90%88@%E6%B0%97%E9%81%93", [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize("%E5%90%88@%E6%B0%97%E9%81%93", [return_map])),
     #{userinfo := "合", host := "気道"} =
-        uri_string:normalize("//%E5%90%88@%E6%B0%97%E9%81%93", [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize("//%E5%90%88@%E6%B0%97%E9%81%93", [return_map])),
     #{userinfo := "合:気", host := "道"} =
-        uri_string:normalize("//%E5%90%88:%E6%B0%97@%E9%81%93", [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize("//%E5%90%88:%E6%B0%97@%E9%81%93", [return_map])),
     #{scheme := "foo", path := "/合気道@"} =
-        uri_string:normalize("foo:/%E5%90%88%E6%B0%97%E9%81%93@", [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize("foo:/%E5%90%88%E6%B0%97%E9%81%93@", [return_map])),
     #{scheme := "foo", userinfo := "合", host := "気道"} =
-        uri_string:normalize("foo://%E5%90%88@%E6%B0%97%E9%81%93", [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize("foo://%E5%90%88@%E6%B0%97%E9%81%93", [return_map])),
     #{scheme := "foo", userinfo := "合:気", host := "道"} =
-        uri_string:normalize("foo://%E5%90%88:%E6%B0%97@%E9%81%93", [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize("foo://%E5%90%88:%E6%B0%97@%E9%81%93", [return_map])),
     {error,invalid_uri,"@"} =
         uri_string:normalize("//%E5%90%88@%E6%B0%97%E9%81%93@", [return_map]),
     {error,invalid_uri,":"} =
@@ -1133,25 +1178,37 @@ normalize_pct_encoded_userinfo(_Config) ->
 normalize_pct_encoded_query(_Config) ->
     #{scheme := "foo", host := "example.com", path := "/",
       query := "name=合気道"} =
-        uri_string:normalize("foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93", [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            "foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93", [return_map])),
     #{host := "example.com", path := "/", query := "name=合気道"} =
-        uri_string:normalize("//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93", [return_map]).
+        uri_string:percent_decode(
+          uri_string:normalize(
+            "//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93", [return_map])).
 
 normalize_pct_encoded_fragment(_Config) ->
     #{scheme := "foo", host := "example.com", fragment := "合気道"} =
-        uri_string:normalize("foo://example.com#%E5%90%88%E6%B0%97%E9%81%93", [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(
+            "foo://example.com#%E5%90%88%E6%B0%97%E9%81%93", [return_map])),
     #{host := "example.com", path := "/", fragment := "合気道"} =
-        uri_string:normalize("//example.com/#%E5%90%88%E6%B0%97%E9%81%93", [return_map]).
+        uri_string:percent_decode(
+          uri_string:normalize(
+            "//example.com/#%E5%90%88%E6%B0%97%E9%81%93", [return_map])).
 
 normalize_pct_encoded_negative(_Config) ->
-    {error,invalid_utf8,<<0,0,0,246>>} =
-        uri_string:normalize(#{host => "%00%00%00%F6",path => []}, [return_map]),
-    {error,invalid_utf8,<<0,0,0,246>>} =
-        uri_string:normalize(#{host => "%00%00%00%F6",path => []}, []),
-    {error,invalid_utf8,<<0,0,0,246>>} =
-        uri_string:normalize("//%00%00%00%F6", [return_map]),
-    {error,invalid_utf8,<<0,0,0,246>>} =
-        uri_string:normalize("//%00%00%00%F6", []).
+    {error,{invalid,{host,{invalid_utf8,<<0,0,0,246>>}}}} =
+        uri_string:percent_decode(
+          uri_string:normalize(#{host => "%00%00%00%F6",path => []}, [return_map])),
+    {error,invalid_utf8,<<47,47,0,0,0,246>>} =
+        uri_string:percent_decode(
+          uri_string:normalize(#{host => "%00%00%00%F6",path => []}, [])),
+    {error,{invalid,{host,{invalid_utf8,<<0,0,0,246>>}}}} =
+        uri_string:percent_decode(
+          uri_string:normalize("//%00%00%00%F6", [return_map])),
+    {error,invalid_utf8,<<47,47,0,0,0,246>>} =
+        uri_string:percent_decode(
+          uri_string:normalize("//%00%00%00%F6", [])).
 
 interop_query_utf8(_Config) ->
     Q = uri_string:compose_query([{"foo bar","1"}, {"合", "2"}]),
@@ -1216,8 +1273,8 @@ regression_normalize(_Config) ->
     "foo://%C3%B6" =
         uri_string:normalize("FOo://%C3%B6"),
     #{host := "ö",path := [],scheme := "foo"} =
-        uri_string:normalize("FOo://%C3%B6", [return_map]),
-
+        uri_string:percent_decode(
+          uri_string:normalize("FOo://%C3%B6", [return_map])),
 
     "foo://bar" =
         uri_string:normalize(#{host => "Bar",path => [],scheme => "FOo"}),
@@ -1242,7 +1299,9 @@ regression_normalize(_Config) ->
     "foo://%C3%B6" =
         uri_string:normalize(#{host => "%C3%B6",path => [],scheme => "FOo"}),
     #{host := "ö",path := [],scheme := "foo"} =
-        uri_string:normalize(#{host => "%C3%B6",path => [],scheme => "FOo"}, [return_map]),
+        uri_string:percent_decode(
+          uri_string:normalize(#{host => "%C3%B6",path => [],scheme => "FOo"},
+                               [return_map])),
 
     "foo://%C3%B6" =
         uri_string:normalize(#{host => "ö",path => [],scheme => "FOo"}),
-- 
2.26.2
Places

File 3581-stdlib-Improve-API-and-documentation-of-uri_string.patch of Package erlang

Places