File 3581-stdlib-Improve-API-and-documentation-of-uri_string.patch of Package erlang
From 8efe45a03e615be3a8f3c3b81f5026e170cd893d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= <peterdmv@erlang.org>
Date: Wed, 30 Sep 2020 16:02:48 +0200
Subject: [PATCH] stdlib: Improve API and documentation of uri_string
---
lib/stdlib/doc/src/Makefile | 2 +-
lib/stdlib/doc/src/part.xml | 1 +
lib/stdlib/doc/src/uri_string.xml | 49 +++
lib/stdlib/doc/src/uri_string_usage.xml | 370 ++++++++++++++++++
lib/stdlib/src/uri_string.erl | 121 +++++-
.../property_test/uri_string_recompose.erl | 9 +-
lib/stdlib/test/uri_string_SUITE.erl | 151 ++++---
7 files changed, 642 insertions(+), 61 deletions(-)
create mode 100644 lib/stdlib/doc/src/uri_string_usage.xml
diff --git a/lib/stdlib/doc/src/Makefile b/lib/stdlib/doc/src/Makefile
index 1092ce3ffa..4b22e35e3b 100644
--- a/lib/stdlib/doc/src/Makefile
+++ b/lib/stdlib/doc/src/Makefile
@@ -101,7 +101,7 @@ XML_REF6_FILES = stdlib_app.xml
XML_PART_FILES = part.xml
XML_CHAPTER_FILES = introduction.xml io_protocol.xml unicode_usage.xml \
- notes.xml assert_hrl.xml
+ uri_string_usage.xml notes.xml assert_hrl.xml
BOOK_FILES = book.xml
diff --git a/lib/stdlib/doc/src/part.xml b/lib/stdlib/doc/src/part.xml
index 93c47405bf..b6a2f16b57 100644
--- a/lib/stdlib/doc/src/part.xml
+++ b/lib/stdlib/doc/src/part.xml
@@ -37,5 +37,6 @@
<xi:include href="introduction.xml"/>
<xi:include href="io_protocol.xml"/>
<xi:include href="unicode_usage.xml"/>
+ <xi:include href="uri_string_usage.xml"/>
</part>
diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml
index a792decbff..dea8e60979 100644
--- a/lib/stdlib/doc/src/uri_string.xml
+++ b/lib/stdlib/doc/src/uri_string.xml
@@ -84,6 +84,9 @@
<item>Dissecting form-urlencoded query strings into a list of key-value pairs<br></br>
<seealso marker="#dissect_query/1"><c>dissect_query/1</c></seealso>
</item>
+ <item>Decoding percent-encoded triplets<br></br>
+ <seealso marker="#percent_decode/1"><c>percent_decode/1</c></seealso>
+ </item>
</list>
<p>There are four different encodings present during the handling of URIs:</p>
<list type="bulleted">
@@ -149,6 +152,21 @@
<funcs>
+ <func>
+ <name name="allowed_characters" arity="0" since="OTP 23.2"/>
+ <fsummary>Print allowed characters in URI components.</fsummary>
+ <desc>
+ <p>This is a utility function meant to be used in the shell for printing
+ the allowed characters in each
+ major URI component, and also in the most important characters sets.
+ Please note that this function does not replace the ABNF rules defined by
+ the standards, these character sets are derived directly from those
+ aformentioned rules. For more information see the
+ <seealso marker="uri_string_usage#percent_encoding">Uniform Resource
+ Identifiers</seealso> chapter in stdlib's Users Guide.</p>
+ </desc>
+ </func>
+
<func>
<name name="compose_query" arity="1" since="OTP 21.0"/>
<fsummary>Compose urlencoded query string.</fsummary>
@@ -308,6 +326,37 @@
</desc>
</func>
+ <func>
+ <name name="percent_decode" arity="1" since="OTP 23.2"/>
+ <fsummary>Decode percent-decode triplets in the input.</fsummary>
+ <desc>
+ <p>Decodes all percent-encoded triplets in the input that can be both a
+ <c>uri_string()</c> and a <c>uri_map()</c>. Note, that this function performs
+ raw decoding and it shall be used on already parsed URI components. Applying
+ this function directly on a standard URI can effectively change it.</p>
+ <p>If the input encoding is not UTF-8, an error tuple is returned.</p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>uri_string:percent_decode(#{host => "localhost-%C3%B6rebro",path => [],</input>
+1> <input>scheme => "http"}).</input>
+#{host => "localhost-örebro",path => [],scheme => "http"}
+2> <![CDATA[uri_string:percent_decode(<<"%C3%B6rebro">>).]]>
+<![CDATA[<<"örebro"/utf8>>]]>
+ </pre>
+ <warning><p>
+ Using <c>uri_string:percent_decode/1</c> directly on a URI is not safe. This
+ example shows, that after each consecutive application of the function
+ the resulting URI will be changed. None of these URIs refer to the same
+ resource.</p>
+ <pre>
+<![CDATA[3> uri_string:percent_decode(<<"http://local%252Fhost/path">>).
+<<"http://local%2Fhost/path">>
+4> uri_string:percent_decode(<<"http://local%2Fhost/path">>).
+<<"http://local/host/path">>]]>
+ </pre></warning>
+ </desc>
+ </func>
+
<func>
<name name="recompose" arity="1" since="OTP 21.0"/>
<fsummary>Recompose URI.</fsummary>
diff --git a/lib/stdlib/doc/src/uri_string_usage.xml b/lib/stdlib/doc/src/uri_string_usage.xml
new file mode 100644
index 0000000000..72851096b7
--- /dev/null
+++ b/lib/stdlib/doc/src/uri_string_usage.xml
@@ -0,0 +1,370 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<!DOCTYPE chapter SYSTEM "chapter.dtd">
+
+<chapter>
+ <header>
+ <copyright>
+ <year>2020</year>
+ <year>2020</year>
+ <holder>Ericsson AB. All Rights Reserved.</holder>
+ </copyright>
+ <legalnotice>
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+ </legalnotice>
+
+ <title>Uniform Resource Identifiers</title>
+ <prepared>Péter Dimitrov</prepared>
+ <responsible></responsible>
+ <docno></docno>
+ <approved></approved>
+ <checked></checked>
+ <date>2020-09-30</date>
+ <rev>PA1</rev>
+ <file>uri_string_usage.xml</file>
+ </header>
+ <section>
+ <title>Basics</title>
+ <p>At the time of writing this document, in October 2020, there are
+ two major standards concerning Universal Resource Identifiers and
+ Universal Resource Locators:</p>
+ <list type="bulleted">
+ <item><p>
+ <url href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986 - Uniform Resource
+ Identifier (URI): Generic Syntax</url></p></item>
+ <item><p>
+ <url href="https://url.spec.whatwg.org/">WHAT WG URL - Living standard</url>
+ </p></item>
+ </list>
+ <p>
+ The former is a classical standard with a proper formal syntax, using the so
+ called <url href="https://www.ietf.org/rfc/rfc2234.txt">Augmented Backus-Naur Form
+ (ABNF)</url> for describing
+ the grammar, while the latter is a living document describing the current pratice,
+ that is, how a majority of Web browsers work with URIs. WHAT WG URL is Web focused
+ and it has no formal grammar but a plain english description of the algorithms
+ that should be followed.</p>
+ <p>What is the difference between them, if any? They provide an overlapping
+ definition for resource identifiers and they are not compatible.
+ The <seealso marker="stdlib:uri_string"><c>uri_string</c></seealso> module implements
+ <url href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986</url> and the term URI will
+ be used throughout this document. A URI is an identifier, a string of characters
+ that identifies a particular resource.</p>
+ <p>
+ For a more complete problem
+ statement regarding the URIs check the
+ <url href="https://tools.ietf.org/html/draft-ruby-url-problem-01">URL Problem
+ Statement and Directions</url>.</p>
+ </section>
+
+ <section>
+ <title>What is a URI?</title>
+ <p>Let's start with what it is not. It is not the text that you type in the address
+ bar in your Web browser. Web browsers do all possible heuristics to convert the
+ input into a valid URI that could be sent over the network.</p>
+ <p>A URI is an identifier consisting of a sequence of characters matching the syntax
+ rule named <c>URI</c> in
+ <url href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986</url>.
+ </p>
+ <p>It is crucial to clarify that a <i>character</i> is a symbol that is displayed on
+ a terminal or written to paper and should not be confused with its internal
+ representation.</p>
+ <p>A URI more specifically, is a sequence of characters from a
+ subset of the US ASCII character set. The generic URI syntax consists of a
+ hierarchical sequence of components referred to as the scheme, authority,
+ path, query, and fragment. There is a formal description for
+ each of these components in
+ <url href="https://www.ietf.org/rfc/rfc2234.txt">ABNF</url> notation in
+ <url href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986</url>:</p>
+ <pre>
+ URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+ hier-part = "//" authority path-abempty
+ / path-absolute
+ / path-rootless
+ / path-empty
+ scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+ authority = [ userinfo "@" ] host [ ":" port ]
+ userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
+
+ reserved = gen-delims / sub-delims
+ gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+ sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
+ / "*" / "+" / "," / ";" / "="
+
+ unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
+ </pre>
+ </section>
+
+ <section>
+ <title>The uri_string module</title>
+ <p>As producing and consuming standard URIs can get quite complex, Erlang/OTP
+ provides
+ a module, <seealso marker="stdlib:uri_string"><c>uri_string</c></seealso>, to handle all the most difficult operations such as parsing,
+ recomposing, normalizing and resolving URIs against a base URI.
+ </p>
+ <p>The API functions in <seealso marker="stdlib:uri_string"><c>uri_string</c></seealso>
+ work on two basic data types
+ <seealso marker="uri_string#type-uri_string"><c>uri_string()</c></seealso> and
+ <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>.
+ <seealso marker="uri_string#type-uri_string"><c>uri_string()</c></seealso> represents a
+ standard URI, while
+ <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso> is a wider datatype,
+ that can represent URI components using
+ <seealso marker="unicode_usage#what-unicode-is">Unicode</seealso> characters.
+ <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>
+ is a convenient choice for enabling
+ operations such as producing standard compliant URIs out of components that have
+ special or <seealso marker="unicode_usage#what-unicode-is">Unicode</seealso>
+ characters. It is easier to explain this by an example.
+ </p>
+ <p>Let's say that we would like to create the following URI and send it over the
+ network: <c>http://cities/örebro?foo bar</c>. This is not a valid URI as it contains
+ characters that are not allowed in a URI such as "ö" and the space. We can verify
+ this by parsing the URI:
+ </p>
+ <pre>
+ 1> uri_string:parse("http://cities/örebro?foo bar").
+ {error,invalid_uri,":"}
+ </pre>
+ <p>The URI parser tries all possible combinations to interpret the input and fails
+ at the last attempt when it encounters the colon character <c>":"</c>. Note, that
+ the inital fault occurs when the parser attempts to interpret the character
+ <c>"ö"</c> and after a failure back-tracks to the point where it has another
+ possible parsing alternative.</p>
+ <p>The proper way to solve this problem is to use
+ <seealso marker="uri_string#recompose/1"><c>uri_string:recompose/1</c></seealso>
+ with a <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso> as input:</p>
+ <pre>
+ 2> uri_string:recompose(#{scheme => "http", host => "cities", path => "/örebro",
+ query => "foo bar"}).
+ "http://cities/%C3%B6rebro?foo%20bar"
+ </pre>
+ <p>The result is a valid URI where all the special characters are encoded as defined
+ by the standard. Applying
+ <seealso marker="uri_string#parse/1"><c>uri_string:parse/1</c></seealso> and
+ <seealso marker="uri_string#percent_decode/1"><c>uri_string:percent_decode/1</c></seealso>
+ on the URI returns the original input:
+ </p>
+ <pre>
+ 3> uri_string:percent_decode(uri_string:parse("http://cities/%C3%B6rebro?foo%20bar")).
+ #{host => "cities",path => "/örebro",query => "foo bar",
+ scheme => "http"}
+ </pre>
+ <p>This symmetric property is heavily used in our property test suite.
+ </p>
+ </section>
+
+ <section>
+ <title>Percent-encoding</title>
+ <p>As you have seen in the previous chapter, a standard URI can only contain a strict
+ subset of the US ASCII character set, moreover the allowed set of characters is not
+ the same in the different URI components. Percent-encoding is a mechanism to
+ represent a data octet in a component when that octet's corresponding character
+ is outside of
+ the allowed set or is being used as a delimiter. This is what you see when <c>"ö"</c>
+ is encoded as <c>%C3%B6</c> and <c>space</c> as <c>%20</c>.
+ Most of the API functions are
+ expecting UTF-8 encoding when handling percent-encoded triplets. The UTF-8 encoding
+ of the <seealso marker="unicode_usage#what-unicode-is">Unicode</seealso>
+ character <c>"ö"</c> is two octets: <c>OxC3 0xB6</c>.
+ The character <c>space</c> is in the first 128 characters of
+ <seealso marker="unicode_usage#what-unicode-is">Unicode</seealso> and it is encoded
+ using a single octet <c>0x20</c>.</p>
+ <note><p><seealso marker="unicode_usage#what-unicode-is">Unicode</seealso>
+ is backward compatible with ASCII, the encoding of the first 128
+ characters is the same binary value as in ASCII.
+ </p></note>
+ <p><marker id="percent_encoding"></marker>
+ It is a major source of confusion exactly which characters will be
+ percent-encoded. In order to make it easier to answer this question the library
+ provides a utility function,
+ <seealso marker="uri_string#allowed_characters/0"><c>uri_string:allowed_characters/0
+ </c></seealso>,
+ that lists the allowed set of characters in each major
+ URI component, and also in the most important standard character sets.
+ </p>
+ <pre>
+ 1> uri_string:allowed_characters().
+ <![CDATA[{scheme,
+ "+-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"},
+ {userinfo,
+ "!$%&'()*+,-.0123456789:;=ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+ {host,
+ "!$&'()*+,-.0123456789:;=ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+ {ipv4,".0123456789"},
+ {ipv6,".0123456789:ABCDEFabcdef"},
+ {regname,
+ "!$%&'()*+,-.0123456789;=ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+ {path,
+ "!$%&'()*+,-./0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+ {query,
+ "!$%&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+ {fragment,
+ "!$%&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"},
+ {reserved,"!#$&'()*+,/:;=?@[]"},
+ {unreserved,
+ "-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"}] ]]>
+ </pre>
+ <p>If a URI component has a character that is not allowed, it will be
+ percent-encoded when the URI is produced:
+ </p>
+ <pre>
+ 2> uri_string:recompose(#{scheme => "https", host => "local#host", path => ""}).
+ "https://local%23host"
+ </pre>
+ <p>Consuming a URI containing percent-encoded triplets can take many steps. The
+ following example shows how to handle an input URI that is not normalized and
+ contains multiple percent-encoded triplets.
+ First, the input <seealso marker="uri_string#type-uri_string"><c>uri_string()</c></seealso>
+ is to be parsed into a <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>.
+ The parsing only splits the URI into its components without doing any decoding:
+ </p>
+ <pre>
+ 3> uri_string:parse("http://%6C%6Fcal%23host/%F6re%26bro%20").
+ #{host => "%6C%6Fcal%23host",path => "/%F6re%26bro%20",
+ scheme => "http"}}
+ </pre>
+ <p>The input is a valid URI but how can you decode those
+ percent-encoded octets? You can try to normalize the input with
+ <seealso marker="uri_string#normalize/1"><c>uri_string:normalize/1</c></seealso>. The
+ normalize operation decodes those
+ percent-encoded triplets that correspond to a character in the unreserved set.
+ Normalization is a safe, idempotent operation that converts a URI into its
+ canonical form:</p>
+ <pre>
+ 4> uri_string:normalize("http://%6C%6Fcal%23host/%F6re%26bro%20").
+ "http://local%23host/%F6re%26bro%20"
+ 5> uri_string:normalize("http://%6C%6Fcal%23host/%F6re%26bro%20", [return_map]).
+ #{host => "local%23host",path => "/%F6re%26bro%20",
+ scheme => "http"}
+ </pre>
+ <p>There are still a few percent-encoded triplets left in the output. At this point,
+ when the URI is already parsed, it is safe to apply application specific decoding on
+ the remaining character triplets. Erlang/OTP provides a function,
+ <seealso marker="uri_string#percent_decode/1"><c>uri_string:percent_decode/1</c></seealso>
+ for raw percent decoding
+ that you can use on the host and path components, or on the whole map:
+ </p>
+ <pre>
+ 6> uri_string:percent_decode("local%23host").
+ "local#host"
+ 7> uri_string:percent_decode("/%F6re%26bro%20").
+ <![CDATA[{error,invalid_utf8,<<"/öre&bro ">>}]]>
+ 8> uri_string:percent_decode(#{host => "local%23host",path => "/%F6re%26bro%20",
+ scheme => "http"}).
+ <![CDATA[{error,{invalid,{path,{invalid_utf8,<<"/öre&bro ">>}}}}]]>
+ </pre>
+ <p>The <c>host</c> was successfully decoded but the path contains at least one
+ character with
+ non-UTF-8 encoding. In order to be able to decode this, you have to make assumptions
+ about the encoding used in these triplets. The most obvious choice is
+ <i>latin-1</i>, so you can try
+ <seealso marker="uri_string#transcode/2"><c>uri_string:transcode/2</c></seealso>, to
+ transcode the path to UTF-8 and run the percent-decode operation on the
+ transcoded string:
+ </p>
+ <pre>
+ 9> uri_string:transcode("/%F6re%26bro%20", [{in_encoding, latin1}]).
+ "/%C3%B6re%26bro%20"
+ 10> uri_string:percent_decode("/%C3%B6re%26bro%20").
+ <![CDATA["/öre&bro "]]>
+ </pre>
+ <p>It is important to emphasize that it is not safe to apply
+ <seealso marker="uri_string#percent_decode/1"><c>uri_string:percent_decode/1</c></seealso>
+ directly on an input URI:
+ </p>
+ <pre>
+ 11> uri_string:percent_decode("http://%6C%6Fcal%23host/%C3%B6re%26bro%20").
+ <![CDATA["http://local#host/öre&bro "
+ 12> uri_string:parse("http://local#host/öre&bro ").]]>
+ {error,invalid_uri,":"}
+ </pre>
+ <note><p>Percent-encoding is implemented in
+ <seealso marker="uri_string#recompose/1"><c>uri_string:recompose/1</c></seealso>
+ and it happens when converting a
+ <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>
+ into a <seealso marker="uri_string#type-uri_string"><c>uri_string()</c></seealso>.
+ There is no equivalent to a raw percent-encoding function as percent-encoding
+ shall be applied on the component level using different sets of allowed characters.
+ Applying percent-encoding directly on an input URI would not be safe just as in
+ the case of
+ <seealso marker="uri_string#percent_decode/1"><c>uri_string:percent_decode/1</c></seealso>,
+ the output could be an invalid URI.
+ </p>
+ </note>
+ </section>
+
+ <section>
+ <title>Normalization</title>
+ <p>Normalization is the operation of converting the input URI into a <i>canonical</i>
+ form and keeping the reference to the same underlying resource. The most common
+ application of normalization is determining whether two URIs are equivalent
+ without accessing their referenced resources.</p>
+ <p>Normalization has 6 distinct steps. First the input URI is parsed into an
+ intermediate form that can handle
+ <seealso marker="unicode_usage#what-unicode-is">Unicode</seealso> characters.
+ This datatype is the
+ <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>, that can hold the
+ components of the URI in map elements of type
+ <seealso marker="unicode#type-chardata"><c>unicode:chardata()</c></seealso>.
+ After having the intermediate form, a sequence of
+ normalization algorithms are applied to the individual URI components:</p>
+ <taglist>
+ <tag>Case normalization</tag>
+ <item>
+ <p>Converts the <c>scheme</c> and <c>host</c> components
+ to lower case as they are not case sensitive.</p>
+ </item>
+ <tag>Percent-encoding normalization</tag>
+ <item>
+ <p>Decodes percent-encoded triplets that
+ correspond to characters in the unreserved set.</p>
+ </item>
+ <tag>Scheme-based normalization</tag>
+ <item>
+ <p>Applying rules for the schemes http, https,
+ ftp, ssh, sftp and tftp.</p>
+ </item>
+ <tag>Path segment normalization</tag>
+ <item>
+ <p>Converts the path into a canonical form.</p>
+ </item>
+ </taglist>
+ <p>After these steps, the intermediate data structure, an
+ <seealso marker="uri_string#type-uri_map"><c>uri_map()</c></seealso>,
+ is fully normalized. The last step is applying
+ <seealso marker="uri_string#recompose/1"><c>uri_string:recompose/1</c></seealso>
+ that converts the intermediate structure into a valid canonical URI string.</p>
+ <p>Notice the order, the
+ <seealso marker="uri_string#normalize/2"><c>uri_string:normalize(URIMap, [return_map])</c></seealso> that we
+ used many times in this user guide is a shortcut in the normalization process
+ returning the intermediate datastructure, and allowing us to inspect and apply
+ further decoding on the remaining percent-encoded triplets.</p>
+ <pre>
+ 13> uri_string:normalize("hTTp://LocalHost:80/%c3%B6rebro/a/../b").
+ "http://localhost/%C3%B6rebro/b"
+ 14> uri_string:normalize("hTTp://LocalHost:80/%c3%B6rebro/a/../b", [return_map]).
+ #{host => "localhost",path => "/%C3%B6rebro/b",
+ scheme => "http"}
+ </pre>
+ </section>
+
+ <section>
+ <title>Special considerations</title>
+ <p>The current URI implementation provides support for producing and consuming
+ standard URIs. The API is not meant to be directly exposed in a Web
+ browser's address bar where users can basically enter free text. Application
+ designers shall implement proper heuristics to map the input into a parsable URI.</p>
+ </section>
+
+</chapter>
diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl
index 0b84a8a91d..bb6c9e542a 100644
--- a/lib/stdlib/src/uri_string.erl
+++ b/lib/stdlib/src/uri_string.erl
@@ -226,10 +226,21 @@
%%-------------------------------------------------------------------------
%% External API
%%-------------------------------------------------------------------------
--export([compose_query/1, compose_query/2,
- dissect_query/1, normalize/1, normalize/2, parse/1,
- recompose/1, resolve/2, resolve/3, transcode/2]).
--export_type([error/0, uri_map/0, uri_string/0]).
+-export([allowed_characters/0,
+ compose_query/1,
+ compose_query/2,
+ dissect_query/1,
+ normalize/1,
+ normalize/2,
+ percent_decode/1,
+ parse/1,
+ recompose/1,
+ resolve/2,
+ resolve/3,
+ transcode/2]).
+-export_type([error/0,
+ uri_map/0,
+ uri_string/0]).
%%-------------------------------------------------------------------------
@@ -286,7 +297,7 @@
port => non_neg_integer() | undefined,
query => unicode:chardata(),
scheme => unicode:chardata(),
- userinfo => unicode:chardata()} | #{}.
+ userinfo => unicode:chardata()}.
%%-------------------------------------------------------------------------
@@ -452,6 +463,61 @@ transcode(URIString, Options) when is_list(URIString) ->
end.
+%%-------------------------------------------------------------------------
+%% Misc
+%%-------------------------------------------------------------------------
+-spec allowed_characters() -> [{atom(), list()}].
+allowed_characters() ->
+ Input = lists:seq(0,127),
+ Scheme = lists:filter(fun is_scheme/1, Input),
+ UserInfo = lists:filter(fun is_userinfo/1, Input),
+ Host = lists:filter(fun is_host/1, Input),
+ IPv4 = lists:filter(fun is_ipv4/1, Input),
+ IPv6 = lists:filter(fun is_ipv6/1, Input),
+ RegName = lists:filter(fun is_reg_name/1, Input),
+ Path = lists:filter(fun is_path/1, Input),
+ Query = lists:filter(fun is_query/1, Input),
+ Fragment = lists:filter(fun is_fragment/1, Input),
+ Reserved = lists:filter(fun is_reserved/1, Input),
+ Unreserved = lists:filter(fun is_unreserved/1, Input),
+ [{scheme, Scheme},
+ {userinfo, UserInfo},
+ {host, Host},
+ {ipv4, IPv4},
+ {ipv6, IPv6},
+ {regname,RegName},
+ {path,Path},
+ {query, Query},
+ {fragment,Fragment},
+ {reserved, Reserved},
+ {unreserved, Unreserved}].
+
+-spec percent_decode(URI) -> Result when
+ URI :: uri_string() | uri_map(),
+ Result :: uri_string() |
+ uri_map() |
+ {error, {invalid, {atom(), {term(), term()}}}}.
+percent_decode(URIMap) when is_map(URIMap)->
+ Fun = fun (K,V) when K =:= userinfo; K =:= host; K =:= path;
+ K =:= query; K =:= fragment ->
+ case raw_decode(V) of
+ {error, Reason, Input} ->
+ throw({error, {invalid, {K, {Reason, Input}}}});
+ Else ->
+ Else
+ end;
+ %% Handle port and scheme
+ (_,V) ->
+ V
+ end,
+ try maps:map(Fun, URIMap)
+ catch throw:Return ->
+ Return
+ end;
+percent_decode(URI) when is_list(URI) orelse
+ is_binary(URI) ->
+ raw_decode(URI).
+
%%-------------------------------------------------------------------------
%% Functions for working with the query part of a URI as a list
%% of key/value pairs.
@@ -1421,8 +1487,15 @@ decode(<<$%,C0,C1,Cs/binary>>, Acc) ->
case is_hex_digit(C0) andalso is_hex_digit(C1) of
true ->
B = ?HEX2DEC(C0)*16+?HEX2DEC(C1),
- case is_reserved(B) of
- true ->
+ %% [2.4] When a URI is dereferenced, the components and subcomponents
+ %% significant to the scheme-specific dereferencing process (if any)
+ %% must be parsed and separated before the percent-encoded octets within
+ %% those components can be safely decoded, as otherwise the data may be
+ %% mistaken for component delimiters. The only exception is for
+ %% percent-encoded octets corresponding to characters in the unreserved
+ %% set, which can be decoded at any time.
+ case is_unreserved(B) of
+ false ->
%% [2.2] Characters in the reserved set are protected from
%% normalization.
%% [2.1] For consistency, URI producers and normalizers should
@@ -1431,7 +1504,7 @@ decode(<<$%,C0,C1,Cs/binary>>, Acc) ->
H0 = hex_to_upper(C0),
H1 = hex_to_upper(C1),
decode(Cs, <<Acc/binary,$%,H0,H1>>);
- false ->
+ true ->
decode(Cs, <<Acc/binary, B>>)
end;
false -> throw({error,invalid_percent_encoding,<<$%,C0,C1>>})
@@ -1441,6 +1514,32 @@ decode(<<C,Cs/binary>>, Acc) ->
decode(<<>>, Acc) ->
check_utf8(Acc).
+-spec raw_decode(list()|binary()) -> list() | binary() | error().
+raw_decode(Cs) ->
+ raw_decode(Cs, <<>>).
+%%
+raw_decode(L, Acc) when is_list(L) ->
+ try
+ B0 = unicode:characters_to_binary(L),
+ B1 = raw_decode(B0, Acc),
+ unicode:characters_to_list(B1)
+ catch
+ throw:{error, Atom, RestData} ->
+ {error, Atom, RestData}
+ end;
+raw_decode(<<$%,C0,C1,Cs/binary>>, Acc) ->
+ case is_hex_digit(C0) andalso is_hex_digit(C1) of
+ true ->
+ B = ?HEX2DEC(C0)*16+?HEX2DEC(C1),
+ raw_decode(Cs, <<Acc/binary, B>>);
+ false ->
+ throw({error,invalid_percent_encoding,<<$%,C0,C1>>})
+ end;
+raw_decode(<<C,Cs/binary>>, Acc) ->
+ raw_decode(Cs, <<Acc/binary, C>>);
+raw_decode(<<>>, Acc) ->
+ check_utf8(Acc).
+
%% Returns Cs if it is utf8 encoded.
check_utf8(Cs) ->
case unicode:characters_to_list(Cs) of
@@ -1897,7 +1996,7 @@ transcode_pct([], Acc, B, InEncoding, OutEncoding) ->
OutBinary = convert_to_binary(B, InEncoding, OutEncoding),
PctEncUtf8 = percent_encode_segment(OutBinary),
Out = convert_to_list(PctEncUtf8, utf8),
- lists:reverse(Acc) ++ Out.
+ lists:reverse(Acc, Out).
%% Convert to binary
@@ -1932,7 +2031,7 @@ flatten_list(L, InEnc) ->
%%
flatten_list([H|T], InEnc, Acc) when is_binary(H) ->
L = convert_to_list(H, InEnc),
- flatten_list(T, InEnc, lists:reverse(L) ++ Acc);
+ flatten_list(T, InEnc, lists:reverse(L, Acc));
flatten_list([H|T], InEnc, Acc) when is_list(H) ->
flatten_list(H ++ T, InEnc, Acc);
flatten_list([H|T], InEnc, Acc) ->
@@ -1952,7 +2051,7 @@ percent_encode_segment(Segment) ->
%%-------------------------------------------------------------------------
%% Returns separator to be used between key-value pairs
-get_separator(L) when length(L) =:= 0 ->
+get_separator([]) ->
<<>>;
get_separator(_L) ->
<<"&">>.
diff --git a/lib/stdlib/test/property_test/uri_string_recompose.erl b/lib/stdlib/test/property_test/uri_string_recompose.erl
index 39fadf23c2..3c0dae0f8b 100644
--- a/lib/stdlib/test/property_test/uri_string_recompose.erl
+++ b/lib/stdlib/test/property_test/uri_string_recompose.erl
@@ -85,9 +85,12 @@ prop_recompose() ->
prop_normalize() ->
?FORALL(Map, map(),
- uri_string:normalize(Map, [return_map]) =:=
- uri_string:normalize(uri_string:parse(uri_string:recompose(Map)),
- [return_map])).
+ uri_string:percent_decode(
+ uri_string:normalize(Map, [return_map])) =:=
+ uri_string:percent_decode(
+ uri_string:normalize(
+ uri_string:parse(uri_string:recompose(Map)),
+ [return_map]))).
%% Stats
prop_map_key_length_collect() ->
diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl
index 51f6aac7ad..a326f56b2a 100644
--- a/lib/stdlib/test/uri_string_SUITE.erl
+++ b/lib/stdlib/test/uri_string_SUITE.erl
@@ -1049,14 +1049,20 @@ normalize_map(_Config) ->
normalize_return_map(_Config) ->
#{scheme := "http",path := "/a/g",host := "localhost-örebro"} =
- uri_string:normalize("http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g",
- [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ "http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g",
+ [return_map])),
#{scheme := <<"http">>,path := <<"/a/g">>, host := <<"localhost-örebro"/utf8>>} =
- uri_string:normalize(<<"http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g">>,
- [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g">>,
+ [return_map])),
#{scheme := <<"https">>,path := <<"/">>, host := <<"localhost">>} =
- uri_string:normalize(#{scheme => <<"https">>,port => 443,path => <<>>,
- host => <<"localhost">>}, [return_map]).
+ uri_string:percent_decode(
+ uri_string:normalize(
+ #{scheme => <<"https">>,port => 443,path => <<>>,
+ host => <<"localhost">>}, [return_map])).
normalize_negative(_Config) ->
{error,invalid_uri,":"} =
@@ -1067,64 +1073,103 @@ normalize_negative(_Config) ->
uri_string:normalize("http://[192.168.0.1]", [return_map]),
{error,invalid_uri,":"} =
uri_string:normalize(<<"http://[192.168.0.1]">>, [return_map]),
- {error,invalid_utf8,<<0,0,0,246>>} = uri_string:normalize("//%00%00%00%F6").
+ {error,invalid_utf8,<<47,47,0,0,0,246>>} =
+ uri_string:percent_decode(uri_string:normalize("//%00%00%00%F6")).
normalize_binary_pct_encoded_userinfo(_Config) ->
#{scheme := <<"user">>, path := <<"合@気道"/utf8>>} =
- uri_string:normalize(<<"user:%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"user:%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map])),
#{path := <<"合気道@"/utf8>>} =
- uri_string:normalize(<<"%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map])),
#{path := <<"/合気道@"/utf8>>} =
- uri_string:normalize(<<"/%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"/%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map])),
#{path := <<"合@気道"/utf8>>} =
- uri_string:normalize(<<"%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map])),
#{userinfo := <<"合"/utf8>>, host := <<"気道"/utf8>>} =
- uri_string:normalize(<<"//%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"//%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map])),
#{userinfo := <<"合:気"/utf8>>, host := <<"道"/utf8>>} =
- uri_string:normalize(<<"//%E5%90%88:%E6%B0%97@%E9%81%93">>, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"//%E5%90%88:%E6%B0%97@%E9%81%93">>, [return_map])),
#{scheme := <<"foo">>, path := <<"/合気道@"/utf8>>} =
- uri_string:normalize(<<"foo:/%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"foo:/%E5%90%88%E6%B0%97%E9%81%93@">>, [return_map])),
#{scheme := <<"foo">>, userinfo := <<"合"/utf8>>, host := <<"気道"/utf8>>} =
- uri_string:normalize(<<"foo://%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"foo://%E5%90%88@%E6%B0%97%E9%81%93">>, [return_map])),
#{scheme := <<"foo">>, userinfo := <<"合:気"/utf8>>, host := <<"道"/utf8>>} =
- uri_string:normalize(<<"foo://%E5%90%88:%E6%B0%97@%E9%81%93">>, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"foo://%E5%90%88:%E6%B0%97@%E9%81%93">>, [return_map])),
{error,invalid_uri,"@"} =
- uri_string:normalize(<<"//%E5%90%88@%E6%B0%97%E9%81%93@">>, [return_map]),
+ uri_string:normalize(
+ <<"//%E5%90%88@%E6%B0%97%E9%81%93@">>, [return_map]),
{error,invalid_uri,":"} =
- uri_string:normalize(<<"foo://%E5%90%88@%E6%B0%97%E9%81%93@">>, [return_map]).
+ uri_string:normalize(
+ <<"foo://%E5%90%88@%E6%B0%97%E9%81%93@">>, [return_map]).
normalize_binary_pct_encoded_query(_Config) ->
#{scheme := <<"foo">>, host := <<"example.com">>, path := <<"/">>,
query := <<"name=合気道"/utf8>>} =
- uri_string:normalize(<<"foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>,
+ [return_map])),
#{host := <<"example.com">>, path := <<"/">>, query := <<"name=合気道"/utf8>>} =
- uri_string:normalize(<<"//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>, [return_map]).
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>, [return_map])).
normalize_binary_pct_encoded_fragment(_Config) ->
#{scheme := <<"foo">>, host := <<"example.com">>, fragment := <<"合気道"/utf8>>} =
- uri_string:normalize(<<"foo://example.com#%E5%90%88%E6%B0%97%E9%81%93">>, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"foo://example.com#%E5%90%88%E6%B0%97%E9%81%93">>, [return_map])),
#{host := <<"example.com">>, path := <<"/">>, fragment := <<"合気道"/utf8>>} =
- uri_string:normalize(<<"//example.com/#%E5%90%88%E6%B0%97%E9%81%93">>, [return_map]).
+ uri_string:percent_decode(
+ uri_string:normalize(
+ <<"//example.com/#%E5%90%88%E6%B0%97%E9%81%93">>, [return_map])).
normalize_pct_encoded_userinfo(_Config) ->
#{scheme := "user", path := "合@気道"} =
- uri_string:normalize("user:%E5%90%88@%E6%B0%97%E9%81%93", [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize("user:%E5%90%88@%E6%B0%97%E9%81%93", [return_map])),
#{path := "合気道@"} =
- uri_string:normalize("%E5%90%88%E6%B0%97%E9%81%93@", [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize("%E5%90%88%E6%B0%97%E9%81%93@", [return_map])),
#{path := "/合気道@"} =
- uri_string:normalize("/%E5%90%88%E6%B0%97%E9%81%93@", [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize("/%E5%90%88%E6%B0%97%E9%81%93@", [return_map])),
#{path := "合@気道"} =
- uri_string:normalize("%E5%90%88@%E6%B0%97%E9%81%93", [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize("%E5%90%88@%E6%B0%97%E9%81%93", [return_map])),
#{userinfo := "合", host := "気道"} =
- uri_string:normalize("//%E5%90%88@%E6%B0%97%E9%81%93", [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize("//%E5%90%88@%E6%B0%97%E9%81%93", [return_map])),
#{userinfo := "合:気", host := "道"} =
- uri_string:normalize("//%E5%90%88:%E6%B0%97@%E9%81%93", [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize("//%E5%90%88:%E6%B0%97@%E9%81%93", [return_map])),
#{scheme := "foo", path := "/合気道@"} =
- uri_string:normalize("foo:/%E5%90%88%E6%B0%97%E9%81%93@", [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize("foo:/%E5%90%88%E6%B0%97%E9%81%93@", [return_map])),
#{scheme := "foo", userinfo := "合", host := "気道"} =
- uri_string:normalize("foo://%E5%90%88@%E6%B0%97%E9%81%93", [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize("foo://%E5%90%88@%E6%B0%97%E9%81%93", [return_map])),
#{scheme := "foo", userinfo := "合:気", host := "道"} =
- uri_string:normalize("foo://%E5%90%88:%E6%B0%97@%E9%81%93", [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize("foo://%E5%90%88:%E6%B0%97@%E9%81%93", [return_map])),
{error,invalid_uri,"@"} =
uri_string:normalize("//%E5%90%88@%E6%B0%97%E9%81%93@", [return_map]),
{error,invalid_uri,":"} =
@@ -1133,25 +1178,37 @@ normalize_pct_encoded_userinfo(_Config) ->
normalize_pct_encoded_query(_Config) ->
#{scheme := "foo", host := "example.com", path := "/",
query := "name=合気道"} =
- uri_string:normalize("foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93", [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ "foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93", [return_map])),
#{host := "example.com", path := "/", query := "name=合気道"} =
- uri_string:normalize("//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93", [return_map]).
+ uri_string:percent_decode(
+ uri_string:normalize(
+ "//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93", [return_map])).
normalize_pct_encoded_fragment(_Config) ->
#{scheme := "foo", host := "example.com", fragment := "合気道"} =
- uri_string:normalize("foo://example.com#%E5%90%88%E6%B0%97%E9%81%93", [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(
+ "foo://example.com#%E5%90%88%E6%B0%97%E9%81%93", [return_map])),
#{host := "example.com", path := "/", fragment := "合気道"} =
- uri_string:normalize("//example.com/#%E5%90%88%E6%B0%97%E9%81%93", [return_map]).
+ uri_string:percent_decode(
+ uri_string:normalize(
+ "//example.com/#%E5%90%88%E6%B0%97%E9%81%93", [return_map])).
normalize_pct_encoded_negative(_Config) ->
- {error,invalid_utf8,<<0,0,0,246>>} =
- uri_string:normalize(#{host => "%00%00%00%F6",path => []}, [return_map]),
- {error,invalid_utf8,<<0,0,0,246>>} =
- uri_string:normalize(#{host => "%00%00%00%F6",path => []}, []),
- {error,invalid_utf8,<<0,0,0,246>>} =
- uri_string:normalize("//%00%00%00%F6", [return_map]),
- {error,invalid_utf8,<<0,0,0,246>>} =
- uri_string:normalize("//%00%00%00%F6", []).
+ {error,{invalid,{host,{invalid_utf8,<<0,0,0,246>>}}}} =
+ uri_string:percent_decode(
+ uri_string:normalize(#{host => "%00%00%00%F6",path => []}, [return_map])),
+ {error,invalid_utf8,<<47,47,0,0,0,246>>} =
+ uri_string:percent_decode(
+ uri_string:normalize(#{host => "%00%00%00%F6",path => []}, [])),
+ {error,{invalid,{host,{invalid_utf8,<<0,0,0,246>>}}}} =
+ uri_string:percent_decode(
+ uri_string:normalize("//%00%00%00%F6", [return_map])),
+ {error,invalid_utf8,<<47,47,0,0,0,246>>} =
+ uri_string:percent_decode(
+ uri_string:normalize("//%00%00%00%F6", [])).
interop_query_utf8(_Config) ->
Q = uri_string:compose_query([{"foo bar","1"}, {"合", "2"}]),
@@ -1216,8 +1273,8 @@ regression_normalize(_Config) ->
"foo://%C3%B6" =
uri_string:normalize("FOo://%C3%B6"),
#{host := "ö",path := [],scheme := "foo"} =
- uri_string:normalize("FOo://%C3%B6", [return_map]),
-
+ uri_string:percent_decode(
+ uri_string:normalize("FOo://%C3%B6", [return_map])),
"foo://bar" =
uri_string:normalize(#{host => "Bar",path => [],scheme => "FOo"}),
@@ -1242,7 +1299,9 @@ regression_normalize(_Config) ->
"foo://%C3%B6" =
uri_string:normalize(#{host => "%C3%B6",path => [],scheme => "FOo"}),
#{host := "ö",path := [],scheme := "foo"} =
- uri_string:normalize(#{host => "%C3%B6",path => [],scheme => "FOo"}, [return_map]),
+ uri_string:percent_decode(
+ uri_string:normalize(#{host => "%C3%B6",path => [],scheme => "FOo"},
+ [return_map])),
"foo://%C3%B6" =
uri_string:normalize(#{host => "ö",path => [],scheme => "FOo"}),
--
2.26.2