File 0001-id3-always-use-little-endian-order-for-utf-16-with-BOM.patch of Package python3-mutagen
From c27b957e242c24b5f247a51fd1807e332fe7ef24 Mon Sep 17 00:00:00 2001
From: Christoph Reiter <reiter.christoph@gmail.com>
Date: Wed, 25 Jan 2017 13:05:45 +0100
Subject: [PATCH] id3: always use little-endian order for utf-16 with BOM. See
#289
While both variants are valid for ID3 this makes tags the same
on all platforms and reduces the risk that tests break on big endian
machines.
---
mutagen/_util.py | 32 ++++++++++++++++++++++++++++++++
mutagen/id3/_specs.py | 10 +++++++---
tests/test__id3specs.py | 9 +--------
tests/test__util.py | 29 ++++++++++++++++++++++++++++-
4 files changed, 68 insertions(+), 12 deletions(-)
diff --git a/mutagen/_util.py b/mutagen/_util.py
index 499729e..2a28995 100644
--- a/mutagen/_util.py
+++ b/mutagen/_util.py
@@ -888,6 +888,38 @@ def dict_match(d, key, default=None):
return default
+def encode_endian(text, encoding, errors="strict", le=True):
+ """Like text.encode(encoding) but always returns little endian/big endian
+ BOMs instead of the system one.
+
+ Args:
+ text (text)
+ encoding (str)
+ errors (str)
+ le (boolean): if little endian
+ Returns:
+ bytes
+ Raises:
+ UnicodeEncodeError
+ LookupError
+ """
+
+ encoding = codecs.lookup(encoding).name
+
+ if encoding == "utf-16":
+ if le:
+ return codecs.BOM_UTF16_LE + text.encode("utf-16-le", errors)
+ else:
+ return codecs.BOM_UTF16_BE + text.encode("utf-16-be", errors)
+ elif encoding == "utf-32":
+ if le:
+ return codecs.BOM_UTF32_LE + text.encode("utf-32-le", errors)
+ else:
+ return codecs.BOM_UTF32_BE + text.encode("utf-32-be", errors)
+ else:
+ return text.encode(encoding, errors)
+
+
def decode_terminated(data, encoding, strict=True):
"""Returns the decoded data until the first NULL terminator
and all data after it.
diff --git a/mutagen/id3/_specs.py b/mutagen/id3/_specs.py
index 9454596..9b5dce7 100644
--- a/mutagen/id3/_specs.py
+++ b/mutagen/id3/_specs.py
@@ -12,7 +12,8 @@
from .._compat import text_type, chr_, PY3, swap_to_string, string_types, \
xrange
-from .._util import total_ordering, decode_terminated, enum, izip, flags, cdata
+from .._util import total_ordering, decode_terminated, enum, izip, flags, \
+ cdata, encode_endian
from ._util import BitPaddedInt, is_valid_frame_id
@@ -487,7 +488,7 @@ def read(self, header, frame, data):
def write(self, config, frame, value):
enc, term = self._encodings[frame.encoding]
try:
- return value.encode(enc) + term
+ return encode_endian(value, enc, le=True) + term
except UnicodeEncodeError as e:
raise SpecError(e)
@@ -815,7 +816,10 @@ def write(self, config, frame, value):
data = []
encoding, term = self._encodings[frame.encoding]
for text, time in value:
- text = text.encode(encoding) + term
+ try:
+ text = encode_endian(text, encoding, le=True) + term
+ except UnicodeEncodeError as e:
+ raise SpecError(e)
data.append(text + struct.pack(">I", time))
return b"".join(data)
diff --git a/tests/test__id3specs.py b/tests/test__id3specs.py
index 5ca00cd..c14083a 100644
--- a/tests/test__id3specs.py
+++ b/tests/test__id3specs.py
@@ -1,7 +1,5 @@
# -*- coding: utf-8 -*-
-import sys
-
from tests import TestCase
from mutagen._compat import PY3
@@ -27,12 +25,7 @@ def test_write(self):
self.assertEqual(
s.read(None, f, s.write(None, f, values)), (values, b""))
data = s.write(None, f, [(u"A", 100)])
- if sys.byteorder == 'little':
- self.assertEquals(
- data, b"\xff\xfeA\x00\x00\x00\x00\x00\x00d")
- else:
- self.assertEquals(
- data, b"\xfe\xff\x00A\x00\x00\x00\x00\x00d")
+ self.assertEquals(data, b"\xff\xfeA\x00\x00\x00\x00\x00\x00d")
# utf-16be
f.encoding = 2
diff --git a/tests/test__util.py b/tests/test__util.py
index f2e3196..76d1095 100644
--- a/tests/test__util.py
+++ b/tests/test__util.py
@@ -3,7 +3,7 @@
from mutagen._util import DictMixin, cdata, insert_bytes, delete_bytes, \
decode_terminated, dict_match, enum, get_size, BitReader, BitReaderError, \
resize_bytes, seek_end, mmap_move, verify_fileobj, fileobj_name, \
- read_full, flags, resize_file, fallback_move
+ read_full, flags, resize_file, fallback_move, encode_endian
from mutagen._compat import text_type, itervalues, iterkeys, iteritems, PY2, \
cBytesIO, xrange, BytesIO
from tests import TestCase, get_temp_empty
@@ -742,6 +742,33 @@ def test_get_size(self):
self.assertEqual(f.tell(), 1)
+class Tencode_endian(TestCase):
+
+ def test_other(self):
+ assert encode_endian(u"\xe4", "latin-1") == b"\xe4"
+ assert encode_endian(u"\xe4", "utf-8") == b"\xc3\xa4"
+ with self.assertRaises(LookupError):
+ encode_endian(u"", "nopenope")
+ with self.assertRaises(UnicodeEncodeError):
+ assert encode_endian(u"\u2714", "latin-1")
+ assert encode_endian(u"\u2714", "latin-1", "replace") == b"?"
+
+ def test_utf_16(self):
+ assert encode_endian(u"\xe4", "utf-16", le=True) == b"\xff\xfe\xe4\x00"
+ assert encode_endian(u"\xe4", "utf-16-le") == b"\xe4\x00"
+ assert encode_endian(
+ u"\xe4", "utf-16", le=False) == b"\xfe\xff\x00\xe4"
+ assert encode_endian(u"\xe4", "utf-16-be") == b"\x00\xe4"
+
+ def test_utf_32(self):
+ assert encode_endian(u"\xe4", "utf-32", le=True) == \
+ b"\xff\xfe\x00\x00\xe4\x00\x00\x00"
+ assert encode_endian(u"\xe4", "utf-32-le") == b"\xe4\x00\x00\x00"
+ assert encode_endian(
+ u"\xe4", "utf-32", le=False) == b"\x00\x00\xfe\xff\x00\x00\x00\xe4"
+ assert encode_endian(u"\xe4", "utf-32-be") == b"\x00\x00\x00\xe4"
+
+
class Tdecode_terminated(TestCase):
def test_all(self):