File ntfs-3g-utf8-fallback.patch of Package ntfs-3g
- bk@suse.de:
#
# NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
# for now]) for path names, but the Unicode code points need to be
# converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
# glibc does this even without a locale in a hard-coded fashion as that
# appears to be is easy because the low 7-bit ASCII range appears to be
# available # in all charsets but it does not convert anything if
# there was some error with the locale setup or none set up like
# when mount is called during early boot where he (by policy) do
# not use locales (and may be not available if /usr is not yet mounted),
# so this patch fixes the resulting issues for systems which use
# UTF-8 and for others, specifying the locale in fstab brings them
# the encoding which they want.
#
# If no locale is defined or there was a problem with setting one
# up and whenever nl_langinfo(CODESET) returns a sting starting with
# "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
# the bug where NTFS-3G does not show any path names which include
# international characters!!! (and also fails on creating them) as result.
#
# Author: Bernhard Kaindl <bk@suse.de>
#
--- include/ntfs-3g/unistr.h
+++ include/ntfs-3g/unistr.h
@@ -26,6 +26,8 @@
#include "types.h"
#include "layout.h"
+extern int use_utf8;
+
extern BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
const ntfschar *upcase, const u32 upcase_size);
--- libntfs-3g/unistr.c
+++ libntfs-3g/unistr.c
@@ -47,6 +47,8 @@
#include "logging.h"
#include "misc.h"
+int use_utf8;
+
/*
* IMPORTANT
* =========
@@ -373,6 +375,85 @@ int ntfs_file_values_compare(const FILE_
err_val, ic, upcase, upcase_len);
}
+/* Return the amount of 16-bit elements in UTF-16LE needed (without
+ * the terminating null to store given UTF-8 string and -1 if it does
+ * noy fit into PATH_MAX
+ * TODO: Extend this with a function to suppport UTF-16LE.
+*/
+static int ucs2_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
+{
+ int i;
+ int count = 0;
+
+ for (i = 0; i < ins_len && ins[i]; i++) {
+ unsigned short c = le16_to_cpu(ins[i]);
+ if (c < 0x80)
+ count++;
+ else
+ count += (c & 0xf800) ? 3 : 2;
+ if (count > outs_len)
+ goto fail;
+ }
+ return count;
+fail:
+ return -1;
+}
+
+/*
+ * ntfs_ucs_to_utf8 - convert a little endian Unicode string to an UTF-8 string
+ * @ins: input Unicode string buffer
+ * @ins_len: length of input string in Unicode characters
+ * @outs: on return contains the (allocated) output multibyte string
+ * @outs_len: length of output buffer in bytes
+ * TODO: Replace this with a function which converts from UTF-16LE because
+ * NTFS uses UTF-16LE. UTF-16 supports more rare/unusual characters than UCS-2
+ */
+int ntfs_ucs_to_utf8(const ntfschar *ins, const int ins_len, char **outs, int outs_len)
+{
+ char *t, *end;
+ int i, size;
+
+ if (!*outs)
+ outs_len = PATH_MAX;
+
+ size = ucs2_to_utf8_size(ins, ins_len, outs_len);
+
+ if (size < 0) {
+ errno = ENAMETOOLONG;
+ goto fail;
+ }
+ if (!*outs)
+ *outs = ntfs_malloc((outs_len = size + 1));
+
+ t = *outs;
+ end = t + outs_len;
+
+ for (i = 0; i < ins_len && ins[i]; i++) {
+ unsigned short c = le16_to_cpu(ins[i]);
+ if (c < 0x80) {
+ *t++ = c;
+ if (t == end)
+ goto fail;
+ } else {
+ if (c & 0xf800) {
+ if (t+3 >= end)
+ goto fail;
+ *t++ = 0xe0 | (c >> 12);
+ *t++ = 0x80 | ((c >> 6) & 0x3f);
+ } else {
+ if (t+2 >= end)
+ goto fail;
+ *t++ = (0xc0 | ((c >> 6) & 0x3f));
+ }
+ *t++ = 0x80 | (c & 0x3f);
+ }
+ }
+ *t = '\0';
+ return t - *outs;
+fail:
+ return -1;
+}
+
/**
* ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
* @ins: input Unicode string buffer
@@ -397,6 +478,8 @@ int ntfs_file_values_compare(const FILE_
* sequence according to the current locale.
* ENAMETOOLONG Destination buffer is too small for input string.
* ENOMEM Not enough memory to allocate destination buffer.
+ * TODO: Replace this with a function which converts from UTF-16LE because
+ * NTFS uses UTF-16LE. UTF-16 supports more rare/unusual characters than UCS-2
*/
int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
int outs_len)
@@ -419,12 +502,15 @@ int ntfs_ucstombs(const ntfschar *ins, c
errno = ENAMETOOLONG;
return -1;
}
+ if (use_utf8)
+ return ntfs_ucs_to_utf8(ins, ins_len, outs, outs_len);
if (!mbs) {
mbs_len = (ins_len + 1) * MB_CUR_MAX;
mbs = ntfs_malloc(mbs_len);
if (!mbs)
return -1;
}
+
#ifdef HAVE_MBSINIT
memset(&mbstate, 0, sizeof(mbstate));
#else
@@ -487,6 +573,107 @@ err_out:
return -1;
}
+/* Return the amount of 16-bit elements in UTF-16LE needed (without
+ * the terminating null to store given UTF-8 string and -1 if it does
+ * noy fit into PATH_MAX
+ * TODO: Extend this with a function to suppport UTF-16LE.
+*/
+static int utf8_to_ucs2_size(const char *s)
+{
+ unsigned int byte;
+ size_t count = 0;
+
+ while ((byte = *((unsigned char *)s++))) {
+ if (++count >= PATH_MAX || byte >= 0xF0)
+ goto fail;
+ if (!*s) break;
+ if (byte >= 0xC0) s++;
+ if (!*s) break;
+ if (byte >= 0xE0) s++;
+ }
+ return count;
+fail:
+ return -1;
+}
+/* This converts one UTF-8 sequence to cpu-endian UCS-2
+ * TODO: Replace this with a function which converts to UTF-16LE because
+ * NTFS uses UTF-16LE. UTF-16 supports more rare/unusual characters than UCS-2
+*/
+static int utf8toucs2(wchar_t *wc, const char *s)
+{
+ unsigned int byte = *((unsigned char *)s);
+
+ if (byte == 0) {
+ *wc = (wchar_t) 0;
+ return 0;
+ } else if (byte < 0xC0) {
+ *wc = (wchar_t) byte;
+ return 1;
+ } else if (byte < 0xE0) {
+ if(strlen(s) < 2)
+ goto fail;
+ if ((s[1] & 0xC0) == 0x80) {
+ *wc = (wchar_t) (((byte & 0x1F) << 6) | (s[1] & 0x3F));
+ return 2;
+ } else
+ goto fail;
+ } else if (byte < 0xF0) {
+ if(strlen(s) < 3)
+ goto fail;
+ if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
+ *wc = (wchar_t) (((byte & 0x0F) << 12)
+ | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F));
+ /* Surrogates range */
+ if((*wc >= 0xD800 && *wc <= 0xDFFF) ||
+ (*wc == 0xFFFE || *wc == 0xFFFF))
+ goto fail;
+ return 3;
+ }
+ }
+fail:
+ return -1;
+}
+
+/**
+ * ntfs_utf8_to_ucs - convert a UTF-8 string to a UCS-2LE Unicode string
+ * @ins: input multibyte string buffer
+ * @outs: on return contains the (allocated) output Unicode string
+ * @outs_len: length of output buffer in Unicode characters
+ * TODO: Replace this with a function which converts to UTF-16LE because
+ * NTFS uses UTF-16LE. UTF-16 supports more rare/unusual characters than UCS-2
+ */
+int ntfs_utf8_to_ucs(const char *ins, ntfschar **outs)
+{
+ const char *t = ins;
+ wchar_t wc;
+ ntfschar *outpos;
+ int shorts = utf8_to_ucs2_size(ins);
+
+ if (shorts < 0) {
+ errno = EILSEQ;
+ goto fail;
+ }
+ if (!*outs)
+ *outs = ntfs_malloc((shorts+1) * sizeof(ntfschar));
+
+ outpos = *outs;
+
+ while(1) {
+ int m = utf8toucs2(&wc, t);
+ if (m < 0) {
+ errno = EILSEQ;
+ goto fail;
+ }
+ *outpos++ = cpu_to_le16(wc);
+ if (m == 0)
+ break;
+ t += m;
+ }
+ return --outpos - *outs;
+fail:
+ return -1;
+}
+
/**
* ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
* @ins: input multibyte string buffer
@@ -509,6 +696,8 @@ err_out:
* string according to the current locale.
* ENAMETOOLONG Destination buffer is too small for input string.
* ENOMEM Not enough memory to allocate destination buffer.
+ * TODO: Replace this with a function which converts to UTF-16LE because
+ * NTFS uses UTF-16LE. UTF-16 supports more rare/unusual characters than UCS-2
*/
int ntfs_mbstoucs(const char *ins, ntfschar **outs)
{
@@ -524,6 +713,8 @@ int ntfs_mbstoucs(const char *ins, ntfsc
errno = EINVAL;
return -1;
}
+ if (use_utf8)
+ return ntfs_utf8_to_ucs(ins, outs);
/* Determine the size of the multi-byte string in bytes. */
ins_size = strlen(ins);
--- src/ntfs-3g.c
+++ src/ntfs-3g.c
@@ -69,6 +69,7 @@
#include <getopt.h>
#include <syslog.h>
#include <sys/wait.h>
+#include <langinfo.h>
#ifdef HAVE_SETXATTR
#include <sys/xattr.h>
@@ -2224,6 +2225,15 @@ static void setup_logging(char *parsed_o
ntfs_log_info("Mount options: %s\n", parsed_options);
}
+void check_codeset() {
+ char *codeset = nl_langinfo(CODESET);
+ if (!codeset || !strncmp(codeset, "ANSI", 4)) {
+ ntfs_log_info("Locale invalid or has ANSI codeset: "
+ "Using UTF-8 for international characters.\n");
+ use_utf8 = 1;
+ }
+}
+
int main(int argc, char *argv[])
{
char *parsed_options = NULL;
@@ -2260,6 +2270,8 @@ int main(int argc, char *argv[])
err = NTFS_VOLUME_SYNTAX_ERROR;
goto err_out;
}
+
+ check_codeset();
#if defined(linux) || defined(__uClinux__)
fstype = get_fuse_fstype();