File ntfs-3g-utf8-fallback.patch of Package ntfs-3g

- bk@suse.de:
#
# NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
# for now]) for path names, but the Unicode code points need to be
# converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
# glibc does this even without a locale in a hard-coded fashion as that
# appears to be is easy because the low 7-bit ASCII range appears to be
# available # in all charsets but it does not convert anything if
# there was some error with the locale setup or none set up like
# when mount is called during early boot where he (by policy) do
# not use locales (and may be not available if /usr is not yet mounted),
# so this patch fixes the resulting issues for systems which use
# UTF-8 and for others, specifying the locale in fstab brings them
# the encoding which they want.
#
# If no locale is defined or there was a problem with setting one
# up and whenever nl_langinfo(CODESET) returns a sting starting with
# "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
# the bug where NTFS-3G does not show any path names which include
# international characters!!! (and also fails on creating them) as result.
#
# Author: Bernhard Kaindl <bk@suse.de>
#
--- include/ntfs-3g/unistr.h
+++ include/ntfs-3g/unistr.h
@@ -26,6 +26,8 @@
 #include "types.h"
 #include "layout.h"
 
+extern int use_utf8;
+
 extern BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
 		const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
 		const ntfschar *upcase, const u32 upcase_size);
--- libntfs-3g/unistr.c
+++ libntfs-3g/unistr.c
@@ -47,6 +47,8 @@
 #include "logging.h"
 #include "misc.h"
 
+int use_utf8;
+
 /*
  * IMPORTANT
  * =========
@@ -373,6 +375,85 @@ int ntfs_file_values_compare(const FILE_
 			err_val, ic, upcase, upcase_len);
 }
 
+/* Return the amount of 16-bit elements in UTF-16LE needed (without
+ * the terminating null to store given UTF-8 string and -1 if it does
+ * noy fit into PATH_MAX
+ * TODO: Extend this with a function to suppport UTF-16LE.
+*/
+static int ucs2_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
+{
+	int i;
+	int count = 0;
+
+	for (i = 0; i < ins_len && ins[i]; i++) {
+		unsigned short c = le16_to_cpu(ins[i]);
+		if (c < 0x80)
+			count++;
+		else
+			count += (c & 0xf800) ? 3 : 2;
+		if (count > outs_len)
+			goto fail;
+	}
+	return count;
+fail:
+	return -1;
+}
+
+/*
+ * ntfs_ucs_to_utf8 - convert a little endian Unicode string to an UTF-8 string
+ * @ins:	input Unicode string buffer
+ * @ins_len:	length of input string in Unicode characters
+ * @outs:	on return contains the (allocated) output multibyte string
+ * @outs_len:	length of output buffer in bytes
+ * TODO: Replace this with a function which converts from UTF-16LE because
+ * NTFS uses UTF-16LE. UTF-16 supports more rare/unusual characters than UCS-2
+ */
+int ntfs_ucs_to_utf8(const ntfschar *ins, const int ins_len, char **outs, int outs_len)
+{
+	char *t, *end;
+	int i, size;
+
+	if (!*outs)
+		outs_len = PATH_MAX;
+
+	size = ucs2_to_utf8_size(ins, ins_len, outs_len);
+
+	if (size < 0) {
+		errno = ENAMETOOLONG;
+		goto fail;
+	}
+	if (!*outs)
+		*outs = ntfs_malloc((outs_len = size + 1));
+
+	t = *outs;
+	end = t + outs_len;
+
+	for (i = 0; i < ins_len && ins[i]; i++) {
+	    unsigned short c = le16_to_cpu(ins[i]);
+	    if (c < 0x80) {
+		*t++ = c;
+		if (t == end)
+			goto fail;
+	    } else {
+	       if (c & 0xf800) {
+		   if (t+3 >= end)
+			goto fail;
+		   *t++ = 0xe0 | (c >> 12);
+		   *t++ = 0x80 | ((c >> 6) & 0x3f);
+	       } else {
+	           if (t+2 >= end)
+			goto fail;
+		   *t++ = (0xc0 | ((c >> 6) & 0x3f));
+	       }
+	       *t++ = 0x80 | (c & 0x3f);
+	    }
+	}
+	*t = '\0';
+	return t - *outs;
+fail:
+	return -1;
+}
+
 /**
  * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
  * @ins:	input Unicode string buffer
@@ -397,6 +478,8 @@ int ntfs_file_values_compare(const FILE_
  *			sequence according to the current locale.
  *	ENAMETOOLONG	Destination buffer is too small for input string.
  *	ENOMEM		Not enough memory to allocate destination buffer.
+ * TODO: Replace this with a function which converts from UTF-16LE because
+ * NTFS uses UTF-16LE. UTF-16 supports more rare/unusual characters than UCS-2
  */
 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
 		int outs_len)
@@ -419,12 +502,15 @@ int ntfs_ucstombs(const ntfschar *ins, c
 		errno = ENAMETOOLONG;
 		return -1;
 	}
+	if (use_utf8)
+		return ntfs_ucs_to_utf8(ins, ins_len, outs, outs_len);
 	if (!mbs) {
 		mbs_len = (ins_len + 1) * MB_CUR_MAX;
 		mbs = ntfs_malloc(mbs_len);
 		if (!mbs)
 			return -1;
 	}
+
 #ifdef HAVE_MBSINIT
 	memset(&mbstate, 0, sizeof(mbstate));
 #else
@@ -487,6 +573,107 @@ err_out:
 	return -1;
 }
 
+/* Return the amount of 16-bit elements in UTF-16LE needed (without
+ * the terminating null to store given UTF-8 string and -1 if it does
+ * noy fit into PATH_MAX
+ * TODO: Extend this with a function to suppport UTF-16LE.
+*/
+static int utf8_to_ucs2_size(const char *s)
+{
+    unsigned int byte;
+    size_t count = 0;
+
+    while ((byte = *((unsigned char *)s++))) {
+	    if (++count >= PATH_MAX || byte >= 0xF0)
+		goto fail;
+	    if (!*s) break;
+	    if (byte >= 0xC0) s++;
+	    if (!*s) break;
+	    if (byte >= 0xE0) s++;
+    }
+    return count;
+fail:
+    return -1;
+}
+/* This converts one UTF-8 sequence to cpu-endian UCS-2
+ * TODO: Replace this with a function which converts to UTF-16LE because
+ * NTFS uses UTF-16LE. UTF-16 supports more rare/unusual characters than UCS-2
+*/
+static int utf8toucs2(wchar_t *wc, const char *s)
+{
+    unsigned int byte = *((unsigned char *)s);
+
+    if (byte == 0) {
+        *wc = (wchar_t) 0;
+        return 0;
+    } else if (byte < 0xC0) {
+        *wc = (wchar_t) byte;
+        return 1;
+    } else if (byte < 0xE0) {
+	if(strlen(s) < 2)
+		goto fail;
+        if ((s[1] & 0xC0) == 0x80) {
+            *wc = (wchar_t) (((byte & 0x1F) << 6) | (s[1] & 0x3F));
+            return 2;
+        } else
+		goto fail;
+    } else if (byte < 0xF0) {
+	if(strlen(s) < 3)
+		goto fail;
+        if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
+            *wc = (wchar_t) (((byte & 0x0F) << 12)
+                    | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F));
+	    /* Surrogates range */
+	    if((*wc >= 0xD800 && *wc <= 0xDFFF) ||
+	       (*wc == 0xFFFE || *wc == 0xFFFF))
+			goto fail;
+            return 3;
+        }
+    }
+fail:
+    return -1;
+}
+
+/**
+ * ntfs_utf8_to_ucs - convert a UTF-8 string to a UCS-2LE Unicode string
+ * @ins:	input multibyte string buffer
+ * @outs:	on return contains the (allocated) output Unicode string
+ * @outs_len:	length of output buffer in Unicode characters
+ * TODO: Replace this with a function which converts to UTF-16LE because
+ * NTFS uses UTF-16LE. UTF-16 supports more rare/unusual characters than UCS-2
+ */
+int ntfs_utf8_to_ucs(const char *ins, ntfschar **outs)
+{
+	const char *t = ins;
+	wchar_t wc;
+	ntfschar *outpos;
+	int shorts = utf8_to_ucs2_size(ins);
+
+	if (shorts < 0) {
+		errno = EILSEQ;
+		goto fail;
+	}
+	if (!*outs)
+		*outs = ntfs_malloc((shorts+1) * sizeof(ntfschar));
+
+	outpos = *outs;
+
+	while(1) {
+		int m  = utf8toucs2(&wc, t);
+		if (m < 0) {
+			errno = EILSEQ;
+			goto fail;
+		}
+		*outpos++ = cpu_to_le16(wc);
+		if (m == 0)
+			break;
+		t += m;
+	}
+    return --outpos - *outs;
+fail:
+    return -1;
+}
+
 /**
  * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
  * @ins:	input multibyte string buffer
@@ -509,6 +696,8 @@ err_out:
  *			string according to the current locale.
  *	ENAMETOOLONG	Destination buffer is too small for input string.
  *	ENOMEM		Not enough memory to allocate destination buffer.
+ * TODO: Replace this with a function which converts to UTF-16LE because
+ * NTFS uses UTF-16LE. UTF-16 supports more rare/unusual characters than UCS-2
  */
 int ntfs_mbstoucs(const char *ins, ntfschar **outs)
 {
@@ -524,6 +713,8 @@ int ntfs_mbstoucs(const char *ins, ntfsc
 		errno = EINVAL;
 		return -1;
 	}
+	if (use_utf8)
+		return ntfs_utf8_to_ucs(ins, outs);
 	
 	/* Determine the size of the multi-byte string in bytes. */
 	ins_size = strlen(ins);
--- src/ntfs-3g.c
+++ src/ntfs-3g.c
@@ -69,6 +69,7 @@
 #include <getopt.h>
 #include <syslog.h>
 #include <sys/wait.h>
+#include <langinfo.h>
 
 #ifdef HAVE_SETXATTR
 #include <sys/xattr.h>
@@ -2224,6 +2225,15 @@ static void setup_logging(char *parsed_o
 	ntfs_log_info("Mount options: %s\n", parsed_options);
 }
 
+void check_codeset() {
+	char *codeset = nl_langinfo(CODESET);
+	if (!codeset || !strncmp(codeset, "ANSI", 4)) {
+		ntfs_log_info("Locale invalid or has ANSI codeset: "
+				"Using UTF-8 for international characters.\n");
+		use_utf8 = 1;
+ 	}
+}
+		
 int main(int argc, char *argv[])
 {
 	char *parsed_options = NULL;
@@ -2260,6 +2270,8 @@ int main(int argc, char *argv[])
 		err = NTFS_VOLUME_SYNTAX_ERROR;
 		goto err_out;
 	}
+
+	check_codeset();
 	
 #if defined(linux) || defined(__uClinux__)
 	fstype = get_fuse_fstype();
openSUSE Build Service is sponsored by