File pcre-utf8.patch of Package grep.35702
* src/pcresearch.c (Pexecute): Do not use PCRE_NO_UTF8_CHECK,
as this leads to undefined behavior when the input is not UTF-8.
* tests/pcre-infloop, tests/pcre-invalid-utf8-input:
Exit status is now 2, not 1, when grep -P is given invalid UTF-8
data in a UTF-8 locale.
* src/dfa.c (using_utf8): Remove "static inline".
* src/dfa.h (using_utf8): Declare it.
Index: grep-2.16/src/dfa.c
===================================================================
--- grep-2.16.orig/src/dfa.c
+++ grep-2.16/src/dfa.c
@@ -753,7 +753,7 @@ setbit_case_fold_c (int b, charclass c)
/* UTF-8 encoding allows some optimizations that we can't otherwise
assume in a multibyte encoding. */
-static inline int
+int
using_utf8 (void)
{
static int utf8 = -1;
Index: grep-2.16/src/dfa.h
===================================================================
--- grep-2.16.orig/src/dfa.h
+++ grep-2.16/src/dfa.h
@@ -99,3 +99,5 @@ extern void dfawarn (const char *);
takes a single argument, a NUL-terminated string describing the error.
The user must supply a dfaerror. */
extern _Noreturn void dfaerror (const char *);
+
+extern int using_utf8 (void);
Index: grep-2.16/src/pcresearch.c
===================================================================
--- grep-2.16.orig/src/pcresearch.c
+++ grep-2.16/src/pcresearch.c
@@ -20,14 +20,12 @@
#include <config.h>
#include "search.h"
+#include "dfa.h"
#if HAVE_PCRE_H
# include <pcre.h>
#elif HAVE_PCRE_PCRE_H
# include <pcre/pcre.h>
#endif
-#if HAVE_LANGINFO_CODESET
-# include <langinfo.h>
-#endif
#if HAVE_LIBPCRE
/* Compiled internal form of a Perl regular expression. */
@@ -54,21 +52,14 @@ Pcompile (char const *pattern, size_t si
int e;
char const *ep;
char *re = xnmalloc (4, size + 7);
- int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
+ int flags = (PCRE_MULTILINE
+ | (match_icase ? PCRE_CASELESS : 0)
+ | (using_utf8 () ? PCRE_UTF8 : 0));
char const *patlim = pattern + size;
char *n = re;
char const *p;
char const *pnul;
-# if defined HAVE_LANGINFO_CODESET
- if (STREQ (nl_langinfo (CODESET), "UTF-8"))
- {
- /* Enable PCRE's UTF-8 matching. Note also the use of
- PCRE_NO_UTF8_CHECK when calling pcre_extra, below. */
- flags |= PCRE_UTF8;
- }
-# endif
-
/* FIXME: Remove these restrictions. */
if (memchr (pattern, '\n', size))
error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
@@ -158,10 +149,6 @@ Pexecute (char const *buf, size_t size,
e == PCRE_ERROR_NOMATCH && line_next < buf + size;
start_ofs -= line_next - line_buf)
{
- /* Disable the check that would make an invalid byte
- seqence *in the input* trigger a failure. */
- int options = PCRE_NO_UTF8_CHECK;
-
line_buf = line_next;
line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
if (line_end == NULL)
@@ -176,7 +163,7 @@ Pexecute (char const *buf, size_t size,
error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
- start_ofs < 0 ? 0 : start_ofs, options,
+ start_ofs < 0 ? 0 : start_ofs, 0,
sub, sizeof sub / sizeof *sub);
}
Index: grep-2.16/tests/pcre-invalid-utf8-input
===================================================================
--- grep-2.16.orig/tests/pcre-invalid-utf8-input
+++ grep-2.16/tests/pcre-invalid-utf8-input
@@ -15,8 +15,7 @@ fail=0
printf 'j\202\nj\n' > in || framework_failure_
-LC_ALL=en_US.UTF-8 grep -P j in > out 2>&1 || fail=1
-compare in out || fail=1
-compare /dev/null err || fail=1
+LC_ALL=en_US.UTF-8 grep -P j in
+test $? -eq 2 || fail=1
Exit $fail