File pcre-invalid-utf8.patch of Package grep.35702
From 16fc7fa0e0f273fa81e382b4303617d707364902 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Tue, 9 Sep 2014 12:41:54 -0700
Subject: [PATCH] grep: -P now treats invalid UTF-8 input as non-matching
Problem reported by Santiago Vila in: http://bugs.gnu.org/18266
* NEWS: Mention this.
* src/pcresearch.c (Pexecute): Treat UTF-8 encoding errors
as non-matching data, instead of exiting 'grep'.
* tests/pcre-infloop: grep now exits with status 1, not 2.
* tests/pcre-invalid-utf8-input: grep now exits with status 0, not 2.
---
NEWS | 3 ++
src/pcresearch.c | 70 ++++++++++++++---------------------
tests/pcre-infloop | 2 +-
tests/pcre-invalid-utf8-input | 2 +-
4 files changed, 33 insertions(+), 44 deletions(-)
Index: grep-2.16/src/pcresearch.c
===================================================================
--- grep-2.16.orig/src/pcresearch.c
+++ grep-2.16/src/pcresearch.c
@@ -137,34 +137,41 @@ Pexecute (char const *buf, size_t size,
#else
/* This array must have at least two elements; everything after that
is just for performance improvement in pcre_exec. */
- int sub[300];
+ enum { nsub = 300 };
+ int sub[nsub];
- const char *line_buf, *line_end, *line_next;
+ char const *p = start_ptr ? start_ptr : buf;
+ int options = p == buf || p[-1] == eolbyte ? 0 : PCRE_NOTBOL;
+ char const *line_start = buf;
int e = PCRE_ERROR_NOMATCH;
- ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
+ char const *line_end;
/* PCRE can't limit the matching to single lines, therefore we have to
match each line in the buffer separately. */
- for (line_next = buf;
- e == PCRE_ERROR_NOMATCH && line_next < buf + size;
- start_ofs -= line_next - line_buf)
+ for (; p < buf + size; p = line_start = line_end + 1)
{
- line_buf = line_next;
- line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
- if (line_end == NULL)
- line_next = line_end = buf + size;
- else
- line_next = line_end + 1;
+ line_end = memchr (p, eolbyte, buf + size - p);
- if (start_ptr && start_ptr >= line_end)
- continue;
-
- if (INT_MAX < line_end - line_buf)
+ if (INT_MAX < line_end - p)
error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
- e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
- start_ofs < 0 ? 0 : start_ofs, 0,
- sub, sizeof sub / sizeof *sub);
+ /* Treat encoding-error bytes as data that cannot match. */
+ for (;;)
+ {
+ e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub);
+ if (e != PCRE_ERROR_BADUTF8)
+ break;
+ e = pcre_exec (cre, extra, p, sub[0], 0,
+ options | PCRE_NO_UTF8_CHECK, sub, nsub);
+ if (e != PCRE_ERROR_NOMATCH)
+ break;
+ p += sub[0] + 1;
+ options = PCRE_NOTBOL;
+ }
+
+ if (e != PCRE_ERROR_NOMATCH)
+ break;
+ options = 0;
}
if (e <= 0)
@@ -181,10 +188,6 @@ Pexecute (char const *buf, size_t size,
error (EXIT_TROUBLE, 0,
_("exceeded PCRE's backtracking limit"));
- case PCRE_ERROR_BADUTF8:
- error (EXIT_TROUBLE, 0,
- _("invalid UTF-8 byte sequence in input"));
-
default:
/* For now, we lump all remaining PCRE failures into this basket.
If anyone cares to provide sample grep usage that can trigger
@@ -198,25 +201,8 @@ Pexecute (char const *buf, size_t size,
}
else
{
- /* Narrow down to the line we've found. */
- char const *beg = line_buf + sub[0];
- char const *end = line_buf + sub[1];
- char const *buflim = buf + size;
- char eol = eolbyte;
- if (!start_ptr)
- {
- /* FIXME: The case when '\n' is not found indicates a bug:
- Since grep is line oriented, the match should never contain
- a newline, so there _must_ be a newline following.
- */
- if (!(end = memchr (end, eol, buflim - end)))
- end = buflim;
- else
- end++;
- while (buf < beg && beg[-1] != eol)
- --beg;
- }
-
+ char const *beg = start_ptr ? p + sub[0] : line_start;
+ char const *end = start_ptr ? p + sub[1] : line_end + 1;
*match_size = end - beg;
return beg - buf;
}
Index: grep-2.16/tests/pcre-invalid-utf8-input
===================================================================
--- grep-2.16.orig/tests/pcre-invalid-utf8-input
+++ grep-2.16/tests/pcre-invalid-utf8-input
@@ -16,6 +16,6 @@ fail=0
printf 'j\202\nj\n' > in || framework_failure_
LC_ALL=en_US.UTF-8 grep -P j in
-test $? -eq 2 || fail=1
+test $? -eq 0 || fail=1
Exit $fail