Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
SUSE:SLE-12-SP5:GA
grep.15962
ignore-case-multibyte.patch
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File ignore-case-multibyte.patch of Package grep.15962
From 97318f5e59a1ef6feb8a378434a00932a3fc1e0b Mon Sep 17 00:00:00 2001 From: Jim Meyering <meyering@fb.com> Date: Sun, 24 Nov 2013 18:49:31 -0800 Subject: [PATCH] grep: make --ignore-case (-i) faster (sometimes 10x) in multibyte locales These days, nearly everyone uses a multibyte locale, and grep is often used with the --ignore-case (-i) option, but that option imposes a very high cost in order to handle some unusual cases in just a few multibyte locales. This change gets most of the performance of using LC_ALL=C without eliminating the ability to search for multibyte strings. With the following example, I see an 11x speed-up with a 2.3GHz i7: Generate a 10M-line file, with each line consisting of 40 'j's: yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -10000000 > k Time searching it for the simple/noexistent string "foobar", first with this patch (best-of-5 trials): LC_ALL=en_US.UTF-8 env time src/grep -i foobar k 1.10 real 1.03 user 0.07 sys Back out that commit (temporarily), recompile, and rerun the experiment: git log -1 -p|patch -R -p1; make LC_ALL=en_US.UTF-8 env time src/grep -i foobar k 12.50 real 12.41 user 0.08 sys The trick is to realize that for some search strings, it is easy to convert to an equivalent one that is handled much more efficiently. E.g., convert this command: grep -i foobar k to this: grep '[fF][oO][oO][bB][aA][rR]' k That allows the matcher to search in buffer mode, rather than having to extract/case-convert/search each line separately. Currently, we perform this conversion only when search strings contain neither '\' nor '['. See the comments for more detail. * src/main.c (trivial_case_ignore): New function. (main): When possible, transform the regexp so we can drop the -i. diff --git a/src/main.c b/src/main.c index 44090be..bfd0982 100644 --- a/src/main.c +++ b/src/main.c @@ -27,6 +27,7 @@ #include <fcntl.h> #include <inttypes.h> #include <stdio.h> +#include <assert.h> #include "system.h" #include "argmatch.h" @@ -1644,13 +1645,14 @@ if any error occurs and -q is not given, the exit status is 2.\n")); exit (status); } +static char const *matcher; + /* If M is NULL, initialize the matcher to the default. Otherwise set the matcher to M if available. Exit in case of conflicts or if M is not available. */ static void setmatcher (char const *m) { - static char const *matcher; unsigned int i; if (!m) @@ -1865,6 +1867,84 @@ parse_grep_colors (void) return; } +#define MBRTOWC(pwc, s, n, ps) \ + (MB_CUR_MAX == 1 ? \ + (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \ + mbrtowc ((pwc), (s), (n), (ps))) + +#define WCRTOMB(s, wc, ps) \ + (MB_CUR_MAX == 1 ? \ + (*(s) = wctob ((wint_t) (wc)), 1) : \ + wcrtomb ((s), (wc), (ps))) + +/* If the newline-separated regular expressions, KEYS (with length, LEN + and no trailing NUL byte), are amenable to transformation into + otherwise equivalent case-ignoring ones, perform the transformation, + put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN, + and return true. Otherwise, return false. */ +static bool +trivial_case_ignore (size_t len, char const *keys, + size_t *new_len, char **new_keys) +{ + /* FIXME: consider removing the following restriction: + Reject if KEYS contain ASCII '\\' or '['. */ + if (memchr (keys, '\\', len) || memchr (keys, '[', len)) + return false; + + /* Worst case is that each byte B of KEYS is ASCII alphabetic and each + other_case(B) character, C, occupies MB_CUR_MAX bytes, so each B + maps to [BC], which requires MB_CUR_MAX + 3 bytes. */ + *new_keys = xnmalloc (MB_CUR_MAX + 3, len + 1); + char *p = *new_keys; + + mbstate_t mb_state; + memset (&mb_state, 0, sizeof mb_state); + while (len) + { + wchar_t wc; + int n = MBRTOWC (&wc, keys, len, &mb_state); + + /* For an invalid, incomplete or L'\0', skip this optimization. */ + if (n <= 0) + { + skip_case_ignore_optimization: + free (*new_keys); + return false; + } + + char const *orig = keys; + keys += n; + len -= n; + + if (!iswalpha (wc)) + { + memcpy (p, orig, n); + p += n; + } + else + { + *p++ = '['; + memcpy (p, orig, n); + p += n; + + wchar_t wc2 = iswupper (wc) ? towlower (wc) : towupper (wc); + char buf[MB_CUR_MAX]; + int n2 = WCRTOMB (buf, wc2, &mb_state); + if (n2 <= 0) + goto skip_case_ignore_optimization; + assert (n2 <= MB_CUR_MAX); + memcpy (p, buf, n2); + p += n2; + + *p++ = ']'; + } + } + + *new_len = p - *new_keys; + + return true; +} + int main (int argc, char **argv) { @@ -2263,6 +2343,35 @@ main (int argc, char **argv) else usage (EXIT_TROUBLE); + /* As currently implemented, case-insensitive matching is expensive in + multi-byte locales because of a few outlier locales in which some + characters change size when converted to upper or lower case. To + accommodate those, we revert to searching the input one line at a + time, rather than using the much more efficient buffer search. + However, if we have a regular expression, /foo/i, we can convert + it to an equivalent case-insensitive /[fF][oO][oO]/, and thus + avoid the expensive read-and-process-a-line-at-a-time requirement. + Optimize-away the "-i" option, when possible, converting each + candidate alpha, C, in the regexp to [Cc]. */ + if (match_icase) + { + size_t new_keycc; + char *new_keys; + /* It is not possible with -F, not useful with -P (pcre) and there is no + point when there is no regexp. It also depends on which constructs + appear in the regexp. See trivial_case_ignore for those details. */ + if (keycc + && ! (matcher + && (STREQ (matcher, "fgrep") || STREQ (matcher, "pcre"))) + && trivial_case_ignore (keycc, keys, &new_keycc, &new_keys)) + { + match_icase = 0; + free (keys); + keys = new_keys; + keycc = new_keycc; + } + } + compile (keys, keycc); free (keys);
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor