File file-5.41-cache-regexps.patch of Package file

From 1957db8212e9c74e5d626de3023e49d0bb502052 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20M=C3=BCller?= <dirk@dmllr.de>
Date: Fri, 11 Mar 2022 23:51:55 +0100
Subject: [PATCH] Cache compiled regexps between magic matches

regcomp() is relatively expensive compared to regexec() for matching,
so it helps to only compile once and then reuse the compiled version
for future matches of the same magic.

when doing equivalent of `find | xargs file` this provides a massive
speedup, between factor 2 and 4 depending on how heavy the magic
is on regexp usage.

The memory overhead is mediocre (~ 200kb ) and it compiles regexps
lazy, so it doesn't add significant overhead to single match usecases.
---
 src/apprentice.c | 26 +++++++++++++++++++----
 src/file.h       | 40 ++++++++++++++++++-----------------
 src/softmagic.c  | 54 +++++++++++++++++++++++++++---------------------
 3 files changed, 73 insertions(+), 47 deletions(-)

Index: file-5.41/src/apprentice.c
===================================================================
--- file-5.41.orig/src/apprentice.c
+++ file-5.41/src/apprentice.c
@@ -427,7 +427,14 @@ add_mlist(struct mlist *mlp, struct magi
 	ml->map = idx == 0 ? map : NULL;
 	ml->magic = map->magic[idx];
 	ml->nmagic = map->nmagic[idx];
-
+	ml->magic_rxcomp = NULL;
+	if (ml->nmagic) {
+		ml->magic_rxcomp = CAST(file_regex_t**, calloc(ml->nmagic, sizeof(file_regex_t*)));
+		if (ml->magic_rxcomp == NULL) {
+			free(ml);
+			return -1;
+		}
+	}
 	mlp->prev->next = ml;
 	ml->prev = mlp->prev;
 	ml->next = mlp;
@@ -612,8 +619,19 @@ mlist_free_all(struct magic_set *ms)
 private void
 mlist_free_one(struct mlist *ml)
 {
+	size_t i;
+
 	if (ml->map)
 		apprentice_unmap(CAST(struct magic_map *, ml->map));
+
+	for (i = 0; i < ml->nmagic; ++i) {
+		if (ml->magic_rxcomp[i]) {
+			file_regfree(ml->magic_rxcomp[i]);
+			free(ml->magic_rxcomp[i]);
+		}
+	}
+	free(ml->magic_rxcomp);
+	ml->magic_rxcomp = NULL;
 	free(ml);
 }
 
@@ -3489,16 +3507,16 @@ file_magicfind(struct magic_set *ms, con
 
 	for (ml = mlist->next; ml != mlist; ml = ml->next) {
 		struct magic *ma = ml->magic;
-		uint32_t nma = ml->nmagic;
-		for (i = 0; i < nma; i++) {
+		for (i = 0; i < ml->nmagic; i++) {
 			if (ma[i].type != FILE_NAME)
 				continue;
 			if (strcmp(ma[i].value.s, name) == 0) {
 				v->magic = &ma[i];
-				for (j = i + 1; j < nma; j++)
+				for (j = i + 1; j < ml->nmagic; j++)
 				    if (ma[j].cont_level == 0)
 					    break;
 				v->nmagic = j - i;
+				v->magic_rxcomp = ml->magic_rxcomp;
 				return 0;
 			}
 		}
Index: file-5.41/src/file.h
===================================================================
--- file-5.41.orig/src/file.h
+++ file-5.41/src/file.h
@@ -88,6 +88,10 @@
 /* Do this here and now, because struct stat gets re-defined on solaris */
 #include <sys/stat.h>
 #include <stdarg.h>
+#include <locale.h>
+#if defined(HAVE_XLOCALE_H)
+#include <xlocale.h>
+#endif
 
 #define ENABLE_CONDITIONALS
 
@@ -167,6 +171,19 @@
 #define FILE_COMPILE	2
 #define FILE_LIST	3
 
+typedef struct {
+	const char *pat;
+#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE)
+#define USE_C_LOCALE
+	locale_t old_lc_ctype;
+	locale_t c_lc_ctype;
+#else
+	char *old_lc_ctype;
+#endif
+	int rc;
+	regex_t rx;
+} file_regex_t;
+
 struct buffer {
 	int fd;
 	struct stat st;
@@ -397,9 +414,10 @@ struct magic {
 
 /* list of magic entries */
 struct mlist {
-	struct magic *magic;		/* array of magic entries */
-	uint32_t nmagic;		/* number of entries in array */
-	void *map;			/* internal resources used by entry */
+	struct magic *magic;			/* array of magic entries */
+	file_regex_t **magic_rxcomp;	/* array of compiled regexps */
+	size_t nmagic;					/* number of entries in array */
+	void *map;				/* internal resources used by entry */
 	struct mlist *next, *prev;
 };
 
@@ -568,23 +586,7 @@ protected void buffer_init(struct buffer
 protected void buffer_fini(struct buffer *);
 protected int buffer_fill(const struct buffer *);
 
-#include <locale.h>
-#if defined(HAVE_XLOCALE_H)
-#include <xlocale.h>
-#endif
 
-typedef struct {
-	const char *pat;
-#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE)
-#define USE_C_LOCALE
-	locale_t old_lc_ctype;
-	locale_t c_lc_ctype;
-#else
-	char *old_lc_ctype;
-#endif
-	int rc;
-	regex_t rx;
-} file_regex_t;
 
 protected int file_regcomp(file_regex_t *, const char *, int);
 protected int file_regexec(file_regex_t *, const char *, size_t, regmatch_t *,
Index: file-5.41/src/softmagic.c
===================================================================
--- file-5.41.orig/src/softmagic.c
+++ file-5.41/src/softmagic.c
@@ -43,7 +43,7 @@ FILE_RCSID("@(#)$File: softmagic.c,v 1.3
 #include <time.h>
 #include "der.h"
 
-private int match(struct magic_set *, struct magic *, uint32_t,
+private int match(struct magic_set *, struct magic *, file_regex_t **, uint32_t,
     const struct buffer *, size_t, int, int, int, uint16_t *,
     uint16_t *, int *, int *, int *, int *);
 private int mget(struct magic_set *, struct magic *, const struct buffer *,
@@ -52,7 +52,7 @@ private int mget(struct magic_set *, str
     uint16_t *, int *, int *, int *, int *);
 private int msetoffset(struct magic_set *, struct magic *, struct buffer *,
     const struct buffer *, size_t, unsigned int);
-private int magiccheck(struct magic_set *, struct magic *);
+private int magiccheck(struct magic_set *, struct magic *, file_regex_t **);
 private int32_t mprint(struct magic_set *, struct magic *);
 private int moffset(struct magic_set *, struct magic *, const struct buffer *,
     int32_t *);
@@ -131,7 +131,7 @@ file_softmagic(struct magic_set *ms, con
 	}
 
 	for (ml = ms->mlist[0]->next; ml != ms->mlist[0]; ml = ml->next)
-		if ((rv = match(ms, ml->magic, ml->nmagic, b, 0, mode,
+		if ((rv = match(ms, ml->magic, ml->magic_rxcomp, ml->nmagic, b, 0, mode,
 		    text, 0, indir_count, name_count,
 		    &printed_something, &need_separator, NULL, NULL)) != 0)
 			return rv;
@@ -191,7 +191,7 @@ file_fmtcheck(struct magic_set *ms, cons
  *	so that higher-level continuations are processed.
  */
 private int
-match(struct magic_set *ms, struct magic *magic, uint32_t nmagic,
+match(struct magic_set *ms, struct magic *magic, file_regex_t **magic_rxcomp, uint32_t nmagic,
     const struct buffer *b, size_t offset, int mode, int text,
     int flip, uint16_t *indir_count, uint16_t *name_count,
     int *printed_something, int *need_separator, int *returnval,
@@ -220,6 +220,7 @@ match(struct magic_set *ms, struct magic
 	for (magindex = 0; magindex < nmagic; magindex++) {
 		int flush = 0;
 		struct magic *m = &magic[magindex];
+		file_regex_t** m_rxcomp = &magic_rxcomp[magindex];
 
 		if (m->type != FILE_NAME)
 		if ((IS_STRING(m->type) &&
@@ -257,7 +258,7 @@ flush:
 				*returnval = 1;
 			}
 
-			switch (magiccheck(ms, m)) {
+			switch (magiccheck(ms, m, m_rxcomp)) {
 			case -1:
 				return -1;
 			case 0:
@@ -318,6 +319,7 @@ flush:
 		while (magindex + 1 < nmagic &&
 		    magic[magindex + 1].cont_level != 0) {
 			m = &magic[++magindex];
+			m_rxcomp = &magic_rxcomp[magindex];
 			ms->line = m->lineno; /* for messages */
 
 			if (cont_level < m->cont_level)
@@ -371,7 +373,7 @@ flush:
 				break;
 			}
 
-			switch (flush ? 1 : magiccheck(ms, m)) {
+			switch (flush ? 1 : magiccheck(ms, m, m_rxcomp)) {
 			case -1:
 				return -1;
 			case 0:
@@ -655,7 +657,7 @@ mprint(struct magic_set *ms, struct magi
 
 			if (m->str_flags & STRING_TRIM)
 				str = file_strtrim(str);
-					
+
 			if (file_printf(ms, F(ms, desc, "%s"),
 			    file_printable(ms, sbuf, sizeof(sbuf), str,
 				sizeof(p->s) - (str - p->s))) == -1)
@@ -770,7 +772,7 @@ mprint(struct magic_set *ms, struct magi
 			return -1;
 		}
 		scp = (m->str_flags & STRING_TRIM) ? file_strtrim(cp) : cp;
-					
+
 		rval = file_printf(ms, F(ms, desc, "%s"), file_printable(ms,
 		    sbuf, sizeof(sbuf), scp, ms->search.rm_len));
 		free(cp);
@@ -1822,7 +1824,7 @@ mget(struct magic_set *ms, struct magic
 		for (mlp = ms->mlist[0]->next; mlp != ms->mlist[0];
 		    mlp = mlp->next)
 		{
-			if ((rv = match(ms, mlp->magic, mlp->nmagic, &bb, 0,
+			if ((rv = match(ms, mlp->magic, mlp->magic_rxcomp, mlp->nmagic, &bb, 0,
 			    BINTEST, text, 0, indir_count, name_count,
 			    printed_something, need_separator, NULL,
 			    NULL)) != 0)
@@ -1875,7 +1877,7 @@ mget(struct magic_set *ms, struct magic
 		nfound_match = 0;
 		(*name_count)++;
 		eoffset = ms->eoffset;
-		rv = match(ms, ml.magic, ml.nmagic, b, offset + o,
+		rv = match(ms, ml.magic, ml.magic_rxcomp, ml.nmagic, b, offset + o,
 		    mode, text, flip, indir_count, name_count,
 		    printed_something, need_separator, returnval,
 		    &nfound_match);
@@ -1999,7 +2001,7 @@ file_strncmp16(const char *a, const char
 }
 
 private int
-magiccheck(struct magic_set *ms, struct magic *m)
+magiccheck(struct magic_set *ms, struct magic *m, file_regex_t** m_cache)
 {
 	uint64_t l = m->value.q;
 	uint64_t v;
@@ -2182,27 +2184,32 @@ magiccheck(struct magic_set *ms, struct
 	}
 	case FILE_REGEX: {
 		int rc;
-		file_regex_t rx;
+		file_regex_t *rx = *m_cache;
 		const char *search;
 
 		if (ms->search.s == NULL)
 			return 0;
 
+		if (rx == NULL) {
+			rx = *m_cache = CAST(file_regex_t*, malloc(sizeof(file_regex_t)));
+			rc = file_regcomp(rx, m->value.s,
+				REG_EXTENDED|REG_NEWLINE|
+				((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0));
+			if (rc) {
+				file_regerror(rx, rc, ms);
+				file_regfree(rx);
+				v = CAST(uint64_t, -1);
+				break;
+			}
+		}
 		l = 0;
-		rc = file_regcomp(&rx, m->value.s,
-		    REG_EXTENDED|REG_NEWLINE|
-		    ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0));
-		if (rc) {
-			file_regerror(&rx, rc, ms);
-			v = CAST(uint64_t, -1);
-		} else {
+		{
 			regmatch_t pmatch;
 			size_t slen = ms->search.s_len;
 			char *copy;
 			if (slen != 0) {
 			    copy = CAST(char *, malloc(slen));
 			    if (copy == NULL)  {
-				file_regfree(&rx);
 				file_error(ms, errno,
 				    "can't allocate %" SIZE_T_FORMAT "u bytes",
 				    slen);
@@ -2215,14 +2222,14 @@ magiccheck(struct magic_set *ms, struct
 			    search = CCAST(char *, "");
 			    copy = NULL;
 			}
-			rc = file_regexec(&rx, RCAST(const char *, search),
+			rc = file_regexec(rx, RCAST(const char *, search),
 			    1, &pmatch, 0);
 			free(copy);
 			switch (rc) {
 			case 0:
 				ms->search.s += CAST(int, pmatch.rm_so);
 				ms->search.offset += CAST(size_t, pmatch.rm_so);
-				ms->search.rm_len = CAST(size_t, 
+				ms->search.rm_len = CAST(size_t,
 				    pmatch.rm_eo - pmatch.rm_so);
 				v = 0;
 				break;
@@ -2232,12 +2239,11 @@ magiccheck(struct magic_set *ms, struct
 				break;
 
 			default:
-				file_regerror(&rx, rc, ms);
+				file_regerror(rx, rc, ms);
 				v = CAST(uint64_t, -1);
 				break;
 			}
 		}
-		file_regfree(&rx);
 		if (v == CAST(uint64_t, -1))
 			return -1;
 		break;
openSUSE Build Service is sponsored by