File perl-regcomp-uni-semantics.diff of Package perl.6851
--- ./regcomp.c.orig 2018-03-12 10:12:01.394695035 +0000
+++ ./regcomp.c 2018-03-12 15:10:48.943906636 +0000
@@ -259,6 +259,14 @@ typedef struct RExC_state_t {
} \
} STMT_END
+#define REQUIRE_UNI_RULES(flagp, restart_retval) \
+ STMT_START { \
+ if (DEPENDS_SEMANTICS) { \
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
+ RExC_uni_semantics = 1; \
+ } \
+ } STMT_END
+
/* This converts the named class defined in regcomp.h to its equivalent class
* number defined in handy.h. */
#define namedclass_to_classnum(class) ((int) ((class) / 2))
@@ -5727,7 +5735,7 @@ Perl_re_op_compile(pTHX_ SV ** const pat
if (initial_charset == REGEX_LOCALE_CHARSET) {
RExC_contains_locale = 1;
}
- else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+ else if (initial_charset == REGEX_DEPENDS_CHARSET && (RExC_utf8 ||RExC_uni_semantics)) {
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'depends' charset specified, as it means unicode when utf8 */
@@ -9473,6 +9481,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I
if (paren) {
/* restore original flags, but keep (?p) */
RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
+ if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
+ }
if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched (");
@@ -9943,7 +9954,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pREx
return ret;
}
- RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
+ REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode semantics */
RExC_parse += 2; /* Skip past the 'U+' */
endchar = RExC_parse + strcspn(RExC_parse, ".}");
@@ -11633,7 +11644,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *
if (LOC) {
vFAIL("(?[...]) not valid in locale");
}
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, FALSE);
/* This will return only an ANYOF regnode, or (unlikely) something smaller
* (such as EXACT). Thus we can skip most everything if just sizing. We
@@ -12483,7 +12494,7 @@ parseit:
named */
/* \p means they want Unicode semantics */
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, FALSE);
}
break;
case 'n': value = '\n'; break;
@@ -13000,7 +13011,7 @@ parseit:
/* non-Latin1 code point implies unicode semantics. Must be set in
* pass1 so is there for the whole of pass 2 */
if (value > 255) {
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, FALSE);
}
/* Ready to process either the single value, or the completed range.