diff options
-rw-r--r-- | ext/re/re.pm | 4 | ||||
-rw-r--r-- | op.c | 7 | ||||
-rw-r--r-- | op_reg_common.h | 48 | ||||
-rw-r--r-- | pod/perldelta.pod | 9 | ||||
-rw-r--r-- | pp.c | 10 | ||||
-rw-r--r-- | regcomp.c | 66 | ||||
-rw-r--r-- | regexp.h | 28 | ||||
-rw-r--r-- | regnodes.h | 4 | ||||
-rw-r--r-- | universal.c | 16 |
9 files changed, 143 insertions, 49 deletions
diff --git a/ext/re/re.pm b/ext/re/re.pm index 665947e8b7..7a489950f4 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -25,9 +25,9 @@ my %reflags = ( x => 1 << ($PMMOD_SHIFT + 3), p => 1 << ($PMMOD_SHIFT + 4), # special cases: - l => 1 << ($PMMOD_SHIFT + 5), - u => 1 << ($PMMOD_SHIFT + 6), d => 0, + l => 1, + u => 2, ); sub setcolor { @@ -3778,10 +3778,10 @@ Perl_newPMOP(pTHX_ I32 type, I32 flags) if (PL_hints & HINT_RE_TAINT) pmop->op_pmflags |= PMf_RETAINT; if (PL_hints & HINT_LOCALE) { - pmop->op_pmflags |= PMf_LOCALE; + set_regex_charset(&(pmop->op_pmflags), REGEX_LOCALE_CHARSET); } else if ((! (PL_hints & HINT_BYTES)) && (PL_hints & HINT_UNI_8_BIT)) { - pmop->op_pmflags |= RXf_PMf_UNICODE; + set_regex_charset(&(pmop->op_pmflags), REGEX_UNICODE_CHARSET); } if (PL_hints & HINT_RE_FLAGS) { SV *reflags = Perl_refcounted_he_fetch_pvn(aTHX_ @@ -3792,8 +3792,7 @@ Perl_newPMOP(pTHX_ I32 type, I32 flags) PL_compiling.cop_hints_hash, STR_WITH_LEN("reflags_charset"), 0, 0 ); if (reflags && SvOK(reflags)) { - pmop->op_pmflags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE); - pmop->op_pmflags |= SvIV(reflags); + set_regex_charset(&(pmop->op_pmflags), SvIV(reflags)); } } diff --git a/op_reg_common.h b/op_reg_common.h index 238d7bdb0e..c6d846dd69 100644 --- a/op_reg_common.h +++ b/op_reg_common.h @@ -32,8 +32,47 @@ #define RXf_PMf_FOLD (1 << (RXf_PMf_STD_PMMOD_SHIFT+2)) /* /i */ #define RXf_PMf_EXTENDED (1 << (RXf_PMf_STD_PMMOD_SHIFT+3)) /* /x */ #define RXf_PMf_KEEPCOPY (1 << (RXf_PMf_STD_PMMOD_SHIFT+4)) /* /p */ -#define RXf_PMf_LOCALE (1 << (RXf_PMf_STD_PMMOD_SHIFT+5)) -#define RXf_PMf_UNICODE (1 << (RXf_PMf_STD_PMMOD_SHIFT+6)) + +/* The character set for the regex is stored in a field of more than one bit + * using an enum, for reasons of compactness and to ensure that the options are + * mutually exclusive */ +typedef enum { + REGEX_DEPENDS_CHARSET = 0, + REGEX_LOCALE_CHARSET, + REGEX_UNICODE_CHARSET +} regex_charset; + +#define _RXf_PMf_CHARSET_SHIFT ((RXf_PMf_STD_PMMOD_SHIFT)+5) +#define RXf_PMf_CHARSET (3 << (_RXf_PMf_CHARSET_SHIFT)) /* 2 bits */ + +/* embed.pl doesn't yet know how to handle static inline functions, so + manually decorate them here with gcc-style attributes. +*/ +PERL_STATIC_INLINE void +set_regex_charset(U32 * const flags, const regex_charset cs) + __attribute__nonnull__(1); + +PERL_STATIC_INLINE void +set_regex_charset(U32 * const flags, const regex_charset cs) +{ + /* Sets the character set portion of 'flags' to 'cs', which is a member of + * the above enum */ + + *flags &= ~RXf_PMf_CHARSET; + *flags |= (cs << _RXf_PMf_CHARSET_SHIFT); +} + +PERL_STATIC_INLINE regex_charset +get_regex_charset(const U32 flags) + __attribute__warn_unused_result__; + +PERL_STATIC_INLINE regex_charset +get_regex_charset(const U32 flags) +{ + /* Returns the enum corresponding to the character set in 'flags' */ + + return (flags & RXf_PMf_CHARSET) >> _RXf_PMf_CHARSET_SHIFT; +} /* Next available bit after the above. Name begins with '_' so won't be * exported by B */ @@ -41,7 +80,7 @@ /* Mask of the above bits. These need to be transferred from op_pmflags to * re->extflags during compilation */ -#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_LOCALE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY|RXf_PMf_UNICODE) +#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_CHARSET|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY) /* These copies need to be numerical or defsubs_h.PL won't know about them. */ #define PMf_MULTILINE 1<<0 @@ -49,9 +88,8 @@ #define PMf_FOLD 1<<2 #define PMf_EXTENDED 1<<3 #define PMf_KEEPCOPY 1<<4 -#define PMf_LOCALE 1<<5 -#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY != RXf_PMf_KEEPCOPY || PMf_LOCALE != RXf_PMf_LOCALE +#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY != RXf_PMf_KEEPCOPY # error RXf_PMf defines are wrong #endif diff --git a/pod/perldelta.pod b/pod/perldelta.pod index ea633cb6e1..3034972fe7 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -435,6 +435,15 @@ fundamentally broken model of how the Unicode non-character code points should be handled, which is now described in L<perlunicode/Non-character code points>. See also L</Selected Bug Fixes>. +=item * + +Certain shared flags in the C<pmop.op_pmflags> and C<regexp.extflags> +structures have been removed. These are: C<Rxf_Pmf_LOCALE>, +C<Rxf_Pmf_UNICODE>, and C<PMf_LOCALE>. Instead there are encodes and +three static in-line functions for accessing the information: +C<get_regex_charset()>, C<set_regex_charset()>, and C<get_regex_charset_name()>, +which are defined in the places where the orginal flags were. + =back =head1 Selected Bug Fixes @@ -5870,7 +5870,7 @@ PP(pp_split) DIE(aTHX_ "panic: pp_split"); rx = PM_GETRE(pm); - TAINT_IF((RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) && + TAINT_IF(get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET && (RX_EXTFLAGS(rx) & (RXf_WHITE | RXf_SKIPWHITE))); RX_MATCH_UTF8_set(rx, do_utf8); @@ -5916,7 +5916,7 @@ PP(pp_split) while (*s == ' ' || is_utf8_space((U8*)s)) s += UTF8SKIP(s); } - else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) { + else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) { while (isSPACE_LC(*s)) s++; } @@ -5946,7 +5946,8 @@ PP(pp_split) else m += t; } - } else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) { + } + else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) { while (m < strend && !isSPACE_LC(*m)) ++m; } else { @@ -5978,7 +5979,8 @@ PP(pp_split) if (do_utf8) { while (s < strend && ( *s == ' ' || is_utf8_space((U8*)s) )) s += UTF8SKIP(s); - } else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) { + } + else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) { while (s < strend && isSPACE_LC(*s)) ++s; } else { @@ -369,8 +369,9 @@ static const scan_data_t zero_scan_data = #define SCF_SEEN_ACCEPT 0x8000 #define UTF cBOOL(RExC_utf8) -#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE) -#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE) +#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) +#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) + #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) #define OOB_UNICODE 12345678 @@ -4479,8 +4480,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) /* Set to use unicode semantics if the pattern is in utf8 and has the * 'dual' charset specified, as it means unicode when utf8 */ pm_flags = orig_pm_flags; - if (RExC_utf8 && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) { - pm_flags |= RXf_PMf_UNICODE; + + if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) { + set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET); } RExC_precomp = exp; @@ -4566,7 +4568,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) r->extflags = pm_flags; { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE)); + bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); /* The caret is output if there are any defaults: if not all the STD * flags are set, or if no character set specifier is needed */ @@ -4587,7 +4589,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) * covered by the caret */ const STRLEN wraplen = plen + has_p + has_runon + has_default /* If needs a caret */ - + has_charset /* If needs a character set specifier */ + + /* If needs a character set specifier */ + + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); @@ -4601,11 +4605,10 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) *p++= DEFAULT_PAT_MOD; } if (has_charset) { - if (r->extflags & RXf_PMf_LOCALE) { - *p++ = LOCALE_PAT_MOD; - } else { - *p++ = UNICODE_PAT_MOD; - } + STRLEN len; + const char* const name = get_regex_charset_name(r->extflags, &len); + Copy(name, p, len, char); + p += len; } if (has_p) *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ @@ -6300,7 +6303,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) STD_PMMOD_FLAGS_CLEAR(&RExC_flags); if (RExC_utf8) { /* But the default for a utf8 pattern is unicode semantics */ - RExC_flags |= RXf_PMf_UNICODE; + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); } goto parse_flags; default: @@ -6310,6 +6313,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) U32 posflags = 0, negflags = 0; U32 *flagsp = &posflags; bool has_charset_modifier = 0; + regex_charset cs = REGEX_DEPENDS_CHARSET; while (*RExC_parse) { /* && strchr("iogcmsx", *RExC_parse) */ @@ -6321,16 +6325,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (has_charset_modifier || flagsp == &negflags) { goto fail_modifiers; } - posflags |= RXf_PMf_LOCALE; - negflags |= RXf_PMf_UNICODE; + cs = REGEX_LOCALE_CHARSET; has_charset_modifier = 1; break; case UNICODE_PAT_MOD: if (has_charset_modifier || flagsp == &negflags) { goto fail_modifiers; } - posflags |= RXf_PMf_UNICODE; - negflags |= RXf_PMf_LOCALE; + cs = REGEX_UNICODE_CHARSET; has_charset_modifier = 1; break; case DUAL_PAT_MOD: @@ -6344,13 +6346,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* The dual charset means unicode semantics if the * pattern (or target, not known until runtime) are * utf8 */ - if (RExC_utf8) { - posflags |= RXf_PMf_UNICODE; - negflags |= RXf_PMf_LOCALE; - } - else { - negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE); - } + cs = (RExC_utf8) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET; has_charset_modifier = 1; break; case ONCE_PAT_MOD: /* 'o' */ @@ -6411,9 +6409,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case ')': RExC_flags |= posflags; RExC_flags &= ~negflags; + set_regex_charset(&RExC_flags, cs); if (paren != ':') { oregflags |= posflags; oregflags &= ~negflags; + set_regex_charset(&oregflags, cs); } nextchar(pRExC_state); if (paren != ':') { @@ -9508,14 +9508,34 @@ S_regdump_extflags(pTHX_ const char *lead, const U32 flags) { int bit; int set=0; + regex_charset cs; for (bit=0; bit<32; bit++) { if (flags & (1<<bit)) { + if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */ + continue; + } if (!set++ && lead) PerlIO_printf(Perl_debug_log, "%s",lead); PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]); } } + if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { + if (!set++ && lead) { + PerlIO_printf(Perl_debug_log, "%s",lead); + } + switch (cs) { + case REGEX_UNICODE_CHARSET: + PerlIO_printf(Perl_debug_log, "UNICODE"); + break; + case REGEX_LOCALE_CHARSET: + PerlIO_printf(Perl_debug_log, "LOCALE"); + break; + default: + PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET"); + break; + } + } if (lead) { if (set) PerlIO_printf(Perl_debug_log, "\n"); @@ -235,7 +235,7 @@ and check for NULL. /* Note, includes locale, unicode */ #define STD_PMMOD_FLAGS_CLEAR(pmfl) \ - *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_LOCALE|RXf_PMf_UNICODE) + *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_CHARSET) /* chars and strings used as regex pattern modifiers * Singular is a 'c'har, plural is a "string" @@ -293,6 +293,32 @@ and check for NULL. * unshared area without affecting binary compatibility */ #define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+2) +/* embed.pl doesn't yet know how to handle static inline functions, so + manually decorate them here with gcc-style attributes. +*/ +PERL_STATIC_INLINE const char * +get_regex_charset_name(const U32 flags, STRLEN* const lenp) + __attribute__warn_unused_result__; + +#define MAX_CHARSET_NAME_LENGTH 1 + +PERL_STATIC_INLINE const char * +get_regex_charset_name(const U32 flags, STRLEN* const lenp) +{ + /* Returns a string that corresponds to the name of the regex character set + * given by 'flags', and *lenp is set the length of that string, which + * cannot exceed MAX_CHARSET_NAME_LENGTH characters */ + + *lenp = 1; + switch (get_regex_charset(flags)) { + case REGEX_DEPENDS_CHARSET: return DUAL_PAT_MODS; + case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS; + case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS; + } + + return "?"; /* Unknown */ +} + /* Anchor and GPOS related stuff */ #define RXf_ANCH_BOL (1<<(RXf_BASE_SHIFT+0)) #define RXf_ANCH_MBOL (1<<(RXf_BASE_SHIFT+1)) diff --git a/regnodes.h b/regnodes.h index 296d383152..f0725ba525 100644 --- a/regnodes.h +++ b/regnodes.h @@ -651,8 +651,8 @@ EXTCONST char * const PL_reg_extflags_name[] = { "FOLD", /* 0x00000004 */ "EXTENDED", /* 0x00000008 */ "KEEPCOPY", /* 0x00000010 */ - "LOCALE", /* 0x00000020 */ - "UNICODE", /* 0x00000040 */ + "CHARSET", /* 0x00000060 */ + "CHARSET", /* 0x00000060 */ "UNUSED_BIT_7", /* 0x00000080 */ "UNUSED_BIT_8", /* 0x00000100 */ "ANCH_BOL", /* 0x00000200 */ diff --git a/universal.c b/universal.c index 08f9ab8f51..96a92cf994 100644 --- a/universal.c +++ b/universal.c @@ -1155,8 +1155,7 @@ XS(XS_re_regexp_pattern) if ( GIMME_V == G_ARRAY ) { STRLEN left = 0; - char reflags[sizeof(INT_PAT_MODS) + 1]; /* The +1 is for the charset - modifier */ + char reflags[sizeof(INT_PAT_MODS) + MAX_CHARSET_NAME_LENGTH]; const char *fptr; char ch; U16 match_flags; @@ -1164,14 +1163,15 @@ XS(XS_re_regexp_pattern) /* we are in list context so stringify the modifiers that apply. We ignore "negative - modifiers" in this scenario. + modifiers" in this scenario, and the default character set */ - if (RX_EXTFLAGS(re) & RXf_PMf_LOCALE) { - reflags[left++] = LOCALE_PAT_MOD; - } - else if (RX_EXTFLAGS(re) & RXf_PMf_UNICODE) { - reflags[left++] = UNICODE_PAT_MOD; + if (get_regex_charset(RX_EXTFLAGS(re)) != REGEX_DEPENDS_CHARSET) { + STRLEN len; + const char* const name = get_regex_charset_name(RX_EXTFLAGS(re), + &len); + Copy(name, reflags + left, len, char); + left += len; } fptr = INT_PAT_MODS; match_flags = (U16)((RX_EXTFLAGS(re) & PMf_COMPILETIME) |