summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ext/re/re.pm4
-rw-r--r--op.c7
-rw-r--r--op_reg_common.h48
-rw-r--r--pod/perldelta.pod9
-rw-r--r--pp.c10
-rw-r--r--regcomp.c66
-rw-r--r--regexp.h28
-rw-r--r--regnodes.h4
-rw-r--r--universal.c16
9 files changed, 143 insertions, 49 deletions
diff --git a/ext/re/re.pm b/ext/re/re.pm
index 665947e8b7..7a489950f4 100644
--- a/ext/re/re.pm
+++ b/ext/re/re.pm
@@ -25,9 +25,9 @@ my %reflags = (
x => 1 << ($PMMOD_SHIFT + 3),
p => 1 << ($PMMOD_SHIFT + 4),
# special cases:
- l => 1 << ($PMMOD_SHIFT + 5),
- u => 1 << ($PMMOD_SHIFT + 6),
d => 0,
+ l => 1,
+ u => 2,
);
sub setcolor {
diff --git a/op.c b/op.c
index 371c1e64ac..fbf3d71dda 100644
--- a/op.c
+++ b/op.c
@@ -3778,10 +3778,10 @@ Perl_newPMOP(pTHX_ I32 type, I32 flags)
if (PL_hints & HINT_RE_TAINT)
pmop->op_pmflags |= PMf_RETAINT;
if (PL_hints & HINT_LOCALE) {
- pmop->op_pmflags |= PMf_LOCALE;
+ set_regex_charset(&(pmop->op_pmflags), REGEX_LOCALE_CHARSET);
}
else if ((! (PL_hints & HINT_BYTES)) && (PL_hints & HINT_UNI_8_BIT)) {
- pmop->op_pmflags |= RXf_PMf_UNICODE;
+ set_regex_charset(&(pmop->op_pmflags), REGEX_UNICODE_CHARSET);
}
if (PL_hints & HINT_RE_FLAGS) {
SV *reflags = Perl_refcounted_he_fetch_pvn(aTHX_
@@ -3792,8 +3792,7 @@ Perl_newPMOP(pTHX_ I32 type, I32 flags)
PL_compiling.cop_hints_hash, STR_WITH_LEN("reflags_charset"), 0, 0
);
if (reflags && SvOK(reflags)) {
- pmop->op_pmflags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
- pmop->op_pmflags |= SvIV(reflags);
+ set_regex_charset(&(pmop->op_pmflags), SvIV(reflags));
}
}
diff --git a/op_reg_common.h b/op_reg_common.h
index 238d7bdb0e..c6d846dd69 100644
--- a/op_reg_common.h
+++ b/op_reg_common.h
@@ -32,8 +32,47 @@
#define RXf_PMf_FOLD (1 << (RXf_PMf_STD_PMMOD_SHIFT+2)) /* /i */
#define RXf_PMf_EXTENDED (1 << (RXf_PMf_STD_PMMOD_SHIFT+3)) /* /x */
#define RXf_PMf_KEEPCOPY (1 << (RXf_PMf_STD_PMMOD_SHIFT+4)) /* /p */
-#define RXf_PMf_LOCALE (1 << (RXf_PMf_STD_PMMOD_SHIFT+5))
-#define RXf_PMf_UNICODE (1 << (RXf_PMf_STD_PMMOD_SHIFT+6))
+
+/* The character set for the regex is stored in a field of more than one bit
+ * using an enum, for reasons of compactness and to ensure that the options are
+ * mutually exclusive */
+typedef enum {
+ REGEX_DEPENDS_CHARSET = 0,
+ REGEX_LOCALE_CHARSET,
+ REGEX_UNICODE_CHARSET
+} regex_charset;
+
+#define _RXf_PMf_CHARSET_SHIFT ((RXf_PMf_STD_PMMOD_SHIFT)+5)
+#define RXf_PMf_CHARSET (3 << (_RXf_PMf_CHARSET_SHIFT)) /* 2 bits */
+
+/* embed.pl doesn't yet know how to handle static inline functions, so
+ manually decorate them here with gcc-style attributes.
+*/
+PERL_STATIC_INLINE void
+set_regex_charset(U32 * const flags, const regex_charset cs)
+ __attribute__nonnull__(1);
+
+PERL_STATIC_INLINE void
+set_regex_charset(U32 * const flags, const regex_charset cs)
+{
+ /* Sets the character set portion of 'flags' to 'cs', which is a member of
+ * the above enum */
+
+ *flags &= ~RXf_PMf_CHARSET;
+ *flags |= (cs << _RXf_PMf_CHARSET_SHIFT);
+}
+
+PERL_STATIC_INLINE regex_charset
+get_regex_charset(const U32 flags)
+ __attribute__warn_unused_result__;
+
+PERL_STATIC_INLINE regex_charset
+get_regex_charset(const U32 flags)
+{
+ /* Returns the enum corresponding to the character set in 'flags' */
+
+ return (flags & RXf_PMf_CHARSET) >> _RXf_PMf_CHARSET_SHIFT;
+}
/* Next available bit after the above. Name begins with '_' so won't be
* exported by B */
@@ -41,7 +80,7 @@
/* Mask of the above bits. These need to be transferred from op_pmflags to
* re->extflags during compilation */
-#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_LOCALE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY|RXf_PMf_UNICODE)
+#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_CHARSET|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY)
/* These copies need to be numerical or defsubs_h.PL won't know about them. */
#define PMf_MULTILINE 1<<0
@@ -49,9 +88,8 @@
#define PMf_FOLD 1<<2
#define PMf_EXTENDED 1<<3
#define PMf_KEEPCOPY 1<<4
-#define PMf_LOCALE 1<<5
-#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY != RXf_PMf_KEEPCOPY || PMf_LOCALE != RXf_PMf_LOCALE
+#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY != RXf_PMf_KEEPCOPY
# error RXf_PMf defines are wrong
#endif
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index ea633cb6e1..3034972fe7 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -435,6 +435,15 @@ fundamentally broken model of how the Unicode non-character code points
should be handled, which is now described in
L<perlunicode/Non-character code points>. See also L</Selected Bug Fixes>.
+=item *
+
+Certain shared flags in the C<pmop.op_pmflags> and C<regexp.extflags>
+structures have been removed. These are: C<Rxf_Pmf_LOCALE>,
+C<Rxf_Pmf_UNICODE>, and C<PMf_LOCALE>. Instead there are encodes and
+three static in-line functions for accessing the information:
+C<get_regex_charset()>, C<set_regex_charset()>, and C<get_regex_charset_name()>,
+which are defined in the places where the orginal flags were.
+
=back
=head1 Selected Bug Fixes
diff --git a/pp.c b/pp.c
index df28740929..026eea11f0 100644
--- a/pp.c
+++ b/pp.c
@@ -5870,7 +5870,7 @@ PP(pp_split)
DIE(aTHX_ "panic: pp_split");
rx = PM_GETRE(pm);
- TAINT_IF((RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) &&
+ TAINT_IF(get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET &&
(RX_EXTFLAGS(rx) & (RXf_WHITE | RXf_SKIPWHITE)));
RX_MATCH_UTF8_set(rx, do_utf8);
@@ -5916,7 +5916,7 @@ PP(pp_split)
while (*s == ' ' || is_utf8_space((U8*)s))
s += UTF8SKIP(s);
}
- else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+ else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
while (isSPACE_LC(*s))
s++;
}
@@ -5946,7 +5946,8 @@ PP(pp_split)
else
m += t;
}
- } else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+ }
+ else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
while (m < strend && !isSPACE_LC(*m))
++m;
} else {
@@ -5978,7 +5979,8 @@ PP(pp_split)
if (do_utf8) {
while (s < strend && ( *s == ' ' || is_utf8_space((U8*)s) ))
s += UTF8SKIP(s);
- } else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+ }
+ else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
while (s < strend && isSPACE_LC(*s))
++s;
} else {
diff --git a/regcomp.c b/regcomp.c
index 108430ae1d..c57256d390 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -369,8 +369,9 @@ static const scan_data_t zero_scan_data =
#define SCF_SEEN_ACCEPT 0x8000
#define UTF cBOOL(RExC_utf8)
-#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
-#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
+#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
+
#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
#define OOB_UNICODE 12345678
@@ -4479,8 +4480,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'dual' charset specified, as it means unicode when utf8 */
pm_flags = orig_pm_flags;
- if (RExC_utf8 && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
- pm_flags |= RXf_PMf_UNICODE;
+
+ if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
+ set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
}
RExC_precomp = exp;
@@ -4566,7 +4568,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
r->extflags = pm_flags;
{
bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
- bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+ bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
/* The caret is output if there are any defaults: if not all the STD
* flags are set, or if no character set specifier is needed */
@@ -4587,7 +4589,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
* covered by the caret */
const STRLEN wraplen = plen + has_p + has_runon
+ has_default /* If needs a caret */
- + has_charset /* If needs a character set specifier */
+
+ /* If needs a character set specifier */
+ + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
+ (sizeof(STD_PAT_MODS) - 1)
+ (sizeof("(?:)") - 1);
@@ -4601,11 +4605,10 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
*p++= DEFAULT_PAT_MOD;
}
if (has_charset) {
- if (r->extflags & RXf_PMf_LOCALE) {
- *p++ = LOCALE_PAT_MOD;
- } else {
- *p++ = UNICODE_PAT_MOD;
- }
+ STRLEN len;
+ const char* const name = get_regex_charset_name(r->extflags, &len);
+ Copy(name, p, len, char);
+ p += len;
}
if (has_p)
*p++ = KEEPCOPY_PAT_MOD; /*'p'*/
@@ -6300,7 +6303,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
if (RExC_utf8) { /* But the default for a utf8 pattern is
unicode semantics */
- RExC_flags |= RXf_PMf_UNICODE;
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
}
goto parse_flags;
default:
@@ -6310,6 +6313,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
bool has_charset_modifier = 0;
+ regex_charset cs = REGEX_DEPENDS_CHARSET;
while (*RExC_parse) {
/* && strchr("iogcmsx", *RExC_parse) */
@@ -6321,16 +6325,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
if (has_charset_modifier || flagsp == &negflags) {
goto fail_modifiers;
}
- posflags |= RXf_PMf_LOCALE;
- negflags |= RXf_PMf_UNICODE;
+ cs = REGEX_LOCALE_CHARSET;
has_charset_modifier = 1;
break;
case UNICODE_PAT_MOD:
if (has_charset_modifier || flagsp == &negflags) {
goto fail_modifiers;
}
- posflags |= RXf_PMf_UNICODE;
- negflags |= RXf_PMf_LOCALE;
+ cs = REGEX_UNICODE_CHARSET;
has_charset_modifier = 1;
break;
case DUAL_PAT_MOD:
@@ -6344,13 +6346,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
/* The dual charset means unicode semantics if the
* pattern (or target, not known until runtime) are
* utf8 */
- if (RExC_utf8) {
- posflags |= RXf_PMf_UNICODE;
- negflags |= RXf_PMf_LOCALE;
- }
- else {
- negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
- }
+ cs = (RExC_utf8)
+ ? REGEX_UNICODE_CHARSET
+ : REGEX_DEPENDS_CHARSET;
has_charset_modifier = 1;
break;
case ONCE_PAT_MOD: /* 'o' */
@@ -6411,9 +6409,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
case ')':
RExC_flags |= posflags;
RExC_flags &= ~negflags;
+ set_regex_charset(&RExC_flags, cs);
if (paren != ':') {
oregflags |= posflags;
oregflags &= ~negflags;
+ set_regex_charset(&oregflags, cs);
}
nextchar(pRExC_state);
if (paren != ':') {
@@ -9508,14 +9508,34 @@ S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
{
int bit;
int set=0;
+ regex_charset cs;
for (bit=0; bit<32; bit++) {
if (flags & (1<<bit)) {
+ if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
+ continue;
+ }
if (!set++ && lead)
PerlIO_printf(Perl_debug_log, "%s",lead);
PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
}
}
+ if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
+ if (!set++ && lead) {
+ PerlIO_printf(Perl_debug_log, "%s",lead);
+ }
+ switch (cs) {
+ case REGEX_UNICODE_CHARSET:
+ PerlIO_printf(Perl_debug_log, "UNICODE");
+ break;
+ case REGEX_LOCALE_CHARSET:
+ PerlIO_printf(Perl_debug_log, "LOCALE");
+ break;
+ default:
+ PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
+ break;
+ }
+ }
if (lead) {
if (set)
PerlIO_printf(Perl_debug_log, "\n");
diff --git a/regexp.h b/regexp.h
index a695389501..b6dec7193f 100644
--- a/regexp.h
+++ b/regexp.h
@@ -235,7 +235,7 @@ and check for NULL.
/* Note, includes locale, unicode */
#define STD_PMMOD_FLAGS_CLEAR(pmfl) \
- *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_LOCALE|RXf_PMf_UNICODE)
+ *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_CHARSET)
/* chars and strings used as regex pattern modifiers
* Singular is a 'c'har, plural is a "string"
@@ -293,6 +293,32 @@ and check for NULL.
* unshared area without affecting binary compatibility */
#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+2)
+/* embed.pl doesn't yet know how to handle static inline functions, so
+ manually decorate them here with gcc-style attributes.
+*/
+PERL_STATIC_INLINE const char *
+get_regex_charset_name(const U32 flags, STRLEN* const lenp)
+ __attribute__warn_unused_result__;
+
+#define MAX_CHARSET_NAME_LENGTH 1
+
+PERL_STATIC_INLINE const char *
+get_regex_charset_name(const U32 flags, STRLEN* const lenp)
+{
+ /* Returns a string that corresponds to the name of the regex character set
+ * given by 'flags', and *lenp is set the length of that string, which
+ * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
+
+ *lenp = 1;
+ switch (get_regex_charset(flags)) {
+ case REGEX_DEPENDS_CHARSET: return DUAL_PAT_MODS;
+ case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
+ case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
+ }
+
+ return "?"; /* Unknown */
+}
+
/* Anchor and GPOS related stuff */
#define RXf_ANCH_BOL (1<<(RXf_BASE_SHIFT+0))
#define RXf_ANCH_MBOL (1<<(RXf_BASE_SHIFT+1))
diff --git a/regnodes.h b/regnodes.h
index 296d383152..f0725ba525 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -651,8 +651,8 @@ EXTCONST char * const PL_reg_extflags_name[] = {
"FOLD", /* 0x00000004 */
"EXTENDED", /* 0x00000008 */
"KEEPCOPY", /* 0x00000010 */
- "LOCALE", /* 0x00000020 */
- "UNICODE", /* 0x00000040 */
+ "CHARSET", /* 0x00000060 */
+ "CHARSET", /* 0x00000060 */
"UNUSED_BIT_7", /* 0x00000080 */
"UNUSED_BIT_8", /* 0x00000100 */
"ANCH_BOL", /* 0x00000200 */
diff --git a/universal.c b/universal.c
index 08f9ab8f51..96a92cf994 100644
--- a/universal.c
+++ b/universal.c
@@ -1155,8 +1155,7 @@ XS(XS_re_regexp_pattern)
if ( GIMME_V == G_ARRAY ) {
STRLEN left = 0;
- char reflags[sizeof(INT_PAT_MODS) + 1]; /* The +1 is for the charset
- modifier */
+ char reflags[sizeof(INT_PAT_MODS) + MAX_CHARSET_NAME_LENGTH];
const char *fptr;
char ch;
U16 match_flags;
@@ -1164,14 +1163,15 @@ XS(XS_re_regexp_pattern)
/*
we are in list context so stringify
the modifiers that apply. We ignore "negative
- modifiers" in this scenario.
+ modifiers" in this scenario, and the default character set
*/
- if (RX_EXTFLAGS(re) & RXf_PMf_LOCALE) {
- reflags[left++] = LOCALE_PAT_MOD;
- }
- else if (RX_EXTFLAGS(re) & RXf_PMf_UNICODE) {
- reflags[left++] = UNICODE_PAT_MOD;
+ if (get_regex_charset(RX_EXTFLAGS(re)) != REGEX_DEPENDS_CHARSET) {
+ STRLEN len;
+ const char* const name = get_regex_charset_name(RX_EXTFLAGS(re),
+ &len);
+ Copy(name, reflags + left, len, char);
+ left += len;
}
fptr = INT_PAT_MODS;
match_flags = (U16)((RX_EXTFLAGS(re) & PMf_COMPILETIME)