Use multi-bit field for regex character set

The /d, /l, and /u regex modifiers are mutually exclusive. This patch changes the field that stores the character set to use more than one bit with an enum determining which one. This data structure more closely follows the semantics of their being mutually exclusive, and conserves bits as well, and is better expandable. A small API is added to set and query the bit field. This patch is not .xs source backwards compatible. A handful of cpan programs are affected.
author: Karl Williamson <public@khwilliamson.com> 2010-12-26 10:31:16 -0700
committer: Karl Williamson <public@khwilliamson.com> 2011-01-16 16:36:43 -0700
commit: a62b1201c068dc7b099bcb7182e188c4d2fbf34c (patch)
tree: 6c067a6e4adc8f2333b749fa3592c2812e711b95
parent: 5458d9a05ef8545ccbb8a58e670fbede60d10480 (diff)
download: perl-a62b1201c068dc7b099bcb7182e188c4d2fbf34c.tar.gz
9 files changed, 143 insertions, 49 deletions
diff --git a/ext/re/re.pm b/ext/re/re.pm
index 665947e8b7..7a489950f4 100644
--- a/ext/re/re.pm
+++ b/ext/re/re.pm
@@ -25,9 +25,9 @@ my %reflags = (
     x => 1 << ($PMMOD_SHIFT + 3),
     p => 1 << ($PMMOD_SHIFT + 4),
 # special cases:
-    l => 1 << ($PMMOD_SHIFT + 5),
-    u => 1 << ($PMMOD_SHIFT + 6),
     d => 0,
+    l => 1,
+    u => 2,
 );
 
 sub setcolor {
diff --git a/op.c b/op.c
index 371c1e64ac..fbf3d71dda 100644
--- a/op.c
+++ b/op.c
@@ -3778,10 +3778,10 @@ Perl_newPMOP(pTHX_ I32 type, I32 flags)
     if (PL_hints & HINT_RE_TAINT)
 	pmop->op_pmflags |= PMf_RETAINT;
     if (PL_hints & HINT_LOCALE) {
-	pmop->op_pmflags |= PMf_LOCALE;
+	set_regex_charset(&(pmop->op_pmflags), REGEX_LOCALE_CHARSET);
     }
     else if ((! (PL_hints & HINT_BYTES)) && (PL_hints & HINT_UNI_8_BIT)) {
-        pmop->op_pmflags |= RXf_PMf_UNICODE;
+	set_regex_charset(&(pmop->op_pmflags), REGEX_UNICODE_CHARSET);
     }
     if (PL_hints & HINT_RE_FLAGS) {
         SV *reflags = Perl_refcounted_he_fetch_pvn(aTHX_
@@ -3792,8 +3792,7 @@ Perl_newPMOP(pTHX_ I32 type, I32 flags)
          PL_compiling.cop_hints_hash, STR_WITH_LEN("reflags_charset"), 0, 0
         );
         if (reflags && SvOK(reflags)) {
-            pmop->op_pmflags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
-            pmop->op_pmflags |= SvIV(reflags);
+            set_regex_charset(&(pmop->op_pmflags), SvIV(reflags));
         }
     }
 
diff --git a/op_reg_common.h b/op_reg_common.h
index 238d7bdb0e..c6d846dd69 100644
--- a/op_reg_common.h
+++ b/op_reg_common.h
@@ -32,8 +32,47 @@
 #define RXf_PMf_FOLD           (1 << (RXf_PMf_STD_PMMOD_SHIFT+2))    /* /i */
 #define RXf_PMf_EXTENDED       (1 << (RXf_PMf_STD_PMMOD_SHIFT+3))    /* /x */
 #define RXf_PMf_KEEPCOPY       (1 << (RXf_PMf_STD_PMMOD_SHIFT+4))    /* /p */
-#define RXf_PMf_LOCALE         (1 << (RXf_PMf_STD_PMMOD_SHIFT+5))
-#define RXf_PMf_UNICODE        (1 << (RXf_PMf_STD_PMMOD_SHIFT+6))
+
+/* The character set for the regex is stored in a field of more than one bit
+ * using an enum, for reasons of compactness and to ensure that the options are
+ * mutually exclusive */
+typedef enum {
+    REGEX_DEPENDS_CHARSET = 0,
+    REGEX_LOCALE_CHARSET,
+    REGEX_UNICODE_CHARSET
+} regex_charset;
+
+#define _RXf_PMf_CHARSET_SHIFT ((RXf_PMf_STD_PMMOD_SHIFT)+5)
+#define RXf_PMf_CHARSET (3 << (_RXf_PMf_CHARSET_SHIFT)) /* 2 bits */
+
+/* embed.pl doesn't yet know how to handle static inline functions, so
+   manually decorate them here with gcc-style attributes.
+*/
+PERL_STATIC_INLINE void
+set_regex_charset(U32 * const flags, const regex_charset cs)
+    __attribute__nonnull__(1);
+
+PERL_STATIC_INLINE void
+set_regex_charset(U32 * const flags, const regex_charset cs)
+{
+    /* Sets the character set portion of 'flags' to 'cs', which is a member of
+     * the above enum */
+
+    *flags &= ~RXf_PMf_CHARSET;
+    *flags |= (cs << _RXf_PMf_CHARSET_SHIFT);
+}
+
+PERL_STATIC_INLINE regex_charset
+get_regex_charset(const U32 flags)
+    __attribute__warn_unused_result__;
+
+PERL_STATIC_INLINE regex_charset
+get_regex_charset(const U32 flags)
+{
+    /* Returns the enum corresponding to the character set in 'flags' */
+
+    return (flags & RXf_PMf_CHARSET) >> _RXf_PMf_CHARSET_SHIFT;
+}
 
 /* Next available bit after the above.  Name begins with '_' so won't be
  * exported by B */
@@ -41,7 +80,7 @@
 
 /* Mask of the above bits.  These need to be transferred from op_pmflags to
  * re->extflags during compilation */
-#define RXf_PMf_COMPILETIME    (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_LOCALE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY|RXf_PMf_UNICODE)
+#define RXf_PMf_COMPILETIME    (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_CHARSET|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY)
 
 /* These copies need to be numerical or defsubs_h.PL won't know about them. */
 #define PMf_MULTILINE    1<<0
@@ -49,9 +88,8 @@
 #define PMf_FOLD         1<<2
 #define PMf_EXTENDED     1<<3
 #define PMf_KEEPCOPY     1<<4
-#define PMf_LOCALE       1<<5
 
-#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY != RXf_PMf_KEEPCOPY || PMf_LOCALE != RXf_PMf_LOCALE
+#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY != RXf_PMf_KEEPCOPY
 #   error RXf_PMf defines are wrong
 #endif
 
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index ea633cb6e1..3034972fe7 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -435,6 +435,15 @@ fundamentally broken model of how the Unicode non-character code points
 should be handled, which is now described in
 L<perlunicode/Non-character code points>.  See also L</Selected Bug Fixes>.
 
+=item *
+
+Certain shared flags in the C<pmop.op_pmflags> and C<regexp.extflags>
+structures have been removed.  These are: C<Rxf_Pmf_LOCALE>,
+C<Rxf_Pmf_UNICODE>, and C<PMf_LOCALE>.  Instead there are encodes and
+three static in-line functions for accessing the information:
+C<get_regex_charset()>, C<set_regex_charset()>, and C<get_regex_charset_name()>,
+which are defined in the places where the orginal flags were.
+
 =back
 
 =head1 Selected Bug Fixes
diff --git a/pp.c b/pp.c
index df28740929..026eea11f0 100644
--- a/pp.c
+++ b/pp.c
@@ -5870,7 +5870,7 @@ PP(pp_split)
 	DIE(aTHX_ "panic: pp_split");
     rx = PM_GETRE(pm);
 
-    TAINT_IF((RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) &&
+    TAINT_IF(get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET &&
 	     (RX_EXTFLAGS(rx) & (RXf_WHITE | RXf_SKIPWHITE)));
 
     RX_MATCH_UTF8_set(rx, do_utf8);
@@ -5916,7 +5916,7 @@ PP(pp_split)
 	    while (*s == ' ' || is_utf8_space((U8*)s))
 		s += UTF8SKIP(s);
 	}
-	else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+	else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
 	    while (isSPACE_LC(*s))
 		s++;
 	}
@@ -5946,7 +5946,8 @@ PP(pp_split)
 		    else
 			m += t;
 		}
-            } else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+	    }
+	    else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
 	        while (m < strend && !isSPACE_LC(*m))
 		    ++m;
             } else {
@@ -5978,7 +5979,8 @@ PP(pp_split)
 	    if (do_utf8) {
 		while (s < strend && ( *s == ' ' || is_utf8_space((U8*)s) ))
 	            s +=  UTF8SKIP(s);
-            } else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+	    }
+	    else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
 	        while (s < strend && isSPACE_LC(*s))
 		    ++s;
             } else {
diff --git a/regcomp.c b/regcomp.c
index 108430ae1d..c57256d390 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -369,8 +369,9 @@ static const scan_data_t zero_scan_data =
 #define SCF_SEEN_ACCEPT         0x8000 
 
 #define UTF cBOOL(RExC_utf8)
-#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
-#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
+#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
+
 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 
 #define OOB_UNICODE		12345678
@@ -4479,8 +4480,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     /* Set to use unicode semantics if the pattern is in utf8 and has the
      * 'dual' charset specified, as it means unicode when utf8  */
     pm_flags = orig_pm_flags;
-    if (RExC_utf8  && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
-	pm_flags |= RXf_PMf_UNICODE;
+
+    if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
+	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
     }
 
     RExC_precomp = exp;
@@ -4566,7 +4568,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     r->extflags = pm_flags;
     {
         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
-        bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+        bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
 
         /* The caret is output if there are any defaults: if not all the STD
          * flags are set, or if no character set specifier is needed */
@@ -4587,7 +4589,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
          * covered by the caret */
 	const STRLEN wraplen = plen + has_p + has_runon
             + has_default       /* If needs a caret */
-            + has_charset       /* If needs a character set specifier */
+
+		/* If needs a character set specifier */
+	    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
             + (sizeof(STD_PAT_MODS) - 1)
             + (sizeof("(?:)") - 1);
 
@@ -4601,11 +4605,10 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
             *p++= DEFAULT_PAT_MOD;
         }
         if (has_charset) {
-            if (r->extflags & RXf_PMf_LOCALE) {
-                *p++ = LOCALE_PAT_MOD;
-            } else {
-                *p++ = UNICODE_PAT_MOD;
-            }
+	    STRLEN len;
+	    const char* const name = get_regex_charset_name(r->extflags, &len);
+	    Copy(name, p, len, char);
+	    p += len;
         }
         if (has_p)
             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
@@ -6300,7 +6303,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
 		if (RExC_utf8) {    /* But the default for a utf8 pattern is
 				       unicode semantics */
-		    RExC_flags |= RXf_PMf_UNICODE;
+		    set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
 		}
                 goto parse_flags;
 	    default:
@@ -6310,6 +6313,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 U32 posflags = 0, negflags = 0;
 	        U32 *flagsp = &posflags;
                 bool has_charset_modifier = 0;
+		regex_charset cs = REGEX_DEPENDS_CHARSET;
 
 		while (*RExC_parse) {
 		    /* && strchr("iogcmsx", *RExC_parse) */
@@ -6321,16 +6325,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         if (has_charset_modifier || flagsp == &negflags) {
                             goto fail_modifiers;
                         }
-                        posflags |= RXf_PMf_LOCALE;
-                        negflags |= RXf_PMf_UNICODE;
+			cs = REGEX_LOCALE_CHARSET;
                         has_charset_modifier = 1;
                         break;
                     case UNICODE_PAT_MOD:
                         if (has_charset_modifier || flagsp == &negflags) {
                             goto fail_modifiers;
                         }
-                        posflags |= RXf_PMf_UNICODE;
-                        negflags |= RXf_PMf_LOCALE;
+			cs = REGEX_UNICODE_CHARSET;
                         has_charset_modifier = 1;
                         break;
                     case DUAL_PAT_MOD:
@@ -6344,13 +6346,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 			/* The dual charset means unicode semantics if the
 			 * pattern (or target, not known until runtime) are
 			 * utf8 */
-			if (RExC_utf8) {
-			    posflags |= RXf_PMf_UNICODE;
-			    negflags |= RXf_PMf_LOCALE;
-			}
-			else {
-			    negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
-			}
+			cs = (RExC_utf8)
+			     ? REGEX_UNICODE_CHARSET
+			     : REGEX_DEPENDS_CHARSET;
                         has_charset_modifier = 1;
                         break;
                     case ONCE_PAT_MOD: /* 'o' */
@@ -6411,9 +6409,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     case ')':
                         RExC_flags |= posflags;
                         RExC_flags &= ~negflags;
+			set_regex_charset(&RExC_flags, cs);
                         if (paren != ':') {
                             oregflags |= posflags;
                             oregflags &= ~negflags;
+			    set_regex_charset(&oregflags, cs);
                         }
                         nextchar(pRExC_state);
 		        if (paren != ':') {
@@ -9508,14 +9508,34 @@ S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
 {
     int bit;
     int set=0;
+    regex_charset cs;
 
     for (bit=0; bit<32; bit++) {
         if (flags & (1<<bit)) {
+	    if ((1<<bit) & RXf_PMf_CHARSET) {	/* Output separately, below */
+		continue;
+	    }
             if (!set++ && lead) 
                 PerlIO_printf(Perl_debug_log, "%s",lead);
             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
         }	        
     }	   
+    if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
+            if (!set++ && lead) {
+                PerlIO_printf(Perl_debug_log, "%s",lead);
+            }
+            switch (cs) {
+                case REGEX_UNICODE_CHARSET:
+                    PerlIO_printf(Perl_debug_log, "UNICODE");
+                    break;
+                case REGEX_LOCALE_CHARSET:
+                    PerlIO_printf(Perl_debug_log, "LOCALE");
+                    break;
+                default:
+                    PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
+                    break;
+            }
+    }
     if (lead)  {
         if (set) 
             PerlIO_printf(Perl_debug_log, "\n");
diff --git a/regexp.h b/regexp.h
index a695389501..b6dec7193f 100644
--- a/regexp.h
+++ b/regexp.h
@@ -235,7 +235,7 @@ and check for NULL.
 
 /* Note, includes locale, unicode */
 #define STD_PMMOD_FLAGS_CLEAR(pmfl)                        \
-    *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_LOCALE|RXf_PMf_UNICODE)
+    *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_CHARSET)
 
 /* chars and strings used as regex pattern modifiers
  * Singular is a 'c'har, plural is a "string"
@@ -293,6 +293,32 @@ and check for NULL.
  * unshared area without affecting binary compatibility */
 #define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+2)
 
+/* embed.pl doesn't yet know how to handle static inline functions, so
+   manually decorate them here with gcc-style attributes.
+*/
+PERL_STATIC_INLINE const char *
+get_regex_charset_name(const U32 flags, STRLEN* const lenp)
+    __attribute__warn_unused_result__;
+
+#define MAX_CHARSET_NAME_LENGTH 1
+
+PERL_STATIC_INLINE const char *
+get_regex_charset_name(const U32 flags, STRLEN* const lenp)
+{
+    /* Returns a string that corresponds to the name of the regex character set
+     * given by 'flags', and *lenp is set the length of that string, which
+     * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
+
+    *lenp = 1;
+    switch (get_regex_charset(flags)) {
+        case REGEX_DEPENDS_CHARSET: return DUAL_PAT_MODS;
+        case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
+        case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
+    }
+
+    return "?";	    /* Unknown */
+}
+
 /* Anchor and GPOS related stuff */
 #define RXf_ANCH_BOL    	(1<<(RXf_BASE_SHIFT+0))
 #define RXf_ANCH_MBOL   	(1<<(RXf_BASE_SHIFT+1))
diff --git a/regnodes.h b/regnodes.h
index 296d383152..f0725ba525 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -651,8 +651,8 @@ EXTCONST char * const PL_reg_extflags_name[] = {
 	"FOLD",             /* 0x00000004 */
 	"EXTENDED",         /* 0x00000008 */
 	"KEEPCOPY",         /* 0x00000010 */
-	"LOCALE",           /* 0x00000020 */
-	"UNICODE",          /* 0x00000040 */
+	"CHARSET",          /* 0x00000060 */
+	"CHARSET",          /* 0x00000060 */
 	"UNUSED_BIT_7",     /* 0x00000080 */
 	"UNUSED_BIT_8",     /* 0x00000100 */
 	"ANCH_BOL",         /* 0x00000200 */
diff --git a/universal.c b/universal.c
index 08f9ab8f51..96a92cf994 100644
--- a/universal.c
+++ b/universal.c
@@ -1155,8 +1155,7 @@ XS(XS_re_regexp_pattern)
 
         if ( GIMME_V == G_ARRAY ) {
 	    STRLEN left = 0;
-	    char reflags[sizeof(INT_PAT_MODS) + 1]; /* The +1 is for the charset
-						        modifier */
+	    char reflags[sizeof(INT_PAT_MODS) + MAX_CHARSET_NAME_LENGTH];
             const char *fptr;
             char ch;
             U16 match_flags;
@@ -1164,14 +1163,15 @@ XS(XS_re_regexp_pattern)
             /*
                we are in list context so stringify
                the modifiers that apply. We ignore "negative
-               modifiers" in this scenario.
+               modifiers" in this scenario, and the default character set
             */
 
-            if (RX_EXTFLAGS(re) & RXf_PMf_LOCALE) {
-		reflags[left++] = LOCALE_PAT_MOD;
-	    }
-	    else if (RX_EXTFLAGS(re) & RXf_PMf_UNICODE) {
-		reflags[left++] = UNICODE_PAT_MOD;
+	    if (get_regex_charset(RX_EXTFLAGS(re)) != REGEX_DEPENDS_CHARSET) {
+		STRLEN len;
+		const char* const name = get_regex_charset_name(RX_EXTFLAGS(re),
+								&len);
+		Copy(name, reflags + left, len, char);
+		left += len;
 	    }
             fptr = INT_PAT_MODS;
             match_flags = (U16)((RX_EXTFLAGS(re) & PMf_COMPILETIME)
author	Karl Williamson <public@khwilliamson.com>	2010-12-26 10:31:16 -0700
committer	Karl Williamson <public@khwilliamson.com>	2011-01-16 16:36:43 -0700
commit	a62b1201c068dc7b099bcb7182e188c4d2fbf34c (patch)
tree	6c067a6e4adc8f2333b749fa3592c2812e711b95
parent	5458d9a05ef8545ccbb8a58e670fbede60d10480 (diff)
download	perl-a62b1201c068dc7b099bcb7182e188c4d2fbf34c.tar.gz