9 files changed, 143 insertions, 49 deletions
diff --git a/ext/re/re.pm b/ext/re/re.pm
index 665947e8b7..7a489950f4 100644
--- a/ext/re/re.pm
+++ b/ext/re/re.pm
@@ -25,9 +25,9 @@ my %reflags = (
     x => 1 << ($PMMOD_SHIFT + 3),
     p => 1 << ($PMMOD_SHIFT + 4),
 # special cases:
-    l => 1 << ($PMMOD_SHIFT + 5),
-    u => 1 << ($PMMOD_SHIFT + 6),
     d => 0,
+    l => 1,
+    u => 2,
 );
 
 sub setcolor {
diff --git a/op.c b/op.c
index 371c1e64ac..fbf3d71dda 100644
--- a/op.c
+++ b/op.c
@@ -3778,10 +3778,10 @@ Perl_newPMOP(pTHX_ I32 type, I32 flags)
     if (PL_hints & HINT_RE_TAINT)
 	pmop->op_pmflags |= PMf_RETAINT;
     if (PL_hints & HINT_LOCALE) {
-	pmop->op_pmflags |= PMf_LOCALE;
+	set_regex_charset(&(pmop->op_pmflags), REGEX_LOCALE_CHARSET);
     }
     else if ((! (PL_hints & HINT_BYTES)) && (PL_hints & HINT_UNI_8_BIT)) {
-        pmop->op_pmflags |= RXf_PMf_UNICODE;
+	set_regex_charset(&(pmop->op_pmflags), REGEX_UNICODE_CHARSET);
     }
     if (PL_hints & HINT_RE_FLAGS) {
         SV *reflags = Perl_refcounted_he_fetch_pvn(aTHX_
@@ -3792,8 +3792,7 @@ Perl_newPMOP(pTHX_ I32 type, I32 flags)
          PL_compiling.cop_hints_hash, STR_WITH_LEN("reflags_charset"), 0, 0
         );
         if (reflags && SvOK(reflags)) {
-            pmop->op_pmflags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
-            pmop->op_pmflags |= SvIV(reflags);
+            set_regex_charset(&(pmop->op_pmflags), SvIV(reflags));
         }
     }
 
diff --git a/op_reg_common.h b/op_reg_common.h
index 238d7bdb0e..c6d846dd69 100644
--- a/op_reg_common.h
+++ b/op_reg_common.h
@@ -32,8 +32,47 @@
 #define RXf_PMf_FOLD           (1 << (RXf_PMf_STD_PMMOD_SHIFT+2))    /* /i */
 #define RXf_PMf_EXTENDED       (1 << (RXf_PMf_STD_PMMOD_SHIFT+3))    /* /x */
 #define RXf_PMf_KEEPCOPY       (1 << (RXf_PMf_STD_PMMOD_SHIFT+4))    /* /p */
-#define RXf_PMf_LOCALE         (1 << (RXf_PMf_STD_PMMOD_SHIFT+5))
-#define RXf_PMf_UNICODE        (1 << (RXf_PMf_STD_PMMOD_SHIFT+6))
+
+/* The character set for the regex is stored in a field of more than one bit
+ * using an enum, for reasons of compactness and to ensure that the options are
+ * mutually exclusive */
+typedef enum {
+    REGEX_DEPENDS_CHARSET = 0,
+    REGEX_LOCALE_CHARSET,
+    REGEX_UNICODE_CHARSET
+} regex_charset;
+
+#define _RXf_PMf_CHARSET_SHIFT ((RXf_PMf_STD_PMMOD_SHIFT)+5)
+#define RXf_PMf_CHARSET (3 << (_RXf_PMf_CHARSET_SHIFT)) /* 2 bits */
+
+/* embed.pl doesn't yet know how to handle static inline functions, so
+   manually decorate them here with gcc-style attributes.
+*/
+PERL_STATIC_INLINE void
+set_regex_charset(U32 * const flags, const regex_charset cs)
+    __attribute__nonnull__(1);
+
+PERL_STATIC_INLINE void
+set_regex_charset(U32 * const flags, const regex_charset cs)
+{
+    /* Sets the character set portion of 'flags' to 'cs', which is a member of
+     * the above enum */
+
+    *flags &= ~RXf_PMf_CHARSET;
+    *flags |= (cs << _RXf_PMf_CHARSET_SHIFT);
+}
+
+PERL_STATIC_INLINE regex_charset
+get_regex_charset(const U32 flags)
+    __attribute__warn_unused_result__;
+
+PERL_STATIC_INLINE regex_charset
+get_regex_charset(const U32 flags)
+{
+    /* Returns the enum corresponding to the character set in 'flags' */
+
+    return (flags & RXf_PMf_CHARSET) >> _RXf_PMf_CHARSET_SHIFT;
+}
 
 /* Next available bit after the above.  Name begins with '_' so won't be
  * exported by B */
@@ -41,7 +80,7 @@
 
 /* Mask of the above bits.  These need to be transferred from op_pmflags to
  * re->extflags during compilation */
-#define RXf_PMf_COMPILETIME    (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_LOCALE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY|RXf_PMf_UNICODE)
+#define RXf_PMf_COMPILETIME    (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_CHARSET|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY)
 
 /* These copies need to be numerical or defsubs_h.PL won't know about them. */
 #define PMf_MULTILINE    1<<0
@@ -49,9 +88,8 @@
 #define PMf_FOLD         1<<2
 #define PMf_EXTENDED     1<<3
 #define PMf_KEEPCOPY     1<<4
-#define PMf_LOCALE       1<<5
 
-#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY != RXf_PMf_KEEPCOPY || PMf_LOCALE != RXf_PMf_LOCALE
+#if PMf_MULTILINE != RXf_PMf_MULTILINE || PMf_SINGLELINE != RXf_PMf_SINGLELINE || PMf_FOLD != RXf_PMf_FOLD || PMf_EXTENDED != RXf_PMf_EXTENDED || PMf_KEEPCOPY != RXf_PMf_KEEPCOPY
 #   error RXf_PMf defines are wrong
 #endif
 
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index ea633cb6e1..3034972fe7 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -435,6 +435,15 @@ fundamentally broken model of how the Unicode non-character code points
 should be handled, which is now described in
 L<perlunicode/Non-character code points>.  See also L</Selected Bug Fixes>.
 
+=item *
+
+Certain shared flags in the C<pmop.op_pmflags> and C<regexp.extflags>
+structures have been removed.  These are: C<Rxf_Pmf_LOCALE>,
+C<Rxf_Pmf_UNICODE>, and C<PMf_LOCALE>.  Instead there are encodes and
+three static in-line functions for accessing the information:
+C<get_regex_charset()>, C<set_regex_charset()>, and C<get_regex_charset_name()>,
+which are defined in the places where the orginal flags were.
+
 =back
 
 =head1 Selected Bug Fixes
diff --git a/pp.c b/pp.c
index df28740929..026eea11f0 100644
--- a/pp.c
+++ b/pp.c
@@ -5870,7 +5870,7 @@ PP(pp_split)
 	DIE(aTHX_ "panic: pp_split");
     rx = PM_GETRE(pm);
 
-    TAINT_IF((RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) &&
+    TAINT_IF(get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET &&
 	     (RX_EXTFLAGS(rx) & (RXf_WHITE | RXf_SKIPWHITE)));
 
     RX_MATCH_UTF8_set(rx, do_utf8);
@@ -5916,7 +5916,7 @@ PP(pp_split)
 	    while (*s == ' ' || is_utf8_space((U8*)s))
 		s += UTF8SKIP(s);
 	}
-	else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+	else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
 	    while (isSPACE_LC(*s))
 		s++;
 	}
@@ -5946,7 +5946,8 @@ PP(pp_split)
 		    else
 			m += t;
 		}
-            } else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+	    }
+	    else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
 	        while (m < strend && !isSPACE_LC(*m))
 		    ++m;
             } else {
@@ -5978,7 +5979,8 @@ PP(pp_split)
 	    if (do_utf8) {
 		while (s < strend && ( *s == ' ' || is_utf8_space((U8*)s) ))
 	            s +=  UTF8SKIP(s);
-            } else if (RX_EXTFLAGS(rx) & RXf_PMf_LOCALE) {
+	    }
+	    else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
 	        while (s < strend && isSPACE_LC(*s))
 		    ++s;
             } else {
diff --git a/regcomp.c b/regcomp.c
index 108430ae1d..c57256d390 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -369,8 +369,9 @@ static const scan_data_t zero_scan_data =
 #define SCF_SEEN_ACCEPT         0x8000 
 
 #define UTF cBOOL(RExC_utf8)
-#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
-#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
+#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
+#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
+
 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 
 #define OOB_UNICODE		12345678
@@ -4479,8 +4480,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     /* Set to use unicode semantics if the pattern is in utf8 and has the
      * 'dual' charset specified, as it means unicode when utf8  */
     pm_flags = orig_pm_flags;
-    if (RExC_utf8  && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
-	pm_flags |= RXf_PMf_UNICODE;
+
+    if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
+	set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
     }
 
     RExC_precomp = exp;
@@ -4566,7 +4568,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     r->extflags = pm_flags;
     {
         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
-        bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
+        bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
 
         /* The caret is output if there are any defaults: if not all the STD
          * flags are set, or if no character set specifier is needed */
@@ -4587,7 +4589,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
          * covered by the caret */
 	const STRLEN wraplen = plen + has_p + has_runon
             + has_default       /* If needs a caret */
-            + has_charset       /* If needs a character set specifier */
+
+		/* If needs a character set specifier */
+	    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
             + (sizeof(STD_PAT_MODS) - 1)
             + (sizeof("(?:)") - 1);
 
@@ -4601,11 +4605,10 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
             *p++= DEFAULT_PAT_MOD;
         }
         if (has_charset) {
-            if (r->extflags & RXf_PMf_LOCALE) {
-                *p++ = LOCALE_PAT_MOD;
-            } else {
-                *p++ = UNICODE_PAT_MOD;
-            }
+	    STRLEN len;
+	    const char* const name = get_regex_charset_name(r->extflags, &len);
+	    Copy(name, p, len, char);
+	    p += len;
         }
         if (has_p)
             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
@@ -6300,7 +6303,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
 		if (RExC_utf8) {    /* But the default for a utf8 pattern is
 				       unicode semantics */
-		    RExC_flags |= RXf_PMf_UNICODE;
+		    set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
 		}
                 goto parse_flags;
 	    default:
@@ -6310,6 +6313,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 U32 posflags = 0, negflags = 0;
 	        U32 *flagsp = &posflags;
                 bool has_charset_modifier = 0;
+		regex_charset cs = REGEX_DEPENDS_CHARSET;
 
 		while (*RExC_parse) {
 		    /* && strchr("iogcmsx", *RExC_parse) */
@@ -6321,16 +6325,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         if (has_charset_modifier || flagsp == &negflags) {
                             goto fail_modifiers;
                         }
-                        posflags |= RXf_PMf_LOCALE;
-                        negflags |= RXf_PMf_UNICODE;
+			cs = REGEX_LOCALE_CHARSET;
                         has_charset_modifier = 1;
                         break;
                     case UNICODE_PAT_MOD:
                         if (has_charset_modifier || flagsp == &negflags) {
                             goto fail_modifiers;
                         }
-                        posflags |= RXf_PMf_UNICODE;
-                        negflags |= RXf_PMf_LOCALE;
+			cs = REGEX_UNICODE_CHARSET;
                         has_charset_modifier = 1;
                         break;
                     case DUAL_PAT_MOD:
@@ -6344,13 +6346,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 			/* The dual charset means unicode semantics if the
 			 * pattern (or target, not known until runtime) are
 			 * utf8 */
-			if (RExC_utf8) {
-			    posflags |= RXf_PMf_UNICODE;
-			    negflags |= RXf_PMf_LOCALE;
-			}
-			else {
-			    negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
-			}
+			cs = (RExC_utf8)
+			     ? REGEX_UNICODE_CHARSET
+			     : REGEX_DEPENDS_CHARSET;
                         has_charset_modifier = 1;
                         break;
                     case ONCE_PAT_MOD: /* 'o' */
@@ -6411,9 +6409,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     case ')':
                         RExC_flags |= posflags;
                         RExC_flags &= ~negflags;
+			set_regex_charset(&RExC_flags, cs);
                         if (paren != ':') {
                             oregflags |= posflags;
                             oregflags &= ~negflags;
+			    set_regex_charset(&oregflags, cs);
                         }
                         nextchar(pRExC_state);
 		        if (paren != ':') {
@@ -9508,14 +9508,34 @@ S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
 {
     int bit;
     int set=0;
+    regex_charset cs;
 
     for (bit=0; bit<32; bit++) {
         if (flags & (1<<bit)) {
+	    if ((1<<bit) & RXf_PMf_CHARSET) {	/* Output separately, below */
+		continue;
+	    }
             if (!set++ && lead) 
                 PerlIO_printf(Perl_debug_log, "%s",lead);
             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
         }	        
     }	   
+    if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
+            if (!set++ && lead) {
+                PerlIO_printf(Perl_debug_log, "%s",lead);
+            }
+            switch (cs) {
+                case REGEX_UNICODE_CHARSET:
+                    PerlIO_printf(Perl_debug_log, "UNICODE");
+                    break;
+                case REGEX_LOCALE_CHARSET:
+                    PerlIO_printf(Perl_debug_log, "LOCALE");
+                    break;
+                default:
+                    PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
+                    break;
+            }
+    }
     if (lead)  {
         if (set) 
             PerlIO_printf(Perl_debug_log, "\n");
diff --git a/regexp.h b/regexp.h
index a695389501..b6dec7193f 100644
--- a/regexp.h
+++ b/regexp.h
@@ -235,7 +235,7 @@ and check for NULL.
 
 /* Note, includes locale, unicode */
 #define STD_PMMOD_FLAGS_CLEAR(pmfl)                        \
-    *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_LOCALE|RXf_PMf_UNICODE)
+    *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_CHARSET)
 
 /* chars and strings used as regex pattern modifiers
  * Singular is a 'c'har, plural is a "string"
@@ -293,6 +293,32 @@ and check for NULL.
  * unshared area without affecting binary compatibility */
 #define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+2)
 
+/* embed.pl doesn't yet know how to handle static inline functions, so
+   manually decorate them here with gcc-style attributes.
+*/
+PERL_STATIC_INLINE const char *
+get_regex_charset_name(const U32 flags, STRLEN* const lenp)
+    __attribute__warn_unused_result__;
+
+#define MAX_CHARSET_NAME_LENGTH 1
+
+PERL_STATIC_INLINE const char *
+get_regex_charset_name(const U32 flags, STRLEN* const lenp)
+{
+    /* Returns a string that corresponds to the name of the regex character set
+     * given by 'flags', and *lenp is set the length of that string, which
+     * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
+
+    *lenp = 1;
+    switch (get_regex_charset(flags)) {
+        case REGEX_DEPENDS_CHARSET: return DUAL_PAT_MODS;
+        case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
+        case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
+    }
+
+    return "?";	    /* Unknown */
+}
+
 /* Anchor and GPOS related stuff */
 #define RXf_ANCH_BOL    	(1<<(RXf_BASE_SHIFT+0))
 #define RXf_ANCH_MBOL   	(1<<(RXf_BASE_SHIFT+1))
diff --git a/regnodes.h b/regnodes.h
index 296d383152..f0725ba525 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -651,8 +651,8 @@ EXTCONST char * const PL_reg_extflags_name[] = {
 	"FOLD",             /* 0x00000004 */
 	"EXTENDED",         /* 0x00000008 */
 	"KEEPCOPY",         /* 0x00000010 */
-	"LOCALE",           /* 0x00000020 */
-	"UNICODE",          /* 0x00000040 */
+	"CHARSET",          /* 0x00000060 */
+	"CHARSET",          /* 0x00000060 */
 	"UNUSED_BIT_7",     /* 0x00000080 */
 	"UNUSED_BIT_8",     /* 0x00000100 */
 	"ANCH_BOL",         /* 0x00000200 */
diff --git a/universal.c b/universal.c
index 08f9ab8f51..96a92cf994 100644
--- a/universal.c
+++ b/universal.c
@@ -1155,8 +1155,7 @@ XS(XS_re_regexp_pattern)
 
         if ( GIMME_V == G_ARRAY ) {
 	    STRLEN left = 0;
-	    char reflags[sizeof(INT_PAT_MODS) + 1]; /* The +1 is for the charset
-						        modifier */
+	    char reflags[sizeof(INT_PAT_MODS) + MAX_CHARSET_NAME_LENGTH];
             const char *fptr;
             char ch;
             U16 match_flags;
@@ -1164,14 +1163,15 @@ XS(XS_re_regexp_pattern)
             /*
                we are in list context so stringify
                the modifiers that apply. We ignore "negative
-               modifiers" in this scenario.
+               modifiers" in this scenario, and the default character set
             */
 
-            if (RX_EXTFLAGS(re) & RXf_PMf_LOCALE) {
-		reflags[left++] = LOCALE_PAT_MOD;
-	    }
-	    else if (RX_EXTFLAGS(re) & RXf_PMf_UNICODE) {
-		reflags[left++] = UNICODE_PAT_MOD;
+	    if (get_regex_charset(RX_EXTFLAGS(re)) != REGEX_DEPENDS_CHARSET) {
+		STRLEN len;
+		const char* const name = get_regex_charset_name(RX_EXTFLAGS(re),
+								&len);
+		Copy(name, reflags + left, len, char);
+		left += len;
 	    }
             fptr = INT_PAT_MODS;
             match_flags = (U16)((RX_EXTFLAGS(re) & PMf_COMPILETIME)