6 files changed, 56 insertions, 23 deletions
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod
index fee1d6900e..15b716b630 100644
--- a/pod/perldebguts.pod
+++ b/pod/perldebguts.pod
@@ -613,7 +613,9 @@ will be lost.
                   posixl
 
  ANYOFH           sv 1       Like ANYOF, but only has "High" matches,
-                             none in the bitmap;
+                             none in the bitmap; the flags field
+                             contains the lowest matchable UTF-8 start
+                             byte
  ANYOFHb          sv 1       Like ANYOFH, but all matches share the same
                              UTF-8 start byte, given in the flags field
 
diff --git a/regcomp.c b/regcomp.c
index e54bc1976b..00460392ce 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1582,7 +1582,9 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
     unsigned int i;
     const U32 n = ARG(node);
     bool new_node_has_latin1 = FALSE;
-    const U8 flags = OP(node) == ANYOFHb ? 0 : ANYOF_FLAGS(node);
+    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHb))
+                      ? 0
+                      : ANYOF_FLAGS(node);
 
     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
 
@@ -1722,7 +1724,9 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      * another SSC or a regular ANYOF class.  Can create false positives. */
 
     SV* anded_cp_list;
-    U8  and_with_flags = (OP(and_with) == ANYOFHb) ? 0 : ANYOF_FLAGS(and_with);
+    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHb)
+                          ? 0
+                          : ANYOF_FLAGS(and_with);
     U8  anded_flags;
 
     PERL_ARGS_ASSERT_SSC_AND;
@@ -1906,7 +1910,9 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
 
     SV* ored_cp_list;
     U8 ored_flags;
-    U8  or_with_flags = (OP(or_with) == ANYOFHb) ? 0 : ANYOF_FLAGS(or_with);
+    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHb)
+                         ? 0
+                         : ANYOF_FLAGS(or_with);
 
     PERL_ARGS_ASSERT_SSC_OR;
 
@@ -19024,25 +19030,37 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             && ! upper_latin1_only_utf8_matches
             &&   anyof_flags == 0)
         {
+            U8 low_utf8[UTF8_MAXBYTES+1];
             UV highest_cp = invlist_highest(cp_list);
 
-            /* If the lowest and highest code point in the class have the same
-             * UTF-8 first byte, then all do, and we can store that byte for
-             * regexec.c to use so that it can more quickly scan the target
-             * string for potential matches for this class.  We co-opt the
-             * flags field for this, and make the node ANYOFb.  We do accept
-             * here very large code points (for future use), but don't do
-             * this optimization for them, as it would cause other
-             * complications */
             op = ANYOFH;
+
+            /* Currently the maximum allowed code point by the system is
+             * IV_MAX.  Higher ones are reserved for future internal use.  This
+             * particular regnode can be used for higher ones, but we can't
+             * calculate the code point of those.  IV_MAX suffices though, as
+             * it will be a large first byte */
+            (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+
+            /* We store the lowest possible first byte of the UTF-8
+             * representation, using the flags field.  This allows for quick
+             * ruling out of some inputs without having to convert from UTF-8
+             * to code point.  For EBCDIC, this has to be I8. */
+            anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
+
+            /* If the lowest and highest code point in the class have the same
+             * UTF-8 first byte, then all have that byte, and we can get an
+             * exact first byte instead of a minimum.  We signal this with a
+             * different regnode */
             if (highest_cp <= IV_MAX) {
-                U8 low_utf8[UTF8_MAXBYTES+1];
                 U8 high_utf8[UTF8_MAXBYTES+1];
 
-                (void) uvchr_to_utf8(low_utf8, start[0]);
-                (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list));
+                (void) uvchr_to_utf8(high_utf8, highest_cp);
 
                 if (low_utf8[0] == high_utf8[0]) {
+
+                    /* No need to convert to I8 for EBCDIC as this is an exact
+                     * match */
                     anyof_flags = low_utf8[0];
                     op = ANYOFHb;
                 }
@@ -20348,7 +20366,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* 2: embedded, otherwise 1 */
 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
     else if (k == ANYOF) {
-	const U8 flags = (OP(o) == ANYOFHb) ? 0 : ANYOF_FLAGS(o);
+	const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHb)
+                          ? 0
+                          : ANYOF_FLAGS(o);
         bool do_sep = FALSE;    /* Do we need to separate various components of
                                    the output? */
         /* Set if there is still an unresolved user-defined property */
@@ -20502,8 +20522,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* And finally the matching, closing ']' */
 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
 
-        if (OP(o) == ANYOFHb) {
-            Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o));
+        if (inRANGE(OP(o), ANYOFH, ANYOFHb)) {
+            Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x", FLAGS(o));
+            if (OP(o) == ANYOFH) {
+                /* Not strictly true for 32-bit or EBCDIC, but good
+                 * enough */
+                Perl_sv_catpvf(aTHX_ sv, "..\\xff");
+            }
+            Perl_sv_catpvf(aTHX_ sv, ")");
         }
 
 
diff --git a/regcomp.sym b/regcomp.sym
index 4b8885d0b3..9e2c6d3aea 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -64,7 +64,7 @@ ANYOFL      ANYOF,      sv charclass S    ; Like ANYOF, but /l is in effect
 ANYOFPOSIXL ANYOF,      sv charclass_posixl S    ; Like ANYOFL, but matches [[:posix:]] classes
 
 # Must be sequential
-ANYOFH      ANYOF,      sv 1 S    ; Like ANYOF, but only has "High" matches, none in the bitmap;
+ANYOFH      ANYOF,      sv 1 S    ; Like ANYOF, but only has "High" matches, none in the bitmap; the flags field contains the lowest matchable UTF-8 start byte
 ANYOFHb     ANYOF,      sv 1 S    ; Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field
 
 ANYOFM      ANYOFM      byte 1 S  ; Like ANYOF, but matches an invariant byte as determined by the mask and arg
diff --git a/regexec.c b/regexec.c
index 6eec9c2400..a205696d87 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2179,7 +2179,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     case ANYOFH:
         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
             REXEC_FBC_CLASS_SCAN(TRUE,
-                      reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
+                  (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+                   && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
         }
         break;
 
@@ -6805,6 +6806,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
         case ANYOFH:
             if (   ! utf8_target
                 ||   NEXTCHR_IS_EOS
+                ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8((U8) *locinput)
 	        || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
                                                                    utf8_target))
             {
@@ -9592,6 +9594,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
             while (  hardcount < max
                    && scan < this_eol
+                   && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
             {
                 scan += UTF8SKIP(scan);
@@ -9859,7 +9862,9 @@ STATIC bool
 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
 {
     dVAR;
-    const char flags = (OP(n) == ANYOFHb) ? 0 : ANYOF_FLAGS(n);
+    const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHb))
+                        ? 0
+                        : ANYOF_FLAGS(n);
     bool match = FALSE;
     UV c = *p;
 
diff --git a/regnodes.h b/regnodes.h
index 487b6c2dee..5e39b5035b 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -33,7 +33,7 @@
 #define	ANYOFD                	19	/* 0x13 Like ANYOF, but /d is in effect */
 #define	ANYOFL                	20	/* 0x14 Like ANYOF, but /l is in effect */
 #define	ANYOFPOSIXL           	21	/* 0x15 Like ANYOFL, but matches [[:posix:]] classes */
-#define	ANYOFH                	22	/* 0x16 Like ANYOF, but only has "High" matches, none in the bitmap; */
+#define	ANYOFH                	22	/* 0x16 Like ANYOF, but only has "High" matches, none in the bitmap; the flags field contains the lowest matchable UTF-8 start byte */
 #define	ANYOFHb               	23	/* 0x17 Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field */
 #define	ANYOFM                	24	/* 0x18 Like ANYOF, but matches an invariant byte as determined by the mask and arg */
 #define	NANYOFM               	25	/* 0x19 complement of ANYOFM */
diff --git a/t/re/anyof.t b/t/re/anyof.t
index aafcdb090c..b7656d68a5 100644
--- a/t/re/anyof.t
+++ b/t/re/anyof.t
@@ -874,7 +874,7 @@ while (defined (my $test = shift @tests)) {
         my $result = get_compiled($test);
         if ($expected =~ / ^ ANYOFH /x) {
             like($result, qr/ ^ \Q$expected\E (?:\Q (First UTF-8 byte=\x\E
-                              [[:xdigit:]]{2}\) )? $ /x, $test_name);
+                              [[:xdigit:]]{2} )? /x, $test_name);
         }
         else {
             is($result, $expected,