summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2019-09-19 16:03:04 -0600
committerKarl Williamson <khw@cpan.org>2019-11-17 21:20:07 -0700
commit13fcf6522466471a1b1c5fc2d760dd5367fd8940 (patch)
treef6272af8bf0e7308ab5792219978ab12d75d78d3
parentd913538e4f136a14760fb7c73de064901acfc25b (diff)
downloadperl-13fcf6522466471a1b1c5fc2d760dd5367fd8940.tar.gz
Add ANYOFR regnode
This matches a single range of code points. It is both faster and smaller than other ANYOF-type nodes, requiring, after set-up, a single subtraction and conditional branch. The vast majority of Unicode properties match a single range (though most of the properties likely to be used in real world applications have more than a single range). But things like [ij] are a single range, and those are quite commonly encountered. This new regnode matches them more efficiently than a bitmap would, and doesn't require the space for one either. The flags field is used to store the minimum matchable start byte for UTF-8 strings, and is ignored for non-UTF-8 targets. This, like ANYOFH nodes which have a similar mechanism, allows for quick weeding out of many possible matches without having to convert the UTF-8 to its corresponding code point. This regnode packs the 32 bit argument with 20 bits for the minimum code point the node matches, and 12 bits for the maximum range. If the input is a value outside these, it simply won't compile to this regnode, instead going to one of the ANYOFH flavors. ANYOFR is sufficient to match all of Unicode except for the final (private use) 65K plane.
-rw-r--r--pod/perldebguts.pod5
-rw-r--r--regcomp.c95
-rw-r--r--regcomp.h6
-rw-r--r--regcomp.sym1
-rw-r--r--regexec.c65
-rw-r--r--regnodes.h341
-rw-r--r--t/re/anyof.t74
7 files changed, 367 insertions, 220 deletions
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod
index ebaee86f13..b7b3e54013 100644
--- a/pod/perldebguts.pod
+++ b/pod/perldebguts.pod
@@ -621,6 +621,11 @@ will be lost.
ANYOFHr sv 1 Like ANYOFH, but the flags field contains
packed bounds for all matchable UTF-8 start
bytes.
+ ANYOFR packed 1 Matches any character in the range given by
+ its packed args: upper 12 bits is the max
+ delta from the base lower 20; the flags
+ field contains the lowest matchable UTF-8
+ start byte
ANYOFM byte 1 Like ANYOF, but matches an invariant byte
as determined by the mask and arg
diff --git a/regcomp.c b/regcomp.c
index b25511b24c..9531c90407 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -5889,6 +5889,25 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
break;
}
+ case ANYOFR:
+ {
+ SV* cp_list = NULL;
+
+ cp_list = _add_range_to_invlist(cp_list,
+ ANYOFRbase(scan),
+ ANYOFRbase(scan) + ANYOFRdelta(scan));
+
+ if (flags & SCF_DO_STCLASS_OR) {
+ ssc_union(data->start_class, cp_list, invert);
+ }
+ else if (flags & SCF_DO_STCLASS_AND) {
+ ssc_intersection(data->start_class, cp_list, invert);
+ }
+
+ SvREFCNT_dec_NN(cp_list);
+ break;
+ }
+
case NPOSIXL:
invert = 1;
/* FALLTHROUGH */
@@ -18482,6 +18501,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
Size_t partial_cp_count = 0;
UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
UV end[MAX_FOLD_FROMS+1] = { 0 };
+ bool single_range = FALSE;
if (cp_list) { /* Count the code points in enough ranges that we would
see all the ones possible in any fold in this version
@@ -18495,6 +18515,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
partial_cp_count += end[i] - start[i] + 1;
}
+ if (i == 1) {
+ single_range = TRUE;
+ }
invlist_iterfinish(cp_list);
}
@@ -19213,6 +19236,36 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
SvREFCNT_dec(intersection);
}
+ /* If it is a single contiguous range, ANYOFR is an efficient regnode,
+ * both in size and speed. Currently, a 20 bit range base (smallest
+ * code point in the range), and a 12 bit maximum delta are packed into
+ * a 32 bit word. This allows for using it on all of the Unicode code
+ * points except for the highest plane, which is only for private use
+ * code points. khw doubts that a bigger delta is likely in real world
+ * applications */
+ if ( single_range
+ && ! has_runtime_dependency
+ && anyof_flags == 0
+ && start[0] < (1 << ANYOFR_BASE_BITS)
+ && end[0] - start[0]
+ < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
+ * CHARBITS - ANYOFR_BASE_BITS))))
+
+ {
+ U8 low_utf8[UTF8_MAXBYTES+1];
+
+ ret = reganode(pRExC_state, ANYOFR,
+ (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
+
+ /* Place the lowest UTF-8 start byte in the flags field, so as to
+ * allow efficient ruling out at run time of many possible inputs.
+ * */
+ (void) uvchr_to_utf8(low_utf8, start[0]);
+ ANYOF_FLAGS(REGNODE_p(ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
+
+ goto not_anyof;
+ }
+
/* If didn't find an optimization and there is no need for a bitmap,
* optimize to indicate that */
if ( start[0] >= NUM_ANYOF_CODE_POINTS
@@ -20597,10 +20650,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
else if (k == LOGICAL)
/* 2: embedded, otherwise 1 */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
- else if (k == ANYOF) {
- const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
- ? 0
- : ANYOF_FLAGS(o);
+ else if (k == ANYOF || k == ANYOFR) {
+ U8 flags;
char * bitmap;
U32 arg;
bool do_sep = FALSE; /* Do we need to separate various components of
@@ -20619,11 +20670,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
bool inverted;
- if (inRANGE(OP(o), ANYOFH, ANYOFHb)) {
+ if (inRANGE(OP(o), ANYOFH, ANYOFR)) {
+ flags = 0;
bitmap = NULL;
arg = 0;
}
else {
+ flags = ANYOF_FLAGS(o);
bitmap = ANYOF_BITMAP(o);
arg = ARG(o);
}
@@ -20641,15 +20694,23 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
/* If there is stuff outside the bitmap, get it */
if (arg != ANYOF_ONLY_HAS_BITMAP) {
- (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+ if (inRANGE(OP(o), ANYOFR, ANYOFR)) {
+ nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+ ANYOFRbase(o),
+ ANYOFRbase(o) + ANYOFRdelta(o));
+ }
+ else {
+ (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
&unresolved,
&only_utf8_locale_invlist,
&nonbitmap_invlist);
+ }
+
/* The non-bitmap data may contain stuff that could fit in the
* bitmap. This could come from a user-defined property being
* finally resolved when this call was done; or much more likely
* because there are matches that require UTF-8 to be valid, and so
- * aren't in the bitmap. This is teased apart later */
+ * aren't in the bitmap (or ANYOFR). This is teased apart later */
_invlist_intersection(nonbitmap_invlist,
PL_InBitmap,
&bitmap_range_not_in_bitmap);
@@ -20669,7 +20730,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
/* Ready to start outputting. First, the initial left bracket */
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+ /* ANYOFH by definition doesn't have anything that will fit inside the
+ * bitmap; ANYOFR may or may not. */
+ if ( ! inRANGE(OP(o), ANYOFH, ANYOFHr)
+ && ( ! inRANGE(OP(o), ANYOFR, ANYOFR)
+ || ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
+ {
/* Then all the things that could fit in the bitmap */
do_sep = put_charclass_bitmap_innards(sv,
bitmap,
@@ -20682,7 +20748,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
* better display if there
* are things that haven't
* been resolved */
- unresolved != NULL);
+ unresolved != NULL
+ || inRANGE(OP(o), ANYOFR, ANYOFR));
SvREFCNT_dec(bitmap_range_not_in_bitmap);
/* If there are user-defined properties which haven't been defined
@@ -20768,15 +20835,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
/* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
- if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+ if (inRANGE(OP(o), ANYOFH, ANYOFR)) {
U8 lowest = (OP(o) != ANYOFHr)
? FLAGS(o)
: LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
- U8 highest = (OP(o) == ANYOFHb)
- ? lowest
- : OP(o) == ANYOFH
+ U8 highest = (OP(o) == ANYOFHr)
+ ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
+ : (OP(o) == ANYOFH || OP(o) == ANYOFR)
? 0xFF
- : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+ : lowest;
Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
if (lowest != highest) {
Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
diff --git a/regcomp.h b/regcomp.h
index 2dddc5fd8d..53e81dba37 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -384,6 +384,10 @@ struct regnode_ssc {
((struct regnode_string *)(p))->str_len = (v); \
} STMT_END
+#define ANYOFR_BASE_BITS 20
+#define ANYOFRbase(p) (ARG(p) & ((1 << ANYOFR_BASE_BITS) - 1))
+#define ANYOFRdelta(p) (ARG(p) >> ANYOFR_BASE_BITS)
+
#undef NODE_ALIGN
#undef ARG_LOC
#undef NEXTOPER
@@ -1178,7 +1182,7 @@ typedef enum {
WB_BOUND
} bound_type;
-/* This unpacks the FLAGS field of ANYOFHx nodes. The value it contains
+/* This unpacks the FLAGS field of ANYOF[HR]x nodes. The value it contains
* gives the strict lower bound for the UTF-8 start byte of any code point
* matchable by the node, and a loose upper bound as well.
*
diff --git a/regcomp.sym b/regcomp.sym
index 4ea160e6db..b664fc8f07 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -82,6 +82,7 @@ ANYOFPOSIXL ANYOF, sv charclass_posixl S ; Like ANYOFL, but matches [[:p
ANYOFH ANYOF, sv 1 S ; Like ANYOF, but only has "High" matches, none in the bitmap; the flags field contains the lowest matchable UTF-8 start byte
ANYOFHb ANYOF, sv 1 S ; Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field
ANYOFHr ANYOF, sv 1 S ; Like ANYOFH, but the flags field contains packed bounds for all matchable UTF-8 start bytes.
+ANYOFR ANYOFR, packed 1 S ; Matches any character in the range given by its packed args: upper 12 bits is the max delta from the base lower 20; the flags field contains the lowest matchable UTF-8 start byte
ANYOFM ANYOFM byte 1 S ; Like ANYOF, but matches an invariant byte as determined by the mask and arg
NANYOFM ANYOFM byte 1 S ; complement of ANYOFM
diff --git a/regexec.c b/regexec.c
index 9b8f74165c..8265af4dd4 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2205,6 +2205,21 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
break;
+ case ANYOFR:
+ if (utf8_target) {
+ REXEC_FBC_CLASS_SCAN(TRUE,
+ ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+ && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
+ (U8 *) strend,
+ NULL),
+ ANYOFRbase(c), ANYOFRdelta(c))));
+ }
+ else {
+ REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
+ ANYOFRbase(c), ANYOFRdelta(c)));
+ }
+ break;
+
case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
assert(! is_utf8_pat);
/* FALLTHROUGH */
@@ -6874,6 +6889,31 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
goto increment_locinput;
break;
+ case ANYOFR:
+ if (NEXTCHR_IS_EOS) {
+ sayNO;
+ }
+
+ if (utf8_target) {
+ if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
+ || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
+ (U8 *) reginfo->strend,
+ NULL),
+ ANYOFRbase(scan), ANYOFRdelta(scan)))
+ {
+ sayNO;
+ }
+ }
+ else {
+ if (! withinCOUNT((U8) *locinput,
+ ANYOFRbase(scan), ANYOFRdelta(scan)))
+ {
+ sayNO;
+ }
+ }
+ goto increment_locinput;
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number
* */
@@ -9703,6 +9743,31 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
}
break;
+ case ANYOFR:
+ if (utf8_target) {
+ while ( hardcount < max
+ && scan < this_eol
+ && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
+ && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
+ (U8 *) this_eol,
+ NULL),
+ ANYOFRbase(p), ANYOFRdelta(p)))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ else {
+ while ( hardcount < max
+ && scan < this_eol
+ && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
+ {
+ scan++;
+ hardcount++;
+ }
+ }
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number */
case NPOSIXL:
diff --git a/regnodes.h b/regnodes.h
index cee7b75f7f..b7bf210f27 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
/* Regops and State definitions */
-#define REGNODE_MAX 105
-#define REGMATCH_STATE_MAX 145
+#define REGNODE_MAX 106
+#define REGMATCH_STATE_MAX 146
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
@@ -36,87 +36,88 @@
#define ANYOFH 22 /* 0x16 Like ANYOF, but only has "High" matches, none in the bitmap; the flags field contains the lowest matchable UTF-8 start byte */
#define ANYOFHb 23 /* 0x17 Like ANYOFH, but all matches share the same UTF-8 start byte, given in the flags field */
#define ANYOFHr 24 /* 0x18 Like ANYOFH, but the flags field contains packed bounds for all matchable UTF-8 start bytes. */
-#define ANYOFM 25 /* 0x19 Like ANYOF, but matches an invariant byte as determined by the mask and arg */
-#define NANYOFM 26 /* 0x1a complement of ANYOFM */
-#define POSIXD 27 /* 0x1b Some [[:class:]] under /d; the FLAGS field gives which one */
-#define POSIXL 28 /* 0x1c Some [[:class:]] under /l; the FLAGS field gives which one */
-#define POSIXU 29 /* 0x1d Some [[:class:]] under /u; the FLAGS field gives which one */
-#define POSIXA 30 /* 0x1e Some [[:class:]] under /a; the FLAGS field gives which one */
-#define NPOSIXD 31 /* 0x1f complement of POSIXD, [[:^class:]] */
-#define NPOSIXL 32 /* 0x20 complement of POSIXL, [[:^class:]] */
-#define NPOSIXU 33 /* 0x21 complement of POSIXU, [[:^class:]] */
-#define NPOSIXA 34 /* 0x22 complement of POSIXA, [[:^class:]] */
-#define CLUMP 35 /* 0x23 Match any extended grapheme cluster sequence */
-#define BRANCH 36 /* 0x24 Match this alternative, or the next... */
-#define EXACT 37 /* 0x25 Match this string (flags field is the length). */
-#define LEXACT 38 /* 0x26 Match this long string (preceded by length; flags unused). */
-#define EXACTL 39 /* 0x27 Like EXACT, but /l is in effect (used so locale-related warnings can be checked for) */
-#define EXACTF 40 /* 0x28 Like EXACT, but match using /id rules; (string not UTF-8, ASCII folded; non-ASCII not) */
-#define EXACTFL 41 /* 0x29 Like EXACT, but match using /il rules; (string not likely to be folded) */
-#define EXACTFU 42 /* 0x2a Like EXACT, but match using /iu rules; (string folded) */
-#define EXACTFAA 43 /* 0x2b Like EXACT, but match using /iaa rules; (string folded except in non-UTF8 patterns: MICRO, SHARP S; folded length <= unfolded) */
-#define EXACTFUP 44 /* 0x2c Like EXACT, but match using /iu rules; (string not UTF-8, folded except MICRO, SHARP S: hence Problematic) */
-#define EXACTFLU8 45 /* 0x2d Like EXACTFU, but use /il, UTF-8, (string is folded, and everything in it is above 255 */
-#define EXACTFAA_NO_TRIE 46 /* 0x2e Like EXACT, but match using /iaa rules (string not UTF-8, not guaranteed to be folded, not currently trie-able) */
-#define EXACT_REQ8 47 /* 0x2f Like EXACT, but only UTF-8 encoded targets can match */
-#define LEXACT_REQ8 48 /* 0x30 Like LEXACT, but only UTF-8 encoded targets can match */
-#define EXACTFU_REQ8 49 /* 0x31 Like EXACTFU, but only UTF-8 encoded targets can match */
-#define EXACTFU_S_EDGE 50 /* 0x32 /di rules, but nothing in it precludes /ui, except begins and/or ends with [Ss]; (string not UTF-8; compile-time only) */
-#define NOTHING 51 /* 0x33 Match empty string. */
-#define TAIL 52 /* 0x34 Match empty string. Can jump here from outside. */
-#define STAR 53 /* 0x35 Match this (simple) thing 0 or more times. */
-#define PLUS 54 /* 0x36 Match this (simple) thing 1 or more times. */
-#define CURLY 55 /* 0x37 Match this simple thing {n,m} times. */
-#define CURLYN 56 /* 0x38 Capture next-after-this simple thing */
-#define CURLYM 57 /* 0x39 Capture this medium-complex thing {n,m} times. */
-#define CURLYX 58 /* 0x3a Match this complex thing {n,m} times. */
-#define WHILEM 59 /* 0x3b Do curly processing and see if rest matches. */
-#define OPEN 60 /* 0x3c Mark this point in input as start of #n. */
-#define CLOSE 61 /* 0x3d Close corresponding OPEN of #n. */
-#define SROPEN 62 /* 0x3e Same as OPEN, but for script run */
-#define SRCLOSE 63 /* 0x3f Close preceding SROPEN */
-#define REF 64 /* 0x40 Match some already matched string */
-#define REFF 65 /* 0x41 Match already matched string, using /di rules. */
-#define REFFL 66 /* 0x42 Match already matched string, using /li rules. */
-#define REFFU 67 /* 0x43 Match already matched string, usng /ui. */
-#define REFFA 68 /* 0x44 Match already matched string, using /aai rules. */
-#define REFN 69 /* 0x45 Match some already matched string */
-#define REFFN 70 /* 0x46 Match already matched string, using /di rules. */
-#define REFFLN 71 /* 0x47 Match already matched string, using /li rules. */
-#define REFFUN 72 /* 0x48 Match already matched string, using /ui rules. */
-#define REFFAN 73 /* 0x49 Match already matched string, using /aai rules. */
-#define LONGJMP 74 /* 0x4a Jump far away. */
-#define BRANCHJ 75 /* 0x4b BRANCH with long offset. */
-#define IFMATCH 76 /* 0x4c Succeeds if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current */
-#define UNLESSM 77 /* 0x4d Fails if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current */
-#define SUSPEND 78 /* 0x4e "Independent" sub-RE. */
-#define IFTHEN 79 /* 0x4f Switch, should be preceded by switcher. */
-#define GROUPP 80 /* 0x50 Whether the group matched. */
-#define EVAL 81 /* 0x51 Execute some Perl code. */
-#define MINMOD 82 /* 0x52 Next operator is not greedy. */
-#define LOGICAL 83 /* 0x53 Next opcode should set the flag only. */
-#define RENUM 84 /* 0x54 Group with independently numbered parens. */
-#define TRIE 85 /* 0x55 Match many EXACT(F[ALU]?)? at once. flags==type */
-#define TRIEC 86 /* 0x56 Same as TRIE, but with embedded charclass data */
-#define AHOCORASICK 87 /* 0x57 Aho Corasick stclass. flags==type */
-#define AHOCORASICKC 88 /* 0x58 Same as AHOCORASICK, but with embedded charclass data */
-#define GOSUB 89 /* 0x59 recurse to paren arg1 at (signed) ofs arg2 */
-#define GROUPPN 90 /* 0x5a Whether the group matched. */
-#define INSUBP 91 /* 0x5b Whether we are in a specific recurse. */
-#define DEFINEP 92 /* 0x5c Never execute directly. */
-#define ENDLIKE 93 /* 0x5d Used only for the type field of verbs */
-#define OPFAIL 94 /* 0x5e Same as (?!), but with verb arg */
-#define ACCEPT 95 /* 0x5f Accepts the current matched string, with verbar */
-#define VERB 96 /* 0x60 Used only for the type field of verbs */
-#define PRUNE 97 /* 0x61 Pattern fails at this startpoint if no-backtracking through this */
-#define MARKPOINT 98 /* 0x62 Push the current location for rollback by cut. */
-#define SKIP 99 /* 0x63 On failure skip forward (to the mark) before retrying */
-#define COMMIT 100 /* 0x64 Pattern fails outright if backtracking through this */
-#define CUTGROUP 101 /* 0x65 On failure go to the next alternation in the group */
-#define KEEPS 102 /* 0x66 $& begins here. */
-#define LNBREAK 103 /* 0x67 generic newline pattern */
-#define OPTIMIZED 104 /* 0x68 Placeholder for dump. */
-#define PSEUDO 105 /* 0x69 Pseudo opcode for internal use. */
+#define ANYOFR 25 /* 0x19 Matches any character in the range given by its packed args: upper 12 bits is the max delta from the base lower 20; the flags field contains the lowest matchable UTF-8 start byte */
+#define ANYOFM 26 /* 0x1a Like ANYOF, but matches an invariant byte as determined by the mask and arg */
+#define NANYOFM 27 /* 0x1b complement of ANYOFM */
+#define POSIXD 28 /* 0x1c Some [[:class:]] under /d; the FLAGS field gives which one */
+#define POSIXL 29 /* 0x1d Some [[:class:]] under /l; the FLAGS field gives which one */
+#define POSIXU 30 /* 0x1e Some [[:class:]] under /u; the FLAGS field gives which one */
+#define POSIXA 31 /* 0x1f Some [[:class:]] under /a; the FLAGS field gives which one */
+#define NPOSIXD 32 /* 0x20 complement of POSIXD, [[:^class:]] */
+#define NPOSIXL 33 /* 0x21 complement of POSIXL, [[:^class:]] */
+#define NPOSIXU 34 /* 0x22 complement of POSIXU, [[:^class:]] */
+#define NPOSIXA 35 /* 0x23 complement of POSIXA, [[:^class:]] */
+#define CLUMP 36 /* 0x24 Match any extended grapheme cluster sequence */
+#define BRANCH 37 /* 0x25 Match this alternative, or the next... */
+#define EXACT 38 /* 0x26 Match this string (flags field is the length). */
+#define LEXACT 39 /* 0x27 Match this long string (preceded by length; flags unused). */
+#define EXACTL 40 /* 0x28 Like EXACT, but /l is in effect (used so locale-related warnings can be checked for) */
+#define EXACTF 41 /* 0x29 Like EXACT, but match using /id rules; (string not UTF-8, ASCII folded; non-ASCII not) */
+#define EXACTFL 42 /* 0x2a Like EXACT, but match using /il rules; (string not likely to be folded) */
+#define EXACTFU 43 /* 0x2b Like EXACT, but match using /iu rules; (string folded) */
+#define EXACTFAA 44 /* 0x2c Like EXACT, but match using /iaa rules; (string folded except in non-UTF8 patterns: MICRO, SHARP S; folded length <= unfolded) */
+#define EXACTFUP 45 /* 0x2d Like EXACT, but match using /iu rules; (string not UTF-8, folded except MICRO, SHARP S: hence Problematic) */
+#define EXACTFLU8 46 /* 0x2e Like EXACTFU, but use /il, UTF-8, (string is folded, and everything in it is above 255 */
+#define EXACTFAA_NO_TRIE 47 /* 0x2f Like EXACT, but match using /iaa rules (string not UTF-8, not guaranteed to be folded, not currently trie-able) */
+#define EXACT_REQ8 48 /* 0x30 Like EXACT, but only UTF-8 encoded targets can match */
+#define LEXACT_REQ8 49 /* 0x31 Like LEXACT, but only UTF-8 encoded targets can match */
+#define EXACTFU_REQ8 50 /* 0x32 Like EXACTFU, but only UTF-8 encoded targets can match */
+#define EXACTFU_S_EDGE 51 /* 0x33 /di rules, but nothing in it precludes /ui, except begins and/or ends with [Ss]; (string not UTF-8; compile-time only) */
+#define NOTHING 52 /* 0x34 Match empty string. */
+#define TAIL 53 /* 0x35 Match empty string. Can jump here from outside. */
+#define STAR 54 /* 0x36 Match this (simple) thing 0 or more times. */
+#define PLUS 55 /* 0x37 Match this (simple) thing 1 or more times. */
+#define CURLY 56 /* 0x38 Match this simple thing {n,m} times. */
+#define CURLYN 57 /* 0x39 Capture next-after-this simple thing */
+#define CURLYM 58 /* 0x3a Capture this medium-complex thing {n,m} times. */
+#define CURLYX 59 /* 0x3b Match this complex thing {n,m} times. */
+#define WHILEM 60 /* 0x3c Do curly processing and see if rest matches. */
+#define OPEN 61 /* 0x3d Mark this point in input as start of #n. */
+#define CLOSE 62 /* 0x3e Close corresponding OPEN of #n. */
+#define SROPEN 63 /* 0x3f Same as OPEN, but for script run */
+#define SRCLOSE 64 /* 0x40 Close preceding SROPEN */
+#define REF 65 /* 0x41 Match some already matched string */
+#define REFF 66 /* 0x42 Match already matched string, using /di rules. */
+#define REFFL 67 /* 0x43 Match already matched string, using /li rules. */
+#define REFFU 68 /* 0x44 Match already matched string, usng /ui. */
+#define REFFA 69 /* 0x45 Match already matched string, using /aai rules. */
+#define REFN 70 /* 0x46 Match some already matched string */
+#define REFFN 71 /* 0x47 Match already matched string, using /di rules. */
+#define REFFLN 72 /* 0x48 Match already matched string, using /li rules. */
+#define REFFUN 73 /* 0x49 Match already matched string, using /ui rules. */
+#define REFFAN 74 /* 0x4a Match already matched string, using /aai rules. */
+#define LONGJMP 75 /* 0x4b Jump far away. */
+#define BRANCHJ 76 /* 0x4c BRANCH with long offset. */
+#define IFMATCH 77 /* 0x4d Succeeds if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current */
+#define UNLESSM 78 /* 0x4e Fails if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current */
+#define SUSPEND 79 /* 0x4f "Independent" sub-RE. */
+#define IFTHEN 80 /* 0x50 Switch, should be preceded by switcher. */
+#define GROUPP 81 /* 0x51 Whether the group matched. */
+#define EVAL 82 /* 0x52 Execute some Perl code. */
+#define MINMOD 83 /* 0x53 Next operator is not greedy. */
+#define LOGICAL 84 /* 0x54 Next opcode should set the flag only. */
+#define RENUM 85 /* 0x55 Group with independently numbered parens. */
+#define TRIE 86 /* 0x56 Match many EXACT(F[ALU]?)? at once. flags==type */
+#define TRIEC 87 /* 0x57 Same as TRIE, but with embedded charclass data */
+#define AHOCORASICK 88 /* 0x58 Aho Corasick stclass. flags==type */
+#define AHOCORASICKC 89 /* 0x59 Same as AHOCORASICK, but with embedded charclass data */
+#define GOSUB 90 /* 0x5a recurse to paren arg1 at (signed) ofs arg2 */
+#define GROUPPN 91 /* 0x5b Whether the group matched. */
+#define INSUBP 92 /* 0x5c Whether we are in a specific recurse. */
+#define DEFINEP 93 /* 0x5d Never execute directly. */
+#define ENDLIKE 94 /* 0x5e Used only for the type field of verbs */
+#define OPFAIL 95 /* 0x5f Same as (?!), but with verb arg */
+#define ACCEPT 96 /* 0x60 Accepts the current matched string, with verbar */
+#define VERB 97 /* 0x61 Used only for the type field of verbs */
+#define PRUNE 98 /* 0x62 Pattern fails at this startpoint if no-backtracking through this */
+#define MARKPOINT 99 /* 0x63 Push the current location for rollback by cut. */
+#define SKIP 100 /* 0x64 On failure skip forward (to the mark) before retrying */
+#define COMMIT 101 /* 0x65 Pattern fails outright if backtracking through this */
+#define CUTGROUP 102 /* 0x66 On failure go to the next alternation in the group */
+#define KEEPS 103 /* 0x67 $& begins here. */
+#define LNBREAK 104 /* 0x68 generic newline pattern */
+#define OPTIMIZED 105 /* 0x69 Placeholder for dump. */
+#define PSEUDO 106 /* 0x6a Pseudo opcode for internal use. */
/* ------------ States ------------- */
#define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */
#define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */
@@ -190,6 +191,7 @@ EXTCONST U8 PL_regkind[] = {
ANYOF, /* ANYOFH */
ANYOF, /* ANYOFHb */
ANYOF, /* ANYOFHr */
+ ANYOFR, /* ANYOFR */
ANYOFM, /* ANYOFM */
ANYOFM, /* NANYOFM */
POSIXD, /* POSIXD */
@@ -345,6 +347,7 @@ static const U8 regarglen[] = {
EXTRA_SIZE(struct regnode_1), /* ANYOFH */
EXTRA_SIZE(struct regnode_1), /* ANYOFHb */
EXTRA_SIZE(struct regnode_1), /* ANYOFHr */
+ EXTRA_SIZE(struct regnode_1), /* ANYOFR */
EXTRA_SIZE(struct regnode_1), /* ANYOFM */
EXTRA_SIZE(struct regnode_1), /* NANYOFM */
0, /* POSIXD */
@@ -456,6 +459,7 @@ static const char reg_off_by_arg[] = {
0, /* ANYOFH */
0, /* ANYOFHb */
0, /* ANYOFHr */
+ 0, /* ANYOFR */
0, /* ANYOFM */
0, /* NANYOFM */
0, /* POSIXD */
@@ -573,87 +577,88 @@ EXTCONST char * const PL_reg_name[] = {
"ANYOFH", /* 0x16 */
"ANYOFHb", /* 0x17 */
"ANYOFHr", /* 0x18 */
- "ANYOFM", /* 0x19 */
- "NANYOFM", /* 0x1a */
- "POSIXD", /* 0x1b */
- "POSIXL", /* 0x1c */
- "POSIXU", /* 0x1d */
- "POSIXA", /* 0x1e */
- "NPOSIXD", /* 0x1f */
- "NPOSIXL", /* 0x20 */
- "NPOSIXU", /* 0x21 */
- "NPOSIXA", /* 0x22 */
- "CLUMP", /* 0x23 */
- "BRANCH", /* 0x24 */
- "EXACT", /* 0x25 */
- "LEXACT", /* 0x26 */
- "EXACTL", /* 0x27 */
- "EXACTF", /* 0x28 */
- "EXACTFL", /* 0x29 */
- "EXACTFU", /* 0x2a */
- "EXACTFAA", /* 0x2b */
- "EXACTFUP", /* 0x2c */
- "EXACTFLU8", /* 0x2d */
- "EXACTFAA_NO_TRIE", /* 0x2e */
- "EXACT_REQ8", /* 0x2f */
- "LEXACT_REQ8", /* 0x30 */
- "EXACTFU_REQ8", /* 0x31 */
- "EXACTFU_S_EDGE", /* 0x32 */
- "NOTHING", /* 0x33 */
- "TAIL", /* 0x34 */
- "STAR", /* 0x35 */
- "PLUS", /* 0x36 */
- "CURLY", /* 0x37 */
- "CURLYN", /* 0x38 */
- "CURLYM", /* 0x39 */
- "CURLYX", /* 0x3a */
- "WHILEM", /* 0x3b */
- "OPEN", /* 0x3c */
- "CLOSE", /* 0x3d */
- "SROPEN", /* 0x3e */
- "SRCLOSE", /* 0x3f */
- "REF", /* 0x40 */
- "REFF", /* 0x41 */
- "REFFL", /* 0x42 */
- "REFFU", /* 0x43 */
- "REFFA", /* 0x44 */
- "REFN", /* 0x45 */
- "REFFN", /* 0x46 */
- "REFFLN", /* 0x47 */
- "REFFUN", /* 0x48 */
- "REFFAN", /* 0x49 */
- "LONGJMP", /* 0x4a */
- "BRANCHJ", /* 0x4b */
- "IFMATCH", /* 0x4c */
- "UNLESSM", /* 0x4d */
- "SUSPEND", /* 0x4e */
- "IFTHEN", /* 0x4f */
- "GROUPP", /* 0x50 */
- "EVAL", /* 0x51 */
- "MINMOD", /* 0x52 */
- "LOGICAL", /* 0x53 */
- "RENUM", /* 0x54 */
- "TRIE", /* 0x55 */
- "TRIEC", /* 0x56 */
- "AHOCORASICK", /* 0x57 */
- "AHOCORASICKC", /* 0x58 */
- "GOSUB", /* 0x59 */
- "GROUPPN", /* 0x5a */
- "INSUBP", /* 0x5b */
- "DEFINEP", /* 0x5c */
- "ENDLIKE", /* 0x5d */
- "OPFAIL", /* 0x5e */
- "ACCEPT", /* 0x5f */
- "VERB", /* 0x60 */
- "PRUNE", /* 0x61 */
- "MARKPOINT", /* 0x62 */
- "SKIP", /* 0x63 */
- "COMMIT", /* 0x64 */
- "CUTGROUP", /* 0x65 */
- "KEEPS", /* 0x66 */
- "LNBREAK", /* 0x67 */
- "OPTIMIZED", /* 0x68 */
- "PSEUDO", /* 0x69 */
+ "ANYOFR", /* 0x19 */
+ "ANYOFM", /* 0x1a */
+ "NANYOFM", /* 0x1b */
+ "POSIXD", /* 0x1c */
+ "POSIXL", /* 0x1d */
+ "POSIXU", /* 0x1e */
+ "POSIXA", /* 0x1f */
+ "NPOSIXD", /* 0x20 */
+ "NPOSIXL", /* 0x21 */
+ "NPOSIXU", /* 0x22 */
+ "NPOSIXA", /* 0x23 */
+ "CLUMP", /* 0x24 */
+ "BRANCH", /* 0x25 */
+ "EXACT", /* 0x26 */
+ "LEXACT", /* 0x27 */
+ "EXACTL", /* 0x28 */
+ "EXACTF", /* 0x29 */
+ "EXACTFL", /* 0x2a */
+ "EXACTFU", /* 0x2b */
+ "EXACTFAA", /* 0x2c */
+ "EXACTFUP", /* 0x2d */
+ "EXACTFLU8", /* 0x2e */
+ "EXACTFAA_NO_TRIE", /* 0x2f */
+ "EXACT_REQ8", /* 0x30 */
+ "LEXACT_REQ8", /* 0x31 */
+ "EXACTFU_REQ8", /* 0x32 */
+ "EXACTFU_S_EDGE", /* 0x33 */
+ "NOTHING", /* 0x34 */
+ "TAIL", /* 0x35 */
+ "STAR", /* 0x36 */
+ "PLUS", /* 0x37 */
+ "CURLY", /* 0x38 */
+ "CURLYN", /* 0x39 */
+ "CURLYM", /* 0x3a */
+ "CURLYX", /* 0x3b */
+ "WHILEM", /* 0x3c */
+ "OPEN", /* 0x3d */
+ "CLOSE", /* 0x3e */
+ "SROPEN", /* 0x3f */
+ "SRCLOSE", /* 0x40 */
+ "REF", /* 0x41 */
+ "REFF", /* 0x42 */
+ "REFFL", /* 0x43 */
+ "REFFU", /* 0x44 */
+ "REFFA", /* 0x45 */
+ "REFN", /* 0x46 */
+ "REFFN", /* 0x47 */
+ "REFFLN", /* 0x48 */
+ "REFFUN", /* 0x49 */
+ "REFFAN", /* 0x4a */
+ "LONGJMP", /* 0x4b */
+ "BRANCHJ", /* 0x4c */
+ "IFMATCH", /* 0x4d */
+ "UNLESSM", /* 0x4e */
+ "SUSPEND", /* 0x4f */
+ "IFTHEN", /* 0x50 */
+ "GROUPP", /* 0x51 */
+ "EVAL", /* 0x52 */
+ "MINMOD", /* 0x53 */
+ "LOGICAL", /* 0x54 */
+ "RENUM", /* 0x55 */
+ "TRIE", /* 0x56 */
+ "TRIEC", /* 0x57 */
+ "AHOCORASICK", /* 0x58 */
+ "AHOCORASICKC", /* 0x59 */
+ "GOSUB", /* 0x5a */
+ "GROUPPN", /* 0x5b */
+ "INSUBP", /* 0x5c */
+ "DEFINEP", /* 0x5d */
+ "ENDLIKE", /* 0x5e */
+ "OPFAIL", /* 0x5f */
+ "ACCEPT", /* 0x60 */
+ "VERB", /* 0x61 */
+ "PRUNE", /* 0x62 */
+ "MARKPOINT", /* 0x63 */
+ "SKIP", /* 0x64 */
+ "COMMIT", /* 0x65 */
+ "CUTGROUP", /* 0x66 */
+ "KEEPS", /* 0x67 */
+ "LNBREAK", /* 0x68 */
+ "OPTIMIZED", /* 0x69 */
+ "PSEUDO", /* 0x6a */
/* ------------ States ------------- */
"TRIE_next", /* REGNODE_MAX +0x01 */
"TRIE_next_fail", /* REGNODE_MAX +0x02 */
@@ -788,7 +793,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
EXTCONST U8 PL_varies_bitmask[];
#else
EXTCONST U8 PL_varies_bitmask[] = {
- 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0xE0, 0x0F, 0xFF, 0xCB, 0x00, 0x00, 0x00, 0x00
+ 0x00, 0x00, 0x00, 0x00, 0x30, 0x00, 0xC0, 0x1F, 0xFE, 0x97, 0x01, 0x00, 0x00, 0x00
};
#endif /* DOINIT */
@@ -801,8 +806,8 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__;
#else
EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
REG_ANY, SANY, ANYOF, ANYOFD, ANYOFL, ANYOFPOSIXL, ANYOFH, ANYOFHb,
- ANYOFHr, ANYOFM, NANYOFM, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD,
- NPOSIXL, NPOSIXU, NPOSIXA,
+ ANYOFHr, ANYOFR, ANYOFM, NANYOFM, POSIXD, POSIXL, POSIXU, POSIXA,
+ NPOSIXD, NPOSIXL, NPOSIXU, NPOSIXA,
0
};
#endif /* DOINIT */
@@ -811,7 +816,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
EXTCONST U8 PL_simple_bitmask[];
#else
EXTCONST U8 PL_simple_bitmask[] = {
- 0x00, 0x00, 0xFF, 0xFF, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ 0x00, 0x00, 0xFF, 0xFF, 0x0F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
#endif /* DOINIT */
diff --git a/t/re/anyof.t b/t/re/anyof.t
index 90d77ac70a..2fa92cc0d9 100644
--- a/t/re/anyof.t
+++ b/t/re/anyof.t
@@ -139,7 +139,7 @@ my @tests = (
'[_[:^blank:]]' => 'NPOSIXD[:blank:]',
'[\xA0[:^blank:]]' => 'ANYOF[^\t ][0100-167F 1681-1FFF 200B-202E 2030-205E 2060-2FFF 3001-INFTY]',
'(?d:[_[:^blank:]])' => 'NPOSIXD[:blank:]',
- '[\x{07}-\x{0B}]' => 'ANYOF[\a\b\t\n\x0B]',
+ '[\x{07}-\x{0B}]' => 'ANYOFR[\a\b\t\n\x0B]',
'(?l)[\x{2029}]' => 'EXACTL <\x{2029}>',
'(?l)(?[\x{2029}])' => 'ANYOFL{utf8-locale-reqd}[2029]', # regex sets requires utf8 locale for /l
'(?il)[\x{212A}]' => 'EXACTFL <\\x{212a}>',
@@ -487,19 +487,19 @@ my @tests = (
'[\x{102}\x{104}]' => 'ANYOFHb[0102 0104]',
'[\x{102}-\x{104}{INFTY}]' => 'ANYOFH[0102-0104 INFTY-INFTY]',
'[\x{102}-\x{104}{HIGHEST_CP}]' => 'ANYOFH[0102-0104 HIGHEST_CP]',
- '[\x{102}-\x{104}\x{101}]' => 'ANYOFHb[0101-0104]',
+ '[\x{102}-\x{104}\x{101}]' => 'ANYOFR[0101-0104]',
'[\x{102}-\x{104}\x{101}-{INFTY}]' => 'ANYOFH[0101-INFTY]',
'[\x{102}-\x{104}\x{101}-{HIGHEST_CP}]' => 'ANYOFH[0101-HIGHEST_CP]',
- '[\x{102}-\x{104}\x{102}]' => 'ANYOFHb[0102-0104]',
+ '[\x{102}-\x{104}\x{102}]' => 'ANYOFR[0102-0104]',
'[\x{102}-\x{104}\x{102}-{INFTY}]' => 'ANYOFH[0102-INFTY]',
'[\x{102}-\x{104}\x{102}-{HIGHEST_CP}]' => 'ANYOFH[0102-HIGHEST_CP]',
- '[\x{102}-\x{104}\x{103}]' => 'ANYOFHb[0102-0104]',
+ '[\x{102}-\x{104}\x{103}]' => 'ANYOFR[0102-0104]',
'[\x{102}-\x{104}\x{103}-{INFTY}]' => 'ANYOFH[0102-INFTY]',
'[\x{102}-\x{104}\x{103}-{HIGHEST_CP}]' => 'ANYOFH[0102-HIGHEST_CP]',
- '[\x{102}-\x{104}\x{104}]' => 'ANYOFHb[0102-0104]',
+ '[\x{102}-\x{104}\x{104}]' => 'ANYOFR[0102-0104]',
'[\x{102}-\x{104}\x{104}-{INFTY}]' => 'ANYOFH[0102-INFTY]',
'[\x{102}-\x{104}\x{104}-{HIGHEST_CP}]' => 'ANYOFH[0102-HIGHEST_CP]',
- '[\x{102}-\x{104}\x{105}]' => 'ANYOFHb[0102-0105]',
+ '[\x{102}-\x{104}\x{105}]' => 'ANYOFR[0102-0105]',
'[\x{102}-\x{104}\x{105}-{INFTY}]' => 'ANYOFH[0102-INFTY]',
'[\x{102}-\x{104}\x{105}-{HIGHEST_CP}]' => 'ANYOFH[0102-HIGHEST_CP]',
'[\x{102}-\x{104}\x{106}]' => 'ANYOFHb[0102-0104 0106]',
@@ -515,11 +515,11 @@ my @tests = (
'[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{104}]' => 'ANYOFHb[0101-0104 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{105}]' => 'ANYOFHb[0101-0105 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{106}]' => 'ANYOFHb[0101-0106 0108-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{107}]' => 'ANYOFHb[0101-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{108}]' => 'ANYOFHb[0101-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{109}]' => 'ANYOFHb[0101-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{10A}]' => 'ANYOFHb[0101-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{10B}]' => 'ANYOFHb[0101-010B]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{107}]' => 'ANYOFR[0101-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{108}]' => 'ANYOFR[0101-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{109}]' => 'ANYOFR[0101-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{10A}]' => 'ANYOFR[0101-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-\x{10B}]' => 'ANYOFR[0101-010B]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{102}]' => 'ANYOFHb[0102-0104 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-{INFTY}]' => 'ANYOFH[0102-INFTY]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-{HIGHEST_CP}]' => 'ANYOFH[0102-HIGHEST_CP]',
@@ -528,45 +528,45 @@ my @tests = (
'[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{104}]' => 'ANYOFHb[0102-0104 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{105}]' => 'ANYOFHb[0102-0105 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{106}]' => 'ANYOFHb[0102-0106 0108-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{107}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{108}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{109}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{10A}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{10B}]' => 'ANYOFHb[0102-010B]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{10C}]' => 'ANYOFHb[0102-010C]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{107}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{108}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{109}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{10A}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{10B}]' => 'ANYOFR[0102-010B]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{102}-\x{10C}]' => 'ANYOFR[0102-010C]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{103}]' => 'ANYOFHb[0102-0104 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-{INFTY}]' => 'ANYOFH[0102-INFTY]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-{HIGHEST_CP}]' => 'ANYOFH[0102-HIGHEST_CP]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{104}]' => 'ANYOFHb[0102-0104 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{105}]' => 'ANYOFHb[0102-0105 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{106}]' => 'ANYOFHb[0102-0106 0108-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{107}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{108}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{109}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{10A}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{10B}]' => 'ANYOFHb[0102-010B]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{10C}]' => 'ANYOFHb[0102-010C]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{107}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{108}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{109}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{10A}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{10B}]' => 'ANYOFR[0102-010B]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{103}-\x{10C}]' => 'ANYOFR[0102-010C]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{104}]' => 'ANYOFHb[0102-0104 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-{INFTY}]' => 'ANYOFH[0102-INFTY]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-{HIGHEST_CP}]' => 'ANYOFH[0102-HIGHEST_CP]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{105}]' => 'ANYOFHb[0102-0105 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{106}]' => 'ANYOFHb[0102-0106 0108-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{107}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{108}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{109}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{10A}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{10B}]' => 'ANYOFHb[0102-010B]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{10C}]' => 'ANYOFHb[0102-010C]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{107}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{108}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{109}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{10A}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{10B}]' => 'ANYOFR[0102-010B]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{104}-\x{10C}]' => 'ANYOFR[0102-010C]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{105}]' => 'ANYOFHb[0102-0105 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-{INFTY}]' => 'ANYOFH[0102-INFTY]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-{HIGHEST_CP}]' => 'ANYOFH[0102-HIGHEST_CP]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{106}]' => 'ANYOFHb[0102-0106 0108-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{107}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{108}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{109}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{10A}]' => 'ANYOFHb[0102-010A]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{10B}]' => 'ANYOFHb[0102-010B]',
- '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{10C}]' => 'ANYOFHb[0102-010C]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{107}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{108}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{109}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{10A}]' => 'ANYOFR[0102-010A]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{10B}]' => 'ANYOFR[0102-010B]',
+ '[\x{102}-\x{104}\x{108}-\x{10A}\x{105}-\x{10C}]' => 'ANYOFR[0102-010C]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{106}]' => 'ANYOFHb[0102-0104 0106 0108-010A]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{106}-{INFTY}]' => 'ANYOFH[0102-0104 0106-INFTY]',
'[\x{102}-\x{104}\x{108}-\x{10A}\x{106}-{HIGHEST_CP}]' => 'ANYOFH[0102-0104 0106-HIGHEST_CP]',
@@ -583,7 +583,7 @@ my @tests = (
'[\x{102}-\x{104}\x{108}-\x{10A}\x{10B}]' => 'ANYOFHb[0102-0104 0108-010B]',
'[\x{103}\x{102}]' => 'EXACTFU_REQ8 <\x{103}>',
'[\x{104}\x{102}]' => 'ANYOFHb[0102 0104]',
- '[\x{104}\x{102}\x{103}]' => 'ANYOFHb[0102-0104]',
+ '[\x{104}\x{102}\x{103}]' => 'ANYOFR[0102-0104]',
'[\x{106}-{INFTY}\x{104}]' => 'ANYOFH[0104 0106-INFTY]',
'[\x{106}-{INFTY}\x{104}-{INFTY}]' => 'ANYOFH[0104-INFTY]',
'[\x{106}-{INFTY}\x{104}-{HIGHEST_CP}]' => 'ANYOFH[0104-INFTY]',
@@ -886,7 +886,7 @@ while (defined (my $test = shift @tests)) {
. " $expected";
my $result = get_compiled($test);
- if ($expected =~ / ^ ANYOFH /x) {
+ if ($expected =~ / ^ ANYOF[HR] /x) {
like($result, qr/ ^ \Q$expected\E (?:\Q (First UTF-8 byte=\x\E
[[:xdigit:]]{2} )? /x, $test_name);
}