summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2014-01-29 20:42:33 -0700
committerKarl Williamson <public@khwilliamson.com>2014-02-15 21:55:32 -0700
commit34fdef848b1687b91892ba55e9e0c3430e0770f6 (patch)
tree0c42f2e84076a6040b8b29fe47ad88c92228640b
parent56feebade29d8842a38364ccb13c5ff09284d0d7 (diff)
downloadperl-34fdef848b1687b91892ba55e9e0c3430e0770f6.tar.gz
Free up bit for regex ANYOF nodes
This commit frees up a bit by using an extra regnode to pass the information to the regex engine instead of the flag. I originally thought that if this was needed, it should be the ANYOF_ABOVE_LATIN1_ALL bit, as that might speed some things up. But if we need to do this again by adding another node to get another bit, we want one that is mutually exclusive of the first one we did, For otherwise we start having to make 3 nodes instead of two to get the combinations: 1 0 0 1 1 1 This combinatorial problem is avoided by using bits that are mutually exclusive, which the ABOVE_LATIN1_ALL isn't, but the one freed by this commit ANYOF_NON_UTF8_NON_ASCII_ALL is only set under /d matching, and there are other bits that are set only under /l, so if we need to do this again, we should use one of those. I wrote this code when I thought I really needed a bit. But since, I have figured out a better way to get the bit needed now. But I don't want to lose this code to posterity, so this commit is being made long enough to get the commit number, then it will be reverted, adding comments referring to the commit number, so that it can easily be reconstructed when necessary.
-rw-r--r--pod/perldebguts.pod327
-rw-r--r--regcomp.c7
-rw-r--r--regcomp.h26
-rw-r--r--regcomp.sym1
-rw-r--r--regexec.c32
-rw-r--r--regnodes.h305
-rw-r--r--t/porting/known_pod_issues.dat2
7 files changed, 392 insertions, 308 deletions
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod
index 526124e96b..60538df378 100644
--- a/pod/perldebguts.pod
+++ b/pod/perldebguts.pod
@@ -556,56 +556,72 @@ will be lost.
# Exit points
- END no End of program.
- SUCCEED no Return from a subroutine, basically.
+ END no End of program.
+ SUCCEED no Return from a subroutine,
+ basically.
# Anchors:
- BOL no Match "" at beginning of line.
- MBOL no Same, assuming multiline.
- SBOL no Same, assuming singleline.
- EOS no Match "" at end of string.
- EOL no Match "" at end of line.
- MEOL no Same, assuming multiline.
- SEOL no Same, assuming singleline.
- BOUND no Match "" at any word boundary using native
- charset semantics for non-utf8
- BOUNDL no Match "" at any locale word boundary
- BOUNDU no Match "" at any word boundary using Unicode
- semantics
- BOUNDA no Match "" at any word boundary using ASCII
- semantics
- NBOUND no Match "" at any word non-boundary using
- native charset semantics for non-utf8
- NBOUNDL no Match "" at any locale word non-boundary
- NBOUNDU no Match "" at any word non-boundary using
- Unicode semantics
- NBOUNDA no Match "" at any word non-boundary using
- ASCII semantics
- GPOS no Matches where last m//g left off.
+ BOL no Match "" at beginning of line.
+ MBOL no Same, assuming multiline.
+ SBOL no Same, assuming singleline.
+ EOS no Match "" at end of string.
+ EOL no Match "" at end of line.
+ MEOL no Same, assuming multiline.
+ SEOL no Same, assuming singleline.
+ BOUND no Match "" at any word boundary
+ using native charset semantics
+ for non-utf8
+ BOUNDL no Match "" at any locale word
+ boundary
+ BOUNDU no Match "" at any word boundary
+ using Unicode semantics
+ BOUNDA no Match "" at any word boundary
+ using ASCII semantics
+ NBOUND no Match "" at any word non-
+ boundary using native charset
+ semantics for non-utf8
+ NBOUNDL no Match "" at any locale word non-
+ boundary
+ NBOUNDU no Match "" at any word non-
+ boundary using Unicode
+ semantics
+ NBOUNDA no Match "" at any word non-
+ boundary using ASCII semantics
+ GPOS no Matches where last m//g left
+ off.
# [Special] alternatives:
- REG_ANY no Match any one character (except newline).
- SANY no Match any one character.
- CANY no Match any one byte.
- ANYOF sv Match character in (or not in) this class,
- single char match only
-
- POSIXD none Some [[:class:]] under /d; the FLAGS field
- gives which one
- POSIXL none Some [[:class:]] under /l; the FLAGS field
- gives which one
- POSIXU none Some [[:class:]] under /u; the FLAGS field
- gives which one
- POSIXA none Some [[:class:]] under /a; the FLAGS field
- gives which one
- NPOSIXD none complement of POSIXD, [[:^class:]]
- NPOSIXL none complement of POSIXL, [[:^class:]]
- NPOSIXU none complement of POSIXU, [[:^class:]]
- NPOSIXA none complement of POSIXA, [[:^class:]]
-
- CLUMP no Match any extended grapheme cluster sequence
+ REG_ANY no Match any one character (except
+ newline).
+ SANY no Match any one character.
+ CANY no Match any one byte.
+ ANYOF sv Match character in (or not in)
+ this class, single char match
+ only
+ ANYOF_NON_UTF8_NON_ASCII_ALL sv like ANYOF, also matches any
+ U+80 - U+FF when not in UTF-8
+
+ POSIXD none Some [[:class:]] under /d; the
+ FLAGS field gives which one
+ POSIXL none Some [[:class:]] under /l; the
+ FLAGS field gives which one
+ POSIXU none Some [[:class:]] under /u; the
+ FLAGS field gives which one
+ POSIXA none Some [[:class:]] under /a; the
+ FLAGS field gives which one
+ NPOSIXD none complement of POSIXD,
+ [[:^class:]]
+ NPOSIXL none complement of POSIXL,
+ [[:^class:]]
+ NPOSIXU none complement of POSIXU,
+ [[:^class:]]
+ NPOSIXA none complement of POSIXA,
+ [[:^class:]]
+
+ CLUMP no Match any extended grapheme
+ cluster sequence
# Alternation
@@ -618,40 +634,49 @@ will be lost.
# pointer of each individual branch points; each branch
# starts with the operand node of a BRANCH node.
#
- BRANCH node Match this alternative, or the next...
+ BRANCH node Match this alternative, or the
+ next...
# Back pointer
# BACK Normal "next" pointers all implicitly point forward;
# BACK exists to make loop structures possible.
# not used
- BACK no Match "", "next" ptr points backward.
+ BACK no Match "", "next" ptr points
+ backward.
# Literals
- EXACT str Match this string (preceded by length).
- EXACTF str Match this non-UTF-8 string (not guaranteed
- to be folded) using /id rules (w/len).
- EXACTFL str Match this string (not guaranteed to be
- folded) using /il rules (w/len).
- EXACTFU str Match this string (folded iff in UTF-8,
- length in folding doesn't change if not in
- UTF-8) using /iu rules (w/len).
- EXACTFA str Match this string (not guaranteed to be
- folded) using /iaa rules (w/len).
- EXACTFU_SS str Match this string (folded iff in UTF-8,
- length in folding may change even if not in
- UTF-8) using /iu rules (w/len).
- EXACTFA_NO_TRIE str Match this string (which is not trie-able;
- not guaranteed to be folded) using /iaa
- rules (w/len).
+ EXACT str Match this string (preceded by
+ length).
+ EXACTF str Match this non-UTF-8 string
+ (not guaranteed to be folded)
+ using /id rules (w/len).
+ EXACTFL str Match this string (not
+ guaranteed to be folded) using
+ /il rules (w/len).
+ EXACTFU str Match this string (folded iff
+ in UTF-8, length in folding
+ doesn't change if not in UTF-8)
+ using /iu rules (w/len).
+ EXACTFA str Match this string (not
+ guaranteed to be folded) using
+ /iaa rules (w/len).
+ EXACTFU_SS str Match this string (folded iff
+ in UTF-8, length in folding may
+ change even if not in UTF-8)
+ using /iu rules (w/len).
+ EXACTFA_NO_TRIE str Match this string (which is not
+ trie-able; not guaranteed to be
+ folded) using /iaa rules
+ (w/len).
# Do nothing types
- NOTHING no Match empty string.
+ NOTHING no Match empty string.
# A variant of above which delimits a group, thus stops optimizations
- TAIL no Match empty string. Can jump here from
- outside.
+ TAIL no Match empty string. Can jump
+ here from outside.
# Loops
@@ -660,68 +685,89 @@ will be lost.
# (one character per match) are implemented with STAR
# and PLUS for speed and to minimize recursive plunges.
#
- STAR node Match this (simple) thing 0 or more times.
- PLUS node Match this (simple) thing 1 or more times.
-
- CURLY sv 2 Match this simple thing {n,m} times.
- CURLYN no 2 Capture next-after-this simple thing
- CURLYM no 2 Capture this medium-complex thing {n,m}
- times.
- CURLYX sv 2 Match this complex thing {n,m} times.
+ STAR node Match this (simple) thing 0 or
+ more times.
+ PLUS node Match this (simple) thing 1 or
+ more times.
+
+ CURLY sv 2 Match this simple thing {n,m}
+ times.
+ CURLYN no 2 Capture next-after-this simple
+ thing
+ CURLYM no 2 Capture this medium-complex
+ thing {n,m} times.
+ CURLYX sv 2 Match this complex thing {n,m}
+ times.
# This terminator creates a loop structure for CURLYX
- WHILEM no Do curly processing and see if rest matches.
+ WHILEM no Do curly processing and see if
+ rest matches.
# Buffer related
# OPEN,CLOSE,GROUPP ...are numbered at compile time.
- OPEN num 1 Mark this point in input as start of #n.
- CLOSE num 1 Analogous to OPEN.
-
- REF num 1 Match some already matched string
- REFF num 1 Match already matched string, folded using
- native charset semantics for non-utf8
- REFFL num 1 Match already matched string, folded in loc.
- REFFU num 1 Match already matched string, folded using
- unicode semantics for non-utf8
- REFFA num 1 Match already matched string, folded using
- unicode semantics for non-utf8, no mixing
- ASCII, non-ASCII
+ OPEN num 1 Mark this point in input as
+ start of #n.
+ CLOSE num 1 Analogous to OPEN.
+
+ REF num 1 Match some already matched
+ string
+ REFF num 1 Match already matched string,
+ folded using native charset
+ semantics for non-utf8
+ REFFL num 1 Match already matched string,
+ folded in loc.
+ REFFU num 1 Match already matched string,
+ folded using unicode semantics
+ for non-utf8
+ REFFA num 1 Match already matched string,
+ folded using unicode semantics
+ for non-utf8, no mixing ASCII,
+ non-ASCII
# Named references. Code in regcomp.c assumes that these all are after
# the numbered references
- NREF no-sv 1 Match some already matched string
- NREFF no-sv 1 Match already matched string, folded using
- native charset semantics for non-utf8
- NREFFL no-sv 1 Match already matched string, folded in loc.
- NREFFU num 1 Match already matched string, folded using
- unicode semantics for non-utf8
- NREFFA num 1 Match already matched string, folded using
- unicode semantics for non-utf8, no mixing
- ASCII, non-ASCII
-
- IFMATCH off 1 2 Succeeds if the following matches.
- UNLESSM off 1 2 Fails if the following matches.
- SUSPEND off 1 1 "Independent" sub-RE.
- IFTHEN off 1 1 Switch, should be preceded by switcher.
- GROUPP num 1 Whether the group matched.
+ NREF no-sv 1 Match some already matched
+ string
+ NREFF no-sv 1 Match already matched string,
+ folded using native charset
+ semantics for non-utf8
+ NREFFL no-sv 1 Match already matched string,
+ folded in loc.
+ NREFFU num 1 Match already matched string,
+ folded using unicode semantics
+ for non-utf8
+ NREFFA num 1 Match already matched string,
+ folded using unicode semantics
+ for non-utf8, no mixing ASCII,
+ non-ASCII
+
+ IFMATCH off 1 2 Succeeds if the following
+ matches.
+ UNLESSM off 1 2 Fails if the following matches.
+ SUSPEND off 1 1 "Independent" sub-RE.
+ IFTHEN off 1 1 Switch, should be preceded by
+ switcher.
+ GROUPP num 1 Whether the group matched.
# Support for long RE
- LONGJMP off 1 1 Jump far away.
- BRANCHJ off 1 1 BRANCH with long offset.
+ LONGJMP off 1 1 Jump far away.
+ BRANCHJ off 1 1 BRANCH with long offset.
# The heavy worker
- EVAL evl 1 Execute some Perl code.
+ EVAL evl 1 Execute some Perl code.
# Modifiers
- MINMOD no Next operator is not greedy.
- LOGICAL no Next opcode should set the flag only.
+ MINMOD no Next operator is not greedy.
+ LOGICAL no Next opcode should set the flag
+ only.
# This is not used yet
- RENUM off 1 1 Group with independently numbered parens.
+ RENUM off 1 1 Group with independently
+ numbered parens.
# Trie Related
@@ -729,60 +775,67 @@ will be lost.
# have inline charclass data (ascii only), the 'C' store it in the
# structure.
- TRIE trie 1 Match many EXACT(F[ALU]?)? at once.
- flags==type
- TRIEC trie Same as TRIE, but with embedded charclass
- charclass data
+ TRIE trie 1 Match many EXACT(F[ALU]?)? at
+ once. flags==type
+ TRIEC trie Same as TRIE, but with embedded
+ charclass charclass data
- AHOCORASICK trie 1 Aho Corasick stclass. flags==type
- AHOCORASICKC trie Same as AHOCORASICK, but with embedded
- charclass charclass data
+ AHOCORASICK trie 1 Aho Corasick stclass.
+ flags==type
+ AHOCORASICKC trie Same as AHOCORASICK, but with
+ charclass embedded charclass data
# Regex Subroutines
- GOSUB num/ofs 2L recurse to paren arg1 at (signed) ofs arg2
- GOSTART no recurse to start of pattern
+ GOSUB num/ofs 2L recurse to paren arg1 at
+ (signed) ofs arg2
+ GOSTART no recurse to start of pattern
# Special conditionals
- NGROUPP no-sv 1 Whether the group matched.
- INSUBP num 1 Whether we are in a specific recurse.
- DEFINEP none 1 Never execute directly.
+ NGROUPP no-sv 1 Whether the group matched.
+ INSUBP num 1 Whether we are in a specific
+ recurse.
+ DEFINEP none 1 Never execute directly.
# Backtracking Verbs
- ENDLIKE none Used only for the type field of verbs
- OPFAIL none Same as (?!)
- ACCEPT parno 1 Accepts the current matched string.
+ ENDLIKE none Used only for the type field of
+ verbs
+ OPFAIL none Same as (?!)
+ ACCEPT parno 1 Accepts the current matched
+ string.
# Verbs With Arguments
- VERB no-sv 1 Used only for the type field of verbs
- PRUNE no-sv 1 Pattern fails at this startpoint if no-
- backtracking through this
- MARKPOINT no-sv 1 Push the current location for rollback by
- cut.
- SKIP no-sv 1 On failure skip forward (to the mark) before
- retrying
- COMMIT no-sv 1 Pattern fails outright if backtracking
- through this
- CUTGROUP no-sv 1 On failure go to the next alternation in the
- group
+ VERB no-sv 1 Used only for the type field of
+ verbs
+ PRUNE no-sv 1 Pattern fails at this
+ startpoint if no-backtracking
+ through this
+ MARKPOINT no-sv 1 Push the current location for
+ rollback by cut.
+ SKIP no-sv 1 On failure skip forward (to the
+ mark) before retrying
+ COMMIT no-sv 1 Pattern fails outright if
+ backtracking through this
+ CUTGROUP no-sv 1 On failure go to the next
+ alternation in the group
# Control what to keep in $&.
- KEEPS no $& begins here.
+ KEEPS no $& begins here.
# New charclass like patterns
- LNBREAK none generic newline pattern
+ LNBREAK none generic newline pattern
# SPECIAL REGOPS
# This is not really a node, but an optimized away piece of a "long"
# node. To simplify debugging output, we mark it as if it were a node
- OPTIMIZED off Placeholder for dump.
+ OPTIMIZED off Placeholder for dump.
# Special opcode with the property that no opcode in a compiled program
# will ever be of this type. Thus it can be used as a flag value that
# no other opcode has been seen. END is used similarly, in that an END
# node cant be optimized. So END implies "unoptimizable" and PSEUDO
# mean "not seen anything to optimize yet".
- PSEUDO off Pseudo opcode for internal use.
+ PSEUDO off Pseudo opcode for internal use.
=for regcomp.pl end
diff --git a/regcomp.c b/regcomp.c
index a82171a9b2..96bf77528e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1068,7 +1068,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
/* If this can match all upper Latin1 code points, have to add them
* as well */
- if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
+ if (OP(node) == ANYOF_NON_UTF8_NON_ASCII_ALL) {
_invlist_union(invlist, PL_UpperLatin1, &invlist);
}
@@ -4840,6 +4840,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
}
break;
+ case ANYOF_NON_UTF8_NON_ASCII_ALL:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ssc_and(pRExC_state, data->start_class,
@@ -14471,7 +14472,7 @@ parseit:
if (DEPENDS_SEMANTICS) {
/* Under /d, everything in the upper half of the Latin1 range
* matches these complements */
- ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
+ OP(ret) = ANYOF_NON_UTF8_NON_ASCII_ALL;
}
else if (AT_LEAST_ASCII_RESTRICTED) {
/* Under /a and /aa, everything above ASCII matches these
@@ -15657,7 +15658,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
sv_catpvs(sv, "^");
}
- if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
+ if (OP(o) == ANYOF_NON_UTF8_NON_ASCII_ALL) {
sv_catpvs(sv, "{non-utf8-latin1-all}");
}
diff --git a/regcomp.h b/regcomp.h
index 84aa85d0f2..69bd852e31 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -358,17 +358,15 @@ struct regnode_ssc {
* ANYOF_NONBITMAP_NON_UTF8 bit is also set. */
#define ANYOF_NONBITMAP(node) (ARG(node) != ANYOF_NONBITMAP_EMPTY)
-/* Flags for node->flags of ANYOF. These are in short supply, with none
- * currently available. If more are needed, the ANYOF_LOCALE and
- * ANYOF_POSIXL bits could be shared, making a space penalty for all locale
- * nodes. Also, the ABOVE_LATIN1_ALL bit could be freed up by resorting to
- * creating a swash containing everything above 255. This introduces a
- * performance penalty. Better would be to split it off into a separate node,
- * which actually would improve performance a bit by allowing regexec.c to test
- * for a UTF-8 character being above 255 without having to call a function nor
- * calculate its code point value. Several flags are not used in synthetic
- * start class (SSC) nodes, so could be shared should new flags be needed for
- * SSCs. */
+/* Flags for node->flags of ANYOF. These are in short supply, with one
+ * currently available. If more are needed, the ABOVE_LATIN1_ALL bit could be
+ * freed up by resorting to creating a swash containing everything above 255.
+ * This introduces a performance penalty. An option that wouldn't slow things
+ * down would be to split one of the LOC flags out into a separate node, like
+ * what has been done with ANYOF_NON_UTF8_NON_ASCII_ALL. One of these is only
+ * for /l nodes; the other only for /d, so there are no combinatorial issues.
+ * Several flags are not used in synthetic start class (SSC) nodes, so could be
+ * shared should new flags be needed for SSCs, like ANYOF_EMPTY_STRING now. */
/* regexec.c is expecting this to be in the low bit */
#define ANYOF_INVERT 0x01
@@ -406,11 +404,7 @@ struct regnode_ssc {
#define ANYOF_ABOVE_LATIN1_ALL 0x40
#define ANYOF_UNICODE_ALL ANYOF_ABOVE_LATIN1_ALL
-/* Match all Latin1 characters that aren't ASCII when the target string is not
- * in utf8. */
-#define ANYOF_NON_UTF8_NON_ASCII_ALL 0x80
-
-#define ANYOF_FLAGS_ALL (0xff)
+#define ANYOF_FLAGS_ALL (0x7F)
#define ANYOF_LOCALE_FLAGS (ANYOF_LOCALE \
|ANYOF_LOC_FOLD \
diff --git a/regcomp.sym b/regcomp.sym
index a1981862cc..4764d0e138 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -55,6 +55,7 @@ REG_ANY REG_ANY, no 0 S ; Match any one character (except newline).
SANY REG_ANY, no 0 S ; Match any one character.
CANY REG_ANY, no 0 S ; Match any one byte.
ANYOF ANYOF, sv 0 S ; Match character in (or not in) this class, single char match only
+ANYOF_NON_UTF8_NON_ASCII_ALL ANYOF, sv 0 S ; like ANYOF, also matches any U+80 - U+FF when not in UTF-8
# Order of the below is important. See ordering comment above.
POSIXD POSIXD, none 0 S ; Some [[:class:]] under /d; the FLAGS field gives which one
diff --git a/regexec.c b/regexec.c
index a2928ce0db..235a3fb914 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1619,6 +1619,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
/* We know what class it must start with. */
switch (OP(c)) {
+ case ANYOF_NON_UTF8_NON_ASCII_ALL:
+ if (! utf8_target && ! ANYOF_FLAGS(c)) {
+ REXEC_FBC_CLASS_SCAN(! isASCII((U8) *s)
+ || REGINCLASS(prog, c, (U8*)s));
+ break;
+ }
+
+ /* FALL THROUGH */
case ANYOF:
if (utf8_target) {
REXEC_FBC_UTF8_CLASS_SCAN(
@@ -4576,6 +4584,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
sayNO;
break;
+ case ANYOF_NON_UTF8_NON_ASCII_ALL:
+ if (! NEXTCHR_IS_EOS && ! utf8_target && ! ANYOF_FLAGS(scan)) {
+ if ((isASCII((U8)(*locinput))
+ && ! REGINCLASS(rex, scan, (U8*)locinput)))
+ {
+ sayNO;
+ }
+ locinput++;
+ break;
+ }
+ /* FALLTHROUGH */
case ANYOF: /* /[abc]/ */
if (NEXTCHR_IS_EOS)
sayNO;
@@ -7203,6 +7222,17 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
}
break;
}
+ case ANYOF_NON_UTF8_NON_ASCII_ALL:
+ if (! utf8_target && ! ANYOF_FLAGS(p)) {
+ while (scan < loceol
+ && (! isASCII((U8) *scan)
+ || REGINCLASS(prog, p, (U8*)scan)))
+ {
+ scan++;
+ }
+ break;
+ }
+ /* FALLTHROUGH */
case ANYOF:
if (utf8_target) {
while (hardcount < max
@@ -7646,7 +7676,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
if (c < 256) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
- else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL
+ else if (OP(n) == ANYOF_NON_UTF8_NON_ASCII_ALL
&& ! utf8_target
&& ! isASCII(c))
{
diff --git a/regnodes.h b/regnodes.h
index f9d4fc05dd..f9de6c39a9 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
/* Regops and State definitions */
-#define REGNODE_MAX 93
-#define REGMATCH_STATE_MAX 133
+#define REGNODE_MAX 94
+#define REGMATCH_STATE_MAX 134
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
@@ -31,78 +31,79 @@
#define SANY 19 /* 0x13 Match any one character. */
#define CANY 20 /* 0x14 Match any one byte. */
#define ANYOF 21 /* 0x15 Match character in (or not in) this class, single char match only */
-#define POSIXD 22 /* 0x16 Some [[:class:]] under /d; the FLAGS field gives which one */
-#define POSIXL 23 /* 0x17 Some [[:class:]] under /l; the FLAGS field gives which one */
-#define POSIXU 24 /* 0x18 Some [[:class:]] under /u; the FLAGS field gives which one */
-#define POSIXA 25 /* 0x19 Some [[:class:]] under /a; the FLAGS field gives which one */
-#define NPOSIXD 26 /* 0x1a complement of POSIXD, [[:^class:]] */
-#define NPOSIXL 27 /* 0x1b complement of POSIXL, [[:^class:]] */
-#define NPOSIXU 28 /* 0x1c complement of POSIXU, [[:^class:]] */
-#define NPOSIXA 29 /* 0x1d complement of POSIXA, [[:^class:]] */
-#define CLUMP 30 /* 0x1e Match any extended grapheme cluster sequence */
-#define BRANCH 31 /* 0x1f Match this alternative, or the next... */
-#define BACK 32 /* 0x20 Match "", "next" ptr points backward. */
-#define EXACT 33 /* 0x21 Match this string (preceded by length). */
-#define EXACTF 34 /* 0x22 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
-#define EXACTFL 35 /* 0x23 Match this string (not guaranteed to be folded) using /il rules (w/len). */
-#define EXACTFU 36 /* 0x24 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
-#define EXACTFA 37 /* 0x25 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
-#define EXACTFU_SS 38 /* 0x26 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
-#define EXACTFA_NO_TRIE 39 /* 0x27 Match this string (which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len). */
-#define NOTHING 40 /* 0x28 Match empty string. */
-#define TAIL 41 /* 0x29 Match empty string. Can jump here from outside. */
-#define STAR 42 /* 0x2a Match this (simple) thing 0 or more times. */
-#define PLUS 43 /* 0x2b Match this (simple) thing 1 or more times. */
-#define CURLY 44 /* 0x2c Match this simple thing {n,m} times. */
-#define CURLYN 45 /* 0x2d Capture next-after-this simple thing */
-#define CURLYM 46 /* 0x2e Capture this medium-complex thing {n,m} times. */
-#define CURLYX 47 /* 0x2f Match this complex thing {n,m} times. */
-#define WHILEM 48 /* 0x30 Do curly processing and see if rest matches. */
-#define OPEN 49 /* 0x31 Mark this point in input as start of #n. */
-#define CLOSE 50 /* 0x32 Analogous to OPEN. */
-#define REF 51 /* 0x33 Match some already matched string */
-#define REFF 52 /* 0x34 Match already matched string, folded using native charset semantics for non-utf8 */
-#define REFFL 53 /* 0x35 Match already matched string, folded in loc. */
-#define REFFU 54 /* 0x36 Match already matched string, folded using unicode semantics for non-utf8 */
-#define REFFA 55 /* 0x37 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define NREF 56 /* 0x38 Match some already matched string */
-#define NREFF 57 /* 0x39 Match already matched string, folded using native charset semantics for non-utf8 */
-#define NREFFL 58 /* 0x3a Match already matched string, folded in loc. */
-#define NREFFU 59 /* 0x3b Match already matched string, folded using unicode semantics for non-utf8 */
-#define NREFFA 60 /* 0x3c Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define IFMATCH 61 /* 0x3d Succeeds if the following matches. */
-#define UNLESSM 62 /* 0x3e Fails if the following matches. */
-#define SUSPEND 63 /* 0x3f "Independent" sub-RE. */
-#define IFTHEN 64 /* 0x40 Switch, should be preceded by switcher. */
-#define GROUPP 65 /* 0x41 Whether the group matched. */
-#define LONGJMP 66 /* 0x42 Jump far away. */
-#define BRANCHJ 67 /* 0x43 BRANCH with long offset. */
-#define EVAL 68 /* 0x44 Execute some Perl code. */
-#define MINMOD 69 /* 0x45 Next operator is not greedy. */
-#define LOGICAL 70 /* 0x46 Next opcode should set the flag only. */
-#define RENUM 71 /* 0x47 Group with independently numbered parens. */
-#define TRIE 72 /* 0x48 Match many EXACT(F[ALU]?)? at once. flags==type */
-#define TRIEC 73 /* 0x49 Same as TRIE, but with embedded charclass data */
-#define AHOCORASICK 74 /* 0x4a Aho Corasick stclass. flags==type */
-#define AHOCORASICKC 75 /* 0x4b Same as AHOCORASICK, but with embedded charclass data */
-#define GOSUB 76 /* 0x4c recurse to paren arg1 at (signed) ofs arg2 */
-#define GOSTART 77 /* 0x4d recurse to start of pattern */
-#define NGROUPP 78 /* 0x4e Whether the group matched. */
-#define INSUBP 79 /* 0x4f Whether we are in a specific recurse. */
-#define DEFINEP 80 /* 0x50 Never execute directly. */
-#define ENDLIKE 81 /* 0x51 Used only for the type field of verbs */
-#define OPFAIL 82 /* 0x52 Same as (?!) */
-#define ACCEPT 83 /* 0x53 Accepts the current matched string. */
-#define VERB 84 /* 0x54 Used only for the type field of verbs */
-#define PRUNE 85 /* 0x55 Pattern fails at this startpoint if no-backtracking through this */
-#define MARKPOINT 86 /* 0x56 Push the current location for rollback by cut. */
-#define SKIP 87 /* 0x57 On failure skip forward (to the mark) before retrying */
-#define COMMIT 88 /* 0x58 Pattern fails outright if backtracking through this */
-#define CUTGROUP 89 /* 0x59 On failure go to the next alternation in the group */
-#define KEEPS 90 /* 0x5a $& begins here. */
-#define LNBREAK 91 /* 0x5b generic newline pattern */
-#define OPTIMIZED 92 /* 0x5c Placeholder for dump. */
-#define PSEUDO 93 /* 0x5d Pseudo opcode for internal use. */
+#define ANYOF_NON_UTF8_NON_ASCII_ALL 22 /* 0x16 like ANYOF, also matches any U+80 - U+FF when not in UTF-8 */
+#define POSIXD 23 /* 0x17 Some [[:class:]] under /d; the FLAGS field gives which one */
+#define POSIXL 24 /* 0x18 Some [[:class:]] under /l; the FLAGS field gives which one */
+#define POSIXU 25 /* 0x19 Some [[:class:]] under /u; the FLAGS field gives which one */
+#define POSIXA 26 /* 0x1a Some [[:class:]] under /a; the FLAGS field gives which one */
+#define NPOSIXD 27 /* 0x1b complement of POSIXD, [[:^class:]] */
+#define NPOSIXL 28 /* 0x1c complement of POSIXL, [[:^class:]] */
+#define NPOSIXU 29 /* 0x1d complement of POSIXU, [[:^class:]] */
+#define NPOSIXA 30 /* 0x1e complement of POSIXA, [[:^class:]] */
+#define CLUMP 31 /* 0x1f Match any extended grapheme cluster sequence */
+#define BRANCH 32 /* 0x20 Match this alternative, or the next... */
+#define BACK 33 /* 0x21 Match "", "next" ptr points backward. */
+#define EXACT 34 /* 0x22 Match this string (preceded by length). */
+#define EXACTF 35 /* 0x23 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
+#define EXACTFL 36 /* 0x24 Match this string (not guaranteed to be folded) using /il rules (w/len). */
+#define EXACTFU 37 /* 0x25 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
+#define EXACTFA 38 /* 0x26 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
+#define EXACTFU_SS 39 /* 0x27 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
+#define EXACTFA_NO_TRIE 40 /* 0x28 Match this string (which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len). */
+#define NOTHING 41 /* 0x29 Match empty string. */
+#define TAIL 42 /* 0x2a Match empty string. Can jump here from outside. */
+#define STAR 43 /* 0x2b Match this (simple) thing 0 or more times. */
+#define PLUS 44 /* 0x2c Match this (simple) thing 1 or more times. */
+#define CURLY 45 /* 0x2d Match this simple thing {n,m} times. */
+#define CURLYN 46 /* 0x2e Capture next-after-this simple thing */
+#define CURLYM 47 /* 0x2f Capture this medium-complex thing {n,m} times. */
+#define CURLYX 48 /* 0x30 Match this complex thing {n,m} times. */
+#define WHILEM 49 /* 0x31 Do curly processing and see if rest matches. */
+#define OPEN 50 /* 0x32 Mark this point in input as start of #n. */
+#define CLOSE 51 /* 0x33 Analogous to OPEN. */
+#define REF 52 /* 0x34 Match some already matched string */
+#define REFF 53 /* 0x35 Match already matched string, folded using native charset semantics for non-utf8 */
+#define REFFL 54 /* 0x36 Match already matched string, folded in loc. */
+#define REFFU 55 /* 0x37 Match already matched string, folded using unicode semantics for non-utf8 */
+#define REFFA 56 /* 0x38 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define NREF 57 /* 0x39 Match some already matched string */
+#define NREFF 58 /* 0x3a Match already matched string, folded using native charset semantics for non-utf8 */
+#define NREFFL 59 /* 0x3b Match already matched string, folded in loc. */
+#define NREFFU 60 /* 0x3c Match already matched string, folded using unicode semantics for non-utf8 */
+#define NREFFA 61 /* 0x3d Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define IFMATCH 62 /* 0x3e Succeeds if the following matches. */
+#define UNLESSM 63 /* 0x3f Fails if the following matches. */
+#define SUSPEND 64 /* 0x40 "Independent" sub-RE. */
+#define IFTHEN 65 /* 0x41 Switch, should be preceded by switcher. */
+#define GROUPP 66 /* 0x42 Whether the group matched. */
+#define LONGJMP 67 /* 0x43 Jump far away. */
+#define BRANCHJ 68 /* 0x44 BRANCH with long offset. */
+#define EVAL 69 /* 0x45 Execute some Perl code. */
+#define MINMOD 70 /* 0x46 Next operator is not greedy. */
+#define LOGICAL 71 /* 0x47 Next opcode should set the flag only. */
+#define RENUM 72 /* 0x48 Group with independently numbered parens. */
+#define TRIE 73 /* 0x49 Match many EXACT(F[ALU]?)? at once. flags==type */
+#define TRIEC 74 /* 0x4a Same as TRIE, but with embedded charclass data */
+#define AHOCORASICK 75 /* 0x4b Aho Corasick stclass. flags==type */
+#define AHOCORASICKC 76 /* 0x4c Same as AHOCORASICK, but with embedded charclass data */
+#define GOSUB 77 /* 0x4d recurse to paren arg1 at (signed) ofs arg2 */
+#define GOSTART 78 /* 0x4e recurse to start of pattern */
+#define NGROUPP 79 /* 0x4f Whether the group matched. */
+#define INSUBP 80 /* 0x50 Whether we are in a specific recurse. */
+#define DEFINEP 81 /* 0x51 Never execute directly. */
+#define ENDLIKE 82 /* 0x52 Used only for the type field of verbs */
+#define OPFAIL 83 /* 0x53 Same as (?!) */
+#define ACCEPT 84 /* 0x54 Accepts the current matched string. */
+#define VERB 85 /* 0x55 Used only for the type field of verbs */
+#define PRUNE 86 /* 0x56 Pattern fails at this startpoint if no-backtracking through this */
+#define MARKPOINT 87 /* 0x57 Push the current location for rollback by cut. */
+#define SKIP 88 /* 0x58 On failure skip forward (to the mark) before retrying */
+#define COMMIT 89 /* 0x59 Pattern fails outright if backtracking through this */
+#define CUTGROUP 90 /* 0x5a On failure go to the next alternation in the group */
+#define KEEPS 91 /* 0x5b $& begins here. */
+#define LNBREAK 92 /* 0x5c generic newline pattern */
+#define OPTIMIZED 93 /* 0x5d Placeholder for dump. */
+#define PSEUDO 94 /* 0x5e Pseudo opcode for internal use. */
/* ------------ States ------------- */
#define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */
#define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */
@@ -173,6 +174,7 @@ EXTCONST U8 PL_regkind[] = {
REG_ANY, /* SANY */
REG_ANY, /* CANY */
ANYOF, /* ANYOF */
+ ANYOF, /* ANYOF_NON_UTF8_NON_ASCII_ALL */
POSIXD, /* POSIXD */
POSIXD, /* POSIXL */
POSIXD, /* POSIXU */
@@ -315,6 +317,7 @@ static const U8 regarglen[] = {
0, /* SANY */
0, /* CANY */
0, /* ANYOF */
+ 0, /* ANYOF_NON_UTF8_NON_ASCII_ALL */
0, /* POSIXD */
0, /* POSIXL */
0, /* POSIXU */
@@ -414,6 +417,7 @@ static const char reg_off_by_arg[] = {
0, /* SANY */
0, /* CANY */
0, /* ANYOF */
+ 0, /* ANYOF_NON_UTF8_NON_ASCII_ALL */
0, /* POSIXD */
0, /* POSIXL */
0, /* POSIXU */
@@ -518,78 +522,79 @@ EXTCONST char * const PL_reg_name[] = {
"SANY", /* 0x13 */
"CANY", /* 0x14 */
"ANYOF", /* 0x15 */
- "POSIXD", /* 0x16 */
- "POSIXL", /* 0x17 */
- "POSIXU", /* 0x18 */
- "POSIXA", /* 0x19 */
- "NPOSIXD", /* 0x1a */
- "NPOSIXL", /* 0x1b */
- "NPOSIXU", /* 0x1c */
- "NPOSIXA", /* 0x1d */
- "CLUMP", /* 0x1e */
- "BRANCH", /* 0x1f */
- "BACK", /* 0x20 */
- "EXACT", /* 0x21 */
- "EXACTF", /* 0x22 */
- "EXACTFL", /* 0x23 */
- "EXACTFU", /* 0x24 */
- "EXACTFA", /* 0x25 */
- "EXACTFU_SS", /* 0x26 */
- "EXACTFA_NO_TRIE", /* 0x27 */
- "NOTHING", /* 0x28 */
- "TAIL", /* 0x29 */
- "STAR", /* 0x2a */
- "PLUS", /* 0x2b */
- "CURLY", /* 0x2c */
- "CURLYN", /* 0x2d */
- "CURLYM", /* 0x2e */
- "CURLYX", /* 0x2f */
- "WHILEM", /* 0x30 */
- "OPEN", /* 0x31 */
- "CLOSE", /* 0x32 */
- "REF", /* 0x33 */
- "REFF", /* 0x34 */
- "REFFL", /* 0x35 */
- "REFFU", /* 0x36 */
- "REFFA", /* 0x37 */
- "NREF", /* 0x38 */
- "NREFF", /* 0x39 */
- "NREFFL", /* 0x3a */
- "NREFFU", /* 0x3b */
- "NREFFA", /* 0x3c */
- "IFMATCH", /* 0x3d */
- "UNLESSM", /* 0x3e */
- "SUSPEND", /* 0x3f */
- "IFTHEN", /* 0x40 */
- "GROUPP", /* 0x41 */
- "LONGJMP", /* 0x42 */
- "BRANCHJ", /* 0x43 */
- "EVAL", /* 0x44 */
- "MINMOD", /* 0x45 */
- "LOGICAL", /* 0x46 */
- "RENUM", /* 0x47 */
- "TRIE", /* 0x48 */
- "TRIEC", /* 0x49 */
- "AHOCORASICK", /* 0x4a */
- "AHOCORASICKC", /* 0x4b */
- "GOSUB", /* 0x4c */
- "GOSTART", /* 0x4d */
- "NGROUPP", /* 0x4e */
- "INSUBP", /* 0x4f */
- "DEFINEP", /* 0x50 */
- "ENDLIKE", /* 0x51 */
- "OPFAIL", /* 0x52 */
- "ACCEPT", /* 0x53 */
- "VERB", /* 0x54 */
- "PRUNE", /* 0x55 */
- "MARKPOINT", /* 0x56 */
- "SKIP", /* 0x57 */
- "COMMIT", /* 0x58 */
- "CUTGROUP", /* 0x59 */
- "KEEPS", /* 0x5a */
- "LNBREAK", /* 0x5b */
- "OPTIMIZED", /* 0x5c */
- "PSEUDO", /* 0x5d */
+ "ANYOF_NON_UTF8_NON_ASCII_ALL", /* 0x16 */
+ "POSIXD", /* 0x17 */
+ "POSIXL", /* 0x18 */
+ "POSIXU", /* 0x19 */
+ "POSIXA", /* 0x1a */
+ "NPOSIXD", /* 0x1b */
+ "NPOSIXL", /* 0x1c */
+ "NPOSIXU", /* 0x1d */
+ "NPOSIXA", /* 0x1e */
+ "CLUMP", /* 0x1f */
+ "BRANCH", /* 0x20 */
+ "BACK", /* 0x21 */
+ "EXACT", /* 0x22 */
+ "EXACTF", /* 0x23 */
+ "EXACTFL", /* 0x24 */
+ "EXACTFU", /* 0x25 */
+ "EXACTFA", /* 0x26 */
+ "EXACTFU_SS", /* 0x27 */
+ "EXACTFA_NO_TRIE", /* 0x28 */
+ "NOTHING", /* 0x29 */
+ "TAIL", /* 0x2a */
+ "STAR", /* 0x2b */
+ "PLUS", /* 0x2c */
+ "CURLY", /* 0x2d */
+ "CURLYN", /* 0x2e */
+ "CURLYM", /* 0x2f */
+ "CURLYX", /* 0x30 */
+ "WHILEM", /* 0x31 */
+ "OPEN", /* 0x32 */
+ "CLOSE", /* 0x33 */
+ "REF", /* 0x34 */
+ "REFF", /* 0x35 */
+ "REFFL", /* 0x36 */
+ "REFFU", /* 0x37 */
+ "REFFA", /* 0x38 */
+ "NREF", /* 0x39 */
+ "NREFF", /* 0x3a */
+ "NREFFL", /* 0x3b */
+ "NREFFU", /* 0x3c */
+ "NREFFA", /* 0x3d */
+ "IFMATCH", /* 0x3e */
+ "UNLESSM", /* 0x3f */
+ "SUSPEND", /* 0x40 */
+ "IFTHEN", /* 0x41 */
+ "GROUPP", /* 0x42 */
+ "LONGJMP", /* 0x43 */
+ "BRANCHJ", /* 0x44 */
+ "EVAL", /* 0x45 */
+ "MINMOD", /* 0x46 */
+ "LOGICAL", /* 0x47 */
+ "RENUM", /* 0x48 */
+ "TRIE", /* 0x49 */
+ "TRIEC", /* 0x4a */
+ "AHOCORASICK", /* 0x4b */
+ "AHOCORASICKC", /* 0x4c */
+ "GOSUB", /* 0x4d */
+ "GOSTART", /* 0x4e */
+ "NGROUPP", /* 0x4f */
+ "INSUBP", /* 0x50 */
+ "DEFINEP", /* 0x51 */
+ "ENDLIKE", /* 0x52 */
+ "OPFAIL", /* 0x53 */
+ "ACCEPT", /* 0x54 */
+ "VERB", /* 0x55 */
+ "PRUNE", /* 0x56 */
+ "MARKPOINT", /* 0x57 */
+ "SKIP", /* 0x58 */
+ "COMMIT", /* 0x59 */
+ "CUTGROUP", /* 0x5a */
+ "KEEPS", /* 0x5b */
+ "LNBREAK", /* 0x5c */
+ "OPTIMIZED", /* 0x5d */
+ "PSEUDO", /* 0x5e */
/* ------------ States ------------- */
"TRIE_next", /* REGNODE_MAX +0x01 */
"TRIE_next_fail", /* REGNODE_MAX +0x02 */
@@ -717,7 +722,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
EXTCONST U8 PL_varies_bitmask[];
#else
EXTCONST U8 PL_varies_bitmask[] = {
- 0x00, 0x00, 0x00, 0xC0, 0x01, 0xFC, 0xF9, 0x9F, 0x09, 0x00, 0x00, 0x00
+ 0x00, 0x00, 0x00, 0x80, 0x03, 0xF8, 0xF3, 0x3F, 0x13, 0x00, 0x00, 0x00
};
#endif /* DOINIT */
@@ -729,8 +734,8 @@ EXTCONST U8 PL_varies_bitmask[] = {
EXTCONST U8 PL_simple[] __attribute__deprecated__;
#else
EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
- REG_ANY, SANY, CANY, ANYOF, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD,
- NPOSIXL, NPOSIXU, NPOSIXA,
+ REG_ANY, SANY, CANY, ANYOF, ANYOF_NON_UTF8_NON_ASCII_ALL, POSIXD,
+ POSIXL, POSIXU, POSIXA, NPOSIXD, NPOSIXL, NPOSIXU, NPOSIXA,
0
};
#endif /* DOINIT */
@@ -739,7 +744,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
EXTCONST U8 PL_simple_bitmask[];
#else
EXTCONST U8 PL_simple_bitmask[] = {
- 0x00, 0x00, 0xFC, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ 0x00, 0x00, 0xFC, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
#endif /* DOINIT */
diff --git a/t/porting/known_pod_issues.dat b/t/porting/known_pod_issues.dat
index c31cb382a6..75af59af0c 100644
--- a/t/porting/known_pod_issues.dat
+++ b/t/porting/known_pod_issues.dat
@@ -231,7 +231,7 @@ pod/perlbook.pod Verbatim line length including indents exceeds 79 by 1
pod/perlcall.pod Verbatim line length including indents exceeds 79 by 2
pod/perlce.pod Verbatim line length including indents exceeds 79 by 3
pod/perlcygwin.pod Verbatim line length including indents exceeds 79 by 25
-pod/perldebguts.pod Verbatim line length including indents exceeds 79 by 34
+pod/perldebguts.pod Verbatim line length including indents exceeds 79 by 35
pod/perldebtut.pod Verbatim line length including indents exceeds 79 by 22
pod/perldebug.pod Verbatim line length including indents exceeds 79 by 3
pod/perldelta.pod Apparent broken link 1