summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc1
-rw-r--r--embed.h1
-rw-r--r--handy.h2
-rw-r--r--proto.h6
-rw-r--r--regcomp.c496
-rw-r--r--regcomp.sym54
-rw-r--r--regexec.c1200
-rw-r--r--regnodes.h443
8 files changed, 849 insertions, 1354 deletions
diff --git a/embed.fnc b/embed.fnc
index 5af5c97109..2a5b2b30fc 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2028,6 +2028,7 @@ Es |U8 |regtail_study |NN struct RExC_state_t *pRExC_state \
#if defined(PERL_IN_REGEXEC_C)
ERs |bool |isFOO_lc |const U8 classnum|const U8 character
+ERs |bool |isFOO_utf8_lc |const U8 classnum|NN const U8* character
ERs |I32 |regmatch |NN regmatch_info *reginfo|NN char *startpos|NN regnode *prog
ERs |I32 |regrepeat |NN const regexp *prog|NN char **startposp|NN const regnode *p|I32 max|int depth
ERs |I32 |regtry |NN regmatch_info *reginfo|NN char **startposp
diff --git a/embed.h b/embed.h
index c1ca676374..786892d328 100644
--- a/embed.h
+++ b/embed.h
@@ -972,6 +972,7 @@
#define core_regclass_swash(a,b,c,d) S_core_regclass_swash(aTHX_ a,b,c,d)
#define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e)
#define isFOO_lc(a,b) S_isFOO_lc(aTHX_ a,b)
+#define isFOO_utf8_lc(a,b) S_isFOO_utf8_lc(aTHX_ a,b)
#define reg_check_named_buff_matched(a,b) S_reg_check_named_buff_matched(aTHX_ a,b)
#define regcppop(a,b) S_regcppop(aTHX_ a,b)
#define regcppush(a,b,c) S_regcppush(aTHX_ a,b,c)
diff --git a/handy.h b/handy.h
index aaeda4adb4..223324adc5 100644
--- a/handy.h
+++ b/handy.h
@@ -803,7 +803,7 @@ typedef enum {
#define POSIX_SWASH_COUNT _FIRST_NON_SWASH_CC
#define POSIX_CC_COUNT (_HIGHEST_REGCOMP_DOT_H_SYNC + 1)
-#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C)
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
# if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
|| _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
|| _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8
diff --git a/proto.h b/proto.h
index d47e5de925..70b2dd4074 100644
--- a/proto.h
+++ b/proto.h
@@ -6799,6 +6799,12 @@ STATIC char* S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, cons
STATIC bool S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
__attribute__warn_unused_result__;
+STATIC bool S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
+ __attribute__warn_unused_result__
+ __attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT_ISFOO_UTF8_LC \
+ assert(character)
+
STATIC I32 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1)
diff --git a/regcomp.c b/regcomp.c
index c5bc8f413d..59e47106a2 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2950,34 +2950,6 @@ typedef struct scan_frame {
#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
-#define CASE_SYNST_FNC(nAmE) \
-case nAmE: \
- if (flags & SCF_DO_STCLASS_AND) { \
- for (value = 0; value < 256; value++) \
- if (!is_ ## nAmE ## _cp(value)) \
- ANYOF_BITMAP_CLEAR(data->start_class, value); \
- } \
- else { \
- for (value = 0; value < 256; value++) \
- if (is_ ## nAmE ## _cp(value)) \
- ANYOF_BITMAP_SET(data->start_class, value); \
- } \
- break; \
-case N ## nAmE: \
- if (flags & SCF_DO_STCLASS_AND) { \
- for (value = 0; value < 256; value++) \
- if (is_ ## nAmE ## _cp(value)) \
- ANYOF_BITMAP_CLEAR(data->start_class, value); \
- } \
- else { \
- for (value = 0; value < 256; value++) \
- if (!is_ ## nAmE ## _cp(value)) \
- ANYOF_BITMAP_SET(data->start_class, value); \
- } \
- break
-
-
-
STATIC I32
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
I32 *minlenp, I32 *deltap,
@@ -4147,11 +4119,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
}
min++;
if (flags & SCF_DO_STCLASS) {
+ int loop_max = 256;
data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
/* Some of the logic below assumes that switching
locale on will only add false positives. */
switch (PL_regkind[OP(scan)]) {
+ U8 classnum;
+
case SANY:
default:
do_default:
@@ -4178,200 +4153,75 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
cl_or(pRExC_state, data->start_class,
(struct regnode_charclass_class*)scan);
break;
- case ALNUM:
+ case POSIXA:
+ loop_max = 128;
+ case POSIXL:
+ case POSIXD:
+ case POSIXU:
+ classnum = FLAGS(scan);
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
- if (OP(scan) == ALNUMU) {
- for (value = 0; value < 256; value++) {
- if (!isWORDCHAR_L1(value)) {
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
- }
- } else {
- for (value = 0; value < 256; value++) {
- if (!isALNUM(value)) {
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
+ ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
+ for (value = 0; value < loop_max; value++) {
+ if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
}
}
}
}
else {
- if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
+ if (data->start_class->flags & ANYOF_LOCALE) {
+ ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
+ }
+ else {
/* Even if under locale, set the bits for non-locale
* in case it isn't a true locale-node. This will
* create false positives if it truly is locale */
- if (OP(scan) == ALNUMU) {
- for (value = 0; value < 256; value++) {
- if (isWORDCHAR_L1(value)) {
- ANYOF_BITMAP_SET(data->start_class, value);
- }
- }
- } else {
- for (value = 0; value < 256; value++) {
- if (isALNUM(value)) {
- ANYOF_BITMAP_SET(data->start_class, value);
- }
+ for (value = 0; value < loop_max; value++) {
+ if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
}
}
+ }
}
break;
- case NALNUM:
+ case NPOSIXA:
+ loop_max = 128;
+ case NPOSIXL:
+ case NPOSIXU:
+ case NPOSIXD:
+ classnum = FLAGS(scan);
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
- if (OP(scan) == NALNUMU) {
- for (value = 0; value < 256; value++) {
- if (isWORDCHAR_L1(value)) {
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
+ ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
+ for (value = 0; value < loop_max; value++) {
+ if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
}
- } else {
- for (value = 0; value < 256; value++) {
- if (isALNUM(value)) {
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
- }
- }
+ }
}
}
else {
- if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
+ if (data->start_class->flags & ANYOF_LOCALE) {
+ ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
+ }
+ else {
/* Even if under locale, set the bits for non-locale in
* case it isn't a true locale-node. This will create
* false positives if it truly is locale */
- if (OP(scan) == NALNUMU) {
- for (value = 0; value < 256; value++) {
- if (! isWORDCHAR_L1(value)) {
- ANYOF_BITMAP_SET(data->start_class, value);
- }
- }
- } else {
- for (value = 0; value < 256; value++) {
- if (! isALNUM(value)) {
- ANYOF_BITMAP_SET(data->start_class, value);
- }
- }
- }
- }
- break;
- case SPACE:
- if (flags & SCF_DO_STCLASS_AND) {
- if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
- if (OP(scan) == SPACEU) {
- for (value = 0; value < 256; value++) {
- if (!isSPACE_L1(value)) {
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
- }
- } else {
- for (value = 0; value < 256; value++) {
- if (!isSPACE(value)) {
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
- }
+ for (value = 0; value < loop_max; value++) {
+ if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
+ ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
}
- }
- }
- else {
- if (data->start_class->flags & ANYOF_LOCALE) {
- ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
}
- if (OP(scan) == SPACEU) {
- for (value = 0; value < 256; value++) {
- if (isSPACE_L1(value)) {
- ANYOF_BITMAP_SET(data->start_class, value);
- }
- }
- } else {
- for (value = 0; value < 256; value++) {
- if (isSPACE(value)) {
- ANYOF_BITMAP_SET(data->start_class, value);
- }
- }
- }
- }
- break;
- case NSPACE:
- if (flags & SCF_DO_STCLASS_AND) {
- if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
- if (OP(scan) == NSPACEU) {
- for (value = 0; value < 256; value++) {
- if (isSPACE_L1(value)) {
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
- }
- } else {
- for (value = 0; value < 256; value++) {
- if (isSPACE(value)) {
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
- }
- }
- }
- }
- else {
- if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
- if (OP(scan) == NSPACEU) {
- for (value = 0; value < 256; value++) {
- if (!isSPACE_L1(value)) {
- ANYOF_BITMAP_SET(data->start_class, value);
- }
- }
+ if (PL_regkind[OP(scan)] == NPOSIXD) {
+ data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
}
- else {
- for (value = 0; value < 256; value++) {
- if (!isSPACE(value)) {
- ANYOF_BITMAP_SET(data->start_class, value);
- }
- }
}
}
break;
- case DIGIT:
- if (flags & SCF_DO_STCLASS_AND) {
- if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
- for (value = 0; value < 256; value++)
- if (!isDIGIT(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
- }
- else {
- if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
- for (value = 0; value < 256; value++)
- if (isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
- }
- break;
- case NDIGIT:
- if (flags & SCF_DO_STCLASS_AND) {
- if (!(data->start_class->flags & ANYOF_LOCALE))
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
- for (value = 0; value < 256; value++)
- if (isDIGIT(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
- }
- else {
- if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
- for (value = 0; value < 256; value++)
- if (!isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
- }
- break;
- CASE_SYNST_FNC(VERTWS);
- CASE_SYNST_FNC(HORIZWS);
-
}
if (flags & SCF_DO_STCLASS_OR)
cl_and(data->start_class, and_withp);
@@ -6440,7 +6290,7 @@ reStudy:
r->extflags |= RXf_NULL;
else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
r->extflags |= RXf_START_ONLY;
- else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
+ else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
&& OP(regnext(first)) == END)
r->extflags |= RXf_WHITE;
}
@@ -9553,6 +9403,16 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
ret = reg_node(pRExC_state, OPFAIL);
return ret;
}
+ else if (max == 0) {
+ if (SIZE_ONLY) {
+ RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
+ }
+ else {
+ RExC_emit = orig_emit;
+ }
+ ret = reg_node(pRExC_state, NOTHING);
+ return ret;
+ }
do_curly:
if ((flags&SIMPLE)) {
@@ -10120,9 +9980,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
I32 flags;
char *parse_start = RExC_parse;
U8 op;
+ *flagp = WORST; /* Tentatively. */
+
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("atom");
- *flagp = WORST; /* Tentatively. */
+ int invert = 0;
PERL_ARGS_ASSERT_REGATOM;
@@ -10218,6 +10080,7 @@ tryagain:
literal text handling code.
*/
switch ((U8)*++RExC_parse) {
+ U8 arg;
/* Special Escapes */
case 'A':
RExC_seen_zerolen++;
@@ -10258,22 +10121,14 @@ tryagain:
ret = reg_node(pRExC_state, CLUMP);
*flagp |= HASWIDTH;
goto finish_meta_pat;
- case 'w':
- op = ALNUM + get_regex_charset(RExC_flags);
- if (op > ALNUMA) { /* /aa is same as /a */
- op = ALNUMA;
- }
- ret = reg_node(pRExC_state, op);
- *flagp |= HASWIDTH|SIMPLE;
- goto finish_meta_pat;
+
case 'W':
- op = NALNUM + get_regex_charset(RExC_flags);
- if (op > NALNUMA) { /* /aa is same as /a */
- op = NALNUMA;
- }
- ret = reg_node(pRExC_state, op);
- *flagp |= HASWIDTH|SIMPLE;
- goto finish_meta_pat;
+ invert = 1;
+ /* FALLTHROUGH */
+ case 'w':
+ arg = ANYOF_WORDCHAR;
+ goto join_posix;
+
case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
@@ -10296,60 +10151,60 @@ tryagain:
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
goto finish_meta_pat;
+
+ case 'S':
+ invert = 1;
+ /* FALLTHROUGH */
case 's':
- op = SPACE + get_regex_charset(RExC_flags);
- if (op > SPACEA) { /* /aa is same as /a */
- op = SPACEA;
+ arg = ANYOF_SPACE;
+
+ join_posix:
+
+ op = POSIXD + get_regex_charset(RExC_flags);
+ if (op > POSIXA) { /* /aa is same as /a */
+ op = POSIXA;
}
- ret = reg_node(pRExC_state, op);
- *flagp |= HASWIDTH|SIMPLE;
- goto finish_meta_pat;
- case 'S':
- op = NSPACE + get_regex_charset(RExC_flags);
- if (op > NSPACEA) { /* /aa is same as /a */
- op = NSPACEA;
+
+ join_posix_op_known:
+
+ if (invert) {
+ op += NPOSIXD - POSIXD;
}
ret = reg_node(pRExC_state, op);
+ if (! SIZE_ONLY) {
+ FLAGS(ret) = namedclass_to_classnum(arg);
+ }
+
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'D':
- op = NDIGIT;
- goto join_D_and_d;
+ invert = 1;
+ /* FALLTHROUGH */
case 'd':
- op = DIGIT;
- join_D_and_d:
- {
- U8 offset = get_regex_charset(RExC_flags);
- if (offset == REGEX_UNICODE_CHARSET) {
- offset = REGEX_DEPENDS_CHARSET;
- }
- else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
- offset = REGEX_ASCII_RESTRICTED_CHARSET;
- }
- op += offset;
- }
- ret = reg_node(pRExC_state, op);
- *flagp |= HASWIDTH|SIMPLE;
- goto finish_meta_pat;
+ arg = ANYOF_DIGIT;
+ goto join_posix;
+
case 'R':
ret = reg_node(pRExC_state, LNBREAK);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
- case 'h':
- ret = reg_node(pRExC_state, HORIZWS);
- *flagp |= HASWIDTH|SIMPLE;
- goto finish_meta_pat;
+
case 'H':
- ret = reg_node(pRExC_state, NHORIZWS);
- *flagp |= HASWIDTH|SIMPLE;
- goto finish_meta_pat;
- case 'v':
- ret = reg_node(pRExC_state, VERTWS);
- *flagp |= HASWIDTH|SIMPLE;
- goto finish_meta_pat;
+ invert = 1;
+ /* FALLTHROUGH */
+ case 'h':
+ arg = ANYOF_BLANK;
+ op = POSIXU;
+ goto join_posix_op_known;
+
case 'V':
- ret = reg_node(pRExC_state, NVERTWS);
- *flagp |= HASWIDTH|SIMPLE;
+ invert = 1;
+ /* FALLTHROUGH */
+ case 'v':
+ arg = ANYOF_VERTWS;
+ op = POSIXU;
+ goto join_posix_op_known;
+
finish_meta_pat:
nextchar(pRExC_state);
Set_Node_Length(ret, 2); /* MJD */
@@ -12314,101 +12169,69 @@ parseit:
if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
[:digit:] or \p{foo} */
- /* Certain named classes have equivalents that can appear outside a
- * character class, e.g. \w, \H. We use these instead of a
- * character class. */
+ /* All named classes are mapped into POSIXish nodes, with its FLAG
+ * argument giving which class it is */
switch ((I32)namedclass) {
- U8 offset;
-
- /* The first group is for node types that depend on the charset
- * modifier to the regex. We first calculate the base node
- * type, and if it should be inverted */
-
- case ANYOF_NWORDCHAR:
- invert = ! invert;
- /* FALLTHROUGH */
- case ANYOF_WORDCHAR:
- op = ALNUM;
- goto join_charset_classes;
-
- case ANYOF_NSPACE:
- invert = ! invert;
- /* FALLTHROUGH */
- case ANYOF_SPACE:
- op = SPACE;
- goto join_charset_classes;
-
- case ANYOF_NDIGIT:
- invert = ! invert;
- /* FALLTHROUGH */
- case ANYOF_DIGIT:
- op = DIGIT;
-
- join_charset_classes:
-
- /* Now that we have the base node type, we take advantage
- * of the enum ordering of the charset modifiers to get the
- * exact node type, For example the base SPACE also has
- * SPACEL, SPACEU, and SPACEA */
-
- offset = get_regex_charset(RExC_flags);
-
- /* /aa is the same as /a for these */
- if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
- offset = REGEX_ASCII_RESTRICTED_CHARSET;
- }
- else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
- offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
- }
-
- op += offset;
-
- /* The number of varieties of each of these is the same,
- * hence, so is the delta between the normal and
- * complemented nodes */
- if (invert) {
- op += NALNUM - ALNUM;
- }
- *flagp |= HASWIDTH|SIMPLE;
+ case ANYOF_UNIPROP:
break;
- /* The second group doesn't depend of the charset modifiers.
- * We just have normal and complemented */
+ /* These don't depend on the charset modifiers. They always
+ * match under /u rules */
case ANYOF_NHORIZWS:
- invert = ! invert;
- /* FALLTHROUGH */
case ANYOF_HORIZWS:
- is_horizws:
- op = (invert) ? NHORIZWS : HORIZWS;
- *flagp |= HASWIDTH|SIMPLE;
- break;
+ namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
+ /* FALLTHROUGH */
case ANYOF_NVERTWS:
- invert = ! invert;
- /* FALLTHROUGH */
case ANYOF_VERTWS:
- op = (invert) ? NVERTWS : VERTWS;
- *flagp |= HASWIDTH|SIMPLE;
- break;
-
- case ANYOF_UNIPROP:
- break;
-
- case ANYOF_NBLANK:
- invert = ! invert;
- /* FALLTHROUGH */
- case ANYOF_BLANK:
- if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
- goto is_horizws;
+ op = POSIXU;
+ goto join_posix;
+
+ /* The actual POSIXish node for all the rest depends on the
+ * charset modifier. The ones in the first set depend only on
+ * ASCII or, if available on this platform, locale */
+ case ANYOF_ASCII:
+ case ANYOF_NASCII:
+#ifdef HAS_ISASCII
+ op = (LOC) ? POSIXL : POSIXA;
+#else
+ op = POSIXA;
+#endif
+ goto join_posix;
+
+ case ANYOF_LOWER:
+ case ANYOF_NLOWER:
+ case ANYOF_UPPER:
+ case ANYOF_NUPPER:
+ /* under /a could be alpha */
+ if (FOLD) {
+ if (ASCII_RESTRICTED) {
+ namedclass = ANYOF_ALPHA + (namedclass % 2);
+ }
+ else if (! LOC) {
+ break;
+ }
}
/* FALLTHROUGH */
+
+ /* The rest have more possibilities depending on the charset. We
+ * take advantage of the enum ordering of the charset modifiers to
+ * get the exact node type, */
default:
- /* A generic posix class. All the /a ones can be handled
- * by the POSIXA opcode. And all are closed under folding
- * in the ASCII range, so FOLD doesn't matter */
- if (AT_LEAST_ASCII_RESTRICTED
- || (! LOC && namedclass == ANYOF_ASCII))
+ op = POSIXD + get_regex_charset(RExC_flags);
+ if (op > POSIXA) { /* /aa is same as /a */
+ op = POSIXA;
+ }
+#ifndef HAS_ISBLANK
+ if (op == POSIXL
+ && (namedclass == ANYOF_BLANK
+ || namedclass == ANYOF_NBLANK))
{
+ op = POSIXA;
+ }
+#endif
+
+ join_posix:
/* The odd numbered ones are the complements of the
* next-lower even number one */
if (namedclass % 2 == 1) {
@@ -12416,8 +12239,6 @@ parseit:
namedclass--;
}
arg = namedclass_to_classnum(namedclass);
- op = (invert) ? NPOSIXA : POSIXA;
- }
break;
}
}
@@ -12442,8 +12263,8 @@ parseit:
else if (! LOC) { /* locale could vary these */
if (prevvalue == '0') {
if (value == '9') {
- op = (invert) ? NDIGITA : DIGITA;
- *flagp |= HASWIDTH|SIMPLE;
+ arg = _CC_DIGIT;
+ op = POSIXA;
}
}
}
@@ -12469,6 +12290,11 @@ parseit:
}
else {
RExC_emit = (regnode *)orig_emit;
+ if (PL_regkind[op] == POSIXD) {
+ if (invert) {
+ op += NPOSIXD - POSIXD;
+ }
+ }
}
ret = reg_node(pRExC_state, op);
diff --git a/regcomp.sym b/regcomp.sym
index eb8ba46238..2a49d20379 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -36,8 +36,7 @@ SEOL EOL, no ; Same, assuming singleline.
# modifiers have to ordered thusly: /d, /l, /u, /a, /aa. This is because code
# in regcomp.c uses the enum value of the modifier as an offset from the /d
# version. The complements must come after the non-complements.
-# BOUND, ALNUM, SPACE, DIGIT, and their complements are affected, as well as
-# EXACTF.
+# BOUND, POSIX and their complements are affected, as well as EXACTF.
BOUND BOUND, no ; Match "" at any word boundary using native charset semantics for non-utf8
BOUNDL BOUND, no ; Match "" at any locale word boundary
BOUNDU BOUND, no ; Match "" at any word boundary using Unicode semantics
@@ -56,44 +55,16 @@ SANY REG_ANY, no 0 S ; Match any one character.
CANY REG_ANY, no 0 S ; Match any one byte.
ANYOF ANYOF, sv 0 S ; Match character in (or not in) this class, single char match only
-# Order (within each group) of the below is important. See ordering comment
-# above. The PLACEHOLDERn ones are wasting a value. Right now, we have plenty
-# to spare, but these would be obvious candidates if ever we ran out of node
-# types in a U8.
-ALNUM ALNUM, no 0 S ; Match any alphanumeric character using native charset semantics for non-utf8
-ALNUML ALNUM, no 0 S ; Match any alphanumeric char in locale
-ALNUMU ALNUM, no 0 S ; Match any alphanumeric char using Unicode semantics
-ALNUMA ALNUM, no 0 S ; Match [A-Za-z_0-9]
-NALNUM NALNUM, no 0 S ; Match any non-alphanumeric character using native charset semantics for non-utf8
-NALNUML NALNUM, no 0 S ; Match any non-alphanumeric char in locale
-NALNUMU NALNUM, no 0 S ; Match any non-alphanumeric char using Unicode semantics
-NALNUMA NALNUM, no 0 S ; Match [^A-Za-z_0-9]
-SPACE SPACE, no 0 S ; Match any whitespace character using native charset semantics for non-utf8
-SPACEL SPACE, no 0 S ; Match any whitespace char in locale
-SPACEU SPACE, no 0 S ; Match any whitespace char using Unicode semantics
-SPACEA SPACE, no 0 S ; Match [ \t\n\f\r]
-NSPACE NSPACE, no 0 S ; Match any non-whitespace character using native charset semantics for non-utf8
-NSPACEL NSPACE, no 0 S ; Match any non-whitespace char in locale
-NSPACEU NSPACE, no 0 S ; Match any non-whitespace char using Unicode semantics
-NSPACEA NSPACE, no 0 S ; Match [^ \t\n\f\r]
-DIGIT DIGIT, no 0 S ; Match any numeric character using native charset semantics for non-utf8
-DIGITL DIGIT, no 0 S ; Match any numeric character in locale
-PLACEHOLDER1 NOTHING, no ; placeholder for missing DIGITU
-DIGITA DIGIT, no 0 S ; Match [0-9]
-NDIGIT NDIGIT, no 0 S ; Match any non-numeric character using native charset semantics for non-utf8
-NDIGITL NDIGIT, no 0 S ; Match any non-numeric character in locale
-PLACEHOLDER2 NOTHING, no ; placeholder for missing NDIGITU
-NDIGITA NDIGIT, no 0 S ; Match [^0-9]
-
-POSIXD POSIXD, none 0 S ; currently unused except as a placeholder
-POSIXL POSIXD, none 0 S ; currently unused except as a placeholder
-POSIXU POSIXD, none 0 S ; currently unused except as a placeholder
+# Order of the below is important. See ordering comment above.
+POSIXD POSIXD, none 0 S ; Some [[:class:]] under /d; the FLAGS field gives which one
+POSIXL POSIXD, none 0 S ; Some [[:class:]] under /l; the FLAGS field gives which one
+POSIXU POSIXD, none 0 S ; Some [[:class:]] under /u; the FLAGS field gives which one
POSIXA POSIXD, none 0 S ; Some [[:class:]] under /a; the FLAGS field gives which one
-NPOSIXD NPOSIXD, none 0 S ; currently unused except as a placeholder
-NPOSIXL NPOSIXD, none 0 S ; currently unused except as a placeholder
-NPOSIXU NPOSIXD, none 0 S ; currently unused except as a placeholder
+NPOSIXD NPOSIXD, none 0 S ; complement of POSIXD, [[:^class:]]
+NPOSIXL NPOSIXD, none 0 S ; complement of POSIXL, [[:^class:]]
+NPOSIXU NPOSIXD, none 0 S ; complement of POSIXU, [[:^class:]]
NPOSIXA NPOSIXD, none 0 S ; complement of POSIXA, [[:^class:]]
-# End of order is important (within groups)
+# End of order is important
CLUMP CLUMP, no 0 V ; Match any extended grapheme cluster sequence
@@ -237,13 +208,6 @@ KEEPS KEEPS, no ; $& begins here.
#*New charclass like patterns
LNBREAK LNBREAK, none ; generic newline pattern
-# regcomp.c expects the node number of the complement to be one greater than
-# the non-complement
-VERTWS VERTWS, none 0 S ; vertical whitespace (Perl 6)
-NVERTWS NVERTWS, none 0 S ; not vertical whitespace (Perl 6)
-HORIZWS HORIZWS, none 0 S ; horizontal whitespace (Perl 6)
-NHORIZWS NHORIZWS, none 0 S ; not horizontal whitespace (Perl 6)
-
# NEW STUFF SOMEWHERE ABOVE THIS LINE
################################################################################
diff --git a/regexec.c b/regexec.c
index 7d03f09344..31a25fbded 100644
--- a/regexec.c
+++ b/regexec.c
@@ -174,101 +174,6 @@ static const char* const non_utf8_target_but_utf8_required
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
-/* The actual code for CCC_TRY, which uses several variables from the routine
- * it's callable from. It is designed to be the bulk of a case statement.
- * FUNC is the macro or function to call on non-utf8 targets that indicate if
- * nextchr matches the class.
- * UTF8_TEST is the whole test string to use for utf8 targets
- * LOAD is what to use to test, and if not present to load in the swash for the
- * class
- * POS_OR_NEG is either empty or ! to complement the results of FUNC or
- * UTF8_TEST test.
- * The logic is: Fail if we're at the end-of-string; otherwise if the target is
- * utf8 and a variant, load the swash if necessary and test using the utf8
- * test. Advance to the next character if test is ok, otherwise fail; If not
- * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
- * fails, or advance to the next character */
-
-#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
- if (NEXTCHR_IS_EOS) { \
- sayNO; \
- } \
- if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
- LOAD_UTF8_CHARCLASS(CLASS, STR); \
- if (POS_OR_NEG (UTF8_TEST)) { \
- sayNO; \
- } \
- } \
- else if (POS_OR_NEG (FUNC(nextchr))) { \
- sayNO; \
- } \
- goto increment_locinput;
-
-/* Handle the non-locale cases for a character class and its complement. It
- * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
- * This is because that code fails when the test succeeds, so we want to have
- * the test fail so that the code succeeds. The swash is stored in a
- * predictable PL_ place */
-#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
- CLASS, STR) \
- case NAME: \
- _CCC_TRY_CODE( !, FUNC, \
- cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
- (U8*)locinput, TRUE)), \
- CLASS, STR) \
- case NNAME: \
- _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
- cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
- (U8*)locinput, TRUE)), \
- CLASS, STR)
-/* Generate the case statements for both locale and non-locale character
- * classes in regmatch for classes that don't have special unicode semantics.
- * Locales don't use an immediate swash, but an intermediary special locale
- * function that is called on the pointer to the current place in the input
- * string. That function will resolve to needing the same swash. One might
- * think that because we don't know what the locale will match, we shouldn't
- * check with the swash loading function that it loaded properly; ie, that we
- * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
- * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
- * irrelevant here */
-#define CCC_TRY(NAME, NNAME, FUNC, \
- NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
- NAMEA, NNAMEA, FUNCA, \
- CLASS, STR) \
- case NAMEL: \
- PL_reg_flags |= RF_tainted; \
- _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
- case NNAMEL: \
- PL_reg_flags |= RF_tainted; \
- _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
- CLASS, STR) \
- case NAMEA: \
- if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
- sayNO; \
- } \
- /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
- locinput++; \
- break; \
- case NNAMEA: \
- if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
- sayNO; \
- } \
- goto increment_locinput; \
- /* Generate the non-locale cases */ \
- _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
-
-/* This is like CCC_TRY, but has an extra set of parameters for generating case
- * statements to handle separate Unicode semantics nodes */
-#define CCC_TRY_U(NAME, NNAME, FUNC, \
- NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
- NAMEU, NNAMEU, FUNCU, \
- NAMEA, NNAMEA, FUNCA, \
- CLASS, STR) \
- CCC_TRY(NAME, NNAME, FUNC, \
- NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
- NAMEA, NNAMEA, FUNCA, \
- CLASS, STR) \
- _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
@@ -549,6 +454,56 @@ S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
return FALSE;
}
+STATIC bool
+S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
+{
+ /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
+ * 'character' is a member of the Posix character class given by 'classnum'
+ * that should be equivalent to a value in the typedef
+ * '_char_class_number'.
+ *
+ * This just calls isFOO_lc on the code point for the character if it is in
+ * the range 0-255. Outside that range, all characters avoid Unicode
+ * rules, ignoring any locale. So use the Unicode function if this class
+ * requires a swash, and use the Unicode macro otherwise. */
+
+ PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
+
+ if (UTF8_IS_INVARIANT(*character)) {
+ return isFOO_lc(classnum, *character);
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
+ return isFOO_lc(classnum,
+ TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
+ }
+
+ if (classnum < _FIRST_NON_SWASH_CC) {
+
+ /* Initialize the swash unless done already */
+ if (! PL_utf8_swash_ptrs[classnum]) {
+ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+ PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
+ swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
+ }
+
+ return swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) character, TRUE);
+ }
+
+ switch ((_char_class_number) classnum) {
+ case _CC_ENUM_SPACE:
+ case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
+
+ case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
+ case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
+ case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
+ default: return 0; /* Things like CNTRL are always
+ below 256 */
+ }
+
+ assert(0); /* NOTREACHED */
+ return FALSE;
+}
+
/*
* pregexec and friends
*/
@@ -1498,13 +1453,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
const U8 *fold_array; /* array for folding ords < 256 */
STRLEN ln;
STRLEN lnc;
- STRLEN uskip;
U8 c1;
U8 c2;
char *e;
I32 tmp = 1; /* Scratch variable? */
const bool utf8_target = PL_reg_match_utf8;
UV utf8_fold_flags = 0;
+ bool to_complement = FALSE; /* Invert the result? Taking the xor of this
+ with a result inverts that result, as 0^1 =
+ 1 and 1^1 = 0 */
+ _char_class_number classnum;
+
RXi_GET_DECL(prog,progi);
PERL_ARGS_ASSERT_FIND_BYCLASS;
@@ -1710,182 +1669,155 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
isALNUM_uni(tmp),
cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
break;
- case ALNUML:
- REXEC_FBC_CSCAN_TAINT(
- isALNUM_LC_utf8((U8*)s),
- isALNUM_LC(*s)
- );
- break;
- case ALNUMU:
- REXEC_FBC_CSCAN_PRELOAD(
- LOAD_UTF8_CHARCLASS_ALNUM(),
- swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
- isWORDCHAR_L1((U8) *s)
- );
- break;
- case ALNUM:
- REXEC_FBC_CSCAN_PRELOAD(
- LOAD_UTF8_CHARCLASS_ALNUM(),
- swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
- isWORDCHAR((U8) *s)
- );
- break;
- case ALNUMA:
- /* Don't need to worry about utf8, as it can match only a single
- * byte invariant character */
- REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
- break;
- case NALNUMU:
- REXEC_FBC_CSCAN_PRELOAD(
- LOAD_UTF8_CHARCLASS_ALNUM(),
- !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
- ! isWORDCHAR_L1((U8) *s)
- );
- break;
- case NALNUM:
- REXEC_FBC_CSCAN_PRELOAD(
- LOAD_UTF8_CHARCLASS_ALNUM(),
- !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
- ! isALNUM(*s)
- );
- break;
- case NALNUMA:
- REXEC_FBC_CSCAN(
- !isWORDCHAR_A(*s),
- !isWORDCHAR_A(*s)
- );
- break;
- case NALNUML:
- REXEC_FBC_CSCAN_TAINT(
- !isALNUM_LC_utf8((U8*)s),
- !isALNUM_LC(*s)
- );
- break;
- case SPACEU:
- REXEC_FBC_CSCAN(
- is_XPERLSPACE_utf8(s),
- isSPACE_L1((U8) *s)
- );
- break;
- case SPACE:
- REXEC_FBC_CSCAN(
- is_XPERLSPACE_utf8(s),
- isSPACE((U8) *s)
- );
- break;
- case SPACEA:
- /* Don't need to worry about utf8, as it can match only a single
- * byte invariant character */
- REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
- break;
- case SPACEL:
- REXEC_FBC_CSCAN_TAINT(
- isSPACE_LC_utf8((U8*)s),
- isSPACE_LC(*s)
- );
- break;
- case NSPACEU:
- REXEC_FBC_CSCAN(
- ! is_XPERLSPACE_utf8(s),
- ! isSPACE_L1((U8) *s)
- );
- break;
- case NSPACE:
- REXEC_FBC_CSCAN(
- ! is_XPERLSPACE_utf8(s),
- ! isSPACE((U8) *s)
- );
- break;
- case NSPACEA:
- REXEC_FBC_CSCAN(
- !isSPACE_A(*s),
- !isSPACE_A(*s)
- );
- break;
- case NSPACEL:
- REXEC_FBC_CSCAN_TAINT(
- !isSPACE_LC_utf8((U8*)s),
- !isSPACE_LC(*s)
- );
- break;
- case DIGIT:
- REXEC_FBC_CSCAN_PRELOAD(
- LOAD_UTF8_CHARCLASS_DIGIT(),
- swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
- isDIGIT(*s)
- );
- break;
- case DIGITA:
- /* Don't need to worry about utf8, as it can match only a single
- * byte invariant character */
- REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
- break;
- case DIGITL:
- REXEC_FBC_CSCAN_TAINT(
- isDIGIT_LC_utf8((U8*)s),
- isDIGIT_LC(*s)
- );
- break;
- case NDIGIT:
- REXEC_FBC_CSCAN_PRELOAD(
- LOAD_UTF8_CHARCLASS_DIGIT(),
- !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
- !isDIGIT(*s)
- );
- break;
- case NDIGITA:
- REXEC_FBC_CSCAN(
- !isDIGIT_A(*s),
- !isDIGIT_A(*s)
- );
- break;
- case NDIGITL:
- REXEC_FBC_CSCAN_TAINT(
- !isDIGIT_LC_utf8((U8*)s),
- !isDIGIT_LC(*s)
- );
- break;
case LNBREAK:
REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
is_LNBREAK_latin1_safe(s, strend)
);
break;
- case VERTWS:
- REXEC_FBC_CSCAN(
- is_VERTWS_utf8_safe(s, strend),
- is_VERTWS_latin1_safe(s, strend)
- );
- break;
- case NVERTWS:
- REXEC_FBC_CSCAN(
- !is_VERTWS_utf8_safe(s, strend),
- !is_VERTWS_latin1_safe(s, strend)
- );
- break;
- case HORIZWS:
- REXEC_FBC_CSCAN(
- is_HORIZWS_utf8_safe(s, strend),
- is_HORIZWS_latin1_safe(s, strend)
- );
- break;
- case NHORIZWS:
- REXEC_FBC_CSCAN(
- !is_HORIZWS_utf8_safe(s, strend),
- !is_HORIZWS_latin1_safe(s, strend)
- );
+
+ /* The argument to all the POSIX node types is the class number to pass to
+ * _generic_isCC() to build a mask for searching in PL_charclass[] */
+
+ case NPOSIXL:
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXL:
+ PL_reg_flags |= RF_tainted;
+ REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
+ to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
break;
+
+ case NPOSIXD:
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXD:
+ if (utf8_target) {
+ goto posix_utf8;
+ }
+ goto posixa;
+
+ case NPOSIXA:
+ if (utf8_target) {
+ /* The complement of something that matches only ASCII matches all
+ * UTF-8 variant code points, plus everything in ASCII that isn't
+ * in the class */
+ REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
+ || ! _generic_isCC_A(*s, FLAGS(c)));
+ break;
+ }
+
+ to_complement = 1;
+ /* FALLTHROUGH */
+
case POSIXA:
+ posixa:
/* Don't need to worry about utf8, as it can match only a single
- * byte invariant character. The flag in this node type is the
- * class number to pass to _generic_isCC() to build a mask for
- * searching in PL_charclass[] */
- REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
+ * byte invariant character. */
+ REXEC_FBC_CLASS_SCAN(
+ to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
break;
- case NPOSIXA:
- REXEC_FBC_CSCAN(
- !_generic_isCC_A(*s, FLAGS(c)),
- !_generic_isCC_A(*s, FLAGS(c))
- );
+
+ case NPOSIXU:
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXU:
+ if (! utf8_target) {
+ REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
+ FLAGS(c))));
+ }
+ else {
+
+ posix_utf8:
+ classnum = (_char_class_number) FLAGS(c);
+ if (classnum < _FIRST_NON_SWASH_CC) {
+ while (s < strend) {
+
+ /* We avoid loading in the swash as long as possible, but
+ * should we have to, we jump to a separate loop. This
+ * extra 'if' statement is what keeps this code from being
+ * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
+ if (UTF8_IS_ABOVE_LATIN1(*s)) {
+ goto found_above_latin1;
+ }
+ if ((UTF8_IS_INVARIANT(*s)
+ && to_complement ^ cBOOL(_generic_isCC((U8) *s,
+ classnum)))
+ || (UTF8_IS_DOWNGRADEABLE_START(*s)
+ && to_complement ^ cBOOL(
+ _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
+ classnum))))
+ {
+ if (tmp && (!reginfo || regtry(reginfo, &s)))
+ goto got_it;
+ else {
+ tmp = doevery;
+ }
+ }
+ else {
+ tmp = 1;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else switch (classnum) { /* These classes are implemented as
+ macros */
+ case _CC_ENUM_SPACE: /* XXX would require separate code if we
+ revert the change of \v matching this */
+ /* FALL THROUGH */
+
+ case _CC_ENUM_PSXSPC:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ to_complement ^ cBOOL(isSPACE_utf8(s)));
+ break;
+
+ case _CC_ENUM_BLANK:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ to_complement ^ cBOOL(isBLANK_utf8(s)));
+ break;
+
+ case _CC_ENUM_XDIGIT:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ to_complement ^ cBOOL(isXDIGIT_utf8(s)));
+ break;
+
+ case _CC_ENUM_VERTSPACE:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ to_complement ^ cBOOL(isVERTWS_utf8(s)));
+ break;
+
+ case _CC_ENUM_CNTRL:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ to_complement ^ cBOOL(isCNTRL_utf8(s)));
+ break;
+
+ default:
+ Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
+ assert(0); /* NOTREACHED */
+ }
+ }
+ break;
+
+ found_above_latin1: /* Here we have to load a swash to get the result
+ for the current code point */
+ if (! PL_utf8_swash_ptrs[classnum]) {
+ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+ PL_utf8_swash_ptrs[classnum] =
+ _core_swash_init("utf8", swash_property_names[classnum],
+ &PL_sv_undef, 1, 0, NULL, &flags);
+ }
+
+ /* This is a copy of the loop above for swash classes, though using the
+ * FBC macro instead of being expanded out. Since we've loaded the
+ * swash, we don't have to check for that each time through the loop */
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ to_complement ^ cBOOL(_generic_utf8(
+ classnum,
+ s,
+ swash_fetch(PL_utf8_swash_ptrs[classnum],
+ (U8 *) s, TRUE))));
break;
case AHOCORASICKC:
@@ -3636,6 +3568,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
CHECKPOINT runops_cp; /* savestack position before executing EVAL */
U32 maxopenparen = 0; /* max '(' index seen so far */
+ int to_complement; /* Invert the result? */
+ _char_class_number classnum;
#ifdef DEBUGGING
GET_RE_DEBUG_FLAGS_DECL;
@@ -3697,6 +3631,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
state_num = OP(scan);
reenter_switch:
+ to_complement = 0;
SET_nextchr;
assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
@@ -4362,100 +4297,184 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
}
break;
- /* Special char classes: \d, \w etc.
- * The defines start on line 166 or so */
- CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
- ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
- ALNUMU, NALNUMU, isWORDCHAR_L1,
- ALNUMA, NALNUMA, isWORDCHAR_A,
- alnum, "a");
+ /* The argument (FLAGS) to all the POSIX node types is the class number
+ * */
- case SPACEL:
- PL_reg_flags |= RF_tainted;
- if (NEXTCHR_IS_EOS) {
+ case NPOSIXL: /* \W or [:^punct:] etc. under /l */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXL: /* \w or [:punct:] etc. under /l */
+ if (NEXTCHR_IS_EOS)
sayNO;
- }
- if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
- if (! isSPACE_LC_utf8((U8 *) locinput)) {
- sayNO;
- }
- }
- else if (! isSPACE_LC((U8) nextchr)) {
- sayNO;
- }
- goto increment_locinput;
- case NSPACEL:
+ /* The locale hasn't influenced the outcome before this, so defer
+ * tainting until now */
PL_reg_flags |= RF_tainted;
- if (NEXTCHR_IS_EOS) {
- sayNO;
- }
- if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
- if (isSPACE_LC_utf8((U8 *) locinput)) {
+
+ /* Use isFOO_lc() for characters within Latin1. (Note that
+ * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
+ * wouldn't be invariant) */
+ if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
+ if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), nextchr)))) {
sayNO;
}
}
- else if (isSPACE_LC(nextchr)) {
+ else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
+ if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
+ TWO_BYTE_UTF8_TO_UNI(nextchr,
+ *(locinput + 1))))))
+ {
sayNO;
+ }
}
- goto increment_locinput;
-
- case SPACE:
- if (utf8_target) {
- goto utf8_space;
+ else { /* Here, must be an above Latin-1 code point */
+ goto utf8_posix_not_eos;
}
- /* FALL THROUGH */
- case SPACEA:
- if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
- sayNO;
- }
- /* Matched a utf8-invariant, so don't have to worry about utf8 */
- locinput++;
+
+ /* Here, must be utf8 */
+ locinput += UTF8SKIP(locinput);
break;
- case NSPACE:
+ case NPOSIXD: /* \W or [:^punct:] etc. under /d */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXD: /* \w or [:punct:] etc. under /d */
if (utf8_target) {
- goto utf8_nspace;
- }
- /* FALL THROUGH */
- case NSPACEA:
- if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
- sayNO;
+ goto utf8_posix;
}
- goto increment_locinput;
+ goto posixa;
+
+ case NPOSIXA: /* \W or [:^punct:] etc. under /a */
- case SPACEU:
- utf8_space:
- if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
+ if (NEXTCHR_IS_EOS) {
sayNO;
}
- goto increment_locinput;
- case NSPACEU:
- utf8_nspace:
- if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
- sayNO;
+ /* All UTF-8 variants match */
+ if (! UTF8_IS_INVARIANT(nextchr)) {
+ goto increment_locinput;
}
- goto increment_locinput;
- CCC_TRY(DIGIT, NDIGIT, isDIGIT,
- DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
- DIGITA, NDIGITA, isDIGIT_A,
- digit, "0");
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXA: /* \w or [:punct:] etc. under /a */
+
+ posixa:
+ /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
+ * UTF-8, and also from NPOSIXA even in UTF-8 when the current
+ * character is a single byte */
- case POSIXA: /* /[[:ascii:]]/ etc */
- if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
+ if (NEXTCHR_IS_EOS
+ || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
+ FLAGS(scan)))))
+ {
sayNO;
}
- /* Matched a utf8-invariant, so don't have to worry about utf8 */
+
+ /* Here we are either not in utf8, or we matched a utf8-invariant,
+ * so the next char is the next byte */
locinput++;
break;
- case NPOSIXA: /* /[^[:ascii:]]/ etc */
- if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
+ case NPOSIXU: /* \W or [:^punct:] etc. under /u */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXU: /* \w or [:punct:] etc. under /u */
+ utf8_posix:
+ if (NEXTCHR_IS_EOS) {
sayNO;
}
- goto increment_locinput;
+ utf8_posix_not_eos:
+
+ /* Use _generic_isCC() for characters within Latin1. (Note that
+ * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
+ * wouldn't be invariant) */
+ if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
+ if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
+ FLAGS(scan)))))
+ {
+ sayNO;
+ }
+ locinput++;
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
+ if (! (to_complement
+ ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
+ *(locinput + 1)),
+ FLAGS(scan)))))
+ {
+ sayNO;
+ }
+ locinput += 2;
+ }
+ else { /* Handle above Latin-1 code points */
+ classnum = (_char_class_number) FLAGS(scan);
+ if (classnum < _FIRST_NON_SWASH_CC) {
+
+ /* Here, uses a swash to find such code points. Load if if
+ * not done already */
+ if (! PL_utf8_swash_ptrs[classnum]) {
+ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+ PL_utf8_swash_ptrs[classnum]
+ = _core_swash_init("utf8",
+ swash_property_names[classnum],
+ &PL_sv_undef, 1, 0, NULL, &flags);
+ }
+ if (! (to_complement
+ ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
+ (U8 *) locinput, TRUE))))
+ {
+ sayNO;
+ }
+ }
+ else { /* Here, uses macros to find above Latin-1 code points */
+ switch (classnum) {
+ case _CC_ENUM_SPACE: /* XXX would require separate
+ code if we revert the change
+ of \v matching this */
+ case _CC_ENUM_PSXSPC:
+ if (! (to_complement
+ ^ cBOOL(is_XPERLSPACE_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ case _CC_ENUM_BLANK:
+ if (! (to_complement
+ ^ cBOOL(is_HORIZWS_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ case _CC_ENUM_XDIGIT:
+ if (! (to_complement
+ ^ cBOOL(is_XDIGIT_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ case _CC_ENUM_VERTSPACE:
+ if (! (to_complement
+ ^ cBOOL(is_VERTWS_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ default: /* The rest, e.g. [:cntrl:], can't match
+ above Latin1 */
+ if (! to_complement) {
+ sayNO;
+ }
+ break;
+ }
+ }
+ locinput += UTF8SKIP(locinput);
+ }
+ break;
case CLUMP: /* Match \X: logical Unicode character. This is defined as
a Unicode extended Grapheme Cluster */
@@ -6417,29 +6436,6 @@ NULL
sayNO;
break;
-#define CASE_CLASS(nAmE) \
- case nAmE: \
- if (NEXTCHR_IS_EOS) \
- sayNO; \
- if ((n=is_##nAmE(locinput,utf8_target))) { \
- locinput += n; \
- } else \
- sayNO; \
- break; \
- case N##nAmE: \
- if (NEXTCHR_IS_EOS) \
- sayNO; \
- if ((n=is_##nAmE(locinput,utf8_target))) { \
- sayNO; \
- } else { \
- locinput += UTF8SKIP(locinput); \
- } \
- break
-
- CASE_CLASS(VERTWS); /* \v \V */
- CASE_CLASS(HORIZWS); /* \h \H */
-#undef CASE_CLASS
-
default:
PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
PTR2UV(scan), OP(scan));
@@ -6665,7 +6661,9 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
char *loceol = PL_regeol; /* local version */
I32 hardcount = 0; /* How many matches so far */
bool utf8_target = PL_reg_match_utf8;
+ int to_complement = 0; /* Invert the result? */
UV utf8_flags;
+ _char_class_number classnum;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
@@ -6887,79 +6885,38 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
scan++;
}
break;
- case ALNUMU:
- if (utf8_target) {
- utf8_wordchar:
- LOAD_UTF8_CHARCLASS_ALNUM();
- while (hardcount < max && scan < loceol &&
- swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
- scan++;
- }
- }
- break;
- case ALNUM:
- if (utf8_target)
- goto utf8_wordchar;
- while (scan < loceol && isALNUM((U8) *scan)) {
- scan++;
- }
- break;
- case ALNUMA:
- if (utf8_target && scan + max < loceol) {
- /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
- * since here, to match, 1 char == 1 byte */
- loceol = scan + max;
- }
- while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
- scan++;
- }
- break;
- case ALNUML:
- PL_reg_flags |= RF_tainted;
- if (utf8_target) {
- while (hardcount < max && scan < loceol &&
- isALNUM_LC_utf8((U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- while (scan < loceol && isALNUM_LC(*scan))
- scan++;
- }
- break;
- case NALNUMU:
- if (utf8_target) {
+ /* The argument (FLAGS) to all the POSIX node types is the class number */
- utf8_Nwordchar:
+ case NPOSIXL:
+ to_complement = 1;
+ /* FALLTHROUGH */
- LOAD_UTF8_CHARCLASS_ALNUM();
- while (hardcount < max && scan < loceol &&
- ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+ case POSIXL:
+ PL_reg_flags |= RF_tainted;
+ if (! utf8_target) {
+ while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
+ *scan)))
{
- scan += UTF8SKIP(scan);
+ scan++;
+ }
+ } else {
+ while (hardcount < max && scan < loceol
+ && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
+ (U8 *) scan)))
+ {
+ scan += UTF8SKIP(scan);
hardcount++;
}
- } else {
- while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
- scan++;
- }
- }
- break;
- case NALNUM:
- if (utf8_target)
- goto utf8_Nwordchar;
- while (scan < loceol && ! isALNUM((U8) *scan)) {
- scan++;
}
break;
+ case POSIXD:
+ if (utf8_target) {
+ goto utf8_posix;
+ }
+ /* FALLTHROUGH */
+
case POSIXA:
if (utf8_target && scan + max < loceol) {
@@ -6972,232 +6929,170 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
scan++;
}
break;
- case NPOSIXA:
- if (utf8_target) {
- while (scan < loceol && hardcount < max
- && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- }
- else {
- while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
- scan++;
- }
- }
- break;
- case NALNUMA:
- if (utf8_target) {
- while (scan < loceol && hardcount < max
- && ! isWORDCHAR_A((U8) *scan))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- }
- else {
- while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
- scan++;
- }
- }
- break;
- case NALNUML:
- PL_reg_flags |= RF_tainted;
- if (utf8_target) {
- while (hardcount < max && scan < loceol &&
- !isALNUM_LC_utf8((U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- while (scan < loceol && !isALNUM_LC(*scan))
- scan++;
- }
- break;
- case SPACEU:
- if (utf8_target) {
- utf8_space:
+ case NPOSIXD:
+ if (utf8_target) {
+ to_complement = 1;
+ goto utf8_posix;
+ }
+ /* FALL THROUGH */
- while (hardcount < max && scan < loceol
- && is_XPERLSPACE_utf8((U8*)scan))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- break;
- }
- else {
- while (scan < loceol && isSPACE_L1((U8) *scan)) {
+ case NPOSIXA:
+ if (! utf8_target) {
+ while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
scan++;
}
- break;
- }
- case SPACE:
- if (utf8_target)
- goto utf8_space;
-
- while (scan < loceol && isSPACE((U8) *scan)) {
- scan++;
- }
- break;
- case SPACEA:
- if (utf8_target && scan + max < loceol) {
-
- /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
- * since here, to match, 1 char == 1 byte */
- loceol = scan + max;
}
- while (scan < loceol && isSPACE_A((U8) *scan)) {
- scan++;
- }
- break;
- case SPACEL:
- PL_reg_flags |= RF_tainted;
- if (utf8_target) {
- while (hardcount < max && scan < loceol &&
- isSPACE_LC_utf8((U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- while (scan < loceol && isSPACE_LC(*scan))
- scan++;
- }
- break;
- case NSPACEU:
- if (utf8_target) {
-
- utf8_Nspace:
+ else {
+ /* The complement of something that matches only ASCII matches all
+ * UTF-8 variant code points, plus everything in ASCII that isn't
+ * in the class. */
while (hardcount < max && scan < loceol
- && ! is_XPERLSPACE_utf8((U8*)scan))
+ && (! UTF8_IS_INVARIANT(*scan)
+ || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
{
- scan += UTF8SKIP(scan);
+ scan += UTF8SKIP(scan);
hardcount++;
}
- break;
- }
- else {
- while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
- scan++;
- }
- }
- break;
- case NSPACE:
- if (utf8_target)
- goto utf8_Nspace;
+ }
+ break;
- while (scan < loceol && ! isSPACE((U8) *scan)) {
- scan++;
- }
- break;
- case NSPACEA:
- if (utf8_target) {
- while (hardcount < max && scan < loceol
- && ! isSPACE_A((U8) *scan))
+ case NPOSIXU:
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXU:
+ if (! utf8_target) {
+ while (scan < loceol && to_complement
+ ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
{
- scan += UTF8SKIP(scan);
- hardcount++;
- }
+ scan++;
+ }
}
else {
- while (scan < loceol && ! isSPACE_A((U8) *scan)) {
- scan++;
- }
- }
- break;
- case NSPACEL:
- PL_reg_flags |= RF_tainted;
- if (utf8_target) {
- while (hardcount < max && scan < loceol &&
- !isSPACE_LC_utf8((U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- while (scan < loceol && !isSPACE_LC(*scan))
- scan++;
- }
- break;
- case DIGIT:
- if (utf8_target) {
- LOAD_UTF8_CHARCLASS_DIGIT();
- while (hardcount < max && scan < loceol &&
- swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- while (scan < loceol && isDIGIT(*scan))
- scan++;
+ utf8_posix:
+ classnum = (_char_class_number) FLAGS(p);
+ if (classnum < _FIRST_NON_SWASH_CC) {
+
+ /* Here, a swash is needed for above-Latin1 code points.
+ * Process as many Latin1 code points using the built-in rules.
+ * Go to another loop to finish processing upon encountering
+ * the first Latin1 code point. We could do that in this loop
+ * as well, but the other way saves having to test if the swash
+ * has been loaded every time through the loop: extra space to
+ * save a test. */
+ while (hardcount < max && scan < loceol) {
+ if (UTF8_IS_INVARIANT(*scan)) {
+ if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
+ classnum))))
+ {
+ break;
+ }
+ scan++;
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
+ if (! (to_complement
+ ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
+ *(scan + 1)),
+ classnum))))
+ {
+ break;
+ }
+ scan += 2;
+ }
+ else {
+ goto found_above_latin1;
+ }
+
+ hardcount++;
+ }
+ }
+ else {
+ /* For these character classes, the knowledge of how to handle
+ * every code point is compiled in to Perl via a macro. This
+ * code is written for making the loops as tight as possible.
+ * It could be refactored to save space instead */
+ switch (classnum) {
+ case _CC_ENUM_SPACE: /* XXX would require separate code
+ if we revert the change of \v
+ matching this */
+ /* FALL THROUGH */
+ case _CC_ENUM_PSXSPC:
+ while (hardcount < max
+ && scan < loceol
+ && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
+ case _CC_ENUM_BLANK:
+ while (hardcount < max
+ && scan < loceol
+ && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
+ case _CC_ENUM_XDIGIT:
+ while (hardcount < max
+ && scan < loceol
+ && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
+ case _CC_ENUM_VERTSPACE:
+ while (hardcount < max
+ && scan < loceol
+ && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
+ case _CC_ENUM_CNTRL:
+ while (hardcount < max
+ && scan < loceol
+ && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
+ default:
+ Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
+ }
+ }
}
- break;
- case DIGITA:
- if (utf8_target && scan + max < loceol) {
+ break;
- /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
- * since here, to match, 1 char == 1 byte */
- loceol = scan + max;
+ found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
+
+ /* Load the swash if not already present */
+ if (! PL_utf8_swash_ptrs[classnum]) {
+ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+ PL_utf8_swash_ptrs[classnum] = _core_swash_init(
+ "utf8", swash_property_names[classnum],
+ &PL_sv_undef, 1, 0, NULL, &flags);
}
- while (scan < loceol && isDIGIT_A((U8) *scan)) {
- scan++;
- }
- break;
- case DIGITL:
- PL_reg_flags |= RF_tainted;
- if (utf8_target) {
- while (hardcount < max && scan < loceol &&
- isDIGIT_LC_utf8((U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- while (scan < loceol && isDIGIT_LC(*scan))
- scan++;
- }
- break;
- case NDIGIT:
- if (utf8_target) {
- LOAD_UTF8_CHARCLASS_DIGIT();
- while (hardcount < max && scan < loceol &&
- !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- while (scan < loceol && !isDIGIT(*scan))
- scan++;
- }
- break;
- case NDIGITA:
- if (utf8_target) {
- while (hardcount < max && scan < loceol
- && ! isDIGIT_A((U8) *scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- }
- else {
- while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
- scan++;
- }
- }
- break;
- case NDIGITL:
- PL_reg_flags |= RF_tainted;
- if (utf8_target) {
- while (hardcount < max && scan < loceol &&
- !isDIGIT_LC_utf8((U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- while (scan < loceol && !isDIGIT_LC(*scan))
- scan++;
- }
- break;
+
+ while (hardcount < max && scan < loceol
+ && to_complement ^ cBOOL(_generic_utf8(
+ classnum,
+ scan,
+ swash_fetch(PL_utf8_swash_ptrs[classnum],
+ (U8 *) scan,
+ TRUE))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
+
case LNBREAK:
if (utf8_target) {
while (hardcount < max && scan < loceol &&
@@ -7216,61 +7111,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
}
}
break;
- case HORIZWS:
- if (utf8_target) {
- while (hardcount < max && scan < loceol &&
- (c=is_HORIZWS_utf8_safe(scan, loceol)))
- {
- scan += c;
- hardcount++;
- }
- } else {
- while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
- scan++;
- }
- break;
- case NHORIZWS:
- if (utf8_target) {
- while (hardcount < max && scan < loceol &&
- !is_HORIZWS_utf8_safe(scan, loceol))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
- scan++;
-
- }
- break;
- case VERTWS:
- if (utf8_target) {
- while (hardcount < max && scan < loceol &&
- (c=is_VERTWS_utf8_safe(scan, loceol)))
- {
- scan += c;
- hardcount++;
- }
- } else {
- while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
- scan++;
-
- }
- break;
- case NVERTWS:
- if (utf8_target) {
- while (hardcount < max && scan < loceol &&
- !is_VERTWS_utf8_safe(scan, loceol))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
- scan++;
-
- }
- break;
case BOUND:
case BOUNDA:
diff --git a/regnodes.h b/regnodes.h
index 2024d156bb..e1fdad1fb9 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
/* Regops and State definitions */
-#define REGNODE_MAX 121
-#define REGMATCH_STATE_MAX 161
+#define REGNODE_MAX 93
+#define REGMATCH_STATE_MAX 133
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
@@ -31,106 +31,78 @@
#define SANY 19 /* 0x13 Match any one character. */
#define CANY 20 /* 0x14 Match any one byte. */
#define ANYOF 21 /* 0x15 Match character in (or not in) this class, single char match only */
-#define ALNUM 22 /* 0x16 Match any alphanumeric character using native charset semantics for non-utf8 */
-#define ALNUML 23 /* 0x17 Match any alphanumeric char in locale */
-#define ALNUMU 24 /* 0x18 Match any alphanumeric char using Unicode semantics */
-#define ALNUMA 25 /* 0x19 Match [A-Za-z_0-9] */
-#define NALNUM 26 /* 0x1a Match any non-alphanumeric character using native charset semantics for non-utf8 */
-#define NALNUML 27 /* 0x1b Match any non-alphanumeric char in locale */
-#define NALNUMU 28 /* 0x1c Match any non-alphanumeric char using Unicode semantics */
-#define NALNUMA 29 /* 0x1d Match [^A-Za-z_0-9] */
-#define SPACE 30 /* 0x1e Match any whitespace character using native charset semantics for non-utf8 */
-#define SPACEL 31 /* 0x1f Match any whitespace char in locale */
-#define SPACEU 32 /* 0x20 Match any whitespace char using Unicode semantics */
-#define SPACEA 33 /* 0x21 Match [ \t\n\f\r] */
-#define NSPACE 34 /* 0x22 Match any non-whitespace character using native charset semantics for non-utf8 */
-#define NSPACEL 35 /* 0x23 Match any non-whitespace char in locale */
-#define NSPACEU 36 /* 0x24 Match any non-whitespace char using Unicode semantics */
-#define NSPACEA 37 /* 0x25 Match [^ \t\n\f\r] */
-#define DIGIT 38 /* 0x26 Match any numeric character using native charset semantics for non-utf8 */
-#define DIGITL 39 /* 0x27 Match any numeric character in locale */
-#define PLACEHOLDER1 40 /* 0x28 placeholder for missing DIGITU */
-#define DIGITA 41 /* 0x29 Match [0-9] */
-#define NDIGIT 42 /* 0x2a Match any non-numeric character using native charset semantics for non-utf8 */
-#define NDIGITL 43 /* 0x2b Match any non-numeric character in locale */
-#define PLACEHOLDER2 44 /* 0x2c placeholder for missing NDIGITU */
-#define NDIGITA 45 /* 0x2d Match [^0-9] */
-#define POSIXD 46 /* 0x2e currently unused except as a placeholder */
-#define POSIXL 47 /* 0x2f currently unused except as a placeholder */
-#define POSIXU 48 /* 0x30 currently unused except as a placeholder */
-#define POSIXA 49 /* 0x31 Some [[:class:]] under /a; the FLAGS field gives which one */
-#define NPOSIXD 50 /* 0x32 currently unused except as a placeholder */
-#define NPOSIXL 51 /* 0x33 currently unused except as a placeholder */
-#define NPOSIXU 52 /* 0x34 currently unused except as a placeholder */
-#define NPOSIXA 53 /* 0x35 complement of POSIXA, [[:^class:]] */
-#define CLUMP 54 /* 0x36 Match any extended grapheme cluster sequence */
-#define BRANCH 55 /* 0x37 Match this alternative, or the next... */
-#define BACK 56 /* 0x38 Match "", "next" ptr points backward. */
-#define EXACT 57 /* 0x39 Match this string (preceded by length). */
-#define EXACTF 58 /* 0x3a Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
-#define EXACTFL 59 /* 0x3b Match this string (not guaranteed to be folded) using /il rules (w/len). */
-#define EXACTFU 60 /* 0x3c Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
-#define EXACTFA 61 /* 0x3d Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
-#define EXACTFU_SS 62 /* 0x3e Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
-#define EXACTFU_TRICKYFOLD 63 /* 0x3f Match this folded UTF-8 string using /iu rules */
-#define NOTHING 64 /* 0x40 Match empty string. */
-#define TAIL 65 /* 0x41 Match empty string. Can jump here from outside. */
-#define STAR 66 /* 0x42 Match this (simple) thing 0 or more times. */
-#define PLUS 67 /* 0x43 Match this (simple) thing 1 or more times. */
-#define CURLY 68 /* 0x44 Match this simple thing {n,m} times. */
-#define CURLYN 69 /* 0x45 Capture next-after-this simple thing */
-#define CURLYM 70 /* 0x46 Capture this medium-complex thing {n,m} times. */
-#define CURLYX 71 /* 0x47 Match this complex thing {n,m} times. */
-#define WHILEM 72 /* 0x48 Do curly processing and see if rest matches. */
-#define OPEN 73 /* 0x49 Mark this point in input as start of */
-#define CLOSE 74 /* 0x4a Analogous to OPEN. */
-#define REF 75 /* 0x4b Match some already matched string */
-#define REFF 76 /* 0x4c Match already matched string, folded using native charset semantics for non-utf8 */
-#define REFFL 77 /* 0x4d Match already matched string, folded in loc. */
-#define REFFU 78 /* 0x4e Match already matched string, folded using unicode semantics for non-utf8 */
-#define REFFA 79 /* 0x4f Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define NREF 80 /* 0x50 Match some already matched string */
-#define NREFF 81 /* 0x51 Match already matched string, folded using native charset semantics for non-utf8 */
-#define NREFFL 82 /* 0x52 Match already matched string, folded in loc. */
-#define NREFFU 83 /* 0x53 Match already matched string, folded using unicode semantics for non-utf8 */
-#define NREFFA 84 /* 0x54 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define IFMATCH 85 /* 0x55 Succeeds if the following matches. */
-#define UNLESSM 86 /* 0x56 Fails if the following matches. */
-#define SUSPEND 87 /* 0x57 "Independent" sub-RE. */
-#define IFTHEN 88 /* 0x58 Switch, should be preceded by switcher . */
-#define GROUPP 89 /* 0x59 Whether the group matched. */
-#define LONGJMP 90 /* 0x5a Jump far away. */
-#define BRANCHJ 91 /* 0x5b BRANCH with long offset. */
-#define EVAL 92 /* 0x5c Execute some Perl code. */
-#define MINMOD 93 /* 0x5d Next operator is not greedy. */
-#define LOGICAL 94 /* 0x5e Next opcode should set the flag only. */
-#define RENUM 95 /* 0x5f Group with independently numbered parens. */
-#define TRIE 96 /* 0x60 Match many EXACT(F[ALU]?)? at once. flags==type */
-#define TRIEC 97 /* 0x61 Same as TRIE, but with embedded charclass data */
-#define AHOCORASICK 98 /* 0x62 Aho Corasick stclass. flags==type */
-#define AHOCORASICKC 99 /* 0x63 Same as AHOCORASICK, but with embedded charclass data */
-#define GOSUB 100 /* 0x64 recurse to paren arg1 at (signed) ofs arg2 */
-#define GOSTART 101 /* 0x65 recurse to start of pattern */
-#define NGROUPP 102 /* 0x66 Whether the group matched. */
-#define INSUBP 103 /* 0x67 Whether we are in a specific recurse. */
-#define DEFINEP 104 /* 0x68 Never execute directly. */
-#define ENDLIKE 105 /* 0x69 Used only for the type field of verbs */
-#define OPFAIL 106 /* 0x6a Same as (?!) */
-#define ACCEPT 107 /* 0x6b Accepts the current matched string. */
-#define VERB 108 /* 0x6c Used only for the type field of verbs */
-#define PRUNE 109 /* 0x6d Pattern fails at this startpoint if no-backtracking through this */
-#define MARKPOINT 110 /* 0x6e Push the current location for rollback by cut. */
-#define SKIP 111 /* 0x6f On failure skip forward (to the mark) before retrying */
-#define COMMIT 112 /* 0x70 Pattern fails outright if backtracking through this */
-#define CUTGROUP 113 /* 0x71 On failure go to the next alternation in the group */
-#define KEEPS 114 /* 0x72 $& begins here. */
-#define LNBREAK 115 /* 0x73 generic newline pattern */
-#define VERTWS 116 /* 0x74 vertical whitespace (Perl 6) */
-#define NVERTWS 117 /* 0x75 not vertical whitespace (Perl 6) */
-#define HORIZWS 118 /* 0x76 horizontal whitespace (Perl 6) */
-#define NHORIZWS 119 /* 0x77 not horizontal whitespace (Perl 6) */
-#define OPTIMIZED 120 /* 0x78 Placeholder for dump. */
-#define PSEUDO 121 /* 0x79 Pseudo opcode for internal use. */
+#define POSIXD 22 /* 0x16 Some [[:class:]] under /d; the FLAGS field gives which one */
+#define POSIXL 23 /* 0x17 Some [[:class:]] under /l; the FLAGS field gives which one */
+#define POSIXU 24 /* 0x18 Some [[:class:]] under /u; the FLAGS field gives which one */
+#define POSIXA 25 /* 0x19 Some [[:class:]] under /a; the FLAGS field gives which one */
+#define NPOSIXD 26 /* 0x1a complement of POSIXD, [[:^class:]] */
+#define NPOSIXL 27 /* 0x1b complement of POSIXL, [[:^class:]] */
+#define NPOSIXU 28 /* 0x1c complement of POSIXU, [[:^class:]] */
+#define NPOSIXA 29 /* 0x1d complement of POSIXA, [[:^class:]] */
+#define CLUMP 30 /* 0x1e Match any extended grapheme cluster sequence */
+#define BRANCH 31 /* 0x1f Match this alternative, or the next... */
+#define BACK 32 /* 0x20 Match "", "next" ptr points backward. */
+#define EXACT 33 /* 0x21 Match this string (preceded by length). */
+#define EXACTF 34 /* 0x22 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
+#define EXACTFL 35 /* 0x23 Match this string (not guaranteed to be folded) using /il rules (w/len). */
+#define EXACTFU 36 /* 0x24 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
+#define EXACTFA 37 /* 0x25 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
+#define EXACTFU_SS 38 /* 0x26 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
+#define EXACTFU_TRICKYFOLD 39 /* 0x27 Match this folded UTF-8 string using /iu rules */
+#define NOTHING 40 /* 0x28 Match empty string. */
+#define TAIL 41 /* 0x29 Match empty string. Can jump here from outside. */
+#define STAR 42 /* 0x2a Match this (simple) thing 0 or more times. */
+#define PLUS 43 /* 0x2b Match this (simple) thing 1 or more times. */
+#define CURLY 44 /* 0x2c Match this simple thing {n,m} times. */
+#define CURLYN 45 /* 0x2d Capture next-after-this simple thing */
+#define CURLYM 46 /* 0x2e Capture this medium-complex thing {n,m} times. */
+#define CURLYX 47 /* 0x2f Match this complex thing {n,m} times. */
+#define WHILEM 48 /* 0x30 Do curly processing and see if rest matches. */
+#define OPEN 49 /* 0x31 Mark this point in input as start of */
+#define CLOSE 50 /* 0x32 Analogous to OPEN. */
+#define REF 51 /* 0x33 Match some already matched string */
+#define REFF 52 /* 0x34 Match already matched string, folded using native charset semantics for non-utf8 */
+#define REFFL 53 /* 0x35 Match already matched string, folded in loc. */
+#define REFFU 54 /* 0x36 Match already matched string, folded using unicode semantics for non-utf8 */
+#define REFFA 55 /* 0x37 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define NREF 56 /* 0x38 Match some already matched string */
+#define NREFF 57 /* 0x39 Match already matched string, folded using native charset semantics for non-utf8 */
+#define NREFFL 58 /* 0x3a Match already matched string, folded in loc. */
+#define NREFFU 59 /* 0x3b Match already matched string, folded using unicode semantics for non-utf8 */
+#define NREFFA 60 /* 0x3c Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define IFMATCH 61 /* 0x3d Succeeds if the following matches. */
+#define UNLESSM 62 /* 0x3e Fails if the following matches. */
+#define SUSPEND 63 /* 0x3f "Independent" sub-RE. */
+#define IFTHEN 64 /* 0x40 Switch, should be preceded by switcher . */
+#define GROUPP 65 /* 0x41 Whether the group matched. */
+#define LONGJMP 66 /* 0x42 Jump far away. */
+#define BRANCHJ 67 /* 0x43 BRANCH with long offset. */
+#define EVAL 68 /* 0x44 Execute some Perl code. */
+#define MINMOD 69 /* 0x45 Next operator is not greedy. */
+#define LOGICAL 70 /* 0x46 Next opcode should set the flag only. */
+#define RENUM 71 /* 0x47 Group with independently numbered parens. */
+#define TRIE 72 /* 0x48 Match many EXACT(F[ALU]?)? at once. flags==type */
+#define TRIEC 73 /* 0x49 Same as TRIE, but with embedded charclass data */
+#define AHOCORASICK 74 /* 0x4a Aho Corasick stclass. flags==type */
+#define AHOCORASICKC 75 /* 0x4b Same as AHOCORASICK, but with embedded charclass data */
+#define GOSUB 76 /* 0x4c recurse to paren arg1 at (signed) ofs arg2 */
+#define GOSTART 77 /* 0x4d recurse to start of pattern */
+#define NGROUPP 78 /* 0x4e Whether the group matched. */
+#define INSUBP 79 /* 0x4f Whether we are in a specific recurse. */
+#define DEFINEP 80 /* 0x50 Never execute directly. */
+#define ENDLIKE 81 /* 0x51 Used only for the type field of verbs */
+#define OPFAIL 82 /* 0x52 Same as (?!) */
+#define ACCEPT 83 /* 0x53 Accepts the current matched string. */
+#define VERB 84 /* 0x54 Used only for the type field of verbs */
+#define PRUNE 85 /* 0x55 Pattern fails at this startpoint if no-backtracking through this */
+#define MARKPOINT 86 /* 0x56 Push the current location for rollback by cut. */
+#define SKIP 87 /* 0x57 On failure skip forward (to the mark) before retrying */
+#define COMMIT 88 /* 0x58 Pattern fails outright if backtracking through this */
+#define CUTGROUP 89 /* 0x59 On failure go to the next alternation in the group */
+#define KEEPS 90 /* 0x5a $& begins here. */
+#define LNBREAK 91 /* 0x5b generic newline pattern */
+#define OPTIMIZED 92 /* 0x5c Placeholder for dump. */
+#define PSEUDO 93 /* 0x5d Pseudo opcode for internal use. */
/* ------------ States ------------- */
#define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */
#define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */
@@ -201,30 +173,6 @@ EXTCONST U8 PL_regkind[] = {
REG_ANY, /* SANY */
REG_ANY, /* CANY */
ANYOF, /* ANYOF */
- ALNUM, /* ALNUM */
- ALNUM, /* ALNUML */
- ALNUM, /* ALNUMU */
- ALNUM, /* ALNUMA */
- NALNUM, /* NALNUM */
- NALNUM, /* NALNUML */
- NALNUM, /* NALNUMU */
- NALNUM, /* NALNUMA */
- SPACE, /* SPACE */
- SPACE, /* SPACEL */
- SPACE, /* SPACEU */
- SPACE, /* SPACEA */
- NSPACE, /* NSPACE */
- NSPACE, /* NSPACEL */
- NSPACE, /* NSPACEU */
- NSPACE, /* NSPACEA */
- DIGIT, /* DIGIT */
- DIGIT, /* DIGITL */
- NOTHING, /* PLACEHOLDER1 */
- DIGIT, /* DIGITA */
- NDIGIT, /* NDIGIT */
- NDIGIT, /* NDIGITL */
- NOTHING, /* PLACEHOLDER2 */
- NDIGIT, /* NDIGITA */
POSIXD, /* POSIXD */
POSIXD, /* POSIXL */
POSIXD, /* POSIXU */
@@ -295,10 +243,6 @@ EXTCONST U8 PL_regkind[] = {
VERB, /* CUTGROUP */
KEEPS, /* KEEPS */
LNBREAK, /* LNBREAK */
- VERTWS, /* VERTWS */
- NVERTWS, /* NVERTWS */
- HORIZWS, /* HORIZWS */
- NHORIZWS, /* NHORIZWS */
NOTHING, /* OPTIMIZED */
PSEUDO, /* PSEUDO */
/* ------------ States ------------- */
@@ -371,30 +315,6 @@ static const U8 regarglen[] = {
0, /* SANY */
0, /* CANY */
0, /* ANYOF */
- 0, /* ALNUM */
- 0, /* ALNUML */
- 0, /* ALNUMU */
- 0, /* ALNUMA */
- 0, /* NALNUM */
- 0, /* NALNUML */
- 0, /* NALNUMU */
- 0, /* NALNUMA */
- 0, /* SPACE */
- 0, /* SPACEL */
- 0, /* SPACEU */
- 0, /* SPACEA */
- 0, /* NSPACE */
- 0, /* NSPACEL */
- 0, /* NSPACEU */
- 0, /* NSPACEA */
- 0, /* DIGIT */
- 0, /* DIGITL */
- 0, /* PLACEHOLDER1 */
- 0, /* DIGITA */
- 0, /* NDIGIT */
- 0, /* NDIGITL */
- 0, /* PLACEHOLDER2 */
- 0, /* NDIGITA */
0, /* POSIXD */
0, /* POSIXL */
0, /* POSIXU */
@@ -465,10 +385,6 @@ static const U8 regarglen[] = {
EXTRA_SIZE(struct regnode_1), /* CUTGROUP */
0, /* KEEPS */
0, /* LNBREAK */
- 0, /* VERTWS */
- 0, /* NVERTWS */
- 0, /* HORIZWS */
- 0, /* NHORIZWS */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
@@ -498,30 +414,6 @@ static const char reg_off_by_arg[] = {
0, /* SANY */
0, /* CANY */
0, /* ANYOF */
- 0, /* ALNUM */
- 0, /* ALNUML */
- 0, /* ALNUMU */
- 0, /* ALNUMA */
- 0, /* NALNUM */
- 0, /* NALNUML */
- 0, /* NALNUMU */
- 0, /* NALNUMA */
- 0, /* SPACE */
- 0, /* SPACEL */
- 0, /* SPACEU */
- 0, /* SPACEA */
- 0, /* NSPACE */
- 0, /* NSPACEL */
- 0, /* NSPACEU */
- 0, /* NSPACEA */
- 0, /* DIGIT */
- 0, /* DIGITL */
- 0, /* PLACEHOLDER1 */
- 0, /* DIGITA */
- 0, /* NDIGIT */
- 0, /* NDIGITL */
- 0, /* PLACEHOLDER2 */
- 0, /* NDIGITA */
0, /* POSIXD */
0, /* POSIXL */
0, /* POSIXU */
@@ -592,10 +484,6 @@ static const char reg_off_by_arg[] = {
0, /* CUTGROUP */
0, /* KEEPS */
0, /* LNBREAK */
- 0, /* VERTWS */
- 0, /* NVERTWS */
- 0, /* HORIZWS */
- 0, /* NHORIZWS */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
@@ -630,106 +518,78 @@ EXTCONST char * const PL_reg_name[] = {
"SANY", /* 0x13 */
"CANY", /* 0x14 */
"ANYOF", /* 0x15 */
- "ALNUM", /* 0x16 */
- "ALNUML", /* 0x17 */
- "ALNUMU", /* 0x18 */
- "ALNUMA", /* 0x19 */
- "NALNUM", /* 0x1a */
- "NALNUML", /* 0x1b */
- "NALNUMU", /* 0x1c */
- "NALNUMA", /* 0x1d */
- "SPACE", /* 0x1e */
- "SPACEL", /* 0x1f */
- "SPACEU", /* 0x20 */
- "SPACEA", /* 0x21 */
- "NSPACE", /* 0x22 */
- "NSPACEL", /* 0x23 */
- "NSPACEU", /* 0x24 */
- "NSPACEA", /* 0x25 */
- "DIGIT", /* 0x26 */
- "DIGITL", /* 0x27 */
- "PLACEHOLDER1", /* 0x28 */
- "DIGITA", /* 0x29 */
- "NDIGIT", /* 0x2a */
- "NDIGITL", /* 0x2b */
- "PLACEHOLDER2", /* 0x2c */
- "NDIGITA", /* 0x2d */
- "POSIXD", /* 0x2e */
- "POSIXL", /* 0x2f */
- "POSIXU", /* 0x30 */
- "POSIXA", /* 0x31 */
- "NPOSIXD", /* 0x32 */
- "NPOSIXL", /* 0x33 */
- "NPOSIXU", /* 0x34 */
- "NPOSIXA", /* 0x35 */
- "CLUMP", /* 0x36 */
- "BRANCH", /* 0x37 */
- "BACK", /* 0x38 */
- "EXACT", /* 0x39 */
- "EXACTF", /* 0x3a */
- "EXACTFL", /* 0x3b */
- "EXACTFU", /* 0x3c */
- "EXACTFA", /* 0x3d */
- "EXACTFU_SS", /* 0x3e */
- "EXACTFU_TRICKYFOLD", /* 0x3f */
- "NOTHING", /* 0x40 */
- "TAIL", /* 0x41 */
- "STAR", /* 0x42 */
- "PLUS", /* 0x43 */
- "CURLY", /* 0x44 */
- "CURLYN", /* 0x45 */
- "CURLYM", /* 0x46 */
- "CURLYX", /* 0x47 */
- "WHILEM", /* 0x48 */
- "OPEN", /* 0x49 */
- "CLOSE", /* 0x4a */
- "REF", /* 0x4b */
- "REFF", /* 0x4c */
- "REFFL", /* 0x4d */
- "REFFU", /* 0x4e */
- "REFFA", /* 0x4f */
- "NREF", /* 0x50 */
- "NREFF", /* 0x51 */
- "NREFFL", /* 0x52 */
- "NREFFU", /* 0x53 */
- "NREFFA", /* 0x54 */
- "IFMATCH", /* 0x55 */
- "UNLESSM", /* 0x56 */
- "SUSPEND", /* 0x57 */
- "IFTHEN", /* 0x58 */
- "GROUPP", /* 0x59 */
- "LONGJMP", /* 0x5a */
- "BRANCHJ", /* 0x5b */
- "EVAL", /* 0x5c */
- "MINMOD", /* 0x5d */
- "LOGICAL", /* 0x5e */
- "RENUM", /* 0x5f */
- "TRIE", /* 0x60 */
- "TRIEC", /* 0x61 */
- "AHOCORASICK", /* 0x62 */
- "AHOCORASICKC", /* 0x63 */
- "GOSUB", /* 0x64 */
- "GOSTART", /* 0x65 */
- "NGROUPP", /* 0x66 */
- "INSUBP", /* 0x67 */
- "DEFINEP", /* 0x68 */
- "ENDLIKE", /* 0x69 */
- "OPFAIL", /* 0x6a */
- "ACCEPT", /* 0x6b */
- "VERB", /* 0x6c */
- "PRUNE", /* 0x6d */
- "MARKPOINT", /* 0x6e */
- "SKIP", /* 0x6f */
- "COMMIT", /* 0x70 */
- "CUTGROUP", /* 0x71 */
- "KEEPS", /* 0x72 */
- "LNBREAK", /* 0x73 */
- "VERTWS", /* 0x74 */
- "NVERTWS", /* 0x75 */
- "HORIZWS", /* 0x76 */
- "NHORIZWS", /* 0x77 */
- "OPTIMIZED", /* 0x78 */
- "PSEUDO", /* 0x79 */
+ "POSIXD", /* 0x16 */
+ "POSIXL", /* 0x17 */
+ "POSIXU", /* 0x18 */
+ "POSIXA", /* 0x19 */
+ "NPOSIXD", /* 0x1a */
+ "NPOSIXL", /* 0x1b */
+ "NPOSIXU", /* 0x1c */
+ "NPOSIXA", /* 0x1d */
+ "CLUMP", /* 0x1e */
+ "BRANCH", /* 0x1f */
+ "BACK", /* 0x20 */
+ "EXACT", /* 0x21 */
+ "EXACTF", /* 0x22 */
+ "EXACTFL", /* 0x23 */
+ "EXACTFU", /* 0x24 */
+ "EXACTFA", /* 0x25 */
+ "EXACTFU_SS", /* 0x26 */
+ "EXACTFU_TRICKYFOLD", /* 0x27 */
+ "NOTHING", /* 0x28 */
+ "TAIL", /* 0x29 */
+ "STAR", /* 0x2a */
+ "PLUS", /* 0x2b */
+ "CURLY", /* 0x2c */
+ "CURLYN", /* 0x2d */
+ "CURLYM", /* 0x2e */
+ "CURLYX", /* 0x2f */
+ "WHILEM", /* 0x30 */
+ "OPEN", /* 0x31 */
+ "CLOSE", /* 0x32 */
+ "REF", /* 0x33 */
+ "REFF", /* 0x34 */
+ "REFFL", /* 0x35 */
+ "REFFU", /* 0x36 */
+ "REFFA", /* 0x37 */
+ "NREF", /* 0x38 */
+ "NREFF", /* 0x39 */
+ "NREFFL", /* 0x3a */
+ "NREFFU", /* 0x3b */
+ "NREFFA", /* 0x3c */
+ "IFMATCH", /* 0x3d */
+ "UNLESSM", /* 0x3e */
+ "SUSPEND", /* 0x3f */
+ "IFTHEN", /* 0x40 */
+ "GROUPP", /* 0x41 */
+ "LONGJMP", /* 0x42 */
+ "BRANCHJ", /* 0x43 */
+ "EVAL", /* 0x44 */
+ "MINMOD", /* 0x45 */
+ "LOGICAL", /* 0x46 */
+ "RENUM", /* 0x47 */
+ "TRIE", /* 0x48 */
+ "TRIEC", /* 0x49 */
+ "AHOCORASICK", /* 0x4a */
+ "AHOCORASICKC", /* 0x4b */
+ "GOSUB", /* 0x4c */
+ "GOSTART", /* 0x4d */
+ "NGROUPP", /* 0x4e */
+ "INSUBP", /* 0x4f */
+ "DEFINEP", /* 0x50 */
+ "ENDLIKE", /* 0x51 */
+ "OPFAIL", /* 0x52 */
+ "ACCEPT", /* 0x53 */
+ "VERB", /* 0x54 */
+ "PRUNE", /* 0x55 */
+ "MARKPOINT", /* 0x56 */
+ "SKIP", /* 0x57 */
+ "COMMIT", /* 0x58 */
+ "CUTGROUP", /* 0x59 */
+ "KEEPS", /* 0x5a */
+ "LNBREAK", /* 0x5b */
+ "OPTIMIZED", /* 0x5c */
+ "PSEUDO", /* 0x5d */
/* ------------ States ------------- */
"TRIE_next", /* REGNODE_MAX +0x01 */
"TRIE_next_fail", /* REGNODE_MAX +0x02 */
@@ -834,7 +694,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
EXTCONST U8 PL_varies_bitmask[];
#else
EXTCONST U8 PL_varies_bitmask[] = {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x01, 0xFC, 0xF9, 0x9F, 0x09, 0x00, 0x00, 0x00, 0x00
+ 0x00, 0x00, 0x00, 0xC0, 0x01, 0xFC, 0xF9, 0x9F, 0x09, 0x00, 0x00, 0x00
};
#endif /* DOINIT */
@@ -846,11 +706,8 @@ EXTCONST U8 PL_varies_bitmask[] = {
EXTCONST U8 PL_simple[] __attribute__deprecated__;
#else
EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
- REG_ANY, SANY, CANY, ANYOF, ALNUM, ALNUML, ALNUMU, ALNUMA, NALNUM,
- NALNUML, NALNUMU, NALNUMA, SPACE, SPACEL, SPACEU, SPACEA, NSPACE,
- NSPACEL, NSPACEU, NSPACEA, DIGIT, DIGITL, DIGITA, NDIGIT, NDIGITL,
- NDIGITA, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD, NPOSIXL, NPOSIXU,
- NPOSIXA, VERTWS, NVERTWS, HORIZWS, NHORIZWS,
+ REG_ANY, SANY, CANY, ANYOF, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD,
+ NPOSIXL, NPOSIXU, NPOSIXA,
0
};
#endif /* DOINIT */
@@ -859,7 +716,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
EXTCONST U8 PL_simple_bitmask[];
#else
EXTCONST U8 PL_simple_bitmask[] = {
- 0x00, 0x00, 0xFC, 0xFF, 0xFF, 0xEE, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x00
+ 0x00, 0x00, 0xFC, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
#endif /* DOINIT */