diff options
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | handy.h | 2 | ||||
-rw-r--r-- | proto.h | 6 | ||||
-rw-r--r-- | regcomp.c | 496 | ||||
-rw-r--r-- | regcomp.sym | 54 | ||||
-rw-r--r-- | regexec.c | 1200 | ||||
-rw-r--r-- | regnodes.h | 443 |
8 files changed, 849 insertions, 1354 deletions
@@ -2028,6 +2028,7 @@ Es |U8 |regtail_study |NN struct RExC_state_t *pRExC_state \ #if defined(PERL_IN_REGEXEC_C) ERs |bool |isFOO_lc |const U8 classnum|const U8 character +ERs |bool |isFOO_utf8_lc |const U8 classnum|NN const U8* character ERs |I32 |regmatch |NN regmatch_info *reginfo|NN char *startpos|NN regnode *prog ERs |I32 |regrepeat |NN const regexp *prog|NN char **startposp|NN const regnode *p|I32 max|int depth ERs |I32 |regtry |NN regmatch_info *reginfo|NN char **startposp @@ -972,6 +972,7 @@ #define core_regclass_swash(a,b,c,d) S_core_regclass_swash(aTHX_ a,b,c,d) #define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e) #define isFOO_lc(a,b) S_isFOO_lc(aTHX_ a,b) +#define isFOO_utf8_lc(a,b) S_isFOO_utf8_lc(aTHX_ a,b) #define reg_check_named_buff_matched(a,b) S_reg_check_named_buff_matched(aTHX_ a,b) #define regcppop(a,b) S_regcppop(aTHX_ a,b) #define regcppush(a,b,c) S_regcppush(aTHX_ a,b,c) @@ -803,7 +803,7 @@ typedef enum { #define POSIX_SWASH_COUNT _FIRST_NON_SWASH_CC #define POSIX_CC_COUNT (_HIGHEST_REGCOMP_DOT_H_SYNC + 1) -#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) +#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 @@ -6799,6 +6799,12 @@ STATIC char* S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, cons STATIC bool S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) __attribute__warn_unused_result__; +STATIC bool S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_2); +#define PERL_ARGS_ASSERT_ISFOO_UTF8_LC \ + assert(character) + STATIC I32 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1) @@ -2950,34 +2950,6 @@ typedef struct scan_frame { #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) -#define CASE_SYNST_FNC(nAmE) \ -case nAmE: \ - if (flags & SCF_DO_STCLASS_AND) { \ - for (value = 0; value < 256; value++) \ - if (!is_ ## nAmE ## _cp(value)) \ - ANYOF_BITMAP_CLEAR(data->start_class, value); \ - } \ - else { \ - for (value = 0; value < 256; value++) \ - if (is_ ## nAmE ## _cp(value)) \ - ANYOF_BITMAP_SET(data->start_class, value); \ - } \ - break; \ -case N ## nAmE: \ - if (flags & SCF_DO_STCLASS_AND) { \ - for (value = 0; value < 256; value++) \ - if (is_ ## nAmE ## _cp(value)) \ - ANYOF_BITMAP_CLEAR(data->start_class, value); \ - } \ - else { \ - for (value = 0; value < 256; value++) \ - if (!is_ ## nAmE ## _cp(value)) \ - ANYOF_BITMAP_SET(data->start_class, value); \ - } \ - break - - - STATIC I32 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, I32 *deltap, @@ -4147,11 +4119,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } min++; if (flags & SCF_DO_STCLASS) { + int loop_max = 256; data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ /* Some of the logic below assumes that switching locale on will only add false positives. */ switch (PL_regkind[OP(scan)]) { + U8 classnum; + case SANY: default: do_default: @@ -4178,200 +4153,75 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, cl_or(pRExC_state, data->start_class, (struct regnode_charclass_class*)scan); break; - case ALNUM: + case POSIXA: + loop_max = 128; + case POSIXL: + case POSIXD: + case POSIXU: + classnum = FLAGS(scan); if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR); - if (OP(scan) == ALNUMU) { - for (value = 0; value < 256; value++) { - if (!isWORDCHAR_L1(value)) { - ANYOF_BITMAP_CLEAR(data->start_class, value); - } - } - } else { - for (value = 0; value < 256; value++) { - if (!isALNUM(value)) { - ANYOF_BITMAP_CLEAR(data->start_class, value); - } + ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1); + for (value = 0; value < loop_max; value++) { + if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { + ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); } } } } else { - if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR); + if (data->start_class->flags & ANYOF_LOCALE) { + ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum)); + } + else { /* Even if under locale, set the bits for non-locale * in case it isn't a true locale-node. This will * create false positives if it truly is locale */ - if (OP(scan) == ALNUMU) { - for (value = 0; value < 256; value++) { - if (isWORDCHAR_L1(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } - } - } else { - for (value = 0; value < 256; value++) { - if (isALNUM(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } + for (value = 0; value < loop_max; value++) { + if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { + ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); } } + } } break; - case NALNUM: + case NPOSIXA: + loop_max = 128; + case NPOSIXL: + case NPOSIXU: + case NPOSIXD: + classnum = FLAGS(scan); if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR); - if (OP(scan) == NALNUMU) { - for (value = 0; value < 256; value++) { - if (isWORDCHAR_L1(value)) { - ANYOF_BITMAP_CLEAR(data->start_class, value); - } + ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum)); + for (value = 0; value < loop_max; value++) { + if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { + ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); } - } else { - for (value = 0; value < 256; value++) { - if (isALNUM(value)) { - ANYOF_BITMAP_CLEAR(data->start_class, value); - } - } - } + } } } else { - if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR); + if (data->start_class->flags & ANYOF_LOCALE) { + ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1); + } + else { /* Even if under locale, set the bits for non-locale in * case it isn't a true locale-node. This will create * false positives if it truly is locale */ - if (OP(scan) == NALNUMU) { - for (value = 0; value < 256; value++) { - if (! isWORDCHAR_L1(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } - } - } else { - for (value = 0; value < 256; value++) { - if (! isALNUM(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } - } - } - } - break; - case SPACE: - if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); - if (OP(scan) == SPACEU) { - for (value = 0; value < 256; value++) { - if (!isSPACE_L1(value)) { - ANYOF_BITMAP_CLEAR(data->start_class, value); - } - } - } else { - for (value = 0; value < 256; value++) { - if (!isSPACE(value)) { - ANYOF_BITMAP_CLEAR(data->start_class, value); - } - } + for (value = 0; value < loop_max; value++) { + if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { + ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); } - } - } - else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); } - if (OP(scan) == SPACEU) { - for (value = 0; value < 256; value++) { - if (isSPACE_L1(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } - } - } else { - for (value = 0; value < 256; value++) { - if (isSPACE(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } - } - } - } - break; - case NSPACE: - if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); - if (OP(scan) == NSPACEU) { - for (value = 0; value < 256; value++) { - if (isSPACE_L1(value)) { - ANYOF_BITMAP_CLEAR(data->start_class, value); - } - } - } else { - for (value = 0; value < 256; value++) { - if (isSPACE(value)) { - ANYOF_BITMAP_CLEAR(data->start_class, value); - } - } - } - } - } - else { - if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); - if (OP(scan) == NSPACEU) { - for (value = 0; value < 256; value++) { - if (!isSPACE_L1(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } - } + if (PL_regkind[OP(scan)] == NPOSIXD) { + data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL; } - else { - for (value = 0; value < 256; value++) { - if (!isSPACE(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } - } } } break; - case DIGIT: - if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); - for (value = 0; value < 256; value++) - if (!isDIGIT(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); - } - } - else { - if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); - for (value = 0; value < 256; value++) - if (isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } - break; - case NDIGIT: - if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); - for (value = 0; value < 256; value++) - if (isDIGIT(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); - } - else { - if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); - for (value = 0; value < 256; value++) - if (!isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } - break; - CASE_SYNST_FNC(VERTWS); - CASE_SYNST_FNC(HORIZWS); - } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, and_withp); @@ -6440,7 +6290,7 @@ reStudy: r->extflags |= RXf_NULL; else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END) r->extflags |= RXf_START_ONLY; - else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE + else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE && OP(regnext(first)) == END) r->extflags |= RXf_WHITE; } @@ -9553,6 +9403,16 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, OPFAIL); return ret; } + else if (max == 0) { + if (SIZE_ONLY) { + RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING]; + } + else { + RExC_emit = orig_emit; + } + ret = reg_node(pRExC_state, NOTHING); + return ret; + } do_curly: if ((flags&SIMPLE)) { @@ -10120,9 +9980,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) I32 flags; char *parse_start = RExC_parse; U8 op; + *flagp = WORST; /* Tentatively. */ + GET_RE_DEBUG_FLAGS_DECL; DEBUG_PARSE("atom"); - *flagp = WORST; /* Tentatively. */ + int invert = 0; PERL_ARGS_ASSERT_REGATOM; @@ -10218,6 +10080,7 @@ tryagain: literal text handling code. */ switch ((U8)*++RExC_parse) { + U8 arg; /* Special Escapes */ case 'A': RExC_seen_zerolen++; @@ -10258,22 +10121,14 @@ tryagain: ret = reg_node(pRExC_state, CLUMP); *flagp |= HASWIDTH; goto finish_meta_pat; - case 'w': - op = ALNUM + get_regex_charset(RExC_flags); - if (op > ALNUMA) { /* /aa is same as /a */ - op = ALNUMA; - } - ret = reg_node(pRExC_state, op); - *flagp |= HASWIDTH|SIMPLE; - goto finish_meta_pat; + case 'W': - op = NALNUM + get_regex_charset(RExC_flags); - if (op > NALNUMA) { /* /aa is same as /a */ - op = NALNUMA; - } - ret = reg_node(pRExC_state, op); - *flagp |= HASWIDTH|SIMPLE; - goto finish_meta_pat; + invert = 1; + /* FALLTHROUGH */ + case 'w': + arg = ANYOF_WORDCHAR; + goto join_posix; + case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; @@ -10296,60 +10151,60 @@ tryagain: FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; goto finish_meta_pat; + + case 'S': + invert = 1; + /* FALLTHROUGH */ case 's': - op = SPACE + get_regex_charset(RExC_flags); - if (op > SPACEA) { /* /aa is same as /a */ - op = SPACEA; + arg = ANYOF_SPACE; + + join_posix: + + op = POSIXD + get_regex_charset(RExC_flags); + if (op > POSIXA) { /* /aa is same as /a */ + op = POSIXA; } - ret = reg_node(pRExC_state, op); - *flagp |= HASWIDTH|SIMPLE; - goto finish_meta_pat; - case 'S': - op = NSPACE + get_regex_charset(RExC_flags); - if (op > NSPACEA) { /* /aa is same as /a */ - op = NSPACEA; + + join_posix_op_known: + + if (invert) { + op += NPOSIXD - POSIXD; } ret = reg_node(pRExC_state, op); + if (! SIZE_ONLY) { + FLAGS(ret) = namedclass_to_classnum(arg); + } + *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'D': - op = NDIGIT; - goto join_D_and_d; + invert = 1; + /* FALLTHROUGH */ case 'd': - op = DIGIT; - join_D_and_d: - { - U8 offset = get_regex_charset(RExC_flags); - if (offset == REGEX_UNICODE_CHARSET) { - offset = REGEX_DEPENDS_CHARSET; - } - else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) { - offset = REGEX_ASCII_RESTRICTED_CHARSET; - } - op += offset; - } - ret = reg_node(pRExC_state, op); - *flagp |= HASWIDTH|SIMPLE; - goto finish_meta_pat; + arg = ANYOF_DIGIT; + goto join_posix; + case 'R': ret = reg_node(pRExC_state, LNBREAK); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; - case 'h': - ret = reg_node(pRExC_state, HORIZWS); - *flagp |= HASWIDTH|SIMPLE; - goto finish_meta_pat; + case 'H': - ret = reg_node(pRExC_state, NHORIZWS); - *flagp |= HASWIDTH|SIMPLE; - goto finish_meta_pat; - case 'v': - ret = reg_node(pRExC_state, VERTWS); - *flagp |= HASWIDTH|SIMPLE; - goto finish_meta_pat; + invert = 1; + /* FALLTHROUGH */ + case 'h': + arg = ANYOF_BLANK; + op = POSIXU; + goto join_posix_op_known; + case 'V': - ret = reg_node(pRExC_state, NVERTWS); - *flagp |= HASWIDTH|SIMPLE; + invert = 1; + /* FALLTHROUGH */ + case 'v': + arg = ANYOF_VERTWS; + op = POSIXU; + goto join_posix_op_known; + finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ @@ -12314,101 +12169,69 @@ parseit: if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or [:digit:] or \p{foo} */ - /* Certain named classes have equivalents that can appear outside a - * character class, e.g. \w, \H. We use these instead of a - * character class. */ + /* All named classes are mapped into POSIXish nodes, with its FLAG + * argument giving which class it is */ switch ((I32)namedclass) { - U8 offset; - - /* The first group is for node types that depend on the charset - * modifier to the regex. We first calculate the base node - * type, and if it should be inverted */ - - case ANYOF_NWORDCHAR: - invert = ! invert; - /* FALLTHROUGH */ - case ANYOF_WORDCHAR: - op = ALNUM; - goto join_charset_classes; - - case ANYOF_NSPACE: - invert = ! invert; - /* FALLTHROUGH */ - case ANYOF_SPACE: - op = SPACE; - goto join_charset_classes; - - case ANYOF_NDIGIT: - invert = ! invert; - /* FALLTHROUGH */ - case ANYOF_DIGIT: - op = DIGIT; - - join_charset_classes: - - /* Now that we have the base node type, we take advantage - * of the enum ordering of the charset modifiers to get the - * exact node type, For example the base SPACE also has - * SPACEL, SPACEU, and SPACEA */ - - offset = get_regex_charset(RExC_flags); - - /* /aa is the same as /a for these */ - if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) { - offset = REGEX_ASCII_RESTRICTED_CHARSET; - } - else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) { - offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */ - } - - op += offset; - - /* The number of varieties of each of these is the same, - * hence, so is the delta between the normal and - * complemented nodes */ - if (invert) { - op += NALNUM - ALNUM; - } - *flagp |= HASWIDTH|SIMPLE; + case ANYOF_UNIPROP: break; - /* The second group doesn't depend of the charset modifiers. - * We just have normal and complemented */ + /* These don't depend on the charset modifiers. They always + * match under /u rules */ case ANYOF_NHORIZWS: - invert = ! invert; - /* FALLTHROUGH */ case ANYOF_HORIZWS: - is_horizws: - op = (invert) ? NHORIZWS : HORIZWS; - *flagp |= HASWIDTH|SIMPLE; - break; + namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS; + /* FALLTHROUGH */ case ANYOF_NVERTWS: - invert = ! invert; - /* FALLTHROUGH */ case ANYOF_VERTWS: - op = (invert) ? NVERTWS : VERTWS; - *flagp |= HASWIDTH|SIMPLE; - break; - - case ANYOF_UNIPROP: - break; - - case ANYOF_NBLANK: - invert = ! invert; - /* FALLTHROUGH */ - case ANYOF_BLANK: - if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) { - goto is_horizws; + op = POSIXU; + goto join_posix; + + /* The actual POSIXish node for all the rest depends on the + * charset modifier. The ones in the first set depend only on + * ASCII or, if available on this platform, locale */ + case ANYOF_ASCII: + case ANYOF_NASCII: +#ifdef HAS_ISASCII + op = (LOC) ? POSIXL : POSIXA; +#else + op = POSIXA; +#endif + goto join_posix; + + case ANYOF_LOWER: + case ANYOF_NLOWER: + case ANYOF_UPPER: + case ANYOF_NUPPER: + /* under /a could be alpha */ + if (FOLD) { + if (ASCII_RESTRICTED) { + namedclass = ANYOF_ALPHA + (namedclass % 2); + } + else if (! LOC) { + break; + } } /* FALLTHROUGH */ + + /* The rest have more possibilities depending on the charset. We + * take advantage of the enum ordering of the charset modifiers to + * get the exact node type, */ default: - /* A generic posix class. All the /a ones can be handled - * by the POSIXA opcode. And all are closed under folding - * in the ASCII range, so FOLD doesn't matter */ - if (AT_LEAST_ASCII_RESTRICTED - || (! LOC && namedclass == ANYOF_ASCII)) + op = POSIXD + get_regex_charset(RExC_flags); + if (op > POSIXA) { /* /aa is same as /a */ + op = POSIXA; + } +#ifndef HAS_ISBLANK + if (op == POSIXL + && (namedclass == ANYOF_BLANK + || namedclass == ANYOF_NBLANK)) { + op = POSIXA; + } +#endif + + join_posix: /* The odd numbered ones are the complements of the * next-lower even number one */ if (namedclass % 2 == 1) { @@ -12416,8 +12239,6 @@ parseit: namedclass--; } arg = namedclass_to_classnum(namedclass); - op = (invert) ? NPOSIXA : POSIXA; - } break; } } @@ -12442,8 +12263,8 @@ parseit: else if (! LOC) { /* locale could vary these */ if (prevvalue == '0') { if (value == '9') { - op = (invert) ? NDIGITA : DIGITA; - *flagp |= HASWIDTH|SIMPLE; + arg = _CC_DIGIT; + op = POSIXA; } } } @@ -12469,6 +12290,11 @@ parseit: } else { RExC_emit = (regnode *)orig_emit; + if (PL_regkind[op] == POSIXD) { + if (invert) { + op += NPOSIXD - POSIXD; + } + } } ret = reg_node(pRExC_state, op); diff --git a/regcomp.sym b/regcomp.sym index eb8ba46238..2a49d20379 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -36,8 +36,7 @@ SEOL EOL, no ; Same, assuming singleline. # modifiers have to ordered thusly: /d, /l, /u, /a, /aa. This is because code # in regcomp.c uses the enum value of the modifier as an offset from the /d # version. The complements must come after the non-complements. -# BOUND, ALNUM, SPACE, DIGIT, and their complements are affected, as well as -# EXACTF. +# BOUND, POSIX and their complements are affected, as well as EXACTF. BOUND BOUND, no ; Match "" at any word boundary using native charset semantics for non-utf8 BOUNDL BOUND, no ; Match "" at any locale word boundary BOUNDU BOUND, no ; Match "" at any word boundary using Unicode semantics @@ -56,44 +55,16 @@ SANY REG_ANY, no 0 S ; Match any one character. CANY REG_ANY, no 0 S ; Match any one byte. ANYOF ANYOF, sv 0 S ; Match character in (or not in) this class, single char match only -# Order (within each group) of the below is important. See ordering comment -# above. The PLACEHOLDERn ones are wasting a value. Right now, we have plenty -# to spare, but these would be obvious candidates if ever we ran out of node -# types in a U8. -ALNUM ALNUM, no 0 S ; Match any alphanumeric character using native charset semantics for non-utf8 -ALNUML ALNUM, no 0 S ; Match any alphanumeric char in locale -ALNUMU ALNUM, no 0 S ; Match any alphanumeric char using Unicode semantics -ALNUMA ALNUM, no 0 S ; Match [A-Za-z_0-9] -NALNUM NALNUM, no 0 S ; Match any non-alphanumeric character using native charset semantics for non-utf8 -NALNUML NALNUM, no 0 S ; Match any non-alphanumeric char in locale -NALNUMU NALNUM, no 0 S ; Match any non-alphanumeric char using Unicode semantics -NALNUMA NALNUM, no 0 S ; Match [^A-Za-z_0-9] -SPACE SPACE, no 0 S ; Match any whitespace character using native charset semantics for non-utf8 -SPACEL SPACE, no 0 S ; Match any whitespace char in locale -SPACEU SPACE, no 0 S ; Match any whitespace char using Unicode semantics -SPACEA SPACE, no 0 S ; Match [ \t\n\f\r] -NSPACE NSPACE, no 0 S ; Match any non-whitespace character using native charset semantics for non-utf8 -NSPACEL NSPACE, no 0 S ; Match any non-whitespace char in locale -NSPACEU NSPACE, no 0 S ; Match any non-whitespace char using Unicode semantics -NSPACEA NSPACE, no 0 S ; Match [^ \t\n\f\r] -DIGIT DIGIT, no 0 S ; Match any numeric character using native charset semantics for non-utf8 -DIGITL DIGIT, no 0 S ; Match any numeric character in locale -PLACEHOLDER1 NOTHING, no ; placeholder for missing DIGITU -DIGITA DIGIT, no 0 S ; Match [0-9] -NDIGIT NDIGIT, no 0 S ; Match any non-numeric character using native charset semantics for non-utf8 -NDIGITL NDIGIT, no 0 S ; Match any non-numeric character in locale -PLACEHOLDER2 NOTHING, no ; placeholder for missing NDIGITU -NDIGITA NDIGIT, no 0 S ; Match [^0-9] - -POSIXD POSIXD, none 0 S ; currently unused except as a placeholder -POSIXL POSIXD, none 0 S ; currently unused except as a placeholder -POSIXU POSIXD, none 0 S ; currently unused except as a placeholder +# Order of the below is important. See ordering comment above. +POSIXD POSIXD, none 0 S ; Some [[:class:]] under /d; the FLAGS field gives which one +POSIXL POSIXD, none 0 S ; Some [[:class:]] under /l; the FLAGS field gives which one +POSIXU POSIXD, none 0 S ; Some [[:class:]] under /u; the FLAGS field gives which one POSIXA POSIXD, none 0 S ; Some [[:class:]] under /a; the FLAGS field gives which one -NPOSIXD NPOSIXD, none 0 S ; currently unused except as a placeholder -NPOSIXL NPOSIXD, none 0 S ; currently unused except as a placeholder -NPOSIXU NPOSIXD, none 0 S ; currently unused except as a placeholder +NPOSIXD NPOSIXD, none 0 S ; complement of POSIXD, [[:^class:]] +NPOSIXL NPOSIXD, none 0 S ; complement of POSIXL, [[:^class:]] +NPOSIXU NPOSIXD, none 0 S ; complement of POSIXU, [[:^class:]] NPOSIXA NPOSIXD, none 0 S ; complement of POSIXA, [[:^class:]] -# End of order is important (within groups) +# End of order is important CLUMP CLUMP, no 0 V ; Match any extended grapheme cluster sequence @@ -237,13 +208,6 @@ KEEPS KEEPS, no ; $& begins here. #*New charclass like patterns LNBREAK LNBREAK, none ; generic newline pattern -# regcomp.c expects the node number of the complement to be one greater than -# the non-complement -VERTWS VERTWS, none 0 S ; vertical whitespace (Perl 6) -NVERTWS NVERTWS, none 0 S ; not vertical whitespace (Perl 6) -HORIZWS HORIZWS, none 0 S ; horizontal whitespace (Perl 6) -NHORIZWS NHORIZWS, none 0 S ; not horizontal whitespace (Perl 6) - # NEW STUFF SOMEWHERE ABOVE THIS LINE ################################################################################ @@ -174,101 +174,6 @@ static const char* const non_utf8_target_but_utf8_required #define PLACEHOLDER /* Something for the preprocessor to grab onto */ -/* The actual code for CCC_TRY, which uses several variables from the routine - * it's callable from. It is designed to be the bulk of a case statement. - * FUNC is the macro or function to call on non-utf8 targets that indicate if - * nextchr matches the class. - * UTF8_TEST is the whole test string to use for utf8 targets - * LOAD is what to use to test, and if not present to load in the swash for the - * class - * POS_OR_NEG is either empty or ! to complement the results of FUNC or - * UTF8_TEST test. - * The logic is: Fail if we're at the end-of-string; otherwise if the target is - * utf8 and a variant, load the swash if necessary and test using the utf8 - * test. Advance to the next character if test is ok, otherwise fail; If not - * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it - * fails, or advance to the next character */ - -#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \ - if (NEXTCHR_IS_EOS) { \ - sayNO; \ - } \ - if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \ - LOAD_UTF8_CHARCLASS(CLASS, STR); \ - if (POS_OR_NEG (UTF8_TEST)) { \ - sayNO; \ - } \ - } \ - else if (POS_OR_NEG (FUNC(nextchr))) { \ - sayNO; \ - } \ - goto increment_locinput; - -/* Handle the non-locale cases for a character class and its complement. It - * calls _CCC_TRY_CODE with a ! to complement the test for the character class. - * This is because that code fails when the test succeeds, so we want to have - * the test fail so that the code succeeds. The swash is stored in a - * predictable PL_ place */ -#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \ - CLASS, STR) \ - case NAME: \ - _CCC_TRY_CODE( !, FUNC, \ - cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \ - (U8*)locinput, TRUE)), \ - CLASS, STR) \ - case NNAME: \ - _CCC_TRY_CODE( PLACEHOLDER , FUNC, \ - cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \ - (U8*)locinput, TRUE)), \ - CLASS, STR) -/* Generate the case statements for both locale and non-locale character - * classes in regmatch for classes that don't have special unicode semantics. - * Locales don't use an immediate swash, but an intermediary special locale - * function that is called on the pointer to the current place in the input - * string. That function will resolve to needing the same swash. One might - * think that because we don't know what the locale will match, we shouldn't - * check with the swash loading function that it loaded properly; ie, that we - * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the - * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is - * irrelevant here */ -#define CCC_TRY(NAME, NNAME, FUNC, \ - NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \ - NAMEA, NNAMEA, FUNCA, \ - CLASS, STR) \ - case NAMEL: \ - PL_reg_flags |= RF_tainted; \ - _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \ - case NNAMEL: \ - PL_reg_flags |= RF_tainted; \ - _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \ - CLASS, STR) \ - case NAMEA: \ - if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \ - sayNO; \ - } \ - /* Matched a utf8-invariant, so don't have to worry about utf8 */ \ - locinput++; \ - break; \ - case NNAMEA: \ - if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \ - sayNO; \ - } \ - goto increment_locinput; \ - /* Generate the non-locale cases */ \ - _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR) - -/* This is like CCC_TRY, but has an extra set of parameters for generating case - * statements to handle separate Unicode semantics nodes */ -#define CCC_TRY_U(NAME, NNAME, FUNC, \ - NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \ - NAMEU, NNAMEU, FUNCU, \ - NAMEA, NNAMEA, FUNCA, \ - CLASS, STR) \ - CCC_TRY(NAME, NNAME, FUNC, \ - NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \ - NAMEA, NNAMEA, FUNCA, \ - CLASS, STR) \ - _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR) /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ @@ -549,6 +454,56 @@ S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) return FALSE; } +STATIC bool +S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) +{ + /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded + * 'character' is a member of the Posix character class given by 'classnum' + * that should be equivalent to a value in the typedef + * '_char_class_number'. + * + * This just calls isFOO_lc on the code point for the character if it is in + * the range 0-255. Outside that range, all characters avoid Unicode + * rules, ignoring any locale. So use the Unicode function if this class + * requires a swash, and use the Unicode macro otherwise. */ + + PERL_ARGS_ASSERT_ISFOO_UTF8_LC; + + if (UTF8_IS_INVARIANT(*character)) { + return isFOO_lc(classnum, *character); + } + else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { + return isFOO_lc(classnum, + TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1))); + } + + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Initialize the swash unless done already */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", + swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags); + } + + return swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) character, TRUE); + } + + switch ((_char_class_number) classnum) { + case _CC_ENUM_SPACE: + case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character); + + case _CC_ENUM_BLANK: return is_HORIZWS_high(character); + case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character); + case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character); + default: return 0; /* Things like CNTRL are always + below 256 */ + } + + assert(0); /* NOTREACHED */ + return FALSE; +} + /* * pregexec and friends */ @@ -1498,13 +1453,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const U8 *fold_array; /* array for folding ords < 256 */ STRLEN ln; STRLEN lnc; - STRLEN uskip; U8 c1; U8 c2; char *e; I32 tmp = 1; /* Scratch variable? */ const bool utf8_target = PL_reg_match_utf8; UV utf8_fold_flags = 0; + bool to_complement = FALSE; /* Invert the result? Taking the xor of this + with a result inverts that result, as 0^1 = + 1 and 1^1 = 0 */ + _char_class_number classnum; + RXi_GET_DECL(prog,progi); PERL_ARGS_ASSERT_FIND_BYCLASS; @@ -1710,182 +1669,155 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, isALNUM_uni(tmp), cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); break; - case ALNUML: - REXEC_FBC_CSCAN_TAINT( - isALNUM_LC_utf8((U8*)s), - isALNUM_LC(*s) - ); - break; - case ALNUMU: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_ALNUM(), - swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target), - isWORDCHAR_L1((U8) *s) - ); - break; - case ALNUM: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_ALNUM(), - swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target), - isWORDCHAR((U8) *s) - ); - break; - case ALNUMA: - /* Don't need to worry about utf8, as it can match only a single - * byte invariant character */ - REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s)); - break; - case NALNUMU: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_ALNUM(), - !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target), - ! isWORDCHAR_L1((U8) *s) - ); - break; - case NALNUM: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_ALNUM(), - !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target), - ! isALNUM(*s) - ); - break; - case NALNUMA: - REXEC_FBC_CSCAN( - !isWORDCHAR_A(*s), - !isWORDCHAR_A(*s) - ); - break; - case NALNUML: - REXEC_FBC_CSCAN_TAINT( - !isALNUM_LC_utf8((U8*)s), - !isALNUM_LC(*s) - ); - break; - case SPACEU: - REXEC_FBC_CSCAN( - is_XPERLSPACE_utf8(s), - isSPACE_L1((U8) *s) - ); - break; - case SPACE: - REXEC_FBC_CSCAN( - is_XPERLSPACE_utf8(s), - isSPACE((U8) *s) - ); - break; - case SPACEA: - /* Don't need to worry about utf8, as it can match only a single - * byte invariant character */ - REXEC_FBC_CLASS_SCAN( isSPACE_A(*s)); - break; - case SPACEL: - REXEC_FBC_CSCAN_TAINT( - isSPACE_LC_utf8((U8*)s), - isSPACE_LC(*s) - ); - break; - case NSPACEU: - REXEC_FBC_CSCAN( - ! is_XPERLSPACE_utf8(s), - ! isSPACE_L1((U8) *s) - ); - break; - case NSPACE: - REXEC_FBC_CSCAN( - ! is_XPERLSPACE_utf8(s), - ! isSPACE((U8) *s) - ); - break; - case NSPACEA: - REXEC_FBC_CSCAN( - !isSPACE_A(*s), - !isSPACE_A(*s) - ); - break; - case NSPACEL: - REXEC_FBC_CSCAN_TAINT( - !isSPACE_LC_utf8((U8*)s), - !isSPACE_LC(*s) - ); - break; - case DIGIT: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_DIGIT(), - swash_fetch(PL_utf8_digit,(U8*)s, utf8_target), - isDIGIT(*s) - ); - break; - case DIGITA: - /* Don't need to worry about utf8, as it can match only a single - * byte invariant character */ - REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s)); - break; - case DIGITL: - REXEC_FBC_CSCAN_TAINT( - isDIGIT_LC_utf8((U8*)s), - isDIGIT_LC(*s) - ); - break; - case NDIGIT: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_DIGIT(), - !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target), - !isDIGIT(*s) - ); - break; - case NDIGITA: - REXEC_FBC_CSCAN( - !isDIGIT_A(*s), - !isDIGIT_A(*s) - ); - break; - case NDIGITL: - REXEC_FBC_CSCAN_TAINT( - !isDIGIT_LC_utf8((U8*)s), - !isDIGIT_LC(*s) - ); - break; case LNBREAK: REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend), is_LNBREAK_latin1_safe(s, strend) ); break; - case VERTWS: - REXEC_FBC_CSCAN( - is_VERTWS_utf8_safe(s, strend), - is_VERTWS_latin1_safe(s, strend) - ); - break; - case NVERTWS: - REXEC_FBC_CSCAN( - !is_VERTWS_utf8_safe(s, strend), - !is_VERTWS_latin1_safe(s, strend) - ); - break; - case HORIZWS: - REXEC_FBC_CSCAN( - is_HORIZWS_utf8_safe(s, strend), - is_HORIZWS_latin1_safe(s, strend) - ); - break; - case NHORIZWS: - REXEC_FBC_CSCAN( - !is_HORIZWS_utf8_safe(s, strend), - !is_HORIZWS_latin1_safe(s, strend) - ); + + /* The argument to all the POSIX node types is the class number to pass to + * _generic_isCC() to build a mask for searching in PL_charclass[] */ + + case NPOSIXL: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: + PL_reg_flags |= RF_tainted; + REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), + to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); break; + + case NPOSIXD: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXD: + if (utf8_target) { + goto posix_utf8; + } + goto posixa; + + case NPOSIXA: + if (utf8_target) { + /* The complement of something that matches only ASCII matches all + * UTF-8 variant code points, plus everything in ASCII that isn't + * in the class */ + REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s) + || ! _generic_isCC_A(*s, FLAGS(c))); + break; + } + + to_complement = 1; + /* FALLTHROUGH */ + case POSIXA: + posixa: /* Don't need to worry about utf8, as it can match only a single - * byte invariant character. The flag in this node type is the - * class number to pass to _generic_isCC() to build a mask for - * searching in PL_charclass[] */ - REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c))); + * byte invariant character. */ + REXEC_FBC_CLASS_SCAN( + to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); break; - case NPOSIXA: - REXEC_FBC_CSCAN( - !_generic_isCC_A(*s, FLAGS(c)), - !_generic_isCC_A(*s, FLAGS(c)) - ); + + case NPOSIXU: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: + if (! utf8_target) { + REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s, + FLAGS(c)))); + } + else { + + posix_utf8: + classnum = (_char_class_number) FLAGS(c); + if (classnum < _FIRST_NON_SWASH_CC) { + while (s < strend) { + + /* We avoid loading in the swash as long as possible, but + * should we have to, we jump to a separate loop. This + * extra 'if' statement is what keeps this code from being + * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */ + if (UTF8_IS_ABOVE_LATIN1(*s)) { + goto found_above_latin1; + } + if ((UTF8_IS_INVARIANT(*s) + && to_complement ^ cBOOL(_generic_isCC((U8) *s, + classnum))) + || (UTF8_IS_DOWNGRADEABLE_START(*s) + && to_complement ^ cBOOL( + _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)), + classnum)))) + { + if (tmp && (!reginfo || regtry(reginfo, &s))) + goto got_it; + else { + tmp = doevery; + } + } + else { + tmp = 1; + } + s += UTF8SKIP(s); + } + } + else switch (classnum) { /* These classes are implemented as + macros */ + case _CC_ENUM_SPACE: /* XXX would require separate code if we + revert the change of \v matching this */ + /* FALL THROUGH */ + + case _CC_ENUM_PSXSPC: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isSPACE_utf8(s))); + break; + + case _CC_ENUM_BLANK: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isBLANK_utf8(s))); + break; + + case _CC_ENUM_XDIGIT: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isXDIGIT_utf8(s))); + break; + + case _CC_ENUM_VERTSPACE: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isVERTWS_utf8(s))); + break; + + case _CC_ENUM_CNTRL: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isCNTRL_utf8(s))); + break; + + default: + Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum); + assert(0); /* NOTREACHED */ + } + } + break; + + found_above_latin1: /* Here we have to load a swash to get the result + for the current code point */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", swash_property_names[classnum], + &PL_sv_undef, 1, 0, NULL, &flags); + } + + /* This is a copy of the loop above for swash classes, though using the + * FBC macro instead of being expanded out. Since we've loaded the + * swash, we don't have to check for that each time through the loop */ + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(_generic_utf8( + classnum, + s, + swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) s, TRUE)))); break; case AHOCORASICKC: @@ -3636,6 +3568,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ CHECKPOINT runops_cp; /* savestack position before executing EVAL */ U32 maxopenparen = 0; /* max '(' index seen so far */ + int to_complement; /* Invert the result? */ + _char_class_number classnum; #ifdef DEBUGGING GET_RE_DEBUG_FLAGS_DECL; @@ -3697,6 +3631,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) state_num = OP(scan); reenter_switch: + to_complement = 0; SET_nextchr; assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); @@ -4362,100 +4297,184 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; - /* Special char classes: \d, \w etc. - * The defines start on line 166 or so */ - CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR, - ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8, - ALNUMU, NALNUMU, isWORDCHAR_L1, - ALNUMA, NALNUMA, isWORDCHAR_A, - alnum, "a"); + /* The argument (FLAGS) to all the POSIX node types is the class number + * */ - case SPACEL: - PL_reg_flags |= RF_tainted; - if (NEXTCHR_IS_EOS) { + case NPOSIXL: /* \W or [:^punct:] etc. under /l */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: /* \w or [:punct:] etc. under /l */ + if (NEXTCHR_IS_EOS) sayNO; - } - if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { - if (! isSPACE_LC_utf8((U8 *) locinput)) { - sayNO; - } - } - else if (! isSPACE_LC((U8) nextchr)) { - sayNO; - } - goto increment_locinput; - case NSPACEL: + /* The locale hasn't influenced the outcome before this, so defer + * tainting until now */ PL_reg_flags |= RF_tainted; - if (NEXTCHR_IS_EOS) { - sayNO; - } - if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { - if (isSPACE_LC_utf8((U8 *) locinput)) { + + /* Use isFOO_lc() for characters within Latin1. (Note that + * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else + * wouldn't be invariant) */ + if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { + if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), nextchr)))) { sayNO; } } - else if (isSPACE_LC(nextchr)) { + else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { + if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), + TWO_BYTE_UTF8_TO_UNI(nextchr, + *(locinput + 1)))))) + { sayNO; + } } - goto increment_locinput; - - case SPACE: - if (utf8_target) { - goto utf8_space; + else { /* Here, must be an above Latin-1 code point */ + goto utf8_posix_not_eos; } - /* FALL THROUGH */ - case SPACEA: - if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) { - sayNO; - } - /* Matched a utf8-invariant, so don't have to worry about utf8 */ - locinput++; + + /* Here, must be utf8 */ + locinput += UTF8SKIP(locinput); break; - case NSPACE: + case NPOSIXD: /* \W or [:^punct:] etc. under /d */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXD: /* \w or [:punct:] etc. under /d */ if (utf8_target) { - goto utf8_nspace; - } - /* FALL THROUGH */ - case NSPACEA: - if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) { - sayNO; + goto utf8_posix; } - goto increment_locinput; + goto posixa; + + case NPOSIXA: /* \W or [:^punct:] etc. under /a */ - case SPACEU: - utf8_space: - if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) { + if (NEXTCHR_IS_EOS) { sayNO; } - goto increment_locinput; - case NSPACEU: - utf8_nspace: - if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) { - sayNO; + /* All UTF-8 variants match */ + if (! UTF8_IS_INVARIANT(nextchr)) { + goto increment_locinput; } - goto increment_locinput; - CCC_TRY(DIGIT, NDIGIT, isDIGIT, - DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8, - DIGITA, NDIGITA, isDIGIT_A, - digit, "0"); + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXA: /* \w or [:punct:] etc. under /a */ + + posixa: + /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in + * UTF-8, and also from NPOSIXA even in UTF-8 when the current + * character is a single byte */ - case POSIXA: /* /[[:ascii:]]/ etc */ - if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) { + if (NEXTCHR_IS_EOS + || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr, + FLAGS(scan))))) + { sayNO; } - /* Matched a utf8-invariant, so don't have to worry about utf8 */ + + /* Here we are either not in utf8, or we matched a utf8-invariant, + * so the next char is the next byte */ locinput++; break; - case NPOSIXA: /* /[^[:ascii:]]/ etc */ - if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) { + case NPOSIXU: /* \W or [:^punct:] etc. under /u */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: /* \w or [:punct:] etc. under /u */ + utf8_posix: + if (NEXTCHR_IS_EOS) { sayNO; } - goto increment_locinput; + utf8_posix_not_eos: + + /* Use _generic_isCC() for characters within Latin1. (Note that + * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else + * wouldn't be invariant) */ + if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { + if (! (to_complement ^ cBOOL(_generic_isCC(nextchr, + FLAGS(scan))))) + { + sayNO; + } + locinput++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { + if (! (to_complement + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr, + *(locinput + 1)), + FLAGS(scan))))) + { + sayNO; + } + locinput += 2; + } + else { /* Handle above Latin-1 code points */ + classnum = (_char_class_number) FLAGS(scan); + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Here, uses a swash to find such code points. Load if if + * not done already */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] + = _core_swash_init("utf8", + swash_property_names[classnum], + &PL_sv_undef, 1, 0, NULL, &flags); + } + if (! (to_complement + ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) locinput, TRUE)))) + { + sayNO; + } + } + else { /* Here, uses macros to find above Latin-1 code points */ + switch (classnum) { + case _CC_ENUM_SPACE: /* XXX would require separate + code if we revert the change + of \v matching this */ + case _CC_ENUM_PSXSPC: + if (! (to_complement + ^ cBOOL(is_XPERLSPACE_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_BLANK: + if (! (to_complement + ^ cBOOL(is_HORIZWS_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_XDIGIT: + if (! (to_complement + ^ cBOOL(is_XDIGIT_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_VERTSPACE: + if (! (to_complement + ^ cBOOL(is_VERTWS_high(locinput)))) + { + sayNO; + } + break; + default: /* The rest, e.g. [:cntrl:], can't match + above Latin1 */ + if (! to_complement) { + sayNO; + } + break; + } + } + locinput += UTF8SKIP(locinput); + } + break; case CLUMP: /* Match \X: logical Unicode character. This is defined as a Unicode extended Grapheme Cluster */ @@ -6417,29 +6436,6 @@ NULL sayNO; break; -#define CASE_CLASS(nAmE) \ - case nAmE: \ - if (NEXTCHR_IS_EOS) \ - sayNO; \ - if ((n=is_##nAmE(locinput,utf8_target))) { \ - locinput += n; \ - } else \ - sayNO; \ - break; \ - case N##nAmE: \ - if (NEXTCHR_IS_EOS) \ - sayNO; \ - if ((n=is_##nAmE(locinput,utf8_target))) { \ - sayNO; \ - } else { \ - locinput += UTF8SKIP(locinput); \ - } \ - break - - CASE_CLASS(VERTWS); /* \v \V */ - CASE_CLASS(HORIZWS); /* \h \H */ -#undef CASE_CLASS - default: PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", PTR2UV(scan), OP(scan)); @@ -6665,7 +6661,9 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma char *loceol = PL_regeol; /* local version */ I32 hardcount = 0; /* How many matches so far */ bool utf8_target = PL_reg_match_utf8; + int to_complement = 0; /* Invert the result? */ UV utf8_flags; + _char_class_number classnum; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -6887,79 +6885,38 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma scan++; } break; - case ALNUMU: - if (utf8_target) { - utf8_wordchar: - LOAD_UTF8_CHARCLASS_ALNUM(); - while (hardcount < max && scan < loceol && - swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) - { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - while (scan < loceol && isWORDCHAR_L1((U8) *scan)) { - scan++; - } - } - break; - case ALNUM: - if (utf8_target) - goto utf8_wordchar; - while (scan < loceol && isALNUM((U8) *scan)) { - scan++; - } - break; - case ALNUMA: - if (utf8_target && scan + max < loceol) { - /* We didn't adjust <loceol> because is UTF-8, but ok to do so, - * since here, to match, 1 char == 1 byte */ - loceol = scan + max; - } - while (scan < loceol && isWORDCHAR_A((U8) *scan)) { - scan++; - } - break; - case ALNUML: - PL_reg_flags |= RF_tainted; - if (utf8_target) { - while (hardcount < max && scan < loceol && - isALNUM_LC_utf8((U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - while (scan < loceol && isALNUM_LC(*scan)) - scan++; - } - break; - case NALNUMU: - if (utf8_target) { + /* The argument (FLAGS) to all the POSIX node types is the class number */ - utf8_Nwordchar: + case NPOSIXL: + to_complement = 1; + /* FALLTHROUGH */ - LOAD_UTF8_CHARCLASS_ALNUM(); - while (hardcount < max && scan < loceol && - ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) + case POSIXL: + PL_reg_flags |= RF_tainted; + if (! utf8_target) { + while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), + *scan))) { - scan += UTF8SKIP(scan); + scan++; + } + } else { + while (hardcount < max && scan < loceol + && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p), + (U8 *) scan))) + { + scan += UTF8SKIP(scan); hardcount++; } - } else { - while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) { - scan++; - } - } - break; - case NALNUM: - if (utf8_target) - goto utf8_Nwordchar; - while (scan < loceol && ! isALNUM((U8) *scan)) { - scan++; } break; + case POSIXD: + if (utf8_target) { + goto utf8_posix; + } + /* FALLTHROUGH */ + case POSIXA: if (utf8_target && scan + max < loceol) { @@ -6972,232 +6929,170 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma scan++; } break; - case NPOSIXA: - if (utf8_target) { - while (scan < loceol && hardcount < max - && ! _generic_isCC_A((U8) *scan, FLAGS(p))) - { - scan += UTF8SKIP(scan); - hardcount++; - } - } - else { - while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { - scan++; - } - } - break; - case NALNUMA: - if (utf8_target) { - while (scan < loceol && hardcount < max - && ! isWORDCHAR_A((U8) *scan)) - { - scan += UTF8SKIP(scan); - hardcount++; - } - } - else { - while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) { - scan++; - } - } - break; - case NALNUML: - PL_reg_flags |= RF_tainted; - if (utf8_target) { - while (hardcount < max && scan < loceol && - !isALNUM_LC_utf8((U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - while (scan < loceol && !isALNUM_LC(*scan)) - scan++; - } - break; - case SPACEU: - if (utf8_target) { - utf8_space: + case NPOSIXD: + if (utf8_target) { + to_complement = 1; + goto utf8_posix; + } + /* FALL THROUGH */ - while (hardcount < max && scan < loceol - && is_XPERLSPACE_utf8((U8*)scan)) - { - scan += UTF8SKIP(scan); - hardcount++; - } - break; - } - else { - while (scan < loceol && isSPACE_L1((U8) *scan)) { + case NPOSIXA: + if (! utf8_target) { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { scan++; } - break; - } - case SPACE: - if (utf8_target) - goto utf8_space; - - while (scan < loceol && isSPACE((U8) *scan)) { - scan++; - } - break; - case SPACEA: - if (utf8_target && scan + max < loceol) { - - /* We didn't adjust <loceol> because is UTF-8, but ok to do so, - * since here, to match, 1 char == 1 byte */ - loceol = scan + max; } - while (scan < loceol && isSPACE_A((U8) *scan)) { - scan++; - } - break; - case SPACEL: - PL_reg_flags |= RF_tainted; - if (utf8_target) { - while (hardcount < max && scan < loceol && - isSPACE_LC_utf8((U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - while (scan < loceol && isSPACE_LC(*scan)) - scan++; - } - break; - case NSPACEU: - if (utf8_target) { - - utf8_Nspace: + else { + /* The complement of something that matches only ASCII matches all + * UTF-8 variant code points, plus everything in ASCII that isn't + * in the class. */ while (hardcount < max && scan < loceol - && ! is_XPERLSPACE_utf8((U8*)scan)) + && (! UTF8_IS_INVARIANT(*scan) + || ! _generic_isCC_A((U8) *scan, FLAGS(p)))) { - scan += UTF8SKIP(scan); + scan += UTF8SKIP(scan); hardcount++; } - break; - } - else { - while (scan < loceol && ! isSPACE_L1((U8) *scan)) { - scan++; - } - } - break; - case NSPACE: - if (utf8_target) - goto utf8_Nspace; + } + break; - while (scan < loceol && ! isSPACE((U8) *scan)) { - scan++; - } - break; - case NSPACEA: - if (utf8_target) { - while (hardcount < max && scan < loceol - && ! isSPACE_A((U8) *scan)) + case NPOSIXU: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: + if (! utf8_target) { + while (scan < loceol && to_complement + ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p)))) { - scan += UTF8SKIP(scan); - hardcount++; - } + scan++; + } } else { - while (scan < loceol && ! isSPACE_A((U8) *scan)) { - scan++; - } - } - break; - case NSPACEL: - PL_reg_flags |= RF_tainted; - if (utf8_target) { - while (hardcount < max && scan < loceol && - !isSPACE_LC_utf8((U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - while (scan < loceol && !isSPACE_LC(*scan)) - scan++; - } - break; - case DIGIT: - if (utf8_target) { - LOAD_UTF8_CHARCLASS_DIGIT(); - while (hardcount < max && scan < loceol && - swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - while (scan < loceol && isDIGIT(*scan)) - scan++; + utf8_posix: + classnum = (_char_class_number) FLAGS(p); + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Here, a swash is needed for above-Latin1 code points. + * Process as many Latin1 code points using the built-in rules. + * Go to another loop to finish processing upon encountering + * the first Latin1 code point. We could do that in this loop + * as well, but the other way saves having to test if the swash + * has been loaded every time through the loop: extra space to + * save a test. */ + while (hardcount < max && scan < loceol) { + if (UTF8_IS_INVARIANT(*scan)) { + if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan, + classnum)))) + { + break; + } + scan++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { + if (! (to_complement + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan, + *(scan + 1)), + classnum)))) + { + break; + } + scan += 2; + } + else { + goto found_above_latin1; + } + + hardcount++; + } + } + else { + /* For these character classes, the knowledge of how to handle + * every code point is compiled in to Perl via a macro. This + * code is written for making the loops as tight as possible. + * It could be refactored to save space instead */ + switch (classnum) { + case _CC_ENUM_SPACE: /* XXX would require separate code + if we revert the change of \v + matching this */ + /* FALL THROUGH */ + case _CC_ENUM_PSXSPC: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isSPACE_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_BLANK: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isBLANK_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_XDIGIT: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isXDIGIT_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_VERTSPACE: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isVERTWS_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_CNTRL: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isCNTRL_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + default: + Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum); + } + } } - break; - case DIGITA: - if (utf8_target && scan + max < loceol) { + break; - /* We didn't adjust <loceol> because is UTF-8, but ok to do so, - * since here, to match, 1 char == 1 byte */ - loceol = scan + max; + found_above_latin1: /* Continuation of POSIXU and NPOSIXU */ + + /* Load the swash if not already present */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = _core_swash_init( + "utf8", swash_property_names[classnum], + &PL_sv_undef, 1, 0, NULL, &flags); } - while (scan < loceol && isDIGIT_A((U8) *scan)) { - scan++; - } - break; - case DIGITL: - PL_reg_flags |= RF_tainted; - if (utf8_target) { - while (hardcount < max && scan < loceol && - isDIGIT_LC_utf8((U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - while (scan < loceol && isDIGIT_LC(*scan)) - scan++; - } - break; - case NDIGIT: - if (utf8_target) { - LOAD_UTF8_CHARCLASS_DIGIT(); - while (hardcount < max && scan < loceol && - !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - while (scan < loceol && !isDIGIT(*scan)) - scan++; - } - break; - case NDIGITA: - if (utf8_target) { - while (hardcount < max && scan < loceol - && ! isDIGIT_A((U8) *scan)) { - scan += UTF8SKIP(scan); - hardcount++; - } - } - else { - while (scan < loceol && ! isDIGIT_A((U8) *scan)) { - scan++; - } - } - break; - case NDIGITL: - PL_reg_flags |= RF_tainted; - if (utf8_target) { - while (hardcount < max && scan < loceol && - !isDIGIT_LC_utf8((U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - while (scan < loceol && !isDIGIT_LC(*scan)) - scan++; - } - break; + + while (hardcount < max && scan < loceol + && to_complement ^ cBOOL(_generic_utf8( + classnum, + scan, + swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) scan, + TRUE)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case LNBREAK: if (utf8_target) { while (hardcount < max && scan < loceol && @@ -7216,61 +7111,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } } break; - case HORIZWS: - if (utf8_target) { - while (hardcount < max && scan < loceol && - (c=is_HORIZWS_utf8_safe(scan, loceol))) - { - scan += c; - hardcount++; - } - } else { - while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol)) - scan++; - } - break; - case NHORIZWS: - if (utf8_target) { - while (hardcount < max && scan < loceol && - !is_HORIZWS_utf8_safe(scan, loceol)) - { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol)) - scan++; - - } - break; - case VERTWS: - if (utf8_target) { - while (hardcount < max && scan < loceol && - (c=is_VERTWS_utf8_safe(scan, loceol))) - { - scan += c; - hardcount++; - } - } else { - while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol)) - scan++; - - } - break; - case NVERTWS: - if (utf8_target) { - while (hardcount < max && scan < loceol && - !is_VERTWS_utf8_safe(scan, loceol)) - { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol)) - scan++; - - } - break; case BOUND: case BOUNDA: diff --git a/regnodes.h b/regnodes.h index 2024d156bb..e1fdad1fb9 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 121 -#define REGMATCH_STATE_MAX 161 +#define REGNODE_MAX 93 +#define REGMATCH_STATE_MAX 133 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -31,106 +31,78 @@ #define SANY 19 /* 0x13 Match any one character. */ #define CANY 20 /* 0x14 Match any one byte. */ #define ANYOF 21 /* 0x15 Match character in (or not in) this class, single char match only */ -#define ALNUM 22 /* 0x16 Match any alphanumeric character using native charset semantics for non-utf8 */ -#define ALNUML 23 /* 0x17 Match any alphanumeric char in locale */ -#define ALNUMU 24 /* 0x18 Match any alphanumeric char using Unicode semantics */ -#define ALNUMA 25 /* 0x19 Match [A-Za-z_0-9] */ -#define NALNUM 26 /* 0x1a Match any non-alphanumeric character using native charset semantics for non-utf8 */ -#define NALNUML 27 /* 0x1b Match any non-alphanumeric char in locale */ -#define NALNUMU 28 /* 0x1c Match any non-alphanumeric char using Unicode semantics */ -#define NALNUMA 29 /* 0x1d Match [^A-Za-z_0-9] */ -#define SPACE 30 /* 0x1e Match any whitespace character using native charset semantics for non-utf8 */ -#define SPACEL 31 /* 0x1f Match any whitespace char in locale */ -#define SPACEU 32 /* 0x20 Match any whitespace char using Unicode semantics */ -#define SPACEA 33 /* 0x21 Match [ \t\n\f\r] */ -#define NSPACE 34 /* 0x22 Match any non-whitespace character using native charset semantics for non-utf8 */ -#define NSPACEL 35 /* 0x23 Match any non-whitespace char in locale */ -#define NSPACEU 36 /* 0x24 Match any non-whitespace char using Unicode semantics */ -#define NSPACEA 37 /* 0x25 Match [^ \t\n\f\r] */ -#define DIGIT 38 /* 0x26 Match any numeric character using native charset semantics for non-utf8 */ -#define DIGITL 39 /* 0x27 Match any numeric character in locale */ -#define PLACEHOLDER1 40 /* 0x28 placeholder for missing DIGITU */ -#define DIGITA 41 /* 0x29 Match [0-9] */ -#define NDIGIT 42 /* 0x2a Match any non-numeric character using native charset semantics for non-utf8 */ -#define NDIGITL 43 /* 0x2b Match any non-numeric character in locale */ -#define PLACEHOLDER2 44 /* 0x2c placeholder for missing NDIGITU */ -#define NDIGITA 45 /* 0x2d Match [^0-9] */ -#define POSIXD 46 /* 0x2e currently unused except as a placeholder */ -#define POSIXL 47 /* 0x2f currently unused except as a placeholder */ -#define POSIXU 48 /* 0x30 currently unused except as a placeholder */ -#define POSIXA 49 /* 0x31 Some [[:class:]] under /a; the FLAGS field gives which one */ -#define NPOSIXD 50 /* 0x32 currently unused except as a placeholder */ -#define NPOSIXL 51 /* 0x33 currently unused except as a placeholder */ -#define NPOSIXU 52 /* 0x34 currently unused except as a placeholder */ -#define NPOSIXA 53 /* 0x35 complement of POSIXA, [[:^class:]] */ -#define CLUMP 54 /* 0x36 Match any extended grapheme cluster sequence */ -#define BRANCH 55 /* 0x37 Match this alternative, or the next... */ -#define BACK 56 /* 0x38 Match "", "next" ptr points backward. */ -#define EXACT 57 /* 0x39 Match this string (preceded by length). */ -#define EXACTF 58 /* 0x3a Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */ -#define EXACTFL 59 /* 0x3b Match this string (not guaranteed to be folded) using /il rules (w/len). */ -#define EXACTFU 60 /* 0x3c Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */ -#define EXACTFA 61 /* 0x3d Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ -#define EXACTFU_SS 62 /* 0x3e Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */ -#define EXACTFU_TRICKYFOLD 63 /* 0x3f Match this folded UTF-8 string using /iu rules */ -#define NOTHING 64 /* 0x40 Match empty string. */ -#define TAIL 65 /* 0x41 Match empty string. Can jump here from outside. */ -#define STAR 66 /* 0x42 Match this (simple) thing 0 or more times. */ -#define PLUS 67 /* 0x43 Match this (simple) thing 1 or more times. */ -#define CURLY 68 /* 0x44 Match this simple thing {n,m} times. */ -#define CURLYN 69 /* 0x45 Capture next-after-this simple thing */ -#define CURLYM 70 /* 0x46 Capture this medium-complex thing {n,m} times. */ -#define CURLYX 71 /* 0x47 Match this complex thing {n,m} times. */ -#define WHILEM 72 /* 0x48 Do curly processing and see if rest matches. */ -#define OPEN 73 /* 0x49 Mark this point in input as start of */ -#define CLOSE 74 /* 0x4a Analogous to OPEN. */ -#define REF 75 /* 0x4b Match some already matched string */ -#define REFF 76 /* 0x4c Match already matched string, folded using native charset semantics for non-utf8 */ -#define REFFL 77 /* 0x4d Match already matched string, folded in loc. */ -#define REFFU 78 /* 0x4e Match already matched string, folded using unicode semantics for non-utf8 */ -#define REFFA 79 /* 0x4f Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ -#define NREF 80 /* 0x50 Match some already matched string */ -#define NREFF 81 /* 0x51 Match already matched string, folded using native charset semantics for non-utf8 */ -#define NREFFL 82 /* 0x52 Match already matched string, folded in loc. */ -#define NREFFU 83 /* 0x53 Match already matched string, folded using unicode semantics for non-utf8 */ -#define NREFFA 84 /* 0x54 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ -#define IFMATCH 85 /* 0x55 Succeeds if the following matches. */ -#define UNLESSM 86 /* 0x56 Fails if the following matches. */ -#define SUSPEND 87 /* 0x57 "Independent" sub-RE. */ -#define IFTHEN 88 /* 0x58 Switch, should be preceded by switcher . */ -#define GROUPP 89 /* 0x59 Whether the group matched. */ -#define LONGJMP 90 /* 0x5a Jump far away. */ -#define BRANCHJ 91 /* 0x5b BRANCH with long offset. */ -#define EVAL 92 /* 0x5c Execute some Perl code. */ -#define MINMOD 93 /* 0x5d Next operator is not greedy. */ -#define LOGICAL 94 /* 0x5e Next opcode should set the flag only. */ -#define RENUM 95 /* 0x5f Group with independently numbered parens. */ -#define TRIE 96 /* 0x60 Match many EXACT(F[ALU]?)? at once. flags==type */ -#define TRIEC 97 /* 0x61 Same as TRIE, but with embedded charclass data */ -#define AHOCORASICK 98 /* 0x62 Aho Corasick stclass. flags==type */ -#define AHOCORASICKC 99 /* 0x63 Same as AHOCORASICK, but with embedded charclass data */ -#define GOSUB 100 /* 0x64 recurse to paren arg1 at (signed) ofs arg2 */ -#define GOSTART 101 /* 0x65 recurse to start of pattern */ -#define NGROUPP 102 /* 0x66 Whether the group matched. */ -#define INSUBP 103 /* 0x67 Whether we are in a specific recurse. */ -#define DEFINEP 104 /* 0x68 Never execute directly. */ -#define ENDLIKE 105 /* 0x69 Used only for the type field of verbs */ -#define OPFAIL 106 /* 0x6a Same as (?!) */ -#define ACCEPT 107 /* 0x6b Accepts the current matched string. */ -#define VERB 108 /* 0x6c Used only for the type field of verbs */ -#define PRUNE 109 /* 0x6d Pattern fails at this startpoint if no-backtracking through this */ -#define MARKPOINT 110 /* 0x6e Push the current location for rollback by cut. */ -#define SKIP 111 /* 0x6f On failure skip forward (to the mark) before retrying */ -#define COMMIT 112 /* 0x70 Pattern fails outright if backtracking through this */ -#define CUTGROUP 113 /* 0x71 On failure go to the next alternation in the group */ -#define KEEPS 114 /* 0x72 $& begins here. */ -#define LNBREAK 115 /* 0x73 generic newline pattern */ -#define VERTWS 116 /* 0x74 vertical whitespace (Perl 6) */ -#define NVERTWS 117 /* 0x75 not vertical whitespace (Perl 6) */ -#define HORIZWS 118 /* 0x76 horizontal whitespace (Perl 6) */ -#define NHORIZWS 119 /* 0x77 not horizontal whitespace (Perl 6) */ -#define OPTIMIZED 120 /* 0x78 Placeholder for dump. */ -#define PSEUDO 121 /* 0x79 Pseudo opcode for internal use. */ +#define POSIXD 22 /* 0x16 Some [[:class:]] under /d; the FLAGS field gives which one */ +#define POSIXL 23 /* 0x17 Some [[:class:]] under /l; the FLAGS field gives which one */ +#define POSIXU 24 /* 0x18 Some [[:class:]] under /u; the FLAGS field gives which one */ +#define POSIXA 25 /* 0x19 Some [[:class:]] under /a; the FLAGS field gives which one */ +#define NPOSIXD 26 /* 0x1a complement of POSIXD, [[:^class:]] */ +#define NPOSIXL 27 /* 0x1b complement of POSIXL, [[:^class:]] */ +#define NPOSIXU 28 /* 0x1c complement of POSIXU, [[:^class:]] */ +#define NPOSIXA 29 /* 0x1d complement of POSIXA, [[:^class:]] */ +#define CLUMP 30 /* 0x1e Match any extended grapheme cluster sequence */ +#define BRANCH 31 /* 0x1f Match this alternative, or the next... */ +#define BACK 32 /* 0x20 Match "", "next" ptr points backward. */ +#define EXACT 33 /* 0x21 Match this string (preceded by length). */ +#define EXACTF 34 /* 0x22 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */ +#define EXACTFL 35 /* 0x23 Match this string (not guaranteed to be folded) using /il rules (w/len). */ +#define EXACTFU 36 /* 0x24 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */ +#define EXACTFA 37 /* 0x25 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ +#define EXACTFU_SS 38 /* 0x26 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */ +#define EXACTFU_TRICKYFOLD 39 /* 0x27 Match this folded UTF-8 string using /iu rules */ +#define NOTHING 40 /* 0x28 Match empty string. */ +#define TAIL 41 /* 0x29 Match empty string. Can jump here from outside. */ +#define STAR 42 /* 0x2a Match this (simple) thing 0 or more times. */ +#define PLUS 43 /* 0x2b Match this (simple) thing 1 or more times. */ +#define CURLY 44 /* 0x2c Match this simple thing {n,m} times. */ +#define CURLYN 45 /* 0x2d Capture next-after-this simple thing */ +#define CURLYM 46 /* 0x2e Capture this medium-complex thing {n,m} times. */ +#define CURLYX 47 /* 0x2f Match this complex thing {n,m} times. */ +#define WHILEM 48 /* 0x30 Do curly processing and see if rest matches. */ +#define OPEN 49 /* 0x31 Mark this point in input as start of */ +#define CLOSE 50 /* 0x32 Analogous to OPEN. */ +#define REF 51 /* 0x33 Match some already matched string */ +#define REFF 52 /* 0x34 Match already matched string, folded using native charset semantics for non-utf8 */ +#define REFFL 53 /* 0x35 Match already matched string, folded in loc. */ +#define REFFU 54 /* 0x36 Match already matched string, folded using unicode semantics for non-utf8 */ +#define REFFA 55 /* 0x37 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ +#define NREF 56 /* 0x38 Match some already matched string */ +#define NREFF 57 /* 0x39 Match already matched string, folded using native charset semantics for non-utf8 */ +#define NREFFL 58 /* 0x3a Match already matched string, folded in loc. */ +#define NREFFU 59 /* 0x3b Match already matched string, folded using unicode semantics for non-utf8 */ +#define NREFFA 60 /* 0x3c Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ +#define IFMATCH 61 /* 0x3d Succeeds if the following matches. */ +#define UNLESSM 62 /* 0x3e Fails if the following matches. */ +#define SUSPEND 63 /* 0x3f "Independent" sub-RE. */ +#define IFTHEN 64 /* 0x40 Switch, should be preceded by switcher . */ +#define GROUPP 65 /* 0x41 Whether the group matched. */ +#define LONGJMP 66 /* 0x42 Jump far away. */ +#define BRANCHJ 67 /* 0x43 BRANCH with long offset. */ +#define EVAL 68 /* 0x44 Execute some Perl code. */ +#define MINMOD 69 /* 0x45 Next operator is not greedy. */ +#define LOGICAL 70 /* 0x46 Next opcode should set the flag only. */ +#define RENUM 71 /* 0x47 Group with independently numbered parens. */ +#define TRIE 72 /* 0x48 Match many EXACT(F[ALU]?)? at once. flags==type */ +#define TRIEC 73 /* 0x49 Same as TRIE, but with embedded charclass data */ +#define AHOCORASICK 74 /* 0x4a Aho Corasick stclass. flags==type */ +#define AHOCORASICKC 75 /* 0x4b Same as AHOCORASICK, but with embedded charclass data */ +#define GOSUB 76 /* 0x4c recurse to paren arg1 at (signed) ofs arg2 */ +#define GOSTART 77 /* 0x4d recurse to start of pattern */ +#define NGROUPP 78 /* 0x4e Whether the group matched. */ +#define INSUBP 79 /* 0x4f Whether we are in a specific recurse. */ +#define DEFINEP 80 /* 0x50 Never execute directly. */ +#define ENDLIKE 81 /* 0x51 Used only for the type field of verbs */ +#define OPFAIL 82 /* 0x52 Same as (?!) */ +#define ACCEPT 83 /* 0x53 Accepts the current matched string. */ +#define VERB 84 /* 0x54 Used only for the type field of verbs */ +#define PRUNE 85 /* 0x55 Pattern fails at this startpoint if no-backtracking through this */ +#define MARKPOINT 86 /* 0x56 Push the current location for rollback by cut. */ +#define SKIP 87 /* 0x57 On failure skip forward (to the mark) before retrying */ +#define COMMIT 88 /* 0x58 Pattern fails outright if backtracking through this */ +#define CUTGROUP 89 /* 0x59 On failure go to the next alternation in the group */ +#define KEEPS 90 /* 0x5a $& begins here. */ +#define LNBREAK 91 /* 0x5b generic newline pattern */ +#define OPTIMIZED 92 /* 0x5c Placeholder for dump. */ +#define PSEUDO 93 /* 0x5d Pseudo opcode for internal use. */ /* ------------ States ------------- */ #define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */ #define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */ @@ -201,30 +173,6 @@ EXTCONST U8 PL_regkind[] = { REG_ANY, /* SANY */ REG_ANY, /* CANY */ ANYOF, /* ANYOF */ - ALNUM, /* ALNUM */ - ALNUM, /* ALNUML */ - ALNUM, /* ALNUMU */ - ALNUM, /* ALNUMA */ - NALNUM, /* NALNUM */ - NALNUM, /* NALNUML */ - NALNUM, /* NALNUMU */ - NALNUM, /* NALNUMA */ - SPACE, /* SPACE */ - SPACE, /* SPACEL */ - SPACE, /* SPACEU */ - SPACE, /* SPACEA */ - NSPACE, /* NSPACE */ - NSPACE, /* NSPACEL */ - NSPACE, /* NSPACEU */ - NSPACE, /* NSPACEA */ - DIGIT, /* DIGIT */ - DIGIT, /* DIGITL */ - NOTHING, /* PLACEHOLDER1 */ - DIGIT, /* DIGITA */ - NDIGIT, /* NDIGIT */ - NDIGIT, /* NDIGITL */ - NOTHING, /* PLACEHOLDER2 */ - NDIGIT, /* NDIGITA */ POSIXD, /* POSIXD */ POSIXD, /* POSIXL */ POSIXD, /* POSIXU */ @@ -295,10 +243,6 @@ EXTCONST U8 PL_regkind[] = { VERB, /* CUTGROUP */ KEEPS, /* KEEPS */ LNBREAK, /* LNBREAK */ - VERTWS, /* VERTWS */ - NVERTWS, /* NVERTWS */ - HORIZWS, /* HORIZWS */ - NHORIZWS, /* NHORIZWS */ NOTHING, /* OPTIMIZED */ PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ @@ -371,30 +315,6 @@ static const U8 regarglen[] = { 0, /* SANY */ 0, /* CANY */ 0, /* ANYOF */ - 0, /* ALNUM */ - 0, /* ALNUML */ - 0, /* ALNUMU */ - 0, /* ALNUMA */ - 0, /* NALNUM */ - 0, /* NALNUML */ - 0, /* NALNUMU */ - 0, /* NALNUMA */ - 0, /* SPACE */ - 0, /* SPACEL */ - 0, /* SPACEU */ - 0, /* SPACEA */ - 0, /* NSPACE */ - 0, /* NSPACEL */ - 0, /* NSPACEU */ - 0, /* NSPACEA */ - 0, /* DIGIT */ - 0, /* DIGITL */ - 0, /* PLACEHOLDER1 */ - 0, /* DIGITA */ - 0, /* NDIGIT */ - 0, /* NDIGITL */ - 0, /* PLACEHOLDER2 */ - 0, /* NDIGITA */ 0, /* POSIXD */ 0, /* POSIXL */ 0, /* POSIXU */ @@ -465,10 +385,6 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_1), /* CUTGROUP */ 0, /* KEEPS */ 0, /* LNBREAK */ - 0, /* VERTWS */ - 0, /* NVERTWS */ - 0, /* HORIZWS */ - 0, /* NHORIZWS */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -498,30 +414,6 @@ static const char reg_off_by_arg[] = { 0, /* SANY */ 0, /* CANY */ 0, /* ANYOF */ - 0, /* ALNUM */ - 0, /* ALNUML */ - 0, /* ALNUMU */ - 0, /* ALNUMA */ - 0, /* NALNUM */ - 0, /* NALNUML */ - 0, /* NALNUMU */ - 0, /* NALNUMA */ - 0, /* SPACE */ - 0, /* SPACEL */ - 0, /* SPACEU */ - 0, /* SPACEA */ - 0, /* NSPACE */ - 0, /* NSPACEL */ - 0, /* NSPACEU */ - 0, /* NSPACEA */ - 0, /* DIGIT */ - 0, /* DIGITL */ - 0, /* PLACEHOLDER1 */ - 0, /* DIGITA */ - 0, /* NDIGIT */ - 0, /* NDIGITL */ - 0, /* PLACEHOLDER2 */ - 0, /* NDIGITA */ 0, /* POSIXD */ 0, /* POSIXL */ 0, /* POSIXU */ @@ -592,10 +484,6 @@ static const char reg_off_by_arg[] = { 0, /* CUTGROUP */ 0, /* KEEPS */ 0, /* LNBREAK */ - 0, /* VERTWS */ - 0, /* NVERTWS */ - 0, /* HORIZWS */ - 0, /* NHORIZWS */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -630,106 +518,78 @@ EXTCONST char * const PL_reg_name[] = { "SANY", /* 0x13 */ "CANY", /* 0x14 */ "ANYOF", /* 0x15 */ - "ALNUM", /* 0x16 */ - "ALNUML", /* 0x17 */ - "ALNUMU", /* 0x18 */ - "ALNUMA", /* 0x19 */ - "NALNUM", /* 0x1a */ - "NALNUML", /* 0x1b */ - "NALNUMU", /* 0x1c */ - "NALNUMA", /* 0x1d */ - "SPACE", /* 0x1e */ - "SPACEL", /* 0x1f */ - "SPACEU", /* 0x20 */ - "SPACEA", /* 0x21 */ - "NSPACE", /* 0x22 */ - "NSPACEL", /* 0x23 */ - "NSPACEU", /* 0x24 */ - "NSPACEA", /* 0x25 */ - "DIGIT", /* 0x26 */ - "DIGITL", /* 0x27 */ - "PLACEHOLDER1", /* 0x28 */ - "DIGITA", /* 0x29 */ - "NDIGIT", /* 0x2a */ - "NDIGITL", /* 0x2b */ - "PLACEHOLDER2", /* 0x2c */ - "NDIGITA", /* 0x2d */ - "POSIXD", /* 0x2e */ - "POSIXL", /* 0x2f */ - "POSIXU", /* 0x30 */ - "POSIXA", /* 0x31 */ - "NPOSIXD", /* 0x32 */ - "NPOSIXL", /* 0x33 */ - "NPOSIXU", /* 0x34 */ - "NPOSIXA", /* 0x35 */ - "CLUMP", /* 0x36 */ - "BRANCH", /* 0x37 */ - "BACK", /* 0x38 */ - "EXACT", /* 0x39 */ - "EXACTF", /* 0x3a */ - "EXACTFL", /* 0x3b */ - "EXACTFU", /* 0x3c */ - "EXACTFA", /* 0x3d */ - "EXACTFU_SS", /* 0x3e */ - "EXACTFU_TRICKYFOLD", /* 0x3f */ - "NOTHING", /* 0x40 */ - "TAIL", /* 0x41 */ - "STAR", /* 0x42 */ - "PLUS", /* 0x43 */ - "CURLY", /* 0x44 */ - "CURLYN", /* 0x45 */ - "CURLYM", /* 0x46 */ - "CURLYX", /* 0x47 */ - "WHILEM", /* 0x48 */ - "OPEN", /* 0x49 */ - "CLOSE", /* 0x4a */ - "REF", /* 0x4b */ - "REFF", /* 0x4c */ - "REFFL", /* 0x4d */ - "REFFU", /* 0x4e */ - "REFFA", /* 0x4f */ - "NREF", /* 0x50 */ - "NREFF", /* 0x51 */ - "NREFFL", /* 0x52 */ - "NREFFU", /* 0x53 */ - "NREFFA", /* 0x54 */ - "IFMATCH", /* 0x55 */ - "UNLESSM", /* 0x56 */ - "SUSPEND", /* 0x57 */ - "IFTHEN", /* 0x58 */ - "GROUPP", /* 0x59 */ - "LONGJMP", /* 0x5a */ - "BRANCHJ", /* 0x5b */ - "EVAL", /* 0x5c */ - "MINMOD", /* 0x5d */ - "LOGICAL", /* 0x5e */ - "RENUM", /* 0x5f */ - "TRIE", /* 0x60 */ - "TRIEC", /* 0x61 */ - "AHOCORASICK", /* 0x62 */ - "AHOCORASICKC", /* 0x63 */ - "GOSUB", /* 0x64 */ - "GOSTART", /* 0x65 */ - "NGROUPP", /* 0x66 */ - "INSUBP", /* 0x67 */ - "DEFINEP", /* 0x68 */ - "ENDLIKE", /* 0x69 */ - "OPFAIL", /* 0x6a */ - "ACCEPT", /* 0x6b */ - "VERB", /* 0x6c */ - "PRUNE", /* 0x6d */ - "MARKPOINT", /* 0x6e */ - "SKIP", /* 0x6f */ - "COMMIT", /* 0x70 */ - "CUTGROUP", /* 0x71 */ - "KEEPS", /* 0x72 */ - "LNBREAK", /* 0x73 */ - "VERTWS", /* 0x74 */ - "NVERTWS", /* 0x75 */ - "HORIZWS", /* 0x76 */ - "NHORIZWS", /* 0x77 */ - "OPTIMIZED", /* 0x78 */ - "PSEUDO", /* 0x79 */ + "POSIXD", /* 0x16 */ + "POSIXL", /* 0x17 */ + "POSIXU", /* 0x18 */ + "POSIXA", /* 0x19 */ + "NPOSIXD", /* 0x1a */ + "NPOSIXL", /* 0x1b */ + "NPOSIXU", /* 0x1c */ + "NPOSIXA", /* 0x1d */ + "CLUMP", /* 0x1e */ + "BRANCH", /* 0x1f */ + "BACK", /* 0x20 */ + "EXACT", /* 0x21 */ + "EXACTF", /* 0x22 */ + "EXACTFL", /* 0x23 */ + "EXACTFU", /* 0x24 */ + "EXACTFA", /* 0x25 */ + "EXACTFU_SS", /* 0x26 */ + "EXACTFU_TRICKYFOLD", /* 0x27 */ + "NOTHING", /* 0x28 */ + "TAIL", /* 0x29 */ + "STAR", /* 0x2a */ + "PLUS", /* 0x2b */ + "CURLY", /* 0x2c */ + "CURLYN", /* 0x2d */ + "CURLYM", /* 0x2e */ + "CURLYX", /* 0x2f */ + "WHILEM", /* 0x30 */ + "OPEN", /* 0x31 */ + "CLOSE", /* 0x32 */ + "REF", /* 0x33 */ + "REFF", /* 0x34 */ + "REFFL", /* 0x35 */ + "REFFU", /* 0x36 */ + "REFFA", /* 0x37 */ + "NREF", /* 0x38 */ + "NREFF", /* 0x39 */ + "NREFFL", /* 0x3a */ + "NREFFU", /* 0x3b */ + "NREFFA", /* 0x3c */ + "IFMATCH", /* 0x3d */ + "UNLESSM", /* 0x3e */ + "SUSPEND", /* 0x3f */ + "IFTHEN", /* 0x40 */ + "GROUPP", /* 0x41 */ + "LONGJMP", /* 0x42 */ + "BRANCHJ", /* 0x43 */ + "EVAL", /* 0x44 */ + "MINMOD", /* 0x45 */ + "LOGICAL", /* 0x46 */ + "RENUM", /* 0x47 */ + "TRIE", /* 0x48 */ + "TRIEC", /* 0x49 */ + "AHOCORASICK", /* 0x4a */ + "AHOCORASICKC", /* 0x4b */ + "GOSUB", /* 0x4c */ + "GOSTART", /* 0x4d */ + "NGROUPP", /* 0x4e */ + "INSUBP", /* 0x4f */ + "DEFINEP", /* 0x50 */ + "ENDLIKE", /* 0x51 */ + "OPFAIL", /* 0x52 */ + "ACCEPT", /* 0x53 */ + "VERB", /* 0x54 */ + "PRUNE", /* 0x55 */ + "MARKPOINT", /* 0x56 */ + "SKIP", /* 0x57 */ + "COMMIT", /* 0x58 */ + "CUTGROUP", /* 0x59 */ + "KEEPS", /* 0x5a */ + "LNBREAK", /* 0x5b */ + "OPTIMIZED", /* 0x5c */ + "PSEUDO", /* 0x5d */ /* ------------ States ------------- */ "TRIE_next", /* REGNODE_MAX +0x01 */ "TRIE_next_fail", /* REGNODE_MAX +0x02 */ @@ -834,7 +694,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = { EXTCONST U8 PL_varies_bitmask[]; #else EXTCONST U8 PL_varies_bitmask[] = { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x01, 0xFC, 0xF9, 0x9F, 0x09, 0x00, 0x00, 0x00, 0x00 + 0x00, 0x00, 0x00, 0xC0, 0x01, 0xFC, 0xF9, 0x9F, 0x09, 0x00, 0x00, 0x00 }; #endif /* DOINIT */ @@ -846,11 +706,8 @@ EXTCONST U8 PL_varies_bitmask[] = { EXTCONST U8 PL_simple[] __attribute__deprecated__; #else EXTCONST U8 PL_simple[] __attribute__deprecated__ = { - REG_ANY, SANY, CANY, ANYOF, ALNUM, ALNUML, ALNUMU, ALNUMA, NALNUM, - NALNUML, NALNUMU, NALNUMA, SPACE, SPACEL, SPACEU, SPACEA, NSPACE, - NSPACEL, NSPACEU, NSPACEA, DIGIT, DIGITL, DIGITA, NDIGIT, NDIGITL, - NDIGITA, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD, NPOSIXL, NPOSIXU, - NPOSIXA, VERTWS, NVERTWS, HORIZWS, NHORIZWS, + REG_ANY, SANY, CANY, ANYOF, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD, + NPOSIXL, NPOSIXU, NPOSIXA, 0 }; #endif /* DOINIT */ @@ -859,7 +716,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = { EXTCONST U8 PL_simple_bitmask[]; #else EXTCONST U8 PL_simple_bitmask[] = { - 0x00, 0x00, 0xFC, 0xFF, 0xFF, 0xEE, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x00 + 0x00, 0x00, 0xFC, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; #endif /* DOINIT */ |