diff options
-rw-r--r-- | regcomp.c | 40 | ||||
-rw-r--r-- | regexec.c | 52 | ||||
-rw-r--r-- | t/re/re_tests | 4 |
3 files changed, 95 insertions, 1 deletions
@@ -11124,6 +11124,10 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) * changed since initialization, then there is a run-time definition. */ #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) +/* This converts the named class defined in regcomp.h to its equivalent class + * number defined in handy.h. */ +#define namedclass_to_classnum(class) ((class) / 2) + /* parse a class specification and produce either an ANYOF node that matches the pattern or perhaps will be optimized into an EXACTish node @@ -11865,6 +11869,7 @@ parseit: * Check if this is the case for this class */ if (element_count == 1) { U8 op = END; + U8 arg = 0; if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or [:digit:] or \p{foo} */ @@ -11942,7 +11947,26 @@ parseit: op = (invert) ? NVERTWS : VERTWS; break; + case ANYOF_MAX: + break; + default: + /* A generic posix class. All the /a ones can be handled + * by the POSIXA opcode. And all are closed under folding + * in the ASCII range, so FOLD doesn't matter */ + if (AT_LEAST_ASCII_RESTRICTED + || (! LOC && namedclass == ANYOF_ASCII)) + { + /* The odd numbered ones are the complements of the + * next-lower even number one */ + if (namedclass % 2 == 1) { + invert = ! invert; + namedclass--; + } + arg = namedclass_to_classnum(namedclass); + op = (invert) ? NPOSIXA : POSIXA; + } + break; } } else if (value == prevvalue) { @@ -11994,7 +12018,12 @@ parseit: ret = reg_node(pRExC_state, op); - if (PL_regkind[op] == EXACT) { + if (PL_regkind[op] == POSIXD) { + if (! SIZE_ONLY) { + FLAGS(ret) = arg; + } + } + else if (PL_regkind[op] == EXACT) { alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value); } @@ -13543,6 +13572,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } + else if (k == POSIXD) { + U8 index = FLAGS(o) * 2; + if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } + else { + sv_catpv(sv, anyofs[index]); + } + } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); #else @@ -1814,6 +1814,20 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, !is_HORIZWS_latin1(s) ); break; + case POSIXA: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character. The flag in this node type is the + * class number to pass to _generic_isCC() to build a mask for + * searching in PL_charclass[] */ + REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c))); + break; + case NPOSIXA: + REXEC_FBC_CSCAN( + !_generic_isCC_A(*s, FLAGS(c)), + !_generic_isCC_A(*s, FLAGS(c)) + ); + break; + case AHOCORASICKC: case AHOCORASICK: { @@ -3881,6 +3895,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) DIGITA, NDIGITA, isDIGIT_A, digit, "0"); + case POSIXA: + if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) { + sayNO; + } + /* Matched a utf8-invariant, so don't have to worry about utf8 */ + nextchr = UCHARAT(++locinput); + break; + case NPOSIXA: + if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) { + sayNO; + } + if (utf8_target) { + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + } + else { + nextchr = UCHARAT(++locinput); + } + break; + case CLUMP: /* Match \X: logical Unicode character. This is defined as a Unicode extended Grapheme Cluster */ /* From http://www.unicode.org/reports/tr29 (5.2 version). An @@ -6298,6 +6332,24 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; + + case POSIXA: + while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + break; + case NPOSIXA: + if (utf8_target) { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan += UTF8SKIP(scan); + } + } + else { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + } + break; case NALNUMA: if (utf8_target) { while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) { diff --git a/t/re/re_tests b/t/re/re_tests index 46332b4d62..3d281555c9 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1692,4 +1692,8 @@ ab[c\\\](??{"x"})]{3}d ab\\](d y - - [^\n]+ \nb y $& b [^\n]+ a\n y $& a +# /a has no effect on properties +(?a:\p{Any}) \x{100} y $& \x{100} +(?aa:\p{Any}) \x{100} y $& \x{100} + # vim: softtabstop=0 noexpandtab |