summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--regcomp.c40
-rw-r--r--regexec.c52
-rw-r--r--t/re/re_tests4
3 files changed, 95 insertions, 1 deletions
diff --git a/regcomp.c b/regcomp.c
index 5a87e9cc61..8f4884499e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -11124,6 +11124,10 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
* changed since initialization, then there is a run-time definition. */
#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
+/* This converts the named class defined in regcomp.h to its equivalent class
+ * number defined in handy.h. */
+#define namedclass_to_classnum(class) ((class) / 2)
+
/*
parse a class specification and produce either an ANYOF node that
matches the pattern or perhaps will be optimized into an EXACTish node
@@ -11865,6 +11869,7 @@ parseit:
* Check if this is the case for this class */
if (element_count == 1) {
U8 op = END;
+ U8 arg = 0;
if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
[:digit:] or \p{foo} */
@@ -11942,7 +11947,26 @@ parseit:
op = (invert) ? NVERTWS : VERTWS;
break;
+ case ANYOF_MAX:
+ break;
+ default:
+ /* A generic posix class. All the /a ones can be handled
+ * by the POSIXA opcode. And all are closed under folding
+ * in the ASCII range, so FOLD doesn't matter */
+ if (AT_LEAST_ASCII_RESTRICTED
+ || (! LOC && namedclass == ANYOF_ASCII))
+ {
+ /* The odd numbered ones are the complements of the
+ * next-lower even number one */
+ if (namedclass % 2 == 1) {
+ invert = ! invert;
+ namedclass--;
+ }
+ arg = namedclass_to_classnum(namedclass);
+ op = (invert) ? NPOSIXA : POSIXA;
+ }
+ break;
}
}
else if (value == prevvalue) {
@@ -11994,7 +12018,12 @@ parseit:
ret = reg_node(pRExC_state, op);
- if (PL_regkind[op] == EXACT) {
+ if (PL_regkind[op] == POSIXD) {
+ if (! SIZE_ONLY) {
+ FLAGS(ret) = arg;
+ }
+ }
+ else if (PL_regkind[op] == EXACT) {
alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
}
@@ -13543,6 +13572,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
+ else if (k == POSIXD) {
+ U8 index = FLAGS(o) * 2;
+ if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
+ Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
+ }
+ else {
+ sv_catpv(sv, anyofs[index]);
+ }
+ }
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
#else
diff --git a/regexec.c b/regexec.c
index af64a69f31..dca278b406 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1814,6 +1814,20 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
!is_HORIZWS_latin1(s)
);
break;
+ case POSIXA:
+ /* Don't need to worry about utf8, as it can match only a single
+ * byte invariant character. The flag in this node type is the
+ * class number to pass to _generic_isCC() to build a mask for
+ * searching in PL_charclass[] */
+ REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
+ break;
+ case NPOSIXA:
+ REXEC_FBC_CSCAN(
+ !_generic_isCC_A(*s, FLAGS(c)),
+ !_generic_isCC_A(*s, FLAGS(c))
+ );
+ break;
+
case AHOCORASICKC:
case AHOCORASICK:
{
@@ -3881,6 +3895,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
DIGITA, NDIGITA, isDIGIT_A,
digit, "0");
+ case POSIXA:
+ if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
+ sayNO;
+ }
+ /* Matched a utf8-invariant, so don't have to worry about utf8 */
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NPOSIXA:
+ if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
+ sayNO;
+ }
+ if (utf8_target) {
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ }
+ else {
+ nextchr = UCHARAT(++locinput);
+ }
+ break;
+
case CLUMP: /* Match \X: logical Unicode character. This is defined as
a Unicode extended Grapheme Cluster */
/* From http://www.unicode.org/reports/tr29 (5.2 version). An
@@ -6298,6 +6332,24 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
scan++;
}
break;
+
+ case POSIXA:
+ while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
+ scan++;
+ }
+ break;
+ case NPOSIXA:
+ if (utf8_target) {
+ while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+ scan += UTF8SKIP(scan);
+ }
+ }
+ else {
+ while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+ scan++;
+ }
+ }
+ break;
case NALNUMA:
if (utf8_target) {
while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
diff --git a/t/re/re_tests b/t/re/re_tests
index 46332b4d62..3d281555c9 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1692,4 +1692,8 @@ ab[c\\\](??{"x"})]{3}d ab\\](d y - -
[^\n]+ \nb y $& b
[^\n]+ a\n y $& a
+# /a has no effect on properties
+(?a:\p{Any}) \x{100} y $& \x{100}
+(?aa:\p{Any}) \x{100} y $& \x{100}
+
# vim: softtabstop=0 noexpandtab