summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c673
1 files changed, 477 insertions, 196 deletions
diff --git a/regcomp.c b/regcomp.c
index 59fe5a7d9f..3569b3bbf1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -163,6 +163,9 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#define LOC (PL_regflags & PMf_LOCALE)
#define FOLD (PL_regflags & PMf_FOLD)
+#define OOB_CHAR8 1234
+#define OOB_UTF8 123456
+
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
@@ -2093,12 +2096,17 @@ S_regwhite(pTHX_ char *p, char *e)
return p;
}
-/* parse POSIX character classes like [[:foo:]] */
-STATIC char*
+/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
+ Character classes ([:foo:]) can also be negated ([:^foo:]).
+ Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
+ Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
+ but trigger warnings because they are currently unimplemented. */
+STATIC I32
S_regpposixcc(pTHX_ I32 value)
{
dTHR;
char *posixcc = 0;
+ I32 namedclass = -1;
if (value == '[' && PL_regcomp_parse + 1 < PL_regxend &&
/* I smell either [: or [= or [. -- POSIX has been here, right? */
@@ -2114,26 +2122,120 @@ S_regpposixcc(pTHX_ I32 value)
/* Grandfather lone [:, [=, [. */
PL_regcomp_parse = s;
else {
- PL_regcomp_parse++; /* skip over the c */
- if (*PL_regcomp_parse == ']') {
- /* Not Implemented Yet.
- * (POSIX Extended Character Classes, that is)
- * The text between e.g. [: and :] would start
- * at s + 1 and stop at regcomp_parse - 2. */
- if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY)
+ char* t = PL_regcomp_parse++; /* skip over the c */
+
+ if (*PL_regcomp_parse == ']') {
+ PL_regcomp_parse++; /* skip over the ending ] */
+ posixcc = s + 1;
+ if (*s == ':') {
+ I32 complement = *posixcc == '^' ? *posixcc++ : 0;
+ I32 skip = 5; /* the most common skip */
+
+ switch (*posixcc) {
+ case 'a':
+ if (strnEQ(posixcc, "alnum", 5))
+ namedclass =
+ complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
+ else if (strnEQ(posixcc, "alpha", 5))
+ namedclass =
+ complement ? ANYOF_NALPHA : ANYOF_ALPHA;
+ else if (strnEQ(posixcc, "ascii", 5))
+ namedclass =
+ complement ? ANYOF_NASCII : ANYOF_ASCII;
+ break;
+ case 'c':
+ if (strnEQ(posixcc, "cntrl", 5))
+ namedclass =
+ complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
+ break;
+ case 'd':
+ if (strnEQ(posixcc, "digit", 5))
+ namedclass =
+ complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
+ break;
+ case 'g':
+ if (strnEQ(posixcc, "graph", 5))
+ namedclass =
+ complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
+ break;
+ case 'l':
+ if (strnEQ(posixcc, "lower", 5))
+ namedclass =
+ complement ? ANYOF_NLOWER : ANYOF_LOWER;
+ break;
+ case 'p':
+ if (strnEQ(posixcc, "print", 5))
+ namedclass =
+ complement ? ANYOF_NPRINT : ANYOF_PRINT;
+ else if (strnEQ(posixcc, "punct", 5))
+ namedclass =
+ complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
+ break;
+ case 's':
+ if (strnEQ(posixcc, "space", 5))
+ namedclass =
+ complement ? ANYOF_NSPACE : ANYOF_SPACE;
+ case 'u':
+ if (strnEQ(posixcc, "upper", 5))
+ namedclass =
+ complement ? ANYOF_NUPPER : ANYOF_UPPER;
+ break;
+ case 'w': /* this is not POSIX, this is the Perl \w */
+ if (strnEQ(posixcc, "word", 4)) {
+ namedclass =
+ complement ? ANYOF_NALNUM : ANYOF_ALNUM;
+ skip = 4;
+ }
+ break;
+ case 'x':
+ if (strnEQ(posixcc, "xdigit", 6)) {
+ namedclass =
+ complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
+ skip = 6;
+ }
+ break;
+ }
+ if ((namedclass == -1 ||
+ !(posixcc + skip + 2 < PL_regxend &&
+ (posixcc[skip] == ':' &&
+ posixcc[skip + 1] == ']'))))
+ Perl_croak(aTHX_ "Character class [:%.*s:] unknown",
+ t - s - 1, s + 1);
+ } else if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY)
+ /* [[=foo=]] and [[.foo.]] are still future. */
Perl_warner(aTHX_ WARN_UNSAFE,
- "Character class syntax [%c %c] is reserved for future extensions", c, c);
- PL_regcomp_parse++; /* skip over the ending ] */
- posixcc = s + 1;
- }
- else {
- /* maternal grandfather */
+ "Character class syntax [%c %c] is reserved for future extensions", c, c);
+ } else {
+ /* Maternal grandfather:
+ * "[:" ending in ":" but not in ":]" */
PL_regcomp_parse = s;
}
}
}
- return posixcc;
+ return namedclass;
+}
+
+STATIC void
+S_checkposixcc(pTHX)
+{
+ if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY &&
+ (*PL_regcomp_parse == ':' ||
+ *PL_regcomp_parse == '=' ||
+ *PL_regcomp_parse == '.')) {
+ char *s = PL_regcomp_parse;
+ char c = *s++;
+
+ while(*s && isALNUM(*s))
+ s++;
+ if (*s && c == *s && s[1] == ']') {
+ Perl_warner(aTHX_ WARN_UNSAFE,
+ "Character class syntax [%c %c] belongs inside character classes", c, c);
+ if (c == '=' || c == '.')
+ Perl_warner(aTHX_ WARN_UNSAFE,
+ "Character class syntax [%c %c] is reserved for future extensions", c, c);
+ }
+ }
}
STATIC regnode *
@@ -2142,142 +2244,319 @@ S_regclass(pTHX)
dTHR;
register char *opnd, *s;
register I32 value;
- register I32 lastvalue = 1234;
+ register I32 lastvalue = OOB_CHAR8;
register I32 range = 0;
register regnode *ret;
register I32 def;
I32 numlen;
+ I32 namedclass;
s = opnd = (char *) OPERAND(PL_regcode);
ret = reg_node(ANYOF);
- for (value = 0; value < 33; value++)
+ for (value = 0; value < ANYOF_SIZE; value++)
regc(0, s++);
if (*PL_regcomp_parse == '^') { /* Complement of range. */
PL_regnaughty++;
PL_regcomp_parse++;
if (!SIZE_ONLY)
- *opnd |= ANYOF_INVERT;
+ ANYOF_FLAGS(opnd) |= ANYOF_INVERT;
}
if (!SIZE_ONLY) {
PL_regcode += ANY_SKIP;
if (FOLD)
- *opnd |= ANYOF_FOLD;
+ ANYOF_FLAGS(opnd) |= ANYOF_FOLD;
if (LOC)
- *opnd |= ANYOF_LOCALE;
+ ANYOF_FLAGS(opnd) |= ANYOF_LOCALE;
}
else {
PL_regsize += ANY_SKIP;
}
+
+ checkposixcc();
+
if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-')
goto skipcond; /* allow 1st char to be ] or - */
while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
skipcond:
+ namedclass = -1;
value = UCHARAT(PL_regcomp_parse++);
if (value == '[')
- (void)regpposixcc(value); /* ignore the return value for now */
+ namedclass = regpposixcc(value);
else if (value == '\\') {
value = UCHARAT(PL_regcomp_parse++);
switch (value) {
- case 'w':
- if (!SIZE_ONLY) {
- if (LOC)
- *opnd |= ANYOF_ALNUML;
- else {
- for (value = 0; value < 256; value++)
- if (isALNUM(value))
- ANYOF_SET(opnd, value);
- }
+ case 'w': namedclass = ANYOF_ALNUM; break;
+ case 'W': namedclass = ANYOF_NALNUM; break;
+ case 's': namedclass = ANYOF_SPACE; break;
+ case 'S': namedclass = ANYOF_NSPACE; break;
+ case 'd': namedclass = ANYOF_DIGIT; break;
+ case 'D': namedclass = ANYOF_NDIGIT; break;
+ case 'n': value = '\n'; break;
+ case 'r': value = '\r'; break;
+ case 't': value = '\t'; break;
+ case 'f': value = '\f'; break;
+ case 'b': value = '\b'; break;
+ case 'e': value = '\033'; break;
+ case 'a': value = '\007'; break;
+ case 'x':
+ value = scan_hex(PL_regcomp_parse, 2, &numlen);
+ PL_regcomp_parse += numlen;
+ break;
+ case 'c':
+ value = UCHARAT(PL_regcomp_parse++);
+ value = toCTRL(value);
+ break;
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ value = scan_oct(--PL_regcomp_parse, 3, &numlen);
+ PL_regcomp_parse += numlen;
+ break;
+ }
+ }
+ if (!SIZE_ONLY && namedclass > -1) {
+ switch (namedclass) {
+ case ANYOF_ALNUM:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_ALNUM);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isALNUM(value))
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 'W':
- if (!SIZE_ONLY) {
- if (LOC)
- *opnd |= ANYOF_NALNUML;
- else {
- for (value = 0; value < 256; value++)
- if (!isALNUM(value))
- ANYOF_SET(opnd, value);
- }
+ break;
+ case ANYOF_NALNUM:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NALNUM);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isALNUM(value))
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 's':
- if (!SIZE_ONLY) {
- if (LOC)
- *opnd |= ANYOF_SPACEL;
- else {
- for (value = 0; value < 256; value++)
- if (isSPACE(value))
- ANYOF_SET(opnd, value);
- }
+ break;
+ case ANYOF_SPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_SPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isSPACE(value))
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 'S':
- if (!SIZE_ONLY) {
- if (LOC)
- *opnd |= ANYOF_NSPACEL;
- else {
- for (value = 0; value < 256; value++)
- if (!isSPACE(value))
- ANYOF_SET(opnd, value);
- }
+ break;
+ case ANYOF_NSPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NSPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isSPACE(value))
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 'd':
- if (!SIZE_ONLY) {
+ break;
+ case ANYOF_DIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_DIGIT);
+ else {
for (value = '0'; value <= '9'; value++)
- ANYOF_SET(opnd, value);
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 'D':
- if (!SIZE_ONLY) {
+ break;
+ case ANYOF_NDIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NDIGIT);
+ else {
for (value = 0; value < '0'; value++)
- ANYOF_SET(opnd, value);
+ ANYOF_BITMAP_SET(opnd, value);
for (value = '9' + 1; value < 256; value++)
- ANYOF_SET(opnd, value);
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_NALNUMC:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NALNUMC);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isALNUMC(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_ALNUMC:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_ALNUMC);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isALNUMC(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_ALPHA:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_ALPHA);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isALPHA(value))
+ ANYOF_BITMAP_SET(opnd, value);
}
- lastvalue = 1234;
- continue;
- case 'n':
- value = '\n';
break;
- case 'r':
- value = '\r';
+ case ANYOF_NALPHA:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NALPHA);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isALPHA(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 't':
- value = '\t';
+ case ANYOF_ASCII:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_ASCII);
+ else {
+ for (value = 0; value < 128; value++)
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 'f':
- value = '\f';
+ case ANYOF_NASCII:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NASCII);
+ else {
+ for (value = 128; value < 256; value++)
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 'b':
- value = '\b';
+ case ANYOF_CNTRL:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_CNTRL);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isCNTRL(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ lastvalue = OOB_CHAR8;
break;
- case 'e':
- value = '\033';
+ case ANYOF_NCNTRL:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NCNTRL);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isCNTRL(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 'a':
- value = '\007';
+ case ANYOF_GRAPH:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_GRAPH);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isGRAPH(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 'x':
- value = scan_hex(PL_regcomp_parse, 2, &numlen);
- PL_regcomp_parse += numlen;
+ case ANYOF_NGRAPH:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NGRAPH);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isGRAPH(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case 'c':
- value = UCHARAT(PL_regcomp_parse++);
- value = toCTRL(value);
+ case ANYOF_LOWER:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_LOWER);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isLOWER(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
break;
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- value = scan_oct(--PL_regcomp_parse, 3, &numlen);
- PL_regcomp_parse += numlen;
+ case ANYOF_NLOWER:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NLOWER);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isLOWER(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_PRINT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_PRINT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isPRINT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_NPRINT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NPRINT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isPRINT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_PUNCT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_PUNCT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isPUNCT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_NPUNCT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NPUNCT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isPUNCT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_UPPER:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_UPPER);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isUPPER(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_NUPPER:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NUPPER);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isUPPER(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_XDIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_XDIGIT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isXDIGIT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ case ANYOF_NXDIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(opnd, ANYOF_NXDIGIT);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isXDIGIT(value))
+ ANYOF_BITMAP_SET(opnd, value);
+ }
+ break;
+ default:
+ FAIL("invalid [::] class in regexp");
break;
}
+ if (LOC)
+ ANYOF_FLAGS(opnd) |= ANYOF_CLASS;
+ lastvalue = OOB_CHAR8;
}
+ else
if (range) {
if (lastvalue > value)
FAIL("invalid [] range in regexp");
@@ -2301,35 +2580,36 @@ S_regclass(pTHX)
if (isLOWER(lastvalue)) {
for (i = lastvalue; i <= value; i++)
if (isLOWER(i))
- ANYOF_SET(opnd, i);
+ ANYOF_BITMAP_SET(opnd, i);
} else {
for (i = lastvalue; i <= value; i++)
if (isUPPER(i))
- ANYOF_SET(opnd, i);
+ ANYOF_BITMAP_SET(opnd, i);
}
}
else
#endif
for ( ; lastvalue <= value; lastvalue++)
- ANYOF_SET(opnd, lastvalue);
+ ANYOF_BITMAP_SET(opnd, lastvalue);
}
lastvalue = value;
}
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
- if (!SIZE_ONLY && (*opnd & (0xFF ^ ANYOF_INVERT)) == ANYOF_FOLD) {
+ if (!SIZE_ONLY &&
+ (ANYOF_FLAGS(opnd) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) {
for (value = 0; value < 256; ++value) {
- if (ANYOF_TEST(opnd, value)) {
+ if (ANYOF_BITMAP_TEST(opnd, value)) {
I32 cf = PL_fold[value];
- ANYOF_SET(opnd, cf);
+ ANYOF_BITMAP_SET(opnd, cf);
}
}
- *opnd &= ~ANYOF_FOLD;
+ ANYOF_FLAGS(opnd) &= ~ANYOF_FOLD;
}
/* optimize inverted simple patterns (e.g. [^a-z]) */
- if (!SIZE_ONLY && (*opnd & 0xFF) == ANYOF_INVERT) {
- for (value = 0; value < 32; ++value)
- opnd[1 + value] ^= 0xFF;
- *opnd = 0;
+ if (!SIZE_ONLY && (ANYOF_FLAGS(opnd) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+ for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
+ opnd[ANYOF_BITMAP_OFFSET + value] ^= ANYOF_FLAGS_ALL;
+ ANYOF_FLAGS(opnd) = 0;
}
return ret;
}
@@ -2337,16 +2617,17 @@ S_regclass(pTHX)
STATIC regnode *
S_regclassutf8(pTHX)
{
+ dTHR;
register char *opnd, *e;
register U32 value;
- register U32 lastvalue = 123456;
+ register U32 lastvalue = OOB_UTF8;
register I32 range = 0;
register regnode *ret;
I32 numlen;
I32 n;
SV *listsv;
U8 flags = 0;
- dTHR;
+ I32 namedclass;
if (*PL_regcomp_parse == '^') { /* Complement of range. */
PL_regnaughty++;
@@ -2362,75 +2643,29 @@ S_regclassutf8(pTHX)
listsv = newSVpvn("# comment\n",10);
}
+ checkposixcc();
+
if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-')
goto skipcond; /* allow 1st char to be ] or - */
while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
skipcond:
+ namedclass = -1;
value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
PL_regcomp_parse += numlen;
if (value == '[')
- (void)regpposixcc(value); /* ignore the return value for now */
+ namedclass = regpposixcc(value);
else if (value == '\\') {
value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
PL_regcomp_parse += numlen;
switch (value) {
- case 'w':
- if (!SIZE_ONLY) {
- if (LOC)
- flags |= ANYOF_ALNUML;
-
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
- }
- lastvalue = 123456;
- continue;
- case 'W':
- if (!SIZE_ONLY) {
- if (LOC)
- flags |= ANYOF_NALNUML;
-
- Perl_sv_catpvf(aTHX_ listsv,
- "-utf8::IsAlpha\n-utf8::IsDigit\n0000\t%04x\n%04x\tffff\n",
- '_' - 1,
- '_' + 1);
- }
- lastvalue = 123456;
- continue;
- case 's':
- if (!SIZE_ONLY) {
- if (LOC)
- flags |= ANYOF_SPACEL;
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
- if (!PL_utf8_space)
- is_utf8_space((U8*)" ");
- }
- lastvalue = 123456;
- continue;
- case 'S':
- if (!SIZE_ONLY) {
- if (LOC)
- flags |= ANYOF_NSPACEL;
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::IsSpace\n");
- if (!PL_utf8_space)
- is_utf8_space((U8*)" ");
- }
- lastvalue = 123456;
- continue;
- case 'd':
- if (!SIZE_ONLY) {
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
- }
- lastvalue = 123456;
- continue;
- case 'D':
- if (!SIZE_ONLY) {
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::IsDigit\n");
- }
- lastvalue = 123456;
- continue;
+ case 'w': namedclass = ANYOF_ALNUM; break;
+ case 'W': namedclass = ANYOF_NALNUM; break;
+ case 's': namedclass = ANYOF_SPACE; break;
+ case 'S': namedclass = ANYOF_NSPACE; break;
+ case 'd': namedclass = ANYOF_DIGIT; break;
+ case 'D': namedclass = ANYOF_NDIGIT; break;
case 'p':
case 'P':
if (*PL_regcomp_parse == '{') {
@@ -2445,41 +2680,30 @@ S_regclassutf8(pTHX)
}
if (!SIZE_ONLY) {
if (value == 'p')
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::%.*s\n", n, PL_regcomp_parse);
+ Perl_sv_catpvf(aTHX_ listsv,
+ "+utf8::%.*s\n", n, PL_regcomp_parse);
else
Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::%.*s\n", n, PL_regcomp_parse);
+ "!utf8::%.*s\n", n, PL_regcomp_parse);
}
PL_regcomp_parse = e + 1;
- lastvalue = 123456;
+ lastvalue = OOB_UTF8;
continue;
- case 'n':
- value = '\n';
- break;
- case 'r':
- value = '\r';
- break;
- case 't':
- value = '\t';
- break;
- case 'f':
- value = '\f';
- break;
- case 'b':
- value = '\b';
- break;
- case 'e':
- value = '\033';
- break;
- case 'a':
- value = '\007';
- break;
+ case 'n': value = '\n'; break;
+ case 'r': value = '\r'; break;
+ case 't': value = '\t'; break;
+ case 'f': value = '\f'; break;
+ case 'b': value = '\b'; break;
+ case 'e': value = '\033'; break;
+ case 'a': value = '\007'; break;
case 'x':
if (*PL_regcomp_parse == '{') {
e = strchr(PL_regcomp_parse++, '}');
if (!e)
FAIL("Missing right brace on \\x{}");
- value = scan_hex(PL_regcomp_parse, e - PL_regcomp_parse, &numlen);
+ value = scan_hex(PL_regcomp_parse,
+ e - PL_regcomp_parse,
+ &numlen);
PL_regcomp_parse = e + 1;
}
else {
@@ -2498,7 +2722,64 @@ S_regclassutf8(pTHX)
break;
}
}
- if (range) {
+ if (!SIZE_ONLY && namedclass > -1) {
+ switch (namedclass) {
+ case ANYOF_ALNUM:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break;
+ case ANYOF_NALNUM:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break;
+ case ANYOF_ALNUMC:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break;
+ case ANYOF_NALNUMC:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break;
+ case ANYOF_ALPHA:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break;
+ case ANYOF_NALPHA:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break;
+ case ANYOF_ASCII:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break;
+ case ANYOF_NASCII:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break;
+ case ANYOF_CNTRL:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break;
+ case ANYOF_NCNTRL:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break;
+ case ANYOF_GRAPH:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break;
+ case ANYOF_NGRAPH:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break;
+ case ANYOF_DIGIT:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break;
+ case ANYOF_NDIGIT:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break;
+ case ANYOF_LOWER:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break;
+ case ANYOF_NLOWER:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break;
+ case ANYOF_PRINT:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break;
+ case ANYOF_NPRINT:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break;
+ case ANYOF_PUNCT:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break;
+ case ANYOF_NPUNCT:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break;
+ case ANYOF_SPACE:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break;
+ case ANYOF_NSPACE:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break;
+ case ANYOF_UPPER:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break;
+ case ANYOF_NUPPER:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break;
+ case ANYOF_XDIGIT:
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break;
+ case ANYOF_NXDIGIT:
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break;
+ }
+ }
+ else
+ if (range) {
if (lastvalue > value)
FAIL("invalid [] range in regexp");
if (!SIZE_ONLY)