summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-10-06 14:57:38 -0600
committerKarl Williamson <public@khwilliamson.com>2012-10-09 11:16:05 -0600
commitb40a2c17551b484a78122be98db5dc06bb4614d5 (patch)
tree52a59ff7f0c32df215790cd516e29dd5ea2f2aaf /regexec.c
parent51e68360f9d9caf2a439cb6a3ca38dcf1ad0e35d (diff)
downloadperl-b40a2c17551b484a78122be98db5dc06bb4614d5.tar.gz
regex: Allow any single char to be SIMPLE
This commit relaxes the previous requirement that an EXACTish node must contain a single Latin1-range character in order to be considered SIMPLE. Now it allows any single character, not just Latin1. This allows above-Unicode characters to be in optimizations like STAR or CURLY, instead of having to match with the more complex CURLYM; and it brings EXACTish nodes in alignment with other SIMPLE nodes, such as those matching \w or the dot metacharacter, which all along have supported any code point being SIMPLE
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c45
1 files changed, 26 insertions, 19 deletions
diff --git a/regexec.c b/regexec.c
index f2833ac901..05a96acbd0 100644
--- a/regexec.c
+++ b/regexec.c
@@ -5507,21 +5507,21 @@ NULL
} \
}
- case STAR: /* /A*B/ where A is width 1 */
+ case STAR: /* /A*B/ where A is width 1 char */
ST.paren = 0;
ST.min = 0;
ST.max = REG_INFTY;
scan = NEXTOPER(scan);
goto repeat;
- case PLUS: /* /A+B/ where A is width 1 */
+ case PLUS: /* /A+B/ where A is width 1 char */
ST.paren = 0;
ST.min = 1;
ST.max = REG_INFTY;
scan = NEXTOPER(scan);
goto repeat;
- case CURLYN: /* /(A){m,n}B/ where A is width 1 */
+ case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
ST.paren = scan->flags; /* Which paren to set */
ST.lastparen = rex->lastparen;
ST.lastcloseparen = rex->lastcloseparen;
@@ -5537,7 +5537,7 @@ NULL
scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
goto repeat;
- case CURLY: /* /A{m,n}B/ where A is width 1 */
+ case CURLY: /* /A{m,n}B/ where A is width 1 char */
ST.paren = 0;
ST.min = ARG1(scan); /* min to match */
ST.max = ARG2(scan); /* max to match */
@@ -6344,25 +6344,30 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
scan = loceol;
break;
case EXACT:
- /* To get here, EXACTish nodes must have *byte* length == 1. That
- * means they match only characters in the string that can be expressed
- * as a single byte. For non-utf8 strings, that means a simple match.
- * For utf8 strings, the character matched must be an invariant, or
- * downgradable to a single byte. The pattern's utf8ness is
- * irrelevant, as since it's a single byte, it either isn't utf8, or if
- * it is, it's an invariant */
-
c = (U8)*STRING(p);
- assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
if (! utf8_target || UNI_IS_INVARIANT(c)) {
while (scan < loceol && UCHARAT(scan) == c) {
scan++;
}
}
+ else if (UTF_PATTERN) {
+ STRLEN scan_char_len;
+
+ loceol = PL_regeol;
+
+ while (hardcount < max
+ && scan + (scan_char_len = UTF8SKIP(scan)) < loceol
+ && scan_char_len <= STR_LEN(p)
+ && memEQ(scan, STRING(p), scan_char_len))
+ {
+ scan += scan_char_len;
+ hardcount++;
+ }
+ }
else {
- /* Here, the string is utf8, and the pattern char is different
+ /* Here, the string is utf8, the pattern isn't, but <c> is different
* in utf8 than not, so can't compare them directly. Outside the
* loop, find the two utf8 bytes that represent c, and then
* look for those in sequence in the utf8 string */
@@ -6398,17 +6403,19 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
case EXACTFU:
utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
- /* The comments for the EXACT case above apply as well to these fold
- * ones */
-
do_exactf:
c = (U8)*STRING(p);
- if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
+ if (utf8_target
+ || OP(p) == EXACTFU_SS
+ || (UTF_PATTERN && ! UTF8_IS_INVARIANT(c)))
+ {
+ /* Use full Unicode fold matching */
char *tmpeol = loceol;
+ STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
while (hardcount < max
&& foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
- STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
+ STRING(p), NULL, pat_len, cBOOL(UTF_PATTERN), utf8_flags))
{
scan = tmpeol;
tmpeol = loceol;