regex: Allow any single char to be SIMPLE

This commit relaxes the previous requirement that an EXACTish node must contain a single Latin1-range character in order to be considered SIMPLE. Now it allows any single character, not just Latin1. This allows above-Unicode characters to be in optimizations like STAR or CURLY, instead of having to match with the more complex CURLYM; and it brings EXACTish nodes in alignment with other SIMPLE nodes, such as those matching \w or the dot metacharacter, which all along have supported any code point being SIMPLE
author: Karl Williamson <public@khwilliamson.com> 2012-10-06 14:57:38 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-10-09 11:16:05 -0600
commit: b40a2c17551b484a78122be98db5dc06bb4614d5 (patch)
tree: 52a59ff7f0c32df215790cd516e29dd5ea2f2aaf /regexec.c
parent: 51e68360f9d9caf2a439cb6a3ca38dcf1ad0e35d (diff)
download: perl-b40a2c17551b484a78122be98db5dc06bb4614d5.tar.gz
1 files changed, 26 insertions, 19 deletions
diff --git a/regexec.c b/regexec.c
index f2833ac901..05a96acbd0 100644
--- a/regexec.c
+++ b/regexec.c
@@ -5507,21 +5507,21 @@ NULL
 	} \
     }
 
-	case STAR:		/*  /A*B/ where A is width 1 */
+        case STAR:		/*  /A*B/ where A is width 1 char */
 	    ST.paren = 0;
 	    ST.min = 0;
 	    ST.max = REG_INFTY;
 	    scan = NEXTOPER(scan);
 	    goto repeat;
 
-	case PLUS:		/*  /A+B/ where A is width 1 */
+        case PLUS:		/*  /A+B/ where A is width 1 char */
 	    ST.paren = 0;
 	    ST.min = 1;
 	    ST.max = REG_INFTY;
 	    scan = NEXTOPER(scan);
 	    goto repeat;
 
-	case CURLYN:		/*  /(A){m,n}B/ where A is width 1 */
+	case CURLYN:		/*  /(A){m,n}B/ where A is width 1 char */
 	    ST.paren = scan->flags;	/* Which paren to set */
 	    ST.lastparen      = rex->lastparen;
 	    ST.lastcloseparen = rex->lastcloseparen;
@@ -5537,7 +5537,7 @@ NULL
             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
 	    goto repeat;
 
-	case CURLY:		/*  /A{m,n}B/ where A is width 1 */
+	case CURLY:		/*  /A{m,n}B/ where A is width 1 char */
 	    ST.paren = 0;
 	    ST.min = ARG1(scan);  /* min to match */
 	    ST.max = ARG2(scan);  /* max to match */
@@ -6344,25 +6344,30 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	scan = loceol;
 	break;
     case EXACT:
-	/* To get here, EXACTish nodes must have *byte* length == 1.  That
-	 * means they match only characters in the string that can be expressed
-	 * as a single byte.  For non-utf8 strings, that means a simple match.
-	 * For utf8 strings, the character matched must be an invariant, or
-	 * downgradable to a single byte.  The pattern's utf8ness is
-	 * irrelevant, as since it's a single byte, it either isn't utf8, or if
-	 * it is, it's an invariant */
-
 	c = (U8)*STRING(p);
-	assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
 
 	if (! utf8_target || UNI_IS_INVARIANT(c)) {
 	    while (scan < loceol && UCHARAT(scan) == c) {
 		scan++;
 	    }
 	}
+	else if (UTF_PATTERN) {
+            STRLEN scan_char_len;
+
+	    loceol = PL_regeol;
+
+	    while (hardcount < max
+                   && scan + (scan_char_len = UTF8SKIP(scan)) < loceol
+                   && scan_char_len <= STR_LEN(p)
+                   && memEQ(scan, STRING(p), scan_char_len))
+            {
+		scan += scan_char_len;
+		hardcount++;
+	    }
+        }
 	else {
 
-	    /* Here, the string is utf8, and the pattern char is different
+	    /* Here, the string is utf8, the pattern isn't, but <c> is different
 	     * in utf8 than not, so can't compare them directly.  Outside the
 	     * loop, find the two utf8 bytes that represent c, and then
 	     * look for those in sequence in the utf8 string */
@@ -6398,17 +6403,19 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case EXACTFU:
 	utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
 
-	/* The comments for the EXACT case above apply as well to these fold
-	 * ones */
-
     do_exactf:
 	c = (U8)*STRING(p);
 
-	if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
+	if (utf8_target
+            || OP(p) == EXACTFU_SS
+            || (UTF_PATTERN && ! UTF8_IS_INVARIANT(c)))
+        {
+            /* Use full Unicode fold matching */
 	    char *tmpeol = loceol;
+            STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
 	    while (hardcount < max
 		    && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
-				   STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
+                       STRING(p), NULL, pat_len, cBOOL(UTF_PATTERN), utf8_flags))
 	    {
 		scan = tmpeol;
 		tmpeol = loceol;
author	Karl Williamson <public@khwilliamson.com>	2012-10-06 14:57:38 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-10-09 11:16:05 -0600
commit	b40a2c17551b484a78122be98db5dc06bb4614d5 (patch)
tree	52a59ff7f0c32df215790cd516e29dd5ea2f2aaf /regexec.c
parent	51e68360f9d9caf2a439cb6a3ca38dcf1ad0e35d (diff)
download	perl-b40a2c17551b484a78122be98db5dc06bb4614d5.tar.gz