Don't bother doing POSIX charclass parsing if it

possibly cannot be so. Prepares way for charclass syntax like [[abc]||[def]] (or just [[abc][def]]) for union, [[\w]&&[$a]] for intersection, and [[a-z]&&[^def]] for subtraction. Currently /[[a]/ (or /[a[]/) parses as a character class containing two characters, "[" and "a", this may have to be broken for the syntax described above, otherwise we would have to scan the whole pattern to find out whether the square brackets match pairwise. Luckily, the special case of "[" doesn't seem to be documented (as opposed to "]" and "-"), so we may have better story for breaking it... One can always use \[ if one wants a literal "[", so there. p4raw-id: //depot/perl@12835
author: Jarkko Hietaniemi <jhi@iki.fi> 2001-11-03 18:34:08 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2001-11-03 18:34:08 +0000
commit: 9a86a77bd2e47579153b5c82d594bdb3778f11fc (patch)
tree: 0b3c3d773618327063adf546a2badaf3605e5473 /regcomp.c
parent: beeb77fc17e11913205723efb9351ff771305f35 (diff)
download: perl-9a86a77bd2e47579153b5c82d594bdb3778f11fc.tar.gz
1 files changed, 22 insertions, 18 deletions
diff --git a/regcomp.c b/regcomp.c
index 9c30566dbb..0a63f2233a 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3180,6 +3180,11 @@ S_regwhite(pTHX_ char *p, char *e)
    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
    but trigger failures because they are currently unimplemented. */
+
+#define POSIXCC_DONE(c)   ((c) == ':')
+#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
+#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
+
 STATIC I32
 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
 {
@@ -3188,13 +3193,11 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
 
     if (value == '[' && RExC_parse + 1 < RExC_end &&
 	/* I smell either [: or [= or [. -- POSIX has been here, right? */
-	(*RExC_parse == ':' ||
-	 *RExC_parse == '=' ||
-	 *RExC_parse == '.')) {
-	char  c = *RExC_parse;
+	POSIXCC(UCHARAT(RExC_parse))) {
+	char  c = UCHARAT(RExC_parse);
 	char* s = RExC_parse++;
 	
-	while (RExC_parse < RExC_end && *RExC_parse != c)
+	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
 	    RExC_parse++;
 	if (RExC_parse == RExC_end)
 	    /* Grandfather lone [:, [=, [. */
@@ -3202,7 +3205,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
 	else {
 	    char* t = RExC_parse++; /* skip over the c */
 
-  	    if (*RExC_parse == ']') {
+  	    if (UCHARAT(RExC_parse) == ']') {
   		RExC_parse++; /* skip over the ending ] */
   		posixcc = s + 1;
 		if (*s == ':') {
@@ -3291,7 +3294,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
 
 		    /* adjust RExC_parse so the warning shows after
 		       the class closes */
-		    while (*RExC_parse && *RExC_parse != ']')
+		    while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
 			RExC_parse++;
 		    Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
 		}
@@ -3310,9 +3313,7 @@ STATIC void
 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 {
     if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
-	(*RExC_parse == ':' ||
-	 *RExC_parse == '=' ||
-	 *RExC_parse == '.')) {
+	POSIXCC(UCHARAT(RExC_parse))) {
 	char *s = RExC_parse;
  	char  c = *s++;
 
@@ -3322,11 +3323,10 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 	    vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
 
 	    /* [[=foo=]] and [[.foo.]] are still future. */
-	    if (c == '=' || c == '.')
-	    {
+	    if (POSIXCC_NOTYET(c)) {
 		/* adjust RExC_parse so the error shows after
 		   the class closes */
-		while (*RExC_parse && *RExC_parse++ != ']')
+		while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
 		    ;
 		Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
 	    }
@@ -3338,6 +3338,7 @@ STATIC regnode *
 S_regclass(pTHX_ RExC_state_t *pRExC_state)
 {
     register UV value;
+    register UV nextvalue;
     register IV prevvalue = OOB_UNICODE;
     register IV range = 0;
     register regnode *ret;
@@ -3355,7 +3356,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
     if (!SIZE_ONLY)
 	ANYOF_FLAGS(ret) = 0;
 
-    if (*RExC_parse == '^') {	/* Complement of range. */
+    if (UCHARAT(RExC_parse) == '^') {	/* Complement of range. */
 	RExC_naughty++;
 	RExC_parse++;
 	if (!SIZE_ONLY)
@@ -3374,13 +3375,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 	listsv = newSVpvn("# comment\n", 10);
     }
 
-    if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
+    nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+
+    if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
 	checkposixcc(pRExC_state);
 
-    if (*RExC_parse == ']' || *RExC_parse == '-')
+    if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
 	goto charclassloop;		/* allow 1st char to be ] or - */
 
-    while (RExC_parse < RExC_end && *RExC_parse != ']') {
+    while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
 
     charclassloop:
 
@@ -3396,7 +3399,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 	}
 	else
 	    value = UCHARAT(RExC_parse++);
-	if (value == '[')
+	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+	if (value == '[' && POSIXCC(nextvalue))
 	    namedclass = regpposixcc(pRExC_state, value);
 	else if (value == '\\') {
 	    if (UTF) {
author	Jarkko Hietaniemi <jhi@iki.fi>	2001-11-03 18:34:08 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2001-11-03 18:34:08 +0000
commit	9a86a77bd2e47579153b5c82d594bdb3778f11fc (patch)
tree	0b3c3d773618327063adf546a2badaf3605e5473 /regcomp.c
parent	beeb77fc17e11913205723efb9351ff771305f35 (diff)
download	perl-9a86a77bd2e47579153b5c82d594bdb3778f11fc.tar.gz