summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-11-03 18:34:08 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-11-03 18:34:08 +0000
commit9a86a77bd2e47579153b5c82d594bdb3778f11fc (patch)
tree0b3c3d773618327063adf546a2badaf3605e5473 /regcomp.c
parentbeeb77fc17e11913205723efb9351ff771305f35 (diff)
downloadperl-9a86a77bd2e47579153b5c82d594bdb3778f11fc.tar.gz
Don't bother doing POSIX charclass parsing if it
possibly cannot be so. Prepares way for charclass syntax like [[abc]||[def]] (or just [[abc][def]]) for union, [[\w]&&[$a]] for intersection, and [[a-z]&&[^def]] for subtraction. Currently /[[a]/ (or /[a[]/) parses as a character class containing two characters, "[" and "a", this may have to be broken for the syntax described above, otherwise we would have to scan the whole pattern to find out whether the square brackets match pairwise. Luckily, the special case of "[" doesn't seem to be documented (as opposed to "]" and "-"), so we may have better story for breaking it... One can always use \[ if one wants a literal "[", so there. p4raw-id: //depot/perl@12835
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c40
1 files changed, 22 insertions, 18 deletions
diff --git a/regcomp.c b/regcomp.c
index 9c30566dbb..0a63f2233a 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3180,6 +3180,11 @@ S_regwhite(pTHX_ char *p, char *e)
Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
but trigger failures because they are currently unimplemented. */
+
+#define POSIXCC_DONE(c) ((c) == ':')
+#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
+#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
+
STATIC I32
S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
{
@@ -3188,13 +3193,11 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
if (value == '[' && RExC_parse + 1 < RExC_end &&
/* I smell either [: or [= or [. -- POSIX has been here, right? */
- (*RExC_parse == ':' ||
- *RExC_parse == '=' ||
- *RExC_parse == '.')) {
- char c = *RExC_parse;
+ POSIXCC(UCHARAT(RExC_parse))) {
+ char c = UCHARAT(RExC_parse);
char* s = RExC_parse++;
- while (RExC_parse < RExC_end && *RExC_parse != c)
+ while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
RExC_parse++;
if (RExC_parse == RExC_end)
/* Grandfather lone [:, [=, [. */
@@ -3202,7 +3205,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
else {
char* t = RExC_parse++; /* skip over the c */
- if (*RExC_parse == ']') {
+ if (UCHARAT(RExC_parse) == ']') {
RExC_parse++; /* skip over the ending ] */
posixcc = s + 1;
if (*s == ':') {
@@ -3291,7 +3294,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
/* adjust RExC_parse so the warning shows after
the class closes */
- while (*RExC_parse && *RExC_parse != ']')
+ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
RExC_parse++;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
@@ -3310,9 +3313,7 @@ STATIC void
S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
{
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
- (*RExC_parse == ':' ||
- *RExC_parse == '=' ||
- *RExC_parse == '.')) {
+ POSIXCC(UCHARAT(RExC_parse))) {
char *s = RExC_parse;
char c = *s++;
@@ -3322,11 +3323,10 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
/* [[=foo=]] and [[.foo.]] are still future. */
- if (c == '=' || c == '.')
- {
+ if (POSIXCC_NOTYET(c)) {
/* adjust RExC_parse so the error shows after
the class closes */
- while (*RExC_parse && *RExC_parse++ != ']')
+ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
@@ -3338,6 +3338,7 @@ STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state)
{
register UV value;
+ register UV nextvalue;
register IV prevvalue = OOB_UNICODE;
register IV range = 0;
register regnode *ret;
@@ -3355,7 +3356,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!SIZE_ONLY)
ANYOF_FLAGS(ret) = 0;
- if (*RExC_parse == '^') { /* Complement of range. */
+ if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_naughty++;
RExC_parse++;
if (!SIZE_ONLY)
@@ -3374,13 +3375,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
listsv = newSVpvn("# comment\n", 10);
}
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
+ nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+
+ if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
checkposixcc(pRExC_state);
- if (*RExC_parse == ']' || *RExC_parse == '-')
+ if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
goto charclassloop; /* allow 1st char to be ] or - */
- while (RExC_parse < RExC_end && *RExC_parse != ']') {
+ while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
charclassloop:
@@ -3396,7 +3399,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
}
else
value = UCHARAT(RExC_parse++);
- if (value == '[')
+ nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+ if (value == '[' && POSIXCC(nextvalue))
namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
if (UTF) {