summaryrefslogtreecommitdiff
path: root/src/regex.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/regex.c')
-rw-r--r--src/regex.c645
1 files changed, 305 insertions, 340 deletions
diff --git a/src/regex.c b/src/regex.c
index 164eb4612ae..1917a8480ae 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -50,6 +50,7 @@
#include <config.h>
#include <stddef.h>
+#include <stdlib.h>
#ifdef emacs
/* We need this for `regex.h', and perhaps for the Emacs include files. */
@@ -215,7 +216,7 @@ xmalloc (size_t size)
void *val = malloc (size);
if (!val && size)
{
- write (2, "virtual memory exhausted\n", 25);
+ write (STDERR_FILENO, "virtual memory exhausted\n", 25);
exit (1);
}
return val;
@@ -233,7 +234,7 @@ xrealloc (void *block, size_t size)
val = realloc (block, size);
if (!val && size)
{
- write (2, "virtual memory exhausted\n", 25);
+ write (STDERR_FILENO, "virtual memory exhausted\n", 25);
exit (1);
}
return val;
@@ -324,7 +325,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
? (((c) >= 'a' && (c) <= 'z') \
|| ((c) >= 'A' && (c) <= 'Z') \
|| ((c) >= '0' && (c) <= '9')) \
- : (alphabeticp (c) || decimalnump (c)))
+ : alphanumericp (c))
# define ISALPHA(c) (IS_REAL_ASCII (c) \
? (((c) >= 'a' && (c) <= 'z') \
@@ -443,25 +444,12 @@ init_syntax_once (void)
#else /* not REGEX_MALLOC */
-/* Emacs already defines alloca, sometimes. */
-# ifndef alloca
-
-/* Make alloca work the best possible way. */
-# ifdef __GNUC__
-# define alloca __builtin_alloca
-# else /* not __GNUC__ */
-# ifdef HAVE_ALLOCA_H
-# include <alloca.h>
-# endif /* HAVE_ALLOCA_H */
-# endif /* not __GNUC__ */
-
-# endif /* not alloca */
-
# ifdef emacs
# define REGEX_USE_SAFE_ALLOCA USE_SAFE_ALLOCA
# define REGEX_SAFE_FREE() SAFE_FREE ()
# define REGEX_ALLOCATE SAFE_ALLOCA
# else
+# include <alloca.h>
# define REGEX_ALLOCATE alloca
# endif
@@ -513,8 +501,6 @@ init_syntax_once (void)
#define BYTEWIDTH 8 /* In bits. */
-#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
-
#ifndef emacs
# undef max
# undef min
@@ -669,9 +655,7 @@ typedef enum
notsyntaxspec
#ifdef emacs
- ,before_dot, /* Succeeds if before point. */
- at_dot, /* Succeeds if at point. */
- after_dot, /* Succeeds if after point. */
+ , at_dot, /* Succeeds if at point. */
/* Matches any character whose category-set contains the specified
category. The operator is followed by a byte which contains a
@@ -783,44 +767,6 @@ extract_number_and_incr (re_char **source)
and end. */
#define CHARSET_RANGE_TABLE_END(range_table, count) \
((range_table) + (count) * 2 * 3)
-
-/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
- COUNT is number of ranges in RANGE_TABLE. */
-#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
- do \
- { \
- re_wchar_t range_start, range_end; \
- re_char *rtp; \
- re_char *range_table_end \
- = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
- \
- for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
- { \
- EXTRACT_CHARACTER (range_start, rtp); \
- EXTRACT_CHARACTER (range_end, rtp + 3); \
- \
- if (range_start <= (c) && (c) <= range_end) \
- { \
- (not) = !(not); \
- break; \
- } \
- } \
- } \
- while (0)
-
-/* Test if C is in range table of CHARSET. The flag NOT is negated if
- C is listed in it. */
-#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
- do \
- { \
- /* Number of ranges in range table. */ \
- int count; \
- re_char *range_table = CHARSET_RANGE_TABLE (charset); \
- \
- EXTRACT_NUMBER_AND_INCR (count, range_table); \
- CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
- } \
- while (0)
/* If DEBUG is defined, Regex prints many voluminous messages about what
it is doing (if the variable `debug' is nonzero). If linked with the
@@ -1091,18 +1037,10 @@ print_partial_compiled_pattern (re_char *start, re_char *end)
break;
# ifdef emacs
- case before_dot:
- fprintf (stderr, "/before_dot");
- break;
-
case at_dot:
fprintf (stderr, "/at_dot");
break;
- case after_dot:
- fprintf (stderr, "/after_dot");
- break;
-
case categoryspec:
fprintf (stderr, "/categoryspec");
mcnt = *p++;
@@ -1156,7 +1094,9 @@ print_compiled_pattern (struct re_pattern_buffer *bufp)
printf ("no_sub: %d\t", bufp->no_sub);
printf ("not_bol: %d\t", bufp->not_bol);
printf ("not_eol: %d\t", bufp->not_eol);
+#ifndef emacs
printf ("syntax: %lx\n", bufp->syntax);
+#endif
fflush (stdout);
/* Perhaps we should print the translate table? */
}
@@ -1197,13 +1137,8 @@ print_double_string (re_char *where, re_char *string1, ssize_t size1,
#endif /* not DEBUG */
-/* Use this to suppress gcc's `...may be used before initialized' warnings. */
-#ifdef lint
-# define IF_LINT(Code) Code
-#else
-# define IF_LINT(Code) /* empty */
-#endif
-
+#ifndef emacs
+
/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
also be assigned to arbitrarily: each pattern buffer stores its own
syntax, so it can be changed between regex compilations. */
@@ -1229,15 +1164,7 @@ re_set_syntax (reg_syntax_t syntax)
}
WEAK_ALIAS (__re_set_syntax, re_set_syntax)
-/* Regexp to use to replace spaces, or NULL meaning don't. */
-static const_re_char *whitespace_regexp;
-
-void
-re_set_whitespace_regexp (const char *regexp)
-{
- whitespace_regexp = (const_re_char *) regexp;
-}
-WEAK_ALIAS (__re_set_syntax, re_set_syntax)
+#endif
/* This table gives an error message for each of the error codes listed
in regex.h. Obviously the order here has to be same as there.
@@ -1619,7 +1546,12 @@ do { \
/* Subroutine declarations and macros for regex_compile. */
static reg_errcode_t regex_compile (re_char *pattern, size_t size,
+#ifdef emacs
+ bool posix_backtracking,
+ const char *whitespace_regexp,
+#else
reg_syntax_t syntax,
+#endif
struct re_pattern_buffer *bufp);
static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
@@ -2014,29 +1946,96 @@ struct range_table_work_area
#if ! WIDE_CHAR_SUPPORT
-/* Map a string to the char class it names (if any). */
+/* Parse a character class, i.e. string such as "[:name:]". *strp
+ points to the string to be parsed and limit is length, in bytes, of
+ that string.
+
+ If *strp point to a string that begins with "[:name:]", where name is
+ a non-empty sequence of lower case letters, *strp will be advanced past the
+ closing square bracket and RECC_* constant which maps to the name will be
+ returned. If name is not a valid character class name zero, or RECC_ERROR,
+ is returned.
+
+ Otherwise, if *strp doesn’t begin with "[:name:]", -1 is returned.
+
+ The function can be used on ASCII and multibyte (UTF-8-encoded) strings.
+ */
re_wctype_t
-re_wctype (const_re_char *str)
+re_wctype_parse (const unsigned char **strp, unsigned limit)
{
- const char *string = (const char *) str;
- if (STREQ (string, "alnum")) return RECC_ALNUM;
- else if (STREQ (string, "alpha")) return RECC_ALPHA;
- else if (STREQ (string, "word")) return RECC_WORD;
- else if (STREQ (string, "ascii")) return RECC_ASCII;
- else if (STREQ (string, "nonascii")) return RECC_NONASCII;
- else if (STREQ (string, "graph")) return RECC_GRAPH;
- else if (STREQ (string, "lower")) return RECC_LOWER;
- else if (STREQ (string, "print")) return RECC_PRINT;
- else if (STREQ (string, "punct")) return RECC_PUNCT;
- else if (STREQ (string, "space")) return RECC_SPACE;
- else if (STREQ (string, "upper")) return RECC_UPPER;
- else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
- else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
- else if (STREQ (string, "digit")) return RECC_DIGIT;
- else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
- else if (STREQ (string, "cntrl")) return RECC_CNTRL;
- else if (STREQ (string, "blank")) return RECC_BLANK;
- else return 0;
+ const char *beg = (const char *)*strp, *it;
+
+ if (limit < 4 || beg[0] != '[' || beg[1] != ':')
+ return -1;
+
+ beg += 2; /* skip opening ‘[:’ */
+ limit -= 3; /* opening ‘[:’ and half of closing ‘:]’; --limit handles rest */
+ for (it = beg; it[0] != ':' || it[1] != ']'; ++it)
+ if (!--limit)
+ return -1;
+
+ *strp = (const unsigned char *)(it + 2);
+
+ /* Sort tests in the length=five case by frequency the classes to minimize
+ number of times we fail the comparison. The frequencies of character class
+ names used in Emacs sources as of 2016-07-27:
+
+ $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + |
+ sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr
+ 213 [:alnum:]
+ 104 [:alpha:]
+ 62 [:space:]
+ 39 [:digit:]
+ 36 [:blank:]
+ 26 [:word:]
+ 26 [:upper:]
+ 21 [:lower:]
+ 10 [:xdigit:]
+ 10 [:punct:]
+ 10 [:ascii:]
+ 4 [:nonascii:]
+ 4 [:graph:]
+ 2 [:print:]
+ 2 [:cntrl:]
+ 1 [:ff:]
+
+ If you update this list, consider also updating chain of or’ed conditions
+ in execute_charset function.
+ */
+
+ switch (it - beg) {
+ case 4:
+ if (!memcmp (beg, "word", 4)) return RECC_WORD;
+ break;
+ case 5:
+ if (!memcmp (beg, "alnum", 5)) return RECC_ALNUM;
+ if (!memcmp (beg, "alpha", 5)) return RECC_ALPHA;
+ if (!memcmp (beg, "space", 5)) return RECC_SPACE;
+ if (!memcmp (beg, "digit", 5)) return RECC_DIGIT;
+ if (!memcmp (beg, "blank", 5)) return RECC_BLANK;
+ if (!memcmp (beg, "upper", 5)) return RECC_UPPER;
+ if (!memcmp (beg, "lower", 5)) return RECC_LOWER;
+ if (!memcmp (beg, "punct", 5)) return RECC_PUNCT;
+ if (!memcmp (beg, "ascii", 5)) return RECC_ASCII;
+ if (!memcmp (beg, "graph", 5)) return RECC_GRAPH;
+ if (!memcmp (beg, "print", 5)) return RECC_PRINT;
+ if (!memcmp (beg, "cntrl", 5)) return RECC_CNTRL;
+ break;
+ case 6:
+ if (!memcmp (beg, "xdigit", 6)) return RECC_XDIGIT;
+ break;
+ case 7:
+ if (!memcmp (beg, "unibyte", 7)) return RECC_UNIBYTE;
+ break;
+ case 8:
+ if (!memcmp (beg, "nonascii", 8)) return RECC_NONASCII;
+ break;
+ case 9:
+ if (!memcmp (beg, "multibyte", 9)) return RECC_MULTIBYTE;
+ break;
+ }
+
+ return RECC_ERROR;
}
/* True if CH is in the char class CC. */
@@ -2382,6 +2381,9 @@ static boolean group_in_compile_stack (compile_stack_type compile_stack,
/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
Returns one of error codes defined in `regex.h', or zero for success.
+ If WHITESPACE_REGEXP is given (only #ifdef emacs), it is used instead of
+ a space character in PATTERN.
+
Assumes the `allocated' (and perhaps `buffer') and `translate'
fields are set in BUFP on entry.
@@ -2414,7 +2416,15 @@ do { \
} while (0)
static reg_errcode_t
-regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
+regex_compile (const_re_char *pattern, size_t size,
+#ifdef emacs
+# define syntax RE_SYNTAX_EMACS
+ bool posix_backtracking,
+ const char *whitespace_regexp,
+#else
+ reg_syntax_t syntax,
+# define posix_backtracking (!(syntax & RE_NO_POSIX_BACKTRACKING))
+#endif
struct re_pattern_buffer *bufp)
{
/* We fetch characters from PATTERN here. */
@@ -2467,14 +2477,16 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
/* If the object matched can contain multibyte characters. */
const boolean multibyte = RE_MULTIBYTE_P (bufp);
+#ifdef emacs
/* Nonzero if we have pushed down into a subpattern. */
int in_subpattern = 0;
/* These hold the values of p, pattern, and pend from the main
pattern when we have pushed into a subpattern. */
- re_char *main_p IF_LINT (= NULL);
- re_char *main_pattern IF_LINT (= NULL);
- re_char *main_pend IF_LINT (= NULL);
+ re_char *main_p;
+ re_char *main_pattern;
+ re_char *main_pend;
+#endif
#ifdef DEBUG
debug++;
@@ -2501,7 +2513,9 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
range_table_work.allocated = 0;
/* Initialize the pattern buffer. */
+#ifndef emacs
bufp->syntax = syntax;
+#endif
bufp->fastmap_accurate = 0;
bufp->not_bol = bufp->not_eol = 0;
bufp->used_syntax = 0;
@@ -2543,6 +2557,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
{
if (p == pend)
{
+#ifdef emacs
/* If this is the end of an included regexp,
pop back to the main regexp and try again. */
if (in_subpattern)
@@ -2553,6 +2568,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
pend = main_pend;
continue;
}
+#endif
/* If this is the end of the main regexp, we are done. */
break;
}
@@ -2561,6 +2577,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
switch (c)
{
+#ifdef emacs
case ' ':
{
re_char *p1 = p;
@@ -2589,10 +2606,11 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
main_p = p1;
main_pend = pend;
main_pattern = pattern;
- p = pattern = whitespace_regexp;
- pend = p + strlen ((const char *) p);
+ p = pattern = (re_char *) whitespace_regexp;
+ pend = p + strlen (whitespace_regexp);
break;
}
+#endif
case '^':
{
@@ -2821,10 +2839,69 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
{
boolean escaped_char = false;
const unsigned char *p2 = p;
+ re_wctype_t cc;
re_wchar_t ch;
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+ /* See if we're at the beginning of a possible character
+ class. */
+ if (syntax & RE_CHAR_CLASSES &&
+ (cc = re_wctype_parse(&p, pend - p)) != -1)
+ {
+ if (cc == 0)
+ FREE_STACK_RETURN (REG_ECTYPE);
+
+ if (p == pend)
+ FREE_STACK_RETURN (REG_EBRACK);
+
+#ifndef emacs
+ for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
+ if (re_iswctype (btowc (ch), cc))
+ {
+ c = TRANSLATE (ch);
+ if (c < (1 << BYTEWIDTH))
+ SET_LIST_BIT (c);
+ }
+#else /* emacs */
+ /* Most character classes in a multibyte match just set
+ a flag. Exceptions are is_blank, is_digit, is_cntrl, and
+ is_xdigit, since they can only match ASCII characters.
+ We don't need to handle them for multibyte. */
+
+ /* Setup the gl_state object to its buffer-defined value.
+ This hardcodes the buffer-global syntax-table for ASCII
+ chars, while the other chars will obey syntax-table
+ properties. It's not ideal, but it's the way it's been
+ done until now. */
+ SETUP_BUFFER_SYNTAX_TABLE ();
+
+ for (c = 0; c < 0x80; ++c)
+ if (re_iswctype (c, cc))
+ {
+ SET_LIST_BIT (c);
+ c1 = TRANSLATE (c);
+ if (c1 == c)
+ continue;
+ if (ASCII_CHAR_P (c1))
+ SET_LIST_BIT (c1);
+ else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
+ SET_LIST_BIT (c1);
+ }
+ SET_RANGE_TABLE_WORK_AREA_BIT
+ (range_table_work, re_wctype_to_bit (cc));
+#endif /* emacs */
+ /* In most cases the matching rule for char classes only
+ uses the syntax table for multibyte chars, so that the
+ content of the syntax-table is not hardcoded in the
+ range_table. SPACE and WORD are the two exceptions. */
+ if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
+ bufp->used_syntax = 1;
+
+ /* Repeat the loop. */
+ continue;
+ }
+
/* Don't translate yet. The range TRANSLATE(X..Y) cannot
always be determined from TRANSLATE(X) and TRANSLATE(Y)
So the translation is done later in a loop. Example:
@@ -2848,119 +2925,6 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
break;
}
- /* See if we're at the beginning of a possible character
- class. */
-
- if (!escaped_char &&
- syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
- {
- /* Leave room for the null. */
- unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
- const unsigned char *class_beg;
-
- PATFETCH (c);
- c1 = 0;
- class_beg = p;
-
- /* If pattern is `[[:'. */
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
- for (;;)
- {
- PATFETCH (c);
- if ((c == ':' && *p == ']') || p == pend)
- break;
- if (c1 < CHAR_CLASS_MAX_LENGTH)
- str[c1++] = c;
- else
- /* This is in any case an invalid class name. */
- str[0] = '\0';
- }
- str[c1] = '\0';
-
- /* If isn't a word bracketed by `[:' and `:]':
- undo the ending character, the letters, and
- leave the leading `:' and `[' (but set bits for
- them). */
- if (c == ':' && *p == ']')
- {
- re_wctype_t cc = re_wctype (str);
-
- if (cc == 0)
- FREE_STACK_RETURN (REG_ECTYPE);
-
- /* Throw away the ] at the end of the character
- class. */
- PATFETCH (c);
-
- if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
-
-#ifndef emacs
- for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
- if (re_iswctype (btowc (ch), cc))
- {
- c = TRANSLATE (ch);
- if (c < (1 << BYTEWIDTH))
- SET_LIST_BIT (c);
- }
-#else /* emacs */
- /* Most character classes in a multibyte match
- just set a flag. Exceptions are is_blank,
- is_digit, is_cntrl, and is_xdigit, since
- they can only match ASCII characters. We
- don't need to handle them for multibyte.
- They are distinguished by a negative wctype. */
-
- /* Setup the gl_state object to its buffer-defined
- value. This hardcodes the buffer-global
- syntax-table for ASCII chars, while the other chars
- will obey syntax-table properties. It's not ideal,
- but it's the way it's been done until now. */
- SETUP_BUFFER_SYNTAX_TABLE ();
-
- for (ch = 0; ch < 256; ++ch)
- {
- c = RE_CHAR_TO_MULTIBYTE (ch);
- if (! CHAR_BYTE8_P (c)
- && re_iswctype (c, cc))
- {
- SET_LIST_BIT (ch);
- c1 = TRANSLATE (c);
- if (c1 == c)
- continue;
- if (ASCII_CHAR_P (c1))
- SET_LIST_BIT (c1);
- else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
- SET_LIST_BIT (c1);
- }
- }
- SET_RANGE_TABLE_WORK_AREA_BIT
- (range_table_work, re_wctype_to_bit (cc));
-#endif /* emacs */
- /* In most cases the matching rule for char classes
- only uses the syntax table for multibyte chars,
- so that the content of the syntax-table is not
- hardcoded in the range_table. SPACE and WORD are
- the two exceptions. */
- if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
- bufp->used_syntax = 1;
-
- /* Repeat the loop. */
- continue;
- }
- else
- {
- /* Go back to right after the "[:". */
- p = class_beg;
- SET_LIST_BIT ('[');
-
- /* Because the `:' may start the range, we
- can't simply set bit and repeat the loop.
- Instead, just set it to C and handle below. */
- c = ':';
- }
- }
-
if (p < pend && p[0] == '-' && p[1] != ']')
{
@@ -3467,8 +3431,6 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
goto normal_char;
#ifdef emacs
- /* There is no way to specify the before_dot and after_dot
- operators. rms says this is ok. --karl */
case '=':
laststart = b;
BUF_PUSH (at_dot);
@@ -3675,7 +3637,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
/* If we don't want backtracking, force success
the first time we reach the end of the compiled pattern. */
- if (syntax & RE_NO_POSIX_BACKTRACKING)
+ if (!posix_backtracking)
BUF_PUSH (succeed);
/* We have succeeded; set the length of the buffer. */
@@ -3710,6 +3672,12 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
#endif /* not MATCH_MAY_ALLOCATE */
FREE_STACK_RETURN (REG_NOERROR);
+
+#ifdef emacs
+# undef syntax
+#else
+# undef posix_backtracking
+#endif
} /* regex_compile */
/* Subroutines for `regex_compile'. */
@@ -4045,9 +4013,7 @@ analyze_first (const_re_char *p, const_re_char *pend, char *fastmap,
/* All cases after this match the empty string. These end with
`continue'. */
- case before_dot:
case at_dot:
- case after_dot:
#endif /* !emacs */
case no_op:
case begline:
@@ -4668,6 +4634,73 @@ skip_noops (const_re_char *p, const_re_char *pend)
return p;
}
+/* Test if C matches charset op. *PP points to the charset or charset_not
+ opcode. When the function finishes, *PP will be advanced past that opcode.
+ C is character to test (possibly after translations) and CORIG is original
+ character (i.e. without any translations). UNIBYTE denotes whether c is
+ unibyte or multibyte character. */
+static bool
+execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
+{
+ re_char *p = *pp, *rtp = NULL;
+ bool not = (re_opcode_t) *p == charset_not;
+
+ if (CHARSET_RANGE_TABLE_EXISTS_P (p))
+ {
+ int count;
+ rtp = CHARSET_RANGE_TABLE (p);
+ EXTRACT_NUMBER_AND_INCR (count, rtp);
+ *pp = CHARSET_RANGE_TABLE_END ((rtp), (count));
+ }
+ else
+ *pp += 2 + CHARSET_BITMAP_SIZE (p);
+
+ if (unibyte && c < (1 << BYTEWIDTH))
+ { /* Lookup bitmap. */
+ /* Cast to `unsigned' instead of `unsigned char' in
+ case the bit list is a full 32 bytes long. */
+ if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH)
+ && p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
+ return !not;
+ }
+#ifdef emacs
+ else if (rtp)
+ {
+ int class_bits = CHARSET_RANGE_TABLE_BITS (p);
+ re_wchar_t range_start, range_end;
+
+ /* Sort tests by the most commonly used classes with some adjustment to which
+ tests are easiest to perform. Take a look at comment in re_wctype_parse
+ for table with frequencies of character class names. */
+
+ if ((class_bits & BIT_MULTIBYTE) ||
+ (class_bits & BIT_ALNUM && ISALNUM (c)) ||
+ (class_bits & BIT_ALPHA && ISALPHA (c)) ||
+ (class_bits & BIT_SPACE && ISSPACE (c)) ||
+ (class_bits & BIT_WORD && ISWORD (c)) ||
+ ((class_bits & BIT_UPPER) &&
+ (ISUPPER (c) || (corig != c &&
+ c == downcase (corig) && ISLOWER (c)))) ||
+ ((class_bits & BIT_LOWER) &&
+ (ISLOWER (c) || (corig != c &&
+ c == upcase (corig) && ISUPPER(c)))) ||
+ (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
+ (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
+ (class_bits & BIT_PRINT && ISPRINT (c)))
+ return !not;
+
+ for (p = *pp; rtp < p; rtp += 2 * 3)
+ {
+ EXTRACT_CHARACTER (range_start, rtp);
+ EXTRACT_CHARACTER (range_end, rtp + 3);
+ if (range_start <= c && c <= range_end)
+ return !not;
+ }
+ }
+#endif /* emacs */
+ return not;
+}
+
/* Non-zero if "p1 matches something" implies "p2 fails". */
static int
mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
@@ -4725,22 +4758,7 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
else if ((re_opcode_t) *p1 == charset
|| (re_opcode_t) *p1 == charset_not)
{
- int not = (re_opcode_t) *p1 == charset_not;
-
- /* Test if C is listed in charset (or charset_not)
- at `p1'. */
- if (! multibyte || IS_REAL_ASCII (c))
- {
- if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
- && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
- not = !not;
- }
- else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
- CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
-
- /* `not' is equal to 1 if c would match, which means
- that we can't change to pop_failure_jump. */
- if (!not)
+ if (!execute_charset (&p1, c, c, !multibyte || IS_REAL_ASCII (c)))
{
DEBUG_PRINT (" No match => fast loop.\n");
return 1;
@@ -5140,8 +5158,6 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
if (p == pend)
{
- ptrdiff_t dcnt;
-
/* End of pattern means we might have succeeded. */
DEBUG_PRINT ("end of pattern ... ");
@@ -5149,19 +5165,22 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
longest match, try backtracking. */
if (d != end_match_2)
{
- /* 1 if this match ends in the same string (string1 or string2)
- as the best previous match. */
- boolean same_str_p = (FIRST_STRING_P (match_end)
- == FIRST_STRING_P (d));
- /* 1 if this match is the best seen so far. */
- boolean best_match_p;
-
- /* AIX compiler got confused when this was combined
- with the previous declaration. */
- if (same_str_p)
- best_match_p = d > match_end;
- else
- best_match_p = !FIRST_STRING_P (d);
+ /* True if this match is the best seen so far. */
+ bool best_match_p;
+
+ {
+ /* True if this match ends in the same string (string1
+ or string2) as the best previous match. */
+ bool same_str_p = (FIRST_STRING_P (match_end)
+ == FIRST_STRING_P (d));
+
+ /* AIX compiler got confused when this was combined
+ with the previous declaration. */
+ if (same_str_p)
+ best_match_p = d > match_end;
+ else
+ best_match_p = !FIRST_STRING_P (d);
+ }
DEBUG_PRINT ("backtracking.\n");
@@ -5290,7 +5309,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
nfailure_points_pushed - nfailure_points_popped);
DEBUG_PRINT ("%u registers pushed.\n", num_regs_pushed);
- dcnt = POINTER_TO_OFFSET (d) - pos;
+ ptrdiff_t dcnt = POINTER_TO_OFFSET (d) - pos;
DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt);
@@ -5421,6 +5440,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
{
int buf_charlen;
re_wchar_t buf_ch;
+ reg_syntax_t syntax;
DEBUG_PRINT ("EXECUTING anychar.\n");
@@ -5429,10 +5449,14 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
target_multibyte);
buf_ch = TRANSLATE (buf_ch);
- if ((!(bufp->syntax & RE_DOT_NEWLINE)
- && buf_ch == '\n')
- || ((bufp->syntax & RE_DOT_NOT_NULL)
- && buf_ch == '\000'))
+#ifdef emacs
+ syntax = RE_SYNTAX_EMACS;
+#else
+ syntax = bufp->syntax;
+#endif
+
+ if ((!(syntax & RE_DOT_NEWLINE) && buf_ch == '\n')
+ || ((syntax & RE_DOT_NOT_NULL) && buf_ch == '\000'))
goto fail;
DEBUG_PRINT (" Matched \"%d\".\n", *d);
@@ -5445,32 +5469,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
case charset_not:
{
register unsigned int c, corig;
- boolean not = (re_opcode_t) *(p - 1) == charset_not;
int len;
- /* Start of actual range_table, or end of bitmap if there is no
- range table. */
- re_char *range_table IF_LINT (= NULL);
-
- /* Nonzero if there is a range table. */
- int range_table_exists;
-
- /* Number of ranges of range table. This is not included
- in the initial byte-length of the command. */
- int count = 0;
-
/* Whether matching against a unibyte character. */
boolean unibyte_char = false;
- DEBUG_PRINT ("EXECUTING charset%s.\n", not ? "_not" : "");
-
- range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
-
- if (range_table_exists)
- {
- range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
- EXTRACT_NUMBER_AND_INCR (count, range_table);
- }
+ DEBUG_PRINT ("EXECUTING charset%s.\n",
+ (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
PREFETCH ();
corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
@@ -5504,47 +5509,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
unibyte_char = true;
}
- if (unibyte_char && c < (1 << BYTEWIDTH))
- { /* Lookup bitmap. */
- /* Cast to `unsigned' instead of `unsigned char' in
- case the bit list is a full 32 bytes long. */
- if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
- && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
- not = !not;
- }
-#ifdef emacs
- else if (range_table_exists)
- {
- int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
-
- if ( (class_bits & BIT_LOWER
- && (ISLOWER (c)
- || (corig != c
- && c == upcase (corig) && ISUPPER(c))))
- | (class_bits & BIT_MULTIBYTE)
- | (class_bits & BIT_PUNCT && ISPUNCT (c))
- | (class_bits & BIT_SPACE && ISSPACE (c))
- | (class_bits & BIT_UPPER
- && (ISUPPER (c)
- || (corig != c
- && c == downcase (corig) && ISLOWER (c))))
- | (class_bits & BIT_WORD && ISWORD (c))
- | (class_bits & BIT_ALPHA && ISALPHA (c))
- | (class_bits & BIT_ALNUM && ISALNUM (c))
- | (class_bits & BIT_GRAPH && ISGRAPH (c))
- | (class_bits & BIT_PRINT && ISPRINT (c)))
- not = !not;
- else
- CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
- }
-#endif /* emacs */
-
- if (range_table_exists)
- p = CHARSET_RANGE_TABLE_END (range_table, count);
- else
- p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
-
- if (!not) goto fail;
+ p -= 1;
+ if (!execute_charset (&p, c, corig, unibyte_char))
+ goto fail;
d += len;
}
@@ -6179,24 +6146,12 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
break;
#ifdef emacs
- case before_dot:
- DEBUG_PRINT ("EXECUTING before_dot.\n");
- if (PTR_BYTE_POS (d) >= PT_BYTE)
- goto fail;
- break;
-
case at_dot:
DEBUG_PRINT ("EXECUTING at_dot.\n");
if (PTR_BYTE_POS (d) != PT_BYTE)
goto fail;
break;
- case after_dot:
- DEBUG_PRINT ("EXECUTING after_dot.\n");
- if (PTR_BYTE_POS (d) <= PT_BYTE)
- goto fail;
- break;
-
case categoryspec:
case notcategoryspec:
{
@@ -6328,6 +6283,9 @@ bcmp_translate (const_re_char *s1, const_re_char *s2, register ssize_t len,
const char *
re_compile_pattern (const char *pattern, size_t length,
+#ifdef emacs
+ bool posix_backtracking, const char *whitespace_regexp,
+#endif
struct re_pattern_buffer *bufp)
{
reg_errcode_t ret;
@@ -6341,7 +6299,14 @@ re_compile_pattern (const char *pattern, size_t length,
setting no_sub. */
bufp->no_sub = 0;
- ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
+ ret = regex_compile ((re_char*) pattern, length,
+#ifdef emacs
+ posix_backtracking,
+ whitespace_regexp,
+#else
+ re_syntax_options,
+#endif
+ bufp);
if (!ret)
return NULL;