diff options
Diffstat (limited to 'src/regex.c')
-rw-r--r-- | src/regex.c | 645 |
1 files changed, 305 insertions, 340 deletions
diff --git a/src/regex.c b/src/regex.c index 164eb4612ae..1917a8480ae 100644 --- a/src/regex.c +++ b/src/regex.c @@ -50,6 +50,7 @@ #include <config.h> #include <stddef.h> +#include <stdlib.h> #ifdef emacs /* We need this for `regex.h', and perhaps for the Emacs include files. */ @@ -215,7 +216,7 @@ xmalloc (size_t size) void *val = malloc (size); if (!val && size) { - write (2, "virtual memory exhausted\n", 25); + write (STDERR_FILENO, "virtual memory exhausted\n", 25); exit (1); } return val; @@ -233,7 +234,7 @@ xrealloc (void *block, size_t size) val = realloc (block, size); if (!val && size) { - write (2, "virtual memory exhausted\n", 25); + write (STDERR_FILENO, "virtual memory exhausted\n", 25); exit (1); } return val; @@ -324,7 +325,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; ? (((c) >= 'a' && (c) <= 'z') \ || ((c) >= 'A' && (c) <= 'Z') \ || ((c) >= '0' && (c) <= '9')) \ - : (alphabeticp (c) || decimalnump (c))) + : alphanumericp (c)) # define ISALPHA(c) (IS_REAL_ASCII (c) \ ? (((c) >= 'a' && (c) <= 'z') \ @@ -443,25 +444,12 @@ init_syntax_once (void) #else /* not REGEX_MALLOC */ -/* Emacs already defines alloca, sometimes. */ -# ifndef alloca - -/* Make alloca work the best possible way. */ -# ifdef __GNUC__ -# define alloca __builtin_alloca -# else /* not __GNUC__ */ -# ifdef HAVE_ALLOCA_H -# include <alloca.h> -# endif /* HAVE_ALLOCA_H */ -# endif /* not __GNUC__ */ - -# endif /* not alloca */ - # ifdef emacs # define REGEX_USE_SAFE_ALLOCA USE_SAFE_ALLOCA # define REGEX_SAFE_FREE() SAFE_FREE () # define REGEX_ALLOCATE SAFE_ALLOCA # else +# include <alloca.h> # define REGEX_ALLOCATE alloca # endif @@ -513,8 +501,6 @@ init_syntax_once (void) #define BYTEWIDTH 8 /* In bits. */ -#define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) - #ifndef emacs # undef max # undef min @@ -669,9 +655,7 @@ typedef enum notsyntaxspec #ifdef emacs - ,before_dot, /* Succeeds if before point. */ - at_dot, /* Succeeds if at point. */ - after_dot, /* Succeeds if after point. */ + , at_dot, /* Succeeds if at point. */ /* Matches any character whose category-set contains the specified category. The operator is followed by a byte which contains a @@ -783,44 +767,6 @@ extract_number_and_incr (re_char **source) and end. */ #define CHARSET_RANGE_TABLE_END(range_table, count) \ ((range_table) + (count) * 2 * 3) - -/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in. - COUNT is number of ranges in RANGE_TABLE. */ -#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \ - do \ - { \ - re_wchar_t range_start, range_end; \ - re_char *rtp; \ - re_char *range_table_end \ - = CHARSET_RANGE_TABLE_END ((range_table), (count)); \ - \ - for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \ - { \ - EXTRACT_CHARACTER (range_start, rtp); \ - EXTRACT_CHARACTER (range_end, rtp + 3); \ - \ - if (range_start <= (c) && (c) <= range_end) \ - { \ - (not) = !(not); \ - break; \ - } \ - } \ - } \ - while (0) - -/* Test if C is in range table of CHARSET. The flag NOT is negated if - C is listed in it. */ -#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \ - do \ - { \ - /* Number of ranges in range table. */ \ - int count; \ - re_char *range_table = CHARSET_RANGE_TABLE (charset); \ - \ - EXTRACT_NUMBER_AND_INCR (count, range_table); \ - CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \ - } \ - while (0) /* If DEBUG is defined, Regex prints many voluminous messages about what it is doing (if the variable `debug' is nonzero). If linked with the @@ -1091,18 +1037,10 @@ print_partial_compiled_pattern (re_char *start, re_char *end) break; # ifdef emacs - case before_dot: - fprintf (stderr, "/before_dot"); - break; - case at_dot: fprintf (stderr, "/at_dot"); break; - case after_dot: - fprintf (stderr, "/after_dot"); - break; - case categoryspec: fprintf (stderr, "/categoryspec"); mcnt = *p++; @@ -1156,7 +1094,9 @@ print_compiled_pattern (struct re_pattern_buffer *bufp) printf ("no_sub: %d\t", bufp->no_sub); printf ("not_bol: %d\t", bufp->not_bol); printf ("not_eol: %d\t", bufp->not_eol); +#ifndef emacs printf ("syntax: %lx\n", bufp->syntax); +#endif fflush (stdout); /* Perhaps we should print the translate table? */ } @@ -1197,13 +1137,8 @@ print_double_string (re_char *where, re_char *string1, ssize_t size1, #endif /* not DEBUG */ -/* Use this to suppress gcc's `...may be used before initialized' warnings. */ -#ifdef lint -# define IF_LINT(Code) Code -#else -# define IF_LINT(Code) /* empty */ -#endif - +#ifndef emacs + /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can also be assigned to arbitrarily: each pattern buffer stores its own syntax, so it can be changed between regex compilations. */ @@ -1229,15 +1164,7 @@ re_set_syntax (reg_syntax_t syntax) } WEAK_ALIAS (__re_set_syntax, re_set_syntax) -/* Regexp to use to replace spaces, or NULL meaning don't. */ -static const_re_char *whitespace_regexp; - -void -re_set_whitespace_regexp (const char *regexp) -{ - whitespace_regexp = (const_re_char *) regexp; -} -WEAK_ALIAS (__re_set_syntax, re_set_syntax) +#endif /* This table gives an error message for each of the error codes listed in regex.h. Obviously the order here has to be same as there. @@ -1619,7 +1546,12 @@ do { \ /* Subroutine declarations and macros for regex_compile. */ static reg_errcode_t regex_compile (re_char *pattern, size_t size, +#ifdef emacs + bool posix_backtracking, + const char *whitespace_regexp, +#else reg_syntax_t syntax, +#endif struct re_pattern_buffer *bufp); static void store_op1 (re_opcode_t op, unsigned char *loc, int arg); static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2); @@ -2014,29 +1946,96 @@ struct range_table_work_area #if ! WIDE_CHAR_SUPPORT -/* Map a string to the char class it names (if any). */ +/* Parse a character class, i.e. string such as "[:name:]". *strp + points to the string to be parsed and limit is length, in bytes, of + that string. + + If *strp point to a string that begins with "[:name:]", where name is + a non-empty sequence of lower case letters, *strp will be advanced past the + closing square bracket and RECC_* constant which maps to the name will be + returned. If name is not a valid character class name zero, or RECC_ERROR, + is returned. + + Otherwise, if *strp doesn’t begin with "[:name:]", -1 is returned. + + The function can be used on ASCII and multibyte (UTF-8-encoded) strings. + */ re_wctype_t -re_wctype (const_re_char *str) +re_wctype_parse (const unsigned char **strp, unsigned limit) { - const char *string = (const char *) str; - if (STREQ (string, "alnum")) return RECC_ALNUM; - else if (STREQ (string, "alpha")) return RECC_ALPHA; - else if (STREQ (string, "word")) return RECC_WORD; - else if (STREQ (string, "ascii")) return RECC_ASCII; - else if (STREQ (string, "nonascii")) return RECC_NONASCII; - else if (STREQ (string, "graph")) return RECC_GRAPH; - else if (STREQ (string, "lower")) return RECC_LOWER; - else if (STREQ (string, "print")) return RECC_PRINT; - else if (STREQ (string, "punct")) return RECC_PUNCT; - else if (STREQ (string, "space")) return RECC_SPACE; - else if (STREQ (string, "upper")) return RECC_UPPER; - else if (STREQ (string, "unibyte")) return RECC_UNIBYTE; - else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE; - else if (STREQ (string, "digit")) return RECC_DIGIT; - else if (STREQ (string, "xdigit")) return RECC_XDIGIT; - else if (STREQ (string, "cntrl")) return RECC_CNTRL; - else if (STREQ (string, "blank")) return RECC_BLANK; - else return 0; + const char *beg = (const char *)*strp, *it; + + if (limit < 4 || beg[0] != '[' || beg[1] != ':') + return -1; + + beg += 2; /* skip opening ‘[:’ */ + limit -= 3; /* opening ‘[:’ and half of closing ‘:]’; --limit handles rest */ + for (it = beg; it[0] != ':' || it[1] != ']'; ++it) + if (!--limit) + return -1; + + *strp = (const unsigned char *)(it + 2); + + /* Sort tests in the length=five case by frequency the classes to minimize + number of times we fail the comparison. The frequencies of character class + names used in Emacs sources as of 2016-07-27: + + $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + | + sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr + 213 [:alnum:] + 104 [:alpha:] + 62 [:space:] + 39 [:digit:] + 36 [:blank:] + 26 [:word:] + 26 [:upper:] + 21 [:lower:] + 10 [:xdigit:] + 10 [:punct:] + 10 [:ascii:] + 4 [:nonascii:] + 4 [:graph:] + 2 [:print:] + 2 [:cntrl:] + 1 [:ff:] + + If you update this list, consider also updating chain of or’ed conditions + in execute_charset function. + */ + + switch (it - beg) { + case 4: + if (!memcmp (beg, "word", 4)) return RECC_WORD; + break; + case 5: + if (!memcmp (beg, "alnum", 5)) return RECC_ALNUM; + if (!memcmp (beg, "alpha", 5)) return RECC_ALPHA; + if (!memcmp (beg, "space", 5)) return RECC_SPACE; + if (!memcmp (beg, "digit", 5)) return RECC_DIGIT; + if (!memcmp (beg, "blank", 5)) return RECC_BLANK; + if (!memcmp (beg, "upper", 5)) return RECC_UPPER; + if (!memcmp (beg, "lower", 5)) return RECC_LOWER; + if (!memcmp (beg, "punct", 5)) return RECC_PUNCT; + if (!memcmp (beg, "ascii", 5)) return RECC_ASCII; + if (!memcmp (beg, "graph", 5)) return RECC_GRAPH; + if (!memcmp (beg, "print", 5)) return RECC_PRINT; + if (!memcmp (beg, "cntrl", 5)) return RECC_CNTRL; + break; + case 6: + if (!memcmp (beg, "xdigit", 6)) return RECC_XDIGIT; + break; + case 7: + if (!memcmp (beg, "unibyte", 7)) return RECC_UNIBYTE; + break; + case 8: + if (!memcmp (beg, "nonascii", 8)) return RECC_NONASCII; + break; + case 9: + if (!memcmp (beg, "multibyte", 9)) return RECC_MULTIBYTE; + break; + } + + return RECC_ERROR; } /* True if CH is in the char class CC. */ @@ -2382,6 +2381,9 @@ static boolean group_in_compile_stack (compile_stack_type compile_stack, /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. Returns one of error codes defined in `regex.h', or zero for success. + If WHITESPACE_REGEXP is given (only #ifdef emacs), it is used instead of + a space character in PATTERN. + Assumes the `allocated' (and perhaps `buffer') and `translate' fields are set in BUFP on entry. @@ -2414,7 +2416,15 @@ do { \ } while (0) static reg_errcode_t -regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, +regex_compile (const_re_char *pattern, size_t size, +#ifdef emacs +# define syntax RE_SYNTAX_EMACS + bool posix_backtracking, + const char *whitespace_regexp, +#else + reg_syntax_t syntax, +# define posix_backtracking (!(syntax & RE_NO_POSIX_BACKTRACKING)) +#endif struct re_pattern_buffer *bufp) { /* We fetch characters from PATTERN here. */ @@ -2467,14 +2477,16 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, /* If the object matched can contain multibyte characters. */ const boolean multibyte = RE_MULTIBYTE_P (bufp); +#ifdef emacs /* Nonzero if we have pushed down into a subpattern. */ int in_subpattern = 0; /* These hold the values of p, pattern, and pend from the main pattern when we have pushed into a subpattern. */ - re_char *main_p IF_LINT (= NULL); - re_char *main_pattern IF_LINT (= NULL); - re_char *main_pend IF_LINT (= NULL); + re_char *main_p; + re_char *main_pattern; + re_char *main_pend; +#endif #ifdef DEBUG debug++; @@ -2501,7 +2513,9 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, range_table_work.allocated = 0; /* Initialize the pattern buffer. */ +#ifndef emacs bufp->syntax = syntax; +#endif bufp->fastmap_accurate = 0; bufp->not_bol = bufp->not_eol = 0; bufp->used_syntax = 0; @@ -2543,6 +2557,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, { if (p == pend) { +#ifdef emacs /* If this is the end of an included regexp, pop back to the main regexp and try again. */ if (in_subpattern) @@ -2553,6 +2568,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, pend = main_pend; continue; } +#endif /* If this is the end of the main regexp, we are done. */ break; } @@ -2561,6 +2577,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, switch (c) { +#ifdef emacs case ' ': { re_char *p1 = p; @@ -2589,10 +2606,11 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, main_p = p1; main_pend = pend; main_pattern = pattern; - p = pattern = whitespace_regexp; - pend = p + strlen ((const char *) p); + p = pattern = (re_char *) whitespace_regexp; + pend = p + strlen (whitespace_regexp); break; } +#endif case '^': { @@ -2821,10 +2839,69 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, { boolean escaped_char = false; const unsigned char *p2 = p; + re_wctype_t cc; re_wchar_t ch; if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + /* See if we're at the beginning of a possible character + class. */ + if (syntax & RE_CHAR_CLASSES && + (cc = re_wctype_parse(&p, pend - p)) != -1) + { + if (cc == 0) + FREE_STACK_RETURN (REG_ECTYPE); + + if (p == pend) + FREE_STACK_RETURN (REG_EBRACK); + +#ifndef emacs + for (ch = 0; ch < (1 << BYTEWIDTH); ++ch) + if (re_iswctype (btowc (ch), cc)) + { + c = TRANSLATE (ch); + if (c < (1 << BYTEWIDTH)) + SET_LIST_BIT (c); + } +#else /* emacs */ + /* Most character classes in a multibyte match just set + a flag. Exceptions are is_blank, is_digit, is_cntrl, and + is_xdigit, since they can only match ASCII characters. + We don't need to handle them for multibyte. */ + + /* Setup the gl_state object to its buffer-defined value. + This hardcodes the buffer-global syntax-table for ASCII + chars, while the other chars will obey syntax-table + properties. It's not ideal, but it's the way it's been + done until now. */ + SETUP_BUFFER_SYNTAX_TABLE (); + + for (c = 0; c < 0x80; ++c) + if (re_iswctype (c, cc)) + { + SET_LIST_BIT (c); + c1 = TRANSLATE (c); + if (c1 == c) + continue; + if (ASCII_CHAR_P (c1)) + SET_LIST_BIT (c1); + else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0) + SET_LIST_BIT (c1); + } + SET_RANGE_TABLE_WORK_AREA_BIT + (range_table_work, re_wctype_to_bit (cc)); +#endif /* emacs */ + /* In most cases the matching rule for char classes only + uses the syntax table for multibyte chars, so that the + content of the syntax-table is not hardcoded in the + range_table. SPACE and WORD are the two exceptions. */ + if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD))) + bufp->used_syntax = 1; + + /* Repeat the loop. */ + continue; + } + /* Don't translate yet. The range TRANSLATE(X..Y) cannot always be determined from TRANSLATE(X) and TRANSLATE(Y) So the translation is done later in a loop. Example: @@ -2848,119 +2925,6 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, break; } - /* See if we're at the beginning of a possible character - class. */ - - if (!escaped_char && - syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') - { - /* Leave room for the null. */ - unsigned char str[CHAR_CLASS_MAX_LENGTH + 1]; - const unsigned char *class_beg; - - PATFETCH (c); - c1 = 0; - class_beg = p; - - /* If pattern is `[[:'. */ - if (p == pend) FREE_STACK_RETURN (REG_EBRACK); - - for (;;) - { - PATFETCH (c); - if ((c == ':' && *p == ']') || p == pend) - break; - if (c1 < CHAR_CLASS_MAX_LENGTH) - str[c1++] = c; - else - /* This is in any case an invalid class name. */ - str[0] = '\0'; - } - str[c1] = '\0'; - - /* If isn't a word bracketed by `[:' and `:]': - undo the ending character, the letters, and - leave the leading `:' and `[' (but set bits for - them). */ - if (c == ':' && *p == ']') - { - re_wctype_t cc = re_wctype (str); - - if (cc == 0) - FREE_STACK_RETURN (REG_ECTYPE); - - /* Throw away the ] at the end of the character - class. */ - PATFETCH (c); - - if (p == pend) FREE_STACK_RETURN (REG_EBRACK); - -#ifndef emacs - for (ch = 0; ch < (1 << BYTEWIDTH); ++ch) - if (re_iswctype (btowc (ch), cc)) - { - c = TRANSLATE (ch); - if (c < (1 << BYTEWIDTH)) - SET_LIST_BIT (c); - } -#else /* emacs */ - /* Most character classes in a multibyte match - just set a flag. Exceptions are is_blank, - is_digit, is_cntrl, and is_xdigit, since - they can only match ASCII characters. We - don't need to handle them for multibyte. - They are distinguished by a negative wctype. */ - - /* Setup the gl_state object to its buffer-defined - value. This hardcodes the buffer-global - syntax-table for ASCII chars, while the other chars - will obey syntax-table properties. It's not ideal, - but it's the way it's been done until now. */ - SETUP_BUFFER_SYNTAX_TABLE (); - - for (ch = 0; ch < 256; ++ch) - { - c = RE_CHAR_TO_MULTIBYTE (ch); - if (! CHAR_BYTE8_P (c) - && re_iswctype (c, cc)) - { - SET_LIST_BIT (ch); - c1 = TRANSLATE (c); - if (c1 == c) - continue; - if (ASCII_CHAR_P (c1)) - SET_LIST_BIT (c1); - else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0) - SET_LIST_BIT (c1); - } - } - SET_RANGE_TABLE_WORK_AREA_BIT - (range_table_work, re_wctype_to_bit (cc)); -#endif /* emacs */ - /* In most cases the matching rule for char classes - only uses the syntax table for multibyte chars, - so that the content of the syntax-table is not - hardcoded in the range_table. SPACE and WORD are - the two exceptions. */ - if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD))) - bufp->used_syntax = 1; - - /* Repeat the loop. */ - continue; - } - else - { - /* Go back to right after the "[:". */ - p = class_beg; - SET_LIST_BIT ('['); - - /* Because the `:' may start the range, we - can't simply set bit and repeat the loop. - Instead, just set it to C and handle below. */ - c = ':'; - } - } - if (p < pend && p[0] == '-' && p[1] != ']') { @@ -3467,8 +3431,6 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, goto normal_char; #ifdef emacs - /* There is no way to specify the before_dot and after_dot - operators. rms says this is ok. --karl */ case '=': laststart = b; BUF_PUSH (at_dot); @@ -3675,7 +3637,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, /* If we don't want backtracking, force success the first time we reach the end of the compiled pattern. */ - if (syntax & RE_NO_POSIX_BACKTRACKING) + if (!posix_backtracking) BUF_PUSH (succeed); /* We have succeeded; set the length of the buffer. */ @@ -3710,6 +3672,12 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, #endif /* not MATCH_MAY_ALLOCATE */ FREE_STACK_RETURN (REG_NOERROR); + +#ifdef emacs +# undef syntax +#else +# undef posix_backtracking +#endif } /* regex_compile */ /* Subroutines for `regex_compile'. */ @@ -4045,9 +4013,7 @@ analyze_first (const_re_char *p, const_re_char *pend, char *fastmap, /* All cases after this match the empty string. These end with `continue'. */ - case before_dot: case at_dot: - case after_dot: #endif /* !emacs */ case no_op: case begline: @@ -4668,6 +4634,73 @@ skip_noops (const_re_char *p, const_re_char *pend) return p; } +/* Test if C matches charset op. *PP points to the charset or charset_not + opcode. When the function finishes, *PP will be advanced past that opcode. + C is character to test (possibly after translations) and CORIG is original + character (i.e. without any translations). UNIBYTE denotes whether c is + unibyte or multibyte character. */ +static bool +execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte) +{ + re_char *p = *pp, *rtp = NULL; + bool not = (re_opcode_t) *p == charset_not; + + if (CHARSET_RANGE_TABLE_EXISTS_P (p)) + { + int count; + rtp = CHARSET_RANGE_TABLE (p); + EXTRACT_NUMBER_AND_INCR (count, rtp); + *pp = CHARSET_RANGE_TABLE_END ((rtp), (count)); + } + else + *pp += 2 + CHARSET_BITMAP_SIZE (p); + + if (unibyte && c < (1 << BYTEWIDTH)) + { /* Lookup bitmap. */ + /* Cast to `unsigned' instead of `unsigned char' in + case the bit list is a full 32 bytes long. */ + if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH) + && p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) + return !not; + } +#ifdef emacs + else if (rtp) + { + int class_bits = CHARSET_RANGE_TABLE_BITS (p); + re_wchar_t range_start, range_end; + + /* Sort tests by the most commonly used classes with some adjustment to which + tests are easiest to perform. Take a look at comment in re_wctype_parse + for table with frequencies of character class names. */ + + if ((class_bits & BIT_MULTIBYTE) || + (class_bits & BIT_ALNUM && ISALNUM (c)) || + (class_bits & BIT_ALPHA && ISALPHA (c)) || + (class_bits & BIT_SPACE && ISSPACE (c)) || + (class_bits & BIT_WORD && ISWORD (c)) || + ((class_bits & BIT_UPPER) && + (ISUPPER (c) || (corig != c && + c == downcase (corig) && ISLOWER (c)))) || + ((class_bits & BIT_LOWER) && + (ISLOWER (c) || (corig != c && + c == upcase (corig) && ISUPPER(c)))) || + (class_bits & BIT_PUNCT && ISPUNCT (c)) || + (class_bits & BIT_GRAPH && ISGRAPH (c)) || + (class_bits & BIT_PRINT && ISPRINT (c))) + return !not; + + for (p = *pp; rtp < p; rtp += 2 * 3) + { + EXTRACT_CHARACTER (range_start, rtp); + EXTRACT_CHARACTER (range_end, rtp + 3); + if (range_start <= c && c <= range_end) + return !not; + } + } +#endif /* emacs */ + return not; +} + /* Non-zero if "p1 matches something" implies "p2 fails". */ static int mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1, @@ -4725,22 +4758,7 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1, else if ((re_opcode_t) *p1 == charset || (re_opcode_t) *p1 == charset_not) { - int not = (re_opcode_t) *p1 == charset_not; - - /* Test if C is listed in charset (or charset_not) - at `p1'. */ - if (! multibyte || IS_REAL_ASCII (c)) - { - if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH - && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) - not = !not; - } - else if (CHARSET_RANGE_TABLE_EXISTS_P (p1)) - CHARSET_LOOKUP_RANGE_TABLE (not, c, p1); - - /* `not' is equal to 1 if c would match, which means - that we can't change to pop_failure_jump. */ - if (!not) + if (!execute_charset (&p1, c, c, !multibyte || IS_REAL_ASCII (c))) { DEBUG_PRINT (" No match => fast loop.\n"); return 1; @@ -5140,8 +5158,6 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, if (p == pend) { - ptrdiff_t dcnt; - /* End of pattern means we might have succeeded. */ DEBUG_PRINT ("end of pattern ... "); @@ -5149,19 +5165,22 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, longest match, try backtracking. */ if (d != end_match_2) { - /* 1 if this match ends in the same string (string1 or string2) - as the best previous match. */ - boolean same_str_p = (FIRST_STRING_P (match_end) - == FIRST_STRING_P (d)); - /* 1 if this match is the best seen so far. */ - boolean best_match_p; - - /* AIX compiler got confused when this was combined - with the previous declaration. */ - if (same_str_p) - best_match_p = d > match_end; - else - best_match_p = !FIRST_STRING_P (d); + /* True if this match is the best seen so far. */ + bool best_match_p; + + { + /* True if this match ends in the same string (string1 + or string2) as the best previous match. */ + bool same_str_p = (FIRST_STRING_P (match_end) + == FIRST_STRING_P (d)); + + /* AIX compiler got confused when this was combined + with the previous declaration. */ + if (same_str_p) + best_match_p = d > match_end; + else + best_match_p = !FIRST_STRING_P (d); + } DEBUG_PRINT ("backtracking.\n"); @@ -5290,7 +5309,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, nfailure_points_pushed - nfailure_points_popped); DEBUG_PRINT ("%u registers pushed.\n", num_regs_pushed); - dcnt = POINTER_TO_OFFSET (d) - pos; + ptrdiff_t dcnt = POINTER_TO_OFFSET (d) - pos; DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt); @@ -5421,6 +5440,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, { int buf_charlen; re_wchar_t buf_ch; + reg_syntax_t syntax; DEBUG_PRINT ("EXECUTING anychar.\n"); @@ -5429,10 +5449,14 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, target_multibyte); buf_ch = TRANSLATE (buf_ch); - if ((!(bufp->syntax & RE_DOT_NEWLINE) - && buf_ch == '\n') - || ((bufp->syntax & RE_DOT_NOT_NULL) - && buf_ch == '\000')) +#ifdef emacs + syntax = RE_SYNTAX_EMACS; +#else + syntax = bufp->syntax; +#endif + + if ((!(syntax & RE_DOT_NEWLINE) && buf_ch == '\n') + || ((syntax & RE_DOT_NOT_NULL) && buf_ch == '\000')) goto fail; DEBUG_PRINT (" Matched \"%d\".\n", *d); @@ -5445,32 +5469,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, case charset_not: { register unsigned int c, corig; - boolean not = (re_opcode_t) *(p - 1) == charset_not; int len; - /* Start of actual range_table, or end of bitmap if there is no - range table. */ - re_char *range_table IF_LINT (= NULL); - - /* Nonzero if there is a range table. */ - int range_table_exists; - - /* Number of ranges of range table. This is not included - in the initial byte-length of the command. */ - int count = 0; - /* Whether matching against a unibyte character. */ boolean unibyte_char = false; - DEBUG_PRINT ("EXECUTING charset%s.\n", not ? "_not" : ""); - - range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]); - - if (range_table_exists) - { - range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */ - EXTRACT_NUMBER_AND_INCR (count, range_table); - } + DEBUG_PRINT ("EXECUTING charset%s.\n", + (re_opcode_t) *(p - 1) == charset_not ? "_not" : ""); PREFETCH (); corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte); @@ -5504,47 +5509,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, unibyte_char = true; } - if (unibyte_char && c < (1 << BYTEWIDTH)) - { /* Lookup bitmap. */ - /* Cast to `unsigned' instead of `unsigned char' in - case the bit list is a full 32 bytes long. */ - if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH) - && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) - not = !not; - } -#ifdef emacs - else if (range_table_exists) - { - int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]); - - if ( (class_bits & BIT_LOWER - && (ISLOWER (c) - || (corig != c - && c == upcase (corig) && ISUPPER(c)))) - | (class_bits & BIT_MULTIBYTE) - | (class_bits & BIT_PUNCT && ISPUNCT (c)) - | (class_bits & BIT_SPACE && ISSPACE (c)) - | (class_bits & BIT_UPPER - && (ISUPPER (c) - || (corig != c - && c == downcase (corig) && ISLOWER (c)))) - | (class_bits & BIT_WORD && ISWORD (c)) - | (class_bits & BIT_ALPHA && ISALPHA (c)) - | (class_bits & BIT_ALNUM && ISALNUM (c)) - | (class_bits & BIT_GRAPH && ISGRAPH (c)) - | (class_bits & BIT_PRINT && ISPRINT (c))) - not = !not; - else - CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); - } -#endif /* emacs */ - - if (range_table_exists) - p = CHARSET_RANGE_TABLE_END (range_table, count); - else - p += CHARSET_BITMAP_SIZE (&p[-1]) + 1; - - if (!not) goto fail; + p -= 1; + if (!execute_charset (&p, c, corig, unibyte_char)) + goto fail; d += len; } @@ -6179,24 +6146,12 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, break; #ifdef emacs - case before_dot: - DEBUG_PRINT ("EXECUTING before_dot.\n"); - if (PTR_BYTE_POS (d) >= PT_BYTE) - goto fail; - break; - case at_dot: DEBUG_PRINT ("EXECUTING at_dot.\n"); if (PTR_BYTE_POS (d) != PT_BYTE) goto fail; break; - case after_dot: - DEBUG_PRINT ("EXECUTING after_dot.\n"); - if (PTR_BYTE_POS (d) <= PT_BYTE) - goto fail; - break; - case categoryspec: case notcategoryspec: { @@ -6328,6 +6283,9 @@ bcmp_translate (const_re_char *s1, const_re_char *s2, register ssize_t len, const char * re_compile_pattern (const char *pattern, size_t length, +#ifdef emacs + bool posix_backtracking, const char *whitespace_regexp, +#endif struct re_pattern_buffer *bufp) { reg_errcode_t ret; @@ -6341,7 +6299,14 @@ re_compile_pattern (const char *pattern, size_t length, setting no_sub. */ bufp->no_sub = 0; - ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp); + ret = regex_compile ((re_char*) pattern, length, +#ifdef emacs + posix_backtracking, + whitespace_regexp, +#else + re_syntax_options, +#endif + bufp); if (!ret) return NULL; |