diff options
Diffstat (limited to 'gettext-tools/libgrep/m-regex.c')
-rw-r--r-- | gettext-tools/libgrep/m-regex.c | 272 |
1 files changed, 272 insertions, 0 deletions
diff --git a/gettext-tools/libgrep/m-regex.c b/gettext-tools/libgrep/m-regex.c new file mode 100644 index 0000000..5444978 --- /dev/null +++ b/gettext-tools/libgrep/m-regex.c @@ -0,0 +1,272 @@ +/* Pattern Matchers for Regular Expressions. + Copyright (C) 1992, 1998, 2000, 2005-2006, 2010 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +/* Specification. */ +#include "libgrep.h" + +#include <ctype.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <regex.h> + +#include "error.h" +#include "exitfail.h" +#include "xalloc.h" + +#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) +# define IN_CTYPE_DOMAIN(c) 1 +#else +# define IN_CTYPE_DOMAIN(c) isascii(c) +#endif +#define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C)) +#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') + +struct patterns +{ + /* Regex compiled regexp. */ + struct re_pattern_buffer regexbuf; + struct re_registers regs; /* This is here on account of a BRAIN-DEAD + Q@#%!# library interface in regex.c. */ +}; + +struct compiled_regex { + bool match_words; + bool match_lines; + char eolbyte; + + /* The Regex compiled patterns. */ + struct patterns *patterns; + size_t pcount; +}; + +static void * +compile (const char *pattern, size_t pattern_size, + bool match_icase, bool match_words, bool match_lines, char eolbyte, + reg_syntax_t syntax) +{ + struct compiled_regex *cregex; + + cregex = XMALLOC (struct compiled_regex); + memset (cregex, '\0', sizeof (struct compiled_regex)); + cregex->match_words = match_words; + cregex->match_lines = match_lines; + cregex->eolbyte = eolbyte; + cregex->patterns = NULL; + cregex->pcount = 0; + + re_set_syntax (syntax); + + /* For GNU regex compiler we have to pass the patterns separately to detect + errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" + GNU regex should have raised a syntax error. The same for backref, where + the backref should have been local to each pattern. */ + { + const char *sep; + size_t total = pattern_size; + const char *motif = pattern; + + do + { + size_t len; + const char *err; + + sep = (const char *) memchr (motif, '\n', total); + if (sep) + { + len = sep - motif; + sep++; + total -= (len + 1); + } + else + { + len = total; + total = 0; + } + + cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns)); + memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns)); + + if ((err = re_compile_pattern (motif, len, + &cregex->patterns[cregex->pcount].regexbuf)) != NULL) + error (exit_failure, 0, "%s", err); + cregex->pcount++; + + motif = sep; + } + while (sep && total != 0); + } + + return cregex; +} + +static void * +Gcompile (const char *pattern, size_t pattern_size, + bool match_icase, bool match_words, bool match_lines, char eolbyte) +{ + return compile (pattern, pattern_size, + match_icase, match_words, match_lines, eolbyte, + RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); +} + +static void * +Ecompile (const char *pattern, size_t pattern_size, + bool match_icase, bool match_words, bool match_lines, char eolbyte) +{ + return compile (pattern, pattern_size, + match_icase, match_words, match_lines, eolbyte, + RE_SYNTAX_POSIX_EGREP); +} + +static void * +AWKcompile (const char *pattern, size_t pattern_size, + bool match_icase, bool match_words, bool match_lines, char eolbyte) +{ + return compile (pattern, pattern_size, + match_icase, match_words, match_lines, eolbyte, + RE_SYNTAX_AWK); +} + +static size_t +EGexecute (const void *compiled_pattern, + const char *buf, size_t buf_size, + size_t *match_size, bool exact) +{ + struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern; + char eol = cregex->eolbyte; + register const char *buflim = buf + buf_size; + register const char *beg; + register const char *end; + + for (beg = buf; beg < buflim; beg = end) + { + size_t i; + + end = (const char *) memchr (beg, eol, buflim - beg); + if (end == NULL) + end = buflim; + /* Here, either end < buflim && *end == eol, or end == buflim. */ + + for (i = 0; i < cregex->pcount; i++) + { + int start, len; + + cregex->patterns[i].regexbuf.not_eol = 0; + if (0 <= (start = re_search (&cregex->patterns[i].regexbuf, beg, + end - beg, 0, + end - beg, &cregex->patterns[i].regs))) + { + len = cregex->patterns[i].regs.end[0] - start; + if (exact) + { + *match_size = len; + return start; + } + if (cregex->match_lines) + { + if (len == end - beg) /* implies start == 0 */ + goto success; + } + else if (cregex->match_words) + { + /* If -w, check if the match aligns with word boundaries. + We do this iteratively because: + (a) the line may contain more than one occurence of the + pattern, and + (b) Several alternatives in the pattern might be valid at + a given point, and we may need to consider a shorter + one to find a word boundary. */ + while (start >= 0) + { + if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1])) + && (start + len == end - beg + || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len]))) + goto success; + if (len > 0) + { + /* Try a shorter length anchored at the same place. */ + --len; + cregex->patterns[i].regexbuf.not_eol = 1; + len = re_match (&cregex->patterns[i].regexbuf, beg, + start + len, start, + &cregex->patterns[i].regs); + } + if (len <= 0) + { + /* Try looking further on. */ + if (start == end - beg) + break; + ++start; + cregex->patterns[i].regexbuf.not_eol = 0; + start = re_search (&cregex->patterns[i].regexbuf, beg, + end - beg, + start, end - beg - start, + &cregex->patterns[i].regs); + len = cregex->patterns[i].regs.end[0] - start; + } + } + } + else + goto success; + } + } + + if (end < buflim) + end++; + } + return (size_t) -1; + + success: + *match_size = end - beg; + return beg - buf; +} + +static void +EGfree (void *compiled_pattern) +{ + struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern; + + free (cregex->patterns); + free (cregex); +} + +/* POSIX Basic Regular Expressions */ +matcher_t matcher_grep = + { + Gcompile, + EGexecute, + EGfree + }; + +/* POSIX Extended Regular Expressions */ +matcher_t matcher_egrep = + { + Ecompile, + EGexecute, + EGfree + }; + +/* AWK Regular Expressions */ +matcher_t matcher_awk = + { + AWKcompile, + EGexecute, + EGfree + }; |