summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJim Meyering <jim@meyering.net>1992-12-10 19:08:15 +0000
committerJim Meyering <jim@meyering.net>1992-12-10 19:08:15 +0000
commitcd8263cefca26ed1f54316a259a9368baef74542 (patch)
tree4aa333b039cadbbd141712ae6d665a777a83da1e
parenta07ae8bd0769097e10fa627c1f0a5da9db1995de (diff)
downloadgnulib-cd8263cefca26ed1f54316a259a9368baef74542.tar.gz
GNU text utilitiesTEXTUTILS-1_3_6
-rw-r--r--lib/regex.c177
-rw-r--r--lib/regex.h28
2 files changed, 123 insertions, 82 deletions
diff --git a/lib/regex.c b/lib/regex.c
index 71aa4cc87e..a5594be55e 100644
--- a/lib/regex.c
+++ b/lib/regex.c
@@ -29,11 +29,14 @@
/* We need this for `regex.h', and perhaps for the Emacs include files. */
#include <sys/types.h>
+#if defined (HAVE_CONFIG_H) || defined (emacs)
+#include "config.h"
+#endif
+
/* The `emacs' switch turns on certain matching commands
that make sense only in Emacs. */
#ifdef emacs
-#include "config.h"
#include "lisp.h"
#include "buffer.h"
#include "syntax.h"
@@ -45,11 +48,17 @@
/* We used to test for `BSTRING' here, but only GCC and Emacs define
`BSTRING', as far as I know, and neither of them use this code. */
-#if USG || STDC_HEADERS
+#if HAVE_STRING_H || STDC_HEADERS
#include <string.h>
+#ifndef bcmp
#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
+#endif
+#ifndef bcopy
#define bcopy(s, d, n) memcpy ((d), (s), (n))
+#endif
+#ifndef bzero
#define bzero(s, n) memset ((s), 0, (n))
+#endif
#else
#include <strings.h>
#endif
@@ -135,12 +144,8 @@ init_syntax_once ()
(Per Bothner suggested the basic approach.) */
#undef SIGN_EXTEND_CHAR
#if __STDC__
-#ifndef VMS
#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
-#else /* On VMS, VAXC doesn't recognize `signed' before `char' */
-#define SIGN_EXTEND_CHAR(c) ((char) (c))
-#endif /* VMS */
-#else
+#else /* not __STDC__ */
/* As in Harbison and Steele. */
#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
#endif
@@ -447,6 +452,7 @@ static int debug = 0;
#define DEBUG_PRINT1(x) if (debug) printf (x)
#define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
#define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
+#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
if (debug) print_partial_compiled_pattern (s, e)
#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
@@ -760,6 +766,7 @@ print_double_string (where, string1, size1, string2, size2)
#define DEBUG_PRINT1(x)
#define DEBUG_PRINT2(x1, x2)
#define DEBUG_PRINT3(x1, x2, x3)
+#define DEBUG_PRINT4(x1, x2, x3, x4)
#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
@@ -1025,9 +1032,9 @@ typedef struct
`buffer' is the compiled pattern;
`syntax' is set to SYNTAX;
`used' is set to the length of the compiled pattern;
- `fastmap_accurate' is set to zero;
- `re_nsub' is set to the number of groups in PATTERN;
- `not_bol' and `not_eol' are set to zero.
+ `fastmap_accurate' is zero;
+ `re_nsub' is the number of subexpressions in PATTERN;
+ `not_bol' and `not_eol' are zero;
The `fastmap' and `newline_anchor' fields are neither
examined nor set. */
@@ -1676,10 +1683,10 @@ regex_compile (pattern, size, syntax, bufp)
| v | v
a | b | c
- If we are at `b,' then fixup_alt_jump right now points to a
- three-byte space after `a.' We'll put in the jump, set
- fixup_alt_jump to right after `b,' and leave behind three
- bytes which we'll fill in when we get to after `c.' */
+ If we are at `b', then fixup_alt_jump right now points to a
+ three-byte space after `a'. We'll put in the jump, set
+ fixup_alt_jump to right after `b', and leave behind three
+ bytes which we'll fill in when we get to after `c'. */
if (fixup_alt_jump)
STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
@@ -2320,6 +2327,7 @@ typedef struct
int this_reg; \
\
DEBUG_STATEMENT (failure_id++); \
+ DEBUG_STATEMENT (nfailure_points_pushed++); \
DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
@@ -2473,6 +2481,8 @@ typedef struct
regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
} \
+ \
+ DEBUG_STATEMENT (nfailure_points_popped++); \
} /* POP_FAILURE_POINT */
/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
@@ -2860,15 +2870,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
else if (endpos > total_size)
range = total_size - startpos;
- /* Update the fastmap now if not correct already. */
- if (fastmap && !bufp->fastmap_accurate)
- if (re_compile_fastmap (bufp) == -2)
- return -2;
-
/* If the search isn't to be a backwards one, don't waste time in a
- long search for a pattern that says it is anchored. */
- if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf
- && range > 0)
+ search for a pattern that must be anchored. */
+ if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
{
if (startpos > 0)
return -1;
@@ -2876,6 +2880,12 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
range = 1;
}
+ /* Update the fastmap now if not correct already. */
+ if (fastmap && !bufp->fastmap_accurate)
+ if (re_compile_fastmap (bufp) == -2)
+ return -2;
+
+ /* Loop through the string, looking for a place to start matching. */
for (;;)
{
/* If a fastmap is supplied, skip quickly over characters that
@@ -2913,7 +2923,7 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
? string2[startpos - size1]
: string1[startpos]);
- if (!fastmap[TRANSLATE (c)])
+ if (!fastmap[(unsigned char) TRANSLATE (c)])
goto advance;
}
}
@@ -2987,12 +2997,9 @@ typedef union
#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
-/* Call this when have matched something; it sets `matched' flags for the
- registers corresponding to the group of which we currently are inside.
- Also records whether this group ever matched something. We only care
- about this information at `stop_memory', and then only about the
- previous time through the loop (if the group is starred or whatever).
- So it is ok to clear all the nonactive registers here. */
+/* Call this when have matched a real character; it sets `matched' flags
+ for the subexpressions which we are currently inside. Also records
+ that those subexprs have matched. */
#define SET_REGS_MATCHED() \
do \
{ \
@@ -3037,24 +3044,24 @@ typedef union
/* Test if at very beginning or at very end of the virtual concatenation
of `string1' and `string2'. If only one string, it's `string2'. */
-#define AT_STRINGS_BEG() (d == (size1 ? string1 : string2) || !size2)
-#define AT_STRINGS_END() (d == end2)
+#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
+#define AT_STRINGS_END(d) ((d) == end2)
/* Test if D points to a character which is word-constituent. We have
two special cases to check for: if past the end of string1, look at
the first character in string2; and if before the beginning of
- string2, look at the last character in string1.
-
- Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG (). */
-#define LETTER_P(d) \
+ string2, look at the last character in string1. */
+#define WORDCHAR_P(d) \
(SYNTAX ((d) == end1 ? *string2 \
- : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == Sword)
+ : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
+ == Sword)
/* Test if the character before D and the one at D differ with respect
to being word-constituent. */
#define AT_WORD_BOUNDARY(d) \
- (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d - 1) != LETTER_P (d))
+ (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
+ || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
/* Free everything we malloc. */
@@ -3161,6 +3168,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
fail_stack_type fail_stack;
#ifdef DEBUG
static unsigned failure_id = 0;
+ unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
#endif
/* We fill all the registers internally, independent of what we
@@ -3254,8 +3262,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
else
{
/* We must initialize all our variables to NULL, so that
- `FREE_VARIABLES' doesn't try to free them. Too bad this isn't
- Lisp, so we could have a list of variables. As it is, */
+ `FREE_VARIABLES' doesn't try to free them. */
regstart = regend = old_regstart = old_regend = best_regstart
= best_regend = reg_dummy = NULL;
reg_info = reg_info_dummy = (register_info_type *) NULL;
@@ -3339,8 +3346,10 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
if (p == pend)
{ /* End of pattern means we might have succeeded. */
- DEBUG_PRINT1 ("End of pattern: ");
- /* If not end of string, try backtracking. Otherwise done. */
+ DEBUG_PRINT1 ("end of pattern ... ");
+
+ /* If we haven't matched the entire string, and we want the
+ longest match, try backtracking. */
if (d != end_match_2)
{
DEBUG_PRINT1 ("backtracking.\n");
@@ -3378,6 +3387,8 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
For example, the pattern `x.*y.*z' against the
strings `x-' and `y-z-', if the two strings are
not consecutive in memory. */
+ DEBUG_PRINT1 ("Restoring best registers.\n");
+
d = match_end;
dend = ((d >= string1 && d <= end1)
? end_match_1 : end_match_2);
@@ -3390,7 +3401,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
}
} /* d != end_match_2 */
- DEBUG_PRINT1 ("\nAccepting match.\n");
+ DEBUG_PRINT1 ("Accepting match.\n");
/* If caller wants register contents data back, do it. */
if (regs && !bufp->no_sub)
@@ -3456,7 +3467,10 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
} /* regs && !bufp->no_sub */
FREE_VARIABLES ();
- DEBUG_PRINT2 ("%d registers pushed.\n", num_regs_pushed);
+ DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
+ nfailure_points_pushed, nfailure_points_popped,
+ nfailure_points_pushed - nfailure_points_popped);
+ DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
mcnt = d - pos - (MATCHING_IN_FIRST_STRING
? string1
@@ -3658,7 +3672,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
/* If just failed to match something this time around with a
group that's operated on by a repetition operator, try to
- force exit from the ``loop,'' and restore the register
+ force exit from the ``loop'', and restore the register
information for this group that we had before trying this
last match. */
if ((!MATCHED_SOMETHING (reg_info[*p])
@@ -3802,7 +3816,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
case begline:
DEBUG_PRINT1 ("EXECUTING begline.\n");
- if (AT_STRINGS_BEG ())
+ if (AT_STRINGS_BEG (d))
{
if (!bufp->not_bol) break;
}
@@ -3818,7 +3832,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
case endline:
DEBUG_PRINT1 ("EXECUTING endline.\n");
- if (AT_STRINGS_END ())
+ if (AT_STRINGS_END (d))
{
if (!bufp->not_eol) break;
}
@@ -3835,7 +3849,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Match at the very beginning of the data. */
case begbuf:
DEBUG_PRINT1 ("EXECUTING begbuf.\n");
- if (AT_STRINGS_BEG ())
+ if (AT_STRINGS_BEG (d))
break;
goto fail;
@@ -3843,7 +3857,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Match at the very end of the data. */
case endbuf:
DEBUG_PRINT1 ("EXECUTING endbuf.\n");
- if (AT_STRINGS_END ())
+ if (AT_STRINGS_END (d))
break;
goto fail;
@@ -3897,7 +3911,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
the original * applied to a group), save the information
for that group and all inner ones, so that if we fail back
to this point, the group's information will be correct.
- For example, in \(a*\)*\1, we only need the preceding group,
+ For example, in \(a*\)*\1, we need the preceding group,
and in \(\(a*\)b*\)\2, we need the inner group. */
/* We can't use `p' to check ahead because we push
@@ -3927,8 +3941,8 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
break;
- /* A smart repeat ends with a maybe_pop_jump.
- We change it either to a pop_failure_jump or a jump. */
+ /* A smart repeat ends with `maybe_pop_jump'.
+ We change it to either `pop_failure_jump' or `jump'. */
case maybe_pop_jump:
EXTRACT_NUMBER_AND_INCR (mcnt, p);
DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
@@ -3956,10 +3970,21 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
/* If we're at the end of the pattern, we can change. */
if (p2 == pend)
- {
+ { /* But if we're also at the end of the string, we might
+ as well skip changing anything. For example, in `a+'
+ against `a', we'll have already matched the `a', and
+ I don't see the the point of changing the opcode,
+ popping the failure point, finding out it fails, and
+ then going into our endgame. */
+ if (d == dend)
+ {
+ p = pend;
+ DEBUG_PRINT1 (" End of pattern & string => done.\n");
+ continue;
+ }
+
p[-3] = (unsigned char) pop_failure_jump;
- DEBUG_PRINT1
- (" End of pattern: change to `pop_failure_jump'.\n");
+ DEBUG_PRINT1 (" End of pattern => pop_failure_jump.\n");
}
else if ((re_opcode_t) *p2 == exactn
@@ -3973,7 +3998,12 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
to the `maybe_finalize_jump' of this case. Examine what
follows. */
if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
- p[-3] = (unsigned char) pop_failure_jump;
+ {
+ p[-3] = (unsigned char) pop_failure_jump;
+ DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
+ c, p1[5]);
+ }
+
else if ((re_opcode_t) p1[3] == charset
|| (re_opcode_t) p1[3] == charset_not)
{
@@ -3988,9 +4018,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
if (!not)
{
p[-3] = (unsigned char) pop_failure_jump;
- DEBUG_PRINT1
- (" No match: change to `pop_failure_jump'.\n");
-
+ DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
}
}
}
@@ -3999,6 +4027,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
if ((re_opcode_t) p[-1] != pop_failure_jump)
{
p[-1] = (unsigned char) jump;
+ DEBUG_PRINT1 (" Match => jump.\n");
goto unconditional_jump;
}
/* Note fall through. */
@@ -4060,7 +4089,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
/* At the end of an alternative, we need to push a dummy failure
- point in case we are followed by a pop_failure_jump', because
+ point in case we are followed by a `pop_failure_jump', because
we don't want the failure point for the alternative to be
popped. For example, matching `(a|ab)*' against `aab'
requires that we match the `ab' alternative. */
@@ -4137,14 +4166,14 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
case wordbeg:
DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
- if (LETTER_P (d) && (AT_STRINGS_BEG () || !LETTER_P (d - 1)))
+ if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
break;
goto fail;
case wordend:
DEBUG_PRINT1 ("EXECUTING wordend.\n");
- if (!AT_STRINGS_BEG () && LETTER_P (d - 1)
- && (!LETTER_P (d) || AT_STRINGS_END ()))
+ if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
+ && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
break;
goto fail;
@@ -4181,11 +4210,12 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
goto matchsyntax;
case wordchar:
- DEBUG_PRINT1 ("EXECUTING wordchar.\n");
+ DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
mcnt = (int) Sword;
matchsyntax:
PREFETCH ();
- if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
+ if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
+ goto fail;
SET_REGS_MATCHED ();
break;
@@ -4195,11 +4225,12 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
goto matchnotsyntax;
case notwordchar:
- DEBUG_PRINT1 ("EXECUTING notwordchar.\n");
+ DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
mcnt = (int) Sword;
- matchnotsyntax: /* We goto here from notsyntaxspec. */
+ matchnotsyntax:
PREFETCH ();
- if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
+ if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
+ goto fail;
SET_REGS_MATCHED ();
break;
@@ -4207,17 +4238,19 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
case wordchar:
DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
PREFETCH ();
- if (!LETTER_P (d))
+ if (!WORDCHAR_P (d))
goto fail;
SET_REGS_MATCHED ();
+ d++;
break;
case notwordchar:
DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
PREFETCH ();
- if (LETTER_P (d))
+ if (WORDCHAR_P (d))
goto fail;
SET_REGS_MATCHED ();
+ d++;
break;
#endif /* not emacs */
@@ -4684,10 +4717,12 @@ regcomp (preg, pattern, cflags)
{
reg_errcode_t ret;
unsigned syntax
- = cflags & REG_EXTENDED ? RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
+ = (cflags & REG_EXTENDED) ?
+ RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
/* regex_compile will allocate the space for the compiled pattern. */
preg->buffer = 0;
+ preg->allocated = 0;
/* Don't bother to use a fastmap when searching. This simplifies the
REG_NEWLINE case: if we used a fastmap, we'd have to put all the
@@ -4812,7 +4847,7 @@ regexec (preg, string, nmatch, pmatch, eflags)
/* Returns a message corresponding to an error code, ERRCODE, returned
- from either regcomp or regexec. */
+ from either regcomp or regexec. We don't use PREG here. */
size_t
regerror (errcode, preg, errbuf, errbuf_size)
diff --git a/lib/regex.h b/lib/regex.h
index ef8e9a3669..e38853eaf6 100644
--- a/lib/regex.h
+++ b/lib/regex.h
@@ -20,12 +20,15 @@
#ifndef __REGEXP_LIBRARY_H__
#define __REGEXP_LIBRARY_H__
+/* POSIX says that <sys/types.h> must be included (by the caller) before
+ <regex.h>. */
+
#ifdef VMS
-/* POSIX says that size_t should be in stddef.h. */
+/* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it
+ should be there. */
#include <stddef.h>
#endif
-/* POSIX says that <sys/types.h> must be included before <regex.h>. */
/* The following bits are used to determine the regexp syntax we
recognize. The set/not-set meanings are chosen so that Emacs syntax
@@ -162,6 +165,9 @@ extern reg_syntax_t re_syntax_options;
#define RE_SYNTAX_POSIX_EGREP \
(RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES)
+/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
+#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC
+
#define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC
/* Syntax bits common to both basic and extended POSIX regex syntax. */
@@ -316,12 +322,12 @@ struct re_pattern_buffer
#define REGS_FIXED 2
unsigned regs_allocated : 2;
- /* Set to zero when regex_compile compiles a pattern; set to one
- by re_compile_fastmap when it updates the fastmap, if any. */
+ /* Set to zero when `regex_compile' compiles a pattern; set to one
+ by `re_compile_fastmap' if it updates the fastmap. */
unsigned fastmap_accurate : 1;
- /* If set, regexec reports only success or failure and does not
- return anything in pmatch. */
+ /* If set, `re_match_2' does not return information about
+ subexpressions. */
unsigned no_sub : 1;
/* If set, a beginning-of-line anchor doesn't match at the
@@ -383,17 +389,17 @@ typedef struct
unfortunately clutters up the declarations a bit, but I think it's
worth it.
- We also have to undo `const' if we are not ANSI and if it hasn't
- previously being taken care of. */
+ We may also have to undo `const' if we are not ANSI -- but if it has
+ already been defined, as by Autoconf's AC_CONST, don't do anything. */
#if __STDC__
#define _RE_ARGS(args) args
-#else
+#else /* not __STDC__ */
#define _RE_ARGS(args) ()
-#ifndef const
+#if !const && !HAVE_CONST
#define const
#endif
-#endif
+#endif /* not __STDC__ */
/* Sets the current default syntax to SYNTAX, and return the old syntax.
You can also simply assign to the `re_syntax_options' variable. */