summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Stallman <rms@gnu.org>1998-04-03 07:33:13 +0000
committerRichard Stallman <rms@gnu.org>1998-04-03 07:33:13 +0000
commit42f97e3f0c576afb94226c8493f3567b44352ca8 (patch)
tree0abce6ed6f0d86b4ddfc12a9ce4dde7f406e185f
parenteb3ea5be51b785e310baa910622f3386f0eb0dae (diff)
downloadgnulib-42f97e3f0c576afb94226c8493f3567b44352ca8.tar.gz
(compile_range): Unused function deleted.
(regex_compile): Special handling for range \177-\377. (regex_compile): Cast args to TRANSLATE to unsigned char. (re_search_2): Fix forward scan handling multibyte. Recognize that nonascii characters are not in the fastmap. Handle fetching multibyte characters for backward scan, (re_match_2_internal): Handle multibyte and translation in exactn and anychar. (bcmp_translate): Handle multibyte chars for translation. (TRANSLATE): Don't cast to unsigned char. (PATFETCH): Use RE_TRANSLATE to translate.
-rw-r--r--regex.c230
1 files changed, 133 insertions, 97 deletions
diff --git a/regex.c b/regex.c
index a997402a15..a26c0f57a6 100644
--- a/regex.c
+++ b/regex.c
@@ -168,7 +168,7 @@ init_syntax_once ()
#define SYNTAX(c) re_syntax_table[c]
-/* Dummy macro for non emacs environments. */
+/* Dummy macros for non-Emacs environments. */
#define BASE_LEADING_CODE_P(c) (0)
#define WORD_BOUNDARY_P(c1, c2) (0)
#define CHAR_HEAD_P(p) (1)
@@ -1539,7 +1539,7 @@ static reg_errcode_t compile_range ();
#define PATFETCH(c) \
do {if (p == pend) return REG_EEND; \
c = (unsigned char) *p++; \
- if (translate) c = (unsigned char) translate[c]; \
+ if (translate) c = RE_TRANSLATE (translate, c); \
} while (0)
#endif
@@ -1560,7 +1560,7 @@ static reg_errcode_t compile_range ();
when we use a character as a subscript we must make it unsigned. */
#ifndef TRANSLATE
#define TRANSLATE(d) \
- (translate ? (unsigned char) RE_TRANSLATE (translate, (unsigned char) (d)) : (d))
+ (translate ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d))
#endif
@@ -2107,9 +2107,10 @@ regex_compile (pattern, size, syntax, bufp)
incremented `p', by the way, to be the character after
the `*'. Do we have to do something analogous here
for null bytes, because of RE_DOT_NOT_NULL? */
- if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
+ if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.')
&& zero_times_ok
- && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
+ && p < pend
+ && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n')
&& !(syntax & RE_DOT_NEWLINE))
{ /* We have .*\n. */
STORE_JUMP (jump, b, laststart);
@@ -2333,7 +2334,18 @@ regex_compile (pattern, size, syntax, bufp)
p += len;
}
- if (!SAME_CHARSET_P (c, c1))
+ if (SINGLE_BYTE_CHAR_P (c)
+ && ! SINGLE_BYTE_CHAR_P (c1))
+ {
+ /* Handle a range such as \177-\377 in multibyte mode.
+ Split that into two ranges,,
+ the low one ending at 0237, and the high one
+ starting at ...040. */
+ int c1_base = (c1 & ~0177) | 040;
+ SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
+ c1 = 0237;
+ }
+ else if (!SAME_CHARSET_P (c, c1))
FREE_STACK_RETURN (REG_ERANGE);
}
else
@@ -2359,8 +2371,8 @@ regex_compile (pattern, size, syntax, bufp)
for (this_char = range_start; this_char <= range_end;
this_char++)
SET_LIST_BIT (TRANSLATE (this_char));
+ }
}
- }
else
/* ... into range table. */
SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
@@ -2913,8 +2925,8 @@ regex_compile (pattern, size, syntax, bufp)
/* Here, C may translated, therefore C may not equal to *P1. */
while (1)
{
- BUF_PUSH (c);
- (*pending_exact)++;
+ BUF_PUSH (c);
+ (*pending_exact)++;
if (++p1 == p)
break;
@@ -3121,64 +3133,6 @@ group_in_compile_stack (compile_stack, regnum)
return false;
}
-
-
-/* Read the ending character of a range (in a bracket expression) from the
- uncompiled pattern *P_PTR (which ends at PEND). We assume the
- starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
- Then we set the translation of all bits between the starting and
- ending characters (inclusive) in the compiled pattern B.
-
- Return an error code.
-
- We use these short variable names so we can use the same macros as
- `regex_compile' itself. */
-
-static reg_errcode_t
-compile_range (p_ptr, pend, translate, syntax, b)
- const char **p_ptr, *pend;
- RE_TRANSLATE_TYPE translate;
- reg_syntax_t syntax;
- unsigned char *b;
-{
- unsigned this_char;
-
- const char *p = *p_ptr;
- int range_start, range_end;
-
- if (p == pend)
- return REG_ERANGE;
-
- /* Even though the pattern is a signed `char *', we need to fetch
- with unsigned char *'s; if the high bit of the pattern character
- is set, the range endpoints will be negative if we fetch using a
- signed char *.
-
- We also want to fetch the endpoints without translating them; the
- appropriate translation is done in the bit-setting loop below. */
- /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
- range_start = ((const unsigned char *) p)[-2];
- range_end = ((const unsigned char *) p)[0];
-
- /* Have to increment the pointer into the pattern string, so the
- caller isn't still at the ending character. */
- (*p_ptr)++;
-
- /* If the start is after the end, the range is empty. */
- if (range_start > range_end)
- return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
-
- /* Here we see why `this_char' has to be larger than an `unsigned
- char' -- the range is inclusive, so if `range_end' == 0xff
- (assuming 8-bit characters), we would otherwise go into an infinite
- loop, since all characters <= 0xff. */
- for (this_char = range_start; this_char <= range_end; this_char++)
- {
- SET_LIST_BIT (TRANSLATE (this_char));
- }
-
- return REG_NOERROR;
-}
/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
@@ -3812,24 +3766,45 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
the first null string. */
if (fastmap && startpos < total_size && !bufp->can_be_null)
{
+ register const char *d;
+ register unsigned int buf_ch;
+
+ d = POS_ADDR_VSTRING (startpos);
+
if (range > 0) /* Searching forwards. */
{
- register const char *d;
register int lim = 0;
int irange = range;
if (startpos < size1 && startpos + range >= size1)
lim = range - (size1 - startpos);
- d = POS_ADDR_VSTRING (startpos);
-
/* Written out as an if-else to avoid testing `translate'
inside the loop. */
if (translate)
- while (range > lim
- && !fastmap[(unsigned char)
- RE_TRANSLATE (translate, (unsigned char) *d++)])
- range--;
+ {
+ if (multibyte)
+ while (range > lim)
+ {
+ int buf_charlen;
+
+ buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
+ buf_charlen);
+
+ buf_ch = RE_TRANSLATE (translate, buf_ch);
+ if (buf_ch >= 0400
+ || fastmap[buf_ch])
+ break;
+
+ range -= buf_charlen;
+ d += buf_charlen;
+ }
+ else
+ while (range > lim
+ && !fastmap[(unsigned char)
+ RE_TRANSLATE (translate, (unsigned char) *d++)])
+ range--;
+ }
else
while (range > lim && !fastmap[(unsigned char) *d++])
range--;
@@ -3838,11 +3813,16 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
}
else /* Searching backwards. */
{
- register char c = (size1 == 0 || startpos >= size1
- ? string2[startpos - size1]
- : string1[startpos]);
+ int room = (size1 == 0 || startpos >= size1
+ ? size2 + size1 - startpos
+ : size1 - startpos);
+
+ buf_ch = STRING_CHAR (d, room);
+ if (translate)
+ buf_ch = RE_TRANSLATE (translate, buf_ch);
- if (!fastmap[(unsigned char) TRANSLATE (c)])
+ if (! (buf_ch >= 0400
+ || fastmap[buf_ch]))
goto advance;
}
}
@@ -4515,14 +4495,36 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
testing `translate' inside the loop. */
if (translate)
{
- do
- {
- PREFETCH ();
- if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d++)
- != (unsigned char) *p++)
- goto fail;
- }
- while (--mcnt);
+#ifdef emacs
+ if (multibyte)
+ do
+ {
+ int pat_charlen, buf_charlen;
+ int pat_ch, buf_ch;
+
+ PREFETCH ();
+ pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+ buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+
+ if (RE_TRANSLATE (translate, buf_ch)
+ != pat_ch)
+ goto fail;
+
+ p += pat_charlen;
+ d += buf_charlen;
+ mcnt -= pat_charlen;
+ }
+ while (mcnt > 0);
+ else
+#endif /* not emacs */
+ do
+ {
+ PREFETCH ();
+ if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d++)
+ != (unsigned char) *p++)
+ goto fail;
+ }
+ while (--mcnt);
}
else
{
@@ -4539,17 +4541,36 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Match any character except possibly a newline or a null. */
case anychar:
- DEBUG_PRINT1 ("EXECUTING anychar.\n");
+ {
+ int buf_charlen;
+ int buf_ch;
- PREFETCH ();
+ DEBUG_PRINT1 ("EXECUTING anychar.\n");
- if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
- || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
- goto fail;
+ PREFETCH ();
- SET_REGS_MATCHED ();
- DEBUG_PRINT2 (" Matched `%d'.\n", *d);
- d += multibyte ? MULTIBYTE_FORM_LENGTH (d, dend - d) : 1;
+#ifdef emacs
+ if (multibyte)
+ buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+ else
+#endif /* not emacs */
+ {
+ buf_ch = *d;
+ buf_charlen = 1;
+ }
+
+ buf_ch = TRANSLATE (buf_ch);
+
+ if ((!(bufp->syntax & RE_DOT_NEWLINE)
+ && buf_ch == '\n')
+ || ((bufp->syntax & RE_DOT_NOT_NULL)
+ && buf_ch == '\000'))
+ goto fail;
+
+ SET_REGS_MATCHED ();
+ DEBUG_PRINT2 (" Matched `%d'.\n", *d);
+ d += buf_charlen;
+ }
break;
@@ -5926,12 +5947,27 @@ bcmp_translate (s1, s2, len, translate)
RE_TRANSLATE_TYPE translate;
{
register unsigned char *p1 = s1, *p2 = s2;
- while (len)
+ unsigned char *p1_end = s1 + len;
+ unsigned char *p2_end = s2 + len;
+
+ while (p1 != p1_end && p2 != p2_end)
{
- if (RE_TRANSLATE (translate, *p1++) != RE_TRANSLATE (translate, *p2++))
+ int p1_charlen, p2_charlen;
+ int p1_ch, p2_ch;
+
+ p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
+ p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
+
+ if (RE_TRANSLATE (translate, p1_ch)
+ != RE_TRANSLATE (translate, p2_ch))
return 1;
- len--;
+
+ p1 += p1_charlen, p2 += p2_charlen;
}
+
+ if (p1 != p1_end || p2 != p2_end)
+ return 1;
+
return 0;
}