summaryrefslogtreecommitdiff
path: root/regex.c
diff options
context:
space:
mode:
Diffstat (limited to 'regex.c')
-rw-r--r--regex.c2692
1 files changed, 2303 insertions, 389 deletions
diff --git a/regex.c b/regex.c
index 1da69e2c..9ea11811 100644
--- a/regex.c
+++ b/regex.c
@@ -2,7 +2,7 @@
version 0.12.
(Implements POSIX draft P1003.2/D11.2, except for some of the
internationalization features.)
- Copyright (C) 1993, 94, 95, 96, 97, 98, 99, 2000 Free Software Foundation, Inc.
+ Copyright (C) 1993, 94, 95, 96, 97, 98, 99, 2000, 2002 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -38,6 +38,12 @@
# endif /* GCC. */
#endif /* Not PARAMS. */
+#ifndef INSIDE_RECURSION
+
+#if !defined __GNUC__
+# define __alignof__(type) sizeof(type)
+#endif
+
#if defined STDC_HEADERS && !defined emacs
# include <stddef.h>
#else
@@ -45,7 +51,21 @@
# include <sys/types.h>
#endif
+#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
+/* We can handle multibyte string. */
+# define MBS_SUPPORT
+# if !WIDE_CHAR_SUPPORT
+# define WIDE_CHAR_SUPPORT 1
+# endif
+#else
#define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
+#endif
+
+/* Activate this if definition of alignof() for non-GNU C is good.
+#if !defined __GNUC__
+# undef MBS_SUPPORT
+#endif
+*/
/* For platform which support the ISO C amendement 1 functionality we
support user defined character classes. */
@@ -80,9 +100,14 @@
#define btowc __btowc
#endif
+/* Don't use gettext if ENABLE_NLS is not defined */
+#ifdef ENABLE_NLS
/* This is for other GNU distributions with internationalized messages. */
-#if HAVE_LIBINTL_H || defined _LIBC
-# include <libintl.h>
+# if HAVE_LIBINTL_H || defined _LIBC
+# include <libintl.h>
+# else
+# define gettext(msgid) (msgid)
+# endif
#else
# define gettext(msgid) (msgid)
#endif
@@ -165,6 +190,14 @@ char *realloc ();
#endif /* not emacs */
+# if defined _LIBC || HAVE_LIMITS_H
+# include <limits.h>
+# endif
+
+# ifndef MB_LEN_MAX
+# define MB_LEN_MAX 1
+# endif
+
/* Get the interface, including the syntax bits. */
#include <regex.h>
@@ -374,12 +407,46 @@ typedef char boolean;
#define false 0
#define true 1
-static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
+static reg_errcode_t byte_regex_compile _RE_ARGS ((const char *pattern, size_t size,
+ reg_syntax_t syntax,
+ struct re_pattern_buffer *bufp));
+
+static int byte_re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
const char *string1, int size1,
const char *string2, int size2,
int pos,
struct re_registers *regs,
int stop));
+static int byte_re_search_2 PARAMS ((struct re_pattern_buffer *bufp,
+ const char *string1, int size1,
+ const char *string2, int size2,
+ int startpos, int range,
+ struct re_registers *regs, int stop));
+static int byte_re_compile_fastmap PARAMS ((struct re_pattern_buffer *bufp));
+
+#ifdef MBS_SUPPORT
+static reg_errcode_t wcs_regex_compile _RE_ARGS ((const char *pattern, size_t size,
+ reg_syntax_t syntax,
+ struct re_pattern_buffer *bufp));
+
+
+static int wcs_re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
+ const char *cstring1, int csize1,
+ const char *cstring2, int csize2,
+ int pos,
+ struct re_registers *regs,
+ int stop,
+ wchar_t *string1, int size1,
+ wchar_t *string2, int size2,
+ int *mbs_offset1, int *mbs_offset2));
+static int wcs_re_search_2 PARAMS ((struct re_pattern_buffer *bufp,
+ const char *string1, int size1,
+ const char *string2, int size2,
+ int startpos, int range,
+ struct re_registers *regs, int stop));
+static int wcs_re_compile_fastmap PARAMS ((struct re_pattern_buffer *bufp));
+#endif
+
/* These are the command codes that appear in compiled regular
expressions. Some opcodes are followed by argument bytes. A
@@ -396,6 +463,11 @@ typedef enum
/* Followed by one byte giving n, then by n literal bytes. */
exactn,
+# ifdef MBS_SUPPORT
+ /* Same as exactn, but contains binary data. */
+ exactn_bin,
+# endif
+
/* Matches any (more or less) character. */
anychar,
@@ -405,6 +477,13 @@ typedef enum
are ordered low-bit-first. A character is in the set if its
bit is 1. A character too large to have a bit in the map is
automatically not in the set. */
+ /* ifdef MBS_SUPPORT, following element is length of character
+ classes, length of collating symbols, length of equivalence
+ classes, length of character ranges, and length of characters.
+ Next, character class element, collating symbols elements,
+ equivalence class elements, range elements, and character
+ elements follow.
+ See regex_compile function. */
charset,
/* Same parameters as charset, but match any character that is
@@ -454,6 +533,7 @@ typedef enum
/* Followed by two-byte relative address of place to resume at
in case of failure. */
+ /* ifdef WCHAR, the size of address is 1. */
on_failure_jump,
/* Like on_failure_jump, but pushes a placeholder instead of the
@@ -462,6 +542,7 @@ typedef enum
/* Throw away latest failure point and then jump to following
two-byte relative address. */
+ /* ifdef WCHAR, the size of address is 1. */
pop_failure_jump,
/* Change to pop_failure_jump if know won't have to backtrack to
@@ -471,6 +552,7 @@ typedef enum
sure that there is no use backtracking out of repetitions
already matched, then we change it to a pop_failure_jump.
Followed by two-byte address. */
+ /* ifdef WCHAR, the size of address is 1. */
maybe_pop_jump,
/* Jump to following two-byte address, and push a dummy failure
@@ -478,6 +560,7 @@ typedef enum
is made to use it for a failure. A `+' construct makes this
before the first repeat. Also used as an intermediary kind
of jump when compiling an alternative. */
+ /* ifdef WCHAR, the size of address is 1. */
dummy_failure_jump,
/* Push a dummy failure point and continue. Used at the end of
@@ -486,15 +569,18 @@ typedef enum
/* Followed by two-byte relative address and two-byte number n.
After matching N times, jump to the address upon failure. */
+ /* ifdef WCHAR, the size of address is 1. */
succeed_n,
/* Followed by two-byte relative address, and two-byte number n.
Jump to the address N times, then fail. */
+ /* ifdef WCHAR, the size of address is 1. */
jump_n,
/* Set the following two-byte relative address to the
subsequent two-byte number. The address *includes* the two
bytes of number. */
+ /* ifdef WCHAR, the size of address is 1. */
set_number_at,
wordchar, /* Matches any word-constituent character. */
@@ -519,51 +605,131 @@ typedef enum
notsyntaxspec
#endif /* emacs */
} re_opcode_t;
+#endif /* not INSIDE_RECURSION */
+
+#ifdef BYTE
+# define CHAR_T char
+# define UCHAR_T unsigned char
+# define COMPILED_BUFFER_VAR bufp->buffer
+/* Offset address size in compiled pattern. */
+# define OFFSET_ADDRESS_SIZE 2
+#ifdef HAVE_STRINGIZE /* if it supports #, it probably supports ## too */
+# define PREFIX(name) byte_##name
+#else
+# define PREFIX(name) byte_/**/name
+#endif
+# define ARG_PREFIX(name) name
+# define PUT_CHAR(c) putchar (c)
+#else
+#ifdef WCHAR
+# define CHAR_T wchar_t
+# define CHAR_T_SIGN (1 << (sizeof(CHAR_T) * 8 - 1))
+# if defined _AIX
+# define WCHAR_T_NEED_SIGNEXTEND 1
+# endif /* _AIX */
+# define UCHAR_T wchar_t
+# define COMPILED_BUFFER_VAR wc_buffer
+/* Offset address size in compiled pattern. */
+# define OFFSET_ADDRESS_SIZE 1
+# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1)
+#ifdef HAVE_STRINGIZE /* if it supports #, it probably supports ## too */
+# define PREFIX(name) wcs_##name
+#else
+# define PREFIX(name) wcs_/**/name
+#endif
+# define ARG_PREFIX(name) c##name
+/* Should we use wide stream?? */
+# define PUT_CHAR(c) printf ("%C", c);
+# define TRUE 1
+# define FALSE 0
+#else
+# ifdef MBS_SUPPORT
+# define WCHAR
+# define INSIDE_RECURSION
+# include "regex.c"
+# undef INSIDE_RECURSION
+# endif /* MBS_SUPPORT */
+# define BYTE
+# define INSIDE_RECURSION
+# include "regex.c"
+# undef INSIDE_RECURSION
+#endif /* WCHAR */
+#endif /* BYTE */
+
+#ifdef INSIDE_RECURSION
/* Common operations on the compiled pattern. */
/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
+/* ifdef WCHAR, we store NUMBER in 1 element. */
-#define STORE_NUMBER(destination, number) \
+# ifdef WCHAR
+# define STORE_NUMBER(destination, number) \
+ do { \
+ *(destination) = (UCHAR_T)(number); \
+ } while (0)
+# else /* BYTE */
+# define STORE_NUMBER(destination, number) \
do { \
(destination)[0] = (number) & 0377; \
(destination)[1] = (number) >> 8; \
} while (0)
+# endif /* WCHAR */
/* Same as STORE_NUMBER, except increment DESTINATION to
the byte after where the number is stored. Therefore, DESTINATION
must be an lvalue. */
+/* ifdef WCHAR, we store NUMBER in 1 element. */
#define STORE_NUMBER_AND_INCR(destination, number) \
do { \
STORE_NUMBER (destination, number); \
- (destination) += 2; \
+ (destination) += OFFSET_ADDRESS_SIZE; \
} while (0)
/* Put into DESTINATION a number stored in two contiguous bytes starting
at SOURCE. */
+/* ifdef WCHAR, we store NUMBER in 1 element. */
-#define EXTRACT_NUMBER(destination, source) \
+# ifdef WCHAR
+# ifdef WCHAR_T_NEED_SIGNEXTEND
+# define EXTRACT_NUMBER(destination, source) \
+ (destination) = (*(source) ^ CHAR_T_SIGN) - CHAR_T_SIGN;
+# else
+# define EXTRACT_NUMBER(destination, source) \
+ (destination) = *(source)
+# endif /* WCHAR_T_NEED_SIGNEXTEND */
+# else /* BYTE */
+# define EXTRACT_NUMBER(destination, source) \
do { \
(destination) = *(source) & 0377; \
(destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
} while (0)
+# endif
#ifdef DEBUG
-static void extract_number _RE_ARGS ((int *dest, unsigned char *source));
+static void PREFIX(extract_number) _RE_ARGS ((int *dest, UCHAR_T *source));
static void
-extract_number (dest, source)
+PREFIX(extract_number) (dest, source)
int *dest;
- unsigned char *source;
+ UCHAR_T *source;
{
+# ifdef WCHAR
+# ifdef WCHAR_T_NEED_SIGNEXTEND
+ *dest = (*source ^ CHAR_T_SIGN) - CHAR_T_SIGN;
+# else
+ *dest = *source;
+# endif /* WCHAR_T_NEED_SIGNEXTEND */
+# else /* BYTE */
int temp = SIGN_EXTEND_CHAR (*(source + 1));
*dest = *source & 0377;
*dest += temp << 8;
+# endif
}
# ifndef EXTRACT_MACROS /* To debug the macros. */
# undef EXTRACT_NUMBER
-# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
+# define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src)
# endif /* not EXTRACT_MACROS */
#endif /* DEBUG */
@@ -574,25 +740,25 @@ extract_number (dest, source)
#define EXTRACT_NUMBER_AND_INCR(destination, source) \
do { \
EXTRACT_NUMBER (destination, source); \
- (source) += 2; \
+ (source) += OFFSET_ADDRESS_SIZE; \
} while (0)
#ifdef DEBUG
-static void extract_number_and_incr _RE_ARGS ((int *destination,
- unsigned char **source));
+static void PREFIX(extract_number_and_incr) _RE_ARGS ((int *destination,
+ UCHAR_T **source));
static void
-extract_number_and_incr (destination, source)
+PREFIX(extract_number_and_incr) (destination, source)
int *destination;
- unsigned char **source;
+ UCHAR_T **source;
{
- extract_number (destination, *source);
- *source += 2;
+ PREFIX(extract_number) (destination, *source);
+ *source += OFFSET_ADDRESS_SIZE;
}
# ifndef EXTRACT_MACROS
# undef EXTRACT_NUMBER_AND_INCR
# define EXTRACT_NUMBER_AND_INCR(dest, src) \
- extract_number_and_incr (&dest, &src)
+ PREFIX(extract_number_and_incr) (&dest, &src)
# endif /* not EXTRACT_MACROS */
#endif /* DEBUG */
@@ -605,6 +771,8 @@ extract_number_and_incr (destination, source)
#ifdef DEBUG
+#ifndef DEFINED_ONCE
+
/* We use standard I/O for debugging. */
# include <stdio.h>
@@ -618,14 +786,17 @@ static int debug;
# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
+#endif /* not DEFINED_ONCE */
+
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
- if (debug) print_partial_compiled_pattern (s, e)
+ if (debug) PREFIX(print_partial_compiled_pattern) (s, e)
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
- if (debug) print_double_string (w, s1, sz1, s2, sz2)
+ if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2)
/* Print the fastmap in human-readable form. */
+#ifndef DEFINED_ONCE
void
print_fastmap (fastmap)
char *fastmap;
@@ -653,20 +824,21 @@ print_fastmap (fastmap)
}
putchar ('\n');
}
+#endif /* not DEFINED_ONCE */
/* Print a compiled pattern string in human-readable form, starting at
the START pointer into it and ending just before the pointer END. */
void
-print_partial_compiled_pattern (start, end)
- unsigned char *start;
- unsigned char *end;
+PREFIX(print_partial_compiled_pattern) (start, end)
+ UCHAR_T *start;
+ UCHAR_T *end;
{
int mcnt, mcnt2;
- unsigned char *p1;
- unsigned char *p = start;
- unsigned char *pend = end;
+ UCHAR_T *p1;
+ UCHAR_T *p = start;
+ UCHAR_T *pend = end;
if (start == NULL)
{
@@ -691,10 +863,22 @@ print_partial_compiled_pattern (start, end)
do
{
putchar ('/');
- putchar (*p++);
+ PUT_CHAR (*p++);
+ }
+ while (--mcnt);
+ break;
+
+# ifdef MBS_SUPPORT
+ case exactn_bin:
+ mcnt = *p++;
+ printf ("/exactn_bin/%d", mcnt);
+ do
+ {
+ printf("/%lx", (long int) *p++);
}
while (--mcnt);
break;
+# endif /* MBS_SUPPORT */
case start_memory:
mcnt = *p++;
@@ -717,6 +901,45 @@ print_partial_compiled_pattern (start, end)
case charset:
case charset_not:
{
+# ifdef WCHAR
+ int i, length;
+ wchar_t *workp = p;
+ printf ("/charset [%s",
+ (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
+ p += 5;
+ length = *workp++; /* the length of char_classes */
+ for (i=0 ; i<length ; i++)
+ printf("[:%lx:]", (long int) *p++);
+ length = *workp++; /* the length of collating_symbol */
+ for (i=0 ; i<length ;)
+ {
+ printf("[.");
+ while(*p != 0)
+ PUT_CHAR((i++,*p++));
+ i++,p++;
+ printf(".]");
+ }
+ length = *workp++; /* the length of equivalence_class */
+ for (i=0 ; i<length ;)
+ {
+ printf("[=");
+ while(*p != 0)
+ PUT_CHAR((i++,*p++));
+ i++,p++;
+ printf("=]");
+ }
+ length = *workp++; /* the length of char_range */
+ for (i=0 ; i<length ; i++)
+ {
+ wchar_t range_start = *p++;
+ wchar_t range_end = *p++;
+ printf("%C-%C", range_start, range_end);
+ }
+ length = *workp++; /* the length of char */
+ for (i=0 ; i<length ; i++)
+ printf("%C", *p++);
+ putchar (']');
+# else
register int c, last = -100;
register int in_range = 0;
@@ -754,6 +977,7 @@ print_partial_compiled_pattern (start, end)
putchar (']');
p += 1 + *p;
+# endif /* WCHAR */
}
break;
@@ -766,17 +990,17 @@ print_partial_compiled_pattern (start, end)
break;
case on_failure_jump:
- extract_number_and_incr (&mcnt, &p);
+ PREFIX(extract_number_and_incr) (&mcnt, &p);
printf ("/on_failure_jump to %d", p + mcnt - start);
break;
case on_failure_keep_string_jump:
- extract_number_and_incr (&mcnt, &p);
+ PREFIX(extract_number_and_incr) (&mcnt, &p);
printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
break;
case dummy_failure_jump:
- extract_number_and_incr (&mcnt, &p);
+ PREFIX(extract_number_and_incr) (&mcnt, &p);
printf ("/dummy_failure_jump to %d", p + mcnt - start);
break;
@@ -785,43 +1009,43 @@ print_partial_compiled_pattern (start, end)
break;
case maybe_pop_jump:
- extract_number_and_incr (&mcnt, &p);
+ PREFIX(extract_number_and_incr) (&mcnt, &p);
printf ("/maybe_pop_jump to %d", p + mcnt - start);
break;
case pop_failure_jump:
- extract_number_and_incr (&mcnt, &p);
+ PREFIX(extract_number_and_incr) (&mcnt, &p);
printf ("/pop_failure_jump to %d", p + mcnt - start);
break;
case jump_past_alt:
- extract_number_and_incr (&mcnt, &p);
+ PREFIX(extract_number_and_incr) (&mcnt, &p);
printf ("/jump_past_alt to %d", p + mcnt - start);
break;
case jump:
- extract_number_and_incr (&mcnt, &p);
+ PREFIX(extract_number_and_incr) (&mcnt, &p);
printf ("/jump to %d", p + mcnt - start);
break;
case succeed_n:
- extract_number_and_incr (&mcnt, &p);
+ PREFIX(extract_number_and_incr) (&mcnt, &p);
p1 = p + mcnt;
- extract_number_and_incr (&mcnt2, &p);
+ PREFIX(extract_number_and_incr) (&mcnt2, &p);
printf ("/succeed_n to %d, %d times", p1 - start, mcnt2);
break;
case jump_n:
- extract_number_and_incr (&mcnt, &p);
+ PREFIX(extract_number_and_incr) (&mcnt, &p);
p1 = p + mcnt;
- extract_number_and_incr (&mcnt2, &p);
+ PREFIX(extract_number_and_incr) (&mcnt2, &p);
printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
break;
case set_number_at:
- extract_number_and_incr (&mcnt, &p);
+ PREFIX(extract_number_and_incr) (&mcnt, &p);
p1 = p + mcnt;
- extract_number_and_incr (&mcnt2, &p);
+ PREFIX(extract_number_and_incr) (&mcnt2, &p);
printf ("/set_number_at location %d to %d", p1 - start, mcnt2);
break;
@@ -894,12 +1118,13 @@ print_partial_compiled_pattern (start, end)
void
-print_compiled_pattern (bufp)
+PREFIX(print_compiled_pattern) (bufp)
struct re_pattern_buffer *bufp;
{
- unsigned char *buffer = bufp->buffer;
+ UCHAR_T *buffer = (UCHAR_T*) bufp->buffer;
- print_partial_compiled_pattern (buffer, buffer + bufp->used);
+ PREFIX(print_partial_compiled_pattern) (buffer, buffer
+ + bufp->used / sizeof(UCHAR_T));
printf ("%ld bytes used/%ld bytes allocated.\n",
bufp->used, bufp->allocated);
@@ -922,10 +1147,10 @@ print_compiled_pattern (bufp)
void
-print_double_string (where, string1, size1, string2, size2)
- const char *where;
- const char *string1;
- const char *string2;
+PREFIX(print_double_string) (where, string1, size1, string2, size2)
+ const CHAR_T *where;
+ const CHAR_T *string1;
+ const CHAR_T *string2;
int size1;
int size2;
{
@@ -938,25 +1163,28 @@ print_double_string (where, string1, size1, string2, size2)
if (FIRST_STRING_P (where))
{
for (this_char = where - string1; this_char < size1; this_char++)
- putchar (string1[this_char]);
+ PUT_CHAR (string1[this_char]);
where = string2;
}
for (this_char = where - string2; this_char < size2; this_char++)
- putchar (string2[this_char]);
+ PUT_CHAR (string2[this_char]);
}
}
+# ifndef DEFINED_ONCE
void
printchar (c)
int c;
{
putc (c, stderr);
}
+#endif
#else /* not DEBUG */
+#ifndef DEFINED_ONCE
# undef assert
# define assert(e)
@@ -965,11 +1193,88 @@ printchar (c)
# define DEBUG_PRINT2(x1, x2)
# define DEBUG_PRINT3(x1, x2, x3)
# define DEBUG_PRINT4(x1, x2, x3, x4)
+#endif /* not DEFINED_ONCE */
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
#endif /* not DEBUG */
+# ifdef WCHAR
+/* This convert a multibyte string to a wide character string.
+ And write their correspondances to offset_buffer(see below)
+ and write whether each wchar_t is binary data to is_binary.
+ This assume invalid multibyte sequences as binary data.
+ We assume offset_buffer and is_binary is already allocated
+ enough space. */
+
+static size_t convert_mbs_to_wcs (CHAR_T *dest, const unsigned char* src,
+ size_t len, int *offset_buffer,
+ char *is_binary);
+static size_t
+convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
+ CHAR_T *dest;
+ const unsigned char* src;
+ size_t len; /* the length of multibyte string. */
+
+ /* It hold correspondances between src(char string) and
+ dest(wchar_t string) for optimization.
+ e.g. src = "xxxyzz"
+ dest = {'X', 'Y', 'Z'}
+ (each "xxx", "y" and "zz" represent one multibyte character
+ corresponding to 'X', 'Y' and 'Z'.)
+ offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
+ = {0, 3, 4, 6}
+ */
+ int *offset_buffer;
+ char *is_binary;
+{
+ wchar_t *pdest = dest;
+ const unsigned char *psrc = src;
+ size_t wc_count = 0;
+
+ mbstate_t mbs;
+ int i, consumed;
+ size_t mb_remain = len;
+ size_t mb_count = 0;
+
+ /* Initialize the conversion state. */
+ memset (&mbs, 0, sizeof (mbstate_t));
+
+ offset_buffer[0] = 0;
+ for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
+ psrc += consumed)
+ {
+ consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
+
+ if (consumed <= 0)
+ /* failed to convert. maybe src contains binary data.
+ So we consume 1 byte manualy. */
+ {
+ *pdest = *psrc;
+ consumed = 1;
+ is_binary[wc_count] = TRUE;
+ }
+ else
+ is_binary[wc_count] = FALSE;
+ /* In sjis encoding, we use yen sign as escape character in
+ place of reverse solidus. So we convert 0x5c(yen sign in
+ sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
+ solidus in UCS2). */
+ if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
+ *pdest = (wchar_t) *psrc;
+
+ offset_buffer[wc_count + 1] = mb_count += consumed;
+ }
+
+ /* Fill remain of the buffer with sentinel. */
+ for (i = wc_count + 1 ; i <= len ; i++)
+ offset_buffer[i] = mb_count + 1;
+
+ return wc_count;
+}
+# endif /* WCHAR */
+
+#else /* not INSIDE_RECURSION */
/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
also be assigned to arbitrarily: each pattern buffer stores its own
syntax, so it can be changed between regex compilations. */
@@ -1033,6 +1338,9 @@ static const char *re_error_msgid[] =
gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
};
+#endif /* INSIDE_RECURSION */
+
+#ifndef DEFINED_ONCE
/* Avoiding alloca during matching, to placate r_alloc. */
/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
@@ -1069,8 +1377,10 @@ static const char *re_error_msgid[] =
#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
# undef MATCH_MAY_ALLOCATE
#endif
+#endif /* not DEFINED_ONCE */
+#ifdef INSIDE_RECURSION
/* Failure stack declarations and macros; both re_compile_fastmap and
re_match_2 use a failure stack. These have to be macros because of
REGEX_ALLOCATE_STACK. */
@@ -1090,59 +1400,93 @@ static const char *re_error_msgid[] =
#ifdef INT_IS_16BIT
-# if defined MATCH_MAY_ALLOCATE
+#ifndef DEFINED_ONCE
+
+# ifdef REGEX_MALLOC
+/*
+ * Set RE_MAX_FAILURES to the largest reasonable value,
+ * to avoid spurious "not enough memory" messages.
+ * The '80' is computed as follows:
+ * regex.c's MAX_FAILURE_ITEMS is at most 20.
+ * regex.c computes 2 * MAX_FAILURE_ITEMS, giving us 40.
+ * re_max_failures is signed, not unsigned, for another factor of 2,
+ * giving us 80.
+ */
+long int re_max_failures = ((unsigned) -1) / 80;
+# else /* ! not REGEX_MALLOC */
+# if defined MATCH_MAY_ALLOCATE
/* 4400 was enough to cause a crash on Alpha OSF/1,
whose default stack limit is 2mb. */
long int re_max_failures = 4000;
-# else
+# else
long int re_max_failures = 2000;
-# endif
+# endif
+# endif /* ! not REGEX_MALLOC */
+#endif /* ! DEFINED_ONCE */
-union fail_stack_elt
+union PREFIX(fail_stack_elt)
{
- unsigned char *pointer;
+ UCHAR_T *pointer;
long int integer;
};
-typedef union fail_stack_elt fail_stack_elt_t;
+typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
typedef struct
{
- fail_stack_elt_t *stack;
+ PREFIX(fail_stack_elt_t) *stack;
unsigned long int size;
unsigned long int avail; /* Offset of next open position. */
-} fail_stack_type;
+} PREFIX(fail_stack_type);
#else /* not INT_IS_16BIT */
-# if defined MATCH_MAY_ALLOCATE
+#ifndef DEFINED_ONCE
+
+# ifdef REGEX_MALLOC
+/*
+ * Set RE_MAX_FAILURES to the largest reasonable value,
+ * to avoid spurious "not enough memory" messages.
+ * The '80' is computed as follows:
+ * regex.c's MAX_FAILURE_ITEMS is at most 20.
+ * regex.c computes 2 * MAX_FAILURE_ITEMS, giving us 40.
+ * re_max_failures is signed, not unsigned, for another factor of 2,
+ * giving us 80.
+ */
+long int re_max_failures = ((unsigned) -1) / 80;
+# else /* ! not REGEX_MALLOC */
+# if defined MATCH_MAY_ALLOCATE
/* 4400 was enough to cause a crash on Alpha OSF/1,
whose default stack limit is 2mb. */
int re_max_failures = 20000;
-# else
+# else
int re_max_failures = 2000;
-# endif
+# endif
+# endif /* ! not REGEX_MALLOC */
+#endif
-union fail_stack_elt
+union PREFIX(fail_stack_elt)
{
- unsigned char *pointer;
+ UCHAR_T *pointer;
int integer;
};
-typedef union fail_stack_elt fail_stack_elt_t;
+typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
typedef struct
{
- fail_stack_elt_t *stack;
+ PREFIX(fail_stack_elt_t) *stack;
unsigned size;
unsigned avail; /* Offset of next open position. */
-} fail_stack_type;
+} PREFIX(fail_stack_type);
#endif /* INT_IS_16BIT */
+#ifndef DEFINED_ONCE
#define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
+#endif
/* Define macros to initialize and free the failure stack.
@@ -1151,8 +1495,8 @@ typedef struct
#ifdef MATCH_MAY_ALLOCATE
# define INIT_FAIL_STACK() \
do { \
- fail_stack.stack = (fail_stack_elt_t *) \
- REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
+ fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \
+ REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \
\
if (fail_stack.stack == NULL) \
return -2; \
@@ -1182,10 +1526,10 @@ typedef struct
#define DOUBLE_FAIL_STACK(fail_stack) \
((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
? 0 \
- : ((fail_stack).stack = (fail_stack_elt_t *) \
+ : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \
REGEX_REALLOCATE_STACK ((fail_stack).stack, \
- (fail_stack).size * sizeof (fail_stack_elt_t), \
- ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
+ (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \
+ ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\
\
(fail_stack).stack == NULL \
? 0 \
@@ -1207,7 +1551,7 @@ typedef struct
Assumes the variable `fail_stack'. Probably should only
be called from within `PUSH_FAILURE_POINT'. */
#define PUSH_FAILURE_POINTER(item) \
- fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
+ fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item)
/* This pushes an integer-valued item onto the failure stack.
Assumes the variable `fail_stack'. Probably should only
@@ -1325,6 +1669,7 @@ typedef struct
DEBUG_PUSH (failure_id); \
} while (0)
+#ifndef DEFINED_ONCE
/* This is the number of items that are pushed and popped on the stack
for each register. */
#define NUM_REG_ITEMS 3
@@ -1351,6 +1696,7 @@ typedef struct
/* How many items can still be added to the stack without overflowing it. */
#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
+#endif /* not DEFINED_ONCE */
/* Pops what PUSH_FAIL_STACK pushes.
@@ -1369,7 +1715,7 @@ typedef struct
{ \
DEBUG_STATEMENT (unsigned failure_id;) \
active_reg_t this_reg; \
- const unsigned char *string_temp; \
+ const UCHAR_T *string_temp; \
\
assert (!FAIL_STACK_EMPTY ()); \
\
@@ -1388,13 +1734,13 @@ typedef struct
saved NULL, thus retaining our current position in the string. */ \
string_temp = POP_FAILURE_POINTER (); \
if (string_temp != NULL) \
- str = (const char *) string_temp; \
+ str = (const CHAR_T *) string_temp; \
\
DEBUG_PRINT2 (" Popping string %p: `", str); \
DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
DEBUG_PRINT1 ("'\n"); \
\
- pat = (unsigned char *) POP_FAILURE_POINTER (); \
+ pat = (UCHAR_T *) POP_FAILURE_POINTER (); \
DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
\
@@ -1414,10 +1760,10 @@ typedef struct
DEBUG_PRINT2 (" info: %p\n", \
reg_info[this_reg].word.pointer); \
\
- regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
+ regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
\
- regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
+ regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
} \
else \
@@ -1453,7 +1799,7 @@ typedef struct
typedef union
{
- fail_stack_elt_t word;
+ PREFIX(fail_stack_elt_t) word;
struct
{
/* This field is one if this group can match the empty string,
@@ -1464,8 +1810,9 @@ typedef union
unsigned matched_something : 1;
unsigned ever_matched_something : 1;
} bits;
-} register_info_type;
+} PREFIX(register_info_type);
+#ifndef DEFINED_ONCE
#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
#define IS_ACTIVE(R) ((R).bits.is_active)
#define MATCHED_SOMETHING(R) ((R).bits.matched_something)
@@ -1491,51 +1838,71 @@ typedef union
} \
} \
while (0)
+#endif /* not DEFINED_ONCE */
/* Registers are set to a sentinel when they haven't yet matched. */
-static char reg_unset_dummy;
-#define REG_UNSET_VALUE (&reg_unset_dummy)
+static CHAR_T PREFIX(reg_unset_dummy);
+#define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy))
#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
/* Subroutine declarations and macros for regex_compile. */
-
-static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,
- reg_syntax_t syntax,
- struct re_pattern_buffer *bufp));
-static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
-static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
+static void PREFIX(store_op1) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc, int arg));
+static void PREFIX(store_op2) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
int arg1, int arg2));
-static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
- int arg, unsigned char *end));
-static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
- int arg1, int arg2, unsigned char *end));
-static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p,
+static void PREFIX(insert_op1) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
+ int arg, UCHAR_T *end));
+static void PREFIX(insert_op2) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
+ int arg1, int arg2, UCHAR_T *end));
+static boolean PREFIX(at_begline_loc_p) _RE_ARGS ((const CHAR_T *pattern, const CHAR_T *p,
reg_syntax_t syntax));
-static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend,
+static boolean PREFIX(at_endline_loc_p) _RE_ARGS ((const CHAR_T *p, const CHAR_T *pend,
reg_syntax_t syntax));
-static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
- const char *pend,
- char *translate,
- reg_syntax_t syntax,
- unsigned char *b));
+# ifdef WCHAR
+static reg_errcode_t wcs_compile_range _RE_ARGS ((CHAR_T range_start,
+ const CHAR_T **p_ptr,
+ const CHAR_T *pend,
+ char *translate,
+ reg_syntax_t syntax,
+ UCHAR_T *b,
+ CHAR_T *char_set));
+static void insert_space _RE_ARGS ((int num, CHAR_T *loc, CHAR_T *end));
+# else /* BYTE */
+static reg_errcode_t byte_compile_range _RE_ARGS ((unsigned int range_start,
+ const char **p_ptr,
+ const char *pend,
+ char *translate,
+ reg_syntax_t syntax,
+ unsigned char *b));
+# endif /* WCHAR */
/* Fetch the next character in the uncompiled pattern---translating it
if necessary. Also cast from a signed character in the constant
string passed to us by the user to an unsigned char that we can use
as an array index (in, e.g., `translate'). */
+/* ifdef WCHAR, we translate only if character <= 0xff,
+ because it is impossible to allocate 4GB array for some encodings
+ which have 4 byte character_set like UCS4. */
#ifndef PATFETCH
-# define PATFETCH(c) \
+# ifdef WCHAR
+# define PATFETCH(c) \
+ do {if (p == pend) return REG_EEND; \
+ c = (UCHAR_T) *p++; \
+ if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \
+ } while (0)
+# else /* BYTE */
+# define PATFETCH(c) \
do {if (p == pend) return REG_EEND; \
c = (unsigned char) *p++; \
if (translate) c = (unsigned char) translate[c]; \
} while (0)
+# endif /* WCHAR */
#endif
/* Fetch the next character in the uncompiled pattern, with no
translation. */
#define PATFETCH_RAW(c) \
do {if (p == pend) return REG_EEND; \
- c = (unsigned char) *p++; \
+ c = (UCHAR_T) *p++; \
} while (0)
/* Go backwards one character in the pattern. */
@@ -1546,27 +1913,43 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
cast the subscript to translate because some data is declared as
`char *', to avoid warnings when a string constant is passed. But
when we use a character as a subscript we must make it unsigned. */
+/* ifdef WCHAR, we translate only if character <= 0xff,
+ because it is impossible to allocate 4GB array for some encodings
+ which have 4 byte character_set like UCS4. */
#ifndef TRANSLATE
-# define TRANSLATE(d) \
+# ifdef WCHAR
+# define TRANSLATE(d) \
+ ((translate && ((UCHAR_T) (d)) <= 0xff) \
+ ? (char) translate[(unsigned char) (d)] : (d))
+# else /* BYTE */
+# define TRANSLATE(d) \
(translate ? (char) translate[(unsigned char) (d)] : (d))
+# endif /* WCHAR */
#endif
/* Macros for outputting the compiled pattern into `buffer'. */
/* If the buffer isn't allocated when it comes in, use this. */
-#define INIT_BUF_SIZE 32
+# define INIT_BUF_SIZE (32 * sizeof(UCHAR_T))
/* Make sure we have at least N more bytes of space in buffer. */
-#define GET_BUFFER_SPACE(n) \
+# ifdef WCHAR
+# define GET_BUFFER_SPACE(n) \
+ while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
+ + (n)*sizeof(CHAR_T)) > bufp->allocated) \
+ EXTEND_BUFFER ()
+# else /* BYTE */
+# define GET_BUFFER_SPACE(n) \
while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
EXTEND_BUFFER ()
+# endif /* WCHAR */
/* Make sure we have one more byte of buffer space and then add C to it. */
#define BUF_PUSH(c) \
do { \
GET_BUFFER_SPACE (1); \
- *b++ = (unsigned char) (c); \
+ *b++ = (UCHAR_T) (c); \
} while (0)
@@ -1574,8 +1957,8 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
#define BUF_PUSH_2(c1, c2) \
do { \
GET_BUFFER_SPACE (2); \
- *b++ = (unsigned char) (c1); \
- *b++ = (unsigned char) (c2); \
+ *b++ = (UCHAR_T) (c1); \
+ *b++ = (UCHAR_T) (c2); \
} while (0)
@@ -1583,28 +1966,28 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
#define BUF_PUSH_3(c1, c2, c3) \
do { \
GET_BUFFER_SPACE (3); \
- *b++ = (unsigned char) (c1); \
- *b++ = (unsigned char) (c2); \
- *b++ = (unsigned char) (c3); \
+ *b++ = (UCHAR_T) (c1); \
+ *b++ = (UCHAR_T) (c2); \
+ *b++ = (UCHAR_T) (c3); \
} while (0)
/* Store a jump with opcode OP at LOC to location TO. We store a
relative address offset by the three bytes the jump itself occupies. */
#define STORE_JUMP(op, loc, to) \
- store_op1 (op, loc, (int) ((to) - (loc) - 3))
+ PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
/* Likewise, for a two-argument jump. */
#define STORE_JUMP2(op, loc, to, arg) \
- store_op2 (op, loc, (int) ((to) - (loc) - 3), arg)
+ PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
#define INSERT_JUMP(op, loc, to) \
- insert_op1 (op, loc, (int) ((to) - (loc) - 3), b)
+ PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
#define INSERT_JUMP2(op, loc, to, arg) \
- insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b)
+ PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg, b)
/* This is not an arbitrary limit: the arguments which represent offsets
@@ -1615,6 +1998,7 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
reallocating to 0 bytes. Such thing is not going to work too well.
You have been warned!! */
+#ifndef DEFINED_ONCE
#if defined _MSC_VER && !defined WIN32
/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
The REALLOC define eliminates a flurry of conversion warnings,
@@ -1650,34 +2034,72 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
# define MOVE_BUFFER_POINTER(P) (P) += incr
# define ELSE_EXTEND_BUFFER_HIGH_BOUND
#endif
-#define EXTEND_BUFFER() \
- do { \
- unsigned char *old_buffer = bufp->buffer; \
- if (bufp->allocated == MAX_BUF_SIZE) \
+#endif /* not DEFINED_ONCE */
+# ifdef WCHAR
+# define EXTEND_BUFFER() \
+ do { \
+ UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
+ int wchar_count; \
+ if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \
return REG_ESIZE; \
bufp->allocated <<= 1; \
if (bufp->allocated > MAX_BUF_SIZE) \
- bufp->allocated = MAX_BUF_SIZE; \
- bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\
- if (bufp->buffer == NULL) \
+ bufp->allocated = MAX_BUF_SIZE; \
+ /* How many characters the new buffer can have? */ \
+ wchar_count = bufp->allocated / sizeof(UCHAR_T); \
+ if (wchar_count == 0) wchar_count = 1; \
+ /* Truncate the buffer to CHAR_T align. */ \
+ bufp->allocated = wchar_count * sizeof(UCHAR_T); \
+ RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \
+ bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
+ if (COMPILED_BUFFER_VAR == NULL) \
return REG_ESPACE; \
/* If the buffer moved, move all the pointers into it. */ \
- if (old_buffer != bufp->buffer) \
+ if (old_buffer != COMPILED_BUFFER_VAR) \
{ \
- int incr = bufp->buffer - old_buffer; \
+ int incr = COMPILED_BUFFER_VAR - old_buffer; \
+ MOVE_BUFFER_POINTER (b); \
+ MOVE_BUFFER_POINTER (begalt); \
+ if (fixup_alt_jump) \
+ MOVE_BUFFER_POINTER (fixup_alt_jump); \
+ if (laststart) \
+ MOVE_BUFFER_POINTER (laststart); \
+ if (pending_exact) \
+ MOVE_BUFFER_POINTER (pending_exact); \
+ } \
+ ELSE_EXTEND_BUFFER_HIGH_BOUND \
+ } while (0)
+# else /* BYTE */
+# define EXTEND_BUFFER() \
+ do { \
+ UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
+ if (bufp->allocated == MAX_BUF_SIZE) \
+ return REG_ESIZE; \
+ bufp->allocated <<= 1; \
+ if (bufp->allocated > MAX_BUF_SIZE) \
+ bufp->allocated = MAX_BUF_SIZE; \
+ bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \
+ bufp->allocated); \
+ if (COMPILED_BUFFER_VAR == NULL) \
+ return REG_ESPACE; \
+ /* If the buffer moved, move all the pointers into it. */ \
+ if (old_buffer != COMPILED_BUFFER_VAR) \
+ { \
+ int incr = COMPILED_BUFFER_VAR - old_buffer; \
MOVE_BUFFER_POINTER (b); \
MOVE_BUFFER_POINTER (begalt); \
- if (fixup_alt_jump) \
+ if (fixup_alt_jump) \
MOVE_BUFFER_POINTER (fixup_alt_jump); \
- if (laststart) \
+ if (laststart) \
MOVE_BUFFER_POINTER (laststart); \
- if (pending_exact) \
+ if (pending_exact) \
MOVE_BUFFER_POINTER (pending_exact); \
} \
ELSE_EXTEND_BUFFER_HIGH_BOUND \
} while (0)
+# endif /* WCHAR */
-
+#ifndef DEFINED_ONCE
/* Since we have one byte reserved for the register number argument to
{start,stop}_memory, the maximum number of groups we can report
things about is what fits in that byte. */
@@ -1721,12 +2143,15 @@ typedef struct
/* The next available element. */
#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
+#endif /* not DEFINED_ONCE */
/* Set the bit for character C in a list. */
+#ifndef DEFINED_ONCE
#define SET_LIST_BIT(c) \
(b[((unsigned char) (c)) / BYTEWIDTH] \
|= 1 << (((unsigned char) c) % BYTEWIDTH))
+#endif /* DEFINED_ONCE */
/* Get the next unsigned number in the uncompiled pattern. */
#define GET_UNSIGNED_NUMBER(num) \
@@ -1745,6 +2170,7 @@ typedef struct
} \
}
+#ifndef DEFINED_ONCE
#if defined _LIBC || WIDE_CHAR_SUPPORT
/* The GNU C library provides support for user-defined character classes
and the functions from ISO C amendement 1. */
@@ -1772,6 +2198,7 @@ typedef struct
|| STREQ (string, "punct") || STREQ (string, "graph") \
|| STREQ (string, "cntrl") || STREQ (string, "blank"))
#endif
+#endif /* DEFINED_ONCE */
#ifndef MATCH_MAY_ALLOCATE
@@ -1782,25 +2209,28 @@ typedef struct
The register vectors, we adjust in size each time we
compile a regexp, according to the number of registers it needs. */
-static fail_stack_type fail_stack;
+static PREFIX(fail_stack_type) fail_stack;
/* Size with which the following vectors are currently allocated.
That is so we can make them bigger as needed,
but never make them smaller. */
+#ifdef DEFINED_ONCE
static int regs_allocated_size;
static const char ** regstart, ** regend;
static const char ** old_regstart, ** old_regend;
static const char **best_regstart, **best_regend;
-static register_info_type *reg_info;
static const char **reg_dummy;
-static register_info_type *reg_info_dummy;
+#endif /* DEFINED_ONCE */
+
+static PREFIX(register_info_type) *PREFIX(reg_info);
+static PREFIX(register_info_type) *PREFIX(reg_info_dummy);
/* Make the register vectors big enough for NUM_REGS registers,
but don't make them smaller. */
-static
-regex_grow_registers (num_regs)
+static void
+PREFIX(regex_grow_registers) (num_regs)
int num_regs;
{
if (num_regs > regs_allocated_size)
@@ -1811,9 +2241,9 @@ regex_grow_registers (num_regs)
RETALLOC_IF (old_regend, num_regs, const char *);
RETALLOC_IF (best_regstart, num_regs, const char *);
RETALLOC_IF (best_regend, num_regs, const char *);
- RETALLOC_IF (reg_info, num_regs, register_info_type);
+ RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type));
RETALLOC_IF (reg_dummy, num_regs, const char *);
- RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
+ RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type));
regs_allocated_size = num_regs;
}
@@ -1821,9 +2251,11 @@ regex_grow_registers (num_regs)
#endif /* not MATCH_MAY_ALLOCATE */
+#ifndef DEFINED_ONCE
static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
compile_stack,
regnum_t regnum));
+#endif /* not DEFINED_ONCE */
/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
Returns one of error codes defined in `regex.h', or zero for success.
@@ -1844,33 +2276,55 @@ static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
examined nor set. */
/* Return, freeing storage we allocated. */
-#define FREE_STACK_RETURN(value) \
+# ifdef WCHAR
+# define FREE_STACK_RETURN(value) \
+ return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
+# else
+# define FREE_STACK_RETURN(value) \
return (free (compile_stack.stack), value)
+# endif /* WCHAR */
static reg_errcode_t
-regex_compile (pattern, size, syntax, bufp)
- const char *pattern;
- size_t size;
+PREFIX(regex_compile) (ARG_PREFIX(pattern), ARG_PREFIX(size), syntax, bufp)
+ const char *ARG_PREFIX(pattern);
+ size_t ARG_PREFIX(size);
reg_syntax_t syntax;
struct re_pattern_buffer *bufp;
{
/* We fetch characters from PATTERN here. Even though PATTERN is
`char *' (i.e., signed), we declare these variables as unsigned, so
they can be reliably used as array indices. */
- register unsigned char c, c1;
+ register UCHAR_T c, c1;
+
+#ifdef WCHAR
+ /* A temporary space to keep wchar_t pattern and compiled pattern. */
+ CHAR_T *pattern, *COMPILED_BUFFER_VAR;
+ size_t size;
+ /* offset buffer for optimization. See convert_mbs_to_wc. */
+ int *mbs_offset = NULL;
+ /* It hold whether each wchar_t is binary data or not. */
+ char *is_binary = NULL;
+ /* A flag whether exactn is handling binary data or not. */
+ char is_exactn_bin = FALSE;
+#endif /* WCHAR */
/* A random temporary spot in PATTERN. */
- const char *p1;
+ const CHAR_T *p1;
/* Points to the end of the buffer, where we should append. */
- register unsigned char *b;
+ register UCHAR_T *b;
/* Keeps track of unclosed groups. */
compile_stack_type compile_stack;
/* Points to the current (ending) position in the pattern. */
- const char *p = pattern;
- const char *pend = pattern + size;
+#ifdef WCHAR
+ const CHAR_T *p;
+ const CHAR_T *pend;
+#else /* BYTE */
+ const CHAR_T *p = pattern;
+ const CHAR_T *pend = pattern + size;
+#endif /* WCHAR */
/* How to translate the characters in the pattern. */
RE_TRANSLATE_TYPE translate = bufp->translate;
@@ -1879,30 +2333,54 @@ regex_compile (pattern, size, syntax, bufp)
command. This makes it possible to tell if a new exact-match
character can be added to that command or if the character requires
a new `exactn' command. */
- unsigned char *pending_exact = 0;
+ UCHAR_T *pending_exact = 0;
/* Address of start of the most recently finished expression.
This tells, e.g., postfix * where to find the start of its
operand. Reset at the beginning of groups and alternatives. */
- unsigned char *laststart = 0;
+ UCHAR_T *laststart = 0;
/* Address of beginning of regexp, or inside of last group. */
- unsigned char *begalt;
+ UCHAR_T *begalt;
/* Place in the uncompiled pattern (i.e., the {) to
which to go back if the interval is invalid. */
- const char *beg_interval;
+ const CHAR_T *beg_interval;
/* Address of the place where a forward jump should go to the end of
the containing expression. Each alternative of an `or' -- except the
last -- ends with a forward jump of this sort. */
- unsigned char *fixup_alt_jump = 0;
+ UCHAR_T *fixup_alt_jump = 0;
/* Counts open-groups as they are encountered. Remembered for the
matching close-group on the compile stack, so the same register
number is put in the stop_memory as the start_memory. */
regnum_t regnum = 0;
+#ifdef WCHAR
+ /* Initialize the wchar_t PATTERN and offset_buffer. */
+ p = pend = pattern = TALLOC(csize + 1, CHAR_T);
+ mbs_offset = TALLOC(csize + 1, int);
+ is_binary = TALLOC(csize + 1, char);
+ if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
+ {
+ free(pattern);
+ free(mbs_offset);
+ free(is_binary);
+ return REG_ESPACE;
+ }
+ pattern[csize] = L'\0'; /* sentinel */
+ size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
+ pend = p + size;
+ if (size < 0)
+ {
+ free(pattern);
+ free(mbs_offset);
+ free(is_binary);
+ return REG_BADPAT;
+ }
+#endif
+
#ifdef DEBUG
DEBUG_PRINT1 ("\nCompiling pattern: ");
if (debug)
@@ -1910,7 +2388,7 @@ regex_compile (pattern, size, syntax, bufp)
unsigned debug_count;
for (debug_count = 0; debug_count < size; debug_count++)
- putchar (pattern[debug_count]);
+ PUT_CHAR (pattern[debug_count]);
putchar ('\n');
}
#endif /* DEBUG */
@@ -1918,7 +2396,15 @@ regex_compile (pattern, size, syntax, bufp)
/* Initialize the compile stack. */
compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
if (compile_stack.stack == NULL)
- return REG_ESPACE;
+ {
+#ifdef WCHAR
+ free(pattern);
+ free(mbs_offset);
+ free(is_binary);
+#endif
+ return REG_ESPACE;
+ }
+
compile_stack.size = INIT_COMPILE_STACK_SIZE;
compile_stack.avail = 0;
@@ -1947,18 +2433,34 @@ regex_compile (pattern, size, syntax, bufp)
{ /* If zero allocated, but buffer is non-null, try to realloc
enough space. This loses if buffer's address is bogus, but
that is the user's responsibility. */
- RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
+#ifdef WCHAR
+ /* Free bufp->buffer and allocate an array for wchar_t pattern
+ buffer. */
+ free(bufp->buffer);
+ COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T),
+ UCHAR_T);
+#else
+ RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T);
+#endif /* WCHAR */
}
else
{ /* Caller did not allocate a buffer. Do it for them. */
- bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
+ COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T),
+ UCHAR_T);
}
- if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
+ if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
+#ifdef WCHAR
+ bufp->buffer = (char*)COMPILED_BUFFER_VAR;
+#endif /* WCHAR */
bufp->allocated = INIT_BUF_SIZE;
}
+#ifdef WCHAR
+ else
+ COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer;
+#endif
- begalt = b = bufp->buffer;
+ begalt = b = COMPILED_BUFFER_VAR;
/* Loop through the uncompiled pattern until we're at the end. */
while (p != pend)
@@ -1974,7 +2476,7 @@ regex_compile (pattern, size, syntax, bufp)
/* If context independent, it's an operator. */
|| syntax & RE_CONTEXT_INDEP_ANCHORS
/* Otherwise, depends on what's come before. */
- || at_begline_loc_p (pattern, p, syntax))
+ || PREFIX(at_begline_loc_p) (pattern, p, syntax))
BUF_PUSH (begline);
else
goto normal_char;
@@ -1989,7 +2491,7 @@ regex_compile (pattern, size, syntax, bufp)
/* If context independent, it's an operator. */
|| syntax & RE_CONTEXT_INDEP_ANCHORS
/* Otherwise, depends on what's next. */
- || at_endline_loc_p (p, pend, syntax))
+ || PREFIX(at_endline_loc_p) (p, pend, syntax))
BUF_PUSH (endline);
else
goto normal_char;
@@ -2083,7 +2585,7 @@ regex_compile (pattern, size, syntax, bufp)
assert (p - 1 > pattern);
/* Allocate the space for the jump. */
- GET_BUFFER_SPACE (3);
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
/* We know we are not at the first character of the pattern,
because laststart was nonzero. And we've already
@@ -2100,20 +2602,23 @@ regex_compile (pattern, size, syntax, bufp)
}
else
/* Anything else. */
- STORE_JUMP (maybe_pop_jump, b, laststart - 3);
+ STORE_JUMP (maybe_pop_jump, b, laststart -
+ (1 + OFFSET_ADDRESS_SIZE));
/* We've added more stuff to the buffer. */
- b += 3;
+ b += 1 + OFFSET_ADDRESS_SIZE;
}
/* On failure, jump from laststart to b + 3, which will be the
end of the buffer after this jump is inserted. */
- GET_BUFFER_SPACE (3);
+ /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
+ 'b + 3'. */
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
: on_failure_jump,
- laststart, b + 3);
+ laststart, b + 1 + OFFSET_ADDRESS_SIZE);
pending_exact = 0;
- b += 3;
+ b += 1 + OFFSET_ADDRESS_SIZE;
if (!zero_times_ok)
{
@@ -2122,9 +2627,10 @@ regex_compile (pattern, size, syntax, bufp)
`on_failure_jump' instruction of the loop. This
effects a skip over that instruction the first time
we hit that loop. */
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
- b += 3;
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ INSERT_JUMP (dummy_failure_jump, laststart, laststart +
+ 2 + 2 * OFFSET_ADDRESS_SIZE);
+ b += 1 + OFFSET_ADDRESS_SIZE;
}
}
break;
@@ -2139,9 +2645,343 @@ regex_compile (pattern, size, syntax, bufp)
case '[':
{
boolean had_char_class = false;
-
+#ifdef WCHAR
+ CHAR_T range_start = 0xffffffff;
+#else
+ unsigned int range_start = 0xffffffff;
+#endif
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+#ifdef WCHAR
+ /* We assume a charset(_not) structure as a wchar_t array.
+ charset[0] = (re_opcode_t) charset(_not)
+ charset[1] = l (= length of char_classes)
+ charset[2] = m (= length of collating_symbols)
+ charset[3] = n (= length of equivalence_classes)
+ charset[4] = o (= length of char_ranges)
+ charset[5] = p (= length of chars)
+
+ charset[6] = char_class (wctype_t)
+ charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
+ ...
+ charset[l+5] = char_class (wctype_t)
+
+ charset[l+6] = collating_symbol (wchar_t)
+ ...
+ charset[l+m+5] = collating_symbol (wchar_t)
+ ifdef _LIBC we use the index if
+ _NL_COLLATE_SYMB_EXTRAMB instead of
+ wchar_t string.
+
+ charset[l+m+6] = equivalence_classes (wchar_t)
+ ...
+ charset[l+m+n+5] = equivalence_classes (wchar_t)
+ ifdef _LIBC we use the index in
+ _NL_COLLATE_WEIGHT instead of
+ wchar_t string.
+
+ charset[l+m+n+6] = range_start
+ charset[l+m+n+7] = range_end
+ ...
+ charset[l+m+n+2o+4] = range_start
+ charset[l+m+n+2o+5] = range_end
+ ifdef _LIBC we use the value looked up
+ in _NL_COLLATE_COLLSEQ instead of
+ wchar_t character.
+
+ charset[l+m+n+2o+6] = char
+ ...
+ charset[l+m+n+2o+p+5] = char
+
+ */
+
+ /* We need at least 6 spaces: the opcode, the length of
+ char_classes, the length of collating_symbols, the length of
+ equivalence_classes, the length of char_ranges, the length of
+ chars. */
+ GET_BUFFER_SPACE (6);
+
+ /* Save b as laststart. And We use laststart as the pointer
+ to the first element of the charset here.
+ In other words, laststart[i] indicates charset[i]. */
+ laststart = b;
+
+ /* We test `*p == '^' twice, instead of using an if
+ statement, so we only need one BUF_PUSH. */
+ BUF_PUSH (*p == '^' ? charset_not : charset);
+ if (*p == '^')
+ p++;
+
+ /* Push the length of char_classes, the length of
+ collating_symbols, the length of equivalence_classes, the
+ length of char_ranges and the length of chars. */
+ BUF_PUSH_3 (0, 0, 0);
+ BUF_PUSH_2 (0, 0);
+
+ /* Remember the first position in the bracket expression. */
+ p1 = p;
+
+ /* charset_not matches newline according to a syntax bit. */
+ if ((re_opcode_t) b[-6] == charset_not
+ && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
+ {
+ BUF_PUSH('\n');
+ laststart[5]++; /* Update the length of characters */
+ }
+
+ /* Read in characters and ranges, setting map bits. */
+ for (;;)
+ {
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ PATFETCH (c);
+
+ /* \ might escape characters inside [...] and [^...]. */
+ if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
+ {
+ if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
+
+ PATFETCH (c1);
+ BUF_PUSH(c1);
+ laststart[5]++; /* Update the length of chars */
+ range_start = c1;
+ continue;
+ }
+
+ /* Could be the end of the bracket expression. If it's
+ not (i.e., when the bracket expression is `[]' so
+ far), the ']' character bit gets set way below. */
+ if (c == ']' && p != p1 + 1)
+ break;
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character class. */
+ if (had_char_class && c == '-' && *p != ']')
+ FREE_STACK_RETURN (REG_ERANGE);
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character: if this is a hyphen not at the
+ beginning or the end of a list, then it's the range
+ operator. */
+ if (c == '-'
+ && !(p - 2 >= pattern && p[-2] == '[')
+ && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
+ && *p != ']')
+ {
+ reg_errcode_t ret;
+ /* Allocate the space for range_start and range_end. */
+ GET_BUFFER_SPACE (2);
+ /* Update the pointer to indicate end of buffer. */
+ b += 2;
+ ret = wcs_compile_range (range_start, &p, pend, translate,
+ syntax, b, laststart);
+ if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ range_start = 0xffffffff;
+ }
+ else if (p[0] == '-' && p[1] != ']')
+ { /* This handles ranges made up of characters only. */
+ reg_errcode_t ret;
+
+ /* Move past the `-'. */
+ PATFETCH (c1);
+ /* Allocate the space for range_start and range_end. */
+ GET_BUFFER_SPACE (2);
+ /* Update the pointer to indicate end of buffer. */
+ b += 2;
+ ret = wcs_compile_range (c, &p, pend, translate, syntax, b,
+ laststart);
+ if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ range_start = 0xffffffff;
+ }
+
+ /* See if we're at the beginning of a possible character
+ class. */
+ else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
+ { /* Leave room for the null. */
+ char str[CHAR_CLASS_MAX_LENGTH + 1];
+
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[:'. */
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ for (;;)
+ {
+ PATFETCH (c);
+ if ((c == ':' && *p == ']') || p == pend)
+ break;
+ if (c1 < CHAR_CLASS_MAX_LENGTH)
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
+ }
+ str[c1] = '\0';
+
+ /* If isn't a word bracketed by `[:' and `:]':
+ undo the ending character, the letters, and leave
+ the leading `:' and `[' (but store them as character). */
+ if (c == ':' && *p == ']')
+ {
+ wctype_t wt;
+ unsigned long int alignedp;
+
+ /* Query the character class as wctype_t. */
+ wt = IS_CHAR_CLASS (str);
+ if (wt == 0)
+ FREE_STACK_RETURN (REG_ECTYPE);
+
+ /* Throw away the ] at the end of the character
+ class. */
+ PATFETCH (c);
+
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ /* Allocate the space for character class. */
+ GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
+ /* Update the pointer to indicate end of buffer. */
+ b += CHAR_CLASS_SIZE;
+ /* Move data which follow character classes
+ not to violate the data. */
+ insert_space(CHAR_CLASS_SIZE,
+ laststart + 6 + laststart[1],
+ b - 1);
+ alignedp = ((unsigned long int)(laststart + 6 + laststart[1])
+ + __alignof__(wctype_t) - 1)
+ & ~(unsigned long int)(__alignof__(wctype_t) - 1);
+ /* Store the character class. */
+ *((wctype_t*)alignedp) = wt;
+ /* Update length of char_classes */
+ laststart[1] += CHAR_CLASS_SIZE;
+
+ had_char_class = true;
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ BUF_PUSH ('[');
+ BUF_PUSH (':');
+ laststart[5] += 2; /* Update the length of characters */
+ range_start = ':';
+ had_char_class = false;
+ }
+ }
+ else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
+ || *p == '.'))
+ {
+ CHAR_T str[128]; /* Should be large enough. */
+ CHAR_T delim = *p; /* '=' or '.' */
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[=' or '[[.'. */
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ for (;;)
+ {
+ PATFETCH (c);
+ if ((c == delim && *p == ']') || p == pend)
+ break;
+ if (c1 < sizeof (str) - 1)
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
+ }
+ str[c1] = '\0';
+
+ if (c == delim && *p == ']' && str[0] != '\0')
+ {
+ unsigned int i, offset;
+ /* If we have no collation data we use the default
+ collation in which each character is in a class
+ by itself. It also means that ASCII is the
+ character set and therefore we cannot have character
+ with more than one byte in the multibyte
+ representation. */
+
+ /* If not defined _LIBC, we push the name and
+ `\0' for the sake of matching performance. */
+ int datasize = c1 + 1;
+
+ if (c1 != 1)
+ FREE_STACK_RETURN (REG_ECOLLATE);
+
+ /* Throw away the ] at the end of the equivalence
+ class (or collating symbol). */
+ PATFETCH (c);
+
+ /* Allocate the space for the equivalence class
+ (or collating symbol) (and '\0' if needed). */
+ GET_BUFFER_SPACE(datasize);
+ /* Update the pointer to indicate end of buffer. */
+ b += datasize;
+
+ if (delim == '=')
+ { /* equivalence class */
+ /* Calculate the offset of char_ranges,
+ which is next to equivalence_classes. */
+ offset = laststart[1] + laststart[2]
+ + laststart[3] +6;
+ /* Insert space. */
+ insert_space(datasize, laststart + offset, b - 1);
+
+ /* Write the equivalence_class and \0. */
+ for (i = 0 ; i < datasize ; i++)
+ laststart[offset + i] = str[i];
+
+ /* Update the length of equivalence_classes. */
+ laststart[3] += datasize;
+ had_char_class = true;
+ }
+ else /* delim == '.' */
+ { /* collating symbol */
+ /* Calculate the offset of the equivalence_classes,
+ which is next to collating_symbols. */
+ offset = laststart[1] + laststart[2] + 6;
+ /* Insert space and write the collationg_symbol
+ and \0. */
+ insert_space(datasize, laststart + offset, b-1);
+ for (i = 0 ; i < datasize ; i++)
+ laststart[offset + i] = str[i];
+
+ /* In re_match_2_internal if range_start < -1, we
+ assume -range_start is the offset of the
+ collating symbol which is specified as
+ the character of the range start. So we assign
+ -(laststart[1] + laststart[2] + 6) to
+ range_start. */
+ range_start = -(laststart[1] + laststart[2] + 6);
+ /* Update the length of collating_symbol. */
+ laststart[2] += datasize;
+ had_char_class = false;
+ }
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ BUF_PUSH ('[');
+ BUF_PUSH (delim);
+ laststart[5] += 2; /* Update the length of characters */
+ range_start = delim;
+ had_char_class = false;
+ }
+ }
+ else
+ {
+ had_char_class = false;
+ BUF_PUSH(c);
+ laststart[5]++; /* Update the length of characters */
+ range_start = c;
+ }
+ }
+
+#else /* BYTE */
/* Ensure that we have enough space to push a charset: the
opcode, the length count, and the bitset; 34 bytes in all. */
GET_BUFFER_SPACE (34);
@@ -2182,6 +3022,7 @@ regex_compile (pattern, size, syntax, bufp)
PATFETCH (c1);
SET_LIST_BIT (c1);
+ range_start = c1;
continue;
}
@@ -2206,8 +3047,10 @@ regex_compile (pattern, size, syntax, bufp)
&& *p != ']')
{
reg_errcode_t ret
- = compile_range (&p, pend, translate, syntax, b);
+ = byte_compile_range (range_start, &p, pend, translate,
+ syntax, b);
if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ range_start = 0xffffffff;
}
else if (p[0] == '-' && p[1] != ']')
@@ -2217,8 +3060,9 @@ regex_compile (pattern, size, syntax, bufp)
/* Move past the `-'. */
PATFETCH (c1);
- ret = compile_range (&p, pend, translate, syntax, b);
+ ret = byte_compile_range (c, &p, pend, translate, syntax, b);
if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+ range_start = 0xffffffff;
}
/* See if we're at the beginning of a possible character
@@ -2341,6 +3185,119 @@ regex_compile (pattern, size, syntax, bufp)
PATUNFETCH;
SET_LIST_BIT ('[');
SET_LIST_BIT (':');
+ range_start = ':';
+ had_char_class = false;
+ }
+ }
+ else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
+ {
+ unsigned char str[MB_LEN_MAX + 1];
+
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[='. */
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ for (;;)
+ {
+ PATFETCH (c);
+ if ((c == '=' && *p == ']') || p == pend)
+ break;
+ if (c1 < MB_LEN_MAX)
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
+ }
+ str[c1] = '\0';
+
+ if (c == '=' && *p == ']' && str[0] != '\0')
+ {
+ /* If we have no collation data we use the default
+ collation in which each character is in a class
+ by itself. It also means that ASCII is the
+ character set and therefore we cannot have character
+ with more than one byte in the multibyte
+ representation. */
+ {
+ if (c1 != 1)
+ FREE_STACK_RETURN (REG_ECOLLATE);
+
+ /* Throw away the ] at the end of the equivalence
+ class. */
+ PATFETCH (c);
+
+ /* Set the bit for the character. */
+ SET_LIST_BIT (str[0]);
+ }
+ had_char_class = true;
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ SET_LIST_BIT ('[');
+ SET_LIST_BIT ('=');
+ range_start = '=';
+ had_char_class = false;
+ }
+ }
+ else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
+ {
+ unsigned char str[128]; /* Should be large enough. */
+
+ PATFETCH (c);
+ c1 = 0;
+
+ /* If pattern is `[[.'. */
+ if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+ for (;;)
+ {
+ PATFETCH (c);
+ if ((c == '.' && *p == ']') || p == pend)
+ break;
+ if (c1 < sizeof (str))
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
+ }
+ str[c1] = '\0';
+
+ if (c == '.' && *p == ']' && str[0] != '\0')
+ {
+ /* If we have no collation data we use the default
+ collation in which each character is the name
+ for its own class which contains only the one
+ character. It also means that ASCII is the
+ character set and therefore we cannot have character
+ with more than one byte in the multibyte
+ representation. */
+ {
+ if (c1 != 1)
+ FREE_STACK_RETURN (REG_ECOLLATE);
+
+ /* Throw away the ] at the end of the equivalence
+ class. */
+ PATFETCH (c);
+
+ /* Set the bit for the character. */
+ SET_LIST_BIT (str[0]);
+ range_start = ((const unsigned char *) str)[0];
+ }
+ had_char_class = false;
+ }
+ else
+ {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ SET_LIST_BIT ('[');
+ SET_LIST_BIT ('.');
+ range_start = '.';
had_char_class = false;
}
}
@@ -2348,6 +3305,7 @@ regex_compile (pattern, size, syntax, bufp)
{
had_char_class = false;
SET_LIST_BIT (c);
+ range_start = c;
}
}
@@ -2356,6 +3314,7 @@ regex_compile (pattern, size, syntax, bufp)
while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
b[-1]--;
b += b[-1];
+#endif /* WCHAR */
}
break;
@@ -2426,10 +3385,10 @@ regex_compile (pattern, size, syntax, bufp)
group. They are all relative offsets, so that if the
whole pattern moves because of realloc, they will still
be valid. */
- COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
+ COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
COMPILE_STACK_TOP.fixup_alt_jump
- = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
- COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
+ = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
+ COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
COMPILE_STACK_TOP.regnum = regnum;
/* We will eventually replace the 0 with the number of
@@ -2438,7 +3397,8 @@ regex_compile (pattern, size, syntax, bufp)
represent in the compiled pattern. */
if (regnum <= MAX_REGNUM)
{
- COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
+ COMPILE_STACK_TOP.inner_group_offset = b
+ - COMPILED_BUFFER_VAR + 2;
BUF_PUSH_3 (start_memory, regnum, 0);
}
@@ -2497,12 +3457,12 @@ regex_compile (pattern, size, syntax, bufp)
regnum_t this_group_regnum;
compile_stack.avail--;
- begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
+ begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
fixup_alt_jump
= COMPILE_STACK_TOP.fixup_alt_jump
- ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
+ ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
: 0;
- laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
+ laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
this_group_regnum = COMPILE_STACK_TOP.regnum;
/* If we've reached MAX_REGNUM groups, then this open
won't actually generate any code, so we'll have to
@@ -2513,8 +3473,8 @@ regex_compile (pattern, size, syntax, bufp)
groups were inside this one. */
if (this_group_regnum <= MAX_REGNUM)
{
- unsigned char *inner_group_loc
- = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
+ UCHAR_T *inner_group_loc
+ = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
*inner_group_loc = regnum - this_group_regnum;
BUF_PUSH_3 (stop_memory, this_group_regnum,
@@ -2533,10 +3493,11 @@ regex_compile (pattern, size, syntax, bufp)
/* Insert before the previous alternative a jump which
jumps to this alternative if the former fails. */
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (on_failure_jump, begalt, b + 6);
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ INSERT_JUMP (on_failure_jump, begalt,
+ b + 2 + 2 * OFFSET_ADDRESS_SIZE);
pending_exact = 0;
- b += 3;
+ b += 1 + OFFSET_ADDRESS_SIZE;
/* The alternative before this one has a jump after it
which gets executed if it gets matched. Adjust that
@@ -2561,8 +3522,8 @@ regex_compile (pattern, size, syntax, bufp)
to be filled in later either by next alternative or
when know we're at the end of a series of alternatives. */
fixup_alt_jump = b;
- GET_BUFFER_SPACE (3);
- b += 3;
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ b += 1 + OFFSET_ADDRESS_SIZE;
laststart = 0;
begalt = b;
@@ -2646,11 +3607,13 @@ regex_compile (pattern, size, syntax, bufp)
/* If the upper bound is zero, don't want to succeed at
all; jump from `laststart' to `b + 3', which will be
the end of the buffer after we insert the jump. */
+ /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE'
+ instead of 'b + 3'. */
if (upper_bound == 0)
{
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (jump, laststart, b + 3);
- b += 3;
+ GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
+ INSERT_JUMP (jump, laststart, b + 1 + OFFSET_ADDRESS_SIZE);
+ b += 1 + OFFSET_ADDRESS_SIZE;
}
/* Otherwise, we have a nontrivial interval. When
@@ -2665,7 +3628,8 @@ regex_compile (pattern, size, syntax, bufp)
else
{ /* If the upper bound is > 1, we need to insert
more at the end of the loop. */
- unsigned nbytes = 10 + (upper_bound > 1) * 10;
+ unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
+ (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
GET_BUFFER_SPACE (nbytes);
@@ -2675,16 +3639,21 @@ regex_compile (pattern, size, syntax, bufp)
because `re_compile_fastmap' needs to know.
Jump to the `jump_n' we might insert below. */
INSERT_JUMP2 (succeed_n, laststart,
- b + 5 + (upper_bound > 1) * 5,
+ b + 1 + 2 * OFFSET_ADDRESS_SIZE +
+ (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE),
lower_bound);
- b += 5;
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
/* Code to initialize the lower bound. Insert
before the `succeed_n'. The `5' is the last two
bytes of this `set_number_at', plus 3 bytes of
the following `succeed_n'. */
- insert_op2 (set_number_at, laststart, 5, lower_bound, b);
- b += 5;
+ /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE'
+ is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
+ of the following `succeed_n'. */
+ PREFIX(insert_op2) (set_number_at, laststart,
+ 1 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
if (upper_bound > 1)
{ /* More than one repetition is allowed, so
@@ -2694,9 +3663,10 @@ regex_compile (pattern, size, syntax, bufp)
When we've reached this during matching,
we'll have matched the interval once, so
jump back only `upper_bound - 1' times. */
- STORE_JUMP2 (jump_n, b, laststart + 5,
+ STORE_JUMP2 (jump_n, b, laststart +
+ 1 + 2 * OFFSET_ADDRESS_SIZE,
upper_bound - 1);
- b += 5;
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
/* The location we want to set is the second
parameter of the `jump_n'; that is `b-2' as
@@ -2712,9 +3682,10 @@ regex_compile (pattern, size, syntax, bufp)
We insert this at the beginning of the loop
so that if we fail during matching, we'll
reinitialize the bounds. */
- insert_op2 (set_number_at, laststart, b - laststart,
- upper_bound - 1, b);
- b += 5;
+ PREFIX(insert_op2) (set_number_at, laststart,
+ b - laststart,
+ upper_bound - 1, b);
+ b += 1 + 2 * OFFSET_ADDRESS_SIZE;
}
}
pending_exact = 0;
@@ -2853,6 +3824,11 @@ regex_compile (pattern, size, syntax, bufp)
normal_char:
/* If no exactn currently being built. */
if (!pending_exact
+#ifdef WCHAR
+ /* If last exactn handle binary(or character) and
+ new exactn handle character(or binary). */
+ || is_exactn_bin != is_binary[p - 1 - pattern]
+#endif /* WCHAR */
/* If last exactn not at current position. */
|| pending_exact + *pending_exact + 1 != b
@@ -2874,7 +3850,16 @@ regex_compile (pattern, size, syntax, bufp)
laststart = b;
+#ifdef WCHAR
+ /* Is this exactn binary data or character? */
+ is_exactn_bin = is_binary[p - 1 - pattern];
+ if (is_exactn_bin)
+ BUF_PUSH_2 (exactn_bin, 0);
+ else
+ BUF_PUSH_2 (exactn, 0);
+#else
BUF_PUSH_2 (exactn, 0);
+#endif /* WCHAR */
pending_exact = b - 1;
}
@@ -2898,16 +3883,25 @@ regex_compile (pattern, size, syntax, bufp)
if (syntax & RE_NO_POSIX_BACKTRACKING)
BUF_PUSH (succeed);
+#ifdef WCHAR
+ free (pattern);
+ free (mbs_offset);
+ free (is_binary);
+#endif
free (compile_stack.stack);
/* We have succeeded; set the length of the buffer. */
+#ifdef WCHAR
+ bufp->used = (unsigned long int) b - (unsigned long int) COMPILED_BUFFER_VAR;
+#else
bufp->used = b - bufp->buffer;
+#endif
#ifdef DEBUG
if (debug)
{
DEBUG_PRINT1 ("\nCompiled pattern: \n");
- print_compiled_pattern (bufp);
+ PREFIX(print_compiled_pattern) (bufp);
}
#endif /* DEBUG */
@@ -2928,27 +3922,27 @@ regex_compile (pattern, size, syntax, bufp)
# ifdef emacs
if (! fail_stack.stack)
fail_stack.stack
- = (fail_stack_elt_t *) xmalloc (fail_stack.size
- * sizeof (fail_stack_elt_t));
+ = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size
+ * sizeof (PREFIX(fail_stack_elt_t)));
else
fail_stack.stack
- = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
+ = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack,
(fail_stack.size
- * sizeof (fail_stack_elt_t)));
+ * sizeof (PREFIX(fail_stack_elt_t))));
# else /* not emacs */
if (! fail_stack.stack)
fail_stack.stack
- = (fail_stack_elt_t *) malloc (fail_stack.size
- * sizeof (fail_stack_elt_t));
+ = (PREFIX(fail_stack_elt_t) *) malloc (fail_stack.size
+ * sizeof (PREFIX(fail_stack_elt_t)));
else
fail_stack.stack
- = (fail_stack_elt_t *) realloc (fail_stack.stack,
+ = (PREFIX(fail_stack_elt_t) *) realloc (fail_stack.stack,
(fail_stack.size
- * sizeof (fail_stack_elt_t)));
+ * sizeof (PREFIX(fail_stack_elt_t))));
# endif /* not emacs */
}
- regex_grow_registers (num_regs);
+ PREFIX(regex_grow_registers (num_regs));
}
#endif /* not MATCH_MAY_ALLOCATE */
@@ -2958,68 +3952,72 @@ regex_compile (pattern, size, syntax, bufp)
/* Subroutines for `regex_compile'. */
/* Store OP at LOC followed by two-byte integer parameter ARG. */
+/* ifdef WCHAR, integer parameter is 1 wchar_t. */
static void
-store_op1 (op, loc, arg)
+PREFIX(store_op1) (op, loc, arg)
re_opcode_t op;
- unsigned char *loc;
+ UCHAR_T *loc;
int arg;
{
- *loc = (unsigned char) op;
+ *loc = (UCHAR_T) op;
STORE_NUMBER (loc + 1, arg);
}
/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
+/* ifdef WCHAR, integer parameter is 1 wchar_t. */
static void
-store_op2 (op, loc, arg1, arg2)
+PREFIX(store_op2) (op, loc, arg1, arg2)
re_opcode_t op;
- unsigned char *loc;
+ UCHAR_T *loc;
int arg1, arg2;
{
- *loc = (unsigned char) op;
+ *loc = (UCHAR_T) op;
STORE_NUMBER (loc + 1, arg1);
- STORE_NUMBER (loc + 3, arg2);
+ STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
}
/* Copy the bytes from LOC to END to open up three bytes of space at LOC
for OP followed by two-byte integer parameter ARG. */
+/* ifdef WCHAR, integer parameter is 1 wchar_t. */
static void
-insert_op1 (op, loc, arg, end)
+PREFIX(insert_op1) (op, loc, arg, end)
re_opcode_t op;
- unsigned char *loc;
+ UCHAR_T *loc;
int arg;
- unsigned char *end;
+ UCHAR_T *end;
{
- register unsigned char *pfrom = end;
- register unsigned char *pto = end + 3;
+ register UCHAR_T *pfrom = end;
+ register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE;
while (pfrom != loc)
*--pto = *--pfrom;
- store_op1 (op, loc, arg);
+ PREFIX(store_op1) (op, loc, arg);
}
/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
+/* ifdef WCHAR, integer parameter is 1 wchar_t. */
static void
-insert_op2 (op, loc, arg1, arg2, end)
+PREFIX(insert_op2) (op, loc, arg1, arg2, end)
re_opcode_t op;
- unsigned char *loc;
+ UCHAR_T *loc;
int arg1, arg2;
- unsigned char *end;
+ UCHAR_T *end;
{
- register unsigned char *pfrom = end;
- register unsigned char *pto = end + 5;
+ register UCHAR_T *pfrom = end;
+ register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
while (pfrom != loc)
*--pto = *--pfrom;
- store_op2 (op, loc, arg1, arg2);
+ PREFIX(store_op2) (op, loc, arg1, arg2);
}
@@ -3028,11 +4026,11 @@ insert_op2 (op, loc, arg1, arg2, end)
least one character before the ^. */
static boolean
-at_begline_loc_p (pattern, p, syntax)
- const char *pattern, *p;
+PREFIX(at_begline_loc_p) (pattern, p, syntax)
+ const CHAR_T *pattern, *p;
reg_syntax_t syntax;
{
- const char *prev = p - 2;
+ const CHAR_T *prev = p - 2;
boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
return
@@ -3047,13 +4045,13 @@ at_begline_loc_p (pattern, p, syntax)
at least one character after the $, i.e., `P < PEND'. */
static boolean
-at_endline_loc_p (p, pend, syntax)
- const char *p, *pend;
+PREFIX(at_endline_loc_p) (p, pend, syntax)
+ const CHAR_T *p, *pend;
reg_syntax_t syntax;
{
- const char *next = p;
+ const CHAR_T *next = p;
boolean next_backslash = *next == '\\';
- const char *next_next = p + 1 < pend ? p + 1 : 0;
+ const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0;
return
/* Before a subexpression? */
@@ -3064,6 +4062,7 @@ at_endline_loc_p (p, pend, syntax)
: next_backslash && next_next && *next_next == '|');
}
+#else /* not INSIDE_RECURSION */
/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
false if it's not. */
@@ -3084,7 +4083,63 @@ group_in_compile_stack (compile_stack, regnum)
return false;
}
+#endif /* not INSIDE_RECURSION */
+
+#ifdef INSIDE_RECURSION
+
+#ifdef WCHAR
+/* This insert space, which size is "num", into the pattern at "loc".
+ "end" must point the end of the allocated buffer. */
+static void
+insert_space (num, loc, end)
+ int num;
+ CHAR_T *loc;
+ CHAR_T *end;
+{
+ register CHAR_T *pto = end;
+ register CHAR_T *pfrom = end - num;
+
+ while (pfrom >= loc)
+ *pto-- = *pfrom--;
+}
+#endif /* WCHAR */
+#ifdef WCHAR
+static reg_errcode_t
+wcs_compile_range (range_start_char, p_ptr, pend, translate, syntax, b,
+ char_set)
+ CHAR_T range_start_char;
+ const CHAR_T **p_ptr, *pend;
+ CHAR_T *char_set, *b;
+ RE_TRANSLATE_TYPE translate;
+ reg_syntax_t syntax;
+{
+ const CHAR_T *p = *p_ptr;
+ CHAR_T range_start, range_end;
+ reg_errcode_t ret;
+ if (p == pend)
+ return REG_ERANGE;
+
+ range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
+ range_start_char;
+ range_end = TRANSLATE (p[0]);
+ /* Report an error if the range is empty and the syntax prohibits
+ this. */
+ ret = ((syntax & RE_NO_EMPTY_RANGES)
+ && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
+
+ /* Insert space to the end of the char_ranges. */
+ insert_space(2, b - char_set[5] - 2, b - 1);
+ *(b - char_set[5] - 2) = range_start;
+ *(b - char_set[5] - 1) = range_end;
+ char_set[4]++; /* ranges_index */
+
+ /* Have to increment the pointer into the pattern string, so the
+ caller isn't still at the ending character. */
+ (*p_ptr)++;
+ return ret;
+}
+#else /* BYTE */
/* Read the ending character of a range (in a bracket expression) from the
uncompiled pattern *P_PTR (which ends at PEND). We assume the
starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
@@ -3095,52 +4150,51 @@ group_in_compile_stack (compile_stack, regnum)
We use these short variable names so we can use the same macros as
`regex_compile' itself. */
-
static reg_errcode_t
-compile_range (p_ptr, pend, translate, syntax, b)
- const char **p_ptr, *pend;
- RE_TRANSLATE_TYPE translate;
- reg_syntax_t syntax;
- unsigned char *b;
+byte_compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
+ unsigned int range_start_char;
+ const char **p_ptr, *pend;
+ RE_TRANSLATE_TYPE translate;
+ reg_syntax_t syntax;
+ unsigned char *b;
{
unsigned this_char;
const char *p = *p_ptr;
- unsigned int range_start, range_end;
+ reg_errcode_t ret;
+ unsigned end_char;
+
if (p == pend)
return REG_ERANGE;
- /* Even though the pattern is a signed `char *', we need to fetch
- with unsigned char *'s; if the high bit of the pattern character
- is set, the range endpoints will be negative if we fetch using a
- signed char *.
-
- We also want to fetch the endpoints without translating them; the
- appropriate translation is done in the bit-setting loop below. */
- /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
- range_start = ((const unsigned char *) p)[-2];
- range_end = ((const unsigned char *) p)[0];
-
/* Have to increment the pointer into the pattern string, so the
caller isn't still at the ending character. */
(*p_ptr)++;
- /* If the start is after the end, the range is empty. */
- if (range_start > range_end)
- return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
+ /* Report an error if the range is empty and the syntax prohibits this. */
+ ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
/* Here we see why `this_char' has to be larger than an `unsigned
- char' -- the range is inclusive, so if `range_end' == 0xff
- (assuming 8-bit characters), we would otherwise go into an infinite
- loop, since all characters <= 0xff. */
- for (this_char = range_start; this_char <= range_end; this_char++)
+ char' -- we would otherwise go into an infinite loop, since all
+ characters <= 0xff. */
+ range_start_char = TRANSLATE (range_start_char);
+ /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
+ and some compilers cast it to int implicitly, so following for_loop
+ may fall to (almost) infinite loop.
+ e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
+ To avoid this, we cast p[0] to unsigned int and truncate it. */
+ end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
+
+ for (this_char = range_start_char; this_char <= end_char; ++this_char)
{
SET_LIST_BIT (TRANSLATE (this_char));
+ ret = REG_NOERROR;
}
- return REG_NOERROR;
+ return ret;
}
+# endif /* WCHAR */
/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
@@ -3155,27 +4209,50 @@ compile_range (p_ptr, pend, translate, syntax, b)
Returns 0 if we succeed, -2 if an internal error. */
-int
-re_compile_fastmap (bufp)
+#ifdef WCHAR
+/* local function for re_compile_fastmap.
+ truncate wchar_t character to char. */
+static unsigned char truncate_wchar (CHAR_T c);
+
+static unsigned char
+truncate_wchar (c)
+ CHAR_T c;
+{
+ unsigned char buf[MB_LEN_MAX];
+ int retval = wctomb(buf, c);
+ return retval > 0 ? buf[0] : (unsigned char)c;
+}
+#endif /* WCHAR */
+
+static int
+PREFIX(re_compile_fastmap) (bufp)
struct re_pattern_buffer *bufp;
{
int j, k;
#ifdef MATCH_MAY_ALLOCATE
- fail_stack_type fail_stack;
+ PREFIX(fail_stack_type) fail_stack;
#endif
#ifndef REGEX_MALLOC
char *destination;
#endif
register char *fastmap = bufp->fastmap;
- unsigned char *pattern = bufp->buffer;
- unsigned char *p = pattern;
- register unsigned char *pend = pattern + bufp->used;
+
+#ifdef WCHAR
+ /* We need to cast pattern to (wchar_t*), because we casted this compiled
+ pattern to (char*) in regex_compile. */
+ UCHAR_T *pattern = (UCHAR_T*)bufp->buffer;
+ register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used);
+#else /* BYTE */
+ UCHAR_T *pattern = bufp->buffer;
+ register UCHAR_T *pend = pattern + bufp->used;
+#endif /* WCHAR */
+ UCHAR_T *p = pattern;
#ifdef REL_ALLOC
/* This holds the pointer to the failure stack, when
it is allocated relocatably. */
- fail_stack_elt_t *failure_stack_ptr;
+ PREFIX(fail_stack_elt_t) *failure_stack_ptr;
#endif
/* Assume that each path through the pattern can be null until
@@ -3233,11 +4310,32 @@ re_compile_fastmap (bufp)
/* Following are the cases which match a character. These end
with `break'. */
+#ifdef WCHAR
+ case exactn:
+ fastmap[truncate_wchar(p[1])] = 1;
+ break;
+#else /* BYTE */
case exactn:
fastmap[p[1]] = 1;
break;
+#endif /* WCHAR */
+#ifdef MBS_SUPPORT
+ case exactn_bin:
+ fastmap[p[1]] = 1;
+ break;
+#endif
+#ifdef WCHAR
+ /* It is hard to distinguish fastmap from (multi byte) characters
+ which depends on current locale. */
+ case charset:
+ case charset_not:
+ case wordchar:
+ case notwordchar:
+ bufp->can_be_null = 1;
+ goto done;
+#else /* BYTE */
case charset:
for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
@@ -3268,6 +4366,7 @@ re_compile_fastmap (bufp)
if (SYNTAX (j) != Sword)
fastmap[j] = 1;
break;
+#endif /* WCHAR */
case anychar:
@@ -3398,13 +4497,13 @@ re_compile_fastmap (bufp)
case succeed_n:
/* Get to the number of times to succeed. */
- p += 2;
+ p += OFFSET_ADDRESS_SIZE;
/* Increment p past the n for when k != 0. */
EXTRACT_NUMBER_AND_INCR (k, p);
if (k == 0)
{
- p -= 4;
+ p -= 2 * OFFSET_ADDRESS_SIZE;
succeed_n_p = true; /* Spaghetti code alert. */
goto handle_on_failure_jump;
}
@@ -3412,7 +4511,7 @@ re_compile_fastmap (bufp)
case set_number_at:
- p += 4;
+ p += 2 * OFFSET_ADDRESS_SIZE;
continue;
@@ -3443,6 +4542,20 @@ re_compile_fastmap (bufp)
done:
RESET_FAIL_STACK ();
return 0;
+}
+
+#else /* not INSIDE_RECURSION */
+
+int
+re_compile_fastmap (bufp)
+ struct re_pattern_buffer *bufp;
+{
+# ifdef MBS_SUPPORT
+ if (MB_CUR_MAX != 1)
+ return wcs_re_compile_fastmap(bufp);
+ else
+# endif
+ return byte_re_compile_fastmap(bufp);
} /* re_compile_fastmap */
#ifdef _LIBC
weak_alias (__re_compile_fastmap, re_compile_fastmap)
@@ -3537,11 +4650,84 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
struct re_registers *regs;
int stop;
{
+# ifdef MBS_SUPPORT
+ if (MB_CUR_MAX != 1)
+ return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos,
+ range, regs, stop);
+ else
+# endif
+ return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos,
+ range, regs, stop);
+} /* re_search_2 */
+#ifdef _LIBC
+weak_alias (__re_search_2, re_search_2)
+#endif
+
+#endif /* not INSIDE_RECURSION */
+
+#ifdef INSIDE_RECURSION
+
+#ifdef MATCH_MAY_ALLOCATE
+# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
+#else
+# define FREE_VAR(var) if (var) free (var); var = NULL
+#endif
+
+#ifdef WCHAR
+# define MAX_ALLOCA_SIZE 2000
+
+# define FREE_WCS_BUFFERS() \
+ do { \
+ if (size1 > MAX_ALLOCA_SIZE) \
+ { \
+ free (wcs_string1); \
+ free (mbs_offset1); \
+ } \
+ else \
+ { \
+ FREE_VAR (wcs_string1); \
+ FREE_VAR (mbs_offset1); \
+ } \
+ if (size2 > MAX_ALLOCA_SIZE) \
+ { \
+ free (wcs_string2); \
+ free (mbs_offset2); \
+ } \
+ else \
+ { \
+ FREE_VAR (wcs_string2); \
+ FREE_VAR (mbs_offset2); \
+ } \
+ } while (0)
+
+#endif
+
+static int
+PREFIX(re_search_2) (bufp, string1, size1, string2, size2, startpos, range,
+ regs, stop)
+ struct re_pattern_buffer *bufp;
+ const char *string1, *string2;
+ int size1, size2;
+ int startpos;
+ int range;
+ struct re_registers *regs;
+ int stop;
+{
int val;
register char *fastmap = bufp->fastmap;
register RE_TRANSLATE_TYPE translate = bufp->translate;
int total_size = size1 + size2;
int endpos = startpos + range;
+#ifdef WCHAR
+ /* We need wchar_t* buffers correspond to cstring1, cstring2. */
+ wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL;
+ /* We need the size of wchar_t buffers correspond to csize1, csize2. */
+ int wcs_size1 = 0, wcs_size2 = 0;
+ /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
+ int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
+ /* They hold whether each wchar_t is binary data or not. */
+ char *is_binary = NULL;
+#endif /* WCHAR */
/* Check for out-of-range STARTPOS. */
if (startpos < 0 || startpos > total_size)
@@ -3585,6 +4771,80 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
if (re_compile_fastmap (bufp) == -2)
return -2;
+#ifdef WCHAR
+ /* Allocate wchar_t array for wcs_string1 and wcs_string2 and
+ fill them with converted string. */
+ if (size1 != 0)
+ {
+ if (size1 > MAX_ALLOCA_SIZE)
+ {
+ wcs_string1 = TALLOC (size1 + 1, CHAR_T);
+ mbs_offset1 = TALLOC (size1 + 1, int);
+ is_binary = TALLOC (size1 + 1, char);
+ }
+ else
+ {
+ wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T);
+ mbs_offset1 = REGEX_TALLOC (size1 + 1, int);
+ is_binary = REGEX_TALLOC (size1 + 1, char);
+ }
+ if (!wcs_string1 || !mbs_offset1 || !is_binary)
+ {
+ if (size1 > MAX_ALLOCA_SIZE)
+ {
+ free (wcs_string1);
+ free (mbs_offset1);
+ free (is_binary);
+ }
+ else
+ {
+ FREE_VAR (wcs_string1);
+ FREE_VAR (mbs_offset1);
+ FREE_VAR (is_binary);
+ }
+ return -2;
+ }
+ wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1,
+ mbs_offset1, is_binary);
+ wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */
+ if (size1 > MAX_ALLOCA_SIZE)
+ free (is_binary);
+ else
+ FREE_VAR (is_binary);
+ }
+ if (size2 != 0)
+ {
+ if (size2 > MAX_ALLOCA_SIZE)
+ {
+ wcs_string2 = TALLOC (size2 + 1, CHAR_T);
+ mbs_offset2 = TALLOC (size2 + 1, int);
+ is_binary = TALLOC (size2 + 1, char);
+ }
+ else
+ {
+ wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T);
+ mbs_offset2 = REGEX_TALLOC (size2 + 1, int);
+ is_binary = REGEX_TALLOC (size2 + 1, char);
+ }
+ if (!wcs_string2 || !mbs_offset2 || !is_binary)
+ {
+ FREE_WCS_BUFFERS ();
+ if (size2 > MAX_ALLOCA_SIZE)
+ free (is_binary);
+ else
+ FREE_VAR (is_binary);
+ return -2;
+ }
+ wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2,
+ mbs_offset2, is_binary);
+ wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */
+ if (size2 > MAX_ALLOCA_SIZE)
+ free (is_binary);
+ else
+ FREE_VAR (is_binary);
+ }
+#endif /* WCHAR */
+
/* Loop through the string, looking for a place to start matching. */
for (;;)
{
@@ -3620,9 +4880,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
}
else /* Searching backwards. */
{
- register char c = (size1 == 0 || startpos >= size1
- ? string2[startpos - size1]
- : string1[startpos]);
+ register CHAR_T c = (size1 == 0 || startpos >= size1
+ ? string2[startpos - size1]
+ : string1[startpos]);
if (!fastmap[(unsigned char) TRANSLATE (c)])
goto advance;
@@ -3632,10 +4892,24 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
/* If can't match the null string, and that's all we have left, fail. */
if (range >= 0 && startpos == total_size && fastmap
&& !bufp->can_be_null)
- return -1;
+ {
+#ifdef WCHAR
+ FREE_WCS_BUFFERS ();
+#endif
+ return -1;
+ }
+
+#ifdef WCHAR
+ val = wcs_re_match_2_internal (bufp, string1, size1, string2,
+ size2, startpos, regs, stop,
+ wcs_string1, wcs_size1,
+ wcs_string2, wcs_size2,
+ mbs_offset1, mbs_offset2);
+#else /* BYTE */
+ val = byte_re_match_2_internal (bufp, string1, size1, string2,
+ size2, startpos, regs, stop);
+#endif /* BYTE */
- val = re_match_2_internal (bufp, string1, size1, string2, size2,
- startpos, regs, stop);
#ifndef REGEX_MALLOC
# ifdef C_ALLOCA
alloca (0);
@@ -3643,10 +4917,20 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
#endif
if (val >= 0)
- return startpos;
+ {
+#ifdef WCHAR
+ FREE_WCS_BUFFERS ();
+#endif
+ return startpos;
+ }
if (val == -2)
- return -2;
+ {
+#ifdef WCHAR
+ FREE_WCS_BUFFERS ();
+#endif
+ return -2;
+ }
advance:
if (!range)
@@ -3662,18 +4946,31 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
startpos--;
}
}
+#ifdef WCHAR
+ FREE_WCS_BUFFERS ();
+#endif
return -1;
+
} /* re_search_2 */
-#ifdef _LIBC
-weak_alias (__re_search_2, re_search_2)
-#endif
-
+
+#ifdef WCHAR
+/* This converts PTR, a pointer into one of the search wchar_t strings
+ `string1' and `string2' into an multibyte string offset from the
+ beginning of that string. We use mbs_offset to optimize.
+ See convert_mbs_to_wcs. */
+# define POINTER_TO_OFFSET(ptr) \
+ (FIRST_STRING_P (ptr) \
+ ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
+ : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
+ + csize1)))
+#else /* BYTE */
/* This converts PTR, a pointer into one of the search strings `string1'
and `string2' into an offset from the beginning of that string. */
#define POINTER_TO_OFFSET(ptr) \
(FIRST_STRING_P (ptr) \
? ((regoff_t) ((ptr) - string1)) \
: ((regoff_t) ((ptr) - string2 + size1)))
+#endif /* WCHAR */
/* Macros for dealing with the split strings in re_match_2. */
@@ -3703,10 +5000,19 @@ weak_alias (__re_search_2, re_search_2)
two special cases to check for: if past the end of string1, look at
the first character in string2; and if before the beginning of
string2, look at the last character in string1. */
+#ifdef WCHAR
+/* Use internationalized API instead of SYNTAX. */
+# define WORDCHAR_P(d) \
+ (iswalnum ((wint_t)((d) == end1 ? *string2 \
+ : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \
+ || ((d) == end1 ? *string2 \
+ : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_')
+#else /* BYTE */
#define WORDCHAR_P(d) \
(SYNTAX ((d) == end1 ? *string2 \
: (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
== Sword)
+#endif /* WCHAR */
/* Disabled due to a compiler bug -- see comment at case wordbound */
#if 0
@@ -3719,7 +5025,7 @@ weak_alias (__re_search_2, re_search_2)
/* Free everything we malloc. */
#ifdef MATCH_MAY_ALLOCATE
-# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
+# ifdef WCHAR
# define FREE_VARIABLES() \
do { \
REGEX_FREE_STACK (fail_stack.stack); \
@@ -3732,9 +5038,44 @@ weak_alias (__re_search_2, re_search_2)
FREE_VAR (reg_info); \
FREE_VAR (reg_dummy); \
FREE_VAR (reg_info_dummy); \
+ if (!cant_free_wcs_buf) \
+ { \
+ FREE_VAR (string1); \
+ FREE_VAR (string2); \
+ FREE_VAR (mbs_offset1); \
+ FREE_VAR (mbs_offset2); \
+ } \
+ } while (0)
+# else /* BYTE */
+# define FREE_VARIABLES() \
+ do { \
+ REGEX_FREE_STACK (fail_stack.stack); \
+ FREE_VAR (regstart); \
+ FREE_VAR (regend); \
+ FREE_VAR (old_regstart); \
+ FREE_VAR (old_regend); \
+ FREE_VAR (best_regstart); \
+ FREE_VAR (best_regend); \
+ FREE_VAR (reg_info); \
+ FREE_VAR (reg_dummy); \
+ FREE_VAR (reg_info_dummy); \
} while (0)
+# endif /* WCHAR */
#else
+# ifdef WCHAR
+# define FREE_VARIABLES() \
+ do { \
+ if (!cant_free_wcs_buf) \
+ { \
+ FREE_VAR (string1); \
+ FREE_VAR (string2); \
+ FREE_VAR (mbs_offset1); \
+ FREE_VAR (mbs_offset2); \
+ } \
+ } while (0)
+# else /* BYTE */
# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
+# endif /* WCHAR */
#endif /* not MATCH_MAY_ALLOCATE */
/* These values must meet several constraints. They must not be valid
@@ -3746,6 +5087,7 @@ weak_alias (__re_search_2, re_search_2)
to actually save any registers when none are active. */
#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
+#else /* not INSIDE_RECURSION */
/* Matching routines. */
@@ -3759,7 +5101,15 @@ re_match (bufp, string, size, pos, regs)
int size, pos;
struct re_registers *regs;
{
- int result = re_match_2_internal (bufp, NULL, 0, string, size,
+ int result;
+# ifdef MBS_SUPPORT
+ if (MB_CUR_MAX != 1)
+ result = wcs_re_match_2_internal (bufp, NULL, 0, string, size,
+ pos, regs, size,
+ NULL, 0, NULL, 0, NULL, NULL);
+ else
+# endif
+ result = byte_re_match_2_internal (bufp, NULL, 0, string, size,
pos, regs, size);
# ifndef REGEX_MALLOC
# ifdef C_ALLOCA
@@ -3773,17 +5123,21 @@ weak_alias (__re_match, re_match)
# endif
#endif /* not emacs */
-static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p,
- unsigned char *end,
- register_info_type *reg_info));
-static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p,
- unsigned char *end,
- register_info_type *reg_info));
-static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p,
- unsigned char *end,
- register_info_type *reg_info));
-static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2,
+#endif /* not INSIDE_RECURSION */
+
+#ifdef INSIDE_RECURSION
+static boolean PREFIX(group_match_null_string_p) _RE_ARGS ((UCHAR_T **p,
+ UCHAR_T *end,
+ PREFIX(register_info_type) *reg_info));
+static boolean PREFIX(alt_match_null_string_p) _RE_ARGS ((UCHAR_T *p,
+ UCHAR_T *end,
+ PREFIX(register_info_type) *reg_info));
+static boolean PREFIX(common_op_match_null_string_p) _RE_ARGS ((UCHAR_T **p,
+ UCHAR_T *end,
+ PREFIX(register_info_type) *reg_info));
+static int PREFIX(bcmp_translate) _RE_ARGS ((const CHAR_T *s1, const CHAR_T *s2,
int len, char *translate));
+#else /* not INSIDE_RECURSION */
/* re_match_2 matches the compiled pattern in BUFP against the
the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
@@ -3807,8 +5161,16 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
struct re_registers *regs;
int stop;
{
- int result = re_match_2_internal (bufp, string1, size1, string2, size2,
- pos, regs, stop);
+ int result;
+# ifdef MBS_SUPPORT
+ if (MB_CUR_MAX != 1)
+ result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2,
+ pos, regs, stop,
+ NULL, 0, NULL, 0, NULL, NULL);
+ else
+# endif
+ result = byte_re_match_2_internal (bufp, string1, size1, string2, size2,
+ pos, regs, stop);
#ifndef REGEX_MALLOC
# ifdef C_ALLOCA
alloca (0);
@@ -3820,38 +5182,123 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
weak_alias (__re_match_2, re_match_2)
#endif
+#endif /* not INSIDE_RECURSION */
+
+#ifdef INSIDE_RECURSION
+
+#ifdef WCHAR
+static int count_mbs_length PARAMS ((int *, int));
+
+/* This check the substring (from 0, to length) of the multibyte string,
+ to which offset_buffer correspond. And count how many wchar_t_characters
+ the substring occupy. We use offset_buffer to optimization.
+ See convert_mbs_to_wcs. */
+
+static int
+count_mbs_length(offset_buffer, length)
+ int *offset_buffer;
+ int length;
+{
+ int upper, lower;
+
+ /* Check whether the size is valid. */
+ if (length < 0)
+ return -1;
+
+ if (offset_buffer == NULL)
+ return 0;
+
+ /* If there are no multibyte character, offset_buffer[i] == i.
+ Optmize for this case. */
+ if (offset_buffer[length] == length)
+ return length;
+
+ /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */
+ upper = length;
+ lower = 0;
+
+ while (true)
+ {
+ int middle = (lower + upper) / 2;
+ if (middle == lower || middle == upper)
+ break;
+ if (offset_buffer[middle] > length)
+ upper = middle;
+ else if (offset_buffer[middle] < length)
+ lower = middle;
+ else
+ return middle;
+ }
+
+ return -1;
+}
+#endif /* WCHAR */
+
/* This is a separate function so that we can force an alloca cleanup
afterwards. */
+#ifdef WCHAR
+static int
+wcs_re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos,
+ regs, stop, string1, size1, string2, size2,
+ mbs_offset1, mbs_offset2)
+ struct re_pattern_buffer *bufp;
+ const char *cstring1, *cstring2;
+ int csize1, csize2;
+ int pos;
+ struct re_registers *regs;
+ int stop;
+ /* string1 == string2 == NULL means string1/2, size1/2 and
+ mbs_offset1/2 need seting up in this function. */
+ /* We need wchar_t* buffers correspond to cstring1, cstring2. */
+ wchar_t *string1, *string2;
+ /* We need the size of wchar_t buffers correspond to csize1, csize2. */
+ int size1, size2;
+ /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
+ int *mbs_offset1, *mbs_offset2;
+#else /* BYTE */
static int
-re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
+byte_re_match_2_internal (bufp, string1, size1,string2, size2, pos,
+ regs, stop)
struct re_pattern_buffer *bufp;
const char *string1, *string2;
int size1, size2;
int pos;
struct re_registers *regs;
int stop;
+#endif /* BYTE */
{
/* General temporaries. */
int mcnt;
- unsigned char *p1;
+ UCHAR_T *p1;
+#ifdef WCHAR
+ /* They hold whether each wchar_t is binary data or not. */
+ char *is_binary = NULL;
+ /* If true, we can't free string1/2, mbs_offset1/2. */
+ int cant_free_wcs_buf = 1;
+#endif /* WCHAR */
/* Just past the end of the corresponding string. */
- const char *end1, *end2;
+ const CHAR_T *end1, *end2;
/* Pointers into string1 and string2, just past the last characters in
each to consider matching. */
- const char *end_match_1, *end_match_2;
+ const CHAR_T *end_match_1, *end_match_2;
/* Where we are in the data, and the end of the current string. */
- const char *d, *dend;
+ const CHAR_T *d, *dend;
/* Where we are in the pattern, and the end of the pattern. */
- unsigned char *p = bufp->buffer;
- register unsigned char *pend = p + bufp->used;
+#ifdef WCHAR
+ UCHAR_T *pattern, *p;
+ register UCHAR_T *pend;
+#else /* BYTE */
+ UCHAR_T *p = bufp->buffer;
+ register UCHAR_T *pend = p + bufp->used;
+#endif /* WCHAR */
/* Mark the opcode just after a start_memory, so we can test for an
empty subpattern when we get to the stop_memory. */
- unsigned char *just_past_start_mem = 0;
+ UCHAR_T *just_past_start_mem = 0;
/* We use this to map every character in the string. */
RE_TRANSLATE_TYPE translate = bufp->translate;
@@ -3866,7 +5313,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
a ``dummy''; if a failure happens and the failure point is a dummy,
it gets discarded and the next next one is tried. */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
- fail_stack_type fail_stack;
+ PREFIX(fail_stack_type) fail_stack;
#endif
#ifdef DEBUG
static unsigned failure_id;
@@ -3896,7 +5343,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
stopped matching the regnum-th subexpression. (The zeroth register
keeps track of what the whole pattern matches.) */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **regstart, **regend;
+ const CHAR_T **regstart, **regend;
#endif
/* If a group that's operated upon by a repetition operator fails to
@@ -3905,7 +5352,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
are when we last see its open-group operator. Similarly for a
register's end. */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **old_regstart, **old_regend;
+ const CHAR_T **old_regstart, **old_regend;
#endif
/* The is_active field of reg_info helps us keep track of which (possibly
@@ -3915,7 +5362,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
subexpression. These two fields get reset each time through any
loop their register is in. */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
- register_info_type *reg_info;
+ PREFIX(register_info_type) *reg_info;
#endif
/* The following record the register info as found in the above
@@ -3924,7 +5371,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
turn happens only if we have not yet matched the entire string. */
unsigned best_regs_set = false;
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **best_regstart, **best_regend;
+ const CHAR_T **best_regstart, **best_regend;
#endif
/* Logically, this is `best_regend[0]'. But we don't want to have to
@@ -3935,15 +5382,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
the end of the best match so far in a separate variable. We
initialize this to NULL so that when we backtrack the first time
and need to test it, it's not garbage. */
- const char *match_end = NULL;
+ const CHAR_T *match_end = NULL;
/* This helps SET_REGS_MATCHED avoid doing redundant work. */
int set_regs_matched_done = 0;
/* Used when we pop values we don't care about. */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
- const char **reg_dummy;
- register_info_type *reg_info_dummy;
+ const CHAR_T **reg_dummy;
+ PREFIX(register_info_type) *reg_info_dummy;
#endif
#ifdef DEBUG
@@ -3963,15 +5410,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
array indexing. We should fix this. */
if (bufp->re_nsub)
{
- regstart = REGEX_TALLOC (num_regs, const char *);
- regend = REGEX_TALLOC (num_regs, const char *);
- old_regstart = REGEX_TALLOC (num_regs, const char *);
- old_regend = REGEX_TALLOC (num_regs, const char *);
- best_regstart = REGEX_TALLOC (num_regs, const char *);
- best_regend = REGEX_TALLOC (num_regs, const char *);
- reg_info = REGEX_TALLOC (num_regs, register_info_type);
- reg_dummy = REGEX_TALLOC (num_regs, const char *);
- reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
+ regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
+ regend = REGEX_TALLOC (num_regs, const CHAR_T *);
+ old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
+ old_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
+ best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
+ best_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
+ reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
+ reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *);
+ reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
if (!(regstart && regend && old_regstart && old_regend && reg_info
&& best_regstart && best_regend && reg_dummy && reg_info_dummy))
@@ -3986,17 +5433,72 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
`FREE_VARIABLES' doesn't try to free them. */
regstart = regend = old_regstart = old_regend = best_regstart
= best_regend = reg_dummy = NULL;
- reg_info = reg_info_dummy = (register_info_type *) NULL;
+ reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL;
}
#endif /* MATCH_MAY_ALLOCATE */
/* The starting position is bogus. */
+#ifdef WCHAR
+ if (pos < 0 || pos > csize1 + csize2)
+#else /* BYTE */
if (pos < 0 || pos > size1 + size2)
+#endif
{
FREE_VARIABLES ();
return -1;
}
+#ifdef WCHAR
+ /* Allocate wchar_t array for string1 and string2 and
+ fill them with converted string. */
+ if (string1 == NULL && string2 == NULL)
+ {
+ /* We need seting up buffers here. */
+
+ /* We must free wcs buffers in this function. */
+ cant_free_wcs_buf = 0;
+
+ if (csize1 != 0)
+ {
+ string1 = REGEX_TALLOC (csize1 + 1, CHAR_T);
+ mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
+ is_binary = REGEX_TALLOC (csize1 + 1, char);
+ if (!string1 || !mbs_offset1 || !is_binary)
+ {
+ FREE_VAR (string1);
+ FREE_VAR (mbs_offset1);
+ FREE_VAR (is_binary);
+ return -2;
+ }
+ }
+ if (csize2 != 0)
+ {
+ string2 = REGEX_TALLOC (csize2 + 1, CHAR_T);
+ mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
+ is_binary = REGEX_TALLOC (csize2 + 1, char);
+ if (!string2 || !mbs_offset2 || !is_binary)
+ {
+ FREE_VAR (string1);
+ FREE_VAR (mbs_offset1);
+ FREE_VAR (string2);
+ FREE_VAR (mbs_offset2);
+ FREE_VAR (is_binary);
+ return -2;
+ }
+ size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
+ mbs_offset2, is_binary);
+ string2[size2] = L'\0'; /* for a sentinel */
+ FREE_VAR (is_binary);
+ }
+ }
+
+ /* We need to cast pattern to (wchar_t*), because we casted this compiled
+ pattern to (char*) in regex_compile. */
+ p = pattern = (CHAR_T*)bufp->buffer;
+ pend = (CHAR_T*)(bufp->buffer + bufp->used);
+
+#endif /* WCHAR */
+
/* Initialize subexpression text positions to -1 to mark ones that no
start_memory/stop_memory has been seen for. Also initialize the
register information struct. */
@@ -4019,11 +5521,38 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
size2 = size1;
string1 = 0;
size1 = 0;
+#ifdef WCHAR
+ mbs_offset2 = mbs_offset1;
+ csize2 = csize1;
+ mbs_offset1 = NULL;
+ csize1 = 0;
+#endif
}
end1 = string1 + size1;
end2 = string2 + size2;
/* Compute where to stop matching, within the two strings. */
+#ifdef WCHAR
+ if (stop <= csize1)
+ {
+ mcnt = count_mbs_length(mbs_offset1, stop);
+ end_match_1 = string1 + mcnt;
+ end_match_2 = string2;
+ }
+ else
+ {
+ if (stop > csize1 + csize2)
+ stop = csize1 + csize2;
+ end_match_1 = end1;
+ mcnt = count_mbs_length(mbs_offset2, stop-csize1);
+ end_match_2 = string2 + mcnt;
+ }
+ if (mcnt < 0)
+ { /* count_mbs_length return error. */
+ FREE_VARIABLES ();
+ return -1;
+ }
+#else
if (stop <= size1)
{
end_match_1 = string1 + stop;
@@ -4034,6 +5563,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
end_match_1 = end1;
end_match_2 = string2 + stop - size1;
}
+#endif /* WCHAR */
/* `p' scans through the pattern as `d' scans through the data.
`dend' is the end of the input string that `d' points within. `d'
@@ -4041,6 +5571,26 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
this happens before fetching; therefore, at the beginning of the
loop, `d' can be pointing at the end of a string, but it cannot
equal `string2'. */
+#ifdef WCHAR
+ if (size1 > 0 && pos <= csize1)
+ {
+ mcnt = count_mbs_length(mbs_offset1, pos);
+ d = string1 + mcnt;
+ dend = end_match_1;
+ }
+ else
+ {
+ mcnt = count_mbs_length(mbs_offset2, pos-csize1);
+ d = string2 + mcnt;
+ dend = end_match_2;
+ }
+
+ if (mcnt < 0)
+ { /* count_mbs_length return error. */
+ FREE_VARIABLES ();
+ return -1;
+ }
+#else
if (size1 > 0 && pos <= size1)
{
d = string1 + pos;
@@ -4051,6 +5601,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
d = string2 + pos - size1;
dend = end_match_2;
}
+#endif /* WCHAR */
DEBUG_PRINT1 ("The compiled pattern is:\n");
DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
@@ -4188,9 +5739,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
if (regs->num_regs > 0)
{
regs->start[0] = pos;
+#ifdef WCHAR
+ if (MATCHING_IN_FIRST_STRING)
+ regs->end[0] = mbs_offset1 != NULL ?
+ mbs_offset1[d-string1] : 0;
+ else
+ regs->end[0] = csize1 + (mbs_offset2 != NULL ?
+ mbs_offset2[d-string2] : 0);
+#else
regs->end[0] = (MATCHING_IN_FIRST_STRING
? ((regoff_t) (d - string1))
: ((regoff_t) (d - string2 + size1)));
+#endif /* WCHAR */
}
/* Go through the first `min (num_regs, regs->num_regs)'
@@ -4223,9 +5783,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
nfailure_points_pushed - nfailure_points_popped);
DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
+#ifdef WCHAR
+ if (MATCHING_IN_FIRST_STRING)
+ mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
+ else
+ mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
+ csize1;
+ mcnt -= pos;
+#else
mcnt = d - pos - (MATCHING_IN_FIRST_STRING
? string1
: string2 - size1);
+#endif /* WCHAR */
DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
@@ -4250,6 +5819,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
byte in the pattern defines n, and the n bytes after that
are the characters to match. */
case exactn:
+#ifdef MBS_SUPPORT
+ case exactn_bin:
+#endif
mcnt = *p++;
DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
@@ -4260,9 +5832,23 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
do
{
PREFETCH ();
- if ((unsigned char) translate[(unsigned char) *d++]
- != (unsigned char) *p++)
+#ifdef WCHAR
+ if (*d <= 0xff)
+ {
+ if ((UCHAR_T) translate[(unsigned char) *d++]
+ != (UCHAR_T) *p++)
+ goto fail;
+ }
+ else
+ {
+ if (*d++ != (CHAR_T) *p++)
+ goto fail;
+ }
+#else
+ if ((UCHAR_T) translate[(unsigned char) *d++]
+ != (UCHAR_T) *p++)
goto fail;
+#endif /* WCHAR */
}
while (--mcnt);
}
@@ -4271,7 +5857,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
do
{
PREFETCH ();
- if (*d++ != (char) *p++) goto fail;
+ if (*d++ != (CHAR_T) *p++) goto fail;
}
while (--mcnt);
}
@@ -4298,7 +5884,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
case charset:
case charset_not:
{
- register unsigned char c;
+#ifdef WCHAR
+ unsigned int i, char_class_length, coll_symbol_length,
+ equiv_class_length, ranges_length, chars_length, length;
+ CHAR_T *workp, *workp2, *charset_top;
+# define WORK_BUFFER_SIZE 128
+ CHAR_T str_buf[WORK_BUFFER_SIZE];
+#endif /* WCHAR */
+ register UCHAR_T c;
boolean not = (re_opcode_t) *(p - 1) == charset_not;
DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
@@ -4306,6 +5899,199 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
PREFETCH ();
c = TRANSLATE (*d); /* The character to match. */
+#ifdef WCHAR
+ charset_top = p - 1;
+ char_class_length = *p++;
+ coll_symbol_length = *p++;
+ equiv_class_length = *p++;
+ ranges_length = *p++;
+ chars_length = *p++;
+ /* p points charset[6], so the address of the next instruction
+ (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
+ where l=length of char_classes, m=length of collating_symbol,
+ n=equivalence_class, o=length of char_range,
+ p'=length of character. */
+ workp = p;
+ /* Update p to indicate the next instruction. */
+ p += char_class_length + coll_symbol_length+ equiv_class_length +
+ 2*ranges_length + chars_length;
+
+ /* match with char_class? */
+ for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
+ {
+ wctype_t wctype;
+ unsigned long int alignedp = ((unsigned long int)workp
+ + __alignof__(wctype_t) - 1)
+ & ~(unsigned long int)(__alignof__(wctype_t) - 1);
+ wctype = *((wctype_t*)alignedp);
+ workp += CHAR_CLASS_SIZE;
+ if (iswctype((wint_t)c, wctype))
+ goto char_set_matched;
+ }
+
+ /* match with collating_symbol? */
+ /* If we can't look up collation data, we use wcscoll
+ instead. */
+ for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
+ {
+ const CHAR_T *backup_d = d, *backup_dend = dend;
+ length = wcslen(workp);
+
+ /* If wcscoll(the collating symbol, whole string) > 0,
+ any substring of the string never match with the
+ collating symbol. */
+ if (wcscoll(workp, d) > 0)
+ {
+ workp += length + 1;
+ continue;
+ }
+
+ /* First, we compare the collating symbol with
+ the first character of the string.
+ If it don't match, we add the next character to
+ the compare buffer in turn. */
+ for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
+ {
+ int match;
+ if (d == dend)
+ {
+ if (dend == end_match_2)
+ break;
+ d = string2;
+ dend = end_match_2;
+ }
+
+ /* add next character to the compare buffer. */
+ str_buf[i] = TRANSLATE(*d);
+ str_buf[i+1] = '\0';
+
+ match = wcscoll(workp, str_buf);
+ if (match == 0)
+ goto char_set_matched;
+
+ if (match < 0)
+ /* (str_buf > workp) indicate (str_buf + X > workp),
+ because for all X (str_buf + X > str_buf).
+ So we don't need continue this loop. */
+ break;
+
+ /* Otherwise(str_buf < workp),
+ (str_buf+next_character) may equals (workp).
+ So we continue this loop. */
+ }
+ /* not matched */
+ d = backup_d;
+ dend = backup_dend;
+ workp += length + 1;
+ }
+ /* match with equivalence_class? */
+ /* If we can't look up collation data, we use wcscoll
+ instead. */
+ for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
+ {
+ const CHAR_T *backup_d = d, *backup_dend = dend;
+ length = wcslen(workp);
+
+ /* If wcscoll(the collating symbol, whole string) > 0,
+ any substring of the string never match with the
+ collating symbol. */
+ if (wcscoll(workp, d) > 0)
+ {
+ workp += length + 1;
+ break;
+ }
+
+ /* First, we compare the equivalence class with
+ the first character of the string.
+ If it don't match, we add the next character to
+ the compare buffer in turn. */
+ for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
+ {
+ int match;
+ if (d == dend)
+ {
+ if (dend == end_match_2)
+ break;
+ d = string2;
+ dend = end_match_2;
+ }
+
+ /* add next character to the compare buffer. */
+ str_buf[i] = TRANSLATE(*d);
+ str_buf[i+1] = '\0';
+
+ match = wcscoll(workp, str_buf);
+
+ if (match == 0)
+ goto char_set_matched;
+
+ if (match < 0)
+ /* (str_buf > workp) indicate (str_buf + X > workp),
+ because for all X (str_buf + X > str_buf).
+ So we don't need continue this loop. */
+ break;
+
+ /* Otherwise(str_buf < workp),
+ (str_buf+next_character) may equals (workp).
+ So we continue this loop. */
+ }
+ /* not matched */
+ d = backup_d;
+ dend = backup_dend;
+ workp += length + 1;
+ }
+
+ /* match with char_range? */
+ /* We set range_start_char at str_buf[0], range_end_char
+ at str_buf[4], and compared char at str_buf[2]. */
+ str_buf[1] = 0;
+ str_buf[2] = c;
+ str_buf[3] = 0;
+ str_buf[5] = 0;
+ for (; workp < p - chars_length ;)
+ {
+ wchar_t *range_start_char, *range_end_char;
+
+ /* match if (range_start_char <= c <= range_end_char). */
+
+ /* If range_start(or end) < 0, we assume -range_start(end)
+ is the offset of the collating symbol which is specified
+ as the character of the range start(end). */
+
+ /* range_start */
+ if (*workp < 0)
+ range_start_char = charset_top - (*workp++);
+ else
+ {
+ str_buf[0] = *workp++;
+ range_start_char = str_buf;
+ }
+
+ /* range_end */
+ if (*workp < 0)
+ range_end_char = charset_top - (*workp++);
+ else
+ {
+ str_buf[4] = *workp++;
+ range_end_char = str_buf + 4;
+ }
+
+ if (wcscoll(range_start_char, str_buf+2) <= 0 &&
+ wcscoll(str_buf+2, range_end_char) <= 0)
+
+ goto char_set_matched;
+ }
+
+ /* match with char? */
+ for (; workp < p ; workp++)
+ if (c == *workp)
+ goto char_set_matched;
+
+ not = !not;
+
+ char_set_matched:
+ if (not) goto fail;
+#else
/* Cast to `unsigned' instead of `unsigned char' in case the
bit list is a full 32 bytes long. */
if (c < (unsigned) (*p * BYTEWIDTH)
@@ -4315,7 +6101,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
p += 1 + *p;
if (!not) goto fail;
-
+#undef WORK_BUFFER_SIZE
+#endif /* WCHAR */
SET_REGS_MATCHED ();
d++;
break;
@@ -4335,7 +6122,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
REG_MATCH_NULL_STRING_P (reg_info[*p])
- = group_match_null_string_p (&p1, pend, reg_info);
+ = PREFIX(group_match_null_string_p) (&p1, pend, reg_info);
/* Save the position in the string where we were the last time
we were at this open-group operator in case the group is
@@ -4410,7 +6197,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
it isn't necessarily one less than now: consider
(a(b)c(d(e)f)g). When group 3 ends, after the f), the
new highest active register is 1. */
- unsigned char r = *p - 1;
+ UCHAR_T r = *p - 1;
while (r > 0 && !IS_ACTIVE (reg_info[r]))
r--;
@@ -4453,7 +6240,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
case dummy_failure_jump:
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
if (is_a_jump_n)
- p1 += 2;
+ p1 += OFFSET_ADDRESS_SIZE;
break;
default:
@@ -4467,7 +6254,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
by forcing a failure after pushing on the stack the
on_failure_jump's jump in the pattern, and d. */
if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
- && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
+ && (re_opcode_t) p1[1 + OFFSET_ADDRESS_SIZE] == start_memory
+ && p1[2 + OFFSET_ADDRESS_SIZE] == *p)
{
/* If this group ever matched anything, then restore
what its registers were before trying this last
@@ -4513,7 +6301,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
followed by the numeric value of <digit> as the register number. */
case duplicate:
{
- register const char *d2, *dend2;
+ register const CHAR_T *d2, *dend2;
int regno = *p++; /* Get which register to match against. */
DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
@@ -4562,8 +6350,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Compare that many; failure if mismatch, else move
past them. */
if (translate
- ? bcmp_translate (d, d2, mcnt, translate)
- : memcmp (d, d2, mcnt))
+ ? PREFIX(bcmp_translate) (d, d2, mcnt, translate)
+ : memcmp (d, d2, mcnt*sizeof(UCHAR_T)))
goto fail;
d += mcnt, d2 += mcnt;
@@ -4719,7 +6507,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
EXTRACT_NUMBER_AND_INCR (mcnt, p);
DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
{
- register unsigned char *p2 = p;
+ register UCHAR_T *p2 = p;
/* Compare the beginning of the repeat with what in the
pattern follows its end. If we can establish that there
@@ -4744,9 +6532,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
&& ((re_opcode_t) *p2 == stop_memory
|| (re_opcode_t) *p2 == start_memory))
p2 += 3;
- else if (p2 + 6 < pend
+ else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
&& (re_opcode_t) *p2 == dummy_failure_jump)
- p2 += 6;
+ p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
else
break;
}
@@ -4762,24 +6550,39 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Consider what happens when matching ":\(.*\)"
against ":/". I don't really understand this code
yet. */
- p[-3] = (unsigned char) pop_failure_jump;
+ p[-(1 + OFFSET_ADDRESS_SIZE)] = (UCHAR_T) pop_failure_jump;
DEBUG_PRINT1
(" End of pattern: change to `pop_failure_jump'.\n");
}
else if ((re_opcode_t) *p2 == exactn
+#ifdef MBS_SUPPORT
+ || (re_opcode_t) *p2 == exactn_bin
+#endif
|| (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
{
- register unsigned char c
- = *p2 == (unsigned char) endline ? '\n' : p2[2];
+ register UCHAR_T c
+ = *p2 == (UCHAR_T) endline ? '\n' : p2[2];
- if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
+ if (((re_opcode_t) p1[1 + OFFSET_ADDRESS_SIZE] == exactn
+#ifdef MBS_SUPPORT
+ || (re_opcode_t) p1[1 + OFFSET_ADDRESS_SIZE] == exactn_bin
+#endif
+ ) && p1[3 + OFFSET_ADDRESS_SIZE] != c)
{
- p[-3] = (unsigned char) pop_failure_jump;
- DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
- c, p1[5]);
+ p[-(1 + OFFSET_ADDRESS_SIZE)] = (UCHAR_T) pop_failure_jump;
+#ifdef WCHAR
+ DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
+ (wint_t) c,
+ (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
+#else
+ DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
+ (char) c,
+ (char) p1[3+OFFSET_ADDRESS_SIZE]);
+#endif
}
+#ifndef WCHAR
else if ((re_opcode_t) p1[3] == charset
|| (re_opcode_t) p1[3] == charset_not)
{
@@ -4797,7 +6600,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
}
}
+#endif /* not WCHAR */
}
+#ifndef WCHAR
else if ((re_opcode_t) *p2 == charset)
{
/* We win if the first character of the loop is not part
@@ -4846,11 +6651,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
}
}
}
+#endif /* not WCHAR */
}
- p -= 2; /* Point at relative address again. */
+ p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
if ((re_opcode_t) p[-1] != pop_failure_jump)
{
- p[-1] = (unsigned char) jump;
+ p[-1] = (UCHAR_T) jump;
DEBUG_PRINT1 (" Match => jump.\n");
goto unconditional_jump;
}
@@ -4871,8 +6677,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
register from the stack, since lowest will == highest in
`pop_failure_point'. */
active_reg_t dummy_low_reg, dummy_high_reg;
- unsigned char *pdummy;
- const char *sdummy;
+ UCHAR_T *pdummy = NULL;
+ const CHAR_T *sdummy = NULL;
DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
POP_FAILURE_POINT (sdummy, pdummy,
@@ -4937,7 +6743,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Have to succeed matching what follows at least n times.
After that, handle like `on_failure_jump'. */
case succeed_n:
- EXTRACT_NUMBER (mcnt, p + 2);
+ EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
assert (mcnt >= 0);
@@ -4945,46 +6751,57 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
if (mcnt > 0)
{
mcnt--;
- p += 2;
+ p += OFFSET_ADDRESS_SIZE;
STORE_NUMBER_AND_INCR (p, mcnt);
#ifdef _LIBC
- DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt);
+ DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE,
+ mcnt);
#else
- DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt);
+ DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE,
+ mcnt);
#endif
}
else if (mcnt == 0)
{
#ifdef _LIBC
- DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
+ DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
+ p + OFFSET_ADDRESS_SIZE);
#else
- DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
+ DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
+ p + OFFSET_ADDRESS_SIZE);
#endif
- p[2] = (unsigned char) no_op;
- p[3] = (unsigned char) no_op;
+
+#ifdef WCHAR
+ p[1] = (UCHAR_T) no_op;
+#else
+ p[2] = (UCHAR_T) no_op;
+ p[3] = (UCHAR_T) no_op;
+#endif /* WCHAR */
goto on_failure;
}
break;
case jump_n:
- EXTRACT_NUMBER (mcnt, p + 2);
+ EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
/* Originally, this is how many times we CAN jump. */
if (mcnt)
{
mcnt--;
- STORE_NUMBER (p + 2, mcnt);
+ STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
#ifdef _LIBC
- DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt);
+ DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
+ mcnt);
#else
- DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt);
+ DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
+ mcnt);
#endif
goto unconditional_jump;
}
/* If don't have to jump any more, skip over the rest of command. */
else
- p += 4;
+ p += 2 * OFFSET_ADDRESS_SIZE;
break;
case set_number_at:
@@ -5215,13 +7032,13 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
We don't handle duplicates properly (yet). */
static boolean
-group_match_null_string_p (p, end, reg_info)
- unsigned char **p, *end;
- register_info_type *reg_info;
+PREFIX(group_match_null_string_p) (p, end, reg_info)
+ UCHAR_T **p, *end;
+ PREFIX(register_info_type) *reg_info;
{
int mcnt;
/* Point to after the args to the start_memory. */
- unsigned char *p1 = *p + 2;
+ UCHAR_T *p1 = *p + 2;
while (p1 < end)
{
@@ -5259,14 +7076,16 @@ group_match_null_string_p (p, end, reg_info)
with an on_failure_jump (see above) that jumps to right
past a jump_past_alt. */
- while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
+ while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
+ jump_past_alt)
{
/* `mcnt' holds how many bytes long the alternative
is, including the ending `jump_past_alt' and
its number. */
- if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
- reg_info))
+ if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt -
+ (1 + OFFSET_ADDRESS_SIZE),
+ reg_info))
return false;
/* Move to right after this alternative, including the
@@ -5282,10 +7101,11 @@ group_match_null_string_p (p, end, reg_info)
alternative that starts with an on_failure_jump. */
p1++;
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
- if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
+ if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
+ jump_past_alt)
{
/* Get to the beginning of the n-th alternative. */
- p1 -= 3;
+ p1 -= 1 + OFFSET_ADDRESS_SIZE;
break;
}
}
@@ -5293,9 +7113,9 @@ group_match_null_string_p (p, end, reg_info)
/* Deal with the last alternative: go back and get number
of the `jump_past_alt' just before it. `mcnt' contains
the length of the alternative. */
- EXTRACT_NUMBER (mcnt, p1 - 2);
+ EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
- if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
+ if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info))
return false;
p1 += mcnt; /* Get past the n-th alternative. */
@@ -5310,7 +7130,7 @@ group_match_null_string_p (p, end, reg_info)
default:
- if (!common_op_match_null_string_p (&p1, end, reg_info))
+ if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
return false;
}
} /* while p1 < end */
@@ -5324,12 +7144,12 @@ group_match_null_string_p (p, end, reg_info)
byte past the last. The alternative can contain groups. */
static boolean
-alt_match_null_string_p (p, end, reg_info)
- unsigned char *p, *end;
- register_info_type *reg_info;
+PREFIX(alt_match_null_string_p) (p, end, reg_info)
+ UCHAR_T *p, *end;
+ PREFIX(register_info_type) *reg_info;
{
int mcnt;
- unsigned char *p1 = p;
+ UCHAR_T *p1 = p;
while (p1 < end)
{
@@ -5346,7 +7166,7 @@ alt_match_null_string_p (p, end, reg_info)
break;
default:
- if (!common_op_match_null_string_p (&p1, end, reg_info))
+ if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
return false;
}
} /* while p1 < end */
@@ -5361,14 +7181,14 @@ alt_match_null_string_p (p, end, reg_info)
Sets P to one after the op and its arguments, if any. */
static boolean
-common_op_match_null_string_p (p, end, reg_info)
- unsigned char **p, *end;
- register_info_type *reg_info;
+PREFIX(common_op_match_null_string_p) (p, end, reg_info)
+ UCHAR_T **p, *end;
+ PREFIX(register_info_type) *reg_info;
{
int mcnt;
boolean ret;
int reg_no;
- unsigned char *p1 = *p;
+ UCHAR_T *p1 = *p;
switch ((re_opcode_t) *p1++)
{
@@ -5391,7 +7211,7 @@ common_op_match_null_string_p (p, end, reg_info)
case start_memory:
reg_no = *p1;
assert (reg_no > 0 && reg_no <= MAX_REGNUM);
- ret = group_match_null_string_p (&p1, end, reg_info);
+ ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info);
/* Have to set this here in case we're checking a group which
contains a group and a back reference to it. */
@@ -5414,12 +7234,12 @@ common_op_match_null_string_p (p, end, reg_info)
case succeed_n:
/* Get to the number of times to succeed. */
- p1 += 2;
+ p1 += OFFSET_ADDRESS_SIZE;
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
if (mcnt == 0)
{
- p1 -= 4;
+ p1 -= 2 * OFFSET_ADDRESS_SIZE;
EXTRACT_NUMBER_AND_INCR (mcnt, p1);
p1 += mcnt;
}
@@ -5433,7 +7253,7 @@ common_op_match_null_string_p (p, end, reg_info)
break;
case set_number_at:
- p1 += 4;
+ p1 += 2 * OFFSET_ADDRESS_SIZE;
default:
/* All other opcodes mean we cannot match the empty string. */
@@ -5449,21 +7269,30 @@ common_op_match_null_string_p (p, end, reg_info)
bytes; nonzero otherwise. */
static int
-bcmp_translate (s1, s2, len, translate)
- const char *s1, *s2;
+PREFIX(bcmp_translate) (s1, s2, len, translate)
+ const CHAR_T *s1, *s2;
register int len;
RE_TRANSLATE_TYPE translate;
{
- register const unsigned char *p1 = (const unsigned char *) s1;
- register const unsigned char *p2 = (const unsigned char *) s2;
+ register const UCHAR_T *p1 = (const UCHAR_T *) s1;
+ register const UCHAR_T *p2 = (const UCHAR_T *) s2;
while (len)
{
+#ifdef WCHAR
+ if (((*p1<=0xff)?translate[*p1++]:*p1++)
+ != ((*p2<=0xff)?translate[*p2++]:*p2++))
+ return 1;
+#else /* BYTE */
if (translate[*p1++] != translate[*p2++]) return 1;
+#endif /* WCHAR */
len--;
}
return 0;
}
+
+#else /* not INSIDE_RECURSION */
+
/* Entry points for GNU code. */
/* re_compile_pattern is the GNU regular expression compiler: it
@@ -5495,7 +7324,12 @@ re_compile_pattern (pattern, length, bufp)
/* Match anchors at newline. */
bufp->newline_anchor = 1;
- ret = regex_compile (pattern, length, re_syntax_options, bufp);
+# ifdef MBS_SUPPORT
+ if (MB_CUR_MAX != 1)
+ ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp);
+ else
+# endif
+ ret = byte_regex_compile (pattern, length, re_syntax_options, bufp);
if (!ret)
return NULL;
@@ -5550,7 +7384,12 @@ re_comp (s)
/* Match anchors at newlines. */
re_comp_buf.newline_anchor = 1;
- ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
+# ifdef MBS_SUPPORT
+ if (MB_CUR_MAX != 1)
+ ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
+ else
+# endif
+ ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
if (!ret)
return NULL;
@@ -5667,7 +7506,12 @@ regcomp (preg, pattern, cflags)
/* POSIX says a null character in the pattern terminates it, so we
can use strlen here in compiling the pattern. */
- ret = regex_compile (pattern, strlen (pattern), syntax, preg);
+# ifdef MBS_SUPPORT
+ if (MB_CUR_MAX != 1)
+ ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg);
+ else
+# endif
+ ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg);
/* POSIX doesn't distinguish between an unmatched open-group and an
unmatched close-group: both are REG_EPAREN. */
@@ -5852,3 +7696,73 @@ weak_alias (__regfree, regfree)
#endif
#endif /* not emacs */
+
+#endif /* not INSIDE_RECURSION */
+
+#undef STORE_NUMBER
+#undef STORE_NUMBER_AND_INCR
+#undef EXTRACT_NUMBER
+#undef EXTRACT_NUMBER_AND_INCR
+
+#undef DEBUG_PRINT_COMPILED_PATTERN
+#undef DEBUG_PRINT_DOUBLE_STRING
+
+#undef INIT_FAIL_STACK
+#undef RESET_FAIL_STACK
+#undef DOUBLE_FAIL_STACK
+#undef PUSH_PATTERN_OP
+#undef PUSH_FAILURE_POINTER
+#undef PUSH_FAILURE_INT
+#undef PUSH_FAILURE_ELT
+#undef POP_FAILURE_POINTER
+#undef POP_FAILURE_INT
+#undef POP_FAILURE_ELT
+#undef DEBUG_PUSH
+#undef DEBUG_POP
+#undef PUSH_FAILURE_POINT
+#undef POP_FAILURE_POINT
+
+#undef REG_UNSET_VALUE
+#undef REG_UNSET
+
+#undef PATFETCH
+#undef PATFETCH_RAW
+#undef PATUNFETCH
+#undef TRANSLATE
+
+#undef INIT_BUF_SIZE
+#undef GET_BUFFER_SPACE
+#undef BUF_PUSH
+#undef BUF_PUSH_2
+#undef BUF_PUSH_3
+#undef STORE_JUMP
+#undef STORE_JUMP2
+#undef INSERT_JUMP
+#undef INSERT_JUMP2
+#undef EXTEND_BUFFER
+#undef GET_UNSIGNED_NUMBER
+#undef FREE_STACK_RETURN
+
+#undef POINTER_TO_OFFSET
+#undef MATCHING_IN_FRST_STRING
+#undef PREFETCH
+#undef AT_STRINGS_BEG
+#undef AT_STRINGS_END
+#undef WORDCHAR_P
+#undef FREE_VAR
+#undef FREE_VARIABLES
+#undef NO_HIGHEST_ACTIVE_REG
+#undef NO_LOWEST_ACTIVE_REG
+
+#undef CHAR_T
+#undef UCHAR_T
+#undef COMPILED_BUFFER_VAR
+#undef OFFSET_ADDRESS_SIZE
+#undef CHAR_CLASS_SIZE
+#undef PREFIX
+#undef ARG_PREFIX
+#undef PUT_CHAR
+#undef BYTE
+#undef WCHAR
+
+#define DEFINED_ONCE