summaryrefslogtreecommitdiff
path: root/posix
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2004-11-18 23:57:34 +0000
committerUlrich Drepper <drepper@redhat.com>2004-11-18 23:57:34 +0000
commitc06a6956a40a878fdbe319e4688196e7c990bf7a (patch)
treefee7b24f6276363e8b1f0e977231b413f9eb2fbf /posix
parent1b1d36792e9d9c4ad9a67ad8bfc1a3be8f2104c1 (diff)
downloadglibc-c06a6956a40a878fdbe319e4688196e7c990bf7a.tar.gz
[BZ #544]
Update. 2004-11-18 Jakub Jelinek <jakub@redhat.com> [BZ #544] * posix/regex.h (RE_NO_SUB): New define. * posix/regex_internal.h (OP_DELETED_SUBEXP): New. (re_dfa_t): Add subexp_map. * posix/regcomp.c (struct subexp_optimize): New type. (optimize_subexps): New routine. (re_compile_internal): Call it. (re_compile_pattern): Set preg->no_sub to 1 if RE_NO_SUB. (free_dfa_content): Free subexp_map. (calc_inveclosure, calc_eclosure): Skip OP_DELETED_SUBEXP nodes. * posix/regexec.c (re_search_internal): If subexp_map is not NULL, duplicate registers as needed. * posix/Makefile: Add rules to build and run tst-regex2. * posix/tst-regex2.c: New test. * posix/rxspencer/tests: Fix last two tests (\0 -> \1). Add some new tests for nested subexpressions.
Diffstat (limited to 'posix')
-rw-r--r--posix/Makefile5
-rw-r--r--posix/regcomp.c105
-rw-r--r--posix/regex.h4
-rw-r--r--posix/regex_internal.h2
-rw-r--r--posix/regexec.c12
-rw-r--r--posix/rxspencer/tests20
-rw-r--r--posix/tst-regex2.c244
7 files changed, 387 insertions, 5 deletions
diff --git a/posix/Makefile b/posix/Makefile
index 744f49f5b3..c89cfc0b41 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -80,7 +80,7 @@ tests := tstgetopt testfnm runtests runptests \
bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
bug-regex21 bug-regex22 bug-regex23 bug-regex24 \
- tst-nice tst-nanosleep \
+ tst-nice tst-nanosleep tst-regex2 \
transbug tst-rxspencer tst-pcre tst-boost \
bug-ga1 tst-vfork1 tst-vfork2 tst-waitid \
tst-getaddrinfo2 bug-glob1 bug-glob2
@@ -160,6 +160,7 @@ tst-fnmatch-ENV = LOCPATH=$(common-objpfx)localedata
tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex1-ENV = LOCPATH=$(common-objpfx)localedata
tst-regex-ENV = LOCPATH=$(common-objpfx)localedata
+tst-regex2-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex5-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex6-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex17-ENV = LOCPATH=$(common-objpfx)localedata
@@ -244,8 +245,10 @@ $(objpfx)tst-getconf.out: tst-getconf.sh $(objpfx)getconf
ifeq (yes,$(build-shared))
$(objpfx)tst-regex: $(common-objpfx)rt/librt.so
+$(objpfx)tst-regex2: $(common-objpfx)rt/librt.so
else
$(objpfx)tst-regex: $(common-objpfx)rt/librt.a
+$(objpfx)tst-regex2: $(common-objpfx)rt/librt.a
endif
$(objpfx)bug-ga2-mem: $(objpfx)bug-ga2.out
diff --git a/posix/regcomp.c b/posix/regcomp.c
index ba7a1cc5d4..dafad9bd0c 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -33,6 +33,14 @@ static reg_errcode_t create_initial_state (re_dfa_t *dfa);
#ifdef RE_ENABLE_I18N
static void optimize_utf8 (re_dfa_t *dfa);
#endif
+struct subexp_optimize
+{
+ re_dfa_t *dfa;
+ re_token_t *nodes;
+ int no_sub, re_nsub;
+};
+static bin_tree_t *optimize_subexps (struct subexp_optimize *so,
+ bin_tree_t *node, int sidx, int depth);
static reg_errcode_t analyze (re_dfa_t *dfa);
static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
@@ -238,8 +246,8 @@ re_compile_pattern (pattern, length, bufp)
/* And GNU code determines whether or not to get register information
by passing null for the REGS argument to re_match, etc., not by
- setting no_sub. */
- bufp->no_sub = 0;
+ setting no_sub, unless RE_NO_SUB is set. */
+ bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
/* Match anchors at newline. */
bufp->newline_anchor = 1;
@@ -633,6 +641,7 @@ free_dfa_content (re_dfa_t *dfa)
if (dfa->sb_char != utf8_sb_map)
re_free (dfa->sb_char);
#endif
+ re_free (dfa->subexp_map);
#ifdef DEBUG
re_free (dfa->re_str);
#endif
@@ -810,6 +819,17 @@ re_compile_internal (preg, pattern, length, syntax)
optimize_utf8 (dfa);
#endif
+ if (preg->re_nsub > 0)
+ {
+ struct subexp_optimize so;
+
+ so.dfa = dfa;
+ so.nodes = dfa->nodes;
+ so.no_sub = preg->no_sub;
+ so.re_nsub = preg->re_nsub;
+ dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0);
+ }
+
/* Analyze the tree and collect information which is necessary to
create the dfa. */
err = analyze (dfa);
@@ -1121,6 +1141,82 @@ optimize_utf8 (dfa)
}
#endif
+static bin_tree_t *
+optimize_subexps (so, node, sidx, depth)
+ struct subexp_optimize *so;
+ bin_tree_t *node;
+ int sidx, depth;
+{
+ int idx, new_depth, new_sidx;
+ bin_tree_t *ret;
+ if (node == NULL)
+ return NULL;
+
+ new_depth = 0;
+ new_sidx = sidx;
+ if ((depth & 1) && node->type == CONCAT
+ && node->right && node->right->type == 0
+ && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
+ {
+ new_depth = depth + 1;
+ if (new_depth == 2
+ || (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
+ && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)))
+ new_sidx = so->nodes[idx].opr.idx;
+ }
+ node->left = optimize_subexps (so, node->left, new_sidx, new_depth);
+ new_depth = (depth & 1) == 0 && node->type == CONCAT
+ && node->left && node->left->type == 0
+ && so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP
+ ? depth + 1 : 0;
+ node->right = optimize_subexps (so, node->right, sidx, new_depth);
+
+ if (node->type != CONCAT)
+ return node;
+ if ((depth & 1) == 0
+ && node->left
+ && node->left->type == 0
+ && so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP)
+ ret = node->right;
+ else if ((depth & 1)
+ && node->right
+ && node->right->type == 0
+ && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
+ ret = node->left;
+ else
+ return node;
+
+ if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
+ && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))
+ return node;
+
+ if (!so->no_sub)
+ {
+ int i;
+
+ if (depth < 2)
+ return node;
+
+ if (so->dfa->subexp_map == NULL)
+ {
+ so->dfa->subexp_map = re_malloc (int, so->re_nsub);
+ if (so->dfa->subexp_map == NULL)
+ return node;
+
+ for (i = 0; i < so->re_nsub; i++)
+ so->dfa->subexp_map[i] = i;
+ }
+
+ i = so->nodes[idx].opr.idx;
+ assert (sidx < i);
+ so->dfa->subexp_map[i] = sidx;
+ }
+
+ so->nodes[idx].type = OP_DELETED_SUBEXP;
+ ret->parent = node->parent;
+ return ret;
+}
+
/* Analyze the structure tree, and calculate "first", "next", "edest",
"eclosure", and "inveclosure". */
@@ -1525,6 +1621,8 @@ calc_inveclosure (dfa)
int src, idx, dest;
for (src = 0; src < dfa->nodes_len; ++src)
{
+ if (dfa->nodes[src].type == OP_DELETED_SUBEXP)
+ continue;
for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
{
dest = dfa->eclosures[src].elems[idx];
@@ -1560,6 +1658,9 @@ calc_eclosure (dfa)
#ifdef DEBUG
assert (dfa->eclosures[node_idx].nelem != -1);
#endif
+ if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP)
+ continue;
+
/* If we have already calculated, skip it. */
if (dfa->eclosures[node_idx].nelem != 0)
continue;
diff --git a/posix/regex.h b/posix/regex.h
index 4279dbcc5b..b2d9a62fec 100644
--- a/posix/regex.h
+++ b/posix/regex.h
@@ -179,6 +179,10 @@ typedef unsigned long int reg_syntax_t;
immediately after an alternation or begin-group operator. */
#define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1)
+/* If this bit is set, then no_sub will be set to 1 during
+ re_compile_pattern. */
+#define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
+
/* This global variable defines the particular regexp syntax to use (for
some interfaces). When a regexp is compiled, the syntax used is
stored in the pattern buffer, so changing this does not affect
diff --git a/posix/regex_internal.h b/posix/regex_internal.h
index 14d95a5b84..a778032d77 100644
--- a/posix/regex_internal.h
+++ b/posix/regex_internal.h
@@ -189,6 +189,7 @@ typedef enum
OP_DUP_PLUS = EPSILON_BIT | 4,
OP_DUP_QUESTION = EPSILON_BIT | 5,
ANCHOR = EPSILON_BIT | 6,
+ OP_DELETED_SUBEXP = EPSILON_BIT | 7,
/* Tree type, these are used only by tree. */
CONCAT = 16,
@@ -644,6 +645,7 @@ struct re_dfa_t
int mb_cur_max;
bitset word_char;
reg_syntax_t syntax;
+ int *subexp_map;
#ifdef DEBUG
char* re_str;
#endif
diff --git a/posix/regexec.c b/posix/regexec.c
index a03df2636a..5877adeb55 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -882,6 +882,18 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
pmatch[reg_idx].rm_so += match_first;
pmatch[reg_idx].rm_eo += match_first;
}
+
+ if (dfa->subexp_map)
+ for (reg_idx = 0;
+ reg_idx + 1 < nmatch && reg_idx < preg->re_nsub;
+ reg_idx++)
+ if (dfa->subexp_map[reg_idx] != reg_idx)
+ {
+ pmatch[reg_idx + 1].rm_so
+ = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
+ pmatch[reg_idx + 1].rm_eo
+ = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
+ }
}
free_return:
diff --git a/posix/rxspencer/tests b/posix/rxspencer/tests
index 30fff15946..a724252d8c 100644
--- a/posix/rxspencer/tests
+++ b/posix/rxspencer/tests
@@ -508,5 +508,21 @@ a*a*a*a*a*a*a* & aaaaaa aaaaaa
(\b){0} - x @x -
\(\b\)\{0,0\} b abc @abc -
a(\b){0}c - ac ac -
-a(.*)b(\0){0}c - abc abc @bc,-
-a(.*)b(\0){0}c - axbc axbc x,-
+a(.*)b(\1){0}c - abc abc @bc,-
+a(.*)b(\1){0}c - axbc axbc x,-
+
+a\(\(b*\)\)c\1d b abbcbbd abbcbbd bb,bb
+a\(\([bc]\)\)\2d b abcdabbd abbd b,b
+a\(\(\(\([bc]\)\)\3\)\)*d b abbccd abbccd cc,cc,c,c
+a(b)(c)d - abcd abcd b,c
+a(((b)))c - abc abc b,b,b
+a(((b|(((c))))))d - abd abd b,b,b,-,-,-
+a(((b*|c|e)))d - abbd abbd bb,bb,bb
+a((b|c)){0,0}d - ad ad -,-
+a((b|c)){0,1}d - abd abd b,b
+a((b|c)){0,2}d - abcd abcd c,c
+a((b+|((c)*)))+d - abd abd b,b,-,-
+a((b+|((c)*)))+d - abcd abcd c,c,c,c
+(((\b))){0} - x @x -,-,-
+a(((.*)))b((\2)){0}c - abc abc @bc,@bc,@bc,-,-
+a(((.*)))b((\1)){0}c - axbc axbc x,x,x,-,-
diff --git a/posix/tst-regex2.c b/posix/tst-regex2.c
new file mode 100644
index 0000000000..a107f73ba4
--- /dev/null
+++ b/posix/tst-regex2.c
@@ -0,0 +1,244 @@
+#include <fcntl.h>
+#include <locale.h>
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+#ifdef _POSIX_CPUTIME
+static clockid_t cl;
+static int use_clock;
+#endif
+
+static int
+do_test (void)
+{
+#ifdef _POSIX_CPUTIME
+ /* See whether we can use the CPU clock. */
+ use_clock = clock_getcpuclockid (0, &cl) == 0;
+#endif
+
+ static const char *pat[] = {
+ ".?.?.?.?.?.?.?Log\\.13",
+ "(.?)(.?)(.?)(.?)(.?)(.?)(.?)Log\\.13",
+ "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
+ "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
+ "((((((((((.?))))))))))Log\\.13" };
+
+ int fd = open ("../ChangeLog.14", O_RDONLY);
+ if (fd < 0)
+ {
+ printf ("Couldn't open ChangeLog.14: %m\n");
+ return 1;
+ }
+
+ struct stat64 st;
+ if (fstat64 (fd, &st) < 0)
+ {
+ printf ("Couldn't fstat ChangeLog.14: %m\n");
+ return 1;
+ }
+
+ char *buf = malloc (st.st_size + 1);
+ if (buf == NULL)
+ {
+ printf ("Couldn't allocate buffer: %m\n");
+ return 1;
+ }
+
+ if (read (fd, buf, st.st_size) != (ssize_t) st.st_size)
+ {
+ puts ("Couldn't read ChangeLog.14");
+ return 1;
+ }
+
+ close (fd);
+ buf[st.st_size] = '\0';
+
+ setlocale (LC_ALL, "de_DE.UTF-8");
+
+ char *string = buf;
+ size_t len = st.st_size;
+
+#ifndef WHOLE_FILE_TIMING
+ /* Don't search the whole file normally, it takes too long. */
+ if (len > 500000 + 64)
+ {
+ string += 500000;
+ len -= 500000;
+ }
+#endif
+
+ for (int testno = 0; testno < 4; ++testno)
+ for (int i = 0; i < sizeof (pat) / sizeof (pat[0]); ++i)
+ {
+ printf ("test %d pattern %d", testno, i);
+
+ regex_t rbuf;
+ struct re_pattern_buffer rpbuf;
+ int err;
+ if (testno < 2)
+ {
+ err = regcomp (&rbuf, pat[i],
+ REG_EXTENDED | (testno ? REG_NOSUB : 0));
+ if (err != 0)
+ {
+ putchar ('\n');
+ char errstr[300];
+ regerror (err, &rbuf, errstr, sizeof (errstr));
+ puts (errstr);
+ return err;
+ }
+ }
+ else
+ {
+ re_set_syntax (RE_SYNTAX_POSIX_EGREP
+ | (testno == 3 ? RE_NO_SUB : 0));
+
+ memset (&rpbuf, 0, sizeof (rpbuf));
+ const char *s = re_compile_pattern (pat[i], strlen (pat[i]),
+ &rpbuf);
+ if (s != NULL)
+ {
+ printf ("\n%s\n", s);
+ return 1;
+ }
+
+ /* Just so that this can be tested with earlier glibc as well. */
+ if (testno == 3)
+ rpbuf.no_sub = 1;
+ }
+
+#ifdef _POSIX_CPUTIME
+ struct timespec start, stop;
+ if (use_clock)
+ use_clock = clock_gettime (cl, &start) == 0;
+#endif
+
+ if (testno < 2)
+ {
+ regmatch_t pmatch[71];
+ err = regexec (&rbuf, string, 71, pmatch, 0);
+ if (err == REG_NOMATCH)
+ {
+ puts ("\nregexec failed");
+ return 1;
+ }
+
+ if (testno == 0)
+ {
+ if (pmatch[0].rm_eo != pmatch[0].rm_so + 13
+ || pmatch[0].rm_eo > len
+ || pmatch[0].rm_so < len - 100
+ || strncmp (string + pmatch[0].rm_so,
+ " ChangeLog.13 for earlier changes",
+ sizeof " ChangeLog.13 for earlier changes" - 1)
+ != 0)
+ {
+ puts ("\nregexec without REG_NOSUB did not find the correct match");
+ return 1;
+ }
+
+ if (i > 0)
+ for (int j = 0, l = 1; j < 7; ++j)
+ for (int k = 0; k < (i == 1 ? 1 : 10); ++k, ++l)
+ if (pmatch[l].rm_so != pmatch[0].rm_so + j
+ || pmatch[l].rm_eo != pmatch[l].rm_so + 1)
+ {
+ printf ("\npmatch[%d] incorrect\n", l);
+ return 1;
+ }
+ }
+ }
+ else
+ {
+ struct re_registers regs;
+
+ memset (&regs, 0, sizeof (regs));
+ int match = re_search (&rpbuf, string, len, 0, len,
+ &regs);
+ if (match < 0)
+ {
+ puts ("\nre_search failed");
+ return 1;
+ }
+
+ if (match + 13 > len
+ || match < len - 100
+ || strncmp (string + match,
+ " ChangeLog.13 for earlier changes",
+ sizeof " ChangeLog.13 for earlier changes" - 1)
+ != 0)
+ {
+ puts ("\nre_search did not find the correct match");
+ return 1;
+ }
+
+ if (testno == 2)
+ {
+ if (regs.num_regs != 2 + (i == 0 ? 0 : i == 1 ? 7 : 70))
+ {
+ printf ("\nincorrect num_regs %d\n", regs.num_regs);
+ return 1;
+ }
+
+ if (regs.start[0] != match || regs.end[0] != match + 13)
+ {
+ printf ("\nincorrect regs.{start,end}[0] = { %d, %d}\n",
+ regs.start[0], regs.end[0]);
+ return 1;
+ }
+
+ if (regs.start[regs.num_regs - 1] != -1
+ || regs.end[regs.num_regs - 1] != -1)
+ {
+ puts ("\nincorrect regs.{start,end}[num_regs - 1]");
+ return 1;
+ }
+
+ if (i > 0)
+ for (int j = 0, l = 1; j < 7; ++j)
+ for (int k = 0; k < (i == 1 ? 1 : 10); ++k, ++l)
+ if (regs.start[l] != match + j
+ || regs.end[l] != regs.start[l] + 1)
+ {
+ printf ("\nregs.{start,end}[%d] incorrect\n", l);
+ return 1;
+ }
+ }
+ }
+
+#ifdef _POSIX_CPUTIME
+ if (use_clock)
+ use_clock = clock_gettime (cl, &stop) == 0;
+ if (use_clock)
+ {
+ stop.tv_sec -= start.tv_sec;
+ if (stop.tv_nsec < start.tv_nsec)
+ {
+ stop.tv_sec--;
+ stop.tv_nsec += 1000000000 - start.tv_nsec;
+ }
+ else
+ stop.tv_nsec -= start.tv_nsec;
+ printf (": %ld.%09lds\n", (long) stop.tv_sec, (long) stop.tv_nsec);
+ }
+ else
+#endif
+ putchar ('\n');
+
+ if (testno < 2)
+ regfree (&rbuf);
+ else
+ regfree (&rpbuf);
+ }
+
+ return 0;
+}
+
+#define TIMEOUT 20
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"