summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilip Withnall <pwithnall@endlessos.org>2020-11-14 14:08:30 +0000
committerPhilip Withnall <pwithnall@endlessos.org>2021-06-09 14:39:20 +0100
commitb052620398237ce74b0876a1baded6a5c39412ad (patch)
tree8561ad57e7b78351ebc3e94e0e0997c17121d264
parent601ef3b6be457a6b0c15ab3a341a0e51f1d02ffd (diff)
downloadglib-b052620398237ce74b0876a1baded6a5c39412ad.tar.gz
gregex: Fix return from g_match_info_fetch() for unmatched subpatterns
If there were more subpatterns in the regex than matches (which can happen if one or more of the subpatterns are optional), `g_match_info_fetch()` was erroneously returning `NULL` rather than the empty string. It should only return `NULL` when the `match_num` specifies a subpattern which doesn’t exist in the regex. This is complicated slightly by the fact that when using `g_regex_match_all()`, more matches can be returned than there are subpatterns, due to one or more subpatterns matching multiple times at different offsets in the string. This includes a fix for a unit test which was erroneously checking the broken behaviour. Thanks to Allison Karlitskaya for the minimal reproducer. Signed-off-by: Philip Withnall <pwithnall@endlessos.org> Fixes: #229
-rw-r--r--glib/gregex.c30
-rw-r--r--glib/tests/regex.c5
2 files changed, 25 insertions, 10 deletions
diff --git a/glib/gregex.c b/glib/gregex.c
index 5e6ddfb46..f48138ef9 100644
--- a/glib/gregex.c
+++ b/glib/gregex.c
@@ -35,6 +35,7 @@
#include "gmessages.h"
#include "gstrfuncs.h"
#include "gatomic.h"
+#include "gtestutils.h"
#include "gthread.h"
/**
@@ -206,7 +207,8 @@ struct _GMatchInfo
gint ref_count; /* the ref count (atomic) */
GRegex *regex; /* the regex */
GRegexMatchFlags match_opts; /* options used at match time on the regex */
- gint matches; /* number of matching sub patterns */
+ gint matches; /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */
+ gint n_subpatterns; /* total number of sub patterns in the regex */
gint pos; /* position in the string where last match left off */
gint n_offsets; /* number of offsets */
gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
@@ -574,6 +576,9 @@ match_info_new (const GRegex *regex,
match_info->pos = start_position;
match_info->match_opts = match_options;
+ pcre_fullinfo (regex->pcre_re, regex->extra,
+ PCRE_INFO_CAPTURECOUNT, &match_info->n_subpatterns);
+
if (is_dfa)
{
/* These values should be enough for most cases, if they are not
@@ -584,10 +589,7 @@ match_info_new (const GRegex *regex,
}
else
{
- gint capture_count;
- pcre_fullinfo (regex->pcre_re, regex->extra,
- PCRE_INFO_CAPTURECOUNT, &capture_count);
- match_info->n_offsets = (capture_count + 1) * 3;
+ match_info->n_offsets = (match_info->n_subpatterns + 1) * 3;
}
match_info->offsets = g_new0 (gint, match_info->n_offsets);
@@ -768,6 +770,8 @@ g_match_info_next (GMatchInfo *match_info,
match_info->pos = match_info->offsets[1];
}
+ g_assert (match_info->matches <= match_info->n_subpatterns + 1);
+
/* it's possible to get two identical matches when we are matching
* empty strings, for instance if the pattern is "(?=[A-Z0-9])" and
* the string is "RegExTest" we have:
@@ -1044,16 +1048,21 @@ g_match_info_fetch_pos (const GMatchInfo *match_info,
g_return_val_if_fail (match_info != NULL, FALSE);
g_return_val_if_fail (match_num >= 0, FALSE);
+ /* check whether there was an error */
+ if (match_info->matches < 0)
+ return FALSE;
+
/* make sure the sub expression number they're requesting is less than
- * the total number of sub expressions that were matched. */
- if (match_num >= match_info->matches)
+ * the total number of sub expressions in the regex. When matching all
+ * (g_regex_match_all()), also compare against the number of matches */
+ if (match_num >= MAX (match_info->n_subpatterns + 1, match_info->matches))
return FALSE;
if (start_pos != NULL)
- *start_pos = match_info->offsets[2 * match_num];
+ *start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1;
if (end_pos != NULL)
- *end_pos = match_info->offsets[2 * match_num + 1];
+ *end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1;
return TRUE;
}
@@ -1989,6 +1998,9 @@ g_regex_match_all_full (const GRegex *regex,
pcre_free (pcre_re);
#endif
+ /* don’t assert that (info->matches <= info->n_subpatterns + 1) as that only
+ * holds true for a single match, rather than matching all */
+
/* set info->pos to -1 so that a call to g_match_info_next() fails. */
info->pos = -1;
retval = info->matches >= 0;
diff --git a/glib/tests/regex.c b/glib/tests/regex.c
index 60ab2f9df..6b76f761f 100644
--- a/glib/tests/regex.c
+++ b/glib/tests/regex.c
@@ -2557,13 +2557,16 @@ main (int argc, char *argv[])
TEST_SUB_PATTERN("(a)?(b)", "b", 0, 0, "b", 0, 1);
TEST_SUB_PATTERN("(a)?(b)", "b", 0, 1, "", -1, -1);
TEST_SUB_PATTERN("(a)?(b)", "b", 0, 2, "b", 0, 1);
+ TEST_SUB_PATTERN("(a)?b", "b", 0, 0, "b", 0, 1);
+ TEST_SUB_PATTERN("(a)?b", "b", 0, 1, "", -1, -1);
+ TEST_SUB_PATTERN("(a)?b", "b", 0, 2, NULL, UNTOUCHED, UNTOUCHED);
/* TEST_NAMED_SUB_PATTERN(pattern, string, start_position, sub_name,
* expected_sub, expected_start, expected_end) */
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", "ab", 0, "A", "b", 1, 2);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", "aab", 1, "A", "b", 2, 3);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", EURO "ab", 0, "A", "b", 4, 5);
- TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", EURO "ab", 0, "B", NULL, UNTOUCHED, UNTOUCHED);
+ TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", EURO "ab", 0, "B", "", -1, -1);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", EURO "ab", 0, "C", NULL, UNTOUCHED, UNTOUCHED);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", "a" EGRAVE "x", 0, "A", EGRAVE, 1, 3);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", "a" EGRAVE "x", 0, "B", "x", 3, 4);