diff options
author | Marco Trevisan <mail@3v1n0.net> | 2022-09-12 21:45:57 +0000 |
---|---|---|
committer | Marco Trevisan <mail@3v1n0.net> | 2022-09-12 21:45:57 +0000 |
commit | 0d823aa926286b5518bfca1ff125e3114ba9e2c1 (patch) | |
tree | ee1f00fb40c094bc72d91017c26b7dcf13e489e0 | |
parent | a2a0441189534e26c5ca99b0237f794f594ef213 (diff) | |
parent | 653f8eb0203485c7ffb0eeae81e6e30437d18529 (diff) | |
download | glib-0d823aa926286b5518bfca1ff125e3114ba9e2c1.tar.gz |
Merge branch 'wip/3v1n0/regex-pcre2-flags-fixes' into 'main'
GRegex flags fixes and cleanups
Closes gtksourceview#283, #2741, #2729, #2688 e gtksourceview#278
See merge request GNOME/glib!2878
-rw-r--r-- | glib/gregex.c | 780 | ||||
-rw-r--r-- | glib/tests/regex.c | 359 |
2 files changed, 755 insertions, 384 deletions
diff --git a/glib/gregex.c b/glib/gregex.c index 08c43ef4b..220a1a11a 100644 --- a/glib/gregex.c +++ b/glib/gregex.c @@ -3,6 +3,7 @@ * Copyright (C) 1999, 2000 Scott Wimer * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com> * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org> + * Copyright (C) 2022, Marco Trevisan <marco.trevisan@canonical.com> * * SPDX-License-Identifier: LGPL-2.1-or-later * @@ -22,6 +23,7 @@ #include "config.h" +#include <stdint.h> #include <string.h> #define PCRE2_CODE_UNIT_WIDTH 8 @@ -110,62 +112,112 @@ * library written by Philip Hazel. */ -/* Signifies that flags have already been converted from pcre1 to pcre2. The - * value 0x04000000u is also the value of PCRE2_MATCH_INVALID_UTF in pcre2.h, - * but it is not used in gregex, so we can reuse it for this flag. - */ -#define G_REGEX_FLAGS_CONVERTED 0x04000000u +#define G_REGEX_PCRE_GENERIC_MASK (PCRE2_ANCHORED | \ + PCRE2_NO_UTF_CHECK | \ + PCRE2_ENDANCHORED) + /* Mask of all the possible values for GRegexCompileFlags. */ -#define G_REGEX_COMPILE_MASK (PCRE2_CASELESS | \ - PCRE2_MULTILINE | \ - PCRE2_DOTALL | \ - PCRE2_EXTENDED | \ - PCRE2_ANCHORED | \ - PCRE2_DOLLAR_ENDONLY | \ - PCRE2_UNGREEDY | \ - PCRE2_UTF | \ - PCRE2_NO_AUTO_CAPTURE | \ - PCRE2_FIRSTLINE | \ - PCRE2_DUPNAMES | \ - PCRE2_NEWLINE_CR | \ - PCRE2_NEWLINE_LF | \ - PCRE2_NEWLINE_CRLF | \ - PCRE2_NEWLINE_ANYCRLF | \ - PCRE2_BSR_ANYCRLF | \ - G_REGEX_FLAGS_CONVERTED) - -/* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */ -#define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK) -#define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF | \ - G_REGEX_FLAGS_CONVERTED) +#define G_REGEX_COMPILE_MASK (G_REGEX_DEFAULT | \ + G_REGEX_CASELESS | \ + G_REGEX_MULTILINE | \ + G_REGEX_DOTALL | \ + G_REGEX_EXTENDED | \ + G_REGEX_ANCHORED | \ + G_REGEX_DOLLAR_ENDONLY | \ + G_REGEX_UNGREEDY | \ + G_REGEX_RAW | \ + G_REGEX_NO_AUTO_CAPTURE | \ + G_REGEX_OPTIMIZE | \ + G_REGEX_FIRSTLINE | \ + G_REGEX_DUPNAMES | \ + G_REGEX_NEWLINE_CR | \ + G_REGEX_NEWLINE_LF | \ + G_REGEX_NEWLINE_CRLF | \ + G_REGEX_NEWLINE_ANYCRLF | \ + G_REGEX_BSR_ANYCRLF) + +#define G_REGEX_PCRE2_COMPILE_MASK (PCRE2_ALLOW_EMPTY_CLASS | \ + PCRE2_ALT_BSUX | \ + PCRE2_AUTO_CALLOUT | \ + PCRE2_CASELESS | \ + PCRE2_DOLLAR_ENDONLY | \ + PCRE2_DOTALL | \ + PCRE2_DUPNAMES | \ + PCRE2_EXTENDED | \ + PCRE2_FIRSTLINE | \ + PCRE2_MATCH_UNSET_BACKREF | \ + PCRE2_MULTILINE | \ + PCRE2_NEVER_UCP | \ + PCRE2_NEVER_UTF | \ + PCRE2_NO_AUTO_CAPTURE | \ + PCRE2_NO_AUTO_POSSESS | \ + PCRE2_NO_DOTSTAR_ANCHOR | \ + PCRE2_NO_START_OPTIMIZE | \ + PCRE2_UCP | \ + PCRE2_UNGREEDY | \ + PCRE2_UTF | \ + PCRE2_NEVER_BACKSLASH_C | \ + PCRE2_ALT_CIRCUMFLEX | \ + PCRE2_ALT_VERBNAMES | \ + PCRE2_USE_OFFSET_LIMIT | \ + PCRE2_EXTENDED_MORE | \ + PCRE2_LITERAL | \ + PCRE2_MATCH_INVALID_UTF | \ + G_REGEX_PCRE_GENERIC_MASK) + +#define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF) /* Mask of all the possible values for GRegexMatchFlags. */ -#define G_REGEX_MATCH_MASK (PCRE2_ANCHORED | \ - PCRE2_NOTBOL | \ - PCRE2_NOTEOL | \ - PCRE2_NOTEMPTY | \ - PCRE2_NEWLINE_CR | \ - PCRE2_NEWLINE_LF | \ - PCRE2_NEWLINE_CRLF | \ - PCRE2_NEWLINE_ANY | \ - PCRE2_NEWLINE_ANYCRLF | \ - PCRE2_BSR_ANYCRLF | \ - PCRE2_BSR_UNICODE | \ - PCRE2_PARTIAL_SOFT | \ - PCRE2_PARTIAL_HARD | \ - PCRE2_NOTEMPTY_ATSTART | \ - G_REGEX_FLAGS_CONVERTED) - +#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_DEFAULT | \ + G_REGEX_MATCH_ANCHORED | \ + G_REGEX_MATCH_NOTBOL | \ + G_REGEX_MATCH_NOTEOL | \ + G_REGEX_MATCH_NOTEMPTY | \ + G_REGEX_MATCH_PARTIAL | \ + G_REGEX_MATCH_NEWLINE_CR | \ + G_REGEX_MATCH_NEWLINE_LF | \ + G_REGEX_MATCH_NEWLINE_CRLF | \ + G_REGEX_MATCH_NEWLINE_ANY | \ + G_REGEX_MATCH_NEWLINE_ANYCRLF | \ + G_REGEX_MATCH_BSR_ANYCRLF | \ + G_REGEX_MATCH_BSR_ANY | \ + G_REGEX_MATCH_PARTIAL_SOFT | \ + G_REGEX_MATCH_PARTIAL_HARD | \ + G_REGEX_MATCH_NOTEMPTY_ATSTART) + +#define G_REGEX_PCRE2_MATCH_MASK (PCRE2_NOTBOL |\ + PCRE2_NOTEOL |\ + PCRE2_NOTEMPTY |\ + PCRE2_NOTEMPTY_ATSTART |\ + PCRE2_PARTIAL_SOFT |\ + PCRE2_PARTIAL_HARD |\ + PCRE2_NO_JIT |\ + PCRE2_COPY_MATCHED_SUBJECT |\ + G_REGEX_PCRE_GENERIC_MASK) + +/* TODO: Support PCRE2_NEWLINE_NUL */ #define G_REGEX_NEWLINE_MASK (PCRE2_NEWLINE_CR | \ PCRE2_NEWLINE_LF | \ PCRE2_NEWLINE_CRLF | \ PCRE2_NEWLINE_ANYCRLF) -#define G_REGEX_MATCH_NEWLINE_MASK (PCRE2_NEWLINE_CR | \ - PCRE2_NEWLINE_LF | \ - PCRE2_NEWLINE_CRLF | \ - PCRE2_NEWLINE_ANYCRLF | \ - PCRE2_NEWLINE_ANY) +/* Some match options are not supported when using JIT as stated in the + * pcre2jit man page under the «UNSUPPORTED OPTIONS AND PATTERN ITEMS» section: + * https://www.pcre.org/current/doc/html/pcre2jit.html#SEC5 + */ +#define G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS (PCRE2_ANCHORED | \ + PCRE2_ENDANCHORED) + +#define G_REGEX_COMPILE_NEWLINE_MASK (G_REGEX_NEWLINE_CR | \ + G_REGEX_NEWLINE_LF | \ + G_REGEX_NEWLINE_CRLF | \ + G_REGEX_NEWLINE_ANYCRLF) + +#define G_REGEX_MATCH_NEWLINE_MASK (G_REGEX_MATCH_NEWLINE_CR | \ + G_REGEX_MATCH_NEWLINE_LF | \ + G_REGEX_MATCH_NEWLINE_CRLF | \ + G_REGEX_MATCH_NEWLINE_ANY | \ + G_REGEX_MATCH_NEWLINE_ANYCRLF) /* if the string is in UTF-8 use g_utf8_ functions, else use * use just +/- 1. */ @@ -180,14 +232,14 @@ struct _GMatchInfo { gint ref_count; /* the ref count (atomic) */ GRegex *regex; /* the regex */ - GRegexMatchFlags match_opts; /* options used at match time on the regex */ + uint32_t match_opts; /* pcre match options used at match time on the regex */ gint matches; /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */ - gint n_subpatterns; /* total number of sub patterns in the regex */ + uint32_t n_subpatterns; /* total number of sub patterns in the regex */ gint pos; /* position in the string where last match left off */ - gint n_offsets; /* number of offsets */ + uint32_t n_offsets; /* number of offsets */ gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */ gint *workspace; /* workspace for pcre2_dfa_match() */ - gint n_workspace; /* number of workspace elements */ + PCRE2_SIZE n_workspace; /* number of workspace elements */ const gchar *string; /* string passed to the match function */ gssize string_len; /* length of string, in bytes */ pcre2_match_context *match_context; @@ -206,10 +258,11 @@ struct _GRegex gint ref_count; /* the ref count for the immutable part (atomic) */ gchar *pattern; /* the pattern */ pcre2_code *pcre_re; /* compiled form of the pattern */ - GRegexCompileFlags compile_opts; /* options used at compile time on the pattern, pcre2 values */ + uint32_t compile_opts; /* options used at compile time on the pattern, pcre2 values */ GRegexCompileFlags orig_compile_opts; /* options used at compile time on the pattern, gregex values */ - GRegexMatchFlags match_opts; /* options used at match time on the regex */ - gint jit_options; /* options which were enabled for jit compiler */ + uint32_t match_opts; /* pcre2 options used at match time on the regex */ + GRegexMatchFlags orig_match_opts; /* options used as default match options, gregex values */ + uint32_t jit_options; /* options which were enabled for jit compiler */ JITStatus jit_status; /* indicates the status of jit compiler for this compiled regex */ }; @@ -225,197 +278,182 @@ static GList *split_replacement (const gchar *replacement, GError **error); static void free_interpolation_data (InterpolationData *data); -static gint -map_to_pcre2_compile_flags (gint pcre1_flags) +static uint32_t +get_pcre2_compile_options (GRegexCompileFlags compile_flags) { - /* Maps compile flags from pcre1 to pcre2 values - */ - gint pcre2_flags = G_REGEX_FLAGS_CONVERTED; + /* Maps compile flags to pcre2 values */ + uint32_t pcre2_flags = 0; - if (pcre1_flags & G_REGEX_FLAGS_CONVERTED) - return pcre1_flags; - - if (pcre1_flags & G_REGEX_CASELESS) + if (compile_flags & G_REGEX_CASELESS) pcre2_flags |= PCRE2_CASELESS; - if (pcre1_flags & G_REGEX_MULTILINE) + if (compile_flags & G_REGEX_MULTILINE) pcre2_flags |= PCRE2_MULTILINE; - if (pcre1_flags & G_REGEX_DOTALL) + if (compile_flags & G_REGEX_DOTALL) pcre2_flags |= PCRE2_DOTALL; - if (pcre1_flags & G_REGEX_EXTENDED) + if (compile_flags & G_REGEX_EXTENDED) pcre2_flags |= PCRE2_EXTENDED; - if (pcre1_flags & G_REGEX_ANCHORED) + if (compile_flags & G_REGEX_ANCHORED) pcre2_flags |= PCRE2_ANCHORED; - if (pcre1_flags & G_REGEX_DOLLAR_ENDONLY) + if (compile_flags & G_REGEX_DOLLAR_ENDONLY) pcre2_flags |= PCRE2_DOLLAR_ENDONLY; - if (pcre1_flags & G_REGEX_UNGREEDY) + if (compile_flags & G_REGEX_UNGREEDY) pcre2_flags |= PCRE2_UNGREEDY; - if (!(pcre1_flags & G_REGEX_RAW)) + if (!(compile_flags & G_REGEX_RAW)) pcre2_flags |= PCRE2_UTF; - if (pcre1_flags & G_REGEX_NO_AUTO_CAPTURE) + if (compile_flags & G_REGEX_NO_AUTO_CAPTURE) pcre2_flags |= PCRE2_NO_AUTO_CAPTURE; - if (pcre1_flags & G_REGEX_FIRSTLINE) + if (compile_flags & G_REGEX_FIRSTLINE) pcre2_flags |= PCRE2_FIRSTLINE; - if (pcre1_flags & G_REGEX_DUPNAMES) + if (compile_flags & G_REGEX_DUPNAMES) pcre2_flags |= PCRE2_DUPNAMES; - if (pcre1_flags & G_REGEX_NEWLINE_CR) - pcre2_flags |= PCRE2_NEWLINE_CR; - if (pcre1_flags & G_REGEX_NEWLINE_LF) - pcre2_flags |= PCRE2_NEWLINE_LF; - /* Check for exact match for a composite flag */ - if ((pcre1_flags & G_REGEX_NEWLINE_CRLF) == G_REGEX_NEWLINE_CRLF) - pcre2_flags |= PCRE2_NEWLINE_CRLF; - /* Check for exact match for a composite flag */ - if ((pcre1_flags & G_REGEX_NEWLINE_ANYCRLF) == G_REGEX_NEWLINE_ANYCRLF) - pcre2_flags |= PCRE2_NEWLINE_ANYCRLF; - if (pcre1_flags & G_REGEX_BSR_ANYCRLF) - pcre2_flags |= PCRE2_BSR_ANYCRLF; - - /* these are not available in pcre2, but we use G_REGEX_OPTIMIZE as a special - * case to request JIT compilation */ - if (pcre1_flags & G_REGEX_OPTIMIZE) - pcre2_flags |= 0; -G_GNUC_BEGIN_IGNORE_DEPRECATIONS - if (pcre1_flags & G_REGEX_JAVASCRIPT_COMPAT) - pcre2_flags |= 0; -G_GNUC_END_IGNORE_DEPRECATIONS - - return pcre2_flags; + + return pcre2_flags & G_REGEX_PCRE2_COMPILE_MASK; } -static gint -map_to_pcre2_match_flags (gint pcre1_flags) +static uint32_t +get_pcre2_match_options (GRegexMatchFlags match_flags, + GRegexCompileFlags compile_flags) { - /* Maps match flags from pcre1 to pcre2 values - */ - gint pcre2_flags = G_REGEX_FLAGS_CONVERTED; - - if (pcre1_flags & G_REGEX_FLAGS_CONVERTED) - return pcre1_flags; + /* Maps match flags to pcre2 values */ + uint32_t pcre2_flags = 0; - if (pcre1_flags & G_REGEX_MATCH_ANCHORED) + if (match_flags & G_REGEX_MATCH_ANCHORED) pcre2_flags |= PCRE2_ANCHORED; - if (pcre1_flags & G_REGEX_MATCH_NOTBOL) + if (match_flags & G_REGEX_MATCH_NOTBOL) pcre2_flags |= PCRE2_NOTBOL; - if (pcre1_flags & G_REGEX_MATCH_NOTEOL) + if (match_flags & G_REGEX_MATCH_NOTEOL) pcre2_flags |= PCRE2_NOTEOL; - if (pcre1_flags & G_REGEX_MATCH_NOTEMPTY) + if (match_flags & G_REGEX_MATCH_NOTEMPTY) pcre2_flags |= PCRE2_NOTEMPTY; - if (pcre1_flags & G_REGEX_MATCH_NEWLINE_CR) - pcre2_flags |= PCRE2_NEWLINE_CR; - if (pcre1_flags & G_REGEX_MATCH_NEWLINE_LF) - pcre2_flags |= PCRE2_NEWLINE_LF; - /* Check for exact match for a composite flag */ - if ((pcre1_flags & G_REGEX_MATCH_NEWLINE_CRLF) == G_REGEX_MATCH_NEWLINE_CRLF) - pcre2_flags |= PCRE2_NEWLINE_CRLF; - if (pcre1_flags & G_REGEX_MATCH_NEWLINE_ANY) - pcre2_flags |= PCRE2_NEWLINE_ANY; - /* Check for exact match for a composite flag */ - if ((pcre1_flags & G_REGEX_MATCH_NEWLINE_ANYCRLF) == G_REGEX_MATCH_NEWLINE_ANYCRLF) - pcre2_flags |= PCRE2_NEWLINE_ANYCRLF; - if (pcre1_flags & G_REGEX_MATCH_BSR_ANYCRLF) - pcre2_flags |= PCRE2_BSR_ANYCRLF; - if (pcre1_flags & G_REGEX_MATCH_BSR_ANY) - pcre2_flags |= PCRE2_BSR_UNICODE; - if (pcre1_flags & G_REGEX_MATCH_PARTIAL_SOFT) + if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT) pcre2_flags |= PCRE2_PARTIAL_SOFT; - if (pcre1_flags & G_REGEX_MATCH_PARTIAL_HARD) + if (match_flags & G_REGEX_MATCH_PARTIAL_HARD) pcre2_flags |= PCRE2_PARTIAL_HARD; - if (pcre1_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART) + if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART) pcre2_flags |= PCRE2_NOTEMPTY_ATSTART; - return pcre2_flags; + if (compile_flags & G_REGEX_RAW) + pcre2_flags |= PCRE2_NO_UTF_CHECK; + + return pcre2_flags & G_REGEX_PCRE2_MATCH_MASK; } -static gint -map_to_pcre1_compile_flags (gint pcre2_flags) +static GRegexCompileFlags +g_regex_compile_flags_from_pcre2 (uint32_t pcre2_flags) { - /* Maps compile flags from pcre2 to pcre1 values - */ - gint pcre1_flags = 0; - - if (!(pcre2_flags & G_REGEX_FLAGS_CONVERTED)) - return pcre2_flags; + GRegexCompileFlags compile_flags = G_REGEX_DEFAULT; if (pcre2_flags & PCRE2_CASELESS) - pcre1_flags |= G_REGEX_CASELESS; + compile_flags |= G_REGEX_CASELESS; if (pcre2_flags & PCRE2_MULTILINE) - pcre1_flags |= G_REGEX_MULTILINE; + compile_flags |= G_REGEX_MULTILINE; if (pcre2_flags & PCRE2_DOTALL) - pcre1_flags |= G_REGEX_DOTALL; + compile_flags |= G_REGEX_DOTALL; if (pcre2_flags & PCRE2_EXTENDED) - pcre1_flags |= G_REGEX_EXTENDED; + compile_flags |= G_REGEX_EXTENDED; if (pcre2_flags & PCRE2_ANCHORED) - pcre1_flags |= G_REGEX_ANCHORED; + compile_flags |= G_REGEX_ANCHORED; if (pcre2_flags & PCRE2_DOLLAR_ENDONLY) - pcre1_flags |= G_REGEX_DOLLAR_ENDONLY; + compile_flags |= G_REGEX_DOLLAR_ENDONLY; if (pcre2_flags & PCRE2_UNGREEDY) - pcre1_flags |= G_REGEX_UNGREEDY; + compile_flags |= G_REGEX_UNGREEDY; if (!(pcre2_flags & PCRE2_UTF)) - pcre1_flags |= G_REGEX_RAW; + compile_flags |= G_REGEX_RAW; if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE) - pcre1_flags |= G_REGEX_NO_AUTO_CAPTURE; + compile_flags |= G_REGEX_NO_AUTO_CAPTURE; if (pcre2_flags & PCRE2_FIRSTLINE) - pcre1_flags |= G_REGEX_FIRSTLINE; + compile_flags |= G_REGEX_FIRSTLINE; if (pcre2_flags & PCRE2_DUPNAMES) - pcre1_flags |= G_REGEX_DUPNAMES; - if (pcre2_flags & PCRE2_NEWLINE_CR) - pcre1_flags |= G_REGEX_NEWLINE_CR; - if (pcre2_flags & PCRE2_NEWLINE_LF) - pcre1_flags |= G_REGEX_NEWLINE_LF; - /* Check for exact match for a composite flag */ - if ((pcre2_flags & PCRE2_NEWLINE_CRLF) == PCRE2_NEWLINE_CRLF) - pcre1_flags |= G_REGEX_NEWLINE_CRLF; - /* Check for exact match for a composite flag */ - if ((pcre2_flags & PCRE2_NEWLINE_ANYCRLF) == PCRE2_NEWLINE_ANYCRLF) - pcre1_flags |= G_REGEX_NEWLINE_ANYCRLF; - if (pcre2_flags & PCRE2_BSR_ANYCRLF) - pcre1_flags |= G_REGEX_BSR_ANYCRLF; - - return pcre1_flags; + compile_flags |= G_REGEX_DUPNAMES; + + return compile_flags & G_REGEX_COMPILE_MASK; } -static gint -map_to_pcre1_match_flags (gint pcre2_flags) +static GRegexMatchFlags +g_regex_match_flags_from_pcre2 (uint32_t pcre2_flags) { - /* Maps match flags from pcre2 to pcre1 values - */ - gint pcre1_flags = 0; - - if (!(pcre2_flags & G_REGEX_FLAGS_CONVERTED)) - return pcre2_flags; + GRegexMatchFlags match_flags = G_REGEX_MATCH_DEFAULT; if (pcre2_flags & PCRE2_ANCHORED) - pcre1_flags |= G_REGEX_MATCH_ANCHORED; + match_flags |= G_REGEX_MATCH_ANCHORED; if (pcre2_flags & PCRE2_NOTBOL) - pcre1_flags |= G_REGEX_MATCH_NOTBOL; + match_flags |= G_REGEX_MATCH_NOTBOL; if (pcre2_flags & PCRE2_NOTEOL) - pcre1_flags |= G_REGEX_MATCH_NOTEOL; + match_flags |= G_REGEX_MATCH_NOTEOL; if (pcre2_flags & PCRE2_NOTEMPTY) - pcre1_flags |= G_REGEX_MATCH_NOTEMPTY; - if (pcre2_flags & PCRE2_NEWLINE_CR) - pcre1_flags |= G_REGEX_MATCH_NEWLINE_CR; - if (pcre2_flags & PCRE2_NEWLINE_LF) - pcre1_flags |= G_REGEX_MATCH_NEWLINE_LF; - /* Check for exact match for a composite flag */ - if ((pcre2_flags & PCRE2_NEWLINE_CRLF) == PCRE2_NEWLINE_CRLF) - pcre1_flags |= G_REGEX_MATCH_NEWLINE_CRLF; - if (pcre2_flags & PCRE2_NEWLINE_ANY) - pcre1_flags |= G_REGEX_MATCH_NEWLINE_ANY; - /* Check for exact match for a composite flag */ - if ((pcre2_flags & PCRE2_NEWLINE_ANYCRLF) == PCRE2_NEWLINE_ANYCRLF) - pcre1_flags |= G_REGEX_MATCH_NEWLINE_ANYCRLF; - if (pcre2_flags & PCRE2_BSR_ANYCRLF) - pcre1_flags |= G_REGEX_MATCH_BSR_ANYCRLF; - if (pcre2_flags & PCRE2_BSR_UNICODE) - pcre1_flags |= G_REGEX_MATCH_BSR_ANY; + match_flags |= G_REGEX_MATCH_NOTEMPTY; if (pcre2_flags & PCRE2_PARTIAL_SOFT) - pcre1_flags |= G_REGEX_MATCH_PARTIAL_SOFT; + match_flags |= G_REGEX_MATCH_PARTIAL_SOFT; if (pcre2_flags & PCRE2_PARTIAL_HARD) - pcre1_flags |= G_REGEX_MATCH_PARTIAL_HARD; + match_flags |= G_REGEX_MATCH_PARTIAL_HARD; if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART) - pcre1_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART; + match_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART; + + return (match_flags & G_REGEX_MATCH_MASK); +} + +static uint32_t +get_pcre2_newline_compile_options (GRegexCompileFlags compile_flags) +{ + compile_flags &= G_REGEX_COMPILE_NEWLINE_MASK; + + switch (compile_flags) + { + case G_REGEX_NEWLINE_CR: + return PCRE2_NEWLINE_CR; + case G_REGEX_NEWLINE_LF: + return PCRE2_NEWLINE_LF; + case G_REGEX_NEWLINE_CRLF: + return PCRE2_NEWLINE_CRLF; + case G_REGEX_NEWLINE_ANYCRLF: + return PCRE2_NEWLINE_ANYCRLF; + default: + if (compile_flags != 0) + return 0; + + return PCRE2_NEWLINE_ANY; + } +} + +static uint32_t +get_pcre2_newline_match_options (GRegexMatchFlags match_flags) +{ + switch (match_flags & G_REGEX_MATCH_NEWLINE_MASK) + { + case G_REGEX_MATCH_NEWLINE_CR: + return PCRE2_NEWLINE_CR; + case G_REGEX_MATCH_NEWLINE_LF: + return PCRE2_NEWLINE_LF; + case G_REGEX_MATCH_NEWLINE_CRLF: + return PCRE2_NEWLINE_CRLF; + case G_REGEX_MATCH_NEWLINE_ANY: + return PCRE2_NEWLINE_ANY; + case G_REGEX_MATCH_NEWLINE_ANYCRLF: + return PCRE2_NEWLINE_ANYCRLF; + default: + return 0; + } +} - return pcre1_flags; +static uint32_t +get_pcre2_bsr_compile_options (GRegexCompileFlags compile_flags) +{ + if (compile_flags & G_REGEX_BSR_ANYCRLF) + return PCRE2_BSR_ANYCRLF; + + return PCRE2_BSR_UNICODE; +} + +static uint32_t +get_pcre2_bsr_match_options (GRegexMatchFlags match_flags) +{ + if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF) + return PCRE2_BSR_ANYCRLF; + + if (match_flags & G_REGEX_MATCH_BSR_ANY) + return PCRE2_BSR_UNICODE; + + return 0; } static const gchar * @@ -440,6 +478,7 @@ match_error (gint errcode) /* not used by pcre2_match() */ break; case PCRE2_ERROR_MATCHLIMIT: + case PCRE2_ERROR_JIT_STACKLIMIT: return _("backtracking limit reached"); case PCRE2_ERROR_CALLOUT: /* callouts are not implemented */ @@ -744,12 +783,12 @@ translate_compile_error (gint *errcode, const gchar **errmsg) /* GMatchInfo */ static GMatchInfo * -match_info_new (const GRegex *regex, - const gchar *string, - gint string_len, - gint start_position, - gint match_options, - gboolean is_dfa) +match_info_new (const GRegex *regex, + const gchar *string, + gint string_len, + gint start_position, + GRegexMatchFlags match_options, + gboolean is_dfa) { GMatchInfo *match_info; @@ -763,7 +802,8 @@ match_info_new (const GRegex *regex, match_info->string_len = string_len; match_info->matches = PCRE2_ERROR_NOMATCH; match_info->pos = start_position; - match_info->match_opts = match_options; + match_info->match_opts = + get_pcre2_match_options (match_options, regex->orig_compile_opts); pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &match_info->n_subpatterns); @@ -774,15 +814,11 @@ match_info_new (const GRegex *regex, { /* These values should be enough for most cases, if they are not * enough g_regex_match_all_full() will expand them. */ - match_info->n_offsets = 24; match_info->n_workspace = 100; match_info->workspace = g_new (gint, match_info->n_workspace); } - else - { - match_info->n_offsets = (match_info->n_subpatterns + 1) * 3; - } + match_info->n_offsets = 2; match_info->offsets = g_new0 (gint, match_info->n_offsets); /* Set an invalid position for the previous match. */ match_info->offsets[0] = -1; @@ -800,9 +836,20 @@ recalc_match_offsets (GMatchInfo *match_info, GError **error) { PCRE2_SIZE *ovector; - gint i; + uint32_t ovector_size = 0; + uint32_t pre_n_offset; + uint32_t i; + + g_assert (!IS_PCRE2_ERROR (match_info->matches)); + + if (match_info->matches == PCRE2_ERROR_PARTIAL) + ovector_size = 1; + else if (match_info->matches > 0) + ovector_size = match_info->matches; - if (pcre2_get_ovector_count (match_info->match_data) > G_MAXINT / 2) + g_assert (ovector_size != 0); + + if (pcre2_get_ovector_count (match_info->match_data) < ovector_size) { g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, _("Error while matching regular expression %s: %s"), @@ -810,11 +857,17 @@ recalc_match_offsets (GMatchInfo *match_info, return FALSE; } - match_info->n_offsets = pcre2_get_ovector_count (match_info->match_data) * 2; + pre_n_offset = match_info->n_offsets; + match_info->n_offsets = ovector_size * 2; ovector = pcre2_get_ovector_pointer (match_info->match_data); - match_info->offsets = g_realloc_n (match_info->offsets, - match_info->n_offsets, - sizeof (gint)); + + if (match_info->n_offsets != pre_n_offset) + { + match_info->offsets = g_realloc_n (match_info->offsets, + match_info->n_offsets, + sizeof (gint)); + } + for (i = 0; i < match_info->n_offsets; i++) { match_info->offsets[i] = (int) ovector[i]; @@ -823,16 +876,21 @@ recalc_match_offsets (GMatchInfo *match_info, return TRUE; } -static void -enable_jit_with_match_options (GRegex *regex, - GRegexMatchFlags match_options) +static JITStatus +enable_jit_with_match_options (GRegex *regex, + uint32_t match_options) { - gint old_jit_options, new_jit_options, retval; + gint retval; + uint32_t old_jit_options, new_jit_options; if (!(regex->orig_compile_opts & G_REGEX_OPTIMIZE)) - return; + return JIT_STATUS_DISABLED; + if (regex->jit_status == JIT_STATUS_DISABLED) - return; + return JIT_STATUS_DISABLED; + + if (match_options & G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS) + return JIT_STATUS_DISABLED; old_jit_options = regex->jit_options; new_jit_options = old_jit_options | PCRE2_JIT_COMPLETE; @@ -843,34 +901,34 @@ enable_jit_with_match_options (GRegex *regex, /* no new options enabled */ if (new_jit_options == old_jit_options) - return; + return regex->jit_status; retval = pcre2_jit_compile (regex->pcre_re, new_jit_options); switch (retval) { case 0: /* JIT enabled successfully */ - regex->jit_status = JIT_STATUS_ENABLED; regex->jit_options = new_jit_options; - break; + return JIT_STATUS_ENABLED; case PCRE2_ERROR_NOMEMORY: g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " "but JIT was unable to allocate executable memory for the " "compiler. Falling back to interpretive code."); - regex->jit_status = JIT_STATUS_DISABLED; - break; + return JIT_STATUS_DISABLED; case PCRE2_ERROR_JIT_BADOPTION: g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " "but JIT support is not available. Falling back to " "interpretive code."); - regex->jit_status = JIT_STATUS_DISABLED; + return JIT_STATUS_DISABLED; break; default: g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " - "but request for JIT support had unexpectedly failed. " - "Falling back to interpretive code."); - regex->jit_status = JIT_STATUS_DISABLED; + "but request for JIT support had unexpectedly failed (error %d). " + "Falling back to interpretive code.", retval); + return JIT_STATUS_DISABLED; break; } + + return regex->jit_status; } /** @@ -881,7 +939,7 @@ enable_jit_with_match_options (GRegex *regex, * and must not be freed. Use g_regex_ref() if you need to keep it * after you free @match_info object. * - * Returns: #GRegex object used in @match_info + * Returns: (transfer none): #GRegex object used in @match_info * * Since: 2.14 */ @@ -992,9 +1050,10 @@ gboolean g_match_info_next (GMatchInfo *match_info, GError **error) { + JITStatus jit_status; gint prev_match_start; gint prev_match_end; - gint opts; + uint32_t opts; g_return_val_if_fail (match_info != NULL, FALSE); g_return_val_if_fail (error == NULL || *error == NULL, FALSE); @@ -1011,16 +1070,16 @@ g_match_info_next (GMatchInfo *match_info, return FALSE; } - opts = map_to_pcre2_match_flags (match_info->regex->match_opts | match_info->match_opts); + opts = match_info->regex->match_opts | match_info->match_opts; - enable_jit_with_match_options (match_info->regex, opts); - if (match_info->regex->jit_status == JIT_STATUS_ENABLED) + jit_status = enable_jit_with_match_options (match_info->regex, opts); + if (jit_status == JIT_STATUS_ENABLED) { match_info->matches = pcre2_jit_match (match_info->regex->pcre_re, (PCRE2_SPTR8) match_info->string, match_info->string_len, match_info->pos, - opts & ~G_REGEX_FLAGS_CONVERTED, + opts, match_info->match_data, match_info->match_context); } @@ -1030,7 +1089,7 @@ g_match_info_next (GMatchInfo *match_info, (PCRE2_SPTR8) match_info->string, match_info->string_len, match_info->pos, - opts & ~G_REGEX_FLAGS_CONVERTED, + opts, match_info->match_data, match_info->match_context); } @@ -1042,6 +1101,25 @@ g_match_info_next (GMatchInfo *match_info, match_info->regex->pattern, match_error (match_info->matches)); return FALSE; } + else if (match_info->matches == 0) + { + /* info->offsets is too small. */ + match_info->n_offsets *= 2; + match_info->offsets = g_realloc_n (match_info->offsets, + match_info->n_offsets, + sizeof (gint)); + + pcre2_match_data_free (match_info->match_data); + match_info->match_data = pcre2_match_data_create (match_info->n_offsets, NULL); + + return g_match_info_next (match_info, error); + } + else if (match_info->matches == PCRE2_ERROR_NOMATCH) + { + /* We're done with this match info */ + match_info->pos = -1; + return FALSE; + } else if (!recalc_match_offsets (match_info, error)) return FALSE; @@ -1067,7 +1145,8 @@ g_match_info_next (GMatchInfo *match_info, match_info->pos = match_info->offsets[1]; } - g_assert (match_info->matches <= match_info->n_subpatterns + 1); + g_assert (match_info->matches < 0 || + (uint32_t) match_info->matches <= match_info->n_subpatterns + 1); /* it's possible to get two identical matches when we are matching * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and @@ -1350,7 +1429,7 @@ g_match_info_fetch_pos (const GMatchInfo *match_info, /* make sure the sub expression number they're requesting is less than * the total number of sub expressions in the regex. When matching all * (g_regex_match_all()), also compare against the number of matches */ - if (match_num >= MAX (match_info->n_subpatterns + 1, match_info->matches)) + if ((uint32_t) match_num >= MAX (match_info->n_subpatterns + 1, (uint32_t) match_info->matches)) return FALSE; if (start_pos != NULL) @@ -1565,14 +1644,14 @@ g_regex_unref (GRegex *regex) } } -/* - * @match_options: (inout) (optional): - */ -static pcre2_code *regex_compile (const gchar *pattern, - GRegexCompileFlags compile_options, - GRegexCompileFlags *compile_options_out, - GRegexMatchFlags *match_options, - GError **error); +static pcre2_code * regex_compile (const gchar *pattern, + uint32_t compile_options, + uint32_t newline_options, + uint32_t bsr_options, + GError **error); + +static uint32_t get_pcre2_inline_compile_options (pcre2_code *re, + uint32_t compile_options); /** * g_regex_new: @@ -1598,11 +1677,10 @@ g_regex_new (const gchar *pattern, GRegex *regex; pcre2_code *re; static gsize initialised = 0; - GRegexCompileFlags orig_compile_opts; - - orig_compile_opts = compile_options; - compile_options = map_to_pcre2_compile_flags (compile_options); - match_options = map_to_pcre2_match_flags (match_options); + uint32_t pcre_compile_options; + uint32_t pcre_match_options; + uint32_t newline_options; + uint32_t bsr_options; g_return_val_if_fail (pattern != NULL, NULL); g_return_val_if_fail (error == NULL || *error == NULL, NULL); @@ -1620,113 +1698,97 @@ g_regex_new (const gchar *pattern, g_once_init_leave (&initialised, supports_utf8 ? 1 : 2); } - if (G_UNLIKELY (initialised != 1)) + if (G_UNLIKELY (initialised != 1)) { g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, _("PCRE library is compiled with incompatible options")); return NULL; } - switch (compile_options & G_REGEX_NEWLINE_MASK) + pcre_compile_options = get_pcre2_compile_options (compile_options); + pcre_match_options = get_pcre2_match_options (match_options, compile_options); + + newline_options = get_pcre2_newline_match_options (match_options); + if (newline_options == 0) + newline_options = get_pcre2_newline_compile_options (compile_options); + + if (newline_options == 0) { - case 0: /* PCRE2_NEWLINE_ANY */ - case PCRE2_NEWLINE_CR: - case PCRE2_NEWLINE_LF: - case PCRE2_NEWLINE_CRLF: - case PCRE2_NEWLINE_ANYCRLF: - break; - default: g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, "Invalid newline flags"); return NULL; } - re = regex_compile (pattern, compile_options, &compile_options, - &match_options, error); + bsr_options = get_pcre2_bsr_match_options (match_options); + if (!bsr_options) + bsr_options = get_pcre2_bsr_compile_options (compile_options); + + re = regex_compile (pattern, pcre_compile_options, + newline_options, bsr_options, error); if (re == NULL) return NULL; + pcre_compile_options |= + get_pcre2_inline_compile_options (re, pcre_compile_options); + regex = g_new0 (GRegex, 1); regex->ref_count = 1; regex->pattern = g_strdup (pattern); regex->pcre_re = re; - regex->compile_opts = compile_options; - regex->orig_compile_opts = orig_compile_opts; - regex->match_opts = match_options; - enable_jit_with_match_options (regex, regex->match_opts); + regex->compile_opts = pcre_compile_options; + regex->orig_compile_opts = compile_options; + regex->match_opts = pcre_match_options; + regex->orig_match_opts = match_options; + regex->jit_status = enable_jit_with_match_options (regex, regex->match_opts); return regex; } -static gint -extract_newline_options (const GRegexCompileFlags compile_options, - const GRegexMatchFlags *match_options) -{ - gint newline_options = PCRE2_NEWLINE_ANY; - - if (compile_options & G_REGEX_NEWLINE_MASK) - newline_options = compile_options & G_REGEX_NEWLINE_MASK; - if (match_options && *match_options & G_REGEX_MATCH_NEWLINE_MASK) - newline_options = *match_options & G_REGEX_MATCH_NEWLINE_MASK; - - return newline_options; -} - -static gint -extract_bsr_options (const GRegexCompileFlags compile_options, - const GRegexMatchFlags *match_options) -{ - gint bsr_options = PCRE2_BSR_UNICODE; - - if (compile_options & PCRE2_BSR_ANYCRLF) - bsr_options = PCRE2_BSR_ANYCRLF; - if (match_options && *match_options & PCRE2_BSR_ANYCRLF) - bsr_options = PCRE2_BSR_ANYCRLF; - if (match_options && *match_options & PCRE2_BSR_UNICODE) - bsr_options = PCRE2_BSR_UNICODE; - - return bsr_options; -} - static pcre2_code * -regex_compile (const gchar *pattern, - GRegexCompileFlags compile_options, - GRegexCompileFlags *compile_options_out, - GRegexMatchFlags *match_options, - GError **error) +regex_compile (const gchar *pattern, + uint32_t compile_options, + uint32_t newline_options, + uint32_t bsr_options, + GError **error) { pcre2_code *re; pcre2_compile_context *context; const gchar *errmsg; PCRE2_SIZE erroffset; gint errcode; - GRegexCompileFlags nonpcre_compile_options; - uint32_t pcre_compile_options; - - nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; context = pcre2_compile_context_create (NULL); /* set newline options */ - pcre2_set_newline (context, extract_newline_options (compile_options, match_options)); + if (pcre2_set_newline (context, newline_options) != 0) + { + g_set_error (error, G_REGEX_ERROR, + G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, + "Invalid newline flags"); + pcre2_compile_context_free (context); + return NULL; + } /* set bsr options */ - pcre2_set_bsr (context, extract_bsr_options (compile_options, match_options)); + if (pcre2_set_bsr (context, bsr_options) != 0) + { + g_set_error (error, G_REGEX_ERROR, + G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, + "Invalid BSR flags"); + pcre2_compile_context_free (context); + return NULL; + } /* In case UTF-8 mode is used, also set PCRE2_NO_UTF_CHECK */ if (compile_options & PCRE2_UTF) - { - compile_options |= PCRE2_NO_UTF_CHECK; - if (match_options != NULL) - *match_options |= PCRE2_NO_UTF_CHECK; - } + compile_options |= PCRE2_NO_UTF_CHECK; compile_options |= PCRE2_UCP; /* compile the pattern */ re = pcre2_compile ((PCRE2_SPTR8) pattern, PCRE2_ZERO_TERMINATED, - compile_options & ~G_REGEX_FLAGS_CONVERTED, + compile_options, &errcode, &erroffset, context); @@ -1757,30 +1819,33 @@ regex_compile (const gchar *pattern, return NULL; } + return re; +} + +static uint32_t +get_pcre2_inline_compile_options (pcre2_code *re, + uint32_t compile_options) +{ + uint32_t pcre_compile_options; + uint32_t nonpcre_compile_options; + /* For options set at the beginning of the pattern, pcre puts them into * compile options, e.g. "(?i)foo" will make the pcre structure store * PCRE2_CASELESS even though it wasn't explicitly given for compilation. */ + nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options); - compile_options = pcre_compile_options & G_REGEX_COMPILE_PCRE_MASK; - - /* Don't leak PCRE2_NEWLINE_ANY, which is part of PCRE2_NEWLINE_ANYCRLF */ - if ((pcre_compile_options & PCRE2_NEWLINE_ANYCRLF) != PCRE2_NEWLINE_ANYCRLF) - compile_options &= ~PCRE2_NEWLINE_ANY; - + compile_options = pcre_compile_options & G_REGEX_PCRE2_COMPILE_MASK; compile_options |= nonpcre_compile_options; if (!(compile_options & PCRE2_DUPNAMES)) { - gboolean jchanged = FALSE; + uint32_t jchanged = 0; pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged); if (jchanged) compile_options |= PCRE2_DUPNAMES; } - if (compile_options_out != 0) - *compile_options_out = compile_options; - - return re; + return compile_options; } /** @@ -1817,7 +1882,7 @@ g_regex_get_pattern (const GRegex *regex) gint g_regex_get_max_backref (const GRegex *regex) { - gint value; + uint32_t value; pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value); @@ -1837,7 +1902,7 @@ g_regex_get_max_backref (const GRegex *regex) gint g_regex_get_capture_count (const GRegex *regex) { - gint value; + uint32_t value; pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value); @@ -1857,7 +1922,7 @@ g_regex_get_capture_count (const GRegex *regex) gboolean g_regex_get_has_cr_or_lf (const GRegex *regex) { - gint value; + uint32_t value; pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value); @@ -1879,7 +1944,7 @@ g_regex_get_has_cr_or_lf (const GRegex *regex) gint g_regex_get_max_lookbehind (const GRegex *regex) { - gint max_lookbehind; + uint32_t max_lookbehind; pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND, &max_lookbehind); @@ -1904,7 +1969,8 @@ g_regex_get_max_lookbehind (const GRegex *regex) GRegexCompileFlags g_regex_get_compile_flags (const GRegex *regex) { - gint extra_flags, info_value; + GRegexCompileFlags extra_flags; + uint32_t info_value; g_return_val_if_fail (regex != NULL, 0); @@ -1942,7 +2008,7 @@ g_regex_get_compile_flags (const GRegex *regex) break; } - return map_to_pcre1_compile_flags (regex->compile_opts) | extra_flags; + return g_regex_compile_flags_from_pcre2 (regex->compile_opts) | extra_flags; } /** @@ -1958,9 +2024,15 @@ g_regex_get_compile_flags (const GRegex *regex) GRegexMatchFlags g_regex_get_match_flags (const GRegex *regex) { + uint32_t flags; + g_return_val_if_fail (regex != NULL, 0); - return map_to_pcre1_match_flags (regex->match_opts & G_REGEX_MATCH_MASK); + flags = g_regex_match_flags_from_pcre2 (regex->match_opts); + flags |= (regex->orig_match_opts & G_REGEX_MATCH_NEWLINE_MASK); + flags |= (regex->orig_match_opts & (G_REGEX_MATCH_BSR_ANY | G_REGEX_MATCH_BSR_ANYCRLF)); + + return flags; } /** @@ -1994,9 +2066,6 @@ g_regex_match_simple (const gchar *pattern, GRegex *regex; gboolean result; - compile_options = map_to_pcre2_compile_flags (compile_options); - match_options = map_to_pcre2_match_flags (match_options); - regex = g_regex_new (pattern, compile_options, G_REGEX_MATCH_DEFAULT, NULL); if (!regex) return FALSE; @@ -2064,8 +2133,6 @@ g_regex_match (const GRegex *regex, GRegexMatchFlags match_options, GMatchInfo **match_info) { - match_options = map_to_pcre2_match_flags (match_options); - return g_regex_match_full (regex, string, -1, 0, match_options, match_info, NULL); } @@ -2149,8 +2216,6 @@ g_regex_match_full (const GRegex *regex, GMatchInfo *info; gboolean match_ok; - match_options = map_to_pcre2_match_flags (match_options); - g_return_val_if_fail (regex != NULL, FALSE); g_return_val_if_fail (string != NULL, FALSE); g_return_val_if_fail (start_position >= 0, FALSE); @@ -2201,8 +2266,6 @@ g_regex_match_all (const GRegex *regex, GRegexMatchFlags match_options, GMatchInfo **match_info) { - match_options = map_to_pcre2_match_flags (match_options); - return g_regex_match_all_full (regex, string, -1, 0, match_options, match_info, NULL); } @@ -2274,8 +2337,8 @@ g_regex_match_all_full (const GRegex *regex, gboolean done; pcre2_code *pcre_re; gboolean retval; - - match_options = map_to_pcre2_match_flags (match_options); + uint32_t newline_options; + uint32_t bsr_options; g_return_val_if_fail (regex != NULL, FALSE); g_return_val_if_fail (string != NULL, FALSE); @@ -2283,6 +2346,14 @@ g_regex_match_all_full (const GRegex *regex, g_return_val_if_fail (error == NULL || *error == NULL, FALSE); g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); + newline_options = get_pcre2_newline_match_options (match_options); + if (!newline_options) + newline_options = get_pcre2_newline_compile_options (regex->orig_compile_opts); + + bsr_options = get_pcre2_bsr_match_options (match_options); + if (!bsr_options) + bsr_options = get_pcre2_bsr_compile_options (regex->orig_compile_opts); + /* For PCRE2 we need to turn off PCRE2_NO_AUTO_POSSESS, which is an * optimization for normal regex matching, but results in omitting some * shorter matches here, and an observable behaviour change. @@ -2291,7 +2362,7 @@ g_regex_match_all_full (const GRegex *regex, * codesearch.debian.net, so don't bother caching the recompiled RE. */ pcre_re = regex_compile (regex->pattern, regex->compile_opts | PCRE2_NO_AUTO_POSSESS, - NULL, NULL, error); + newline_options, bsr_options, error); if (pcre_re == NULL) return FALSE; @@ -2305,17 +2376,10 @@ g_regex_match_all_full (const GRegex *regex, info->matches = pcre2_dfa_match (pcre_re, (PCRE2_SPTR8) info->string, info->string_len, info->pos, - (regex->match_opts | match_options | PCRE2_NO_UTF_CHECK) & ~G_REGEX_FLAGS_CONVERTED, + (regex->match_opts | info->match_opts), info->match_data, info->match_context, info->workspace, info->n_workspace); - - if (!recalc_match_offsets (info, error)) - { - g_match_info_free (info); - return FALSE; - } - if (info->matches == PCRE2_ERROR_DFA_WSSIZE) { /* info->workspace is too small. */ @@ -2342,6 +2406,11 @@ g_regex_match_all_full (const GRegex *regex, _("Error while matching regular expression %s: %s"), regex->pattern, match_error (info->matches)); } + else if (info->matches != PCRE2_ERROR_NOMATCH) + { + if (!recalc_match_offsets (info, error)) + info->matches = PCRE2_ERROR_NOMATCH; + } } pcre2_code_free (pcre_re); @@ -2438,9 +2507,6 @@ g_regex_split_simple (const gchar *pattern, GRegex *regex; gchar **result; - compile_options = map_to_pcre2_compile_flags (compile_options); - match_options = map_to_pcre2_match_flags (match_options); - regex = g_regex_new (pattern, compile_options, 0, NULL); if (!regex) return NULL; @@ -2484,8 +2550,6 @@ g_regex_split (const GRegex *regex, const gchar *string, GRegexMatchFlags match_options) { - match_options = map_to_pcre2_match_flags (match_options); - return g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL); } @@ -2550,8 +2614,6 @@ g_regex_split_full (const GRegex *regex, /* the returned array of char **s */ gchar **string_list; - match_options = map_to_pcre2_match_flags (match_options); - g_return_val_if_fail (regex != NULL, NULL); g_return_val_if_fail (string != NULL, NULL); g_return_val_if_fail (start_position >= 0, NULL); @@ -3176,8 +3238,6 @@ g_regex_replace (const GRegex *regex, GList *list; GError *tmp_error = NULL; - match_options = map_to_pcre2_match_flags (match_options); - g_return_val_if_fail (regex != NULL, NULL); g_return_val_if_fail (string != NULL, NULL); g_return_val_if_fail (start_position >= 0, NULL); @@ -3247,8 +3307,6 @@ g_regex_replace_literal (const GRegex *regex, GRegexMatchFlags match_options, GError **error) { - match_options = map_to_pcre2_match_flags (match_options); - g_return_val_if_fail (replacement != NULL, NULL); g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); @@ -3337,8 +3395,6 @@ g_regex_replace_eval (const GRegex *regex, gboolean done = FALSE; GError *tmp_error = NULL; - match_options = map_to_pcre2_match_flags (match_options); - g_return_val_if_fail (regex != NULL, NULL); g_return_val_if_fail (string != NULL, NULL); g_return_val_if_fail (start_position >= 0, NULL); diff --git a/glib/tests/regex.c b/glib/tests/regex.c index acb082b70..9803d4965 100644 --- a/glib/tests/regex.c +++ b/glib/tests/regex.c @@ -1,6 +1,7 @@ /* * Copyright (C) 2005 - 2006, Marco Barisione <marco@barisione.org> * Copyright (C) 2010 Red Hat, Inc. + * Copyright (C) 2022, Marco Trevisan <marco.trevisan@canonical.com> * * SPDX-License-Identifier: LGPL-2.1-or-later * @@ -105,7 +106,7 @@ test_new (gconstpointer d) data = g_new0 (TestNewData, 1); \ data->pattern = _pattern; \ data->compile_opts = _compile_opts; \ - data->match_opts = 0; \ + data->match_opts = _match_opts; \ data->expected_error = 0; \ data->check_flags = TRUE; \ data->real_compile_opts = _real_compile_opts; \ @@ -172,7 +173,24 @@ test_match_simple (gconstpointer d) data->compile_opts = _compile_opts; \ data->match_opts = _match_opts; \ data->expected = _expected; \ - path = g_strdup_printf ("/regex/match-%s/%d", _name, ++total); \ + total++; \ + if (data->compile_opts & G_REGEX_OPTIMIZE) \ + path = g_strdup_printf ("/regex/match-%s-optimized/%d", _name, total); \ + else \ + path = g_strdup_printf ("/regex/match-%s/%d", _name, total); \ + g_test_add_data_func_full (path, data, test_match_simple, g_free); \ + g_free (path); \ + data = g_memdup2 (data, sizeof (TestMatchData)); \ + if (data->compile_opts & G_REGEX_OPTIMIZE) \ + { \ + data->compile_opts &= ~G_REGEX_OPTIMIZE; \ + path = g_strdup_printf ("/regex/match-%s/%d", _name, total); \ + } \ + else \ + { \ + data->compile_opts |= G_REGEX_OPTIMIZE; \ + path = g_strdup_printf ("/regex/match-%s-optimized/%d", _name, total); \ + } \ g_test_add_data_func_full (path, data, test_match_simple, g_free); \ g_free (path); \ } @@ -184,6 +202,108 @@ test_match_simple (gconstpointer d) #define TEST_MATCH_NOTEMPTY_ATSTART(_pattern, _string, _expected) \ TEST_MATCH_SIMPLE_NAMED("notempty-atstart", _pattern, _string, 0, G_REGEX_MATCH_NOTEMPTY_ATSTART, _expected) +static char * +compile_options_to_string (GRegexCompileFlags compile_flags) +{ + GStrvBuilder *builder = g_strv_builder_new(); + GStrv strv; + char *ret; + + if (compile_flags & G_REGEX_DEFAULT) + g_strv_builder_add (builder, "default"); + if (compile_flags & G_REGEX_CASELESS) + g_strv_builder_add (builder, "caseless"); + if (compile_flags & G_REGEX_MULTILINE) + g_strv_builder_add (builder, "multiline"); + if (compile_flags & G_REGEX_DOTALL) + g_strv_builder_add (builder, "dotall"); + if (compile_flags & G_REGEX_EXTENDED) + g_strv_builder_add (builder, "extended"); + if (compile_flags & G_REGEX_ANCHORED) + g_strv_builder_add (builder, "anchored"); + if (compile_flags & G_REGEX_DOLLAR_ENDONLY) + g_strv_builder_add (builder, "dollar-endonly"); + if (compile_flags & G_REGEX_UNGREEDY) + g_strv_builder_add (builder, "ungreedy"); + if (compile_flags & G_REGEX_RAW) + g_strv_builder_add (builder, "raw"); + if (compile_flags & G_REGEX_NO_AUTO_CAPTURE) + g_strv_builder_add (builder, "no-auto-capture"); + if (compile_flags & G_REGEX_OPTIMIZE) + g_strv_builder_add (builder, "optimize"); + if (compile_flags & G_REGEX_FIRSTLINE) + g_strv_builder_add (builder, "firstline"); + if (compile_flags & G_REGEX_DUPNAMES) + g_strv_builder_add (builder, "dupnames"); + if (compile_flags & G_REGEX_NEWLINE_CR) + g_strv_builder_add (builder, "newline-cr"); + if (compile_flags & G_REGEX_NEWLINE_LF) + g_strv_builder_add (builder, "newline-lf"); + if (compile_flags & G_REGEX_NEWLINE_CRLF) + g_strv_builder_add (builder, "newline-crlf"); + if (compile_flags & G_REGEX_NEWLINE_ANYCRLF) + g_strv_builder_add (builder, "newline-anycrlf"); + if (compile_flags & G_REGEX_BSR_ANYCRLF) + g_strv_builder_add (builder, "bsr-anycrlf"); + + strv = g_strv_builder_end (builder); + ret = g_strjoinv ("|", strv); + + g_strfreev (strv); + g_strv_builder_unref (builder); + + return ret; +} + +static char * +match_options_to_string (GRegexMatchFlags match_flags) +{ + GStrvBuilder *builder = g_strv_builder_new(); + GStrv strv; + char *ret; + + if (match_flags & G_REGEX_MATCH_DEFAULT) + g_strv_builder_add (builder, "default"); + if (match_flags & G_REGEX_MATCH_ANCHORED) + g_strv_builder_add (builder, "anchored"); + if (match_flags & G_REGEX_MATCH_NOTBOL) + g_strv_builder_add (builder, "notbol"); + if (match_flags & G_REGEX_MATCH_NOTEOL) + g_strv_builder_add (builder, "noteol"); + if (match_flags & G_REGEX_MATCH_NOTEMPTY) + g_strv_builder_add (builder, "notempty"); + if (match_flags & G_REGEX_MATCH_PARTIAL) + g_strv_builder_add (builder, "partial"); + if (match_flags & G_REGEX_MATCH_NEWLINE_CR) + g_strv_builder_add (builder, "newline-cr"); + if (match_flags & G_REGEX_MATCH_NEWLINE_LF) + g_strv_builder_add (builder, "newline-lf"); + if (match_flags & G_REGEX_MATCH_NEWLINE_CRLF) + g_strv_builder_add (builder, "newline-crlf"); + if (match_flags & G_REGEX_MATCH_NEWLINE_ANY) + g_strv_builder_add (builder, "newline-any"); + if (match_flags & G_REGEX_MATCH_NEWLINE_ANYCRLF) + g_strv_builder_add (builder, "newline-anycrlf"); + if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF) + g_strv_builder_add (builder, "bsr-anycrlf"); + if (match_flags & G_REGEX_MATCH_BSR_ANY) + g_strv_builder_add (builder, "bsr-any"); + if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT) + g_strv_builder_add (builder, "partial-soft"); + if (match_flags & G_REGEX_MATCH_PARTIAL_HARD) + g_strv_builder_add (builder, "partial-hard"); + if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART) + g_strv_builder_add (builder, "notempty-atstart"); + + strv = g_strv_builder_end (builder); + ret = g_strjoinv ("|", strv); + + g_strfreev (strv); + g_strv_builder_unref (builder); + + return ret; +} + static void test_match (gconstpointer d) { @@ -191,6 +311,9 @@ test_match (gconstpointer d) GRegex *regex; gboolean match; GError *error = NULL; + gchar *compile_opts_str; + gchar *match_opts_str; + gchar *match_opts2_str; regex = g_regex_new (data->pattern, data->compile_opts, data->match_opts, &error); g_assert (regex != NULL); @@ -199,31 +322,35 @@ test_match (gconstpointer d) match = g_regex_match_full (regex, data->string, data->string_len, data->start_position, data->match_opts2, NULL, NULL); + compile_opts_str = compile_options_to_string (data->compile_opts); + match_opts_str = match_options_to_string (data->match_opts); + match_opts2_str = match_options_to_string (data->match_opts2); + if (data->expected) { if (!match) - g_error ("Regex '%s' (with compile options %u and " - "match options %u) should have matched '%.*s' " - "(of length %d, at position %d, with match options %u) but did not", - data->pattern, data->compile_opts, data->match_opts, + g_error ("Regex '%s' (with compile options '%s' and " + "match options '%s') should have matched '%.*s' " + "(of length %d, at position %d, with match options '%s') but did not", + data->pattern, compile_opts_str, match_opts_str, data->string_len == -1 ? (int) strlen (data->string) : (int) data->string_len, data->string, (int) data->string_len, - data->start_position, data->match_opts2); + data->start_position, match_opts2_str); g_assert_cmpint (match, ==, TRUE); } else { if (match) - g_error ("Regex '%s' (with compile options %u and " - "match options %u) should not have matched '%.*s' " - "(of length %d, at position %d, with match options %u) but did", - data->pattern, data->compile_opts, data->match_opts, + g_error ("Regex '%s' (with compile options '%s' and " + "match options '%s') should not have matched '%.*s' " + "(of length %d, at position %d, with match options '%s') but did", + data->pattern, compile_opts_str, match_opts_str, data->string_len == -1 ? (int) strlen (data->string) : (int) data->string_len, data->string, (int) data->string_len, - data->start_position, data->match_opts2); + data->start_position, match_opts2_str); } if (data->string_len == -1 && data->start_position == 0) @@ -232,6 +359,9 @@ test_match (gconstpointer d) g_assert_cmpint (match, ==, data->expected); } + g_free (compile_opts_str); + g_free (match_opts_str); + g_free (match_opts2_str); g_regex_unref (regex); } @@ -248,7 +378,24 @@ test_match (gconstpointer d) data->start_position = _start_position; \ data->match_opts2 = _match_opts2; \ data->expected = _expected; \ - path = g_strdup_printf ("/regex/match/%d", ++total); \ + total++; \ + if (data->compile_opts & G_REGEX_OPTIMIZE) \ + path = g_strdup_printf ("/regex/match-optimized/%d", total); \ + else \ + path = g_strdup_printf ("/regex/match/%d", total); \ + g_test_add_data_func_full (path, data, test_match, g_free); \ + g_free (path); \ + data = g_memdup2 (data, sizeof (TestMatchData)); \ + if (data->compile_opts & G_REGEX_OPTIMIZE) \ + { \ + data->compile_opts &= ~G_REGEX_OPTIMIZE; \ + path = g_strdup_printf ("/regex/match/%d", total); \ + } \ + else \ + { \ + data->compile_opts |= G_REGEX_OPTIMIZE; \ + path = g_strdup_printf ("/regex/match-optimized/%d", total); \ + } \ g_test_add_data_func_full (path, data, test_match, g_free); \ g_free (path); \ } @@ -467,6 +614,7 @@ typedef struct { const gchar *pattern; const gchar *string; gint start_position; + GRegexCompileFlags compile_flags; GRegexMatchFlags match_opts; gint expected_count; } TestMatchCountData; @@ -479,7 +627,8 @@ test_match_count (gconstpointer d) GMatchInfo *match_info; gint count; - regex = g_regex_new (data->pattern, G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); + regex = g_regex_new (data->pattern, data->compile_flags, + G_REGEX_MATCH_DEFAULT, NULL); g_assert (regex != NULL); @@ -504,7 +653,14 @@ test_match_count (gconstpointer d) data->start_position = _start_position; \ data->match_opts = _match_opts; \ data->expected_count = _expected_count; \ - path = g_strdup_printf ("/regex/match/count/%d", ++total); \ + data->compile_flags = G_REGEX_DEFAULT; \ + total++; \ + path = g_strdup_printf ("/regex/match/count/%d", total); \ + g_test_add_data_func_full (path, data, test_match_count, g_free); \ + g_free (path); \ + data = g_memdup2 (data, sizeof (TestMatchCountData)); \ + data->compile_flags |= G_REGEX_OPTIMIZE; \ + path = g_strdup_printf ("/regex/match/count-optimized/%d", total); \ g_test_add_data_func_full (path, data, test_match_count, g_free); \ g_free (path); \ } @@ -543,7 +699,24 @@ test_partial (gconstpointer d) data->compile_opts = _compile_opts; \ data->match_opts = _match_opts; \ data->expected = _expected; \ - path = g_strdup_printf ("/regex/match/partial/%d", ++total); \ + total++; \ + if (data->compile_opts & G_REGEX_OPTIMIZE) \ + path = g_strdup_printf ("/regex/match/partial-optimized/%d", total); \ + else \ + path = g_strdup_printf ("/regex/match/partial%d", total); \ + g_test_add_data_func_full (path, data, test_partial, g_free); \ + g_free (path); \ + data = g_memdup2 (data, sizeof (TestMatchData)); \ + if (data->compile_opts & G_REGEX_OPTIMIZE) \ + { \ + data->compile_opts &= ~G_REGEX_OPTIMIZE; \ + path = g_strdup_printf ("/regex/match/partial%d", total); \ + } \ + else \ + { \ + data->compile_opts |= G_REGEX_OPTIMIZE; \ + path = g_strdup_printf ("/regex/match/partial-optimized/%d", total); \ + } \ g_test_add_data_func_full (path, data, test_partial, g_free); \ g_free (path); \ } @@ -553,6 +726,7 @@ test_partial (gconstpointer d) typedef struct { const gchar *pattern; const gchar *string; + GRegexCompileFlags compile_flags; gint start_position; gint sub_n; const gchar *expected_sub; @@ -569,7 +743,7 @@ test_sub_pattern (gconstpointer d) gchar *sub_expr; gint start = UNTOUCHED, end = UNTOUCHED; - regex = g_regex_new (data->pattern, G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); + regex = g_regex_new (data->pattern, data->compile_flags, G_REGEX_MATCH_DEFAULT, NULL); g_assert (regex != NULL); @@ -599,7 +773,14 @@ test_sub_pattern (gconstpointer d) data->expected_sub = _expected_sub; \ data->expected_start = _expected_start; \ data->expected_end = _expected_end; \ - path = g_strdup_printf ("/regex/match/subpattern/%d", ++total); \ + data->compile_flags = G_REGEX_DEFAULT; \ + total++; \ + path = g_strdup_printf ("/regex/match/subpattern/%d", total); \ + g_test_add_data_func_full (path, data, test_sub_pattern, g_free); \ + g_free (path); \ + data = g_memdup2 (data, sizeof (TestSubData)); \ + data->compile_flags = G_REGEX_OPTIMIZE; \ + path = g_strdup_printf ("/regex/match/subpattern-optimized/%d", total); \ g_test_add_data_func_full (path, data, test_sub_pattern, g_free); \ g_free (path); \ } @@ -1094,6 +1275,8 @@ typedef struct { gint start_position; const gchar *replacement; const gchar *expected; + GRegexCompileFlags compile_flags; + GRegexMatchFlags match_flags; } TestReplaceData; static void @@ -1102,17 +1285,25 @@ test_replace (gconstpointer d) const TestReplaceData *data = d; GRegex *regex; gchar *res; + GError *error = NULL; - regex = g_regex_new (data->pattern, G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); - res = g_regex_replace (regex, data->string, -1, data->start_position, data->replacement, 0, NULL); + regex = g_regex_new (data->pattern, data->compile_flags, G_REGEX_MATCH_DEFAULT, &error); + g_assert_no_error (error); + + res = g_regex_replace (regex, data->string, -1, data->start_position, + data->replacement, data->match_flags, &error); g_assert_cmpstr (res, ==, data->expected); + if (data->expected) + g_assert_no_error (error); + g_free (res); g_regex_unref (regex); + g_clear_error (&error); } -#define TEST_REPLACE(_pattern, _string, _start_position, _replacement, _expected) { \ +#define TEST_REPLACE_OPTIONS(_pattern, _string, _start_position, _replacement, _expected, _compile_flags, _match_flags) { \ TestReplaceData *data; \ gchar *path; \ data = g_new0 (TestReplaceData, 1); \ @@ -1121,11 +1312,33 @@ test_replace (gconstpointer d) data->start_position = _start_position; \ data->replacement = _replacement; \ data->expected = _expected; \ - path = g_strdup_printf ("/regex/replace/%d", ++total); \ + data->compile_flags = _compile_flags; \ + data->match_flags = _match_flags; \ + total++; \ + if (data->compile_flags & G_REGEX_OPTIMIZE) \ + path = g_strdup_printf ("/regex/replace-optimized/%d", total); \ + else \ + path = g_strdup_printf ("/regex/replace/%d", total); \ + g_test_add_data_func_full (path, data, test_replace, g_free); \ + g_free (path); \ + data = g_memdup2 (data, sizeof (TestReplaceData)); \ + if (data->compile_flags & G_REGEX_OPTIMIZE) \ + { \ + data->compile_flags &= ~G_REGEX_OPTIMIZE; \ + path = g_strdup_printf ("/regex/replace/%d", total); \ + } \ + else \ + { \ + data->compile_flags |= G_REGEX_OPTIMIZE; \ + path = g_strdup_printf ("/regex/replace-optimized/%d", total); \ + } \ g_test_add_data_func_full (path, data, test_replace, g_free); \ g_free (path); \ } +#define TEST_REPLACE(_pattern, _string, _start_position, _replacement, _expected) \ + TEST_REPLACE_OPTIONS (_pattern, _string, _start_position, _replacement, _expected, 0, 0) + static void test_replace_lit (gconstpointer d) { @@ -1556,6 +1769,12 @@ test_class (void) res = g_match_info_next (match, NULL); g_assert (!res); + /* Accessing match again should not crash */ + g_test_expect_message ("GLib", G_LOG_LEVEL_CRITICAL, + "*match_info->pos >= 0*"); + g_assert_false (g_match_info_next (match, NULL)); + g_test_assert_expected_messages (); + g_match_info_free (match); g_regex_unref (regex); } @@ -2200,6 +2419,67 @@ test_compile_errors (void) g_clear_error (&error); } +static void +test_jit_unsupported_matching_options (void) +{ + GRegex *regex; + GMatchInfo *info; + gchar *substring; + + regex = g_regex_new ("(\\w+)#(\\w+)", G_REGEX_OPTIMIZE, G_REGEX_MATCH_DEFAULT, NULL); + + g_assert_true (g_regex_match (regex, "aa#bb cc#dd", G_REGEX_MATCH_DEFAULT, &info)); + g_assert_cmpint (g_match_info_get_match_count (info), ==, 3); + substring = g_match_info_fetch (info, 1); + g_assert_cmpstr (substring, ==, "aa"); + g_clear_pointer (&substring, g_free); + substring = g_match_info_fetch (info, 2); + g_assert_cmpstr (substring, ==, "bb"); + g_clear_pointer (&substring, g_free); + g_assert_true (g_match_info_next (info, NULL)); + g_assert_cmpint (g_match_info_get_match_count (info), ==, 3); + substring = g_match_info_fetch (info, 1); + g_assert_cmpstr (substring, ==, "cc"); + g_clear_pointer (&substring, g_free); + substring = g_match_info_fetch (info, 2); + g_assert_cmpstr (substring, ==, "dd"); + g_clear_pointer (&substring, g_free); + g_assert_false (g_match_info_next (info, NULL)); + g_match_info_free (info); + + g_assert_true (g_regex_match (regex, "aa#bb cc#dd", G_REGEX_MATCH_ANCHORED, &info)); + g_assert_cmpint (g_match_info_get_match_count (info), ==, 3); + substring = g_match_info_fetch (info, 1); + g_assert_cmpstr (substring, ==, "aa"); + g_clear_pointer (&substring, g_free); + substring = g_match_info_fetch (info, 2); + g_assert_cmpstr (substring, ==, "bb"); + g_clear_pointer (&substring, g_free); + g_assert_false (g_match_info_next (info, NULL)); + g_match_info_free (info); + + g_assert_true (g_regex_match (regex, "aa#bb cc#dd", G_REGEX_MATCH_DEFAULT, &info)); + g_assert_cmpint (g_match_info_get_match_count (info), ==, 3); + substring = g_match_info_fetch (info, 1); + g_assert_cmpstr (substring, ==, "aa"); + g_clear_pointer (&substring, g_free); + substring = g_match_info_fetch (info, 2); + g_assert_cmpstr (substring, ==, "bb"); + g_clear_pointer (&substring, g_free); + g_assert_true (g_match_info_next (info, NULL)); + g_assert_cmpint (g_match_info_get_match_count (info), ==, 3); + substring = g_match_info_fetch (info, 1); + g_assert_cmpstr (substring, ==, "cc"); + g_clear_pointer (&substring, g_free); + substring = g_match_info_fetch (info, 2); + g_assert_cmpstr (substring, ==, "dd"); + g_clear_pointer (&substring, g_free); + g_assert_false (g_match_info_next (info, NULL)); + g_match_info_free (info); + + g_regex_unref (regex); +} + int main (int argc, char *argv[]) { @@ -2218,6 +2498,7 @@ main (int argc, char *argv[]) g_test_add_func ("/regex/explicit-crlf", test_explicit_crlf); g_test_add_func ("/regex/max-lookbehind", test_max_lookbehind); g_test_add_func ("/regex/compile-errors", test_compile_errors); + g_test_add_func ("/regex/jit-unsupported-matching", test_jit_unsupported_matching_options); /* TEST_NEW(pattern, compile_opts, match_opts) */ TEST_NEW("[A-Z]+", G_REGEX_CASELESS | G_REGEX_EXTENDED | G_REGEX_OPTIMIZE, G_REGEX_MATCH_NOTBOL | G_REGEX_MATCH_PARTIAL); @@ -2243,7 +2524,13 @@ main (int argc, char *argv[]) /* TEST_NEW_CHECK_FLAGS(pattern, compile_opts, match_ops, real_compile_opts, real_match_opts) */ TEST_NEW_CHECK_FLAGS ("a", G_REGEX_OPTIMIZE, 0, G_REGEX_OPTIMIZE, 0); + TEST_NEW_CHECK_FLAGS ("a", G_REGEX_OPTIMIZE, G_REGEX_MATCH_NOTEMPTY, + G_REGEX_OPTIMIZE, G_REGEX_MATCH_NOTEMPTY); + TEST_NEW_CHECK_FLAGS ("a", 0, G_REGEX_MATCH_NEWLINE_ANYCRLF | G_REGEX_MATCH_BSR_ANYCRLF, + G_REGEX_NEWLINE_ANYCRLF | G_REGEX_BSR_ANYCRLF, + G_REGEX_MATCH_NEWLINE_ANYCRLF | G_REGEX_MATCH_BSR_ANYCRLF); TEST_NEW_CHECK_FLAGS ("a", G_REGEX_RAW, 0, G_REGEX_RAW, 0); + TEST_NEW_CHECK_FLAGS ("(?J)a", 0, 0, G_REGEX_DUPNAMES, 0); TEST_NEW_CHECK_FLAGS ("^.*", 0, 0, G_REGEX_ANCHORED, 0); TEST_NEW_CHECK_FLAGS ("(*UTF8)a", 0, 0, 0 /* this is the default in GRegex */, 0); TEST_NEW_CHECK_FLAGS ("(*UCP)a", 0, 0, 0 /* this always on in GRegex */, 0); @@ -2315,6 +2602,16 @@ main (int argc, char *argv[]) TEST_NEW_FAIL ("\\k", 0, G_REGEX_ERROR_MISSING_NAME); TEST_NEW_FAIL ("a[\\NB]c", 0, G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS); TEST_NEW_FAIL ("(*:0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEFG)XX", 0, G_REGEX_ERROR_NAME_TOO_LONG); + /* See https://gitlab.gnome.org/GNOME/gtksourceview/-/issues/278 */ + TEST_NEW_FAIL ("(?i-x)((?:(?i-x)[^\\x00\\t\\n\\f\\r \"'/<=>\\x{007F}-\\x{009F}" \ + "\\x{FDD0}-\\x{FDEF}\\x{FFFE}\\x{FFFF}\\x{1FFFE}\\x{1FFFF}" \ + "\\x{2FFFE}\\x{2FFFF}\\x{3FFFE}\\x{3FFFF}\\x{4FFFE}\\x{4FFFF}" \ + "\\x{5FFFE}\\x{5FFFF}\\x{6FFFE}\\x{6FFFF}\\x{7FFFE}\\x{7FFFF}" \ + "\\x{8FFFE}\\x{8FFFF}\\x{9FFFE}\\x{9FFFF}\\x{AFFFE}\\x{AFFFF}" \ + "\\x{BFFFE}\\x{BFFFF}\\x{CFFFE}\\x{CFFFF}\\x{DFFFE}\\x{DFFFF}" \ + "\\x{EFFFE}\\x{EFFFF}\\x{FFFFE}\\x{FFFFF}\\x{10FFFE}\\x{10FFFF}]+)" \ + "\\s*=\\s*)(\\\")", + G_REGEX_RAW, G_REGEX_ERROR_HEX_CODE_TOO_LARGE); /* These errors can't really be tested easily: * G_REGEX_ERROR_EXPRESSION_TOO_LARGE @@ -2338,6 +2635,7 @@ main (int argc, char *argv[]) TEST_MATCH_SIMPLE("a", "ab", 0, G_REGEX_MATCH_ANCHORED, TRUE); TEST_MATCH_SIMPLE("a", "a", G_REGEX_CASELESS, 0, TRUE); TEST_MATCH_SIMPLE("a", "A", G_REGEX_CASELESS, 0, TRUE); + TEST_MATCH_SIMPLE("\\C\\C", "ab", G_REGEX_OPTIMIZE | G_REGEX_RAW, 0, TRUE); /* These are needed to test extended properties. */ TEST_MATCH_SIMPLE(AGRAVE, AGRAVE, G_REGEX_CASELESS, 0, TRUE); TEST_MATCH_SIMPLE(AGRAVE, AGRAVE_UPPER, G_REGEX_CASELESS, 0, TRUE); @@ -2449,6 +2747,8 @@ main (int argc, char *argv[]) TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, 0, "a\rb\rc", -1, 0, 0, TRUE); TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_LF, 0, "a\rb\rc", -1, 0, 0, FALSE); TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CRLF, 0, "a\rb\rc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_ANYCRLF, 0, "a\r\nb\nc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_ANYCRLF, 0, "a\r\nb\rc", -1, 0, 0, TRUE); TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CR, "a\nb\nc", -1, 0, 0, FALSE); TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_LF, "a\nb\nc", -1, 0, 0, TRUE); TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CRLF, "a\nb\nc", -1, 0, 0, FALSE); @@ -2458,6 +2758,8 @@ main (int argc, char *argv[]) TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CR, "a\rb\rc", -1, 0, 0, TRUE); TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_LF, "a\rb\rc", -1, 0, 0, FALSE); TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CRLF, "a\rb\rc", -1, 0, 0, FALSE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_ANYCRLF, "a\r\nb\rc", -1, 0, 0, TRUE); + TEST_MATCH("^b$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_ANYCRLF, "a\r\nb\nc", -1, 0, 0, TRUE); TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, G_REGEX_MATCH_NEWLINE_ANY, "a\nb\nc", -1, 0, 0, TRUE); TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, G_REGEX_MATCH_NEWLINE_ANY, "a\rb\rc", -1, 0, 0, TRUE); @@ -2467,6 +2769,13 @@ main (int argc, char *argv[]) TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, G_REGEX_MATCH_NEWLINE_CRLF, "a\r\nb\r\nc", -1, 0, 0, TRUE); TEST_MATCH("^b$", G_REGEX_MULTILINE | G_REGEX_NEWLINE_CR, G_REGEX_MATCH_NEWLINE_CRLF, "a\rb\rc", -1, 0, 0, FALSE); + /* See https://gitlab.gnome.org/GNOME/glib/-/issues/2729#note_1544130 */ + TEST_MATCH("^a$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_ANY, "a", -1, 0, 0, TRUE); + TEST_MATCH("^a$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_LF, "a", -1, 0, 0, TRUE); + TEST_MATCH("^a$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CR, "a", -1, 0, 0, TRUE); + TEST_MATCH("^a$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_CRLF, "a", -1, 0, 0, TRUE); + TEST_MATCH("^a$", G_REGEX_MULTILINE, G_REGEX_MATCH_NEWLINE_ANYCRLF, "a", -1, 0, 0, TRUE); + TEST_MATCH("a#\nb", G_REGEX_EXTENDED, 0, "a", -1, 0, 0, FALSE); TEST_MATCH("a#\r\nb", G_REGEX_EXTENDED, 0, "a", -1, 0, 0, FALSE); TEST_MATCH("a#\rb", G_REGEX_EXTENDED, 0, "a", -1, 0, 0, FALSE); @@ -2786,6 +3095,12 @@ main (int argc, char *argv[]) TEST_REPLACE("\\S+", "hello world", 0, "\\U-\\0-", "-HELLO- -WORLD-"); TEST_REPLACE(".", "a", 0, "\\A", NULL); TEST_REPLACE(".", "a", 0, "\\g", NULL); + TEST_REPLACE_OPTIONS("(\\w+)#(\\w+)", "aa#bb cc#dd", 0, "\\2#\\1", "bb#aa dd#cc", + G_REGEX_OPTIMIZE|G_REGEX_MULTILINE|G_REGEX_CASELESS, + 0); + TEST_REPLACE_OPTIONS("(\\w+)#(\\w+)", "aa#bb cc#dd", 0, "\\2#\\1", "bb#aa cc#dd", + G_REGEX_OPTIMIZE|G_REGEX_MULTILINE|G_REGEX_CASELESS, + G_REGEX_MATCH_ANCHORED); /* TEST_REPLACE_LIT(pattern, string, start_position, replacement, expected) */ TEST_REPLACE_LIT("a", "ababa", 0, "A", "AbAbA"); |