diff options
Diffstat (limited to 'glib/gregex.c')
-rw-r--r-- | glib/gregex.c | 1400 |
1 files changed, 929 insertions, 471 deletions
diff --git a/glib/gregex.c b/glib/gregex.c index 2fa069891..39b9edeec 100644 --- a/glib/gregex.c +++ b/glib/gregex.c @@ -3,6 +3,7 @@ * Copyright (C) 1999, 2000 Scott Wimer * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com> * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org> + * Copyright (C) 2022, Marco Trevisan <marco.trevisan@canonical.com> * * SPDX-License-Identifier: LGPL-2.1-or-later * @@ -22,9 +23,11 @@ #include "config.h" +#include <stdint.h> #include <string.h> -#include <pcre.h> +#define PCRE2_CODE_UNIT_WIDTH 8 +#include <pcre2.h> #include "gtypes.h" #include "gregex.h" @@ -86,18 +89,6 @@ * unescaped "#" outside a character class is encountered. This indicates * a comment that lasts until after the next newline. * - * When setting the %G_REGEX_JAVASCRIPT_COMPAT flag, pattern syntax and pattern - * matching is changed to be compatible with the way that regular expressions - * work in JavaScript. More precisely, a lonely ']' character in the pattern - * is a syntax error; the '\x' escape only allows 0 to 2 hexadecimal digits, and - * you must use the '\u' escape sequence with 4 hex digits to specify a unicode - * codepoint instead of '\x' or 'x{....}'. If '\x' or '\u' are not followed by - * the specified number of hex digits, they match 'x' and 'u' literally; also - * '\U' always matches 'U' instead of being an error in the pattern. Finally, - * pattern matching is modified so that back references to an unset subpattern - * group produces a match with the empty string instead of an error. See - * pcreapi(3) for more information. - * * Creating and manipulating the same #GRegex structure from different * threads is not a problem as #GRegex does not modify its internal * state between creation and destruction, on the other hand #GMatchInfo @@ -109,33 +100,64 @@ * library written by Philip Hazel. */ +#define G_REGEX_PCRE_GENERIC_MASK (PCRE2_ANCHORED | \ + PCRE2_NO_UTF_CHECK | \ + PCRE2_ENDANCHORED) + /* Mask of all the possible values for GRegexCompileFlags. */ -#define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS | \ - G_REGEX_MULTILINE | \ - G_REGEX_DOTALL | \ - G_REGEX_EXTENDED | \ - G_REGEX_ANCHORED | \ - G_REGEX_DOLLAR_ENDONLY | \ - G_REGEX_UNGREEDY | \ - G_REGEX_RAW | \ - G_REGEX_NO_AUTO_CAPTURE | \ - G_REGEX_OPTIMIZE | \ - G_REGEX_FIRSTLINE | \ - G_REGEX_DUPNAMES | \ - G_REGEX_NEWLINE_CR | \ - G_REGEX_NEWLINE_LF | \ - G_REGEX_NEWLINE_CRLF | \ - G_REGEX_NEWLINE_ANYCRLF | \ - G_REGEX_BSR_ANYCRLF | \ - G_REGEX_JAVASCRIPT_COMPAT) - -/* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */ -#define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK) -#define G_REGEX_COMPILE_NONPCRE_MASK (G_REGEX_RAW | \ - G_REGEX_OPTIMIZE) +#define G_REGEX_COMPILE_MASK (G_REGEX_DEFAULT | \ + G_REGEX_CASELESS | \ + G_REGEX_MULTILINE | \ + G_REGEX_DOTALL | \ + G_REGEX_EXTENDED | \ + G_REGEX_ANCHORED | \ + G_REGEX_DOLLAR_ENDONLY | \ + G_REGEX_UNGREEDY | \ + G_REGEX_RAW | \ + G_REGEX_NO_AUTO_CAPTURE | \ + G_REGEX_OPTIMIZE | \ + G_REGEX_FIRSTLINE | \ + G_REGEX_DUPNAMES | \ + G_REGEX_NEWLINE_CR | \ + G_REGEX_NEWLINE_LF | \ + G_REGEX_NEWLINE_CRLF | \ + G_REGEX_NEWLINE_ANYCRLF | \ + G_REGEX_BSR_ANYCRLF) + +#define G_REGEX_PCRE2_COMPILE_MASK (PCRE2_ALLOW_EMPTY_CLASS | \ + PCRE2_ALT_BSUX | \ + PCRE2_AUTO_CALLOUT | \ + PCRE2_CASELESS | \ + PCRE2_DOLLAR_ENDONLY | \ + PCRE2_DOTALL | \ + PCRE2_DUPNAMES | \ + PCRE2_EXTENDED | \ + PCRE2_FIRSTLINE | \ + PCRE2_MATCH_UNSET_BACKREF | \ + PCRE2_MULTILINE | \ + PCRE2_NEVER_UCP | \ + PCRE2_NEVER_UTF | \ + PCRE2_NO_AUTO_CAPTURE | \ + PCRE2_NO_AUTO_POSSESS | \ + PCRE2_NO_DOTSTAR_ANCHOR | \ + PCRE2_NO_START_OPTIMIZE | \ + PCRE2_UCP | \ + PCRE2_UNGREEDY | \ + PCRE2_UTF | \ + PCRE2_NEVER_BACKSLASH_C | \ + PCRE2_ALT_CIRCUMFLEX | \ + PCRE2_ALT_VERBNAMES | \ + PCRE2_USE_OFFSET_LIMIT | \ + PCRE2_EXTENDED_MORE | \ + PCRE2_LITERAL | \ + PCRE2_MATCH_INVALID_UTF | \ + G_REGEX_PCRE_GENERIC_MASK) + +#define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF) /* Mask of all the possible values for GRegexMatchFlags. */ -#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \ +#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_DEFAULT | \ + G_REGEX_MATCH_ANCHORED | \ G_REGEX_MATCH_NOTBOL | \ G_REGEX_MATCH_NOTEOL | \ G_REGEX_MATCH_NOTEMPTY | \ @@ -151,45 +173,39 @@ G_REGEX_MATCH_PARTIAL_HARD | \ G_REGEX_MATCH_NOTEMPTY_ATSTART) -/* we rely on these flags having the same values */ -G_STATIC_ASSERT (G_REGEX_CASELESS == PCRE_CASELESS); -G_STATIC_ASSERT (G_REGEX_MULTILINE == PCRE_MULTILINE); -G_STATIC_ASSERT (G_REGEX_DOTALL == PCRE_DOTALL); -G_STATIC_ASSERT (G_REGEX_EXTENDED == PCRE_EXTENDED); -G_STATIC_ASSERT (G_REGEX_ANCHORED == PCRE_ANCHORED); -G_STATIC_ASSERT (G_REGEX_DOLLAR_ENDONLY == PCRE_DOLLAR_ENDONLY); -G_STATIC_ASSERT (G_REGEX_UNGREEDY == PCRE_UNGREEDY); -G_STATIC_ASSERT (G_REGEX_NO_AUTO_CAPTURE == PCRE_NO_AUTO_CAPTURE); -G_STATIC_ASSERT (G_REGEX_FIRSTLINE == PCRE_FIRSTLINE); -G_STATIC_ASSERT (G_REGEX_DUPNAMES == PCRE_DUPNAMES); -G_STATIC_ASSERT (G_REGEX_NEWLINE_CR == PCRE_NEWLINE_CR); -G_STATIC_ASSERT (G_REGEX_NEWLINE_LF == PCRE_NEWLINE_LF); -G_STATIC_ASSERT (G_REGEX_NEWLINE_CRLF == PCRE_NEWLINE_CRLF); -G_STATIC_ASSERT (G_REGEX_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF); -G_STATIC_ASSERT (G_REGEX_BSR_ANYCRLF == PCRE_BSR_ANYCRLF); -G_STATIC_ASSERT (G_REGEX_JAVASCRIPT_COMPAT == PCRE_JAVASCRIPT_COMPAT); - -G_STATIC_ASSERT (G_REGEX_MATCH_ANCHORED == PCRE_ANCHORED); -G_STATIC_ASSERT (G_REGEX_MATCH_NOTBOL == PCRE_NOTBOL); -G_STATIC_ASSERT (G_REGEX_MATCH_NOTEOL == PCRE_NOTEOL); -G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY == PCRE_NOTEMPTY); -G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL == PCRE_PARTIAL); -G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CR == PCRE_NEWLINE_CR); -G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_LF == PCRE_NEWLINE_LF); -G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CRLF == PCRE_NEWLINE_CRLF); -G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANY == PCRE_NEWLINE_ANY); -G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF); -G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANYCRLF == PCRE_BSR_ANYCRLF); -G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANY == PCRE_BSR_UNICODE); -G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_SOFT == PCRE_PARTIAL_SOFT); -G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_HARD == PCRE_PARTIAL_HARD); -G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY_ATSTART == PCRE_NOTEMPTY_ATSTART); - -/* These PCRE flags are unused or not exposed publicly in GRegexFlags, so - * it should be ok to reuse them for different things. +#define G_REGEX_PCRE2_MATCH_MASK (PCRE2_NOTBOL |\ + PCRE2_NOTEOL |\ + PCRE2_NOTEMPTY |\ + PCRE2_NOTEMPTY_ATSTART |\ + PCRE2_PARTIAL_SOFT |\ + PCRE2_PARTIAL_HARD |\ + PCRE2_NO_JIT |\ + PCRE2_COPY_MATCHED_SUBJECT |\ + G_REGEX_PCRE_GENERIC_MASK) + +/* TODO: Support PCRE2_NEWLINE_NUL */ +#define G_REGEX_NEWLINE_MASK (PCRE2_NEWLINE_CR | \ + PCRE2_NEWLINE_LF | \ + PCRE2_NEWLINE_CRLF | \ + PCRE2_NEWLINE_ANYCRLF) + +/* Some match options are not supported when using JIT as stated in the + * pcre2jit man page under the «UNSUPPORTED OPTIONS AND PATTERN ITEMS» section: + * https://www.pcre.org/current/doc/html/pcre2jit.html#SEC5 */ -G_STATIC_ASSERT (G_REGEX_OPTIMIZE == PCRE_NO_UTF8_CHECK); -G_STATIC_ASSERT (G_REGEX_RAW == PCRE_UTF8); +#define G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS (PCRE2_ANCHORED | \ + PCRE2_ENDANCHORED) + +#define G_REGEX_COMPILE_NEWLINE_MASK (G_REGEX_NEWLINE_CR | \ + G_REGEX_NEWLINE_LF | \ + G_REGEX_NEWLINE_CRLF | \ + G_REGEX_NEWLINE_ANYCRLF) + +#define G_REGEX_MATCH_NEWLINE_MASK (G_REGEX_MATCH_NEWLINE_CR | \ + G_REGEX_MATCH_NEWLINE_LF | \ + G_REGEX_MATCH_NEWLINE_CRLF | \ + G_REGEX_MATCH_NEWLINE_ANY | \ + G_REGEX_MATCH_NEWLINE_ANYCRLF) /* if the string is in UTF-8 use g_utf8_ functions, else use * use just +/- 1. */ @@ -204,30 +220,42 @@ struct _GMatchInfo { gint ref_count; /* the ref count (atomic) */ GRegex *regex; /* the regex */ - GRegexMatchFlags match_opts; /* options used at match time on the regex */ + uint32_t match_opts; /* pcre match options used at match time on the regex */ gint matches; /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */ - gint n_subpatterns; /* total number of sub patterns in the regex */ + uint32_t n_subpatterns; /* total number of sub patterns in the regex */ gint pos; /* position in the string where last match left off */ - gint n_offsets; /* number of offsets */ + uint32_t n_offsets; /* number of offsets */ gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */ - gint *workspace; /* workspace for pcre_dfa_exec() */ - gint n_workspace; /* number of workspace elements */ + gint *workspace; /* workspace for pcre2_dfa_match() */ + PCRE2_SIZE n_workspace; /* number of workspace elements */ const gchar *string; /* string passed to the match function */ gssize string_len; /* length of string, in bytes */ + pcre2_match_context *match_context; + pcre2_match_data *match_data; }; +typedef enum +{ + JIT_STATUS_DEFAULT, + JIT_STATUS_ENABLED, + JIT_STATUS_DISABLED +} JITStatus; + struct _GRegex { gint ref_count; /* the ref count for the immutable part (atomic) */ gchar *pattern; /* the pattern */ - pcre *pcre_re; /* compiled form of the pattern */ - GRegexCompileFlags compile_opts; /* options used at compile time on the pattern */ - GRegexMatchFlags match_opts; /* options used at match time on the regex */ - pcre_extra *extra; /* data stored when G_REGEX_OPTIMIZE is used */ + pcre2_code *pcre_re; /* compiled form of the pattern */ + uint32_t compile_opts; /* options used at compile time on the pattern, pcre2 values */ + GRegexCompileFlags orig_compile_opts; /* options used at compile time on the pattern, gregex values */ + uint32_t match_opts; /* pcre2 options used at match time on the regex */ + GRegexMatchFlags orig_match_opts; /* options used as default match options, gregex values */ + uint32_t jit_options; /* options which were enabled for jit compiler */ + JITStatus jit_status; /* indicates the status of jit compiler for this compiled regex */ }; /* TRUE if ret is an error code, FALSE otherwise. */ -#define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL) +#define IS_PCRE2_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL) typedef struct _InterpolationData InterpolationData; static gboolean interpolation_list_needs_match (GList *list); @@ -238,327 +266,548 @@ static GList *split_replacement (const gchar *replacement, GError **error); static void free_interpolation_data (InterpolationData *data); +static uint32_t +get_pcre2_compile_options (GRegexCompileFlags compile_flags) +{ + /* Maps compile flags to pcre2 values */ + uint32_t pcre2_flags = 0; + + if (compile_flags & G_REGEX_CASELESS) + pcre2_flags |= PCRE2_CASELESS; + if (compile_flags & G_REGEX_MULTILINE) + pcre2_flags |= PCRE2_MULTILINE; + if (compile_flags & G_REGEX_DOTALL) + pcre2_flags |= PCRE2_DOTALL; + if (compile_flags & G_REGEX_EXTENDED) + pcre2_flags |= PCRE2_EXTENDED; + if (compile_flags & G_REGEX_ANCHORED) + pcre2_flags |= PCRE2_ANCHORED; + if (compile_flags & G_REGEX_DOLLAR_ENDONLY) + pcre2_flags |= PCRE2_DOLLAR_ENDONLY; + if (compile_flags & G_REGEX_UNGREEDY) + pcre2_flags |= PCRE2_UNGREEDY; + if (!(compile_flags & G_REGEX_RAW)) + pcre2_flags |= PCRE2_UTF; + if (compile_flags & G_REGEX_NO_AUTO_CAPTURE) + pcre2_flags |= PCRE2_NO_AUTO_CAPTURE; + if (compile_flags & G_REGEX_FIRSTLINE) + pcre2_flags |= PCRE2_FIRSTLINE; + if (compile_flags & G_REGEX_DUPNAMES) + pcre2_flags |= PCRE2_DUPNAMES; + + return pcre2_flags & G_REGEX_PCRE2_COMPILE_MASK; +} + +static uint32_t +get_pcre2_match_options (GRegexMatchFlags match_flags, + GRegexCompileFlags compile_flags) +{ + /* Maps match flags to pcre2 values */ + uint32_t pcre2_flags = 0; + + if (match_flags & G_REGEX_MATCH_ANCHORED) + pcre2_flags |= PCRE2_ANCHORED; + if (match_flags & G_REGEX_MATCH_NOTBOL) + pcre2_flags |= PCRE2_NOTBOL; + if (match_flags & G_REGEX_MATCH_NOTEOL) + pcre2_flags |= PCRE2_NOTEOL; + if (match_flags & G_REGEX_MATCH_NOTEMPTY) + pcre2_flags |= PCRE2_NOTEMPTY; + if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT) + pcre2_flags |= PCRE2_PARTIAL_SOFT; + if (match_flags & G_REGEX_MATCH_PARTIAL_HARD) + pcre2_flags |= PCRE2_PARTIAL_HARD; + if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART) + pcre2_flags |= PCRE2_NOTEMPTY_ATSTART; + + if (compile_flags & G_REGEX_RAW) + pcre2_flags |= PCRE2_NO_UTF_CHECK; + + return pcre2_flags & G_REGEX_PCRE2_MATCH_MASK; +} + +static GRegexCompileFlags +g_regex_compile_flags_from_pcre2 (uint32_t pcre2_flags) +{ + GRegexCompileFlags compile_flags = G_REGEX_DEFAULT; + + if (pcre2_flags & PCRE2_CASELESS) + compile_flags |= G_REGEX_CASELESS; + if (pcre2_flags & PCRE2_MULTILINE) + compile_flags |= G_REGEX_MULTILINE; + if (pcre2_flags & PCRE2_DOTALL) + compile_flags |= G_REGEX_DOTALL; + if (pcre2_flags & PCRE2_EXTENDED) + compile_flags |= G_REGEX_EXTENDED; + if (pcre2_flags & PCRE2_ANCHORED) + compile_flags |= G_REGEX_ANCHORED; + if (pcre2_flags & PCRE2_DOLLAR_ENDONLY) + compile_flags |= G_REGEX_DOLLAR_ENDONLY; + if (pcre2_flags & PCRE2_UNGREEDY) + compile_flags |= G_REGEX_UNGREEDY; + if (!(pcre2_flags & PCRE2_UTF)) + compile_flags |= G_REGEX_RAW; + if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE) + compile_flags |= G_REGEX_NO_AUTO_CAPTURE; + if (pcre2_flags & PCRE2_FIRSTLINE) + compile_flags |= G_REGEX_FIRSTLINE; + if (pcre2_flags & PCRE2_DUPNAMES) + compile_flags |= G_REGEX_DUPNAMES; + + return compile_flags & G_REGEX_COMPILE_MASK; +} + +static GRegexMatchFlags +g_regex_match_flags_from_pcre2 (uint32_t pcre2_flags) +{ + GRegexMatchFlags match_flags = G_REGEX_MATCH_DEFAULT; + + if (pcre2_flags & PCRE2_ANCHORED) + match_flags |= G_REGEX_MATCH_ANCHORED; + if (pcre2_flags & PCRE2_NOTBOL) + match_flags |= G_REGEX_MATCH_NOTBOL; + if (pcre2_flags & PCRE2_NOTEOL) + match_flags |= G_REGEX_MATCH_NOTEOL; + if (pcre2_flags & PCRE2_NOTEMPTY) + match_flags |= G_REGEX_MATCH_NOTEMPTY; + if (pcre2_flags & PCRE2_PARTIAL_SOFT) + match_flags |= G_REGEX_MATCH_PARTIAL_SOFT; + if (pcre2_flags & PCRE2_PARTIAL_HARD) + match_flags |= G_REGEX_MATCH_PARTIAL_HARD; + if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART) + match_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART; + + return (match_flags & G_REGEX_MATCH_MASK); +} + +static uint32_t +get_pcre2_newline_compile_options (GRegexCompileFlags compile_flags) +{ + compile_flags &= G_REGEX_COMPILE_NEWLINE_MASK; + + switch (compile_flags) + { + case G_REGEX_NEWLINE_CR: + return PCRE2_NEWLINE_CR; + case G_REGEX_NEWLINE_LF: + return PCRE2_NEWLINE_LF; + case G_REGEX_NEWLINE_CRLF: + return PCRE2_NEWLINE_CRLF; + case G_REGEX_NEWLINE_ANYCRLF: + return PCRE2_NEWLINE_ANYCRLF; + default: + if (compile_flags != 0) + return 0; + + return PCRE2_NEWLINE_ANY; + } +} + +static uint32_t +get_pcre2_newline_match_options (GRegexMatchFlags match_flags) +{ + switch (match_flags & G_REGEX_MATCH_NEWLINE_MASK) + { + case G_REGEX_MATCH_NEWLINE_CR: + return PCRE2_NEWLINE_CR; + case G_REGEX_MATCH_NEWLINE_LF: + return PCRE2_NEWLINE_LF; + case G_REGEX_MATCH_NEWLINE_CRLF: + return PCRE2_NEWLINE_CRLF; + case G_REGEX_MATCH_NEWLINE_ANY: + return PCRE2_NEWLINE_ANY; + case G_REGEX_MATCH_NEWLINE_ANYCRLF: + return PCRE2_NEWLINE_ANYCRLF; + default: + return 0; + } +} + +static uint32_t +get_pcre2_bsr_compile_options (GRegexCompileFlags compile_flags) +{ + if (compile_flags & G_REGEX_BSR_ANYCRLF) + return PCRE2_BSR_ANYCRLF; + + return PCRE2_BSR_UNICODE; +} + +static uint32_t +get_pcre2_bsr_match_options (GRegexMatchFlags match_flags) +{ + if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF) + return PCRE2_BSR_ANYCRLF; + + if (match_flags & G_REGEX_MATCH_BSR_ANY) + return PCRE2_BSR_UNICODE; + + return 0; +} + +static char * +get_pcre2_error_string (int errcode) +{ + PCRE2_UCHAR8 error_msg[2048]; + int err_length; + + err_length = pcre2_get_error_message (errcode, error_msg, + G_N_ELEMENTS (error_msg)); + + if (err_length <= 0) + return NULL; + + /* The array is always filled with a trailing zero */ + g_assert ((size_t) err_length < G_N_ELEMENTS (error_msg)); + return g_memdup2 (error_msg, err_length + 1); +} static const gchar * -match_error (gint errcode) +translate_match_error (gint errcode) { switch (errcode) { - case PCRE_ERROR_NOMATCH: + case PCRE2_ERROR_NOMATCH: /* not an error */ break; - case PCRE_ERROR_NULL: + case PCRE2_ERROR_NULL: /* NULL argument, this should not happen in GRegex */ - g_warning ("A NULL argument was passed to PCRE"); + g_critical ("A NULL argument was passed to PCRE"); break; - case PCRE_ERROR_BADOPTION: + case PCRE2_ERROR_BADOPTION: return "bad options"; - case PCRE_ERROR_BADMAGIC: + case PCRE2_ERROR_BADMAGIC: return _("corrupted object"); - case PCRE_ERROR_UNKNOWN_OPCODE: - return N_("internal error or corrupted object"); - case PCRE_ERROR_NOMEMORY: + case PCRE2_ERROR_NOMEMORY: return _("out of memory"); - case PCRE_ERROR_NOSUBSTRING: - /* not used by pcre_exec() */ + case PCRE2_ERROR_NOSUBSTRING: + /* not used by pcre2_match() */ break; - case PCRE_ERROR_MATCHLIMIT: + case PCRE2_ERROR_MATCHLIMIT: + case PCRE2_ERROR_JIT_STACKLIMIT: return _("backtracking limit reached"); - case PCRE_ERROR_CALLOUT: + case PCRE2_ERROR_CALLOUT: /* callouts are not implemented */ break; - case PCRE_ERROR_BADUTF8: - case PCRE_ERROR_BADUTF8_OFFSET: + case PCRE2_ERROR_BADUTFOFFSET: /* we do not check if strings are valid */ break; - case PCRE_ERROR_PARTIAL: + case PCRE2_ERROR_PARTIAL: /* not an error */ break; - case PCRE_ERROR_BADPARTIAL: - return _("the pattern contains items not supported for partial matching"); - case PCRE_ERROR_INTERNAL: + case PCRE2_ERROR_INTERNAL: return _("internal error"); - case PCRE_ERROR_BADCOUNT: - /* negative ovecsize, this should not happen in GRegex */ - g_warning ("A negative ovecsize was passed to PCRE"); - break; - case PCRE_ERROR_DFA_UITEM: + case PCRE2_ERROR_DFA_UITEM: return _("the pattern contains items not supported for partial matching"); - case PCRE_ERROR_DFA_UCOND: + case PCRE2_ERROR_DFA_UCOND: return _("back references as conditions are not supported for partial matching"); - case PCRE_ERROR_DFA_UMLIMIT: - /* the match_field field is not used in GRegex */ - break; - case PCRE_ERROR_DFA_WSSIZE: + case PCRE2_ERROR_DFA_WSSIZE: /* handled expanding the workspace */ break; - case PCRE_ERROR_DFA_RECURSE: - case PCRE_ERROR_RECURSIONLIMIT: + case PCRE2_ERROR_DFA_RECURSE: + case PCRE2_ERROR_RECURSIONLIMIT: return _("recursion limit reached"); - case PCRE_ERROR_BADNEWLINE: - return _("invalid combination of newline flags"); - case PCRE_ERROR_BADOFFSET: + case PCRE2_ERROR_BADOFFSET: return _("bad offset"); - case PCRE_ERROR_SHORTUTF8: - return _("short utf8"); - case PCRE_ERROR_RECURSELOOP: + case PCRE2_ERROR_RECURSELOOP: return _("recursion loop"); + case PCRE2_ERROR_JIT_BADOPTION: + /* should not happen in GRegex since we check modes before each match */ + return _("matching mode is requested that was not compiled for JIT"); default: break; } - return _("unknown error"); + return NULL; +} + +static char * +get_match_error_message (int errcode) +{ + const char *msg = translate_match_error (errcode); + char *error_string; + + if (msg) + return g_strdup (msg); + + error_string = get_pcre2_error_string (errcode); + + if (error_string) + return error_string; + + return g_strdup (_("unknown error")); } static void translate_compile_error (gint *errcode, const gchar **errmsg) { - /* Compile errors are created adding 100 to the error code returned - * by PCRE. - * If errcode is known we put the translatable error message in - * erromsg. If errcode is unknown we put the generic - * G_REGEX_ERROR_COMPILE error code in errcode and keep the - * untranslated error message returned by PCRE. + /* If errcode is known we put the translatable error message in + * errmsg. If errcode is unknown we put the generic + * G_REGEX_ERROR_COMPILE error code in errcode. * Note that there can be more PCRE errors with the same GRegexError * and that some PCRE errors are useless for us. */ - *errcode += 100; + gint original_errcode = *errcode; + + *errcode = -1; + *errmsg = NULL; - switch (*errcode) + switch (original_errcode) { - case G_REGEX_ERROR_STRAY_BACKSLASH: + case PCRE2_ERROR_END_BACKSLASH: + *errcode = G_REGEX_ERROR_STRAY_BACKSLASH; *errmsg = _("\\ at end of pattern"); break; - case G_REGEX_ERROR_MISSING_CONTROL_CHAR: + case PCRE2_ERROR_END_BACKSLASH_C: + *errcode = G_REGEX_ERROR_MISSING_CONTROL_CHAR; *errmsg = _("\\c at end of pattern"); break; - case G_REGEX_ERROR_UNRECOGNIZED_ESCAPE: + case PCRE2_ERROR_UNKNOWN_ESCAPE: + case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE: + *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE; *errmsg = _("unrecognized character following \\"); break; - case G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER: + case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER: + *errcode = G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER; *errmsg = _("numbers out of order in {} quantifier"); break; - case G_REGEX_ERROR_QUANTIFIER_TOO_BIG: + case PCRE2_ERROR_QUANTIFIER_TOO_BIG: + *errcode = G_REGEX_ERROR_QUANTIFIER_TOO_BIG; *errmsg = _("number too big in {} quantifier"); break; - case G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS: + case PCRE2_ERROR_MISSING_SQUARE_BRACKET: + *errcode = G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS; *errmsg = _("missing terminating ] for character class"); break; - case G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS: + case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS: + *errcode = G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS; *errmsg = _("invalid escape sequence in character class"); break; - case G_REGEX_ERROR_RANGE_OUT_OF_ORDER: + case PCRE2_ERROR_CLASS_RANGE_ORDER: + *errcode = G_REGEX_ERROR_RANGE_OUT_OF_ORDER; *errmsg = _("range out of order in character class"); break; - case G_REGEX_ERROR_NOTHING_TO_REPEAT: + case PCRE2_ERROR_QUANTIFIER_INVALID: + case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT: + *errcode = G_REGEX_ERROR_NOTHING_TO_REPEAT; *errmsg = _("nothing to repeat"); break; - case 111: /* internal error: unexpected repeat */ - *errcode = G_REGEX_ERROR_INTERNAL; - *errmsg = _("unexpected repeat"); - break; - case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER: + case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY: + *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; *errmsg = _("unrecognized character after (? or (?-"); break; - case G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS: + case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS: + *errcode = G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS; *errmsg = _("POSIX named classes are supported only within a class"); break; - case G_REGEX_ERROR_UNMATCHED_PARENTHESIS: + case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING: + *errcode = G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED; + *errmsg = _("POSIX collating elements are not supported"); + break; + case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS: + case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS: + case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING: + *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; *errmsg = _("missing terminating )"); break; - case G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE: + case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE: + *errcode = G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE; *errmsg = _("reference to non-existent subpattern"); break; - case G_REGEX_ERROR_UNTERMINATED_COMMENT: + case PCRE2_ERROR_MISSING_COMMENT_CLOSING: + *errcode = G_REGEX_ERROR_UNTERMINATED_COMMENT; *errmsg = _("missing ) after comment"); break; - case G_REGEX_ERROR_EXPRESSION_TOO_LARGE: + case PCRE2_ERROR_PATTERN_TOO_LARGE: + *errcode = G_REGEX_ERROR_EXPRESSION_TOO_LARGE; *errmsg = _("regular expression is too large"); break; - case G_REGEX_ERROR_MEMORY_ERROR: - *errmsg = _("failed to get memory"); - break; - case 122: /* unmatched parentheses */ - *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; - *errmsg = _(") without opening ("); - break; - case 123: /* internal error: code overflow */ - *errcode = G_REGEX_ERROR_INTERNAL; - *errmsg = _("code overflow"); - break; - case 124: /* "unrecognized character after (?<\0 */ - *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; - *errmsg = _("unrecognized character after (?<"); + case PCRE2_ERROR_MISSING_CONDITION_CLOSING: + *errcode = G_REGEX_ERROR_MALFORMED_CONDITION; + *errmsg = _("malformed number or name after (?("); break; - case G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND: + case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH: + *errcode = G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND; *errmsg = _("lookbehind assertion is not fixed length"); break; - case G_REGEX_ERROR_MALFORMED_CONDITION: - *errmsg = _("malformed number or name after (?("); - break; - case G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES: + case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES: + *errcode = G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES; *errmsg = _("conditional group contains more than two branches"); break; - case G_REGEX_ERROR_ASSERTION_EXPECTED: + case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED: + *errcode = G_REGEX_ERROR_ASSERTION_EXPECTED; *errmsg = _("assertion expected after (?("); break; - case 129: - *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; - /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of) - * sequences here, '(?-54' would be an example for the second group. - */ - *errmsg = _("(?R or (?[+-]digits must be followed by )"); + case PCRE2_ERROR_BAD_RELATIVE_REFERENCE: + *errcode = G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE; + *errmsg = _("a numbered reference must not be zero"); break; - case G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME: + case PCRE2_ERROR_UNKNOWN_POSIX_CLASS: + *errcode = G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME; *errmsg = _("unknown POSIX class name"); break; - case G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED: - *errmsg = _("POSIX collating elements are not supported"); - break; - case G_REGEX_ERROR_HEX_CODE_TOO_LARGE: + case PCRE2_ERROR_CODE_POINT_TOO_BIG: + case PCRE2_ERROR_INVALID_HEXADECIMAL: + *errcode = G_REGEX_ERROR_HEX_CODE_TOO_LARGE; *errmsg = _("character value in \\x{...} sequence is too large"); break; - case G_REGEX_ERROR_INVALID_CONDITION: - *errmsg = _("invalid condition (?(0)"); - break; - case G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND: + case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C: + *errcode = G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND; *errmsg = _("\\C not allowed in lookbehind assertion"); break; - case 137: /* PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0 */ - /* A number of Perl escapes are not handled by PCRE. - * Therefore it explicitly raises ERR37. - */ - *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE; - *errmsg = _("escapes \\L, \\l, \\N{name}, \\U, and \\u are not supported"); - break; - case G_REGEX_ERROR_INFINITE_LOOP: - *errmsg = _("recursive call could loop indefinitely"); - break; - case 141: /* unrecognized character after (?P\0 */ - *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; - *errmsg = _("unrecognized character after (?P"); - break; - case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR: + case PCRE2_ERROR_MISSING_NAME_TERMINATOR: + *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR; *errmsg = _("missing terminator in subpattern name"); break; - case G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME: + case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME: + *errcode = G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME; *errmsg = _("two named subpatterns have the same name"); break; - case G_REGEX_ERROR_MALFORMED_PROPERTY: + case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY: + *errcode = G_REGEX_ERROR_MALFORMED_PROPERTY; *errmsg = _("malformed \\P or \\p sequence"); break; - case G_REGEX_ERROR_UNKNOWN_PROPERTY: + case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY: + *errcode = G_REGEX_ERROR_UNKNOWN_PROPERTY; *errmsg = _("unknown property name after \\P or \\p"); break; - case G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG: + case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG: + *errcode = G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG; *errmsg = _("subpattern name is too long (maximum 32 characters)"); break; - case G_REGEX_ERROR_TOO_MANY_SUBPATTERNS: + case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS: + *errcode = G_REGEX_ERROR_TOO_MANY_SUBPATTERNS; *errmsg = _("too many named subpatterns (maximum 10,000)"); break; - case G_REGEX_ERROR_INVALID_OCTAL_VALUE: + case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG: + *errcode = G_REGEX_ERROR_INVALID_OCTAL_VALUE; *errmsg = _("octal value is greater than \\377"); break; - case 152: /* internal error: overran compiling workspace */ - *errcode = G_REGEX_ERROR_INTERNAL; - *errmsg = _("overran compiling workspace"); - break; - case 153: /* internal error: previously-checked referenced subpattern not found */ - *errcode = G_REGEX_ERROR_INTERNAL; - *errmsg = _("previously-checked referenced subpattern not found"); - break; - case G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE: + case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES: + *errcode = G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE; *errmsg = _("DEFINE group contains more than one branch"); break; - case G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS: + case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE: + *errcode = G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS; *errmsg = _("inconsistent NEWLINE options"); break; - case G_REGEX_ERROR_MISSING_BACK_REFERENCE: + case PCRE2_ERROR_BACKSLASH_G_SYNTAX: + *errcode = G_REGEX_ERROR_MISSING_BACK_REFERENCE; *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or " "number, or by a plain number"); break; - case G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE: - *errmsg = _("a numbered reference must not be zero"); - break; - case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN: + case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED: + *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN; *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)"); break; - case G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB: + case PCRE2_ERROR_VERB_UNKNOWN: + *errcode = G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB; *errmsg = _("(*VERB) not recognized"); break; - case G_REGEX_ERROR_NUMBER_TOO_BIG: + case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG: + *errcode = G_REGEX_ERROR_NUMBER_TOO_BIG; *errmsg = _("number is too big"); break; - case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME: + case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED: + *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME; *errmsg = _("missing subpattern name after (?&"); break; - case G_REGEX_ERROR_MISSING_DIGIT: - *errmsg = _("digit expected after (?+"); - break; - case G_REGEX_ERROR_INVALID_DATA_CHARACTER: - *errmsg = _("] is an invalid data character in JavaScript compatibility mode"); - break; - case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME: + case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH: + *errcode = G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME; *errmsg = _("different names for subpatterns of the same number are not allowed"); break; - case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED: + case PCRE2_ERROR_MARK_MISSING_ARGUMENT: + *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED; *errmsg = _("(*MARK) must have an argument"); break; - case G_REGEX_ERROR_INVALID_CONTROL_CHAR: + case PCRE2_ERROR_BACKSLASH_C_SYNTAX: + *errcode = G_REGEX_ERROR_INVALID_CONTROL_CHAR; *errmsg = _( "\\c must be followed by an ASCII character"); break; - case G_REGEX_ERROR_MISSING_NAME: + case PCRE2_ERROR_BACKSLASH_K_SYNTAX: + *errcode = G_REGEX_ERROR_MISSING_NAME; *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name"); break; - case G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS: + case PCRE2_ERROR_BACKSLASH_N_IN_CLASS: + *errcode = G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS; *errmsg = _("\\N is not supported in a class"); break; - case G_REGEX_ERROR_TOO_MANY_FORWARD_REFERENCES: - *errmsg = _("too many forward references"); - break; - case G_REGEX_ERROR_NAME_TOO_LONG: + case PCRE2_ERROR_VERB_NAME_TOO_LONG: + *errcode = G_REGEX_ERROR_NAME_TOO_LONG; *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)"); break; - case G_REGEX_ERROR_CHARACTER_VALUE_TOO_LARGE: - *errmsg = _("character value in \\u.... sequence is too large"); + case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW: + *errcode = G_REGEX_ERROR_INTERNAL; + *errmsg = _("code overflow"); break; - - case 116: /* erroffset passed as NULL */ - /* This should not happen as we never pass a NULL erroffset */ - g_warning ("erroffset passed as NULL"); - *errcode = G_REGEX_ERROR_COMPILE; + case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P: + *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; + *errmsg = _("unrecognized character after (?P"); break; - case 117: /* unknown option bit(s) set */ - /* This should not happen as we check options before passing them - * to pcre_compile2() */ - g_warning ("unknown option bit(s) set"); - *errcode = G_REGEX_ERROR_COMPILE; + case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE: + *errcode = G_REGEX_ERROR_INTERNAL; + *errmsg = _("overran compiling workspace"); + break; + case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN: + *errcode = G_REGEX_ERROR_INTERNAL; + *errmsg = _("previously-checked referenced subpattern not found"); break; - case 132: /* this version of PCRE is compiled without UTF support */ - case 144: /* invalid UTF-8 string */ - case 145: /* support for \\P, \\p, and \\X has not been compiled */ - case 167: /* this version of PCRE is not compiled with Unicode property support */ - case 173: /* disallowed Unicode code point (>= 0xd800 && <= 0xdfff) */ - case 174: /* invalid UTF-16 string */ - /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE - * and we do not check if strings are valid */ - case 170: /* internal error: unknown opcode in find_fixedlength() */ + case PCRE2_ERROR_HEAP_FAILED: + case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW: + case PCRE2_ERROR_UNICODE_NOT_SUPPORTED: + case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT: + case PCRE2_ERROR_NO_SURROGATES_IN_UTF16: + case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS: + case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE: + case PCRE2_ERROR_INTERNAL_STUDY_ERROR: + case PCRE2_ERROR_UTF_IS_DISABLED: + case PCRE2_ERROR_UCP_IS_DISABLED: + case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS: + case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED: + case PCRE2_ERROR_INTERNAL_BAD_CODE: + case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP: *errcode = G_REGEX_ERROR_INTERNAL; break; - + case PCRE2_ERROR_INVALID_SUBPATTERN_NAME: + case PCRE2_ERROR_CLASS_INVALID_RANGE: + case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE: + case PCRE2_ERROR_PARENTHESES_STACK_CHECK: + case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED: + case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG: + case PCRE2_ERROR_MISSING_CALLOUT_CLOSING: + case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB: + case PCRE2_ERROR_NULL_PATTERN: + case PCRE2_ERROR_BAD_OPTIONS: + case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP: + case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE: + case PCRE2_ERROR_INVALID_OCTAL: + case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG: + case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG: + case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS: + case PCRE2_ERROR_VERSION_CONDITION_SYNTAX: + case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER: + case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER: + case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED: + case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP: + case PCRE2_ERROR_PATTERN_TOO_COMPLICATED: + case PCRE2_ERROR_LOOKBEHIND_TOO_LONG: + case PCRE2_ERROR_PATTERN_STRING_TOO_LONG: + case PCRE2_ERROR_BAD_LITERAL_OPTIONS: default: *errcode = G_REGEX_ERROR_COMPILE; + break; } + + g_assert (*errcode != -1); } /* GMatchInfo */ static GMatchInfo * -match_info_new (const GRegex *regex, - const gchar *string, - gint string_len, - gint start_position, - gint match_options, - gboolean is_dfa) +match_info_new (const GRegex *regex, + const gchar *string, + gint string_len, + gint start_position, + GRegexMatchFlags match_options, + gboolean is_dfa) { GMatchInfo *match_info; @@ -570,34 +819,137 @@ match_info_new (const GRegex *regex, match_info->regex = g_regex_ref ((GRegex *)regex); match_info->string = string; match_info->string_len = string_len; - match_info->matches = PCRE_ERROR_NOMATCH; + match_info->matches = PCRE2_ERROR_NOMATCH; match_info->pos = start_position; - match_info->match_opts = match_options; + match_info->match_opts = + get_pcre2_match_options (match_options, regex->orig_compile_opts); + + pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, + &match_info->n_subpatterns); - pcre_fullinfo (regex->pcre_re, regex->extra, - PCRE_INFO_CAPTURECOUNT, &match_info->n_subpatterns); + match_info->match_context = pcre2_match_context_create (NULL); if (is_dfa) { /* These values should be enough for most cases, if they are not * enough g_regex_match_all_full() will expand them. */ - match_info->n_offsets = 24; match_info->n_workspace = 100; match_info->workspace = g_new (gint, match_info->n_workspace); } - else - { - match_info->n_offsets = (match_info->n_subpatterns + 1) * 3; - } + match_info->n_offsets = 2; match_info->offsets = g_new0 (gint, match_info->n_offsets); /* Set an invalid position for the previous match. */ match_info->offsets[0] = -1; match_info->offsets[1] = -1; + match_info->match_data = pcre2_match_data_create_from_pattern ( + match_info->regex->pcre_re, + NULL); + return match_info; } +static gboolean +recalc_match_offsets (GMatchInfo *match_info, + GError **error) +{ + PCRE2_SIZE *ovector; + uint32_t ovector_size = 0; + uint32_t pre_n_offset; + uint32_t i; + + g_assert (!IS_PCRE2_ERROR (match_info->matches)); + + if (match_info->matches == PCRE2_ERROR_PARTIAL) + ovector_size = 1; + else if (match_info->matches > 0) + ovector_size = match_info->matches; + + g_assert (ovector_size != 0); + + if (pcre2_get_ovector_count (match_info->match_data) < ovector_size) + { + g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, + _("Error while matching regular expression %s: %s"), + match_info->regex->pattern, _("code overflow")); + return FALSE; + } + + pre_n_offset = match_info->n_offsets; + match_info->n_offsets = ovector_size * 2; + ovector = pcre2_get_ovector_pointer (match_info->match_data); + + if (match_info->n_offsets != pre_n_offset) + { + match_info->offsets = g_realloc_n (match_info->offsets, + match_info->n_offsets, + sizeof (gint)); + } + + for (i = 0; i < match_info->n_offsets; i++) + { + match_info->offsets[i] = (int) ovector[i]; + } + + return TRUE; +} + +static JITStatus +enable_jit_with_match_options (GRegex *regex, + uint32_t match_options) +{ + gint retval; + uint32_t old_jit_options, new_jit_options; + + if (!(regex->orig_compile_opts & G_REGEX_OPTIMIZE)) + return JIT_STATUS_DISABLED; + + if (regex->jit_status == JIT_STATUS_DISABLED) + return JIT_STATUS_DISABLED; + + if (match_options & G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS) + return JIT_STATUS_DISABLED; + + old_jit_options = regex->jit_options; + new_jit_options = old_jit_options | PCRE2_JIT_COMPLETE; + if (match_options & PCRE2_PARTIAL_HARD) + new_jit_options |= PCRE2_JIT_PARTIAL_HARD; + if (match_options & PCRE2_PARTIAL_SOFT) + new_jit_options |= PCRE2_JIT_PARTIAL_SOFT; + + /* no new options enabled */ + if (new_jit_options == old_jit_options) + return regex->jit_status; + + retval = pcre2_jit_compile (regex->pcre_re, new_jit_options); + switch (retval) + { + case 0: /* JIT enabled successfully */ + regex->jit_options = new_jit_options; + return JIT_STATUS_ENABLED; + case PCRE2_ERROR_NOMEMORY: + g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " + "but JIT was unable to allocate executable memory for the " + "compiler. Falling back to interpretive code."); + return JIT_STATUS_DISABLED; + case PCRE2_ERROR_JIT_BADOPTION: + g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " + "but JIT support is not available. Falling back to " + "interpretive code."); + return JIT_STATUS_DISABLED; + break; + default: + g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " + "but request for JIT support had unexpectedly failed (error %d). " + "Falling back to interpretive code.", retval); + return JIT_STATUS_DISABLED; + break; + } + + g_assert_not_reached (); +} + /** * g_match_info_get_regex: * @match_info: a #GMatchInfo @@ -606,7 +958,7 @@ match_info_new (const GRegex *regex, * and must not be freed. Use g_regex_ref() if you need to keep it * after you free @match_info object. * - * Returns: #GRegex object used in @match_info + * Returns: (transfer none): #GRegex object used in @match_info * * Since: 2.14 */ @@ -669,6 +1021,10 @@ g_match_info_unref (GMatchInfo *match_info) if (g_atomic_int_dec_and_test (&match_info->ref_count)) { g_regex_unref (match_info->regex); + if (match_info->match_context) + pcre2_match_context_free (match_info->match_context); + if (match_info->match_data) + pcre2_match_data_free (match_info->match_data); g_free (match_info->offsets); g_free (match_info->workspace); g_free (match_info); @@ -713,8 +1069,10 @@ gboolean g_match_info_next (GMatchInfo *match_info, GError **error) { + JITStatus jit_status; gint prev_match_start; gint prev_match_end; + uint32_t opts; g_return_val_if_fail (match_info != NULL, FALSE); g_return_val_if_fail (error == NULL || *error == NULL, FALSE); @@ -727,25 +1085,66 @@ g_match_info_next (GMatchInfo *match_info, { /* we have reached the end of the string */ match_info->pos = -1; - match_info->matches = PCRE_ERROR_NOMATCH; + match_info->matches = PCRE2_ERROR_NOMATCH; return FALSE; } - match_info->matches = pcre_exec (match_info->regex->pcre_re, - match_info->regex->extra, - match_info->string, - match_info->string_len, - match_info->pos, - match_info->regex->match_opts | match_info->match_opts, - match_info->offsets, - match_info->n_offsets); - if (IS_PCRE_ERROR (match_info->matches)) + opts = match_info->regex->match_opts | match_info->match_opts; + + jit_status = enable_jit_with_match_options (match_info->regex, opts); + if (jit_status == JIT_STATUS_ENABLED) { + match_info->matches = pcre2_jit_match (match_info->regex->pcre_re, + (PCRE2_SPTR8) match_info->string, + match_info->string_len, + match_info->pos, + opts, + match_info->match_data, + match_info->match_context); + } + else + { + match_info->matches = pcre2_match (match_info->regex->pcre_re, + (PCRE2_SPTR8) match_info->string, + match_info->string_len, + match_info->pos, + opts, + match_info->match_data, + match_info->match_context); + } + + if (IS_PCRE2_ERROR (match_info->matches)) + { + gchar *error_msg = get_match_error_message (match_info->matches); + g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, _("Error while matching regular expression %s: %s"), - match_info->regex->pattern, match_error (match_info->matches)); + match_info->regex->pattern, error_msg); + g_clear_pointer (&error_msg, g_free); + return FALSE; + } + else if (match_info->matches == 0) + { + /* info->offsets is too small. */ + match_info->n_offsets *= 2; + match_info->offsets = g_realloc_n (match_info->offsets, + match_info->n_offsets, + sizeof (gint)); + + pcre2_match_data_free (match_info->match_data); + match_info->match_data = pcre2_match_data_create (match_info->n_offsets, NULL); + + return g_match_info_next (match_info, error); + } + else if (match_info->matches == PCRE2_ERROR_NOMATCH) + { + /* We're done with this match info */ + match_info->pos = -1; return FALSE; } + else + if (!recalc_match_offsets (match_info, error)) + return FALSE; /* avoid infinite loops if the pattern is an empty string or something * equivalent */ @@ -755,7 +1154,7 @@ g_match_info_next (GMatchInfo *match_info, { /* we have reached the end of the string */ match_info->pos = -1; - match_info->matches = PCRE_ERROR_NOMATCH; + match_info->matches = PCRE2_ERROR_NOMATCH; return FALSE; } @@ -768,7 +1167,8 @@ g_match_info_next (GMatchInfo *match_info, match_info->pos = match_info->offsets[1]; } - g_assert (match_info->matches <= match_info->n_subpatterns + 1); + g_assert (match_info->matches < 0 || + (uint32_t) match_info->matches <= match_info->n_subpatterns + 1); /* it's possible to get two identical matches when we are matching * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and @@ -833,10 +1233,10 @@ g_match_info_get_match_count (const GMatchInfo *match_info) { g_return_val_if_fail (match_info, -1); - if (match_info->matches == PCRE_ERROR_NOMATCH) + if (match_info->matches == PCRE2_ERROR_NOMATCH) /* no match */ return 0; - else if (match_info->matches < PCRE_ERROR_NOMATCH) + else if (match_info->matches < PCRE2_ERROR_NOMATCH) /* error */ return -1; else @@ -891,7 +1291,7 @@ g_match_info_is_partial_match (const GMatchInfo *match_info) { g_return_val_if_fail (match_info != NULL, FALSE); - return match_info->matches == PCRE_ERROR_PARTIAL; + return match_info->matches == PCRE2_ERROR_PARTIAL; } /** @@ -988,8 +1388,6 @@ gchar * g_match_info_fetch (const GMatchInfo *match_info, gint match_num) { - /* we cannot use pcre_get_substring() because it allocates the - * string using pcre_malloc(). */ gchar *match = NULL; gint start, end; @@ -1053,7 +1451,7 @@ g_match_info_fetch_pos (const GMatchInfo *match_info, /* make sure the sub expression number they're requesting is less than * the total number of sub expressions in the regex. When matching all * (g_regex_match_all()), also compare against the number of matches */ - if (match_num >= MAX (match_info->n_subpatterns + 1, match_info->matches)) + if ((uint32_t) match_num >= MAX (match_info->n_subpatterns + 1, (uint32_t) match_info->matches)) return FALSE; if (start_pos != NULL) @@ -1069,32 +1467,33 @@ g_match_info_fetch_pos (const GMatchInfo *match_info, * Returns number of first matched subpattern with name @name. * There may be more than one in case when DUPNAMES is used, * and not all subpatterns with that name match; - * pcre_get_stringnumber() does not work in that case. + * pcre2_substring_number_from_name() does not work in that case. */ static gint get_matched_substring_number (const GMatchInfo *match_info, const gchar *name) { gint entrysize; - gchar *first, *last; + PCRE2_SPTR first, last; guchar *entry; - if (!(match_info->regex->compile_opts & G_REGEX_DUPNAMES)) - return pcre_get_stringnumber (match_info->regex->pcre_re, name); + if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES)) + return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR8) name); - /* This code is copied from pcre_get.c: get_first_set() */ - entrysize = pcre_get_stringtable_entries (match_info->regex->pcre_re, - name, - &first, - &last); + /* This code is analogous to code from pcre2_substring.c: + * pcre2_substring_get_byname() */ + entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re, + (PCRE2_SPTR8) name, + &first, + &last); if (entrysize <= 0) return entrysize; for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize) { - gint n = (entry[0] << 8) + entry[1]; - if (match_info->offsets[n*2] >= 0) + guint n = (entry[0] << 8) + entry[1]; + if (n * 2 < match_info->n_offsets && match_info->offsets[n * 2] >= 0) return n; } @@ -1124,8 +1523,6 @@ gchar * g_match_info_fetch_named (const GMatchInfo *match_info, const gchar *name) { - /* we cannot use pcre_get_named_substring() because it allocates the - * string using pcre_malloc(). */ gint num; g_return_val_if_fail (match_info != NULL, NULL); @@ -1207,8 +1604,6 @@ g_match_info_fetch_named_pos (const GMatchInfo *match_info, gchar ** g_match_info_fetch_all (const GMatchInfo *match_info) { - /* we cannot use pcre_get_substring_list() because the returned value - * isn't suitable for g_strfreev(). */ gchar **result; gint i; @@ -1266,21 +1661,19 @@ g_regex_unref (GRegex *regex) { g_free (regex->pattern); if (regex->pcre_re != NULL) - pcre_free (regex->pcre_re); - if (regex->extra != NULL) - pcre_free (regex->extra); + pcre2_code_free (regex->pcre_re); g_free (regex); } } -/* - * @match_options: (inout) (optional): - */ -static pcre *regex_compile (const gchar *pattern, - GRegexCompileFlags compile_options, - GRegexCompileFlags *compile_options_out, - GRegexMatchFlags *match_options, - GError **error); +static pcre2_code * regex_compile (const gchar *pattern, + uint32_t compile_options, + uint32_t newline_options, + uint32_t bsr_options, + GError **error); + +static uint32_t get_pcre2_inline_compile_options (pcre2_code *re, + uint32_t compile_options); /** * g_regex_new: @@ -1304,175 +1697,191 @@ g_regex_new (const gchar *pattern, GError **error) { GRegex *regex; - pcre *re; - const gchar *errmsg; - gboolean optimize = FALSE; + pcre2_code *re; static gsize initialised = 0; + uint32_t pcre_compile_options; + uint32_t pcre_match_options; + uint32_t newline_options; + uint32_t bsr_options; g_return_val_if_fail (pattern != NULL, NULL); g_return_val_if_fail (error == NULL || *error == NULL, NULL); - g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL); +G_GNUC_BEGIN_IGNORE_DEPRECATIONS + g_return_val_if_fail ((compile_options & ~(G_REGEX_COMPILE_MASK | + G_REGEX_JAVASCRIPT_COMPAT)) == 0, NULL); +G_GNUC_END_IGNORE_DEPRECATIONS g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); if (g_once_init_enter (&initialised)) { - int supports_utf8, supports_ucp; + int supports_utf8; - pcre_config (PCRE_CONFIG_UTF8, &supports_utf8); + pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8); if (!supports_utf8) g_critical (_("PCRE library is compiled without UTF8 support")); - pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &supports_ucp); - if (!supports_ucp) - g_critical (_("PCRE library is compiled without UTF8 properties support")); - - g_once_init_leave (&initialised, supports_utf8 && supports_ucp ? 1 : 2); + g_once_init_leave (&initialised, supports_utf8 ? 1 : 2); } - if (G_UNLIKELY (initialised != 1)) + if (G_UNLIKELY (initialised != 1)) { g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, _("PCRE library is compiled with incompatible options")); return NULL; } - /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK, - * as we do not need to wrap PCRE_NO_UTF8_CHECK. */ - if (compile_options & G_REGEX_OPTIMIZE) - optimize = TRUE; + pcre_compile_options = get_pcre2_compile_options (compile_options); + pcre_match_options = get_pcre2_match_options (match_options, compile_options); - re = regex_compile (pattern, compile_options, &compile_options, - &match_options, error); + newline_options = get_pcre2_newline_match_options (match_options); + if (newline_options == 0) + newline_options = get_pcre2_newline_compile_options (compile_options); + if (newline_options == 0) + { + g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, + "Invalid newline flags"); + return NULL; + } + + bsr_options = get_pcre2_bsr_match_options (match_options); + if (!bsr_options) + bsr_options = get_pcre2_bsr_compile_options (compile_options); + + re = regex_compile (pattern, pcre_compile_options, + newline_options, bsr_options, error); if (re == NULL) return NULL; + pcre_compile_options |= + get_pcre2_inline_compile_options (re, pcre_compile_options); + regex = g_new0 (GRegex, 1); regex->ref_count = 1; regex->pattern = g_strdup (pattern); regex->pcre_re = re; - regex->compile_opts = compile_options; - regex->match_opts = match_options; - - if (optimize) - { - regex->extra = pcre_study (regex->pcre_re, 0, &errmsg); - if (errmsg != NULL) - { - GError *tmp_error = g_error_new (G_REGEX_ERROR, - G_REGEX_ERROR_OPTIMIZE, - _("Error while optimizing " - "regular expression %s: %s"), - regex->pattern, - errmsg); - g_propagate_error (error, tmp_error); - - g_regex_unref (regex); - return NULL; - } - } + regex->compile_opts = pcre_compile_options; + regex->orig_compile_opts = compile_options; + regex->match_opts = pcre_match_options; + regex->orig_match_opts = match_options; + regex->jit_status = enable_jit_with_match_options (regex, regex->match_opts); return regex; } -static pcre * -regex_compile (const gchar *pattern, - GRegexCompileFlags compile_options, - GRegexCompileFlags *compile_options_out, - GRegexMatchFlags *match_options, - GError **error) +static pcre2_code * +regex_compile (const gchar *pattern, + uint32_t compile_options, + uint32_t newline_options, + uint32_t bsr_options, + GError **error) { - pcre *re; + pcre2_code *re; + pcre2_compile_context *context; const gchar *errmsg; - gint erroffset; + PCRE2_SIZE erroffset; gint errcode; - GRegexCompileFlags nonpcre_compile_options; - unsigned long int pcre_compile_options; - nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; + context = pcre2_compile_context_create (NULL); - /* In GRegex the string are, by default, UTF-8 encoded. PCRE - * instead uses UTF-8 only if required with PCRE_UTF8. */ - if (compile_options & G_REGEX_RAW) - { - /* disable utf-8 */ - compile_options &= ~G_REGEX_RAW; - } - else + /* set newline options */ + if (pcre2_set_newline (context, newline_options) != 0) { - /* enable utf-8 */ - compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK; - - if (match_options != NULL) - *match_options |= PCRE_NO_UTF8_CHECK; + g_set_error (error, G_REGEX_ERROR, + G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, + "Invalid newline flags"); + pcre2_compile_context_free (context); + return NULL; } - /* PCRE_NEWLINE_ANY is the default for the internal PCRE but - * not for the system one. */ - if (!(compile_options & G_REGEX_NEWLINE_CR) && - !(compile_options & G_REGEX_NEWLINE_LF)) + /* set bsr options */ + if (pcre2_set_bsr (context, bsr_options) != 0) { - compile_options |= PCRE_NEWLINE_ANY; + g_set_error (error, G_REGEX_ERROR, + G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, + "Invalid BSR flags"); + pcre2_compile_context_free (context); + return NULL; } - compile_options |= PCRE_UCP; + /* In case UTF-8 mode is used, also set PCRE2_NO_UTF_CHECK */ + if (compile_options & PCRE2_UTF) + compile_options |= PCRE2_NO_UTF_CHECK; - /* PCRE_BSR_UNICODE is the default for the internal PCRE but - * possibly not for the system one. - */ - if (~compile_options & G_REGEX_BSR_ANYCRLF) - compile_options |= PCRE_BSR_UNICODE; + compile_options |= PCRE2_UCP; /* compile the pattern */ - re = pcre_compile2 (pattern, compile_options, &errcode, - &errmsg, &erroffset, NULL); + re = pcre2_compile ((PCRE2_SPTR8) pattern, + PCRE2_ZERO_TERMINATED, + compile_options, + &errcode, + &erroffset, + context); + pcre2_compile_context_free (context); /* if the compilation failed, set the error member and return * immediately */ if (re == NULL) { GError *tmp_error; + gchar *offset_str; + gchar *pcre2_errmsg = NULL; + int original_errcode; /* Translate the PCRE error code to GRegexError and use a translated * error message if possible */ + original_errcode = errcode; translate_compile_error (&errcode, &errmsg); + if (!errmsg) + { + errmsg = _("unknown error"); + pcre2_errmsg = get_pcre2_error_string (original_errcode); + } + /* PCRE uses byte offsets but we want to show character offsets */ erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]); + offset_str = g_strdup_printf ("%" G_GSIZE_FORMAT, erroffset); tmp_error = g_error_new (G_REGEX_ERROR, errcode, - _("Error while compiling regular " - "expression %s at char %d: %s"), - pattern, erroffset, errmsg); + _("Error while compiling regular expression ‘%s’ " + "at char %s: %s"), + pattern, offset_str, + pcre2_errmsg ? pcre2_errmsg : errmsg); g_propagate_error (error, tmp_error); + g_free (offset_str); + g_clear_pointer (&pcre2_errmsg, g_free); return NULL; } - /* For options set at the beginning of the pattern, pcre puts them into - * compile options, e.g. "(?i)foo" will make the pcre structure store - * PCRE_CASELESS even though it wasn't explicitly given for compilation. */ - pcre_fullinfo (re, NULL, PCRE_INFO_OPTIONS, &pcre_compile_options); - compile_options = pcre_compile_options & G_REGEX_COMPILE_PCRE_MASK; + return re; +} - /* Don't leak PCRE_NEWLINE_ANY, which is part of PCRE_NEWLINE_ANYCRLF */ - if ((pcre_compile_options & PCRE_NEWLINE_ANYCRLF) != PCRE_NEWLINE_ANYCRLF) - compile_options &= ~PCRE_NEWLINE_ANY; +static uint32_t +get_pcre2_inline_compile_options (pcre2_code *re, + uint32_t compile_options) +{ + uint32_t pcre_compile_options; + uint32_t nonpcre_compile_options; + /* For options set at the beginning of the pattern, pcre puts them into + * compile options, e.g. "(?i)foo" will make the pcre structure store + * PCRE2_CASELESS even though it wasn't explicitly given for compilation. */ + nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; + pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options); + compile_options = pcre_compile_options & G_REGEX_PCRE2_COMPILE_MASK; compile_options |= nonpcre_compile_options; - if (!(compile_options & G_REGEX_DUPNAMES)) + if (!(compile_options & PCRE2_DUPNAMES)) { - gboolean jchanged = FALSE; - pcre_fullinfo (re, NULL, PCRE_INFO_JCHANGED, &jchanged); + uint32_t jchanged = 0; + pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged); if (jchanged) - compile_options |= G_REGEX_DUPNAMES; + compile_options |= PCRE2_DUPNAMES; } - if (compile_options_out != 0) - *compile_options_out = compile_options; - - return re; + return compile_options; } /** @@ -1509,10 +1918,9 @@ g_regex_get_pattern (const GRegex *regex) gint g_regex_get_max_backref (const GRegex *regex) { - gint value; + uint32_t value; - pcre_fullinfo (regex->pcre_re, regex->extra, - PCRE_INFO_BACKREFMAX, &value); + pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value); return value; } @@ -1530,10 +1938,9 @@ g_regex_get_max_backref (const GRegex *regex) gint g_regex_get_capture_count (const GRegex *regex) { - gint value; + uint32_t value; - pcre_fullinfo (regex->pcre_re, regex->extra, - PCRE_INFO_CAPTURECOUNT, &value); + pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value); return value; } @@ -1551,10 +1958,9 @@ g_regex_get_capture_count (const GRegex *regex) gboolean g_regex_get_has_cr_or_lf (const GRegex *regex) { - gint value; + uint32_t value; - pcre_fullinfo (regex->pcre_re, regex->extra, - PCRE_INFO_HASCRORLF, &value); + pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value); return !!value; } @@ -1574,10 +1980,10 @@ g_regex_get_has_cr_or_lf (const GRegex *regex) gint g_regex_get_max_lookbehind (const GRegex *regex) { - gint max_lookbehind; + uint32_t max_lookbehind; - pcre_fullinfo (regex->pcre_re, regex->extra, - PCRE_INFO_MAXLOOKBEHIND, &max_lookbehind); + pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND, + &max_lookbehind); return max_lookbehind; } @@ -1599,9 +2005,46 @@ g_regex_get_max_lookbehind (const GRegex *regex) GRegexCompileFlags g_regex_get_compile_flags (const GRegex *regex) { + GRegexCompileFlags extra_flags; + uint32_t info_value; + g_return_val_if_fail (regex != NULL, 0); - return regex->compile_opts; + /* Preserve original G_REGEX_OPTIMIZE */ + extra_flags = (regex->orig_compile_opts & G_REGEX_OPTIMIZE); + + /* Also include the newline options */ + pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_NEWLINE, &info_value); + switch (info_value) + { + case PCRE2_NEWLINE_ANYCRLF: + extra_flags |= G_REGEX_NEWLINE_ANYCRLF; + break; + case PCRE2_NEWLINE_CRLF: + extra_flags |= G_REGEX_NEWLINE_CRLF; + break; + case PCRE2_NEWLINE_LF: + extra_flags |= G_REGEX_NEWLINE_LF; + break; + case PCRE2_NEWLINE_CR: + extra_flags |= G_REGEX_NEWLINE_CR; + break; + default: + break; + } + + /* Also include the bsr options */ + pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BSR, &info_value); + switch (info_value) + { + case PCRE2_BSR_ANYCRLF: + extra_flags |= G_REGEX_BSR_ANYCRLF; + break; + default: + break; + } + + return g_regex_compile_flags_from_pcre2 (regex->compile_opts) | extra_flags; } /** @@ -1617,9 +2060,15 @@ g_regex_get_compile_flags (const GRegex *regex) GRegexMatchFlags g_regex_get_match_flags (const GRegex *regex) { + uint32_t flags; + g_return_val_if_fail (regex != NULL, 0); - return regex->match_opts & G_REGEX_MATCH_MASK; + flags = g_regex_match_flags_from_pcre2 (regex->match_opts); + flags |= (regex->orig_match_opts & G_REGEX_MATCH_NEWLINE_MASK); + flags |= (regex->orig_match_opts & (G_REGEX_MATCH_BSR_ANY | G_REGEX_MATCH_BSR_ANYCRLF)); + + return flags; } /** @@ -1653,7 +2102,7 @@ g_regex_match_simple (const gchar *pattern, GRegex *regex; gboolean result; - regex = g_regex_new (pattern, compile_options, 0, NULL); + regex = g_regex_new (pattern, compile_options, G_REGEX_MATCH_DEFAULT, NULL); if (!regex) return FALSE; result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL); @@ -1692,7 +2141,7 @@ g_regex_match_simple (const gchar *pattern, * GRegex *regex; * GMatchInfo *match_info; * - * regex = g_regex_new ("[A-Z]+", 0, 0, NULL); + * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); * g_regex_match (regex, string, 0, &match_info); * while (g_match_info_matches (match_info)) * { @@ -1768,7 +2217,7 @@ g_regex_match (const GRegex *regex, * GMatchInfo *match_info; * GError *error = NULL; * - * regex = g_regex_new ("[A-Z]+", 0, 0, NULL); + * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error); * while (g_match_info_matches (match_info)) * { @@ -1922,9 +2371,10 @@ g_regex_match_all_full (const GRegex *regex, { GMatchInfo *info; gboolean done; - pcre *pcre_re; - pcre_extra *extra; + pcre2_code *pcre_re; gboolean retval; + uint32_t newline_options; + uint32_t bsr_options; g_return_val_if_fail (regex != NULL, FALSE); g_return_val_if_fail (string != NULL, FALSE); @@ -1932,29 +2382,26 @@ g_regex_match_all_full (const GRegex *regex, g_return_val_if_fail (error == NULL || *error == NULL, FALSE); g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); -#ifdef PCRE_NO_AUTO_POSSESS - /* For PCRE >= 8.34 we need to turn off PCRE_NO_AUTO_POSSESS, which - * is an optimization for normal regex matching, but results in omitting - * some shorter matches here, and an observable behaviour change. + newline_options = get_pcre2_newline_match_options (match_options); + if (!newline_options) + newline_options = get_pcre2_newline_compile_options (regex->orig_compile_opts); + + bsr_options = get_pcre2_bsr_match_options (match_options); + if (!bsr_options) + bsr_options = get_pcre2_bsr_compile_options (regex->orig_compile_opts); + + /* For PCRE2 we need to turn off PCRE2_NO_AUTO_POSSESS, which is an + * optimization for normal regex matching, but results in omitting some + * shorter matches here, and an observable behaviour change. * * DFA matching is rather niche, and very rarely used according to * codesearch.debian.net, so don't bother caching the recompiled RE. */ pcre_re = regex_compile (regex->pattern, - regex->compile_opts | PCRE_NO_AUTO_POSSESS, - NULL, NULL, error); - + regex->compile_opts | PCRE2_NO_AUTO_POSSESS, + newline_options, bsr_options, error); if (pcre_re == NULL) return FALSE; - /* Not bothering to cache the optimization data either, with similar - * reasoning */ - extra = NULL; -#else - /* For PCRE < 8.33 the precompiled regex is fine. */ - pcre_re = regex->pcre_re; - extra = regex->extra; -#endif - info = match_info_new (regex, string, string_len, start_position, match_options, TRUE); @@ -1962,39 +2409,50 @@ g_regex_match_all_full (const GRegex *regex, while (!done) { done = TRUE; - info->matches = pcre_dfa_exec (pcre_re, extra, - info->string, info->string_len, - info->pos, - regex->match_opts | match_options, - info->offsets, info->n_offsets, - info->workspace, info->n_workspace); - if (info->matches == PCRE_ERROR_DFA_WSSIZE) + info->matches = pcre2_dfa_match (pcre_re, + (PCRE2_SPTR8) info->string, info->string_len, + info->pos, + (regex->match_opts | info->match_opts), + info->match_data, + info->match_context, + info->workspace, info->n_workspace); + if (info->matches == PCRE2_ERROR_DFA_WSSIZE) { /* info->workspace is too small. */ info->n_workspace *= 2; - info->workspace = g_realloc (info->workspace, - info->n_workspace * sizeof (gint)); + info->workspace = g_realloc_n (info->workspace, + info->n_workspace, + sizeof (gint)); done = FALSE; } else if (info->matches == 0) { /* info->offsets is too small. */ info->n_offsets *= 2; - info->offsets = g_realloc (info->offsets, - info->n_offsets * sizeof (gint)); + info->offsets = g_realloc_n (info->offsets, + info->n_offsets, + sizeof (gint)); + pcre2_match_data_free (info->match_data); + info->match_data = pcre2_match_data_create (info->n_offsets, NULL); done = FALSE; } - else if (IS_PCRE_ERROR (info->matches)) + else if (IS_PCRE2_ERROR (info->matches)) { + gchar *error_msg = get_match_error_message (info->matches); + g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, _("Error while matching regular expression %s: %s"), - regex->pattern, match_error (info->matches)); + regex->pattern, error_msg); + g_clear_pointer (&error_msg, g_free); + } + else if (info->matches != PCRE2_ERROR_NOMATCH) + { + if (!recalc_match_offsets (info, error)) + info->matches = PCRE2_ERROR_NOMATCH; } } -#ifdef PCRE_NO_AUTO_POSSESS - pcre_free (pcre_re); -#endif + pcre2_code_free (pcre_re); /* don’t assert that (info->matches <= info->n_subpatterns + 1) as that only * holds true for a single match, rather than matching all */ @@ -2032,8 +2490,8 @@ g_regex_get_string_number (const GRegex *regex, g_return_val_if_fail (regex != NULL, -1); g_return_val_if_fail (name != NULL, -1); - num = pcre_get_stringnumber (regex->pcre_re, name); - if (num == PCRE_ERROR_NOSUBSTRING) + num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR8) name); + if (num == PCRE2_ERROR_NOSUBSTRING) num = -1; return num; @@ -2949,7 +3407,7 @@ g_regex_replace_literal (const GRegex *regex, * g_hash_table_insert (h, "3", "THREE"); * g_hash_table_insert (h, "4", "FOUR"); * - * reg = g_regex_new ("1|2|3|4", 0, 0, NULL); + * reg = g_regex_new ("1|2|3|4", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL); * g_hash_table_destroy (h); * @@ -3126,7 +3584,7 @@ g_regex_escape_nul (const gchar *string, /** * g_regex_escape_string: - * @string: (array length=length): the string to escape + * @string: the string to escape * @length: the length of @string, in bytes, or -1 if @string is nul-terminated * * Escapes the special characters used for regular expressions |