summaryrefslogtreecommitdiff
path: root/pcre
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mnogosearch.org>2013-12-02 14:39:08 +0400
committerAlexander Barkov <bar@mnogosearch.org>2013-12-02 14:39:08 +0400
commit5bb01fa1ace1dcfe87c9c1eae3cd30a55c9de032 (patch)
tree0e88f2fe808ebab43f6f6c1c4bd314693525455c /pcre
parentd25d7ec589cb83acd00ae2c7251dd851ff3cc1a7 (diff)
downloadmariadb-git-5bb01fa1ace1dcfe87c9c1eae3cd30a55c9de032.tar.gz
MDEV-5357 REGEXP word boundaries don't work
Applied a patch from Philip Hazel implementing the non-standard syntax for word boundaries in PCRE, for compatibility with the old Henry Spencer's regex library.
Diffstat (limited to 'pcre')
-rw-r--r--pcre/pcre_compile.c35
-rw-r--r--pcre/pcre_internal.h2
2 files changed, 37 insertions, 0 deletions
diff --git a/pcre/pcre_compile.c b/pcre/pcre_compile.c
index 0ebb3f168f1..a307372fbfe 100644
--- a/pcre/pcre_compile.c
+++ b/pcre/pcre_compile.c
@@ -253,6 +253,19 @@ static const verbitem verbs[] = {
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
+/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
+another regex library. */
+
+static const pcre_uchar sub_start_of_word[] = {
+ CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
+ CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
+
+static const pcre_uchar sub_end_of_word[] = {
+ CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
+ CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
+ CHAR_RIGHT_PARENTHESIS, '\0' };
+
+
/* Tables of names of POSIX character classes and their lengths. The names are
now all in a single string, to reduce the number of relocations when a shared
library is dynamically loaded. The list of lengths is terminated by a zero
@@ -4036,8 +4049,30 @@ for (;; ptr++)
goto FAILED;
}
goto NORMAL_CHAR;
+
+ /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
+ used for "start of word" and "end of word". As these are otherwise illegal
+ sequences, we don't break anything by recognizing them. They are replaced
+ by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
+ erroneous and are handled by the normal code below. */
case CHAR_LEFT_SQUARE_BRACKET:
+ if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
+ {
+ nestptr = ptr + 7;
+ ptr = sub_start_of_word - 1;
+ continue;
+ }
+
+ if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
+ {
+ nestptr = ptr + 7;
+ ptr = sub_end_of_word - 1;
+ continue;
+ }
+
+ /* Handle a real character class. */
+
previous = code;
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
diff --git a/pcre/pcre_internal.h b/pcre/pcre_internal.h
index 307069ca9d6..cd6ef3ed83e 100644
--- a/pcre/pcre_internal.h
+++ b/pcre/pcre_internal.h
@@ -1794,6 +1794,8 @@ only. */
#define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
+#define STRING_WEIRD_STARTWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
+#define STRING_WEIRD_ENDWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS