MDEV-5357 REGEXP word boundaries don't work

Applied a patch from Philip Hazel implementing the non-standard syntax for word boundaries in PCRE, for compatibility with the old Henry Spencer's regex library.
author: Alexander Barkov <bar@mnogosearch.org> 2013-12-02 14:39:08 +0400
committer: Alexander Barkov <bar@mnogosearch.org> 2013-12-02 14:39:08 +0400
commit: 5bb01fa1ace1dcfe87c9c1eae3cd30a55c9de032 (patch)
tree: 0e88f2fe808ebab43f6f6c1c4bd314693525455c /pcre
parent: d25d7ec589cb83acd00ae2c7251dd851ff3cc1a7 (diff)
download: mariadb-git-5bb01fa1ace1dcfe87c9c1eae3cd30a55c9de032.tar.gz
2 files changed, 37 insertions, 0 deletions
diff --git a/pcre/pcre_compile.c b/pcre/pcre_compile.c
index 0ebb3f168f1..a307372fbfe 100644
--- a/pcre/pcre_compile.c
+++ b/pcre/pcre_compile.c
@@ -253,6 +253,19 @@ static const verbitem verbs[] = {
 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
 
 
+/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in 
+another regex library. */
+
+static const pcre_uchar sub_start_of_word[] = {
+  CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
+  CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' }; 
+
+static const pcre_uchar sub_end_of_word[] = {
+  CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
+  CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
+  CHAR_RIGHT_PARENTHESIS, '\0' }; 
+
+
 /* Tables of names of POSIX character classes and their lengths. The names are
 now all in a single string, to reduce the number of relocations when a shared
 library is dynamically loaded. The list of lengths is terminated by a zero
@@ -4036,8 +4049,30 @@ for (;; ptr++)
       goto FAILED;
       }
     goto NORMAL_CHAR;
+    
+    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is 
+    used for "start of word" and "end of word". As these are otherwise illegal
+    sequences, we don't break anything by recognizing them. They are replaced
+    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
+    erroneous and are handled by the normal code below. */
 
     case CHAR_LEFT_SQUARE_BRACKET:
+    if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
+      {
+      nestptr = ptr + 7;
+      ptr = sub_start_of_word - 1;
+      continue;  
+      }  
+
+    if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
+      {
+      nestptr = ptr + 7;
+      ptr = sub_end_of_word - 1;
+      continue;  
+      }  
+
+    /* Handle a real character class. */
+ 
     previous = code;
 
     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
diff --git a/pcre/pcre_internal.h b/pcre/pcre_internal.h
index 307069ca9d6..cd6ef3ed83e 100644
--- a/pcre/pcre_internal.h
+++ b/pcre/pcre_internal.h
@@ -1794,6 +1794,8 @@ only. */
 #define STRING_xdigit               STR_x STR_d STR_i STR_g STR_i STR_t
 
 #define STRING_DEFINE               STR_D STR_E STR_F STR_I STR_N STR_E
+#define STRING_WEIRD_STARTWORD      STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
+#define STRING_WEIRD_ENDWORD        STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
 
 #define STRING_CR_RIGHTPAR             STR_C STR_R STR_RIGHT_PARENTHESIS
 #define STRING_LF_RIGHTPAR             STR_L STR_F STR_RIGHT_PARENTHESIS
author	Alexander Barkov <bar@mnogosearch.org>	2013-12-02 14:39:08 +0400
committer	Alexander Barkov <bar@mnogosearch.org>	2013-12-02 14:39:08 +0400
commit	5bb01fa1ace1dcfe87c9c1eae3cd30a55c9de032 (patch)
tree	0e88f2fe808ebab43f6f6c1c4bd314693525455c /pcre
parent	d25d7ec589cb83acd00ae2c7251dd851ff3cc1a7 (diff)
download	mariadb-git-5bb01fa1ace1dcfe87c9c1eae3cd30a55c9de032.tar.gz