diff options
author | Alexander Barkov <bar@mnogosearch.org> | 2013-12-02 14:39:08 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mnogosearch.org> | 2013-12-02 14:39:08 +0400 |
commit | 5bb01fa1ace1dcfe87c9c1eae3cd30a55c9de032 (patch) | |
tree | 0e88f2fe808ebab43f6f6c1c4bd314693525455c | |
parent | d25d7ec589cb83acd00ae2c7251dd851ff3cc1a7 (diff) | |
download | mariadb-git-5bb01fa1ace1dcfe87c9c1eae3cd30a55c9de032.tar.gz |
MDEV-5357 REGEXP word boundaries don't work
Applied a patch from Philip Hazel implementing the non-standard
syntax for word boundaries in PCRE, for compatibility with the
old Henry Spencer's regex library.
-rw-r--r-- | mysql-test/include/ctype_regex_utf8.inc | 24 | ||||
-rw-r--r-- | mysql-test/include/ctype_utf8mb4.inc | 20 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf8.result | 28 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf8mb4.result | 28 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf8mb4_heap.result | 28 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf8mb4_innodb.result | 28 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf8mb4_myisam.result | 28 | ||||
-rw-r--r-- | mysql-test/t/ctype_utf8.test | 20 | ||||
-rw-r--r-- | mysql-test/t/ctype_utf8mb4.test | 20 | ||||
-rw-r--r-- | pcre/pcre_compile.c | 35 | ||||
-rw-r--r-- | pcre/pcre_internal.h | 2 |
11 files changed, 192 insertions, 69 deletions
diff --git a/mysql-test/include/ctype_regex_utf8.inc b/mysql-test/include/ctype_regex_utf8.inc new file mode 100644 index 00000000000..d389cb214f7 --- /dev/null +++ b/mysql-test/include/ctype_regex_utf8.inc @@ -0,0 +1,24 @@ +# +# Bug #3928 regexp [[:>:]] and UTF-8 +# +SELECT @@character_set_client, @@collation_connection; + +# This should return TRUE +select 'вася' rlike '\\bвася\\b'; +select 'вася ' rlike '\\bвася\\b'; +select ' вася' rlike '\\bвася\\b'; +select ' вася ' rlike '\\bвася\\b'; + +select 'вася' rlike '[[:<:]]вася[[:>:]]'; +select 'вася ' rlike '[[:<:]]вася[[:>:]]'; +select ' вася' rlike '[[:<:]]вася[[:>:]]'; +select ' вася ' rlike '[[:<:]]вася[[:>:]]'; + +# This should return FALSE +select 'васяz' rlike '\\bвася\\b'; +select 'zвася' rlike '\\bвася\\b'; +select 'zвасяz' rlike '\\bвася\\b'; + +select 'васяz' rlike '[[:<:]]вася[[:>:]]'; +select 'zвася' rlike '[[:<:]]вася[[:>:]]'; +select 'zвасяz' rlike '[[:<:]]вася[[:>:]]'; diff --git a/mysql-test/include/ctype_utf8mb4.inc b/mysql-test/include/ctype_utf8mb4.inc index af3a4564026..9ee2414e142 100644 --- a/mysql-test/include/ctype_utf8mb4.inc +++ b/mysql-test/include/ctype_utf8mb4.inc @@ -224,25 +224,9 @@ drop table t1; # # Testing regexp # -set collation_connection=utf8mb4_general_ci; ---source include/ctype_regex.inc -set names utf8mb4; - -# -# Bug #3928 regexp [[:>:]] and UTF-8 -# set names utf8mb4; - -# This should return TRUE -select 'вася' rlike '\\bвася\\b'; -select 'вася ' rlike '\\bвася\\b'; -select ' вася' rlike '\\bвася\\b'; -select ' вася ' rlike '\\bвася\\b'; - -# This should return FALSE -select 'васяz' rlike '\\bвася\\b'; -select 'zвася' rlike '\\bвася\\b'; -select 'zвасяz' rlike '\\bвася\\b'; +--source include/ctype_regex.inc +--source include/ctype_regex_utf8.inc # # Bug #4555 diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index 7342dffa1e0..f98fe649f80 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -270,7 +270,7 @@ b select * from t1 where a = 'b' and a != 'b'; a drop table t1; -set collation_connection=utf8_general_ci; +set names utf8; drop table if exists t1; create table t1 as select repeat(' ', 64) as s1, repeat(' ',64) as s2 @@ -314,8 +314,9 @@ NULL NULL NULL drop table t1; -set names utf8; -set names utf8; +SELECT @@character_set_client, @@collation_connection; +@@character_set_client @@collation_connection +utf8 utf8_general_ci select 'вася' rlike '\\bвася\\b'; 'вася' rlike '\\bвася\\b' 1 @@ -328,6 +329,18 @@ select ' вася' rlike '\\bвася\\b'; select ' вася ' rlike '\\bвася\\b'; ' вася ' rlike '\\bвася\\b' 1 +select 'вася' rlike '[[:<:]]вася[[:>:]]'; +'вася' rlike '[[:<:]]вася[[:>:]]' +1 +select 'вася ' rlike '[[:<:]]вася[[:>:]]'; +'вася ' rlike '[[:<:]]вася[[:>:]]' +1 +select ' вася' rlike '[[:<:]]вася[[:>:]]'; +' вася' rlike '[[:<:]]вася[[:>:]]' +1 +select ' вася ' rlike '[[:<:]]вася[[:>:]]'; +' вася ' rlike '[[:<:]]вася[[:>:]]' +1 select 'васяz' rlike '\\bвася\\b'; 'васяz' rlike '\\bвася\\b' 0 @@ -337,6 +350,15 @@ select 'zвася' rlike '\\bвася\\b'; select 'zвасяz' rlike '\\bвася\\b'; 'zвасяz' rlike '\\bвася\\b' 0 +select 'васяz' rlike '[[:<:]]вася[[:>:]]'; +'васяz' rlike '[[:<:]]вася[[:>:]]' +0 +select 'zвася' rlike '[[:<:]]вася[[:>:]]'; +'zвася' rlike '[[:<:]]вася[[:>:]]' +0 +select 'zвасяz' rlike '[[:<:]]вася[[:>:]]'; +'zвасяz' rlike '[[:<:]]вася[[:>:]]' +0 CREATE TABLE t1 (a enum ('Y', 'N') DEFAULT 'N' COLLATE utf8_unicode_ci); ALTER TABLE t1 ADD COLUMN b CHAR(20); DROP TABLE t1; diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result index e9608188e9f..4580d90c5bc 100644 --- a/mysql-test/r/ctype_utf8mb4.result +++ b/mysql-test/r/ctype_utf8mb4.result @@ -270,7 +270,7 @@ b select * from t1 where a = 'b' and a != 'b'; a drop table t1; -set collation_connection=utf8mb4_general_ci; +set names utf8mb4; drop table if exists t1; create table t1 as select repeat(' ', 64) as s1, repeat(' ',64) as s2 @@ -314,8 +314,9 @@ NULL NULL NULL drop table t1; -set names utf8mb4; -set names utf8mb4; +SELECT @@character_set_client, @@collation_connection; +@@character_set_client @@collation_connection +utf8mb4 utf8mb4_general_ci select 'вася' rlike '\\bвася\\b'; 'вася' rlike '\\bвася\\b' 1 @@ -328,6 +329,18 @@ select ' вася' rlike '\\bвася\\b'; select ' вася ' rlike '\\bвася\\b'; ' вася ' rlike '\\bвася\\b' 1 +select 'вася' rlike '[[:<:]]вася[[:>:]]'; +'вася' rlike '[[:<:]]вася[[:>:]]' +1 +select 'вася ' rlike '[[:<:]]вася[[:>:]]'; +'вася ' rlike '[[:<:]]вася[[:>:]]' +1 +select ' вася' rlike '[[:<:]]вася[[:>:]]'; +' вася' rlike '[[:<:]]вася[[:>:]]' +1 +select ' вася ' rlike '[[:<:]]вася[[:>:]]'; +' вася ' rlike '[[:<:]]вася[[:>:]]' +1 select 'васяz' rlike '\\bвася\\b'; 'васяz' rlike '\\bвася\\b' 0 @@ -337,6 +350,15 @@ select 'zвася' rlike '\\bвася\\b'; select 'zвасяz' rlike '\\bвася\\b'; 'zвасяz' rlike '\\bвася\\b' 0 +select 'васяz' rlike '[[:<:]]вася[[:>:]]'; +'васяz' rlike '[[:<:]]вася[[:>:]]' +0 +select 'zвася' rlike '[[:<:]]вася[[:>:]]'; +'zвася' rlike '[[:<:]]вася[[:>:]]' +0 +select 'zвасяz' rlike '[[:<:]]вася[[:>:]]'; +'zвасяz' rlike '[[:<:]]вася[[:>:]]' +0 CREATE TABLE t1 (a enum ('Y', 'N') DEFAULT 'N' COLLATE utf8mb4_unicode_ci); ALTER TABLE t1 ADD COLUMN b CHAR(20); DROP TABLE t1; diff --git a/mysql-test/r/ctype_utf8mb4_heap.result b/mysql-test/r/ctype_utf8mb4_heap.result index 0ffe26b5f25..bcacdd4e16e 100644 --- a/mysql-test/r/ctype_utf8mb4_heap.result +++ b/mysql-test/r/ctype_utf8mb4_heap.result @@ -260,7 +260,7 @@ b select * from t1 where a = 'b' and a != 'b'; a drop table t1; -set collation_connection=utf8mb4_general_ci; +set names utf8mb4; drop table if exists t1; create table t1 as select repeat(' ', 64) as s1, repeat(' ',64) as s2 @@ -304,8 +304,9 @@ NULL NULL NULL drop table t1; -set names utf8mb4; -set names utf8mb4; +SELECT @@character_set_client, @@collation_connection; +@@character_set_client @@collation_connection +utf8mb4 utf8mb4_general_ci select 'вася' rlike '\\bвася\\b'; 'вася' rlike '\\bвася\\b' 1 @@ -318,6 +319,18 @@ select ' вася' rlike '\\bвася\\b'; select ' вася ' rlike '\\bвася\\b'; ' вася ' rlike '\\bвася\\b' 1 +select 'вася' rlike '[[:<:]]вася[[:>:]]'; +'вася' rlike '[[:<:]]вася[[:>:]]' +1 +select 'вася ' rlike '[[:<:]]вася[[:>:]]'; +'вася ' rlike '[[:<:]]вася[[:>:]]' +1 +select ' вася' rlike '[[:<:]]вася[[:>:]]'; +' вася' rlike '[[:<:]]вася[[:>:]]' +1 +select ' вася ' rlike '[[:<:]]вася[[:>:]]'; +' вася ' rlike '[[:<:]]вася[[:>:]]' +1 select 'васяz' rlike '\\bвася\\b'; 'васяz' rlike '\\bвася\\b' 0 @@ -327,6 +340,15 @@ select 'zвася' rlike '\\bвася\\b'; select 'zвасяz' rlike '\\bвася\\b'; 'zвасяz' rlike '\\bвася\\b' 0 +select 'васяz' rlike '[[:<:]]вася[[:>:]]'; +'васяz' rlike '[[:<:]]вася[[:>:]]' +0 +select 'zвася' rlike '[[:<:]]вася[[:>:]]'; +'zвася' rlike '[[:<:]]вася[[:>:]]' +0 +select 'zвасяz' rlike '[[:<:]]вася[[:>:]]'; +'zвасяz' rlike '[[:<:]]вася[[:>:]]' +0 CREATE TABLE t1 (a enum ('Y', 'N') DEFAULT 'N' COLLATE utf8mb4_unicode_ci) ENGINE heap; ALTER TABLE t1 ADD COLUMN b CHAR(20); DROP TABLE t1; diff --git a/mysql-test/r/ctype_utf8mb4_innodb.result b/mysql-test/r/ctype_utf8mb4_innodb.result index 3e1554cd0ae..2375ca3bb92 100644 --- a/mysql-test/r/ctype_utf8mb4_innodb.result +++ b/mysql-test/r/ctype_utf8mb4_innodb.result @@ -270,7 +270,7 @@ b select * from t1 where a = 'b' and a != 'b'; a drop table t1; -set collation_connection=utf8mb4_general_ci; +set names utf8mb4; drop table if exists t1; create table t1 as select repeat(' ', 64) as s1, repeat(' ',64) as s2 @@ -314,8 +314,9 @@ NULL NULL NULL drop table t1; -set names utf8mb4; -set names utf8mb4; +SELECT @@character_set_client, @@collation_connection; +@@character_set_client @@collation_connection +utf8mb4 utf8mb4_general_ci select 'вася' rlike '\\bвася\\b'; 'вася' rlike '\\bвася\\b' 1 @@ -328,6 +329,18 @@ select ' вася' rlike '\\bвася\\b'; select ' вася ' rlike '\\bвася\\b'; ' вася ' rlike '\\bвася\\b' 1 +select 'вася' rlike '[[:<:]]вася[[:>:]]'; +'вася' rlike '[[:<:]]вася[[:>:]]' +1 +select 'вася ' rlike '[[:<:]]вася[[:>:]]'; +'вася ' rlike '[[:<:]]вася[[:>:]]' +1 +select ' вася' rlike '[[:<:]]вася[[:>:]]'; +' вася' rlike '[[:<:]]вася[[:>:]]' +1 +select ' вася ' rlike '[[:<:]]вася[[:>:]]'; +' вася ' rlike '[[:<:]]вася[[:>:]]' +1 select 'васяz' rlike '\\bвася\\b'; 'васяz' rlike '\\bвася\\b' 0 @@ -337,6 +350,15 @@ select 'zвася' rlike '\\bвася\\b'; select 'zвасяz' rlike '\\bвася\\b'; 'zвасяz' rlike '\\bвася\\b' 0 +select 'васяz' rlike '[[:<:]]вася[[:>:]]'; +'васяz' rlike '[[:<:]]вася[[:>:]]' +0 +select 'zвася' rlike '[[:<:]]вася[[:>:]]'; +'zвася' rlike '[[:<:]]вася[[:>:]]' +0 +select 'zвасяz' rlike '[[:<:]]вася[[:>:]]'; +'zвасяz' rlike '[[:<:]]вася[[:>:]]' +0 CREATE TABLE t1 (a enum ('Y', 'N') DEFAULT 'N' COLLATE utf8mb4_unicode_ci) ENGINE InnoDB; ALTER TABLE t1 ADD COLUMN b CHAR(20); DROP TABLE t1; diff --git a/mysql-test/r/ctype_utf8mb4_myisam.result b/mysql-test/r/ctype_utf8mb4_myisam.result index 4efcedf708f..b467f07ece9 100644 --- a/mysql-test/r/ctype_utf8mb4_myisam.result +++ b/mysql-test/r/ctype_utf8mb4_myisam.result @@ -270,7 +270,7 @@ b select * from t1 where a = 'b' and a != 'b'; a drop table t1; -set collation_connection=utf8mb4_general_ci; +set names utf8mb4; drop table if exists t1; create table t1 as select repeat(' ', 64) as s1, repeat(' ',64) as s2 @@ -314,8 +314,9 @@ NULL NULL NULL drop table t1; -set names utf8mb4; -set names utf8mb4; +SELECT @@character_set_client, @@collation_connection; +@@character_set_client @@collation_connection +utf8mb4 utf8mb4_general_ci select 'вася' rlike '\\bвася\\b'; 'вася' rlike '\\bвася\\b' 1 @@ -328,6 +329,18 @@ select ' вася' rlike '\\bвася\\b'; select ' вася ' rlike '\\bвася\\b'; ' вася ' rlike '\\bвася\\b' 1 +select 'вася' rlike '[[:<:]]вася[[:>:]]'; +'вася' rlike '[[:<:]]вася[[:>:]]' +1 +select 'вася ' rlike '[[:<:]]вася[[:>:]]'; +'вася ' rlike '[[:<:]]вася[[:>:]]' +1 +select ' вася' rlike '[[:<:]]вася[[:>:]]'; +' вася' rlike '[[:<:]]вася[[:>:]]' +1 +select ' вася ' rlike '[[:<:]]вася[[:>:]]'; +' вася ' rlike '[[:<:]]вася[[:>:]]' +1 select 'васяz' rlike '\\bвася\\b'; 'васяz' rlike '\\bвася\\b' 0 @@ -337,6 +350,15 @@ select 'zвася' rlike '\\bвася\\b'; select 'zвасяz' rlike '\\bвася\\b'; 'zвасяz' rlike '\\bвася\\b' 0 +select 'васяz' rlike '[[:<:]]вася[[:>:]]'; +'васяz' rlike '[[:<:]]вася[[:>:]]' +0 +select 'zвася' rlike '[[:<:]]вася[[:>:]]'; +'zвася' rlike '[[:<:]]вася[[:>:]]' +0 +select 'zвасяz' rlike '[[:<:]]вася[[:>:]]'; +'zвасяz' rlike '[[:<:]]вася[[:>:]]' +0 CREATE TABLE t1 (a enum ('Y', 'N') DEFAULT 'N' COLLATE utf8mb4_unicode_ci) ENGINE MyISAM; ALTER TABLE t1 ADD COLUMN b CHAR(20); DROP TABLE t1; diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test index 6f2222b8e45..468804130f4 100644 --- a/mysql-test/t/ctype_utf8.test +++ b/mysql-test/t/ctype_utf8.test @@ -199,25 +199,9 @@ drop table t1; # # Testing regexp # -set collation_connection=utf8_general_ci; ---source include/ctype_regex.inc -set names utf8; - -# -# Bug #3928 regexp [[:>:]] and UTF-8 -# set names utf8; - -# This should return TRUE -select 'вася' rlike '\\bвася\\b'; -select 'вася ' rlike '\\bвася\\b'; -select ' вася' rlike '\\bвася\\b'; -select ' вася ' rlike '\\bвася\\b'; - -# This should return FALSE -select 'васяz' rlike '\\bвася\\b'; -select 'zвася' rlike '\\bвася\\b'; -select 'zвасяz' rlike '\\bвася\\b'; +--source include/ctype_regex.inc +--source include/ctype_regex_utf8.inc # # Bug #4555 diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test index 934adb50cca..7a3c67bb417 100644 --- a/mysql-test/t/ctype_utf8mb4.test +++ b/mysql-test/t/ctype_utf8mb4.test @@ -197,25 +197,9 @@ drop table t1; # # Testing regexp # -set collation_connection=utf8mb4_general_ci; ---source include/ctype_regex.inc -set names utf8mb4; - -# -# Bug #3928 regexp [[:>:]] and UTF-8 -# set names utf8mb4; - -# This should return TRUE -select 'вася' rlike '\\bвася\\b'; -select 'вася ' rlike '\\bвася\\b'; -select ' вася' rlike '\\bвася\\b'; -select ' вася ' rlike '\\bвася\\b'; - -# This should return FALSE -select 'васяz' rlike '\\bвася\\b'; -select 'zвася' rlike '\\bвася\\b'; -select 'zвасяz' rlike '\\bвася\\b'; +--source include/ctype_regex.inc +--source include/ctype_regex_utf8.inc # # Bug #4555 diff --git a/pcre/pcre_compile.c b/pcre/pcre_compile.c index 0ebb3f168f1..a307372fbfe 100644 --- a/pcre/pcre_compile.c +++ b/pcre/pcre_compile.c @@ -253,6 +253,19 @@ static const verbitem verbs[] = { static const int verbcount = sizeof(verbs)/sizeof(verbitem); +/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in +another regex library. */ + +static const pcre_uchar sub_start_of_word[] = { + CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK, + CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' }; + +static const pcre_uchar sub_end_of_word[] = { + CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK, + CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, + CHAR_RIGHT_PARENTHESIS, '\0' }; + + /* Tables of names of POSIX character classes and their lengths. The names are now all in a single string, to reduce the number of relocations when a shared library is dynamically loaded. The list of lengths is terminated by a zero @@ -4036,8 +4049,30 @@ for (;; ptr++) goto FAILED; } goto NORMAL_CHAR; + + /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is + used for "start of word" and "end of word". As these are otherwise illegal + sequences, we don't break anything by recognizing them. They are replaced + by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are + erroneous and are handled by the normal code below. */ case CHAR_LEFT_SQUARE_BRACKET: + if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0) + { + nestptr = ptr + 7; + ptr = sub_start_of_word - 1; + continue; + } + + if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0) + { + nestptr = ptr + 7; + ptr = sub_end_of_word - 1; + continue; + } + + /* Handle a real character class. */ + previous = code; /* PCRE supports POSIX class stuff inside a class. Perl gives an error if diff --git a/pcre/pcre_internal.h b/pcre/pcre_internal.h index 307069ca9d6..cd6ef3ed83e 100644 --- a/pcre/pcre_internal.h +++ b/pcre/pcre_internal.h @@ -1794,6 +1794,8 @@ only. */ #define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t #define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E +#define STRING_WEIRD_STARTWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET +#define STRING_WEIRD_ENDWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET #define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS #define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS |