diff options
author | bar@bar.intranet.mysql.r18.ru <> | 2004-06-07 12:51:18 +0500 |
---|---|---|
committer | bar@bar.intranet.mysql.r18.ru <> | 2004-06-07 12:51:18 +0500 |
commit | b30b1ccc6ae5027ac315de1b60c5ec00858698f9 (patch) | |
tree | b95e6a3d5908a015cf9163612fd462eb0d498c8e /strings | |
parent | dabc0e774eeb6e70f2fda743c50f19a4c72f8d3d (diff) | |
download | mariadb-git-b30b1ccc6ae5027ac315de1b60c5ec00858698f9.tar.gz |
Bug #3928 regexp [[:>:]] and UTF-8
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-utf8.c | 20 |
1 files changed, 13 insertions, 7 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 29d2c5d1358..09b918b0777 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -1524,8 +1524,12 @@ MY_UNICASE_INFO *uni_plane[256]={ #ifdef HAVE_CHARSET_utf8 -/* These arrays are taken from usa7 implementation */ - +/* + We consider bytes with code more than 127 as a letter. + This garantees that word boundaries work fine with regular + expressions. Note, there is no need to mark byte 255 as a + letter, it is illegal byte in UTF8. +*/ static uchar ctype_utf8[] = { 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32, @@ -1536,16 +1540,18 @@ static uchar ctype_utf8[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 16, 16, 16, 16,130,130,130,130,130,130, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 16, 16, 16, 16, 32, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0 }; +/* The below are taken from usa7 implementation */ + static uchar to_lower_utf8[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, |