summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorSergei Golubchik <sergii@pisem.net>2014-02-25 16:04:35 +0100
committerSergei Golubchik <sergii@pisem.net>2014-02-25 16:04:35 +0100
commit0b9a0a3517ca2b75655f3af5c372cf333d3d5fe2 (patch)
tree5c67457ff8abbb89b203a7f55cda776b738c385b /strings
parent6324c36bd703a0f55dcd49dd721af262f73cf7aa (diff)
parentff2e82f4a175b7b023cd167b2fa6e6fcd1bd192e (diff)
downloadmariadb-git-0b9a0a3517ca2b75655f3af5c372cf333d3d5fe2.tar.gz
5.5 merge
Diffstat (limited to 'strings')
-rw-r--r--strings/CMakeLists.txt2
-rw-r--r--strings/ctype-mb.c4
-rw-r--r--strings/ctype-simple.c5
-rw-r--r--strings/ctype-uca.c4
-rw-r--r--strings/ctype-ucs2.c6
-rw-r--r--strings/ctype-utf8.c268
-rw-r--r--strings/ctype-win1250ch.c5
-rw-r--r--strings/ctype.c6
-rw-r--r--strings/t_ctype.h3
9 files changed, 248 insertions, 55 deletions
diff --git a/strings/CMakeLists.txt b/strings/CMakeLists.txt
index 18fae59d394..cafd3292930 100644
--- a/strings/CMakeLists.txt
+++ b/strings/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2006, 2010, Oracle and/or its affiliates
+# Copyright (c) 2006, 2013, Oracle and/or its affiliates
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index 0c0332ea3da..9f845511866 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
- Copyright (c) 2009-2011, Monty Program Ab
+/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
+ Copyright (c) 2009, 2014, SkySQL Ab.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index 3cd6805158e..4556ed75f7e 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -1,6 +1,5 @@
-/* Copyright (c) 2002-2007 MySQL AB, 2009 Sun Microsystems, Inc.
- Copyright (c) 2009-2011, Monty Program Ab
- Use is subject to license terms.
+/* Copyright (c) 2002, 2013, Oracle and/or its affiliates.
+ Copyright (c) 2009, 2014, SkySQL Ab.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 0f464be607d..ad484acd21e 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2004, 2011, Oracle and/or its affiliates.
- Copyright (c) 2009, 2011, Monty Program Ab
+/* Copyright (c) 2004, 2013, Oracle and/or its affiliates.
+ Copyright (c) 2009, 2014, SkySQL Ab.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index 89df4ae0bc4..fee272e5d35 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2003, 2012, Oracle and/or its affiliates
- Copyright (c) 2009, 2013, Monty Program Ab.
+/* Copyright (c) 2003, 2013, Oracle and/or its affiliates
+ Copyright (c) 2009, 2014, SkySQL Ab.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
@@ -1664,7 +1664,7 @@ struct charset_info_st my_charset_utf16_general_ci=
struct charset_info_st my_charset_utf16_bin=
{
55,0,0, /* number */
- MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII,
+ MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
"utf16", /* cs name */
"utf16_bin", /* name */
"UTF-16 Unicode", /* comment */
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index cc19148b973..03fba7e51e7 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -26,6 +26,7 @@
#define EILSEQ ENOENT
#endif
+#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40)
#define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci"
#define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs"
@@ -56,6 +57,46 @@
#define HAVE_UNIDATA
#endif
+
+#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4)
+
+static inline
+int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e)
+{
+ uchar c;
+
+ DBUG_ASSERT(s < e);
+ c= s[0];
+ if (c < 0x80)
+ return 1;
+
+ if (c < 0xc2)
+ return MY_CS_ILSEQ;
+
+ if (c < 0xe0)
+ {
+ if (s+2 > e) /* We need 2 characters */
+ return MY_CS_TOOSMALL2;
+
+ if (!(IS_CONTINUATION_BYTE(s[1])))
+ return MY_CS_ILSEQ;
+
+ return 2;
+ }
+
+ DBUG_ASSERT(c < 0xf0);
+ if (s+3 > e) /* We need 3 characters */
+ return MY_CS_TOOSMALL3;
+
+ if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
+ (c >= 0xe1 || s[1] >= 0xa0)))
+ return MY_CS_ILSEQ;
+
+ return 3;
+}
+
+#endif /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/
+
#ifdef HAVE_UNIDATA
#include "my_uctype.h"
@@ -2285,7 +2326,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
- if (!((s[1] ^ 0x80) < 0x40))
+ if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@@ -2296,7 +2337,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
- if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@@ -2312,9 +2353,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90)))
return MY_CS_ILSEQ;
@@ -2330,10 +2371,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+5 >e) /* We need 5 characters */
return MY_CS_TOOSMALL5;
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
- (s[4] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ IS_CONTINUATION_BYTE(s[4]) &&
(c >= 0xf9 || s[1] >= 0x88)))
return MY_CS_ILSEQ;
@@ -2349,11 +2390,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if ( s+6 >e ) /* We need 6 characters */
return MY_CS_TOOSMALL6;
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
- (s[4] ^ 0x80) < 0x40 &&
- (s[5] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ IS_CONTINUATION_BYTE(s[4]) &&
+ IS_CONTINUATION_BYTE(s[5]) &&
(c >= 0xfd || s[1] >= 0x84)))
return MY_CS_ILSEQ;
@@ -2397,11 +2438,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
return 2;
}
-
+
if (c < 0xf0)
{
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@@ -2876,10 +2917,91 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
return (len * 2 + 2) / 3;
}
+
+static
+int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *s, const uchar *e)
+{
+ uchar c;
+
+ if (s >= e)
+ return MY_CS_TOOSMALL;
+
+ c= s[0];
+ if (c < 0xf0)
+ return my_valid_mbcharlen_utf8mb3(s, e);
+
+#ifdef UNICODE_32BIT
+ if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32)
+ {
+ if (s+4 > e) /* We need 4 characters */
+ return MY_CS_TOOSMALL4;
+
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ (c >= 0xf1 || s[1] >= 0x90)))
+ return MY_CS_ILSEQ;
+
+ return 4;
+ }
+ if (c < 0xfc && sizeof(my_wc_t)*8 >= 32)
+ {
+ if (s+5 >e) /* We need 5 characters */
+ return MY_CS_TOOSMALL5;
+
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ IS_CONTINUATION_BYTE(s[4]) &&
+ (c >= 0xf9 || s[1] >= 0x88)))
+ return MY_CS_ILSEQ;
+
+ return 5;
+ }
+ if (c < 0xfe && sizeof(my_wc_t)*8 >= 32)
+ {
+ if ( s+6 >e ) /* We need 6 characters */
+ return MY_CS_TOOSMALL6;
+
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ IS_CONTINUATION_BYTE(s[4]) &&
+ IS_CONTINUATION_BYTE(s[5]) &&
+ (c >= 0xfd || s[1] >= 0x84)))
+ return MY_CS_ILSEQ;
+
+ return 6;
+ }
+#endif
+ return MY_CS_ILSEQ;
+}
+
+static size_t
+my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
+ size_t pos, int *error)
+{
+ const char *b_start= b;
+ *error= 0;
+ while (pos)
+ {
+ int mb_len;
+
+ if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
+ {
+ *error= b < e ? 1 : 0;
+ break;
+ }
+ b+= mb_len;
+ pos--;
+ }
+ return (size_t) (b - b_start);
+}
+
static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
{
- my_wc_t wc;
- int res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e);
+ int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e);
return (res>1) ? res : 0;
}
@@ -2928,7 +3050,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
my_mbcharlen_utf8,
my_numchars_mb,
my_charpos_mb,
- my_well_formed_len_mb,
+ my_well_formed_len_utf8,
my_lengthsp_8bit,
my_numcells_mb,
my_utf8_uni,
@@ -4702,7 +4824,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (s + 2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
- if (!((s[1] ^ 0x80) < 0x40))
+ if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@@ -4713,7 +4835,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (s + 3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
- if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@@ -4746,9 +4868,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
[F4][80..8F][80..BF][80..BF]
*/
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90) &&
(c <= 0xf3 || s[1] <= 0x8F)))
return MY_CS_ILSEQ;
@@ -4784,17 +4906,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
if (c < 0xe0)
{
- if (!((s[1] ^ 0x80) < 0x40))
+ if (!IS_CONTINUATION_BYTE(s[1]))
return MY_CS_ILSEQ;
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
return 2;
}
-
+
if (c < 0xf0)
{
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
*pwc= ((my_wc_t) (c & 0x0f) << 12) |
@@ -4805,9 +4927,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
}
else if (c < 0xf5)
{
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90) &&
(c <= 0xf3 || s[1] <= 0x8F)))
return MY_CS_ILSEQ;
@@ -5296,11 +5418,84 @@ my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len)
}
+static int
+my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *s, const uchar *e)
+{
+ uchar c;
+
+ if (s >= e)
+ return MY_CS_TOOSMALL;
+
+ c= s[0];
+ if (c < 0xf0)
+ return my_valid_mbcharlen_utf8mb3(s, e);
+
+ if (c < 0xf5)
+ {
+ if (s + 4 > e) /* We need 4 characters */
+ return MY_CS_TOOSMALL4;
+
+ /*
+ UTF-8 quick four-byte mask:
+ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ Encoding allows to encode U+00010000..U+001FFFFF
+
+ The maximum character defined in the Unicode standard is U+0010FFFF.
+ Higher characters U+00110000..U+001FFFFF are not used.
+
+ 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
+ 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
+
+ Valid codes:
+ [F0][90..BF][80..BF][80..BF]
+ [F1][80..BF][80..BF][80..BF]
+ [F2][80..BF][80..BF][80..BF]
+ [F3][80..BF][80..BF][80..BF]
+ [F4][80..8F][80..BF][80..BF]
+ */
+
+ if (!(IS_CONTINUATION_BYTE(s[1]) &&
+ IS_CONTINUATION_BYTE(s[2]) &&
+ IS_CONTINUATION_BYTE(s[3]) &&
+ (c >= 0xf1 || s[1] >= 0x90) &&
+ (c <= 0xf3 || s[1] <= 0x8F)))
+ return MY_CS_ILSEQ;
+
+ return 4;
+ }
+
+ return MY_CS_ILSEQ;
+}
+
+
+static
+size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
+ const char *b, const char *e,
+ size_t pos, int *error)
+{
+ const char *b_start= b;
+ *error= 0;
+ while (pos)
+ {
+ int mb_len;
+
+ if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
+ {
+ *error= b < e ? 1 : 0;
+ break;
+ }
+ b+= mb_len;
+ pos--;
+ }
+ return (size_t) (b - b_start);
+}
+
+
static uint
my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
{
- my_wc_t wc;
- int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e);
+ int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e);
return (res > 1) ? res : 0;
}
@@ -5361,7 +5556,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
my_mbcharlen_utf8mb4,
my_numchars_mb,
my_charpos_mb,
- my_well_formed_len_mb,
+ my_well_formed_len_utf8mb4,
my_lengthsp_8bit,
my_numcells_mb,
my_mb_wc_utf8mb4,
@@ -5423,7 +5618,8 @@ struct charset_info_st my_charset_utf8mb4_general_ci=
struct charset_info_st my_charset_utf8mb4_bin=
{
46,0,0, /* number */
- MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT, /* state */
+ MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|
+ MY_CS_UNICODE_SUPPLEMENT, /* state */
MY_UTF8MB4, /* cs name */
MY_UTF8MB4_BIN, /* name */
"UTF-8 Unicode", /* comment */
diff --git a/strings/ctype-win1250ch.c b/strings/ctype-win1250ch.c
index ef54101cf7f..5e33d9ccbd6 100644
--- a/strings/ctype-win1250ch.c
+++ b/strings/ctype-win1250ch.c
@@ -1,6 +1,5 @@
-/* Copyright (c) 2001 Jan Pazdziora.
- Copyright (c) 2002-2007 MySQL AB
- Copyright (c) 2009-2011, Monty Program Ab
+/* Copyright (c) 2002, 2013, Oracle and/or its affiliates.
+ Copyright (c) 2009, 2014, SkySQL Ab.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
diff --git a/strings/ctype.c b/strings/ctype.c
index 23f18b6617b..df22d0f19e5 100644
--- a/strings/ctype.c
+++ b/strings/ctype.c
@@ -1,7 +1,5 @@
-/* Copyright (c) 2000-2007 MySQL AB, 2008, 2009 Sun Microsystems, Inc.
- Copyright (c) 2009-2011, Monty Program Ab
- Use is subject to license terms.
- Copyright (c) 2009-2011, Monty Program Ab
+/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
+ Copyright (c) 2009, 2014, SkySQL Ab.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
diff --git a/strings/t_ctype.h b/strings/t_ctype.h
index 8198d3eada8..a4fdd267c3f 100644
--- a/strings/t_ctype.h
+++ b/strings/t_ctype.h
@@ -1,4 +1,5 @@
-/* Copyright (C) 2000 MySQL AB
+/* Copyright (c) 2000, 2001, 2003 MySQL AB
+ Use is subject to license terms
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by