diff options
author | Alexander Barkov <bar@mariadb.org> | 2015-03-04 09:16:43 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.org> | 2015-03-04 09:16:43 +0400 |
commit | a7ed8523e35ff2e82701cd1f483c8f665f322f3b (patch) | |
tree | 436d9089a3028f07661d85fe28ed65bde23a4045 | |
parent | d8c1165c28ae6ce2e29ecd5492c2540bfd6b2177 (diff) | |
download | mariadb-git-a7ed8523e35ff2e82701cd1f483c8f665f322f3b.tar.gz |
Adding a shared include file ctype-mb.ic and removing a number
of very similar copies of my_well_formed_len_xxx(), implemented
for big5, cp932, euckr, eucjpms, gb2312m gbk, sjis, ujis.
-rw-r--r-- | strings/ctype-big5.c | 43 | ||||
-rw-r--r-- | strings/ctype-cp932.c | 60 | ||||
-rw-r--r-- | strings/ctype-euc_kr.c | 41 | ||||
-rw-r--r-- | strings/ctype-eucjpms.c | 79 | ||||
-rw-r--r-- | strings/ctype-gb2312.c | 41 | ||||
-rw-r--r-- | strings/ctype-gbk.c | 43 | ||||
-rw-r--r-- | strings/ctype-mb.ic | 94 | ||||
-rw-r--r-- | strings/ctype-sjis.c | 55 | ||||
-rw-r--r-- | strings/ctype-ujis.c | 81 |
9 files changed, 184 insertions, 353 deletions
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index a9eb2b1b318..d631bd0a34e 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -34,6 +34,7 @@ /* Support for Chinese(BIG5) characters, by jou@nematic.ieo.nctu.edu.tw + CP950 and HKSCS additional characters are also accepted. modified by Wei He (hewei@mail.ied.ac.cn) modified by Alex Barkov <bar@udm.net> */ @@ -47,6 +48,12 @@ #define big5head(e) ((uchar)(e>>8)) #define big5tail(e) ((uchar)(e&0xff)) +#define MY_FUNCTION_NAME(x) my_ ## x ## _big5 +#define IS_MB2_CHAR(x,y) (isbig5head(x) && isbig5tail(y)) +#define WELL_FORMED_LEN +#include "ctype-mb.ic" + + static const uchar ctype_big5[257] = { 0, /* For standard library */ @@ -6843,42 +6850,6 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)), } -/* - Returns a well formed length of a BIG5 string. - CP950 and HKSCS additional characters are also accepted. -*/ -static -size_t my_well_formed_len_big5(CHARSET_INFO *cs __attribute__((unused)), - const char *b, const char *e, - size_t pos, int *error) -{ - const char *b0= b; - const char *emb= e - 1; /* Last possible end of an MB character */ - - *error= 0; - while (pos-- && b < e) - { - if ((uchar) b[0] < 128) - { - /* Single byte ascii character */ - b++; - } - else if ((b < emb) && isbig5code((uchar)*b, (uchar)b[1])) - { - /* Double byte character */ - b+= 2; - } - else - { - /* Wrong byte sequence */ - *error= 1; - break; - } - } - return (size_t) (b - b0); -} - - static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler = { NULL, /* init */ diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c index 66b352721db..13129a6a874 100644 --- a/strings/ctype-cp932.c +++ b/strings/ctype-cp932.c @@ -176,10 +176,18 @@ static const uchar sort_order_cp932[]= (uchar) '\370',(uchar) '\371',(uchar) '\372',(uchar) '\373',(uchar) '\374',(uchar) '\375',(uchar) '\376',(uchar) '\377' }; -#define iscp932head(c) ((0x81<=(c) && (c)<=0x9f) || \ - ((0xe0<=(c)) && (c)<=0xfc)) -#define iscp932tail(c) ((0x40<=(c) && (c)<=0x7e) || \ - (0x80<=(c) && (c)<=0xfc)) +#define iscp932head(c) ((0x81 <= (uchar) (c) && (uchar) (c) <= 0x9f) || \ + (0xe0 <= (uchar) (c) && (uchar) (c) <= 0xfc)) +#define iscp932tail(c) ((0x40 <= (uchar) (c) && (uchar) (c) <= 0x7e) || \ + (0x80 <= (uchar) (c) && (uchar) (c) <= 0xfc)) + +#define iscp932kata(c) (0xA1 <= (uchar) (c) && (uchar) (c) <= 0xDF) + +#define MY_FUNCTION_NAME(x) my_ ## x ## _cp932 +#define IS_8BIT_CHAR(x) iscp932kata(x) +#define IS_MB2_CHAR(x,y) (iscp932head(x) && iscp932tail(y)) +#define WELL_FORMED_LEN +#include "ctype-mb.ic" static uint ismbchar_cp932(CHARSET_INFO *cs __attribute__((unused)), @@ -34711,50 +34719,6 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)), return clen; } -/* - Returns a well formed length of a cp932 string. - cp932 additional characters are also accepted. -*/ - -static -size_t my_well_formed_len_cp932(CHARSET_INFO *cs __attribute__((unused)), - const char *b, const char *e, - size_t pos, int *error) -{ - const char *b0= b; - *error= 0; - while (pos-- && b < e) - { - /* - Cast to int8 for extra safety. - "char" can be unsigned by default - on some platforms. - */ - if (((int8)b[0]) >= 0) - { - /* Single byte ascii character */ - b++; - } - else if (iscp932head((uchar)*b) && (e-b)>1 && iscp932tail((uchar)b[1])) - { - /* Double byte character */ - b+= 2; - } - else if (((uchar)*b) >= 0xA1 && ((uchar)*b) <= 0xDF) - { - /* Half width kana */ - b++; - } - else - { - /* Wrong byte sequence */ - *error= 1; - break; - } - } - return (size_t) (b - b0); -} - static MY_COLLATION_HANDLER my_collation_ci_handler = { diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index 36d99eec375..eab9539ad45 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -202,6 +202,12 @@ static const uchar sort_order_euc_kr[]= iseuc_kr_tail3(c)) +#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr +#define IS_MB2_CHAR(x,y) (iseuc_kr_head(x) && iseuc_kr_tail(y)) +#define WELL_FORMED_LEN +#include "ctype-mb.ic" + + static uint ismbchar_euc_kr(CHARSET_INFO *cs __attribute__((unused)), const char* p, const char *e) { @@ -9929,41 +9935,6 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)), } -/* - Returns well formed length of a EUC-KR string. -*/ -static size_t -my_well_formed_len_euckr(CHARSET_INFO *cs __attribute__((unused)), - const char *b, const char *e, - size_t pos, int *error) -{ - const char *b0= b; - const char *emb= e - 1; /* Last possible end of an MB character */ - - *error= 0; - while (pos-- && b < e) - { - if ((uchar) b[0] < 128) - { - /* Single byte ascii character */ - b++; - } - else if (b < emb && iseuc_kr_head(*b) && iseuc_kr_tail(b[1])) - { - /* Double byte character */ - b+= 2; - } - else - { - /* Wrong byte sequence */ - *error= 1; - break; - } - } - return (size_t) (b - b0); -} - - static MY_COLLATION_HANDLER my_collation_ci_handler = { NULL, /* init */ diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c index 8c47b666cf4..52873c2f87e 100644 --- a/strings/ctype-eucjpms.c +++ b/strings/ctype-eucjpms.c @@ -180,10 +180,26 @@ static const uchar sort_order_eucjpms[]= }; -#define iseucjpms(c) ((0xa1<=((c)&0xff) && ((c)&0xff)<=0xfe)) -#define iskata(c) ((0xa1<=((c)&0xff) && ((c)&0xff)<=0xdf)) -#define iseucjpms_ss2(c) (((c)&0xff) == 0x8e) -#define iseucjpms_ss3(c) (((c)&0xff) == 0x8f) +/* + EUCJPMS encoding subcomponents: + [x00-x7F] # ASCII/JIS-Roman (one-byte/character) + [x8E][xA1-xDF] # half-width katakana (two bytes/char) + [x8F][xA1-xFE][xA1-xFE] # JIS X 0212-1990 (three bytes/char) + [xA1-xFE][xA1-xFE] # JIS X 0208:1997 (two bytes/char) +*/ +#define iseucjpms(c) (0xa1 <= (uchar) (c) && (uchar) (c) <= 0xfe) +#define iskata(c) (0xa1 <= (uchar) (c) && (uchar) (c) <= 0xdf) +#define iseucjpms_ss2(c) ((uchar) (c) == 0x8e) +#define iseucjpms_ss3(c) ((uchar) (c) == 0x8f) + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _eucjpms +#define IS_MB2_JIS(x,y) (iseucjpms(x) && iseucjpms(y)) +#define IS_MB2_KATA(x,y) (iseucjpms_ss2(x) && iskata(y)) +#define IS_MB2_CHAR(x,y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y)) +#define IS_MB3_CHAR(x,y,z) (iseucjpms_ss3(x) && IS_MB2_JIS(y,z)) +#define WELL_FORMED_LEN +#include "ctype-mb.ic" static uint ismbchar_eucjpms(CHARSET_INFO *cs __attribute__((unused)), @@ -67416,61 +67432,6 @@ my_wc_mb_eucjpms(CHARSET_INFO *cs __attribute__((unused)), } -/* - EUCJPMS encoding subcomponents: - [x00-x7F] # ASCII/JIS-Roman (one-byte/character) - [x8E][xA1-xDF] # half-width katakana (two bytes/char) - [x8F][xA1-xFE][xA1-xFE] # JIS X 0212-1990 (three bytes/char) - [xA1-xFE][xA1-xFE] # JIS X 0208:1997 (two bytes/char) -*/ - -static -size_t my_well_formed_len_eucjpms(CHARSET_INFO *cs __attribute__((unused)), - const char *beg, const char *end, size_t pos, - int *error) -{ - const uchar *b= (uchar *) beg; - *error=0; - - for ( ; pos && b < (uchar*) end; pos--, b++) - { - char *chbeg; - uint ch= *b; - - if (ch <= 0x7F) /* one byte */ - continue; - - chbeg= (char *) b++; - if (b >= (uchar *) end) /* need more bytes */ - return (uint) (chbeg - beg); /* unexpected EOL */ - - if (iseucjpms_ss2(ch)) /* [x8E][xA1-xDF] */ - { - if (iskata(*b)) - continue; - *error=1; - return (uint) (chbeg - beg); /* invalid sequence */ - } - - if (iseucjpms_ss3(ch)) /* [x8F][xA1-xFE][xA1-xFE] */ - { - ch= *b++; - if (b >= (uchar*) end) - { - *error= 1; - return (uint)(chbeg - beg); /* unexpected EOL */ - } - } - - if (iseucjpms(ch) && iseucjpms(*b)) /* [xA1-xFE][xA1-xFE] */ - continue; - *error=1; - return (size_t) (chbeg - beg); /* invalid sequence */ - } - return (size_t) (b - (uchar *) beg); -} - - static size_t my_numcells_eucjpms(CHARSET_INFO *cs __attribute__((unused)), const char *str, const char *str_end) diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index b5aeed2088f..a4268b8fd68 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -165,6 +165,12 @@ static const uchar sort_order_gb2312[]= #define isgb2312tail(c) (0xa1<=(uchar)(c) && (uchar)(c)<=0xfe) +#define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312 +#define IS_MB2_CHAR(x,y) (isgb2312head(x) && isgb2312tail(y)) +#define WELL_FORMED_LEN +#include "ctype-mb.ic" + + static uint ismbchar_gb2312(CHARSET_INFO *cs __attribute__((unused)), const char* p, const char *e) { @@ -6332,41 +6338,6 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)), } -/* - Returns well formed length of a EUC-KR string. -*/ -static size_t -my_well_formed_len_gb2312(CHARSET_INFO *cs __attribute__((unused)), - const char *b, const char *e, - size_t pos, int *error) -{ - const char *b0= b; - const char *emb= e - 1; /* Last possible end of an MB character */ - - *error= 0; - while (pos-- && b < e) - { - if ((uchar) b[0] < 128) - { - /* Single byte ascii character */ - b++; - } - else if (b < emb && isgb2312head(*b) && isgb2312tail(b[1])) - { - /* Double byte character */ - b+= 2; - } - else - { - /* Wrong byte sequence */ - *error= 1; - break; - } - } - return (size_t) (b - b0); -} - - static MY_COLLATION_HANDLER my_collation_ci_handler = { NULL, /* init */ diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index d282d96145d..392fdb487b6 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -43,6 +43,12 @@ #define gbkhead(e) ((uchar)(e>>8)) #define gbktail(e) ((uchar)(e&0xff)) +#define MY_FUNCTION_NAME(x) my_ ## x ## _gbk +#define IS_MB2_CHAR(x,y) (isgbkhead(x) && isgbktail(y)) +#define WELL_FORMED_LEN +#include "ctype-mb.ic" + + static const uchar ctype_gbk[257] = { 0, /* For standard library */ @@ -10726,43 +10732,6 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)), } -/* - Returns well formed length of a GBK string. -*/ -static -size_t my_well_formed_len_gbk(CHARSET_INFO *cs __attribute__((unused)), - const char *b, const char *e, - size_t pos, int *error) -{ - const char *b0= b; - const char *emb= e - 1; /* Last possible end of an MB character */ - - *error= 0; - while (pos-- && b < e) - { - if ((uchar) b[0] < 128) - { - /* Single byte ascii character */ - b++; - } - else if ((b < emb) && isgbkcode((uchar)*b, (uchar)b[1])) - { - /* Double byte character */ - b+= 2; - } - else - { - /* Wrong byte sequence */ - *error= 1; - break; - } - } - return (size_t) (b - b0); -} - - - - static MY_COLLATION_HANDLER my_collation_ci_handler = { NULL, /* init */ diff --git a/strings/ctype-mb.ic b/strings/ctype-mb.ic new file mode 100644 index 00000000000..70cc89c9af0 --- /dev/null +++ b/strings/ctype-mb.ic @@ -0,0 +1,94 @@ +/* + Copyright (c) 2015, MariaDB Foundation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + + +#ifndef MY_FUNCTION_NAME +#error MY_FUNCTION_NAME is not defined +#endif + +#if defined(IS_MB3_CHAR) && !defined(IS_MB2_CHAR) +#error IS_MB3_CHAR is defined, while IS_MB2_CHAR is not! +#endif + +#if defined(IS_MB4_CHAR) && !defined(IS_MB3_CHAR) +#error IS_MB4_CHAR is defined, while IS_MB3_CHAR is not! +#endif + + +#ifdef WELL_FORMED_LEN +/** + Returns well formed length of a character string with + variable character length for character sets with: + - mbminlen == 1 + - mbmaxlen == 2, 3, or 4 +*/ +static size_t +MY_FUNCTION_NAME(well_formed_len)(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, + size_t nchars, int *error) +{ + const char *b0= b; + + DBUG_ASSERT(cs->mbminlen == 1); + DBUG_ASSERT(cs->mbmaxlen <= 4); + + for (*error= 0 ; b < e && nchars-- ; ) + { + if ((uchar) b[0] < 128) + { + b++; /* Single byte ASCII character */ + continue; + } + + if (b + 2 <= e && IS_MB2_CHAR(b[0], b[1])) + { + b+= 2; /* Double byte character */ + continue; + } + +#ifdef IS_MB3_CHAR + if (b + 3 <= e && IS_MB3_CHAR(b[0], b[1], b[2])) + { + b+= 3; /* Three-byte character */ + continue; + } +#endif + +#ifdef IS_MB4_CHAR + if (b + 4 <= e && IS_MB4_CHAR(b[0], b[1], b[2], b[3])) + { + b+= 4; /* Four-byte character */ + continue; + } +#endif + +#ifdef IS_8BIT_CHAR + if (IS_8BIT_CHAR(b[0])) + { + b++; /* Single byte non-ASCII character, e.g. half width kana in sjis */ + continue; + } +#endif + + /* Wrong byte sequence */ + *error= 1; + break; + } + return b - b0; +} + +#endif /* WELL_FORMED_LEN */ diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index 2038632c9d3..432e2e5e823 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -176,10 +176,19 @@ static const uchar sort_order_sjis[]= (uchar) '\370',(uchar) '\371',(uchar) '\372',(uchar) '\373',(uchar) '\374',(uchar) '\375',(uchar) '\376',(uchar) '\377' }; -#define issjishead(c) ((0x81<=(c) && (c)<=0x9f) || \ - ((0xe0<=(c)) && (c)<=0xfc)) -#define issjistail(c) ((0x40<=(c) && (c)<=0x7e) || \ - (0x80<=(c) && (c)<=0xfc)) +#define issjishead(c) ((0x81 <= (uchar) (c) && (uchar) (c) <= 0x9f) || \ + (0xe0 <= (uchar) (c) && (uchar) (c) <= 0xfc)) +#define issjistail(c) ((0x40 <= (uchar) (c) && (uchar) (c) <= 0x7e) || \ + (0x80 <= (uchar) (c) && (uchar) (c) <= 0xfc)) + +#define issjiskata(c) ((0xA1 <= (uchar) (c) && (uchar) (c) <= 0xDF)) + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _sjis +#define IS_8BIT_CHAR(x) issjiskata(x) +#define IS_MB2_CHAR(x,y) (issjishead(x) && issjistail(y)) +#define WELL_FORMED_LEN +#include "ctype-mb.ic" static uint ismbchar_sjis(CHARSET_INFO *cs __attribute__((unused)), @@ -34089,44 +34098,6 @@ size_t my_numcells_sjis(CHARSET_INFO *cs __attribute__((unused)), return clen; } -/* - Returns a well formed length of a SJIS string. - CP932 additional characters are also accepted. -*/ -static -size_t my_well_formed_len_sjis(CHARSET_INFO *cs __attribute__((unused)), - const char *b, const char *e, - size_t pos, int *error) -{ - const char *b0= b; - *error= 0; - while (pos-- && b < e) - { - if ((uchar) b[0] < 128) - { - /* Single byte ascii character */ - b++; - } - else if (issjishead((uchar)*b) && (e-b)>1 && issjistail((uchar)b[1])) - { - /* Double byte character */ - b+= 2; - } - else if (((uchar)*b) >= 0xA1 && ((uchar)*b) <= 0xDF) - { - /* Half width kana */ - b++; - } - else - { - /* Wrong byte sequence */ - *error= 1; - break; - } - } - return (size_t) (b - b0); -} - static MY_COLLATION_HANDLER my_collation_ci_handler = { diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index f208d15f364..99f5be3fa38 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -179,10 +179,26 @@ static const uchar sort_order_ujis[]= }; -#define isujis(c) ((0xa1<=((c)&0xff) && ((c)&0xff)<=0xfe)) -#define iskata(c) ((0xa1<=((c)&0xff) && ((c)&0xff)<=0xdf)) -#define isujis_ss2(c) (((c)&0xff) == 0x8e) -#define isujis_ss3(c) (((c)&0xff) == 0x8f) +/* + EUC-JP encoding subcomponents: + [x00-x7F] # ASCII/JIS-Roman (one-byte/character) + [x8E][xA1-xDF] # half-width katakana (two bytes/char) + [x8F][xA1-xFE][xA1-xFE] # JIS X 0212-1990 (three bytes/char) + [xA1-xFE][xA1-xFE] # JIS X 0208:1997 (two bytes/char) +*/ + +#define isujis(c) (0xa1 <= (uchar) (c) && (uchar) (c) <= 0xfe) +#define iskata(c) (0xa1 <= (uchar) (c) && (uchar) (c) <= 0xdf) +#define isujis_ss2(c) ((uchar) (c) == 0x8e) +#define isujis_ss3(c) ((uchar) (c) == 0x8f) + +#define MY_FUNCTION_NAME(x) my_ ## x ## _ujis +#define IS_MB2_JIS(x,y) (isujis(x) && isujis(y)) +#define IS_MB2_KATA(x,y) (isujis_ss2(x) && iskata(y)) +#define IS_MB2_CHAR(x, y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y)) +#define IS_MB3_CHAR(x, y, z) (isujis_ss3(x) && IS_MB2_JIS(y,z)) +#define WELL_FORMED_LEN +#include "ctype-mb.ic" static uint ismbchar_ujis(CHARSET_INFO *cs __attribute__((unused)), @@ -201,63 +217,6 @@ static uint mbcharlen_ujis(CHARSET_INFO *cs __attribute__((unused)),uint c) } -/* - EUC-JP encoding subcomponents: - [x00-x7F] # ASCII/JIS-Roman (one-byte/character) - [x8E][xA1-xDF] # half-width katakana (two bytes/char) - [x8F][xA1-xFE][xA1-xFE] # JIS X 0212-1990 (three bytes/char) - [xA1-xFE][xA1-xFE] # JIS X 0208:1997 (two bytes/char) -*/ - -static -size_t my_well_formed_len_ujis(CHARSET_INFO *cs __attribute__((unused)), - const char *beg, const char *end, - size_t pos, int *error) -{ - const uchar *b= (uchar *) beg; - - for ( *error= 0 ; pos && b < (uchar*) end; pos--, b++) - { - char *chbeg; - uint ch= *b; - - if (ch <= 0x7F) /* one byte */ - continue; - - chbeg= (char *) b++; - if (b >= (uchar *) end) /* need more bytes */ - { - *error= 1; - return (size_t) (chbeg - beg); /* unexpected EOL */ - } - - if (isujis_ss2(ch)) /* [x8E][xA1-xDF] */ - { - if (iskata(*b)) - continue; - *error= 1; - return (size_t) (chbeg - beg); /* invalid sequence */ - } - - if (isujis_ss3(ch)) /* [x8F][xA1-xFE][xA1-xFE] */ - { - ch= *b++; - if (b >= (uchar*) end) - { - *error= 1; - return (size_t) (chbeg - beg); /* unexpected EOL */ - } - } - - if (isujis(ch) && isujis(*b)) /* [xA1-xFE][xA1-xFE] */ - continue; - *error= 1; - return (size_t) (chbeg - beg); /* invalid sequence */ - } - return (size_t) (b - (uchar *) beg); -} - - static size_t my_numcells_eucjp(CHARSET_INFO *cs __attribute__((unused)), const char *str, const char *str_end) |