summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorunknown <monty@mysql.com>2004-10-29 19:26:52 +0300
committerunknown <monty@mysql.com>2004-10-29 19:26:52 +0300
commitf095274fe8c3d3394d6c0ce0a68f4bea04311999 (patch)
tree23bcc9a71fe7237887a111b158e30f5a6bb665d3 /strings
parentf41bba8c6156a7adf4c67dfa75e16112767a5d3c (diff)
parent5be6c328f5a9f78f37176bbbd88a538fa3b65fe9 (diff)
downloadmariadb-git-f095274fe8c3d3394d6c0ce0a68f4bea04311999.tar.gz
merge with 4.1
BitKeeper/etc/ignore: auto-union BitKeeper/etc/logging_ok: auto-union BitKeeper/triggers/post-commit: Auto merged Docs/Support/texi2html: Auto merged Makefile.am: Auto merged client/Makefile.am: Auto merged client/mysql.cc: Auto merged client/mysqldump.c: Auto merged include/my_base.h: Auto merged include/my_global.h: Auto merged include/my_pthread.h: Auto merged include/my_sys.h: Auto merged include/my_time.h: Auto merged include/mysql.h: Auto merged include/mysql_com.h: Auto merged innobase/buf/buf0buf.c: Auto merged innobase/include/row0mysql.h: Auto merged innobase/row/row0sel.c: Auto merged libmysql/libmysql.c: Auto merged libmysqld/examples/Makefile.am: Auto merged myisam/mi_check.c: Auto merged mysql-test/include/ps_modify.inc: Auto merged mysql-test/install_test_db.sh: Auto merged mysql-test/r/alter_table.result: Auto merged mysql-test/r/auto_increment.result: Auto merged mysql-test/r/bdb.result: Auto merged mysql-test/r/ctype_latin1_de.result: Auto merged mysql-test/r/ctype_recoding.result: Auto merged mysql-test/r/fulltext.result: Auto merged mysql-test/r/func_gconcat.result: Auto merged mysql-test/r/func_group.result: Auto merged mysql-test/r/func_if.result: Auto merged mysql-test/t/derived.test: Auto merged mysql-test/t/insert.test: merge with 4.1 Fixed test case to not use 'if exists' when it shouldn't mysql-test/t/range.test: merge with 4.1 Added missing drop table sql/ha_ndbcluster.cc: merge with 4.1 Simple optimization: use max() instead of ? : sql/item_func.cc: merge with 4.1 (Added back old variable names for easier merges) sql/opt_range.cc: merge with 4.1 Removed argument 'parent_alloc' from QUICK_RANGE_SELECT as this was not used Added assert if using QUICK_GROUP_MIN_MAX_SELECT with parent_alloc as the init() function can't handle this Changed back get_quick_select_for_ref() to use it's own alloc root becasue this function may be called several times for one query sql/sql_handler.cc: merge with 4.1 change variable 'err' to 'error' as same function had a label named 'err' sql/sql_update.cc: Use multi-update code from 5.0 instead of 4.1 We will fix the locking code shortly in 5.0 to be faster than in 4.1
Diffstat (limited to 'strings')
-rw-r--r--strings/CHARSET_INFO.txt230
-rw-r--r--strings/Makefile.am2
-rw-r--r--strings/ctype-big5.c3
-rw-r--r--strings/ctype-bin.c1
-rw-r--r--strings/ctype-euc_kr.c3
-rw-r--r--strings/ctype-gb2312.c3
-rw-r--r--strings/ctype-gbk.c3
-rw-r--r--strings/ctype-latin1.c1
-rw-r--r--strings/ctype-mb.c182
-rw-r--r--strings/ctype-simple.c10
-rw-r--r--strings/ctype-sjis.c69
-rw-r--r--strings/ctype-tis620.c1
-rw-r--r--strings/ctype-uca.c157
-rw-r--r--strings/ctype-ucs2.c363
-rw-r--r--strings/ctype-ujis.c37
-rw-r--r--strings/ctype-utf8.c294
-rw-r--r--strings/xml.c7
17 files changed, 1083 insertions, 283 deletions
diff --git a/strings/CHARSET_INFO.txt b/strings/CHARSET_INFO.txt
new file mode 100644
index 00000000000..f7a10f95880
--- /dev/null
+++ b/strings/CHARSET_INFO.txt
@@ -0,0 +1,230 @@
+
+CHARSET_INFO
+============
+A structure containing data for charset+collation pair implementation.
+
+Virtual functions which use this data are collected
+into separate structures MY_CHARSET_HANDLER and
+MY_COLLATION_HANDLER.
+
+
+typedef struct charset_info_st
+{
+ uint number;
+ uint primary_number;
+ uint binary_number;
+ uint state;
+
+ const char *csname;
+ const char *name;
+ const char *comment;
+
+ uchar *ctype;
+ uchar *to_lower;
+ uchar *to_upper;
+ uchar *sort_order;
+
+ uint16 *tab_to_uni;
+ MY_UNI_IDX *tab_from_uni;
+
+ uchar state_map[256];
+ uchar ident_map[256];
+
+ uint strxfrm_multiply;
+ uint mbminlen;
+ uint mbmaxlen;
+ char max_sort_char; /* For LIKE optimization */
+
+ MY_CHARSET_HANDLER *cset;
+ MY_COLLATION_HANDLER *coll;
+
+} CHARSET_INFO;
+
+
+CHARSET_INFO fields description:
+===============================
+
+
+Numbers (identifiers)
+---------------------
+
+number - an ID uniquely identifying this charset+collation pair.
+
+primary_number - ID of a charset+collation pair, which consists
+of the same character set and the default collation of this
+character set. Not really used now. Intended to optimize some
+parts of the code where we need to find the default collation
+using its non-default counterpart for the given character set.
+
+binary_numner - ID of a charset+collation pair, which consists
+of the same character set and the binary collation of this
+character set. Not really used now.
+
+Names
+-----
+
+ csname - name of the character set for this charset+collation pair.
+ name - name of the collation for this charset+collation pair.
+ comment - a text comment, dysplayed in "Description" column of
+ SHOW CHARACTER SET output.
+
+Conversion tables
+-----------------
+
+ ctype - pointer to array[257] of "type of characters"
+ bit mask for each chatacter, e.g. if a
+ character is a digit or a letter or a separator, etc.
+
+ Monty 2004-10-21:
+ If you look at the macros, we use ctype[(char)+1].
+ ctype[0] is traditionally in most ctype libraries
+ reserved for EOF (-1). The idea is that you can use
+ the result from fgetc() directly with ctype[]. As
+ we have to be compatible with external ctype[] versions,
+ it's better to do it the same way as they do...
+
+ to_lower - pointer to array[256] used in LCASE()
+ to_upper - pointer to array[256] used in UCASE()
+ sort_order - pointer to array[256] used for strings comparison
+
+
+
+Unicode conversion data
+-----------------------
+For 8bit character sets:
+
+tab_to_uni : array[256] of charset->Unicode translation
+tab_from_uni: a structure for Unicode->charset translation
+
+Non-8 bit charsets have their own structures per charset
+hidden in correspondent ctype-xxx.c file and don't use
+tab_to_uni and tab_from_uni tables.
+
+
+Parser maps
+-----------
+state_map[]
+ident_map[]
+
+ These maps are to quickly identify if a character is
+an identificator part, a digit, a special character,
+or a part of other SQL language lexical item.
+
+Probably can be combined with ctype array in the future.
+But for some reasons these two arrays are used in the parser,
+while a separate ctype[] array is used in the other part of the
+code, like fulltext, etc.
+
+
+Misc fields
+-----------
+
+ strxfrm_multiply - how many times a sort key (i.e. a string
+ which can be passed into memcmp() for comparison)
+ can be longer than the original string.
+ Usually it is 1. For some complex
+ collations it can be bigger. For example
+ in latin1_german2_ci, a sort key is up to
+ twice longer than the original string.
+ e.g. Letter 'A' with two dots above is
+ substituted with 'AE'.
+ mbminlen - mininum multibyte sequence length.
+ Now always 1 except ucs2. For ucs2
+ it is 2.
+ mbmaxlen - maximum multibyte sequence length.
+ 1 for 8bit charsets. Can be also 2 or 3.
+
+
+
+MY_CHARSET_HANDLER
+==================
+
+MY_CHARSET_HANDLER is a collection of character-set
+related routines. Defined in m_ctype.h. Have the
+following set of functions:
+
+Multibyte routines
+------------------
+ismbchar() - detects if the given string is a multibyte sequence
+mbcharlen() - returns length of multibyte sequence starting with
+ the given character
+numchars() - returns number of characters in the given string, e.g.
+ in SQL function CHAR_LENGTH().
+charpos() - calculates the offset of the given position in the string.
+ Used in SQL functions LEFT(), RIGHT(), SUBSTRING(),
+ INSERT()
+
+well_formed_length()
+ - finds the length of correctly formed multybyte beginning.
+ Used in INSERTs to cut a beginning of the given string
+ which is
+ a) "well formed" according to the given character set.
+ b) can fit into the given data type
+ Terminates the string in the good position, taking in account
+ multibyte character boundaries.
+
+lengthsp() - returns the length of the given string without traling spaces.
+
+
+Unicode conversion routines
+---------------------------
+mb_wc - converts the left multibyte sequence into it Unicode code.
+mc_mb - converts the given Unicode code into multibyte sequence.
+
+
+Case and sort convertion
+------------------------
+caseup_str - converts the given 0-terminated string into the upper case
+casedn_str - converts the given 0-terminated string into the lower case
+caseup - converts the given string into the lower case using length
+casedn - converts the given string into the lower case using length
+
+Number-to-string conversion routines
+------------------------------------
+snprintf()
+long10_to_str()
+longlong10_to_str()
+
+The names are pretty self-descripting.
+
+String padding routines
+-----------------------
+fill() - writes the given Unicode value into the given string
+ with the given length. Used to pad the string, usually
+ with space character, according to the given charset.
+
+String-to-numner conversion routines
+------------------------------------
+strntol()
+strntoul()
+strntoll()
+strntoull()
+strntod()
+
+These functions are almost for the same thing with their
+STDLIB counterparts, but also:
+ - accept length instead of 0-terminator
+ - and are character set dependant
+
+Simple scanner routines
+-----------------------
+scan() - to skip leading spaces in the given string.
+ Used when a string value is inserted into a numeric field.
+
+
+
+MY_COLLATION_HANDLER
+====================
+strnncoll() - compares two strings according to the given collation
+strnncollsp() - like the above but ignores trailing spaces
+strnxfrm() - makes a sort key suitable for memcmp() corresponding
+ to the given string
+like_range() - creates a LIKE range, for optimizer
+wildcmp() - wildcard comparison, for LIKE
+strcasecmp() - 0-terminated string comparison
+instr() - finds the first substring appearence in the string
+hash_sort() - calculates hash value taking in account
+ the collation rules, e.g. case-insensitivity,
+ accent sensitivity, etc.
+
+ \ No newline at end of file
diff --git a/strings/Makefile.am b/strings/Makefile.am
index 31b5195d5cb..f8fcfbc5ea3 100644
--- a/strings/Makefile.am
+++ b/strings/Makefile.am
@@ -57,7 +57,7 @@ EXTRA_DIST = ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-win1250ch.c \
t_ctype.h
libmystrings_a_LIBADD=
-conf_to_src_SOURCES = conf_to_src.c xml.c ctype.c
+conf_to_src_SOURCES = conf_to_src.c xml.c ctype.c bcmp.c
conf_to_src_LDADD=
#force static linking of conf_to_src - essential when linking against
#custom installation of libc
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index 3f35f7504ac..8345c53202c 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -6290,7 +6290,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
my_charpos_mb,
my_well_formed_len_mb,
my_lengthsp_8bit,
- my_numcells_mb,
+ my_numcells_8bit,
my_mb_wc_big5, /* mb_wc */
my_wc_mb_big5, /* wc_mb */
my_caseup_str_mb,
@@ -6306,6 +6306,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
my_strntoll_8bit,
my_strntoull_8bit,
my_strntod_8bit,
+ my_strtoll10_8bit,
my_scan_8bit
};
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c
index 42dc0ab086d..7d17f62c8d0 100644
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -465,6 +465,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strntoll_8bit,
my_strntoull_8bit,
my_strntod_8bit,
+ my_strtoll10_8bit,
my_scan_8bit
};
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index 43a50b0dfbe..ee792d9c3e4 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -8657,7 +8657,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_charpos_mb,
my_well_formed_len_mb,
my_lengthsp_8bit,
- my_numcells_mb,
+ my_numcells_8bit,
my_mb_wc_euc_kr, /* mb_wc */
my_wc_mb_euc_kr, /* wc_mb */
my_caseup_str_mb,
@@ -8673,6 +8673,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strntoll_8bit,
my_strntoull_8bit,
my_strntod_8bit,
+ my_strtoll10_8bit,
my_scan_8bit
};
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index 8d97ac9ca1d..f17cc94723f 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -5708,7 +5708,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_charpos_mb,
my_well_formed_len_mb,
my_lengthsp_8bit,
- my_numcells_mb,
+ my_numcells_8bit,
my_mb_wc_gb2312, /* mb_wc */
my_wc_mb_gb2312, /* wc_mb */
my_caseup_str_mb,
@@ -5724,6 +5724,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strntoll_8bit,
my_strntoull_8bit,
my_strntod_8bit,
+ my_strtoll10_8bit,
my_scan_8bit
};
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index 9400fb08f2b..0be56e8d946 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -9939,7 +9939,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_charpos_mb,
my_well_formed_len_mb,
my_lengthsp_8bit,
- my_numcells_mb,
+ my_numcells_8bit,
my_mb_wc_gbk,
my_wc_mb_gbk,
my_caseup_str_mb,
@@ -9955,6 +9955,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strntoll_8bit,
my_strntoull_8bit,
my_strntod_8bit,
+ my_strtoll10_8bit,
my_scan_8bit
};
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index aea517811ab..5f1850b7772 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -403,6 +403,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strntoll_8bit,
my_strntoull_8bit,
my_strntod_8bit,
+ my_strtoll10_8bit,
my_scan_8bit
};
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index 2548a68ab19..7d81766c4cb 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -123,8 +123,7 @@ int my_strcasecmp_mb(CHARSET_INFO * cs,const char *s, const char *t)
** 1 if matched with wildcard
*/
-#define INC_PTR(cs,A,B) A+=((use_mb_flag && \
- my_ismbchar(cs,A,B)) ? my_ismbchar(cs,A,B) : 1)
+#define INC_PTR(cs,A,B) A+=(my_ismbchar(cs,A,B) ? my_ismbchar(cs,A,B) : 1)
#define likeconv(s,A) (uchar) (s)->sort_order[(uchar) (A)]
@@ -135,8 +134,6 @@ int my_wildcmp_mb(CHARSET_INFO *cs,
{
int result= -1; /* Not found, using wildcards */
- bool use_mb_flag=use_mb(cs);
-
while (wildstr != wildend)
{
while (*wildstr != w_many && *wildstr != w_one)
@@ -144,8 +141,7 @@ int my_wildcmp_mb(CHARSET_INFO *cs,
int l;
if (*wildstr == escape && wildstr+1 != wildend)
wildstr++;
- if (use_mb_flag &&
- (l = my_ismbchar(cs, wildstr, wildend)))
+ if ((l = my_ismbchar(cs, wildstr, wildend)))
{
if (str+l > str_end || memcmp(str, wildstr, l) != 0)
return 1;
@@ -200,41 +196,30 @@ int my_wildcmp_mb(CHARSET_INFO *cs,
cmp= *++wildstr;
mb=wildstr;
- LINT_INIT(mblen);
- if (use_mb_flag)
- mblen = my_ismbchar(cs, wildstr, wildend);
+ mblen= my_ismbchar(cs, wildstr, wildend);
INC_PTR(cs,wildstr,wildend); /* This is compared trough cmp */
cmp=likeconv(cs,cmp);
do
{
- if (use_mb_flag)
- {
- for (;;)
+ for (;;)
+ {
+ if (str >= str_end)
+ return -1;
+ if (mblen)
{
- if (str >= str_end)
- return -1;
- if (mblen)
+ if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0)
{
- if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0)
- {
- str += mblen;
- break;
- }
- }
- else if (!my_ismbchar(cs, str, str_end) &&
- likeconv(cs,*str) == cmp)
- {
- str++;
+ str += mblen;
break;
}
- INC_PTR(cs,str, str_end);
}
- }
- else
- {
- while (str != str_end && likeconv(cs,*str) != cmp)
+ else if (!my_ismbchar(cs, str, str_end) &&
+ likeconv(cs,*str) == cmp)
+ {
str++;
- if (str++ == str_end) return (-1);
+ break;
+ }
+ INC_PTR(cs,str, str_end);
}
{
int tmp=my_wildcmp_mb(cs,str,str_end,wildstr,wildend,escape,w_one,
@@ -458,6 +443,97 @@ static void my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)),
}
}
+/*
+** Calculate min_str and max_str that ranges a LIKE string.
+** Arguments:
+** ptr Pointer to LIKE string.
+** ptr_length Length of LIKE string.
+** escape Escape character in LIKE. (Normally '\').
+** All escape characters should be removed from min_str and max_str
+** res_length Length of min_str and max_str.
+** min_str Smallest case sensitive string that ranges LIKE.
+** Should be space padded to res_length.
+** max_str Largest case sensitive string that ranges LIKE.
+** Normally padded with the biggest character sort value.
+**
+** The function should return 0 if ok and 1 if the LIKE string can't be
+** optimized !
+*/
+
+my_bool my_like_range_mb(CHARSET_INFO *cs,
+ const char *ptr,uint ptr_length,
+ pbool escape, pbool w_one, pbool w_many,
+ uint res_length,
+ char *min_str,char *max_str,
+ uint *min_length,uint *max_length)
+{
+ const char *end=ptr+ptr_length;
+ char *min_org=min_str;
+ char *min_end=min_str+res_length;
+ char *max_end=max_str+res_length;
+
+ for (; ptr != end && min_str != min_end ; ptr++)
+ {
+ if (*ptr == escape && ptr+1 != end)
+ {
+ ptr++; /* Skip escape */
+ *min_str++= *max_str++ = *ptr;
+ continue;
+ }
+ if (*ptr == w_one || *ptr == w_many) /* '_' and '%' in SQL */
+ {
+ char buf[10];
+ uint buflen;
+ uint charlen= my_charpos(cs, min_org, min_str, res_length/cs->mbmaxlen);
+
+ if (charlen < (uint) (min_str - min_org))
+ min_str= min_org + charlen;
+
+ /* Write min key */
+ *min_length= (uint) (min_str - min_org);
+ *max_length=res_length;
+ do
+ {
+ *min_str++= (char) cs->min_sort_char;
+ } while (min_str != min_end);
+
+ /*
+ Write max key: create a buffer with multibyte
+ representation of the max_sort_char character,
+ and copy it into max_str in a loop.
+ */
+ buflen= cs->cset->wc_mb(cs, cs->max_sort_char, (uchar*) buf,
+ (uchar*) buf + sizeof(buf));
+ DBUG_ASSERT(buflen > 0);
+ do
+ {
+ if ((max_str + buflen) <= max_end)
+ {
+ /* Enough space for max characer */
+ memcpy(max_str, buf, buflen);
+ max_str+= buflen;
+ }
+ else
+ {
+ /*
+ There is no space for whole multibyte
+ character, then add trailing spaces.
+ */
+
+ *max_str++= ' ';
+ }
+ } while (max_str != max_end);
+ return 0;
+ }
+ *min_str++= *max_str++ = *ptr;
+ }
+ *min_length= *max_length = (uint) (min_str - min_org);
+
+ while (min_str != min_end)
+ *min_str++ = *max_str++ = ' '; /* Because if key compression */
+ return 0;
+}
+
static int my_wildcmp_mb_bin(CHARSET_INFO *cs,
const char *str,const char *str_end,
const char *wildstr,const char *wildend,
@@ -465,8 +541,6 @@ static int my_wildcmp_mb_bin(CHARSET_INFO *cs,
{
int result= -1; /* Not found, using wildcards */
- bool use_mb_flag=use_mb(cs);
-
while (wildstr != wildend)
{
while (*wildstr != w_many && *wildstr != w_one)
@@ -474,8 +548,7 @@ static int my_wildcmp_mb_bin(CHARSET_INFO *cs,
int l;
if (*wildstr == escape && wildstr+1 != wildend)
wildstr++;
- if (use_mb_flag &&
- (l = my_ismbchar(cs, wildstr, wildend)))
+ if ((l = my_ismbchar(cs, wildstr, wildend)))
{
if (str+l > str_end || memcmp(str, wildstr, l) != 0)
return 1;
@@ -530,42 +603,31 @@ static int my_wildcmp_mb_bin(CHARSET_INFO *cs,
cmp= *++wildstr;
mb=wildstr;
- LINT_INIT(mblen);
- if (use_mb_flag)
- mblen = my_ismbchar(cs, wildstr, wildend);
+ mblen= my_ismbchar(cs, wildstr, wildend);
INC_PTR(cs,wildstr,wildend); /* This is compared trough cmp */
do
{
- if (use_mb_flag)
- {
- for (;;)
+ for (;;)
+ {
+ if (str >= str_end)
+ return -1;
+ if (mblen)
{
- if (str >= str_end)
- return -1;
- if (mblen)
- {
- if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0)
- {
- str += mblen;
- break;
- }
- }
- else if (!my_ismbchar(cs, str, str_end) && *str == cmp)
+ if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0)
{
- str++;
+ str += mblen;
break;
}
- INC_PTR(cs,str, str_end);
}
- }
- else
- {
- while (str != str_end && *str != cmp)
+ else if (!my_ismbchar(cs, str, str_end) && *str == cmp)
+ {
str++;
- if (str++ == str_end) return (-1);
+ break;
+ }
+ INC_PTR(cs,str, str_end);
}
{
- int tmp=my_wildcmp_mb(cs,str,str_end,wildstr,wildend,escape,w_one,w_many);
+ int tmp=my_wildcmp_mb_bin(cs,str,str_end,wildstr,wildend,escape,w_one,w_many);
if (tmp <= 0)
return (tmp);
}
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index 84bfcb0b171..a019665a235 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -27,8 +27,7 @@ int my_strnxfrm_simple(CHARSET_INFO * cs,
const uchar *src, uint srclen)
{
uchar *map= cs->sort_order;
- DBUG_ASSERT(len >= srclen);
- len= min(len,srclen);
+ set_if_smaller(len, srclen);
if (dest != src)
{
const uchar *end;
@@ -1284,6 +1283,12 @@ static my_bool my_coll_init_simple(CHARSET_INFO *cs,
}
+longlong my_strtoll10_8bit(CHARSET_INFO *cs __attribute__((unused)),
+ const char *nptr, char **endptr, int *error)
+{
+ return my_strtoll10(nptr, endptr, error);
+}
+
MY_CHARSET_HANDLER my_charset_8bit_handler=
{
@@ -1310,6 +1315,7 @@ MY_CHARSET_HANDLER my_charset_8bit_handler=
my_strntoll_8bit,
my_strntoull_8bit,
my_strntod_8bit,
+ my_strtoll10_8bit,
my_scan_8bit
};
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index b4cfee0f24a..4176ff2e538 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -4534,6 +4534,70 @@ my_mb_wc_sjis(CHARSET_INFO *cs __attribute__((unused)),
return 2;
}
+static
+uint my_numcells_sjis(CHARSET_INFO *cs __attribute__((unused)),
+ const char *str, const char *strend)
+{
+ uint clen= 0;
+ const unsigned char *b= (const unsigned char *) str;
+ const unsigned char *e= (const unsigned char *) strend;
+
+ for (clen= 0; b < e; )
+ {
+ if (*b >= 0xA1 && *b <= 0xDF)
+ {
+ clen++;
+ b++;
+ }
+ else if (*b > 0x7F)
+ {
+ clen+= 2;
+ b+= 2;
+ }
+ else
+ {
+ clen++;
+ b++;
+ }
+ }
+ return clen;
+}
+
+/*
+ Returns a well formed length of a SJIS string.
+ CP932 additional characters are also accepted.
+*/
+static
+uint my_well_formed_len_sjis(CHARSET_INFO *cs __attribute__((unused)),
+ const char *b, const char *e, uint pos)
+{
+ const char *b0= b;
+ while (pos && b < e)
+ {
+ /*
+ Cast to int8 for extra safety.
+ "char" can be unsigned by default
+ on some platforms.
+ */
+ if (((int8)b[0]) >= 0)
+ {
+ /* Single byte character */
+ b+= 1;
+ }
+ else if (issjishead((uchar)*b) && (e-b)>1 && issjistail((uchar)b[1]))
+ {
+ /* Double byte character */
+ b+= 2;
+ }
+ else
+ {
+ /* Wrong byte sequence */
+ break;
+ }
+ }
+ return b - b0;
+}
+
static MY_COLLATION_HANDLER my_collation_ci_handler =
{
@@ -4556,9 +4620,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
mbcharlen_sjis,
my_numchars_mb,
my_charpos_mb,
- my_well_formed_len_mb,
+ my_well_formed_len_sjis,
my_lengthsp_8bit,
- my_numcells_mb,
+ my_numcells_sjis,
my_mb_wc_sjis, /* mb_wc */
my_wc_mb_sjis, /* wc_mb */
my_caseup_str_8bit,
@@ -4574,6 +4638,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strntoll_8bit,
my_strntoull_8bit,
my_strntod_8bit,
+ my_strtoll10_8bit,
my_scan_8bit
};
diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c
index 420c5b5582e..a2ba4783591 100644
--- a/strings/ctype-tis620.c
+++ b/strings/ctype-tis620.c
@@ -946,6 +946,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strntoll_8bit,
my_strntoull_8bit,
my_strntod_8bit,
+ my_strtoll10_8bit,
my_scan_8bit
};
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index cecc3be5045..91af7af0c54 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -6658,6 +6658,42 @@ static const char roman[]= /* i.e. Classical Latin */
"& V << u <<< U ";
/*
+ Persian collation support was provided by
+ Jody McIntyre <mysql@modernduck.com>
+
+ To: internals@lists.mysql.com
+ Subject: Persian UTF8 collation support
+ Date: 17.08.2004
+
+ Contraction is not implemented. Some implementations do perform
+ contraction but others do not, and it is able to sort all my test
+ strings correctly.
+
+ Jody.
+*/
+static const char persian[]=
+ "& \\u066D < \\u064E < \\uFE76 < \\uFE77 < \\u0650 < \\uFE7A < \\uFE7B"
+ " < \\u064F < \\uFE78 < \\uFE79 < \\u064B < \\uFE70 < \\uFE71"
+ " < \\u064D < \\uFE74 < \\u064C < \\uFE72"
+ "& \\uFE7F < \\u0653 < \\u0654 < \\u0655 < \\u0670"
+ "& \\u0669 < \\u0622 < \\u0627 < \\u0671 < \\u0621 < \\u0623 < \\u0625"
+ " < \\u0624 < \\u0626"
+ "& \\u0642 < \\u06A9 < \\u0643"
+ "& \\u0648 < \\u0647 < \\u0629 < \\u06C0 < \\u06CC < \\u0649 < \\u064A"
+ "& \\uFE80 < \\uFE81 < \\uFE82 < \\uFE8D < \\uFE8E < \\uFB50 < \\uFB51"
+ " < \\uFE80 < \\uFE83 < \\uFE84 < \\uFE87 < \\uFE88 < \\uFE85"
+ " < \\uFE86 < \\u0689 < \\u068A"
+ "& \\uFEAE < \\uFDFC"
+ "& \\uFED8 < \\uFB8E < \\uFB8F < \\uFB90 < \\uFB91 < \\uFED9 < \\uFEDA"
+ " < \\uFEDB < \\uFEDC"
+ "& \\uFEEE < \\uFEE9 < \\uFEEA < \\uFEEB < \\uFEEC < \\uFE93 < \\uFE94"
+ " < \\uFBA4 < \\uFBA5 < \\uFBFC < \\uFBFD < \\uFBFE < \\uFBFF"
+ " < \\uFEEF < \\uFEF0 < \\uFEF1 < \\uFEF2 < \\uFEF3 < \\uFEF4"
+ " < \\uFEF5 < \\uFEF6 < \\uFEF7 < \\uFEF8 < \\uFEF9 < \\uFEFA"
+ " < \\uFEFB < \\uFEFC";
+
+
+/*
Unicode Collation Algorithm:
Collation element (weight) scanner,
for consequent scan of collations
@@ -6876,7 +6912,8 @@ static int my_uca_scanner_next_any(my_uca_scanner *scanner)
int mblen;
if (((mblen= scanner->cs->cset->mb_wc(scanner->cs, &wc,
- scanner->sbeg, scanner->send)) < 0))
+ scanner->sbeg,
+ scanner->send)) <= 0))
return -1;
scanner->page= wc >> 8;
@@ -7015,6 +7052,28 @@ static int my_strnncoll_uca(CHARSET_INFO *cs,
NOTES:
Works exactly the same with my_strnncoll_uca(),
but ignores trailing spaces.
+
+ In the while() comparison these situations are possible:
+ 1. (s_res>0) and (t_res>0) and (s_res == t_res)
+ Weights are the same so far, continue comparison
+ 2. (s_res>0) and (t_res>0) and (s_res!=t_res)
+ A difference has been found, return.
+ 3. (s_res>0) and (t_res<0)
+ We have reached the end of the second string, or found
+ an illegal multibyte sequence in the second string.
+ Compare the first string to an infinite array of
+ space characters until difference is found, or until
+ the end of the first string.
+ 4. (s_res<0) and (t_res>0)
+ We have reached the end of the first string, or found
+ an illegal multibyte sequence in the first string.
+ Compare the second string to an infinite array of
+ space characters until difference is found or until
+ the end of the second steing.
+ 5. (s_res<0) and (t_res<0)
+ Both scanners returned -1. It means we have riched
+ the end-of-string of illegal-sequence in both strings
+ at the same time. Return 0, strings are equal.
RETURN
Difference between two strings, according to the collation:
@@ -7033,9 +7092,6 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs,
int s_res;
int t_res;
- slen= cs->cset->lengthsp(cs, (char*) s, slen);
- tlen= cs->cset->lengthsp(cs, (char*) t, tlen);
-
scanner_handler->init(&sscanner, cs, s, slen);
scanner_handler->init(&tscanner, cs, t, tlen);
@@ -7044,6 +7100,36 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs,
s_res= scanner_handler->next(&sscanner);
t_res= scanner_handler->next(&tscanner);
} while ( s_res == t_res && s_res >0);
+
+ if (s_res > 0 && t_res < 0)
+ {
+ /* Calculate weight for SPACE character */
+ t_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]];
+
+ /* compare the first string to spaces */
+ do
+ {
+ if (s_res != t_res)
+ return (s_res - t_res);
+ s_res= scanner_handler->next(&sscanner);
+ } while (s_res > 0);
+ return 0;
+ }
+
+ if (s_res < 0 && t_res > 0)
+ {
+ /* Calculate weight for SPACE character */
+ s_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]];
+
+ /* compare the second string to spaces */
+ do
+ {
+ if (s_res != t_res)
+ return (s_res - t_res);
+ t_res= scanner_handler->next(&tscanner);
+ } while (t_res > 0);
+ return 0;
+ }
return ( s_res - t_res );
}
@@ -7670,7 +7756,7 @@ static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
return (size_t) nitems;
}
-#define MY_MAX_COLL_RULE 64
+#define MY_MAX_COLL_RULE 128
/*
This function copies an UCS2 collation from
@@ -7918,7 +8004,7 @@ MY_COLLATION_HANDLER my_collation_ucs2_uca_handler =
my_strnncoll_ucs2_uca,
my_strnncollsp_ucs2_uca,
my_strnxfrm_ucs2_uca,
- my_like_range_simple,
+ my_like_range_ucs2,
my_wildcmp_uca,
NULL,
my_instr_mb,
@@ -8359,6 +8445,35 @@ CHARSET_INFO my_charset_ucs2_roman_uca_ci=
&my_collation_ucs2_uca_handler
};
+
+CHARSET_INFO my_charset_ucs2_persian_uca_ci=
+{
+ 144,0,0, /* number */
+ MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
+ "ucs2", /* cs name */
+ "ucs2_persian_ci", /* name */
+ "", /* comment */
+ persian, /* tailoring */
+ NULL, /* ctype */
+ NULL, /* to_lower */
+ NULL, /* to_upper */
+ NULL, /* sort_order */
+ NULL, /* contractions */
+ NULL, /* sort_order_big*/
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 8, /* strxfrm_multiply */
+ 2, /* mbminlen */
+ 2, /* mbmaxlen */
+ 9, /* min_sort_char */
+ 0xFFFF, /* max_sort_char */
+ &my_charset_ucs2_handler,
+ &my_collation_ucs2_uca_handler
+};
+
+
#endif
@@ -8369,7 +8484,7 @@ MY_COLLATION_HANDLER my_collation_any_uca_handler =
my_strnncoll_any_uca,
my_strnncollsp_any_uca,
my_strnxfrm_any_uca,
- my_like_range_simple,
+ my_like_range_mb,
my_wildcmp_uca,
NULL,
my_instr_mb,
@@ -8837,4 +8952,32 @@ CHARSET_INFO my_charset_utf8_roman_uca_ci=
&my_charset_utf8_handler,
&my_collation_any_uca_handler
};
+
+CHARSET_INFO my_charset_utf8_persian_uca_ci=
+{
+ 208,0,0, /* number */
+ MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
+ "utf8", /* cs name */
+ "utf8_persian_ci", /* name */
+ "", /* comment */
+ persian, /* tailoring */
+ ctype_utf8, /* ctype */
+ NULL, /* to_lower */
+ NULL, /* to_upper */
+ NULL, /* sort_order */
+ NULL, /* contractions */
+ NULL, /* sort_order_big*/
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 8, /* strxfrm_multiply */
+ 1, /* mbminlen */
+ 2, /* mbmaxlen */
+ 9, /* min_sort_char */
+ 0xFFFF, /* max_sort_char */
+ &my_charset_utf8_handler,
+ &my_collation_any_uca_handler
+};
+
#endif
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index c6e55ee8f0e..851c2044f47 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -18,6 +18,7 @@
/* UCS2 support. Written by Alexander Barkov <bar@mysql.com> */
#include <my_global.h>
+#include <my_sys.h>
#include "m_string.h"
#include "m_ctype.h"
#include <errno.h>
@@ -852,7 +853,6 @@ bs:
return (negative ? -((longlong) res) : (longlong) res);
}
-
double my_strntod_ucs2(CHARSET_INFO *cs __attribute__((unused)),
char *nptr, uint length,
char **endptr, int *err)
@@ -1000,6 +1000,188 @@ cnv:
}
+#undef ULONGLONG_MAX
+#define ULONGLONG_MAX (~(ulonglong) 0)
+#define MAX_NEGATIVE_NUMBER ((ulonglong) LL(0x8000000000000000))
+#define INIT_CNT 9
+#define LFACTOR ULL(1000000000)
+#define LFACTOR1 ULL(10000000000)
+#define LFACTOR2 ULL(100000000000)
+
+static unsigned long lfactor[9]=
+{
+ 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L
+};
+
+
+longlong my_strtoll10_ucs2(CHARSET_INFO *cs __attribute__((unused)),
+ const char *nptr, char **endptr, int *error)
+{
+ const char *s, *end, *start, *n_end, *true_end;
+ unsigned char c;
+ unsigned long i, j, k;
+ ulonglong li;
+ int negative;
+ ulong cutoff, cutoff2, cutoff3;
+
+ s= nptr;
+ /* If fixed length string */
+ if (endptr)
+ {
+ /* Make sure string length is even */
+ end= s + ((*endptr - s) / 2) * 2;
+ while (s < end && !s[0] && (s[1] == ' ' || s[1] == '\t'))
+ s+= 2;
+ if (s == end)
+ goto no_conv;
+ }
+ else
+ {
+ /* We don't support null terminated strings in UCS2 */
+ goto no_conv;
+ }
+
+ /* Check for a sign. */
+ negative= 0;
+ if (!s[0] && s[1] == '-')
+ {
+ *error= -1; /* Mark as negative number */
+ negative= 1;
+ s+= 2;
+ if (s == end)
+ goto no_conv;
+ cutoff= MAX_NEGATIVE_NUMBER / LFACTOR2;
+ cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
+ cutoff3= MAX_NEGATIVE_NUMBER % 100;
+ }
+ else
+ {
+ *error= 0;
+ if (!s[0] && s[1] == '+')
+ {
+ s+= 2;
+ if (s == end)
+ goto no_conv;
+ }
+ cutoff= ULONGLONG_MAX / LFACTOR2;
+ cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
+ cutoff3= ULONGLONG_MAX % 100;
+ }
+
+ /* Handle case where we have a lot of pre-zero */
+ if (!s[0] && s[1] == '0')
+ {
+ i= 0;
+ do
+ {
+ s+= 2;
+ if (s == end)
+ goto end_i; /* Return 0 */
+ }
+ while (!s[0] && s[1] == '0');
+ n_end= s + 2 * INIT_CNT;
+ }
+ else
+ {
+ /* Read first digit to check that it's a valid number */
+ if (s[0] || (c= (s[1]-'0')) > 9)
+ goto no_conv;
+ i= c;
+ s+= 2;
+ n_end= s + 2 * (INIT_CNT-1);
+ }
+
+ /* Handle first 9 digits and store them in i */
+ if (n_end > end)
+ n_end= end;
+ for (; s != n_end ; s+= 2)
+ {
+ if (s[0] || (c= (s[1]-'0')) > 9)
+ goto end_i;
+ i= i*10+c;
+ }
+ if (s == end)
+ goto end_i;
+
+ /* Handle next 9 digits and store them in j */
+ j= 0;
+ start= s; /* Used to know how much to shift i */
+ n_end= true_end= s + 2 * INIT_CNT;
+ if (n_end > end)
+ n_end= end;
+ do
+ {
+ if (s[0] || (c= (s[1]-'0')) > 9)
+ goto end_i_and_j;
+ j= j*10+c;
+ s+= 2;
+ } while (s != n_end);
+ if (s == end)
+ {
+ if (s != true_end)
+ goto end_i_and_j;
+ goto end3;
+ }
+ if (s[0] || (c= (s[1]-'0')) > 9)
+ goto end3;
+
+ /* Handle the next 1 or 2 digits and store them in k */
+ k=c;
+ s+= 2;
+ if (s == end || s[0] || (c= (s[1]-'0')) > 9)
+ goto end4;
+ k= k*10+c;
+ s+= 2;
+ *endptr= (char*) s;
+
+ /* number string should have ended here */
+ if (s != end && !s[0] && (c= (s[1]-'0')) <= 9)
+ goto overflow;
+
+ /* Check that we didn't get an overflow with the last digit */
+ if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
+ k > cutoff3)))
+ goto overflow;
+ li=i*LFACTOR2+ (ulonglong) j*100 + k;
+ return (longlong) li;
+
+overflow: /* *endptr is set here */
+ *error= MY_ERRNO_ERANGE;
+ return negative ? LONGLONG_MIN : (longlong) ULONGLONG_MAX;
+
+end_i:
+ *endptr= (char*) s;
+ return (negative ? ((longlong) -(long) i) : (longlong) i);
+
+end_i_and_j:
+ li= (ulonglong) i * lfactor[(uint) (s-start) / 2] + j;
+ *endptr= (char*) s;
+ return (negative ? -((longlong) li) : (longlong) li);
+
+end3:
+ li=(ulonglong) i*LFACTOR+ (ulonglong) j;
+ *endptr= (char*) s;
+ return (negative ? -((longlong) li) : (longlong) li);
+
+end4:
+ li=(ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
+ *endptr= (char*) s;
+ if (negative)
+ {
+ if (li > MAX_NEGATIVE_NUMBER)
+ goto overflow;
+ return -((longlong) li);
+ }
+ return (longlong) li;
+
+no_conv:
+ /* There was no number to convert. */
+ *error= MY_ERRNO_EDOM;
+ *endptr= (char *) nptr;
+ return 0;
+}
+
+
static
uint my_numchars_ucs2(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *e)
@@ -1049,172 +1231,14 @@ uint my_lengthsp_ucs2(CHARSET_INFO *cs __attribute__((unused)),
}
-/*
-** Compare string against string with wildcard
-** 0 if matched
-** -1 if not matched with wildcard
-** 1 if matched with wildcard
-*/
-
-static
-int my_wildcmp_ucs2(CHARSET_INFO *cs,
- const char *str,const char *str_end,
- const char *wildstr,const char *wildend,
- int escape, int w_one, int w_many,
- MY_UNICASE_INFO **weights)
-{
- int result= -1; /* Not found, using wildcards */
- my_wc_t s_wc, w_wc;
- int scan, plane;
-
- while (wildstr != wildend)
- {
-
- while (1)
- {
- scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
- (const uchar*)wildend);
- if (scan <= 0)
- return 1;
-
- if (w_wc == (my_wc_t)escape)
- {
- wildstr+= scan;
- scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
- (const uchar*)wildend);
- if (scan <= 0)
- return 1;
- }
-
- if (w_wc == (my_wc_t)w_many)
- {
- result= 1; /* Found an anchor char */
- break;
- }
-
- wildstr+= scan;
- scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str, (const uchar*)str_end);
- if (scan <=0)
- return 1;
- str+= scan;
-
- if (w_wc == (my_wc_t)w_one)
- {
- result= 1; /* Found an anchor char */
- }
- else
- {
- if (weights)
- {
- plane=(s_wc>>8) & 0xFF;
- s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
- plane=(w_wc>>8) & 0xFF;
- w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
- }
- if (s_wc != w_wc)
- return 1; /* No match */
- }
- if (wildstr == wildend)
- return (str != str_end); /* Match if both are at end */
- }
-
-
- if (w_wc == (my_wc_t)w_many)
- { /* Found w_many */
-
- /* Remove any '%' and '_' from the wild search string */
- for ( ; wildstr != wildend ; )
- {
- scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
- (const uchar*)wildend);
- if (scan <= 0)
- return 1;
-
- if (w_wc == (my_wc_t)w_many)
- {
- wildstr+= scan;
- continue;
- }
-
- if (w_wc == (my_wc_t)w_one)
- {
- wildstr+= scan;
- scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str,
- (const uchar*)str_end);
- if (scan <=0)
- return 1;
- str+= scan;
- continue;
- }
- break; /* Not a wild character */
- }
-
- if (wildstr == wildend)
- return 0; /* Ok if w_many is last */
-
- if (str == str_end)
- return -1;
-
- scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
- (const uchar*)wildend);
- if (scan <= 0)
- return 1;
-
- if (w_wc == (my_wc_t)escape)
- {
- wildstr+= scan;
- scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
- (const uchar*)wildend);
- if (scan <= 0)
- return 1;
- }
-
- while (1)
- {
- /* Skip until the first character from wildstr is found */
- while (str != str_end)
- {
- scan= my_ucs2_uni(cs,&s_wc, (const uchar*)str,
- (const uchar*)str_end);
- if (scan <= 0)
- return 1;
- if (weights)
- {
- plane=(s_wc>>8) & 0xFF;
- s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
- plane=(w_wc>>8) & 0xFF;
- w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
- }
-
- if (s_wc == w_wc)
- break;
- str+= scan;
- }
- if (str == str_end)
- return -1;
-
- result= my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,escape,
- w_one,w_many,weights);
-
- if (result <= 0)
- return result;
-
- str+= scan;
- }
- }
- }
- return (str != str_end ? 1 : 0);
-}
-
-
static
int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
const char *str,const char *str_end,
const char *wildstr,const char *wildend,
int escape, int w_one, int w_many)
{
- return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,
- escape,w_one,w_many,uni_plane);
+ return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+ escape,w_one,w_many,uni_plane);
}
@@ -1224,8 +1248,8 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs,
const char *wildstr,const char *wildend,
int escape, int w_one, int w_many)
{
- return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,
- escape,w_one,w_many,NULL);
+ return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+ escape,w_one,w_many,NULL);
}
@@ -1345,10 +1369,10 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
}
if (ptr[0] == '\0' && ptr[1] == w_one) /* '_' in SQL */
{
- *min_str++= (char) cs->min_sort_char >> 8;
- *min_str++= (char) cs->min_sort_char & 255;
- *max_str++= (char) cs->max_sort_char >> 8;
- *max_str++= (char) cs->max_sort_char & 255;
+ *min_str++= (char) (cs->min_sort_char >> 8);
+ *min_str++= (char) (cs->min_sort_char & 255);
+ *max_str++= (char) (cs->max_sort_char >> 8);
+ *max_str++= (char) (cs->max_sort_char & 255);
continue;
}
if (ptr[0] == '\0' && ptr[1] == w_many) /* '%' in SQL */
@@ -1358,8 +1382,8 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
do {
*min_str++ = 0;
*min_str++ = 0;
- *max_str++ = (char) cs->max_sort_char >>8;
- *max_str++ = (char) cs->max_sort_char & 255;
+ *max_str++ = (char) (cs->max_sort_char >> 8);
+ *max_str++ = (char) (cs->max_sort_char & 255);
} while (min_str + 1 < min_end);
return 0;
}
@@ -1439,6 +1463,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
my_strntoll_ucs2,
my_strntoull_ucs2,
my_strntod_ucs2,
+ my_strtoll10_ucs2,
my_scan_8bit
};
diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c
index 37c26a3bbc4..94673a20795 100644
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@@ -8252,6 +8252,40 @@ my_jisx0212_uni_onechar(int code){
[xA1-xFE][xA1-xFE] # JIS X 0208:1997 (two bytes/char)
*/
+static
+uint my_numcells_eucjp(CHARSET_INFO *cs __attribute__((unused)),
+ const char *str, const char *strend)
+{
+ uint clen= 0;
+ const unsigned char *b= (const unsigned char *) str;
+ const unsigned char *e= (const unsigned char *) strend;
+
+ for (clen= 0; b < e; )
+ {
+ if (*b == 0x8E)
+ {
+ clen++;
+ b+= 2;
+ }
+ else if (*b == 0x8F)
+ {
+ clen+= 2;
+ b+= 3;
+ }
+ else if (*b & 0x80)
+ {
+ clen+= 2;
+ b+= 2;
+ }
+ else
+ {
+ clen++;
+ b++;
+ }
+ }
+ return clen;
+}
+
static int
my_mb_wc_euc_jp(CHARSET_INFO *cs,my_wc_t *pwc, const uchar *s, const uchar *e)
{
@@ -8443,7 +8477,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_charpos_mb,
my_well_formed_len_mb,
my_lengthsp_8bit,
- my_numcells_mb,
+ my_numcells_eucjp,
my_mb_wc_euc_jp, /* mb_wc */
my_wc_mb_euc_jp, /* wc_mb */
my_caseup_str_mb,
@@ -8459,6 +8493,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strntoll_8bit,
my_strntoull_8bit,
my_strntod_8bit,
+ my_strtoll10_8bit,
my_scan_8bit
};
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 5e339725b1a..b3097649158 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -1518,6 +1518,161 @@ MY_UNICASE_INFO *uni_plane[256]={
};
+
+/*
+** Compare string against string with wildcard
+** This function is used in UTF8 and UCS2
+**
+** 0 if matched
+** -1 if not matched with wildcard
+** 1 if matched with wildcard
+*/
+
+int my_wildcmp_unicode(CHARSET_INFO *cs,
+ const char *str,const char *str_end,
+ const char *wildstr,const char *wildend,
+ int escape, int w_one, int w_many,
+ MY_UNICASE_INFO **weights)
+{
+ int result= -1; /* Not found, using wildcards */
+ my_wc_t s_wc, w_wc;
+ int scan, plane;
+ int (*mb_wc)(struct charset_info_st *cs, my_wc_t *wc,
+ const unsigned char *s,const unsigned char *e);
+ mb_wc= cs->cset->mb_wc;
+
+ while (wildstr != wildend)
+ {
+ while (1)
+ {
+ if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+ (const uchar*)wildend)) <= 0)
+ return 1;
+
+ if (w_wc == (my_wc_t)escape)
+ {
+ wildstr+= scan;
+ if ((scan= mb_wc(cs,&w_wc, (const uchar*)wildstr,
+ (const uchar*)wildend)) <= 0)
+ return 1;
+ }
+
+ if (w_wc == (my_wc_t)w_many)
+ {
+ result= 1; /* Found an anchor char */
+ break;
+ }
+
+ wildstr+= scan;
+ if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+ (const uchar*)str_end)) <=0)
+ return 1;
+ str+= scan;
+
+ if (w_wc == (my_wc_t)w_one)
+ {
+ result= 1; /* Found an anchor char */
+ }
+ else
+ {
+ if (weights)
+ {
+ plane=(s_wc>>8) & 0xFF;
+ s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
+ plane=(w_wc>>8) & 0xFF;
+ w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
+ }
+ if (s_wc != w_wc)
+ return 1; /* No match */
+ }
+ if (wildstr == wildend)
+ return (str != str_end); /* Match if both are at end */
+ }
+
+
+ if (w_wc == (my_wc_t)w_many)
+ { /* Found w_many */
+
+ /* Remove any '%' and '_' from the wild search string */
+ for ( ; wildstr != wildend ; )
+ {
+ if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+ (const uchar*)wildend)) <= 0)
+ return 1;
+
+ if (w_wc == (my_wc_t)w_many)
+ {
+ wildstr+= scan;
+ continue;
+ }
+
+ if (w_wc == (my_wc_t)w_one)
+ {
+ wildstr+= scan;
+ if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+ (const uchar*)str_end)) <=0)
+ return 1;
+ str+= scan;
+ continue;
+ }
+ break; /* Not a wild character */
+ }
+
+ if (wildstr == wildend)
+ return 0; /* Ok if w_many is last */
+
+ if (str == str_end)
+ return -1;
+
+ if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+ (const uchar*)wildend)) <=0)
+ return 1;
+
+ if (w_wc == (my_wc_t)escape)
+ {
+ wildstr+= scan;
+ if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+ (const uchar*)wildend)) <=0)
+ return 1;
+ }
+
+ while (1)
+ {
+ /* Skip until the first character from wildstr is found */
+ while (str != str_end)
+ {
+ if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+ (const uchar*)str_end)) <=0)
+ return 1;
+ if (weights)
+ {
+ plane=(s_wc>>8) & 0xFF;
+ s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
+ plane=(w_wc>>8) & 0xFF;
+ w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
+ }
+
+ if (s_wc == w_wc)
+ break;
+ str+= scan;
+ }
+ if (str == str_end)
+ return -1;
+
+ result= my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
+ escape, w_one, w_many,
+ weights);
+
+ if (result <= 0)
+ return result;
+
+ str+= scan;
+ }
+ }
+ }
+ return (str != str_end ? 1 : 0);
+}
+
#endif
@@ -1948,50 +2103,120 @@ static int my_strnncollsp_utf8(CHARSET_INFO *cs,
}
-static int my_strncasecmp_utf8(CHARSET_INFO *cs,
- const char *s, const char *t, uint len)
-{
- int s_res,t_res;
- my_wc_t s_wc,t_wc;
- const char *se=s+len;
- const char *te=t+len;
+/*
+ Compare 0-terminated UTF8 strings.
- while ( s < se && t < te )
- {
- int plane;
+ SYNOPSIS
+ my_strcasecmp_utf8()
+ cs character set handler
+ s First 0-terminated string to compare
+ t Second 0-terminated string to compare
- s_res=my_utf8_uni(cs,&s_wc, (const uchar*)s, (const uchar*)se);
- t_res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*)te);
+ IMPLEMENTATION
- if ( s_res <= 0 || t_res <= 0 )
+ RETURN
+ - negative number if s < t
+ - positive number if s > t
+ - 0 is the strings are equal
+*/
+
+static
+int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
+{
+ while (s[0] && t[0])
+ {
+ my_wc_t s_wc,t_wc;
+
+ /*
+ Cast to int8 for extra safety.
+ char can be unsigned by default
+ on some platforms.
+ */
+ if (((int8)s[0]) >= 0)
{
- /* Incorrect string, compare byte by byte value */
- return bincmp(s, se, t, te);
+ /*
+ s[0] is between 0 and 127.
+ It represents a single byte character.
+ Convert it into weight according to collation.
+ */
+ s_wc= plane00[(uchar) s[0]].tolower;
+ s++;
}
-
- plane=(s_wc>>8) & 0xFF;
- s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc;
-
- plane=(t_wc>>8) & 0xFF;
- t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc;
-
+ else
+ {
+ int plane, res;
+
+ /*
+ Scan a multibyte character.
+
+ In the future it is worth to write a special version of my_utf8_uni()
+ for 0-terminated strings which will not take in account length. Now
+ we call the regular version of my_utf8_uni() with s+3 in the
+ last argument. s+3 is enough to scan any multibyte sequence.
+
+ Calling the regular version of my_utf8_uni is safe for 0-terminated
+ strings: we will never lose the end of the string:
+ If we have 0 character in the middle of a multibyte sequence,
+ then my_utf8_uni will always return a negative number, so the
+ loop with finish.
+ */
+
+ res= my_utf8_uni(cs,&s_wc, (const uchar*)s, (const uchar*) s + 3);
+
+ /*
+ In the case of wrong multibyte sequence we will
+ call strcmp() for byte-to-byte comparison.
+ */
+ if (res <= 0)
+ return strcmp(s, t);
+ s+= res;
+
+ /* Convert Unicode code into weight according to collation */
+ plane=(s_wc>>8) & 0xFF;
+ s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc;
+ }
+
+
+ /* Do the same for the second string */
+
+ if (((int8)t[0]) >= 0)
+ {
+ /* Convert single byte character into weight */
+ t_wc= plane00[(uchar) t[0]].tolower;
+ t++;
+ }
+ else
+ {
+ int plane;
+ int res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*) t + 3);
+ if (res <= 0)
+ return strcmp(s, t);
+ t+= res;
+
+ /* Convert code into weight */
+ plane=(t_wc>>8) & 0xFF;
+ t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc;
+ }
+
+ /* Now we have two weights, let's compare them */
if ( s_wc != t_wc )
return ((int) s_wc) - ((int) t_wc);
-
- s+=s_res;
- t+=t_res;
}
- return ( (se-s) - (te-t) );
+ return ((int)(uchar)s[0]) - ((int) (uchar) t[0]);
}
-static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
+
+static
+int my_wildcmp_utf8(CHARSET_INFO *cs,
+ const char *str,const char *str_end,
+ const char *wildstr,const char *wildend,
+ int escape, int w_one, int w_many)
{
- uint s_len=strlen(s);
- uint t_len=strlen(t);
- uint len = (s_len > t_len) ? s_len : t_len;
- return my_strncasecmp_utf8(cs, s, t, len);
+ return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+ escape,w_one,w_many,uni_plane);
}
+
static int my_strnxfrm_utf8(CHARSET_INFO *cs,
uchar *dst, uint dstlen,
const uchar *src, uint srclen)
@@ -2059,8 +2284,8 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_strnncoll_utf8,
my_strnncollsp_utf8,
my_strnxfrm_utf8,
- my_like_range_simple,
- my_wildcmp_mb,
+ my_like_range_mb,
+ my_wildcmp_utf8,
my_strcasecmp_utf8,
my_instr_mb,
my_hash_sort_utf8
@@ -2091,6 +2316,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
my_strntoll_8bit,
my_strntoull_8bit,
my_strntod_8bit,
+ my_strtoll10_8bit,
my_scan_8bit
};
@@ -2118,7 +2344,7 @@ CHARSET_INFO my_charset_utf8_general_ci=
1, /* mbminlen */
3, /* mbmaxlen */
0, /* min_sort_char */
- 255, /* max_sort_char */
+ 0xFFFF, /* max_sort_char */
&my_charset_utf8_handler,
&my_collation_ci_handler
};
diff --git a/strings/xml.c b/strings/xml.c
index 7d7839e1603..6ba52ea41a8 100644
--- a/strings/xml.c
+++ b/strings/xml.c
@@ -81,10 +81,11 @@ static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
a->beg=p->cur;
a->end=p->cur;
- if (!memcmp(p->cur,"<!--",4))
+ if (!bcmp(p->cur,"<!--",4))
{
- for( ; (p->cur < p->end) && memcmp(p->cur, "-->", 3); p->cur++);
- if(!memcmp(p->cur, "-->", 3))
+ for( ; (p->cur < p->end) && bcmp(p->cur, "-->", 3); p->cur++)
+ {}
+ if (!bcmp(p->cur, "-->", 3))
p->cur+=3;
a->end=p->cur;
lex=MY_XML_COMMENT;