summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorMichael Widenius <monty@mysql.com>2008-10-10 18:28:41 +0300
committerMichael Widenius <monty@mysql.com>2008-10-10 18:28:41 +0300
commitf47e003e1bfc56c2bf5d0f144a35517f526b538b (patch)
treee2bfb9834c6e558381465ed2f57a9d873a9b2c90 /strings
parent51a92bbb03cc58ab8688fa9d8226afe32e6156ca (diff)
parent9daa56fd5ce3ccd33c32b5a505ac1d2b2c437460 (diff)
downloadmariadb-git-f47e003e1bfc56c2bf5d0f144a35517f526b538b.tar.gz
Merged 5.1 with maria 5.1
Diffstat (limited to 'strings')
-rw-r--r--strings/CHARSET_INFO.txt138
-rw-r--r--strings/ctype-big5.c6
-rw-r--r--strings/ctype-gbk.c6
-rw-r--r--strings/decimal.c22
4 files changed, 115 insertions, 57 deletions
diff --git a/strings/CHARSET_INFO.txt b/strings/CHARSET_INFO.txt
index 1336d5ae3bb..bb8e40025c7 100644
--- a/strings/CHARSET_INFO.txt
+++ b/strings/CHARSET_INFO.txt
@@ -3,9 +3,8 @@ CHARSET_INFO
============
A structure containing data for charset+collation pair implementation.
-Virtual functions which use this data are collected
-into separate structures MY_CHARSET_HANDLER and
-MY_COLLATION_HANDLER.
+Virtual functions that use this data are collected into separate
+structures, MY_CHARSET_HANDLER and MY_COLLATION_HANDLER.
typedef struct charset_info_st
@@ -56,7 +55,7 @@ character set. Not really used now. Intended to optimize some
parts of the code where we need to find the default collation
using its non-default counterpart for the given character set.
-binary_numner - ID of a charset+collation pair, which consists
+binary_number - ID of a charset+collation pair, which consists
of the same character set and the binary collation of this
character set. Not really used now.
@@ -65,15 +64,15 @@ Names
csname - name of the character set for this charset+collation pair.
name - name of the collation for this charset+collation pair.
- comment - a text comment, dysplayed in "Description" column of
+ comment - a text comment, displayed in "Description" column of
SHOW CHARACTER SET output.
Conversion tables
-----------------
ctype - pointer to array[257] of "type of characters"
- bit mask for each chatacter, e.g. if a
- character is a digit or a letter or a separator, etc.
+ bit mask for each character, e.g., whether a
+ character is a digit, letter, separator, etc.
Monty 2004-10-21:
If you look at the macros, we use ctype[(char)+1].
@@ -87,17 +86,64 @@ Conversion tables
to_upper - pointer to array[256] used in UCASE()
sort_order - pointer to array[256] used for strings comparison
+In all Asian charsets these arrays are set up as follows:
+
+- All bytes in the range 0x80..0xFF were marked as letters in the
+ ctype array.
+
+- The to_lower and to_upper arrays map only ASCII letters.
+ UPPER() and LOWER() doesn't really work for multi-byte characters.
+ Most of the characters in Asian character sets are ideograms
+ anyway and they don't have case mapping. However, there are
+ still some characters from European alphabets.
+ For example:
+ _ujis 0x8FAAF2 - LATIN CAPITAL LETTER Y WITH ACUTE
+ _ujis 0x8FABF2 - LATIN SMALL LETTER Y WITH ACUTE
+
+ But they don't map to each other with UPPER and LOWER operations.
+
+- The sort_order array is filled case insensitively for the
+ ASCII range 0x00..0x7F, and in "binary" fashion for the multi-byte
+ range 0x80..0xFF for these collations:
+
+ cp932_japanese_ci,
+ euckr_korean_ci,
+ eucjpms_japanese_ci,
+ gb2312_chinese_ci,
+ sjis_japanese_ci,
+ ujis_japanese_ci.
+
+ So multi-byte characters are sorted just according to their codes.
+
+
+- Two collations are still case insensitive for the ASCII characters,
+ but have special sorting order for multi-byte characters
+ (something more complex than just according to codes):
+
+ big5_chinese_ci
+ gbk_chinese_ci
+
+ So handlers for these collations use only the 0x00..0x7F part
+ of their sort_order arrays, and apply the special functions
+ for multi-byte characters
+
+In Unicode character sets we have full support of UPPER/LOWER mapping,
+for sorting order, and for character type detection.
+"utf8_general_ci" still has the "old-fashioned" arrays
+like to_upper, to_lower, sort_order and ctype, but they are
+not really used (maybe only in some rare legacy functions).
+
Unicode conversion data
-----------------------
-For 8bit character sets:
+For 8-bit character sets:
tab_to_uni : array[256] of charset->Unicode translation
tab_from_uni: a structure for Unicode->charset translation
-Non-8 bit charsets have their own structures per charset
-hidden in correspondent ctype-xxx.c file and don't use
+Non-8-bit charsets have their own structures per charset
+hidden in corresponding ctype-xxx.c file and don't use
tab_to_uni and tab_from_uni tables.
@@ -106,9 +152,9 @@ Parser maps
state_map[]
ident_map[]
- These maps are to quickly identify if a character is
-an identificator part, a digit, a special character,
-or a part of other SQL language lexical item.
+These maps are used to quickly identify whether a character is an
+identifier part, a digit, a special character, or a part of another
+SQL language lexical item.
Probably can be combined with ctype array in the future.
But for some reasons these two arrays are used in the parser,
@@ -116,32 +162,32 @@ while a separate ctype[] array is used in the other part of the
code, like fulltext, etc.
-Misc fields
------------
+Miscellaneous fields
+--------------------
- strxfrm_multiply - how many times a sort key (i.e. a string
- which can be passed into memcmp() for comparison)
+ strxfrm_multiply - how many times a sort key (that is, a string
+ that can be passed into memcmp() for comparison)
can be longer than the original string.
Usually it is 1. For some complex
- collations it can be bigger. For example
+ collations it can be bigger. For example,
in latin1_german2_ci, a sort key is up to
- twice longer than the original string.
+ two times longer than the original string.
e.g. Letter 'A' with two dots above is
substituted with 'AE'.
- mbminlen - mininum multibyte sequence length.
- Now always 1 except ucs2. For ucs2
+ mbminlen - minimum multi-byte sequence length.
+ Now always 1 except for ucs2. For ucs2,
it is 2.
- mbmaxlen - maximum multibyte sequence length.
- 1 for 8bit charsets. Can be also 2 or 3.
+ mbmaxlen - maximum multi-byte sequence length.
+ 1 for 8-bit charsets. Can be also 2 or 3.
max_sort_char - for LIKE range
- in case of 8bit character sets - native code
+ in case of 8-bit character sets - native code
of maximum character (max_str pad byte);
in case of UTF8 and UCS2 - Unicode code of the maximum
possible character (usually U+FFFF). This code is
- converted to multibyte representation (usually 0xEFBFBF)
+ converted to multi-byte representation (usually 0xEFBFBF)
and then used as a pad sequence for max_str.
- in case of other multibyte character sets -
+ in case of other multi-byte character sets -
max_str pad byte (usually 0xFF).
MY_CHARSET_HANDLER
@@ -151,10 +197,10 @@ MY_CHARSET_HANDLER is a collection of character-set
related routines. Defined in m_ctype.h. Have the
following set of functions:
-Multibyte routines
+Multi-byte routines
------------------
-ismbchar() - detects if the given string is a multibyte sequence
-mbcharlen() - returns length of multibyte sequence starting with
+ismbchar() - detects whether the given string is a multi-byte sequence
+mbcharlen() - returns length of multi-byte sequence starting with
the given character
numchars() - returns number of characters in the given string, e.g.
in SQL function CHAR_LENGTH().
@@ -163,29 +209,29 @@ charpos() - calculates the offset of the given position in the string.
INSERT()
well_formed_length()
- - finds the length of correctly formed multybyte beginning.
+ - finds the length of correctly formed multi-byte beginning.
Used in INSERTs to cut a beginning of the given string
which is
a) "well formed" according to the given character set.
- b) can fit into the given data type
+ b) can fit into the given data type
Terminates the string in the good position, taking in account
- multibyte character boundaries.
+ multi-byte character boundaries.
-lengthsp() - returns the length of the given string without traling spaces.
+lengthsp() - returns the length of the given string without trailing spaces.
Unicode conversion routines
---------------------------
-mb_wc - converts the left multibyte sequence into it Unicode code.
-mc_mb - converts the given Unicode code into multibyte sequence.
+mb_wc - converts the left multi-byte sequence into its Unicode code.
+mc_mb - converts the given Unicode code into multi-byte sequence.
Case and sort conversion
------------------------
-caseup_str - converts the given 0-terminated string into the upper case
-casedn_str - converts the given 0-terminated string into the lower case
-caseup - converts the given string into the lower case using length
-casedn - converts the given string into the lower case using length
+caseup_str - converts the given 0-terminated string to uppercase
+casedn_str - converts the given 0-terminated string to lowercase
+caseup - converts the given string to lowercase using length
+casedn - converts the given string to lowercase using length
Number-to-string conversion routines
------------------------------------
@@ -193,7 +239,7 @@ snprintf()
long10_to_str()
longlong10_to_str()
-The names are pretty self-descripting.
+The names are pretty self-describing.
String padding routines
-----------------------
@@ -201,7 +247,7 @@ fill() - writes the given Unicode value into the given string
with the given length. Used to pad the string, usually
with space character, according to the given charset.
-String-to-numner conversion routines
+String-to-number conversion routines
------------------------------------
strntol()
strntoul()
@@ -209,10 +255,10 @@ strntoll()
strntoull()
strntod()
-These functions are almost for the same thing with their
-STDLIB counterparts, but also:
+These functions are almost the same as their STDLIB counterparts,
+but also:
- accept length instead of 0-terminator
- - and are character set dependant
+ - are character set dependent
Simple scanner routines
-----------------------
@@ -230,8 +276,8 @@ strnxfrm() - makes a sort key suitable for memcmp() corresponding
like_range() - creates a LIKE range, for optimizer
wildcmp() - wildcard comparison, for LIKE
strcasecmp() - 0-terminated string comparison
-instr() - finds the first substring appearence in the string
-hash_sort() - calculates hash value taking in account
+instr() - finds the first substring appearance in the string
+hash_sort() - calculates hash value taking into account
the collation rules, e.g. case-insensitivity,
accent sensitivity, etc.
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index ecfd3d648e0..3da307b82fc 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -307,15 +307,17 @@ static size_t my_strnxfrm_big5(CHARSET_INFO *cs __attribute__((unused)),
{
uint16 e;
size_t dstlen= len;
+ uchar *dest_end= dest + dstlen;
len = srclen;
- while (len--)
+ while (len-- && dest < dest_end)
{
if ((len > 0) && isbig5code(*src, *(src+1)))
{
e = big5strokexfrm((uint16) big5code(*src, *(src+1)));
*dest++ = big5head(e);
- *dest++ = big5tail(e);
+ if (dest < dest_end)
+ *dest++ = big5tail(e);
src +=2;
len--;
} else
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index c7a2558eb37..7b8bb85652b 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -2668,15 +2668,17 @@ static size_t my_strnxfrm_gbk(CHARSET_INFO *cs __attribute__((unused)),
{
uint16 e;
size_t dstlen= len;
+ uchar *dest_end= dest + dstlen;
len = srclen;
- while (len--)
+ while (len-- && dest < dest_end)
{
if ((len > 0) && isgbkcode(*src, *(src+1)))
{
e = gbksortorder((uint16) gbkcode(*src, *(src+1)));
*dest++ = gbkhead(e);
- *dest++ = gbktail(e);
+ if (dest < dest_end)
+ *dest++ = gbktail(e);
src+=2;
len--;
} else
diff --git a/strings/decimal.c b/strings/decimal.c
index 0559dd97613..a7770fbb2e1 100644
--- a/strings/decimal.c
+++ b/strings/decimal.c
@@ -2005,18 +2005,18 @@ int decimal_mul(decimal_t *from1, decimal_t *from2, decimal_t *to)
sanity(to);
- i=intg0;
+ i=intg0; /* save 'ideal' values */
j=frac0;
- FIX_INTG_FRAC_ERROR(to->len, intg0, frac0, error);
+ FIX_INTG_FRAC_ERROR(to->len, intg0, frac0, error); /* bound size */
to->sign=from1->sign != from2->sign;
- to->frac=from1->frac+from2->frac;
+ to->frac=from1->frac+from2->frac; /* store size in digits */
to->intg=intg0*DIG_PER_DEC1;
if (unlikely(error))
{
set_if_smaller(to->frac, frac0*DIG_PER_DEC1);
set_if_smaller(to->intg, intg0*DIG_PER_DEC1);
- if (unlikely(i > intg0))
+ if (unlikely(i > intg0)) /* bounded integer-part */
{
i-=intg0;
j=i >> 1;
@@ -2024,12 +2024,20 @@ int decimal_mul(decimal_t *from1, decimal_t *from2, decimal_t *to)
intg2-=i-j;
frac1=frac2=0; /* frac0 is already 0 here */
}
- else
+ else /* bounded fract part */
{
j-=frac0;
i=j >> 1;
- frac1-= i;
- frac2-=j-i;
+ if (frac1 <= frac2)
+ {
+ frac1-= i;
+ frac2-=j-i;
+ }
+ else
+ {
+ frac2-= i;
+ frac1-=j-i;
+ }
}
}
start0=to->buf+intg0+frac0-1;