diff options
author | Michael Widenius <monty@mysql.com> | 2008-10-10 18:28:41 +0300 |
---|---|---|
committer | Michael Widenius <monty@mysql.com> | 2008-10-10 18:28:41 +0300 |
commit | f47e003e1bfc56c2bf5d0f144a35517f526b538b (patch) | |
tree | e2bfb9834c6e558381465ed2f57a9d873a9b2c90 /strings | |
parent | 51a92bbb03cc58ab8688fa9d8226afe32e6156ca (diff) | |
parent | 9daa56fd5ce3ccd33c32b5a505ac1d2b2c437460 (diff) | |
download | mariadb-git-f47e003e1bfc56c2bf5d0f144a35517f526b538b.tar.gz |
Merged 5.1 with maria 5.1
Diffstat (limited to 'strings')
-rw-r--r-- | strings/CHARSET_INFO.txt | 138 | ||||
-rw-r--r-- | strings/ctype-big5.c | 6 | ||||
-rw-r--r-- | strings/ctype-gbk.c | 6 | ||||
-rw-r--r-- | strings/decimal.c | 22 |
4 files changed, 115 insertions, 57 deletions
diff --git a/strings/CHARSET_INFO.txt b/strings/CHARSET_INFO.txt index 1336d5ae3bb..bb8e40025c7 100644 --- a/strings/CHARSET_INFO.txt +++ b/strings/CHARSET_INFO.txt @@ -3,9 +3,8 @@ CHARSET_INFO ============ A structure containing data for charset+collation pair implementation. -Virtual functions which use this data are collected -into separate structures MY_CHARSET_HANDLER and -MY_COLLATION_HANDLER. +Virtual functions that use this data are collected into separate +structures, MY_CHARSET_HANDLER and MY_COLLATION_HANDLER. typedef struct charset_info_st @@ -56,7 +55,7 @@ character set. Not really used now. Intended to optimize some parts of the code where we need to find the default collation using its non-default counterpart for the given character set. -binary_numner - ID of a charset+collation pair, which consists +binary_number - ID of a charset+collation pair, which consists of the same character set and the binary collation of this character set. Not really used now. @@ -65,15 +64,15 @@ Names csname - name of the character set for this charset+collation pair. name - name of the collation for this charset+collation pair. - comment - a text comment, dysplayed in "Description" column of + comment - a text comment, displayed in "Description" column of SHOW CHARACTER SET output. Conversion tables ----------------- ctype - pointer to array[257] of "type of characters" - bit mask for each chatacter, e.g. if a - character is a digit or a letter or a separator, etc. + bit mask for each character, e.g., whether a + character is a digit, letter, separator, etc. Monty 2004-10-21: If you look at the macros, we use ctype[(char)+1]. @@ -87,17 +86,64 @@ Conversion tables to_upper - pointer to array[256] used in UCASE() sort_order - pointer to array[256] used for strings comparison +In all Asian charsets these arrays are set up as follows: + +- All bytes in the range 0x80..0xFF were marked as letters in the + ctype array. + +- The to_lower and to_upper arrays map only ASCII letters. + UPPER() and LOWER() doesn't really work for multi-byte characters. + Most of the characters in Asian character sets are ideograms + anyway and they don't have case mapping. However, there are + still some characters from European alphabets. + For example: + _ujis 0x8FAAF2 - LATIN CAPITAL LETTER Y WITH ACUTE + _ujis 0x8FABF2 - LATIN SMALL LETTER Y WITH ACUTE + + But they don't map to each other with UPPER and LOWER operations. + +- The sort_order array is filled case insensitively for the + ASCII range 0x00..0x7F, and in "binary" fashion for the multi-byte + range 0x80..0xFF for these collations: + + cp932_japanese_ci, + euckr_korean_ci, + eucjpms_japanese_ci, + gb2312_chinese_ci, + sjis_japanese_ci, + ujis_japanese_ci. + + So multi-byte characters are sorted just according to their codes. + + +- Two collations are still case insensitive for the ASCII characters, + but have special sorting order for multi-byte characters + (something more complex than just according to codes): + + big5_chinese_ci + gbk_chinese_ci + + So handlers for these collations use only the 0x00..0x7F part + of their sort_order arrays, and apply the special functions + for multi-byte characters + +In Unicode character sets we have full support of UPPER/LOWER mapping, +for sorting order, and for character type detection. +"utf8_general_ci" still has the "old-fashioned" arrays +like to_upper, to_lower, sort_order and ctype, but they are +not really used (maybe only in some rare legacy functions). + Unicode conversion data ----------------------- -For 8bit character sets: +For 8-bit character sets: tab_to_uni : array[256] of charset->Unicode translation tab_from_uni: a structure for Unicode->charset translation -Non-8 bit charsets have their own structures per charset -hidden in correspondent ctype-xxx.c file and don't use +Non-8-bit charsets have their own structures per charset +hidden in corresponding ctype-xxx.c file and don't use tab_to_uni and tab_from_uni tables. @@ -106,9 +152,9 @@ Parser maps state_map[] ident_map[] - These maps are to quickly identify if a character is -an identificator part, a digit, a special character, -or a part of other SQL language lexical item. +These maps are used to quickly identify whether a character is an +identifier part, a digit, a special character, or a part of another +SQL language lexical item. Probably can be combined with ctype array in the future. But for some reasons these two arrays are used in the parser, @@ -116,32 +162,32 @@ while a separate ctype[] array is used in the other part of the code, like fulltext, etc. -Misc fields ------------ +Miscellaneous fields +-------------------- - strxfrm_multiply - how many times a sort key (i.e. a string - which can be passed into memcmp() for comparison) + strxfrm_multiply - how many times a sort key (that is, a string + that can be passed into memcmp() for comparison) can be longer than the original string. Usually it is 1. For some complex - collations it can be bigger. For example + collations it can be bigger. For example, in latin1_german2_ci, a sort key is up to - twice longer than the original string. + two times longer than the original string. e.g. Letter 'A' with two dots above is substituted with 'AE'. - mbminlen - mininum multibyte sequence length. - Now always 1 except ucs2. For ucs2 + mbminlen - minimum multi-byte sequence length. + Now always 1 except for ucs2. For ucs2, it is 2. - mbmaxlen - maximum multibyte sequence length. - 1 for 8bit charsets. Can be also 2 or 3. + mbmaxlen - maximum multi-byte sequence length. + 1 for 8-bit charsets. Can be also 2 or 3. max_sort_char - for LIKE range - in case of 8bit character sets - native code + in case of 8-bit character sets - native code of maximum character (max_str pad byte); in case of UTF8 and UCS2 - Unicode code of the maximum possible character (usually U+FFFF). This code is - converted to multibyte representation (usually 0xEFBFBF) + converted to multi-byte representation (usually 0xEFBFBF) and then used as a pad sequence for max_str. - in case of other multibyte character sets - + in case of other multi-byte character sets - max_str pad byte (usually 0xFF). MY_CHARSET_HANDLER @@ -151,10 +197,10 @@ MY_CHARSET_HANDLER is a collection of character-set related routines. Defined in m_ctype.h. Have the following set of functions: -Multibyte routines +Multi-byte routines ------------------ -ismbchar() - detects if the given string is a multibyte sequence -mbcharlen() - returns length of multibyte sequence starting with +ismbchar() - detects whether the given string is a multi-byte sequence +mbcharlen() - returns length of multi-byte sequence starting with the given character numchars() - returns number of characters in the given string, e.g. in SQL function CHAR_LENGTH(). @@ -163,29 +209,29 @@ charpos() - calculates the offset of the given position in the string. INSERT() well_formed_length() - - finds the length of correctly formed multybyte beginning. + - finds the length of correctly formed multi-byte beginning. Used in INSERTs to cut a beginning of the given string which is a) "well formed" according to the given character set. - b) can fit into the given data type + b) can fit into the given data type Terminates the string in the good position, taking in account - multibyte character boundaries. + multi-byte character boundaries. -lengthsp() - returns the length of the given string without traling spaces. +lengthsp() - returns the length of the given string without trailing spaces. Unicode conversion routines --------------------------- -mb_wc - converts the left multibyte sequence into it Unicode code. -mc_mb - converts the given Unicode code into multibyte sequence. +mb_wc - converts the left multi-byte sequence into its Unicode code. +mc_mb - converts the given Unicode code into multi-byte sequence. Case and sort conversion ------------------------ -caseup_str - converts the given 0-terminated string into the upper case -casedn_str - converts the given 0-terminated string into the lower case -caseup - converts the given string into the lower case using length -casedn - converts the given string into the lower case using length +caseup_str - converts the given 0-terminated string to uppercase +casedn_str - converts the given 0-terminated string to lowercase +caseup - converts the given string to lowercase using length +casedn - converts the given string to lowercase using length Number-to-string conversion routines ------------------------------------ @@ -193,7 +239,7 @@ snprintf() long10_to_str() longlong10_to_str() -The names are pretty self-descripting. +The names are pretty self-describing. String padding routines ----------------------- @@ -201,7 +247,7 @@ fill() - writes the given Unicode value into the given string with the given length. Used to pad the string, usually with space character, according to the given charset. -String-to-numner conversion routines +String-to-number conversion routines ------------------------------------ strntol() strntoul() @@ -209,10 +255,10 @@ strntoll() strntoull() strntod() -These functions are almost for the same thing with their -STDLIB counterparts, but also: +These functions are almost the same as their STDLIB counterparts, +but also: - accept length instead of 0-terminator - - and are character set dependant + - are character set dependent Simple scanner routines ----------------------- @@ -230,8 +276,8 @@ strnxfrm() - makes a sort key suitable for memcmp() corresponding like_range() - creates a LIKE range, for optimizer wildcmp() - wildcard comparison, for LIKE strcasecmp() - 0-terminated string comparison -instr() - finds the first substring appearence in the string -hash_sort() - calculates hash value taking in account +instr() - finds the first substring appearance in the string +hash_sort() - calculates hash value taking into account the collation rules, e.g. case-insensitivity, accent sensitivity, etc. diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index ecfd3d648e0..3da307b82fc 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -307,15 +307,17 @@ static size_t my_strnxfrm_big5(CHARSET_INFO *cs __attribute__((unused)), { uint16 e; size_t dstlen= len; + uchar *dest_end= dest + dstlen; len = srclen; - while (len--) + while (len-- && dest < dest_end) { if ((len > 0) && isbig5code(*src, *(src+1))) { e = big5strokexfrm((uint16) big5code(*src, *(src+1))); *dest++ = big5head(e); - *dest++ = big5tail(e); + if (dest < dest_end) + *dest++ = big5tail(e); src +=2; len--; } else diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index c7a2558eb37..7b8bb85652b 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -2668,15 +2668,17 @@ static size_t my_strnxfrm_gbk(CHARSET_INFO *cs __attribute__((unused)), { uint16 e; size_t dstlen= len; + uchar *dest_end= dest + dstlen; len = srclen; - while (len--) + while (len-- && dest < dest_end) { if ((len > 0) && isgbkcode(*src, *(src+1))) { e = gbksortorder((uint16) gbkcode(*src, *(src+1))); *dest++ = gbkhead(e); - *dest++ = gbktail(e); + if (dest < dest_end) + *dest++ = gbktail(e); src+=2; len--; } else diff --git a/strings/decimal.c b/strings/decimal.c index 0559dd97613..a7770fbb2e1 100644 --- a/strings/decimal.c +++ b/strings/decimal.c @@ -2005,18 +2005,18 @@ int decimal_mul(decimal_t *from1, decimal_t *from2, decimal_t *to) sanity(to); - i=intg0; + i=intg0; /* save 'ideal' values */ j=frac0; - FIX_INTG_FRAC_ERROR(to->len, intg0, frac0, error); + FIX_INTG_FRAC_ERROR(to->len, intg0, frac0, error); /* bound size */ to->sign=from1->sign != from2->sign; - to->frac=from1->frac+from2->frac; + to->frac=from1->frac+from2->frac; /* store size in digits */ to->intg=intg0*DIG_PER_DEC1; if (unlikely(error)) { set_if_smaller(to->frac, frac0*DIG_PER_DEC1); set_if_smaller(to->intg, intg0*DIG_PER_DEC1); - if (unlikely(i > intg0)) + if (unlikely(i > intg0)) /* bounded integer-part */ { i-=intg0; j=i >> 1; @@ -2024,12 +2024,20 @@ int decimal_mul(decimal_t *from1, decimal_t *from2, decimal_t *to) intg2-=i-j; frac1=frac2=0; /* frac0 is already 0 here */ } - else + else /* bounded fract part */ { j-=frac0; i=j >> 1; - frac1-= i; - frac2-=j-i; + if (frac1 <= frac2) + { + frac1-= i; + frac2-=j-i; + } + else + { + frac2-= i; + frac1-=j-i; + } } } start0=to->buf+intg0+frac0-1; |