From d8ebf276396252fc37169d108a993bf8b6b38157 Mon Sep 17 00:00:00 2001
From: unknown <gshchepa/uchum@host.loc>
Date: Wed, 23 Apr 2008 02:14:58 +0500
Subject: Fixed bug #35993: memory corruption and crash with multibyte
 conversion.

Grouping or ordering of long values in not indexed BLOB/TEXT columns
with GBK or BIG5 charsets crashes the server.

MySQL server uses sorting (the filesort procedure) in the temporary
table to evaluate the GROUP BY clause in case of lack of suitable index.
That procedure takes into account only first @max_sort_length bytes
(system variable, usually 1024) of TEXT/BLOB sorting key string.
The my_strnxfrm_gbk and my_strnxfrm_big5 fill temporary keys
with data of whole blob length instead of @max_sort_length bytes
length. That buffer overrun has been fixed.


mysql-test/r/ctype_gbk.result:
  Added test case for bug #35993.
mysql-test/t/ctype_gbk.test:
  Added test case for bug #35993.
strings/ctype-big5.c:
  Fixed bug #35993: memory corruption and crash with multibyte conversion.

  Buffer overrun has been fixed in the my_strnxfrm_big5 function.
strings/ctype-gbk.c:
  Fixed bug #35993: memory corruption and crash with multibyte conversion.

  Buffer overrun has been fixed in the my_strnxfrm_gbk function.
---
 strings/ctype-big5.c | 6 ++++--
 strings/ctype-gbk.c  | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'strings')

diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index 44b9951657d..c73247db404 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -307,15 +307,17 @@ static int my_strnxfrm_big5(CHARSET_INFO *cs __attribute__((unused)),
 {
   uint16 e;
   uint dstlen= len;
+  uchar *dest_end= dest + dstlen;
 
   len = srclen;
-  while (len--)
+  while (len-- && dest < dest_end)
   {
     if ((len > 0) && isbig5code(*src, *(src+1)))
     {
       e = big5strokexfrm((uint16) big5code(*src, *(src+1)));
       *dest++ = big5head(e);
-      *dest++ = big5tail(e);
+      if (dest < dest_end)
+        *dest++ = big5tail(e);
       src +=2;
       len--;
     } else
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index 8ac7d62c9da..d0ba33aa3cc 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -2668,15 +2668,17 @@ static int my_strnxfrm_gbk(CHARSET_INFO *cs __attribute__((unused)),
 {
   uint16 e;
   uint dstlen= len;
+  uchar *dest_end= dest + dstlen;
 
   len = srclen;
-  while (len--)
+  while (len-- && dest < dest_end)
   {
     if ((len > 0) && isgbkcode(*src, *(src+1)))
     {
       e = gbksortorder((uint16) gbkcode(*src, *(src+1)));
       *dest++ = gbkhead(e);
-      *dest++ = gbktail(e);
+      if (dest < dest_end)
+        *dest++ = gbktail(e);
       src+=2;
       len--;
     } else 
-- 
cgit v1.2.1


From c9232b936d5695c64e23dab18c0cabae767d5cb9 Mon Sep 17 00:00:00 2001
From: Alexander Barkov <bar@mysql.com>
Date: Wed, 28 May 2008 15:03:47 +0500
Subject: Updating charset doc files. Thanks to Paul for preparing the
 up-to-date files reflecting 4.1 changes.

---
 strings/CHARSET_INFO.txt | 142 +++++++++++++++++++++++++++++++----------------
 1 file changed, 94 insertions(+), 48 deletions(-)

(limited to 'strings')

diff --git a/strings/CHARSET_INFO.txt b/strings/CHARSET_INFO.txt
index 3fd262c6f12..bb8e40025c7 100644
--- a/strings/CHARSET_INFO.txt
+++ b/strings/CHARSET_INFO.txt
@@ -3,9 +3,8 @@ CHARSET_INFO
 ============
 A structure containing data for charset+collation pair implementation. 
 
-Virtual functions which use this data are collected
-into separate structures MY_CHARSET_HANDLER and
-MY_COLLATION_HANDLER.
+Virtual functions that use this data are collected into separate
+structures, MY_CHARSET_HANDLER and MY_COLLATION_HANDLER.
 
 
 typedef struct charset_info_st
@@ -56,7 +55,7 @@ character set. Not really used now. Intended to optimize some
 parts of the code where we need to find the default collation
 using its non-default counterpart for the given character set.
 
-binary_numner - ID of a charset+collation pair, which consists
+binary_number - ID of a charset+collation pair, which consists
 of the same character set and the binary collation of this
 character set. Not really used now. 
 
@@ -65,15 +64,15 @@ Names
 
   csname  - name of the character set for this charset+collation pair.
   name    - name of the collation for this charset+collation pair.
-  comment - a text comment, dysplayed in "Description" column of
+  comment - a text comment, displayed in "Description" column of
             SHOW CHARACTER SET output.
 
 Conversion tables
 -----------------
   
   ctype      - pointer to array[257] of "type of characters"
-               bit mask for each chatacter, e.g. if a 
-               character is a digit or a letter or a separator, etc.
+               bit mask for each character, e.g., whether a 
+               character is a digit, letter, separator, etc.
 
                Monty 2004-10-21:
                  If you look at the macros, we use ctype[(char)+1].
@@ -87,17 +86,64 @@ Conversion tables
   to_upper   - pointer to array[256] used in UCASE()
   sort_order - pointer to array[256] used for strings comparison
 
+In all Asian charsets these arrays are set up as follows:
+
+- All bytes in the range 0x80..0xFF were marked as letters in the
+  ctype array.
+
+- The to_lower and to_upper arrays map only ASCII letters.
+  UPPER() and LOWER() doesn't really work for multi-byte characters.
+  Most of the characters in Asian character sets are ideograms
+  anyway and they don't have case mapping. However, there are
+  still some characters from European alphabets.
+  For example:
+  _ujis 0x8FAAF2 - LATIN CAPITAL LETTER Y WITH ACUTE
+  _ujis 0x8FABF2 - LATIN SMALL LETTER Y WITH ACUTE
+
+  But they don't map to each other with UPPER and LOWER operations.
+
+- The sort_order array is filled case insensitively for the
+  ASCII range 0x00..0x7F, and in "binary" fashion for the multi-byte
+  range 0x80..0xFF for these collations:
+
+  cp932_japanese_ci,
+  euckr_korean_ci,
+  eucjpms_japanese_ci,
+  gb2312_chinese_ci,
+  sjis_japanese_ci,
+  ujis_japanese_ci.
+
+  So multi-byte characters are sorted just according to their codes.
+
+
+- Two collations are still case insensitive for the ASCII characters,
+  but have special sorting order for multi-byte characters
+  (something more complex than just according to codes):
+
+  big5_chinese_ci
+  gbk_chinese_ci
+
+  So handlers for these collations use only the 0x00..0x7F part
+  of their sort_order arrays, and apply the special functions
+  for multi-byte characters
+
+In Unicode character sets we have full support of UPPER/LOWER mapping,
+for sorting order, and for character type detection.
+"utf8_general_ci" still has the "old-fashioned" arrays
+like to_upper, to_lower, sort_order and ctype, but they are
+not really used (maybe only in some rare legacy functions).
+
 
 
 Unicode conversion data
 -----------------------
-For 8bit character sets:
+For 8-bit character sets:
 
 tab_to_uni  : array[256] of charset->Unicode translation
 tab_from_uni: a structure for Unicode->charset translation
 
-Non-8 bit charsets have their own structures per charset
-hidden in correspondent ctype-xxx.c file and don't use
+Non-8-bit charsets have their own structures per charset
+hidden in corresponding ctype-xxx.c file and don't use
 tab_to_uni and tab_from_uni tables.
 
 
@@ -106,9 +152,9 @@ Parser maps
 state_map[]
 ident_map[]
 
- These maps are to quickly identify if a character is
-an identificator part, a digit, a special character, 
-or a part of other SQL language lexical item.
+These maps are used to quickly identify whether a character is an
+identifier part, a digit, a special character, or a part of another
+SQL language lexical item.
 
 Probably can be combined with ctype array in the future.
 But for some reasons these two arrays are used in the parser,
@@ -116,32 +162,32 @@ while a separate ctype[] array is used in the other part of the
 code, like fulltext, etc.
 
 
-Misc fields
------------
+Miscellaneous fields
+--------------------
 
-  strxfrm_multiply - how many times a sort key (i.e. a string
-                     which can be passed into memcmp() for comparison)
+  strxfrm_multiply - how many times a sort key (that is, a string
+                     that can be passed into memcmp() for comparison)
                      can be longer than the original string. 
                      Usually it is 1. For some complex
-                     collations it can be bigger. For example
+                     collations it can be bigger. For example,
                      in latin1_german2_ci, a sort key is up to
-                     twice longer than the original string.
+                     two times longer than the original string.
                      e.g. Letter 'A' with two dots above is
                      substituted with 'AE'. 
-  mbminlen         - mininum multibyte sequence length.
-                     Now always 1 except ucs2. For ucs2
+  mbminlen         - minimum multi-byte sequence length.
+                     Now always 1 except for ucs2. For ucs2,
                      it is 2.
-  mbmaxlen         - maximum multibyte sequence length.
-                     1 for 8bit charsets. Can be also 2 or 3.
+  mbmaxlen         - maximum multi-byte sequence length.
+                     1 for 8-bit charsets. Can be also 2 or 3.
 
   max_sort_char    - for LIKE range
-                     in case of 8bit character sets - native code
+                     in case of 8-bit character sets - native code
 		     of maximum character (max_str pad byte);
                      in case of UTF8 and UCS2 - Unicode code of the maximum
 		     possible character (usually U+FFFF). This code is
-		     converted to multibyte representation (usually 0xEFBFBF)
+		     converted to multi-byte representation (usually 0xEFBFBF)
 		     and then used as a pad sequence for max_str.
-		     in case of other multibyte character sets -
+		     in case of other multi-byte character sets -
 		     max_str pad byte (usually 0xFF).
 
 MY_CHARSET_HANDLER
@@ -151,10 +197,10 @@ MY_CHARSET_HANDLER is a collection of character-set
 related routines. Defined in m_ctype.h. Have the 
 following set of functions:
 
-Multibyte routines
+Multi-byte routines
 ------------------
-ismbchar()  - detects if the given string is a multibyte sequence
-mbcharlen() - returns length of multibyte sequence starting with
+ismbchar()  - detects whether the given string is a multi-byte sequence
+mbcharlen() - returns length of multi-byte sequence starting with
               the given character
 numchars()  - returns number of characters in the given string, e.g.
               in SQL function CHAR_LENGTH().
@@ -163,29 +209,29 @@ charpos()   - calculates the offset of the given position in the string.
               INSERT()
 
 well_formed_length()
-            - finds the length of correctly formed multybyte beginning.
+            - finds the length of correctly formed multi-byte beginning.
               Used in INSERTs to cut a beginning of the given string
               which is
               a) "well formed" according to the given character set.
-              b)  can fit into the given data type
+              b) can fit into the given data type
               Terminates the string in the good position, taking in account
-              multibyte character boundaries.
+              multi-byte character boundaries.
 
-lengthsp()  - returns the length of the given string without traling spaces.
+lengthsp()  - returns the length of the given string without trailing spaces.
 
 
 Unicode conversion routines
 ---------------------------
-mb_wc       - converts the left multibyte sequence into it Unicode code.
-mc_mb       - converts the given Unicode code into multibyte sequence.
+mb_wc       - converts the left multi-byte sequence into its Unicode code.
+mc_mb       - converts the given Unicode code into multi-byte sequence.
 
 
-Case and sort convertion
+Case and sort conversion
 ------------------------
-caseup_str  - converts the given 0-terminated string into the upper case
-casedn_str  - converts the given 0-terminated string into the lower case
-caseup      - converts the given string into the lower case using length
-casedn      - converts the given string into the lower case using length
+caseup_str  - converts the given 0-terminated string to uppercase
+casedn_str  - converts the given 0-terminated string to lowercase
+caseup      - converts the given string to lowercase using length
+casedn      - converts the given string to lowercase using length
 
 Number-to-string conversion routines
 ------------------------------------
@@ -193,7 +239,7 @@ snprintf()
 long10_to_str()
 longlong10_to_str()
 
-The names are pretty self-descripting.
+The names are pretty self-describing.
 
 String padding routines
 -----------------------
@@ -201,7 +247,7 @@ fill()     - writes the given Unicode value into the given string
              with the given length. Used to pad the string, usually
              with space character, according to the given charset.
 
-String-to-numner conversion routines
+String-to-number conversion routines
 ------------------------------------
 strntol()
 strntoul()
@@ -209,10 +255,10 @@ strntoll()
 strntoull()
 strntod()
 
-These functions are almost for the same thing with their
-STDLIB counterparts, but also:
+These functions are almost the same as their STDLIB counterparts,
+but also:
   - accept length instead of 0-terminator
-  - and are character set dependant
+  - are character set dependent
 
 Simple scanner routines
 -----------------------
@@ -230,9 +276,9 @@ strnxfrm()    - makes a sort key suitable for memcmp() corresponding
 like_range()  - creates a LIKE range, for optimizer
 wildcmp()     - wildcard comparison, for LIKE
 strcasecmp()  - 0-terminated string comparison
-instr()       - finds the first substring appearence in the string
-hash_sort()   - calculates hash value taking in account
+instr()       - finds the first substring appearance in the string
+hash_sort()   - calculates hash value taking into account
                 the collation rules, e.g. case-insensitivity, 
                 accent sensitivity, etc.
 
- 
\ No newline at end of file
+ 
-- 
cgit v1.2.1


From 65c3870cfcad08f410a56e4b198b4a5078d04c20 Mon Sep 17 00:00:00 2001
From: Chad MILLER <chad@mysql.com>
Date: Fri, 15 Aug 2008 15:46:21 -0400
Subject: Bug#36270: incorrect calculation result - works in 4.1 but not in 5.0
 or 5.1

When the fractional part in a multiplication of DECIMALs
overflowed, we truncated the first operand rather than the
longest. Now truncating least significant places instead
for more precise multiplications.

(Queuing at demand of Trudy/Davi.)

mysql-test/r/type_newdecimal.result:
  show that if we need to truncate the scale of an operand, we pick the
  right one (that is, we discard the least significant decimal places)
mysql-test/t/type_newdecimal.test:
  show that if we need to truncate the scale of an operand, we pick the
  right one (that is, we discard the least significant decimal places)
strings/decimal.c:
  when needing to disregard fractional parts, pick the least
  significant ones
---
 strings/decimal.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'strings')

diff --git a/strings/decimal.c b/strings/decimal.c
index 3176cf6afa7..8b431ad9bab 100644
--- a/strings/decimal.c
+++ b/strings/decimal.c
@@ -1999,18 +1999,18 @@ int decimal_mul(decimal_t *from1, decimal_t *from2, decimal_t *to)
 
   sanity(to);
 
-  i=intg0;
+  i=intg0;                                       /* save 'ideal' values */
   j=frac0;
-  FIX_INTG_FRAC_ERROR(to->len, intg0, frac0, error);
+  FIX_INTG_FRAC_ERROR(to->len, intg0, frac0, error);  /* bound size */
   to->sign=from1->sign != from2->sign;
-  to->frac=from1->frac+from2->frac;
+  to->frac=from1->frac+from2->frac;              /* store size in digits */
   to->intg=intg0*DIG_PER_DEC1;
 
   if (unlikely(error))
   {
     set_if_smaller(to->frac, frac0*DIG_PER_DEC1);
     set_if_smaller(to->intg, intg0*DIG_PER_DEC1);
-    if (unlikely(i > intg0))
+    if (unlikely(i > intg0))                     /* bounded integer-part */
     {
       i-=intg0;
       j=i >> 1;
@@ -2018,12 +2018,20 @@ int decimal_mul(decimal_t *from1, decimal_t *from2, decimal_t *to)
       intg2-=i-j;
       frac1=frac2=0; /* frac0 is already 0 here */
     }
-    else
+    else                                         /* bounded fract part */
     {
       j-=frac0;
       i=j >> 1;
-      frac1-= i;
-      frac2-=j-i;
+      if (frac1 <= frac2)
+      {
+        frac1-= i;
+        frac2-=j-i;
+      }
+      else
+      {
+        frac2-= i;
+        frac1-=j-i;
+      }
     }
   }
   start0=to->buf+intg0+frac0-1;
-- 
cgit v1.2.1