diff options
author | unknown <bar@mysql.com/bar.myoffice.izhnet.ru> | 2007-06-28 13:34:44 +0500 |
---|---|---|
committer | unknown <bar@mysql.com/bar.myoffice.izhnet.ru> | 2007-06-28 13:34:44 +0500 |
commit | 54344f681dfc75170867c3e92dd726f9c28a3f41 (patch) | |
tree | da899a52dc4bbef9ed8e4bc182401319f6095505 /strings | |
parent | 9b940d0cb70508ca295bc68a8bab5832d79ededd (diff) | |
download | mariadb-git-54344f681dfc75170867c3e92dd726f9c28a3f41.tar.gz |
Bug#27345 Incorrect data returned when range-read from utf8_danish_ci indexes
Problem: like_range() returned wrong ranges for contractions (like 'ch' in Czech').
Fix: adding a special code to handle tricky cases:
- contraction head followed by a wild character
- full contraction
- contraction part followed by another contraction part,
but they are not a contraction together.
mysql-test/r/ctype_uca.result:
Adding test case
mysql-test/t/ctype_uca.test:
Adding test case
strings/ctype-mb.c:
Adding test case
strings/ctype-uca.c:
Allocate additional 256 bytes for flags "is contraction part".
strings/ctype-ucs2.c:
Adding test case
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-mb.c | 71 | ||||
-rw-r--r-- | strings/ctype-uca.c | 11 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 35 |
3 files changed, 115 insertions, 2 deletions
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index c12426b555f..b370714e464 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -563,6 +563,8 @@ my_bool my_like_range_mb(CHARSET_INFO *cs, char *min_end= min_str + res_length; char *max_end= max_str + res_length; uint maxcharlen= res_length / cs->mbmaxlen; + const char *contraction_flags= cs->contractions ? + ((const char*) cs->contractions) + 0x40*0x40 : NULL; for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--) { @@ -571,6 +573,7 @@ my_bool my_like_range_mb(CHARSET_INFO *cs, ptr++; /* Skip escape */ else if (*ptr == w_one || *ptr == w_many) /* '_' and '%' in SQL */ { +fill_max_and_min: /* Calculate length of keys: 'a\0\0... is the smallest possible string when we have space expand @@ -602,8 +605,74 @@ my_bool my_like_range_mb(CHARSET_INFO *cs, *min_str++= *max_str++= *ptr++; } else - *min_str++= *max_str++= *ptr++; + { + /* + Special case for collations with contractions. + For example, in Chezh, 'ch' is a separate letter + which is sorted between 'h' and 'i'. + If the pattern 'abc%', 'c' at the end can mean: + - letter 'c' itself, + - beginning of the contraction 'ch'. + + If we simply return this LIKE range: + + 'abc\min\min\min' and 'abc\max\max\max' + + then this query: SELECT * FROM t1 WHERE a LIKE 'abc%' + will only find values starting from 'abc[^h]', + but won't find values starting from 'abch'. + We must ignore contraction heads followed by w_one or w_many. + ('Contraction head' means any letter which can be the first + letter in a contraction) + + For example, for Czech 'abc%', we will return LIKE range, + which is equal to LIKE range for 'ab%': + + 'ab\min\min\min\min' and 'ab\max\max\max\max'. + + */ + if (contraction_flags && ptr + 1 < end && + contraction_flags[(uchar) *ptr]) + { + /* Ptr[0] is a contraction head. */ + + if (ptr[1] == w_one || ptr[1] == w_many) + { + /* Contraction head followed by a wildcard, quit. */ + goto fill_max_and_min; + } + + /* + Some letters can be both contraction heads and contraction tails. + For example, in Danish 'aa' is a separate single letter which + is sorted after 'z'. So 'a' can be both head and tail. + + If ptr[0]+ptr[1] is a contraction, + then put both letters together. + + If ptr[1] can be a contraction part, but ptr[0]+ptr[1] + is not a contraction, then we put only ptr[0], + and continue with ptr[1] on the next loop. + */ + if (contraction_flags[(uchar) ptr[1]] && + cs->contractions[(*ptr-0x40)*0x40 + ptr[1] - 0x40]) + { + /* Contraction found */ + if (maxcharlen == 1 || min_str + 1 >= min_end) + { + /* Both contraction parts don't fit, quit */ + goto fill_max_and_min; + } + + /* Put contraction head */ + *min_str++= *max_str++= *ptr++; + maxcharlen--; + } + } + /* Put contraction tail, or a single character */ + *min_str++= *max_str++= *ptr++; + } } *min_length= *max_length = (uint) (min_str - min_org); diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 4dbda0b9239..81fb9ee1970 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -7937,10 +7937,16 @@ static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(uint)) /* Now process contractions */ if (ncontractions) { - uint size= 0x40*0x40*sizeof(uint16); /* 8K, for basic latin letter only */ + /* + 8K for weights for basic latin letter pairs, + plus 256 bytes for "is contraction part" flags. + */ + uint size= 0x40*0x40*sizeof(uint16) + 256; + char *contraction_flags; if (!(cs->contractions= (uint16*) (*alloc)(size))) return 1; bzero((void*)cs->contractions, size); + contraction_flags= ((char*) cs->contractions) + 0x40*0x40; for (i=0; i < rc; i++) { if (rule[i].curr[1]) @@ -7966,6 +7972,9 @@ static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(uint)) /* Copy base weight applying primary difference */ cs->contractions[offsc]= offsb[0] + rule[i].diff[0]; + /* Mark both letters as "is contraction part */ + contraction_flags[rule[i].curr[0]]= 1; + contraction_flags[rule[i].curr[1]]= 1; } } } diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index f3abbaa6e7e..b5353c55e4c 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1524,6 +1524,8 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs, char *min_org=min_str; char *min_end=min_str+res_length; uint charlen= res_length / cs->mbmaxlen; + const char *contraction_flags= cs->contractions ? + ((const char*) cs->contractions) + 0x40*0x40 : NULL; for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0 ; ptr+=2, charlen--) @@ -1545,6 +1547,7 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs, } if (ptr[0] == '\0' && ptr[1] == w_many) /* '%' in SQL */ { +fill_max_and_min: /* Calculate length of keys: 'a\0\0... is the smallest possible string when we have space expand @@ -1561,6 +1564,38 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs, } while (min_str + 1 < min_end); return 0; } + + if (contraction_flags && ptr + 3 < end && + ptr[0] == '\0' && contraction_flags[(uchar) ptr[1]]) + { + /* Contraction head found */ + if (ptr[2] == '\0' && (ptr[3] == w_one || ptr[3] == w_many)) + { + /* Contraction head followed by a wildcard, quit */ + goto fill_max_and_min; + } + + /* + Check if the second letter can be contraction part, + and if two letters really produce a contraction. + */ + if (ptr[2] == '\0' && contraction_flags[(uchar) ptr[3]] && + cs->contractions[(ptr[1]-0x40)*0x40 + ptr[3] - 0x40]) + { + /* Contraction found */ + if (charlen == 1 || min_str + 2 >= min_end) + { + /* Full contraction doesn't fit, quit */ + goto fill_max_and_min; + } + + /* Put contraction head */ + *min_str++= *max_str++= *ptr++; + *min_str++= *max_str++= *ptr++; + charlen--; + } + } + /* Put contraction tail, or a single character */ *min_str++= *max_str++ = ptr[0]; *min_str++= *max_str++ = ptr[1]; } |