summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorunknown <bar@mysql.com/bar.myoffice.izhnet.ru>2007-06-28 13:34:44 +0500
committerunknown <bar@mysql.com/bar.myoffice.izhnet.ru>2007-06-28 13:34:44 +0500
commit54344f681dfc75170867c3e92dd726f9c28a3f41 (patch)
treeda899a52dc4bbef9ed8e4bc182401319f6095505 /strings
parent9b940d0cb70508ca295bc68a8bab5832d79ededd (diff)
downloadmariadb-git-54344f681dfc75170867c3e92dd726f9c28a3f41.tar.gz
Bug#27345 Incorrect data returned when range-read from utf8_danish_ci indexes
Problem: like_range() returned wrong ranges for contractions (like 'ch' in Czech'). Fix: adding a special code to handle tricky cases: - contraction head followed by a wild character - full contraction - contraction part followed by another contraction part, but they are not a contraction together. mysql-test/r/ctype_uca.result: Adding test case mysql-test/t/ctype_uca.test: Adding test case strings/ctype-mb.c: Adding test case strings/ctype-uca.c: Allocate additional 256 bytes for flags "is contraction part". strings/ctype-ucs2.c: Adding test case
Diffstat (limited to 'strings')
-rw-r--r--strings/ctype-mb.c71
-rw-r--r--strings/ctype-uca.c11
-rw-r--r--strings/ctype-ucs2.c35
3 files changed, 115 insertions, 2 deletions
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index c12426b555f..b370714e464 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -563,6 +563,8 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
char *min_end= min_str + res_length;
char *max_end= max_str + res_length;
uint maxcharlen= res_length / cs->mbmaxlen;
+ const char *contraction_flags= cs->contractions ?
+ ((const char*) cs->contractions) + 0x40*0x40 : NULL;
for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--)
{
@@ -571,6 +573,7 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
ptr++; /* Skip escape */
else if (*ptr == w_one || *ptr == w_many) /* '_' and '%' in SQL */
{
+fill_max_and_min:
/*
Calculate length of keys:
'a\0\0... is the smallest possible string when we have space expand
@@ -602,8 +605,74 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
*min_str++= *max_str++= *ptr++;
}
else
- *min_str++= *max_str++= *ptr++;
+ {
+ /*
+ Special case for collations with contractions.
+ For example, in Chezh, 'ch' is a separate letter
+ which is sorted between 'h' and 'i'.
+ If the pattern 'abc%', 'c' at the end can mean:
+ - letter 'c' itself,
+ - beginning of the contraction 'ch'.
+
+ If we simply return this LIKE range:
+
+ 'abc\min\min\min' and 'abc\max\max\max'
+
+ then this query: SELECT * FROM t1 WHERE a LIKE 'abc%'
+ will only find values starting from 'abc[^h]',
+ but won't find values starting from 'abch'.
+ We must ignore contraction heads followed by w_one or w_many.
+ ('Contraction head' means any letter which can be the first
+ letter in a contraction)
+
+ For example, for Czech 'abc%', we will return LIKE range,
+ which is equal to LIKE range for 'ab%':
+
+ 'ab\min\min\min\min' and 'ab\max\max\max\max'.
+
+ */
+ if (contraction_flags && ptr + 1 < end &&
+ contraction_flags[(uchar) *ptr])
+ {
+ /* Ptr[0] is a contraction head. */
+
+ if (ptr[1] == w_one || ptr[1] == w_many)
+ {
+ /* Contraction head followed by a wildcard, quit. */
+ goto fill_max_and_min;
+ }
+
+ /*
+ Some letters can be both contraction heads and contraction tails.
+ For example, in Danish 'aa' is a separate single letter which
+ is sorted after 'z'. So 'a' can be both head and tail.
+
+ If ptr[0]+ptr[1] is a contraction,
+ then put both letters together.
+
+ If ptr[1] can be a contraction part, but ptr[0]+ptr[1]
+ is not a contraction, then we put only ptr[0],
+ and continue with ptr[1] on the next loop.
+ */
+ if (contraction_flags[(uchar) ptr[1]] &&
+ cs->contractions[(*ptr-0x40)*0x40 + ptr[1] - 0x40])
+ {
+ /* Contraction found */
+ if (maxcharlen == 1 || min_str + 1 >= min_end)
+ {
+ /* Both contraction parts don't fit, quit */
+ goto fill_max_and_min;
+ }
+
+ /* Put contraction head */
+ *min_str++= *max_str++= *ptr++;
+ maxcharlen--;
+ }
+ }
+ /* Put contraction tail, or a single character */
+ *min_str++= *max_str++= *ptr++;
+ }
}
*min_length= *max_length = (uint) (min_str - min_org);
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 4dbda0b9239..81fb9ee1970 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -7937,10 +7937,16 @@ static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(uint))
/* Now process contractions */
if (ncontractions)
{
- uint size= 0x40*0x40*sizeof(uint16); /* 8K, for basic latin letter only */
+ /*
+ 8K for weights for basic latin letter pairs,
+ plus 256 bytes for "is contraction part" flags.
+ */
+ uint size= 0x40*0x40*sizeof(uint16) + 256;
+ char *contraction_flags;
if (!(cs->contractions= (uint16*) (*alloc)(size)))
return 1;
bzero((void*)cs->contractions, size);
+ contraction_flags= ((char*) cs->contractions) + 0x40*0x40;
for (i=0; i < rc; i++)
{
if (rule[i].curr[1])
@@ -7966,6 +7972,9 @@ static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(uint))
/* Copy base weight applying primary difference */
cs->contractions[offsc]= offsb[0] + rule[i].diff[0];
+ /* Mark both letters as "is contraction part */
+ contraction_flags[rule[i].curr[0]]= 1;
+ contraction_flags[rule[i].curr[1]]= 1;
}
}
}
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index f3abbaa6e7e..b5353c55e4c 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -1524,6 +1524,8 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
char *min_org=min_str;
char *min_end=min_str+res_length;
uint charlen= res_length / cs->mbmaxlen;
+ const char *contraction_flags= cs->contractions ?
+ ((const char*) cs->contractions) + 0x40*0x40 : NULL;
for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0
; ptr+=2, charlen--)
@@ -1545,6 +1547,7 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
}
if (ptr[0] == '\0' && ptr[1] == w_many) /* '%' in SQL */
{
+fill_max_and_min:
/*
Calculate length of keys:
'a\0\0... is the smallest possible string when we have space expand
@@ -1561,6 +1564,38 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
} while (min_str + 1 < min_end);
return 0;
}
+
+ if (contraction_flags && ptr + 3 < end &&
+ ptr[0] == '\0' && contraction_flags[(uchar) ptr[1]])
+ {
+ /* Contraction head found */
+ if (ptr[2] == '\0' && (ptr[3] == w_one || ptr[3] == w_many))
+ {
+ /* Contraction head followed by a wildcard, quit */
+ goto fill_max_and_min;
+ }
+
+ /*
+ Check if the second letter can be contraction part,
+ and if two letters really produce a contraction.
+ */
+ if (ptr[2] == '\0' && contraction_flags[(uchar) ptr[3]] &&
+ cs->contractions[(ptr[1]-0x40)*0x40 + ptr[3] - 0x40])
+ {
+ /* Contraction found */
+ if (charlen == 1 || min_str + 2 >= min_end)
+ {
+ /* Full contraction doesn't fit, quit */
+ goto fill_max_and_min;
+ }
+
+ /* Put contraction head */
+ *min_str++= *max_str++= *ptr++;
+ *min_str++= *max_str++= *ptr++;
+ charlen--;
+ }
+ }
+ /* Put contraction tail, or a single character */
*min_str++= *max_str++ = ptr[0];
*min_str++= *max_str++ = ptr[1];
}