summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorMichael Widenius <monty@mariadb.org>2014-09-11 22:42:35 +0300
committerMichael Widenius <monty@mariadb.org>2014-09-11 22:42:35 +0300
commitc4f5326bb7bee1857d0cc6d5cdff1178e0854d00 (patch)
tree58552a3bbd9cb843672a3cc119c7b06236ff3e82 /strings
parent2362d98470801ddd1bbc3459c106368ffc215933 (diff)
downloadmariadb-git-c4f5326bb7bee1857d0cc6d5cdff1178e0854d00.tar.gz
MDEV-6255 DUPLICATE KEY Errors on SELECT .. GROUP BY that uses temporary and filesort.
The problem was that my_hash_sort didn't properly delete end-space characters properly, so strings that should compare identically was seen as different strings. (Space was handled correctly, but not NBSP) This caused duplicate key errors when a heap table was converted to Aria as part of overflow in group by. Fixed by removing all characters that compares as end space when creating a hash. Other things: - Fixed that --sorted_results also works for errors in mysqltest. - Speed up hash by not comparing strings that has different hash. - Speed up many my_hash_sort functions by using registers to calculate hash instead of pointers. This was previously done for some functions, but not for all. - Made a macro of the hash function, to simplify code and to be able to experiment with new hash functions. client/mysqltest.cc: Fixed that --sorted_results also works for error messages. mysql-test/r/ctype_partitions.result: New test to ensure that partitions on hash works mysql-test/suite/multi_source/gtid.result: Updated result mysql-test/suite/multi_source/gtid.test: Test that --sorted_result works for error messages mysql-test/suite/multi_source/gtid_ignore_duplicates.result: Updated result mysql-test/suite/multi_source/gtid_ignore_duplicates.test: Updated result mysql-test/suite/multi_source/load_data.result: Updated result mysql-test/suite/multi_source/load_data.test: Updated result mysql-test/t/ctype_partitions.test: New test to ensure that partitions on hash works storage/heap/hp_write.c: Speed up hash by not comparing strings that has different hash. storage/maria/ma_check.c: Extra debug strings/ctype-bin.c: Use macro for hash function strings/ctype-latin1.c: Use macro for hash function Use registers to calculate hash (speedup) strings/ctype-mb.c: Use macro for hash function Use registers to calculate hash (speedup) strings/ctype-simple.c: Use macro for hash function Use same variable names as in other my_hash_sort functions. Update my_hash_sort_simple() to properly remove end space (patch by Bar) strings/ctype-uca.c: Ignore duplicated space inside strings and end space in my_hash_sort_uca(). This fixed MDEV-6255 Use macro for hash function Use registers to calculate hash (speedup) strings/ctype-ucs2.c: Use macro for hash function Use registers to calculate hash (speedup) strings/ctype-utf8.c: Use macro for hash function Use registers to calculate hash (speedup) strings/strings_def.h: Made a macro of the hash function, to simplify code and to be able to experiment with new hash functions.
Diffstat (limited to 'strings')
-rw-r--r--strings/ctype-bin.c8
-rw-r--r--strings/ctype-latin1.c10
-rw-r--r--strings/ctype-mb.c8
-rw-r--r--strings/ctype-simple.c46
-rw-r--r--strings/ctype-uca.c38
-rw-r--r--strings/ctype-ucs2.c56
-rw-r--r--strings/ctype-utf8.c28
-rw-r--r--strings/strings_def.h16
8 files changed, 133 insertions, 77 deletions
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c
index 4beb7047d00..3ca4ba2b430 100644
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -288,9 +288,7 @@ void my_hash_sort_8bit_bin(CHARSET_INFO *cs __attribute__((unused)),
for (; key < end ; key++)
{
- tmp1^= (ulong) ((((uint) tmp1 & 63) + tmp2) *
- ((uint) *key)) + (tmp1 << 8);
- tmp2+= 3;
+ MY_HASH_ADD(tmp1, tmp2, (uint) *key);
}
*nr1= tmp1;
@@ -307,9 +305,7 @@ void my_hash_sort_bin(CHARSET_INFO *cs __attribute__((unused)),
for (; key < end ; key++)
{
- tmp1^= (ulong) ((((uint) tmp1 & 63) + tmp2) *
- ((uint) *key)) + (tmp1 << 8);
- tmp2+= 3;
+ MY_HASH_ADD(tmp1, tmp2, (uint) *key);
}
*nr1= tmp1;
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index 2c84f86fad0..babf74599ea 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -691,6 +691,8 @@ void my_hash_sort_latin1_de(CHARSET_INFO *cs __attribute__((unused)),
ulong *nr1, ulong *nr2)
{
const uchar *end;
+ register ulong m1= *nr1, m2= *nr2;
+
/*
Remove end space. We have to do this to be able to compare
'AE' and 'Ä' as identical
@@ -700,14 +702,14 @@ void my_hash_sort_latin1_de(CHARSET_INFO *cs __attribute__((unused)),
for (; key < end ; key++)
{
uint X= (uint) combo1map[(uint) *key];
- nr1[0]^=(ulong) ((((uint) nr1[0] & 63)+nr2[0]) * X) + (nr1[0] << 8);
- nr2[0]+=3;
+ MY_HASH_ADD(m1, m2, X);
if ((X= combo2map[*key]))
{
- nr1[0]^=(ulong) ((((uint) nr1[0] & 63)+nr2[0]) * X) + (nr1[0] << 8);
- nr2[0]+=3;
+ MY_HASH_ADD(m1, m2, X);
}
}
+ *nr1= m1;
+ *nr2= m2;
}
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index f4e70fd1dd5..1ef772e1d5e 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -680,6 +680,8 @@ void
my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)),
const uchar *key, size_t len,ulong *nr1, ulong *nr2)
{
+ register ulong m1= *nr1, m2= *nr2;
+
/*
Remove trailing spaces. We have to do this to be able to compare
'A ' and 'A' as identical
@@ -688,10 +690,10 @@ my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)),
for (; key < end ; key++)
{
- nr1[0]^=(ulong) ((((uint) nr1[0] & 63)+nr2[0]) *
- ((uint)*key)) + (nr1[0] << 8);
- nr2[0]+=3;
+ MY_HASH_ADD(m1, m2, (uint)*key);
}
+ *nr1= m1;
+ *nr2= m2;
}
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index f5484965314..0785ba35700 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -306,24 +306,48 @@ void my_hash_sort_simple(CHARSET_INFO *cs,
{
register const uchar *sort_order=cs->sort_order;
const uchar *end;
- ulong n1, n2;
+ register ulong m1= *nr1, m2= *nr2;
+ uint16 space_weight= sort_order[' '];
/*
- Remove end space. We have to do this to be able to compare
- 'A ' and 'A' as identical
+ Remove all trailing characters that are equal to space.
+ We have to do this to be able to compare 'A ' and 'A' as identical.
+
+ If the key is long enough, cut the trailing spaces (0x20) using an
+ optimized function implemented in skip_trailing_spaces().
+
+ "len > 16" is just some heuristic here.
+ Calling skip_triling_space() for short values is not desirable,
+ because its initialization block may be more expensive than the
+ performance gained.
+ */
+
+ end= len > 16 ? skip_trailing_space(key, len) : key + len;
+
+ /*
+ We removed all trailing characters that are binary equal to space 0x20.
+ Now remove all trailing characters that have weights equal to space.
+ Some 8bit simple collations may have such characters:
+ - cp1250_general_ci 0xA0 NO-BREAK SPACE == 0x20 SPACE
+ - cp1251_ukrainian_ci 0x60 GRAVE ACCENT == 0x20 SPACE
+ - koi8u_general_ci 0x60 GRAVE ACCENT == 0x20 SPACE
*/
- end= skip_trailing_space(key, len);
- n1= *nr1;
- n2= *nr2;
+ for ( ; key < end ; )
+ {
+ if (sort_order[*--end] != space_weight)
+ {
+ end++;
+ break;
+ }
+ }
+
for (; key < (uchar*) end ; key++)
{
- n1^=(ulong) ((((uint) n1 & 63)+n2) *
- ((uint) sort_order[(uint) *key])) + (n1 << 8);
- n2+=3;
+ MY_HASH_ADD(m1, m2, (uint) sort_order[(uint) *key]);
}
- *nr1= n1;
- *nr2= n2;
+ *nr1= m1;
+ *nr2= m2;
}
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 5165a43e852..19ce17c3856 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -20873,21 +20873,45 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs,
static void my_hash_sort_uca(CHARSET_INFO *cs,
my_uca_scanner_handler *scanner_handler,
const uchar *s, size_t slen,
- ulong *n1, ulong *n2)
+ ulong *nr1, ulong *nr2)
{
int s_res;
my_uca_scanner scanner;
-
- slen= cs->cset->lengthsp(cs, (char*) s, slen);
+ int space_weight= my_space_weight(cs);
+ register ulong m1= *nr1, m2= *nr2;
+
scanner_handler->init(&scanner, cs, &cs->uca->level[0], s, slen);
while ((s_res= scanner_handler->next(&scanner)) >0)
{
- n1[0]^= (((n1[0] & 63)+n2[0])*(s_res >> 8))+ (n1[0] << 8);
- n2[0]+=3;
- n1[0]^= (((n1[0] & 63)+n2[0])*(s_res & 0xFF))+ (n1[0] << 8);
- n2[0]+=3;
+ if (s_res == space_weight)
+ {
+ /* Combine all spaces to be able to skip end spaces */
+ uint count= 0;
+ do
+ {
+ count++;
+ if ((s_res= scanner_handler->next(&scanner)) <= 0)
+ {
+ /* Skip strings at end of string */
+ goto end;
+ }
+ }
+ while (s_res == space_weight);
+
+ /* Add back that has for the space characters */
+ do
+ {
+ MY_HASH_ADD_16(m1, m2, space_weight);
+ }
+ while (--count != 0);
+
+ }
+ MY_HASH_ADD_16(m1, m2, s_res);
}
+end:
+ *nr1= m1;
+ *nr2= m2;
}
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index d2f7d34f0aa..593f9a12950 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -1222,23 +1222,23 @@ my_caseup_utf16(CHARSET_INFO *cs, char *src, size_t srclen,
static void
my_hash_sort_utf16(CHARSET_INFO *cs, const uchar *s, size_t slen,
- ulong *n1, ulong *n2)
+ ulong *nr1, ulong *nr2)
{
my_wc_t wc;
my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
int res;
const uchar *e= s + cs->cset->lengthsp(cs, (const char *) s, slen);
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
+ register ulong m1= *nr1, m2= *nr2;
while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0)
{
my_tosort_utf16(uni_plane, &wc);
- n1[0]^= (((n1[0] & 63) + n2[0]) * (wc & 0xFF)) + (n1[0] << 8);
- n2[0]+= 3;
- n1[0]^= (((n1[0] & 63) + n2[0]) * (wc >> 8)) + (n1[0] << 8);
- n2[0]+= 3;
+ MY_HASH_ADD_16(m1, m2, wc);
s+= res;
}
+ *nr1= m1;
+ *nr2= m2;
}
@@ -1611,12 +1611,14 @@ my_hash_sort_utf16_bin(CHARSET_INFO *cs,
const uchar *pos, size_t len, ulong *nr1, ulong *nr2)
{
const uchar *end= pos + cs->cset->lengthsp(cs, (const char *) pos, len);
+ register ulong m1= *nr1, m2= *nr2;
+
for ( ; pos < end ; pos++)
{
- nr1[0]^= (ulong) ((((uint) nr1[0] & 63) + nr2[0]) *
- ((uint)*pos)) + (nr1[0] << 8);
- nr2[0]+= 3;
+ MY_HASH_ADD(m1, m2, (uint)*pos);
}
+ *nr1= m1;
+ *nr2= m2;
}
@@ -2007,22 +2009,15 @@ my_caseup_utf32(CHARSET_INFO *cs, char *src, size_t srclen,
}
-static inline void
-my_hash_add(ulong *n1, ulong *n2, uint ch)
-{
- n1[0]^= (((n1[0] & 63) + n2[0]) * (ch)) + (n1[0] << 8);
- n2[0]+= 3;
-}
-
-
static void
my_hash_sort_utf32(CHARSET_INFO *cs, const uchar *s, size_t slen,
- ulong *n1, ulong *n2)
+ ulong *nr1, ulong *nr2)
{
my_wc_t wc;
int res;
const uchar *e= s + slen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
+ register ulong m1= *nr1, m2= *nr2;
/* Skip trailing spaces */
while (e > s + 3 && e[-1] == ' ' && !e[-2] && !e[-3] && !e[-4])
@@ -2031,12 +2026,14 @@ my_hash_sort_utf32(CHARSET_INFO *cs, const uchar *s, size_t slen,
while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
{
my_tosort_utf32(uni_plane, &wc);
- my_hash_add(n1, n2, (uint) (wc >> 24));
- my_hash_add(n1, n2, (uint) (wc >> 16) & 0xFF);
- my_hash_add(n1, n2, (uint) (wc >> 8) & 0xFF);
- my_hash_add(n1, n2, (uint) (wc & 0xFF));
+ MY_HASH_ADD(m1, m2, (uint) (wc >> 24));
+ MY_HASH_ADD(m1, m2, (uint) (wc >> 16) & 0xFF);
+ MY_HASH_ADD(m1, m2, (uint) (wc >> 8) & 0xFF);
+ MY_HASH_ADD(m1, m2, (uint) (wc & 0xFF));
s+= res;
}
+ *nr1= m1;
+ *nr2= m2;
}
@@ -2976,12 +2973,13 @@ static size_t my_caseup_ucs2(CHARSET_INFO *cs, char *src, size_t srclen,
static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen,
- ulong *n1, ulong *n2)
+ ulong *nr1, ulong *nr2)
{
my_wc_t wc;
int res;
const uchar *e=s+slen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
+ register ulong m1= *nr1, m2= *nr2;
while (e > s+1 && e[-1] == ' ' && e[-2] == '\0')
e-= 2;
@@ -2989,12 +2987,11 @@ static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen,
while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
{
my_tosort_ucs2(uni_plane, &wc);
- n1[0]^= (((n1[0] & 63)+n2[0])*(wc & 0xFF))+ (n1[0] << 8);
- n2[0]+=3;
- n1[0]^= (((n1[0] & 63)+n2[0])*(wc >> 8))+ (n1[0] << 8);
- n2[0]+=3;
+ MY_HASH_ADD_16(m1, m2, wc);
s+=res;
}
+ *nr1= m1;
+ *nr2= m2;
}
@@ -3312,16 +3309,17 @@ void my_hash_sort_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)),
const uchar *key, size_t len,ulong *nr1, ulong *nr2)
{
const uchar *end = key + len;
+ register ulong m1= *nr1, m2= *nr2;
while (end > key+1 && end[-1] == ' ' && end[-2] == '\0')
end-= 2;
for (; key < (uchar*) end ; key++)
{
- nr1[0]^=(ulong) ((((uint) nr1[0] & 63)+nr2[0]) *
- ((uint)*key)) + (nr1[0] << 8);
- nr2[0]+=3;
+ MY_HASH_ADD(m1, m2, (uint)*key);
}
+ *nr1= m1;
+ *nr2= m2;
}
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 96d5ea26a3c..d0a64d11c84 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -5087,12 +5087,13 @@ static size_t my_caseup_utf8(CHARSET_INFO *cs, char *src, size_t srclen,
static void my_hash_sort_utf8(CHARSET_INFO *cs, const uchar *s, size_t slen,
- ulong *n1, ulong *n2)
+ ulong *nr1, ulong *nr2)
{
my_wc_t wc;
int res;
const uchar *e=s+slen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
+ register ulong m1= *nr1, m2= *nr2;
/*
Remove end space. We have to do this to be able to compare
@@ -5104,12 +5105,11 @@ static void my_hash_sort_utf8(CHARSET_INFO *cs, const uchar *s, size_t slen,
while ((s < e) && (res=my_utf8_uni(cs,&wc, (uchar *)s, (uchar*)e))>0 )
{
my_tosort_unicode(uni_plane, &wc, cs->state);
- n1[0]^= (((n1[0] & 63)+n2[0])*(wc & 0xFF))+ (n1[0] << 8);
- n2[0]+=3;
- n1[0]^= (((n1[0] & 63)+n2[0])*(wc >> 8))+ (n1[0] << 8);
- n2[0]+=3;
+ MY_HASH_ADD_16(m1, m2, wc);
s+=res;
}
+ *nr1= m1;
+ *nr2= m2;
}
@@ -7597,22 +7597,15 @@ my_caseup_utf8mb4(CHARSET_INFO *cs, char *src, size_t srclen,
}
-static inline void
-my_hash_add(ulong *n1, ulong *n2, uint ch)
-{
- n1[0]^= (((n1[0] & 63) + n2[0]) * (ch)) + (n1[0] << 8);
- n2[0]+= 3;
-}
-
-
static void
my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen,
- ulong *n1, ulong *n2)
+ ulong *nr1, ulong *nr2)
{
my_wc_t wc;
int res;
const uchar *e= s + slen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
+ register ulong m1= *nr1, m2= *nr2;
/*
Remove end space. We do this to be able to compare
@@ -7624,8 +7617,7 @@ my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen,
while ((res= my_mb_wc_utf8mb4(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
{
my_tosort_unicode(uni_plane, &wc, cs->state);
- my_hash_add(n1, n2, (uint) (wc & 0xFF));
- my_hash_add(n1, n2, (uint) (wc >> 8) & 0xFF);
+ MY_HASH_ADD_16(m1, m2, (uint) (wc & 0xFFFF));
if (wc > 0xFFFF)
{
/*
@@ -7635,10 +7627,12 @@ my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen,
This is useful to keep order of records in
test results, e.g. for "SHOW GRANTS".
*/
- my_hash_add(n1, n2, (uint) (wc >> 16) & 0xFF);
+ MY_HASH_ADD(m1, m2, (uint) ((wc >> 16) & 0xFF));
}
s+= res;
}
+ *nr1= m1;
+ *nr2= m2;
}
diff --git a/strings/strings_def.h b/strings/strings_def.h
index 6bd8e8f5575..d601f5ca697 100644
--- a/strings/strings_def.h
+++ b/strings/strings_def.h
@@ -100,4 +100,20 @@ static inline const uchar *skip_trailing_space(const uchar *ptr,size_t len)
end--;
return (end);
}
+
+/* Macros for hashing characters */
+
+#define MY_HASH_ADD(A, B, value) \
+ do { A^= (((A & 63)+B)*((value)))+ (A << 8); B+=3; } while(0)
+
+#define MY_HASH_ADD_16(A, B, value) \
+ do { MY_HASH_ADD(A, B, ((value) & 0xFF)) ; MY_HASH_ADD(A, B, ((value >>8 ))); } while(0)
+
+/*
+ This one is needed to ensure we get the exact same hash as MariaDB 5.1
+ This is needed to ensure that old partitioned tables still work as before.
+*/
+#define MY_HASH_ADD_16_INV(A, B, value) \
+ do { MY_HASH_ADD(A, B, ((value >> 8))) ; MY_HASH_ADD(A, B, ((value & 0xFF ))); } while(0)
+
#endif