diff options
author | Michael Widenius <monty@mariadb.org> | 2014-09-11 22:42:35 +0300 |
---|---|---|
committer | Michael Widenius <monty@mariadb.org> | 2014-09-11 22:42:35 +0300 |
commit | c4f5326bb7bee1857d0cc6d5cdff1178e0854d00 (patch) | |
tree | 58552a3bbd9cb843672a3cc119c7b06236ff3e82 /strings | |
parent | 2362d98470801ddd1bbc3459c106368ffc215933 (diff) | |
download | mariadb-git-c4f5326bb7bee1857d0cc6d5cdff1178e0854d00.tar.gz |
MDEV-6255 DUPLICATE KEY Errors on SELECT .. GROUP BY that uses temporary and filesort.
The problem was that my_hash_sort didn't properly delete end-space characters properly, so strings that should compare
identically was seen as different strings. (Space was handled correctly, but not NBSP)
This caused duplicate key errors when a heap table was converted to Aria as part of overflow in group by.
Fixed by removing all characters that compares as end space when creating a hash.
Other things:
- Fixed that --sorted_results also works for errors in mysqltest.
- Speed up hash by not comparing strings that has different hash.
- Speed up many my_hash_sort functions by using registers to calculate hash instead of pointers.
This was previously done for some functions, but not for all.
- Made a macro of the hash function, to simplify code and to be able to experiment with new hash functions.
client/mysqltest.cc:
Fixed that --sorted_results also works for error messages.
mysql-test/r/ctype_partitions.result:
New test to ensure that partitions on hash works
mysql-test/suite/multi_source/gtid.result:
Updated result
mysql-test/suite/multi_source/gtid.test:
Test that --sorted_result works for error messages
mysql-test/suite/multi_source/gtid_ignore_duplicates.result:
Updated result
mysql-test/suite/multi_source/gtid_ignore_duplicates.test:
Updated result
mysql-test/suite/multi_source/load_data.result:
Updated result
mysql-test/suite/multi_source/load_data.test:
Updated result
mysql-test/t/ctype_partitions.test:
New test to ensure that partitions on hash works
storage/heap/hp_write.c:
Speed up hash by not comparing strings that has different hash.
storage/maria/ma_check.c:
Extra debug
strings/ctype-bin.c:
Use macro for hash function
strings/ctype-latin1.c:
Use macro for hash function
Use registers to calculate hash (speedup)
strings/ctype-mb.c:
Use macro for hash function
Use registers to calculate hash (speedup)
strings/ctype-simple.c:
Use macro for hash function
Use same variable names as in other my_hash_sort functions.
Update my_hash_sort_simple() to properly remove end space (patch by Bar)
strings/ctype-uca.c:
Ignore duplicated space inside strings and end space in my_hash_sort_uca(). This fixed MDEV-6255
Use macro for hash function
Use registers to calculate hash (speedup)
strings/ctype-ucs2.c:
Use macro for hash function
Use registers to calculate hash (speedup)
strings/ctype-utf8.c:
Use macro for hash function
Use registers to calculate hash (speedup)
strings/strings_def.h:
Made a macro of the hash function, to simplify code and to be able to experiment with new hash functions.
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-bin.c | 8 | ||||
-rw-r--r-- | strings/ctype-latin1.c | 10 | ||||
-rw-r--r-- | strings/ctype-mb.c | 8 | ||||
-rw-r--r-- | strings/ctype-simple.c | 46 | ||||
-rw-r--r-- | strings/ctype-uca.c | 38 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 56 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 28 | ||||
-rw-r--r-- | strings/strings_def.h | 16 |
8 files changed, 133 insertions, 77 deletions
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 4beb7047d00..3ca4ba2b430 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -288,9 +288,7 @@ void my_hash_sort_8bit_bin(CHARSET_INFO *cs __attribute__((unused)), for (; key < end ; key++) { - tmp1^= (ulong) ((((uint) tmp1 & 63) + tmp2) * - ((uint) *key)) + (tmp1 << 8); - tmp2+= 3; + MY_HASH_ADD(tmp1, tmp2, (uint) *key); } *nr1= tmp1; @@ -307,9 +305,7 @@ void my_hash_sort_bin(CHARSET_INFO *cs __attribute__((unused)), for (; key < end ; key++) { - tmp1^= (ulong) ((((uint) tmp1 & 63) + tmp2) * - ((uint) *key)) + (tmp1 << 8); - tmp2+= 3; + MY_HASH_ADD(tmp1, tmp2, (uint) *key); } *nr1= tmp1; diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index 2c84f86fad0..babf74599ea 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -691,6 +691,8 @@ void my_hash_sort_latin1_de(CHARSET_INFO *cs __attribute__((unused)), ulong *nr1, ulong *nr2) { const uchar *end; + register ulong m1= *nr1, m2= *nr2; + /* Remove end space. We have to do this to be able to compare 'AE' and 'Ä' as identical @@ -700,14 +702,14 @@ void my_hash_sort_latin1_de(CHARSET_INFO *cs __attribute__((unused)), for (; key < end ; key++) { uint X= (uint) combo1map[(uint) *key]; - nr1[0]^=(ulong) ((((uint) nr1[0] & 63)+nr2[0]) * X) + (nr1[0] << 8); - nr2[0]+=3; + MY_HASH_ADD(m1, m2, X); if ((X= combo2map[*key])) { - nr1[0]^=(ulong) ((((uint) nr1[0] & 63)+nr2[0]) * X) + (nr1[0] << 8); - nr2[0]+=3; + MY_HASH_ADD(m1, m2, X); } } + *nr1= m1; + *nr2= m2; } diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index f4e70fd1dd5..1ef772e1d5e 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -680,6 +680,8 @@ void my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)), const uchar *key, size_t len,ulong *nr1, ulong *nr2) { + register ulong m1= *nr1, m2= *nr2; + /* Remove trailing spaces. We have to do this to be able to compare 'A ' and 'A' as identical @@ -688,10 +690,10 @@ my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)), for (; key < end ; key++) { - nr1[0]^=(ulong) ((((uint) nr1[0] & 63)+nr2[0]) * - ((uint)*key)) + (nr1[0] << 8); - nr2[0]+=3; + MY_HASH_ADD(m1, m2, (uint)*key); } + *nr1= m1; + *nr2= m2; } diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index f5484965314..0785ba35700 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -306,24 +306,48 @@ void my_hash_sort_simple(CHARSET_INFO *cs, { register const uchar *sort_order=cs->sort_order; const uchar *end; - ulong n1, n2; + register ulong m1= *nr1, m2= *nr2; + uint16 space_weight= sort_order[' ']; /* - Remove end space. We have to do this to be able to compare - 'A ' and 'A' as identical + Remove all trailing characters that are equal to space. + We have to do this to be able to compare 'A ' and 'A' as identical. + + If the key is long enough, cut the trailing spaces (0x20) using an + optimized function implemented in skip_trailing_spaces(). + + "len > 16" is just some heuristic here. + Calling skip_triling_space() for short values is not desirable, + because its initialization block may be more expensive than the + performance gained. + */ + + end= len > 16 ? skip_trailing_space(key, len) : key + len; + + /* + We removed all trailing characters that are binary equal to space 0x20. + Now remove all trailing characters that have weights equal to space. + Some 8bit simple collations may have such characters: + - cp1250_general_ci 0xA0 NO-BREAK SPACE == 0x20 SPACE + - cp1251_ukrainian_ci 0x60 GRAVE ACCENT == 0x20 SPACE + - koi8u_general_ci 0x60 GRAVE ACCENT == 0x20 SPACE */ - end= skip_trailing_space(key, len); - n1= *nr1; - n2= *nr2; + for ( ; key < end ; ) + { + if (sort_order[*--end] != space_weight) + { + end++; + break; + } + } + for (; key < (uchar*) end ; key++) { - n1^=(ulong) ((((uint) n1 & 63)+n2) * - ((uint) sort_order[(uint) *key])) + (n1 << 8); - n2+=3; + MY_HASH_ADD(m1, m2, (uint) sort_order[(uint) *key]); } - *nr1= n1; - *nr2= n2; + *nr1= m1; + *nr2= m2; } diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 5165a43e852..19ce17c3856 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -20873,21 +20873,45 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs, static void my_hash_sort_uca(CHARSET_INFO *cs, my_uca_scanner_handler *scanner_handler, const uchar *s, size_t slen, - ulong *n1, ulong *n2) + ulong *nr1, ulong *nr2) { int s_res; my_uca_scanner scanner; - - slen= cs->cset->lengthsp(cs, (char*) s, slen); + int space_weight= my_space_weight(cs); + register ulong m1= *nr1, m2= *nr2; + scanner_handler->init(&scanner, cs, &cs->uca->level[0], s, slen); while ((s_res= scanner_handler->next(&scanner)) >0) { - n1[0]^= (((n1[0] & 63)+n2[0])*(s_res >> 8))+ (n1[0] << 8); - n2[0]+=3; - n1[0]^= (((n1[0] & 63)+n2[0])*(s_res & 0xFF))+ (n1[0] << 8); - n2[0]+=3; + if (s_res == space_weight) + { + /* Combine all spaces to be able to skip end spaces */ + uint count= 0; + do + { + count++; + if ((s_res= scanner_handler->next(&scanner)) <= 0) + { + /* Skip strings at end of string */ + goto end; + } + } + while (s_res == space_weight); + + /* Add back that has for the space characters */ + do + { + MY_HASH_ADD_16(m1, m2, space_weight); + } + while (--count != 0); + + } + MY_HASH_ADD_16(m1, m2, s_res); } +end: + *nr1= m1; + *nr2= m2; } diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index d2f7d34f0aa..593f9a12950 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1222,23 +1222,23 @@ my_caseup_utf16(CHARSET_INFO *cs, char *src, size_t srclen, static void my_hash_sort_utf16(CHARSET_INFO *cs, const uchar *s, size_t slen, - ulong *n1, ulong *n2) + ulong *nr1, ulong *nr2) { my_wc_t wc; my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; int res; const uchar *e= s + cs->cset->lengthsp(cs, (const char *) s, slen); MY_UNICASE_INFO *uni_plane= cs->caseinfo; + register ulong m1= *nr1, m2= *nr2; while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0) { my_tosort_utf16(uni_plane, &wc); - n1[0]^= (((n1[0] & 63) + n2[0]) * (wc & 0xFF)) + (n1[0] << 8); - n2[0]+= 3; - n1[0]^= (((n1[0] & 63) + n2[0]) * (wc >> 8)) + (n1[0] << 8); - n2[0]+= 3; + MY_HASH_ADD_16(m1, m2, wc); s+= res; } + *nr1= m1; + *nr2= m2; } @@ -1611,12 +1611,14 @@ my_hash_sort_utf16_bin(CHARSET_INFO *cs, const uchar *pos, size_t len, ulong *nr1, ulong *nr2) { const uchar *end= pos + cs->cset->lengthsp(cs, (const char *) pos, len); + register ulong m1= *nr1, m2= *nr2; + for ( ; pos < end ; pos++) { - nr1[0]^= (ulong) ((((uint) nr1[0] & 63) + nr2[0]) * - ((uint)*pos)) + (nr1[0] << 8); - nr2[0]+= 3; + MY_HASH_ADD(m1, m2, (uint)*pos); } + *nr1= m1; + *nr2= m2; } @@ -2007,22 +2009,15 @@ my_caseup_utf32(CHARSET_INFO *cs, char *src, size_t srclen, } -static inline void -my_hash_add(ulong *n1, ulong *n2, uint ch) -{ - n1[0]^= (((n1[0] & 63) + n2[0]) * (ch)) + (n1[0] << 8); - n2[0]+= 3; -} - - static void my_hash_sort_utf32(CHARSET_INFO *cs, const uchar *s, size_t slen, - ulong *n1, ulong *n2) + ulong *nr1, ulong *nr2) { my_wc_t wc; int res; const uchar *e= s + slen; MY_UNICASE_INFO *uni_plane= cs->caseinfo; + register ulong m1= *nr1, m2= *nr2; /* Skip trailing spaces */ while (e > s + 3 && e[-1] == ' ' && !e[-2] && !e[-3] && !e[-4]) @@ -2031,12 +2026,14 @@ my_hash_sort_utf32(CHARSET_INFO *cs, const uchar *s, size_t slen, while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0) { my_tosort_utf32(uni_plane, &wc); - my_hash_add(n1, n2, (uint) (wc >> 24)); - my_hash_add(n1, n2, (uint) (wc >> 16) & 0xFF); - my_hash_add(n1, n2, (uint) (wc >> 8) & 0xFF); - my_hash_add(n1, n2, (uint) (wc & 0xFF)); + MY_HASH_ADD(m1, m2, (uint) (wc >> 24)); + MY_HASH_ADD(m1, m2, (uint) (wc >> 16) & 0xFF); + MY_HASH_ADD(m1, m2, (uint) (wc >> 8) & 0xFF); + MY_HASH_ADD(m1, m2, (uint) (wc & 0xFF)); s+= res; } + *nr1= m1; + *nr2= m2; } @@ -2976,12 +2973,13 @@ static size_t my_caseup_ucs2(CHARSET_INFO *cs, char *src, size_t srclen, static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen, - ulong *n1, ulong *n2) + ulong *nr1, ulong *nr2) { my_wc_t wc; int res; const uchar *e=s+slen; MY_UNICASE_INFO *uni_plane= cs->caseinfo; + register ulong m1= *nr1, m2= *nr2; while (e > s+1 && e[-1] == ' ' && e[-2] == '\0') e-= 2; @@ -2989,12 +2987,11 @@ static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen, while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0) { my_tosort_ucs2(uni_plane, &wc); - n1[0]^= (((n1[0] & 63)+n2[0])*(wc & 0xFF))+ (n1[0] << 8); - n2[0]+=3; - n1[0]^= (((n1[0] & 63)+n2[0])*(wc >> 8))+ (n1[0] << 8); - n2[0]+=3; + MY_HASH_ADD_16(m1, m2, wc); s+=res; } + *nr1= m1; + *nr2= m2; } @@ -3312,16 +3309,17 @@ void my_hash_sort_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)), const uchar *key, size_t len,ulong *nr1, ulong *nr2) { const uchar *end = key + len; + register ulong m1= *nr1, m2= *nr2; while (end > key+1 && end[-1] == ' ' && end[-2] == '\0') end-= 2; for (; key < (uchar*) end ; key++) { - nr1[0]^=(ulong) ((((uint) nr1[0] & 63)+nr2[0]) * - ((uint)*key)) + (nr1[0] << 8); - nr2[0]+=3; + MY_HASH_ADD(m1, m2, (uint)*key); } + *nr1= m1; + *nr2= m2; } diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 96d5ea26a3c..d0a64d11c84 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -5087,12 +5087,13 @@ static size_t my_caseup_utf8(CHARSET_INFO *cs, char *src, size_t srclen, static void my_hash_sort_utf8(CHARSET_INFO *cs, const uchar *s, size_t slen, - ulong *n1, ulong *n2) + ulong *nr1, ulong *nr2) { my_wc_t wc; int res; const uchar *e=s+slen; MY_UNICASE_INFO *uni_plane= cs->caseinfo; + register ulong m1= *nr1, m2= *nr2; /* Remove end space. We have to do this to be able to compare @@ -5104,12 +5105,11 @@ static void my_hash_sort_utf8(CHARSET_INFO *cs, const uchar *s, size_t slen, while ((s < e) && (res=my_utf8_uni(cs,&wc, (uchar *)s, (uchar*)e))>0 ) { my_tosort_unicode(uni_plane, &wc, cs->state); - n1[0]^= (((n1[0] & 63)+n2[0])*(wc & 0xFF))+ (n1[0] << 8); - n2[0]+=3; - n1[0]^= (((n1[0] & 63)+n2[0])*(wc >> 8))+ (n1[0] << 8); - n2[0]+=3; + MY_HASH_ADD_16(m1, m2, wc); s+=res; } + *nr1= m1; + *nr2= m2; } @@ -7597,22 +7597,15 @@ my_caseup_utf8mb4(CHARSET_INFO *cs, char *src, size_t srclen, } -static inline void -my_hash_add(ulong *n1, ulong *n2, uint ch) -{ - n1[0]^= (((n1[0] & 63) + n2[0]) * (ch)) + (n1[0] << 8); - n2[0]+= 3; -} - - static void my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen, - ulong *n1, ulong *n2) + ulong *nr1, ulong *nr2) { my_wc_t wc; int res; const uchar *e= s + slen; MY_UNICASE_INFO *uni_plane= cs->caseinfo; + register ulong m1= *nr1, m2= *nr2; /* Remove end space. We do this to be able to compare @@ -7624,8 +7617,7 @@ my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen, while ((res= my_mb_wc_utf8mb4(cs, &wc, (uchar*) s, (uchar*) e)) > 0) { my_tosort_unicode(uni_plane, &wc, cs->state); - my_hash_add(n1, n2, (uint) (wc & 0xFF)); - my_hash_add(n1, n2, (uint) (wc >> 8) & 0xFF); + MY_HASH_ADD_16(m1, m2, (uint) (wc & 0xFFFF)); if (wc > 0xFFFF) { /* @@ -7635,10 +7627,12 @@ my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen, This is useful to keep order of records in test results, e.g. for "SHOW GRANTS". */ - my_hash_add(n1, n2, (uint) (wc >> 16) & 0xFF); + MY_HASH_ADD(m1, m2, (uint) ((wc >> 16) & 0xFF)); } s+= res; } + *nr1= m1; + *nr2= m2; } diff --git a/strings/strings_def.h b/strings/strings_def.h index 6bd8e8f5575..d601f5ca697 100644 --- a/strings/strings_def.h +++ b/strings/strings_def.h @@ -100,4 +100,20 @@ static inline const uchar *skip_trailing_space(const uchar *ptr,size_t len) end--; return (end); } + +/* Macros for hashing characters */ + +#define MY_HASH_ADD(A, B, value) \ + do { A^= (((A & 63)+B)*((value)))+ (A << 8); B+=3; } while(0) + +#define MY_HASH_ADD_16(A, B, value) \ + do { MY_HASH_ADD(A, B, ((value) & 0xFF)) ; MY_HASH_ADD(A, B, ((value >>8 ))); } while(0) + +/* + This one is needed to ensure we get the exact same hash as MariaDB 5.1 + This is needed to ensure that old partitioned tables still work as before. +*/ +#define MY_HASH_ADD_16_INV(A, B, value) \ + do { MY_HASH_ADD(A, B, ((value >> 8))) ; MY_HASH_ADD(A, B, ((value & 0xFF ))); } while(0) + #endif |