diff options
-rw-r--r-- | mysql-test/main/natural_sort_key.result | 131 | ||||
-rw-r--r-- | mysql-test/main/natural_sort_key.test | 33 | ||||
-rw-r--r-- | sql/item_create.cc | 33 | ||||
-rw-r--r-- | sql/item_strfunc.cc | 372 | ||||
-rw-r--r-- | sql/item_strfunc.h | 10 |
5 files changed, 255 insertions, 324 deletions
diff --git a/mysql-test/main/natural_sort_key.result b/mysql-test/main/natural_sort_key.result index 6cfa04d76fa..501898fc7d4 100644 --- a/mysql-test/main/natural_sort_key.result +++ b/mysql-test/main/natural_sort_key.result @@ -64,106 +64,45 @@ SHOW CREATE TABLE t2; Table Create Table t2 CREATE TABLE `t2` ( `c` varchar(30) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL, - `NATURAL_SORT_KEY(c)` varchar(45) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL + `NATURAL_SORT_KEY(c)` varchar(60) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL ) ENGINE=MyISAM DEFAULT CHARSET=latin1 DROP TABLE t1,t2; -SELECT RPAD(val,21,' ') value , RPAD(NATURAL_SORT_KEY(val),26,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead +SELECT RPAD(val,28,' ') value , RPAD(NATURAL_SORT_KEY(val),35,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead FROM ( SELECT 0 val -UNION VALUES ('0.0'),('0.1'), ('0,15'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'),('01'),('0001') -UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_20 +UNION VALUES ('1'),('01'),('0001') +UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_27 ) AS numbers ORDER BY sortkey; value sortkey encoding_overhead -0 00 1 -0,15 00,115 2 -0.0 00.00 2 -0.1 00.01 2 -1 01 1 -1.1 01.01 2 -1.001 01.011 1 -1.02 01.020 2 -1.002 01.021 1 -1.3 01.03 2 -1.010 01.1100 2 -01 010 1 -0001 012 -1 -10 110 1 -100 2100 1 -1000 31000 1 -10000 410000 1 -100000 5100000 1 -1000000 61000000 1 -10000000 710000000 1 -100000000 8100000000 1 -1000000000 901000000000 2 -10000000000 9110000000000 2 -100000000000 92100000000000 2 -1000000000000 931000000000000 2 -10000000000000 9410000000000000 2 -100000000000000 95100000000000000 2 -1000000000000000 961000000000000000 2 -10000000000000000 9710000000000000000 2 -100000000000000000 98100000000000000000 2 -1000000000000000000 9901000000000000000000 3 -10000000000000000000 9912010000000000000000000 5 -100000000000000000000 99121100000000000000000000 5 -# Disable fractions handling by passing NULL as second parameter to NATURAL_SORT_KEY -SELECT val value -FROM -( -SELECT 0 val -UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'), ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1')) -AS numbers ORDER BY NATURAL_SORT_KEY(val, NULL); -value -0 -0,1 -0.1 -1 -1,1 -1,001 -1,02 -1,002 -1,3 -1,010 -1.1 -1.001 -1.02 -1.002 -1.3 -1.010 -# Use ',' as decimal separator for NATURAL_SORT_KEY -SELECT val value, NATURAL_SORT_KEY(val,',') sortkey -FROM -( -SELECT 0 val -UNION VALUES ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1')) -AS numbers ORDER BY sortkey; -value sortkey -0 00 -0,1 00,1 -1 01 -1,001 01,001 -1,002 01,002 -1,010 01,010 -1,02 01,02 -1,1 01,1 -1,3 01,3 -# Use '.' as decimal separator for NATURAL_SORT_KEY -SELECT val value,NATURAL_SORT_KEY(val,'.') sortkey -FROM -( -SELECT 0 val -UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1')) -AS numbers ORDER BY sortkey; -value sortkey -0 00 -0.1 00.1 -1 01 -1.001 01.001 -1.002 01.002 -1.010 01.010 -1.02 01.02 -1.1 01.1 -1.3 01.3 -SET NAMES DEFAULT; +0 000 2 +1 010 2 +01 011 1 +0001 013 -1 +10 1100 2 +100 21000 2 +1000 310000 2 +10000 4100000 2 +100000 51000000 2 +1000000 610000000 2 +10000000 7100000000 2 +100000000 81000000000 2 +1000000000 9010000000000 3 +10000000000 91100000000000 3 +100000000000 921000000000000 3 +1000000000000 9310000000000000 3 +10000000000000 94100000000000000 3 +100000000000000 951000000000000000 3 +1000000000000000 9610000000000000000 3 +10000000000000000 97100000000000000000 3 +100000000000000000 981000000000000000000 3 +1000000000000000000 99010000000000000000000 4 +10000000000000000000 991100000000000000000000 4 +100000000000000000000 9921000000000000000000000 4 +1000000000000000000000 99310000000000000000000000 4 +10000000000000000000000 994100000000000000000000000 4 +100000000000000000000000 9951000000000000000000000000 4 +1000000000000000000000000 99610000000000000000000000000 4 +10000000000000000000000000 997100000000000000000000000000 4 +100000000000000000000000000 9981000000000000000000000000000 4 +1000000000000000000000000000 99901271000000000000000000000000000 8 diff --git a/mysql-test/main/natural_sort_key.test b/mysql-test/main/natural_sort_key.test index bee0d8e76bd..50f579c87d0 100644 --- a/mysql-test/main/natural_sort_key.test +++ b/mysql-test/main/natural_sort_key.test @@ -29,36 +29,11 @@ CREATE TABLE t2 AS SELECT c, NATURAL_SORT_KEY(c) FROM t1 WHERE 0; SHOW CREATE TABLE t2; DROP TABLE t1,t2; -#Show encoding of numbers, including fractions, and leading whitespace. -SELECT RPAD(val,21,' ') value , RPAD(NATURAL_SORT_KEY(val),26,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead +#Show encoding of numbers, with some leading whitespace. +SELECT RPAD(val,28,' ') value , RPAD(NATURAL_SORT_KEY(val),35,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead FROM ( SELECT 0 val -UNION VALUES ('0.0'),('0.1'), ('0,15'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'),('01'),('0001') -UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_20 +UNION VALUES ('1'),('01'),('0001') +UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_27 ) AS numbers ORDER BY sortkey; - ---echo # Disable fractions handling by passing NULL as second parameter to NATURAL_SORT_KEY -SELECT val value -FROM -( -SELECT 0 val -UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'), ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1')) -AS numbers ORDER BY NATURAL_SORT_KEY(val, NULL); - ---echo # Use ',' as decimal separator for NATURAL_SORT_KEY -SELECT val value, NATURAL_SORT_KEY(val,',') sortkey -FROM -( -SELECT 0 val -UNION VALUES ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1')) -AS numbers ORDER BY sortkey; - ---echo # Use '.' as decimal separator for NATURAL_SORT_KEY -SELECT val value,NATURAL_SORT_KEY(val,'.') sortkey -FROM -( -SELECT 0 val -UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1')) -AS numbers ORDER BY sortkey; -SET NAMES DEFAULT; diff --git a/sql/item_create.cc b/sql/item_create.cc index a8b43deb727..24ff9fb37ed 100644 --- a/sql/item_create.cc +++ b/sql/item_create.cc @@ -1622,11 +1622,10 @@ protected: virtual ~Create_func_name_const() {} }; -class Create_func_natural_sort_key : public Create_native_func +class Create_func_natural_sort_key : public Create_func_arg1 { public: - virtual Item *create_native(THD *thd, LEX_CSTRING *name, - List<Item> *item_list) override; + virtual Item *create_1_arg(THD *thd, Item *arg1) override; static Create_func_natural_sort_key s_singleton; protected: Create_func_natural_sort_key() {} @@ -4664,33 +4663,9 @@ Create_func_md5::create_1_arg(THD *thd, Item *arg1) Create_func_natural_sort_key Create_func_natural_sort_key::s_singleton; -Item *Create_func_natural_sort_key::create_native(THD *thd, LEX_CSTRING *name, - List<Item> *item_list) +Item *Create_func_natural_sort_key::create_1_arg(THD *thd, Item* arg1) { - Item *func= NULL; - int arg_count= 0; - - if (item_list != NULL) - arg_count= item_list->elements; - - Item *param_1, *param_2; - - switch (arg_count) - { - case 1: - param_1= item_list->pop(); - func= new (thd->mem_root) Item_func_natural_sort_key(thd, param_1); - break; - case 2: - param_1= item_list->pop(); - param_2= item_list->pop(); - func= new (thd->mem_root) Item_func_natural_sort_key(thd, param_1, param_2); - break; - default: - my_error(ER_WRONG_PARAMCOUNT_TO_NATIVE_FCT, MYF(0), name->str); - break; - } - return func; + return new (thd->mem_root) Item_func_natural_sort_key(thd, arg1); } Create_func_monthname Create_func_monthname::s_singleton; diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc index 59460134ef8..943c459d834 100644 --- a/sql/item_strfunc.cc +++ b/sql/item_strfunc.cc @@ -5440,212 +5440,264 @@ String *Item_temptable_rowid::val_str(String *str) } /** - Here is how we generate a key suitable for lexicographic comparison - from a numeric string (representing positive number). - - We prepend variable length prefix, that only encodes string length - The longer the string, the lexicographically larger is the prefix. - - Rules for generating prefix is following. - - 1. Small length (1 to 9) - Prefix is single char '0' + length -1 - This gives the range '0' - '8' - 2. Medium length(10 to 18) - Prefix is 2 chars - concat('9', '0'+length-10)' - This gives range '90'-'98' - 3. Large length(19) - Prefix is "990" - 4. Huge length (20 to 2^32-1) - Prefix stars with '99', then log10 of the length is added as char - then the number itself is added - The range is '99120' - '9994294967295' -*/ + Helper routine to encode length prefix + in natsort_encode_numeric_string(). -#define NATSORT_BUFSIZE 16 -static size_t natsort_num2str(size_t n, char *s) -{ - if (n < 10) - { - s[0]= '0' + (char)n -1; - return 1; - } + The idea is so that bigger input numbers correspond + lexicographically bigger output strings. - if (n < 19) - { - s[0]= '9'; - s[1]= '0'+ (char)n- 10; - return 2; - } + Note, that in real use the number would typically + small, as it only computes variable *length prefixes*. + + @param[in] n - the number + @param[in] s - output buffer (should be at least 26 bytes large) + + @return - length of encoding + + Here is how encoding works + + - n is from 0 to 8 + Output string calculated as '0'+n (range '0' - '8') + + - n is from 9 to 17 + Output calculated as concat('9', '0' + n -9)' + Output range: '90'-'98' - if (n == 19) + -n is from 18 to 26 + Output calculated as concat('99', '0' + n -18)' + Output range '990'-'998' + + - n is from 28 to SIZE_T_MAX + Output starts with '999', + then log10(n) is encoded as 2-digit decimal number + then the number itself is added. + Example : for 28 key is concat('999', '01' , '28') + i.e '9990128' + + Key legth is 5 + ceil(log10(n)) + + Output range is + (64bit)'9990128' - '9991918446744073709551615' + (32bit)'9990128' - '999094294967295' +*/ + +/* Largest length of encoded string.*/ +#define NATSORT_BUFSIZE 26 +static size_t natsort_encode_length(size_t n, char *s) +{ + if (n < 27) { - s[0]= '9'; - s[1]= '9'; - s[2]= '0'; - return 3; + size_t n_nines= n / 9; + if (n_nines) + memset(s, '9', n_nines); + s[n_nines]= char(n % 9 + '0'); + s[n_nines + 1]= 0; + return n_nines + 1; } - /* Here, n > 19*/ - s[0]= '9'; - s[1]= '9'; + memset(s, '9', 3); size_t log10n= 0; - for (size_t tmp= n/10; tmp; tmp /= 10) + for (size_t tmp= n / 10; tmp; tmp/= 10) log10n++; - DBUG_ASSERT(log10n && log10n < 10); - s[2]='0' + (char)log10n; - return ll2str(n,s+3,10,0) - s; + s[3]= '0' + (char) (log10n / 10); + s[4]= '0' + (char) (log10n % 10); + return longlong10_to_str(n, s + 5, 10) - s; +} + +enum class NATSORT_ERR +{ + SUCCESS= 0, + KEY_TOO_LARGE= 1, + ALLOC_ERROR= 2 +}; + +/* + Encode numeric string for natural sorting. + + @param[in] in - start of the numeric string + skipping leading zeros + + @param[in] n_digits - length of the string, + in characters, not counting leading zeros. + + @param[in] n_lead_zeros - leading zeros count. + + @param[out] out - String to write to. + + Note: + Special case, where there are only leading zeros + n_digits == 0 and n_leading_zeros > 0 + will treated as-if in = "0", n_digits = 1, n_leading_zeros + is decremented. + + The resulting encoding of the numeric string is then + + CONCAT(natsort_encode_length(n_digits), in, + natsort_encode_length(n_leading_zeros)) +*/ +static NATSORT_ERR natsort_encode_numeric_string(const char *in, + size_t n_digits, + size_t n_leading_zeros, + String *out) +{ + char buf[NATSORT_BUFSIZE]; + DBUG_ASSERT(in); + DBUG_ASSERT(n_digits); + + if (out->append(buf, natsort_encode_length(n_digits - 1, buf))) + return NATSORT_ERR::ALLOC_ERROR; + + if (out->append(in, n_digits)) + return NATSORT_ERR::ALLOC_ERROR; + + if (out->append(buf, natsort_encode_length(n_leading_zeros, buf))) + return NATSORT_ERR::ALLOC_ERROR; + + return NATSORT_ERR::SUCCESS; +} + +/* + Calculate max size of the natsort key. + + A digit expands to 3 chars - length_prefix, the digit, lead_zero_cnt(0) + + With even length L=2N, the largest key corresponds to input string + in form REPEAT('<letter><digit>',N) and the length of a key is + 3N + N = 4*N = 2*L + + With odd input length L=2*N+1, largest key corresponds to + CONCAT(<digit>,REPEAT('<letter><digit>', N) + with key length is 3 + 4*N = 2*L+1 +*/ +static size_t natsort_max_key_size(size_t input_size) +{ + return input_size * 2 + input_size % 2; } /** Convert a string to natural sort key. @param[in] in - input string - @param[out] s - output string + @param[out] out - output string We assume that memory is preallocated for the output string, so that appends do not fail. - - The lead zeros count, if positive, is stored after - the numeric string. - - Fractional parts are stored and compared as strings. - Even if they have numbers in them, they should just - be compared as strings. */ -static void to_natsort_key(const String *in, String *s, my_wc_t decimal_sep) +static NATSORT_ERR to_natsort_key(const String *in, String *out, + size_t max_key_size= (size_t) -1) { - CHARSET_INFO *cs= in->charset(); - uchar *pos= (uchar *) in->ptr(); - uchar *end= pos + in->length(); - size_t num_len= 0; - size_t leading_zeros= 0; - uchar *num_start= nullptr; - bool in_fraction= false; - size_t fraction_len=0; + size_t n_digits= 0; + size_t n_lead_zeros= 0; + size_t num_start; - for (;;) + for (size_t pos= 0;; pos++) { - my_wc_t c= 0; - int charlen= cs->mb_wc(&c, pos, end); - bool is_digit= charlen >= 1 && (c >= '0' && c <= '9'); - - if (!is_digit && (num_start || leading_zeros)) + char c= pos < in->length() ? (*in)[pos] : 0; + bool is_digit= (c >= '0' && c <= '9'); + if (!is_digit && (n_digits || n_lead_zeros)) { - /* - Handle end of digits run. - - Write prefix, natsort_num2str(length) is without leading zeros - Write numeric value, without leading zeros - If there were leading zeros, write variable length suffix - */ - char buf[NATSORT_BUFSIZE]; - if (!num_len) + /* Handle end of digits run.*/ + if (!n_digits) { - // number was zeros only - leading_zeros--; - num_len= 1; + /*We only have zeros.*/ + n_lead_zeros--; + num_start= pos - 1; + n_digits= 1; } + NATSORT_ERR err= natsort_encode_numeric_string( + in->ptr() + num_start, n_digits, n_lead_zeros, out); + if (err != NATSORT_ERR::SUCCESS) + return err; - s->append(buf, natsort_num2str(num_len, buf)); - if (!num_start) - s->append('0'); - else - s->append((const char *) num_start, pos - num_start, cs); - - if (leading_zeros) - s->append(buf, natsort_num2str(leading_zeros, buf)); + if (out->length() > max_key_size) + return NATSORT_ERR::KEY_TOO_LARGE; /* Reset state.*/ - num_len= 0; - num_start= nullptr; - leading_zeros= 0; - in_fraction= c == decimal_sep; - fraction_len= 0; + n_digits= 0; + num_start= size_t(-1); + n_lead_zeros= 0; } - if (charlen < 1) + if (pos == in->length()) break; - if (is_digit && !in_fraction) + if (!is_digit) { - /* Do not count leading zeros in num_len, and num_pos */ - if (c != '0' || num_len) - { - num_len++; - if (!num_start) - num_start= pos; - } - else - leading_zeros++; + if (out->append(c)) + return NATSORT_ERR::KEY_TOO_LARGE; } - else - { - /* Append a non-digit, or fractional part of the number (handling the same as string) */ - s->append((const char *) pos, charlen); - if (in_fraction) - { - if (!is_digit && fraction_len) - in_fraction= false; - fraction_len++; - } - } - pos+= charlen; + else if (c == '0' && !n_digits) + n_lead_zeros++; + else if (!n_digits++) + num_start= pos; + + if (out->length() + n_digits > max_key_size) + return NATSORT_ERR::KEY_TOO_LARGE; } + return NATSORT_ERR::SUCCESS; } - -String *Item_func_natural_sort_key::val_str(String *s) +String *Item_func_natural_sort_key::val_str(String *out) { if (args[0]->is_null()) { null_value= true; return nullptr; } - + NATSORT_ERR err= NATSORT_ERR::SUCCESS; String *in= args[0]->val_str(); CHARSET_INFO *cs= in->charset(); ulong max_allowed_packet= current_thd->variables.max_allowed_packet; - size_t reserve_length= (size_t) in->length() + in->length() / 2 + 2; - if (s->alloc((uint32)reserve_length)) - goto error_exit; - s->length(0); - s->set_charset(cs); - - to_natsort_key(in, s, decimal_separator()); - - DBUG_ASSERT(s->length() < reserve_length); + size_t reserve_length= std::max(natsort_max_key_size(in->length()), + (size_t) max_allowed_packet); + uint errs; + String tmp; + /* + to_natsort_key() only support charsets where digits are represented by + a single byte in range 0x30-0x39. Almost everything is OK, just utf16/32 + won't do. Full ASCII compatibility is not required, so that SJIS and SWE7 + are fine. + */ + if (cs->mbminlen != 1) + { + if (!tmp.copy(in, &my_charset_utf8mb4_bin, &errs)) + goto error_exit; + in= &tmp; + } - if (s->length() > max_allowed_packet) + if (out->alloc((uint32) reserve_length)) { - push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN, - ER_WARN_ALLOWED_PACKET_OVERFLOWED, - ER(ER_WARN_ALLOWED_PACKET_OVERFLOWED), func_name(), - max_allowed_packet); + err= NATSORT_ERR::ALLOC_ERROR; goto error_exit; } - return s; - -error_exit: - null_value=true; - return nullptr; -} + out->length(0); + out->set_charset(in->charset()); + err= to_natsort_key(in, out, max_allowed_packet / cs->mbminlen); -my_wc_t Item_func_natural_sort_key::decimal_separator() -{ - if (m_decimal_separator != DECIMAL_SEP_UNDEFINED) - return m_decimal_separator; + if (err != NATSORT_ERR::SUCCESS) + { + if (err == NATSORT_ERR::KEY_TOO_LARGE) + { + push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN, + ER_WARN_ALLOWED_PACKET_OVERFLOWED, + ER(ER_WARN_ALLOWED_PACKET_OVERFLOWED), func_name(), + max_allowed_packet); + } + goto error_exit; + } - DBUG_ASSERT(arg_count > 1); - String *s= args[1]->val_str(); - if (s && s->length()) + if (cs->mbminlen != 1) { - my_wc_t c; - if (s->charset()->mb_wc(&c, (const uchar *) s->ptr(), - (const uchar *) s->end())) - return c; + /* output string is now utf8, convert to input charset.*/ + if (tmp.copy(out, cs, &errs) || out->copy(tmp)) + goto error_exit; } - return DECIMAL_SEP_NONE; + + return out; + +error_exit: + null_value= true; + return nullptr; } bool Item_func_natural_sort_key::fix_length_and_dec(void) @@ -5653,15 +5705,13 @@ bool Item_func_natural_sort_key::fix_length_and_dec(void) if (agg_arg_charsets_for_string_result(collation, args, 1)) return true; DBUG_ASSERT(collation.collation != NULL); - uint32 max_clen= args[0]->max_char_length(); - fix_char_length(max_clen + max_clen/2 + ((max_clen <= 3)?1:0)); - set_maybe_null(); + uint32 max_char_len= + (uint32) natsort_max_key_size(args[0]->max_char_length()); + fix_char_length(max_char_len); - if (arg_count> 1 && args[1]->can_eval_in_optimize()) - { - /* Initialize decimal separator */ - m_decimal_separator= decimal_separator(); - } + set_maybe_null(args[0]->maybe_null() || + max_char_len * collation.collation->mbmaxlen > + current_thd->variables.max_allowed_packet); return false; } diff --git a/sql/item_strfunc.h b/sql/item_strfunc.h index eb36422dbeb..63894f1d6d9 100644 --- a/sql/item_strfunc.h +++ b/sql/item_strfunc.h @@ -273,17 +273,9 @@ public: class Item_func_natural_sort_key : public Item_str_func { - my_wc_t m_decimal_separator; - static constexpr my_wc_t DECIMAL_SEP_UNDEFINED=ULONG_MAX-1; - static constexpr my_wc_t DECIMAL_SEP_NONE= ULONG_MAX; - public: Item_func_natural_sort_key(THD *thd, Item *a) - : Item_str_func(thd, a), m_decimal_separator(DECIMAL_SEP_NONE){} - - Item_func_natural_sort_key(THD *thd, Item *a, Item *b) - : Item_str_func(thd, a, b), m_decimal_separator(DECIMAL_SEP_UNDEFINED){} - my_wc_t decimal_separator(); + : Item_str_func(thd, a){}; String *val_str(String *) override; LEX_CSTRING func_name_cstring() const override { |