diff options
author | monty@hundin.mysql.fi <> | 2001-09-11 01:40:52 +0300 |
---|---|---|
committer | monty@hundin.mysql.fi <> | 2001-09-11 01:40:52 +0300 |
commit | fa23b20789dc271de6b2638fbea0ccefc333c872 (patch) | |
tree | 6a962f88c6b25e60b6f09ab8b8c7eb0cbb47bc22 | |
parent | c526f5d2ace049ba9f8cecd4800c44b0c41ace17 (diff) | |
download | mariadb-git-fa23b20789dc271de6b2638fbea0ccefc333c872.tar.gz |
Fixes for German sorting order.
-rw-r--r-- | Docs/manual.texi | 33 | ||||
-rw-r--r-- | configure.in | 2 | ||||
-rw-r--r-- | myisam/mi_delete_all.c | 4 | ||||
-rw-r--r-- | myisam/mi_search.c | 19 | ||||
-rw-r--r-- | mysql-test/r/ctype_latin1_de.result | 168 | ||||
-rw-r--r-- | mysql-test/t/ctype_latin1_de-master.opt | 1 | ||||
-rw-r--r-- | mysql-test/t/ctype_latin1_de.test | 36 | ||||
-rw-r--r-- | sql/item_cmpfunc.cc | 2 | ||||
-rw-r--r-- | strings/ctype-latin1_de.c | 218 |
9 files changed, 328 insertions, 155 deletions
diff --git a/Docs/manual.texi b/Docs/manual.texi index 52256013093..8e89c9d09d2 100644 --- a/Docs/manual.texi +++ b/Docs/manual.texi @@ -748,7 +748,7 @@ is also available through the SQL interface as well. @item Full support for several different character sets, including -ISO-8859-1 (Latin1), big5, ujis, and more. For example, the +ISO-8859-1 (Latin1), german, big5, ujis, and more. For example, the Scandinavian characters `@ringaccent{a}', `@"a' and `@"o' are allowed in table and column names. @@ -20442,6 +20442,35 @@ default-character-set=character-set-name but normally this is never needed. +@menu +* German character set:: +@end menu + +@node German character set, , Character sets, Character sets +@subsubsection German character set + +To get German sorting order, you should start @code{mysqld} with +@code{--default-character-set=latin_de}. This will give you the following +characteristics. + +When sorting and comparing string's the following mapping is done on the +strings before doing the comparison: + +@example +ä -> ae +ö -> oe +ü -> ue +ß -> ss +@end example + +All accented characters, except @code{'é'} and @code{É} are converted to +their un-accented counterpart. All letters are converted to uppercase. + +When comparing strings with @code{LIKE} the one -> two character mapping +is not done. All letters are converted to uppercase. Accent are removed +from all letters except: @code{Ü}, @code{ü}, @code{É}, @code{é}, @code{Ö}, +@code{ö}, @code{Ä} and @code{ä}. + @node Languages, Adding character set, Character sets, Localization @subsection Non-English Error Messages @@ -46753,6 +46782,8 @@ Our TODO section contains what we plan to have in 4.0. @xref{TODO MySQL 4.0}. @itemize @bullet @item +New character set @code{latin_de} which provides correct German sorting. +@item @code{TRUNCATE TABLE} and @code{DELETE FROM table_name} are now separate functions. One bonus is that @code{DELETE FROM table_name} now returns the number of deleted rows. diff --git a/configure.in b/configure.in index cef422b6ebd..1c7e212939d 100644 --- a/configure.in +++ b/configure.in @@ -1826,7 +1826,7 @@ CHARSETS_AVAILABLE="big5 cp1251 cp1257 latin1 latin1_de latin2 latin5 sjis swe7 tis620 ujis usa7 win1250 win1251ukr" CHARSETS_DEPRECATED="win1251" -DEFAULT_CHARSET=latin1_de +DEFAULT_CHARSET=latin1 AC_DIVERT_POP AC_ARG_WITH(charset, diff --git a/myisam/mi_delete_all.c b/myisam/mi_delete_all.c index c3ed9455e12..2c506da865f 100644 --- a/myisam/mi_delete_all.c +++ b/myisam/mi_delete_all.c @@ -15,7 +15,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* Remove all rows from a MyISAM table */ -/* This only clears the status information; The files are not truncated */ +/* This only clears the status information and truncates the data file */ #include "myisamdef.h" @@ -50,6 +50,8 @@ int mi_delete_all_rows(MI_INFO *info) myisam_log_command(MI_LOG_DELETE_ALL,info,(byte*) 0,0,0); VOID(_mi_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); + if (my_chsize(info->dfile, 0, MYF(MY_WME))) + goto err; allow_break(); /* Allow SIGHUP & SIGINT */ DBUG_RETURN(0); diff --git a/myisam/mi_search.c b/myisam/mi_search.c index 18d8ea8a4b0..b9895e9d6cd 100644 --- a/myisam/mi_search.c +++ b/myisam/mi_search.c @@ -657,19 +657,19 @@ void _mi_dpointer(MI_INFO *info, uchar *buff, my_off_t pos) int _mi_compare_text(CHARSET_INFO *charset_info, uchar *a, uint a_length, uchar *b, uint b_length, my_bool part_key) { - uint length= min(a_length,b_length); - uchar *end= a+ length; int flag; #ifdef USE_STRCOLL if (use_strcoll(charset_info)) { - if ((flag = my_strnncoll(charset_info, a, a_length, b, b_length))) - return flag; + /* QQ: This needs to work with part keys at some point */ + return my_strnncoll(charset_info, a, a_length, b, b_length); } else #endif { + uint length= min(a_length,b_length); + uchar *end= a+ length; uchar *sort_order=charset_info->sort_order; while (a < end) if ((flag= (int) sort_order[*a++] - (int) sort_order[*b++])) @@ -768,8 +768,15 @@ int _mi_key_cmp(register MI_KEYSEG *keyseg, register uchar *a, } else { - uint length=(uint) (end-a); - if ((flag=_mi_compare_text(keyseg->charset,a,length,b,length, + uint length=(uint) (end-a), a_length=length, b_length=length; + if (!(nextflag & SEARCH_PREFIX)) + { + while (a_length && a[a_length-1] == ' ') + a_length--; + while (b_length && b[b_length-1] == ' ') + b_length--; + } + if ((flag=_mi_compare_text(keyseg->charset,a,a_length,b,b_length, (my_bool) ((nextflag & SEARCH_PREFIX) && next_key_length <= 0)))) return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag); diff --git a/mysql-test/r/ctype_latin1_de.result b/mysql-test/r/ctype_latin1_de.result new file mode 100644 index 00000000000..71bed79891a --- /dev/null +++ b/mysql-test/r/ctype_latin1_de.result @@ -0,0 +1,168 @@ +a b +a 35 +ac 2 +ad 4 +ä 1 +ae 3 +ää 31 +aeae 33 +ääa 32 +aeb 6 +Äc 5 +eä 28 +o 37 +oc 15 +od 18 +ö 14 +oe 17 +Öa 16 +oeb 20 +Öc 19 +öo 30 +q 34 +s 21 +ss 22 +ß 23 +ssa 25 +ßa 27 +ßb 24 +ssc 26 +u 36 +uc 8 +ud 10 +ue 9 +Ü 11 +ueb 12 +üc 7 +uf 13 +uü 29 +é 38 +É 39 +a b +a 35 +ac 2 +ad 4 +ä 1 +ae 3 +ää 31 +aeae 33 +ääa 32 +aeb 6 +Äc 5 +eä 28 +o 37 +oc 15 +od 18 +ö 14 +oe 17 +Öa 16 +oeb 20 +Öc 19 +öo 30 +q 34 +s 21 +ss 22 +ß 23 +ssa 25 +ßa 27 +ßb 24 +ssc 26 +u 36 +uc 8 +ud 10 +ue 9 +Ü 11 +ueb 12 +üc 7 +uf 13 +uü 29 +é 38 +É 39 +a +É +é +uü +uf +üc +ueb +Ü +ue +ud +uc +u +ssc +ßb +ßa +ssa +ß +ss +s +q +öo +Öc +oeb +Öa +oe +ö +od +oc +o +eä +Äc +aeb +ääa +aeae +ää +ae +ä +ad +ac +a +Table Op Msg_type Msg_text +test.t1 check status OK +a b +Öa 16 +Öc 19 +öo 30 +a b +é 38 +É 39 +a b +a 35 +ac 2 +ad 4 +ae 3 +aeae 33 +ääa 32 +aeb 6 +Öa 16 +ssa 25 +ßa 27 +a b +u 36 +uc 8 +ud 10 +ue 9 +ueb 12 +uf 13 +uü 29 +a b +ss 22 +ssa 25 +ssc 26 +strcmp('ä','ae') strcmp('ae','ä') strcmp('aeq','äq') strcmp('äq','aeq') +0 0 0 0 +strcmp('ss','ß') strcmp('ß','ss') strcmp('ßs','sss') strcmp('ßq','ssq') +0 0 0 0 +strcmp('ä','af') strcmp('a','ä') strcmp('ää','aeq') strcmp('ää','aeaeq') +-1 -1 -1 -1 +strcmp('ss','ßa') strcmp('ß','ssa') strcmp('sßa','sssb') strcmp('s','ß') +-1 -1 -1 -1 +strcmp('ö','oö') strcmp('Ü','uü') strcmp('ö','oeb') +-1 -1 -1 +strcmp('af','ä') strcmp('ä','a') strcmp('aeq','ää') strcmp('aeaeq','ää') +1 1 1 1 +strcmp('ßa','ss') strcmp('ssa','ß') strcmp('sssb','sßa') strcmp('ß','s') +1 1 1 1 +strcmp('u','öa') strcmp('u','ö') +1 1 diff --git a/mysql-test/t/ctype_latin1_de-master.opt b/mysql-test/t/ctype_latin1_de-master.opt new file mode 100644 index 00000000000..98accd58c46 --- /dev/null +++ b/mysql-test/t/ctype_latin1_de-master.opt @@ -0,0 +1 @@ +--default-character-set=latin1_de diff --git a/mysql-test/t/ctype_latin1_de.test b/mysql-test/t/ctype_latin1_de.test new file mode 100644 index 00000000000..1df700340da --- /dev/null +++ b/mysql-test/t/ctype_latin1_de.test @@ -0,0 +1,36 @@ +# +# Test latin_de character set +# +drop table if exists t1; +create table t1 (a char (20) not null, b int not null auto_increment, index (a,b),index(b)); +insert into t1 (a) values ('ä'),('ac'),('ae'),('ad'),('Äc'),('aeb'); +insert into t1 (a) values ('üc'),('uc'),('ue'),('ud'),('Ü'),('ueb'),('uf'); +insert into t1 (a) values ('ö'),('oc'),('Öa'),('oe'),('od'),('Öc'),('oeb'); +insert into t1 (a) values ('s'),('ss'),('ß'),('ßb'),('ssa'),('ssc'),('ßa'); +insert into t1 (a) values ('eä'),('uü'),('öo'),('ää'),('ääa'),('aeae'); +insert into t1 (a) values ('q'),('a'),('u'),('o'),('é'),('É'); +select a,b from t1 order by a,b; +select a,b from t1 order by upper(a),b; +select a from t1 order by a desc; +check table t1; +select * from t1 where a like "ö%"; +select * from t1 where a like "%É%"; +select * from t1 where a like "%Á%"; +select * from t1 where a like "%U%"; +select * from t1 where a like "%ss%"; +drop table t1; + +# The following should all be true +select strcmp('ä','ae'),strcmp('ae','ä'),strcmp('aeq','äq'),strcmp('äq','aeq'); +select strcmp('ss','ß'),strcmp('ß','ss'),strcmp('ßs','sss'),strcmp('ßq','ssq'); + +# The following should all return -1 +select strcmp('ä','af'),strcmp('a','ä'),strcmp('ää','aeq'),strcmp('ää','aeaeq'); +select strcmp('ss','ßa'),strcmp('ß','ssa'),strcmp('sßa','sssb'),strcmp('s','ß'); +select strcmp('ö','oö'),strcmp('Ü','uü'),strcmp('ö','oeb'); + +# The following should all return 1 +select strcmp('af','ä'),strcmp('ä','a'),strcmp('aeq','ää'),strcmp('aeaeq','ää'); +select strcmp('ßa','ss'),strcmp('ssa','ß'),strcmp('sssb','sßa'),strcmp('ß','s'); +select strcmp('u','öa'),strcmp('u','ö'); + diff --git a/sql/item_cmpfunc.cc b/sql/item_cmpfunc.cc index b18237ca4cf..db1d6911119 100644 --- a/sql/item_cmpfunc.cc +++ b/sql/item_cmpfunc.cc @@ -254,7 +254,7 @@ longlong Item_func_strcmp::val_int() null_value=1; return 0; } - int value=stringcmp(a,b); + int value= binary ? stringcmp(a,b) : sortcmp(a,b); null_value=0; return !value ? 0 : (value < 0 ? (longlong) -1 : (longlong) 1); } diff --git a/strings/ctype-latin1_de.c b/strings/ctype-latin1_de.c index c73c89cef46..a0a9686e0c1 100644 --- a/strings/ctype-latin1_de.c +++ b/strings/ctype-latin1_de.c @@ -99,12 +99,10 @@ uchar to_upper_latin1_de[] = { * This is a simple latin1 mapping table, which maps all accented * characters to their non-accented equivalents. Note: in this * table, 'ä' is mapped to 'A', 'ÿ' is mapped to 'Y', etc. - all - * accented characters are treated the same way. - * - * SPECIAL NOTE: 'ß' (the sz ligature), which isn't really an - * accented 's', is mapped to 'S', to simplify the sorting - * functions. + * accented characters except the following are treated the same way. + * Ü, ü, É, é, Ö, ö, Ä, ä */ + uchar sort_order_latin1_de[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, @@ -118,10 +116,10 @@ uchar sort_order_latin1_de[] = { 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159, 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175, 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191, - 65, 65, 65, 65, 65, 65, 92, 67, 69, 69, 69, 69, 73, 73, 73, 73, - 68, 78, 79, 79, 79, 79, 79,215,216, 85, 85, 85, 85, 89,222, 83, - 65, 65, 65, 65, 65, 65, 92, 67, 69, 69, 69, 69, 73, 73, 73, 73, - 68, 78, 79, 79, 79, 79, 79,247,216, 85, 85, 85, 85, 89,222, 89 + 65, 65, 65, 65,196, 65, 92, 67, 69,201, 69, 69, 73, 73, 73, 73, + 68, 78, 79, 79, 79, 79,214,215,216, 85, 85, 85,220, 89,222,223, + 65, 65, 65, 65,196, 65, 92, 67, 69,201, 69, 69, 73, 73, 73, 73, + 68, 78, 79, 79, 79, 79,214,247,216, 85, 85, 85,220, 89,222, 89 }; #define L1_AE 196 @@ -132,6 +130,39 @@ uchar sort_order_latin1_de[] = { #define L1_ue 252 #define L1_ss 223 + +/* + Some notes about the following comparison rules: + By definition, my_strnncoll_latin_de must works exactly as if had called + my_strnxfrm_latin_de() on both strings and compared the result strings. + + This means that: + Ä must also matches ÁE and Aè, because my_strxn_frm_latin_de() will convert + both to AE. + + The other option would be to not do any accent removal in + sort_order_latin_de[] at all +*/ + + +#define CHECK_S1_COMBO(ch1, ch2, str1, str1_end, res_if_str1_smaller, str2, fst, snd, accent) \ + /* Invariant: ch1 == fst == sort_order_latin1_de[accent] && ch1 != ch2 */ \ + if (ch2 != accent) \ + { \ + ch1= fst; \ + goto normal; \ + } \ + if (str1 == str1_end) \ + return res_if_str1_smaller; \ + { \ + int diff = (int) sort_order_latin1_de[*str1] - snd; \ + if (diff) \ + return diff*(-(res_if_str1_smaller)); \ + /* They are equal (e.g., "Ae" == 'ä') */ \ + str1++; \ + } + + int my_strnncoll_latin1_de(const uchar * s1, int len1, const uchar * s2, int len2) { @@ -140,172 +171,71 @@ int my_strnncoll_latin1_de(const uchar * s1, int len1, while (s1 < e1 && s2 < e2) { - /* to_upper is used instead of sort_order, because we don't want - * 'Ä' to match "ÁE", only "AE". This couples the to_upper and - * sort_order tables together, but that is acceptable. */ - uchar c1 = to_upper_latin1_de[*s1]; - uchar c2 = to_upper_latin1_de[*s2]; + /* + Because sort_order_latin1_de doesn't convert 'Ä', Ü or ß we + can use it here. + */ + uchar c1 = sort_order_latin1_de[*s1++]; + uchar c2 = sort_order_latin1_de[*s2++]; if (c1 != c2) { - switch (c1) - { - -#define CHECK_S1_COMBO(fst, snd, accent) \ - /* Invariant: c1 == fst == sort_order_latin1_de[accent] && c1 != c2 */ \ - if (c2 == accent) \ - { \ - if (s1 + 1 < e1) \ - { \ - if (to_upper_latin1_de[*(s1 + 1)] == snd) \ - { \ - /* They are equal (e.g., "Ae" == 'ä') */ \ - s1 += 2; \ - s2 += 1; \ - } \ - else \ - { \ - int diff = sort_order_latin1_de[*(s1 + 1)] - snd; \ - if (diff) \ - return diff; \ - else \ - /* Comparison between, e.g., "AÉ" and 'Ä' */ \ - return 1; \ - } \ - } \ - else \ - return -1; \ - } \ - else \ - /* The following should work even if c2 is [ÄÖÜß] */ \ - return fst - sort_order_latin1_de[c2] - + switch (c1) { case 'A': - CHECK_S1_COMBO('A', 'E', L1_AE); + CHECK_S1_COMBO(c1, c2, s1, e1, -1, s2, 'A', 'E', L1_AE); break; case 'O': - CHECK_S1_COMBO('O', 'E', L1_OE); + CHECK_S1_COMBO(c1, c2, s1, e1, -1, s2, 'O', 'E', L1_OE); break; case 'U': - CHECK_S1_COMBO('U', 'E', L1_UE); + CHECK_S1_COMBO(c1, c2, s1, e1, -1, s2, 'U', 'E', L1_UE); break; case 'S': - CHECK_S1_COMBO('S', 'S', L1_ss); + CHECK_S1_COMBO(c1, c2, s1, e1, -1, s2, 'S', 'S', L1_ss); break; - -#define CHECK_S2_COMBO(fst, snd) \ - /* Invariant: sort_order_latin1_de[c1] == fst && c1 != c2 */ \ - if (c2 == fst) \ - { \ - if (s2 + 1 < e2) \ - { \ - if (to_upper_latin1_de[*(s2 + 1)] == snd) \ - { \ - /* They are equal (e.g., 'ä' == "Ae") */ \ - s1 += 1; \ - s2 += 2; \ - } \ - else \ - { \ - int diff = sort_order_latin1_de[*(s1 + 1)] - snd; \ - if (diff) \ - return diff; \ - else \ - /* Comparison between, e.g., 'Ä' and "AÉ" */ \ - return -1; \ - } \ - } \ - else \ - return 1; \ - } \ - else \ - /* The following should work even if c2 is [ÄÖÜß] */ \ - return fst - sort_order_latin1_de[c2] - case L1_AE: - CHECK_S2_COMBO('A', 'E'); + CHECK_S1_COMBO(c1, c2, s2, e2, 1, s1, 'A', 'E', 'A'); break; case L1_OE: - CHECK_S2_COMBO('O', 'E'); + CHECK_S1_COMBO(c1, c2, s2, e2, 1, s1, 'O', 'E', 'O'); break; case L1_UE: - CHECK_S2_COMBO('U', 'E'); + CHECK_S1_COMBO(c1, c2, s2, e2, 1, s1, 'U', 'E', 'U'); break; case L1_ss: - CHECK_S2_COMBO('S', 'S'); + CHECK_S1_COMBO(c1, c2, s2, e2, 1, s1, 'S', 'S', 'S'); break; default: + /* + Handle the case where 'c2' is a special character + If this is true, we know that c1 can't match this character. + */ + normal: switch (c2) { case L1_AE: + return (int) c1 - (int) 'A'; case L1_OE: + return (int) c1 - (int) 'O'; case L1_UE: + return (int) c1 - (int) 'U'; case L1_ss: - /* Make sure these do not match (e.g., "Ä" != "Á") */ - return sort_order_latin1_de[c1] - sort_order_latin1_de[c2]; - break; + return (int) c1 - (int) 'S'; default: - if (sort_order_latin1_de[*s1] != sort_order_latin1_de[*s2]) - return sort_order_latin1_de[*s1] - sort_order_latin1_de[*s2]; - ++s1; - ++s2; - break; + { + int diff= (int) c1 - (int) c2; + if (diff) + return diff; } break; - -#undef CHECK_S1_COMBO -#undef CHECK_S2_COMBO - - } - } - else - { - /* In order to consistently treat "ae" == 'ä', but to NOT allow - * "aé" == 'ä', we must look ahead here to ensure that the second - * letter in a combo really is the unaccented 'e' (or 's' for - * "ss") and is not an accented character with the same sort_order. */ - ++s1; - ++s2; - if (s1 < e1 && s2 < e2) - { - switch (c1) - { - case 'A': - case 'O': - case 'U': - if (sort_order_latin1_de[*s1] == 'E' && - to_upper_latin1_de[*s1] != 'E' && - to_upper_latin1_de[*s2] == 'E') - /* Comparison between, e.g., "AÉ" and "AE" */ - return 1; - if (sort_order_latin1_de[*s2] == 'E' && - to_upper_latin1_de[*s2] != 'E' && - to_upper_latin1_de[*s1] == 'E') - /* Comparison between, e.g., "AE" and "AÉ" */ - return -1; - break; - case 'S': - if (sort_order_latin1_de[*s1] == 'S' && - to_upper_latin1_de[*s1] != 'S' && - to_upper_latin1_de[*s2] == 'S') - /* Comparison between, e.g., "Sß" and "SS" */ - return 1; - if (sort_order_latin1_de[*s2] == 'S' && - to_upper_latin1_de[*s2] != 'S' && - to_upper_latin1_de[*s1] == 'S') - /* Comparison between, e.g., "SS" and "Sß" */ - return -1; - break; - default: - break; } } } } - /* A simple test of string lengths won't work -- we test to see * which string ran out first */ return s1 < e1 ? 1 : s2 < e2 ? -1 : 0; } + int my_strnxfrm_latin1_de(uchar * dest, const uchar * src, int len, int srclen) { const uchar *dest_orig = dest; @@ -313,22 +243,19 @@ int my_strnxfrm_latin1_de(uchar * dest, const uchar * src, int len, int srclen) const uchar *se = src + srclen; while (src < se && dest < de) { - switch (*src) - { + uchar chr=sort_order_latin1_de[*src]; + switch (chr) { case L1_AE: - case L1_ae: *dest++ = 'A'; if (dest < de) *dest++ = 'E'; break; case L1_OE: - case L1_oe: *dest++ = 'O'; if (dest < de) *dest++ = 'E'; break; case L1_UE: - case L1_ue: *dest++ = 'U'; if (dest < de) *dest++ = 'E'; @@ -339,7 +266,7 @@ int my_strnxfrm_latin1_de(uchar * dest, const uchar * src, int len, int srclen) *dest++ = 'S'; break; default: - *dest++ = sort_order_latin1_de[*src]; + *dest++= chr; break; } ++src; @@ -347,6 +274,7 @@ int my_strnxfrm_latin1_de(uchar * dest, const uchar * src, int len, int srclen) return dest - dest_orig; } + int my_strcoll_latin1_de(const uchar * s1, const uchar * s2) { /* XXX QQ: This should be fixed to not call strlen */ |