summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2022-08-30 13:36:30 +0300
committerMarko Mäkelä <marko.makela@mariadb.com>2022-08-30 13:36:30 +0300
commitfe1f8f2c6b6f3b8e3383168225f9ae7853028947 (patch)
treec27d6c68c4772c344c5fe03f9880490f6041fcd8 /strings
parent62b418bd2897b35fdaecd1aa41cc88023e3dd114 (diff)
parente71aca8200558d590f8b1b8dbafa9693fcf5078b (diff)
downloadmariadb-git-fe1f8f2c6b6f3b8e3383168225f9ae7853028947.tar.gz
Merge 10.10 into 10.11
Diffstat (limited to 'strings')
-rw-r--r--strings/ctype-czech.c75
-rw-r--r--strings/ctype-latin1.c16
2 files changed, 33 insertions, 58 deletions
diff --git a/strings/ctype-czech.c b/strings/ctype-czech.c
index ca466c232eb..89eff459215 100644
--- a/strings/ctype-czech.c
+++ b/strings/ctype-czech.c
@@ -23,13 +23,13 @@
solution was needed than the one-to-one conversion table. To
note a few, here is an example of a Czech sorting sequence:
- co < hlaska < hlska < hlava < chlapec < krtek
+ co < hlaska < hláska < hlava < chlapec < krtek
It because some of the rules are: double char 'ch' is sorted
- between 'h' and 'i'. Accented character '' (a with acute) is
+ between 'h' and 'i'. Accented character 'á' (a with acute) is
sorted after 'a' and before 'b', but only if the word is
otherwise the same. However, because 's' is sorted before 'v'
- in hlava, the accentness of '' is overridden. There are many
+ in hlava, the accentness of 'á' is overridden. There are many
more rules.
This file defines functions my_strxfrm and my_strcoll for
@@ -42,8 +42,9 @@
passes, that's why we need four times more space for expanded
string.
- This file also contains the ISO-Latin-2 definitions of
- characters.
+ The non-ASCII literal strings in this file are encoded
+ in the iso-8859-2 / latin-2 character set
+ (https://en.wikipedia.org/wiki/ISO/IEC_8859-2)
Author: (c) 1997--1998 Jan Pazdziora, adelton@fi.muni.cz
Jan Pazdziora has a shared copyright for this code
@@ -112,7 +113,7 @@ static const struct wordvalue doubles[] = {
};
/*
- Unformal description of the algorithm:
+ Informal description of the algorithm:
We walk the string left to right.
@@ -127,7 +128,7 @@ static const struct wordvalue doubles[] = {
End of pass is marked with value 1 on the output.
- For each character, we read it's value from the table.
+ For each character, we read its value from the table.
If the value is ignore (0), we go straight to the next character.
@@ -139,31 +140,6 @@ static const struct wordvalue doubles[] = {
exists behind it, find its value.
We append 0 to the end.
----
- Neformln popis algoritmu:
-
- Prochzme etzec zleva doprava.
-
- Konec etzce je pedn bu jako parametr, nebo je to *p == 0.
- Toto je oeteno makrem IS_END.
-
- Pokud jsme doli na konec etzce pi prchodu 0, nejdeme na
- zatek, ale na uloenou pozici, protoe prvn a druh prchod
- b souasn.
-
- Konec vstupu (prchodu) ozname na vstupu hodnotou 1.
-
- Pro kad znak etzce nateme hodnotu z tdc tabulky.
-
- Jde-li o hodnotu ignorovat (0), skome ihned na dal znak..
-
- Jde-li o hodnotu konec slova (2) a je to prchod 0 nebo 1,
- peskome vechny dal 0 -- 2 a prohodme prchody.
-
- Jde-li o kompozitn znak (255), otestujeme, zda nsleduje
- sprvn do dvojice, dohledme sprvnou hodnotu.
-
- Na konci pipojme znak 0
*/
#define ADD_TO_RESULT(dest, len, totlen, value) \
@@ -336,24 +312,23 @@ my_strnxfrm_czech(CHARSET_INFO *cs __attribute__((unused)),
/*
- Neformln popis algoritmu:
-
- prochzme etzec zleva doprava
- konec etzce poznme podle *p == 0
- pokud jsme doli na konec etzce pi prchodu 0, nejdeme na
- zatek, ale na uloenou pozici, protoe prvn a druh
- prchod b souasn
- konec vstupu (prchodu) ozname na vstupu hodnotou 1
-
- nateme hodnotu z tdc tabulky
- jde-li o hodnotu ignorovat (0), skome na dal prchod
- jde-li o hodnotu konec slova (2) a je to prchod 0 nebo 1,
- peskome vechny dal 0 -- 2 a prohodme
- prchody
- jde-li o kompozitn znak (255), otestujeme, zda nsleduje
- sprvn do dvojice, dohledme sprvnou hodnotu
-
- na konci pipojme znak 0
+ Informal description of the algorithm:
+
+ we pass the chain from left to right
+ we know the end of the string by *p == 0
+ if we reached the end of the string on transition 0, then we don't go to
+ start, but to the saved position, because the first and second
+ the passage runs concurrently
+ we mark the end of the input (transition) with the value 1 on the output
+
+ then we load the value from the sorting table
+ if the value is ignore (0), we jump to the next pass
+ if the value is the end of the word (2) and it is a 0 or 1 transition,
+ we skip all the other 0 -- 2 and switch transitions
+ if it is a composite character (255), we test whether it follows
+ correct to the pair, we find the correct value
+
+ then we add the character 0 at the end
*/
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index 335c4715bf4..ce2e84666bc 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -504,19 +504,19 @@ struct charset_info_st my_charset_latin1_nopad=
*
* The modern sort order is used, where:
*
- * '' -> "ae"
- * '' -> "oe"
- * '' -> "ue"
- * '' -> "ss"
+ * 'ä' -> "ae"
+ * 'ö' -> "oe"
+ * 'ü' -> "ue"
+ * 'ß' -> "ss"
*/
/*
* This is a simple latin1 mapping table, which maps all accented
* characters to their non-accented equivalents. Note: in this
- * table, '' is mapped to 'A', '' is mapped to 'Y', etc. - all
+ * table, 'ä' is mapped to 'A', 'ÿ' is mapped to 'Y', etc. - all
* accented characters except the following are treated the same way.
- * , , , , ,
+ * Ü, ü, Ö, ö, Ä, ä
*/
static const uchar sort_order_latin1_de[] = {
@@ -582,7 +582,7 @@ static const uchar combo2map[]={
my_strnxfrm_latin_de() on both strings and compared the result strings.
This means that:
- must also matches E and A, because my_strxn_frm_latin_de() will convert
+ Ä must also matches ÁE and Aè, because my_strxn_frm_latin_de() will convert
both to AE.
The other option would be to not do any accent removal in
@@ -708,7 +708,7 @@ void my_hash_sort_latin1_de(CHARSET_INFO *cs __attribute__((unused)),
/*
Remove end space. We have to do this to be able to compare
- 'AE' and '' as identical
+ 'AE' and 'Ä' as identical
*/
end= skip_trailing_space(key, len);