summaryrefslogtreecommitdiff
path: root/navit/linguistics.c
diff options
context:
space:
mode:
Diffstat (limited to 'navit/linguistics.c')
-rw-r--r--navit/linguistics.c876
1 files changed, 430 insertions, 446 deletions
diff --git a/navit/linguistics.c b/navit/linguistics.c
index 549f90e17..67b789482 100644
--- a/navit/linguistics.c
+++ b/navit/linguistics.c
@@ -4,260 +4,260 @@
#include "debug.h"
#include "linguistics.h"
-/* To have linguistics_casefold(linguistics_expand_special(s,i)) equal to linguistics_expand_special(linguistics_casefold(s),i),
+/* To have linguistics_casefold(linguistics_expand_special(s,i)) equal to linguistics_expand_special(linguistics_casefold(s),i),
* please always specify here lower case expansions for special letters not having case variants (like german ß).*/
-static const char *special[][3]={
-/* Capital Diacritics */
-/* ¨ Diaresis */
-{"Ä","A","AE"},
-{"Ë","E"},
-{"Ï","I"},
-{"Ö","O","OE"},
-{"Ü","U","UE"},
-{"Ÿ","Y"},
-/* ˝ Double Acute Accent */
-{"Ő","O","Ö"},
-{"Ű","U","Ü"},
-/* ´ Acute Accent */
-{"Á","A"},
-{"Ć","C"},
-{"É","E"},
-{"Í","I"},
-{"Ĺ","L"},
-{"Ń","N"},
-{"Ó","O"},
-{"Ŕ","R"},
-{"Ś","S"},
-{"Ú","U"},
-{"Ý","Y"},
-{"Ź","Z"},
-/* ˛ Ogonek (nosinė) */
-{"Ą","A"},
-{"Ę","E"},
-{"Į","I"},
-{"Ų","U"},
-/* ˙ Dot */
-{"Ċ","C"},
-{"Ė","E"},
-{"Ġ","G"},
-{"İ","I"},
-{"Ŀ","L"},
-{"Ż","Z"},
-/* – Stroke */
-{"Đ","D","DJ"}, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
-{"Ħ","H"},
-{"Ł","L"},
-{"Ŧ","T"},
-/* ˚ Ring */
-{"Å","A","AA"},
-{"Ů","U"},
-/* ˇ Caron (haček, paukščiukas) */
-{"Č","C"},
-{"Ď","D"},
-{"Ě","E"},
-{"Ľ","L"},
-{"Ň","N"},
-{"Ř","R"},
-{"Š","S"},
-{"Ť","T"},
-{"Ž","Z"},
-/* / Slash */
-{"Ø","O","OE"},
-/* ¯ Macron */
-{"Ā","A","AA"},
-{"Ē","E","EE"},
-{"Ī","I","II"},
-{"Ō","O","OO"},
-{"Ū","U","UU"},
-/* ˘ Brevis */
-{"Ă","A"},
-{"Ĕ","E"},
-{"Ğ","G"},
-{"Ĭ","I"},
-{"Ŏ","O"},
-{"Ŭ","U"},
-/* ^ Circumflex */
-{"Â","A"},
-{"Ĉ","C"},
-{"Ê","E"},
-{"Ĝ","G"},
-{"Ĥ","H"},
-{"Î","I"},
-{"Ĵ","J"},
-{"Ô","O"},
-{"Ŝ","S"},
-{"Û","U"},
-{"Ŵ","W"},
-{"Ŷ","Y"},
-/* ¸ Cedilla */
-{"Ç","C"},
-{"Ģ","G","GJ"},
-{"Ķ","K","KJ"},
-{"Ļ","L","LJ"},
-{"Ņ","N","NJ"},
-{"Ŗ","R"},
-{"Ş","S"},
-{"Ţ","T"},
-/* ~ Tilde */
-{"Ã","A"},
-{"Ĩ","I"},
-{"Ñ","N"},
-{"Õ","O"},
-{"Ũ","U"},
-/* ` Grave */
-{"À","A"},
-{"È","E"},
-{"Ì","I"},
-{"Ò","O"},
-{"Ù","U"},
-/* ligatures */
-{"Æ","A","AE"},
-{"IJ","IJ"},
-{"Œ","O","OE"},
-/* special letters */
-{"Ð","D","DH"}, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
-{"Ŋ","N","NG"},
-{"Þ","T","TH"},
-/* Small Diacritics */
-/* ¨ Diaresis */
-{"ä","a","ae"},
-{"ë","e"},
-{"ï","i"},
-{"ö","o","oe"},
-{"ü","u","ue"},
-{"ÿ","y"},
-/* ˝ Double Acute Accent */
-{"ő","o","ö"},
-{"ű","u","ü"},
-/* ´ Acute Accent */
-{"á","a"},
-{"ć","c"},
-{"é","e"},
-{"í","i"},
-{"ĺ","l"},
-{"ń","n"},
-{"ó","o"},
-{"ŕ","r"},
-{"ś","s"},
-{"ú","u"},
-{"ý","y"},
-{"ź","z"},
-/* ˛ Ogonek (nosinė) */
-{"ą","a"},
-{"ę","e"},
-{"į","i"},
-{"ų","u"},
-/* ˙ Dot (and dotless i) */
-{"ċ","c"},
-{"ė","e"},
-{"ġ","g"},
-{"ı","i"},
-{"ŀ","l"},
-{"ż","z"},
-/* – Stroke */
-{"đ","d","dj"},
-{"ħ","h"},
-{"ł","l"},
-{"ŧ","t"},
-/* ˚ Ring */
-{"å","a", "aa"},
-{"ů","u"},
-/* ˇ Caron (haček, paukščiukas) */
-{"č","c"},
-{"ď","d"},
-{"ě","e"},
-{"ľ","l"},
-{"ň","n"},
-{"ř","r"},
-{"š","s"},
-{"ť","t"},
-{"ž","z"},
-/* / Slash */
-{"ø","o", "oe"},
-/* Macron */
-{"ā","a","aa"},
-{"ē","e","ee"},
-{"ī","i","ii"},
-{"ō","o","oo"},
-{"ū","u","uu"},
-/* ˘ Brevis */
-{"ă","a"},
-{"ĕ","e"},
-{"ğ","g"},
-{"ĭ","i"},
-{"ŏ","o"},
-{"ŭ","u"},
-/* ^ Circumflex */
-{"â","a"},
-{"ĉ","c"},
-{"ê","e"},
-{"ĝ","g"},
-{"ĥ","h"},
-{"î","i"},
-{"ĵ","j"},
-{"ô","o"},
-{"ŝ","s"},
-{"û","u"},
-{"ŵ","w"},
-{"ŷ","y"},
-/* ¸ Cedilla */
-{"ç","c"},
-{"ģ","g","gj"},
-{"ķ","k","kj"},
-{"ļ","l","lj"},
-{"ņ","n","nj"},
-{"ŗ","r"},
-{"ş","s"},
-{"ţ","t"},
-/* ~ Tilde */
-{"ã","a"},
-{"ĩ","i"},
-{"õ","o"},
-{"ñ","n"},
-{"ũ","u"},
-/* ` Grave */
-{"à","a"},
-{"è","e"},
-{"ì","i"},
-{"ò","o"},
-{"ù","u"},
-/* ligatures */
-{"æ","a","ae"},
-{"ij","ij"},
-{"œ","o","oe"},
-{"ß","s","ss"},
-/* special letters */
-{"ð","d","dh"},
-{"ŋ","n","ng"},
-{"þ","t","th"},
+static const char *special[][3]= {
+ /* Capital Diacritics */
+ /* ¨ Diaresis */
+ {"Ä","A","AE"},
+ {"Ë","E"},
+ {"Ï","I"},
+ {"Ö","O","OE"},
+ {"Ü","U","UE"},
+ {"Ÿ","Y"},
+ /* ˝ Double Acute Accent */
+ {"Ő","O","Ö"},
+ {"Ű","U","Ü"},
+ /* ´ Acute Accent */
+ {"Á","A"},
+ {"Ć","C"},
+ {"É","E"},
+ {"Í","I"},
+ {"Ĺ","L"},
+ {"Ń","N"},
+ {"Ó","O"},
+ {"Ŕ","R"},
+ {"Ś","S"},
+ {"Ú","U"},
+ {"Ý","Y"},
+ {"Ź","Z"},
+ /* ˛ Ogonek (nosinė) */
+ {"Ą","A"},
+ {"Ę","E"},
+ {"Į","I"},
+ {"Ų","U"},
+ /* ˙ Dot */
+ {"Ċ","C"},
+ {"Ė","E"},
+ {"Ġ","G"},
+ {"İ","I"},
+ {"Ŀ","L"},
+ {"Ż","Z"},
+ /* – Stroke */
+ {"Đ","D","DJ"}, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
+ {"Ħ","H"},
+ {"Ł","L"},
+ {"Ŧ","T"},
+ /* ˚ Ring */
+ {"Å","A","AA"},
+ {"Ů","U"},
+ /* ˇ Caron (haček, paukščiukas) */
+ {"Č","C"},
+ {"Ď","D"},
+ {"Ě","E"},
+ {"Ľ","L"},
+ {"Ň","N"},
+ {"Ř","R"},
+ {"Š","S"},
+ {"Ť","T"},
+ {"Ž","Z"},
+ /* / Slash */
+ {"Ø","O","OE"},
+ /* ¯ Macron */
+ {"Ā","A","AA"},
+ {"Ē","E","EE"},
+ {"Ī","I","II"},
+ {"Ō","O","OO"},
+ {"Ū","U","UU"},
+ /* ˘ Brevis */
+ {"Ă","A"},
+ {"Ĕ","E"},
+ {"Ğ","G"},
+ {"Ĭ","I"},
+ {"Ŏ","O"},
+ {"Ŭ","U"},
+ /* ^ Circumflex */
+ {"Â","A"},
+ {"Ĉ","C"},
+ {"Ê","E"},
+ {"Ĝ","G"},
+ {"Ĥ","H"},
+ {"Î","I"},
+ {"Ĵ","J"},
+ {"Ô","O"},
+ {"Ŝ","S"},
+ {"Û","U"},
+ {"Ŵ","W"},
+ {"Ŷ","Y"},
+ /* ¸ Cedilla */
+ {"Ç","C"},
+ {"Ģ","G","GJ"},
+ {"Ķ","K","KJ"},
+ {"Ļ","L","LJ"},
+ {"Ņ","N","NJ"},
+ {"Ŗ","R"},
+ {"Ş","S"},
+ {"Ţ","T"},
+ /* ~ Tilde */
+ {"Ã","A"},
+ {"Ĩ","I"},
+ {"Ñ","N"},
+ {"Õ","O"},
+ {"Ũ","U"},
+ /* ` Grave */
+ {"À","A"},
+ {"È","E"},
+ {"Ì","I"},
+ {"Ò","O"},
+ {"Ù","U"},
+ /* ligatures */
+ {"Æ","A","AE"},
+ {"IJ","IJ"},
+ {"Œ","O","OE"},
+ /* special letters */
+ {"Ð","D","DH"}, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
+ {"Ŋ","N","NG"},
+ {"Þ","T","TH"},
+ /* Small Diacritics */
+ /* ¨ Diaresis */
+ {"ä","a","ae"},
+ {"ë","e"},
+ {"ï","i"},
+ {"ö","o","oe"},
+ {"ü","u","ue"},
+ {"ÿ","y"},
+ /* ˝ Double Acute Accent */
+ {"ő","o","ö"},
+ {"ű","u","ü"},
+ /* ´ Acute Accent */
+ {"á","a"},
+ {"ć","c"},
+ {"é","e"},
+ {"í","i"},
+ {"ĺ","l"},
+ {"ń","n"},
+ {"ó","o"},
+ {"ŕ","r"},
+ {"ś","s"},
+ {"ú","u"},
+ {"ý","y"},
+ {"ź","z"},
+ /* ˛ Ogonek (nosinė) */
+ {"ą","a"},
+ {"ę","e"},
+ {"į","i"},
+ {"ų","u"},
+ /* ˙ Dot (and dotless i) */
+ {"ċ","c"},
+ {"ė","e"},
+ {"ġ","g"},
+ {"ı","i"},
+ {"ŀ","l"},
+ {"ż","z"},
+ /* – Stroke */
+ {"đ","d","dj"},
+ {"ħ","h"},
+ {"ł","l"},
+ {"ŧ","t"},
+ /* ˚ Ring */
+ {"å","a", "aa"},
+ {"ů","u"},
+ /* ˇ Caron (haček, paukščiukas) */
+ {"č","c"},
+ {"ď","d"},
+ {"ě","e"},
+ {"ľ","l"},
+ {"ň","n"},
+ {"ř","r"},
+ {"š","s"},
+ {"ť","t"},
+ {"ž","z"},
+ /* / Slash */
+ {"ø","o", "oe"},
+ /* Macron */
+ {"ā","a","aa"},
+ {"ē","e","ee"},
+ {"ī","i","ii"},
+ {"ō","o","oo"},
+ {"ū","u","uu"},
+ /* ˘ Brevis */
+ {"ă","a"},
+ {"ĕ","e"},
+ {"ğ","g"},
+ {"ĭ","i"},
+ {"ŏ","o"},
+ {"ŭ","u"},
+ /* ^ Circumflex */
+ {"â","a"},
+ {"ĉ","c"},
+ {"ê","e"},
+ {"ĝ","g"},
+ {"ĥ","h"},
+ {"î","i"},
+ {"ĵ","j"},
+ {"ô","o"},
+ {"ŝ","s"},
+ {"û","u"},
+ {"ŵ","w"},
+ {"ŷ","y"},
+ /* ¸ Cedilla */
+ {"ç","c"},
+ {"ģ","g","gj"},
+ {"ķ","k","kj"},
+ {"ļ","l","lj"},
+ {"ņ","n","nj"},
+ {"ŗ","r"},
+ {"ş","s"},
+ {"ţ","t"},
+ /* ~ Tilde */
+ {"ã","a"},
+ {"ĩ","i"},
+ {"õ","o"},
+ {"ñ","n"},
+ {"ũ","u"},
+ /* ` Grave */
+ {"à","a"},
+ {"è","e"},
+ {"ì","i"},
+ {"ò","o"},
+ {"ù","u"},
+ /* ligatures */
+ {"æ","a","ae"},
+ {"ij","ij"},
+ {"œ","o","oe"},
+ {"ß","s","ss"},
+ /* special letters */
+ {"ð","d","dh"},
+ {"ŋ","n","ng"},
+ {"þ","t","th"},
-/* Cyrillic capital */
-{"Ё","Е"},
-{"І","I"},
-{"Ї","I"},
-{"Ў","У"},
-{"Є","Е","Э"},
-{"Ґ","Г"},
-{"Ѓ","Г"},
-{"Ђ","Д"},
-{"Ќ","К"},
-{"Љ","Л","ЛЬ"},
-{"Њ","Н","НЬ"},
-{"Џ","Ц"},
+ /* Cyrillic capital */
+ {"Ё","Е"},
+ {"І","I"},
+ {"Ї","I"},
+ {"Ў","У"},
+ {"Є","Е","Э"},
+ {"Ґ","Г"},
+ {"Ѓ","Г"},
+ {"Ђ","Д"},
+ {"Ќ","К"},
+ {"Љ","Л","ЛЬ"},
+ {"Њ","Н","НЬ"},
+ {"Џ","Ц"},
-/* Cyrillic small */
-{"ё","е"},
-{"і","i"},
-{"ї","i"},
-{"ў","у"},
-{"є","е","э"},
-{"ґ","г"},
-{"ѓ","г"},
-{"ђ","д"},
-{"ќ","к"},
-{"љ","л","ль"},
-{"њ","н","нь"},
-{"џ","ц"},
+ /* Cyrillic small */
+ {"ё","е"},
+ {"і","i"},
+ {"ї","i"},
+ {"ў","у"},
+ {"є","е","э"},
+ {"ґ","г"},
+ {"ѓ","г"},
+ {"ђ","д"},
+ {"ќ","к"},
+ {"љ","л","ль"},
+ {"њ","н","нь"},
+ {"џ","ц"},
};
@@ -266,15 +266,15 @@ static const char *special[][3]={
* Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element.
* Last element of array should be NULL.
*/
-static const char *upperlower[]={
-/*Latin diacritics*/
-"ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ",
-"äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ",
-/*Cyrillic*/
-"АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
-"абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",
+static const char *upperlower[]= {
+ /*Latin diacritics*/
+ "ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ",
+ "äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ",
+ /*Cyrillic*/
+ "АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
+ "абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",
-NULL
+ NULL
};
static GHashTable *casefold_hash, *special_hash;
@@ -285,53 +285,49 @@ static GHashTable *casefold_hash, *special_hash;
* @param in String to prepeare.
* @return String prepared for case insensitive search. Result shoud be g_free()d after use.
*/
-char*
-linguistics_casefold(const char *in)
-{
- int len=strlen(in);
- const char *src=in;
- char *ret=g_new(char,len+1);
- char *dest=ret;
- char buf[10];
- while(*src && dest-ret<len){
- if(*src>='A' && *src<='Z') {
- *dest++=*src++ - 'A' + 'a';
- } else if (!(*src&128)) {
- *dest++=*src++;
- } else {
- int charlen;
- char *tmp, *folded;
- tmp=g_utf8_find_next_char(src,NULL);
- charlen=tmp-src+1;
- g_strlcpy(buf,src,charlen>10?10:charlen);
- folded=g_hash_table_lookup(casefold_hash,buf);
- if(folded) {
- while(*folded && dest-ret<len)
- *dest++=*folded++;
- src=tmp;
- } else {
- while(src<tmp && dest-ret<len)
- *dest++=*src++;
- }
- }
- }
- *dest=0;
- if(*src)
- dbg(lvl_error,"Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",in,ret);
- return ret;
+char* linguistics_casefold(const char *in) {
+ int len=strlen(in);
+ const char *src=in;
+ char *ret=g_new(char,len+1);
+ char *dest=ret;
+ char buf[10];
+ while(*src && dest-ret<len) {
+ if(*src>='A' && *src<='Z') {
+ *dest++=*src++ - 'A' + 'a';
+ } else if (!(*src&128)) {
+ *dest++=*src++;
+ } else {
+ int charlen;
+ char *tmp, *folded;
+ tmp=g_utf8_find_next_char(src,NULL);
+ charlen=tmp-src+1;
+ g_strlcpy(buf,src,charlen>10?10:charlen);
+ folded=g_hash_table_lookup(casefold_hash,buf);
+ if(folded) {
+ while(*folded && dest-ret<len)
+ *dest++=*folded++;
+ src=tmp;
+ } else {
+ while(src<tmp && dest-ret<len)
+ *dest++=*src++;
+ }
+ }
+ }
+ *dest=0;
+ if(*src)
+ dbg(lvl_error,"Casefolded string for '%s' needs extra space, result is truncated to '%s'.",in,ret);
+ return ret;
}
-static char**
-linguistics_get_special(const char *str, const char *end)
-{
- char *buf;
- int len;
- if(!end)
- end=g_utf8_find_next_char(str,NULL);
- len=end-str+1;
- buf=g_alloca(len);
- g_strlcpy(buf,str,len);
- return g_hash_table_lookup(special_hash,buf);
+static char** linguistics_get_special(const char *str, const char *end) {
+ char *buf;
+ int len;
+ if(!end)
+ end=g_utf8_find_next_char(str,NULL);
+ len=end-str+1;
+ buf=g_alloca(len);
+ g_strlcpy(buf,str,len);
+ return g_hash_table_lookup(special_hash,buf);
}
/**
@@ -339,42 +335,41 @@ linguistics_get_special(const char *str, const char *end)
*
* @param s1 First string to process, for example, an item name from the map. Will be linguistics_casefold()ed before comparison.
* @param s2 Second string to process, usually user supplied search string. Should be linguistics_casefold()ed before calling this function.
- * @param mode set to composition of linguistics_cmp_mode flags to have s1 linguistics_expand_special()ed, allow matches shorter than whole s1, or
+ * @param mode set to composition of linguistics_cmp_mode flags to have s1 linguistics_expand_special()ed, allow matches shorter than whole s1, or
* @param let matches start from any word boundary within s1
* @returns 0 when strings are equal
*/
-int linguistics_compare(const char *s1, const char *s2, enum linguistics_cmp_mode mode)
-{
- int ret=0;
- int i;
- int s2len=strlen(s2);
- char *s1f;
- /* Calling linguistics_casefold() before linguistics_expand_special() requires that result is independent of calling order. This seems
- to be true at the time of writing this comment. */
- s1f=linguistics_casefold(s1);
- for(i=0; i<3; i++) {
- char *s, *word;
- if(i>0)
- s=linguistics_expand_special(s1f,i);
- else
- s=s1f;
- word=s;
- while(word) {
- if(mode & linguistics_cmp_partial)
- ret=strncmp(word,s2,s2len);
- else
- ret=strcmp(word,s2);
- if(!ret || !(mode & linguistics_cmp_words))
- break;
- word=linguistics_next_word(word);
- }
- if(i>0)
- g_free(s);
- if(!ret || !(mode & linguistics_cmp_expand))
- break;
- }
- g_free(s1f);
- return ret;
+int linguistics_compare(const char *s1, const char *s2, enum linguistics_cmp_mode mode) {
+ int ret=0;
+ int i;
+ int s2len=strlen(s2);
+ char *s1f;
+ /* Calling linguistics_casefold() before linguistics_expand_special() requires that result is independent of calling order. This seems
+ to be true at the time of writing this comment. */
+ s1f=linguistics_casefold(s1);
+ for(i=0; i<3; i++) {
+ char *s, *word;
+ if(i>0)
+ s=linguistics_expand_special(s1f,i);
+ else
+ s=s1f;
+ word=s;
+ while(word) {
+ if(mode & linguistics_cmp_partial)
+ ret=strncmp(word,s2,s2len);
+ else
+ ret=strcmp(word,s2);
+ if(!ret || !(mode & linguistics_cmp_words))
+ break;
+ word=linguistics_next_word(word);
+ }
+ if(i>0)
+ g_free(s);
+ if(!ret || !(mode & linguistics_cmp_expand))
+ break;
+ }
+ g_free(s1f);
+ return ret;
}
/**
@@ -387,86 +382,80 @@ int linguistics_compare(const char *s1, const char *s2, enum linguistics_cmp_mod
* replacement has multitple letter (e.g. a-umlaut -> ae)
* @returns copy of string, with characters replaced
*/
-char *
-linguistics_expand_special(const char *str, int mode)
-{
- const char *in=str;
- char *out,*ret;
- int found=0;
- int ret_len=strlen(str);
- int in_rest=ret_len;
- out=ret=g_strdup(str);
- if (!mode)
- return ret;
- while (*in) {
- char *next=g_utf8_find_next_char(in, NULL);
- int len;
- int match=0;
+char *linguistics_expand_special(const char *str, int mode) {
+ const char *in=str;
+ char *out,*ret;
+ int found=0;
+ int ret_len=strlen(str);
+ int in_rest=ret_len;
+ out=ret=g_strdup(str);
+ if (!mode)
+ return ret;
+ while (*in) {
+ char *next=g_utf8_find_next_char(in, NULL);
+ int len;
+ int match=0;
- if(next)
- len=next-in;
- else
- len=strlen(in);
+ if(next)
+ len=next-in;
+ else
+ len=strlen(in);
- in_rest-=len;
-
- if (len > 1) {
- char **spc=linguistics_get_special(in, next);
- if (spc) {
- const char *replace=spc[mode];
- if (replace) {
- int replace_len=strlen(replace);
- if(out-ret+replace_len+in_rest>ret_len) {
- char *new_ret;
- ret_len+=(replace_len-len)*10;
- new_ret=g_realloc(ret,ret_len+1);
- out=new_ret+(out-ret);
- ret=new_ret;
- }
- dbg(lvl_debug,"found %s %s %d %s %d\n",in,spc[0],len,replace,replace_len);
- strcpy(out, replace);
- out+=replace_len;
- match=1;
- }
- }
- }
- if (match) {
- found=1;
- in+=len;
- } else {
- while (len-- > 0)
- *out++=*in++;
- }
- }
- *out++='\0';
- if (!found) {
- g_free(ret);
- ret=NULL;
- }
- return ret;
+ in_rest-=len;
+
+ if (len > 1) {
+ char **spc=linguistics_get_special(in, next);
+ if (spc) {
+ const char *replace=spc[mode];
+ if (replace) {
+ int replace_len=strlen(replace);
+ if(out-ret+replace_len+in_rest>ret_len) {
+ char *new_ret;
+ ret_len+=(replace_len-len)*10;
+ new_ret=g_realloc(ret,ret_len+1);
+ out=new_ret+(out-ret);
+ ret=new_ret;
+ }
+ dbg(lvl_debug,"found %s %s %d %s %d",in,spc[0],len,replace,replace_len);
+ strcpy(out, replace);
+ out+=replace_len;
+ match=1;
+ }
+ }
+ }
+ if (match) {
+ found=1;
+ in+=len;
+ } else {
+ while (len-- > 0)
+ *out++=*in++;
+ }
+ }
+ *out++='\0';
+ if (!found) {
+ g_free(ret);
+ ret=NULL;
+ }
+ return ret;
}
-char *
-linguistics_next_word(char *str)
-{
- int len=strcspn(str, LINGUISTICS_WORD_SEPARATORS_ASCII);
- if (!str[len] || !str[len+1])
- return NULL;
- return str+len+1;
+char *linguistics_next_word(char *str) {
+ int len=strcspn(str, LINGUISTICS_WORD_SEPARATORS_ASCII);
+ if (!str[len] || !str[len+1])
+ return NULL;
+ return str+len+1;
}
-int
-linguistics_search(const char *str)
-{
- if (!g_ascii_strcasecmp(str,"str"))
- return 0;
- if (!g_ascii_strcasecmp(str,"str."))
- return 0;
- if (!g_ascii_strcasecmp(str,"strasse"))
- return 0;
- if (!g_ascii_strcasecmp(str,"weg"))
- return 0;
- return 1;
+int linguistics_search(const char *str) {
+ if (!g_ascii_strcasecmp(str,"str"))
+ return 0;
+ if (!g_ascii_strcasecmp(str,"str."))
+ return 0;
+ if (!g_ascii_strcasecmp(str,"strasse"))
+ return 0;
+ if (!g_ascii_strcasecmp(str,"weg"))
+ return 0;
+ return 1;
}
/**
@@ -475,46 +464,41 @@ linguistics_search(const char *str)
* @param s pointer to the beginning of the char.
* @return newly allocated nul-terminated string containing one utf8 encoded character.
*/
-static char
-*linguistics_dup_utf8_char(const char *s)
-{
- char *ret, *next;
- next=g_utf8_find_next_char(s,NULL);
- ret=g_new(char, next-s+1);
- g_strlcpy(ret,s,next-s+1);
- return ret;
+static char
+*linguistics_dup_utf8_char(const char *s) {
+ char *ret, *next;
+ next=g_utf8_find_next_char(s,NULL);
+ ret=g_new(char, next-s+1);
+ g_strlcpy(ret,s,next-s+1);
+ return ret;
}
-void
-linguistics_init(void)
-{
- int i;
+void linguistics_init(void) {
+ int i;
+
+ casefold_hash=g_hash_table_new_full(g_str_hash, g_str_equal,g_free,g_free);
+
+ for (i = 0 ; upperlower[i]; i+=2) {
+ int j,k;
+ for(j=0,k=0; upperlower[i][j] && upperlower[i+1][k];) {
+ char *s1=linguistics_dup_utf8_char(upperlower[i]+j);
+ char *s2=linguistics_dup_utf8_char(upperlower[i+1]+k);
+ g_hash_table_insert(casefold_hash,s1,s2);
+ j+=strlen(s1);
+ k+=strlen(s2);
+ }
+ }
- casefold_hash=g_hash_table_new_full(g_str_hash, g_str_equal,g_free,g_free);
+ special_hash=g_hash_table_new(g_str_hash, g_str_equal);
+ for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++)
+ g_hash_table_insert(special_hash,(gpointer)special[i][0],special[i]);
- for (i = 0 ; upperlower[i]; i+=2) {
- int j,k;
- for(j=0,k=0;upperlower[i][j] && upperlower[i+1][k];) {
- char *s1=linguistics_dup_utf8_char(upperlower[i]+j);
- char *s2=linguistics_dup_utf8_char(upperlower[i+1]+k);
- g_hash_table_insert(casefold_hash,s1,s2);
- j+=strlen(s1);
- k+=strlen(s2);
- }
- }
-
- special_hash=g_hash_table_new(g_str_hash, g_str_equal);
- for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++)
- g_hash_table_insert(special_hash,(gpointer)special[i][0],special[i]);
-
}
-void
-linguistics_free(void)
-{
- g_hash_table_destroy(casefold_hash);
- g_hash_table_destroy(special_hash);
- casefold_hash=NULL;
- special_hash=NULL;
+void linguistics_free(void) {
+ g_hash_table_destroy(casefold_hash);
+ g_hash_table_destroy(special_hash);
+ casefold_hash=NULL;
+ special_hash=NULL;
}