summaryrefslogtreecommitdiff
path: root/navit
diff options
context:
space:
mode:
authormdankov <mdankov@ffa7fe5e-494d-0410-b361-a75ebd5db220>2012-12-18 20:20:33 +0000
committermdankov <mdankov@ffa7fe5e-494d-0410-b361-a75ebd5db220>2012-12-18 20:20:33 +0000
commit388f5c1ab48578143f87dcd913c4f4e388aa8318 (patch)
tree2e0ebf8d42683757d9c0828b9d0e4fb7eae44f57 /navit
parent39b2b37e6417d77aa457dd568d45f7649fe78e74 (diff)
downloadnavit-svn-388f5c1ab48578143f87dcd913c4f4e388aa8318.tar.gz
Add:core:Cyrillic search improvements: Do accent insensitive search for Cyrillic in town and POI filter. Do Cyrilic case insensitive filtering of POIs.
git-svn-id: http://svn.code.sf.net/p/navit/code/trunk/navit@5296 ffa7fe5e-494d-0410-b361-a75ebd5db220
Diffstat (limited to 'navit')
-rw-r--r--navit/gui/internal/gui_internal.c24
-rw-r--r--navit/linguistics.c155
-rw-r--r--navit/linguistics.h2
-rw-r--r--navit/start_real.c2
4 files changed, 170 insertions, 13 deletions
diff --git a/navit/gui/internal/gui_internal.c b/navit/gui/internal/gui_internal.c
index d77c531e..c91f2ebe 100644
--- a/navit/gui/internal/gui_internal.c
+++ b/navit/gui/internal/gui_internal.c
@@ -73,6 +73,7 @@
#include "xmlconfig.h"
#include "util.h"
#include "bookmarks.h"
+#include "linguistics.h"
#include "debug.h"
#include "fib.h"
#include "types.h"
@@ -2684,7 +2685,7 @@ static char *
removecase(char *s)
{
char *r;
- r=g_utf8_casefold(s,-1);
+ r=linguistics_casefold(s);
return r;
}
@@ -3003,6 +3004,7 @@ gui_internal_cmd_pois_item_selected(struct poi_param *param, struct item *item)
if (param->filter) {
char *long_name, *s;
GList *f;
+ int i;
if (param->isAddressFilter) {
s=gui_internal_compose_item_address_string(item);
} else if (item_attr_get(item, attr_label, &attr)) {
@@ -3014,14 +3016,20 @@ gui_internal_cmd_pois_item_selected(struct poi_param *param, struct item *item)
g_free(s);
item_attr_rewind(item);
- for(s=long_name,f=param->filter;f && s;f=g_list_next(f)) {
- s=strstr(s,f->data);
- if(!s)
- break;
- s=g_utf8_strchr(s,-1,' ');
+ match=0;
+ for(i=0;i<3 && !match;i++) {
+ char *long_name_exp=linguistics_expand_special(long_name, i);
+ for(s=long_name_exp,f=param->filter;f && s;f=g_list_next(f)) {
+ s=strstr(s,f->data);
+ if(!s) {
+ break;
+ }
+ s=g_utf8_strchr(s,-1,' ');
+ }
+ g_free(long_name_exp);
+ if(!f)
+ match=1;
}
- if(f)
- match=0;
g_free(long_name);
}
return match;
diff --git a/navit/linguistics.c b/navit/linguistics.c
index c79b8205..d72ad156 100644
--- a/navit/linguistics.c
+++ b/navit/linguistics.c
@@ -228,16 +228,108 @@ static const char *special[][3]={
{"ð","d","dh"},
{"ŋ","n","ng"},
{"þ","t","th"},
+
+/* Cyrillic capital */
+{"Ё","Е"},
+{"Й","И"},
+{"І","I"},
+{"Ї","I"},
+{"Ў","У"},
+{"Є","Е","Э"},
+{"Ґ","Г"},
+{"Ѓ","Г"},
+{"Ђ","Д"},
+{"Ќ","К"},
+{"Љ","Л","ЛЬ"},
+{"Њ","Н","НЬ"},
+{"Џ","Ц"},
+
+/* Cyrillic small */
+{"ё","е"},
+{"й","и"},
+{"і","i"},
+{"ї","i"},
+{"ў","у"},
+{"є","е","э"},
+{"ґ","г"},
+{"ѓ","г"},
+{"ђ","д"},
+{"ќ","к"},
+{"љ","л","ль"},
+{"њ","н","нь"},
+{"џ","ц"},
+
+};
+
+/* Array of strings for case conversion
+ * Even elements of array are strings of upper-case letters
+ * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element.
+ * Last element of array should be NULL.
+ */
+static const char *upperlower[]={
+/*Latin diacritics*/
+"ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ",
+"äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ",
+/*Cyrillic*/
+"АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
+"абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",
+
+NULL
};
+static GHashTable *casefold_hash;
+
+
+/*
+ * @brief Prepare an utf-8 string for case insensitive comparison.
+ * @param in String to prepeare.
+ * @return String prepared for case insensitive search. Result shoud be g_free()d after use.
+ */
+char*
+linguistics_casefold(char *in)
+{
+ int len=strlen(in);
+ char *src=in;
+ char *ret=g_new(char,len+1);
+ char *dest=ret;
+ char buf[10];
+ while(*src && dest-ret<len){
+ if(*src>='A' && *src<='Z') {
+ *dest++=*src++ - 'A' + 'a';
+ } else if (!(*src&128)) {
+ *dest++=*src++;
+ } else {
+ int charlen;
+ char *tmp, *folded;
+ tmp=g_utf8_find_next_char(src,NULL);
+ charlen=tmp-src+1;
+ g_strlcpy(buf,src,charlen>10?10:charlen);
+ folded=g_hash_table_lookup(casefold_hash,buf);
+ if(folded) {
+ while(*folded && dest-ret<len)
+ *dest++=*folded++;
+ src=tmp;
+ } else {
+ while(src<tmp && dest-ret<len)
+ *dest++=*src++;
+ }
+ }
+ }
+ *dest=0;
+ if(*src)
+ dbg(0,"Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",in,ret);
+ return ret;
+}
+
+
/**
* @brief Replace special characters in string (e.g. umlauts) with plain letters.
* This is useful e.g. to canonicalize a string for comparison.
*
* @param str string to process
* @param mode Replacement mode. 0=do nothing, 1=replace with single
- * ASCII letter, 2=replace with multiple letters if the commonly used
- * ASCII replacement has multitple letter (e.g. a-umlaut -> ae)
+ * UTF character, 2=replace with multiple letters if the commonly used
+ * replacement has multitple letter (e.g. a-umlaut -> ae)
* @returns copy of string, with characters replaced
*/
char *
@@ -246,13 +338,23 @@ linguistics_expand_special(char *str, int mode)
char *in=str;
char *out,*ret;
int found=0;
+ int ret_len=strlen(str);
+ int in_rest=ret_len;
out=ret=g_strdup(str);
if (!mode)
return ret;
while (*in) {
char *next=g_utf8_find_next_char(in, NULL);
- int i,len=next-in;
+ int i,len;
int match=0;
+
+ if(next)
+ len=next-in;
+ else
+ len=strlen(in);
+
+ in_rest-=len;
+
if (len > 1) {
for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++) {
const char *search=special[i][0];
@@ -260,7 +362,13 @@ linguistics_expand_special(char *str, int mode)
const char *replace=special[i][mode];
if (replace) {
int replace_len=strlen(replace);
- dbg_assert(replace_len <= len);
+ if(out-ret+replace_len+in_rest>ret_len) {
+ char *new_ret;
+ ret_len+=(replace_len-len)*10;
+ new_ret=g_realloc(ret,ret_len+1);
+ out=new_ret+(out-ret);
+ ret=new_ret;
+ }
dbg(1,"found %s %s %d %s %d\n",in,search,len,replace,replace_len);
strcpy(out, replace);
out+=replace_len;
@@ -272,7 +380,7 @@ linguistics_expand_special(char *str, int mode)
}
if (match) {
found=1;
- in=next;
+ in+=len;
} else {
while (len-- > 0)
*out++=*in++;
@@ -309,7 +417,44 @@ linguistics_search(char *str)
return 1;
}
+/**
+ * @brief Copy one utf8 encoded char to newly allocated buffer.
+ *
+ * @param s pointer to the beginning of the char.
+ * @return newly allocated nul-terminated string containing one utf8 encoded character.
+ */
+static char
+*linguistics_dup_utf8_char(const char *s)
+{
+ char *ret, *next;
+ next=g_utf8_find_next_char(s,NULL);
+ ret=g_new(char, next-s+1);
+ g_strlcpy(ret,s,next-s+1);
+ return ret;
+}
+
void
linguistics_init(void)
{
+ int i;
+
+ casefold_hash=g_hash_table_new_full(g_str_hash, g_str_equal,g_free,g_free);
+
+ for (i = 0 ; upperlower[i]; i+=2) {
+ int j,k;
+ for(j=0,k=0;upperlower[i][j] && upperlower[i+1][k];) {
+ char *s1=linguistics_dup_utf8_char(upperlower[i]+j);
+ char *s2=linguistics_dup_utf8_char(upperlower[i+1]+k);
+ g_hash_table_insert(casefold_hash,s1,s2);
+ j+=strlen(s1);
+ k+=strlen(s2);
+ }
+ }
}
+
+void
+linguistics_free(void)
+{
+ g_hash_table_destroy(casefold_hash);
+}
+
diff --git a/navit/linguistics.h b/navit/linguistics.h
index 2287b10a..8c15cdea 100644
--- a/navit/linguistics.h
+++ b/navit/linguistics.h
@@ -4,6 +4,8 @@ extern "C" {
char *linguistics_expand_special(char *str, int mode);
char *linguistics_next_word(char *str);
void linguistics_init(void);
+void linguistics_free(void);
+char *linguistics_casefold(char *in);
#ifdef __cplusplus
}
#endif
diff --git a/navit/start_real.c b/navit/start_real.c
index 9c301ec0..fc4a1c65 100644
--- a/navit/start_real.c
+++ b/navit/start_real.c
@@ -226,6 +226,8 @@ int main_real(int argc, const char **argv)
}
event_main_loop_run();
+ linguistics_free();
+
#ifndef HAVE_API_ANDROID
debug_finished();
#endif