From 20612c3b4bc03385f51fcfebf8001cb4abd41124 Mon Sep 17 00:00:00 2001 From: mdankov Date: Tue, 18 Dec 2012 20:20:33 +0000 Subject: Add:core:Cyrillic search improvements: Do accent insensitive search for Cyrillic in town and POI filter. Do Cyrilic case insensitive filtering of POIs. git-svn-id: http://svn.code.sf.net/p/navit/code/trunk/navit@5296 ffa7fe5e-494d-0410-b361-a75ebd5db220 --- navit/linguistics.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 150 insertions(+), 5 deletions(-) (limited to 'navit/linguistics.c') diff --git a/navit/linguistics.c b/navit/linguistics.c index c79b8205d..d72ad156b 100644 --- a/navit/linguistics.c +++ b/navit/linguistics.c @@ -228,16 +228,108 @@ static const char *special[][3]={ {"ð","d","dh"}, {"ŋ","n","ng"}, {"þ","t","th"}, + +/* Cyrillic capital */ +{"Ё","Е"}, +{"Й","И"}, +{"І","I"}, +{"Ї","I"}, +{"Ў","У"}, +{"Є","Е","Э"}, +{"Ґ","Г"}, +{"Ѓ","Г"}, +{"Ђ","Д"}, +{"Ќ","К"}, +{"Љ","Л","ЛЬ"}, +{"Њ","Н","НЬ"}, +{"Џ","Ц"}, + +/* Cyrillic small */ +{"ё","е"}, +{"й","и"}, +{"і","i"}, +{"ї","i"}, +{"ў","у"}, +{"є","е","э"}, +{"ґ","г"}, +{"ѓ","г"}, +{"ђ","д"}, +{"ќ","к"}, +{"љ","л","ль"}, +{"њ","н","нь"}, +{"џ","ц"}, + +}; + +/* Array of strings for case conversion + * Even elements of array are strings of upper-case letters + * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element. + * Last element of array should be NULL. + */ +static const char *upperlower[]={ +/*Latin diacritics*/ +"ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ", +"äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ", +/*Cyrillic*/ +"АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ", +"абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў", + +NULL }; +static GHashTable *casefold_hash; + + +/* + * @brief Prepare an utf-8 string for case insensitive comparison. + * @param in String to prepeare. + * @return String prepared for case insensitive search. Result shoud be g_free()d after use. + */ +char* +linguistics_casefold(char *in) +{ + int len=strlen(in); + char *src=in; + char *ret=g_new(char,len+1); + char *dest=ret; + char buf[10]; + while(*src && dest-ret='A' && *src<='Z') { + *dest++=*src++ - 'A' + 'a'; + } else if (!(*src&128)) { + *dest++=*src++; + } else { + int charlen; + char *tmp, *folded; + tmp=g_utf8_find_next_char(src,NULL); + charlen=tmp-src+1; + g_strlcpy(buf,src,charlen>10?10:charlen); + folded=g_hash_table_lookup(casefold_hash,buf); + if(folded) { + while(*folded && dest-ret ae) + * UTF character, 2=replace with multiple letters if the commonly used + * replacement has multitple letter (e.g. a-umlaut -> ae) * @returns copy of string, with characters replaced */ char * @@ -246,13 +338,23 @@ linguistics_expand_special(char *str, int mode) char *in=str; char *out,*ret; int found=0; + int ret_len=strlen(str); + int in_rest=ret_len; out=ret=g_strdup(str); if (!mode) return ret; while (*in) { char *next=g_utf8_find_next_char(in, NULL); - int i,len=next-in; + int i,len; int match=0; + + if(next) + len=next-in; + else + len=strlen(in); + + in_rest-=len; + if (len > 1) { for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++) { const char *search=special[i][0]; @@ -260,7 +362,13 @@ linguistics_expand_special(char *str, int mode) const char *replace=special[i][mode]; if (replace) { int replace_len=strlen(replace); - dbg_assert(replace_len <= len); + if(out-ret+replace_len+in_rest>ret_len) { + char *new_ret; + ret_len+=(replace_len-len)*10; + new_ret=g_realloc(ret,ret_len+1); + out=new_ret+(out-ret); + ret=new_ret; + } dbg(1,"found %s %s %d %s %d\n",in,search,len,replace,replace_len); strcpy(out, replace); out+=replace_len; @@ -272,7 +380,7 @@ linguistics_expand_special(char *str, int mode) } if (match) { found=1; - in=next; + in+=len; } else { while (len-- > 0) *out++=*in++; @@ -309,7 +417,44 @@ linguistics_search(char *str) return 1; } +/** + * @brief Copy one utf8 encoded char to newly allocated buffer. + * + * @param s pointer to the beginning of the char. + * @return newly allocated nul-terminated string containing one utf8 encoded character. + */ +static char +*linguistics_dup_utf8_char(const char *s) +{ + char *ret, *next; + next=g_utf8_find_next_char(s,NULL); + ret=g_new(char, next-s+1); + g_strlcpy(ret,s,next-s+1); + return ret; +} + void linguistics_init(void) { + int i; + + casefold_hash=g_hash_table_new_full(g_str_hash, g_str_equal,g_free,g_free); + + for (i = 0 ; upperlower[i]; i+=2) { + int j,k; + for(j=0,k=0;upperlower[i][j] && upperlower[i+1][k];) { + char *s1=linguistics_dup_utf8_char(upperlower[i]+j); + char *s2=linguistics_dup_utf8_char(upperlower[i+1]+k); + g_hash_table_insert(casefold_hash,s1,s2); + j+=strlen(s1); + k+=strlen(s2); + } + } } + +void +linguistics_free(void) +{ + g_hash_table_destroy(casefold_hash); +} + -- cgit v1.2.1