/* Localization of proper names. Copyright (C) 2006-2016 Free Software Foundation, Inc. Written by Bruno Haible , 2006. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that the proper_name function might be candidate for attribute 'const' */ #if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__ # pragma GCC diagnostic ignored "-Wsuggest-attribute=const" #endif #include /* Specification. */ #include "propername.h" #include #include #include #include #include #if HAVE_ICONV # include #endif #include "trim.h" #include "mbchar.h" #include "mbuiter.h" #include "localcharset.h" #include "c-strcase.h" #include "xstriconv.h" #include "xalloc.h" #include "gettext.h" /* Tests whether STRING contains trim (SUB), starting and ending at word boundaries. Here, instead of implementing Unicode Standard Annex #29 for determining word boundaries, we assume that trim (SUB) starts and ends with words and only test whether the part before it ends with a non-word and the part after it starts with a non-word. */ static bool mbsstr_trimmed_wordbounded (const char *string, const char *sub) { char *tsub = trim (sub); bool found = false; for (; *string != '\0';) { const char *tsub_in_string = mbsstr (string, tsub); if (tsub_in_string == NULL) break; else { if (MB_CUR_MAX > 1) { mbui_iterator_t string_iter; bool word_boundary_before; bool word_boundary_after; mbui_init (string_iter, string); word_boundary_before = true; if (mbui_cur_ptr (string_iter) < tsub_in_string) { mbchar_t last_char_before_tsub; do { if (!mbui_avail (string_iter)) abort (); last_char_before_tsub = mbui_cur (string_iter); mbui_advance (string_iter); } while (mbui_cur_ptr (string_iter) < tsub_in_string); if (mb_isalnum (last_char_before_tsub)) word_boundary_before = false; } mbui_init (string_iter, tsub_in_string); { mbui_iterator_t tsub_iter; for (mbui_init (tsub_iter, tsub); mbui_avail (tsub_iter); mbui_advance (tsub_iter)) { if (!mbui_avail (string_iter)) abort (); mbui_advance (string_iter); } } word_boundary_after = true; if (mbui_avail (string_iter)) { mbchar_t first_char_after_tsub = mbui_cur (string_iter); if (mb_isalnum (first_char_after_tsub)) word_boundary_after = false; } if (word_boundary_before && word_boundary_after) { found = true; break; } mbui_init (string_iter, tsub_in_string); if (!mbui_avail (string_iter)) break; string = tsub_in_string + mb_len (mbui_cur (string_iter)); } else { bool word_boundary_before; const char *p; bool word_boundary_after; word_boundary_before = true; if (string < tsub_in_string) if (isalnum ((unsigned char) tsub_in_string[-1])) word_boundary_before = false; p = tsub_in_string + strlen (tsub); word_boundary_after = true; if (*p != '\0') if (isalnum ((unsigned char) *p)) word_boundary_after = false; if (word_boundary_before && word_boundary_after) { found = true; break; } if (*tsub_in_string == '\0') break; string = tsub_in_string + 1; } } } free (tsub); return found; } /* Return the localization of NAME. NAME is written in ASCII. */ const char * proper_name (const char *name) { /* See whether there is a translation. */ const char *translation = gettext (name); if (translation != name) { /* See whether the translation contains the original name. */ if (mbsstr_trimmed_wordbounded (translation, name)) return translation; else { /* Return "TRANSLATION (NAME)". */ char *result = XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char); sprintf (result, "%s (%s)", translation, name); return result; } } else return name; } /* Return the localization of a name whose original writing is not ASCII. NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal escape sequences. NAME_ASCII is a fallback written only with ASCII characters. */ const char * proper_name_utf8 (const char *name_ascii, const char *name_utf8) { /* See whether there is a translation. */ const char *translation = gettext (name_ascii); /* Try to convert NAME_UTF8 to the locale encoding. */ const char *locale_code = locale_charset (); char *alloc_name_converted = NULL; char *alloc_name_converted_translit = NULL; const char *name_converted = NULL; const char *name_converted_translit = NULL; const char *name; if (c_strcasecmp (locale_code, "UTF-8") != 0) { #if HAVE_ICONV name_converted = alloc_name_converted = xstr_iconv (name_utf8, "UTF-8", locale_code); # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \ && !defined __UCLIBC__) \ || _LIBICONV_VERSION >= 0x0105 { char *converted_translit; size_t len = strlen (locale_code); char *locale_code_translit = XNMALLOC (len + 10 + 1, char); memcpy (locale_code_translit, locale_code, len); memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1); converted_translit = xstr_iconv (name_utf8, "UTF-8", locale_code_translit); free (locale_code_translit); if (converted_translit != NULL) { # if !_LIBICONV_VERSION /* Don't use the transliteration if it added question marks. glibc's transliteration falls back to question marks; libiconv's transliteration does not. mbschr is equivalent to strchr in this case. */ if (strchr (converted_translit, '?') != NULL) free (converted_translit); else # endif name_converted_translit = alloc_name_converted_translit = converted_translit; } } # endif #endif } else { name_converted = name_utf8; name_converted_translit = name_utf8; } /* The name in locale encoding. */ name = (name_converted != NULL ? name_converted : name_converted_translit != NULL ? name_converted_translit : name_ascii); /* See whether we have a translation. Some translators have not understood that they should use the UTF-8 form of the name, if possible. So if the translator provided a no-op translation, we ignore it. */ if (strcmp (translation, name_ascii) != 0) { /* See whether the translation contains the original name. */ if (mbsstr_trimmed_wordbounded (translation, name_ascii) || (name_converted != NULL && mbsstr_trimmed_wordbounded (translation, name_converted)) || (name_converted_translit != NULL && mbsstr_trimmed_wordbounded (translation, name_converted_translit))) { if (alloc_name_converted != NULL) free (alloc_name_converted); if (alloc_name_converted_translit != NULL) free (alloc_name_converted_translit); return translation; } else { /* Return "TRANSLATION (NAME)". */ char *result = XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char); sprintf (result, "%s (%s)", translation, name); if (alloc_name_converted != NULL) free (alloc_name_converted); if (alloc_name_converted_translit != NULL) free (alloc_name_converted_translit); return result; } } else { if (alloc_name_converted != NULL && alloc_name_converted != name) free (alloc_name_converted); if (alloc_name_converted_translit != NULL && alloc_name_converted_translit != name) free (alloc_name_converted_translit); return name; } } #ifdef TEST1 # include int main (int argc, char *argv[]) { setlocale (LC_ALL, ""); if (mbsstr_trimmed_wordbounded (argv[1], argv[2])) printf("found\n"); return 0; } #endif #ifdef TEST2 # include # include int main (int argc, char *argv[]) { setlocale (LC_ALL, ""); printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard")); return 0; } #endif