summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2014-01-29 16:44:27 -0700
committerKarl Williamson <public@khwilliamson.com>2014-01-29 17:15:15 -0700
commit1d958db20afb78823396dfee8e55ed9c90a0e548 (patch)
tree341f5214f06cb96503a724a002f592c5885cda0b
parentb9df08e4b89fc268daccd0f9f31cc74c54dc9bf0 (diff)
downloadperl-1d958db20afb78823396dfee8e55ed9c90a0e548.tar.gz
locale.c: Fix failure to find UTF-8 locales
Commit 119ee68b changed the method to determine if a locale is a UTF-8 one to a method that was usable on more platforms, by using the C99 libc function mbtowc(). I didn't realize that there needs to be a special call to this function preceeding the main call to make sure it is in the initial state. This commit fixes that. In looking at the results from several different platforms, I decided it is best to use nl_langinfo() in preference to mbtowc() when available, and only use mbtowc() if nl_langinfo doesn't exist on the platform or fails to return a real result, which happens for some locales on Darwin. This commit does that as well.
-rw-r--r--locale.c65
1 files changed, 34 insertions, 31 deletions
diff --git a/locale.c b/locale.c
index 82b8414ac2..5144d8aa00 100644
--- a/locale.c
+++ b/locale.c
@@ -709,9 +709,10 @@ S_is_cur_LC_category_utf8(pTHX_ int category)
return FALSE;
}
-#if defined(USE_LOCALE_CTYPE) && (defined(MB_CUR_MAX) || defined(HAS_NL_LANGINFO) && defined(CODESET))
+#if defined(USE_LOCALE_CTYPE) \
+ && (defined(MB_CUR_MAX) || (defined(HAS_NL_LANGINFO) && defined(CODESET)))
- { /* Next try MB_CUR_MAX or nl_langinfo if available */
+ { /* Next try nl_langinfo or MB_CUR_MAX if available */
char *save_ctype_locale = NULL;
bool is_utf8;
@@ -739,17 +740,35 @@ S_is_cur_LC_category_utf8(pTHX_ int category)
}
/* Here the current LC_CTYPE is set to the locale of the category whose
- * information is desired. This means that MB_CUR_MAX and
- * nl_langinfo() should give the correct results */
+ * information is desired. This means that nl_langinfo() and MB_CUR_MAX
+ * should give the correct results */
-# ifdef MB_CUR_MAX
+# if defined(HAS_NL_LANGINFO) && defined(CODESET)
+ {
+ char *codeset = savepv(nl_langinfo(CODESET));
+ if (codeset && strNE(codeset, "")) {
- /* If we switched LC_CTYPE, switch back */
- if (save_ctype_locale) {
- setlocale(LC_CTYPE, save_ctype_locale);
- Safefree(save_ctype_locale);
+ /* If we switched LC_CTYPE, switch back */
+ if (save_ctype_locale) {
+ setlocale(LC_CTYPE, save_ctype_locale);
+ Safefree(save_ctype_locale);
+ }
+
+ is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8"))
+ || foldEQ(codeset, STR_WITH_LEN("UTF8"));
+
+ Safefree(codeset);
+ Safefree(save_input_locale);
+ return is_utf8;
+ }
}
+# endif
+# ifdef MB_CUR_MAX
+
+ /* Here, either we don't have nl_langinfo, or it didn't return a
+ * codeset. Try MB_CUR_MAX */
+
/* Standard UTF-8 needs at least 4 bytes to represent the maximum
* Unicode code point. Since UTF-8 is the only non-single byte
* encoding we handle, we just say any such encoding is UTF-8, and if
@@ -766,6 +785,7 @@ S_is_cur_LC_category_utf8(pTHX_ int category)
* result */
if (is_utf8) {
wchar_t wc;
+ (void) mbtowc(&wc, NULL, 0); /* Reset any shift state */
if (mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8))
!= strlen(HYPHEN_UTF8)
|| wc != (wchar_t) 0x2010)
@@ -773,32 +793,15 @@ S_is_cur_LC_category_utf8(pTHX_ int category)
is_utf8 = FALSE;
}
}
-
# endif
- return is_utf8;
-#else
-# if defined(HAS_NL_LANGINFO) && defined(CODESET)
- {
- char *codeset = savepv(nl_langinfo(CODESET));
- if (codeset) {
-
- /* If we switched LC_CTYPE, switch back */
- if (save_ctype_locale) {
- setlocale(LC_CTYPE, save_ctype_locale);
- Safefree(save_ctype_locale);
- }
-
- is_utf8 = foldEQ(codeset, STR_WITH_LEN("UTF-8"))
- || foldEQ(codeset, STR_WITH_LEN("UTF8"));
-
- Safefree(codeset);
- Safefree(save_input_locale);
- return is_utf8;
- }
+ /* If we switched LC_CTYPE, switch back */
+ if (save_ctype_locale) {
+ setlocale(LC_CTYPE, save_ctype_locale);
+ Safefree(save_ctype_locale);
}
-# endif
+ return is_utf8;
# endif
}