summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2014-01-21 16:25:42 -0700
committerKarl Williamson <public@khwilliamson.com>2014-01-22 11:45:59 -0700
commit119ee68b322bc73b342f2052597ac5187f18d365 (patch)
tree4feffcb3f10af8b075085c7c3f3ae97606ccd5e1
parentd646bffec1670f95f61ba05ca3da10a8deb21664 (diff)
downloadperl-119ee68b322bc73b342f2052597ac5187f18d365.tar.gz
locale.c: Find utf8-8 locales reliably on C99 platforms
locale.c has a function that tries to determine if the current POSIX locale is a UTF-8 locale. Prior to this patch, it used nl_langinfo() to determine this, falling back to heuristics if that is unavailable on the platform. nl_langinfo() is part of POSIX.1-2001. -This patch adds the use of two functions from C99, mbtowc() and MB_CUR_MAX, that also give reliable results.
-rw-r--r--locale.c63
1 files changed, 53 insertions, 10 deletions
diff --git a/locale.c b/locale.c
index 2bae987e76..7057127762 100644
--- a/locale.c
+++ b/locale.c
@@ -676,8 +676,9 @@ S_is_cur_LC_category_utf8(pTHX_ int category)
{
/* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
* otherwise. 'category' may not be LC_ALL. If the platform doesn't have
- * nl_langinfo(), this employs a heuristic, which hence could give the
- * wrong result. It errs on the side of not being a UTF-8 locale. */
+ * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
+ * could give the wrong result. It errs on the side of not being a UTF-8
+ * locale. */
char *save_input_locale = NULL;
STRLEN final_pos;
@@ -699,14 +700,14 @@ S_is_cur_LC_category_utf8(pTHX_ int category)
return FALSE;
}
-#if defined(HAS_NL_LANGINFO) && defined(CODESET) && defined(USE_LOCALE_CTYPE)
+#if defined(USE_LOCALE_CTYPE) && (defined(MB_CUR_MAX) || defined(HAS_NL_LANGINFO) && defined(CODESET))
- { /* Next try nl_langinfo if available */
+ { /* Next try MB_CUR_MAX or nl_langinfo if available */
char *save_ctype_locale = NULL;
- char *codeset = NULL;
+ bool is_utf8;
- if (category != LC_CTYPE) { /* nl_langinfo works only on LC_CTYPE */
+ if (category != LC_CTYPE) { /* These work only on LC_CTYPE */
/* Get the current LC_CTYPE locale */
save_ctype_locale = stdize_locale(savepv(setlocale(LC_CTYPE, NULL)));
@@ -729,11 +730,49 @@ S_is_cur_LC_category_utf8(pTHX_ int category)
}
/* Here the current LC_CTYPE is set to the locale of the category whose
- * information is desired. This means that nl_langinfo() should give
- * the correct results */
- codeset = savepv(nl_langinfo(CODESET));
+ * information is desired. This means that MB_CUR_MAX and
+ * nl_langinfo() should give the correct results */
+
+# ifdef MB_CUR_MAX
+
+ /* If we switched LC_CTYPE, switch back */
+ if (save_ctype_locale) {
+ setlocale(LC_CTYPE, save_ctype_locale);
+ Safefree(save_ctype_locale);
+ }
+
+ /* Standard UTF-8 needs at least 4 bytes to represent the maximum
+ * Unicode code point. Since UTF-8 is the only non-single byte
+ * encoding we handle, we just say any such encoding is UTF-8, and if
+ * turns out to be wrong, other things will fail */
+ is_utf8 = MB_CUR_MAX >= 4;
+
+ Safefree(save_input_locale);
+
+# ifdef HAS_MBTOWC
+
+ /* ... But, most system that have MB_CUR_MAX will also have mbtowc(),
+ * since they are both in the C99 standard. We can feed a known byte
+ * string to the latter function, and check that it gives the expected
+ * result */
+ if (is_utf8) {
+ wchar_t wc;
+ if (mbtowc(&wc, HYPHEN_UTF8, strlen(HYPHEN_UTF8))
+ != strlen(HYPHEN_UTF8)
+ || wc != (wchar_t) 0x2010)
+ {
+ is_utf8 = FALSE;
+ }
+ }
+
+# endif
+
+ return is_utf8;
+#else
+# if defined(HAS_NL_LANGINFO) && defined(CODESET)
+ {
+ char *codeset = savepv(nl_langinfo(CODESET));
if (codeset) {
- bool is_utf8;
/* If we switched LC_CTYPE, switch back */
if (save_ctype_locale) {
@@ -748,8 +787,12 @@ S_is_cur_LC_category_utf8(pTHX_ int category)
Safefree(save_input_locale);
return is_utf8;
}
+ }
+# endif
+# endif
}
+
cant_use_nllanginfo:
#endif /* HAS_NL_LANGINFO etc */