summaryrefslogtreecommitdiff
path: root/mg.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2017-08-18 13:46:25 -0600
committerKarl Williamson <khw@cpan.org>2017-08-18 14:35:48 -0600
commita8f4b0c691d6f1b08948976e74087b646bf8c6ef (patch)
tree16450eb766a4f749c876d5007584fed9ee5541a6 /mg.c
parent9d8092f2bf23932b723d0dbded2b652c8d9b135e (diff)
downloadperl-a8f4b0c691d6f1b08948976e74087b646bf8c6ef.tar.gz
Improve heuristic for UTF-8 detection in "$!"
Previously, the stringification of "$!" was considered to be UTF-8 if it had any characters with the high bit set, and everything was syntactically legal UTF-8. This may to correctly guess on short strings where there are only a few non-ASCII bytes. This could happen in languages based on the Latin script where many words don't use non-ASCII. This commit adds a check that the locale is a UTF-8 one. That check is a call to an already-existing subroutine which goes to some lengths to get an accurate answer, and should be essentially completely reliable on modern systems that have nl_langinfo() and/or mbtowc(). See the thread starting at http://nntp.perl.org/group/perl.perl5.porters/245902
Diffstat (limited to 'mg.c')
-rw-r--r--mg.c22
1 files changed, 17 insertions, 5 deletions
diff --git a/mg.c b/mg.c
index 3b341d52d1..3d08df680e 100644
--- a/mg.c
+++ b/mg.c
@@ -787,12 +787,24 @@ S_fixup_errno_string(pTHX_ SV* sv)
* UTF-8 validity test"
* (http://en.wikipedia.org/wiki/Charset_detection). There is a
* potential that we will get it wrong however, especially on short
- * error message text. (If it turns out to be necessary, we could also
- * keep track if the current LC_MESSAGES locale is UTF-8) */
- if (! IN_BYTES /* respect 'use bytes' */
+ * error message text, so do an additional check. */
+ if ( ! IN_BYTES /* respect 'use bytes' */
&& ! is_utf8_invariant_string((U8*) SvPVX_const(sv), SvCUR(sv))
- && is_utf8_string((U8*) SvPVX_const(sv), SvCUR(sv)))
- {
+ && is_utf8_string((U8*) SvPVX_const(sv), SvCUR(sv))
+
+#ifdef USE_LOCALE_MESSAGES
+
+ && _is_cur_LC_category_utf8(LC_MESSAGES)
+
+#elif defined(USE_LOCLAE_CTYPE)
+
+ /* For systems that don't have a separate message category,
+ * this assumes that they follow the CTYPE one */
+ && _is_cur_LC_category_utf8(LC_CTYPE)
+
+#endif
+
+ ) {
SvUTF8_on(sv);
}
}