summaryrefslogtreecommitdiff
path: root/mg.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2013-06-19 21:00:53 -0600
committerKarl Williamson <public@khwilliamson.com>2013-07-05 22:30:00 -0600
commit1500bd919ffeae0f3252f8d1bb28b03b043d328e (patch)
tree7888c3a582de5de3e706304f20e44b7f7da7b6f7 /mg.c
parent69edd4754eb5832038093138e75d8325269476a4 (diff)
downloadperl-1500bd919ffeae0f3252f8d1bb28b03b043d328e.tar.gz
PATCH: [perl #112208]: Set utf8 flag on $! appropriately
This patch sets the utf8 flag on $! if the error string passes utf8 validity tests and has some bytes with the upper bit set. (If none have that bit set, is an ASCII string, and whether or not it is UTF-8 is irrelevant.) This is a heuristic that could fail, but as the reference in the comments points out this is unlikely. One can reasonably assume that a UTF-8 locale will return a UTF-8 result. So another approach would be to look at that (but we wouldn't want to turn the flag on for a purely ASCII string anyway, as that could change the semantics from existing behavior by making the string follow Unicode rules, whereas it didn't necessarily before.) To do this, we could keep track of the utf8ness of the LC_MESSAGES locale. But until the heuristic in this patch is shown to not be good enough, I don't see the need to do this extra work.
Diffstat (limited to 'mg.c')
-rw-r--r--mg.c30
1 files changed, 29 insertions, 1 deletions
diff --git a/mg.c b/mg.c
index 7ff78c1e67..518d1085b4 100644
--- a/mg.c
+++ b/mg.c
@@ -1043,7 +1043,35 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
sv_setpv(sv, os2error(Perl_rc));
else
#endif
- sv_setpv(sv, errno ? Strerror(errno) : "");
+ if (! errno) {
+ sv_setpvs(sv, "");
+ }
+ else {
+
+ /* Strerror can return NULL on some platforms, which will result in
+ * 'sv' not being considered SvOK. The SvNOK_on() below will cause
+ * just the number part to be valid */
+ sv_setpv(sv, Strerror(errno));
+
+ /* In some locales the error string may come back as UTF-8, in
+ * which case we should turn on that flag. This didn't use to
+ * happen, and to avoid any possible backward compatibility issues,
+ * we don't turn on the flag unless we have to. So the flag stays
+ * off for an entirely ASCII string. We assume that if the string
+ * looks like UTF-8, it really is UTF-8: "text in any other
+ * encoding that uses bytes with the high bit set is extremely
+ * unlikely to pass a UTF-8 validity test"
+ * (http://en.wikipedia.org/wiki/Charset_detection). There is a
+ * potential that we will get it wrong however, especially on short
+ * error message text. (If it turns out to be necessary, we could
+ * also keep track if the current LC_MESSAGES locale is UTF-8) */
+ if (SvOK(sv) /* It could be that Strerror returned invalid */
+ && ! is_ascii_string((U8*) SvPVX_const(sv), SvCUR(sv))
+ && is_utf8_string((U8*) SvPVX_const(sv), SvCUR(sv)))
+ {
+ SvUTF8_on(sv);
+ }
+ }
RESTORE_ERRNO;
}