summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pp.c6
-rw-r--r--utf8.h10
2 files changed, 14 insertions, 2 deletions
diff --git a/pp.c b/pp.c
index ee82cd2c3a..444489b7aa 100644
--- a/pp.c
+++ b/pp.c
@@ -3382,8 +3382,10 @@ PP(pp_chr)
if (PL_encoding && !IN_BYTES) {
sv_recode_to_utf8(TARG, PL_encoding);
tmps = SvPVX(TARG);
- if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) ||
- UNICODE_IS_REPLACEMENT(utf8_to_uvchr_buf((U8*)tmps, (U8*) tmps + SvCUR(TARG), NULL))) {
+ if (SvCUR(TARG) == 0
+ || ! is_utf8_string((U8*)tmps, SvCUR(TARG))
+ || UTF8_IS_REPLACEMENT((U8*) tmps, (U8*) tmps + SvCUR(TARG)))
+ {
SvGROW(TARG, 2);
tmps = SvPVX(TARG);
SvCUR_set(TARG, 1);
diff --git a/utf8.h b/utf8.h
index 4d80d73b97..ad2b339a6c 100644
--- a/utf8.h
+++ b/utf8.h
@@ -347,8 +347,18 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
# define UTF8_IS_SURROGATE(s) (*(s) == UTF_TO_NATIVE(0xF1) \
&& ((*((s) +1) == UTF_TO_NATIVE(0xB6)) \
|| *((s) + 1) == UTF_TO_NATIVE(0xB7)))
+ /* <send> points to one beyond the end of the string that starts at <s> */
+# define UTF8_IS_REPLACEMENT(s, send) (*(s) == UTF_TO_NATIVE(0xEF) \
+ && (send - s) >= 4 \
+ && *((s) + 1) == UTF_TO_NATIVE(0xBF) \
+ && *((s) + 2) == UTF_TO_NATIVE(0xBF) \
+ && *((s) + 3) == UTF_TO_NATIVE(0xBD)
#else
# define UTF8_IS_SURROGATE(s) (*(s) == 0xED && *((s) + 1) >= 0xA0)
+# define UTF8_IS_REPLACEMENT(s, send) (*(s) == 0xEF \
+ && (send - s) >= 3 \
+ && *((s) + 1) == 0xBF \
+ && *((s) + 2) == 0xBD)
#endif
/* ASCII EBCDIC I8