diff options
-rw-r--r-- | pp.c | 6 | ||||
-rw-r--r-- | utf8.h | 10 |
2 files changed, 14 insertions, 2 deletions
@@ -3382,8 +3382,10 @@ PP(pp_chr) if (PL_encoding && !IN_BYTES) { sv_recode_to_utf8(TARG, PL_encoding); tmps = SvPVX(TARG); - if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) || - UNICODE_IS_REPLACEMENT(utf8_to_uvchr_buf((U8*)tmps, (U8*) tmps + SvCUR(TARG), NULL))) { + if (SvCUR(TARG) == 0 + || ! is_utf8_string((U8*)tmps, SvCUR(TARG)) + || UTF8_IS_REPLACEMENT((U8*) tmps, (U8*) tmps + SvCUR(TARG))) + { SvGROW(TARG, 2); tmps = SvPVX(TARG); SvCUR_set(TARG, 1); @@ -347,8 +347,18 @@ Perl's extended UTF-8 means we can have start bytes up to FF. # define UTF8_IS_SURROGATE(s) (*(s) == UTF_TO_NATIVE(0xF1) \ && ((*((s) +1) == UTF_TO_NATIVE(0xB6)) \ || *((s) + 1) == UTF_TO_NATIVE(0xB7))) + /* <send> points to one beyond the end of the string that starts at <s> */ +# define UTF8_IS_REPLACEMENT(s, send) (*(s) == UTF_TO_NATIVE(0xEF) \ + && (send - s) >= 4 \ + && *((s) + 1) == UTF_TO_NATIVE(0xBF) \ + && *((s) + 2) == UTF_TO_NATIVE(0xBF) \ + && *((s) + 3) == UTF_TO_NATIVE(0xBD) #else # define UTF8_IS_SURROGATE(s) (*(s) == 0xED && *((s) + 1) >= 0xA0) +# define UTF8_IS_REPLACEMENT(s, send) (*(s) == 0xEF \ + && (send - s) >= 3 \ + && *((s) + 1) == 0xBF \ + && *((s) + 2) == 0xBD) #endif /* ASCII EBCDIC I8 |