diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-04-28 18:38:24 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-05-22 08:24:18 -0600 |
commit | 28936164408fd41cfaa353665e07fdb257254b20 (patch) | |
tree | 2f1507e368e61c34b7a8c75ab6639d4be7a5ea9d | |
parent | a4f7a67c079118a2f900d5f95d3cada67e3475ea (diff) | |
download | perl-28936164408fd41cfaa353665e07fdb257254b20.tar.gz |
utf8.h, pp.c: Add UTF8_IS_REPLACEMENT macro, and use it
This should speed things up slightly, as it looks directly at the UTF-8
source, instead of having to decode it first.
-rw-r--r-- | pp.c | 6 | ||||
-rw-r--r-- | utf8.h | 10 |
2 files changed, 14 insertions, 2 deletions
@@ -3382,8 +3382,10 @@ PP(pp_chr) if (PL_encoding && !IN_BYTES) { sv_recode_to_utf8(TARG, PL_encoding); tmps = SvPVX(TARG); - if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) || - UNICODE_IS_REPLACEMENT(utf8_to_uvchr_buf((U8*)tmps, (U8*) tmps + SvCUR(TARG), NULL))) { + if (SvCUR(TARG) == 0 + || ! is_utf8_string((U8*)tmps, SvCUR(TARG)) + || UTF8_IS_REPLACEMENT((U8*) tmps, (U8*) tmps + SvCUR(TARG))) + { SvGROW(TARG, 2); tmps = SvPVX(TARG); SvCUR_set(TARG, 1); @@ -347,8 +347,18 @@ Perl's extended UTF-8 means we can have start bytes up to FF. # define UTF8_IS_SURROGATE(s) (*(s) == UTF_TO_NATIVE(0xF1) \ && ((*((s) +1) == UTF_TO_NATIVE(0xB6)) \ || *((s) + 1) == UTF_TO_NATIVE(0xB7))) + /* <send> points to one beyond the end of the string that starts at <s> */ +# define UTF8_IS_REPLACEMENT(s, send) (*(s) == UTF_TO_NATIVE(0xEF) \ + && (send - s) >= 4 \ + && *((s) + 1) == UTF_TO_NATIVE(0xBF) \ + && *((s) + 2) == UTF_TO_NATIVE(0xBF) \ + && *((s) + 3) == UTF_TO_NATIVE(0xBD) #else # define UTF8_IS_SURROGATE(s) (*(s) == 0xED && *((s) + 1) >= 0xA0) +# define UTF8_IS_REPLACEMENT(s, send) (*(s) == 0xEF \ + && (send - s) >= 3 \ + && *((s) + 1) == 0xBF \ + && *((s) + 2) == 0xBD) #endif /* ASCII EBCDIC I8 |