summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-04-28 18:38:24 -0600
committerKarl Williamson <public@khwilliamson.com>2012-05-22 08:24:18 -0600
commit28936164408fd41cfaa353665e07fdb257254b20 (patch)
tree2f1507e368e61c34b7a8c75ab6639d4be7a5ea9d
parenta4f7a67c079118a2f900d5f95d3cada67e3475ea (diff)
downloadperl-28936164408fd41cfaa353665e07fdb257254b20.tar.gz
utf8.h, pp.c: Add UTF8_IS_REPLACEMENT macro, and use it
This should speed things up slightly, as it looks directly at the UTF-8 source, instead of having to decode it first.
-rw-r--r--pp.c6
-rw-r--r--utf8.h10
2 files changed, 14 insertions, 2 deletions
diff --git a/pp.c b/pp.c
index ee82cd2c3a..444489b7aa 100644
--- a/pp.c
+++ b/pp.c
@@ -3382,8 +3382,10 @@ PP(pp_chr)
if (PL_encoding && !IN_BYTES) {
sv_recode_to_utf8(TARG, PL_encoding);
tmps = SvPVX(TARG);
- if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) ||
- UNICODE_IS_REPLACEMENT(utf8_to_uvchr_buf((U8*)tmps, (U8*) tmps + SvCUR(TARG), NULL))) {
+ if (SvCUR(TARG) == 0
+ || ! is_utf8_string((U8*)tmps, SvCUR(TARG))
+ || UTF8_IS_REPLACEMENT((U8*) tmps, (U8*) tmps + SvCUR(TARG)))
+ {
SvGROW(TARG, 2);
tmps = SvPVX(TARG);
SvCUR_set(TARG, 1);
diff --git a/utf8.h b/utf8.h
index 4d80d73b97..ad2b339a6c 100644
--- a/utf8.h
+++ b/utf8.h
@@ -347,8 +347,18 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
# define UTF8_IS_SURROGATE(s) (*(s) == UTF_TO_NATIVE(0xF1) \
&& ((*((s) +1) == UTF_TO_NATIVE(0xB6)) \
|| *((s) + 1) == UTF_TO_NATIVE(0xB7)))
+ /* <send> points to one beyond the end of the string that starts at <s> */
+# define UTF8_IS_REPLACEMENT(s, send) (*(s) == UTF_TO_NATIVE(0xEF) \
+ && (send - s) >= 4 \
+ && *((s) + 1) == UTF_TO_NATIVE(0xBF) \
+ && *((s) + 2) == UTF_TO_NATIVE(0xBF) \
+ && *((s) + 3) == UTF_TO_NATIVE(0xBD)
#else
# define UTF8_IS_SURROGATE(s) (*(s) == 0xED && *((s) + 1) >= 0xA0)
+# define UTF8_IS_REPLACEMENT(s, send) (*(s) == 0xEF \
+ && (send - s) >= 3 \
+ && *((s) + 1) == 0xBF \
+ && *((s) + 2) == 0xBD)
#endif
/* ASCII EBCDIC I8