summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-04-28 18:38:24 -0600
committerKarl Williamson <public@khwilliamson.com>2012-05-22 08:24:18 -0600
commit28936164408fd41cfaa353665e07fdb257254b20 (patch)
tree2f1507e368e61c34b7a8c75ab6639d4be7a5ea9d /utf8.h
parenta4f7a67c079118a2f900d5f95d3cada67e3475ea (diff)
downloadperl-28936164408fd41cfaa353665e07fdb257254b20.tar.gz
utf8.h, pp.c: Add UTF8_IS_REPLACEMENT macro, and use it
This should speed things up slightly, as it looks directly at the UTF-8 source, instead of having to decode it first.
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h10
1 files changed, 10 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h
index 4d80d73b97..ad2b339a6c 100644
--- a/utf8.h
+++ b/utf8.h
@@ -347,8 +347,18 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
# define UTF8_IS_SURROGATE(s) (*(s) == UTF_TO_NATIVE(0xF1) \
&& ((*((s) +1) == UTF_TO_NATIVE(0xB6)) \
|| *((s) + 1) == UTF_TO_NATIVE(0xB7)))
+ /* <send> points to one beyond the end of the string that starts at <s> */
+# define UTF8_IS_REPLACEMENT(s, send) (*(s) == UTF_TO_NATIVE(0xEF) \
+ && (send - s) >= 4 \
+ && *((s) + 1) == UTF_TO_NATIVE(0xBF) \
+ && *((s) + 2) == UTF_TO_NATIVE(0xBF) \
+ && *((s) + 3) == UTF_TO_NATIVE(0xBD)
#else
# define UTF8_IS_SURROGATE(s) (*(s) == 0xED && *((s) + 1) >= 0xA0)
+# define UTF8_IS_REPLACEMENT(s, send) (*(s) == 0xEF \
+ && (send - s) >= 3 \
+ && *((s) + 1) == 0xBF \
+ && *((s) + 2) == 0xBD)
#endif
/* ASCII EBCDIC I8