summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-02-17 15:25:21 -0700
committerKarl Williamson <khw@cpan.org>2015-02-18 14:03:23 -0700
commite23c50db6337fb5f27e78e4d7e72f052a886113d (patch)
treee37faf9cafae1214269f0d29513636589bf13ad6 /utf8.c
parentce4fe27b699be446d76ea7ae21b2dce87c97165d (diff)
downloadperl-e23c50db6337fb5f27e78e4d7e72f052a886113d.tar.gz
utf8.c: Slight refactor of UTF-16 code
This eliminates a branch in the usual case, at the expense of an extra one in the rarer case, which allows us to collapse some error condition code. It sprinkles some UNLIKELYs.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c23
1 files changed, 15 insertions, 8 deletions
diff --git a/utf8.c b/utf8.c
index bf5a36e232..179a96988e 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1278,19 +1278,26 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
#define LAST_HIGH_SURROGATE 0xDBFF
#define FIRST_LOW_SURROGATE 0xDC00
#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
- if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
- if (p >= pend) {
- Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
- } else {
+
+ /* This assumes that most uses will be in the first Unicode plane, not
+ * needing surrogates */
+ if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
+ && uv <= UNICODE_SURROGATE_LAST))
+ {
+ if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
+ Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
+ }
+ else {
UV low = (p[0] << 8) + p[1];
- p += 2;
- if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE)
+ if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
+ || UNLIKELY(low > LAST_LOW_SURROGATE))
+ {
Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
+ }
+ p += 2;
uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
+ (low - FIRST_LOW_SURROGATE) + 0x10000;
}
- } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
- Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
}
#ifdef EBCDIC
d = uvoffuni_to_utf8_flags(d, uv, 0);