summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-05-15 10:59:54 -0600
committerKarl Williamson <khw@cpan.org>2015-09-04 10:21:17 -0600
commita62b247b9f3d5cc6214f83defea2e06d12398275 (patch)
treeec2bd2e98a8464e9160031756326688ac8c7b98a /utf8.h
parent635e76f560b3b3ca075aa2cb5d6d661601968e04 (diff)
downloadperl-a62b247b9f3d5cc6214f83defea2e06d12398275.tar.gz
Add macro for converting Latin1 to UTF-8, and use it
This adds a macro that converts a code point in the ASCII 128-255 range to UTF-8, and changes existing code to use it when the range is known to be restricted to this one, rather than the previous macro which accepted a wider range (any code point representable by 2 bytes), but had an extra test on EBCDIC platforms, hence was larger than necessary and slightly slower.
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h15
1 files changed, 15 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h
index 271796b881..85bf590162 100644
--- a/utf8.h
+++ b/utf8.h
@@ -350,6 +350,21 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than
/* Longer, but more accurate name */
#define UTF8_IS_ABOVE_LATIN1_START(c) UTF8_IS_ABOVE_LATIN1(c)
+/* Convert a UTF-8 variant Latin1 character to a native code point value.
+ * Needs just one iteration of accumulate. Should be used only if it is known
+ * that the code point is < 256, and is not UTF-8 invariant. Use the slower
+ * but more general TWO_BYTE_UTF8_TO_NATIVE() which handles any code point
+ * representable by two bytes (which turns out to be up through
+ * MAX_PORTABLE_UTF8_TWO_BYTE). The two parameters are:
+ * HI: a downgradable start byte;
+ * LO: continuation.
+ * */
+#define EIGHT_BIT_UTF8_TO_NATIVE(HI, LO) \
+ ( __ASSERT_(UTF8_IS_DOWNGRADEABLE_START(HI)) \
+ __ASSERT_(UTF8_IS_CONTINUATION(LO)) \
+ LATIN1_TO_NATIVE(UTF8_ACCUMULATE(( \
+ NATIVE_UTF8_TO_I8(HI) & UTF_START_MASK(2)), (LO))))
+
/* Convert a two (not one) byte utf8 character to a native code point value.
* Needs just one iteration of accumulate. Should not be used unless it is
* known that the two bytes are legal: 1) two-byte start, and 2) continuation.