summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2013-02-14 10:54:32 -0700
committerKarl Williamson <public@khwilliamson.com>2013-02-25 14:57:50 -0700
commit537124e4032962cd7c5f3bd4f0ee7995cd79e8ec (patch)
treee37330ba1f315956dc962f0c1f3217c6327c81d9 /utf8.h
parent2550367793db9f9f86124a38dc944f949b315d84 (diff)
downloadperl-537124e4032962cd7c5f3bd4f0ee7995cd79e8ec.tar.gz
Add, fix comments
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h7
1 files changed, 5 insertions, 2 deletions
diff --git a/utf8.h b/utf8.h
index f990f37995..01d8f5fa20 100644
--- a/utf8.h
+++ b/utf8.h
@@ -136,7 +136,7 @@ END_EXTERN_C
U+0800..U+0FFF E0 * A0..BF 80..BF
U+1000..U+CFFF E1..EC 80..BF 80..BF
U+D000..U+D7FF ED 80..9F 80..BF
- U+D800..U+DFFF +++++++ utf16 surrogates, not legal utf8 +++++++
+ U+D800..U+DFFF ED A0..BF 80..BF (surrogates)
U+E000..U+FFFF EE..EF 80..BF 80..BF
U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
@@ -144,7 +144,7 @@ END_EXTERN_C
Below are non-Unicode code points
U+110000..U+13FFFF F4 90..BF 80..BF 80..BF
U+110000..U+1FFFFF F5..F7 80..BF 80..BF 80..BF
- U+200000: F8.. * 88..BF 80..BF 80..BF 80..BF
+ U+200000..: F8.. * 88..BF 80..BF 80..BF 80..BF
Note the gaps before several of the byte entries above marked by '*'. These are
caused by legal UTF-8 avoiding non-shortest encodings: it is technically
@@ -275,6 +275,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
#define NATIVE8_TO_UNI(ch) NATIVE_TO_ASCII(ch) /* a clearer synonym */
+/* Adds a UTF8 continuation byte 'new' of information to a running total code
+ * point 'old' of all the continuation bytes so far. This is designed to be
+ * used in a loop to convert from UTF-8 to the code point represented */
#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) \
| (((U8)new) & UTF_CONTINUATION_MASK))