summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-09-10 22:18:16 -0600
committerKarl Williamson <khw@cpan.org>2016-09-17 21:10:49 -0600
commitdf863e438c2ab516c61ce7abbf3f8afdbdf56e7e (patch)
treecb5c49f46a4f08d4e63415e4634ede1e94e4fc1e
parent2b47960981adadbe81b9635d4ca7861c45ccdced (diff)
downloadperl-df863e438c2ab516c61ce7abbf3f8afdbdf56e7e.tar.gz
utf8.h: Add comment, white-space changes
-rw-r--r--utf8.h18
1 files changed, 11 insertions, 7 deletions
diff --git a/utf8.h b/utf8.h
index ac546049f5..c7cef67cdc 100644
--- a/utf8.h
+++ b/utf8.h
@@ -223,7 +223,11 @@ As you can see, the continuation bytes all begin with C<10>, and the
leading bits of the start byte tell how many bytes there are in the
encoded character.
-Perl's extended UTF-8 means we can have start bytes up to FF.
+Perl's extended UTF-8 means we can have start bytes up through FF, though any
+beginning with FF yields a code point that is too large for 32-bit ASCII
+platforms. FF signals to use 13 bytes for the encoded character. This breaks
+the paradigm that the number of leading bits gives how many total bytes there
+are in the character.
*/
@@ -914,12 +918,12 @@ is a valid UTF-8 character.
(UNLIKELY((e) <= (s)) \
? 0 \
: (UTF8_IS_INVARIANT(*s)) \
- ? 1 \
- : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
- ? 0 \
- : LIKELY(NATIVE_UTF8_TO_I8(*s) <= _IS_UTF8_CHAR_HIGHEST_START_BYTE) \
- ? is_UTF8_CHAR_utf8_no_length_checks(s) \
- : _is_utf8_char_helper(s, e, 0))
+ ? 1 \
+ : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
+ ? 0 \
+ : LIKELY(NATIVE_UTF8_TO_I8(*s) <= _IS_UTF8_CHAR_HIGHEST_START_BYTE) \
+ ? is_UTF8_CHAR_utf8_no_length_checks(s) \
+ : _is_utf8_char_helper(s, e, 0))
#define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)