utf8.h: Add comment, white-space changes

author: Karl Williamson <khw@cpan.org> 2016-09-10 22:18:16 -0600
committer: Karl Williamson <khw@cpan.org> 2016-09-17 21:10:49 -0600
commit: df863e438c2ab516c61ce7abbf3f8afdbdf56e7e (patch)
tree: cb5c49f46a4f08d4e63415e4634ede1e94e4fc1e /utf8.h
parent: 2b47960981adadbe81b9635d4ca7861c45ccdced (diff)
download: perl-df863e438c2ab516c61ce7abbf3f8afdbdf56e7e.tar.gz
1 files changed, 11 insertions, 7 deletions
diff --git a/utf8.h b/utf8.h
index ac546049f5..c7cef67cdc 100644
--- a/utf8.h
+++ b/utf8.h
@@ -223,7 +223,11 @@ As you can see, the continuation bytes all begin with C<10>, and the
 leading bits of the start byte tell how many bytes there are in the
 encoded character.
 
-Perl's extended UTF-8 means we can have start bytes up to FF.
+Perl's extended UTF-8 means we can have start bytes up through FF, though any
+beginning with FF yields a code point that is too large for 32-bit ASCII
+platforms.  FF signals to use 13 bytes for the encoded character.  This breaks
+the paradigm that the number of leading bits gives how many total bytes there
+are in the character.
 
 */
 
@@ -914,12 +918,12 @@ is a valid UTF-8 character.
     (UNLIKELY((e) <= (s))                                                   \
     ? 0                                                                     \
     : (UTF8_IS_INVARIANT(*s))                                               \
-    ? 1                                                                     \
-    : UNLIKELY(((e) - (s)) < UTF8SKIP(s))                                   \
-      ? 0                                                                   \
-      : LIKELY(NATIVE_UTF8_TO_I8(*s) <= _IS_UTF8_CHAR_HIGHEST_START_BYTE)   \
-      ? is_UTF8_CHAR_utf8_no_length_checks(s)                               \
-      : _is_utf8_char_helper(s, e, 0))
+      ? 1                                                                   \
+      : UNLIKELY(((e) - (s)) < UTF8SKIP(s))                                 \
+        ? 0                                                                 \
+        : LIKELY(NATIVE_UTF8_TO_I8(*s) <= _IS_UTF8_CHAR_HIGHEST_START_BYTE) \
+          ? is_UTF8_CHAR_utf8_no_length_checks(s)                           \
+          : _is_utf8_char_helper(s, e, 0))
 
 #define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)
author	Karl Williamson <khw@cpan.org>	2016-09-10 22:18:16 -0600
committer	Karl Williamson <khw@cpan.org>	2016-09-17 21:10:49 -0600
commit	df863e438c2ab516c61ce7abbf3f8afdbdf56e7e (patch)
tree	cb5c49f46a4f08d4e63415e4634ede1e94e4fc1e /utf8.h
parent	2b47960981adadbe81b9635d4ca7861c45ccdced (diff)
download	perl-df863e438c2ab516c61ce7abbf3f8afdbdf56e7e.tar.gz