utf8.h, utfebcdic.h: Add comments, align white space

author: Karl Williamson <khw@cpan.org> 2016-08-26 16:07:22 -0600
committer: Karl Williamson <khw@cpan.org> 2016-08-31 20:32:36 -0600
commit: a14e0a36c312aa5bbf90f3971ff3d9ab65c4cda9 (patch)
tree: e781cf0490f993a04008826a02696a7a6f7ae328
parent: 5ff889fb14e2876aef87fbed6e39779692ad2aa4 (diff)
download: perl-a14e0a36c312aa5bbf90f3971ff3d9ab65c4cda9.tar.gz
2 files changed, 56 insertions, 20 deletions
diff --git a/utf8.h b/utf8.h
index f0ee2d23a7..d600202dce 100644
--- a/utf8.h
+++ b/utf8.h
@@ -176,24 +176,30 @@ END_EXTERN_C
 
 /*
 
- The following table is from Unicode 3.2.
+ The following table is from Unicode 3.2, plus the Perl extensions for above
+ U+10FFFF
 
- Code Points		1st Byte  2nd Byte  3rd Byte  4th Byte
+ Code Points		1st Byte  2nd Byte  3rd    4th     5th     6th       7th   8th-13th
 
    U+0000..U+007F	00..7F
    U+0080..U+07FF     * C2..DF    80..BF
-   U+0800..U+0FFF	E0      * A0..BF    80..BF
-   U+1000..U+CFFF       E1..EC    80..BF    80..BF
-   U+D000..U+D7FF       ED        80..9F    80..BF
-   U+D800..U+DFFF       ED        A0..BF    80..BF  (surrogates)
-   U+E000..U+FFFF       EE..EF    80..BF    80..BF
-  U+10000..U+3FFFF	F0      * 90..BF    80..BF    80..BF
-  U+40000..U+FFFFF	F1..F3    80..BF    80..BF    80..BF
- U+100000..U+10FFFF	F4        80..8F    80..BF    80..BF
-    Below are non-Unicode code points
- U+110000..U+13FFFF	F4        90..BF    80..BF    80..BF
- U+110000..U+1FFFFF	F5..F7    80..BF    80..BF    80..BF
- U+200000..:            F8..    * 88..BF    80..BF    80..BF    80..BF
+   U+0800..U+0FFF	E0      * A0..BF  80..BF
+   U+1000..U+CFFF       E1..EC    80..BF  80..BF
+   U+D000..U+D7FF       ED        80..9F  80..BF
+   U+D800..U+DFFF       ED        A0..BF  80..BF  (surrogates)
+   U+E000..U+FFFF       EE..EF    80..BF  80..BF
+  U+10000..U+3FFFF	F0      * 90..BF  80..BF  80..BF
+  U+40000..U+FFFFF	F1..F3    80..BF  80..BF  80..BF
+ U+100000..U+10FFFF	F4        80..8F  80..BF  80..BF
+    Below are above-Unicode code points
+ U+110000..U+13FFFF	F4        90..BF  80..BF  80..BF
+ U+110000..U+1FFFFF	F5..F7    80..BF  80..BF  80..BF
+ U+200000..U+FFFFFF     F8      * 88..BF  80..BF  80..BF  80..BF
+U+1000000..U+3FFFFFF    F9..FB    80..BF  80..BF  80..BF  80..BF
+U+4000000..U+3FFFFFFF    FC     * 84..BF  80..BF  80..BF  80..BF  80..BF
+U+40000000..U+7FFFFFFF   FD       80..BF  80..BF  80..BF  80..BF  80..BF
+U+80000000..U+FFFFFFFFF  FE     * 82..BF  80..BF  80..BF  80..BF  80..BF    80..BF
+U+1000000000..           FF       80..BF  80..BF  80..BF  80..BF  80..BF  * 81..BF  80..BF
 
 Note the gaps before several of the byte entries above marked by '*'.  These are
 caused by legal UTF-8 avoiding non-shortest encodings: it is technically
@@ -654,12 +660,14 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
  * BE AWARE that this test doesn't rule out malformed code points, in
  * particular overlongs */
 #ifdef EBCDIC /* Both versions assume well-formed UTF8 */
-#   define UTF8_IS_SUPER(s, e) (NATIVE_UTF8_TO_I8(* (U8*) (s)) >= 0xF9          \
-                         && (NATIVE_UTF8_TO_I8(* (U8*) (s)) > 0xF9              \
-                             || (NATIVE_UTF8_TO_I8(* ((U8*) (s) + 1)) >= 0xA2)))
+#   define UTF8_IS_SUPER(s, e)                                              \
+                    (        NATIVE_UTF8_TO_I8(* (U8*)  (s)) >= 0xF9        \
+                     && (    NATIVE_UTF8_TO_I8(* (U8*)  (s)) >  0xF9        \
+                         || (NATIVE_UTF8_TO_I8(* ((U8*) (s) + 1)) >= 0xA2)))
 #else
-#   define UTF8_IS_SUPER(s, e) (*(U8*) (s) >= 0xF4                              \
-                           && (*(U8*) (s) > 0xF4 || (*((U8*) (s) + 1) >= 0x90)))
+#   define UTF8_IS_SUPER(s, e)                                              \
+                    (    *(U8*) (s) >= 0xF4                                 \
+                     && (*(U8*) (s) >  0xF4 || (*((U8*) (s) + 1) >= 0x90)))
 #endif
 
 /* These are now machine generated, and the 'given' clause is no longer
diff --git a/utfebcdic.h b/utfebcdic.h
index 10b666afe2..a6ba4fa6a3 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -185,8 +185,36 @@ U+40000000..U+FFFFFFFF ttuuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 11111111 10100000 101000
 For 32-bit words, the 2nd through 7th bytes effectively function as leading
 zeros.  Above 32 bits, these fill up, with each byte yielding 5 bits of
 information, so that with 13 continuation bytes, we can handle 65 bits, just
-above what a 64 bit word can hold */
+above what a 64 bit word can hold
 
+ The following table gives the I8:
+
+   I8 Code Points      1st Byte  2nd Byte  3rd     4th     5th     6th     7th       8th  9th-14th
+
+   0x0000..0x00BF       00..BF
+   0x00A0..0x00FF     * C5..C7    A0..BF
+   U+0100..U+03FF       C8..DF    A0..BF
+   U+0400..U+3FFF     * E1..EF    A0..BF  A0..BF
+   U+4000..U+7FFF       F0      * B0..BF  A0..BF  A0..BF
+   U+8000..U+D7FF       F1        A0..B5  A0..BF  A0..BF
+   U+D800..U+DFFF       F1        B6..B7  A0..BF  A0..BF (surrogates)
+   U+E000..U+FFFF       F1        B8..BF  A0..BF  A0..BF
+  U+10000..U+3FFFF	F2..F7    A0..BF  A0..BF  A0..BF
+  U+40000..U+FFFFF	F8      * A8..BF  A0..BF  A0..BF  A0..BF
+ U+100000..U+10FFFF	F9        A0..A1  A0..BF  A0..BF  A0..BF
+    Below are above-Unicode code points
+ U+110000..U+1FFFFF	F9      * A2..BF  A0..BF  A0..BF  A0..BF
+ U+200000..U+3FFFFF	FA..FB    A0..BF  A0..BF  A0..BF  A0..BF
+ U+400000..U+1FFFFFF	FC      * A4..BF  A0..BF  A0..BF  A0..BF  A0..BF
+U+2000000..U+3FFFFFF	FD        A0..BF  A0..BF  A0..BF  A0..BF  A0..BF
+U+4000000..U+3FFFFFFF   FE      * A2..BF  A0..BF  A0..BF  A0..BF  A0..BF  A0..BF
+U+40000000..            FF        A0..BF  A0..BF  A0..BF  A0..BF  A0..BF  A0..BF  * A1..BF  A0..BF
+
+Note the gaps before several of the byte entries above marked by '*'.  These are
+caused by legal UTF-8 avoiding non-shortest encodings: it is technically
+possible to UTF-8-encode a single code point in different ways, but that is
+explicitly forbidden, and the shortest possible encoding should always be used
+(and that is what Perl does). */
 
 /* This is a fundamental property of UTF-EBCDIC */
 #define OFFUNI_IS_INVARIANT(c) (((UV)(c)) <  0xA0)
author	Karl Williamson <khw@cpan.org>	2016-08-26 16:07:22 -0600
committer	Karl Williamson <khw@cpan.org>	2016-08-31 20:32:36 -0600
commit	a14e0a36c312aa5bbf90f3971ff3d9ab65c4cda9 (patch)
tree	e781cf0490f993a04008826a02696a7a6f7ae328
parent	5ff889fb14e2876aef87fbed6e39779692ad2aa4 (diff)
download	perl-a14e0a36c312aa5bbf90f3971ff3d9ab65c4cda9.tar.gz