summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2002-02-27 19:06:39 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2002-02-27 19:06:39 +0000
commit8c007b5a50b3b203d888850956b2075ebfd49ce5 (patch)
tree2560db8cd912a4a93322711e666da1ed5a1b0c9a
parentc99da3702f996b1fc1415a829383240bdf2f0fd2 (diff)
downloadperl-8c007b5a50b3b203d888850956b2075ebfd49ce5.tar.gz
Update the UTF-8 explanation table.
p4raw-id: //depot/perl@14900
-rw-r--r--pod/perlunicode.pod15
-rw-r--r--utf8.h27
2 files changed, 35 insertions, 7 deletions
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 7fb473ebe5..7ea87141f0 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -700,18 +700,23 @@ UTF-8 is a variable-length (1 to 6 bytes, current character allocations
require 4 bytes), byteorder independent encoding. For ASCII, UTF-8 is
transparent (and we really do mean 7-bit ASCII, not another 8-bit encoding).
-The following table is from Unicode 3.1.
+The following table is from Unicode 3.2.
Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
- U+0000..U+007F 00..7F   
- U+0080..U+07FF C2..DF 80..BF   
+ U+0000..U+007F 00..7F
+ U+0080..U+07FF C2..DF 80..BF
U+0800..U+0FFF E0 A0..BF 80..BF  
- U+1000..U+FFFF E1..EF 80..BF 80..BF  
+ U+1000..U+CFFF E1..EC 80..BF 80..BF  
+ U+D000..U+D7FF ED 80..9F 80..BF  
+ U+D800..U+DFFF ******* ill-formed *******
+ U+E000..U+FFFF EE..EF 80..BF 80..BF  
U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF,
+the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF.
Or, another way to look at it, as bits:
Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
@@ -722,7 +727,7 @@ Or, another way to look at it, as bits:
00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa
As you can see, the continuation bytes all begin with C<10>, and the
-leading bits of the start byte tells how many bytes the are in the
+leading bits of the start byte tell how many bytes the are in the
encoded character.
=item
diff --git a/utf8.h b/utf8.h
index feff1b4696..2e0b5fdb2d 100644
--- a/utf8.h
+++ b/utf8.h
@@ -63,20 +63,43 @@ END_EXTERN_C
/*
- The following table is from Unicode 3.1.
+ The following table is from Unicode 3.2.
Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
U+0000..U+007F 00..7F   
U+0080..U+07FF C2..DF 80..BF   
U+0800..U+0FFF E0 A0..BF 80..BF  
- U+1000..U+FFFF E1..EF 80..BF 80..BF  
+ U+1000..U+CFFF E1..EC 80..BF 80..BF  
+ U+D000..U+D7FF ED 80..9F 80..BF  
+ U+D800..U+DFFF ******* ill-formed *******
+ U+E000..U+FFFF EE..EF 80..BF 80..BF  
U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF,
+the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF.
+
*/
+/*
+ Another way to look at it, as bits:
+
+ Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
+
+ 0aaaaaaa 0aaaaaaa
+ 00000bbbbbaaaaaa 110bbbbb 10aaaaaa
+ ccccbbbbbbaaaaaa 1110cccc 10bbbbbb 10aaaaaa
+ 00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa
+
+As you can see, the continuation bytes all begin with C<10>, and the
+leading bits of the start byte tell how many bytes the are in the
+encoded character.
+
+*/
+
+
#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80)
#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))