diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-04-28 18:30:27 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-05-22 08:24:18 -0600 |
commit | a4f7a67c079118a2f900d5f95d3cada67e3475ea (patch) | |
tree | a957c6c0d3a8dc3becde39014f267dc58a5a7ad7 /utf8.h | |
parent | b2635aa880f19db51ef9eeb9d4d2a0eeeca8228b (diff) | |
download | perl-a4f7a67c079118a2f900d5f95d3cada67e3475ea.tar.gz |
utf8.h: Simplify expressions
These expressions, while valid, are overly complicated in order to make
it easy to separate out problematic code points in the future, such as
surrogates. But we made a decision in 5.12 to not go in that direction,
but to accept such problematic code points in general. I haven't
heard any cause to regret that decision; if we ever want to go back, the
blame log will easily allow us to.
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 36 |
1 files changed, 8 insertions, 28 deletions
@@ -495,6 +495,8 @@ Perl's extended UTF-8 means we can have start bytes up to FF. #define SHARP_S_SKIP 2 #ifndef EBCDIC +/* If you want to exclude surrogates, and beyond legal Unicode, see the blame + * log for earlier versions which gave details for these */ # define IS_UTF8_CHAR_1(p) \ ((p)[0] <= 0x7F) # define IS_UTF8_CHAR_2(p) \ @@ -505,18 +507,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF. (p)[1] >= 0xA0 && (p)[1] <= 0xBF && \ (p)[2] >= 0x80 && (p)[2] <= 0xBF) # define IS_UTF8_CHAR_3b(p) \ - ((p)[0] >= 0xE1 && (p)[0] <= 0xEC && \ - (p)[1] >= 0x80 && (p)[1] <= 0xBF && \ - (p)[2] >= 0x80 && (p)[2] <= 0xBF) -# define IS_UTF8_CHAR_3c(p) \ - ((p)[0] == 0xED && \ - (p)[1] >= 0x80 && (p)[1] <= 0xBF && \ - (p)[2] >= 0x80 && (p)[2] <= 0xBF) - /* In IS_UTF8_CHAR_3c(p) one could use - * (p)[1] >= 0x80 && (p)[1] <= 0x9F - * if one wanted to exclude surrogates. */ -# define IS_UTF8_CHAR_3d(p) \ - ((p)[0] >= 0xEE && (p)[0] <= 0xEF && \ + ((p)[0] >= 0xE1 && (p)[0] <= 0xEF && \ (p)[1] >= 0x80 && (p)[1] <= 0xBF && \ (p)[2] >= 0x80 && (p)[2] <= 0xBF) # define IS_UTF8_CHAR_4a(p) \ @@ -524,34 +515,23 @@ Perl's extended UTF-8 means we can have start bytes up to FF. (p)[1] >= 0x90 && (p)[1] <= 0xBF && \ (p)[2] >= 0x80 && (p)[2] <= 0xBF && \ (p)[3] >= 0x80 && (p)[3] <= 0xBF) -# define IS_UTF8_CHAR_4b(p) \ - ((p)[0] >= 0xF1 && (p)[0] <= 0xF3 && \ - (p)[1] >= 0x80 && (p)[1] <= 0xBF && \ - (p)[2] >= 0x80 && (p)[2] <= 0xBF && \ - (p)[3] >= 0x80 && (p)[3] <= 0xBF) -/* In IS_UTF8_CHAR_4c(p) one could use - * (p)[0] == 0xF4 - * if one wanted to stop at the Unicode limit U+10FFFF. - * The 0xF7 allows us to go to 0x1fffff (0x200000 would +/* The 0xF7 allows us to go to 0x1fffff (0x200000 would * require five bytes). Not doing any further code points * since that is not needed (and that would not be strict * UTF-8, anyway). The "slow path" in Perl_is_utf8_char() * will take care of the "extended UTF-8". */ -# define IS_UTF8_CHAR_4c(p) \ - ((p)[0] >= 0xF4 && (p)[0] <= 0xF7 && \ +# define IS_UTF8_CHAR_4b(p) \ + ((p)[0] >= 0xF1 && (p)[0] <= 0xF7 && \ (p)[1] >= 0x80 && (p)[1] <= 0xBF && \ (p)[2] >= 0x80 && (p)[2] <= 0xBF && \ (p)[3] >= 0x80 && (p)[3] <= 0xBF) # define IS_UTF8_CHAR_3(p) \ (IS_UTF8_CHAR_3a(p) || \ - IS_UTF8_CHAR_3b(p) || \ - IS_UTF8_CHAR_3c(p) || \ - IS_UTF8_CHAR_3d(p)) + IS_UTF8_CHAR_3b(p)) # define IS_UTF8_CHAR_4(p) \ (IS_UTF8_CHAR_4a(p) || \ - IS_UTF8_CHAR_4b(p) || \ - IS_UTF8_CHAR_4c(p)) + IS_UTF8_CHAR_4b(p)) /* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it * (1) allows UTF-8 encoded UTF-16 surrogates |