summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-04-28 18:30:27 -0600
committerKarl Williamson <public@khwilliamson.com>2012-05-22 08:24:18 -0600
commita4f7a67c079118a2f900d5f95d3cada67e3475ea (patch)
treea957c6c0d3a8dc3becde39014f267dc58a5a7ad7 /utf8.h
parentb2635aa880f19db51ef9eeb9d4d2a0eeeca8228b (diff)
downloadperl-a4f7a67c079118a2f900d5f95d3cada67e3475ea.tar.gz
utf8.h: Simplify expressions
These expressions, while valid, are overly complicated in order to make it easy to separate out problematic code points in the future, such as surrogates. But we made a decision in 5.12 to not go in that direction, but to accept such problematic code points in general. I haven't heard any cause to regret that decision; if we ever want to go back, the blame log will easily allow us to.
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h36
1 files changed, 8 insertions, 28 deletions
diff --git a/utf8.h b/utf8.h
index 8d9eb333ba..4d80d73b97 100644
--- a/utf8.h
+++ b/utf8.h
@@ -495,6 +495,8 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
#define SHARP_S_SKIP 2
#ifndef EBCDIC
+/* If you want to exclude surrogates, and beyond legal Unicode, see the blame
+ * log for earlier versions which gave details for these */
# define IS_UTF8_CHAR_1(p) \
((p)[0] <= 0x7F)
# define IS_UTF8_CHAR_2(p) \
@@ -505,18 +507,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
(p)[1] >= 0xA0 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF)
# define IS_UTF8_CHAR_3b(p) \
- ((p)[0] >= 0xE1 && (p)[0] <= 0xEC && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF)
-# define IS_UTF8_CHAR_3c(p) \
- ((p)[0] == 0xED && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF)
- /* In IS_UTF8_CHAR_3c(p) one could use
- * (p)[1] >= 0x80 && (p)[1] <= 0x9F
- * if one wanted to exclude surrogates. */
-# define IS_UTF8_CHAR_3d(p) \
- ((p)[0] >= 0xEE && (p)[0] <= 0xEF && \
+ ((p)[0] >= 0xE1 && (p)[0] <= 0xEF && \
(p)[1] >= 0x80 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF)
# define IS_UTF8_CHAR_4a(p) \
@@ -524,34 +515,23 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
(p)[1] >= 0x90 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF && \
(p)[3] >= 0x80 && (p)[3] <= 0xBF)
-# define IS_UTF8_CHAR_4b(p) \
- ((p)[0] >= 0xF1 && (p)[0] <= 0xF3 && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
- (p)[3] >= 0x80 && (p)[3] <= 0xBF)
-/* In IS_UTF8_CHAR_4c(p) one could use
- * (p)[0] == 0xF4
- * if one wanted to stop at the Unicode limit U+10FFFF.
- * The 0xF7 allows us to go to 0x1fffff (0x200000 would
+/* The 0xF7 allows us to go to 0x1fffff (0x200000 would
* require five bytes). Not doing any further code points
* since that is not needed (and that would not be strict
* UTF-8, anyway). The "slow path" in Perl_is_utf8_char()
* will take care of the "extended UTF-8". */
-# define IS_UTF8_CHAR_4c(p) \
- ((p)[0] >= 0xF4 && (p)[0] <= 0xF7 && \
+# define IS_UTF8_CHAR_4b(p) \
+ ((p)[0] >= 0xF1 && (p)[0] <= 0xF7 && \
(p)[1] >= 0x80 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF && \
(p)[3] >= 0x80 && (p)[3] <= 0xBF)
# define IS_UTF8_CHAR_3(p) \
(IS_UTF8_CHAR_3a(p) || \
- IS_UTF8_CHAR_3b(p) || \
- IS_UTF8_CHAR_3c(p) || \
- IS_UTF8_CHAR_3d(p))
+ IS_UTF8_CHAR_3b(p))
# define IS_UTF8_CHAR_4(p) \
(IS_UTF8_CHAR_4a(p) || \
- IS_UTF8_CHAR_4b(p) || \
- IS_UTF8_CHAR_4c(p))
+ IS_UTF8_CHAR_4b(p))
/* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
* (1) allows UTF-8 encoded UTF-16 surrogates