summaryrefslogtreecommitdiff
path: root/utfebcdic.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2019-10-02 22:04:12 -0600
committerKarl Williamson <khw@cpan.org>2019-11-11 20:55:39 -0700
commit82257e20fc6def7fe08a0c6dc5524db5b5b63011 (patch)
tree9b13e89c1ad0ce0423e52ad332b1aff6dedddd12 /utfebcdic.h
parente56dfd967ce460481a9922d14e931b438548093d (diff)
downloadperl-82257e20fc6def7fe08a0c6dc5524db5b5b63011.tar.gz
utfebcdic.h: Add comments
Diffstat (limited to 'utfebcdic.h')
-rw-r--r--utfebcdic.h17
1 files changed, 15 insertions, 2 deletions
diff --git a/utfebcdic.h b/utfebcdic.h
index 085013ab19..97b8f7001a 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -202,11 +202,24 @@ possible to UTF-8-encode a single code point in different ways, but that is
explicitly forbidden, and the shortest possible encoding should always be used
(and that is what Perl does). */
-/* Comments as to the meaning of each are given at their corresponding utf8.h
- * definitions. */
+/* It turns out that just this one number is sufficient to derive all the basic
+ * macros for UTF-8 and UTF-EBCDIC. Everything follows from the fact that
+ * there are 6 bits of real information in a UTF-8 continuation byte vs. 5 bits
+ * in a UTF-EBCDIC one. */
#define UTF_ACCUMULATION_SHIFT 5
+/* Also needed is how perl handles a start byte of 8 one bits. The decision
+ * was made to just append the minimal number of bytes after that so that code
+ * points up to 64 bits wide could be represented. In UTF-8, that was an extra
+ * 5 bytes, and in UTF-EBCDIC it's 6. The result is in UTF8_MAXBYTES defined
+ * above. This implementation has the advantage that you have everything you
+ * need in the first byte. Other ways of extending UTF-8 have been devised,
+ * some to arbitrarily high code points. But they require looking at the next
+ * byte(s) when the first one is 8 one bits. */
+
+/* These others are for efficiency or for other decisions we've made */
+
#define isUTF8_POSSIBLY_PROBLEMATIC(c) \
_generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)