summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2017-07-12 20:26:18 -0600
committerKarl Williamson <khw@cpan.org>2017-07-12 21:14:26 -0600
commit5f995336c78d31708a69477c3351b87e285d64b8 (patch)
tree7dcaff92af97a8be50e8a640da77b2ec1f6aceda /utf8.c
parentc285bbc4a6321e4e787d0fac9f34c354c7647256 (diff)
downloadperl-5f995336c78d31708a69477c3351b87e285d64b8.tar.gz
utf8.c: Move and slightly change comment block
This is so there are fewer real differences shown in the next commit
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c33
1 files changed, 18 insertions, 15 deletions
diff --git a/utf8.c b/utf8.c
index b55c93115e..fea8fae8c9 100644
--- a/utf8.c
+++ b/utf8.c
@@ -422,19 +422,6 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
* several places in this file, so is centralized here. It is based on the
* following table:
*
- * U+7FFFFFFF (2 ** 31 - 1)
- * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
- * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
- * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
- * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
- * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
- * U+80000000 (2 ** 31):
- * ASCII: \xFE\x82\x80\x80\x80\x80\x80
- * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
- * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
- * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
- * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
- * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
*/
#ifdef EBCDIC
@@ -472,8 +459,24 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
return FALSE;
}
- /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
- * \x41 and \x42. */
+ /* U+7FFFFFFF (2 ** 31 - 1)
+ * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
+ * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
+ * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
+ * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
+ * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
+ * U+80000000 (2 ** 31):
+ * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+ * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+ * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+ * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
+ */
+ /* (Note that in UTF-EBCDIC, the two lowest possible continuation bytes
+ * are \x41 and \x42.) If we have enough bytes available to determine
+ * the answer, or the bytes we do have differ from the UTF-8 prefix of
+ * the highest 30-bit code point, we can compare them to get a
+ * definitive answer */
+
return cBOOL(memGT(s + 1, prefix, cmp_len));
#endif