summaryrefslogtreecommitdiff
path: root/unicode_constants.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-08-27 21:17:49 -0600
committerKarl Williamson <khw@cpan.org>2016-08-31 21:12:54 -0600
commit69bc4c1f86bca21cf0baeb2b4812ea97d3bf438e (patch)
treefe64fbb8692b47e4504b131383a5a49fabc73edc /unicode_constants.h
parent8d19ebbca9eecf219cc453cffe88722722860dd9 (diff)
downloadperl-69bc4c1f86bca21cf0baeb2b4812ea97d3bf438e.tar.gz
Add C macros for UTF-8 for BOM and REPLACEMENT CHARACTER
This makes it easy for module authors to write XS code that can use these characters, and be automatically portable to EBCDIC systems.
Diffstat (limited to 'unicode_constants.h')
-rw-r--r--unicode_constants.h36
1 files changed, 36 insertions, 0 deletions
diff --git a/unicode_constants.h b/unicode_constants.h
index 02d38113ae..1c0e62e356 100644
--- a/unicode_constants.h
+++ b/unicode_constants.h
@@ -21,6 +21,30 @@
* "_TAIL" if instead it represents all but the first byte. This, and
* with no additional suffix are both string constants */
+/*
+=head1 Unicode Support
+
+=for apidoc AmU|placeholder|BOM_UTF8
+
+This is a macro that evaluates to a string constant of the UTF-8 bytes that
+define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl
+is compiled on. This allows code to use a mnemonic for this character that
+works on both ASCII and EBCDIC platforms.
+S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in
+bytes.
+
+=for apidoc AmU|placeholder|REPLACEMENT_CHARACTER_UTF8
+
+This is a macro that evaluates to a string constant of the UTF-8 bytes that
+define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl
+is compiled on. This allows code to use a mnemonic for this character that
+works on both ASCII and EBCDIC platforms.
+S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in
+bytes.
+
+=cut
+*/
+
#define UNICODE_MAJOR_VERSION 9
#define UNICODE_DOT_VERSION 0
#define UNICODE_DOT_DOT_VERSION 0
@@ -45,6 +69,10 @@
# define BOM_UTF8_FIRST_BYTE 0xEF /* U+FEFF */
# define BOM_UTF8_TAIL "\xBB\xBF" /* U+FEFF */
+# define BOM_UTF8 "\xEF\xBB\xBF" /* U+FEFF */
+
+# define REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD" /* U+FFFD */
+
# define NBSP_NATIVE 0xA0 /* U+00A0 */
# define NBSP_UTF8 "\xC2\xA0" /* U+00A0 */
@@ -84,6 +112,10 @@
# define BOM_UTF8_FIRST_BYTE 0xDD /* U+FEFF */
# define BOM_UTF8_TAIL "\x73\x66\x73" /* U+FEFF */
+# define BOM_UTF8 "\xDD\x73\x66\x73" /* U+FEFF */
+
+# define REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71" /* U+FFFD */
+
# define NBSP_NATIVE 0x41 /* U+00A0 */
# define NBSP_UTF8 "\x80\x41" /* U+00A0 */
@@ -123,6 +155,10 @@
# define BOM_UTF8_FIRST_BYTE 0xDD /* U+FEFF */
# define BOM_UTF8_TAIL "\x72\x65\x72" /* U+FEFF */
+# define BOM_UTF8 "\xDD\x72\x65\x72" /* U+FEFF */
+
+# define REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70" /* U+FFFD */
+
# define NBSP_NATIVE 0x41 /* U+00A0 */
# define NBSP_UTF8 "\x78\x41" /* U+00A0 */