summaryrefslogtreecommitdiff
path: root/regen
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-08-27 21:17:49 -0600
committerKarl Williamson <khw@cpan.org>2016-08-31 21:12:54 -0600
commit69bc4c1f86bca21cf0baeb2b4812ea97d3bf438e (patch)
treefe64fbb8692b47e4504b131383a5a49fabc73edc /regen
parent8d19ebbca9eecf219cc453cffe88722722860dd9 (diff)
downloadperl-69bc4c1f86bca21cf0baeb2b4812ea97d3bf438e.tar.gz
Add C macros for UTF-8 for BOM and REPLACEMENT CHARACTER
This makes it easy for module authors to write XS code that can use these characters, and be automatically portable to EBCDIC systems.
Diffstat (limited to 'regen')
-rw-r--r--regen/unicode_constants.pl31
1 files changed, 31 insertions, 0 deletions
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl
index baf25f1258..acdbaa3d4e 100644
--- a/regen/unicode_constants.pl
+++ b/regen/unicode_constants.pl
@@ -28,6 +28,30 @@ print $out_fh <<END;
* "_TAIL" if instead it represents all but the first byte. This, and
* with no additional suffix are both string constants */
+/*
+=head1 Unicode Support
+
+=for apidoc AmU|placeholder|BOM_UTF8
+
+This is a macro that evaluates to a string constant of the UTF-8 bytes that
+define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl
+is compiled on. This allows code to use a mnemonic for this character that
+works on both ASCII and EBCDIC platforms.
+S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in
+bytes.
+
+=for apidoc AmU|placeholder|REPLACEMENT_CHARACTER_UTF8
+
+This is a macro that evaluates to a string constant of the UTF-8 bytes that
+define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl
+is compiled on. This allows code to use a mnemonic for this character that
+works on both ASCII and EBCDIC platforms.
+S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in
+bytes.
+
+=cut
+*/
+
END
my $version = Unicode::UCD::UnicodeVersion();
@@ -180,6 +204,9 @@ read_only_bottom_close_and_rename($out_fh);
# DATA FORMAT
#
+# Note that any apidoc comments you want in the file need to be added to one
+# of the prints above
+#
# A blank line is output as-is.
# Comments (lines whose first non-blank is a '#') are converted to C-style,
# though empty comments are converted to blank lines. Otherwise, each line
@@ -228,6 +255,10 @@ U+2010 string
BOM first
BOM tail
+BOM string
+
+U+FFFD string
+
NBSP native
NBSP string