summaryrefslogtreecommitdiff
path: root/regen
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-09-02 09:44:22 -0600
committerKarl Williamson <public@khwilliamson.com>2012-09-13 21:14:00 -0600
commit525b6419a6e9037dd46cd37b578c11266e7cd2b2 (patch)
treea7cec91887ff1b88591f146dd70289e2176a2562 /regen
parent5f1720e99d3e7fd0da4056940fd040fe824fd2ca (diff)
downloadperl-525b6419a6e9037dd46cd37b578c11266e7cd2b2.tar.gz
regen/utf8_strings.pl: Add ability to get native charset
This adds a new capability to this program: to input a Unicode code point and create a macro that expands to the platform's native value for it. This will allow removal of a bunch of EBCDIC dependencies in the core.
Diffstat (limited to 'regen')
-rw-r--r--regen/utf8_strings.pl33
1 files changed, 25 insertions, 8 deletions
diff --git a/regen/utf8_strings.pl b/regen/utf8_strings.pl
index b868e594a3..311afacd62 100644
--- a/regen/utf8_strings.pl
+++ b/regen/utf8_strings.pl
@@ -14,15 +14,16 @@ print $out_fh <<END;
#define H_UTF8_STRINGS 1
/* This file contains #defines for various Unicode code points. The values
- * for the macros are all or portions of the UTF-8 encoding for the code
- * point. Note that the names all have the suffix "_UTF8".
+ * the macros expand to are the native Unicode code point, or all or portions
+ * of the UTF-8 encoding for the code point. In the former case, the macro
+ * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
*
- * The suffix "_FIRST_BYTE" may be appended to the name if the value is just
- * the first byte of the UTF-8 representation; the value will be a numeric
- * constant.
- *
- * The suffix "_TAIL" is appened if instead it represents all but the first
- * byte. This, and with no suffix are both string constants */
+ * The macros that have the suffix "_UTF8" may have further suffixes, as
+ * follows:
+ * "_FIRST_BYTE" if the value is just the first byte of the UTF-8
+ * representation; the value will be a numeric constant.
+ * "_TAIL" if instead it represents all but the first byte. This, and
+ * with no additional suffix are both string constants */
END
@@ -40,6 +41,8 @@ END
# described in the comments above that are placed in the file.
# first indicates that the output is to be of the FIRST_BYTE form.
# tail indicates that the output is of the _TAIL form.
+# native indicates that the output is the code point, converted to the
+# platform's native character set if applicable
#
# This program is used to make it convenient to create compile time constants
# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
@@ -97,6 +100,12 @@ while ( <DATA> ) {
$suffix .= '_FIRST_BYTE';
$str = "0x$str"; # Is a numeric constant
}
+ elsif ($flag eq 'native') {
+ die "Are you sure you want to run this on an above-Latin1 code point?" if hex $cp > 0xff;
+ $suffix = '_NATIVE';
+ $str = utf8::unicode_to_native(hex $cp);
+ $str = "0x$cp"; # Is a numeric constant
+ }
else {
die "Unknown flag at line $.: $_\n";
}
@@ -122,3 +131,11 @@ __DATA__
1160
11A8
2010 string
+
+007F native
+00DF native
+00E5 native
+00C5 native
+00FF native
+00B5 native
+0085 native