1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
use v5.16.0;
use strict;
use warnings;
require 'regen/regen_lib.pl';
use charnames qw(:loose);
my $out_fh = open_new('unicode_constants.h', '>',
{style => '*', by => $0,
from => "Unicode data"});
print $out_fh <<END;
#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */
#define H_UNICODE_CONSTANTS 1
/* This file contains #defines for various Unicode code points. The values
* the macros expand to are the native Unicode code point, or all or portions
* of the UTF-8 encoding for the code point. In the former case, the macro
* name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
*
* The macros that have the suffix "_UTF8" may have further suffixes, as
* follows:
* "_FIRST_BYTE" if the value is just the first byte of the UTF-8
* representation; the value will be a numeric constant.
* "_TAIL" if instead it represents all but the first byte. This, and
* with no additional suffix are both string constants */
END
# The data are at the end of this file. A blank line is output as-is.
# Otherwise, each line represents one #define, and begins with either a
# Unicode character name with the blanks in it squeezed out or replaced by
# underscores; or it may be a hexadecimal Unicode code point. In the latter
# case, the name will be looked-up to use as the name of the macro. In either
# case, the macro name will have suffixes as listed above, and all blanks will
# be replaced by underscores.
#
# Each line may optionally have one of the following flags on it, separated by
# white space from the initial token.
# string indicates that the output is to be of the string form
# described in the comments above that are placed in the file.
# first indicates that the output is to be of the FIRST_BYTE form.
# tail indicates that the output is of the _TAIL form.
# native indicates that the output is the code point, converted to the
# platform's native character set if applicable
#
# This program is used to make it convenient to create compile time constants
# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
# having to figure things out.
while ( <DATA> ) {
if ($_ !~ /\S/) {
print $out_fh "\n";
next;
}
chomp;
unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
(?: [\ ]+ ( .* ) )? # optional flag
/x)
{
die "Unexpected syntax at line $.: $_\n";
}
my $name_or_cp = $1;
my $flag = $2;
my $name;
my $cp;
if ($name_or_cp =~ /[^[:xdigit:]]/) {
# Anything that isn't a hex value must be a name.
$name = $name_or_cp;
$cp = charnames::vianame($name =~ s/_/ /gr);
die "Unknown name '$name' at line $.: $_\n" unless defined $name;
}
else {
$cp = $name_or_cp;
$name = charnames::viacode("0$cp"); # viacode requires a leading zero
# to be sure that the argument is hex
die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
}
$name =~ s/ /_/g; # The macro name can have no blanks in it
my $str = join "", map { sprintf "\\x%02X", $_ }
unpack("U0C*", pack("U", hex $cp));
my $suffix = '_UTF8';
if (! defined $flag || $flag eq 'string') {
$str = "\"$str\""; # Will be a string constant
} elsif ($flag eq 'tail') {
$str =~ s/\\x..//; # Remove the first byte
$suffix .= '_TAIL';
$str = "\"$str\""; # Will be a string constant
}
elsif ($flag eq 'first') {
$str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
$suffix .= '_FIRST_BYTE';
$str = "0x$str"; # Is a numeric constant
}
elsif ($flag eq 'native') {
die "Are you sure you want to run this on an above-Latin1 code point?" if hex $cp > 0xff;
$suffix = '_NATIVE';
$str = utf8::unicode_to_native(hex $cp);
$str = "0x$cp"; # Is a numeric constant
}
else {
die "Unknown flag at line $.: $_\n";
}
print $out_fh "#define ${name}$suffix $str /* U+$cp */\n";
}
print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
read_only_bottom_close_and_rename($out_fh);
__DATA__
0300 string
0301 string
0308 string
03B9 first
03B9 tail
03C5 first
03C5 tail
1100
1160
11A8
2010 string
007F native
00DF native
00E5 native
00C5 native
00FF native
00B5 native
0085 native
|