use v5.16.0; use strict; use warnings; require 'regen/regen_lib.pl'; use charnames qw(:loose); my $out_fh = open_new('unicode_constants.h', '>', {style => '*', by => $0, from => "Unicode data"}); print $out_fh < ) { if ($_ !~ /\S/) { print $out_fh "\n"; next; } chomp; unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token (?: [\ ]+ ( [^ ]* ) )? # optional flag (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required /x) { die "Unexpected syntax at line $.: $_\n"; } my $name_or_cp = $1; my $flag = $2; my $desired_name = $3; my $name; my $cp; if ($name_or_cp =~ /[^[:xdigit:]]/) { # Anything that isn't a hex value must be a name. $name = $name_or_cp; $cp = charnames::vianame($name =~ s/_/ /gr); die "Unknown name '$name' at line $.: $_\n" unless defined $name; } else { $cp = $name_or_cp; $name = charnames::viacode("0$cp") // ""; # viacode requires a leading # zero to be sure that the # argument is hex die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp; } $name = $desired_name if $name eq ""; $name =~ s/ /_/g; # The macro name can have no blanks in it my $str = join "", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", hex $cp)); my $suffix = '_UTF8'; if (! defined $flag || $flag eq 'string') { $str = "\"$str\""; # Will be a string constant } elsif ($flag eq 'tail') { $str =~ s/\\x..//; # Remove the first byte $suffix .= '_TAIL'; $str = "\"$str\""; # Will be a string constant } elsif ($flag eq 'first') { $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte $suffix .= '_FIRST_BYTE'; $str = "0x$str"; # Is a numeric constant } elsif ($flag eq 'native') { die "Are you sure you want to run this on an above-Latin1 code point?" if hex $cp > 0xff; $suffix = '_NATIVE'; $str = utf8::unicode_to_native(hex $cp); $str = "0x$cp"; # Is a numeric constant } else { die "Unknown flag at line $.: $_\n"; } print $out_fh "#define ${name}$suffix $str /* U+$cp */\n"; } print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; read_only_bottom_close_and_rename($out_fh); __DATA__ 0300 string 0301 string 0308 string 03B9 first 03B9 tail 03C5 first 03C5 tail 2010 string D800 first FIRST_SURROGATE 007F native 00DF native 00E5 native 00C5 native 00FF native 00B5 native 0085 native