use v5.16.0; use strict; use warnings; require 'regen/regen_lib.pl'; use charnames qw(:loose); my $out_fh = open_new('unicode_constants.h', '>', {style => '*', by => $0, from => "Unicode data"}); print $out_fh < ) { if ($_ !~ /\S/) { print $out_fh "\n"; next; } chomp; unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token (?: [\ ]+ ( .* ) )? # optional flag /x) { die "Unexpected syntax at line $.: $_\n"; } my $name_or_cp = $1; my $flag = $2; my $name; my $cp; if ($name_or_cp =~ /[^[:xdigit:]]/) { # Anything that isn't a hex value must be a name. $name = $name_or_cp; $cp = charnames::vianame($name =~ s/_/ /gr); die "Unknown name '$name' at line $.: $_\n" unless defined $name; } else { $cp = $name_or_cp; $name = charnames::viacode("0$cp"); # viacode requires a leading zero # to be sure that the argument is hex die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp; } $name =~ s/ /_/g; # The macro name can have no blanks in it my $str = join "", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", hex $cp)); my $suffix = '_UTF8'; if (! defined $flag || $flag eq 'string') { $str = "\"$str\""; # Will be a string constant } elsif ($flag eq 'tail') { $str =~ s/\\x..//; # Remove the first byte $suffix .= '_TAIL'; $str = "\"$str\""; # Will be a string constant } elsif ($flag eq 'first') { $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte $suffix .= '_FIRST_BYTE'; $str = "0x$str"; # Is a numeric constant } elsif ($flag eq 'native') { die "Are you sure you want to run this on an above-Latin1 code point?" if hex $cp > 0xff; $suffix = '_NATIVE'; $str = utf8::unicode_to_native(hex $cp); $str = "0x$cp"; # Is a numeric constant } else { die "Unknown flag at line $.: $_\n"; } print $out_fh "#define ${name}$suffix $str /* U+$cp */\n"; } print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; read_only_bottom_close_and_rename($out_fh); __DATA__ 0300 string 0301 string 0308 string 03B9 first 03B9 tail 03C5 first 03C5 tail 1100 1160 11A8 2010 string 007F native 00DF native 00E5 native 00C5 native 00FF native 00B5 native 0085 native