diff options
author | Karl Williamson <khw@cpan.org> | 2022-02-13 21:08:22 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2022-03-19 23:17:51 -0600 |
commit | c7b32e72c6b0a4931897121ac865a0fbc7445f17 (patch) | |
tree | b799bc89198aef4a0c40ede7280e7d367c25b095 /regen/unicode_constants.pl | |
parent | 63cd44e4d01aafda8bc32c13f34dbab0035ac382 (diff) | |
download | perl-c7b32e72c6b0a4931897121ac865a0fbc7445f17.tar.gz |
unicode_constants.pl: Generate paired string delimiters
This commit causes several C strings to be generated containing bytes
that match paired string delimiters beyond the four that have
traditionally been used in Perl. This will allow a future commit to
accept more matching delimiters around strings than those four.
The code explains how the added delimiters are chosen.
Diffstat (limited to 'regen/unicode_constants.pl')
-rw-r--r-- | regen/unicode_constants.pl | 117 |
1 files changed, 113 insertions, 4 deletions
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index 67bc75752d..ed7fa0e60e 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -1,9 +1,10 @@ use v5.16.0; use strict; use warnings; +no warnings 'experimental::regex_sets'; require './regen/regen_lib.pl'; require './regen/charset_translations.pl'; -use Unicode::UCD; +use Unicode::UCD qw(prop_invlist prop_invmap); use charnames qw(:loose); my $out_fh = open_new('unicode_constants.h', '>', @@ -85,7 +86,49 @@ print $out_fh <<END; END -# The data are at __DATA__ in this file. +# Gather the characters in Unicode that have left/right symmetry suitable for +# paired string delimiters +my %paireds = ( ord '<' => ord '>' ); # We don't normally use math ones, but + # this is traditionally included + +# This property is the universe of all characters in Unicode which +# are of some import to the Bidirectional Algorithm, and for which there is +# another Unicode character that is a mirror of it. +my ($bmg_invlist, $bmg_invmap, $format, $bmg_default) = + prop_invmap("Bidi_Mirroring_Glyph"); + +# The current list of characters that Perl considers to be paired +# opening/closing delimiters is quite conservative, consisting of those +# from the above property that other Unicode properties classify as +# opening/closing. + +# Find the ones in the bmg list that Unicode thinks are opening ones. +for (my $i = 0; $i < $bmg_invlist->@*; $i++) { + my $mirror_code_point = $bmg_invmap->[$i]; + next if $mirror_code_point eq $bmg_default; # Doesn't map to a character. + + my $code_point = $bmg_invlist->[$i]; + + # Bidi_Paired_Bracket_Type=Open and General_Category=Open_Punctuation are + # definitely in the list. It is language-dependent whether members of + # General_Category=Initial_Punctuation are considered opening or closing; + # we take what Unicode considers the more likely scenario. + if (chr($code_point) =~ /(?[ \p{BPT=Open} + | \p{Gc=Open_Punctuation} + | \p{Gc=Initial_Punctuation} + ])/) + { + $paireds{$code_point} = $mirror_code_point; + } +} + +# There are several hundred characters other characters that clearly should be +# mirrors of each other, like LEFTWARDS ARROW and RIGHTWARDS ARROW. Unicode +# did not bother to classify them as mirrors mostly because they aren't of +# import in the Bidirectional Algorithm. Most of them are symbols. These +# are not considered opening/closing by Perl for now. + +# The rest of the data are at __DATA__ in this file. my @data = <DATA>; @@ -178,6 +221,74 @@ foreach my $charset (get_supported_code_pages()) { printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp; } + # Now output the strings of opening/closing delimiters. The Unicode + # values were earlier entered into %paireds + my $utf8_opening = ""; + my $utf8_closing = ""; + my $non_utf8_opening = ""; + my $non_utf8_closing = ""; + my $deprecated_if_not_mirrored = ""; + my $non_utf8_deprecated_if_not_mirrored = ""; + + for my $from (sort { $a <=> $b } keys %paireds) { + my $to = $paireds{$from}; + my $utf8_from_backslashed = backslash_x_form($from, $charset); + my $utf8_to_backslashed = backslash_x_form($to, $charset); + my $non_utf8_from_backslashed; + my $non_utf8_to_backslashed; + + $utf8_opening .= $utf8_from_backslashed; + $utf8_closing .= $utf8_to_backslashed; + + if ($from < 256) { + $non_utf8_from_backslashed = + backslash_x_form($from, $charset, 'not_utf8'); + $non_utf8_to_backslashed = + backslash_x_form($to, $charset, 'not_utf8'); + + $non_utf8_opening .= $non_utf8_from_backslashed; + $non_utf8_closing .= $non_utf8_to_backslashed; + } + + # Only the ASCII range paired delimiters have traditionally been + # accepted. Until the feature is considered standard, the non-ASCII + # opening ones must be deprecated when the feature isn't in effect, so + # as to warn about behavior that is planned to change. + if ($from > 127) { + $deprecated_if_not_mirrored .= $utf8_from_backslashed; + $non_utf8_deprecated_if_not_mirrored .= + $non_utf8_from_backslashed if $from < 256; + } + + # The implementing code in toke.c assumes that the byte length of each + # opening delimiter is the same as its mirrored closing one. This + # makes sure of that by checking upon each iteration of the loop. + if (length $utf8_opening != length $utf8_closing) { + die "Byte length of representation of '" + . charnames::viacode($from) + . " differs from its mapping '" + . charnames::viacode($to) + . "'"; + } + } + + print $out_fh <<~"EOT"; + + # ifdef PERL_IN_TOKE_C + /* Paired characters for quote-like operators, in UTF-8 */ + # define EXTRA_OPENING_UTF8_BRACKETS "$utf8_opening" + # define EXTRA_CLOSING_UTF8_BRACKETS "$utf8_closing" + + /* And not in UTF-8 */ + # define EXTRA_OPENING_NON_UTF8_BRACKETS "$non_utf8_opening" + # define EXTRA_CLOSING_NON_UTF8_BRACKETS "$non_utf8_closing" + + /* And what's deprecated */ + # define DEPRECATED_OPENING_UTF8_BRACKETS "$deprecated_if_not_mirrored" + # define DEPRECATED_OPENING_NON_UTF8_BRACKETS "$non_utf8_deprecated_if_not_mirrored" + # endif + EOT + my $max_PRINT_A = 0; for my $i (0x20 .. 0x7E) { $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A; @@ -194,8 +305,6 @@ EOT } -use Unicode::UCD 'prop_invlist'; - my $count = 0; my @other_invlist = prop_invlist("Other"); for (my $i = 0; $i < @other_invlist; $i += 2) { |