diff options
author | Karl Williamson <khw@cpan.org> | 2022-03-10 09:40:10 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2022-03-19 23:17:51 -0600 |
commit | 7e4a71c64c41ee74930b339d65a6d0ecdea70316 (patch) | |
tree | c583224cf6eb12e51d577a0fa741e6e688dd6bd6 /regen/unicode_constants.pl | |
parent | 565fbe1bf153b300641c08e2ec3668869422cb4d (diff) | |
download | perl-7e4a71c64c41ee74930b339d65a6d0ecdea70316.tar.gz |
Add < > variants to paired delimiters
Perl considers '< >' to be delimiters for strings; this commit adds
most of the Unicode variants of these to also be string delimiters. The
ones that are combinations of both < and >, aren't included, as that
would be visually confusing.
Diffstat (limited to 'regen/unicode_constants.pl')
-rw-r--r-- | regen/unicode_constants.pl | 79 |
1 files changed, 74 insertions, 5 deletions
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index 4880e4c9d5..d8086b5fb6 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -86,11 +86,55 @@ sub backslash_x_form($$;$) { } } +my @bidi_strong_lefts = ( 'LESS-THAN', + ); +my @bidi_strong_rights = ( 'GREATER-THAN', + ); + +# Create an array of hashes for these, so as to translate between them, and +# avoid recompiling patterns in the loop. +my @bidi_strong_directionals; +for (my $i = 0; $i < @bidi_strong_lefts; $i++) { + push @bidi_strong_directionals, + { + LHS => $bidi_strong_lefts[$i], + RHS => $bidi_strong_rights[$i], + L_pattern => qr/\b$bidi_strong_lefts[$i]\b/, + R_pattern => qr/\b$bidi_strong_rights[$i]\b/, + }; +} + +my @other_directionals = + { + LHS => 'LEFT', + RHS => 'RIGHT', + L_pattern => + qr/ \b LEFT \b /nx, + R_pattern => + qr/ \b RIGHT \b /nx, + }; + my $reverse_re = qr/ \b REVERSE D? [- ] /x; -my %opposite_of = ( LEFT => 'RIGHT', RIGHT =>'LEFT' ); +# Create a mapping from each direction to its opposite one +my %opposite_of; +foreach my $directional (@bidi_strong_directionals, @other_directionals) { + $opposite_of{$directional->{LHS}} = $directional->{RHS}; + $opposite_of{$directional->{RHS}} = $directional->{LHS}; +} + +# Join the two types of each direction as alternatives +my $L_re = join "|", map { $_->{L_pattern} } @bidi_strong_directionals, + @other_directionals; +my $R_re = join "|", map { $_->{R_pattern} } @bidi_strong_directionals, + @other_directionals; +# And anything containing directionality will be either one of these two +my $directional_re = join "|", $L_re, $R_re; -my $directional_re = qr/\b(LEFT|RIGHT)\b/; # Make sure to capture $1 +# Now compile the strings that result from above +$L_re = qr/$L_re/; +$R_re = qr/$R_re/; +$directional_re = qr/($directional_re)/; # Make sure to capture $1 sub format_pairs_line($;$) { my ($from, $to) = @_; @@ -195,8 +239,7 @@ END # Gather the characters in Unicode that have left/right symmetry suitable for # paired string delimiters -my %paireds = ( ord '<' => ord '>' ); # We don't normally use math ones, but - # this is traditionally included +my %paireds; # So don't have to grep an array to determine if have already dealt with the # characters that are the keys @@ -214,15 +257,18 @@ my $non_directional = 'No perceived horizontal direction'; my $not_considered_directional_because = "Not considered directional because"; my $unpaired = "Didn't find a mirror"; my $no_encoded_mate = "Mirrored, but Unicode has no encoded mirror"; +my $bidirectional = "Bidirectional"; # The current list of characters that Perl considers to be paired # opening/closing delimiters is quite conservative, consisting of those # from the above property that other Unicode properties classify as # opening/closing. -foreach my $list (qw(PI PF PS PE)) { +foreach my $list (qw(PI PF PS PE Symbol)) { my @invlist = prop_invlist($list); die "Empty list $list" unless @invlist; + my $is_Symbol = $list eq 'Symbol'; + # Convert from an inversion list to an array containing everything that # matches. (This uses the recipe given in Unicode::UCD.) my @full_list; @@ -296,6 +342,28 @@ foreach my $list (qw(PI PF PS PE)) { next; } + # Unicode doesn't consider '< >' to be brackets, but Perl does. There are + # lots of variants of these in Unicode; easiest to accept all of + # them that aren't bidirectional (which would be visually + # confusing). + for (my $i = 0; $i < @bidi_strong_directionals; $i++) { + my $hash_ref = $bidi_strong_directionals[$i]; + + next if $name !~ $hash_ref->{L_pattern}; + + if ($name =~ $hash_ref->{R_pattern}) { + $discards{$code_point} = { reason => $bidirectional, + mirror => $mirror_code_point + }; + next CODE_POINT; + } + + $paireds{$code_point} = $mirror_code_point; + $inverted_paireds{$mirror_code_point} = $code_point; + next CODE_POINT; + } + + # Only do the above currently next; } else { # Here is not involved with the bidirectional algorithm. @@ -314,6 +382,7 @@ foreach my $list (qw(PI PF PS PE)) { # because they aren't of import in the Bidirectional Algorithm. Most # of them are symbols. These are not considered opening/closing by # Perl for now. + next if $is_Symbol; # Certain names are always treated as non directional. if ($name =~ m{ \b ( |