diff options
author | Karl Williamson <khw@cpan.org> | 2022-03-09 13:13:02 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2022-03-19 23:17:51 -0600 |
commit | 210ad843ab44bc50a506b9ac9c6713b00b27ce44 (patch) | |
tree | 669dbcb8ca1dd8087d20b4a3df5a7fafb18bbf04 /regen/unicode_constants.pl | |
parent | 7e4a71c64c41ee74930b339d65a6d0ecdea70316 (diff) | |
download | perl-210ad843ab44bc50a506b9ac9c6713b00b27ce44.tar.gz |
unicode_constants.pl: Consider all \pP for delims
Previously, only the punctuation characters that Unicode had classed as
being opening/closing were considered in looking for suitable paired
delimiters.
This commit looks at all punctuation characters. There are actually
only 7 new pairs found.
This gives us ꧁ ꧂ as string delimiterss, if your font allows,
which are Javanese and used to surround an honorific title, according to
Wikipedia.
Diffstat (limited to 'regen/unicode_constants.pl')
-rw-r--r-- | regen/unicode_constants.pl | 28 |
1 files changed, 27 insertions, 1 deletions
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index d8086b5fb6..205c651106 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -256,6 +256,7 @@ my %discards; my $non_directional = 'No perceived horizontal direction'; my $not_considered_directional_because = "Not considered directional because"; my $unpaired = "Didn't find a mirror"; +my $illegal = "Mirror illegal"; my $no_encoded_mate = "Mirrored, but Unicode has no encoded mirror"; my $bidirectional = "Bidirectional"; @@ -263,7 +264,7 @@ my $bidirectional = "Bidirectional"; # opening/closing delimiters is quite conservative, consisting of those # from the above property that other Unicode properties classify as # opening/closing. -foreach my $list (qw(PI PF PS PE Symbol)) { +foreach my $list (qw(Punctuation Symbol)) { my @invlist = prop_invlist($list); die "Empty list $list" unless @invlist; @@ -424,6 +425,31 @@ foreach my $list (qw(PI PF PS PE Symbol)) { next; } + # There are a few characters like REVERSED SEMICOLON that are mirrors, + # but have always commonly been used unmirrored. There is also the + # PILCROW SIGN and its mirror which might be considered to be + # legitimate mirrors, but maybe not. Additionally the current + # algorithm for finding the mirror depends on each member of a pair + # being respresented by the same number of bytes as its mate. By + # skipping these, we solve both problems + if ($code_point < 256 != $mirror_code_point < 256) { + $discards{$code_point} = { reason => $illegal, + mirror => $mirror_code_point + }; + next; + } + + # And '/' and '\' are mirrors that we don't accept + if ( $name =~ /SOLIDUS/ + && $name =~ s/REVERSE SOLIDUS/SOLIDUS/r + eq $mirror =~ s/REVERSE SOLIDUS/SOLIDUS/r) + { + $discards{$code_point} = { reason => $illegal, + mirror => $mirror_code_point + }; + next; + } + $paireds{$code_point} = $mirror_code_point; $inverted_paireds{$mirror_code_point} = $code_point; |