summaryrefslogtreecommitdiff
path: root/regen/unicode_constants.pl
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-03-09 13:13:02 -0700
committerKarl Williamson <khw@cpan.org>2022-03-19 23:17:51 -0600
commit210ad843ab44bc50a506b9ac9c6713b00b27ce44 (patch)
tree669dbcb8ca1dd8087d20b4a3df5a7fafb18bbf04 /regen/unicode_constants.pl
parent7e4a71c64c41ee74930b339d65a6d0ecdea70316 (diff)
downloadperl-210ad843ab44bc50a506b9ac9c6713b00b27ce44.tar.gz
unicode_constants.pl: Consider all \pP for delims
Previously, only the punctuation characters that Unicode had classed as being opening/closing were considered in looking for suitable paired delimiters. This commit looks at all punctuation characters. There are actually only 7 new pairs found. This gives us ꧁ ꧂ as string delimiterss, if your font allows, which are Javanese and used to surround an honorific title, according to Wikipedia.
Diffstat (limited to 'regen/unicode_constants.pl')
-rw-r--r--regen/unicode_constants.pl28
1 files changed, 27 insertions, 1 deletions
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl
index d8086b5fb6..205c651106 100644
--- a/regen/unicode_constants.pl
+++ b/regen/unicode_constants.pl
@@ -256,6 +256,7 @@ my %discards;
my $non_directional = 'No perceived horizontal direction';
my $not_considered_directional_because = "Not considered directional because";
my $unpaired = "Didn't find a mirror";
+my $illegal = "Mirror illegal";
my $no_encoded_mate = "Mirrored, but Unicode has no encoded mirror";
my $bidirectional = "Bidirectional";
@@ -263,7 +264,7 @@ my $bidirectional = "Bidirectional";
# opening/closing delimiters is quite conservative, consisting of those
# from the above property that other Unicode properties classify as
# opening/closing.
-foreach my $list (qw(PI PF PS PE Symbol)) {
+foreach my $list (qw(Punctuation Symbol)) {
my @invlist = prop_invlist($list);
die "Empty list $list" unless @invlist;
@@ -424,6 +425,31 @@ foreach my $list (qw(PI PF PS PE Symbol)) {
next;
}
+ # There are a few characters like REVERSED SEMICOLON that are mirrors,
+ # but have always commonly been used unmirrored. There is also the
+ # PILCROW SIGN and its mirror which might be considered to be
+ # legitimate mirrors, but maybe not. Additionally the current
+ # algorithm for finding the mirror depends on each member of a pair
+ # being respresented by the same number of bytes as its mate. By
+ # skipping these, we solve both problems
+ if ($code_point < 256 != $mirror_code_point < 256) {
+ $discards{$code_point} = { reason => $illegal,
+ mirror => $mirror_code_point
+ };
+ next;
+ }
+
+ # And '/' and '\' are mirrors that we don't accept
+ if ( $name =~ /SOLIDUS/
+ && $name =~ s/REVERSE SOLIDUS/SOLIDUS/r
+ eq $mirror =~ s/REVERSE SOLIDUS/SOLIDUS/r)
+ {
+ $discards{$code_point} = { reason => $illegal,
+ mirror => $mirror_code_point
+ };
+ next;
+ }
+
$paireds{$code_point} = $mirror_code_point;
$inverted_paireds{$mirror_code_point} = $code_point;