unicode_constants.pl: Consider all \pP for delims

Previously, only the punctuation characters that Unicode had classed as being opening/closing were considered in looking for suitable paired delimiters. This commit looks at all punctuation characters. There are actually only 7 new pairs found. This gives us ꧁ ꧂ as string delimiterss, if your font allows, which are Javanese and used to surround an honorific title, according to Wikipedia.
author: Karl Williamson <khw@cpan.org> 2022-03-09 13:13:02 -0700
committer: Karl Williamson <khw@cpan.org> 2022-03-19 23:17:51 -0600
commit: 210ad843ab44bc50a506b9ac9c6713b00b27ce44 (patch)
tree: 669dbcb8ca1dd8087d20b4a3df5a7fafb18bbf04 /regen/unicode_constants.pl
parent: 7e4a71c64c41ee74930b339d65a6d0ecdea70316 (diff)
download: perl-210ad843ab44bc50a506b9ac9c6713b00b27ce44.tar.gz
1 files changed, 27 insertions, 1 deletions
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl
index d8086b5fb6..205c651106 100644
--- a/regen/unicode_constants.pl
+++ b/regen/unicode_constants.pl
@@ -256,6 +256,7 @@ my %discards;
 my $non_directional = 'No perceived horizontal direction';
 my $not_considered_directional_because = "Not considered directional because";
 my $unpaired = "Didn't find a mirror";
+my $illegal = "Mirror illegal";
 my $no_encoded_mate = "Mirrored, but Unicode has no encoded mirror";
 my $bidirectional = "Bidirectional";
 
@@ -263,7 +264,7 @@ my $bidirectional = "Bidirectional";
 # opening/closing delimiters is quite conservative, consisting of those
 # from the above property that other Unicode properties classify as
 # opening/closing.
-foreach my $list (qw(PI PF PS PE Symbol)) {
+foreach my $list (qw(Punctuation Symbol)) {
     my @invlist = prop_invlist($list);
     die "Empty list $list" unless @invlist;
 
@@ -424,6 +425,31 @@ foreach my $list (qw(PI PF PS PE Symbol)) {
             next;
         }
 
+        # There are a few characters like REVERSED SEMICOLON that are mirrors,
+        # but have always commonly been used unmirrored.  There is also the
+        # PILCROW SIGN and its mirror which might be considered to be
+        # legitimate mirrors, but maybe not.  Additionally the current
+        # algorithm for finding the mirror depends on each member of a pair
+        # being respresented by the same number of bytes as its mate.  By
+        # skipping these, we solve both problems
+        if ($code_point < 256 != $mirror_code_point < 256) {
+            $discards{$code_point} = { reason => $illegal,
+                                        mirror => $mirror_code_point
+                                     };
+            next;
+        }
+
+        # And '/' and '\' are mirrors that we don't accept
+        if (   $name =~ /SOLIDUS/
+            &&    $name   =~ s/REVERSE SOLIDUS/SOLIDUS/r
+               eq $mirror =~ s/REVERSE SOLIDUS/SOLIDUS/r)
+        {
+            $discards{$code_point} = { reason => $illegal,
+                                        mirror => $mirror_code_point
+                                     };
+            next;
+        }
+
             $paireds{$code_point} = $mirror_code_point;
             $inverted_paireds{$mirror_code_point} = $code_point;
author	Karl Williamson <khw@cpan.org>	2022-03-09 13:13:02 -0700
committer	Karl Williamson <khw@cpan.org>	2022-03-19 23:17:51 -0600
commit	210ad843ab44bc50a506b9ac9c6713b00b27ce44 (patch)
tree	669dbcb8ca1dd8087d20b4a3df5a7fafb18bbf04 /regen/unicode_constants.pl
parent	7e4a71c64c41ee74930b339d65a6d0ecdea70316 (diff)
download	perl-210ad843ab44bc50a506b9ac9c6713b00b27ce44.tar.gz