summaryrefslogtreecommitdiff
path: root/regen/unicode_constants.pl
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-03-09 07:25:46 -0700
committerKarl Williamson <khw@cpan.org>2022-03-19 23:17:51 -0600
commit835f2666d2ae366f7af912303f061f066b8376c4 (patch)
tree900bc30040ac282972c9d24f9562550fa5802a06 /regen/unicode_constants.pl
parent9c9853e81d56a8abb664dd3e6332722675cc9a7c (diff)
downloadperl-835f2666d2ae366f7af912303f061f066b8376c4.tar.gz
Allow reversal of some paired delimiters; deprecations
Unicode says certain opening punctuation characters may be used as closing ones in some languages; and their mirror is instead the opening one. This commit changes to allow either one of each such set to be the opening one. It also deprecates the use of any of the new mirrored delimiters to be used outside the feature as an unmirrored delimiter, and the normal closing delimiter from being used as an unpaired opening one while in the feature. This gives us the freedom to make some or all of the new paired delimiters be reversible.
Diffstat (limited to 'regen/unicode_constants.pl')
-rw-r--r--regen/unicode_constants.pl13
1 files changed, 12 insertions, 1 deletions
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl
index 5578766554..1bc654e740 100644
--- a/regen/unicode_constants.pl
+++ b/regen/unicode_constants.pl
@@ -210,7 +210,7 @@ for (my $i = 0; $i < $bmg_invlist->@*; $i++) {
# Bidi_Paired_Bracket_Type=Open and General_Category=Open_Punctuation are
# definitely in the list. It is language-dependent whether members of
# General_Category=Initial_Punctuation are considered opening or closing;
- # we take what Unicode considers the more likely scenario.
+ # we allow either to be at the front
if (chr($code_point) =~ /(?[ \p{BPT=Open}
| \p{Gc=Open_Punctuation}
| \p{Gc=Initial_Punctuation}
@@ -218,6 +218,10 @@ for (my $i = 0; $i < $bmg_invlist->@*; $i++) {
{
$paireds{$code_point} = $mirror_code_point;
}
+
+ if (chr($code_point) =~ /\p{Gc=Initial_Punctuation}/) {
+ $paireds{$mirror_code_point} = $code_point;
+ }
}
# There are several hundred characters other characters that clearly should be
@@ -356,6 +360,13 @@ foreach my $charset (get_supported_code_pages()) {
$deprecated_if_not_mirrored .= $utf8_from_backslashed;
$non_utf8_deprecated_if_not_mirrored .=
$non_utf8_from_backslashed if $from < 256;
+
+ # We deprecate using any of these strongly directional characters
+ # at either end of the string, in part so we could allow them to
+ # be reversed.
+ $deprecated_if_not_mirrored .= $utf8_to_backslashed
+ if index ($deprecated_if_not_mirrored,
+ $utf8_to_backslashed) < 0;
}
# The implementing code in toke.c assumes that the byte length of each