diff options
author | Karl Williamson <khw@cpan.org> | 2022-03-08 07:31:09 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2022-03-19 23:17:51 -0600 |
commit | dce1e563ba694cbba6b45aaa354e0e0bacb26f69 (patch) | |
tree | de3d3b327629d9d08dfde89e3222d0b5a9657843 /regen/unicode_constants.pl | |
parent | c7b32e72c6b0a4931897121ac865a0fbc7445f17 (diff) | |
download | perl-dce1e563ba694cbba6b45aaa354e0e0bacb26f69.tar.gz |
regen/unicode_constants.pl: List paired delimiters
This adds the capability to temporarily change a scalar to true to cause
this to print on stderr a list of the paired string delimiters, suitable
for pasting into a pod.
Diffstat (limited to 'regen/unicode_constants.pl')
-rw-r--r-- | regen/unicode_constants.pl | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index ed7fa0e60e..5578766554 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -6,6 +6,12 @@ require './regen/regen_lib.pl'; require './regen/charset_translations.pl'; use Unicode::UCD qw(prop_invlist prop_invmap); use charnames qw(:loose); +binmode(STDERR, ":utf8"); + +# Set this to 1 temporarily to get on stderr the complete list of paired +# string delimiters this generates. This list is suitable for plugging into a +# pod. +my $output_lists = 0; my $out_fh = open_new('unicode_constants.h', '>', {style => '*', by => $0, @@ -75,6 +81,98 @@ sub backslash_x_form($$;$) { } } + +my %opposite_of = ( LEFT => 'RIGHT', RIGHT =>'LEFT' ); + +my $directional_re = qr/\b(LEFT|RIGHT)\b/; # Make sure to capture $1 + +sub format_pairs_line($$) { + my ($from, $to) = @_; + + # Format a line containing a character pair in preparation + # for output, suitable for pod. + + my $lhs_name = charnames::viacode($from); + my $lhs_hex = sprintf "%04X", $from; + my $rhs_name; + my $rhs_hex; + my $name = $lhs_name; + + my $hanging_indent = 26; + + if (defined $to) { + my $rhs_name = charnames::viacode($to); + $rhs_hex = sprintf "%04X", $to; + + # Most of the names differ only in LEFT vs RIGHT; some in + # LESS-THAN vs GREATER-THAN. It takes less space, and is easier to + # understand if they are displayed combined. + if ($name =~ s/$directional_re/$opposite_of{$1}/gr eq $rhs_name) { + $name =~ s,$directional_re,$1/$opposite_of{$1},g; + } + else { # Otherwise, display them sequentially + $name .= ", " . $rhs_name; + } + } + + # Handle double-width characters, based on the East Asian Width property. + # Add an extra space to non-wide ones so things stay vertically aligned. + my $extra = 0; + my $output_line = " " # Indent in case output being used for verbatim + # pod + . chr $from; + if (chr($from) =~ /[\p{EA=W}\p{EA=F}]/) { + $extra++; # The length() will be shorter than the displayed + # width + } + else { + $output_line .= " "; + } + if (defined $to) { + $output_line .= " " . chr $to; + if (chr($to) =~ /[\p{EA=W}\p{EA=F}]/) { + $extra++; + } + else { + $output_line .= " "; + } + } + else { + $output_line .= " "; + } + + $output_line .= " U+$lhs_hex"; + $output_line .= ", U+$rhs_hex" if defined $to;; + my $cur_len = $extra + length $output_line; + $output_line .= " " x ($hanging_indent - $cur_len); + + my $max_len = 74; # Pod formatter will indent 4 spaces + $cur_len = length $output_line; + + if ($cur_len + length $name <= $max_len) { + $output_line .= $name; # It will fit + } + else { # It won't fit. Append a segment that is unbreakable until would + # exceed the available width; then start on a new line + # Doesn't handle the case where the whole segment doesn't fit; + # this just doesn't come up with the input data. + while ($name =~ / ( .+? ) \b{lb} /xg) { + my $segment = $1; + my $added_length = length $segment; + if ($cur_len + $added_length > $max_len) { + $output_line =~ s/ +$//; + $output_line .= "\n" . " " x $hanging_indent; + $cur_len = $hanging_indent; + } + + $output_line .= $segment; + $cur_len += $added_length; + } + } + + return $output_line . "\n"; +} + my $version = Unicode::UCD::UnicodeVersion(); my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x; $dotdot = 0 unless defined $dotdot; @@ -270,7 +368,10 @@ foreach my $charset (get_supported_code_pages()) { . charnames::viacode($to) . "'"; } + + print STDERR format_pairs_line($from, $to) if $output_lists; } + $output_lists = 0; # Only output in first iteration print $out_fh <<~"EOT"; |