summaryrefslogtreecommitdiff
path: root/regen/regcharclass.pl
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2014-05-05 18:50:01 -0600
committerKarl Williamson <khw@cpan.org>2014-05-31 11:34:56 -0600
commita1b2a50fb33671d2474d83dc6a3d62dbcb99965b (patch)
tree9a895b677f79428804017ffa979c0576ba42dc80 /regen/regcharclass.pl
parent09be812375d15ad2e53923ffde9ee8117d89d3ef (diff)
downloadperl-a1b2a50fb33671d2474d83dc6a3d62dbcb99965b.tar.gz
regen/regcharclass.pl: Update to use EBCDIC utilities
This causes the generated regcharclass.h to be valid on all supported platforms
Diffstat (limited to 'regen/regcharclass.pl')
-rwxr-xr-xregen/regcharclass.pl86
1 files changed, 46 insertions, 40 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index 8c36b03a02..0c1993dcbe 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -12,9 +12,8 @@ our $hex_fmt= "0x%02X";
sub DEBUG () { 0 }
$|=1 if DEBUG;
-sub ASCII_PLATFORM { (ord('A') == 65) }
-
require 'regen/regen_lib.pl';
+require 'regen/charset_translations.pl';
require "regen/regcharclass_multi_char_folds.pl";
=head1 NAME
@@ -162,38 +161,36 @@ License or the Artistic License, as specified in the README file.
#
sub __uni_latin1 {
+ my $charset= shift;
my $str= shift;
my $max= 0;
my @cp;
my @cp_high;
my $only_has_invariants = 1;
+ my @a2n = get_a2n($charset);
for my $ch ( split //, $str ) {
my $cp= ord $ch;
- push @cp, $cp;
- push @cp_high, $cp if $cp > 255;
$max= $cp if $max < $cp;
- if (! ASCII_PLATFORM && $only_has_invariants) {
- if ($cp > 255) {
- $only_has_invariants = 0;
- }
- else {
- my $temp = chr($cp);
- utf8::upgrade($temp);
- my @utf8 = unpack "U0C*", $temp;
- $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp);
- }
+ if ($cp > 255) {
+ push @cp, $cp;
+ push @cp_high, $cp;
+ }
+ else {
+ push @cp, $a2n[$cp];
}
}
my ( $n, $l, $u );
- $only_has_invariants = $max < 128 if ASCII_PLATFORM;
+ $only_has_invariants = ($charset =~ /ascii/i) ? $max < 128 : $max < 160;
if ($only_has_invariants) {
$n= [@cp];
} else {
$l= [@cp] if $max && $max < 256;
- $u= $str;
- utf8::upgrade($u);
- $u= [ unpack "U0C*", $u ] if defined $u;
+ my @u;
+ for my $ch ( split //, $str ) {
+ push @u, map { ord } split //, cp_2_utfbytes(ord $ch, $charset);
+ }
+ $u = \@u;
}
return ( \@cp, \@cp_high, $n, $l, $u );
}
@@ -354,10 +351,6 @@ sub new {
$str= chr eval $str;
} elsif ( $str =~ /^0x/ ) {
$str= eval $str;
-
- # Convert from Unicode/ASCII to native, if necessary
- $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM
- && $str <= 0xFF;
$str = chr $str;
} elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
my $property = $1;
@@ -393,7 +386,7 @@ sub new {
} else {
die "Unparsable line: $txt\n";
}
- my ( $cp, $cp_high, $low, $latin1, $utf8 )= __uni_latin1( $str );
+ my ( $cp, $cp_high, $low, $latin1, $utf8 )= __uni_latin1( $opt{charset}, $str );
my $UTF8= $low || $utf8;
my $LATIN1= $low || $latin1;
my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
@@ -1107,8 +1100,8 @@ sub _cond_as_str {
# bounds. (No legal UTF-8 character can begin with anything in
# this range, so we don't have to worry about this being a
# continuation byte or not.)
- if (ASCII_PLATFORM
- && ! $opts_ref->{safe}
+ if ($opts_ref->{charset} =~ /ascii/i
+ && (! $opts_ref->{safe} && ! $opts_ref->{no_length_checks})
&& $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
{
my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
@@ -1359,19 +1352,21 @@ WARNING: These macros are for internal Perl core use only, and may be
changed or removed without notice.
EOF
);
- print $out_fh "\n#ifndef H_REGCHARCLASS /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
+ print $out_fh "\n#ifndef H_REGCHARCLASS /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n";
my ( $op, $title, @txt, @types, %mods );
- my $doit= sub {
+ my $doit= sub ($) {
return unless $op;
+ my $charset = shift;
+
# Skip if to compile on a different platform.
- return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM;
- return if delete $mods{only_ebcdic_platform} && ord 'A' != 193;
+ return if delete $mods{only_ascii_platform} && $charset !~ /ascii/i;
+ return if delete $mods{only_ebcdic_platform} && $charset !~ /ebcdic/i;
print $out_fh "/*\n\t$op: $title\n\n";
print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
- my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
+ my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt, charset => $charset);
#die Dumper(\@types,\%mods);
@@ -1402,18 +1397,30 @@ EOF
type => $type,
ret_type => $ret,
safe => $mod eq 'safe' && $type !~ /^cp/,
+ charset => $charset,
);
print $out_fh $macro, "\n";
}
}
};
- while ( <DATA> ) {
+ my @data = <DATA>;
+ foreach my $charset (get_supported_code_pages()) {
+ my $first_time = 1;
+ undef $op;
+ undef $title;
+ undef @txt;
+ undef @types;
+ undef %mods;
+ print $out_fh "\n", get_conditional_compile_line_start($charset);
+ my @data_copy = @data;
+ for (@data_copy) {
s/^ \s* (?: \# .* ) ? $ //x; # squeeze out comment and blanks
next unless /\S/;
chomp;
if ( /^[A-Z]/ ) {
- $doit->(); # This starts a new definition; do the previous one
+ $doit->($charset) unless $first_time; # This starts a new definition; do the previous one
+ $first_time = 0;
( $op, $title )= split /\s*:\s*/, $_, 2;
@txt= ();
} elsif ( s/^=>// ) {
@@ -1425,7 +1432,9 @@ EOF
push @txt, "$_";
}
}
- $doit->();
+ $doit->($charset);
+ print $out_fh get_conditional_compile_line_end();
+ }
print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
@@ -1526,9 +1535,9 @@ EOF
# fast The input string is valid UTF-8. No bounds checking is done,
# and the macro can make assumptions that lead to faster
# execution.
-# only_ascii_platform Skip this definition if this program is being run on
+# only_ascii_platform Skip this definition if the character set is for
# a non-ASCII platform.
-# only_ebcdic_platform Skip this definition if this program is being run on
+# only_ebcdic_platform Skip this definition if the character set is for
# a non-EBCDIC platform.
# No modifier need be specified; fast is assumed for this case. If both
# 'fast', and 'safe' are specified, two macros will be created for each
@@ -1623,12 +1632,9 @@ GCB_V: Grapheme_Cluster_Break=V
#=> UTF8 :safe only_ascii_platform
#0x0 - 0x1FFFFF
-# This hasn't been commented out, because we haven't an EBCDIC platform to run
-# it on, and the 3 types of EBCDIC allegedly supported by Perl would have
-# different results
-UTF8_CHAR: Matches utf8 from 1 to 5 bytes
+UTF8_CHAR: Matches utf8 from 1 to 3 bytes
=> UTF8 :safe only_ebcdic_platform
-0x0 - 0x3FFFFF:
+0x0 - 0x3FFF
QUOTEMETA: Meta-characters that \Q should quote
=> high :fast