diff options
author | Karl Williamson <khw@cpan.org> | 2014-04-28 16:57:16 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2014-05-31 10:20:54 -0600 |
commit | 4bc3dcfa1b5036dd61b541e01ed1ea7eb668f907 (patch) | |
tree | 8bfe4f85b6c36df6456d256cff41e108f79b2dc2 /regen | |
parent | 6ff677df5d6fe0f52ca0b6736f8b5a46ac402943 (diff) | |
download | perl-4bc3dcfa1b5036dd61b541e01ed1ea7eb668f907.tar.gz |
Make many EBCDIC tables generated instead of hand-coded
This causes the generated file ebcdic_tables.h to be #included by
utfebcdic.h instead of the hand-coded tables that were formerly there.
This makes it much easier to add or remove support for EBCDIC code
pages.
The UTF-EBCDIC-related tables for 037 and POSIX-BC are somewhat modified
from what they were before. They were changed by hand minimally a long
time ago to prevent segfaults, but in so doing, they lost an important
sorting characteristic of UTF-EBCDIC. The machine-generated versions
retain the sorting, while also not doing the segfaults. utfebcdic.h has
more detail about this, regarding tr16.
Diffstat (limited to 'regen')
-rw-r--r-- | regen/ebcdic.pl | 204 |
1 files changed, 204 insertions, 0 deletions
diff --git a/regen/ebcdic.pl b/regen/ebcdic.pl new file mode 100644 index 0000000000..055403ea12 --- /dev/null +++ b/regen/ebcdic.pl @@ -0,0 +1,204 @@ +use v5.16.0; +use strict; +use warnings; +require 'regen/regen_lib.pl'; +require 'regen/charset_translations.pl'; + +# Generates the EBCDIC translation tables that were formerly hard-coded into +# utfebcdic.h + +my $out_fh = open_new('ebcdic_tables.h', '>', + {style => '*', by => $0, }); + +sub output_table ($$) { + my $table_ref = shift; + my $name = shift; + + die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256; + + print $out_fh "EXTCONST U8 $name\[\] = {\n"; + + for my $i (0 .. 255) { + printf $out_fh "%4d", $table_ref->[$i]; + #printf $out_fh " 0x%02X", $table_ref->[$i]; + print $out_fh ",", if $i < 255; + print $out_fh "\n" if $i % 16 == 15; + } + print $out_fh "};\n\n"; +} + +print $out_fh <<END; + +#ifndef H_EBCDIC_TABLES /* Guard against nested #includes */ +#define H_EBCDIC_TABLES 1 + +/* This file contains definitions for various tables used in EBCDIC handling. + * More info is in utfebcdic.h */ +END + +my @charsets = get_supported_code_pages(); +shift @charsets; # ASCII is the 0th, and we don't deal with that here. +foreach my $charset (@charsets) { + my @a2e = get_a2n($charset); + + print $out_fh "\n" . get_conditional_compile_line_start($charset); + print $out_fh "\n"; + + print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n"; + output_table(\@a2e, "PL_a2e"); + + { # Construct the inverse + my @e2a; + for my $i (0 .. 255) { + $e2a[$a2e[$i]] = $i; + } + print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n"; + output_table(\@e2a, "PL_e2a"); + } + + my @i82utf = get_I8_2_utf($charset); + print $out_fh <<END; +/* (Confusingly named) Index is $charset I8 byte; value is + * $charset UTF-EBCDIC equivalent */ +END + output_table(\@i82utf, "PL_utf2e"); + + { #Construct the inverse + my @utf2i8; + for my $i (0 .. 255) { + $utf2i8[$i82utf[$i]] = $i; + } + print $out_fh <<END; +/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is + * $charset I8 equivalent */ +END + output_table(\@utf2i8, "PL_e2utf"); + } + + { + my @utf8skip; + + # These are invariants or continuation bytes. + for my $i (0 .. 0xBF) { + $utf8skip[$i82utf[$i]] = 1; + } + + # These are start bytes; The skip is the number of consecutive highest + # order 1-bits (up to 7) + for my $i (0xC0 .. 255) { + my $count; + if (($i & 0b11111110) == 0b11111110) { + $count= 7; + } + elsif (($i & 0b11111100) == 0b11111100) { + $count= 6; + } + elsif (($i & 0b11111000) == 0b11111000) { + $count= 5; + } + elsif (($i & 0b11110000) == 0b11110000) { + $count= 4; + } + elsif (($i & 0b11100000) == 0b11100000) { + $count= 3; + } + elsif (($i & 0b11000000) == 0b11000000) { + $count= 2; + } + else { + die "Something wrong for UTF8SKIP calculation for $i"; + } + $utf8skip[$i82utf[$i]] = $count; + } + + print $out_fh <<END; +/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes; + * 1 for continuation. Adapted from the shadow flags table in tr16. The + * entries marked 9 in tr16 are continuation bytes and are marked as length 1 + * here so that we can recover. */ +END + output_table(\@utf8skip, "PL_utf8skip"); + } + + use feature 'unicode_strings'; + + { + my @lc; + for my $i (0 .. 255) { + $lc[$a2e[$i]] = $a2e[ord lc chr $i]; + } + print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n"; + output_table(\@lc, "PL_latin1_lc"); + } + + { + my @uc; + for my $i (0 .. 255) { + my $uc = uc chr $i; + if (length $uc > 1 || ord $uc > 255) { + $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; + } + $uc[$a2e[$i]] = $a2e[ord $uc]; + } + print $out_fh <<END; +/* Index is $charset code point; value is its uppercase equivalent. + * The 'mod' in the name means that codepoints whose uppercase is above 255 or + * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ +END + output_table(\@uc, "PL_mod_latin1_uc"); + } + + { # PL_fold + my @ascii_fold; + for my $i (0 .. 255) { # Initialise to identity map + $ascii_fold[$i] = $i; + } + + # Overwrite the entries that aren't identity + for my $chr ('A' .. 'Z') { + $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr]; + } + for my $chr ('a' .. 'z') { + $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr]; + } + print $out_fh <<END; +/* Index is $charset code point; For A-Z, value is a-z; for a-z, value + * is A-Z; all other code points map to themselves */ +END + output_table(\@ascii_fold, "PL_fold"); + } + + { + my @latin1_fold; + for my $i (0 .. 255) { + my $char = chr $i; + my $lc = lc $char; + + # lc and uc adequately proxy for fold-case pairs in this 0-255 + # range + my $uc = uc $char; + $uc = $char if length $uc > 1 || ord $uc > 255; + if ($lc ne $char) { + $latin1_fold[$a2e[$i]] = $a2e[ord $lc]; + } + elsif ($uc ne $char) { + $latin1_fold[$a2e[$i]] = $a2e[ord $uc]; + } + else { + $latin1_fold[$a2e[$i]] = $a2e[$i]; + } + } + print $out_fh <<END; +/* Index is $charset code point; value is its other fold-pair equivalent + * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is + * the code point itself */ +END + output_table(\@latin1_fold, "PL_fold_latin1"); + } + + print $out_fh get_conditional_compile_line_end(); +} + +print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n"; + +read_only_bottom_close_and_rename($out_fh); |