summaryrefslogtreecommitdiff
path: root/regen/ebcdic.pl
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2014-04-28 16:57:16 -0600
committerKarl Williamson <khw@cpan.org>2014-05-31 10:20:54 -0600
commit4bc3dcfa1b5036dd61b541e01ed1ea7eb668f907 (patch)
tree8bfe4f85b6c36df6456d256cff41e108f79b2dc2 /regen/ebcdic.pl
parent6ff677df5d6fe0f52ca0b6736f8b5a46ac402943 (diff)
downloadperl-4bc3dcfa1b5036dd61b541e01ed1ea7eb668f907.tar.gz
Make many EBCDIC tables generated instead of hand-coded
This causes the generated file ebcdic_tables.h to be #included by utfebcdic.h instead of the hand-coded tables that were formerly there. This makes it much easier to add or remove support for EBCDIC code pages. The UTF-EBCDIC-related tables for 037 and POSIX-BC are somewhat modified from what they were before. They were changed by hand minimally a long time ago to prevent segfaults, but in so doing, they lost an important sorting characteristic of UTF-EBCDIC. The machine-generated versions retain the sorting, while also not doing the segfaults. utfebcdic.h has more detail about this, regarding tr16.
Diffstat (limited to 'regen/ebcdic.pl')
-rw-r--r--regen/ebcdic.pl204
1 files changed, 204 insertions, 0 deletions
diff --git a/regen/ebcdic.pl b/regen/ebcdic.pl
new file mode 100644
index 0000000000..055403ea12
--- /dev/null
+++ b/regen/ebcdic.pl
@@ -0,0 +1,204 @@
+use v5.16.0;
+use strict;
+use warnings;
+require 'regen/regen_lib.pl';
+require 'regen/charset_translations.pl';
+
+# Generates the EBCDIC translation tables that were formerly hard-coded into
+# utfebcdic.h
+
+my $out_fh = open_new('ebcdic_tables.h', '>',
+ {style => '*', by => $0, });
+
+sub output_table ($$) {
+ my $table_ref = shift;
+ my $name = shift;
+
+ die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;
+
+ print $out_fh "EXTCONST U8 $name\[\] = {\n";
+
+ for my $i (0 .. 255) {
+ printf $out_fh "%4d", $table_ref->[$i];
+ #printf $out_fh " 0x%02X", $table_ref->[$i];
+ print $out_fh ",", if $i < 255;
+ print $out_fh "\n" if $i % 16 == 15;
+ }
+ print $out_fh "};\n\n";
+}
+
+print $out_fh <<END;
+
+#ifndef H_EBCDIC_TABLES /* Guard against nested #includes */
+#define H_EBCDIC_TABLES 1
+
+/* This file contains definitions for various tables used in EBCDIC handling.
+ * More info is in utfebcdic.h */
+END
+
+my @charsets = get_supported_code_pages();
+shift @charsets; # ASCII is the 0th, and we don't deal with that here.
+foreach my $charset (@charsets) {
+ my @a2e = get_a2n($charset);
+
+ print $out_fh "\n" . get_conditional_compile_line_start($charset);
+ print $out_fh "\n";
+
+ print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
+ output_table(\@a2e, "PL_a2e");
+
+ { # Construct the inverse
+ my @e2a;
+ for my $i (0 .. 255) {
+ $e2a[$a2e[$i]] = $i;
+ }
+ print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
+ output_table(\@e2a, "PL_e2a");
+ }
+
+ my @i82utf = get_I8_2_utf($charset);
+ print $out_fh <<END;
+/* (Confusingly named) Index is $charset I8 byte; value is
+ * $charset UTF-EBCDIC equivalent */
+END
+ output_table(\@i82utf, "PL_utf2e");
+
+ { #Construct the inverse
+ my @utf2i8;
+ for my $i (0 .. 255) {
+ $utf2i8[$i82utf[$i]] = $i;
+ }
+ print $out_fh <<END;
+/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
+ * $charset I8 equivalent */
+END
+ output_table(\@utf2i8, "PL_e2utf");
+ }
+
+ {
+ my @utf8skip;
+
+ # These are invariants or continuation bytes.
+ for my $i (0 .. 0xBF) {
+ $utf8skip[$i82utf[$i]] = 1;
+ }
+
+ # These are start bytes; The skip is the number of consecutive highest
+ # order 1-bits (up to 7)
+ for my $i (0xC0 .. 255) {
+ my $count;
+ if (($i & 0b11111110) == 0b11111110) {
+ $count= 7;
+ }
+ elsif (($i & 0b11111100) == 0b11111100) {
+ $count= 6;
+ }
+ elsif (($i & 0b11111000) == 0b11111000) {
+ $count= 5;
+ }
+ elsif (($i & 0b11110000) == 0b11110000) {
+ $count= 4;
+ }
+ elsif (($i & 0b11100000) == 0b11100000) {
+ $count= 3;
+ }
+ elsif (($i & 0b11000000) == 0b11000000) {
+ $count= 2;
+ }
+ else {
+ die "Something wrong for UTF8SKIP calculation for $i";
+ }
+ $utf8skip[$i82utf[$i]] = $count;
+ }
+
+ print $out_fh <<END;
+/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
+ * 1 for continuation. Adapted from the shadow flags table in tr16. The
+ * entries marked 9 in tr16 are continuation bytes and are marked as length 1
+ * here so that we can recover. */
+END
+ output_table(\@utf8skip, "PL_utf8skip");
+ }
+
+ use feature 'unicode_strings';
+
+ {
+ my @lc;
+ for my $i (0 .. 255) {
+ $lc[$a2e[$i]] = $a2e[ord lc chr $i];
+ }
+ print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
+ output_table(\@lc, "PL_latin1_lc");
+ }
+
+ {
+ my @uc;
+ for my $i (0 .. 255) {
+ my $uc = uc chr $i;
+ if (length $uc > 1 || ord $uc > 255) {
+ $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
+ }
+ $uc[$a2e[$i]] = $a2e[ord $uc];
+ }
+ print $out_fh <<END;
+/* Index is $charset code point; value is its uppercase equivalent.
+ * The 'mod' in the name means that codepoints whose uppercase is above 255 or
+ * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
+END
+ output_table(\@uc, "PL_mod_latin1_uc");
+ }
+
+ { # PL_fold
+ my @ascii_fold;
+ for my $i (0 .. 255) { # Initialise to identity map
+ $ascii_fold[$i] = $i;
+ }
+
+ # Overwrite the entries that aren't identity
+ for my $chr ('A' .. 'Z') {
+ $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
+ }
+ for my $chr ('a' .. 'z') {
+ $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
+ }
+ print $out_fh <<END;
+/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
+ * is A-Z; all other code points map to themselves */
+END
+ output_table(\@ascii_fold, "PL_fold");
+ }
+
+ {
+ my @latin1_fold;
+ for my $i (0 .. 255) {
+ my $char = chr $i;
+ my $lc = lc $char;
+
+ # lc and uc adequately proxy for fold-case pairs in this 0-255
+ # range
+ my $uc = uc $char;
+ $uc = $char if length $uc > 1 || ord $uc > 255;
+ if ($lc ne $char) {
+ $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
+ }
+ elsif ($uc ne $char) {
+ $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
+ }
+ else {
+ $latin1_fold[$a2e[$i]] = $a2e[$i];
+ }
+ }
+ print $out_fh <<END;
+/* Index is $charset code point; value is its other fold-pair equivalent
+ * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
+ * the code point itself */
+END
+ output_table(\@latin1_fold, "PL_fold_latin1");
+ }
+
+ print $out_fh get_conditional_compile_line_end();
+}
+
+print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";
+
+read_only_bottom_close_and_rename($out_fh);