mk_PL_charclass.pl: Allow to work on early Unicodes

If the version of Unicode being compiled doesn't have the modern casefolding .txt file, get the values from Unicode::UCD. Also for EBCDIC, where otherwise the file would have to be translated.
author: Karl Williamson <public@khwilliamson.com> 2012-03-27 09:54:53 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-06-02 08:29:15 -0600
commit: dbe1ba6ba7a5cdb7b4922771204f7ec0ed88a7b9 (patch)
tree: e4a7b7f8dc0564036fe8f8d8ffa4b075ed8ca7ea
parent: 83521fcf4b2a24304c04352af0f6116208474ca7 (diff)
download: perl-dbe1ba6ba7a5cdb7b4922771204f7ec0ed88a7b9.tar.gz
2 files changed, 43 insertions, 5 deletions
diff --git a/l1_char_class_tab.h b/l1_char_class_tab.h
index 28df339fe9..dc57acfa42 100644
--- a/l1_char_class_tab.h
+++ b/l1_char_class_tab.h
@@ -1,7 +1,6 @@
 /* -*- buffer-read-only: t -*-
  * !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
- * This file is built by regen/mk_PL_charclass.pl from property definitions
- * and lib/unicore/CaseFolding.txt.
+ * This file is built by regen/mk_PL_charclass.pl from property definitions.
  * Any changes made here will be lost!
  */
 
diff --git a/regen/mk_PL_charclass.pl b/regen/mk_PL_charclass.pl
index 5a3dbbe1f3..6a7dc92c45 100644
--- a/regen/mk_PL_charclass.pl
+++ b/regen/mk_PL_charclass.pl
@@ -57,8 +57,47 @@ my @properties = qw(
 # Read in the case fold mappings.
 my %folded_closure;
 my $file="lib/unicore/CaseFolding.txt";
-open my $fh, "<", $file or die "Failed to read '$file': $!";
-while (<$fh>) {
+my @folds;
+use Unicode::UCD;
+
+# Use the Unicode data file if we are on an ASCII platform (which its data is
+# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
+# available.  This avoids being affected by potential bugs introduced by other
+# layers of Perl
+if (ord('A') == 65
+    && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
+    && open my $fh, "<", $file)
+{
+    @folds = <$fh>;
+}
+else {
+    my ($invlist_ref, $invmap_ref, undef, $default)
+                                    = Unicode::UCD::prop_invmap('Case_Folding');
+    for my $i (0 .. @$invlist_ref - 1 - 1) {
+        next if $invmap_ref->[$i] == $default;
+        my $adjust = -1;
+        for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
+            $adjust++;
+
+            # Single-code point maps go to a 'C' type
+            if (! ref $invmap_ref->[$i]) {
+                push @folds, sprintf("%04X; C; %04X\n",
+                                     $j,
+                                     $invmap_ref->[$i] + $adjust);
+            }
+            else {  # Multi-code point maps go to 'F'.  prop_invmap()
+                    # guarantees that no adjustment is needed for these,
+                    # as the range will contain just one element
+                push @folds, sprintf("%04X; F; %s\n",
+                                    $j,
+                                    join " ", map { sprintf "%04X", $_ }
+                                                    @{$invmap_ref->[$i]});
+            }
+        }
+    }
+}
+
+for (@folds) {
     chomp;
 
     # Lines look like (without the initial '#'
@@ -230,7 +269,7 @@ my @C1 = qw(
 
 my $out_fh = open_new('l1_char_class_tab.h', '>',
 		      {style => '*', by => $0,
-                      from => "property definitions and $file"});
+                      from => "property definitions"});
 
 # Output the table using fairly short names for each char.
 for my $ord (0..255) {
author	Karl Williamson <public@khwilliamson.com>	2012-03-27 09:54:53 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-06-02 08:29:15 -0600
commit	dbe1ba6ba7a5cdb7b4922771204f7ec0ed88a7b9 (patch)
tree	e4a7b7f8dc0564036fe8f8d8ffa4b075ed8ca7ea
parent	83521fcf4b2a24304c04352af0f6116208474ca7 (diff)
download	perl-dbe1ba6ba7a5cdb7b4922771204f7ec0ed88a7b9.tar.gz