mktables: Add code for easier handling of early Unicode versions

This adds infrastructure to the constructor of the Input_file class to allow an alternative to be specified when compiling a Unicode release that is earlier than the file first became available. This is only used when the property is used by core perl and has to work in all releases. For example the qr/\X/ construct should always work, but relies on a property that isn't specified before Unicode 4.1. This allows for easier specification of how to handle this type of case.
author: Karl Williamson <khw@cpan.org> 2015-07-26 22:35:02 -0600
committer: Karl Williamson <khw@cpan.org> 2015-07-28 22:15:57 -0600
commit: a0204bb1e96bad730a658424442f873e50357928 (patch)
tree: 96dd7dc4f3733507a8ae21242f6873a2638d8d91 /lib
parent: d47a7529759ebdc12b1c99229f4d7c6bc649df62 (diff)
download: perl-a0204bb1e96bad730a658424442f873e50357928.tar.gz
1 files changed, 265 insertions, 3 deletions
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index baea9df2ed..b22442368d 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -1382,6 +1382,8 @@ my @named_sequences;       # NamedSequences.txt contents.
 my %potential_files;       # Generated list of all .txt files in the directory
                            # structure so we can warn if something is being
                            # ignored.
+my @missing_early_files;   # Generated list of absent files that we need to
+                           # proceed in compiling this early Unicode version
 my @files_actually_output; # List of files we generated.
 my @more_Names;            # Some code point names are compound; this is used
                            # to store the extra components of them.
@@ -2027,6 +2029,15 @@ package Input_file;
 # is also stackable, but none of the others are, but could easily be changed
 # to be so.
 #
+# Some properties are used by the Perl core but aren't defined until later
+# Unicode releases.  The perl interpreter would have problems working when
+# compiled with an earlier Unicode version that doesn't have them, so we need
+# to define them somehow for those releases.  The 'Early' constructor
+# parameter can be used to automatically handle this.  It is essentially
+# ignored if the Unicode version being compiled has a data file for this
+# property.  Either code to execute or a file to read can be specified.
+# Details are at the %early definition.
+#
 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
 # which insert the parameters as lines to be processed before the next input
 # file line is read.  This allows the EOF handler(s) to flush buffers, for
@@ -2182,6 +2193,50 @@ sub trace { return main::trace(@_); }
     # storage of '@missing' defaults lines
     main::set_access('missings', \%missings);
 
+    my %early;
+    # Used for properties that must be defined (for Perl's purposes) on
+    # versions of Unicode earlier than Unicode itself defines them.  The
+    # parameter is an array (it would be better to be a hash, but not worth
+    # bothering about due to its rare use).
+    #
+    # The first element is either a code reference to call when in a release
+    # earlier than the Unicode file is available in, or it is an alternate
+    # file to use instead of the non-existent one.  This file must have been
+    # plunked down in the same directory as mktables.  Should you be compiling
+    # on a release that needs such a file, mktables will abort the
+    # compilation, and tell you where to get the necessary file(s), and what
+    # name(s) to use to store them as.
+    # In the case of specifying an alternate file, the array must contain two
+    # further elements:
+    #
+    # [1] is the name of the property that will be generated by this file.
+    # The class automatically takes the input file and excludes any code
+    # points in it that were not assigned in the Unicode version being
+    # compiled.  It then uses this result to define the property in the given
+    # version.  Since the property doesn't actually exist in the Unicode
+    # version being compiled, this should be a name accessible only by core
+    # perl.  If it is the same name as the regular property, the constructor
+    # will mark the output table as a $PLACEHOLDER so that it doesn't actually
+    # get output, and so will be unusable by non-core code.  Otherwise it gets
+    # marked as $INTERNAL_ONLY.
+    #
+    # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
+    # the Hangul syllables in that release (which were ripped out in version
+    # 2) for the given property .  (Hence it is ignored except when compiling
+    # version 1.  You only get one value that applies to all of them, which
+    # may not be the actual reality, but probably nobody cares anyway for
+    # these obsolete characters.)
+    #
+    # Not all files can be handled in the above way, and so the code ref
+    # alternative is available.  It can do whatever it needs to.  The other
+    # array elements are optional in this case, and the code is free to use or
+    # ignore them if they are present.
+    #
+    # Internally, the constructor unshifts a 0 or 1 onto this array to
+    # indicate if an early alternative is actually being used or not.  This
+    # makes for easier testing later on.
+    main::set_access('early', \%early, 'c');
+
     my %required_even_in_debug_skip;
     # debug_skip is used to speed up compilation during debugging by skipping
     # processing files that are not needed for the task at hand.  However,
@@ -2222,6 +2277,7 @@ sub trace { return main::trace(@_); }
         $eof_handler{$addr} = [ ];
         $errors{$addr} = { };
         $missings{$addr} = [ ];
+        $early{$addr} = [ ];
         $optional{$addr} = [ ];
 
         # Two positional parameters.
@@ -2289,9 +2345,90 @@ sub trace { return main::trace(@_); }
         }
 
         my $progress;
+        my $function_instead_of_file = 0;
+
+        # If we are compiling a Unicode release earlier than the file became
+        # available, the constructor may have supplied a substitute
+        if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
+
+            # Yes, we have a substitute, that we will use; mark it so
+            unshift $early{$addr}->@*, 1;
+
+            # See the definition of %early for what the array elements mean.
+            # If we have a property this defines, create a table and default
+            # map for it now (at essentially compile time), so that it will be
+            # available for the whole of run time.  (We will want to add this
+            # name as an alias when we are using the official property name;
+            # but this must be deferred until run(), because at construction
+            # time the official names have yet to be defined.)
+            if ($early{$addr}[2]) {
+                my $fate = ($property{$addr}
+                            && $property{$addr} eq $early{$addr}[2])
+                          ? $PLACEHOLDER
+                          : $INTERNAL_ONLY;
+                my $prop_object = Property->new($early{$addr}[2],
+                                                Fate => $fate,
+                                                Perl_Extension => 1,
+                                                );
+
+                # Use the default mapping for the regular property for this
+                # substitute one.
+                if (    defined $property{$addr}
+                    &&  defined $default_mapping{$property{$addr}})
+                {
+                    $prop_object
+                        ->set_default_map($default_mapping{$property{$addr}});
+                }
+            }
+
+            if (ref $early{$addr}[1] eq 'CODE') {
+                $function_instead_of_file = 1;
+
+                # If the first element of the array is a code ref, the others
+                # are optional.
+                $handler{$addr} = $early{$addr}[1];
+                $property{$addr} = $early{$addr}[2]
+                                                if defined $early{$addr}[2];
+                $progress = "substitute $file{$addr}";
+
+                undef $file{$addr};
+            }
+            else {  # Specifying a substitute file
+
+                if (! main::file_exists($early{$addr}[1])) {
+
+                    # If we don't see the substitute file, generate an error
+                    # message giving the needed things, and add it to the list
+                    # of such to output before actual processing happens
+                    # (hence the user finds out all of them in one run).
+                    my $string_version = sprintf "%vd", $first_released{$addr};
+                    push @missing_early_files, <<END;
+'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
+END
+                    ;
+                    return;
+                }
+                $progress = $early{$addr}[1];
+                $progress .= ", substituting for $file{$addr}" if $file{$addr};
+                $file{$addr} = $early{$addr}[1];
+                $property{$addr} = $early{$addr}[2];
 
-        if ($first_released{$addr} le $v_version) {
+                # Ignore code points not in the version being compiled
+                push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
+
+                if (   $v_version lt v2.0        # Hanguls in this release ...
+                    && defined $early{$addr}[3]) # ... need special treatment
+                {
+                    push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
+                }
+            }
+
+            # And this substitute is valid for all releases.
+            $first_released{$addr} = v0;
+        }
+        else {  # Normal behavior
             $progress = $file{$addr};
+            unshift $early{$addr}->@*, 0; # No substitute
         }
 
         my $file = $file{$addr};
@@ -2307,8 +2444,10 @@ sub trace { return main::trace(@_); }
         else {
             $in_this_release{$addr} = $first_released{$addr} le $v_version;
 
-            # Check that the file for this object exists
-            if (! main::file_exists($file))
+            # Check that the file for this object (possibly using a substitute
+            # for early releases) exists or we have a function alternative
+            if (   ! $function_instead_of_file
+                && ! main::file_exists($file))
             {
                 # Here there is nothing available for this release.  This is
                 # fine if we aren't expecting anything in this release.
@@ -2485,6 +2624,29 @@ END
             Carp::my_carp("Was not expecting '$file'.")
                                     if $exists && ! $in_this_release{$addr};
 
+            # If there is special handling for compiling Unicode releases
+            # earlier than the first one in which Unicode defines this
+            # property ...
+            if ($early{$addr}->@* > 1) {
+
+                # Mark as processed any substitute file that would be used in
+                # such a release
+                $fkey = File::Spec->rel2abs($early{$addr}[1]);
+                delete $potential_files{lc($fkey)};
+
+                # As commented in the constructor code, when using the
+                # official property, we still have to allow the publicly
+                # inaccessible early name so that the core code which uses it
+                # will work regardless.
+                if (! $early{$addr}[0] && $early{$addr}->@* > 2) {
+                    my $early_property_name = $early{$addr}[2];
+                    if ($property{$addr} ne $early_property_name) {
+                        main::property_ref($property{$addr})
+                                            ->add_alias($early_property_name);
+                    }
+                }
+            }
+
             # We may be skipping this file ...
             if (defined $skip{$addr}) {
 
@@ -2532,6 +2694,7 @@ END
             # substitute file instead of the official one (though the code
             # could be extended to do so).
             if ($in_this_release{$addr}
+                && ! $early{$addr}[0]
                 && lc($file) ne 'unicodedata.txt')
             {
                 if ($file !~ /^Unihan/i) {
@@ -2970,6 +3133,82 @@ END
         return @return;
     }
 
+    sub _exclude_unassigned {
+
+        # Takes the range in $_ and excludes code points that aren't assigned
+        # in this release
+
+        state $skip_inserted_count = 0;
+
+        # Ignore recursive calls.
+        if ($skip_inserted_count) {
+            $skip_inserted_count--;
+            return;
+        }
+
+        # Find what code points are assigned in this release
+        main::calculate_Assigned() if ! defined $Assigned;
+
+        my $self = shift;
+        my $addr = do { no overloading; pack 'J', $self; };
+        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+        my ($range, @remainder)
+            = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+
+        # Examine the range.
+        if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
+        {
+            my $low = hex $1;
+            my $high = (defined $2) ? hex $2 : $low;
+
+            # Split the range into subranges of just those code points in it
+            # that are assigned.
+            my @ranges = (Range_List->new(Initialize
+                              => Range->new($low, $high)) & $Assigned)->ranges;
+
+            # Do nothing if nothing in the original range is assigned in this
+            # release; handle normally if everything is in this release.
+            if (! @ranges) {
+                $_ = "";
+            }
+            elsif (@ranges != 1) {
+
+                # Here, some code points in the original range aren't in this
+                # release; @ranges gives the ones that are.  Create fake input
+                # lines for each of the ranges, and set things up so that when
+                # this routine is called on that fake input, it will do
+                # nothing.
+                $skip_inserted_count = @ranges;
+                my $remainder = join ";", @remainder;
+                for my $range (@ranges) {
+                    $self->insert_lines(sprintf("%04X..%04X;%s",
+                                    $range->start, $range->end, $remainder));
+                }
+                $_ = "";    # The original range is now defunct.
+            }
+        }
+
+        return;
+    }
+
+    sub _fixup_obsolete_hanguls {
+
+        # This is called only when compiling Unicode version 1.  All Unicode
+        # data for subsequent releases assumes that the code points that were
+        # Hangul syllables in this release only are something else, so if
+        # using such data, we have to override it
+
+        my $self = shift;
+        my $addr = do { no overloading; pack 'J', $self; };
+        Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+        my $object = main::property_ref($property{$addr});
+        $object->add_map(0x3400, 0x4DFF,
+                         $early{$addr}[3],  # Passed-in value for these
+                         Replace => $UNCONDITIONALLY);
+    }
+
     sub _insert_property_into_line {
         # Add a property field to $_, if this file requires it.
 
@@ -18737,6 +18976,29 @@ my @input_file_objects = (
 # End of all the preliminaries.
 # Do it...
 
+if (@missing_early_files) {
+    print simple_fold(join_lines(<<END
+
+The compilation cannot be completed because one or more required input files,
+listed below, are missing.  This is because you are compiling Unicode version
+$string_version, which predates the existence of these file(s).  To fully
+function, perl needs the data that these files would have contained if they
+had been in this release.  To work around this, create copies of later
+versions of the missing files in the directory containing '$0'.  (Perl will
+make the necessary adjustments to the data to compensate for it not being the
+same version as is being compiled.)  The files are available from unicode.org,
+via either ftp or http.  If using http, they will be under
+www.unicode.org/versions/.  Below are listed the source file name of each
+missing file, the Unicode version to copy it from, and the name to store it
+as.  (Note that the listed source file name may not be exactly the one that
+Unicode calls it.  If you don't find it, you can look it up in 'README.perl'
+to get the correct name.)
+END
+    ));
+    print simple_fold(join_lines("\n$_")) for @missing_early_files;
+    exit 2;
+}
+
 if ($compare_versions) {
     Carp::my_carp(<<END
 Warning.  \$compare_versions is set.  Output is not suitable for production
author	Karl Williamson <khw@cpan.org>	2015-07-26 22:35:02 -0600
committer	Karl Williamson <khw@cpan.org>	2015-07-28 22:15:57 -0600
commit	a0204bb1e96bad730a658424442f873e50357928 (patch)
tree	96dd7dc4f3733507a8ae21242f6873a2638d8d91 /lib
parent	d47a7529759ebdc12b1c99229f4d7c6bc649df62 (diff)
download	perl-a0204bb1e96bad730a658424442f873e50357928.tar.gz