diff options
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | lib/Unicode/UCD.pm | 26 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 60 | ||||
-rw-r--r-- | lib/unicore/mktables | 78 | ||||
-rw-r--r-- | proto.h | 6 | ||||
-rw-r--r-- | regcomp.c | 28 | ||||
-rw-r--r-- | utf8.c | 28 |
8 files changed, 195 insertions, 33 deletions
@@ -1480,6 +1480,7 @@ EXpM |void |_invlist_invert_prop|NN SV* const invlist EXMpR |SV* |_new_invlist |IV initial_size EXMpR |SV* |_swash_to_invlist |NN SV* const swash EXMpR |SV* |_add_range_to_invlist |NULLOK SV* invlist|const UV start|const UV end +EXMpR |SV* |_setup_canned_invlist|const STRLEN size|const UV element0|NN UV** other_elements_ptr EXMp |void |_invlist_populate_swatch |NN SV* const invlist|const UV start|const UV end|NN U8* swatch #endif #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_TOKE_C) @@ -990,6 +990,7 @@ #define _invlist_populate_swatch(a,b,c,d) Perl__invlist_populate_swatch(aTHX_ a,b,c,d) #define _invlist_union_maybe_complement_2nd(a,b,c,d) Perl__invlist_union_maybe_complement_2nd(aTHX_ a,b,c,d) #define _new_invlist(a) Perl__new_invlist(aTHX_ a) +#define _setup_canned_invlist(a,b,c) Perl__setup_canned_invlist(aTHX_ a,b,c) #define _swash_to_invlist(a) Perl__swash_to_invlist(aTHX_ a) # endif # if defined(PERL_IN_REGEXEC_C) diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index e4ae34e270..106fe7e678 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -2225,6 +2225,15 @@ sub prop_invlist ($;$) { my @invlist; + if ($swash->{'LIST'} =~ /^V/) { + + # A 'V' as the first character marks the input as already an inversion + # list, in which case, all we need to do is put the remaining lines + # into our array. + @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr; + shift @invlist; + } + else { # The input lines look like: # 0041\t005A # [26] # 005F @@ -2259,6 +2268,7 @@ sub prop_invlist ($;$) { push @invlist, $begin + 1; } } + } # Could need to be inverted: add or subtract a 0 at the beginning of the # list. @@ -3173,6 +3183,21 @@ RETRY: my $requires_adjustment = $format =~ /^a/; + if ($swash->{'LIST'} =~ /^V/) { + @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr; + shift @invlist; + foreach my $i (0 .. @invlist - 1) { + $invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N' + } + + # The map includes lines for all code points; add one for the range + # from 0 to the first Y. + if ($invlist[0] != 0) { + unshift @invlist, 0; + unshift @invmap, 'N'; + } + } + else { # The LIST input lines look like: # ... # 0374\t\tCommon @@ -3329,6 +3354,7 @@ RETRY: push @invmap, $missing; } } + } # If the property is empty, make all code points use the value for missing # ones. diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index b2caf8934c..0d709b1c15 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -1067,30 +1067,19 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of } # Now construct a string from the list that should match the file. - # The file gives ranges of code points with starting and ending values - # in hex, like this: - # 41\t5A - # 61\t7A - # AA - # Our list has even numbered elements start ranges that are in the - # list, and odd ones that aren't in the list. Therefore the odd - # numbered ones are one beyond the end of the previous range, but - # otherwise don't get reflected in the file. - my $tested = ""; - my $i = 0; - for (; $i < @tested; $i += 2) { - my $start = $tested[$i]; - my $end = ($i + 1 < @tested) - ? $tested[$i+1] - 1 - : $Unicode::UCD::MAX_CP; - if ($start == $end) { - $tested .= sprintf("%X\n", $start); - } - else { - $tested .= sprintf "%X\t%X\n", $start, $end; - } - } - + # The file is inversion list format code points, like this: + # V1216 + # 65 # [26] + # 91 + # 192 # [23] + # ... + # The V indicates it's an inversion list, and is followed immediately + # by the number of elements (lines) that follow giving its contents. + # The list has even numbered elements (0th, 2nd, ...) start ranges + # that are in the list, and odd ones that aren't in the list. + # Therefore the odd numbered ones are one beyond the end of the + # previous range, but otherwise don't get reflected in the file. + my $tested = join "\n", ("V" . scalar @tested), @tested; local $/ = "\n"; chomp $tested; $/ = $input_record_separator; @@ -1665,6 +1654,11 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { # appends the next line to the running string. my $tested_map = ""; + # For use with files for binary properties only, which are stored in + # inversion list format. This counts the number of data lines in the + # file. + my $binary_count = 0; + # Create a copy of the file's specials hash. (It has been undef'd if # we know it isn't relevant to this property, so if it exists, it's an # error or is relevant). As we go along, we delete from that copy. @@ -1870,6 +1864,20 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { my $end = (defined $invlist_ref->[$i+1]) ? $invlist_ref->[$i+1] - 1 : $Unicode::UCD::MAX_CP; + if ($is_binary) { + + # Files for binary properties are in inversion list format, + # without ranges. + $tested_map .= "$start\n"; + $binary_count++; + + # If the final value is infinity, no line for it exists. + if ($end < $Unicode::UCD::MAX_CP) { + $tested_map .= ($end + 1) . "\n"; + $binary_count++; + } + } + else { $end = ($start == $end) ? "" : sprintf($file_range_format, $end); if ($invmap_ref->[$i] ne "") { $tested_map .= sprintf "$file_range_format\t%s\t%s\n", @@ -1881,8 +1889,12 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { else { $tested_map .= sprintf "$file_range_format\n", $start; } + } } # End of looping over all elements. + # Binary property files begin with a line count line. + $tested_map = "V$binary_count\n$tested_map" if $binary_count; + # Here are done with generating what the file should look like local $/ = "\n"; diff --git a/lib/unicore/mktables b/lib/unicore/mktables index b94433fe2c..4a58886eee 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -5019,6 +5019,11 @@ sub trace { return main::trace(@_); } # The constructor can override the global flag of the same name. main::set_access('output_range_counts', \%output_range_counts, 'r'); + my %write_as_invlist; + # A boolean set iff the output file for this table is to be in the form of + # an inversion list/map. + main::set_access('write_as_invlist', \%write_as_invlist, 'r'); + my %format; # The format of the entries of the table. This is calculated from the # data in the table (or passed in the constructor). This is an enum e.g., @@ -5055,6 +5060,7 @@ sub trace { return main::trace(@_); } $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0; $fate{$addr} = delete $args{'Fate'} || $ORDINARY; + $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default my $ucd = delete $args{'UCD'}; my $description = delete $args{'Description'}; @@ -5572,6 +5578,7 @@ END Carp::carp_extra_args(\@_) if main::DEBUG && @_; my $addr = do { no overloading; pack 'J', $self; }; + my $write_as_invlist = $write_as_invlist{$addr}; # Start with the header my @HEADER = $self->header; @@ -5624,16 +5631,23 @@ END else { my $range_size_1 = $range_size_1{$addr}; + # To make it more readable, use a minimum indentation + my $comment_indent; + # These are used only in $annotate option my $format; # e.g. $HEX_ADJUST_FORMAT my $include_name; # ? Include the character's name in the # annotation? my $include_cp; # ? Include its code point - # To make it more readable, use a minimum indentation - my $comment_indent = 16; - - if ($annotate) { + if (! $annotate) { + $comment_indent = ($self->isa('Map_Table')) + ? 24 + : ($write_as_invlist) + ? 8 + : 16; + } + else { $format = $self->format; # The name of the character is output only for tables that @@ -5651,7 +5665,11 @@ END # the first column $include_cp = ! $range_size_1; - if ($self->isa('Map_Table')) { + if (! $self->isa('Map_Table')) { + $comment_indent = ($write_as_invlist) ? 8 : 16; + } + else { + $comment_indent = 16; # There are just a few short ranges in this table, so no # need to include the code point in the annotation. @@ -5879,8 +5897,26 @@ END $previous_value = $value; } - # If there is a range - if ($start != $end) { + if ($write_as_invlist) { + + # Inversion list format has a single number per line, + # the starting code point of a range that matches the + # property + push @OUT, $start, "\n"; + $invlist_count++; + + # Add a comment with the size of the range, if + # requested. + if ($output_range_counts{$addr}) { + $OUT[-1] = merge_single_annotation_line( + $OUT[-1], + "# [" + . main::clarify_code_point_count($end - $start + 1) + . "]\n", + $comment_indent); + } + } + elsif ($start != $end) { # If there is a range if ($end == $MAX_WORKING_CODEPOINT) { push @OUT, sprintf "$hex_format\t$hex_format", $start, @@ -6116,6 +6152,15 @@ END } } + # Add the beginning of the range that doesn't match the + # property, except if the just added match range extends + # to infinity. We do this after any annotations for the + # match range. + if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) { + push @OUT, $end + 1, "\n"; + $invlist_count++; + } + # If we split the range, set up so the next time through # we get the remainder, and redo. if ($next_start) { @@ -6129,6 +6174,8 @@ END } # End of loop through all the table's ranges push @OUT, @annotation; # Add orphaned annotation, if any + + splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count; } # Add anything that goes after the main body, but within the here @@ -6408,6 +6455,7 @@ sub trace { return main::trace(@_); } Full_Name => $full_name, _Property => $property, _Range_List => $range_list, + Write_As_Invlist => 0, %args); my $addr = do { no overloading; pack 'J', $self; }; @@ -7330,6 +7378,7 @@ sub trace { return main::trace(@_); } _Property => $property, _Range_List => $range_list, Format => $EMPTY_FORMAT, + Write_As_Invlist => 1, ); my $addr = do { no overloading; pack 'J', $self; }; @@ -8051,14 +8100,25 @@ END if ($count) { # The format differs if no code points, and needs no # explanation in that case + if ($leader->write_as_invlist) { $comment.= <<END; -The format of the lines of this file is: +The first data line of this file begins with the letter V to indicate it is in +inversion list format. The number following the V gives the number of lines +remaining. Each of those remaining lines is a single number representing the +starting code point of a range which goes up to but not including the number +on the next line; The 0th, 2nd, 4th... ranges are for code points that match +the property; the 1st, 3rd, 5th... are ranges of code points that don't match +the property. The final line's range extends to the platform's infinity. END - $comment.= <<END; + } + else { + $comment.= <<END; +The format of the lines of this file is: START\\tSTOP\\twhere START is the starting code point of the range, in hex; STOP is the ending point, or if omitted, the range has just one code point. END + } if ($leader->output_range_counts) { $comment .= <<END; Numbers in comments in [brackets] indicate how many code points are in the @@ -7093,6 +7093,12 @@ PERL_CALLCONV void Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, S PERL_CALLCONV SV* Perl__new_invlist(pTHX_ IV initial_size) __attribute__warn_unused_result__; +PERL_CALLCONV SV* Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, UV** other_elements_ptr) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST \ + assert(other_elements_ptr) + PERL_CALLCONV SV* Perl__swash_to_invlist(pTHX_ SV* const swash) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); @@ -8575,6 +8575,34 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) return invlist; } +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + _append_range_to_invlist(invlist, element0, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + #endif PERL_STATIC_INLINE SV* @@ -4140,6 +4140,33 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) loc = (char *) l; lend = l + lcur; + if (*l == 'V') { /* Inversion list format */ + char *after_strtol = (char *) lend; + UV element0; + UV* other_elements_ptr; + + /* The first number is a count of the rest */ + l++; + elements = Strtoul((char *)l, &after_strtol, 10); + l = (U8 *) after_strtol; + + /* Get the 0th element, which is needed to setup the inversion list */ + element0 = (UV) Strtoul((char *)l, &after_strtol, 10); + l = (U8 *) after_strtol; + invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr); + elements--; + + /* Then just populate the rest of the input */ + while (elements-- > 0) { + if (l > lend) { + Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements); + } + *other_elements_ptr++ = (UV) Strtoul((char *)l, &after_strtol, 10); + l = (U8 *) after_strtol; + } + } + else { + /* Scan the input to count the number of lines to preallocate array size * based on worst possible case, which is each line in the input creates 2 * elements in the inversion list: 1) the beginning of a range in the list; @@ -4173,6 +4200,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) invlist = _add_range_to_invlist(invlist, start, end); } + } /* Invert if the data says it should be */ if (invert_it_svp && SvUV(*invert_it_svp)) { |