summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc1
-rw-r--r--embed.h1
-rw-r--r--lib/Unicode/UCD.pm26
-rw-r--r--lib/Unicode/UCD.t60
-rw-r--r--lib/unicore/mktables78
-rw-r--r--proto.h6
-rw-r--r--regcomp.c28
-rw-r--r--utf8.c28
8 files changed, 195 insertions, 33 deletions
diff --git a/embed.fnc b/embed.fnc
index 42ab356823..9c4607b5cd 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1480,6 +1480,7 @@ EXpM |void |_invlist_invert_prop|NN SV* const invlist
EXMpR |SV* |_new_invlist |IV initial_size
EXMpR |SV* |_swash_to_invlist |NN SV* const swash
EXMpR |SV* |_add_range_to_invlist |NULLOK SV* invlist|const UV start|const UV end
+EXMpR |SV* |_setup_canned_invlist|const STRLEN size|const UV element0|NN UV** other_elements_ptr
EXMp |void |_invlist_populate_swatch |NN SV* const invlist|const UV start|const UV end|NN U8* swatch
#endif
#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_TOKE_C)
diff --git a/embed.h b/embed.h
index 0f8e82c502..c7e9579c0d 100644
--- a/embed.h
+++ b/embed.h
@@ -990,6 +990,7 @@
#define _invlist_populate_swatch(a,b,c,d) Perl__invlist_populate_swatch(aTHX_ a,b,c,d)
#define _invlist_union_maybe_complement_2nd(a,b,c,d) Perl__invlist_union_maybe_complement_2nd(aTHX_ a,b,c,d)
#define _new_invlist(a) Perl__new_invlist(aTHX_ a)
+#define _setup_canned_invlist(a,b,c) Perl__setup_canned_invlist(aTHX_ a,b,c)
#define _swash_to_invlist(a) Perl__swash_to_invlist(aTHX_ a)
# endif
# if defined(PERL_IN_REGEXEC_C)
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index e4ae34e270..106fe7e678 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -2225,6 +2225,15 @@ sub prop_invlist ($;$) {
my @invlist;
+ if ($swash->{'LIST'} =~ /^V/) {
+
+ # A 'V' as the first character marks the input as already an inversion
+ # list, in which case, all we need to do is put the remaining lines
+ # into our array.
+ @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
+ shift @invlist;
+ }
+ else {
# The input lines look like:
# 0041\t005A # [26]
# 005F
@@ -2259,6 +2268,7 @@ sub prop_invlist ($;$) {
push @invlist, $begin + 1;
}
}
+ }
# Could need to be inverted: add or subtract a 0 at the beginning of the
# list.
@@ -3173,6 +3183,21 @@ RETRY:
my $requires_adjustment = $format =~ /^a/;
+ if ($swash->{'LIST'} =~ /^V/) {
+ @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
+ shift @invlist;
+ foreach my $i (0 .. @invlist - 1) {
+ $invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N'
+ }
+
+ # The map includes lines for all code points; add one for the range
+ # from 0 to the first Y.
+ if ($invlist[0] != 0) {
+ unshift @invlist, 0;
+ unshift @invmap, 'N';
+ }
+ }
+ else {
# The LIST input lines look like:
# ...
# 0374\t\tCommon
@@ -3329,6 +3354,7 @@ RETRY:
push @invmap, $missing;
}
}
+ }
# If the property is empty, make all code points use the value for missing
# ones.
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index b2caf8934c..0d709b1c15 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -1067,30 +1067,19 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
}
# Now construct a string from the list that should match the file.
- # The file gives ranges of code points with starting and ending values
- # in hex, like this:
- # 41\t5A
- # 61\t7A
- # AA
- # Our list has even numbered elements start ranges that are in the
- # list, and odd ones that aren't in the list. Therefore the odd
- # numbered ones are one beyond the end of the previous range, but
- # otherwise don't get reflected in the file.
- my $tested = "";
- my $i = 0;
- for (; $i < @tested; $i += 2) {
- my $start = $tested[$i];
- my $end = ($i + 1 < @tested)
- ? $tested[$i+1] - 1
- : $Unicode::UCD::MAX_CP;
- if ($start == $end) {
- $tested .= sprintf("%X\n", $start);
- }
- else {
- $tested .= sprintf "%X\t%X\n", $start, $end;
- }
- }
-
+ # The file is inversion list format code points, like this:
+ # V1216
+ # 65 # [26]
+ # 91
+ # 192 # [23]
+ # ...
+ # The V indicates it's an inversion list, and is followed immediately
+ # by the number of elements (lines) that follow giving its contents.
+ # The list has even numbered elements (0th, 2nd, ...) start ranges
+ # that are in the list, and odd ones that aren't in the list.
+ # Therefore the odd numbered ones are one beyond the end of the
+ # previous range, but otherwise don't get reflected in the file.
+ my $tested = join "\n", ("V" . scalar @tested), @tested;
local $/ = "\n";
chomp $tested;
$/ = $input_record_separator;
@@ -1665,6 +1654,11 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
# appends the next line to the running string.
my $tested_map = "";
+ # For use with files for binary properties only, which are stored in
+ # inversion list format. This counts the number of data lines in the
+ # file.
+ my $binary_count = 0;
+
# Create a copy of the file's specials hash. (It has been undef'd if
# we know it isn't relevant to this property, so if it exists, it's an
# error or is relevant). As we go along, we delete from that copy.
@@ -1870,6 +1864,20 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
my $end = (defined $invlist_ref->[$i+1])
? $invlist_ref->[$i+1] - 1
: $Unicode::UCD::MAX_CP;
+ if ($is_binary) {
+
+ # Files for binary properties are in inversion list format,
+ # without ranges.
+ $tested_map .= "$start\n";
+ $binary_count++;
+
+ # If the final value is infinity, no line for it exists.
+ if ($end < $Unicode::UCD::MAX_CP) {
+ $tested_map .= ($end + 1) . "\n";
+ $binary_count++;
+ }
+ }
+ else {
$end = ($start == $end) ? "" : sprintf($file_range_format, $end);
if ($invmap_ref->[$i] ne "") {
$tested_map .= sprintf "$file_range_format\t%s\t%s\n",
@@ -1881,8 +1889,12 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
else {
$tested_map .= sprintf "$file_range_format\n", $start;
}
+ }
} # End of looping over all elements.
+ # Binary property files begin with a line count line.
+ $tested_map = "V$binary_count\n$tested_map" if $binary_count;
+
# Here are done with generating what the file should look like
local $/ = "\n";
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index b94433fe2c..4a58886eee 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -5019,6 +5019,11 @@ sub trace { return main::trace(@_); }
# The constructor can override the global flag of the same name.
main::set_access('output_range_counts', \%output_range_counts, 'r');
+ my %write_as_invlist;
+ # A boolean set iff the output file for this table is to be in the form of
+ # an inversion list/map.
+ main::set_access('write_as_invlist', \%write_as_invlist, 'r');
+
my %format;
# The format of the entries of the table. This is calculated from the
# data in the table (or passed in the constructor). This is an enum e.g.,
@@ -5055,6 +5060,7 @@ sub trace { return main::trace(@_); }
$range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
$caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
$fate{$addr} = delete $args{'Fate'} || $ORDINARY;
+ $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
my $ucd = delete $args{'UCD'};
my $description = delete $args{'Description'};
@@ -5572,6 +5578,7 @@ END
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $addr = do { no overloading; pack 'J', $self; };
+ my $write_as_invlist = $write_as_invlist{$addr};
# Start with the header
my @HEADER = $self->header;
@@ -5624,16 +5631,23 @@ END
else {
my $range_size_1 = $range_size_1{$addr};
+ # To make it more readable, use a minimum indentation
+ my $comment_indent;
+
# These are used only in $annotate option
my $format; # e.g. $HEX_ADJUST_FORMAT
my $include_name; # ? Include the character's name in the
# annotation?
my $include_cp; # ? Include its code point
- # To make it more readable, use a minimum indentation
- my $comment_indent = 16;
-
- if ($annotate) {
+ if (! $annotate) {
+ $comment_indent = ($self->isa('Map_Table'))
+ ? 24
+ : ($write_as_invlist)
+ ? 8
+ : 16;
+ }
+ else {
$format = $self->format;
# The name of the character is output only for tables that
@@ -5651,7 +5665,11 @@ END
# the first column
$include_cp = ! $range_size_1;
- if ($self->isa('Map_Table')) {
+ if (! $self->isa('Map_Table')) {
+ $comment_indent = ($write_as_invlist) ? 8 : 16;
+ }
+ else {
+ $comment_indent = 16;
# There are just a few short ranges in this table, so no
# need to include the code point in the annotation.
@@ -5879,8 +5897,26 @@ END
$previous_value = $value;
}
- # If there is a range
- if ($start != $end) {
+ if ($write_as_invlist) {
+
+ # Inversion list format has a single number per line,
+ # the starting code point of a range that matches the
+ # property
+ push @OUT, $start, "\n";
+ $invlist_count++;
+
+ # Add a comment with the size of the range, if
+ # requested.
+ if ($output_range_counts{$addr}) {
+ $OUT[-1] = merge_single_annotation_line(
+ $OUT[-1],
+ "# ["
+ . main::clarify_code_point_count($end - $start + 1)
+ . "]\n",
+ $comment_indent);
+ }
+ }
+ elsif ($start != $end) { # If there is a range
if ($end == $MAX_WORKING_CODEPOINT) {
push @OUT, sprintf "$hex_format\t$hex_format",
$start,
@@ -6116,6 +6152,15 @@ END
}
}
+ # Add the beginning of the range that doesn't match the
+ # property, except if the just added match range extends
+ # to infinity. We do this after any annotations for the
+ # match range.
+ if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
+ push @OUT, $end + 1, "\n";
+ $invlist_count++;
+ }
+
# If we split the range, set up so the next time through
# we get the remainder, and redo.
if ($next_start) {
@@ -6129,6 +6174,8 @@ END
} # End of loop through all the table's ranges
push @OUT, @annotation; # Add orphaned annotation, if any
+
+ splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
}
# Add anything that goes after the main body, but within the here
@@ -6408,6 +6455,7 @@ sub trace { return main::trace(@_); }
Full_Name => $full_name,
_Property => $property,
_Range_List => $range_list,
+ Write_As_Invlist => 0,
%args);
my $addr = do { no overloading; pack 'J', $self; };
@@ -7330,6 +7378,7 @@ sub trace { return main::trace(@_); }
_Property => $property,
_Range_List => $range_list,
Format => $EMPTY_FORMAT,
+ Write_As_Invlist => 1,
);
my $addr = do { no overloading; pack 'J', $self; };
@@ -8051,14 +8100,25 @@ END
if ($count) { # The format differs if no code points, and needs no
# explanation in that case
+ if ($leader->write_as_invlist) {
$comment.= <<END;
-The format of the lines of this file is:
+The first data line of this file begins with the letter V to indicate it is in
+inversion list format. The number following the V gives the number of lines
+remaining. Each of those remaining lines is a single number representing the
+starting code point of a range which goes up to but not including the number
+on the next line; The 0th, 2nd, 4th... ranges are for code points that match
+the property; the 1st, 3rd, 5th... are ranges of code points that don't match
+the property. The final line's range extends to the platform's infinity.
END
- $comment.= <<END;
+ }
+ else {
+ $comment.= <<END;
+The format of the lines of this file is:
START\\tSTOP\\twhere START is the starting code point of the range, in hex;
STOP is the ending point, or if omitted, the range has just one code point.
END
+ }
if ($leader->output_range_counts) {
$comment .= <<END;
Numbers in comments in [brackets] indicate how many code points are in the
diff --git a/proto.h b/proto.h
index 32dfa1a7df..086642abd4 100644
--- a/proto.h
+++ b/proto.h
@@ -7093,6 +7093,12 @@ PERL_CALLCONV void Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, S
PERL_CALLCONV SV* Perl__new_invlist(pTHX_ IV initial_size)
__attribute__warn_unused_result__;
+PERL_CALLCONV SV* Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, UV** other_elements_ptr)
+ __attribute__warn_unused_result__
+ __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST \
+ assert(other_elements_ptr)
+
PERL_CALLCONV SV* Perl__swash_to_invlist(pTHX_ SV* const swash)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
diff --git a/regcomp.c b/regcomp.c
index 128ac01f34..0e86710dcd 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -8575,6 +8575,34 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
return invlist;
}
+SV*
+Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, UV** other_elements_ptr)
+{
+ /* Create and return an inversion list whose contents are to be populated
+ * by the caller. The caller gives the number of elements (in 'size') and
+ * the very first element ('element0'). This function will set
+ * '*other_elements_ptr' to an array of UVs, where the remaining elements
+ * are to be placed.
+ *
+ * Obviously there is some trust involved that the caller will properly
+ * fill in the other elements of the array.
+ *
+ * (The first element needs to be passed in, as the underlying code does
+ * things differently depending on whether it is zero or non-zero) */
+
+ SV* invlist = _new_invlist(size);
+ bool offset;
+
+ PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
+
+ _append_range_to_invlist(invlist, element0, element0);
+ offset = *get_invlist_offset_addr(invlist);
+
+ invlist_set_len(invlist, size, offset);
+ *other_elements_ptr = invlist_array(invlist) + 1;
+ return invlist;
+}
+
#endif
PERL_STATIC_INLINE SV*
diff --git a/utf8.c b/utf8.c
index e584aaa537..818efb11e6 100644
--- a/utf8.c
+++ b/utf8.c
@@ -4140,6 +4140,33 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
loc = (char *) l;
lend = l + lcur;
+ if (*l == 'V') { /* Inversion list format */
+ char *after_strtol = (char *) lend;
+ UV element0;
+ UV* other_elements_ptr;
+
+ /* The first number is a count of the rest */
+ l++;
+ elements = Strtoul((char *)l, &after_strtol, 10);
+ l = (U8 *) after_strtol;
+
+ /* Get the 0th element, which is needed to setup the inversion list */
+ element0 = (UV) Strtoul((char *)l, &after_strtol, 10);
+ l = (U8 *) after_strtol;
+ invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
+ elements--;
+
+ /* Then just populate the rest of the input */
+ while (elements-- > 0) {
+ if (l > lend) {
+ Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
+ }
+ *other_elements_ptr++ = (UV) Strtoul((char *)l, &after_strtol, 10);
+ l = (U8 *) after_strtol;
+ }
+ }
+ else {
+
/* Scan the input to count the number of lines to preallocate array size
* based on worst possible case, which is each line in the input creates 2
* elements in the inversion list: 1) the beginning of a range in the list;
@@ -4173,6 +4200,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
invlist = _add_range_to_invlist(invlist, start, end);
}
+ }
/* Invert if the data says it should be */
if (invert_it_svp && SvUV(*invert_it_svp)) {