summaryrefslogtreecommitdiff
path: root/regen
diff options
context:
space:
mode:
Diffstat (limited to 'regen')
-rw-r--r--regen/mk_invlists.pl70
1 files changed, 69 insertions, 1 deletions
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl
index 5b69708b46..46114bcfb1 100644
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -71,12 +71,80 @@ sub output_invlist ($$) {
output_invlist("Latin1", [ 0, 256 ]);
output_invlist("AboveLatin1", [ 256 ]);
+# We construct lists for all the POSIX and backslash sequence character
+# classes in two forms:
+# 1) ones which match only in the ASCII range
+# 2) ones which match either in the Latin1 range, or the entire Unicode range
+#
+# These get compiled in, and hence affect the memory footprint of every Perl
+# program, even those not using Unicode. To minimize the size, currently
+# the Latin1 version is generated for the beyond ASCII range except for those
+# lists that are quite small for the entire range, such as for \s, which is 22
+# UVs long plus 4 UVs (currently) for the header.
+#
+# To save even more memory, the ASCII versions could be derived from the
+# larger ones at runtime, saving some memory (minus the expense of the machine
+# instructions to do so), but these are all small anyway, so their total is
+# about 100 UVs.
+#
+# In the list of properties below that get generated, the L1 prefix is a fake
+# property that means just the Latin1 range of the full property (whose name
+# has an X prefix instead of L1).
+
for my $prop (qw(
ASCII
+ HorizSpace
+ VertSpace
+ PerlSpace
+ XPerlSpace
+ PosixAlnum
+ L1PosixAlnum
+ PosixAlpha
+ L1PosixAlpha
+ PosixBlank
+ XPosixBlank
+ PosixCntrl
+ XPosixCntrl
+ PosixDigit
+ PosixGraph
+ L1PosixGraph
+ PosixLower
+ L1PosixLower
+ PosixPrint
+ L1PosixPrint
+ PosixPunct
+ L1PosixPunct
+ PosixSpace
+ XPosixSpace
+ PosixUpper
+ L1PosixUpper
+ PosixWord
+ L1PosixWord
+ PosixXDigit
+ XPosixXDigit
)
) {
- my @invlist = prop_invlist($prop);
+ # For the Latin1 properties, we change to use the eXtended version of the
+ # base property, then go through the result and get rid of everything not
+ # in Latin1 (above 255). Actually, we retain the element that crosses the
+ # 255/256 boundary. For example, in the Word property, there is a range
+ # of code points that start at U+00F8 and goes through U+02C1. Instead of
+ # artifically cutting that off at 256 because 256 is the first code point
+ # above Latin1, we let the range go to its natural ending. That gives us
+ # extra information with no added space taken.
+ my $lookup_prop = $prop =~ s/^L1/X/r;
+ my @invlist = prop_invlist($lookup_prop);
+
+ if ($lookup_prop ne $prop) {
+ for my $i (0 .. @invlist - 1 - 1) {
+ if ($invlist[$i] > 255) {
+ splice @invlist, $i+1;
+ last;
+ }
+ }
+ }
+
output_invlist($prop, \@invlist);
}