summaryrefslogtreecommitdiff
path: root/lib/unicore
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-05-15 22:11:16 -0600
committerKarl Williamson <public@khwilliamson.com>2012-06-02 08:29:23 -0600
commitc157f5d2977bb9a463059e5684f4630a38194592 (patch)
treeb97df616828a7e48c1c7f5ae53cf5b54d19c856e /lib/unicore
parentaff65f9f696b8f05e42888af5095261c302f1d8b (diff)
downloadperl-c157f5d2977bb9a463059e5684f4630a38194592.tar.gz
mktables: Improve \p{xids} defn for early Unicodes
Diffstat (limited to 'lib/unicore')
-rw-r--r--lib/unicore/mktables48
1 files changed, 41 insertions, 7 deletions
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index cc1de6fe1e..b993759187 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -13144,14 +13144,48 @@ sub compile_perl() {
}
else {
# For Unicode versions that don't have the property, construct our own
- # from first principles. The actual definition is: Letters + letter
- # numbers (Nl), minus Pattern_Syntax and Pattern_White_Space code
- # points, plus stability extensions. PatSyn and PatWS are not defined
- # in releases that don't have XIDS defined, so are irrelevant.
- $perl_xids += $gc->table('Letter');
- my $nl = $gc->table('Letter_Number');
- $perl_xids += $nl if defined $nl;
+ # from first principles. The actual definition is:
+ # Letters
+ # + letter numbers (Nl)
+ # - Pattern_Syntax
+ # - Pattern_White_Space
+ # + stability extensions
+ # - NKFC modifications
+ #
+ # What we do in the code below is to include the identical code points
+ # that are in the first release that had Unicode's version of this
+ # property, essentially extrapolating backwards. There were no
+ # stability extensions until v4.1, so none are included; likewise in
+ # no Unicode version so far do subtracting PatSyn and PatWS make any
+ # difference, so those also are ignored.
+ $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
+
+ # We do subtract the NFKC modifications that are in the first version
+ # that had this property. We don't bother to test if they are in the
+ # version in question, because if they aren't, the operation is a
+ # no-op. The NKFC modifications are discussed in
+ # http://www.unicode.org/reports/tr31/#NFKC_Modifications
+ foreach my $range ( 0x037A,
+ 0x0E33,
+ 0x0EB3,
+ [ 0xFC5E, 0xFC63 ],
+ [ 0xFDFA, 0xFE70 ],
+ [ 0xFE72, 0xFE76 ],
+ 0xFE78,
+ 0xFE7A,
+ 0xFE7C,
+ 0xFE7E,
+ [ 0xFF9E, 0xFF9F ],
+ ) {
+ if (ref $range) {
+ $perl_xids->delete_range($range->[0], $range->[1]);
+ }
+ else {
+ $perl_xids->delete_range($range, $range);
+ }
+ }
}
+
$perl_xids &= $Word;
my $gcb = property_ref('Grapheme_Cluster_Break');