diff options
author | Karl Williamson <khw@cpan.org> | 2015-07-28 22:17:01 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-07-28 22:17:01 -0600 |
commit | a0b05c4bde4f97301b663b8de70677551eacff4c (patch) | |
tree | e9329726efb7aa27f5fd08f1441943f95b68a594 | |
parent | 16d89be8495ff4e0a9b99e837b5444df8a4a6cc6 (diff) | |
parent | 912fd71114648888551a4750534463f20ae16c7c (diff) | |
download | perl-a0b05c4bde4f97301b663b8de70677551eacff4c.tar.gz |
Allow perl to work again with all Unicode releases
Compiling perl to work with early Unicode releases is a goal, but has
been broken since 5.18. This sequence of commits gets this working
again, actually better than before.
-rw-r--r-- | charclass_invlists.h | 3814 | ||||
-rw-r--r-- | ext/XS-APItest/t/handy.t | 17 | ||||
-rw-r--r-- | lib/Unicode/UCD.pm | 57 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 668 | ||||
-rw-r--r-- | lib/locale.t | 3 | ||||
-rw-r--r-- | lib/unicore/README.perl | 46 | ||||
-rw-r--r-- | lib/unicore/mktables | 2432 | ||||
-rw-r--r-- | lib/utf8_heavy.pl | 96 | ||||
-rw-r--r-- | perl.c | 6 | ||||
-rw-r--r-- | perl.h | 9 | ||||
-rw-r--r-- | pod/perldelta.pod | 5 | ||||
-rw-r--r-- | pp.c | 10 | ||||
-rw-r--r-- | regcharclass.h | 32 | ||||
-rw-r--r-- | regcomp.c | 65 | ||||
-rw-r--r-- | regen/mk_PL_charclass.pl | 33 | ||||
-rw-r--r-- | regen/mk_invlists.pl | 62 | ||||
-rwxr-xr-x | regen/regcharclass.pl | 8 | ||||
-rw-r--r-- | regen/regcharclass_multi_char_folds.pl | 2 | ||||
-rw-r--r-- | regen/unicode_constants.pl | 89 | ||||
-rw-r--r-- | regexec.c | 8 | ||||
-rw-r--r-- | t/uni/variables.t | 2 | ||||
-rw-r--r-- | unicode_constants.h | 21 | ||||
-rw-r--r-- | utf8.c | 104 | ||||
-rw-r--r-- | utf8.h | 6 |
24 files changed, 4282 insertions, 3313 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h index 076d223724..64406bfe78 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -312,7 +312,387 @@ static const UV Cased_invlist[] = { /* for ASCII/Latin1 */ 0x1F18A }; -static const UV Grapheme_Cluster_Break_invlist[] = { /* for ASCII/Latin1 */ +#endif /* defined(PERL_IN_PERL_C) */ + +#if defined(PERL_IN_REGCOMP_C) + +static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for ASCII/Latin1 */ + 45, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x2BC, + 0x2BD, + 0x308, + 0x309, + 0x313, + 0x314, + 0x342, + 0x343, + 0x3AC, + 0x3AD, + 0x3AE, + 0x3AF, + 0x3B1, + 0x3B2, + 0x3B7, + 0x3B8, + 0x3B9, + 0x3BA, + 0x3C1, + 0x3C2, + 0x3C5, + 0x3C6, + 0x3C9, + 0x3CA, + 0x3CE, + 0x3CF, + 0x565, + 0x566, + 0x574, + 0x575, + 0x57E, + 0x57F, + 0x1F00, + 0x1F08, + 0x1F20, + 0x1F28, + 0x1F60, + 0x1F68, + 0x1F70, + 0x1F71, + 0x1F74, + 0x1F75, + 0x1F7C, + 0x1F7D +}; + +static const UV _Perl_Any_Folds_invlist[] = { /* for ASCII/Latin1 */ + 247, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x41, + 0x5B, + 0x61, + 0x7B, + 0xB5, + 0xB6, + 0xC0, + 0xD7, + 0xD8, + 0xF7, + 0xF8, + 0x131, + 0x132, + 0x138, + 0x139, + 0x18D, + 0x18E, + 0x19B, + 0x19C, + 0x1AA, + 0x1AC, + 0x1BA, + 0x1BC, + 0x1BE, + 0x1BF, + 0x1C0, + 0x1C4, + 0x221, + 0x222, + 0x234, + 0x23A, + 0x255, + 0x256, + 0x258, + 0x259, + 0x25A, + 0x25B, + 0x25D, + 0x260, + 0x262, + 0x263, + 0x264, + 0x265, + 0x267, + 0x268, + 0x26A, + 0x26B, + 0x26D, + 0x26F, + 0x270, + 0x271, + 0x273, + 0x275, + 0x276, + 0x27D, + 0x27E, + 0x280, + 0x281, + 0x283, + 0x284, + 0x287, + 0x28D, + 0x292, + 0x293, + 0x29D, + 0x29F, + 0x2BC, + 0x2BD, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x345, + 0x346, + 0x370, + 0x374, + 0x376, + 0x378, + 0x37B, + 0x37E, + 0x37F, + 0x380, + 0x386, + 0x387, + 0x388, + 0x38B, + 0x38C, + 0x38D, + 0x38E, + 0x3A2, + 0x3A3, + 0x3D2, + 0x3D5, + 0x3F6, + 0x3F7, + 0x3FC, + 0x3FD, + 0x482, + 0x48A, + 0x530, + 0x531, + 0x557, + 0x561, + 0x588, + 0x10A0, + 0x10C6, + 0x10C7, + 0x10C8, + 0x10CD, + 0x10CE, + 0x13A0, + 0x13F6, + 0x13F8, + 0x13FE, + 0x1D79, + 0x1D7A, + 0x1D7D, + 0x1D7E, + 0x1E00, + 0x1E9C, + 0x1E9E, + 0x1E9F, + 0x1EA0, + 0x1F16, + 0x1F18, + 0x1F1E, + 0x1F20, + 0x1F46, + 0x1F48, + 0x1F4E, + 0x1F50, + 0x1F58, + 0x1F59, + 0x1F5A, + 0x1F5B, + 0x1F5C, + 0x1F5D, + 0x1F5E, + 0x1F5F, + 0x1F7E, + 0x1F80, + 0x1FB5, + 0x1FB6, + 0x1FBD, + 0x1FBE, + 0x1FBF, + 0x1FC2, + 0x1FC5, + 0x1FC6, + 0x1FCD, + 0x1FD0, + 0x1FD4, + 0x1FD6, + 0x1FDC, + 0x1FE0, + 0x1FED, + 0x1FF2, + 0x1FF5, + 0x1FF6, + 0x1FFD, + 0x2126, + 0x2127, + 0x212A, + 0x212C, + 0x2132, + 0x2133, + 0x214E, + 0x214F, + 0x2160, + 0x2180, + 0x2183, + 0x2185, + 0x24B6, + 0x24EA, + 0x2C00, + 0x2C2F, + 0x2C30, + 0x2C5F, + 0x2C60, + 0x2C71, + 0x2C72, + 0x2C74, + 0x2C75, + 0x2C77, + 0x2C7E, + 0x2CE4, + 0x2CEB, + 0x2CEF, + 0x2CF2, + 0x2CF4, + 0x2D00, + 0x2D26, + 0x2D27, + 0x2D28, + 0x2D2D, + 0x2D2E, + 0xA640, + 0xA66E, + 0xA680, + 0xA69C, + 0xA722, + 0xA730, + 0xA732, + 0xA770, + 0xA779, + 0xA788, + 0xA78B, + 0xA78E, + 0xA790, + 0xA794, + 0xA796, + 0xA7AE, + 0xA7B0, + 0xA7B8, + 0xAB53, + 0xAB54, + 0xAB70, + 0xABC0, + 0xFB00, + 0xFB07, + 0xFB13, + 0xFB18, + 0xFF21, + 0xFF3B, + 0xFF41, + 0xFF5B, + 0x10400, + 0x10450, + 0x10C80, + 0x10CB3, + 0x10CC0, + 0x10CF3, + 0x118A0, + 0x118E0 +}; + +static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for ASCII/Latin1 */ + 59, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0xDF, + 0xE0, + 0x130, + 0x131, + 0x149, + 0x14A, + 0x1F0, + 0x1F1, + 0x390, + 0x391, + 0x3B0, + 0x3B1, + 0x587, + 0x588, + 0x1E96, + 0x1E9B, + 0x1E9E, + 0x1E9F, + 0x1F50, + 0x1F51, + 0x1F52, + 0x1F53, + 0x1F54, + 0x1F55, + 0x1F56, + 0x1F57, + 0x1F80, + 0x1FB0, + 0x1FB2, + 0x1FB5, + 0x1FB6, + 0x1FB8, + 0x1FBC, + 0x1FBD, + 0x1FC2, + 0x1FC5, + 0x1FC6, + 0x1FC8, + 0x1FCC, + 0x1FCD, + 0x1FD2, + 0x1FD4, + 0x1FD6, + 0x1FD8, + 0x1FE2, + 0x1FE5, + 0x1FE6, + 0x1FE8, + 0x1FF2, + 0x1FF5, + 0x1FF6, + 0x1FF8, + 0x1FFC, + 0x1FFD, + 0xFB00, + 0xFB07, + 0xFB13, + 0xFB18 +}; + +#endif /* defined(PERL_IN_REGCOMP_C) */ + +#if defined(PERL_IN_PERL_C) + +static const UV _Perl_GCB_invlist[] = { /* for ASCII/Latin1 */ 1502, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -1844,7 +2224,7 @@ typedef enum { GCB_V = 13 } GCB_enum; -static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for ASCII/Latin1 */ +static const GCB_enum _Perl_GCB_invmap[] = { /* for ASCII/Latin1 */ GCB_Control, GCB_LF, GCB_Control, @@ -3351,382 +3731,6 @@ static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for ASCII/Latin1 */ #endif /* defined(PERL_IN_REGEXEC_C) */ -#if defined(PERL_IN_REGCOMP_C) - -static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for ASCII/Latin1 */ - 45, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x2BC, - 0x2BD, - 0x308, - 0x309, - 0x313, - 0x314, - 0x342, - 0x343, - 0x3AC, - 0x3AD, - 0x3AE, - 0x3AF, - 0x3B1, - 0x3B2, - 0x3B7, - 0x3B8, - 0x3B9, - 0x3BA, - 0x3C1, - 0x3C2, - 0x3C5, - 0x3C6, - 0x3C9, - 0x3CA, - 0x3CE, - 0x3CF, - 0x565, - 0x566, - 0x574, - 0x575, - 0x57E, - 0x57F, - 0x1F00, - 0x1F08, - 0x1F20, - 0x1F28, - 0x1F60, - 0x1F68, - 0x1F70, - 0x1F71, - 0x1F74, - 0x1F75, - 0x1F7C, - 0x1F7D -}; - -static const UV _Perl_Any_Folds_invlist[] = { /* for ASCII/Latin1 */ - 247, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x41, - 0x5B, - 0x61, - 0x7B, - 0xB5, - 0xB6, - 0xC0, - 0xD7, - 0xD8, - 0xF7, - 0xF8, - 0x131, - 0x132, - 0x138, - 0x139, - 0x18D, - 0x18E, - 0x19B, - 0x19C, - 0x1AA, - 0x1AC, - 0x1BA, - 0x1BC, - 0x1BE, - 0x1BF, - 0x1C0, - 0x1C4, - 0x221, - 0x222, - 0x234, - 0x23A, - 0x255, - 0x256, - 0x258, - 0x259, - 0x25A, - 0x25B, - 0x25D, - 0x260, - 0x262, - 0x263, - 0x264, - 0x265, - 0x267, - 0x268, - 0x26A, - 0x26B, - 0x26D, - 0x26F, - 0x270, - 0x271, - 0x273, - 0x275, - 0x276, - 0x27D, - 0x27E, - 0x280, - 0x281, - 0x283, - 0x284, - 0x287, - 0x28D, - 0x292, - 0x293, - 0x29D, - 0x29F, - 0x2BC, - 0x2BD, - 0x2BE, - 0x2BF, - 0x300, - 0x302, - 0x307, - 0x309, - 0x30A, - 0x30B, - 0x30C, - 0x30D, - 0x313, - 0x314, - 0x331, - 0x332, - 0x342, - 0x343, - 0x345, - 0x346, - 0x370, - 0x374, - 0x376, - 0x378, - 0x37B, - 0x37E, - 0x37F, - 0x380, - 0x386, - 0x387, - 0x388, - 0x38B, - 0x38C, - 0x38D, - 0x38E, - 0x3A2, - 0x3A3, - 0x3D2, - 0x3D5, - 0x3F6, - 0x3F7, - 0x3FC, - 0x3FD, - 0x482, - 0x48A, - 0x530, - 0x531, - 0x557, - 0x561, - 0x588, - 0x10A0, - 0x10C6, - 0x10C7, - 0x10C8, - 0x10CD, - 0x10CE, - 0x13A0, - 0x13F6, - 0x13F8, - 0x13FE, - 0x1D79, - 0x1D7A, - 0x1D7D, - 0x1D7E, - 0x1E00, - 0x1E9C, - 0x1E9E, - 0x1E9F, - 0x1EA0, - 0x1F16, - 0x1F18, - 0x1F1E, - 0x1F20, - 0x1F46, - 0x1F48, - 0x1F4E, - 0x1F50, - 0x1F58, - 0x1F59, - 0x1F5A, - 0x1F5B, - 0x1F5C, - 0x1F5D, - 0x1F5E, - 0x1F5F, - 0x1F7E, - 0x1F80, - 0x1FB5, - 0x1FB6, - 0x1FBD, - 0x1FBE, - 0x1FBF, - 0x1FC2, - 0x1FC5, - 0x1FC6, - 0x1FCD, - 0x1FD0, - 0x1FD4, - 0x1FD6, - 0x1FDC, - 0x1FE0, - 0x1FED, - 0x1FF2, - 0x1FF5, - 0x1FF6, - 0x1FFD, - 0x2126, - 0x2127, - 0x212A, - 0x212C, - 0x2132, - 0x2133, - 0x214E, - 0x214F, - 0x2160, - 0x2180, - 0x2183, - 0x2185, - 0x24B6, - 0x24EA, - 0x2C00, - 0x2C2F, - 0x2C30, - 0x2C5F, - 0x2C60, - 0x2C71, - 0x2C72, - 0x2C74, - 0x2C75, - 0x2C77, - 0x2C7E, - 0x2CE4, - 0x2CEB, - 0x2CEF, - 0x2CF2, - 0x2CF4, - 0x2D00, - 0x2D26, - 0x2D27, - 0x2D28, - 0x2D2D, - 0x2D2E, - 0xA640, - 0xA66E, - 0xA680, - 0xA69C, - 0xA722, - 0xA730, - 0xA732, - 0xA770, - 0xA779, - 0xA788, - 0xA78B, - 0xA78E, - 0xA790, - 0xA794, - 0xA796, - 0xA7AE, - 0xA7B0, - 0xA7B8, - 0xAB53, - 0xAB54, - 0xAB70, - 0xABC0, - 0xFB00, - 0xFB07, - 0xFB13, - 0xFB18, - 0xFF21, - 0xFF3B, - 0xFF41, - 0xFF5B, - 0x10400, - 0x10450, - 0x10C80, - 0x10CB3, - 0x10CC0, - 0x10CF3, - 0x118A0, - 0x118E0 -}; - -static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for ASCII/Latin1 */ - 59, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0xDF, - 0xE0, - 0x130, - 0x131, - 0x149, - 0x14A, - 0x1F0, - 0x1F1, - 0x390, - 0x391, - 0x3B0, - 0x3B1, - 0x587, - 0x588, - 0x1E96, - 0x1E9B, - 0x1E9E, - 0x1E9F, - 0x1F50, - 0x1F51, - 0x1F52, - 0x1F53, - 0x1F54, - 0x1F55, - 0x1F56, - 0x1F57, - 0x1F80, - 0x1FB0, - 0x1FB2, - 0x1FB5, - 0x1FB6, - 0x1FB8, - 0x1FBC, - 0x1FBD, - 0x1FC2, - 0x1FC5, - 0x1FC6, - 0x1FC8, - 0x1FCC, - 0x1FCD, - 0x1FD2, - 0x1FD4, - 0x1FD6, - 0x1FD8, - 0x1FE2, - 0x1FE5, - 0x1FE6, - 0x1FE8, - 0x1FF2, - 0x1FF5, - 0x1FF6, - 0x1FF8, - 0x1FFC, - 0x1FFD, - 0xFB00, - 0xFB07, - 0xFB13, - 0xFB18 -}; - -#endif /* defined(PERL_IN_REGCOMP_C) */ - #if defined(PERL_IN_UTF8_C) static const UV _Perl_IDCont_invlist[] = { /* for ASCII/Latin1 */ @@ -6193,7 +6197,7 @@ static const UV _Perl_IDStart_invlist[] = { /* for ASCII/Latin1 */ #if defined(PERL_IN_PERL_C) -static const UV Sentence_Break_invlist[] = { /* for ASCII/Latin1 */ +static const UV _Perl_SB_invlist[] = { /* for ASCII/Latin1 */ 2896, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -9121,7 +9125,7 @@ typedef enum { SB_Upper = 15 } SB_enum; -static const SB_enum Sentence_Break_invmap[] = { /* for ASCII/Latin1 */ +static const SB_enum _Perl_SB_invmap[] = { /* for ASCII/Latin1 */ SB_Other, SB_Sp, SB_LF, @@ -12022,37 +12026,9 @@ static const SB_enum Sentence_Break_invmap[] = { /* for ASCII/Latin1 */ #endif /* defined(PERL_IN_REGEXEC_C) */ -#if defined(PERL_IN_REGCOMP_C) - -static const UV UpperLatin1_invlist[] = { /* for ASCII/Latin1 */ - 3, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x80, - 0x100 -}; - -#endif /* defined(PERL_IN_REGCOMP_C) */ - #if defined(PERL_IN_PERL_C) -static const UV VertSpace_invlist[] = { /* for ASCII/Latin1 */ - 7, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0xA, - 0xE, - 0x85, - 0x86, - 0x2028, - 0x202A -}; - -static const UV Word_Break_invlist[] = { /* for ASCII/Latin1 */ +static const UV _Perl_WB_invlist[] = { /* for ASCII/Latin1 */ 1524, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -13611,7 +13587,7 @@ typedef enum { WB_UNKNOWN = 18 } WB_enum; -static const WB_enum Word_Break_invmap[] = { /* for ASCII/Latin1 */ +static const WB_enum _Perl_WB_invmap[] = { /* for ASCII/Latin1 */ WB_Other, WB_LF, WB_Newline, @@ -15140,8 +15116,36 @@ static const WB_enum Word_Break_invmap[] = { /* for ASCII/Latin1 */ #endif /* defined(PERL_IN_REGEXEC_C) */ +#if defined(PERL_IN_REGCOMP_C) + +static const UV UpperLatin1_invlist[] = { /* for ASCII/Latin1 */ + 3, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x80, + 0x100 +}; + +#endif /* defined(PERL_IN_REGCOMP_C) */ + #if defined(PERL_IN_PERL_C) +static const UV VertSpace_invlist[] = { /* for ASCII/Latin1 */ + 7, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0xA, + 0xE, + 0x85, + 0x86, + 0x2028, + 0x202A +}; + static const UV XPerlSpace_invlist[] = { /* for ASCII/Latin1 */ 21, /* Number of elements */ 148565664, /* Version and data structure type */ @@ -24947,7 +24951,419 @@ static const UV Cased_invlist[] = { /* for EBCDIC 1047 */ 0x1F18A }; -static const UV Grapheme_Cluster_Break_invlist[] = { /* for EBCDIC 1047 */ +#endif /* defined(PERL_IN_PERL_C) */ + +#if defined(PERL_IN_REGCOMP_C) + +static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC 1047 */ + 45, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x2BC, + 0x2BD, + 0x308, + 0x309, + 0x313, + 0x314, + 0x342, + 0x343, + 0x3AC, + 0x3AD, + 0x3AE, + 0x3AF, + 0x3B1, + 0x3B2, + 0x3B7, + 0x3B8, + 0x3B9, + 0x3BA, + 0x3C1, + 0x3C2, + 0x3C5, + 0x3C6, + 0x3C9, + 0x3CA, + 0x3CE, + 0x3CF, + 0x565, + 0x566, + 0x574, + 0x575, + 0x57E, + 0x57F, + 0x1F00, + 0x1F08, + 0x1F20, + 0x1F28, + 0x1F60, + 0x1F68, + 0x1F70, + 0x1F71, + 0x1F74, + 0x1F75, + 0x1F7C, + 0x1F7D +}; + +static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC 1047 */ + 279, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x42, + 0x4A, + 0x51, + 0x5A, + 0x62, + 0x6A, + 0x70, + 0x79, + 0x80, + 0x8A, + 0x8C, + 0x8F, + 0x91, + 0x9A, + 0x9C, + 0x9D, + 0x9E, + 0x9F, + 0xA0, + 0xA1, + 0xA2, + 0xAA, + 0xAC, + 0xAD, + 0xAE, + 0xAF, + 0xBA, + 0xBB, + 0xC1, + 0xCA, + 0xCB, + 0xD0, + 0xD1, + 0xDA, + 0xDB, + 0xE0, + 0xE2, + 0xEA, + 0xEB, + 0xF0, + 0xFB, + 0xFF, + 0x100, + 0x131, + 0x132, + 0x138, + 0x139, + 0x18D, + 0x18E, + 0x19B, + 0x19C, + 0x1AA, + 0x1AC, + 0x1BA, + 0x1BC, + 0x1BE, + 0x1BF, + 0x1C0, + 0x1C4, + 0x221, + 0x222, + 0x234, + 0x23A, + 0x255, + 0x256, + 0x258, + 0x259, + 0x25A, + 0x25B, + 0x25D, + 0x260, + 0x262, + 0x263, + 0x264, + 0x265, + 0x267, + 0x268, + 0x26A, + 0x26B, + 0x26D, + 0x26F, + 0x270, + 0x271, + 0x273, + 0x275, + 0x276, + 0x27D, + 0x27E, + 0x280, + 0x281, + 0x283, + 0x284, + 0x287, + 0x28D, + 0x292, + 0x293, + 0x29D, + 0x29F, + 0x2BC, + 0x2BD, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x345, + 0x346, + 0x370, + 0x374, + 0x376, + 0x378, + 0x37B, + 0x37E, + 0x37F, + 0x380, + 0x386, + 0x387, + 0x388, + 0x38B, + 0x38C, + 0x38D, + 0x38E, + 0x3A2, + 0x3A3, + 0x3D2, + 0x3D5, + 0x3F6, + 0x3F7, + 0x3FC, + 0x3FD, + 0x482, + 0x48A, + 0x530, + 0x531, + 0x557, + 0x561, + 0x588, + 0x10A0, + 0x10C6, + 0x10C7, + 0x10C8, + 0x10CD, + 0x10CE, + 0x13A0, + 0x13F6, + 0x13F8, + 0x13FE, + 0x1D79, + 0x1D7A, + 0x1D7D, + 0x1D7E, + 0x1E00, + 0x1E9C, + 0x1E9E, + 0x1E9F, + 0x1EA0, + 0x1F16, + 0x1F18, + 0x1F1E, + 0x1F20, + 0x1F46, + 0x1F48, + 0x1F4E, + 0x1F50, + 0x1F58, + 0x1F59, + 0x1F5A, + 0x1F5B, + 0x1F5C, + 0x1F5D, + 0x1F5E, + 0x1F5F, + 0x1F7E, + 0x1F80, + 0x1FB5, + 0x1FB6, + 0x1FBD, + 0x1FBE, + 0x1FBF, + 0x1FC2, + 0x1FC5, + 0x1FC6, + 0x1FCD, + 0x1FD0, + 0x1FD4, + 0x1FD6, + 0x1FDC, + 0x1FE0, + 0x1FED, + 0x1FF2, + 0x1FF5, + 0x1FF6, + 0x1FFD, + 0x2126, + 0x2127, + 0x212A, + 0x212C, + 0x2132, + 0x2133, + 0x214E, + 0x214F, + 0x2160, + 0x2180, + 0x2183, + 0x2185, + 0x24B6, + 0x24EA, + 0x2C00, + 0x2C2F, + 0x2C30, + 0x2C5F, + 0x2C60, + 0x2C71, + 0x2C72, + 0x2C74, + 0x2C75, + 0x2C77, + 0x2C7E, + 0x2CE4, + 0x2CEB, + 0x2CEF, + 0x2CF2, + 0x2CF4, + 0x2D00, + 0x2D26, + 0x2D27, + 0x2D28, + 0x2D2D, + 0x2D2E, + 0xA640, + 0xA66E, + 0xA680, + 0xA69C, + 0xA722, + 0xA730, + 0xA732, + 0xA770, + 0xA779, + 0xA788, + 0xA78B, + 0xA78E, + 0xA790, + 0xA794, + 0xA796, + 0xA7AE, + 0xA7B0, + 0xA7B8, + 0xAB53, + 0xAB54, + 0xAB70, + 0xABC0, + 0xFB00, + 0xFB07, + 0xFB13, + 0xFB18, + 0xFF21, + 0xFF3B, + 0xFF41, + 0xFF5B, + 0x10400, + 0x10450, + 0x10C80, + 0x10CB3, + 0x10CC0, + 0x10CF3, + 0x118A0, + 0x118E0 +}; + +static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC 1047 */ + 59, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x59, + 0x5A, + 0x130, + 0x131, + 0x149, + 0x14A, + 0x1F0, + 0x1F1, + 0x390, + 0x391, + 0x3B0, + 0x3B1, + 0x587, + 0x588, + 0x1E96, + 0x1E9B, + 0x1E9E, + 0x1E9F, + 0x1F50, + 0x1F51, + 0x1F52, + 0x1F53, + 0x1F54, + 0x1F55, + 0x1F56, + 0x1F57, + 0x1F80, + 0x1FB0, + 0x1FB2, + 0x1FB5, + 0x1FB6, + 0x1FB8, + 0x1FBC, + 0x1FBD, + 0x1FC2, + 0x1FC5, + 0x1FC6, + 0x1FC8, + 0x1FCC, + 0x1FCD, + 0x1FD2, + 0x1FD4, + 0x1FD6, + 0x1FD8, + 0x1FE2, + 0x1FE5, + 0x1FE6, + 0x1FE8, + 0x1FF2, + 0x1FF5, + 0x1FF6, + 0x1FF8, + 0x1FFC, + 0x1FFD, + 0xFB00, + 0xFB07, + 0xFB13, + 0xFB18 +}; + +#endif /* defined(PERL_IN_REGCOMP_C) */ + +#if defined(PERL_IN_PERL_C) + +static const UV _Perl_GCB_invlist[] = { /* for EBCDIC 1047 */ 1502, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -26479,7 +26895,7 @@ typedef enum { GCB_V = 13 } GCB_enum; -static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC 1047 */ +static const GCB_enum _Perl_GCB_invmap[] = { /* for EBCDIC 1047 */ GCB_Control, GCB_CR, GCB_Control, @@ -27986,414 +28402,6 @@ static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC 1047 */ #endif /* defined(PERL_IN_REGEXEC_C) */ -#if defined(PERL_IN_REGCOMP_C) - -static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC 1047 */ - 45, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x2BC, - 0x2BD, - 0x308, - 0x309, - 0x313, - 0x314, - 0x342, - 0x343, - 0x3AC, - 0x3AD, - 0x3AE, - 0x3AF, - 0x3B1, - 0x3B2, - 0x3B7, - 0x3B8, - 0x3B9, - 0x3BA, - 0x3C1, - 0x3C2, - 0x3C5, - 0x3C6, - 0x3C9, - 0x3CA, - 0x3CE, - 0x3CF, - 0x565, - 0x566, - 0x574, - 0x575, - 0x57E, - 0x57F, - 0x1F00, - 0x1F08, - 0x1F20, - 0x1F28, - 0x1F60, - 0x1F68, - 0x1F70, - 0x1F71, - 0x1F74, - 0x1F75, - 0x1F7C, - 0x1F7D -}; - -static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC 1047 */ - 279, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x42, - 0x4A, - 0x51, - 0x5A, - 0x62, - 0x6A, - 0x70, - 0x79, - 0x80, - 0x8A, - 0x8C, - 0x8F, - 0x91, - 0x9A, - 0x9C, - 0x9D, - 0x9E, - 0x9F, - 0xA0, - 0xA1, - 0xA2, - 0xAA, - 0xAC, - 0xAD, - 0xAE, - 0xAF, - 0xBA, - 0xBB, - 0xC1, - 0xCA, - 0xCB, - 0xD0, - 0xD1, - 0xDA, - 0xDB, - 0xE0, - 0xE2, - 0xEA, - 0xEB, - 0xF0, - 0xFB, - 0xFF, - 0x100, - 0x131, - 0x132, - 0x138, - 0x139, - 0x18D, - 0x18E, - 0x19B, - 0x19C, - 0x1AA, - 0x1AC, - 0x1BA, - 0x1BC, - 0x1BE, - 0x1BF, - 0x1C0, - 0x1C4, - 0x221, - 0x222, - 0x234, - 0x23A, - 0x255, - 0x256, - 0x258, - 0x259, - 0x25A, - 0x25B, - 0x25D, - 0x260, - 0x262, - 0x263, - 0x264, - 0x265, - 0x267, - 0x268, - 0x26A, - 0x26B, - 0x26D, - 0x26F, - 0x270, - 0x271, - 0x273, - 0x275, - 0x276, - 0x27D, - 0x27E, - 0x280, - 0x281, - 0x283, - 0x284, - 0x287, - 0x28D, - 0x292, - 0x293, - 0x29D, - 0x29F, - 0x2BC, - 0x2BD, - 0x2BE, - 0x2BF, - 0x300, - 0x302, - 0x307, - 0x309, - 0x30A, - 0x30B, - 0x30C, - 0x30D, - 0x313, - 0x314, - 0x331, - 0x332, - 0x342, - 0x343, - 0x345, - 0x346, - 0x370, - 0x374, - 0x376, - 0x378, - 0x37B, - 0x37E, - 0x37F, - 0x380, - 0x386, - 0x387, - 0x388, - 0x38B, - 0x38C, - 0x38D, - 0x38E, - 0x3A2, - 0x3A3, - 0x3D2, - 0x3D5, - 0x3F6, - 0x3F7, - 0x3FC, - 0x3FD, - 0x482, - 0x48A, - 0x530, - 0x531, - 0x557, - 0x561, - 0x588, - 0x10A0, - 0x10C6, - 0x10C7, - 0x10C8, - 0x10CD, - 0x10CE, - 0x13A0, - 0x13F6, - 0x13F8, - 0x13FE, - 0x1D79, - 0x1D7A, - 0x1D7D, - 0x1D7E, - 0x1E00, - 0x1E9C, - 0x1E9E, - 0x1E9F, - 0x1EA0, - 0x1F16, - 0x1F18, - 0x1F1E, - 0x1F20, - 0x1F46, - 0x1F48, - 0x1F4E, - 0x1F50, - 0x1F58, - 0x1F59, - 0x1F5A, - 0x1F5B, - 0x1F5C, - 0x1F5D, - 0x1F5E, - 0x1F5F, - 0x1F7E, - 0x1F80, - 0x1FB5, - 0x1FB6, - 0x1FBD, - 0x1FBE, - 0x1FBF, - 0x1FC2, - 0x1FC5, - 0x1FC6, - 0x1FCD, - 0x1FD0, - 0x1FD4, - 0x1FD6, - 0x1FDC, - 0x1FE0, - 0x1FED, - 0x1FF2, - 0x1FF5, - 0x1FF6, - 0x1FFD, - 0x2126, - 0x2127, - 0x212A, - 0x212C, - 0x2132, - 0x2133, - 0x214E, - 0x214F, - 0x2160, - 0x2180, - 0x2183, - 0x2185, - 0x24B6, - 0x24EA, - 0x2C00, - 0x2C2F, - 0x2C30, - 0x2C5F, - 0x2C60, - 0x2C71, - 0x2C72, - 0x2C74, - 0x2C75, - 0x2C77, - 0x2C7E, - 0x2CE4, - 0x2CEB, - 0x2CEF, - 0x2CF2, - 0x2CF4, - 0x2D00, - 0x2D26, - 0x2D27, - 0x2D28, - 0x2D2D, - 0x2D2E, - 0xA640, - 0xA66E, - 0xA680, - 0xA69C, - 0xA722, - 0xA730, - 0xA732, - 0xA770, - 0xA779, - 0xA788, - 0xA78B, - 0xA78E, - 0xA790, - 0xA794, - 0xA796, - 0xA7AE, - 0xA7B0, - 0xA7B8, - 0xAB53, - 0xAB54, - 0xAB70, - 0xABC0, - 0xFB00, - 0xFB07, - 0xFB13, - 0xFB18, - 0xFF21, - 0xFF3B, - 0xFF41, - 0xFF5B, - 0x10400, - 0x10450, - 0x10C80, - 0x10CB3, - 0x10CC0, - 0x10CF3, - 0x118A0, - 0x118E0 -}; - -static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC 1047 */ - 59, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x59, - 0x5A, - 0x130, - 0x131, - 0x149, - 0x14A, - 0x1F0, - 0x1F1, - 0x390, - 0x391, - 0x3B0, - 0x3B1, - 0x587, - 0x588, - 0x1E96, - 0x1E9B, - 0x1E9E, - 0x1E9F, - 0x1F50, - 0x1F51, - 0x1F52, - 0x1F53, - 0x1F54, - 0x1F55, - 0x1F56, - 0x1F57, - 0x1F80, - 0x1FB0, - 0x1FB2, - 0x1FB5, - 0x1FB6, - 0x1FB8, - 0x1FBC, - 0x1FBD, - 0x1FC2, - 0x1FC5, - 0x1FC6, - 0x1FC8, - 0x1FCC, - 0x1FCD, - 0x1FD2, - 0x1FD4, - 0x1FD6, - 0x1FD8, - 0x1FE2, - 0x1FE5, - 0x1FE6, - 0x1FE8, - 0x1FF2, - 0x1FF5, - 0x1FF6, - 0x1FF8, - 0x1FFC, - 0x1FFD, - 0xFB00, - 0xFB07, - 0xFB13, - 0xFB18 -}; - -#endif /* defined(PERL_IN_REGCOMP_C) */ - #if defined(PERL_IN_UTF8_C) static const UV _Perl_IDCont_invlist[] = { /* for EBCDIC 1047 */ @@ -30910,7 +30918,7 @@ static const UV _Perl_IDStart_invlist[] = { /* for EBCDIC 1047 */ #if defined(PERL_IN_PERL_C) -static const UV Sentence_Break_invlist[] = { /* for EBCDIC 1047 */ +static const UV _Perl_SB_invlist[] = { /* for EBCDIC 1047 */ 2920, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -33862,7 +33870,7 @@ typedef enum { SB_Upper = 15 } SB_enum; -static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC 1047 */ +static const SB_enum _Perl_SB_invmap[] = { /* for EBCDIC 1047 */ SB_Other, SB_Sp, SB_Other, @@ -36787,91 +36795,9 @@ static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC 1047 */ #endif /* defined(PERL_IN_REGEXEC_C) */ -#if defined(PERL_IN_REGCOMP_C) - -static const UV UpperLatin1_invlist[] = { /* for EBCDIC 1047 */ - 55, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0xB, - 0x14, - 0x15, - 0x17, - 0x18, - 0x1A, - 0x1C, - 0x20, - 0x26, - 0x28, - 0x2D, - 0x30, - 0x32, - 0x33, - 0x37, - 0x38, - 0x3C, - 0x3E, - 0x3F, - 0x41, - 0x4B, - 0x51, - 0x5A, - 0x62, - 0x6B, - 0x70, - 0x79, - 0x80, - 0x81, - 0x8A, - 0x91, - 0x9A, - 0xA1, - 0xAA, - 0xAD, - 0xAE, - 0xBD, - 0xBE, - 0xC0, - 0xCA, - 0xD0, - 0xDA, - 0xE0, - 0xE1, - 0xE2, - 0xEA, - 0xF0, - 0xFA, - 0x100 -}; - -#endif /* defined(PERL_IN_REGCOMP_C) */ - #if defined(PERL_IN_PERL_C) -static const UV VertSpace_invlist[] = { /* for EBCDIC 1047 */ - 9, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0xB, - 0xE, - 0x15, - 0x16, - 0x25, - 0x26, - 0x2028, - 0x202A -}; - -static const UV Word_Break_invlist[] = { /* for EBCDIC 1047 */ +static const UV _Perl_WB_invlist[] = { /* for EBCDIC 1047 */ 1549, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -38455,7 +38381,7 @@ typedef enum { WB_UNKNOWN = 18 } WB_enum; -static const WB_enum Word_Break_invmap[] = { /* for EBCDIC 1047 */ +static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 1047 */ WB_Other, WB_Newline, WB_CR, @@ -40009,8 +39935,90 @@ static const WB_enum Word_Break_invmap[] = { /* for EBCDIC 1047 */ #endif /* defined(PERL_IN_REGEXEC_C) */ +#if defined(PERL_IN_REGCOMP_C) + +static const UV UpperLatin1_invlist[] = { /* for EBCDIC 1047 */ + 55, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0xB, + 0x14, + 0x15, + 0x17, + 0x18, + 0x1A, + 0x1C, + 0x20, + 0x26, + 0x28, + 0x2D, + 0x30, + 0x32, + 0x33, + 0x37, + 0x38, + 0x3C, + 0x3E, + 0x3F, + 0x41, + 0x4B, + 0x51, + 0x5A, + 0x62, + 0x6B, + 0x70, + 0x79, + 0x80, + 0x81, + 0x8A, + 0x91, + 0x9A, + 0xA1, + 0xAA, + 0xAD, + 0xAE, + 0xBD, + 0xBE, + 0xC0, + 0xCA, + 0xD0, + 0xDA, + 0xE0, + 0xE1, + 0xE2, + 0xEA, + 0xF0, + 0xFA, + 0x100 +}; + +#endif /* defined(PERL_IN_REGCOMP_C) */ + #if defined(PERL_IN_PERL_C) +static const UV VertSpace_invlist[] = { /* for EBCDIC 1047 */ + 9, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0xB, + 0xE, + 0x15, + 0x16, + 0x25, + 0x26, + 0x2028, + 0x202A +}; + static const UV XPerlSpace_invlist[] = { /* for EBCDIC 1047 */ 23, /* Number of elements */ 148565664, /* Version and data structure type */ @@ -49924,7 +49932,415 @@ static const UV Cased_invlist[] = { /* for EBCDIC 037 */ 0x1F18A }; -static const UV Grapheme_Cluster_Break_invlist[] = { /* for EBCDIC 037 */ +#endif /* defined(PERL_IN_PERL_C) */ + +#if defined(PERL_IN_REGCOMP_C) + +static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC 037 */ + 45, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x2BC, + 0x2BD, + 0x308, + 0x309, + 0x313, + 0x314, + 0x342, + 0x343, + 0x3AC, + 0x3AD, + 0x3AE, + 0x3AF, + 0x3B1, + 0x3B2, + 0x3B7, + 0x3B8, + 0x3B9, + 0x3BA, + 0x3C1, + 0x3C2, + 0x3C5, + 0x3C6, + 0x3C9, + 0x3CA, + 0x3CE, + 0x3CF, + 0x565, + 0x566, + 0x574, + 0x575, + 0x57E, + 0x57F, + 0x1F00, + 0x1F08, + 0x1F20, + 0x1F28, + 0x1F60, + 0x1F68, + 0x1F70, + 0x1F71, + 0x1F74, + 0x1F75, + 0x1F7C, + 0x1F7D +}; + +static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC 037 */ + 275, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x42, + 0x4A, + 0x51, + 0x5A, + 0x62, + 0x6A, + 0x70, + 0x79, + 0x80, + 0x8A, + 0x8C, + 0x8F, + 0x91, + 0x9A, + 0x9C, + 0x9D, + 0x9E, + 0x9F, + 0xA0, + 0xA1, + 0xA2, + 0xAA, + 0xAC, + 0xAF, + 0xC1, + 0xCA, + 0xCB, + 0xD0, + 0xD1, + 0xDA, + 0xDB, + 0xE0, + 0xE2, + 0xEA, + 0xEB, + 0xF0, + 0xFB, + 0xFF, + 0x100, + 0x131, + 0x132, + 0x138, + 0x139, + 0x18D, + 0x18E, + 0x19B, + 0x19C, + 0x1AA, + 0x1AC, + 0x1BA, + 0x1BC, + 0x1BE, + 0x1BF, + 0x1C0, + 0x1C4, + 0x221, + 0x222, + 0x234, + 0x23A, + 0x255, + 0x256, + 0x258, + 0x259, + 0x25A, + 0x25B, + 0x25D, + 0x260, + 0x262, + 0x263, + 0x264, + 0x265, + 0x267, + 0x268, + 0x26A, + 0x26B, + 0x26D, + 0x26F, + 0x270, + 0x271, + 0x273, + 0x275, + 0x276, + 0x27D, + 0x27E, + 0x280, + 0x281, + 0x283, + 0x284, + 0x287, + 0x28D, + 0x292, + 0x293, + 0x29D, + 0x29F, + 0x2BC, + 0x2BD, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x345, + 0x346, + 0x370, + 0x374, + 0x376, + 0x378, + 0x37B, + 0x37E, + 0x37F, + 0x380, + 0x386, + 0x387, + 0x388, + 0x38B, + 0x38C, + 0x38D, + 0x38E, + 0x3A2, + 0x3A3, + 0x3D2, + 0x3D5, + 0x3F6, + 0x3F7, + 0x3FC, + 0x3FD, + 0x482, + 0x48A, + 0x530, + 0x531, + 0x557, + 0x561, + 0x588, + 0x10A0, + 0x10C6, + 0x10C7, + 0x10C8, + 0x10CD, + 0x10CE, + 0x13A0, + 0x13F6, + 0x13F8, + 0x13FE, + 0x1D79, + 0x1D7A, + 0x1D7D, + 0x1D7E, + 0x1E00, + 0x1E9C, + 0x1E9E, + 0x1E9F, + 0x1EA0, + 0x1F16, + 0x1F18, + 0x1F1E, + 0x1F20, + 0x1F46, + 0x1F48, + 0x1F4E, + 0x1F50, + 0x1F58, + 0x1F59, + 0x1F5A, + 0x1F5B, + 0x1F5C, + 0x1F5D, + 0x1F5E, + 0x1F5F, + 0x1F7E, + 0x1F80, + 0x1FB5, + 0x1FB6, + 0x1FBD, + 0x1FBE, + 0x1FBF, + 0x1FC2, + 0x1FC5, + 0x1FC6, + 0x1FCD, + 0x1FD0, + 0x1FD4, + 0x1FD6, + 0x1FDC, + 0x1FE0, + 0x1FED, + 0x1FF2, + 0x1FF5, + 0x1FF6, + 0x1FFD, + 0x2126, + 0x2127, + 0x212A, + 0x212C, + 0x2132, + 0x2133, + 0x214E, + 0x214F, + 0x2160, + 0x2180, + 0x2183, + 0x2185, + 0x24B6, + 0x24EA, + 0x2C00, + 0x2C2F, + 0x2C30, + 0x2C5F, + 0x2C60, + 0x2C71, + 0x2C72, + 0x2C74, + 0x2C75, + 0x2C77, + 0x2C7E, + 0x2CE4, + 0x2CEB, + 0x2CEF, + 0x2CF2, + 0x2CF4, + 0x2D00, + 0x2D26, + 0x2D27, + 0x2D28, + 0x2D2D, + 0x2D2E, + 0xA640, + 0xA66E, + 0xA680, + 0xA69C, + 0xA722, + 0xA730, + 0xA732, + 0xA770, + 0xA779, + 0xA788, + 0xA78B, + 0xA78E, + 0xA790, + 0xA794, + 0xA796, + 0xA7AE, + 0xA7B0, + 0xA7B8, + 0xAB53, + 0xAB54, + 0xAB70, + 0xABC0, + 0xFB00, + 0xFB07, + 0xFB13, + 0xFB18, + 0xFF21, + 0xFF3B, + 0xFF41, + 0xFF5B, + 0x10400, + 0x10450, + 0x10C80, + 0x10CB3, + 0x10CC0, + 0x10CF3, + 0x118A0, + 0x118E0 +}; + +static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC 037 */ + 59, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x59, + 0x5A, + 0x130, + 0x131, + 0x149, + 0x14A, + 0x1F0, + 0x1F1, + 0x390, + 0x391, + 0x3B0, + 0x3B1, + 0x587, + 0x588, + 0x1E96, + 0x1E9B, + 0x1E9E, + 0x1E9F, + 0x1F50, + 0x1F51, + 0x1F52, + 0x1F53, + 0x1F54, + 0x1F55, + 0x1F56, + 0x1F57, + 0x1F80, + 0x1FB0, + 0x1FB2, + 0x1FB5, + 0x1FB6, + 0x1FB8, + 0x1FBC, + 0x1FBD, + 0x1FC2, + 0x1FC5, + 0x1FC6, + 0x1FC8, + 0x1FCC, + 0x1FCD, + 0x1FD2, + 0x1FD4, + 0x1FD6, + 0x1FD8, + 0x1FE2, + 0x1FE5, + 0x1FE6, + 0x1FE8, + 0x1FF2, + 0x1FF5, + 0x1FF6, + 0x1FF8, + 0x1FFC, + 0x1FFD, + 0xFB00, + 0xFB07, + 0xFB13, + 0xFB18 +}; + +#endif /* defined(PERL_IN_REGCOMP_C) */ + +#if defined(PERL_IN_PERL_C) + +static const UV _Perl_GCB_invlist[] = { /* for EBCDIC 037 */ 1502, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -51456,7 +51872,7 @@ typedef enum { GCB_V = 13 } GCB_enum; -static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC 037 */ +static const GCB_enum _Perl_GCB_invmap[] = { /* for EBCDIC 037 */ GCB_Control, GCB_CR, GCB_Control, @@ -52963,410 +53379,6 @@ static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC 037 */ #endif /* defined(PERL_IN_REGEXEC_C) */ -#if defined(PERL_IN_REGCOMP_C) - -static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC 037 */ - 45, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x2BC, - 0x2BD, - 0x308, - 0x309, - 0x313, - 0x314, - 0x342, - 0x343, - 0x3AC, - 0x3AD, - 0x3AE, - 0x3AF, - 0x3B1, - 0x3B2, - 0x3B7, - 0x3B8, - 0x3B9, - 0x3BA, - 0x3C1, - 0x3C2, - 0x3C5, - 0x3C6, - 0x3C9, - 0x3CA, - 0x3CE, - 0x3CF, - 0x565, - 0x566, - 0x574, - 0x575, - 0x57E, - 0x57F, - 0x1F00, - 0x1F08, - 0x1F20, - 0x1F28, - 0x1F60, - 0x1F68, - 0x1F70, - 0x1F71, - 0x1F74, - 0x1F75, - 0x1F7C, - 0x1F7D -}; - -static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC 037 */ - 275, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x42, - 0x4A, - 0x51, - 0x5A, - 0x62, - 0x6A, - 0x70, - 0x79, - 0x80, - 0x8A, - 0x8C, - 0x8F, - 0x91, - 0x9A, - 0x9C, - 0x9D, - 0x9E, - 0x9F, - 0xA0, - 0xA1, - 0xA2, - 0xAA, - 0xAC, - 0xAF, - 0xC1, - 0xCA, - 0xCB, - 0xD0, - 0xD1, - 0xDA, - 0xDB, - 0xE0, - 0xE2, - 0xEA, - 0xEB, - 0xF0, - 0xFB, - 0xFF, - 0x100, - 0x131, - 0x132, - 0x138, - 0x139, - 0x18D, - 0x18E, - 0x19B, - 0x19C, - 0x1AA, - 0x1AC, - 0x1BA, - 0x1BC, - 0x1BE, - 0x1BF, - 0x1C0, - 0x1C4, - 0x221, - 0x222, - 0x234, - 0x23A, - 0x255, - 0x256, - 0x258, - 0x259, - 0x25A, - 0x25B, - 0x25D, - 0x260, - 0x262, - 0x263, - 0x264, - 0x265, - 0x267, - 0x268, - 0x26A, - 0x26B, - 0x26D, - 0x26F, - 0x270, - 0x271, - 0x273, - 0x275, - 0x276, - 0x27D, - 0x27E, - 0x280, - 0x281, - 0x283, - 0x284, - 0x287, - 0x28D, - 0x292, - 0x293, - 0x29D, - 0x29F, - 0x2BC, - 0x2BD, - 0x2BE, - 0x2BF, - 0x300, - 0x302, - 0x307, - 0x309, - 0x30A, - 0x30B, - 0x30C, - 0x30D, - 0x313, - 0x314, - 0x331, - 0x332, - 0x342, - 0x343, - 0x345, - 0x346, - 0x370, - 0x374, - 0x376, - 0x378, - 0x37B, - 0x37E, - 0x37F, - 0x380, - 0x386, - 0x387, - 0x388, - 0x38B, - 0x38C, - 0x38D, - 0x38E, - 0x3A2, - 0x3A3, - 0x3D2, - 0x3D5, - 0x3F6, - 0x3F7, - 0x3FC, - 0x3FD, - 0x482, - 0x48A, - 0x530, - 0x531, - 0x557, - 0x561, - 0x588, - 0x10A0, - 0x10C6, - 0x10C7, - 0x10C8, - 0x10CD, - 0x10CE, - 0x13A0, - 0x13F6, - 0x13F8, - 0x13FE, - 0x1D79, - 0x1D7A, - 0x1D7D, - 0x1D7E, - 0x1E00, - 0x1E9C, - 0x1E9E, - 0x1E9F, - 0x1EA0, - 0x1F16, - 0x1F18, - 0x1F1E, - 0x1F20, - 0x1F46, - 0x1F48, - 0x1F4E, - 0x1F50, - 0x1F58, - 0x1F59, - 0x1F5A, - 0x1F5B, - 0x1F5C, - 0x1F5D, - 0x1F5E, - 0x1F5F, - 0x1F7E, - 0x1F80, - 0x1FB5, - 0x1FB6, - 0x1FBD, - 0x1FBE, - 0x1FBF, - 0x1FC2, - 0x1FC5, - 0x1FC6, - 0x1FCD, - 0x1FD0, - 0x1FD4, - 0x1FD6, - 0x1FDC, - 0x1FE0, - 0x1FED, - 0x1FF2, - 0x1FF5, - 0x1FF6, - 0x1FFD, - 0x2126, - 0x2127, - 0x212A, - 0x212C, - 0x2132, - 0x2133, - 0x214E, - 0x214F, - 0x2160, - 0x2180, - 0x2183, - 0x2185, - 0x24B6, - 0x24EA, - 0x2C00, - 0x2C2F, - 0x2C30, - 0x2C5F, - 0x2C60, - 0x2C71, - 0x2C72, - 0x2C74, - 0x2C75, - 0x2C77, - 0x2C7E, - 0x2CE4, - 0x2CEB, - 0x2CEF, - 0x2CF2, - 0x2CF4, - 0x2D00, - 0x2D26, - 0x2D27, - 0x2D28, - 0x2D2D, - 0x2D2E, - 0xA640, - 0xA66E, - 0xA680, - 0xA69C, - 0xA722, - 0xA730, - 0xA732, - 0xA770, - 0xA779, - 0xA788, - 0xA78B, - 0xA78E, - 0xA790, - 0xA794, - 0xA796, - 0xA7AE, - 0xA7B0, - 0xA7B8, - 0xAB53, - 0xAB54, - 0xAB70, - 0xABC0, - 0xFB00, - 0xFB07, - 0xFB13, - 0xFB18, - 0xFF21, - 0xFF3B, - 0xFF41, - 0xFF5B, - 0x10400, - 0x10450, - 0x10C80, - 0x10CB3, - 0x10CC0, - 0x10CF3, - 0x118A0, - 0x118E0 -}; - -static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC 037 */ - 59, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x59, - 0x5A, - 0x130, - 0x131, - 0x149, - 0x14A, - 0x1F0, - 0x1F1, - 0x390, - 0x391, - 0x3B0, - 0x3B1, - 0x587, - 0x588, - 0x1E96, - 0x1E9B, - 0x1E9E, - 0x1E9F, - 0x1F50, - 0x1F51, - 0x1F52, - 0x1F53, - 0x1F54, - 0x1F55, - 0x1F56, - 0x1F57, - 0x1F80, - 0x1FB0, - 0x1FB2, - 0x1FB5, - 0x1FB6, - 0x1FB8, - 0x1FBC, - 0x1FBD, - 0x1FC2, - 0x1FC5, - 0x1FC6, - 0x1FC8, - 0x1FCC, - 0x1FCD, - 0x1FD2, - 0x1FD4, - 0x1FD6, - 0x1FD8, - 0x1FE2, - 0x1FE5, - 0x1FE6, - 0x1FE8, - 0x1FF2, - 0x1FF5, - 0x1FF6, - 0x1FF8, - 0x1FFC, - 0x1FFD, - 0xFB00, - 0xFB07, - 0xFB13, - 0xFB18 -}; - -#endif /* defined(PERL_IN_REGCOMP_C) */ - #if defined(PERL_IN_UTF8_C) static const UV _Perl_IDCont_invlist[] = { /* for EBCDIC 037 */ @@ -55875,7 +55887,7 @@ static const UV _Perl_IDStart_invlist[] = { /* for EBCDIC 037 */ #if defined(PERL_IN_PERL_C) -static const UV Sentence_Break_invlist[] = { /* for EBCDIC 037 */ +static const UV _Perl_SB_invlist[] = { /* for EBCDIC 037 */ 2916, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -58823,7 +58835,7 @@ typedef enum { SB_Upper = 15 } SB_enum; -static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC 037 */ +static const SB_enum _Perl_SB_invmap[] = { /* for EBCDIC 037 */ SB_Other, SB_Sp, SB_Other, @@ -61744,93 +61756,9 @@ static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC 037 */ #endif /* defined(PERL_IN_REGEXEC_C) */ -#if defined(PERL_IN_REGCOMP_C) - -static const UV UpperLatin1_invlist[] = { /* for EBCDIC 037 */ - 57, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0xB, - 0x14, - 0x16, - 0x17, - 0x18, - 0x1A, - 0x1C, - 0x20, - 0x25, - 0x28, - 0x2D, - 0x30, - 0x32, - 0x33, - 0x37, - 0x38, - 0x3C, - 0x3E, - 0x3F, - 0x41, - 0x4B, - 0x51, - 0x5A, - 0x5F, - 0x60, - 0x62, - 0x6B, - 0x70, - 0x79, - 0x80, - 0x81, - 0x8A, - 0x91, - 0x9A, - 0xA1, - 0xAA, - 0xB0, - 0xB1, - 0xBA, - 0xBC, - 0xC0, - 0xCA, - 0xD0, - 0xDA, - 0xE0, - 0xE1, - 0xE2, - 0xEA, - 0xF0, - 0xFA, - 0x100 -}; - -#endif /* defined(PERL_IN_REGCOMP_C) */ - #if defined(PERL_IN_PERL_C) -static const UV VertSpace_invlist[] = { /* for EBCDIC 037 */ - 9, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0xB, - 0xE, - 0x15, - 0x16, - 0x25, - 0x26, - 0x2028, - 0x202A -}; - -static const UV Word_Break_invlist[] = { /* for EBCDIC 037 */ +static const UV _Perl_WB_invlist[] = { /* for EBCDIC 037 */ 1545, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -63410,7 +63338,7 @@ typedef enum { WB_UNKNOWN = 18 } WB_enum; -static const WB_enum Word_Break_invmap[] = { /* for EBCDIC 037 */ +static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 037 */ WB_Other, WB_Newline, WB_CR, @@ -64960,8 +64888,92 @@ static const WB_enum Word_Break_invmap[] = { /* for EBCDIC 037 */ #endif /* defined(PERL_IN_REGEXEC_C) */ +#if defined(PERL_IN_REGCOMP_C) + +static const UV UpperLatin1_invlist[] = { /* for EBCDIC 037 */ + 57, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0xB, + 0x14, + 0x16, + 0x17, + 0x18, + 0x1A, + 0x1C, + 0x20, + 0x25, + 0x28, + 0x2D, + 0x30, + 0x32, + 0x33, + 0x37, + 0x38, + 0x3C, + 0x3E, + 0x3F, + 0x41, + 0x4B, + 0x51, + 0x5A, + 0x5F, + 0x60, + 0x62, + 0x6B, + 0x70, + 0x79, + 0x80, + 0x81, + 0x8A, + 0x91, + 0x9A, + 0xA1, + 0xAA, + 0xB0, + 0xB1, + 0xBA, + 0xBC, + 0xC0, + 0xCA, + 0xD0, + 0xDA, + 0xE0, + 0xE1, + 0xE2, + 0xEA, + 0xF0, + 0xFA, + 0x100 +}; + +#endif /* defined(PERL_IN_REGCOMP_C) */ + #if defined(PERL_IN_PERL_C) +static const UV VertSpace_invlist[] = { /* for EBCDIC 037 */ + 9, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0xB, + 0xE, + 0x15, + 0x16, + 0x25, + 0x26, + 0x2028, + 0x202A +}; + static const UV XPerlSpace_invlist[] = { /* for EBCDIC 037 */ 23, /* Number of elements */ 148565664, /* Version and data structure type */ @@ -74865,7 +74877,417 @@ static const UV Cased_invlist[] = { /* for EBCDIC POSIX-BC */ 0x1F18A }; -static const UV Grapheme_Cluster_Break_invlist[] = { /* for EBCDIC POSIX-BC */ +#endif /* defined(PERL_IN_PERL_C) */ + +#if defined(PERL_IN_REGCOMP_C) + +static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC POSIX-BC */ + 45, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x2BC, + 0x2BD, + 0x308, + 0x309, + 0x313, + 0x314, + 0x342, + 0x343, + 0x3AC, + 0x3AD, + 0x3AE, + 0x3AF, + 0x3B1, + 0x3B2, + 0x3B7, + 0x3B8, + 0x3B9, + 0x3BA, + 0x3C1, + 0x3C2, + 0x3C5, + 0x3C6, + 0x3C9, + 0x3CA, + 0x3CE, + 0x3CF, + 0x565, + 0x566, + 0x574, + 0x575, + 0x57E, + 0x57F, + 0x1F00, + 0x1F08, + 0x1F20, + 0x1F28, + 0x1F60, + 0x1F68, + 0x1F70, + 0x1F71, + 0x1F74, + 0x1F75, + 0x1F7C, + 0x1F7D +}; + +static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC POSIX-BC */ + 277, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x42, + 0x4A, + 0x51, + 0x5A, + 0x62, + 0x6A, + 0x70, + 0x79, + 0x80, + 0x8A, + 0x8C, + 0x8F, + 0x91, + 0x9A, + 0x9C, + 0x9D, + 0x9E, + 0x9F, + 0xA0, + 0xA1, + 0xA2, + 0xAA, + 0xAC, + 0xAF, + 0xC0, + 0xCA, + 0xCB, + 0xD0, + 0xD1, + 0xDA, + 0xDB, + 0xE1, + 0xE2, + 0xEA, + 0xEB, + 0xF0, + 0xFC, + 0xFD, + 0xFE, + 0xFF, + 0x100, + 0x131, + 0x132, + 0x138, + 0x139, + 0x18D, + 0x18E, + 0x19B, + 0x19C, + 0x1AA, + 0x1AC, + 0x1BA, + 0x1BC, + 0x1BE, + 0x1BF, + 0x1C0, + 0x1C4, + 0x221, + 0x222, + 0x234, + 0x23A, + 0x255, + 0x256, + 0x258, + 0x259, + 0x25A, + 0x25B, + 0x25D, + 0x260, + 0x262, + 0x263, + 0x264, + 0x265, + 0x267, + 0x268, + 0x26A, + 0x26B, + 0x26D, + 0x26F, + 0x270, + 0x271, + 0x273, + 0x275, + 0x276, + 0x27D, + 0x27E, + 0x280, + 0x281, + 0x283, + 0x284, + 0x287, + 0x28D, + 0x292, + 0x293, + 0x29D, + 0x29F, + 0x2BC, + 0x2BD, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x345, + 0x346, + 0x370, + 0x374, + 0x376, + 0x378, + 0x37B, + 0x37E, + 0x37F, + 0x380, + 0x386, + 0x387, + 0x388, + 0x38B, + 0x38C, + 0x38D, + 0x38E, + 0x3A2, + 0x3A3, + 0x3D2, + 0x3D5, + 0x3F6, + 0x3F7, + 0x3FC, + 0x3FD, + 0x482, + 0x48A, + 0x530, + 0x531, + 0x557, + 0x561, + 0x588, + 0x10A0, + 0x10C6, + 0x10C7, + 0x10C8, + 0x10CD, + 0x10CE, + 0x13A0, + 0x13F6, + 0x13F8, + 0x13FE, + 0x1D79, + 0x1D7A, + 0x1D7D, + 0x1D7E, + 0x1E00, + 0x1E9C, + 0x1E9E, + 0x1E9F, + 0x1EA0, + 0x1F16, + 0x1F18, + 0x1F1E, + 0x1F20, + 0x1F46, + 0x1F48, + 0x1F4E, + 0x1F50, + 0x1F58, + 0x1F59, + 0x1F5A, + 0x1F5B, + 0x1F5C, + 0x1F5D, + 0x1F5E, + 0x1F5F, + 0x1F7E, + 0x1F80, + 0x1FB5, + 0x1FB6, + 0x1FBD, + 0x1FBE, + 0x1FBF, + 0x1FC2, + 0x1FC5, + 0x1FC6, + 0x1FCD, + 0x1FD0, + 0x1FD4, + 0x1FD6, + 0x1FDC, + 0x1FE0, + 0x1FED, + 0x1FF2, + 0x1FF5, + 0x1FF6, + 0x1FFD, + 0x2126, + 0x2127, + 0x212A, + 0x212C, + 0x2132, + 0x2133, + 0x214E, + 0x214F, + 0x2160, + 0x2180, + 0x2183, + 0x2185, + 0x24B6, + 0x24EA, + 0x2C00, + 0x2C2F, + 0x2C30, + 0x2C5F, + 0x2C60, + 0x2C71, + 0x2C72, + 0x2C74, + 0x2C75, + 0x2C77, + 0x2C7E, + 0x2CE4, + 0x2CEB, + 0x2CEF, + 0x2CF2, + 0x2CF4, + 0x2D00, + 0x2D26, + 0x2D27, + 0x2D28, + 0x2D2D, + 0x2D2E, + 0xA640, + 0xA66E, + 0xA680, + 0xA69C, + 0xA722, + 0xA730, + 0xA732, + 0xA770, + 0xA779, + 0xA788, + 0xA78B, + 0xA78E, + 0xA790, + 0xA794, + 0xA796, + 0xA7AE, + 0xA7B0, + 0xA7B8, + 0xAB53, + 0xAB54, + 0xAB70, + 0xABC0, + 0xFB00, + 0xFB07, + 0xFB13, + 0xFB18, + 0xFF21, + 0xFF3B, + 0xFF41, + 0xFF5B, + 0x10400, + 0x10450, + 0x10C80, + 0x10CB3, + 0x10CC0, + 0x10CF3, + 0x118A0, + 0x118E0 +}; + +static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC POSIX-BC */ + 59, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x59, + 0x5A, + 0x130, + 0x131, + 0x149, + 0x14A, + 0x1F0, + 0x1F1, + 0x390, + 0x391, + 0x3B0, + 0x3B1, + 0x587, + 0x588, + 0x1E96, + 0x1E9B, + 0x1E9E, + 0x1E9F, + 0x1F50, + 0x1F51, + 0x1F52, + 0x1F53, + 0x1F54, + 0x1F55, + 0x1F56, + 0x1F57, + 0x1F80, + 0x1FB0, + 0x1FB2, + 0x1FB5, + 0x1FB6, + 0x1FB8, + 0x1FBC, + 0x1FBD, + 0x1FC2, + 0x1FC5, + 0x1FC6, + 0x1FC8, + 0x1FCC, + 0x1FCD, + 0x1FD2, + 0x1FD4, + 0x1FD6, + 0x1FD8, + 0x1FE2, + 0x1FE5, + 0x1FE6, + 0x1FE8, + 0x1FF2, + 0x1FF5, + 0x1FF6, + 0x1FF8, + 0x1FFC, + 0x1FFD, + 0xFB00, + 0xFB07, + 0xFB13, + 0xFB18 +}; + +#endif /* defined(PERL_IN_REGCOMP_C) */ + +#if defined(PERL_IN_PERL_C) + +static const UV _Perl_GCB_invlist[] = { /* for EBCDIC POSIX-BC */ 1502, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -76397,7 +76819,7 @@ typedef enum { GCB_V = 13 } GCB_enum; -static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC POSIX-BC */ +static const GCB_enum _Perl_GCB_invmap[] = { /* for EBCDIC POSIX-BC */ GCB_Control, GCB_CR, GCB_Control, @@ -77904,412 +78326,6 @@ static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC POSIX-BC #endif /* defined(PERL_IN_REGEXEC_C) */ -#if defined(PERL_IN_REGCOMP_C) - -static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC POSIX-BC */ - 45, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x2BC, - 0x2BD, - 0x308, - 0x309, - 0x313, - 0x314, - 0x342, - 0x343, - 0x3AC, - 0x3AD, - 0x3AE, - 0x3AF, - 0x3B1, - 0x3B2, - 0x3B7, - 0x3B8, - 0x3B9, - 0x3BA, - 0x3C1, - 0x3C2, - 0x3C5, - 0x3C6, - 0x3C9, - 0x3CA, - 0x3CE, - 0x3CF, - 0x565, - 0x566, - 0x574, - 0x575, - 0x57E, - 0x57F, - 0x1F00, - 0x1F08, - 0x1F20, - 0x1F28, - 0x1F60, - 0x1F68, - 0x1F70, - 0x1F71, - 0x1F74, - 0x1F75, - 0x1F7C, - 0x1F7D -}; - -static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC POSIX-BC */ - 277, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x42, - 0x4A, - 0x51, - 0x5A, - 0x62, - 0x6A, - 0x70, - 0x79, - 0x80, - 0x8A, - 0x8C, - 0x8F, - 0x91, - 0x9A, - 0x9C, - 0x9D, - 0x9E, - 0x9F, - 0xA0, - 0xA1, - 0xA2, - 0xAA, - 0xAC, - 0xAF, - 0xC0, - 0xCA, - 0xCB, - 0xD0, - 0xD1, - 0xDA, - 0xDB, - 0xE1, - 0xE2, - 0xEA, - 0xEB, - 0xF0, - 0xFC, - 0xFD, - 0xFE, - 0xFF, - 0x100, - 0x131, - 0x132, - 0x138, - 0x139, - 0x18D, - 0x18E, - 0x19B, - 0x19C, - 0x1AA, - 0x1AC, - 0x1BA, - 0x1BC, - 0x1BE, - 0x1BF, - 0x1C0, - 0x1C4, - 0x221, - 0x222, - 0x234, - 0x23A, - 0x255, - 0x256, - 0x258, - 0x259, - 0x25A, - 0x25B, - 0x25D, - 0x260, - 0x262, - 0x263, - 0x264, - 0x265, - 0x267, - 0x268, - 0x26A, - 0x26B, - 0x26D, - 0x26F, - 0x270, - 0x271, - 0x273, - 0x275, - 0x276, - 0x27D, - 0x27E, - 0x280, - 0x281, - 0x283, - 0x284, - 0x287, - 0x28D, - 0x292, - 0x293, - 0x29D, - 0x29F, - 0x2BC, - 0x2BD, - 0x2BE, - 0x2BF, - 0x300, - 0x302, - 0x307, - 0x309, - 0x30A, - 0x30B, - 0x30C, - 0x30D, - 0x313, - 0x314, - 0x331, - 0x332, - 0x342, - 0x343, - 0x345, - 0x346, - 0x370, - 0x374, - 0x376, - 0x378, - 0x37B, - 0x37E, - 0x37F, - 0x380, - 0x386, - 0x387, - 0x388, - 0x38B, - 0x38C, - 0x38D, - 0x38E, - 0x3A2, - 0x3A3, - 0x3D2, - 0x3D5, - 0x3F6, - 0x3F7, - 0x3FC, - 0x3FD, - 0x482, - 0x48A, - 0x530, - 0x531, - 0x557, - 0x561, - 0x588, - 0x10A0, - 0x10C6, - 0x10C7, - 0x10C8, - 0x10CD, - 0x10CE, - 0x13A0, - 0x13F6, - 0x13F8, - 0x13FE, - 0x1D79, - 0x1D7A, - 0x1D7D, - 0x1D7E, - 0x1E00, - 0x1E9C, - 0x1E9E, - 0x1E9F, - 0x1EA0, - 0x1F16, - 0x1F18, - 0x1F1E, - 0x1F20, - 0x1F46, - 0x1F48, - 0x1F4E, - 0x1F50, - 0x1F58, - 0x1F59, - 0x1F5A, - 0x1F5B, - 0x1F5C, - 0x1F5D, - 0x1F5E, - 0x1F5F, - 0x1F7E, - 0x1F80, - 0x1FB5, - 0x1FB6, - 0x1FBD, - 0x1FBE, - 0x1FBF, - 0x1FC2, - 0x1FC5, - 0x1FC6, - 0x1FCD, - 0x1FD0, - 0x1FD4, - 0x1FD6, - 0x1FDC, - 0x1FE0, - 0x1FED, - 0x1FF2, - 0x1FF5, - 0x1FF6, - 0x1FFD, - 0x2126, - 0x2127, - 0x212A, - 0x212C, - 0x2132, - 0x2133, - 0x214E, - 0x214F, - 0x2160, - 0x2180, - 0x2183, - 0x2185, - 0x24B6, - 0x24EA, - 0x2C00, - 0x2C2F, - 0x2C30, - 0x2C5F, - 0x2C60, - 0x2C71, - 0x2C72, - 0x2C74, - 0x2C75, - 0x2C77, - 0x2C7E, - 0x2CE4, - 0x2CEB, - 0x2CEF, - 0x2CF2, - 0x2CF4, - 0x2D00, - 0x2D26, - 0x2D27, - 0x2D28, - 0x2D2D, - 0x2D2E, - 0xA640, - 0xA66E, - 0xA680, - 0xA69C, - 0xA722, - 0xA730, - 0xA732, - 0xA770, - 0xA779, - 0xA788, - 0xA78B, - 0xA78E, - 0xA790, - 0xA794, - 0xA796, - 0xA7AE, - 0xA7B0, - 0xA7B8, - 0xAB53, - 0xAB54, - 0xAB70, - 0xABC0, - 0xFB00, - 0xFB07, - 0xFB13, - 0xFB18, - 0xFF21, - 0xFF3B, - 0xFF41, - 0xFF5B, - 0x10400, - 0x10450, - 0x10C80, - 0x10CB3, - 0x10CC0, - 0x10CF3, - 0x118A0, - 0x118E0 -}; - -static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC POSIX-BC */ - 59, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x59, - 0x5A, - 0x130, - 0x131, - 0x149, - 0x14A, - 0x1F0, - 0x1F1, - 0x390, - 0x391, - 0x3B0, - 0x3B1, - 0x587, - 0x588, - 0x1E96, - 0x1E9B, - 0x1E9E, - 0x1E9F, - 0x1F50, - 0x1F51, - 0x1F52, - 0x1F53, - 0x1F54, - 0x1F55, - 0x1F56, - 0x1F57, - 0x1F80, - 0x1FB0, - 0x1FB2, - 0x1FB5, - 0x1FB6, - 0x1FB8, - 0x1FBC, - 0x1FBD, - 0x1FC2, - 0x1FC5, - 0x1FC6, - 0x1FC8, - 0x1FCC, - 0x1FCD, - 0x1FD2, - 0x1FD4, - 0x1FD6, - 0x1FD8, - 0x1FE2, - 0x1FE5, - 0x1FE6, - 0x1FE8, - 0x1FF2, - 0x1FF5, - 0x1FF6, - 0x1FF8, - 0x1FFC, - 0x1FFD, - 0xFB00, - 0xFB07, - 0xFB13, - 0xFB18 -}; - -#endif /* defined(PERL_IN_REGCOMP_C) */ - #if defined(PERL_IN_UTF8_C) static const UV _Perl_IDCont_invlist[] = { /* for EBCDIC POSIX-BC */ @@ -80822,7 +80838,7 @@ static const UV _Perl_IDStart_invlist[] = { /* for EBCDIC POSIX-BC */ #if defined(PERL_IN_PERL_C) -static const UV Sentence_Break_invlist[] = { /* for EBCDIC POSIX-BC */ +static const UV _Perl_SB_invlist[] = { /* for EBCDIC POSIX-BC */ 2924, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -83778,7 +83794,7 @@ typedef enum { SB_Upper = 15 } SB_enum; -static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC POSIX-BC */ +static const SB_enum _Perl_SB_invmap[] = { /* for EBCDIC POSIX-BC */ SB_Other, SB_Sp, SB_Other, @@ -86707,93 +86723,9 @@ static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC POSIX-BC */ #endif /* defined(PERL_IN_REGEXEC_C) */ -#if defined(PERL_IN_REGCOMP_C) - -static const UV UpperLatin1_invlist[] = { /* for EBCDIC POSIX-BC */ - 57, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0xB, - 0x14, - 0x15, - 0x17, - 0x18, - 0x1A, - 0x1C, - 0x20, - 0x26, - 0x28, - 0x2D, - 0x30, - 0x32, - 0x33, - 0x37, - 0x38, - 0x3C, - 0x3E, - 0x3F, - 0x41, - 0x4A, - 0x51, - 0x5A, - 0x5F, - 0x60, - 0x62, - 0x6A, - 0x70, - 0x7A, - 0x80, - 0x81, - 0x8A, - 0x91, - 0x9A, - 0xA2, - 0xAA, - 0xBB, - 0xBE, - 0xC1, - 0xCA, - 0xD1, - 0xDA, - 0xE2, - 0xEA, - 0xF0, - 0xFA, - 0xFB, - 0xFC, - 0xFD, - 0xFE, - 0xFF -}; - -#endif /* defined(PERL_IN_REGCOMP_C) */ - #if defined(PERL_IN_PERL_C) -static const UV VertSpace_invlist[] = { /* for EBCDIC POSIX-BC */ - 9, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0xB, - 0xE, - 0x15, - 0x16, - 0x25, - 0x26, - 0x2028, - 0x202A -}; - -static const UV Word_Break_invlist[] = { /* for EBCDIC POSIX-BC */ +static const UV _Perl_WB_invlist[] = { /* for EBCDIC POSIX-BC */ 1547, /* Number of elements */ 148565664, /* Version and data structure type */ 0, /* 0 if the list starts at 0; @@ -88375,7 +88307,7 @@ typedef enum { WB_UNKNOWN = 18 } WB_enum; -static const WB_enum Word_Break_invmap[] = { /* for EBCDIC POSIX-BC */ +static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC POSIX-BC */ WB_Other, WB_Newline, WB_CR, @@ -89927,8 +89859,92 @@ static const WB_enum Word_Break_invmap[] = { /* for EBCDIC POSIX-BC */ #endif /* defined(PERL_IN_REGEXEC_C) */ +#if defined(PERL_IN_REGCOMP_C) + +static const UV UpperLatin1_invlist[] = { /* for EBCDIC POSIX-BC */ + 57, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x4, + 0x5, + 0x6, + 0x7, + 0x8, + 0xB, + 0x14, + 0x15, + 0x17, + 0x18, + 0x1A, + 0x1C, + 0x20, + 0x26, + 0x28, + 0x2D, + 0x30, + 0x32, + 0x33, + 0x37, + 0x38, + 0x3C, + 0x3E, + 0x3F, + 0x41, + 0x4A, + 0x51, + 0x5A, + 0x5F, + 0x60, + 0x62, + 0x6A, + 0x70, + 0x7A, + 0x80, + 0x81, + 0x8A, + 0x91, + 0x9A, + 0xA2, + 0xAA, + 0xBB, + 0xBE, + 0xC1, + 0xCA, + 0xD1, + 0xDA, + 0xE2, + 0xEA, + 0xF0, + 0xFA, + 0xFB, + 0xFC, + 0xFD, + 0xFE, + 0xFF +}; + +#endif /* defined(PERL_IN_REGCOMP_C) */ + #if defined(PERL_IN_PERL_C) +static const UV VertSpace_invlist[] = { /* for EBCDIC POSIX-BC */ + 9, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0xB, + 0xE, + 0x15, + 0x16, + 0x25, + 0x26, + 0x2028, + 0x202A +}; + static const UV XPerlSpace_invlist[] = { /* for EBCDIC POSIX-BC */ 23, /* Number of elements */ 148565664, /* Version and data structure type */ @@ -99479,7 +99495,7 @@ static const UV XPosixXDigit_invlist[] = { /* for EBCDIC POSIX-BC */ #endif /* EBCDIC POSIX-BC */ /* Generated from: - * 083180df694deb1fc173361406c1a75619fb8376403db3a76dc585c1e3951eca lib/Unicode/UCD.pm + * 0bca60a25eb4ccf2e04f50446db5f882322f50a9c61dc57bb806ccfc9b2e26a4 lib/Unicode/UCD.pm * ae98bec7e4f0564758eed81eca5015481ba32581f8a735a825b71b3bba714450 lib/unicore/ArabicShaping.txt * 1687fe5994eb7e5c0dab8503fc2a1b3b479d91af9d3b8055941c9bd791f7d0b5 lib/unicore/BidiBrackets.txt * 350d1302116194b0b21def287434b55c5088098fbc726e879f7420a391965643 lib/unicore/BidiMirroring.txt @@ -99521,8 +99537,8 @@ static const UV XPosixXDigit_invlist[] = { /* for EBCDIC POSIX-BC */ * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt - * c9326eab8d7861c3543963e555d5b927348f4467c93071db23154dece7619654 lib/unicore/mktables + * 46f739fb5c9daf6fb457ed67f821d88d9eadd2df17b098f385b3b50f99c01acf lib/unicore/mktables * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version * c6884f4d629f04d1316f3476cb1050b6a1b98ca30c903262955d4eae337c6b1e regen/charset_translations.pl - * f199f92c0b5f87882b0198936ea8ef3dc43627b57a77ac3eb9250bd2664bbd88 regen/mk_invlists.pl + * 8a097f8f726bb1619af2f27f149ab87e60a1602f790147e3a561358be16abd27 regen/mk_invlists.pl * ex: set ro: */ diff --git a/ext/XS-APItest/t/handy.t b/ext/XS-APItest/t/handy.t index 9ebe0d3dd8..eea81e8a8f 100644 --- a/ext/XS-APItest/t/handy.t +++ b/ext/XS-APItest/t/handy.t @@ -42,7 +42,7 @@ my %properties = ( alnum => 'Word', wordchar => 'Word', alphanumeric => 'Alnum', - alpha => 'Alpha', + alpha => 'XPosixAlpha', ascii => 'ASCII', blank => 'Blank', cntrl => 'Control', @@ -50,14 +50,14 @@ my %properties = ( graph => 'Graph', idfirst => '_Perl_IDStart', idcont => '_Perl_IDCont', - lower => 'Lower', + lower => 'XPosixLower', print => 'Print', psxspc => 'XPosixSpace', punct => 'XPosixPunct', quotemeta => '_Perl_Quotemeta', space => 'XPerlSpace', vertws => 'VertSpace', - upper => 'Upper', + upper => 'XPosixUpper', xdigit => 'XDigit', ); @@ -69,8 +69,13 @@ foreach my $name (sort keys %properties) { my $property = $properties{$name}; my @invlist = prop_invlist($property, '_perl_core_internal_ok'); if (! @invlist) { - fail("No inversion list found for $property"); - next; + + # An empty return could mean an unknown property, or merely that it is + # empty. Call in scalar context to differentiate + if (! prop_invlist($property, '_perl_core_internal_ok')) { + fail("No inversion list found for $property"); + next; + } } # Include all the Latin1 code points, plus 0x100. @@ -270,7 +275,7 @@ foreach my $name (sort keys %to_properties) { fail("No inversion map found for $property"); next; } - if ($format ne "al") { + if ($format !~ / ^ a l? $ /x) { fail("Unexpected inversion map format ('$format') found for $property"); next; } diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index 06fbfd1143..1854982491 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -5,7 +5,7 @@ use warnings; no warnings 'surrogate'; # surrogates can be inputs to this use charnames (); -our $VERSION = '0.61'; +our $VERSION = '0.62'; require Exporter; @@ -775,7 +775,6 @@ sub charprop ($$) { } else { croak __PACKAGE__, "::charprop: Internal error: unknown format '$format'. Please perlbug this"; - return undef; } } @@ -877,6 +876,10 @@ sub _charblocks { local $_; local $/ = "\n"; while (<$BLOCKSFH>) { + + # Old versions used a different syntax to mark the range. + $_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0; + if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { my ($lo, $hi) = (hex($1), hex($2)); my $subrange = [ $lo, $hi, $3 ]; @@ -2652,9 +2655,11 @@ or even better, C<"Gc=LC">). Many Unicode properties have more than one name (or alias). C<prop_invmap> understands all of these, including Perl extensions to them. Ambiguities are -resolved as described above for L</prop_aliases()>. The Perl internal -property "Perl_Decimal_Digit, described below, is also accepted. An empty -list is returned if the property name is unknown. +resolved as described above for L</prop_aliases()> (except if a property has +both a complete mapping, and a binary C<Y>/C<N> mapping, then specifying the +property name prefixed by C<"is"> causes the binary one to be returned). The +Perl internal property "Perl_Decimal_Digit, described below, is also accepted. +An empty list is returned if the property name is unknown. See L<perluniprops/Properties accessible through Unicode::UCD> for the properties acceptable as inputs to this function. @@ -3253,8 +3258,8 @@ RETRY: # we need to also read in that table. Create a hash with the keys # being the code points, and the values being a list of the # aliases for the code point key. - my ($aliases_code_points, $aliases_maps, undef, undef) = - &prop_invmap('Name_Alias'); + my ($aliases_code_points, $aliases_maps, undef, undef) + = &prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok'); my %aliases; for (my $i = 0; $i < @$aliases_code_points; $i++) { my $code_point = $aliases_code_points->[$i]; @@ -3545,7 +3550,19 @@ RETRY: if ($swash->{'LIST'} =~ /^V/) { @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr; - shift @invlist; + + shift @invlist; # Get rid of 'V'; + + # Could need to be inverted: add or subtract a 0 at the beginning of + # the list. + if ($swash->{'INVERT_IT'}) { + if (@invlist && $invlist[0] == 0) { + shift @invlist; + } + else { + unshift @invlist, 0; + } + } foreach my $i (0 .. @invlist - 1) { $invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N' } @@ -3558,6 +3575,10 @@ RETRY: } } else { + if ($swash->{'INVERT_IT'}) { + croak __PACKAGE__, ":prop_invmap: Don't know how to deal with inverted"; + } + # The LIST input lines look like: # ... # 0374\t\tCommon @@ -3873,7 +3894,7 @@ RETRY: map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap; $format = 'sl'; } - elsif ($returned_prop eq 'ToNameAlias') { + elsif ($returned_prop =~ / To ( _Perl )? NameAlias/x) { # This property currently doesn't have any lists, but theoretically # could @@ -3888,7 +3909,14 @@ RETRY: # to indicate that need to add code point to it. $format = 'ar'; } - elsif ($format ne 'n' && $format ne 'a') { + elsif ($format eq 'ax') { + + # Normally 'ax' properties have overrides, and will have been handled + # above, but if not, they still need adjustment, and the hex values + # have already been converted to decimal + $format = 'a'; + } + elsif ($format ne 'n' && $format !~ / ^ a /x) { # All others are simple scalars $format = 's'; @@ -4079,6 +4107,15 @@ for its block using C<charblock>). Note that starting in Unicode 6.1, many of the block names have shorter synonyms. These are always given in the new style. +=head2 Use with older Unicode versions + +The functions in this module work as well as can be expected when +used on earlier Unicode versions. But, obviously, they use the available data +from that Unicode version. For example, if the Unicode version predates the +definition of the script property (Unicode 3.1), then any function that deals +with scripts is going to return C<undef> for the script portion of the return +value. + =head1 AUTHOR Jarkko Hietaniemi. Now maintained by perl5 porters. diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index a799dd036b..22b2edbc93 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -19,14 +19,22 @@ use Test::More; use Unicode::UCD qw(charinfo charprop charprops_all); +my $expected_version = '8.0.0'; +my $current_version = Unicode::UCD::UnicodeVersion; +my $v_unicode_version = pack "C*", split /\./, $current_version; +my $unknown_script = ($v_unicode_version lt v5.0.0) + ? 'Common' + : 'Unknown'; my $input_record_separator = 7; # Make sure Unicode::UCD isn't affected by $/ = $input_record_separator; # setting this. my $charinfo; is(charinfo(0x110000), undef, "Verify charinfo() of non-unicode is undef"); -is(charprop(0x110000, 'age'), "Unassigned", "Verify charprop(age) of non-unicode is Unassigned"); -is(charprop(0x110000, 'in'), "Unassigned", "Verify charprop(in), a bipartite Perl extension, works"); +if ($v_unicode_version ge v3.2.0) { + is(lc charprop(0x110000, 'age'), lc "Unassigned", "Verify charprop(age) of non-unicode is Unassigned"); + is(charprop(0x110000, 'in'), "Unassigned", "Verify charprop(in), a bipartite Perl extension, works"); +} is(charprop(0x110000, 'Any'), undef, "Verify charprop of non-bipartite Perl extension returns undef"); my $cp = 0; @@ -37,9 +45,10 @@ is($charinfo->{code}, "0000", is($charinfo->{name}, "<control>"); is(charprop($cp, "name"), ""); -# This gets a sl-type property returning a flattened list -is(charprop($cp, "name_alias"), "NULL: control,NUL: abbreviation"); - +if ($v_unicode_version ge v6.1.0) { + # This gets a sl-type property returning a flattened list + is(charprop($cp, "name_alias"), "NULL: control,NUL: abbreviation"); +} is($charinfo->{category}, "Cc"); is(charprop($cp, "category"), "Control"); is($charinfo->{combining}, "0"); @@ -66,8 +75,8 @@ is($charinfo->{title}, ""); is(charprop($cp, "tc"), "\0"); is($charinfo->{block}, "Basic Latin"); is(charprop($cp, "block"), "Basic_Latin"); -is($charinfo->{script}, "Common"); -is(charprop($cp, "script"), "Common"); +is($charinfo->{script}, "Common") if $v_unicode_version gt v3.0.1; +is(charprop($cp, "script"), "Common") if $v_unicode_version gt v3.0.1; $cp = utf8::unicode_to_native(0x41); my $A_code = sprintf("%04X", ord("A")); @@ -103,8 +112,8 @@ is($charinfo->{title}, ""); is(charprop($cp, 'tc'), "A"); is($charinfo->{block}, "Basic Latin"); is(charprop($cp, 'block'), "Basic_Latin"); -is($charinfo->{script}, "Latin"); -is(charprop($cp, 'script'), "Latin"); +is($charinfo->{script}, "Latin") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Latin") if $v_unicode_version gt v3.0.1; $cp = 0x100; $charinfo = charinfo($cp); @@ -138,8 +147,8 @@ is($charinfo->{title}, ""); is(charprop($cp, 'tc'), "\x{100}"); is($charinfo->{block}, "Latin Extended-A"); is(charprop($cp, 'block'), "Latin_Extended_A"); -is($charinfo->{script}, "Latin"); -is(charprop($cp, 'script'), "Latin"); +is($charinfo->{script}, "Latin") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Latin") if $v_unicode_version gt v3.0.1; $cp = 0x590; # 0x0590 is in the Hebrew block but unused. $charinfo = charinfo($cp); @@ -152,7 +161,9 @@ is(charprop($cp, 'gc'), "Unassigned"); is($charinfo->{combining}, undef); is(charprop($cp, 'ccc'), "Not_Reordered"); is($charinfo->{bidi}, undef); -is(charprop($cp, 'bc'), "Right_To_Left"); +if ($v_unicode_version gt v3.2.0) { + is(charprop($cp, 'bc'), "Right_To_Left"); +} is($charinfo->{decomposition}, undef); is(charprop($cp, 'dm'), "\x{590}"); is($charinfo->{decimal}, undef); @@ -174,7 +185,8 @@ is(charprop($cp, 'tc'), "\x{590}"); is($charinfo->{block}, undef); is(charprop($cp, 'block'), "Hebrew"); is($charinfo->{script}, undef); -is(charprop($cp, 'script'), "Unknown"); +is(charprop($cp, 'script'), $unknown_script) if $v_unicode_version gt +v3.0.1; # 0x05d0 is in the Hebrew block and used. @@ -210,8 +222,8 @@ is($charinfo->{title}, ""); is(charprop($cp, 'tc'), "\x{5d0}"); is($charinfo->{block}, "Hebrew"); is(charprop($cp, 'block'), "Hebrew"); -is($charinfo->{script}, "Hebrew"); -is(charprop($cp, 'script'), "Hebrew"); +is($charinfo->{script}, "Hebrew") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Hebrew") if $v_unicode_version gt v3.0.1; # An open syllable in Hangul. @@ -247,8 +259,8 @@ is($charinfo->{title}, ""); is(charprop($cp, 'tc'), "\x{AC00}"); is($charinfo->{block}, "Hangul Syllables"); is(charprop($cp, 'block'), "Hangul_Syllables"); -is($charinfo->{script}, "Hangul"); -is(charprop($cp, 'script'), "Hangul"); +is($charinfo->{script}, "Hangul") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Hangul") if $v_unicode_version gt v3.0.1; # A closed syllable in Hangul. @@ -284,85 +296,89 @@ is($charinfo->{title}, ""); is(charprop($cp, 'tc'), "\x{AE00}"); is($charinfo->{block}, "Hangul Syllables"); is(charprop($cp, 'block'), "Hangul_Syllables"); -is($charinfo->{script}, "Hangul"); -is(charprop($cp, 'script'), "Hangul"); - -$cp = 0x1D400; -$charinfo = charinfo($cp); +is($charinfo->{script}, "Hangul") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Hangul") if $v_unicode_version gt v3.0.1; + +if ($v_unicode_version gt v3.0.1) { + $cp = 0x1D400; + $charinfo = charinfo($cp); + + is($charinfo->{code}, "1D400", "MATHEMATICAL BOLD CAPITAL A"); + is($charinfo->{name}, "MATHEMATICAL BOLD CAPITAL A"); + is(charprop($cp, 'name'), "MATHEMATICAL BOLD CAPITAL A"); + is($charinfo->{category}, "Lu"); + is(charprop($cp, 'gc'), "Uppercase_Letter"); + is($charinfo->{combining}, "0"); + is(charprop($cp, 'ccc'), "Not_Reordered"); + is($charinfo->{bidi}, "L"); + is(charprop($cp, 'bc'), "Left_To_Right"); + is($charinfo->{decomposition}, "<font> $A_code"); + is(charprop($cp, 'dm'), "A"); + is($charinfo->{decimal}, ""); + is($charinfo->{digit}, ""); + is($charinfo->{numeric}, ""); + is(charprop($cp, 'nv'), "NaN"); + is($charinfo->{mirrored}, "N"); + is(charprop($cp, 'bidim'), "No"); + is($charinfo->{unicode10}, ""); + is(charprop($cp, 'na1'), ""); + is($charinfo->{comment}, ""); + is(charprop($cp, 'isc'), ""); + is($charinfo->{upper}, ""); + is(charprop($cp, 'uc'), "\x{1D400}"); + is($charinfo->{lower}, ""); + is(charprop($cp, 'lc'), "\x{1D400}"); + is($charinfo->{title}, ""); + is(charprop($cp, 'tc'), "\x{1D400}"); + is($charinfo->{block}, "Mathematical Alphanumeric Symbols"); + is(charprop($cp, 'block'), "Mathematical_Alphanumeric_Symbols"); + is($charinfo->{script}, "Common"); + is(charprop($cp, 'script'), "Common"); +} -is($charinfo->{code}, "1D400", "MATHEMATICAL BOLD CAPITAL A"); -is($charinfo->{name}, "MATHEMATICAL BOLD CAPITAL A"); -is(charprop($cp, 'name'), "MATHEMATICAL BOLD CAPITAL A"); -is($charinfo->{category}, "Lu"); -is(charprop($cp, 'gc'), "Uppercase_Letter"); -is($charinfo->{combining}, "0"); -is(charprop($cp, 'ccc'), "Not_Reordered"); -is($charinfo->{bidi}, "L"); -is(charprop($cp, 'bc'), "Left_To_Right"); -is($charinfo->{decomposition}, "<font> $A_code"); -is(charprop($cp, 'dm'), "A"); -is($charinfo->{decimal}, ""); -is($charinfo->{digit}, ""); -is($charinfo->{numeric}, ""); -is(charprop($cp, 'nv'), "NaN"); -is($charinfo->{mirrored}, "N"); -is(charprop($cp, 'bidim'), "No"); -is($charinfo->{unicode10}, ""); -is(charprop($cp, 'na1'), ""); -is($charinfo->{comment}, ""); -is(charprop($cp, 'isc'), ""); -is($charinfo->{upper}, ""); -is(charprop($cp, 'uc'), "\x{1D400}"); -is($charinfo->{lower}, ""); -is(charprop($cp, 'lc'), "\x{1D400}"); -is($charinfo->{title}, ""); -is(charprop($cp, 'tc'), "\x{1D400}"); -is($charinfo->{block}, "Mathematical Alphanumeric Symbols"); -is(charprop($cp, 'block'), "Mathematical_Alphanumeric_Symbols"); -is($charinfo->{script}, "Common"); -is(charprop($cp, 'script'), "Common"); - -$cp = 0x9FBA; #Bug 58428 -$charinfo = charinfo(0x9FBA); - -is($charinfo->{code}, "9FBA", "U+9FBA"); -is($charinfo->{name}, "CJK UNIFIED IDEOGRAPH-9FBA"); -is(charprop($cp, 'name'), "CJK UNIFIED IDEOGRAPH-9FBA"); -is($charinfo->{category}, "Lo"); -is(charprop($cp, 'gc'), "Other_Letter"); -is($charinfo->{combining}, "0"); -is(charprop($cp, 'ccc'), "Not_Reordered"); -is($charinfo->{bidi}, "L"); -is(charprop($cp, 'bc'), "Left_To_Right"); -is($charinfo->{decomposition}, ""); -is(charprop($cp, 'dm'), "\x{9FBA}"); -is($charinfo->{decimal}, ""); -is($charinfo->{digit}, ""); -is($charinfo->{numeric}, ""); -is(charprop($cp, 'nv'), "NaN"); -is($charinfo->{mirrored}, "N"); -is(charprop($cp, 'bidim'), "No"); -is($charinfo->{unicode10}, ""); -is(charprop($cp, 'na1'), ""); -is($charinfo->{comment}, ""); -is(charprop($cp, 'isc'), ""); -is($charinfo->{upper}, ""); -is(charprop($cp, 'uc'), "\x{9FBA}"); -is($charinfo->{lower}, ""); -is(charprop($cp, 'lc'), "\x{9FBA}"); -is($charinfo->{title}, ""); -is(charprop($cp, 'tc'), "\x{9FBA}"); -is($charinfo->{block}, "CJK Unified Ideographs"); -is(charprop($cp, 'block'), "CJK_Unified_Ideographs"); -is($charinfo->{script}, "Han"); -is(charprop($cp, 'script'), "Han"); +if ($v_unicode_version ge v4.1.0) { + $cp = 0x9FBA; #Bug 58428 + $charinfo = charinfo(0x9FBA); + + is($charinfo->{code}, "9FBA", "U+9FBA"); + is($charinfo->{name}, "CJK UNIFIED IDEOGRAPH-9FBA"); + is(charprop($cp, 'name'), "CJK UNIFIED IDEOGRAPH-9FBA"); + is($charinfo->{category}, "Lo"); + is(charprop($cp, 'gc'), "Other_Letter"); + is($charinfo->{combining}, "0"); + is(charprop($cp, 'ccc'), "Not_Reordered"); + is($charinfo->{bidi}, "L"); + is(charprop($cp, 'bc'), "Left_To_Right"); + is($charinfo->{decomposition}, ""); + is(charprop($cp, 'dm'), "\x{9FBA}"); + is($charinfo->{decimal}, ""); + is($charinfo->{digit}, ""); + is($charinfo->{numeric}, ""); + is(charprop($cp, 'nv'), "NaN"); + is($charinfo->{mirrored}, "N"); + is(charprop($cp, 'bidim'), "No"); + is($charinfo->{unicode10}, ""); + is(charprop($cp, 'na1'), ""); + is($charinfo->{comment}, ""); + is(charprop($cp, 'isc'), ""); + is($charinfo->{upper}, ""); + is(charprop($cp, 'uc'), "\x{9FBA}"); + is($charinfo->{lower}, ""); + is(charprop($cp, 'lc'), "\x{9FBA}"); + is($charinfo->{title}, ""); + is(charprop($cp, 'tc'), "\x{9FBA}"); + is($charinfo->{block}, "CJK Unified Ideographs"); + is(charprop($cp, 'block'), "CJK_Unified_Ideographs"); + is($charinfo->{script}, "Han"); + is(charprop($cp, 'script'), "Han"); +} use Unicode::UCD qw(charblock charscript); # 0x0590 is in the Hebrew block but unused. is(charblock(0x590), "Hebrew", "0x0590 - Hebrew unused charblock"); -is(charscript(0x590), "Unknown", "0x0590 - Hebrew unused charscript"); +is(charscript(0x590), $unknown_script, "0x0590 - Hebrew unused charscript") if $v_unicode_version gt v3.0.1; is(charblock(0x1FFFF), "No_Block", "0x1FFFF - unused charblock"); my $fraction_3_4_code = sprintf("%04X", utf8::unicode_to_native(0xbe)); @@ -401,8 +417,8 @@ is($charinfo->{title}, ""); is(charprop($cp, 'tc'), chr hex $cp); is($charinfo->{block}, "Latin-1 Supplement"); is(charprop($cp, 'block'), "Latin_1_Supplement"); -is($charinfo->{script}, "Common"); -is(charprop($cp, 'script'), "Common"); +is($charinfo->{script}, "Common") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Common") if $v_unicode_version gt v3.0.1; # This is to test a case where both simple and full lowercases exist and # differ @@ -435,13 +451,13 @@ is(charprop($cp, 'isc'), ""); is($charinfo->{upper}, ""); is(charprop($cp, 'uc'), "\x{130}"); is($charinfo->{lower}, $i_code); -is(charprop($cp, 'lc'), "i\x{307}"); +is(charprop($cp, 'lc'), "i\x{307}") if $v_unicode_version ge v3.2.0; is($charinfo->{title}, ""); is(charprop($cp, 'tc'), "\x{130}"); is($charinfo->{block}, "Latin Extended-A"); is(charprop($cp, 'block'), "Latin_Extended_A"); -is($charinfo->{script}, "Latin"); -is(charprop($cp, 'script'), "Latin"); +is($charinfo->{script}, "Latin") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Latin") if $v_unicode_version gt v3.0.1; # This is to test a case where both simple and full uppercases exist and # differ @@ -478,19 +494,23 @@ is($charinfo->{title}, "1F88"); is(charprop($cp, "tc"), "\x{1F88}"); is($charinfo->{block}, "Greek Extended"); is(charprop($cp, "block"), "Greek_Extended"); -is($charinfo->{script}, "Greek"); -is(charprop($cp, "script"), "Greek"); +is($charinfo->{script}, "Greek") if $v_unicode_version gt v3.0.1; +is(charprop($cp, "script"), "Greek") if $v_unicode_version gt v3.0.1; is(charprop(ord("A"), "foo"), undef, "Verify charprop of unknown property returns <undef>"); # These were created from inspection of the code to exercise the branches -is(charprop(ord("("), "bpb"), ")", +if ($v_unicode_version ge v6.3.0) { + is(charprop(ord("("), "bpb"), ")", "Verify charprop figures out that s-type properties can be char"); +} is(charprop(ord("9"), "nv"), 9, "Verify charprop can adjust an ar-type property"); -is(charprop(utf8::unicode_to_native(0xAD), "NFKC_Casefold"), "", +if ($v_unicode_version ge v5.2.0) { + is(charprop(utf8::unicode_to_native(0xAD), "NFKC_Casefold"), "", "Verify charprop can handle an \"\" in ae-type property"); +} my $mark_props_ref = charprops_all(0x300); is($mark_props_ref->{'Bidi_Class'}, "Nonspacing_Mark", @@ -499,9 +519,13 @@ is($mark_props_ref->{'Bidi_Mirrored'}, "No"); is($mark_props_ref->{'Canonical_Combining_Class'}, "Above"); is($mark_props_ref->{'Case_Folding'}, "\x{300}"); is($mark_props_ref->{'Decomposition_Mapping'}, "\x{300}"); -is($mark_props_ref->{'Decomposition_Type'}, "None"); +is($mark_props_ref->{'Decomposition_Type'}, ($v_unicode_version le v4.0.0) + ? "none" + : "None"); is($mark_props_ref->{'General_Category'}, "Nonspacing_Mark"); -is($mark_props_ref->{'ISO_Comment'}, ""); +if ($v_unicode_version gt v5.1.0) { + is($mark_props_ref->{'ISO_Comment'}, ""); +} is($mark_props_ref->{'Lowercase_Mapping'}, "\x{300}"); is($mark_props_ref->{'Name'}, "COMBINING GRAVE ACCENT"); is($mark_props_ref->{'Numeric_Type'}, "None"); @@ -522,36 +546,40 @@ ok(exists $charblocks->{Thai}, 'Thai charblock exists'); is($charblocks->{Thai}->[0]->[0], hex('0e00')); ok(!exists $charblocks->{PigLatin}, 'PigLatin charblock does not exist'); -my $charscripts = charscripts(); +if ($v_unicode_version gt v3.0.1) { + my $charscripts = charscripts(); -ok(exists $charscripts->{Armenian}, 'Armenian charscript exists'); -is($charscripts->{Armenian}->[0]->[0], hex('0531')); -ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist'); + ok(exists $charscripts->{Armenian}, 'Armenian charscript exists'); + is($charscripts->{Armenian}->[0]->[0], hex('0531')); + ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist'); -my $charscript; + my $charscript; -$charscript = charscript("12ab"); -is($charscript, 'Ethiopic', 'Ethiopic charscript'); + $charscript = charscript("12ab"); + is($charscript, 'Ethiopic', 'Ethiopic charscript'); -$charscript = charscript("0x12ab"); -is($charscript, 'Ethiopic'); + $charscript = charscript("0x12ab"); + is($charscript, 'Ethiopic'); -$charscript = charscript("U+12ab"); -is($charscript, 'Ethiopic'); + $charscript = charscript("U+12ab"); + is($charscript, 'Ethiopic'); -my $ranges; + my $ranges; -$ranges = charscript('Ogham'); -is($ranges->[0]->[0], hex('1680'), 'Ogham charscript'); -is($ranges->[0]->[1], hex('169C')); + if ($v_unicode_version gt v4.0.0) { + $ranges = charscript('Ogham'); + is($ranges->[0]->[0], hex('1680'), 'Ogham charscript'); + is($ranges->[0]->[1], hex('169C')); + } -use Unicode::UCD qw(charinrange); + use Unicode::UCD qw(charinrange); -$ranges = charscript('Cherokee'); -ok(!charinrange($ranges, "139f"), 'Cherokee charscript'); -ok( charinrange($ranges, "13a0")); -ok( charinrange($ranges, "13f4")); -ok(!charinrange($ranges, "13ff")); + $ranges = charscript('Cherokee'); + ok(!charinrange($ranges, "139f"), 'Cherokee charscript'); + ok( charinrange($ranges, "13a0")); + ok( charinrange($ranges, "13f4")); + ok(!charinrange($ranges, "13ff")); +} use Unicode::UCD qw(general_categories); @@ -571,7 +599,8 @@ is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic'); # If this fails, then maybe one should look at the Unicode changes to see # what else might need to be updated. -is(Unicode::UCD::UnicodeVersion, '8.0.0', 'UnicodeVersion'); +ok($current_version le $expected_version, + "Verify there isn't a new Unicode version to upgrade to"); use Unicode::UCD qw(compexcl); @@ -593,66 +622,93 @@ is($casefold->{full}, $a_code, 'casefold native(0x41) full'); is($casefold->{simple}, $a_code, 'casefold native(0x41) simple'); is($casefold->{turkic}, "", 'casefold native(0x41) turkic'); -$casefold = casefold(utf8::unicode_to_native(0xdf)); my $sharp_s_code = sprintf("%04X", utf8::unicode_to_native(0xdf)); my $S_code = sprintf("%04X", ord "S"); my $s_code = sprintf("%04X", ord "s"); -is($casefold->{code}, $sharp_s_code, 'casefold native(0xDF) code'); -is($casefold->{status}, 'F', 'casefold native(0xDF) status'); -is($casefold->{mapping}, "$s_code $s_code", 'casefold native(0xDF) mapping'); -is($casefold->{full}, "$s_code $s_code", 'casefold native(0xDF) full'); -is($casefold->{simple}, "", 'casefold native(0xDF) simple'); -is($casefold->{turkic}, "", 'casefold native(0xDF) turkic'); - -# Do different tests depending on if version < 3.2, or not. -my $v_unicode_version = pack "C*", split /\./, Unicode::UCD::UnicodeVersion(); -if ($v_unicode_version lt v3.2.0) { - $casefold = casefold(0x130); - - is($casefold->{code}, '0130', 'casefold 0x130 code'); - is($casefold->{status}, 'I' , 'casefold 0x130 status'); - is($casefold->{mapping}, $i_code, 'casefold 0x130 mapping'); - is($casefold->{full}, $i_code, 'casefold 0x130 full'); - is($casefold->{simple}, $i_code, 'casefold 0x130 simple'); - is($casefold->{turkic}, $i_code, 'casefold 0x130 turkic'); - - $casefold = casefold(0x131); - - is($casefold->{code}, '0131', 'casefold 0x131 code'); - is($casefold->{status}, 'I' , 'casefold 0x131 status'); - is($casefold->{mapping}, $i_code, 'casefold 0x131 mapping'); - is($casefold->{full}, $i_code, 'casefold 0x131 full'); - is($casefold->{simple}, $i_code, 'casefold 0x131 simple'); - is($casefold->{turkic}, $i_code, 'casefold 0x131 turkic'); -} else { - $casefold = casefold(utf8::unicode_to_native(0x49)); - - is($casefold->{code}, $I_code, 'casefold native(0x49) code'); - is($casefold->{status}, 'C' , 'casefold native(0x49) status'); - is($casefold->{mapping}, $i_code, 'casefold native(0x49) mapping'); - is($casefold->{full}, $i_code, 'casefold native(0x49) full'); - is($casefold->{simple}, $i_code, 'casefold native(0x49) simple'); - is($casefold->{turkic}, "0131", 'casefold native(0x49) turkic'); - - $casefold = casefold(0x130); - - is($casefold->{code}, '0130', 'casefold 0x130 code'); - is($casefold->{status}, 'F' , 'casefold 0x130 status'); - is($casefold->{mapping}, "$i_code 0307", 'casefold 0x130 mapping'); - is($casefold->{full}, "$i_code 0307", 'casefold 0x130 full'); - is($casefold->{simple}, "", 'casefold 0x130 simple'); - is($casefold->{turkic}, $i_code, 'casefold 0x130 turkic'); -} +if ($v_unicode_version gt v3.0.0) { # These special ones don't work on early + # perls + $casefold = casefold(utf8::unicode_to_native(0xdf)); + + is($casefold->{code}, $sharp_s_code, 'casefold native(0xDF) code'); + is($casefold->{status}, 'F', 'casefold native(0xDF) status'); + is($casefold->{mapping}, "$s_code $s_code", 'casefold native(0xDF) mapping'); + is($casefold->{full}, "$s_code $s_code", 'casefold native(0xDF) full'); + is($casefold->{simple}, "", 'casefold native(0xDF) simple'); + is($casefold->{turkic}, "", 'casefold native(0xDF) turkic'); + + # Do different tests depending on if version < 3.2, or not. + if ($v_unicode_version eq v3.0.1) { + # In this release, there was no special Turkic values. + # Both 0x130 and 0x131 folded to 'i'. + + $casefold = casefold(0x130); + + is($casefold->{code}, '0130', 'casefold 0x130 code'); + is($casefold->{status}, 'C' , 'casefold 0x130 status'); + is($casefold->{mapping}, $i_code, 'casefold 0x130 mapping'); + is($casefold->{full}, $i_code, 'casefold 0x130 full'); + is($casefold->{simple}, $i_code, 'casefold 0x130 simple'); + is($casefold->{turkic}, "", 'casefold 0x130 turkic'); + + $casefold = casefold(0x131); + + is($casefold->{code}, '0131', 'casefold 0x131 code'); + is($casefold->{status}, 'C' , 'casefold 0x131 status'); + is($casefold->{mapping}, $i_code, 'casefold 0x131 mapping'); + is($casefold->{full}, $i_code, 'casefold 0x131 full'); + is($casefold->{simple}, $i_code, 'casefold 0x131 simple'); + is($casefold->{turkic}, "", 'casefold 0x131 turkic'); + } + elsif ($v_unicode_version lt v3.2.0) { + $casefold = casefold(0x130); + + is($casefold->{code}, '0130', 'casefold 0x130 code'); + is($casefold->{status}, 'I' , 'casefold 0x130 status'); + is($casefold->{mapping}, $i_code, 'casefold 0x130 mapping'); + is($casefold->{full}, $i_code, 'casefold 0x130 full'); + is($casefold->{simple}, $i_code, 'casefold 0x130 simple'); + is($casefold->{turkic}, $i_code, 'casefold 0x130 turkic'); + + $casefold = casefold(0x131); + + is($casefold->{code}, '0131', 'casefold 0x131 code'); + is($casefold->{status}, 'I' , 'casefold 0x131 status'); + is($casefold->{mapping}, $i_code, 'casefold 0x131 mapping'); + is($casefold->{full}, $i_code, 'casefold 0x131 full'); + is($casefold->{simple}, $i_code, 'casefold 0x131 simple'); + is($casefold->{turkic}, $i_code, 'casefold 0x131 turkic'); + } else { + $casefold = casefold(utf8::unicode_to_native(0x49)); + + is($casefold->{code}, $I_code, 'casefold native(0x49) code'); + is($casefold->{status}, 'C' , 'casefold native(0x49) status'); + is($casefold->{mapping}, $i_code, 'casefold native(0x49) mapping'); + is($casefold->{full}, $i_code, 'casefold native(0x49) full'); + is($casefold->{simple}, $i_code, 'casefold native(0x49) simple'); + is($casefold->{turkic}, "0131", 'casefold native(0x49) turkic'); + + $casefold = casefold(0x130); + + is($casefold->{code}, '0130', 'casefold 0x130 code'); + is($casefold->{status}, 'F' , 'casefold 0x130 status'); + is($casefold->{mapping}, "$i_code 0307", 'casefold 0x130 mapping'); + is($casefold->{full}, "$i_code 0307", 'casefold 0x130 full'); + is($casefold->{simple}, "", 'casefold 0x130 simple'); + is($casefold->{turkic}, $i_code, 'casefold 0x130 turkic'); + } -$casefold = casefold(0x1F88); + if ($v_unicode_version gt v3.0.1) { + $casefold = casefold(0x1F88); -is($casefold->{code}, '1F88', 'casefold 0x1F88 code'); -is($casefold->{status}, 'S' , 'casefold 0x1F88 status'); -is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping'); -is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full'); -is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple'); -is($casefold->{turkic}, "", 'casefold 0x1F88 turkic'); + is($casefold->{code}, '1F88', 'casefold 0x1F88 code'); + is($casefold->{status}, 'S' , 'casefold 0x1F88 status'); + is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping'); + is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full'); + is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple'); + is($casefold->{turkic}, "", 'casefold 0x1F88 turkic'); + } +} ok(!casefold(utf8::unicode_to_native(0x20))); @@ -672,12 +728,16 @@ ok($casespec->{code} eq $sharp_s_code && $casespec = casespec(0x307); -ok($casespec->{az}->{code} eq '0307' && - !defined $casespec->{az}->{lower} && - $casespec->{az}->{title} eq '0307' && - $casespec->{az}->{upper} eq '0307' && - $casespec->{az}->{condition} eq 'az After_I', - 'casespec 0x307'); +if ($v_unicode_version gt v3.1.0) { + ok($casespec->{az}->{code} eq '0307' + && !defined $casespec->{az}->{lower} + && $casespec->{az}->{title} eq '0307' + && $casespec->{az}->{upper} eq '0307' + && $casespec->{az}->{condition} eq ($v_unicode_version le v3.2) + ? 'az After_Soft_Dotted' + : 'az After_I', + 'casespec 0x307'); +} # perl #7305 UnicodeCD::compexcl is weird @@ -699,11 +759,15 @@ is(Unicode::UCD::_getcode('x123'), undef, "_getcode(x123)"); is(Unicode::UCD::_getcode('0x123x'), undef, "_getcode(x123)"); is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)"); +SKIP: { + skip("Script property not in this release", 3) if $v_unicode_version lt v3.1.0; my $r1 = charscript('Latin'); if (ok(defined $r1, "Found Latin script")) { + skip("Latin range count will be wrong when using older Unicode release", + 2) if $v_unicode_version lt $expected_version; my $n1 = @$r1; - is($n1, 31, "number of ranges in Latin script (Unicode 7.0.0)") if $::IS_ASCII; + is($n1, 31, "number of ranges in Latin script (Unicode $expected_version)") if $::IS_ASCII; shift @$r1 while @$r1; my $r2 = charscript('Latin'); is(@$r2, $n1, "modifying results should not mess up internal caches"); @@ -714,38 +778,72 @@ is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)"); is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD"); } -use Unicode::UCD qw(namedseq); - -is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq"); -is(namedseq("KATAKANA LETTER AINU Q"), undef); -is(namedseq(), undef); -is(namedseq(qw(foo bar)), undef); -my @ns = namedseq("KATAKANA LETTER AINU P"); -is(scalar @ns, 2); -is($ns[0], 0x31F7); -is($ns[1], 0x309A); -my %ns = namedseq(); -is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}"); -@ns = namedseq(42); -is(@ns, 0); +if ($v_unicode_version ge v4.1.0) { + use Unicode::UCD qw(namedseq); + + is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq"); + is(namedseq("KATAKANA LETTER AINU Q"), undef); + is(namedseq(), undef); + is(namedseq(qw(foo bar)), undef); + my @ns = namedseq("KATAKANA LETTER AINU P"); + is(scalar @ns, 2); + is($ns[0], 0x31F7); + is($ns[1], 0x309A); + my %ns = namedseq(); + is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}"); + @ns = namedseq(42); + is(@ns, 0); +} use Unicode::UCD qw(num); -use charnames ":full"; +use charnames (); # Don't use \N{} on things not in original Unicode + # version; else will get a compilation error when this .t + # is run on an older version. is(num("0"), 0, 'Verify num("0") == 0'); is(num("98765"), 98765, 'Verify num("98765") == 98765'); -ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined'); -is(num("\N{NEW TAI LUE DIGIT TWO}"), 2, 'Verify num("\N{NEW TAI LUE DIGIT TWO}") == 2'); -is(num("\N{NEW TAI LUE DIGIT ONE}"), 1, 'Verify num("\N{NEW TAI LUE DIGIT ONE}") == 1'); -is(num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}"), 21, 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}") == 21'); -ok(! defined num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}"), 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined'); -is(num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}"), 3, 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3'); -ok(! defined num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}"), 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined'); +ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), + 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined'); +my $tai_lue_2; +if ($v_unicode_version ge v4.1.0) { + my $tai_lue_1 = charnames::string_vianame("NEW TAI LUE DIGIT ONE"); + $tai_lue_2 = charnames::string_vianame("NEW TAI LUE DIGIT TWO"); + is(num($tai_lue_2), 2, 'Verify num("\N{NEW TAI LUE DIGIT TWO}") == 2'); + is(num($tai_lue_1), 1, 'Verify num("\N{NEW TAI LUE DIGIT ONE}") == 1'); + is(num($tai_lue_2 . $tai_lue_1), 21, + 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}") == 21'); +} +if ($v_unicode_version ge v5.2.0) { + ok(! defined num($tai_lue_2 + . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")), + 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined'); +} +if ($v_unicode_version ge v5.1.0) { + my $cham_0 = charnames::string_vianame("CHAM DIGIT ZERO"); + is(num($cham_0 . charnames::string_vianame("CHAM DIGIT THREE")), 3, + 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3'); + if ($v_unicode_version ge v5.2.0) { + ok(! defined num( $cham_0 + . charnames::string_vianame("JAVANESE DIGIT NINE")), + 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined'); + } +} is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2'); -is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000'); -is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5'); -is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9'); -is(num("\N{U+5146}"), 1000000000000, 'Verify num("\N{U+5146}") == 1000000000000'); +if ($v_unicode_version ge v3.0.0) { + is(num(charnames::string_vianame("ETHIOPIC NUMBER TEN THOUSAND")), 10000, + 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000'); +} +if ($v_unicode_version ge v5.2.0) { + is(num(charnames::string_vianame("NORTH INDIC FRACTION ONE HALF")), + .5, + 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5'); + is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9'); +} +if ($v_unicode_version gt v3.2.0) { # Is missing from non-Unihan files before + # this + is(num("\N{U+5146}"), 1000000000000, + 'Verify num("\N{U+5146}") == 1000000000000'); +} # Create a user-defined property sub InKana {<<'END'} @@ -979,6 +1077,12 @@ while (<$propvalues>) { my @fields = split /\s*;\s*/; # Fields are separated by semi-colons my $prop = shift @fields; # 0th field is the property, + # 'qc' is short in early versions of the file for any of the quick check + # properties. Choose one of them. + if ($prop eq 'qc' && $v_unicode_version le v4.0.0) { + $prop = "NFKC_QC"; + } + # When changing properties, we examine the accumulated values for the old # one to see if our function that returns them matches. if ($prev_prop ne $prop) { @@ -986,6 +1090,11 @@ while (<$propvalues>) { my @ucd_function_values = prop_values($prev_prop); @ucd_function_values = () unless @ucd_function_values; + # The file didn't include strictly numeric values until after this + if ($prev_prop eq 'ccc' && $v_unicode_version le v6.0.0) { + @ucd_function_values = grep { /\D/ } @ucd_function_values; + } + # This perl extension doesn't appear in the official file push @this_prop_values, "Non_Canon" if $prev_prop eq 'dt'; @@ -1008,6 +1117,12 @@ while (<$propvalues>) { # characters that are ignored under loose matching to test that my $mod_prop = "$extra_chars$prop"; + if ($prop eq 'blk' && $v_unicode_version le v5.0.0) { + foreach my $element (@fields) { + $element =~ s/-/_/g; + } + } + if ($fields[0] eq 'n/a') { # See comments in input file, essentially # means full name and short name are identical $fields[0] = $fields[1]; @@ -1190,36 +1305,39 @@ if ($::IS_ASCII) { # On EBCDIC, other things will come first, and can vary $prop = "lc"; ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); - is($format, 'al', "prop_invmap() format of '$prop' is 'al'"); + my $lc_format = ($v_unicode_version ge v3.2.0) ? 'al' : 'a'; + is($format, $lc_format, "prop_invmap() format of '$prop' is '$lc_format"); is($missing, '0', "prop_invmap() missing of '$prop' is '0'"); is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41"); is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61"); } # This property is stable and small, so can test all of it -$prop = "ASCII_Hex_Digit"; -($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); -is($format, 's', "prop_invmap() format of '$prop' is 's'"); -is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); -if ($::IS_ASCII) { - is_deeply($invlist_ref, [ 0x0000, 0x0030, 0x003A, - 0x0041, 0x0047, - 0x0061, 0x0067, 0x110000 - ], - "prop_invmap('$prop') code point list is correct"); -} -elsif ($::IS_EBCDIC) { - is_deeply($invlist_ref, [ - utf8::unicode_to_native(0x0000), - utf8::unicode_to_native(0x0061), utf8::unicode_to_native(0x0066) + 1, - utf8::unicode_to_native(0x0041), utf8::unicode_to_native(0x0046) + 1, - utf8::unicode_to_native(0x0030), utf8::unicode_to_native(0x0039) + 1, - utf8::unicode_to_native(0x110000) - ], - "prop_invmap('$prop') code point list is correct"); +if ($v_unicode_version gt v3.1.0) { + $prop = "ASCII_Hex_Digit"; + ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); + is($format, 's', "prop_invmap() format of '$prop' is 's'"); + is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); + if ($::IS_ASCII) { + is_deeply($invlist_ref, [ 0x0000, 0x0030, 0x003A, + 0x0041, 0x0047, + 0x0061, 0x0067, 0x110000 + ], + "prop_invmap('$prop') code point list is correct"); + } + elsif ($::IS_EBCDIC) { + is_deeply($invlist_ref, [ + utf8::unicode_to_native(0x0000), + utf8::unicode_to_native(0x0061), utf8::unicode_to_native(0x0066) + 1, + utf8::unicode_to_native(0x0041), utf8::unicode_to_native(0x0046) + 1, + utf8::unicode_to_native(0x0030), utf8::unicode_to_native(0x0039) + 1, + utf8::unicode_to_native(0x110000) + ], + "prop_invmap('$prop') code point list is correct"); + } + is_deeply($invmap_ref, [ 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N' ] , + "prop_invmap('$prop') map list is correct"); } -is_deeply($invmap_ref, [ 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N' ] , - "prop_invmap('$prop') map list is correct"); is(prop_invlist("Unknown property"), undef, "prop_invlist(<Unknown property>) returns undef"); is(prop_invlist(undef), undef, "prop_invlist(undef) returns undef"); @@ -1240,36 +1358,38 @@ is(prop_invlist("InKana"), undef, "prop_invlist(<user-defined property returns u # are there in the files. As a small hedge against that, test some # prop_invlist() tables fully with the known correct result. We choose # ASCII_Hex_Digit again, as it is stable. -if ($::IS_ASCII) { - @invlist = prop_invlist("AHex"); - is_deeply(\@invlist, [ 0x0030, 0x003A, 0x0041, - 0x0047, 0x0061, 0x0067 ], - "prop_invlist('AHex') is exactly the expected set of points"); - @invlist = prop_invlist("AHex=f"); - is_deeply(\@invlist, [ 0x0000, 0x0030, 0x003A, 0x0041, - 0x0047, 0x0061, 0x0067 ], - "prop_invlist('AHex=f') is exactly the expected set of points"); -} -elsif ($::IS_EBCDIC) { # Relies on the ranges 0-9, a-f, and A-F each being - # contiguous - @invlist = prop_invlist("AHex"); - is_deeply(\@invlist, [ - utf8::unicode_to_native(0x0061), utf8::unicode_to_native(0x0066) + 1, - utf8::unicode_to_native(0x0041), utf8::unicode_to_native(0x0046) + 1, - utf8::unicode_to_native(0x0030), utf8::unicode_to_native(0x0039) + 1, - ], - "prop_invlist('AHex') is exactly the expected set of points"); - @invlist = prop_invlist("AHex=f"); - is_deeply(\@invlist, [ - utf8::unicode_to_native(0x0000), - utf8::unicode_to_native(0x0061), - utf8::unicode_to_native(0x0066) + 1, - utf8::unicode_to_native(0x0041), - utf8::unicode_to_native(0x0046) + 1, - utf8::unicode_to_native(0x0030), - utf8::unicode_to_native(0x0039) + 1, - ], - "prop_invlist('AHex=f') is exactly the expected set of points"); +if ($v_unicode_version gt v3.1.0) { + if ($::IS_ASCII) { + @invlist = prop_invlist("AHex"); + is_deeply(\@invlist, [ 0x0030, 0x003A, 0x0041, + 0x0047, 0x0061, 0x0067 ], + "prop_invlist('AHex') is exactly the expected set of points"); + @invlist = prop_invlist("AHex=f"); + is_deeply(\@invlist, [ 0x0000, 0x0030, 0x003A, 0x0041, + 0x0047, 0x0061, 0x0067 ], + "prop_invlist('AHex=f') is exactly the expected set of points"); + } + elsif ($::IS_EBCDIC) { # Relies on the ranges 0-9, a-f, and A-F each being + # contiguous + @invlist = prop_invlist("AHex"); + is_deeply(\@invlist, [ + utf8::unicode_to_native(0x0061), utf8::unicode_to_native(0x0066) + 1, + utf8::unicode_to_native(0x0041), utf8::unicode_to_native(0x0046) + 1, + utf8::unicode_to_native(0x0030), utf8::unicode_to_native(0x0039) + 1, + ], + "prop_invlist('AHex') is exactly the expected set of points"); + @invlist = prop_invlist("AHex=f"); + is_deeply(\@invlist, [ + utf8::unicode_to_native(0x0000), + utf8::unicode_to_native(0x0061), + utf8::unicode_to_native(0x0066) + 1, + utf8::unicode_to_native(0x0041), + utf8::unicode_to_native(0x0046) + 1, + utf8::unicode_to_native(0x0030), + utf8::unicode_to_native(0x0039) + 1, + ], + "prop_invlist('AHex=f') is exactly the expected set of points"); + } } sub fail_with_diff ($$$$) { @@ -1547,7 +1667,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { fail("prop_invmap('$prop')"); diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap"); } - next PROPERTY; + next PROPERTY; } } @@ -1555,6 +1675,14 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { # normalized version. $name = &utf8::_loose_name(lc $name); + # In the case of a combination property, both a map table and a match + # table are generated. For all the tests except prop_invmap(), this is + # irrelevant, but for prop_invmap, having an 'is' prefix forces it to + # return the match table; otherwise the map. We thus need to distinguish + # between the two forms. The property name is what has this information. + $name = &utf8::_loose_name(lc $prop) + if exists $Unicode::UCD::combination_property{$name}; + # Add in the characters that are supposed to be ignored to test loose # matching, which the tested function applies to all properties $display_prop = "$extra_chars$prop" unless $display_prop; @@ -2309,7 +2437,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { # And remove the aliases. We read in the Name_Alias property, and go # through them one by one. my ($aliases_code_points, $aliases_maps, undef, undef) - = &prop_invmap('Name_Alias'); + = &prop_invmap('_Perl_Name_Alias', '_perl_core_internal_ok'); for (my $i = 0; $i < @$aliases_code_points; $i++) { my $code_point = $aliases_code_points->[$i]; @@ -2534,11 +2662,13 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { # A few tests of search_invlist use Unicode::UCD qw(search_invlist); -my ($scripts_ranges_ref, $scripts_map_ref) = prop_invmap("Script"); -my $index = search_invlist($scripts_ranges_ref, 0x390); -is($scripts_map_ref->[$index], "Greek", "U+0390 is Greek"); -my @alpha_invlist = prop_invlist("Alpha"); -is(search_invlist(\@alpha_invlist, ord("\t")), undef, "search_invlist returns undef for code points before first one on the list"); +if ($v_unicode_version ge v3.1.0) { # No Script property before this + my ($scripts_ranges_ref, $scripts_map_ref) = prop_invmap("Script"); + my $index = search_invlist($scripts_ranges_ref, 0x390); + is($scripts_map_ref->[$index], "Greek", "U+0390 is Greek"); + my @alpha_invlist = prop_invlist("Alpha"); + is(search_invlist(\@alpha_invlist, ord("\t")), undef, "search_invlist returns undef for code points before first one on the list"); +} ok($/ eq $input_record_separator, "The record separator didn't get overridden"); diff --git a/lib/locale.t b/lib/locale.t index 1ebd0ce39c..1b510d25e6 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -848,7 +848,8 @@ sub disp_str ($) { } else { $result .= " " unless $prev_was_punct; - $result .= charnames::viacode(ord $char); + my $name = charnames::viacode(ord $char); + $result .= (defined $name) ? $name : ':unknown:'; $prev_was_punct = 0; } } diff --git a/lib/unicore/README.perl b/lib/unicore/README.perl index ef5fec6295..f892334d45 100644 --- a/lib/unicore/README.perl +++ b/lib/unicore/README.perl @@ -1,15 +1,12 @@ # The goal is for perl to compile and reasonably run any version of Unicode. -# But in v5.22, the earliest version that this works for is Unicode 5.1. # Working reasonably well doesn't mean that the test suite will run without -# showing errors. You may be able to compile an earlier version, and get -# things to sort-of work. A few of the very-Unicode specific test files have been +# showing errors. A few of the very-Unicode specific test files have been # modified to account for different versions, but most have not. For example, # some tests use characters that aren't encoded in all Unicode versions; others # have hard-coded the General Categories for a code point that were correct at # the time the test was written. Perl itself will not compile under Unicode # releases prior to 3.0 without a simple change to Unicode::Normalize. -# mktables contains instructions for this, as well as other hints for using -# older Unicode versions. +# mktables contains instructions for this. # The *.txt files were copied from @@ -17,14 +14,20 @@ # (which always points to the latest version) with subdirectories 'extracted' and # 'auxiliary'. Older versions are located under Public with an appropriate name. +# They are also available via http at www.unicode.org/versions/ +# # The Unihan files were not included due to space considerations. Also NOT -# included were any *.html files. It is possible to add the Unihan files, and -# edit mktables (see instructions near its beginning) to look at them. +# included were any *.html files. It is possible to add the Unihan files and +# have some properties from them automatically compiled. By editing mktables +# (see instructions near its beginning) you can add other Unihan properties. # The file named 'version' should exist and be a single line with the Unicode # version, like: +# # 5.2.0 +# +# (without the initial '# ') # To be 8.3 filesystem friendly, the names of some of the input files have been # changed from the values that are in the Unicode DB. Not all of the Test @@ -58,6 +61,8 @@ mv extracted/DerivedJoiningType.txt extracted/DJoinType.txt mv extracted/DerivedLineBreak.txt extracted/DLineBreak.txt mv extracted/DerivedNumericType.txt extracted/DNumType.txt mv extracted/DerivedNumericValues.txt extracted/DNumValues.txt +rmdir extracted 2>/dev/null # Will fail if non-empty, but if it is empty + # was an early release that didn't have it. mv auxiliary/GraphemeBreakTest.txt auxiliary/GCBTest.txt mv auxiliary/LineBreakTest.txt auxiliary/LBTest.txt @@ -81,14 +86,31 @@ mv Unihan_Variants.txt UnihanVariants.txt # filesystems. # mktables is used to generate the tables used by the rest of Perl. It will -# warn you about any *.txt files in the directory substructure that it doesn't -# know about. You should remove any so-identified, or edit mktables to add -# them to its lists to process. You can run +# warn you about any *.txt and *.html files in the directory substructure that +# it doesn't know about. You should remove any so-identified, or edit mktables +# to add them to its lists to process. You can run # # mktables -globlist # -#to have it try to process these tables generically. -# +# to have it try to process these tables generically. + +# COMPILING ON OLDER UNICODE VERSIONS +# +# To compile perl for use with an older Unicode release, delete everything in +# the lib/unicore directory except mktables and Makefile. Then download the +# Unicode-supplied files for the desired version to that directory (A url for +# these is given earlier in this file). Then create the 'version' file with a +# single line, like '6.1.0'. Do a 'make test' from the project level. You +# will get some porting errors for needing to regen. Regenerate what it tells +# you are needed, and make test again. If you compile an old enough version, +# you will also have to download a few files from later Unicode versions, +# following the instructions that will be given if warranted. It should +# compile in any release without warnings, except for some casing conflicts +# in Unicode 2.1.8, and some extraneous files will show up in very early +# releases of the form qr/diff.*\.txt/. If you add Unihan.txt, one line is in error in +# +# Other glitches are noted in mktables under 'UNICODE VERSIONS NOTES' + # FOR PUMPKINS # # The files are inter-related. If you take the latest UnicodeData.txt, for diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 572c299038..27cb45ab9d 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -4,17 +4,9 @@ # Any files created or read by this program should be listed in 'mktables.lst' # Use -makelist to regenerate it. -# Needs 'no overloading' to run faster on miniperl. Code commented out at the -# subroutine objaddr can be used instead to work as far back (untested) as -# 5.8: needs pack "U". But almost all occurrences of objaddr have been -# removed in favor of using 'no overloading'. You also would have to go -# through and replace occurrences like: -# my $addr = do { no overloading; pack 'J', $self; } -# with -# my $addr = main::objaddr $self; -# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b -# that instituted the change to main::objaddr, and subsequent commits that -# changed 0+$self to pack 'J', $self.) +# There was an attempt when this was first rewritten to make it 5.8 +# compatible, but that has now been abandoned, and newer constructs are used +# as convenient. my $start_time; BEGIN { # Get the time the script started running; do it at compilation to @@ -32,6 +24,7 @@ use File::Path; use File::Spec; use Text::Tabs; use re "/aa"; +use feature 'state'; sub DEBUG () { 0 } # Set to 0 for production; 1 for development my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; @@ -292,8 +285,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # As mentioned earlier, some properties are given in more than one file. In # particular, the files in the extracted directory are supposedly just # reformattings of the others. But they contain information not easily -# derivable from the other files, including results for Unihan, which this -# program doesn't ordinarily look at, and for unassigned code points. They +# derivable from the other files, including results for Unihan (which isn't +# usually available to this program) and for unassigned code points. They # also have historically had errors or been incomplete. In an attempt to # create the best possible data, this program thus processes them first to # glean information missing from the other files; then processes those other @@ -411,24 +404,19 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # # A NOTE ON UNIHAN # -# This program can generate tables from the Unihan database. But it doesn't -# by default, letting the CPAN module Unicode::Unihan handle them. Prior to -# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the -# database was split into 8 different files, all beginning with the letters -# 'Unihan'. This program will read those file(s) if present, but it needs to -# know which of the many properties in the file(s) should have tables created -# for them. It will create tables for any properties listed in -# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the -# @cjk_properties array and the @cjk_property_values array. Thus, if a -# property you want is not in those files of the release you are building -# against, you must add it to those two arrays. Starting in 4.0, the -# Unicode_Radical_Stroke was listed in those files, so if the Unihan database -# is present in the directory, a table will be generated for that property. -# In 5.2, several more properties were added. For your convenience, the two -# arrays are initialized with all the 6.0 listed properties that are also in -# earlier releases. But these are commented out. You can just uncomment the -# ones you want, or use them as a template for adding entries for other -# properties. +# This program can generate tables from the Unihan database. But that db +# isn't normally available, so it is marked as optional. Prior to version +# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database +# was split into 8 different files, all beginning with the letters 'Unihan'. +# If you plunk those files down into the directory mktables ($0) is in, this +# program will read them and automatically create tables for the properties +# from it that are listed in PropertyAliases.txt and PropValueAliases.txt, +# plus any you add to the @cjk_properties array and the @cjk_property_values +# array, being sure to add necessary '# @missings' lines to the latter. For +# Unicode versions earlier than 5.2, most of the Unihan properties are not +# listed at all in PropertyAliases nor PropValueAliases. This program assumes +# for these early releases that you want the properties that are specified in +# the 5.2 release. # # You may need to adjust the entries to suit your purposes. setup_unihan(), # and filter_unihan_line() are the functions where this is done. This program @@ -437,8 +425,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # # There is a bug in the 3.2 data file in which some values for the # kPrimaryNumeric property have commas and an unexpected comment. A filter -# could be added for these; or for a particular installation, the Unihan.txt -# file could be edited to fix them. +# could be added to correct these; or for a particular installation, the +# Unihan.txt file could be edited to fix them. # # HOW TO ADD A FILE TO BE PROCESSED # @@ -484,13 +472,13 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # handled by Unicode::Normalize, nor will it compile when presented a version # that has them. However, you can trivially get it to compile by simply # ignoring those decompositions, by changing the croak to a carp. At the time -# of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads +# of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or +# cpan/Unicode-Normalize/mkheader) reads # # croak("Weird Canonical Decomposition of U+$h"); # # Simply comment it out. It will compile, but will not know about any three -# character decompositions. If using the .pm version, there is a similar -# line. +# character decompositions. # The number of code points in \p{alpha=True} halved in 2.1.9. It turns out # that the reason is that the CJK block starting at 4E00 was removed from @@ -513,10 +501,13 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # name for the class, it would not have been affected, but if it used the # mnemonic, it would have been. # -# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code +# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code # points which eventually came to have this script property value, instead # mapped to "Unknown". But in the next release all these code points were # moved to \p{sc=common} instead. + +# The tests furnished by Unicode for testing WordBreak and SentenceBreak +# generate errors in 5.0 and earlier. # # The default for missing code points for BidiClass is complicated. Starting # in 3.1.1, the derived file DBidiClass.txt handles this, but this program @@ -596,8 +587,8 @@ our $to_trace = 0; || $caller_name eq 'trace'); my $output = ""; + #print STDERR __LINE__, ": ", join ", ", @input, "\n"; foreach my $string (@input) { - #print STDERR __LINE__, ": ", join ", ", @input, "\n"; if (ref $string eq 'ARRAY' || ref $string eq 'HASH') { $output .= simple_dumper($string); } @@ -623,10 +614,11 @@ our $to_trace = 0; # This is for a rarely used development feature that allows you to compare two # versions of the Unicode standard without having to deal with changes caused -# by the code points introduced in the later version. Change the 0 to a -# string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only -# code points introduced in that release and earlier will be used; later ones -# are thrown away. You use the version number of the earliest one you want to +# by the code points introduced in the later version. You probably also want +# to use the -annotate option when using this. Change the 0 to a string +# containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only code +# points introduced in that release and earlier will be used; later ones are +# thrown away. You use the version number of the earliest one you want to # compare; then run this program on directory structures containing each # release, and compare the outputs. These outputs will therefore include only # the code points common to both releases, and you can see the changes caused @@ -861,33 +853,8 @@ if ($v_version gt v3.2.0) { 'Canonical_Combining_Class=Attached_Below_Left' } -# These are listed in the Property aliases file in 6.0, but Unihan is ignored -# unless explicitly added. -if ($v_version ge v5.2.0) { - my $unihan = 'Unihan; remove from list if using Unihan'; - foreach my $table (qw ( - kAccountingNumeric - kOtherNumeric - kPrimaryNumeric - kCompatibilityVariant - kIICore - kIRG_GSource - kIRG_HSource - kIRG_JSource - kIRG_KPSource - kIRG_MSource - kIRG_KSource - kIRG_TSource - kIRG_USource - kIRG_VSource - kRSUnicode - )) - { - $why_suppress_if_empty_warn_if_not{$table} = $unihan; - } -} - -# Enum values for to_output_map() method in the Map_Table package. +# Enum values for to_output_map() method in the Map_Table package. (0 is don't +# output) my $EXTERNAL_MAP = 1; my $INTERNAL_MAP = 2; my $OUTPUT_ADJUSTED = 3; @@ -913,13 +880,6 @@ my %global_to_output_map = ( Decomposition_Type => 0, ); -# Properties that this program ignores. -my @unimplemented_properties; - -# With this release, it is automatically handled if the Unihan db is -# downloaded -push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0; - # There are several types of obsolete properties defined by Unicode. These # must be hand-edited for every new Unicode release. my %why_deprecated; # Generates a deprecated warning message if used. @@ -959,8 +919,6 @@ my %why_obsolete; # Documentation only # existence is not noted in the comment. 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::', - 'Indic_Matra_Category' => "Withdrawn by Unicode while still provisional", - # Don't suppress ISO_Comment, as otherwise special handling is needed # to differentiate between it and gc=c, which can be written as 'isc', # which is the same characters as ISO_Comment's short name. @@ -1046,45 +1004,13 @@ if ($v_version ge v6.0.0) { my @output_mapped_properties = split "\n", <<END; END -# If you are using the Unihan database in a Unicode version before 5.2, you -# need to add the properties that you want to extract from it to this table. -# For your convenience, the properties in the 6.0 PropertyAliases.txt file are -# listed, commented out +# If you want more Unihan properties than the default, you need to add them to +# these arrays. Depending on the property type, @missing lines might have to +# be added to the second array. A sample entry would be (including the '#'): +# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN my @cjk_properties = split "\n", <<'END'; -#cjkAccountingNumeric; kAccountingNumeric -#cjkOtherNumeric; kOtherNumeric -#cjkPrimaryNumeric; kPrimaryNumeric -#cjkCompatibilityVariant; kCompatibilityVariant -#cjkIICore ; kIICore -#cjkIRG_GSource; kIRG_GSource -#cjkIRG_HSource; kIRG_HSource -#cjkIRG_JSource; kIRG_JSource -#cjkIRG_KPSource; kIRG_KPSource -#cjkIRG_KSource; kIRG_KSource -#cjkIRG_TSource; kIRG_TSource -#cjkIRG_USource; kIRG_USource -#cjkIRG_VSource; kIRG_VSource -#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS END - -# Similarly for the property values. For your convenience, the lines in the -# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both -# '#' marks (for Unicode versions before 5.2) my @cjk_property_values = split "\n", <<'END'; -## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN -## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point> -## @missing: 0000..10FFFF; cjkIICore; <none> -## @missing: 0000..10FFFF; cjkIRG_GSource; <none> -## @missing: 0000..10FFFF; cjkIRG_HSource; <none> -## @missing: 0000..10FFFF; cjkIRG_JSource; <none> -## @missing: 0000..10FFFF; cjkIRG_KPSource; <none> -## @missing: 0000..10FFFF; cjkIRG_KSource; <none> -## @missing: 0000..10FFFF; cjkIRG_TSource; <none> -## @missing: 0000..10FFFF; cjkIRG_USource; <none> -## @missing: 0000..10FFFF; cjkIRG_VSource; <none> -## @missing: 0000..10FFFF; cjkOtherNumeric; NaN -## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN -## @missing: 0000..10FFFF; cjkRSUnicode; <none> END # The input files don't list every code point. Those not listed are to be @@ -1109,7 +1035,7 @@ my %default_mapping = ( Decomposition_Type => 'None', East_Asian_Width => "Neutral", FC_NFKC_Closure => $CODE_POINT, - General_Category => 'Cn', + General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned', Grapheme_Cluster_Break => 'Other', Hangul_Syllable_Type => 'NA', ISO_Comment => "", @@ -1140,39 +1066,6 @@ my %default_mapping = ( Word_Break => 'Other', ); -# Below are files that Unicode furnishes, but this program ignores, and why. -# NormalizationCorrections.txt requires some more explanation. It documents -# the cumulative fixes to erroneous normalizations in earlier Unicode -# versions. Its main purpose is so that someone running on an earlier version -# can use this file to override what got published in that earlier release. -# It would be easy for mktables to read and handle this file. But all the -# corrections in it should already be in the other files for the release it -# is. To get it to actually mean something useful, someone would have to be -# using an earlier Unicode release, and copy it to the files for that release -# and recomplile. So far there has been no demand to do that, so this hasn't -# been implemented. -my %ignored_files = ( - 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points', - 'Index.txt' => 'Alphabetical index of Unicode characters', - 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl', - 'NamesList.txt' => 'Annotated list of characters', - 'NamesList.html' => 'Describes the format and contents of F<NamesList.txt>', - 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base', - 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)', - 'ReadMe.txt' => 'Documentation', - 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>', - 'StandardizedVariants.html' => 'Provides a visual display of the standard variant sequences derived from F<StandardizedVariants.txt>.', - 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values', - 'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters', - 'USourceGlyphs.pdf' => 'Pictures of the characters in F<USourceData.txt>', - 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests', - 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests', - 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests', - 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests', -); - -my %skipped_files; # List of files that we skip - ### End of externally interesting definitions, except for @input_file_objects my $HEADER=<<"EOF"; @@ -1199,7 +1092,9 @@ my $DEVELOPMENT_ONLY=<<"EOF"; EOF -my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF"; +my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0) + ? "10FFFF" + : "FFFF"; my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; @@ -1229,8 +1124,7 @@ my $code_point_re = qr/\b$run_on_code_point_re/; # defaults for code points not listed (i.e., missing) in the file. The code # depends on this ending with a semi-colon, so it can assume it is a valid # field when the line is split() by semi-colons -my $missing_defaults_prefix = - qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/; +my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/; # Property types. Unicode has more types, but these are sufficient for our # purposes. @@ -1307,11 +1201,15 @@ my $OBSOLETE = 'O'; my $a_bold_obsolete = "an 'B<$OBSOLETE>'"; my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; +# Aliases can also have an extra status: +my $INTERNAL_ALIAS = 'P'; + my %status_past_participles = ( $DISCOURAGED => 'discouraged', $STABILIZED => 'stabilized', $OBSOLETE => 'obsolete', $DEPRECATED => 'deprecated', + $INTERNAL_ALIAS => 'reserved for Perl core internal use only', ); # Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be @@ -1374,18 +1272,21 @@ my %loose_to_file_of; # loosely maps table names to their respective # files my %stricter_to_file_of; # same; but for stricter mapping. my %loose_property_to_file_of; # Maps a loose property name to its map file +my %strict_property_to_file_of; # Same, but strict my @inline_definitions = "V0"; # Each element gives a definition of a unique # inversion list. When a definition is inlined, # its value in the hash it's in (one of the two # defined just above) will include an index into # this array. The 0th element is initialized to - # the definition for a zero length invwersion list + # the definition for a zero length inversion list my %file_to_swash_name; # Maps the file name to its corresponding key name # in the hash %utf8::SwashInfo my %nv_floating_to_rational; # maps numeric values floating point numbers to # their rational equivalent my %loose_property_name_of; # Loosely maps (non_string) property names to # standard form +my %strict_property_name_of; # Strictly maps (non_string) property names to + # standard form my %string_property_loose_to_name; # Same, for string properties. my %loose_defaults; # keys are of form "prop=value", where 'prop' is # the property name in standard loose form, and @@ -1395,12 +1296,16 @@ my %loose_to_standard_value; # loosely maps table names to the canonical # alias for them my %ambiguous_names; # keys are alias names (in standard form) that # have more than one possible meaning. +my %combination_property; # keys are alias names (in standard form) that + # have both a map table, and a binary one that + # yields true for all non-null maps. my %prop_aliases; # Keys are standard property name; values are each # one's aliases my %prop_value_aliases; # Keys of top level are standard property name; # values are keys to another hash, Each one is # one of the property's values, in standard form. # The values are that prop-val's aliases. +my %skipped_files; # List of files that we skip my %ucd_pod; # Holds entries that will go into the UCD section of the pod # Most properties are immune to caseless matching, otherwise you would get @@ -1477,6 +1382,8 @@ my @named_sequences; # NamedSequences.txt contents. my %potential_files; # Generated list of all .txt files in the directory # structure so we can warn if something is being # ignored. +my @missing_early_files; # Generated list of absent files that we need to + # proceed in compiling this early Unicode version my @files_actually_output; # List of files we generated. my @more_Names; # Some code point names are compound; this is used # to store the extra components of them. @@ -1493,6 +1400,7 @@ my $block; my $perl_charname; my $print; my $All; +my $Assigned; # All assigned characters in this Unicode release my $script; # Are there conflicting names because of beginning with 'In_', or 'Is_' @@ -1616,7 +1524,7 @@ sub populate_char_info ($) { elsif ($gc-> table('Unassigned')->contains($i)) { $annotate_char_type[$i] = $UNASSIGNED_TYPE; $printable[$i] = 0; - if ($v_version lt v2.0.0) { # No blocks in earliest releases + if (defined $block) { # No blocks in earliest releases $viacode[$i] = 'Unassigned'; $end = $gc-> table('Unassigned')->containing_range($i)->end; } @@ -1633,12 +1541,7 @@ sub populate_char_info ($) { containing_range($i)->end); } } - elsif ($v_version lt v2.0.0) { # No surrogates in earliest releases - $viacode[$i] = $gc->value_of($i); - $annotate_char_type[$i] = $UNKNOWN_TYPE; - $printable[$i] = 0; - } - elsif ($gc-> table('Surrogate')->contains($i)) { + elsif ($perl->table('_Perl_Surrogate')->contains($i)) { $viacode[$i] = 'Surrogate'; $annotate_char_type[$i] = $SURROGATE_TYPE; $printable[$i] = 0; @@ -2110,6 +2013,7 @@ package Input_file; # while(next_line()) {...} loop. # # You can also set up handlers to +# 0) call during object construction time, after everything else is done # 1) call before the first line is read, for pre processing # 2) call to adjust each line of the input before the main handler gets # them. This can be automatically generated, if appropriately simple @@ -2121,19 +2025,29 @@ package Input_file; # each_line_handler()s. So, if the format of the line is not in the desired # format for the main handler, these are used to do that adjusting. They can # be stacked (by enclosing them in an [ anonymous array ] in the constructor, -# so the $_ output of one is used as the input to the next. None of the other -# handlers are stackable, but could easily be changed to be so. +# so the $_ output of one is used as the input to the next. The eof handler +# is also stackable, but none of the others are, but could easily be changed +# to be so. +# +# Some properties are used by the Perl core but aren't defined until later +# Unicode releases. The perl interpreter would have problems working when +# compiled with an earlier Unicode version that doesn't have them, so we need +# to define them somehow for those releases. The 'Early' constructor +# parameter can be used to automatically handle this. It is essentially +# ignored if the Unicode version being compiled has a data file for this +# property. Either code to execute or a file to read can be specified. +# Details are at the %early definition. # # Most of the handlers can call insert_lines() or insert_adjusted_lines() # which insert the parameters as lines to be processed before the next input -# file line is read. This allows the EOF handler to flush buffers, for +# file line is read. This allows the EOF handler(s) to flush buffers, for # example. The difference between the two routines is that the lines inserted # by insert_lines() are subjected to the each_line_handler()s. (So if you -# called it from such a handler, you would get infinite recursion.) Lines -# inserted by insert_adjusted_lines() go directly to the main handler without -# any adjustments. If the post-processing handler calls any of these, there -# will be no effect. Some error checking for these conditions could be added, -# but it hasn't been done. +# called it from such a handler, you would get infinite recursion without some +# mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go +# directly to the main handler without any adjustments. If the +# post-processing handler calls any of these, there will be no effect. Some +# error checking for these conditions could be added, but it hasn't been done. # # carp_bad_line() should be called to warn of bad input lines, which clears $_ # to prevent further processing of the line. This routine will output the @@ -2169,10 +2083,16 @@ sub trace { return main::trace(@_); } main::set_access('property', \%property, qw{ c r }); my %optional; - # If this is true, the file is optional. If not present, no warning is - # output. If it is present, the string given by this parameter is - # evaluated, and if false the file is not processed. - main::set_access('optional', \%optional, 'c', 'r'); + # This is either an unsigned number, or a list of property names. In the + # former case, if it is non-zero, it means the file is optional, so if the + # file is absent, no warning about that is output. In the latter case, it + # is a list of properties that the file (exclusively) defines. If the + # file is present, tables for those properties will be produced; if + # absent, none will, even if they are listed elsewhere (namely + # PropertyAliases.txt and PropValueAliases.txt) as being in this release, + # and no warnings will be raised about them not being available. (And no + # warning about the file itself will be raised.) + main::set_access('optional', \%optional, qw{ c readable_array } ); my %non_skip; # This is used for debugging, to skip processing of all but a few input @@ -2181,16 +2101,19 @@ sub trace { return main::trace(@_); } main::set_access('non_skip', \%non_skip, 'c'); my %skip; - # This is used to skip processing of this input file semi-permanently, - # when it evaluates to true. The value should be the reason the file is - # being skipped. It is used for files that we aren't planning to process - # anytime soon, but want to allow to be in the directory and not raise a - # message that we are not handling. Mostly for test files. This is in - # contrast to the non_skip element, which is supposed to be used very - # temporarily for debugging. Sets 'optional' to 1. Also, files that we - # pretty much will never look at can be placed in the global - # %ignored_files instead. Ones used here will be added to %skipped files - main::set_access('skip', \%skip, 'c'); + # This is used to skip processing of this input file (semi-) permanently. + # The value should be the reason the file is being skipped. It is used + # for files that we aren't planning to process anytime soon, but want to + # allow to be in the directory and be checked for their names not + # conflicting with any other files on a DOS 8.3 name filesystem, but to + # not otherwise be processed, and to not raise a warning about not being + # handled. In the constructor call, any value that evaluates to a numeric + # 0 or undef means don't skip. Any other value is a string giving the + # reason it is being skippped, and this will appear in generated pod. + # However, an empty string reason will suppress the pod entry. + # Internally, calls that evaluate to numeric 0 are changed into undef to + # distinguish them from an empty string call. + main::set_access('skip', \%skip, 'c', 'r'); my %each_line_handler; # list of subroutines to look at and filter each non-comment line in the @@ -2221,21 +2144,28 @@ sub trace { return main::trace(@_); } main::set_access('has_missings_defaults', \%has_missings_defaults, qw{ c r }); + my %construction_time_handler; + # Subroutine to call at the end of the new method. If undef, no such + # handler is called. + main::set_access('construction_time_handler', + \%construction_time_handler, qw{ c }); + my %pre_handler; # Subroutine to call before doing anything else in the file. If undef, no # such handler is called. main::set_access('pre_handler', \%pre_handler, qw{ c }); my %eof_handler; - # Subroutine to call upon getting an EOF on the input file, but before + # Subroutines to call upon getting an EOF on the input file, but before # that is returned to the main handler. This is to allow buffers to be # flushed. The handler is expected to call insert_lines() or # insert_adjusted() with the buffered material - main::set_access('eof_handler', \%eof_handler, qw{ c r }); + main::set_access('eof_handler', \%eof_handler, qw{ c }); my %post_handler; # Subroutine to call after all the lines of the file are read in and - # processed. If undef, no such handler is called. + # processed. If undef, no such handler is called. Note that this cannot + # add lines to be processed; instead use eof_handler main::set_access('post_handler', \%post_handler, qw{ c }); my %progress_message; @@ -2263,6 +2193,69 @@ sub trace { return main::trace(@_); } # storage of '@missing' defaults lines main::set_access('missings', \%missings); + my %early; + # Used for properties that must be defined (for Perl's purposes) on + # versions of Unicode earlier than Unicode itself defines them. The + # parameter is an array (it would be better to be a hash, but not worth + # bothering about due to its rare use). + # + # The first element is either a code reference to call when in a release + # earlier than the Unicode file is available in, or it is an alternate + # file to use instead of the non-existent one. This file must have been + # plunked down in the same directory as mktables. Should you be compiling + # on a release that needs such a file, mktables will abort the + # compilation, and tell you where to get the necessary file(s), and what + # name(s) to use to store them as. + # In the case of specifying an alternate file, the array must contain two + # further elements: + # + # [1] is the name of the property that will be generated by this file. + # The class automatically takes the input file and excludes any code + # points in it that were not assigned in the Unicode version being + # compiled. It then uses this result to define the property in the given + # version. Since the property doesn't actually exist in the Unicode + # version being compiled, this should be a name accessible only by core + # perl. If it is the same name as the regular property, the constructor + # will mark the output table as a $PLACEHOLDER so that it doesn't actually + # get output, and so will be unusable by non-core code. Otherwise it gets + # marked as $INTERNAL_ONLY. + # + # [2] is a property value to assign (only when compiling Unicode 1.1.5) to + # the Hangul syllables in that release (which were ripped out in version + # 2) for the given property . (Hence it is ignored except when compiling + # version 1. You only get one value that applies to all of them, which + # may not be the actual reality, but probably nobody cares anyway for + # these obsolete characters.) + # + # Not all files can be handled in the above way, and so the code ref + # alternative is available. It can do whatever it needs to. The other + # array elements are optional in this case, and the code is free to use or + # ignore them if they are present. + # + # Internally, the constructor unshifts a 0 or 1 onto this array to + # indicate if an early alternative is actually being used or not. This + # makes for easier testing later on. + main::set_access('early', \%early, 'c'); + + my %required_even_in_debug_skip; + # debug_skip is used to speed up compilation during debugging by skipping + # processing files that are not needed for the task at hand. However, + # some files pretty much can never be skipped, and this is used to specify + # that this is one of them. In order to skip this file, the call to the + # constructor must be edited to comment out this parameter. + main::set_access('required_even_in_debug_skip', + \%required_even_in_debug_skip, 'c'); + + my %withdrawn; + # Some files get removed from the Unicode DB. This is a version object + # giving the first release without this file. + main::set_access('withdrawn', \%withdrawn, 'c'); + + my %in_this_release; + # Calculated value from %first_released and %withdrawn. Are we compiling + # a Unicode release which includes this file? + main::set_access('in_this_release', \%in_this_release); + sub _next_line; sub _next_line_with_remapped_range; @@ -2275,22 +2268,23 @@ sub trace { return main::trace(@_); } # Set defaults $handler{$addr} = \&main::process_generic_property_file; $non_skip{$addr} = 0; - $skip{$addr} = 0; + $skip{$addr} = undef; $has_missings_defaults{$addr} = $NO_DEFAULTS; $handle{$addr} = undef; $added_lines{$addr} = [ ]; $remapped_lines{$addr} = [ ]; $each_line_handler{$addr} = [ ]; + $eof_handler{$addr} = [ ]; $errors{$addr} = { }; $missings{$addr} = [ ]; + $early{$addr} = [ ]; + $optional{$addr} = [ ]; # Two positional parameters. return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; $file{$addr} = main::internal_file_to_platform(shift); $first_released{$addr} = shift; - undef $file{$addr} if $first_released{$addr} gt $v_version; - # The rest of the arguments are key => value pairs # %constructor_fields has been set up earlier to list all possible # ones. Either set or push, depending on how the default has been set @@ -2322,30 +2316,206 @@ sub trace { return main::trace(@_); } delete $args{$key}; }; - # If the file has a property for it, it means that the property is not - # listed in the file's entries. So add a handler to the list of line - # handlers to insert the property name into the lines, to provide a - # uniform interface to the final processing subroutine. - # the final code doesn't have to worry about that. - if ($property{$addr}) { - push @{$each_line_handler{$addr}}, \&_insert_property_into_line; + $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr}; + + # Convert 0 (meaning don't skip) to undef + undef $skip{$addr} unless $skip{$addr}; + + # Handle the case where this file is optional + my $pod_message_for_non_existent_optional = ""; + if ($optional{$addr}->@*) { + + # First element is the pod message + $pod_message_for_non_existent_optional + = shift $optional{$addr}->@*; + # Convert a 0 'Optional' argument to an empty list to make later + # code more concise. + if ( $optional{$addr}->@* + && $optional{$addr}->@* == 1 + && $optional{$addr}[0] ne "" + && $optional{$addr}[0] !~ /\D/ + && $optional{$addr}[0] == 0) + { + $optional{$addr} = [ ]; + } + else { # But if the only element doesn't evaluate to 0, make sure + # that this file is indeed considered optional below. + unshift $optional{$addr}->@*, 1; + } + } + + my $progress; + my $function_instead_of_file = 0; + + # If we are compiling a Unicode release earlier than the file became + # available, the constructor may have supplied a substitute + if ($first_released{$addr} gt $v_version && $early{$addr}->@*) { + + # Yes, we have a substitute, that we will use; mark it so + unshift $early{$addr}->@*, 1; + + # See the definition of %early for what the array elements mean. + # If we have a property this defines, create a table and default + # map for it now (at essentially compile time), so that it will be + # available for the whole of run time. (We will want to add this + # name as an alias when we are using the official property name; + # but this must be deferred until run(), because at construction + # time the official names have yet to be defined.) + if ($early{$addr}[2]) { + my $fate = ($property{$addr} + && $property{$addr} eq $early{$addr}[2]) + ? $PLACEHOLDER + : $INTERNAL_ONLY; + my $prop_object = Property->new($early{$addr}[2], + Fate => $fate, + Perl_Extension => 1, + ); + + # Use the default mapping for the regular property for this + # substitute one. + if ( defined $property{$addr} + && defined $default_mapping{$property{$addr}}) + { + $prop_object + ->set_default_map($default_mapping{$property{$addr}}); + } + } + + if (ref $early{$addr}[1] eq 'CODE') { + $function_instead_of_file = 1; + + # If the first element of the array is a code ref, the others + # are optional. + $handler{$addr} = $early{$addr}[1]; + $property{$addr} = $early{$addr}[2] + if defined $early{$addr}[2]; + $progress = "substitute $file{$addr}"; + + undef $file{$addr}; + } + else { # Specifying a substitute file + + if (! main::file_exists($early{$addr}[1])) { + + # If we don't see the substitute file, generate an error + # message giving the needed things, and add it to the list + # of such to output before actual processing happens + # (hence the user finds out all of them in one run). + # Instead of creating a general method for NameAliases, + # hard-code it here, as there is unlikely to ever be a + # second one which needs special handling. + my $string_version = ($file{$addr} eq "NameAliases.txt") + ? 'at least 6.1 (the later, the better)' + : sprintf "%vd", $first_released{$addr}; + push @missing_early_files, <<END; +'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'. +END + ; + return; + } + $progress = $early{$addr}[1]; + $progress .= ", substituting for $file{$addr}" if $file{$addr}; + $file{$addr} = $early{$addr}[1]; + $property{$addr} = $early{$addr}[2]; + + # Ignore code points not in the version being compiled + push $each_line_handler{$addr}->@*, \&_exclude_unassigned; + + if ( $v_version lt v2.0 # Hanguls in this release ... + && defined $early{$addr}[3]) # ... need special treatment + { + push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls; + } + } + + # And this substitute is valid for all releases. + $first_released{$addr} = v0; + } + else { # Normal behavior + $progress = $file{$addr}; + unshift $early{$addr}->@*, 0; # No substitute } - if ($non_skip{$addr} && ! $debug_skip && $verbosity) { - print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n"; + my $file = $file{$addr}; + $progress_message{$addr} = "Processing $progress" + unless $progress_message{$addr}; + + # A file should be there if it is within the window of versions for + # which Unicode supplies it + if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) { + $in_this_release{$addr} = 0; + $skip{$addr} = ""; } + else { + $in_this_release{$addr} = $first_released{$addr} le $v_version; - # If skipping, set to optional, and add to list of ignored files, - # including its reason - if ($skip{$addr}) { - $optional{$addr} = 1; - $skipped_files{$file{$addr}} = $skip{$addr} if $file{$addr}; + # Check that the file for this object (possibly using a substitute + # for early releases) exists or we have a function alternative + if ( ! $function_instead_of_file + && ! main::file_exists($file)) + { + # Here there is nothing available for this release. This is + # fine if we aren't expecting anything in this release. + if (! $in_this_release{$addr}) { + $skip{$addr} = ""; # Don't remark since we expected + # nothing and got nothing + } + elsif ($optional{$addr}->@*) { + + # Here the file is optional in this release; Use the + # passed in text to document this case in the pod. + $skip{$addr} = $pod_message_for_non_existent_optional; + } + elsif ( $in_this_release{$addr} + && ! defined $skip{$addr} + && defined $file) + { # Doesn't exist but should. + $skip{$addr} = "'$file' not found. Possibly Big problems"; + Carp::my_carp($skip{$addr}); + } + } + elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr}) + { + + # The file exists; if not skipped for another reason, and we are + # skipping most everything during debugging builds, use that as + # the skip reason. + $skip{$addr} = '$debug_skip is on' + } + } + + if ( ! $debug_skip + && $non_skip{$addr} + && ! $required_even_in_debug_skip{$addr} + && $verbosity) + { + print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n"; + } + + # Here, we have figured out if we will be skipping this file or not. + # If so, we add any single property it defines to any passed in + # optional property list. These will be dealt with at run time. + if (defined $skip{$addr}) { + if ($property{$addr}) { + push $optional{$addr}->@*, $property{$addr}; + } + } # Otherwise, are going to process the file. + elsif ($property{$addr}) { + + # If the file has a property defined in the constructor for it, it + # means that the property is not listed in the file's entries. So + # add a handler (to the list of line handlers) to insert the + # property name into the lines, to provide a uniform interface to + # the final processing subroutine. + push @{$each_line_handler{$addr}}, \&_insert_property_into_line; } elsif ($properties{$addr}) { - # Add a handler for each line in the input so that it creates a - # separate input line for each property in those input lines, thus - # making them suitable for process_generic_property_file(). + # Similarly, there may be more than one property represented on + # each line, with no clue but the constructor input what those + # might be. Add a handler for each line in the input so that it + # creates a separate input line for each property in those input + # lines, thus making them suitable to handle generically. push @{$each_line_handler{$addr}}, sub { @@ -2376,7 +2546,7 @@ sub trace { return main::trace(@_); } }; } - { # On non-ascii platforms, we use a special handler + { # On non-ascii platforms, we use a special pre-handler no strict; no warnings 'once'; *next_line = (main::NON_ASCII_PLATFORM) @@ -2384,6 +2554,9 @@ sub trace { return main::trace(@_); } : *_next_line; } + &{$construction_time_handler{$addr}}($self) + if $construction_time_handler{$addr}; + return $self; } @@ -2401,13 +2574,13 @@ sub trace { return main::trace(@_); } return __PACKAGE__ . " object for " . $self->file; } - # flag to make sure extracted files are processed early - my $seen_non_extracted_non_age = 0; - sub run { # Process the input object $self. This opens and closes the file and # calls all the handlers for it. Currently, this can only be called - # once per file, as it destroy's the EOF handler + # once per file, as it destroy's the EOF handlers + + # flag to make sure extracted files are processed early + state $seen_non_extracted_non_age = 0; my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; @@ -2416,61 +2589,14 @@ sub trace { return main::trace(@_); } my $file = $file{$addr}; - # Don't process if not expecting this file (because released later - # than this Unicode version), and isn't there. This means if someone - # copies it into an earlier version's directory, we will go ahead and - # process it. - return if $first_released{$addr} gt $v_version - && (! defined $file || ! -e $file); - - # If in debugging mode and this file doesn't have the non-skip - # flag set, and isn't one of the critical files, skip it. - if ($debug_skip - && $first_released{$addr} ne v0 - && ! $non_skip{$addr}) - { - print "Skipping $file in debugging\n" if $verbosity; - return; - } - - # File could be optional - if ($optional{$addr}) { - return unless -e $file; - my $result = eval $optional{$addr}; - if (! defined $result) { - Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped."); - return; - } - if (! $result) { - if ($verbosity) { - print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n"; - } - return; - } - } - - if (! defined $file || ! -e $file) { - - # If the file doesn't exist, see if have internal data for it - # (based on first_released being 0). - if ($first_released{$addr} eq v0) { - $handle{$addr} = 'pretend_is_open'; - } - else { - if (! $optional{$addr} # File could be optional - && $v_version ge $first_released{$addr}) - { - print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr}; - } - return; - } + if (! $file) { + $handle{$addr} = 'pretend_is_open'; } else { - - # Here, the file exists. Some platforms may change the case of - # its name if ($seen_non_extracted_non_age) { - if ($file =~ /$EXTRACTED/i) { + if ($file =~ /$EXTRACTED/i) # Some platforms may change the + # case of the file's name + { Carp::my_carp_bug(main::join_lines(<<END $file should be processed just after the 'Prop...Alias' files, and before anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may @@ -2480,7 +2606,10 @@ END } } elsif ($EXTRACTED_DIR - && $first_released{$addr} ne v0 + + # We only do this check for generic property files + && $handler{$addr} == \&main::process_generic_property_file + && $file !~ /$EXTRACTED/i && lc($file) ne 'dage.txt') { @@ -2490,71 +2619,137 @@ END $seen_non_extracted_non_age = 1; } - # And mark the file as having being processed, and warn if it + # Mark the file as having being processed, and warn if it # isn't a file we are expecting. As we process the files, # they are deleted from the hash, so any that remain at the # end of the program are files that we didn't process. my $fkey = File::Spec->rel2abs($file); - my $expecting = delete $potential_files{lc($fkey)}; + my $exists = delete $potential_files{lc($fkey)}; + + Carp::my_carp("Was not expecting '$file'.") + if $exists && ! $in_this_release{$addr}; + + # If there is special handling for compiling Unicode releases + # earlier than the first one in which Unicode defines this + # property ... + if ($early{$addr}->@* > 1) { + + # Mark as processed any substitute file that would be used in + # such a release + $fkey = File::Spec->rel2abs($early{$addr}[1]); + delete $potential_files{lc($fkey)}; + + # As commented in the constructor code, when using the + # official property, we still have to allow the publicly + # inaccessible early name so that the core code which uses it + # will work regardless. + if (! $early{$addr}[0] && $early{$addr}->@* > 2) { + my $early_property_name = $early{$addr}[2]; + if ($property{$addr} ne $early_property_name) { + main::property_ref($property{$addr}) + ->add_alias($early_property_name); + } + } + } + + # We may be skipping this file ... + if (defined $skip{$addr}) { - Carp::my_carp("Was not expecting '$file'.") if - ! $expecting - && ! defined $handle{$addr}; + # If the file isn't supposed to be in this release, there is + # nothing to do + if ($in_this_release{$addr}) { + + # But otherwise, we may print a message + if ($debug_skip) { + print STDERR "Skipping input file '$file'", + " because '$skip{$addr}'\n"; + } + + # And add it to the list of skipped files, which is later + # used to make the pod + $skipped_files{$file} = $skip{$addr}; + + # The 'optional' list contains properties that are also to + # be skipped along with the file. (There may also be + # digits which are just placeholders to make sure it isn't + # an empty list + foreach my $property ($optional{$addr}->@*) { + next unless $property =~ /\D/; + my $prop_object = main::property_ref($property); + next unless defined $prop_object; + $prop_object->set_fate($SUPPRESSED, $skip{$addr}); + } + } - # Having deleted from expected files, we can quit if not to do - # anything. Don't print progress unless really want verbosity - if ($skip{$addr}) { - print "Skipping $file.\n" if $verbosity >= $VERBOSE; return; } - # Open the file, converting the slashes used in this program - # into the proper form for the OS + # Here, we are going to process the file. Open it, converting the + # slashes used in this program into the proper form for the OS my $file_handle; if (not open $file_handle, "<", $file) { Carp::my_carp("Can't open $file. Skipping: $!"); - return 0; + return; } $handle{$addr} = $file_handle; # Cache the open file handle - if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') { + # If possible, make sure that the file is the correct version. + # (This data isn't available on early Unicode releases or in + # UnicodeData.txt.) We don't do this check if we are using a + # substitute file instead of the official one (though the code + # could be extended to do so). + if ($in_this_release{$addr} + && ! $early{$addr}[0] + && lc($file) ne 'unicodedata.txt') + { if ($file !~ /^Unihan/i) { - $_ = <$file_handle>; - if ($_ !~ / - $string_version \. /x) { - chomp; - $_ =~ s/^#\s*//; - die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version"); + + # The non-Unihan files started getting version numbers in + # 3.2, but some files in 4.0 are unchanged from 3.2, and + # marked as 3.2. 4.0.1 is the first version where there + # are no files marked as being from less than 4.0, though + # some are marked as 4.0. In versions after that, the + # numbers are correct. + if ($v_version ge v4.0.1) { + $_ = <$file_handle>; # The version number is in the + # very first line + if ($_ !~ / - $string_version \. /x) { + chomp; + $_ =~ s/^#\s*//; + + # 4.0.1 had some valid files that weren't updated. + if (! ($v_version eq v4.0.1 && $_ =~ /4\.0\.0/)) { + die Carp::my_carp("File '$file' is version " + . "'$_'. It should be " + . "version $string_version"); + } + } } } - else { + elsif ($v_version ge v6.0.0) { # Unihan + + # Unihan files didn't get accurate version numbers until + # 6.0. The version is somewhere in the first comment + # block while (<$file_handle>) { if ($_ !~ /^#/) { - Carp::my_carp_bug("Could not find the expected version info in file '$file'"); + Carp::my_carp_bug("Could not find the expected " + . "version info in file '$file'"); last; } chomp; $_ =~ s/^#\s*//; next if $_ !~ / version: /x; last if $_ =~ /$string_version/; - die Carp::my_carp("File '$file' is '$_'. It should be version $string_version"); + die Carp::my_carp("File '$file' is version " + . "'$_'. It should be " + . "version $string_version"); } } } } - if ($verbosity >= $PROGRESS) { - if ($progress_message{$addr}) { - print "$progress_message{$addr}\n"; - } - else { - # If using a virtual file, say so. - print "Processing ", (-e $file) - ? $file - : "substitute $file", - "\n"; - } - } - + print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS; # Call any special handler for before the file. &{$pre_handler{$addr}}($self) if $pre_handler{$addr}; @@ -2742,11 +2937,11 @@ END return 1; } # End of looping through lines. - # If there is an EOF handler, call it (only once) and if it generates + # If there are EOF handlers, call each (only once) and if it generates # more lines to process go back in the loop to handle them. - if ($eof_handler{$addr}) { - &{$eof_handler{$addr}}($self); - $eof_handler{$addr} = ""; # Currently only get one shot at it. + while ($eof_handler{$addr}->@*) { + &{$eof_handler{$addr}[0]}($self); + shift $eof_handler{$addr}->@*; # Currently only get one shot at it. goto LINE if $added_lines{$addr}; } @@ -2943,6 +3138,82 @@ END return @return; } + sub _exclude_unassigned { + + # Takes the range in $_ and excludes code points that aren't assigned + # in this release + + state $skip_inserted_count = 0; + + # Ignore recursive calls. + if ($skip_inserted_count) { + $skip_inserted_count--; + return; + } + + # Find what code points are assigned in this release + main::calculate_Assigned() if ! defined $Assigned; + + my $self = shift; + my $addr = do { no overloading; pack 'J', $self; }; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my ($range, @remainder) + = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields + + # Examine the range. + if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) + { + my $low = hex $1; + my $high = (defined $2) ? hex $2 : $low; + + # Split the range into subranges of just those code points in it + # that are assigned. + my @ranges = (Range_List->new(Initialize + => Range->new($low, $high)) & $Assigned)->ranges; + + # Do nothing if nothing in the original range is assigned in this + # release; handle normally if everything is in this release. + if (! @ranges) { + $_ = ""; + } + elsif (@ranges != 1) { + + # Here, some code points in the original range aren't in this + # release; @ranges gives the ones that are. Create fake input + # lines for each of the ranges, and set things up so that when + # this routine is called on that fake input, it will do + # nothing. + $skip_inserted_count = @ranges; + my $remainder = join ";", @remainder; + for my $range (@ranges) { + $self->insert_lines(sprintf("%04X..%04X;%s", + $range->start, $range->end, $remainder)); + } + $_ = ""; # The original range is now defunct. + } + } + + return; + } + + sub _fixup_obsolete_hanguls { + + # This is called only when compiling Unicode version 1. All Unicode + # data for subsequent releases assumes that the code points that were + # Hangul syllables in this release only are something else, so if + # using such data, we have to override it + + my $self = shift; + my $addr = do { no overloading; pack 'J', $self; }; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $object = main::property_ref($property{$addr}); + $object->add_map(0x3400, 0x4DFF, + $early{$addr}[3], # Passed-in value for these + Replace => $UNCONDITIONALLY); + } + sub _insert_property_into_line { # Add a property field to $_, if this file requires it. @@ -3027,6 +3298,8 @@ package Multi_Default; # . # . # 'U')); + # It is best to leave the final value be the one that matches the + # above-Unicode code points. my $class = shift; @@ -3732,7 +4005,7 @@ sub trace { return main::trace(@_); } # => $MULTIPLE_BEFORE means that if this range duplicates an # existing one, but has a different value, # don't replace the existing one, but insert - # this, one so that the same range can occur + # this one so that the same range can occur # multiple times. They are stored LIFO, so # that the final one inserted is the first one # returned in an ordered search of the table. @@ -3747,6 +4020,7 @@ sub trace { return main::trace(@_); } # existing range, this one is discarded # (leaving the existing one in its original, # higher priority position + # => $CROAK Die with an error if is already there # => anything else is the same as => $IF_NOT_EQUIVALENT # # "same value" means identical for non-type-0 ranges, and it means @@ -3835,7 +4109,7 @@ sub trace { return main::trace(@_); } # Here, the new range starts just after the current highest in # the range list, and they have the same type and value. - # Extend the current range to incorporate the new one. + # Extend the existing range to incorporate the new one. @{$r}[-1]->set_end($end); } @@ -4228,7 +4502,7 @@ sub trace { return main::trace(@_); } # In other words, # r[$i-1]->end < $start <= r[$i]->end # And: - # r[$i-1]->end < $start <= $end <= r[$j+1]->start + # r[$i-1]->end < $start <= $end < r[$j+1]->start # # Also: # $clean_insert is a boolean which is set true if and only if @@ -5093,6 +5367,7 @@ sub trace { return main::trace(@_); } my $note = delete $args{'Note'}; my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; my $perl_extension = delete $args{'Perl_Extension'}; + my $suppression_reason = delete $args{'Suppression_Reason'}; # Shouldn't have any left over Carp::carp_extra_args(\%args) if main::DEBUG && %args; @@ -5134,11 +5409,12 @@ END { $fate{$addr} = $SUPPRESSED; } - elsif ($fate{$addr} == $SUPPRESSED - && ! exists $why_suppressed{$property{$addr}->complete_name}) - { - Carp::my_carp_bug("There is no current capability to set the reason for suppressing."); - # perhaps Fate => [ $SUPPRESSED, "reason" ] + elsif ($fate{$addr} == $SUPPRESSED) { + Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason; + # Though currently unused + } + elsif ($suppression_reason) { + Carp::my_carp_bug("A reason was given for suppressing, but not suppressed"); } # If hasn't set its status already, see if it is on one of the @@ -5266,17 +5542,18 @@ END my %args = @_; my $loose_match = delete $args{'Fuzzy'}; - my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; - $make_re_pod_entry = $YES unless defined $make_re_pod_entry; - my $ok_as_filename = delete $args{'OK_as_Filename'}; $ok_as_filename = 1 unless defined $ok_as_filename; - my $status = delete $args{'Status'}; - $status = $NORMAL unless defined $status; - # An internal name does not get documented, unless overridden by the - # input. + # input; same for making tests for it. + my $status = delete $args{'Status'} || (($name =~ /^_/) + ? $INTERNAL_ALIAS + : $NORMAL); + my $make_re_pod_entry = delete $args{'Re_Pod_Entry'} + // (($status ne $INTERNAL_ALIAS) + ? (($name =~ /^_/) ? $NO : $YES) + : $NO); my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1); Carp::carp_extra_args(\%args) if main::DEBUG && %args; @@ -5346,7 +5623,7 @@ END $insert_position, 0, Alias->new($name, $loose_match, $make_re_pod_entry, - $ok_as_filename, $status, $ucd); + $ok_as_filename, $status, $ucd); # This name may be shorter than any existing ones, so clear the cache # of the shortest, so will have to be recalculated. @@ -6267,7 +6544,8 @@ END } # Save the reason for suppression for output - if ($fate == $SUPPRESSED && defined $reason) { + if ($fate >= $SUPPRESSED) { + $reason = "" unless defined $reason; $why_suppressed{$complete_name{$addr}} = $reason; } @@ -6763,7 +7041,7 @@ END # The ranges that map to the default aren't output, so subtract that # to get those actually output. A property with matching tables # already has the information calculated. - if ($property->type != $STRING) { + if ($property->type != $STRING && $property->type != $FORCED_BINARY) { $count -= $property->table($default_map)->count; } elsif (defined $default_map) { @@ -6839,9 +7117,11 @@ END $comment .= "This file returns the $mapping:\n"; my $ucd_accessible_name = ""; + my $has_underscore_name = 0; my $full_name = $self->property->full_name; for my $i (0 .. @property_aliases - 1) { my $name = $property_aliases[$i]->name; + $has_underscore_name = 1 if $name =~ /^_/; $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)'); if ($property_aliases[$i]->ucd) { if ($name eq $full_name) { @@ -6854,7 +7134,12 @@ END } $comment .= "\nwhere 'cp' is $cp."; if ($ucd_accessible_name) { - $comment .= " Note that $these_mappings $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD"; + $comment .= " Note that $these_mappings"; + if ($has_underscore_name) { + $comment .= " (except for the one(s) that begin with an underscore)"; + } + $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD"; + } # And append any commentary already set from the actual property. @@ -6888,7 +7173,7 @@ END # There are tables which end up only having one element per # range, but it is not worth keeping track of for making just # this comment a little better. - $comment.= <<END; + $comment .= <<END; non-comment portions of the main body of lines of this file is: START\\tSTOP\\tMAPPING where START is the starting code point of the range, in hex; STOP is the ending point, or if omitted, the range has just one @@ -7848,7 +8133,6 @@ END main::uniques($leader, @{$equivalents{$addr}}); my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated # tables - for my $parent (@parents) { my $property = $parent->property; @@ -7906,7 +8190,7 @@ END # commentary that the other combinations are possible. # Because regular expressions don't recognize things like # \p{jsn=}, only look at non-null right-hand-sides - my @property_aliases = $table_property->aliases; + my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases; my @table_aliases = grep { $_->name ne "" } $table->aliases; # The alias lists above are already ordered in the order we @@ -7918,8 +8202,7 @@ END ? main::max(scalar @table_aliases, scalar @property_aliases) : 0; - trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG; - + trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG; my $property_had_compound_name = 0; @@ -8070,8 +8353,14 @@ END foreach my $flag (sort keys %flags) { $comment .= <<END; '$flag' below means that this form is $flags{$flag}. -Consult $pod_file.pod END + if ($flag eq $INTERNAL_ALIAS) { + $comment .= "DO NOT USE!!!"; + } + else { + $comment .= "Consult $pod_file.pod"; + } + $comment .= "\n"; } $comment .= "\n"; } @@ -8436,18 +8725,27 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } $perl_extension = $self->perl_extension if ! defined $perl_extension; + my $fate; + my $suppression_reason = ""; + if ($self->name =~ /^_/) { + $fate = $SUPPRESSED; + $suppression_reason = "Parent property is internal only"; + } + elsif ($self->fate >= $SUPPRESSED) { + $fate = $self->fate; + $suppression_reason = $why_suppressed{$self->complete_name}; + + } + elsif ($name =~ /^_/) { + $fate = $INTERNAL_ONLY; + } $table = Match_Table->new( Name => $name, Perl_Extension => $perl_extension, _Alias_Hash => $table_ref{$addr}, _Property => $self, - - # gets property's fate and status by default, - # except if the name begind with an - # underscore, default it to internal - Fate => ($name =~ /^_/) - ? $INTERNAL_ONLY - : $self->fate, + Fate => $fate, + Suppression_Reason => $suppression_reason, Status => $self->status, _Status_Info => $self->status_info, %args); @@ -8562,10 +8860,13 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # Swash names are used only on either # 1) legacy-only properties, because the formats for these are # unchangeable, and they have had these lines in them; or - # 2) regular map tables; otherwise there should be no access to the + # 2) regular or internal-only map tables + # 3) otherwise there should be no access to the # property map table from other parts of Perl. return if $map{$addr}->fate != $ORDINARY - && $map{$addr}->fate != $LEGACY_ONLY; + && $map{$addr}->fate != $LEGACY_ONLY + && ! ($map{$addr}->name =~ /^_/ + && $map{$addr}->fate == $INTERNAL_ONLY); return $file{$addr} if defined $file{$addr}; return $map{$addr}->external_name; @@ -8589,9 +8890,6 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # to it. return 0 if $type{$addr} == $STRING; - # Don't generate anything for unimplemented properties. - return 0 if grep { $self->complete_name eq $_ } - @unimplemented_properties; # Otherwise, do. return 1; } @@ -8808,7 +9106,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } Carp::carp_extra_args(\@_) if main::DEBUG && @_; my $addr = do { no overloading; pack 'J', $self; }; - if ($fate == $SUPPRESSED) { + if ($fate >= $SUPPRESSED) { $why_suppressed{$self->complete_name} = $reason; } @@ -8894,15 +9192,15 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } package main; - sub display_chr { - # Converts an ordinal printable character value to a displayable - # string, using a dotted circle to hold combining characters. +sub display_chr { + # Converts an ordinal printable character value to a displayable string, + # using a dotted circle to hold combining characters. - my $ord = shift; - my $chr = chr $ord; - return $chr if $ccc->table(0)->contains($ord); - return "\x{25CC}$chr"; - } + my $ord = shift; + my $chr = chr $ord; + return $chr if $ccc->table(0)->contains($ord); + return "\x{25CC}$chr"; +} sub join_lines($) { # Returns lines of the input joined together, so that they can be folded @@ -9443,7 +9741,6 @@ sub dump_inside_out { my $object = shift; my $fields_ref = shift; - Carp::carp_extra_args(\@_) if main::DEBUG && @_; my $addr = do { no overloading; pack 'J', $object; }; @@ -9520,6 +9817,17 @@ sub _operator_not_equal { return ! _operator_equal($self, $other); } +sub substitute_PropertyAliases($) { + # Deal with early releases that don't have the crucial PropertyAliases.txt + # file. + + my $file_object = shift; + $file_object->insert_lines(get_old_property_aliases()); + + process_PropertyAliases($file_object); +} + + sub process_PropertyAliases($) { # This reads in the PropertyAliases.txt file, which contains almost all # the character properties in Unicode and their equivalent aliases: @@ -9532,11 +9840,6 @@ sub process_PropertyAliases($) { my $file= shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - # This whole file was non-existent in early releases, so use our own - # internal one. - $file->insert_lines(get_old_property_aliases()) - if ! -e 'PropertyAliases.txt'; - # Add any cjk properties that may have been defined. $file->insert_lines(@cjk_properties); @@ -9546,8 +9849,17 @@ sub process_PropertyAliases($) { my $full = $data[1]; + # This line is defective in early Perls. The property in Unihan.txt + # is kRSUnicode. + if ($full eq 'Unicode_Radical_Stroke' && @data < 3) { + push @data, qw(cjkRSUnicode kRSUnicode); + } + my $this = Property->new($data[0], Full_Name => $full); + $this->set_fate($SUPPRESSED, $why_suppressed{$full}) + if $why_suppressed{$full}; + # Start looking for more aliases after these two. for my $i (2 .. @data - 1) { $this->add_alias($data[$i]); @@ -9573,18 +9885,6 @@ sub finish_property_setup { Property->new('JSN', Full_Name => 'Jamo_Short_Name'); } - # These two properties must be defined in all releases so we can generate - # the tables from them to make regex \X work, but suppress their output so - # aren't application visible prior to releases where they should be - if (! defined property_ref('GCB')) { - Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break', - Fate => $PLACEHOLDER); - } - if (! defined property_ref('hst')) { - Property->new('hst', Full_Name => 'Hangul_Syllable_Type', - Fate => $PLACEHOLDER); - } - # These are used so much, that we set globals for them. $gc = property_ref('General_Category'); $block = property_ref('Block'); @@ -9701,22 +10001,15 @@ sub finish_property_setup { # for non-assigned code points; 'AL' for assigned. if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') { my $lb = property_ref('Line_Break'); - if ($v_version gt 3.2.0) { + if (file_exists("${EXTRACTED}DLineBreak.txt")) { $lb->set_default_map('Unknown'); } else { - my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")', - 'AL'); + my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")', + 'Unknown', + ); $lb->set_default_map($default); } - - # If has the URS property, make sure that the standard aliases are in - # it, since not in the input tables in some versions. - my $urs = property_ref('Unicode_Radical_Stroke'); - if (defined $urs) { - $urs->add_alias('cjkRSUnicode'); - $urs->add_alias('kRSUnicode'); - } } # For backwards compatibility with applications that may read the mapping @@ -9874,6 +10167,16 @@ END return @return; } +sub substitute_PropValueAliases($) { + # Deal with early releases that don't have the crucial + # PropValueAliases.txt file. + + my $file_object = shift; + $file_object->insert_lines(get_old_property_value_aliases()); + + process_PropValueAliases($file_object); +} + sub process_PropValueAliases { # This file contains values that properties look like: # bc ; AL ; Arabic_Letter @@ -9899,35 +10202,29 @@ sub process_PropValueAliases { my $file= shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - # This whole file was non-existent in early releases, so use our own - # internal one if necessary. - if (! -e 'PropValueAliases.txt') { - $file->insert_lines(get_old_property_value_aliases()); - } - if ($v_version lt 4.0.0) { $file->insert_lines(split /\n/, <<'END' -hst; L ; Leading_Jamo -hst; LV ; LV_Syllable -hst; LVT ; LVT_Syllable -hst; NA ; Not_Applicable -hst; T ; Trailing_Jamo -hst; V ; Vowel_Jamo +Hangul_Syllable_Type; L ; Leading_Jamo +Hangul_Syllable_Type; LV ; LV_Syllable +Hangul_Syllable_Type; LVT ; LVT_Syllable +Hangul_Syllable_Type; NA ; Not_Applicable +Hangul_Syllable_Type; T ; Trailing_Jamo +Hangul_Syllable_Type; V ; Vowel_Jamo END ); } if ($v_version lt 4.1.0) { $file->insert_lines(split /\n/, <<'END' -GCB; CN ; Control -GCB; CR ; CR -GCB; EX ; Extend -GCB; L ; L -GCB; LF ; LF -GCB; LV ; LV -GCB; LVT ; LVT -GCB; T ; T -GCB; V ; V -GCB; XX ; Other +_Perl_GCB; CN ; Control +_Perl_GCB; CR ; CR +_Perl_GCB; EX ; Extend +_Perl_GCB; L ; L +_Perl_GCB; LF ; LF +_Perl_GCB; LV ; LV +_Perl_GCB; LVT ; LVT +_Perl_GCB; T ; T +_Perl_GCB; V ; V +_Perl_GCB; XX ; Other END ); } @@ -9942,7 +10239,6 @@ END # program generates for this block property value #$file->insert_lines('blk; n/a; Herited'); - # Process each line of the file ... while ($file->next_line) { @@ -9959,6 +10255,11 @@ END # thus shifting the former field 0 to after them.) splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc'; + if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) { + my $new_style = $data[1] =~ s/-/_/gr; + splice @data, 1, 0, $new_style; + } + # Field 0 is a short name unless "n/a"; field 1 is the full name. If # there is no short name, use the full one in element 1 if ($data[0] eq "n/a") { @@ -10552,7 +10853,8 @@ sub output_perl_charnames_line ($$) { $line)); } - # And process the first range, like any other. + # And set things up so that the below will process this first + # range, like any other. $low = $this_range->start; $high = $this_range->end; } @@ -10963,11 +11265,12 @@ END my $file = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - # Create a new property specially located that is a combination of the + # Create a new property specially located that is a combination of # various Name properties: Name, Unicode_1_Name, Named Sequences, and - # Name_Alias properties. (The final duplicates elements of the - # first.) A comment for it will later be constructed based on the - # actual properties present and used + # _Perl_Name_Alias properties. (The final one duplicates elements of the + # first, and starting in v6.1, is the same as the 'Name_Alias + # property.) A comment for the new property will later be constructed + # based on the actual properties present and used $perl_charname = Property->new('Perl_Charnames', Default_Map => "", Directory => File::Spec->curdir(), @@ -12028,10 +12331,8 @@ sub filter_old_style_case_folding { Carp::carp_extra_args(\@_) if main::DEBUG && @_; my @fields = split /\s*;\s*/; - if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields - $fields[1] = 'I'; - } - elsif ($fields[1] eq 'L') { + + if ($fields[1] eq 'L') { $fields[1] = 'C'; # L => C always } elsif ($fields[1] eq 'E') { @@ -12339,6 +12640,68 @@ sub filter_numeric_value_line { { # Closure my %unihan_properties; + sub construct_unihan { + + my $file_object = shift; + + return unless file_exists($file_object->file); + + if ($v_version lt v4.0.0) { + push @cjk_properties, 'URS ; Unicode_Radical_Stroke'; + push @cjk_property_values, split "\n", <<'END'; +# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none> +END + } + + if ($v_version ge v3.0.0) { + push @cjk_properties, split "\n", <<'END'; +cjkIRG_GSource; kIRG_GSource +cjkIRG_JSource; kIRG_JSource +cjkIRG_KSource; kIRG_KSource +cjkIRG_TSource; kIRG_TSource +cjkIRG_VSource; kIRG_VSource +END + push @cjk_property_values, split "\n", <<'END'; +# @missing: 0000..10FFFF; cjkIRG_GSource; <none> +# @missing: 0000..10FFFF; cjkIRG_JSource; <none> +# @missing: 0000..10FFFF; cjkIRG_KSource; <none> +# @missing: 0000..10FFFF; cjkIRG_TSource; <none> +# @missing: 0000..10FFFF; cjkIRG_VSource; <none> +END + } + if ($v_version ge v3.1.0) { + push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource'; + push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>'; + } + if ($v_version ge v3.1.1) { + push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource'; + push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>'; + } + if ($v_version ge v3.2.0) { + push @cjk_properties, split "\n", <<'END'; +cjkAccountingNumeric; kAccountingNumeric +cjkCompatibilityVariant; kCompatibilityVariant +cjkOtherNumeric; kOtherNumeric +cjkPrimaryNumeric; kPrimaryNumeric +END + push @cjk_property_values, split "\n", <<'END'; +# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN +# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point> +# @missing: 0000..10FFFF; cjkOtherNumeric; NaN +# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN +END + } + if ($v_version gt v4.0.0) { + push @cjk_properties, 'cjkIRG_USource; kIRG_USource'; + push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>'; + } + + if ($v_version ge v4.1.0) { + push @cjk_properties, 'cjkIICore ; kIICore'; + push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>'; + } + } + sub setup_unihan { # Do any special setup for Unihan properties. @@ -12351,16 +12714,16 @@ sub filter_numeric_value_line { my $iicore = property_ref('kIICore'); if (defined $iicore) { $iicore->set_type($FORCED_BINARY); - $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38."); + $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38."); # Unicode doesn't include the maps for this property, so don't # warn that they are missing. $iicore->set_pre_declared_maps(0); $iicore->add_comment(join_lines( <<END -This property contains enum values, but Unicode UAX #38 says it should be -interpreted as binary, so Perl creates tables for both 1) its enum values, -plus 2) true/false tables in which it is considered true for all code points -that have a non-null value +This property contains string values, but any non-empty ones are considered to +be 'core', so Perl creates tables for both: 1) its string values, plus 2) +tables so that \\p{kIICore} matches any code point which has a non-empty +value for this property. END )); } @@ -12439,7 +12802,7 @@ sub filter_blocks_lines { # Change hyphens and blanks in the block name field only $fields[1] =~ s/[ -]/_/g; - $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word + $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word $_ = join("; ", @fields); return; @@ -12657,12 +13020,22 @@ sub generate_hst { END ); - # The Hangul syllables in version 1 are completely different than what came - # after, so just ignore them there. + # The Hangul syllables in version 1 are at different code points than + # those that came along starting in version 2, and have different names; + # they comprise about 60% of the code points of the later version. + # From my (khw) research on them (see <558493EB.4000807@att.net>), the + # initial set is a subset of the later version, with different English + # transliterations. I did not see an easy mapping between them. The + # later set includes essentially all possibilities, even ones that aren't + # in modern use (if they ever were), and over 96% of the new ones are type + # LVT. Mathematically, the early set must also contain a preponderance of + # LVT values. In lieu of doing nothing, we just set them all to LVT, and + # expect that this will be right most of the time, which is better than + # not being right at all. if ($v_version lt v2.0.0) { my $property = property_ref($file->property); + $file->insert_lines("3400..4DFF; LVT\n"); push @tables_that_may_be_empty, $property->table('LV')->complete_name; - push @tables_that_may_be_empty, $property->table('LVT')->complete_name; return; } @@ -12712,7 +13085,6 @@ sub generate_GCB { # Also from http://www.unicode.org/reports/tr29/tr29-3.html. foreach my $code_point ( qw{ - 40000 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F } @@ -12735,275 +13107,50 @@ sub generate_GCB { generate_hst($file); } - return; + main::process_generic_property_file($file); } -sub setup_early_name_alias { - my $file= shift; - Carp::carp_extra_args(\@_) if main::DEBUG && @_; - # This has the effect of pretending that the Name_Alias property was - # available in all Unicode releases. Strictly speaking, this property - # should not be availabe in early releases, but doing this allows - # charnames.pm to work on older releases without change. Prior to v5.16 - # it had these names hard-coded inside it. Unicode 6.1 came along and - # created these names, and so they were removed from charnames. +sub fixup_early_perl_name_alias { - my $aliases = property_ref('Name_Alias'); - if (! defined $aliases) { - $aliases = Property->new('Name_Alias', Default_Map => ""); - } + # Different versions of Unicode have varying support for the name synonyms + # below. Just include everything. As of 6.1, all these are correct in + # the Unicode-supplied file. - $file->insert_lines(get_old_name_aliases()); + my $file= shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; - return; -} -sub get_old_name_aliases () { + # ALERT did not come along until 6.0, at which point it became preferred + # over BELL. By inserting it last in early releases, BELL is preferred + # over it; and vice-vers in 6.0 + my $type_for_bell = ($v_version lt v6.0.0) + ? 'correction' + : 'alternate'; + $file->insert_lines(split /\n/, <<END +0007;BELL; $type_for_bell +000A;LINE FEED (LF);alternate +000C;FORM FEED (FF);alternate +000D;CARRIAGE RETURN (CR);alternate +0085;NEXT LINE (NEL);alternate +END + + ); - # The Unicode_1_Name field, contains most of these names. One would - # expect, given the field's name, that its values would be fixed across - # versions, giving the true Unicode version 1 name for the character. - # Sadly, this is not the case. Actually Version 1.1.5 had no names for - # any of the controls; Version 2.0 introduced names for the C0 controls, - # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2 - # changed some names: it + # One might think that the the 'Unicode_1_Name' field, could work for most + # of the above names, but sadly that field varies depending on the + # release. Version 1.1.5 had no names for any of the controls; Version + # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names. + # 3.0.1 removed the name INDEX; and 3.2 changed some names: # changed to parenthesized versions like "NEXT LINE" to # "NEXT LINE (NEL)"; # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;; # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR - # This list contains all the names that were defined so that - # charnames::vianame(), etc. understand them all EVEN if this version of - # Unicode didn't specify them (this could be construed as a bug). - # mktables elsewhere gives preference to the Unicode_1_Name field over - # these names, so that viacode() will return the correct value for that - # version of Unicode, except when that version doesn't define a name, - # viacode() will return one anyway (this also could be construed as a - # bug). But these potential "bugs" allow for the smooth working of code - # on earlier Unicode releases. - - my @return = split /\n/, <<'END'; -0000;NULL;control -0000;NUL;abbreviation -0001;START OF HEADING;control -0001;SOH;abbreviation -0002;START OF TEXT;control -0002;STX;abbreviation -0003;END OF TEXT;control -0003;ETX;abbreviation -0004;END OF TRANSMISSION;control -0004;EOT;abbreviation -0005;ENQUIRY;control -0005;ENQ;abbreviation -0006;ACKNOWLEDGE;control -0006;ACK;abbreviation -0007;BELL;control -0007;BEL;abbreviation -0008;BACKSPACE;control -0008;BS;abbreviation -0009;CHARACTER TABULATION;control -0009;HORIZONTAL TABULATION;control -0009;HT;abbreviation -0009;TAB;abbreviation -000A;LINE FEED;control -000A;LINE FEED (LF);control -000A;NEW LINE;control -000A;END OF LINE;control -000A;LF;abbreviation -000A;NL;abbreviation -000A;EOL;abbreviation -000B;LINE TABULATION;control -000B;VERTICAL TABULATION;control -000B;VT;abbreviation -000C;FORM FEED;control -000C;FORM FEED (FF);control -000C;FF;abbreviation -000D;CARRIAGE RETURN;control -000D;CARRIAGE RETURN (CR);control -000D;CR;abbreviation -000E;SHIFT OUT;control -000E;LOCKING-SHIFT ONE;control -000E;SO;abbreviation -000F;SHIFT IN;control -000F;LOCKING-SHIFT ZERO;control -000F;SI;abbreviation -0010;DATA LINK ESCAPE;control -0010;DLE;abbreviation -0011;DEVICE CONTROL ONE;control -0011;DC1;abbreviation -0012;DEVICE CONTROL TWO;control -0012;DC2;abbreviation -0013;DEVICE CONTROL THREE;control -0013;DC3;abbreviation -0014;DEVICE CONTROL FOUR;control -0014;DC4;abbreviation -0015;NEGATIVE ACKNOWLEDGE;control -0015;NAK;abbreviation -0016;SYNCHRONOUS IDLE;control -0016;SYN;abbreviation -0017;END OF TRANSMISSION BLOCK;control -0017;ETB;abbreviation -0018;CANCEL;control -0018;CAN;abbreviation -0019;END OF MEDIUM;control -0019;EOM;abbreviation -001A;SUBSTITUTE;control -001A;SUB;abbreviation -001B;ESCAPE;control -001B;ESC;abbreviation -001C;INFORMATION SEPARATOR FOUR;control -001C;FILE SEPARATOR;control -001C;FS;abbreviation -001D;INFORMATION SEPARATOR THREE;control -001D;GROUP SEPARATOR;control -001D;GS;abbreviation -001E;INFORMATION SEPARATOR TWO;control -001E;RECORD SEPARATOR;control -001E;RS;abbreviation -001F;INFORMATION SEPARATOR ONE;control -001F;UNIT SEPARATOR;control -001F;US;abbreviation -0020;SP;abbreviation -007F;DELETE;control -007F;DEL;abbreviation -0080;PADDING CHARACTER;figment -0080;PAD;abbreviation -0081;HIGH OCTET PRESET;figment -0081;HOP;abbreviation -0082;BREAK PERMITTED HERE;control -0082;BPH;abbreviation -0083;NO BREAK HERE;control -0083;NBH;abbreviation -0084;INDEX;control -0084;IND;abbreviation -0085;NEXT LINE;control -0085;NEXT LINE (NEL);control -0085;NEL;abbreviation -0086;START OF SELECTED AREA;control -0086;SSA;abbreviation -0087;END OF SELECTED AREA;control -0087;ESA;abbreviation -0088;CHARACTER TABULATION SET;control -0088;HORIZONTAL TABULATION SET;control -0088;HTS;abbreviation -0089;CHARACTER TABULATION WITH JUSTIFICATION;control -0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control -0089;HTJ;abbreviation -008A;LINE TABULATION SET;control -008A;VERTICAL TABULATION SET;control -008A;VTS;abbreviation -008B;PARTIAL LINE FORWARD;control -008B;PARTIAL LINE DOWN;control -008B;PLD;abbreviation -008C;PARTIAL LINE BACKWARD;control -008C;PARTIAL LINE UP;control -008C;PLU;abbreviation -008D;REVERSE LINE FEED;control -008D;REVERSE INDEX;control -008D;RI;abbreviation -008E;SINGLE SHIFT TWO;control -008E;SINGLE-SHIFT-2;control -008E;SS2;abbreviation -008F;SINGLE SHIFT THREE;control -008F;SINGLE-SHIFT-3;control -008F;SS3;abbreviation -0090;DEVICE CONTROL STRING;control -0090;DCS;abbreviation -0091;PRIVATE USE ONE;control -0091;PRIVATE USE-1;control -0091;PU1;abbreviation -0092;PRIVATE USE TWO;control -0092;PRIVATE USE-2;control -0092;PU2;abbreviation -0093;SET TRANSMIT STATE;control -0093;STS;abbreviation -0094;CANCEL CHARACTER;control -0094;CCH;abbreviation -0095;MESSAGE WAITING;control -0095;MW;abbreviation -0096;START OF GUARDED AREA;control -0096;START OF PROTECTED AREA;control -0096;SPA;abbreviation -0097;END OF GUARDED AREA;control -0097;END OF PROTECTED AREA;control -0097;EPA;abbreviation -0098;START OF STRING;control -0098;SOS;abbreviation -0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment -0099;SGC;abbreviation -009A;SINGLE CHARACTER INTRODUCER;control -009A;SCI;abbreviation -009B;CONTROL SEQUENCE INTRODUCER;control -009B;CSI;abbreviation -009C;STRING TERMINATOR;control -009C;ST;abbreviation -009D;OPERATING SYSTEM COMMAND;control -009D;OSC;abbreviation -009E;PRIVACY MESSAGE;control -009E;PM;abbreviation -009F;APPLICATION PROGRAM COMMAND;control -009F;APC;abbreviation -00A0;NBSP;abbreviation -00AD;SHY;abbreviation -200B;ZWSP;abbreviation -200C;ZWNJ;abbreviation -200D;ZWJ;abbreviation -200E;LRM;abbreviation -200F;RLM;abbreviation -202A;LRE;abbreviation -202B;RLE;abbreviation -202C;PDF;abbreviation -202D;LRO;abbreviation -202E;RLO;abbreviation -FEFF;BYTE ORDER MARK;alternate -FEFF;BOM;abbreviation -FEFF;ZWNBSP;abbreviation -END - - if ($v_version ge v3.0.0) { - push @return, split /\n/, <<'END'; -180B; FVS1; abbreviation -180C; FVS2; abbreviation -180D; FVS3; abbreviation -180E; MVS; abbreviation -202F; NNBSP; abbreviation -END - } - - if ($v_version ge v3.2.0) { - push @return, split /\n/, <<'END'; -034F; CGJ; abbreviation -205F; MMSP; abbreviation -2060; WJ; abbreviation -END - # Add in VS1..VS16 - my $cp = 0xFE00 - 1; - for my $i (1..16) { - push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i); - } - } - if ($v_version ge v4.0.0) { # Add in VS17..VS256 - my $cp = 0xE0100 - 17; - for my $i (17..256) { - push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i); - } - } - - # ALERT did not come along until 6.0, at which point it became preferred - # over BELL, and was never in the Unicode_1_Name field. For the same - # reasons, that the other names are made known to all releases by this - # function, we make ALERT known too. By inserting it - # last in early releases, BELL is preferred over it; and vice-vers in 6.0 - my $alert = '0007; ALERT; control'; - if ($v_version lt v6.0.0) { - push @return, $alert; - } - else { - unshift @return, $alert; - } + # + # All these are present in the 6.1 NameAliases.txt - return @return; + return; } sub filter_later_version_name_alias_line { @@ -13030,14 +13177,29 @@ sub filter_later_version_name_alias_line { sub filter_early_version_name_alias_line { # Early versions did not have the trailing alias type field; implicitly it - # was 'correction'. But our synthetic lines we add in this program do - # have it, so test for the type field. - $_ .= "; correction" if $_ !~ /;.*;/; + # was 'correction'. + $_ .= "; correction"; filter_later_version_name_alias_line; return; } +sub filter_all_caps_script_names { + + # Some early Unicode releases had the script names in all CAPS. This + # converts them to just the first letter of each word being capital. + + my ($range, $script, @remainder) + = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields + my @words = split "_", $script; + for my $word (@words) { + $word = + ucfirst(lc($word)) if $word ne 'CJK'; + } + $script = join "_", @words; + $_ = join ";", $range, $script, @remainder; +} + sub finish_Unicode() { # This routine should be called after all the Unicode files have been read # in. It: @@ -13188,72 +13350,81 @@ END # Add any remaining code points to the mapping, using the default for # missing code points. my $default_table; - if (defined (my $default_map = $property->default_map)) { + my $default_map = $property->default_map; + if ($property_type == $FORCED_BINARY) { - # Make sure there is a match table for the default - if (! defined ($default_table = $property->table($default_map))) { - $default_table = $property->add_match_table($default_map); + # A forced binary property creates a 'Y' table that matches all + # non-default values. The actual string values are also written out + # as a map table. (The default value will almost certainly be the + # empty string, so the pod glosses over the distinction, and just + # talks about empty vs non-empty.) + my $yes = $property->table("Y"); + foreach my $range ($property->ranges) { + next if $range->value eq $default_map; + $yes->add_range($range->start, $range->end); } + $property->table("N")->set_complement($yes); + } + else { + if (defined $default_map) { - # And, if the property is binary, the default table will just - # be the complement of the other table. - if ($property_type == $BINARY) { - my $non_default_table; - - # Find the non-default table. - for my $table ($property->tables) { - next if $table == $default_table; - $non_default_table = $table; + # Make sure there is a match table for the default + if (! defined ($default_table = $property->table($default_map))) + { + $default_table = $property->add_match_table($default_map); } - $default_table->set_complement($non_default_table); - } - else { - # This fills in any missing values with the default. It's not - # necessary to do this with binary properties, as the default - # is defined completely in terms of the Y table. - $property->add_map(0, $MAX_WORKING_CODEPOINT, - $default_map, Replace => $NO); - } - } + # And, if the property is binary, the default table will just + # be the complement of the other table. + if ($property_type == $BINARY) { + my $non_default_table; - # Have all we need to populate the match tables. - my $maps_should_be_defined = $property->pre_declared_maps; - foreach my $range ($property->ranges) { - my $map = $range->value; - my $table = $property->table($map); - if (! defined $table) { + # Find the non-default table. + for my $table ($property->tables) { + if ($table == $default_table) { + if ($v_version le v5.0.0) { + $table->add_alias($_) for qw(N No F False); + } + next; + } elsif ($v_version le v5.0.0) { + $table->add_alias($_) for qw(Y Yes T True); + } + $non_default_table = $table; + } + $default_table->set_complement($non_default_table); + } + else { - # Integral and rational property values are not necessarily - # defined in PropValueAliases, but whether all the other ones - # should be depends on the property. - if ($maps_should_be_defined - && $map !~ /^ -? \d+ ( \/ \d+ )? $/x) - { - Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.") + # This fills in any missing values with the default. It's + # not necessary to do this with binary properties, as the + # default is defined completely in terms of the Y table. + $property->add_map(0, $MAX_WORKING_CODEPOINT, + $default_map, Replace => $NO); } - $table = $property->add_match_table($map); } - next if $table->complement != 0; # Don't need to populate these - $table->add_range($range->start, $range->end); - } + # Have all we need to populate the match tables. + my $maps_should_be_defined = $property->pre_declared_maps; + foreach my $range ($property->ranges) { + my $map = $range->value; + my $table = $property->table($map); + if (! defined $table) { - # A forced binary property has additional true/false tables which - # should have been set up when it was forced into binary. The false - # table matches exactly the same set as the property's default table. - # The true table matches the complement of that. The false table is - # not the same as an additional set of aliases on top of the default - # table, so use 'set_equivalent_to'. If it were implemented as - # additional aliases, various things would have to be adjusted, but - # especially, if the user wants to get a list of names for the table - # using Unicode::UCD::prop_value_aliases(), s/he should get a - # different set depending on whether they want the default table or - # the false table. - if ($property_type == $FORCED_BINARY) { - $property->table('N')->set_equivalent_to($default_table, - Related => 1); - $property->table('Y')->set_complement($default_table); + # Integral and rational property values are not + # necessarily defined in PropValueAliases, but whether all + # the other ones should be depends on the property. + if ($maps_should_be_defined + && $map !~ /^ -? \d+ ( \/ \d+ )? $/x) + { + Carp::my_carp("Table '$property_name=$map' should " + . "have been defined. Defining it now.") + } + $table = $property->add_match_table($map); + } + + next if $table->complement != 0; # Don't need to populate these + $table->add_range($range->start, $range->end); + } } # For Perl 5.6 compatibility, all properties matchable in regexes can @@ -13321,8 +13492,6 @@ END $gc->table('Ll')->set_caseless_equivalent($LC); $gc->table('Lu')->set_caseless_equivalent($LC); - my $Cs = $gc->table('Cs'); - # Create digit and case fold tables with the original file names for # backwards compatibility with applications that read them directly. my $Digit = Property->new("Legacy_Perl_Decimal_Digit", @@ -13408,6 +13577,26 @@ sub pre_3_dot_1_Nl () { return $Nl; } +sub calculate_Assigned() { # Calculate the gc != Cn code points; may be + # called before the Cn's are completely filled. + # Works on Unicodes earlier than ones that + # explicitly specify Cn. + return if defined $Assigned; + + if (! defined $gc || $gc->is_empty()) { + Carp::my_carp_bug("calculate_Assigned() called before $gc is populated"); + } + + $Assigned = $perl->add_match_table('Assigned', + Description => "All assigned code points", + ); + while (defined (my $range = $gc->each_range())) { + my $standard_value = standardize($range->value); + next if $standard_value eq 'cn' || $standard_value eq 'unassigned'; + $Assigned->add_range($range->start, $range->end); + } +} + sub compile_perl() { # Create perl-defined tables. Almost all are part of the pseudo-property # named 'perl' internally to this program. Many of these are recommended @@ -13446,16 +13635,12 @@ sub compile_perl() { } my $Any = $perl->add_match_table('Any', - Description => "All Unicode code points: [\\x{0000}-\\x{10FFFF}]", + Description => "All Unicode code points: [\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]", ); - $Any->add_range(0, 0x10FFFF); + $Any->add_range(0, $MAX_UNICODE_CODEPOINT); $Any->add_alias('Unicode'); - # Assigned is the opposite of gc=unassigned - my $Assigned = $perl->add_match_table('Assigned', - Description => "All assigned code points", - Initialize => ~ $gc->table('Unassigned'), - ); + calculate_Assigned(); # Our internal-only property should be treated as more than just a # synonym; grandfather it in to the pod. @@ -13501,32 +13686,27 @@ sub compile_perl() { # There are quite a few code points in Lower, that aren't in gc=lc, # and not all are in all releases. - foreach my $code_point ( utf8::unicode_to_native(0xAA), - utf8::unicode_to_native(0xBA), - 0x02B0 .. 0x02B8, - 0x02C0 .. 0x02C1, - 0x02E0 .. 0x02E4, - 0x0345, - 0x037A, - 0x1D2C .. 0x1D6A, - 0x1D78, - 0x1D9B .. 0x1DBF, - 0x2071, - 0x207F, - 0x2090 .. 0x209C, - 0x2170 .. 0x217F, - 0x24D0 .. 0x24E9, - 0x2C7C .. 0x2C7D, - 0xA770, - 0xA7F8 .. 0xA7F9, - ) { - # Don't include the code point unless it is assigned in this - # release - my $category = $gc->value_of(hex $code_point); - next if ! defined $category || $category eq 'Cn'; - - $Lower += $code_point; - } + my $temp = Range_List->new(Initialize => [ + utf8::unicode_to_native(0xAA), + utf8::unicode_to_native(0xBA), + 0x02B0 .. 0x02B8, + 0x02C0 .. 0x02C1, + 0x02E0 .. 0x02E4, + 0x0345, + 0x037A, + 0x1D2C .. 0x1D6A, + 0x1D78, + 0x1D9B .. 0x1DBF, + 0x2071, + 0x207F, + 0x2090 .. 0x209C, + 0x2170 .. 0x217F, + 0x24D0 .. 0x24E9, + 0x2C7C .. 0x2C7D, + 0xA770, + 0xA7F8 .. 0xA7F9, + ]); + $Lower += $temp & $Assigned; } my $Posix_Lower = $perl->add_match_table("PosixLower", Description => "[a-z]", @@ -13832,6 +14012,7 @@ sub compile_perl() { ); $Space->add_alias('XPerlSpace'); # Pre-existing synonyms $Space->add_alias('SpacePerl'); + $Space->add_alias('Space') if $v_version lt v4.1.0; my $Posix_space = $perl->add_match_table("PosixSpace", Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)", @@ -13847,11 +14028,18 @@ sub compile_perl() { Initialize => $Cntrl & $ASCII, ); + my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate'); + if (defined (my $Cs = $gc->table('Cs'))) { + $perl_surrogate += $Cs; + } + else { + push @tables_that_may_be_empty, '_Perl_Surrogate'; + } + # $controls is a temporary used to construct Graph. my $controls = Range_List->new(Initialize => $gc->table('Unassigned') - + $gc->table('Control')); - # Cs not in release 1 - $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate'); + + $gc->table('Control') + + $perl_surrogate); # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls) my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph', @@ -13957,14 +14145,21 @@ sub compile_perl() { # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description # of the MU issue. foreach my $range ($loc_problem_folds->ranges) { - foreach my $code_point($range->start .. $range->end) { + foreach my $code_point ($range->start .. $range->end) { my $fold_range = $cf->containing_range($code_point); next unless defined $fold_range; + # Skip if folds to itself + next if $fold_range->value eq $CODE_POINT; + my @hex_folds = split " ", $fold_range->value; - my $start_cp = hex $hex_folds[0]; + my $start_cp = $hex_folds[0]; + next if $start_cp eq $CODE_POINT; + $start_cp = hex $start_cp; foreach my $i (0 .. @hex_folds - 1) { - my $cp = hex $hex_folds[$i]; + my $cp = $hex_folds[$i]; + next if $cp eq $CODE_POINT; + $cp = hex $cp; next unless $cp > 255; # Already have the < 256 ones $loc_problem_folds->add_range($cp, $cp); @@ -13978,9 +14173,13 @@ sub compile_perl() { Description => "Code points whose fold is a string of more than one character", ); + if ($v_version lt v3.0.1) { + push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char'; + } # Look through all the known folds to populate these tables. foreach my $range ($cf->ranges) { + next if $range->value eq $CODE_POINT; my $start = $range->start; my $end = $range->end; $any_folds->add_range($start, $end); @@ -14190,7 +14389,7 @@ sub compile_perl() { + utf8::unicode_to_native(0xA0) # NBSP ); - my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias'); + my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias'); if (@named_sequences) { push @composition, 'Named_Sequence'; @@ -14201,15 +14400,15 @@ sub compile_perl() { my $alias_sentence = ""; my %abbreviations; - my $alias = property_ref('Name_Alias'); - $perl_charname->set_proxy_for('Name_Alias'); - - # Add each entry in Name_Alias to Perl_Charnames. Where these go with - # respect to any existing entry depends on the entry type. Corrections go - # before said entry, as they should be returned in preference over the - # existing entry. (A correction to a correction should be later in the - # Name_Alias table, so it will correctly precede the erroneous correction - # in Perl_Charnames.) + my $alias = property_ref('_Perl_Name_Alias'); + $perl_charname->set_proxy_for('_Perl_Name_Alias'); + + # Add each entry in _Perl_Name_Alias to Perl_Charnames. Where these go + # with respect to any existing entry depends on the entry type. + # Corrections go before said entry, as they should be returned in + # preference over the existing entry. (A correction to a correction + # should be later in the _Perl_Name_Alias table, so it will correctly + # precede the erroneous correction in Perl_Charnames.) # # Abbreviations go after everything else, so they are saved temporarily in # a hash for later. @@ -14244,7 +14443,7 @@ sub compile_perl() { $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type); } $alias_sentence = <<END; -The Name_Alias property adds duplicate code point entries that are +The _Perl_Name_Alias property adds duplicate code point entries that are alternatives to the original name. If an addition is a corrected name, it will be physically first in the table. The original (less correct, but still valid) name will be next; then any alternatives, in no particular @@ -14252,8 +14451,9 @@ order; and finally any abbreviations, again in no particular order. END # Now add the Unicode_1 names for the controls. The Unicode_1 names had - # precedence before 6.1, so should be first in the file; the other names - # have precedence starting in 6.1, + # precedence before 6.1, including the awful ones like "LINE FEED (LF)", + # so should be first in the file; the other names have precedence starting + # in 6.1, my $before_or_after = ($v_version lt v6.1.0) ? $MULTIPLE_BEFORE : $MULTIPLE_AFTER; @@ -14283,12 +14483,6 @@ END Replace => $before_or_after); } - # But in this version only, the ALERT has precedence over BELL, the - # Unicode_1_Name that would otherwise have precedence. - if ($v_version eq v6.0.0) { - $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE); - } - # Now that have everything added, add in abbreviations after # everything else. Sort so results don't change between runs of this # program @@ -14398,6 +14592,25 @@ END $unassigned->set_equivalent_to($age_default, Related => 1); } + my $patws = $perl->add_match_table('_Perl_PatWS', + Perl_Extension => 1, + Fate => $INTERNAL_ONLY); + if (defined (my $off_patws = property_ref('Pattern_White_Space'))) { + $patws->initialize($off_patws->table('Y')); + } + else { + $patws->initialize([ ord("\t"), + ord("\n"), + utf8::unicode_to_native(0x0B), # VT + ord("\f"), + ord("\r"), + ord(" "), + utf8::unicode_to_native(0x85), # NEL + 0x200E..0x200F, # Left, Right marks + 0x2028..0x2029 # Line, Paragraph seps + ] ); + } + # See L<perlfunc/quotemeta> my $quotemeta = $perl->add_match_table('_Perl_Quotemeta', Perl_Extension => 1, @@ -14406,22 +14619,78 @@ END # Initialize to what's common in # all Unicode releases. Initialize => - $Space - + $gc->table('Control') + $gc->table('Control') + + $Space + + $patws + + ((~ $Word) & $ASCII) ); - # In early releases without the proper Unicode properties, just set to \W. - if (! defined (my $patsyn = property_ref('Pattern_Syntax')) - || ! defined (my $patws = property_ref('Pattern_White_Space')) - || ! defined (my $di = property_ref('Default_Ignorable_Code_Point'))) - { - $quotemeta += ~ $Word; + if (defined (my $patsyn = property_ref('Pattern_Syntax'))) { + $quotemeta += $patsyn->table('Y'); } else { - $quotemeta += $patsyn->table('Y') - + $patws->table('Y') - + $di->table('Y') - + ((~ $Word) & $ASCII); + $quotemeta += ((~ $Word) & Range->new(0, 255)) + - utf8::unicode_to_native(0xA8) + - utf8::unicode_to_native(0xAF) + - utf8::unicode_to_native(0xB2) + - utf8::unicode_to_native(0xB3) + - utf8::unicode_to_native(0xB4) + - utf8::unicode_to_native(0xB7) + - utf8::unicode_to_native(0xB8) + - utf8::unicode_to_native(0xB9) + - utf8::unicode_to_native(0xBC) + - utf8::unicode_to_native(0xBD) + - utf8::unicode_to_native(0xBE); + $quotemeta += [ # These are above-Latin1 patsyn; hence should be the + # same in all releases + 0x2010 .. 0x2027, + 0x2030 .. 0x203E, + 0x2041 .. 0x2053, + 0x2055 .. 0x205E, + 0x2190 .. 0x245F, + 0x2500 .. 0x2775, + 0x2794 .. 0x2BFF, + 0x2E00 .. 0x2E7F, + 0x3001 .. 0x3003, + 0x3008 .. 0x3020, + 0x3030 .. 0x3030, + 0xFD3E .. 0xFD3F, + 0xFE45 .. 0xFE46 + ]; + } + + if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) { + $quotemeta += $di->table('Y') + } + else { + if ($v_version ge v2.0) { + $quotemeta += $gc->table('Cf') + + $gc->table('Cs'); + } + $quotemeta += $gc->table('Cc') + - $Space; + my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D, + 0x2060 .. 0x206F, + 0xFE00 .. 0xFE0F, + 0xFFF0 .. 0xFFFB, + 0xE0000 .. 0xE0FFF, + ]); + $quotemeta += $temp & $Assigned; + } + + my $nchar = $perl->add_match_table('_Perl_Nchar', + Perl_Extension => 1, + Fate => $INTERNAL_ONLY); + if (defined (my $off_nchar = property_ref('Nchar'))) { + $nchar->initialize($off_nchar->table('Y')); + } + else { + $nchar->initialize([ 0xFFFE .. 0xFFFF ]); + if ($v_version ge v2.0) { # First release with these nchars + for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) { + $nchar += [ $i .. $i+1 ]; + } + } } # Finished creating all the perl properties. All non-internal non-string @@ -14743,14 +15012,15 @@ sub register_file_for_name($$$) { my $file = shift; # The file name in the final directory. Carp::carp_extra_args(\@_) if main::DEBUG && @_; - trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace; + trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace; if ($table->isa('Property')) { $table->set_file_path(@$directory_ref, $file); push @map_properties, $table; # No swash means don't do the rest of this. - return if $table->fate != $ORDINARY; + return if $table->fate != $ORDINARY + && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY); # Get the path to the file my @path = $table->file_path; @@ -14764,7 +15034,12 @@ sub register_file_for_name($$$) { # property's map table foreach my $alias ($table->aliases) { my $name = $alias->name; - $loose_property_to_file_of{standardize($name)} = $file; + if ($name =~ /^_/) { + $strict_property_to_file_of{lc $name} = $file; + } + else { + $loose_property_to_file_of{standardize($name)} = $file; + } } # And a way for utf8_heavy to find the proper key in the SwashInfo @@ -14972,7 +15247,22 @@ sub register_file_for_name($$$) { # Remove interior underscores. (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg; - # Change any non-word character into an underscore, and truncate to 8. + # Convert the dot in floating point numbers to an underscore + $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x; + + my $suffix = ""; + + # Extract any suffix, delete any non-word character, and truncate to 3 + # after the dot + if ($filename =~ m/ ( .*? ) ( \. .* ) /x) { + $filename = $1; + $suffix = $2; + $suffix =~ s/\W+//g; + substr($suffix, 4) = "" if length($suffix) > 4; + } + + # Change any non-word character outside the suffix into an underscore, + # and truncate to 8. $filename =~ s/\W+/_/g; # eg., "L&" -> "L_" substr($filename, 8) = "" if length($filename) > 8; @@ -14984,7 +15274,7 @@ sub register_file_for_name($$$) { # InGreekE # InGreek2 my $warned = 0; - while (my $num = $base_names{$path}{lc $filename}++) { + while (my $num = $base_names{$path}{lc "$filename$suffix"}++) { $num++; # so basenames with numbers start with '2', which # just looks more natural. @@ -15491,9 +15781,9 @@ sub make_ucd_table_pod_entries { $$info_ref .= $full_name; } - # And the full-name entry includes the short name, if different + # And the full-name entry includes the short name, if shorter if ($info_ref == \$full_info - && $standard_short_name ne $standard_full_name) + && length $standard_short_name < length $standard_full_name) { $full_info =~ s/\.\Z//; $full_info .= " " if $full_info; @@ -15517,6 +15807,17 @@ sub make_ucd_table_pod_entries { $full_info .= ". " if $full_info; $full_info .= $more_info; } + if ($table->property->type == $FORCED_BINARY) { + if ($full_info) { + $full_info =~ s/\.\Z//; + $full_info .= ". "; + } + $full_info .= "This is a combination property which has both:" + . " 1) a map to various string values; and" + . " 2) a map to boolean Y/N, where 'Y' means the" + . " string value is non-empty. Add the prefix 'is'" + . " to the prop_invmap() call to get the latter"; + } # These keep track if have created full and short name pod entries for the # property @@ -15548,6 +15849,9 @@ sub make_ucd_table_pod_entries { $info = $other_info; } + $combination_property{$standard} = 1 + if $table->property->type == $FORCED_BINARY; + # Here, we have set up the two columns for this entry. But if an # entry already exists for this name, we have to decide which one # we're going to later output. @@ -15614,9 +15918,9 @@ sub pod_alphanumeric_sort { # The first few character columns are filler, plus the '\p{'; and get rid # of all the trailing stuff, starting with the trailing '}', so as to sort # on just 'Name=Value' - (my $a = lc $a) =~ s/^ .*? { //x; + (my $a = lc $a) =~ s/^ .*? \{ //x; $a =~ s/}.*//; - (my $b = lc $b) =~ s/^ .*? { //x; + (my $b = lc $b) =~ s/^ .*? \{ //x; $b =~ s/}.*//; # Determine if the two operands are both internal only or both not. @@ -15761,6 +16065,7 @@ END # The sort will cause the alphabetically first properties to be added to # each list first, so each list will be sorted. foreach my $property (sort keys %why_suppressed) { + next unless $why_suppressed{$property}; push @{$why_list{$why_suppressed{$property}}}, $property; } @@ -15823,13 +16128,12 @@ END } # Similiarly, generate a list of files that we don't use, grouped by the - # reasons why. First, create a hash whose keys are the reasons, and whose - # values are anonymous arrays of all the files that share that reason. + # reasons why (Don't output if the reason is empty). First, create a hash + # whose keys are the reasons, and whose values are anonymous arrays of all + # the files that share that reason. my %grouped_by_reason; - foreach my $file (keys %ignored_files) { - push @{$grouped_by_reason{$ignored_files{$file}}}, $file; - } foreach my $file (keys %skipped_files) { + next unless $skipped_files{$file}; push @{$grouped_by_reason{$skipped_files{$file}}}, $file; } @@ -16284,6 +16588,10 @@ sub make_Heavy () { = simple_dumper(\%loose_property_name_of, ' ' x 4); chomp $loose_property_name_of; + my $strict_property_name_of + = simple_dumper(\%strict_property_name_of, ' ' x 4); + chomp $strict_property_name_of; + my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4); chomp $stricter_to_file_of; @@ -16322,6 +16630,10 @@ sub make_Heavy () { = simple_dumper(\%loose_property_to_file_of, ' ' x 4); chomp $loose_property_to_file_of; + my $strict_property_to_file_of + = simple_dumper(\%strict_property_to_file_of, ' ' x 4); + chomp $strict_property_to_file_of; + my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4); chomp $file_to_swash_name; @@ -16337,6 +16649,11 @@ $INTERNAL_ONLY_HEADER $loose_property_name_of ); +# Same, but strict names +\%utf8::strict_property_name_of = ( +$strict_property_name_of +); + # Gives the definitions (in the form of inversion lists) for those properties # whose definitions aren't kept in files \@utf8::inline_definitions = ( @@ -16385,6 +16702,11 @@ $caseless_equivalent_to $loose_property_to_file_of ); +# Property names to mapping files +\%utf8::strict_property_to_file_of = ( +$strict_property_to_file_of +); + # Files to the swash names within them. \%utf8::file_to_swash_name = ( $file_to_swash_name @@ -16756,8 +17078,8 @@ sub make_UCD () { # an element for the Hangul syllables in the appropriate place, and # otherwise changes the name to include the "-<code point>" suffix. my @algorithm_names; - my $done_hangul = 0; - + my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came + # along in this version # Copy it linearly. for my $i (0 .. @code_points_ending_in_code_point - 1) { @@ -16808,6 +17130,9 @@ sub make_UCD () { my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4); chomp $ambiguous_names; + my $combination_property = simple_dumper(\%combination_property, ' ' x 4); + chomp $combination_property; + my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4); chomp $loose_defaults; @@ -16872,6 +17197,13 @@ $ambiguous_names $loose_defaults ); +# The properties that are combinations, in that they have both a map table and +# a match table. This is actually for UCD.t, so it knows how to test for +# these. +\%Unicode::UCD::combination_property = ( +$combination_property +); + # All combinations of names that are suppressed. # This is actually for UCD.t, so it knows which properties shouldn't have # entries. If it got any bigger, would probably want to put it in its own @@ -16972,14 +17304,8 @@ sub write_all_tables() { # with it or not. my $expected_empty = - # $perl should be empty, as well as properties that we just - # don't do anything with - ($is_property - && ($table == $perl - || grep { $complete_name eq $_ } - @unimplemented_properties - ) - ) + # $perl should be empty + ($is_property && ($table == $perl)) # Match tables in properties we skipped populating should be # empty @@ -17045,7 +17371,7 @@ sub write_all_tables() { : ($is_property) ? # All these types of map tables will be full because # they will have been populated with defaults - ($type == $ENUM || $type == $FORCED_BINARY) + ($type == $ENUM) : # A match table should match everything if its method # shows it should @@ -17206,12 +17532,14 @@ sub write_all_tables() { } } else { - if (exists ($loose_property_name_of{$alias_standard})) - { - Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained"); + my $hash_ref = ($alias_standard =~ /^_/) + ? \%strict_property_name_of + : \%loose_property_name_of; + if (exists $hash_ref->{$alias_standard}) { + Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained"); } else { - $loose_property_name_of{$alias_standard} + $hash_ref->{$alias_standard} = $standard_property_name; } @@ -17462,7 +17790,7 @@ sub generate_tests($$$$$) { my @output; # Create a complete set of tests, with complements. if (defined $valid_code) { - push @output, <<"EOC" + push @output, <<"EOC" Expect(1, $valid_code, '\\p{$name}', $warning); Expect(0, $valid_code, '\\p{^$name}', $warning); Expect(0, $valid_code, '\\P{$name}', $warning); @@ -17470,7 +17798,7 @@ Expect(1, $valid_code, '\\P{^$name}', $warning); EOC } if (defined $invalid_code) { - push @output, <<"EOC" + push @output, <<"EOC" Expect(0, $invalid_code, '\\p{$name}', $warning); Expect(1, $invalid_code, '\\p{^$name}', $warning); Expect(1, $invalid_code, '\\P{$name}', $warning); @@ -17708,8 +18036,10 @@ sub make_property_test_script() { # Test each possible combination of the property's aliases with # the table's. If this gets to be too many, could do what is done # in the set_final_comment() for Tables - my @table_aliases = $table->aliases; - my @property_aliases = $table->property->aliases; + my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases; + next unless @table_aliases; + my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases; + next unless @property_aliases; # Every property can be optionally be prefixed by 'Is_', so test # that those work, by creating such a new alias for each @@ -17941,84 +18271,97 @@ END return; } +# Skip reasons, so will be exact same text and hence the files with each +# reason will get grouped together in perluniprops. +my $Documentation = "Documentation"; +my $Indic_Skip + = "Provisional; for the analysis and processing of Indic scripts"; +my $Validation = "Validation Tests"; +my $Validation_Documentation = "Documentation of validation Tests"; + # This is a list of the input files and how to handle them. The files are # processed in their order in this list. Some reordering is possible if -# desired, but the v0 files should be first, and the extracted before the -# others except DAge.txt (as data in an extracted file can be over-ridden by -# the non-extracted. Some other files depend on data derived from an earlier -# file, like UnicodeData requires data from Jamo, and the case changing and -# folding requires data from Unicode. Mostly, it is safest to order by first -# version releases in (except the Jamo). DAge.txt is read before the -# extracted ones because of the rarely used feature $compare_versions. In the -# unlikely event that there were ever an extracted file that contained the Age -# property information, it would have to go in front of DAge. +# desired, but the PropertyAliases and PropValueAliases files should be first, +# and the extracted before the others except DAge.txt (as data in an extracted +# file can be over-ridden by the non-extracted. Some other files depend on +# data derived from an earlier file, like UnicodeData requires data from Jamo, +# and the case changing and folding requires data from Unicode. Mostly, it is +# safest to order by first version releases in (except the Jamo). DAge.txt is +# read before the extracted ones because of the rarely used feature +# $compare_versions. In the unlikely event that there were ever an extracted +# file that contained the Age property information, it would have to go in +# front of DAge. # # The version strings allow the program to know whether to expect a file or # not, but if a file exists in the directory, it will be processed, even if it # is in a version earlier than expected, so you can copy files from a later # release into an earlier release's directory. my @input_file_objects = ( - Input_file->new('PropertyAliases.txt', v0, + Input_file->new('PropertyAliases.txt', v3.2, Handler => \&process_PropertyAliases, - ), + Early => [ \&substitute_PropertyAliases ], + Required_Even_in_Debug_Skip => 1, + ), Input_file->new(undef, v0, # No file associated with this Progress_Message => 'Finishing property setup', Handler => \&finish_property_setup, - ), - Input_file->new('PropValueAliases.txt', v0, + ), + Input_file->new('PropValueAliases.txt', v3.2, Handler => \&process_PropValueAliases, + Early => [ \&substitute_PropValueAliases ], Has_Missings_Defaults => $NOT_IGNORED, - ), + Required_Even_in_Debug_Skip => 1, + ), Input_file->new('DAge.txt', v3.2.0, Has_Missings_Defaults => $NOT_IGNORED, Property => 'Age' - ), + ), Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0, Property => 'General_Category', - ), + ), Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0, Property => 'Canonical_Combining_Class', Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0, Property => 'Numeric_Type', Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0, Property => 'East_Asian_Width', Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0, Property => 'Line_Break', Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1, Property => 'Bidi_Class', Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0, Property => 'Decomposition_Type', Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0), Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0, Property => 'Numeric_Value', Each_Line_Handler => \&filter_numeric_value_line, Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0, Property => 'Joining_Group', Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0, Property => 'Joining_Type', Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new('Jamo.txt', v2.0.0, Property => 'Jamo_Short_Name', Each_Line_Handler => \&filter_jamo_line, - ), + ), Input_file->new('UnicodeData.txt', v1.1.5, Pre_Handler => \&setup_UnicodeData, @@ -18053,7 +18396,12 @@ my @input_file_objects = ( \&filter_UnicodeData_line, ], EOF_Handler => \&EOF_UnicodeData, - ), + ), + Input_file->new('CJKXREF.TXT', v1.1.5, + Withdrawn => v2.0.0, + Skip => 'Gives the mapping of CJK code points ' + . 'between Unicode and various other standards', + ), Input_file->new('ArabicShaping.txt', v2.0.0, Each_Line_Handler => ($v_version lt 4.1.0) @@ -18063,29 +18411,46 @@ my @input_file_objects = ( # not used by Perl Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ], Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new('Blocks.txt', v2.0.0, Property => 'Block', Has_Missings_Defaults => $NOT_IGNORED, Each_Line_Handler => \&filter_blocks_lines - ), + ), + Input_file->new('Index.txt', v2.0.0, + Skip => 'Alphabetical index of Unicode characters', + ), + Input_file->new('NamesList.txt', v2.0.0, + Skip => 'Annotated list of characters', + ), Input_file->new('PropList.txt', v2.0.0, Each_Line_Handler => (($v_version lt v3.1.0) ? \&filter_old_style_proplist : undef), - ), + ), + Input_file->new('Props.txt', v2.0.0, + Withdrawn => v3.0.0, + Skip => 'A subset of F<PropList.txt> (which is used instead)', + ), + Input_file->new('ReadMe.txt', v2.0.0, + Skip => $Documentation, + ), Input_file->new('Unihan.txt', v2.0.0, + Withdrawn => v5.2.0, + Construction_Time_Handler => \&construct_unihan, Pre_Handler => \&setup_unihan, - Optional => 1, + Optional => [ "", + 'Unicode_Radical_Stroke' + ], Each_Line_Handler => \&filter_unihan_line, - ), + ), Input_file->new('SpecialCasing.txt', v2.1.8, Each_Line_Handler => ($v_version eq 2.1.8) ? \&filter_2_1_8_special_casing_line : \&filter_special_casing_line, Pre_Handler => \&setup_special_casing, Has_Missings_Defaults => $IGNORED, - ), + ), Input_file->new( 'LineBreak.txt', v3.0.0, Has_Missings_Defaults => $NOT_IGNORED, @@ -18094,7 +18459,7 @@ my @input_file_objects = ( Each_Line_Handler => (($v_version lt v3.1.0) ? \&filter_early_ea_lb : undef), - ), + ), Input_file->new('EastAsianWidth.txt', v3.0.0, Property => 'East_Asian_Width', Has_Missings_Defaults => $NOT_IGNORED, @@ -18102,10 +18467,14 @@ my @input_file_objects = ( Each_Line_Handler => (($v_version lt v3.1.0) ? \&filter_early_ea_lb : undef), - ), + ), Input_file->new('CompositionExclusions.txt', v3.0.0, Property => 'Composition_Exclusion', - ), + ), + Input_file->new('UnicodeData.html', v3.0.0, + Withdrawn => v4.0.1, + Skip => $Documentation, + ), Input_file->new('BidiMirroring.txt', v3.0.1, Property => 'Bidi_Mirroring_Glyph', Has_Missings_Defaults => ($v_version lt v6.2.0) @@ -18114,12 +18483,15 @@ my @input_file_objects = ( # anything to us, we will use the # null string : $IGNORED, - - ), - Input_file->new("NormTest.txt", v3.0.0, - Handler => \&process_NormalizationsTest, - Skip => ($make_norm_test_script) ? 0 : 'Validation Tests', - ), + ), + Input_file->new('NamesList.html', v3.0.0, + Skip => 'Describes the format and contents of ' + . 'F<NamesList.txt>', + ), + Input_file->new('UnicodeCharacterDatabase.html', v3.0.0, + Withdrawn => v5.1, + Skip => $Documentation, + ), Input_file->new('CaseFolding.txt', v3.0.1, Pre_Handler => \&setup_case_folding, Each_Line_Handler => @@ -18129,105 +18501,222 @@ my @input_file_objects = ( \&filter_case_folding_line ], Has_Missings_Defaults => $IGNORED, - ), + ), + Input_file->new("NormTest.txt", v3.0.1, + Handler => \&process_NormalizationsTest, + Skip => ($make_norm_test_script) ? 0 : $Validation, + ), Input_file->new('DCoreProperties.txt', v3.1.0, # 5.2 changed this file Has_Missings_Defaults => (($v_version ge v5.2.0) ? $NOT_IGNORED : $NO_DEFAULTS), - ), + ), + Input_file->new('DProperties.html', v3.1.0, + Withdrawn => v3.2.0, + Skip => $Documentation, + ), + Input_file->new('PropList.html', v3.1.0, + Withdrawn => v5.1, + Skip => $Documentation, + ), Input_file->new('Scripts.txt', v3.1.0, Property => 'Script', + Each_Line_Handler => (($v_version le v4.0.0) + ? \&filter_all_caps_script_names + : undef), Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new('DNormalizationProps.txt', v3.1.0, Has_Missings_Defaults => $NOT_IGNORED, Each_Line_Handler => (($v_version lt v4.0.1) ? \&filter_old_style_normalization_lines : undef), - ), - Input_file->new('HangulSyllableType.txt', v0, + ), + Input_file->new('DerivedProperties.html', v3.1.1, + Withdrawn => v5.1, + Skip => $Documentation, + ), + Input_file->new('HangulSyllableType.txt', v4.0, Has_Missings_Defaults => $NOT_IGNORED, - Property => 'Hangul_Syllable_Type', - Pre_Handler => ($v_version lt v4.0.0) - ? \&generate_hst - : undef, - ), + Early => [ \&generate_hst, 'Hangul_Syllable_Type' ], + Property => 'Hangul_Syllable_Type' + ), + Input_file->new('NormalizationCorrections.txt', v3.2.0, + # This documents the cumulative fixes to erroneous + # normalizations in earlier Unicode versions. Its main + # purpose is so that someone running on an earlier + # version can use this file to override what got + # published in that earlier release. It would be easy + # for mktables to handle this file. But all the + # corrections in it should already be in the other files + # for the release it is. To get it to actually mean + # something useful, someone would have to be using an + # earlier Unicode release, and copy it into the directory + # for that release and recomplile. So far there has been + # no demand to do that, so this hasn't been implemented. + Skip => 'Documentation of corrections already ' + . 'incorporated into the Unicode data base', + ), + Input_file->new('StandardizedVariants.html', v3.2.0, + Skip => 'Provides a visual display of the standard ' + . 'variant sequences derived from ' + . 'F<StandardizedVariants.txt>.', + # I don't know why the html came earlier than the + # .txt, but both are skipped anyway, so it doesn't + # matter. + ), + Input_file->new('StandardizedVariants.txt', v4.0.0, + Skip => 'Certain glyph variations for character display ' + . 'are standardized. This lists the non-Unihan ' + . 'ones; the Unihan ones are also not used by ' + . 'Perl, and are in a separate Unicode data base ' + . 'L<http://www.unicode.org/ivd>', + ), + Input_file->new('UCD.html', v4.0.0, + Withdrawn => v5.2, + Skip => $Documentation, + ), Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0, + Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ], Property => 'Word_Break', Has_Missings_Defaults => $NOT_IGNORED, - ), - Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0, + ), + Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1, + Early => [ \&generate_GCB, '_Perl_GCB' ], Property => 'Grapheme_Cluster_Break', Has_Missings_Defaults => $NOT_IGNORED, - Pre_Handler => ($v_version lt v4.1.0) - ? \&generate_GCB - : undef, - ), + ), Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, Handler => \&process_GCB_test, - ), - Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0, - Skip => 'Validation Tests', - ), + ), + Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0, + Skip => $Validation_Documentation, + ), Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, Handler => \&process_SB_test, - ), + ), + Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0, + Skip => $Validation_Documentation, + ), Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, Handler => \&process_WB_test, - ), + ), + Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0, + Skip => $Validation_Documentation, + ), Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, Property => 'Sentence_Break', + Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ], Has_Missings_Defaults => $NOT_IGNORED, - ), + ), Input_file->new('NamedSequences.txt', v4.1.0, Handler => \&process_NamedSequences - ), - Input_file->new('NameAliases.txt', v0, + ), + Input_file->new('Unihan.html', v4.1.0, + Withdrawn => v5.2, + Skip => $Documentation, + ), + Input_file->new('NameAliases.txt', v5.0, Property => 'Name_Alias', - Pre_Handler => ($v_version le v6.0.0) - ? \&setup_early_name_alias - : undef, Each_Line_Handler => ($v_version le v6.0.0) ? \&filter_early_version_name_alias_line : \&filter_later_version_name_alias_line, - ), + ), + # NameAliases.txt came along in v5.0. The above constructor handles + # this. But until 6.1, it was lacking some information needed by core + # perl. The constructor below handles that. It is either a kludge or + # clever, depending on your point of view. The 'Withdrawn' parameter + # indicates not to use it at all starting in 6.1 (so the above + # constructor applies), and the 'v6.1' parameter indicates to use the + # Early parameter before 6.1. Therefore 'Early" is always used, + # yielding the internal-only property '_Perl_Name_Alias', which it + # gets from a NameAliases.txt from 6.1 or later stored in + # N_Asubst.txt. In combination with the above constructor, + # 'Name_Alias' is publicly accessible starting with v5.0, and the + # better 6.1 version is accessible to perl core in all releases. + Input_file->new("NameAliases.txt", v6.1, + Withdrawn => v6.1, + Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ], + Property => 'Name_Alias', + EOF_Handler => \&fixup_early_perl_name_alias, + Each_Line_Handler => + \&filter_later_version_name_alias_line, + ), + Input_file->new('NamedSqProv.txt', v5.0.0, + Skip => 'Named sequences proposed for inclusion in a ' + . 'later version of the Unicode Standard; if you ' + . 'need them now, you can append this file to ' + . 'F<NamedSequences.txt> and recompile perl', + ), + Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0, + Skip => $Validation, + ), + Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0, + Skip => $Validation_Documentation, + ), Input_file->new("BidiTest.txt", v5.2.0, - Skip => 'Validation Tests', - ), + Skip => $Validation, + ), Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, - Optional => 1, + Optional => "", Each_Line_Handler => \&filter_unihan_line, - ), + ), Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0, - Optional => 1, + Optional => "", Each_Line_Handler => \&filter_unihan_line, - ), + ), Input_file->new('UnihanIRGSources.txt', v5.2.0, - Optional => 1, + Optional => [ "", + 'kCompatibilityVariant', + 'kIICore', + 'kIRG_GSource', + 'kIRG_HSource', + 'kIRG_JSource', + 'kIRG_KPSource', + 'kIRG_MSource', + 'kIRG_KSource', + 'kIRG_TSource', + 'kIRG_USource', + 'kIRG_VSource', + ], Pre_Handler => \&setup_unihan, Each_Line_Handler => \&filter_unihan_line, - ), + ), Input_file->new('UnihanNumericValues.txt', v5.2.0, - Optional => 1, + Optional => [ "", + 'kAccountingNumeric', + 'kOtherNumeric', + 'kPrimaryNumeric', + ], Each_Line_Handler => \&filter_unihan_line, - ), + ), Input_file->new('UnihanOtherMappings.txt', v5.2.0, - Optional => 1, + Optional => "", Each_Line_Handler => \&filter_unihan_line, - ), + ), Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0, - Optional => 1, + Optional => [ "", + 'Unicode_Radical_Stroke' + ], Each_Line_Handler => \&filter_unihan_line, - ), + ), Input_file->new('UnihanReadings.txt', v5.2.0, - Optional => 1, + Optional => "", Each_Line_Handler => \&filter_unihan_line, - ), + ), Input_file->new('UnihanVariants.txt', v5.2.0, - Optional => 1, + Optional => "", Each_Line_Handler => \&filter_unihan_line, - ), + ), + Input_file->new('CJKRadicals.txt', v5.2.0, + Skip => 'Maps the kRSUnicode property values to ' + . 'corresponding code points', + ), + Input_file->new('EmojiSources.txt', v6.0.0, + Skip => 'Maps certain Unicode code points to their ' + . 'legacy Japanese cell-phone values', + ), Input_file->new('ScriptExtensions.txt', v6.0.0, Property => 'Script_Extensions', Pre_Handler => \&setup_script_extensions, @@ -18235,39 +18724,74 @@ my @input_file_objects = ( Has_Missings_Defaults => (($v_version le v6.0.0) ? $NO_DEFAULTS : $IGNORED), - ), - # The two Indic files are actually available starting in v6.0.0, but their - # property values are missing from PropValueAliases.txt in that release, - # so that further work would have to be done to get them to work properly - # for that release. - Input_file->new('IndicMatraCategory.txt', v6.1.0, + ), + # These two Indic files are actually not usable as-is until 6.1.0, + # because their property values are missing from PropValueAliases.txt + # until that release, so that further work would have to be done to get + # them to work properly, which isn't worth it because of them being + # provisional. + Input_file->new('IndicMatraCategory.txt', v6.0.0, + Withdrawn => v8.0.0, Property => 'Indic_Matra_Category', Has_Missings_Defaults => $NOT_IGNORED, - Skip => "Withdrawn by Unicode while still provisional", - ), - Input_file->new('IndicSyllabicCategory.txt', v6.1.0, + Skip => $Indic_Skip, + ), + Input_file->new('IndicSyllabicCategory.txt', v6.0.0, Property => 'Indic_Syllabic_Category', Has_Missings_Defaults => $NOT_IGNORED, Skip => (($v_version lt v8.0.0) - ? "Provisional; for the analysis and processing of Indic scripts" + ? $Indic_Skip : 0), - ), + ), + Input_file->new('USourceData.txt', v6.2.0, + Skip => 'Documentation of status and cross reference of ' + . 'proposals for encoding by Unicode of Unihan ' + . 'characters', + ), + Input_file->new('USourceGlyphs.pdf', v6.2.0, + Skip => 'Pictures of the characters in F<USourceData.txt>', + ), Input_file->new('BidiBrackets.txt', v6.3.0, - Properties => [ 'Bidi_Paired_Bracket', 'Bidi_Paired_Bracket_Type' ], + Properties => [ 'Bidi_Paired_Bracket', + 'Bidi_Paired_Bracket_Type' + ], Has_Missings_Defaults => $NO_DEFAULTS, - ), + ), Input_file->new("BidiCharacterTest.txt", v6.3.0, - Skip => 'Validation Tests', - ), + Skip => $Validation, + ), Input_file->new('IndicPositionalCategory.txt', v8.0.0, Property => 'Indic_Positional_Category', Has_Missings_Defaults => $NOT_IGNORED, - ), + ), ); # End of all the preliminaries. # Do it... +if (@missing_early_files) { + print simple_fold(join_lines(<<END + +The compilation cannot be completed because one or more required input files, +listed below, are missing. This is because you are compiling Unicode version +$string_version, which predates the existence of these file(s). To fully +function, perl needs the data that these files would have contained if they +had been in this release. To work around this, create copies of later +versions of the missing files in the directory containing '$0'. (Perl will +make the necessary adjustments to the data to compensate for it not being the +same version as is being compiled.) The files are available from unicode.org, +via either ftp or http. If using http, they will be under +www.unicode.org/versions/. Below are listed the source file name of each +missing file, the Unicode version to copy it from, and the name to store it +as. (Note that the listed source file name may not be exactly the one that +Unicode calls it. If you don't find it, you can look it up in 'README.perl' +to get the correct name.) +END + )); + print simple_fold(join_lines("\n$_")) for @missing_early_files; + exit 2; +} + if ($compare_versions) { Carp::my_carp(<<END Warning. \$compare_versions is set. Output is not suitable for production @@ -18276,17 +18800,13 @@ END } # Put into %potential_files a list of all the files in the directory structure -# that could be inputs to this program, excluding those that we should ignore. -# Use absolute file names because it makes it easier across machine types. -my @ignored_files_full_names = map { File::Spec->rel2abs( - internal_file_to_platform($_)) - } keys %ignored_files; +# that could be inputs to this program File::Find::find({ wanted=>sub { - return unless /\.txt$/i; # Some platforms change the name's case + return unless / \. ( txt | htm l? ) $ /xi; # Some platforms change the + # name's case my $full = lc(File::Spec->rel2abs($_)); - $potential_files{$full} = 1 - if ! grep { $full eq lc($_) } @ignored_files_full_names; + $potential_files{$full} = 1; return; } }, File::Spec->curdir()); @@ -18344,8 +18864,7 @@ else { # The paths are stored with relative names, and with '/' as the # delimiter; convert to absolute on this machine my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input))); - $potential_files{lc $full} = 1 - if ! grep { lc($full) eq lc($_) } @ignored_files_full_names; + $potential_files{lc $full} = 1; } } @@ -18422,7 +18941,7 @@ my @input_files = qw(version Makefile); foreach my $object (@input_file_objects) { my $file = $object->file; next if ! defined $file; # Not all objects have files - next if $object->optional && ! -e $file; + next if defined $object->skip;; push @input_files, $file; } @@ -18444,7 +18963,6 @@ foreach my $in (@input_files) { my ($volume, $directories, $file ) = File::Spec->splitpath($in); $directories =~ s;/$;;; # Can have extraneous trailing '/' my @directories = File::Spec->splitdir($directories); - my $base = $file =~ s/\.txt$//; construct_filename($file, 'mutable', \@directories); } } @@ -18593,6 +19111,16 @@ use warnings; my $Tests = 0; my $Fails = 0; +# loc_tools.pl requires this function to be defined +sub ok($@) { + my ($pass, @msg) = @_; + print "not " unless $pass; + print "ok "; + print ++$Tests; + print " - ", join "", @msg if @msg; + print "\n"; +} + sub Expect($$$$) { my $expected = shift; my $ord = shift; diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl index 1ba73b2874..51137b5623 100644 --- a/lib/utf8_heavy.pl +++ b/lib/utf8_heavy.pl @@ -404,7 +404,11 @@ sub _loose_name ($) { # If didn't find it, try again with looser matching by editing # out the applicable characters on the rhs and looking up # again. + my $strict_property_and_table; if (! defined $file) { + + # This isn't used unless the name begins with 'to' + $strict_property_and_table = $property_and_table =~ s/^to//r; $table = _loose_name($table); $property_and_table = "$prefix$table"; print STDERR __LINE__, ": $property_and_table\n" if DEBUG; @@ -443,61 +447,51 @@ sub _loose_name ($) { ## is to use Unicode::UCD. ## # Only check if caller wants non-binary - my $retried = 0; - if ($minbits != 1 && $property_and_table =~ s/^to//) {{ + if ($minbits != 1) { + if ($property_and_table =~ s/^to//) { # Look input up in list of properties for which we have - # mapping files. - if (defined ($file = + # mapping files. First do it with the strict approach + if (defined ($file = $utf8::strict_property_to_file_of{ + $strict_property_and_table})) + { + $type = $utf8::file_to_swash_name{$file}; + print STDERR __LINE__, ": type set to $type\n" + if DEBUG; + $file = "$unicore_dir/$file.pl"; + last GETFILE; + } + elsif (defined ($file = $utf8::loose_property_to_file_of{$property_and_table})) - { - $type = $utf8::file_to_swash_name{$file}; - print STDERR __LINE__, ": type set to $type\n" if DEBUG; - $file = "$unicore_dir/$file.pl"; - last GETFILE; - } # If that fails see if there is a corresponding binary - # property file - elsif (defined ($file = - $utf8::loose_to_file_of{$property_and_table})) - { + { + $type = $utf8::file_to_swash_name{$file}; + print STDERR __LINE__, ": type set to $type\n" + if DEBUG; + $file = "$unicore_dir/$file.pl"; + last GETFILE; + } # If that fails see if there is a corresponding binary + # property file + elsif (defined ($file = + $utf8::loose_to_file_of{$property_and_table})) + { - # Here, there is no map file for the property we are - # trying to get the map of, but this is a binary - # property, and there is a file for it that can easily - # be translated to a mapping. - - # In the case of properties that are forced to binary, - # they are a combination. We return the actual - # mapping instead of the binary. If the input is - # something like 'Tocjkkiicore', it will be found in - # %loose_property_to_file_of above as => 'To/kIICore'. - # But the form like ToIskiicore won't be. To fix - # this, it was easiest to do it here. These - # properties are the complements of the default - # property, so there is an entry in %loose_to_file_of - # that is 'iskiicore' => '!kIICore/N', If we find such - # an entry, strip off things and try again, which - # should find the entry in %loose_property_to_file_of. - # Actual binary properties that are of this form, such - # as this entry: 'ishrkt' => '!Perl/Any' will also be - # retried, but won't be in %loose_property_to_file_of, - # and instead the next time through, it will find - # 'hrkt' => '!Perl/Any' and proceed. - redo if ! $retried - && $file =~ /^!/ - && $property_and_table =~ s/^is//; - - # This is a binary property. Setting this here causes - # it to be stored as such in the cache, so if someone - # comes along later looking for just a binary, they - # get it. - $minbits = 1; - - # The 0+ makes sure is numeric - $invert_it = 0 + $file =~ s/!//; - $file = "$unicore_dir/lib/$file.pl" unless $file =~ m!^#/!; - last GETFILE; + # Here, there is no map file for the property we + # are trying to get the map of, but this is a + # binary property, and there is a file for it that + # can easily be translated to a mapping, so use + # that, treating this as a binary property. + # Setting 'minbits' here causes it to be stored as + # such in the cache, so if someone comes along + # later looking for just a binary, they get it. + $minbits = 1; + + # The 0+ makes sure is numeric + $invert_it = 0 + $file =~ s/!//; + $file = "$unicore_dir/lib/$file.pl" + unless $file =~ m!^#/!; + last GETFILE; + } } - } } + } ## ## If we reach this line, it's because we couldn't figure @@ -389,9 +389,9 @@ perl_construct(pTHXx) PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist); PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(XPosixWord_invlist); PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist); - PL_GCB_invlist = _new_invlist_C_array(Grapheme_Cluster_Break_invlist); - PL_SB_invlist = _new_invlist_C_array(Sentence_Break_invlist); - PL_WB_invlist = _new_invlist_C_array(Word_Break_invlist); + PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist); + PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist); + PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist); ENTER; } @@ -4779,7 +4779,14 @@ EXTCONST unsigned char PL_mod_latin1_uc[] = { 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, - 216, 217, 218, 219, 220, 221, 222, 255 /*sharp s*/, + 216, 217, 218, 219, 220, 221, 222, +#if UNICODE_MAJOR_VERSION > 2 \ + || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \ + && UNICODE_DOT_DOT_VERSION >= 8) + 255 /*sharp s*/, +#else /* uc() is itself in early unicode */ + 223, +#endif 224-32, 225-32, 226-32, 227-32, 228-32, 229-32, 230-32, 231-32, 232-32, 233-32, 234-32, 235-32, 236-32, 237-32, 238-32, 239-32, 240-32, 241-32, 242-32, 243-32, 244-32, 245-32, 246-32, 247, diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 06f2ee271f..1580b43471 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -338,7 +338,10 @@ files in F<ext/> and F<lib/> are best summarized in L</Modules and Pragmata>. =item * -XXX +Perl can again be compiled with any Unicode version. This used to +(mostly) work, but was lost in v5.18 through v5.20. The property +C<Name_Alias> did not exist prior to Unicode 5.0. L<Unicode::UCD> +incorrectly said it did. This has been fixed. =back @@ -4096,6 +4096,9 @@ PP(pp_uc) * just above. * Use the source to distinguish between the three cases */ +#if UNICODE_MAJOR_VERSION > 2 \ + || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \ + && UNICODE_DOT_DOT_VERSION >= 8) if (*s == LATIN_SMALL_LETTER_SHARP_S) { /* uc() of this requires 2 characters, but they are @@ -4108,6 +4111,7 @@ PP(pp_uc) *d++ = 'S'; *d = 'S'; /* upper case is 'SS' */ continue; /* Back to the tight loop; still in ASCII */ } +#endif /* The other two special handling characters have their * upper cases outside the latin1 range, hence need to be @@ -4401,8 +4405,14 @@ PP(pp_fc) const U8 *send; U8 *d; U8 tmpbuf[UTF8_MAXBYTES_CASE + 1]; +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) const bool full_folding = TRUE; /* This variable is here so we can easily move to more generality later */ +#else + const bool full_folding = FALSE; +#endif const U8 flags = ( full_folding ? FOLD_FLAGS_FULL : 0 ) #ifdef USE_LOCALE_CTYPE | ( IN_LC_RUNTIME(LC_CTYPE) ? FOLD_FLAGS_LOCALE : 0 ) diff --git a/regcharclass.h b/regcharclass.h index c59ea78b3e..f7cd2fde5b 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -168,7 +168,7 @@ /* NONCHAR: Non character code points - \p{Nchar} + \p{_Perl_Nchar} */ /*** GENERATED CODE ***/ #define is_NONCHAR_utf8(s) \ @@ -185,7 +185,7 @@ /* SURROGATE: Surrogate characters - \p{Gc=Cs} + \p{_Perl_Surrogate} */ /*** GENERATED CODE ***/ #define is_SURROGATE_utf8(s) \ @@ -591,7 +591,7 @@ /* PATWS: pattern white space - \p{PatWS} + \p{_Perl_PatWS} */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ @@ -761,7 +761,7 @@ /* NONCHAR: Non character code points - \p{Nchar} + \p{_Perl_Nchar} */ /*** GENERATED CODE ***/ #define is_NONCHAR_utf8(s) \ @@ -782,7 +782,7 @@ /* SURROGATE: Surrogate characters - \p{Gc=Cs} + \p{_Perl_Surrogate} */ /*** GENERATED CODE ***/ #define is_SURROGATE_utf8(s) \ @@ -1205,7 +1205,7 @@ /* PATWS: pattern white space - \p{PatWS} + \p{_Perl_PatWS} */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ @@ -1373,7 +1373,7 @@ /* NONCHAR: Non character code points - \p{Nchar} + \p{_Perl_Nchar} */ /*** GENERATED CODE ***/ #define is_NONCHAR_utf8(s) \ @@ -1394,7 +1394,7 @@ /* SURROGATE: Surrogate characters - \p{Gc=Cs} + \p{_Perl_Surrogate} */ /*** GENERATED CODE ***/ #define is_SURROGATE_utf8(s) \ @@ -1825,7 +1825,7 @@ /* PATWS: pattern white space - \p{PatWS} + \p{_Perl_PatWS} */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ @@ -1993,7 +1993,7 @@ /* NONCHAR: Non character code points - \p{Nchar} + \p{_Perl_Nchar} */ /*** GENERATED CODE ***/ #define is_NONCHAR_utf8(s) \ @@ -2014,7 +2014,7 @@ /* SURROGATE: Surrogate characters - \p{Gc=Cs} + \p{_Perl_Surrogate} */ /*** GENERATED CODE ***/ #define is_SURROGATE_utf8(s) \ @@ -2445,7 +2445,7 @@ /* PATWS: pattern white space - \p{PatWS} + \p{_Perl_PatWS} */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ @@ -2472,7 +2472,7 @@ #endif /* H_REGCHARCLASS */ /* Generated from: - * 083180df694deb1fc173361406c1a75619fb8376403db3a76dc585c1e3951eca lib/Unicode/UCD.pm + * 0bca60a25eb4ccf2e04f50446db5f882322f50a9c61dc57bb806ccfc9b2e26a4 lib/Unicode/UCD.pm * ae98bec7e4f0564758eed81eca5015481ba32581f8a735a825b71b3bba714450 lib/unicore/ArabicShaping.txt * 1687fe5994eb7e5c0dab8503fc2a1b3b479d91af9d3b8055941c9bd791f7d0b5 lib/unicore/BidiBrackets.txt * 350d1302116194b0b21def287434b55c5088098fbc726e879f7420a391965643 lib/unicore/BidiMirroring.txt @@ -2514,9 +2514,9 @@ * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt - * c9326eab8d7861c3543963e555d5b927348f4467c93071db23154dece7619654 lib/unicore/mktables + * 46f739fb5c9daf6fb457ed67f821d88d9eadd2df17b098f385b3b50f99c01acf lib/unicore/mktables * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version * c6884f4d629f04d1316f3476cb1050b6a1b98ca30c903262955d4eae337c6b1e regen/charset_translations.pl - * 5e47f645eac3a918246254e19c06b604c8ea088cf62da5be84dcb953ef2bf16c regen/regcharclass.pl - * 206b60035ff0cec9f7d1701937ecf9226a943faa42dfc4827c37306be64ff18e regen/regcharclass_multi_char_folds.pl + * d9c04ac46bdd81bb3e26519f2b8eb6242cb12337205add3f7cf092b0c58dccc4 regen/regcharclass.pl + * 393f8d882713a3ba227351ad0f00ea4839fda74fcf77dcd1cdf31519925adba5 regen/regcharclass_multi_char_folds.pl * ex: set ro: */ @@ -3652,6 +3652,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * this function, we need to flag any occurrences of the sharp s. * This character forbids trie formation (because of added * complexity) */ +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { OP(scan) = EXACTFA_NO_TRIE; @@ -3659,7 +3662,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, break; } s++; - continue; } } else { @@ -3705,6 +3707,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, *min_subtract += len - 1; s += len; } +#endif } } @@ -11505,8 +11508,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, *character = (U8) code_point; len = 1; } /* Else is folded non-UTF8 */ +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { - +#else + else if (1) { +#endif /* We don't fold any non-UTF8 except possibly the Sharp s (see * comments at join_exact()); */ *character = (U8) code_point; @@ -11550,9 +11558,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, /* A single character node is SIMPLE, except for the special-cased SHARP S * under /di. */ if ((len == 1 || (UTF && len == UNISKIP(code_point))) - && (code_point != LATIN_SMALL_LETTER_SHARP_S - || ! FOLD || ! DEPENDS_SEMANTICS)) - { +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) + && ( code_point != LATIN_SMALL_LETTER_SHARP_S + || ! FOLD || ! DEPENDS_SEMANTICS) +#endif + ) { *flagp |= SIMPLE; } @@ -12650,11 +12662,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } else /* A regular FOLD code point */ if (! ( UTF +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU - && ender == LATIN_SMALL_LETTER_SHARP_S))) - { + && ender == LATIN_SMALL_LETTER_SHARP_S) +#endif + )) { /* Here, are folding and are not UTF-8 encoded; therefore * the character must be in the range 0-255, and is not /l * (Not /l because we already handled these under /l in @@ -12667,11 +12683,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * 'ss' */ if (maybe_exactfu && (PL_fold[ender] != PL_fold_latin1[ender] +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) || ender == LATIN_SMALL_LETTER_SHARP_S || (len > 0 && isALPHA_FOLD_EQ(ender, 's') - && isALPHA_FOLD_EQ(*(s-1), 's')))) - { + && isALPHA_FOLD_EQ(*(s-1), 's')) +#endif + )) { maybe_exactfu = FALSE; } } @@ -14014,9 +14034,30 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); break; + +#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */ + case LATIN_SMALL_LETTER_SHARP_S: *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); break; + +#endif + +#if UNICODE_MAJOR_VERSION < 3 \ + || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0) + + /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did + * U+0131. */ + case 'i': + case 'I': + *invlist = + add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); +# if UNICODE_DOT_DOT_VERSION == 1 + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I); +# endif + break; +#endif + default: /* Use deprecated warning to increase the chances of this being * output */ @@ -14209,6 +14250,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, DEBUG_PARSE("clas"); +#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \ + && UNICODE_DOT_DOT_VERSION == 0) + allow_multi_folds = FALSE; +#endif + /* Assume we are going to generate an ANYOF node. */ ret = reganode(pRExC_state, (LOC) diff --git a/regen/mk_PL_charclass.pl b/regen/mk_PL_charclass.pl index 4b46bd0542..1cabfc4de9 100644 --- a/regen/mk_PL_charclass.pl +++ b/regen/mk_PL_charclass.pl @@ -51,6 +51,7 @@ my @properties = qw( # Read in the case fold mappings. my %folded_closure; +my %simple_folded_closure; my @hex_non_final_folds; my @non_latin1_simple_folds; my @folds; @@ -118,8 +119,14 @@ BEGIN { # Have to do this at compile time because using user-defined \p{property for my $i (0 .. @folded - 1) { my $hex_fold = $folded[$i]; my $fold = hex $hex_fold; - push @{$folded_closure{$fold}}, $from if $fold < 256; - push @{$folded_closure{$from}}, $fold if $from < 256; + if ($fold < 256) { + push @{$folded_closure{$fold}}, $from; + push @{$simple_folded_closure{$fold}}, $from if $fold_type ne 'F'; + } + if ($from < 256) { + push @{$folded_closure{$from}}, $fold; + push @{$simple_folded_closure{$from}}, $fold if $fold_type ne 'F'; + } if (($fold_type eq 'C' || $fold_type eq 'S') && ($fold < 256 != $from < 256)) @@ -153,11 +160,16 @@ BEGIN { # Have to do this at compile time because using user-defined \p{property push @{$folded_closure{$from}}, @{$folded_closure{$folded}}; } } + foreach my $folded (keys %simple_folded_closure) { + foreach my $from (grep { $_ < 256 } @{$simple_folded_closure{$folded}}) { + push @{$simple_folded_closure{$from}}, @{$simple_folded_closure{$folded}}; + } + } # We have the single-character folds that cross the 255/256, like KELVIN # SIGN => 'k', but we need the closure, so add like 'K' to it foreach my $folded (@non_latin1_simple_folds) { - foreach my $fold (@{$folded_closure{$folded}}) { + foreach my $fold (@{$simple_folded_closure{$folded}}) { if ($fold < 256 && ! grep { $fold == $_ } @non_latin1_simple_folds) { push @non_latin1_simple_folds, $fold; } @@ -217,12 +229,18 @@ for my $ord (0..255) { } elsif ($name eq 'SPACE') {; $re = qr/\p{XPerlSpace}/; } elsif ($name eq 'IDFIRST') { - $re = qr/[_\p{Alpha}]/; + $re = qr/[_\p{XPosixAlpha}]/; } elsif ($name eq 'WORDCHAR') { $re = qr/\p{XPosixWord}/; + } elsif ($name eq 'LOWER') { + $re = qr/\p{XPosixLower}/; + } elsif ($name eq 'UPPER') { + $re = qr/\p{XPosixUpper}/; } elsif ($name eq 'ALPHANUMERIC') { # Like \w, but no underscore $re = qr/\p{Alnum}/; + } elsif ($name eq 'ALPHA') { + $re = qr/\p{XPosixAlpha}/; } elsif ($name eq 'QUOTEMETA') { $re = qr/\p{_Perl_Quotemeta}/; } elsif ($name eq 'NONLATIN1_FOLD') { @@ -241,7 +259,7 @@ for my $ord (0..255) { use Carp; carp $@ if ! defined $re; } - #print "$ord, $name $property, $re\n"; + #print STDERR __LINE__, ": $ord, $name $property, $re\n"; if ($char =~ $re) { # Add this property if matches $bits[$ord] .= '|' if $bits[$ord]; $bits[$ord] .= "(1U<<_CC_$property)"; @@ -295,10 +313,11 @@ foreach my $charset (get_supported_code_pages()) { } else { use Unicode::UCD qw(prop_invmap); - my ($list_ref, $map_ref, $format) = prop_invmap("Name_Alias"); + my ($list_ref, $map_ref, $format) + = prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok'); if ($format !~ /^s/) { use Carp; - carp "Unexpected format '$format' for 'Name_Alias"; + carp "Unexpected format '$format' for '_Perl_Name_Alias"; last; } my $which = Unicode::UCD::search_invlist($list_ref, $ord); diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl index 42f5b1c925..bc27d0f208 100644 --- a/regen/mk_invlists.pl +++ b/regen/mk_invlists.pl @@ -176,15 +176,14 @@ sub output_invlist ($$;$) { my $charset = shift // ""; # name of character set for comment die "No inversion list for $name" unless defined $invlist - && ref $invlist eq 'ARRAY' - && @$invlist; + && ref $invlist eq 'ARRAY'; # Output the inversion list $invlist using the name $name for it. # It is output in the exact internal form for inversion lists. # Is the last element of the header 0, or 1 ? my $zero_or_one = 0; - if ($invlist->[0] != 0) { + if (@$invlist && $invlist->[0] != 0) { unshift @$invlist, 0; $zero_or_one = 1; } @@ -235,8 +234,8 @@ sub output_invmap ($$$$$$$) { my $name_prefix; if ($input_format eq 's') { - $prop_name = (prop_aliases($prop_name))[1]; # Get full name - my $short_name = (prop_aliases($prop_name))[0]; + $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name + my $short_name = (prop_aliases($prop_name))[0] // $prop_name; my @enums = prop_values($prop_name); if (! @enums) { die "Only enum properties are currently handled; '$prop_name' isn't one"; @@ -267,7 +266,8 @@ sub output_invmap ($$$$$$$) { # Assign a value to each element of the enum. The default # value always gets 0; the others are arbitrarily assigned. my $enum_val = 0; - $default = prop_value_aliases($prop_name, $default); + my $canonical_default = prop_value_aliases($prop_name, $default); + $default = $canonical_default if defined $canonical_default; $enums{$default} = $enum_val++; for my $enum (@enums) { $enums{$enum} = $enum_val++ unless exists $enums{$enum}; @@ -315,7 +315,9 @@ sub output_invmap ($$$$$$$) { # The main body are the scalars passed in to this routine. for my $i (0 .. $count - 1) { my $element = $invmap->[$i]; - $element = $name_prefix . prop_value_aliases($prop_name, $element); + my $full_element_name = prop_value_aliases($prop_name, $element); + $element = $full_element_name if defined $full_element_name; + $element = $name_prefix . $element; print $out_fh "\t$element"; print $out_fh "," if $i < $count - 1; print $out_fh "\n"; @@ -353,7 +355,8 @@ sub mk_invlist_from_sorted_cp_list { my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding"); die "Could not find inversion map for Case_Folding" unless defined $format; die "Incorrect format '$format' for Case_Folding inversion map" - unless $format eq 'al'; + unless $format eq 'al' + || $format eq 'a'; my @has_multi_char_fold; my @is_non_final_fold; @@ -447,9 +450,9 @@ for my $charset (get_supported_code_pages()) { &UpperLatin1 _Perl_IDStart _Perl_IDCont - Grapheme_Cluster_Break,EDGE - Word_Break,EDGE,UNKNOWN - Sentence_Break,EDGE + _Perl_GCB,EDGE + _Perl_WB,EDGE,UNKNOWN + _Perl_SB,EDGE ) ) { @@ -489,20 +492,37 @@ for my $charset (get_supported_code_pages()) { else { @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); if (! @invlist) { - my ($list_ref, $map_ref, $format, $default); - ($list_ref, $map_ref, $format, $default) + # If couldn't find a non-empty inversion list, see if it is + # instead an inversion map + my ($list_ref, $map_ref, $format, $default) = prop_invmap($lookup_prop, '_perl_core_internal_ok'); - die "Could not find inversion list for '$lookup_prop'" unless $list_ref; - @invlist = @$list_ref; - @invmap = @$map_ref; - $map_format = $format; - $map_default = $default; - $maps_to_code_point = $map_format =~ /x/; - $to_adjust = $map_format =~ /a/; + if (! $list_ref) { + # An empty return here could mean an unknown property, or + # merely that the original inversion list is empty. Call + # in scalar context to differentiate + my $count = prop_invlist($lookup_prop, + '_perl_core_internal_ok'); + die "Could not find inversion list for '$lookup_prop'" + unless defined $count; + } + else { + @invlist = @$list_ref; + @invmap = @$map_ref; + $map_format = $format; + $map_default = $default; + $maps_to_code_point = $map_format =~ /x/; + $to_adjust = $map_format =~ /a/; + } } } - die "Could not find inversion list for '$lookup_prop'" unless @invlist; + + + # Short-circuit an empty inversion list. + if (! @invlist) { + output_invlist($prop_name, \@invlist, $charset); + next; + } # Re-order the Unicode code points to native ones for this platform. # This is only needed for code points below 256, because native code diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 184437e88f..9115eafeb6 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -610,6 +610,8 @@ sub length_optree { my $else= ( $opt{else} ||= 0 ); + return $else if $self->{count} == 0; + my $method = $type =~ /generic/ ? 'generic_optree' : 'optree'; if ($method eq 'optree' && scalar keys %{$self->{size}{$type}} == 1) { @@ -1629,11 +1631,11 @@ REPLACEMENT: Unicode REPLACEMENT CHARACTER NONCHAR: Non character code points => UTF8 :fast -\p{Nchar} +\p{_Perl_Nchar} SURROGATE: Surrogate characters => UTF8 :fast -\p{Gc=Cs} +\p{_Perl_Surrogate} # This program was run with this enabled, and the results copied to utf8.h; # then this was commented out because it takes so long to figure out these 2 @@ -1695,4 +1697,4 @@ PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are PATWS: pattern white space => generic cp : safe -\p{PatWS} +\p{_Perl_PatWS} diff --git a/regen/regcharclass_multi_char_folds.pl b/regen/regcharclass_multi_char_folds.pl index dfc8f9f3e5..5ea9d33a25 100644 --- a/regen/regcharclass_multi_char_folds.pl +++ b/regen/regcharclass_multi_char_folds.pl @@ -63,6 +63,8 @@ sub multi_char_folds ($) { # multi-char folds; false if just the ones that # are all ascii + return () if pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) lt v3.0.1; + my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding"); die "Could not find inversion map for Case_Folding" unless defined $format; die "Incorrect format '$format' for Case_Folding inversion map" diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index 936c1a8a6c..5004e1d50e 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -3,6 +3,7 @@ use strict; use warnings; require 'regen/regen_lib.pl'; require 'regen/charset_translations.pl'; +use Unicode::UCD; use charnames qw(:loose); my $out_fh = open_new('unicode_constants.h', '>', @@ -14,10 +15,11 @@ print $out_fh <<END; #ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */ #define H_UNICODE_CONSTANTS 1 -/* This file contains #defines for various Unicode code points. The values - * the macros expand to are the native Unicode code point, or all or portions - * of the UTF-8 encoding for the code point. In the former case, the macro - * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8". +/* This file contains #defines for the version of Unicode being used and + * various Unicode code points. The values the code point macros expand to + * are the native Unicode code point, or all or portions of the UTF-8 encoding + * for the code point. In the former case, the macro name has the suffix + * "_NATIVE"; otherwise, the suffix "_UTF8". * * The macros that have the suffix "_UTF8" may have further suffixes, as * follows: @@ -28,34 +30,18 @@ print $out_fh <<END; END -# The data are at the end of this file. A blank line is output as-is. -# Comments (lines whose first non-blank is a '#') are converted to C-style, -# though empty comments are converted to blank lines. Otherwise, each line -# represents one #define, and begins with either a Unicode character name with -# the blanks and dashes in it squeezed out or replaced by underscores; or it -# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter -# case, the name will be looked-up to use as the name of the macro. In either -# case, the macro name will have suffixes as listed above, and all blanks and -# dashes will be replaced by underscores. -# -# Each line may optionally have one of the following flags on it, separated by -# white space from the initial token. -# string indicates that the output is to be of the string form -# described in the comments above that are placed in the file. -# string_skip_ifundef is the same as 'string', but instead of dying if the -# code point doesn't exist, the line is just skipped: no output is -# generated for it -# first indicates that the output is to be of the FIRST_BYTE form. -# tail indicates that the output is of the _TAIL form. -# native indicates that the output is the code point, converted to the -# platform's native character set if applicable -# -# If the code point has no official name, the desired name may be appended -# after the flag, which will be ignored if there is an official name. -# -# This program is used to make it convenient to create compile time constants -# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually -# having to figure things out. +my $version = Unicode::UCD::UnicodeVersion(); +my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x; +$dotdot = 0 unless defined $dotdot; + +print $out_fh <<END; +#define UNICODE_MAJOR_VERSION $major +#define UNICODE_DOT_VERSION $dot +#define UNICODE_DOT_DOT_VERSION $dotdot + +END + +# The data are at __DATA__ in this file. my @data = <DATA>; @@ -95,13 +81,13 @@ foreach my $charset (get_supported_code_pages()) { my $name; my $cp; my $U_cp; # code point in Unicode (not-native) terms - my $undef_ok = $desired_name || $flag =~ /skip_if_undef/; if ($name_or_cp =~ /^U\+(.*)/) { $U_cp = hex $1; $name = charnames::viacode($name_or_cp); if (! defined $name) { - die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $undef_ok; + next if $flag =~ /skip_if_undef/; + die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name; $name = ""; } } @@ -176,6 +162,37 @@ print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; read_only_bottom_close_and_rename($out_fh); +# DATA FORMAT +# +# A blank line is output as-is. +# Comments (lines whose first non-blank is a '#') are converted to C-style, +# though empty comments are converted to blank lines. Otherwise, each line +# represents one #define, and begins with either a Unicode character name with +# the blanks and dashes in it squeezed out or replaced by underscores; or it +# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter +# case, the name will be looked-up to use as the name of the macro. In either +# case, the macro name will have suffixes as listed above, and all blanks and +# dashes will be replaced by underscores. +# +# Each line may optionally have one of the following flags on it, separated by +# white space from the initial token. +# string indicates that the output is to be of the string form +# described in the comments above that are placed in the file. +# string_skip_ifundef is the same as 'string', but instead of dying if the +# code point doesn't exist, the line is just skipped: no output is +# generated for it +# first indicates that the output is to be of the FIRST_BYTE form. +# tail indicates that the output is of the _TAIL form. +# native indicates that the output is the code point, converted to the +# platform's native character set if applicable +# +# If the code point has no official name, the desired name may be appended +# after the flag, which will be ignored if there is an official name. +# +# This program is used to make it convenient to create compile time constants +# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually +# having to figure things out. + __DATA__ U+017F string @@ -184,10 +201,12 @@ U+0300 string U+0399 string U+03BC string -U+1E9E string +U+1E9E string_skip_if_undef U+FB05 string U+FB06 string +U+0130 string +U+0131 string U+2010 string U+D800 first FIRST_SURROGATE @@ -192,7 +192,7 @@ static const char* const non_utf8_target_but_utf8_required PL_utf8_swash_ptrs[_CC_WORDCHAR], \ "", \ PL_XPosix_ptrs[_CC_WORDCHAR], \ - LATIN_CAPITAL_LETTER_SHARP_S_UTF8); + LATIN_SMALL_LIGATURE_LONG_S_T_UTF8); #define PLACEHOLDER /* Something for the preprocessor to grab onto */ /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ @@ -1753,7 +1753,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ #define getGCB_VAL_CP(cp) \ _generic_GET_BREAK_VAL_CP( \ PL_GCB_invlist, \ - Grapheme_Cluster_Break_invmap, \ + _Perl_GCB_invmap, \ (cp)) /* Returns the GCB value for the first code point in the UTF-8 encoded string @@ -1766,7 +1766,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ #define getSB_VAL_CP(cp) \ _generic_GET_BREAK_VAL_CP( \ PL_SB_invlist, \ - Sentence_Break_invmap, \ + _Perl_SB_invmap, \ (cp)) /* Returns the SB value for the first code point in the UTF-8 encoded string @@ -1778,7 +1778,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ #define getWB_VAL_CP(cp) \ _generic_GET_BREAK_VAL_CP( \ PL_WB_invlist, \ - Word_Break_invmap, \ + _Perl_WB_invmap, \ (cp)) /* Returns the WB value for the first code point in the UTF-8 encoded string diff --git a/t/uni/variables.t b/t/uni/variables.t index 33f057a645..5601b9767f 100644 --- a/t/uni/variables.t +++ b/t/uni/variables.t @@ -231,7 +231,7 @@ for ( 0x0 .. 0xff ) { if ($chr =~ /[#*]/) { # Length-1 variables with these two characters used to be used by - # Perl, but now it generates a warning that they're gone. + # Perl, but now a warning is generated that they're gone. # Ignore such warnings. for (my $i = @warnings - 1; $i >= 0; $i--) { splice @warnings, $i, 1 if $warnings[$i] =~ /is no longer supported/; diff --git a/unicode_constants.h b/unicode_constants.h index 7da4eb3a48..be7a38e92a 100644 --- a/unicode_constants.h +++ b/unicode_constants.h @@ -8,10 +8,11 @@ #ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */ #define H_UNICODE_CONSTANTS 1 -/* This file contains #defines for various Unicode code points. The values - * the macros expand to are the native Unicode code point, or all or portions - * of the UTF-8 encoding for the code point. In the former case, the macro - * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8". +/* This file contains #defines for the version of Unicode being used and + * various Unicode code points. The values the code point macros expand to + * are the native Unicode code point, or all or portions of the UTF-8 encoding + * for the code point. In the former case, the macro name has the suffix + * "_NATIVE"; otherwise, the suffix "_UTF8". * * The macros that have the suffix "_UTF8" may have further suffixes, as * follows: @@ -20,6 +21,10 @@ * "_TAIL" if instead it represents all but the first byte. This, and * with no additional suffix are both string constants */ +#define UNICODE_MAJOR_VERSION 8 +#define UNICODE_DOT_VERSION 0 +#define UNICODE_DOT_DOT_VERSION 0 + #if 'A' == 65 /* ASCII/Latin1 */ # define LATIN_SMALL_LETTER_LONG_S_UTF8 "\xC5\xBF" /* U+017F */ @@ -33,6 +38,8 @@ # define LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 "\xEF\xAC\x85" /* U+FB05 */ # define LATIN_SMALL_LIGATURE_ST_UTF8 "\xEF\xAC\x86" /* U+FB06 */ +# define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 "\xC4\xB0" /* U+0130 */ +# define LATIN_SMALL_LETTER_DOTLESS_I_UTF8 "\xC4\xB1" /* U+0131 */ # define HYPHEN_UTF8 "\xE2\x80\x90" /* U+2010 */ # define FIRST_SURROGATE_UTF8_FIRST_BYTE 0xED /* U+D800 */ @@ -71,6 +78,8 @@ # define LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 "\xDD\x72\x67\x46" /* U+FB05 */ # define LATIN_SMALL_LIGATURE_ST_UTF8 "\xDD\x72\x67\x47" /* U+FB06 */ +# define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 "\x8D\x57" /* U+0130 */ +# define LATIN_SMALL_LETTER_DOTLESS_I_UTF8 "\x8D\x58" /* U+0131 */ # define HYPHEN_UTF8 "\xCA\x41\x57" /* U+2010 */ # define FIRST_SURROGATE_UTF8_FIRST_BYTE 0xDD /* U+D800 */ @@ -109,6 +118,8 @@ # define LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 "\xDD\x71\x66\x46" /* U+FB05 */ # define LATIN_SMALL_LIGATURE_ST_UTF8 "\xDD\x71\x66\x47" /* U+FB06 */ +# define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 "\x8C\x57" /* U+0130 */ +# define LATIN_SMALL_LETTER_DOTLESS_I_UTF8 "\x8C\x58" /* U+0131 */ # define HYPHEN_UTF8 "\xCA\x41\x57" /* U+2010 */ # define FIRST_SURROGATE_UTF8_FIRST_BYTE 0xDD /* U+D800 */ @@ -147,6 +158,8 @@ # define LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 "\xDC\x74\x68\x46" /* U+FB05 */ # define LATIN_SMALL_LIGATURE_ST_UTF8 "\xDC\x74\x68\x47" /* U+FB06 */ +# define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 "\x8E\x58" /* U+0130 */ +# define LATIN_SMALL_LETTER_DOTLESS_I_UTF8 "\x8E\x59" /* U+0131 */ # define HYPHEN_UTF8 "\xCA\x41\x58" /* U+2010 */ # define FIRST_SURROGATE_UTF8_FIRST_BYTE 0xDC /* U+D800 */ @@ -1413,11 +1413,15 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_ case MICRO_SIGN: converted = GREEK_CAPITAL_LETTER_MU; break; +#if UNICODE_MAJOR_VERSION > 2 \ + || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \ + && UNICODE_DOT_DOT_VERSION >= 8) case LATIN_SMALL_LETTER_SHARP_S: *(p)++ = 'S'; *p = S_or_s; *lenp = 2; return 'S'; +#endif default: Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS); NOT_REACHED; /* NOTREACHED */ @@ -1540,6 +1544,9 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f if (c == MICRO_SIGN) { converted = GREEK_SMALL_LETTER_MU; } +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) { /* If can't cross 127/128 boundary, can't return "ss"; instead return @@ -1558,6 +1565,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f return 's'; } } +#endif else { /* In this range the fold of all other characters is their lower case */ converted = toLOWER_LATIN1(c); @@ -2200,11 +2208,13 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) if (flags & FOLD_FLAGS_LOCALE) { -# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8 # define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 + const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1; + +# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8 +# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1; - const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1; /* Special case these two characters, as what normally gets * returned under locale doesn't work */ @@ -2217,7 +2227,9 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) "resolved to \"\\x{17F}\\x{17F}\"."); goto return_long_s; } - else if (UTF8SKIP(p) == long_s_t_len + else +#endif + if (UTF8SKIP(p) == long_s_t_len && memEQ((char *) p, LONG_S_T, long_s_t_len)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ @@ -2226,6 +2238,28 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) "resolved to \"\\x{FB06}\"."); goto return_ligature_st; } + +#if UNICODE_MAJOR_VERSION == 3 \ + && UNICODE_DOT_VERSION == 0 \ + && UNICODE_DOT_DOT_VERSION == 1 +# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 + + /* And special case this on this Unicode version only, for the same + * reaons the other two are special cased. They would cross the + * 255/256 boundary which is forbidden under /l, and so the code + * wouldn't catch that they are equivalent (which they are only in + * this release) */ + else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1 + && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1)) + { + /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), + "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; " + "resolved to \"\\x{0131}\"."); + goto return_dotless_i; + } +#endif + return check_locale_boundary_crossing(p, result, ustrp, lenp); } else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) { @@ -2249,14 +2283,24 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) /* But in these instances, there is an alternative we can * return that is valid */ - if (original == LATIN_CAPITAL_LETTER_SHARP_S - || original == LATIN_SMALL_LETTER_SHARP_S) - { + if (original == LATIN_SMALL_LETTER_SHARP_S +#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */ + || original == LATIN_CAPITAL_LETTER_SHARP_S +#endif + ) { goto return_long_s; } else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) { goto return_ligature_st; } +#if UNICODE_MAJOR_VERSION == 3 \ + && UNICODE_DOT_VERSION == 0 \ + && UNICODE_DOT_DOT_VERSION == 1 + + else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { + goto return_dotless_i; + } +#endif Copy(p, ustrp, *lenp, char); return original; } @@ -2300,6 +2344,18 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1; Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8); return LATIN_SMALL_LIGATURE_ST; + +#if UNICODE_MAJOR_VERSION == 3 \ + && UNICODE_DOT_VERSION == 0 \ + && UNICODE_DOT_DOT_VERSION == 1 + + return_dotless_i: + *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1; + Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8); + return LATIN_SMALL_LETTER_DOTLESS_I; + +#endif + } /* Note: @@ -3252,7 +3308,24 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) * currently handle. But it also means that FB05 and FB06 are equivalent in * a 1-1 mapping which we should handle, and this relationship may not be in * the main table. Therefore this function examines all the multi-char - * sequences and adds the 1-1 mappings that come out of that. */ + * sequences and adds the 1-1 mappings that come out of that. + * + * XXX This function was originally intended to be multipurpose, but its + * only use is quite likely to remain for constructing the inversion of + * the CaseFolding (//i) property. If it were more general purpose for + * regex patterns, it would have to do the FB05/FB06 game for simple folds, + * because certain folds are prohibited under /iaa and /il. As an example, + * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both + * equivalent under /i. But under /iaa and /il, the folds to 'i' are + * prohibited, so we would not figure out that they fold to each other. + * Code could be written to automatically figure this out, similar to the + * code that does this for multi-character folds, but this is the only case + * where something like this is ever likely to happen, as all the single + * char folds to The 0-255 range are now quite settled. Instead there is a + * little special code that is compiled only for this Unicode version. This + * is smaller and didn't require much coding time to do. But this makes + * this routine strongly tied to being used just for CaseFolding. If ever + * it should be generalized, this would have to be fixed */ U8 *l, *lend; STRLEN lcur; @@ -3395,7 +3468,24 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) } /* End of specials */ /* read $swash->{LIST} */ + +#if UNICODE_MAJOR_VERSION == 3 \ + && UNICODE_DOT_VERSION == 0 \ + && UNICODE_DOT_DOT_VERSION == 1 + + /* For this version only U+130 and U+131 are equivalent under qr//i. Add a + * rule so that things work under /iaa and /il */ + + SV * mod_listsv = sv_mortalcopy(*listsvp); + sv_catpv(mod_listsv, "130\t130\t131\n"); + l = (U8*)SvPV(mod_listsv, lcur); + +#else + l = (U8*)SvPV(*listsvp, lcur); + +#endif + lend = l + lcur; /* Go through each input line */ @@ -611,7 +611,11 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case of MICRON */ #define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */ -#define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E +#ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8 +# define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E +#endif +#define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 0x130 +#define LATIN_SMALL_LETTER_DOTLESS_I 0x131 #define LATIN_SMALL_LETTER_LONG_S 0x017F #define LATIN_SMALL_LIGATURE_LONG_S_T 0xFB05 #define LATIN_SMALL_LIGATURE_ST 0xFB06 |