summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-07-28 22:17:01 -0600
committerKarl Williamson <khw@cpan.org>2015-07-28 22:17:01 -0600
commita0b05c4bde4f97301b663b8de70677551eacff4c (patch)
treee9329726efb7aa27f5fd08f1441943f95b68a594
parent16d89be8495ff4e0a9b99e837b5444df8a4a6cc6 (diff)
parent912fd71114648888551a4750534463f20ae16c7c (diff)
downloadperl-a0b05c4bde4f97301b663b8de70677551eacff4c.tar.gz
Allow perl to work again with all Unicode releases
Compiling perl to work with early Unicode releases is a goal, but has been broken since 5.18. This sequence of commits gets this working again, actually better than before.
-rw-r--r--charclass_invlists.h3814
-rw-r--r--ext/XS-APItest/t/handy.t17
-rw-r--r--lib/Unicode/UCD.pm57
-rw-r--r--lib/Unicode/UCD.t668
-rw-r--r--lib/locale.t3
-rw-r--r--lib/unicore/README.perl46
-rw-r--r--lib/unicore/mktables2432
-rw-r--r--lib/utf8_heavy.pl96
-rw-r--r--perl.c6
-rw-r--r--perl.h9
-rw-r--r--pod/perldelta.pod5
-rw-r--r--pp.c10
-rw-r--r--regcharclass.h32
-rw-r--r--regcomp.c65
-rw-r--r--regen/mk_PL_charclass.pl33
-rw-r--r--regen/mk_invlists.pl62
-rwxr-xr-xregen/regcharclass.pl8
-rw-r--r--regen/regcharclass_multi_char_folds.pl2
-rw-r--r--regen/unicode_constants.pl89
-rw-r--r--regexec.c8
-rw-r--r--t/uni/variables.t2
-rw-r--r--unicode_constants.h21
-rw-r--r--utf8.c104
-rw-r--r--utf8.h6
24 files changed, 4282 insertions, 3313 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index 076d223724..64406bfe78 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -312,7 +312,387 @@ static const UV Cased_invlist[] = { /* for ASCII/Latin1 */
0x1F18A
};
-static const UV Grapheme_Cluster_Break_invlist[] = { /* for ASCII/Latin1 */
+#endif /* defined(PERL_IN_PERL_C) */
+
+#if defined(PERL_IN_REGCOMP_C)
+
+static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for ASCII/Latin1 */
+ 45, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x2BC,
+ 0x2BD,
+ 0x308,
+ 0x309,
+ 0x313,
+ 0x314,
+ 0x342,
+ 0x343,
+ 0x3AC,
+ 0x3AD,
+ 0x3AE,
+ 0x3AF,
+ 0x3B1,
+ 0x3B2,
+ 0x3B7,
+ 0x3B8,
+ 0x3B9,
+ 0x3BA,
+ 0x3C1,
+ 0x3C2,
+ 0x3C5,
+ 0x3C6,
+ 0x3C9,
+ 0x3CA,
+ 0x3CE,
+ 0x3CF,
+ 0x565,
+ 0x566,
+ 0x574,
+ 0x575,
+ 0x57E,
+ 0x57F,
+ 0x1F00,
+ 0x1F08,
+ 0x1F20,
+ 0x1F28,
+ 0x1F60,
+ 0x1F68,
+ 0x1F70,
+ 0x1F71,
+ 0x1F74,
+ 0x1F75,
+ 0x1F7C,
+ 0x1F7D
+};
+
+static const UV _Perl_Any_Folds_invlist[] = { /* for ASCII/Latin1 */
+ 247, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x41,
+ 0x5B,
+ 0x61,
+ 0x7B,
+ 0xB5,
+ 0xB6,
+ 0xC0,
+ 0xD7,
+ 0xD8,
+ 0xF7,
+ 0xF8,
+ 0x131,
+ 0x132,
+ 0x138,
+ 0x139,
+ 0x18D,
+ 0x18E,
+ 0x19B,
+ 0x19C,
+ 0x1AA,
+ 0x1AC,
+ 0x1BA,
+ 0x1BC,
+ 0x1BE,
+ 0x1BF,
+ 0x1C0,
+ 0x1C4,
+ 0x221,
+ 0x222,
+ 0x234,
+ 0x23A,
+ 0x255,
+ 0x256,
+ 0x258,
+ 0x259,
+ 0x25A,
+ 0x25B,
+ 0x25D,
+ 0x260,
+ 0x262,
+ 0x263,
+ 0x264,
+ 0x265,
+ 0x267,
+ 0x268,
+ 0x26A,
+ 0x26B,
+ 0x26D,
+ 0x26F,
+ 0x270,
+ 0x271,
+ 0x273,
+ 0x275,
+ 0x276,
+ 0x27D,
+ 0x27E,
+ 0x280,
+ 0x281,
+ 0x283,
+ 0x284,
+ 0x287,
+ 0x28D,
+ 0x292,
+ 0x293,
+ 0x29D,
+ 0x29F,
+ 0x2BC,
+ 0x2BD,
+ 0x2BE,
+ 0x2BF,
+ 0x300,
+ 0x302,
+ 0x307,
+ 0x309,
+ 0x30A,
+ 0x30B,
+ 0x30C,
+ 0x30D,
+ 0x313,
+ 0x314,
+ 0x331,
+ 0x332,
+ 0x342,
+ 0x343,
+ 0x345,
+ 0x346,
+ 0x370,
+ 0x374,
+ 0x376,
+ 0x378,
+ 0x37B,
+ 0x37E,
+ 0x37F,
+ 0x380,
+ 0x386,
+ 0x387,
+ 0x388,
+ 0x38B,
+ 0x38C,
+ 0x38D,
+ 0x38E,
+ 0x3A2,
+ 0x3A3,
+ 0x3D2,
+ 0x3D5,
+ 0x3F6,
+ 0x3F7,
+ 0x3FC,
+ 0x3FD,
+ 0x482,
+ 0x48A,
+ 0x530,
+ 0x531,
+ 0x557,
+ 0x561,
+ 0x588,
+ 0x10A0,
+ 0x10C6,
+ 0x10C7,
+ 0x10C8,
+ 0x10CD,
+ 0x10CE,
+ 0x13A0,
+ 0x13F6,
+ 0x13F8,
+ 0x13FE,
+ 0x1D79,
+ 0x1D7A,
+ 0x1D7D,
+ 0x1D7E,
+ 0x1E00,
+ 0x1E9C,
+ 0x1E9E,
+ 0x1E9F,
+ 0x1EA0,
+ 0x1F16,
+ 0x1F18,
+ 0x1F1E,
+ 0x1F20,
+ 0x1F46,
+ 0x1F48,
+ 0x1F4E,
+ 0x1F50,
+ 0x1F58,
+ 0x1F59,
+ 0x1F5A,
+ 0x1F5B,
+ 0x1F5C,
+ 0x1F5D,
+ 0x1F5E,
+ 0x1F5F,
+ 0x1F7E,
+ 0x1F80,
+ 0x1FB5,
+ 0x1FB6,
+ 0x1FBD,
+ 0x1FBE,
+ 0x1FBF,
+ 0x1FC2,
+ 0x1FC5,
+ 0x1FC6,
+ 0x1FCD,
+ 0x1FD0,
+ 0x1FD4,
+ 0x1FD6,
+ 0x1FDC,
+ 0x1FE0,
+ 0x1FED,
+ 0x1FF2,
+ 0x1FF5,
+ 0x1FF6,
+ 0x1FFD,
+ 0x2126,
+ 0x2127,
+ 0x212A,
+ 0x212C,
+ 0x2132,
+ 0x2133,
+ 0x214E,
+ 0x214F,
+ 0x2160,
+ 0x2180,
+ 0x2183,
+ 0x2185,
+ 0x24B6,
+ 0x24EA,
+ 0x2C00,
+ 0x2C2F,
+ 0x2C30,
+ 0x2C5F,
+ 0x2C60,
+ 0x2C71,
+ 0x2C72,
+ 0x2C74,
+ 0x2C75,
+ 0x2C77,
+ 0x2C7E,
+ 0x2CE4,
+ 0x2CEB,
+ 0x2CEF,
+ 0x2CF2,
+ 0x2CF4,
+ 0x2D00,
+ 0x2D26,
+ 0x2D27,
+ 0x2D28,
+ 0x2D2D,
+ 0x2D2E,
+ 0xA640,
+ 0xA66E,
+ 0xA680,
+ 0xA69C,
+ 0xA722,
+ 0xA730,
+ 0xA732,
+ 0xA770,
+ 0xA779,
+ 0xA788,
+ 0xA78B,
+ 0xA78E,
+ 0xA790,
+ 0xA794,
+ 0xA796,
+ 0xA7AE,
+ 0xA7B0,
+ 0xA7B8,
+ 0xAB53,
+ 0xAB54,
+ 0xAB70,
+ 0xABC0,
+ 0xFB00,
+ 0xFB07,
+ 0xFB13,
+ 0xFB18,
+ 0xFF21,
+ 0xFF3B,
+ 0xFF41,
+ 0xFF5B,
+ 0x10400,
+ 0x10450,
+ 0x10C80,
+ 0x10CB3,
+ 0x10CC0,
+ 0x10CF3,
+ 0x118A0,
+ 0x118E0
+};
+
+static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for ASCII/Latin1 */
+ 59, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0xDF,
+ 0xE0,
+ 0x130,
+ 0x131,
+ 0x149,
+ 0x14A,
+ 0x1F0,
+ 0x1F1,
+ 0x390,
+ 0x391,
+ 0x3B0,
+ 0x3B1,
+ 0x587,
+ 0x588,
+ 0x1E96,
+ 0x1E9B,
+ 0x1E9E,
+ 0x1E9F,
+ 0x1F50,
+ 0x1F51,
+ 0x1F52,
+ 0x1F53,
+ 0x1F54,
+ 0x1F55,
+ 0x1F56,
+ 0x1F57,
+ 0x1F80,
+ 0x1FB0,
+ 0x1FB2,
+ 0x1FB5,
+ 0x1FB6,
+ 0x1FB8,
+ 0x1FBC,
+ 0x1FBD,
+ 0x1FC2,
+ 0x1FC5,
+ 0x1FC6,
+ 0x1FC8,
+ 0x1FCC,
+ 0x1FCD,
+ 0x1FD2,
+ 0x1FD4,
+ 0x1FD6,
+ 0x1FD8,
+ 0x1FE2,
+ 0x1FE5,
+ 0x1FE6,
+ 0x1FE8,
+ 0x1FF2,
+ 0x1FF5,
+ 0x1FF6,
+ 0x1FF8,
+ 0x1FFC,
+ 0x1FFD,
+ 0xFB00,
+ 0xFB07,
+ 0xFB13,
+ 0xFB18
+};
+
+#endif /* defined(PERL_IN_REGCOMP_C) */
+
+#if defined(PERL_IN_PERL_C)
+
+static const UV _Perl_GCB_invlist[] = { /* for ASCII/Latin1 */
1502, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -1844,7 +2224,7 @@ typedef enum {
GCB_V = 13
} GCB_enum;
-static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for ASCII/Latin1 */
+static const GCB_enum _Perl_GCB_invmap[] = { /* for ASCII/Latin1 */
GCB_Control,
GCB_LF,
GCB_Control,
@@ -3351,382 +3731,6 @@ static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for ASCII/Latin1 */
#endif /* defined(PERL_IN_REGEXEC_C) */
-#if defined(PERL_IN_REGCOMP_C)
-
-static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for ASCII/Latin1 */
- 45, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x2BC,
- 0x2BD,
- 0x308,
- 0x309,
- 0x313,
- 0x314,
- 0x342,
- 0x343,
- 0x3AC,
- 0x3AD,
- 0x3AE,
- 0x3AF,
- 0x3B1,
- 0x3B2,
- 0x3B7,
- 0x3B8,
- 0x3B9,
- 0x3BA,
- 0x3C1,
- 0x3C2,
- 0x3C5,
- 0x3C6,
- 0x3C9,
- 0x3CA,
- 0x3CE,
- 0x3CF,
- 0x565,
- 0x566,
- 0x574,
- 0x575,
- 0x57E,
- 0x57F,
- 0x1F00,
- 0x1F08,
- 0x1F20,
- 0x1F28,
- 0x1F60,
- 0x1F68,
- 0x1F70,
- 0x1F71,
- 0x1F74,
- 0x1F75,
- 0x1F7C,
- 0x1F7D
-};
-
-static const UV _Perl_Any_Folds_invlist[] = { /* for ASCII/Latin1 */
- 247, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x41,
- 0x5B,
- 0x61,
- 0x7B,
- 0xB5,
- 0xB6,
- 0xC0,
- 0xD7,
- 0xD8,
- 0xF7,
- 0xF8,
- 0x131,
- 0x132,
- 0x138,
- 0x139,
- 0x18D,
- 0x18E,
- 0x19B,
- 0x19C,
- 0x1AA,
- 0x1AC,
- 0x1BA,
- 0x1BC,
- 0x1BE,
- 0x1BF,
- 0x1C0,
- 0x1C4,
- 0x221,
- 0x222,
- 0x234,
- 0x23A,
- 0x255,
- 0x256,
- 0x258,
- 0x259,
- 0x25A,
- 0x25B,
- 0x25D,
- 0x260,
- 0x262,
- 0x263,
- 0x264,
- 0x265,
- 0x267,
- 0x268,
- 0x26A,
- 0x26B,
- 0x26D,
- 0x26F,
- 0x270,
- 0x271,
- 0x273,
- 0x275,
- 0x276,
- 0x27D,
- 0x27E,
- 0x280,
- 0x281,
- 0x283,
- 0x284,
- 0x287,
- 0x28D,
- 0x292,
- 0x293,
- 0x29D,
- 0x29F,
- 0x2BC,
- 0x2BD,
- 0x2BE,
- 0x2BF,
- 0x300,
- 0x302,
- 0x307,
- 0x309,
- 0x30A,
- 0x30B,
- 0x30C,
- 0x30D,
- 0x313,
- 0x314,
- 0x331,
- 0x332,
- 0x342,
- 0x343,
- 0x345,
- 0x346,
- 0x370,
- 0x374,
- 0x376,
- 0x378,
- 0x37B,
- 0x37E,
- 0x37F,
- 0x380,
- 0x386,
- 0x387,
- 0x388,
- 0x38B,
- 0x38C,
- 0x38D,
- 0x38E,
- 0x3A2,
- 0x3A3,
- 0x3D2,
- 0x3D5,
- 0x3F6,
- 0x3F7,
- 0x3FC,
- 0x3FD,
- 0x482,
- 0x48A,
- 0x530,
- 0x531,
- 0x557,
- 0x561,
- 0x588,
- 0x10A0,
- 0x10C6,
- 0x10C7,
- 0x10C8,
- 0x10CD,
- 0x10CE,
- 0x13A0,
- 0x13F6,
- 0x13F8,
- 0x13FE,
- 0x1D79,
- 0x1D7A,
- 0x1D7D,
- 0x1D7E,
- 0x1E00,
- 0x1E9C,
- 0x1E9E,
- 0x1E9F,
- 0x1EA0,
- 0x1F16,
- 0x1F18,
- 0x1F1E,
- 0x1F20,
- 0x1F46,
- 0x1F48,
- 0x1F4E,
- 0x1F50,
- 0x1F58,
- 0x1F59,
- 0x1F5A,
- 0x1F5B,
- 0x1F5C,
- 0x1F5D,
- 0x1F5E,
- 0x1F5F,
- 0x1F7E,
- 0x1F80,
- 0x1FB5,
- 0x1FB6,
- 0x1FBD,
- 0x1FBE,
- 0x1FBF,
- 0x1FC2,
- 0x1FC5,
- 0x1FC6,
- 0x1FCD,
- 0x1FD0,
- 0x1FD4,
- 0x1FD6,
- 0x1FDC,
- 0x1FE0,
- 0x1FED,
- 0x1FF2,
- 0x1FF5,
- 0x1FF6,
- 0x1FFD,
- 0x2126,
- 0x2127,
- 0x212A,
- 0x212C,
- 0x2132,
- 0x2133,
- 0x214E,
- 0x214F,
- 0x2160,
- 0x2180,
- 0x2183,
- 0x2185,
- 0x24B6,
- 0x24EA,
- 0x2C00,
- 0x2C2F,
- 0x2C30,
- 0x2C5F,
- 0x2C60,
- 0x2C71,
- 0x2C72,
- 0x2C74,
- 0x2C75,
- 0x2C77,
- 0x2C7E,
- 0x2CE4,
- 0x2CEB,
- 0x2CEF,
- 0x2CF2,
- 0x2CF4,
- 0x2D00,
- 0x2D26,
- 0x2D27,
- 0x2D28,
- 0x2D2D,
- 0x2D2E,
- 0xA640,
- 0xA66E,
- 0xA680,
- 0xA69C,
- 0xA722,
- 0xA730,
- 0xA732,
- 0xA770,
- 0xA779,
- 0xA788,
- 0xA78B,
- 0xA78E,
- 0xA790,
- 0xA794,
- 0xA796,
- 0xA7AE,
- 0xA7B0,
- 0xA7B8,
- 0xAB53,
- 0xAB54,
- 0xAB70,
- 0xABC0,
- 0xFB00,
- 0xFB07,
- 0xFB13,
- 0xFB18,
- 0xFF21,
- 0xFF3B,
- 0xFF41,
- 0xFF5B,
- 0x10400,
- 0x10450,
- 0x10C80,
- 0x10CB3,
- 0x10CC0,
- 0x10CF3,
- 0x118A0,
- 0x118E0
-};
-
-static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for ASCII/Latin1 */
- 59, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0xDF,
- 0xE0,
- 0x130,
- 0x131,
- 0x149,
- 0x14A,
- 0x1F0,
- 0x1F1,
- 0x390,
- 0x391,
- 0x3B0,
- 0x3B1,
- 0x587,
- 0x588,
- 0x1E96,
- 0x1E9B,
- 0x1E9E,
- 0x1E9F,
- 0x1F50,
- 0x1F51,
- 0x1F52,
- 0x1F53,
- 0x1F54,
- 0x1F55,
- 0x1F56,
- 0x1F57,
- 0x1F80,
- 0x1FB0,
- 0x1FB2,
- 0x1FB5,
- 0x1FB6,
- 0x1FB8,
- 0x1FBC,
- 0x1FBD,
- 0x1FC2,
- 0x1FC5,
- 0x1FC6,
- 0x1FC8,
- 0x1FCC,
- 0x1FCD,
- 0x1FD2,
- 0x1FD4,
- 0x1FD6,
- 0x1FD8,
- 0x1FE2,
- 0x1FE5,
- 0x1FE6,
- 0x1FE8,
- 0x1FF2,
- 0x1FF5,
- 0x1FF6,
- 0x1FF8,
- 0x1FFC,
- 0x1FFD,
- 0xFB00,
- 0xFB07,
- 0xFB13,
- 0xFB18
-};
-
-#endif /* defined(PERL_IN_REGCOMP_C) */
-
#if defined(PERL_IN_UTF8_C)
static const UV _Perl_IDCont_invlist[] = { /* for ASCII/Latin1 */
@@ -6193,7 +6197,7 @@ static const UV _Perl_IDStart_invlist[] = { /* for ASCII/Latin1 */
#if defined(PERL_IN_PERL_C)
-static const UV Sentence_Break_invlist[] = { /* for ASCII/Latin1 */
+static const UV _Perl_SB_invlist[] = { /* for ASCII/Latin1 */
2896, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -9121,7 +9125,7 @@ typedef enum {
SB_Upper = 15
} SB_enum;
-static const SB_enum Sentence_Break_invmap[] = { /* for ASCII/Latin1 */
+static const SB_enum _Perl_SB_invmap[] = { /* for ASCII/Latin1 */
SB_Other,
SB_Sp,
SB_LF,
@@ -12022,37 +12026,9 @@ static const SB_enum Sentence_Break_invmap[] = { /* for ASCII/Latin1 */
#endif /* defined(PERL_IN_REGEXEC_C) */
-#if defined(PERL_IN_REGCOMP_C)
-
-static const UV UpperLatin1_invlist[] = { /* for ASCII/Latin1 */
- 3, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x80,
- 0x100
-};
-
-#endif /* defined(PERL_IN_REGCOMP_C) */
-
#if defined(PERL_IN_PERL_C)
-static const UV VertSpace_invlist[] = { /* for ASCII/Latin1 */
- 7, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0xA,
- 0xE,
- 0x85,
- 0x86,
- 0x2028,
- 0x202A
-};
-
-static const UV Word_Break_invlist[] = { /* for ASCII/Latin1 */
+static const UV _Perl_WB_invlist[] = { /* for ASCII/Latin1 */
1524, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -13611,7 +13587,7 @@ typedef enum {
WB_UNKNOWN = 18
} WB_enum;
-static const WB_enum Word_Break_invmap[] = { /* for ASCII/Latin1 */
+static const WB_enum _Perl_WB_invmap[] = { /* for ASCII/Latin1 */
WB_Other,
WB_LF,
WB_Newline,
@@ -15140,8 +15116,36 @@ static const WB_enum Word_Break_invmap[] = { /* for ASCII/Latin1 */
#endif /* defined(PERL_IN_REGEXEC_C) */
+#if defined(PERL_IN_REGCOMP_C)
+
+static const UV UpperLatin1_invlist[] = { /* for ASCII/Latin1 */
+ 3, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x80,
+ 0x100
+};
+
+#endif /* defined(PERL_IN_REGCOMP_C) */
+
#if defined(PERL_IN_PERL_C)
+static const UV VertSpace_invlist[] = { /* for ASCII/Latin1 */
+ 7, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0xA,
+ 0xE,
+ 0x85,
+ 0x86,
+ 0x2028,
+ 0x202A
+};
+
static const UV XPerlSpace_invlist[] = { /* for ASCII/Latin1 */
21, /* Number of elements */
148565664, /* Version and data structure type */
@@ -24947,7 +24951,419 @@ static const UV Cased_invlist[] = { /* for EBCDIC 1047 */
0x1F18A
};
-static const UV Grapheme_Cluster_Break_invlist[] = { /* for EBCDIC 1047 */
+#endif /* defined(PERL_IN_PERL_C) */
+
+#if defined(PERL_IN_REGCOMP_C)
+
+static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC 1047 */
+ 45, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x2BC,
+ 0x2BD,
+ 0x308,
+ 0x309,
+ 0x313,
+ 0x314,
+ 0x342,
+ 0x343,
+ 0x3AC,
+ 0x3AD,
+ 0x3AE,
+ 0x3AF,
+ 0x3B1,
+ 0x3B2,
+ 0x3B7,
+ 0x3B8,
+ 0x3B9,
+ 0x3BA,
+ 0x3C1,
+ 0x3C2,
+ 0x3C5,
+ 0x3C6,
+ 0x3C9,
+ 0x3CA,
+ 0x3CE,
+ 0x3CF,
+ 0x565,
+ 0x566,
+ 0x574,
+ 0x575,
+ 0x57E,
+ 0x57F,
+ 0x1F00,
+ 0x1F08,
+ 0x1F20,
+ 0x1F28,
+ 0x1F60,
+ 0x1F68,
+ 0x1F70,
+ 0x1F71,
+ 0x1F74,
+ 0x1F75,
+ 0x1F7C,
+ 0x1F7D
+};
+
+static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC 1047 */
+ 279, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x42,
+ 0x4A,
+ 0x51,
+ 0x5A,
+ 0x62,
+ 0x6A,
+ 0x70,
+ 0x79,
+ 0x80,
+ 0x8A,
+ 0x8C,
+ 0x8F,
+ 0x91,
+ 0x9A,
+ 0x9C,
+ 0x9D,
+ 0x9E,
+ 0x9F,
+ 0xA0,
+ 0xA1,
+ 0xA2,
+ 0xAA,
+ 0xAC,
+ 0xAD,
+ 0xAE,
+ 0xAF,
+ 0xBA,
+ 0xBB,
+ 0xC1,
+ 0xCA,
+ 0xCB,
+ 0xD0,
+ 0xD1,
+ 0xDA,
+ 0xDB,
+ 0xE0,
+ 0xE2,
+ 0xEA,
+ 0xEB,
+ 0xF0,
+ 0xFB,
+ 0xFF,
+ 0x100,
+ 0x131,
+ 0x132,
+ 0x138,
+ 0x139,
+ 0x18D,
+ 0x18E,
+ 0x19B,
+ 0x19C,
+ 0x1AA,
+ 0x1AC,
+ 0x1BA,
+ 0x1BC,
+ 0x1BE,
+ 0x1BF,
+ 0x1C0,
+ 0x1C4,
+ 0x221,
+ 0x222,
+ 0x234,
+ 0x23A,
+ 0x255,
+ 0x256,
+ 0x258,
+ 0x259,
+ 0x25A,
+ 0x25B,
+ 0x25D,
+ 0x260,
+ 0x262,
+ 0x263,
+ 0x264,
+ 0x265,
+ 0x267,
+ 0x268,
+ 0x26A,
+ 0x26B,
+ 0x26D,
+ 0x26F,
+ 0x270,
+ 0x271,
+ 0x273,
+ 0x275,
+ 0x276,
+ 0x27D,
+ 0x27E,
+ 0x280,
+ 0x281,
+ 0x283,
+ 0x284,
+ 0x287,
+ 0x28D,
+ 0x292,
+ 0x293,
+ 0x29D,
+ 0x29F,
+ 0x2BC,
+ 0x2BD,
+ 0x2BE,
+ 0x2BF,
+ 0x300,
+ 0x302,
+ 0x307,
+ 0x309,
+ 0x30A,
+ 0x30B,
+ 0x30C,
+ 0x30D,
+ 0x313,
+ 0x314,
+ 0x331,
+ 0x332,
+ 0x342,
+ 0x343,
+ 0x345,
+ 0x346,
+ 0x370,
+ 0x374,
+ 0x376,
+ 0x378,
+ 0x37B,
+ 0x37E,
+ 0x37F,
+ 0x380,
+ 0x386,
+ 0x387,
+ 0x388,
+ 0x38B,
+ 0x38C,
+ 0x38D,
+ 0x38E,
+ 0x3A2,
+ 0x3A3,
+ 0x3D2,
+ 0x3D5,
+ 0x3F6,
+ 0x3F7,
+ 0x3FC,
+ 0x3FD,
+ 0x482,
+ 0x48A,
+ 0x530,
+ 0x531,
+ 0x557,
+ 0x561,
+ 0x588,
+ 0x10A0,
+ 0x10C6,
+ 0x10C7,
+ 0x10C8,
+ 0x10CD,
+ 0x10CE,
+ 0x13A0,
+ 0x13F6,
+ 0x13F8,
+ 0x13FE,
+ 0x1D79,
+ 0x1D7A,
+ 0x1D7D,
+ 0x1D7E,
+ 0x1E00,
+ 0x1E9C,
+ 0x1E9E,
+ 0x1E9F,
+ 0x1EA0,
+ 0x1F16,
+ 0x1F18,
+ 0x1F1E,
+ 0x1F20,
+ 0x1F46,
+ 0x1F48,
+ 0x1F4E,
+ 0x1F50,
+ 0x1F58,
+ 0x1F59,
+ 0x1F5A,
+ 0x1F5B,
+ 0x1F5C,
+ 0x1F5D,
+ 0x1F5E,
+ 0x1F5F,
+ 0x1F7E,
+ 0x1F80,
+ 0x1FB5,
+ 0x1FB6,
+ 0x1FBD,
+ 0x1FBE,
+ 0x1FBF,
+ 0x1FC2,
+ 0x1FC5,
+ 0x1FC6,
+ 0x1FCD,
+ 0x1FD0,
+ 0x1FD4,
+ 0x1FD6,
+ 0x1FDC,
+ 0x1FE0,
+ 0x1FED,
+ 0x1FF2,
+ 0x1FF5,
+ 0x1FF6,
+ 0x1FFD,
+ 0x2126,
+ 0x2127,
+ 0x212A,
+ 0x212C,
+ 0x2132,
+ 0x2133,
+ 0x214E,
+ 0x214F,
+ 0x2160,
+ 0x2180,
+ 0x2183,
+ 0x2185,
+ 0x24B6,
+ 0x24EA,
+ 0x2C00,
+ 0x2C2F,
+ 0x2C30,
+ 0x2C5F,
+ 0x2C60,
+ 0x2C71,
+ 0x2C72,
+ 0x2C74,
+ 0x2C75,
+ 0x2C77,
+ 0x2C7E,
+ 0x2CE4,
+ 0x2CEB,
+ 0x2CEF,
+ 0x2CF2,
+ 0x2CF4,
+ 0x2D00,
+ 0x2D26,
+ 0x2D27,
+ 0x2D28,
+ 0x2D2D,
+ 0x2D2E,
+ 0xA640,
+ 0xA66E,
+ 0xA680,
+ 0xA69C,
+ 0xA722,
+ 0xA730,
+ 0xA732,
+ 0xA770,
+ 0xA779,
+ 0xA788,
+ 0xA78B,
+ 0xA78E,
+ 0xA790,
+ 0xA794,
+ 0xA796,
+ 0xA7AE,
+ 0xA7B0,
+ 0xA7B8,
+ 0xAB53,
+ 0xAB54,
+ 0xAB70,
+ 0xABC0,
+ 0xFB00,
+ 0xFB07,
+ 0xFB13,
+ 0xFB18,
+ 0xFF21,
+ 0xFF3B,
+ 0xFF41,
+ 0xFF5B,
+ 0x10400,
+ 0x10450,
+ 0x10C80,
+ 0x10CB3,
+ 0x10CC0,
+ 0x10CF3,
+ 0x118A0,
+ 0x118E0
+};
+
+static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC 1047 */
+ 59, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x59,
+ 0x5A,
+ 0x130,
+ 0x131,
+ 0x149,
+ 0x14A,
+ 0x1F0,
+ 0x1F1,
+ 0x390,
+ 0x391,
+ 0x3B0,
+ 0x3B1,
+ 0x587,
+ 0x588,
+ 0x1E96,
+ 0x1E9B,
+ 0x1E9E,
+ 0x1E9F,
+ 0x1F50,
+ 0x1F51,
+ 0x1F52,
+ 0x1F53,
+ 0x1F54,
+ 0x1F55,
+ 0x1F56,
+ 0x1F57,
+ 0x1F80,
+ 0x1FB0,
+ 0x1FB2,
+ 0x1FB5,
+ 0x1FB6,
+ 0x1FB8,
+ 0x1FBC,
+ 0x1FBD,
+ 0x1FC2,
+ 0x1FC5,
+ 0x1FC6,
+ 0x1FC8,
+ 0x1FCC,
+ 0x1FCD,
+ 0x1FD2,
+ 0x1FD4,
+ 0x1FD6,
+ 0x1FD8,
+ 0x1FE2,
+ 0x1FE5,
+ 0x1FE6,
+ 0x1FE8,
+ 0x1FF2,
+ 0x1FF5,
+ 0x1FF6,
+ 0x1FF8,
+ 0x1FFC,
+ 0x1FFD,
+ 0xFB00,
+ 0xFB07,
+ 0xFB13,
+ 0xFB18
+};
+
+#endif /* defined(PERL_IN_REGCOMP_C) */
+
+#if defined(PERL_IN_PERL_C)
+
+static const UV _Perl_GCB_invlist[] = { /* for EBCDIC 1047 */
1502, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -26479,7 +26895,7 @@ typedef enum {
GCB_V = 13
} GCB_enum;
-static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC 1047 */
+static const GCB_enum _Perl_GCB_invmap[] = { /* for EBCDIC 1047 */
GCB_Control,
GCB_CR,
GCB_Control,
@@ -27986,414 +28402,6 @@ static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC 1047 */
#endif /* defined(PERL_IN_REGEXEC_C) */
-#if defined(PERL_IN_REGCOMP_C)
-
-static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC 1047 */
- 45, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x2BC,
- 0x2BD,
- 0x308,
- 0x309,
- 0x313,
- 0x314,
- 0x342,
- 0x343,
- 0x3AC,
- 0x3AD,
- 0x3AE,
- 0x3AF,
- 0x3B1,
- 0x3B2,
- 0x3B7,
- 0x3B8,
- 0x3B9,
- 0x3BA,
- 0x3C1,
- 0x3C2,
- 0x3C5,
- 0x3C6,
- 0x3C9,
- 0x3CA,
- 0x3CE,
- 0x3CF,
- 0x565,
- 0x566,
- 0x574,
- 0x575,
- 0x57E,
- 0x57F,
- 0x1F00,
- 0x1F08,
- 0x1F20,
- 0x1F28,
- 0x1F60,
- 0x1F68,
- 0x1F70,
- 0x1F71,
- 0x1F74,
- 0x1F75,
- 0x1F7C,
- 0x1F7D
-};
-
-static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC 1047 */
- 279, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x42,
- 0x4A,
- 0x51,
- 0x5A,
- 0x62,
- 0x6A,
- 0x70,
- 0x79,
- 0x80,
- 0x8A,
- 0x8C,
- 0x8F,
- 0x91,
- 0x9A,
- 0x9C,
- 0x9D,
- 0x9E,
- 0x9F,
- 0xA0,
- 0xA1,
- 0xA2,
- 0xAA,
- 0xAC,
- 0xAD,
- 0xAE,
- 0xAF,
- 0xBA,
- 0xBB,
- 0xC1,
- 0xCA,
- 0xCB,
- 0xD0,
- 0xD1,
- 0xDA,
- 0xDB,
- 0xE0,
- 0xE2,
- 0xEA,
- 0xEB,
- 0xF0,
- 0xFB,
- 0xFF,
- 0x100,
- 0x131,
- 0x132,
- 0x138,
- 0x139,
- 0x18D,
- 0x18E,
- 0x19B,
- 0x19C,
- 0x1AA,
- 0x1AC,
- 0x1BA,
- 0x1BC,
- 0x1BE,
- 0x1BF,
- 0x1C0,
- 0x1C4,
- 0x221,
- 0x222,
- 0x234,
- 0x23A,
- 0x255,
- 0x256,
- 0x258,
- 0x259,
- 0x25A,
- 0x25B,
- 0x25D,
- 0x260,
- 0x262,
- 0x263,
- 0x264,
- 0x265,
- 0x267,
- 0x268,
- 0x26A,
- 0x26B,
- 0x26D,
- 0x26F,
- 0x270,
- 0x271,
- 0x273,
- 0x275,
- 0x276,
- 0x27D,
- 0x27E,
- 0x280,
- 0x281,
- 0x283,
- 0x284,
- 0x287,
- 0x28D,
- 0x292,
- 0x293,
- 0x29D,
- 0x29F,
- 0x2BC,
- 0x2BD,
- 0x2BE,
- 0x2BF,
- 0x300,
- 0x302,
- 0x307,
- 0x309,
- 0x30A,
- 0x30B,
- 0x30C,
- 0x30D,
- 0x313,
- 0x314,
- 0x331,
- 0x332,
- 0x342,
- 0x343,
- 0x345,
- 0x346,
- 0x370,
- 0x374,
- 0x376,
- 0x378,
- 0x37B,
- 0x37E,
- 0x37F,
- 0x380,
- 0x386,
- 0x387,
- 0x388,
- 0x38B,
- 0x38C,
- 0x38D,
- 0x38E,
- 0x3A2,
- 0x3A3,
- 0x3D2,
- 0x3D5,
- 0x3F6,
- 0x3F7,
- 0x3FC,
- 0x3FD,
- 0x482,
- 0x48A,
- 0x530,
- 0x531,
- 0x557,
- 0x561,
- 0x588,
- 0x10A0,
- 0x10C6,
- 0x10C7,
- 0x10C8,
- 0x10CD,
- 0x10CE,
- 0x13A0,
- 0x13F6,
- 0x13F8,
- 0x13FE,
- 0x1D79,
- 0x1D7A,
- 0x1D7D,
- 0x1D7E,
- 0x1E00,
- 0x1E9C,
- 0x1E9E,
- 0x1E9F,
- 0x1EA0,
- 0x1F16,
- 0x1F18,
- 0x1F1E,
- 0x1F20,
- 0x1F46,
- 0x1F48,
- 0x1F4E,
- 0x1F50,
- 0x1F58,
- 0x1F59,
- 0x1F5A,
- 0x1F5B,
- 0x1F5C,
- 0x1F5D,
- 0x1F5E,
- 0x1F5F,
- 0x1F7E,
- 0x1F80,
- 0x1FB5,
- 0x1FB6,
- 0x1FBD,
- 0x1FBE,
- 0x1FBF,
- 0x1FC2,
- 0x1FC5,
- 0x1FC6,
- 0x1FCD,
- 0x1FD0,
- 0x1FD4,
- 0x1FD6,
- 0x1FDC,
- 0x1FE0,
- 0x1FED,
- 0x1FF2,
- 0x1FF5,
- 0x1FF6,
- 0x1FFD,
- 0x2126,
- 0x2127,
- 0x212A,
- 0x212C,
- 0x2132,
- 0x2133,
- 0x214E,
- 0x214F,
- 0x2160,
- 0x2180,
- 0x2183,
- 0x2185,
- 0x24B6,
- 0x24EA,
- 0x2C00,
- 0x2C2F,
- 0x2C30,
- 0x2C5F,
- 0x2C60,
- 0x2C71,
- 0x2C72,
- 0x2C74,
- 0x2C75,
- 0x2C77,
- 0x2C7E,
- 0x2CE4,
- 0x2CEB,
- 0x2CEF,
- 0x2CF2,
- 0x2CF4,
- 0x2D00,
- 0x2D26,
- 0x2D27,
- 0x2D28,
- 0x2D2D,
- 0x2D2E,
- 0xA640,
- 0xA66E,
- 0xA680,
- 0xA69C,
- 0xA722,
- 0xA730,
- 0xA732,
- 0xA770,
- 0xA779,
- 0xA788,
- 0xA78B,
- 0xA78E,
- 0xA790,
- 0xA794,
- 0xA796,
- 0xA7AE,
- 0xA7B0,
- 0xA7B8,
- 0xAB53,
- 0xAB54,
- 0xAB70,
- 0xABC0,
- 0xFB00,
- 0xFB07,
- 0xFB13,
- 0xFB18,
- 0xFF21,
- 0xFF3B,
- 0xFF41,
- 0xFF5B,
- 0x10400,
- 0x10450,
- 0x10C80,
- 0x10CB3,
- 0x10CC0,
- 0x10CF3,
- 0x118A0,
- 0x118E0
-};
-
-static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC 1047 */
- 59, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x59,
- 0x5A,
- 0x130,
- 0x131,
- 0x149,
- 0x14A,
- 0x1F0,
- 0x1F1,
- 0x390,
- 0x391,
- 0x3B0,
- 0x3B1,
- 0x587,
- 0x588,
- 0x1E96,
- 0x1E9B,
- 0x1E9E,
- 0x1E9F,
- 0x1F50,
- 0x1F51,
- 0x1F52,
- 0x1F53,
- 0x1F54,
- 0x1F55,
- 0x1F56,
- 0x1F57,
- 0x1F80,
- 0x1FB0,
- 0x1FB2,
- 0x1FB5,
- 0x1FB6,
- 0x1FB8,
- 0x1FBC,
- 0x1FBD,
- 0x1FC2,
- 0x1FC5,
- 0x1FC6,
- 0x1FC8,
- 0x1FCC,
- 0x1FCD,
- 0x1FD2,
- 0x1FD4,
- 0x1FD6,
- 0x1FD8,
- 0x1FE2,
- 0x1FE5,
- 0x1FE6,
- 0x1FE8,
- 0x1FF2,
- 0x1FF5,
- 0x1FF6,
- 0x1FF8,
- 0x1FFC,
- 0x1FFD,
- 0xFB00,
- 0xFB07,
- 0xFB13,
- 0xFB18
-};
-
-#endif /* defined(PERL_IN_REGCOMP_C) */
-
#if defined(PERL_IN_UTF8_C)
static const UV _Perl_IDCont_invlist[] = { /* for EBCDIC 1047 */
@@ -30910,7 +30918,7 @@ static const UV _Perl_IDStart_invlist[] = { /* for EBCDIC 1047 */
#if defined(PERL_IN_PERL_C)
-static const UV Sentence_Break_invlist[] = { /* for EBCDIC 1047 */
+static const UV _Perl_SB_invlist[] = { /* for EBCDIC 1047 */
2920, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -33862,7 +33870,7 @@ typedef enum {
SB_Upper = 15
} SB_enum;
-static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC 1047 */
+static const SB_enum _Perl_SB_invmap[] = { /* for EBCDIC 1047 */
SB_Other,
SB_Sp,
SB_Other,
@@ -36787,91 +36795,9 @@ static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC 1047 */
#endif /* defined(PERL_IN_REGEXEC_C) */
-#if defined(PERL_IN_REGCOMP_C)
-
-static const UV UpperLatin1_invlist[] = { /* for EBCDIC 1047 */
- 55, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x4,
- 0x5,
- 0x6,
- 0x7,
- 0x8,
- 0xB,
- 0x14,
- 0x15,
- 0x17,
- 0x18,
- 0x1A,
- 0x1C,
- 0x20,
- 0x26,
- 0x28,
- 0x2D,
- 0x30,
- 0x32,
- 0x33,
- 0x37,
- 0x38,
- 0x3C,
- 0x3E,
- 0x3F,
- 0x41,
- 0x4B,
- 0x51,
- 0x5A,
- 0x62,
- 0x6B,
- 0x70,
- 0x79,
- 0x80,
- 0x81,
- 0x8A,
- 0x91,
- 0x9A,
- 0xA1,
- 0xAA,
- 0xAD,
- 0xAE,
- 0xBD,
- 0xBE,
- 0xC0,
- 0xCA,
- 0xD0,
- 0xDA,
- 0xE0,
- 0xE1,
- 0xE2,
- 0xEA,
- 0xF0,
- 0xFA,
- 0x100
-};
-
-#endif /* defined(PERL_IN_REGCOMP_C) */
-
#if defined(PERL_IN_PERL_C)
-static const UV VertSpace_invlist[] = { /* for EBCDIC 1047 */
- 9, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0xB,
- 0xE,
- 0x15,
- 0x16,
- 0x25,
- 0x26,
- 0x2028,
- 0x202A
-};
-
-static const UV Word_Break_invlist[] = { /* for EBCDIC 1047 */
+static const UV _Perl_WB_invlist[] = { /* for EBCDIC 1047 */
1549, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -38455,7 +38381,7 @@ typedef enum {
WB_UNKNOWN = 18
} WB_enum;
-static const WB_enum Word_Break_invmap[] = { /* for EBCDIC 1047 */
+static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 1047 */
WB_Other,
WB_Newline,
WB_CR,
@@ -40009,8 +39935,90 @@ static const WB_enum Word_Break_invmap[] = { /* for EBCDIC 1047 */
#endif /* defined(PERL_IN_REGEXEC_C) */
+#if defined(PERL_IN_REGCOMP_C)
+
+static const UV UpperLatin1_invlist[] = { /* for EBCDIC 1047 */
+ 55, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x4,
+ 0x5,
+ 0x6,
+ 0x7,
+ 0x8,
+ 0xB,
+ 0x14,
+ 0x15,
+ 0x17,
+ 0x18,
+ 0x1A,
+ 0x1C,
+ 0x20,
+ 0x26,
+ 0x28,
+ 0x2D,
+ 0x30,
+ 0x32,
+ 0x33,
+ 0x37,
+ 0x38,
+ 0x3C,
+ 0x3E,
+ 0x3F,
+ 0x41,
+ 0x4B,
+ 0x51,
+ 0x5A,
+ 0x62,
+ 0x6B,
+ 0x70,
+ 0x79,
+ 0x80,
+ 0x81,
+ 0x8A,
+ 0x91,
+ 0x9A,
+ 0xA1,
+ 0xAA,
+ 0xAD,
+ 0xAE,
+ 0xBD,
+ 0xBE,
+ 0xC0,
+ 0xCA,
+ 0xD0,
+ 0xDA,
+ 0xE0,
+ 0xE1,
+ 0xE2,
+ 0xEA,
+ 0xF0,
+ 0xFA,
+ 0x100
+};
+
+#endif /* defined(PERL_IN_REGCOMP_C) */
+
#if defined(PERL_IN_PERL_C)
+static const UV VertSpace_invlist[] = { /* for EBCDIC 1047 */
+ 9, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0xB,
+ 0xE,
+ 0x15,
+ 0x16,
+ 0x25,
+ 0x26,
+ 0x2028,
+ 0x202A
+};
+
static const UV XPerlSpace_invlist[] = { /* for EBCDIC 1047 */
23, /* Number of elements */
148565664, /* Version and data structure type */
@@ -49924,7 +49932,415 @@ static const UV Cased_invlist[] = { /* for EBCDIC 037 */
0x1F18A
};
-static const UV Grapheme_Cluster_Break_invlist[] = { /* for EBCDIC 037 */
+#endif /* defined(PERL_IN_PERL_C) */
+
+#if defined(PERL_IN_REGCOMP_C)
+
+static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC 037 */
+ 45, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x2BC,
+ 0x2BD,
+ 0x308,
+ 0x309,
+ 0x313,
+ 0x314,
+ 0x342,
+ 0x343,
+ 0x3AC,
+ 0x3AD,
+ 0x3AE,
+ 0x3AF,
+ 0x3B1,
+ 0x3B2,
+ 0x3B7,
+ 0x3B8,
+ 0x3B9,
+ 0x3BA,
+ 0x3C1,
+ 0x3C2,
+ 0x3C5,
+ 0x3C6,
+ 0x3C9,
+ 0x3CA,
+ 0x3CE,
+ 0x3CF,
+ 0x565,
+ 0x566,
+ 0x574,
+ 0x575,
+ 0x57E,
+ 0x57F,
+ 0x1F00,
+ 0x1F08,
+ 0x1F20,
+ 0x1F28,
+ 0x1F60,
+ 0x1F68,
+ 0x1F70,
+ 0x1F71,
+ 0x1F74,
+ 0x1F75,
+ 0x1F7C,
+ 0x1F7D
+};
+
+static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC 037 */
+ 275, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x42,
+ 0x4A,
+ 0x51,
+ 0x5A,
+ 0x62,
+ 0x6A,
+ 0x70,
+ 0x79,
+ 0x80,
+ 0x8A,
+ 0x8C,
+ 0x8F,
+ 0x91,
+ 0x9A,
+ 0x9C,
+ 0x9D,
+ 0x9E,
+ 0x9F,
+ 0xA0,
+ 0xA1,
+ 0xA2,
+ 0xAA,
+ 0xAC,
+ 0xAF,
+ 0xC1,
+ 0xCA,
+ 0xCB,
+ 0xD0,
+ 0xD1,
+ 0xDA,
+ 0xDB,
+ 0xE0,
+ 0xE2,
+ 0xEA,
+ 0xEB,
+ 0xF0,
+ 0xFB,
+ 0xFF,
+ 0x100,
+ 0x131,
+ 0x132,
+ 0x138,
+ 0x139,
+ 0x18D,
+ 0x18E,
+ 0x19B,
+ 0x19C,
+ 0x1AA,
+ 0x1AC,
+ 0x1BA,
+ 0x1BC,
+ 0x1BE,
+ 0x1BF,
+ 0x1C0,
+ 0x1C4,
+ 0x221,
+ 0x222,
+ 0x234,
+ 0x23A,
+ 0x255,
+ 0x256,
+ 0x258,
+ 0x259,
+ 0x25A,
+ 0x25B,
+ 0x25D,
+ 0x260,
+ 0x262,
+ 0x263,
+ 0x264,
+ 0x265,
+ 0x267,
+ 0x268,
+ 0x26A,
+ 0x26B,
+ 0x26D,
+ 0x26F,
+ 0x270,
+ 0x271,
+ 0x273,
+ 0x275,
+ 0x276,
+ 0x27D,
+ 0x27E,
+ 0x280,
+ 0x281,
+ 0x283,
+ 0x284,
+ 0x287,
+ 0x28D,
+ 0x292,
+ 0x293,
+ 0x29D,
+ 0x29F,
+ 0x2BC,
+ 0x2BD,
+ 0x2BE,
+ 0x2BF,
+ 0x300,
+ 0x302,
+ 0x307,
+ 0x309,
+ 0x30A,
+ 0x30B,
+ 0x30C,
+ 0x30D,
+ 0x313,
+ 0x314,
+ 0x331,
+ 0x332,
+ 0x342,
+ 0x343,
+ 0x345,
+ 0x346,
+ 0x370,
+ 0x374,
+ 0x376,
+ 0x378,
+ 0x37B,
+ 0x37E,
+ 0x37F,
+ 0x380,
+ 0x386,
+ 0x387,
+ 0x388,
+ 0x38B,
+ 0x38C,
+ 0x38D,
+ 0x38E,
+ 0x3A2,
+ 0x3A3,
+ 0x3D2,
+ 0x3D5,
+ 0x3F6,
+ 0x3F7,
+ 0x3FC,
+ 0x3FD,
+ 0x482,
+ 0x48A,
+ 0x530,
+ 0x531,
+ 0x557,
+ 0x561,
+ 0x588,
+ 0x10A0,
+ 0x10C6,
+ 0x10C7,
+ 0x10C8,
+ 0x10CD,
+ 0x10CE,
+ 0x13A0,
+ 0x13F6,
+ 0x13F8,
+ 0x13FE,
+ 0x1D79,
+ 0x1D7A,
+ 0x1D7D,
+ 0x1D7E,
+ 0x1E00,
+ 0x1E9C,
+ 0x1E9E,
+ 0x1E9F,
+ 0x1EA0,
+ 0x1F16,
+ 0x1F18,
+ 0x1F1E,
+ 0x1F20,
+ 0x1F46,
+ 0x1F48,
+ 0x1F4E,
+ 0x1F50,
+ 0x1F58,
+ 0x1F59,
+ 0x1F5A,
+ 0x1F5B,
+ 0x1F5C,
+ 0x1F5D,
+ 0x1F5E,
+ 0x1F5F,
+ 0x1F7E,
+ 0x1F80,
+ 0x1FB5,
+ 0x1FB6,
+ 0x1FBD,
+ 0x1FBE,
+ 0x1FBF,
+ 0x1FC2,
+ 0x1FC5,
+ 0x1FC6,
+ 0x1FCD,
+ 0x1FD0,
+ 0x1FD4,
+ 0x1FD6,
+ 0x1FDC,
+ 0x1FE0,
+ 0x1FED,
+ 0x1FF2,
+ 0x1FF5,
+ 0x1FF6,
+ 0x1FFD,
+ 0x2126,
+ 0x2127,
+ 0x212A,
+ 0x212C,
+ 0x2132,
+ 0x2133,
+ 0x214E,
+ 0x214F,
+ 0x2160,
+ 0x2180,
+ 0x2183,
+ 0x2185,
+ 0x24B6,
+ 0x24EA,
+ 0x2C00,
+ 0x2C2F,
+ 0x2C30,
+ 0x2C5F,
+ 0x2C60,
+ 0x2C71,
+ 0x2C72,
+ 0x2C74,
+ 0x2C75,
+ 0x2C77,
+ 0x2C7E,
+ 0x2CE4,
+ 0x2CEB,
+ 0x2CEF,
+ 0x2CF2,
+ 0x2CF4,
+ 0x2D00,
+ 0x2D26,
+ 0x2D27,
+ 0x2D28,
+ 0x2D2D,
+ 0x2D2E,
+ 0xA640,
+ 0xA66E,
+ 0xA680,
+ 0xA69C,
+ 0xA722,
+ 0xA730,
+ 0xA732,
+ 0xA770,
+ 0xA779,
+ 0xA788,
+ 0xA78B,
+ 0xA78E,
+ 0xA790,
+ 0xA794,
+ 0xA796,
+ 0xA7AE,
+ 0xA7B0,
+ 0xA7B8,
+ 0xAB53,
+ 0xAB54,
+ 0xAB70,
+ 0xABC0,
+ 0xFB00,
+ 0xFB07,
+ 0xFB13,
+ 0xFB18,
+ 0xFF21,
+ 0xFF3B,
+ 0xFF41,
+ 0xFF5B,
+ 0x10400,
+ 0x10450,
+ 0x10C80,
+ 0x10CB3,
+ 0x10CC0,
+ 0x10CF3,
+ 0x118A0,
+ 0x118E0
+};
+
+static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC 037 */
+ 59, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x59,
+ 0x5A,
+ 0x130,
+ 0x131,
+ 0x149,
+ 0x14A,
+ 0x1F0,
+ 0x1F1,
+ 0x390,
+ 0x391,
+ 0x3B0,
+ 0x3B1,
+ 0x587,
+ 0x588,
+ 0x1E96,
+ 0x1E9B,
+ 0x1E9E,
+ 0x1E9F,
+ 0x1F50,
+ 0x1F51,
+ 0x1F52,
+ 0x1F53,
+ 0x1F54,
+ 0x1F55,
+ 0x1F56,
+ 0x1F57,
+ 0x1F80,
+ 0x1FB0,
+ 0x1FB2,
+ 0x1FB5,
+ 0x1FB6,
+ 0x1FB8,
+ 0x1FBC,
+ 0x1FBD,
+ 0x1FC2,
+ 0x1FC5,
+ 0x1FC6,
+ 0x1FC8,
+ 0x1FCC,
+ 0x1FCD,
+ 0x1FD2,
+ 0x1FD4,
+ 0x1FD6,
+ 0x1FD8,
+ 0x1FE2,
+ 0x1FE5,
+ 0x1FE6,
+ 0x1FE8,
+ 0x1FF2,
+ 0x1FF5,
+ 0x1FF6,
+ 0x1FF8,
+ 0x1FFC,
+ 0x1FFD,
+ 0xFB00,
+ 0xFB07,
+ 0xFB13,
+ 0xFB18
+};
+
+#endif /* defined(PERL_IN_REGCOMP_C) */
+
+#if defined(PERL_IN_PERL_C)
+
+static const UV _Perl_GCB_invlist[] = { /* for EBCDIC 037 */
1502, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -51456,7 +51872,7 @@ typedef enum {
GCB_V = 13
} GCB_enum;
-static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC 037 */
+static const GCB_enum _Perl_GCB_invmap[] = { /* for EBCDIC 037 */
GCB_Control,
GCB_CR,
GCB_Control,
@@ -52963,410 +53379,6 @@ static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC 037 */
#endif /* defined(PERL_IN_REGEXEC_C) */
-#if defined(PERL_IN_REGCOMP_C)
-
-static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC 037 */
- 45, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x2BC,
- 0x2BD,
- 0x308,
- 0x309,
- 0x313,
- 0x314,
- 0x342,
- 0x343,
- 0x3AC,
- 0x3AD,
- 0x3AE,
- 0x3AF,
- 0x3B1,
- 0x3B2,
- 0x3B7,
- 0x3B8,
- 0x3B9,
- 0x3BA,
- 0x3C1,
- 0x3C2,
- 0x3C5,
- 0x3C6,
- 0x3C9,
- 0x3CA,
- 0x3CE,
- 0x3CF,
- 0x565,
- 0x566,
- 0x574,
- 0x575,
- 0x57E,
- 0x57F,
- 0x1F00,
- 0x1F08,
- 0x1F20,
- 0x1F28,
- 0x1F60,
- 0x1F68,
- 0x1F70,
- 0x1F71,
- 0x1F74,
- 0x1F75,
- 0x1F7C,
- 0x1F7D
-};
-
-static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC 037 */
- 275, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x42,
- 0x4A,
- 0x51,
- 0x5A,
- 0x62,
- 0x6A,
- 0x70,
- 0x79,
- 0x80,
- 0x8A,
- 0x8C,
- 0x8F,
- 0x91,
- 0x9A,
- 0x9C,
- 0x9D,
- 0x9E,
- 0x9F,
- 0xA0,
- 0xA1,
- 0xA2,
- 0xAA,
- 0xAC,
- 0xAF,
- 0xC1,
- 0xCA,
- 0xCB,
- 0xD0,
- 0xD1,
- 0xDA,
- 0xDB,
- 0xE0,
- 0xE2,
- 0xEA,
- 0xEB,
- 0xF0,
- 0xFB,
- 0xFF,
- 0x100,
- 0x131,
- 0x132,
- 0x138,
- 0x139,
- 0x18D,
- 0x18E,
- 0x19B,
- 0x19C,
- 0x1AA,
- 0x1AC,
- 0x1BA,
- 0x1BC,
- 0x1BE,
- 0x1BF,
- 0x1C0,
- 0x1C4,
- 0x221,
- 0x222,
- 0x234,
- 0x23A,
- 0x255,
- 0x256,
- 0x258,
- 0x259,
- 0x25A,
- 0x25B,
- 0x25D,
- 0x260,
- 0x262,
- 0x263,
- 0x264,
- 0x265,
- 0x267,
- 0x268,
- 0x26A,
- 0x26B,
- 0x26D,
- 0x26F,
- 0x270,
- 0x271,
- 0x273,
- 0x275,
- 0x276,
- 0x27D,
- 0x27E,
- 0x280,
- 0x281,
- 0x283,
- 0x284,
- 0x287,
- 0x28D,
- 0x292,
- 0x293,
- 0x29D,
- 0x29F,
- 0x2BC,
- 0x2BD,
- 0x2BE,
- 0x2BF,
- 0x300,
- 0x302,
- 0x307,
- 0x309,
- 0x30A,
- 0x30B,
- 0x30C,
- 0x30D,
- 0x313,
- 0x314,
- 0x331,
- 0x332,
- 0x342,
- 0x343,
- 0x345,
- 0x346,
- 0x370,
- 0x374,
- 0x376,
- 0x378,
- 0x37B,
- 0x37E,
- 0x37F,
- 0x380,
- 0x386,
- 0x387,
- 0x388,
- 0x38B,
- 0x38C,
- 0x38D,
- 0x38E,
- 0x3A2,
- 0x3A3,
- 0x3D2,
- 0x3D5,
- 0x3F6,
- 0x3F7,
- 0x3FC,
- 0x3FD,
- 0x482,
- 0x48A,
- 0x530,
- 0x531,
- 0x557,
- 0x561,
- 0x588,
- 0x10A0,
- 0x10C6,
- 0x10C7,
- 0x10C8,
- 0x10CD,
- 0x10CE,
- 0x13A0,
- 0x13F6,
- 0x13F8,
- 0x13FE,
- 0x1D79,
- 0x1D7A,
- 0x1D7D,
- 0x1D7E,
- 0x1E00,
- 0x1E9C,
- 0x1E9E,
- 0x1E9F,
- 0x1EA0,
- 0x1F16,
- 0x1F18,
- 0x1F1E,
- 0x1F20,
- 0x1F46,
- 0x1F48,
- 0x1F4E,
- 0x1F50,
- 0x1F58,
- 0x1F59,
- 0x1F5A,
- 0x1F5B,
- 0x1F5C,
- 0x1F5D,
- 0x1F5E,
- 0x1F5F,
- 0x1F7E,
- 0x1F80,
- 0x1FB5,
- 0x1FB6,
- 0x1FBD,
- 0x1FBE,
- 0x1FBF,
- 0x1FC2,
- 0x1FC5,
- 0x1FC6,
- 0x1FCD,
- 0x1FD0,
- 0x1FD4,
- 0x1FD6,
- 0x1FDC,
- 0x1FE0,
- 0x1FED,
- 0x1FF2,
- 0x1FF5,
- 0x1FF6,
- 0x1FFD,
- 0x2126,
- 0x2127,
- 0x212A,
- 0x212C,
- 0x2132,
- 0x2133,
- 0x214E,
- 0x214F,
- 0x2160,
- 0x2180,
- 0x2183,
- 0x2185,
- 0x24B6,
- 0x24EA,
- 0x2C00,
- 0x2C2F,
- 0x2C30,
- 0x2C5F,
- 0x2C60,
- 0x2C71,
- 0x2C72,
- 0x2C74,
- 0x2C75,
- 0x2C77,
- 0x2C7E,
- 0x2CE4,
- 0x2CEB,
- 0x2CEF,
- 0x2CF2,
- 0x2CF4,
- 0x2D00,
- 0x2D26,
- 0x2D27,
- 0x2D28,
- 0x2D2D,
- 0x2D2E,
- 0xA640,
- 0xA66E,
- 0xA680,
- 0xA69C,
- 0xA722,
- 0xA730,
- 0xA732,
- 0xA770,
- 0xA779,
- 0xA788,
- 0xA78B,
- 0xA78E,
- 0xA790,
- 0xA794,
- 0xA796,
- 0xA7AE,
- 0xA7B0,
- 0xA7B8,
- 0xAB53,
- 0xAB54,
- 0xAB70,
- 0xABC0,
- 0xFB00,
- 0xFB07,
- 0xFB13,
- 0xFB18,
- 0xFF21,
- 0xFF3B,
- 0xFF41,
- 0xFF5B,
- 0x10400,
- 0x10450,
- 0x10C80,
- 0x10CB3,
- 0x10CC0,
- 0x10CF3,
- 0x118A0,
- 0x118E0
-};
-
-static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC 037 */
- 59, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x59,
- 0x5A,
- 0x130,
- 0x131,
- 0x149,
- 0x14A,
- 0x1F0,
- 0x1F1,
- 0x390,
- 0x391,
- 0x3B0,
- 0x3B1,
- 0x587,
- 0x588,
- 0x1E96,
- 0x1E9B,
- 0x1E9E,
- 0x1E9F,
- 0x1F50,
- 0x1F51,
- 0x1F52,
- 0x1F53,
- 0x1F54,
- 0x1F55,
- 0x1F56,
- 0x1F57,
- 0x1F80,
- 0x1FB0,
- 0x1FB2,
- 0x1FB5,
- 0x1FB6,
- 0x1FB8,
- 0x1FBC,
- 0x1FBD,
- 0x1FC2,
- 0x1FC5,
- 0x1FC6,
- 0x1FC8,
- 0x1FCC,
- 0x1FCD,
- 0x1FD2,
- 0x1FD4,
- 0x1FD6,
- 0x1FD8,
- 0x1FE2,
- 0x1FE5,
- 0x1FE6,
- 0x1FE8,
- 0x1FF2,
- 0x1FF5,
- 0x1FF6,
- 0x1FF8,
- 0x1FFC,
- 0x1FFD,
- 0xFB00,
- 0xFB07,
- 0xFB13,
- 0xFB18
-};
-
-#endif /* defined(PERL_IN_REGCOMP_C) */
-
#if defined(PERL_IN_UTF8_C)
static const UV _Perl_IDCont_invlist[] = { /* for EBCDIC 037 */
@@ -55875,7 +55887,7 @@ static const UV _Perl_IDStart_invlist[] = { /* for EBCDIC 037 */
#if defined(PERL_IN_PERL_C)
-static const UV Sentence_Break_invlist[] = { /* for EBCDIC 037 */
+static const UV _Perl_SB_invlist[] = { /* for EBCDIC 037 */
2916, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -58823,7 +58835,7 @@ typedef enum {
SB_Upper = 15
} SB_enum;
-static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC 037 */
+static const SB_enum _Perl_SB_invmap[] = { /* for EBCDIC 037 */
SB_Other,
SB_Sp,
SB_Other,
@@ -61744,93 +61756,9 @@ static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC 037 */
#endif /* defined(PERL_IN_REGEXEC_C) */
-#if defined(PERL_IN_REGCOMP_C)
-
-static const UV UpperLatin1_invlist[] = { /* for EBCDIC 037 */
- 57, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x4,
- 0x5,
- 0x6,
- 0x7,
- 0x8,
- 0xB,
- 0x14,
- 0x16,
- 0x17,
- 0x18,
- 0x1A,
- 0x1C,
- 0x20,
- 0x25,
- 0x28,
- 0x2D,
- 0x30,
- 0x32,
- 0x33,
- 0x37,
- 0x38,
- 0x3C,
- 0x3E,
- 0x3F,
- 0x41,
- 0x4B,
- 0x51,
- 0x5A,
- 0x5F,
- 0x60,
- 0x62,
- 0x6B,
- 0x70,
- 0x79,
- 0x80,
- 0x81,
- 0x8A,
- 0x91,
- 0x9A,
- 0xA1,
- 0xAA,
- 0xB0,
- 0xB1,
- 0xBA,
- 0xBC,
- 0xC0,
- 0xCA,
- 0xD0,
- 0xDA,
- 0xE0,
- 0xE1,
- 0xE2,
- 0xEA,
- 0xF0,
- 0xFA,
- 0x100
-};
-
-#endif /* defined(PERL_IN_REGCOMP_C) */
-
#if defined(PERL_IN_PERL_C)
-static const UV VertSpace_invlist[] = { /* for EBCDIC 037 */
- 9, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0xB,
- 0xE,
- 0x15,
- 0x16,
- 0x25,
- 0x26,
- 0x2028,
- 0x202A
-};
-
-static const UV Word_Break_invlist[] = { /* for EBCDIC 037 */
+static const UV _Perl_WB_invlist[] = { /* for EBCDIC 037 */
1545, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -63410,7 +63338,7 @@ typedef enum {
WB_UNKNOWN = 18
} WB_enum;
-static const WB_enum Word_Break_invmap[] = { /* for EBCDIC 037 */
+static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 037 */
WB_Other,
WB_Newline,
WB_CR,
@@ -64960,8 +64888,92 @@ static const WB_enum Word_Break_invmap[] = { /* for EBCDIC 037 */
#endif /* defined(PERL_IN_REGEXEC_C) */
+#if defined(PERL_IN_REGCOMP_C)
+
+static const UV UpperLatin1_invlist[] = { /* for EBCDIC 037 */
+ 57, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x4,
+ 0x5,
+ 0x6,
+ 0x7,
+ 0x8,
+ 0xB,
+ 0x14,
+ 0x16,
+ 0x17,
+ 0x18,
+ 0x1A,
+ 0x1C,
+ 0x20,
+ 0x25,
+ 0x28,
+ 0x2D,
+ 0x30,
+ 0x32,
+ 0x33,
+ 0x37,
+ 0x38,
+ 0x3C,
+ 0x3E,
+ 0x3F,
+ 0x41,
+ 0x4B,
+ 0x51,
+ 0x5A,
+ 0x5F,
+ 0x60,
+ 0x62,
+ 0x6B,
+ 0x70,
+ 0x79,
+ 0x80,
+ 0x81,
+ 0x8A,
+ 0x91,
+ 0x9A,
+ 0xA1,
+ 0xAA,
+ 0xB0,
+ 0xB1,
+ 0xBA,
+ 0xBC,
+ 0xC0,
+ 0xCA,
+ 0xD0,
+ 0xDA,
+ 0xE0,
+ 0xE1,
+ 0xE2,
+ 0xEA,
+ 0xF0,
+ 0xFA,
+ 0x100
+};
+
+#endif /* defined(PERL_IN_REGCOMP_C) */
+
#if defined(PERL_IN_PERL_C)
+static const UV VertSpace_invlist[] = { /* for EBCDIC 037 */
+ 9, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0xB,
+ 0xE,
+ 0x15,
+ 0x16,
+ 0x25,
+ 0x26,
+ 0x2028,
+ 0x202A
+};
+
static const UV XPerlSpace_invlist[] = { /* for EBCDIC 037 */
23, /* Number of elements */
148565664, /* Version and data structure type */
@@ -74865,7 +74877,417 @@ static const UV Cased_invlist[] = { /* for EBCDIC POSIX-BC */
0x1F18A
};
-static const UV Grapheme_Cluster_Break_invlist[] = { /* for EBCDIC POSIX-BC */
+#endif /* defined(PERL_IN_PERL_C) */
+
+#if defined(PERL_IN_REGCOMP_C)
+
+static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC POSIX-BC */
+ 45, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x2BC,
+ 0x2BD,
+ 0x308,
+ 0x309,
+ 0x313,
+ 0x314,
+ 0x342,
+ 0x343,
+ 0x3AC,
+ 0x3AD,
+ 0x3AE,
+ 0x3AF,
+ 0x3B1,
+ 0x3B2,
+ 0x3B7,
+ 0x3B8,
+ 0x3B9,
+ 0x3BA,
+ 0x3C1,
+ 0x3C2,
+ 0x3C5,
+ 0x3C6,
+ 0x3C9,
+ 0x3CA,
+ 0x3CE,
+ 0x3CF,
+ 0x565,
+ 0x566,
+ 0x574,
+ 0x575,
+ 0x57E,
+ 0x57F,
+ 0x1F00,
+ 0x1F08,
+ 0x1F20,
+ 0x1F28,
+ 0x1F60,
+ 0x1F68,
+ 0x1F70,
+ 0x1F71,
+ 0x1F74,
+ 0x1F75,
+ 0x1F7C,
+ 0x1F7D
+};
+
+static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC POSIX-BC */
+ 277, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x42,
+ 0x4A,
+ 0x51,
+ 0x5A,
+ 0x62,
+ 0x6A,
+ 0x70,
+ 0x79,
+ 0x80,
+ 0x8A,
+ 0x8C,
+ 0x8F,
+ 0x91,
+ 0x9A,
+ 0x9C,
+ 0x9D,
+ 0x9E,
+ 0x9F,
+ 0xA0,
+ 0xA1,
+ 0xA2,
+ 0xAA,
+ 0xAC,
+ 0xAF,
+ 0xC0,
+ 0xCA,
+ 0xCB,
+ 0xD0,
+ 0xD1,
+ 0xDA,
+ 0xDB,
+ 0xE1,
+ 0xE2,
+ 0xEA,
+ 0xEB,
+ 0xF0,
+ 0xFC,
+ 0xFD,
+ 0xFE,
+ 0xFF,
+ 0x100,
+ 0x131,
+ 0x132,
+ 0x138,
+ 0x139,
+ 0x18D,
+ 0x18E,
+ 0x19B,
+ 0x19C,
+ 0x1AA,
+ 0x1AC,
+ 0x1BA,
+ 0x1BC,
+ 0x1BE,
+ 0x1BF,
+ 0x1C0,
+ 0x1C4,
+ 0x221,
+ 0x222,
+ 0x234,
+ 0x23A,
+ 0x255,
+ 0x256,
+ 0x258,
+ 0x259,
+ 0x25A,
+ 0x25B,
+ 0x25D,
+ 0x260,
+ 0x262,
+ 0x263,
+ 0x264,
+ 0x265,
+ 0x267,
+ 0x268,
+ 0x26A,
+ 0x26B,
+ 0x26D,
+ 0x26F,
+ 0x270,
+ 0x271,
+ 0x273,
+ 0x275,
+ 0x276,
+ 0x27D,
+ 0x27E,
+ 0x280,
+ 0x281,
+ 0x283,
+ 0x284,
+ 0x287,
+ 0x28D,
+ 0x292,
+ 0x293,
+ 0x29D,
+ 0x29F,
+ 0x2BC,
+ 0x2BD,
+ 0x2BE,
+ 0x2BF,
+ 0x300,
+ 0x302,
+ 0x307,
+ 0x309,
+ 0x30A,
+ 0x30B,
+ 0x30C,
+ 0x30D,
+ 0x313,
+ 0x314,
+ 0x331,
+ 0x332,
+ 0x342,
+ 0x343,
+ 0x345,
+ 0x346,
+ 0x370,
+ 0x374,
+ 0x376,
+ 0x378,
+ 0x37B,
+ 0x37E,
+ 0x37F,
+ 0x380,
+ 0x386,
+ 0x387,
+ 0x388,
+ 0x38B,
+ 0x38C,
+ 0x38D,
+ 0x38E,
+ 0x3A2,
+ 0x3A3,
+ 0x3D2,
+ 0x3D5,
+ 0x3F6,
+ 0x3F7,
+ 0x3FC,
+ 0x3FD,
+ 0x482,
+ 0x48A,
+ 0x530,
+ 0x531,
+ 0x557,
+ 0x561,
+ 0x588,
+ 0x10A0,
+ 0x10C6,
+ 0x10C7,
+ 0x10C8,
+ 0x10CD,
+ 0x10CE,
+ 0x13A0,
+ 0x13F6,
+ 0x13F8,
+ 0x13FE,
+ 0x1D79,
+ 0x1D7A,
+ 0x1D7D,
+ 0x1D7E,
+ 0x1E00,
+ 0x1E9C,
+ 0x1E9E,
+ 0x1E9F,
+ 0x1EA0,
+ 0x1F16,
+ 0x1F18,
+ 0x1F1E,
+ 0x1F20,
+ 0x1F46,
+ 0x1F48,
+ 0x1F4E,
+ 0x1F50,
+ 0x1F58,
+ 0x1F59,
+ 0x1F5A,
+ 0x1F5B,
+ 0x1F5C,
+ 0x1F5D,
+ 0x1F5E,
+ 0x1F5F,
+ 0x1F7E,
+ 0x1F80,
+ 0x1FB5,
+ 0x1FB6,
+ 0x1FBD,
+ 0x1FBE,
+ 0x1FBF,
+ 0x1FC2,
+ 0x1FC5,
+ 0x1FC6,
+ 0x1FCD,
+ 0x1FD0,
+ 0x1FD4,
+ 0x1FD6,
+ 0x1FDC,
+ 0x1FE0,
+ 0x1FED,
+ 0x1FF2,
+ 0x1FF5,
+ 0x1FF6,
+ 0x1FFD,
+ 0x2126,
+ 0x2127,
+ 0x212A,
+ 0x212C,
+ 0x2132,
+ 0x2133,
+ 0x214E,
+ 0x214F,
+ 0x2160,
+ 0x2180,
+ 0x2183,
+ 0x2185,
+ 0x24B6,
+ 0x24EA,
+ 0x2C00,
+ 0x2C2F,
+ 0x2C30,
+ 0x2C5F,
+ 0x2C60,
+ 0x2C71,
+ 0x2C72,
+ 0x2C74,
+ 0x2C75,
+ 0x2C77,
+ 0x2C7E,
+ 0x2CE4,
+ 0x2CEB,
+ 0x2CEF,
+ 0x2CF2,
+ 0x2CF4,
+ 0x2D00,
+ 0x2D26,
+ 0x2D27,
+ 0x2D28,
+ 0x2D2D,
+ 0x2D2E,
+ 0xA640,
+ 0xA66E,
+ 0xA680,
+ 0xA69C,
+ 0xA722,
+ 0xA730,
+ 0xA732,
+ 0xA770,
+ 0xA779,
+ 0xA788,
+ 0xA78B,
+ 0xA78E,
+ 0xA790,
+ 0xA794,
+ 0xA796,
+ 0xA7AE,
+ 0xA7B0,
+ 0xA7B8,
+ 0xAB53,
+ 0xAB54,
+ 0xAB70,
+ 0xABC0,
+ 0xFB00,
+ 0xFB07,
+ 0xFB13,
+ 0xFB18,
+ 0xFF21,
+ 0xFF3B,
+ 0xFF41,
+ 0xFF5B,
+ 0x10400,
+ 0x10450,
+ 0x10C80,
+ 0x10CB3,
+ 0x10CC0,
+ 0x10CF3,
+ 0x118A0,
+ 0x118E0
+};
+
+static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC POSIX-BC */
+ 59, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x59,
+ 0x5A,
+ 0x130,
+ 0x131,
+ 0x149,
+ 0x14A,
+ 0x1F0,
+ 0x1F1,
+ 0x390,
+ 0x391,
+ 0x3B0,
+ 0x3B1,
+ 0x587,
+ 0x588,
+ 0x1E96,
+ 0x1E9B,
+ 0x1E9E,
+ 0x1E9F,
+ 0x1F50,
+ 0x1F51,
+ 0x1F52,
+ 0x1F53,
+ 0x1F54,
+ 0x1F55,
+ 0x1F56,
+ 0x1F57,
+ 0x1F80,
+ 0x1FB0,
+ 0x1FB2,
+ 0x1FB5,
+ 0x1FB6,
+ 0x1FB8,
+ 0x1FBC,
+ 0x1FBD,
+ 0x1FC2,
+ 0x1FC5,
+ 0x1FC6,
+ 0x1FC8,
+ 0x1FCC,
+ 0x1FCD,
+ 0x1FD2,
+ 0x1FD4,
+ 0x1FD6,
+ 0x1FD8,
+ 0x1FE2,
+ 0x1FE5,
+ 0x1FE6,
+ 0x1FE8,
+ 0x1FF2,
+ 0x1FF5,
+ 0x1FF6,
+ 0x1FF8,
+ 0x1FFC,
+ 0x1FFD,
+ 0xFB00,
+ 0xFB07,
+ 0xFB13,
+ 0xFB18
+};
+
+#endif /* defined(PERL_IN_REGCOMP_C) */
+
+#if defined(PERL_IN_PERL_C)
+
+static const UV _Perl_GCB_invlist[] = { /* for EBCDIC POSIX-BC */
1502, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -76397,7 +76819,7 @@ typedef enum {
GCB_V = 13
} GCB_enum;
-static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC POSIX-BC */
+static const GCB_enum _Perl_GCB_invmap[] = { /* for EBCDIC POSIX-BC */
GCB_Control,
GCB_CR,
GCB_Control,
@@ -77904,412 +78326,6 @@ static const GCB_enum Grapheme_Cluster_Break_invmap[] = { /* for EBCDIC POSIX-BC
#endif /* defined(PERL_IN_REGEXEC_C) */
-#if defined(PERL_IN_REGCOMP_C)
-
-static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC POSIX-BC */
- 45, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x2BC,
- 0x2BD,
- 0x308,
- 0x309,
- 0x313,
- 0x314,
- 0x342,
- 0x343,
- 0x3AC,
- 0x3AD,
- 0x3AE,
- 0x3AF,
- 0x3B1,
- 0x3B2,
- 0x3B7,
- 0x3B8,
- 0x3B9,
- 0x3BA,
- 0x3C1,
- 0x3C2,
- 0x3C5,
- 0x3C6,
- 0x3C9,
- 0x3CA,
- 0x3CE,
- 0x3CF,
- 0x565,
- 0x566,
- 0x574,
- 0x575,
- 0x57E,
- 0x57F,
- 0x1F00,
- 0x1F08,
- 0x1F20,
- 0x1F28,
- 0x1F60,
- 0x1F68,
- 0x1F70,
- 0x1F71,
- 0x1F74,
- 0x1F75,
- 0x1F7C,
- 0x1F7D
-};
-
-static const UV _Perl_Any_Folds_invlist[] = { /* for EBCDIC POSIX-BC */
- 277, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x42,
- 0x4A,
- 0x51,
- 0x5A,
- 0x62,
- 0x6A,
- 0x70,
- 0x79,
- 0x80,
- 0x8A,
- 0x8C,
- 0x8F,
- 0x91,
- 0x9A,
- 0x9C,
- 0x9D,
- 0x9E,
- 0x9F,
- 0xA0,
- 0xA1,
- 0xA2,
- 0xAA,
- 0xAC,
- 0xAF,
- 0xC0,
- 0xCA,
- 0xCB,
- 0xD0,
- 0xD1,
- 0xDA,
- 0xDB,
- 0xE1,
- 0xE2,
- 0xEA,
- 0xEB,
- 0xF0,
- 0xFC,
- 0xFD,
- 0xFE,
- 0xFF,
- 0x100,
- 0x131,
- 0x132,
- 0x138,
- 0x139,
- 0x18D,
- 0x18E,
- 0x19B,
- 0x19C,
- 0x1AA,
- 0x1AC,
- 0x1BA,
- 0x1BC,
- 0x1BE,
- 0x1BF,
- 0x1C0,
- 0x1C4,
- 0x221,
- 0x222,
- 0x234,
- 0x23A,
- 0x255,
- 0x256,
- 0x258,
- 0x259,
- 0x25A,
- 0x25B,
- 0x25D,
- 0x260,
- 0x262,
- 0x263,
- 0x264,
- 0x265,
- 0x267,
- 0x268,
- 0x26A,
- 0x26B,
- 0x26D,
- 0x26F,
- 0x270,
- 0x271,
- 0x273,
- 0x275,
- 0x276,
- 0x27D,
- 0x27E,
- 0x280,
- 0x281,
- 0x283,
- 0x284,
- 0x287,
- 0x28D,
- 0x292,
- 0x293,
- 0x29D,
- 0x29F,
- 0x2BC,
- 0x2BD,
- 0x2BE,
- 0x2BF,
- 0x300,
- 0x302,
- 0x307,
- 0x309,
- 0x30A,
- 0x30B,
- 0x30C,
- 0x30D,
- 0x313,
- 0x314,
- 0x331,
- 0x332,
- 0x342,
- 0x343,
- 0x345,
- 0x346,
- 0x370,
- 0x374,
- 0x376,
- 0x378,
- 0x37B,
- 0x37E,
- 0x37F,
- 0x380,
- 0x386,
- 0x387,
- 0x388,
- 0x38B,
- 0x38C,
- 0x38D,
- 0x38E,
- 0x3A2,
- 0x3A3,
- 0x3D2,
- 0x3D5,
- 0x3F6,
- 0x3F7,
- 0x3FC,
- 0x3FD,
- 0x482,
- 0x48A,
- 0x530,
- 0x531,
- 0x557,
- 0x561,
- 0x588,
- 0x10A0,
- 0x10C6,
- 0x10C7,
- 0x10C8,
- 0x10CD,
- 0x10CE,
- 0x13A0,
- 0x13F6,
- 0x13F8,
- 0x13FE,
- 0x1D79,
- 0x1D7A,
- 0x1D7D,
- 0x1D7E,
- 0x1E00,
- 0x1E9C,
- 0x1E9E,
- 0x1E9F,
- 0x1EA0,
- 0x1F16,
- 0x1F18,
- 0x1F1E,
- 0x1F20,
- 0x1F46,
- 0x1F48,
- 0x1F4E,
- 0x1F50,
- 0x1F58,
- 0x1F59,
- 0x1F5A,
- 0x1F5B,
- 0x1F5C,
- 0x1F5D,
- 0x1F5E,
- 0x1F5F,
- 0x1F7E,
- 0x1F80,
- 0x1FB5,
- 0x1FB6,
- 0x1FBD,
- 0x1FBE,
- 0x1FBF,
- 0x1FC2,
- 0x1FC5,
- 0x1FC6,
- 0x1FCD,
- 0x1FD0,
- 0x1FD4,
- 0x1FD6,
- 0x1FDC,
- 0x1FE0,
- 0x1FED,
- 0x1FF2,
- 0x1FF5,
- 0x1FF6,
- 0x1FFD,
- 0x2126,
- 0x2127,
- 0x212A,
- 0x212C,
- 0x2132,
- 0x2133,
- 0x214E,
- 0x214F,
- 0x2160,
- 0x2180,
- 0x2183,
- 0x2185,
- 0x24B6,
- 0x24EA,
- 0x2C00,
- 0x2C2F,
- 0x2C30,
- 0x2C5F,
- 0x2C60,
- 0x2C71,
- 0x2C72,
- 0x2C74,
- 0x2C75,
- 0x2C77,
- 0x2C7E,
- 0x2CE4,
- 0x2CEB,
- 0x2CEF,
- 0x2CF2,
- 0x2CF4,
- 0x2D00,
- 0x2D26,
- 0x2D27,
- 0x2D28,
- 0x2D2D,
- 0x2D2E,
- 0xA640,
- 0xA66E,
- 0xA680,
- 0xA69C,
- 0xA722,
- 0xA730,
- 0xA732,
- 0xA770,
- 0xA779,
- 0xA788,
- 0xA78B,
- 0xA78E,
- 0xA790,
- 0xA794,
- 0xA796,
- 0xA7AE,
- 0xA7B0,
- 0xA7B8,
- 0xAB53,
- 0xAB54,
- 0xAB70,
- 0xABC0,
- 0xFB00,
- 0xFB07,
- 0xFB13,
- 0xFB18,
- 0xFF21,
- 0xFF3B,
- 0xFF41,
- 0xFF5B,
- 0x10400,
- 0x10450,
- 0x10C80,
- 0x10CB3,
- 0x10CC0,
- 0x10CF3,
- 0x118A0,
- 0x118E0
-};
-
-static const UV _Perl_Folds_To_Multi_Char_invlist[] = { /* for EBCDIC POSIX-BC */
- 59, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x59,
- 0x5A,
- 0x130,
- 0x131,
- 0x149,
- 0x14A,
- 0x1F0,
- 0x1F1,
- 0x390,
- 0x391,
- 0x3B0,
- 0x3B1,
- 0x587,
- 0x588,
- 0x1E96,
- 0x1E9B,
- 0x1E9E,
- 0x1E9F,
- 0x1F50,
- 0x1F51,
- 0x1F52,
- 0x1F53,
- 0x1F54,
- 0x1F55,
- 0x1F56,
- 0x1F57,
- 0x1F80,
- 0x1FB0,
- 0x1FB2,
- 0x1FB5,
- 0x1FB6,
- 0x1FB8,
- 0x1FBC,
- 0x1FBD,
- 0x1FC2,
- 0x1FC5,
- 0x1FC6,
- 0x1FC8,
- 0x1FCC,
- 0x1FCD,
- 0x1FD2,
- 0x1FD4,
- 0x1FD6,
- 0x1FD8,
- 0x1FE2,
- 0x1FE5,
- 0x1FE6,
- 0x1FE8,
- 0x1FF2,
- 0x1FF5,
- 0x1FF6,
- 0x1FF8,
- 0x1FFC,
- 0x1FFD,
- 0xFB00,
- 0xFB07,
- 0xFB13,
- 0xFB18
-};
-
-#endif /* defined(PERL_IN_REGCOMP_C) */
-
#if defined(PERL_IN_UTF8_C)
static const UV _Perl_IDCont_invlist[] = { /* for EBCDIC POSIX-BC */
@@ -80822,7 +80838,7 @@ static const UV _Perl_IDStart_invlist[] = { /* for EBCDIC POSIX-BC */
#if defined(PERL_IN_PERL_C)
-static const UV Sentence_Break_invlist[] = { /* for EBCDIC POSIX-BC */
+static const UV _Perl_SB_invlist[] = { /* for EBCDIC POSIX-BC */
2924, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -83778,7 +83794,7 @@ typedef enum {
SB_Upper = 15
} SB_enum;
-static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC POSIX-BC */
+static const SB_enum _Perl_SB_invmap[] = { /* for EBCDIC POSIX-BC */
SB_Other,
SB_Sp,
SB_Other,
@@ -86707,93 +86723,9 @@ static const SB_enum Sentence_Break_invmap[] = { /* for EBCDIC POSIX-BC */
#endif /* defined(PERL_IN_REGEXEC_C) */
-#if defined(PERL_IN_REGCOMP_C)
-
-static const UV UpperLatin1_invlist[] = { /* for EBCDIC POSIX-BC */
- 57, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0x4,
- 0x5,
- 0x6,
- 0x7,
- 0x8,
- 0xB,
- 0x14,
- 0x15,
- 0x17,
- 0x18,
- 0x1A,
- 0x1C,
- 0x20,
- 0x26,
- 0x28,
- 0x2D,
- 0x30,
- 0x32,
- 0x33,
- 0x37,
- 0x38,
- 0x3C,
- 0x3E,
- 0x3F,
- 0x41,
- 0x4A,
- 0x51,
- 0x5A,
- 0x5F,
- 0x60,
- 0x62,
- 0x6A,
- 0x70,
- 0x7A,
- 0x80,
- 0x81,
- 0x8A,
- 0x91,
- 0x9A,
- 0xA2,
- 0xAA,
- 0xBB,
- 0xBE,
- 0xC1,
- 0xCA,
- 0xD1,
- 0xDA,
- 0xE2,
- 0xEA,
- 0xF0,
- 0xFA,
- 0xFB,
- 0xFC,
- 0xFD,
- 0xFE,
- 0xFF
-};
-
-#endif /* defined(PERL_IN_REGCOMP_C) */
-
#if defined(PERL_IN_PERL_C)
-static const UV VertSpace_invlist[] = { /* for EBCDIC POSIX-BC */
- 9, /* Number of elements */
- 148565664, /* Version and data structure type */
- 1, /* 0 if the list starts at 0;
- 1 if it starts at the element beyond 0 */
- 0x0,
- 0xB,
- 0xE,
- 0x15,
- 0x16,
- 0x25,
- 0x26,
- 0x2028,
- 0x202A
-};
-
-static const UV Word_Break_invlist[] = { /* for EBCDIC POSIX-BC */
+static const UV _Perl_WB_invlist[] = { /* for EBCDIC POSIX-BC */
1547, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
@@ -88375,7 +88307,7 @@ typedef enum {
WB_UNKNOWN = 18
} WB_enum;
-static const WB_enum Word_Break_invmap[] = { /* for EBCDIC POSIX-BC */
+static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC POSIX-BC */
WB_Other,
WB_Newline,
WB_CR,
@@ -89927,8 +89859,92 @@ static const WB_enum Word_Break_invmap[] = { /* for EBCDIC POSIX-BC */
#endif /* defined(PERL_IN_REGEXEC_C) */
+#if defined(PERL_IN_REGCOMP_C)
+
+static const UV UpperLatin1_invlist[] = { /* for EBCDIC POSIX-BC */
+ 57, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0x4,
+ 0x5,
+ 0x6,
+ 0x7,
+ 0x8,
+ 0xB,
+ 0x14,
+ 0x15,
+ 0x17,
+ 0x18,
+ 0x1A,
+ 0x1C,
+ 0x20,
+ 0x26,
+ 0x28,
+ 0x2D,
+ 0x30,
+ 0x32,
+ 0x33,
+ 0x37,
+ 0x38,
+ 0x3C,
+ 0x3E,
+ 0x3F,
+ 0x41,
+ 0x4A,
+ 0x51,
+ 0x5A,
+ 0x5F,
+ 0x60,
+ 0x62,
+ 0x6A,
+ 0x70,
+ 0x7A,
+ 0x80,
+ 0x81,
+ 0x8A,
+ 0x91,
+ 0x9A,
+ 0xA2,
+ 0xAA,
+ 0xBB,
+ 0xBE,
+ 0xC1,
+ 0xCA,
+ 0xD1,
+ 0xDA,
+ 0xE2,
+ 0xEA,
+ 0xF0,
+ 0xFA,
+ 0xFB,
+ 0xFC,
+ 0xFD,
+ 0xFE,
+ 0xFF
+};
+
+#endif /* defined(PERL_IN_REGCOMP_C) */
+
#if defined(PERL_IN_PERL_C)
+static const UV VertSpace_invlist[] = { /* for EBCDIC POSIX-BC */
+ 9, /* Number of elements */
+ 148565664, /* Version and data structure type */
+ 1, /* 0 if the list starts at 0;
+ 1 if it starts at the element beyond 0 */
+ 0x0,
+ 0xB,
+ 0xE,
+ 0x15,
+ 0x16,
+ 0x25,
+ 0x26,
+ 0x2028,
+ 0x202A
+};
+
static const UV XPerlSpace_invlist[] = { /* for EBCDIC POSIX-BC */
23, /* Number of elements */
148565664, /* Version and data structure type */
@@ -99479,7 +99495,7 @@ static const UV XPosixXDigit_invlist[] = { /* for EBCDIC POSIX-BC */
#endif /* EBCDIC POSIX-BC */
/* Generated from:
- * 083180df694deb1fc173361406c1a75619fb8376403db3a76dc585c1e3951eca lib/Unicode/UCD.pm
+ * 0bca60a25eb4ccf2e04f50446db5f882322f50a9c61dc57bb806ccfc9b2e26a4 lib/Unicode/UCD.pm
* ae98bec7e4f0564758eed81eca5015481ba32581f8a735a825b71b3bba714450 lib/unicore/ArabicShaping.txt
* 1687fe5994eb7e5c0dab8503fc2a1b3b479d91af9d3b8055941c9bd791f7d0b5 lib/unicore/BidiBrackets.txt
* 350d1302116194b0b21def287434b55c5088098fbc726e879f7420a391965643 lib/unicore/BidiMirroring.txt
@@ -99521,8 +99537,8 @@ static const UV XPosixXDigit_invlist[] = { /* for EBCDIC POSIX-BC */
* 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt
* 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt
* a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt
- * c9326eab8d7861c3543963e555d5b927348f4467c93071db23154dece7619654 lib/unicore/mktables
+ * 46f739fb5c9daf6fb457ed67f821d88d9eadd2df17b098f385b3b50f99c01acf lib/unicore/mktables
* 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version
* c6884f4d629f04d1316f3476cb1050b6a1b98ca30c903262955d4eae337c6b1e regen/charset_translations.pl
- * f199f92c0b5f87882b0198936ea8ef3dc43627b57a77ac3eb9250bd2664bbd88 regen/mk_invlists.pl
+ * 8a097f8f726bb1619af2f27f149ab87e60a1602f790147e3a561358be16abd27 regen/mk_invlists.pl
* ex: set ro: */
diff --git a/ext/XS-APItest/t/handy.t b/ext/XS-APItest/t/handy.t
index 9ebe0d3dd8..eea81e8a8f 100644
--- a/ext/XS-APItest/t/handy.t
+++ b/ext/XS-APItest/t/handy.t
@@ -42,7 +42,7 @@ my %properties = (
alnum => 'Word',
wordchar => 'Word',
alphanumeric => 'Alnum',
- alpha => 'Alpha',
+ alpha => 'XPosixAlpha',
ascii => 'ASCII',
blank => 'Blank',
cntrl => 'Control',
@@ -50,14 +50,14 @@ my %properties = (
graph => 'Graph',
idfirst => '_Perl_IDStart',
idcont => '_Perl_IDCont',
- lower => 'Lower',
+ lower => 'XPosixLower',
print => 'Print',
psxspc => 'XPosixSpace',
punct => 'XPosixPunct',
quotemeta => '_Perl_Quotemeta',
space => 'XPerlSpace',
vertws => 'VertSpace',
- upper => 'Upper',
+ upper => 'XPosixUpper',
xdigit => 'XDigit',
);
@@ -69,8 +69,13 @@ foreach my $name (sort keys %properties) {
my $property = $properties{$name};
my @invlist = prop_invlist($property, '_perl_core_internal_ok');
if (! @invlist) {
- fail("No inversion list found for $property");
- next;
+
+ # An empty return could mean an unknown property, or merely that it is
+ # empty. Call in scalar context to differentiate
+ if (! prop_invlist($property, '_perl_core_internal_ok')) {
+ fail("No inversion list found for $property");
+ next;
+ }
}
# Include all the Latin1 code points, plus 0x100.
@@ -270,7 +275,7 @@ foreach my $name (sort keys %to_properties) {
fail("No inversion map found for $property");
next;
}
- if ($format ne "al") {
+ if ($format !~ / ^ a l? $ /x) {
fail("Unexpected inversion map format ('$format') found for $property");
next;
}
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 06fbfd1143..1854982491 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -5,7 +5,7 @@ use warnings;
no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
-our $VERSION = '0.61';
+our $VERSION = '0.62';
require Exporter;
@@ -775,7 +775,6 @@ sub charprop ($$) {
}
else {
croak __PACKAGE__, "::charprop: Internal error: unknown format '$format'. Please perlbug this";
- return undef;
}
}
@@ -877,6 +876,10 @@ sub _charblocks {
local $_;
local $/ = "\n";
while (<$BLOCKSFH>) {
+
+ # Old versions used a different syntax to mark the range.
+ $_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0;
+
if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
my ($lo, $hi) = (hex($1), hex($2));
my $subrange = [ $lo, $hi, $3 ];
@@ -2652,9 +2655,11 @@ or even better, C<"Gc=LC">).
Many Unicode properties have more than one name (or alias). C<prop_invmap>
understands all of these, including Perl extensions to them. Ambiguities are
-resolved as described above for L</prop_aliases()>. The Perl internal
-property "Perl_Decimal_Digit, described below, is also accepted. An empty
-list is returned if the property name is unknown.
+resolved as described above for L</prop_aliases()> (except if a property has
+both a complete mapping, and a binary C<Y>/C<N> mapping, then specifying the
+property name prefixed by C<"is"> causes the binary one to be returned). The
+Perl internal property "Perl_Decimal_Digit, described below, is also accepted.
+An empty list is returned if the property name is unknown.
See L<perluniprops/Properties accessible through Unicode::UCD> for the
properties acceptable as inputs to this function.
@@ -3253,8 +3258,8 @@ RETRY:
# we need to also read in that table. Create a hash with the keys
# being the code points, and the values being a list of the
# aliases for the code point key.
- my ($aliases_code_points, $aliases_maps, undef, undef) =
- &prop_invmap('Name_Alias');
+ my ($aliases_code_points, $aliases_maps, undef, undef)
+ = &prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok');
my %aliases;
for (my $i = 0; $i < @$aliases_code_points; $i++) {
my $code_point = $aliases_code_points->[$i];
@@ -3545,7 +3550,19 @@ RETRY:
if ($swash->{'LIST'} =~ /^V/) {
@invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
- shift @invlist;
+
+ shift @invlist; # Get rid of 'V';
+
+ # Could need to be inverted: add or subtract a 0 at the beginning of
+ # the list.
+ if ($swash->{'INVERT_IT'}) {
+ if (@invlist && $invlist[0] == 0) {
+ shift @invlist;
+ }
+ else {
+ unshift @invlist, 0;
+ }
+ }
foreach my $i (0 .. @invlist - 1) {
$invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N'
}
@@ -3558,6 +3575,10 @@ RETRY:
}
}
else {
+ if ($swash->{'INVERT_IT'}) {
+ croak __PACKAGE__, ":prop_invmap: Don't know how to deal with inverted";
+ }
+
# The LIST input lines look like:
# ...
# 0374\t\tCommon
@@ -3873,7 +3894,7 @@ RETRY:
map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap;
$format = 'sl';
}
- elsif ($returned_prop eq 'ToNameAlias') {
+ elsif ($returned_prop =~ / To ( _Perl )? NameAlias/x) {
# This property currently doesn't have any lists, but theoretically
# could
@@ -3888,7 +3909,14 @@ RETRY:
# to indicate that need to add code point to it.
$format = 'ar';
}
- elsif ($format ne 'n' && $format ne 'a') {
+ elsif ($format eq 'ax') {
+
+ # Normally 'ax' properties have overrides, and will have been handled
+ # above, but if not, they still need adjustment, and the hex values
+ # have already been converted to decimal
+ $format = 'a';
+ }
+ elsif ($format ne 'n' && $format !~ / ^ a /x) {
# All others are simple scalars
$format = 's';
@@ -4079,6 +4107,15 @@ for its block using C<charblock>).
Note that starting in Unicode 6.1, many of the block names have shorter
synonyms. These are always given in the new style.
+=head2 Use with older Unicode versions
+
+The functions in this module work as well as can be expected when
+used on earlier Unicode versions. But, obviously, they use the available data
+from that Unicode version. For example, if the Unicode version predates the
+definition of the script property (Unicode 3.1), then any function that deals
+with scripts is going to return C<undef> for the script portion of the return
+value.
+
=head1 AUTHOR
Jarkko Hietaniemi. Now maintained by perl5 porters.
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index a799dd036b..22b2edbc93 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -19,14 +19,22 @@ use Test::More;
use Unicode::UCD qw(charinfo charprop charprops_all);
+my $expected_version = '8.0.0';
+my $current_version = Unicode::UCD::UnicodeVersion;
+my $v_unicode_version = pack "C*", split /\./, $current_version;
+my $unknown_script = ($v_unicode_version lt v5.0.0)
+ ? 'Common'
+ : 'Unknown';
my $input_record_separator = 7; # Make sure Unicode::UCD isn't affected by
$/ = $input_record_separator; # setting this.
my $charinfo;
is(charinfo(0x110000), undef, "Verify charinfo() of non-unicode is undef");
-is(charprop(0x110000, 'age'), "Unassigned", "Verify charprop(age) of non-unicode is Unassigned");
-is(charprop(0x110000, 'in'), "Unassigned", "Verify charprop(in), a bipartite Perl extension, works");
+if ($v_unicode_version ge v3.2.0) {
+ is(lc charprop(0x110000, 'age'), lc "Unassigned", "Verify charprop(age) of non-unicode is Unassigned");
+ is(charprop(0x110000, 'in'), "Unassigned", "Verify charprop(in), a bipartite Perl extension, works");
+}
is(charprop(0x110000, 'Any'), undef, "Verify charprop of non-bipartite Perl extension returns undef");
my $cp = 0;
@@ -37,9 +45,10 @@ is($charinfo->{code}, "0000",
is($charinfo->{name}, "<control>");
is(charprop($cp, "name"), "");
-# This gets a sl-type property returning a flattened list
-is(charprop($cp, "name_alias"), "NULL: control,NUL: abbreviation");
-
+if ($v_unicode_version ge v6.1.0) {
+ # This gets a sl-type property returning a flattened list
+ is(charprop($cp, "name_alias"), "NULL: control,NUL: abbreviation");
+}
is($charinfo->{category}, "Cc");
is(charprop($cp, "category"), "Control");
is($charinfo->{combining}, "0");
@@ -66,8 +75,8 @@ is($charinfo->{title}, "");
is(charprop($cp, "tc"), "\0");
is($charinfo->{block}, "Basic Latin");
is(charprop($cp, "block"), "Basic_Latin");
-is($charinfo->{script}, "Common");
-is(charprop($cp, "script"), "Common");
+is($charinfo->{script}, "Common") if $v_unicode_version gt v3.0.1;
+is(charprop($cp, "script"), "Common") if $v_unicode_version gt v3.0.1;
$cp = utf8::unicode_to_native(0x41);
my $A_code = sprintf("%04X", ord("A"));
@@ -103,8 +112,8 @@ is($charinfo->{title}, "");
is(charprop($cp, 'tc'), "A");
is($charinfo->{block}, "Basic Latin");
is(charprop($cp, 'block'), "Basic_Latin");
-is($charinfo->{script}, "Latin");
-is(charprop($cp, 'script'), "Latin");
+is($charinfo->{script}, "Latin") if $v_unicode_version gt v3.0.1;
+is(charprop($cp, 'script'), "Latin") if $v_unicode_version gt v3.0.1;
$cp = 0x100;
$charinfo = charinfo($cp);
@@ -138,8 +147,8 @@ is($charinfo->{title}, "");
is(charprop($cp, 'tc'), "\x{100}");
is($charinfo->{block}, "Latin Extended-A");
is(charprop($cp, 'block'), "Latin_Extended_A");
-is($charinfo->{script}, "Latin");
-is(charprop($cp, 'script'), "Latin");
+is($charinfo->{script}, "Latin") if $v_unicode_version gt v3.0.1;
+is(charprop($cp, 'script'), "Latin") if $v_unicode_version gt v3.0.1;
$cp = 0x590; # 0x0590 is in the Hebrew block but unused.
$charinfo = charinfo($cp);
@@ -152,7 +161,9 @@ is(charprop($cp, 'gc'), "Unassigned");
is($charinfo->{combining}, undef);
is(charprop($cp, 'ccc'), "Not_Reordered");
is($charinfo->{bidi}, undef);
-is(charprop($cp, 'bc'), "Right_To_Left");
+if ($v_unicode_version gt v3.2.0) {
+ is(charprop($cp, 'bc'), "Right_To_Left");
+}
is($charinfo->{decomposition}, undef);
is(charprop($cp, 'dm'), "\x{590}");
is($charinfo->{decimal}, undef);
@@ -174,7 +185,8 @@ is(charprop($cp, 'tc'), "\x{590}");
is($charinfo->{block}, undef);
is(charprop($cp, 'block'), "Hebrew");
is($charinfo->{script}, undef);
-is(charprop($cp, 'script'), "Unknown");
+is(charprop($cp, 'script'), $unknown_script) if $v_unicode_version gt
+v3.0.1;
# 0x05d0 is in the Hebrew block and used.
@@ -210,8 +222,8 @@ is($charinfo->{title}, "");
is(charprop($cp, 'tc'), "\x{5d0}");
is($charinfo->{block}, "Hebrew");
is(charprop($cp, 'block'), "Hebrew");
-is($charinfo->{script}, "Hebrew");
-is(charprop($cp, 'script'), "Hebrew");
+is($charinfo->{script}, "Hebrew") if $v_unicode_version gt v3.0.1;
+is(charprop($cp, 'script'), "Hebrew") if $v_unicode_version gt v3.0.1;
# An open syllable in Hangul.
@@ -247,8 +259,8 @@ is($charinfo->{title}, "");
is(charprop($cp, 'tc'), "\x{AC00}");
is($charinfo->{block}, "Hangul Syllables");
is(charprop($cp, 'block'), "Hangul_Syllables");
-is($charinfo->{script}, "Hangul");
-is(charprop($cp, 'script'), "Hangul");
+is($charinfo->{script}, "Hangul") if $v_unicode_version gt v3.0.1;
+is(charprop($cp, 'script'), "Hangul") if $v_unicode_version gt v3.0.1;
# A closed syllable in Hangul.
@@ -284,85 +296,89 @@ is($charinfo->{title}, "");
is(charprop($cp, 'tc'), "\x{AE00}");
is($charinfo->{block}, "Hangul Syllables");
is(charprop($cp, 'block'), "Hangul_Syllables");
-is($charinfo->{script}, "Hangul");
-is(charprop($cp, 'script'), "Hangul");
-
-$cp = 0x1D400;
-$charinfo = charinfo($cp);
+is($charinfo->{script}, "Hangul") if $v_unicode_version gt v3.0.1;
+is(charprop($cp, 'script'), "Hangul") if $v_unicode_version gt v3.0.1;
+
+if ($v_unicode_version gt v3.0.1) {
+ $cp = 0x1D400;
+ $charinfo = charinfo($cp);
+
+ is($charinfo->{code}, "1D400", "MATHEMATICAL BOLD CAPITAL A");
+ is($charinfo->{name}, "MATHEMATICAL BOLD CAPITAL A");
+ is(charprop($cp, 'name'), "MATHEMATICAL BOLD CAPITAL A");
+ is($charinfo->{category}, "Lu");
+ is(charprop($cp, 'gc'), "Uppercase_Letter");
+ is($charinfo->{combining}, "0");
+ is(charprop($cp, 'ccc'), "Not_Reordered");
+ is($charinfo->{bidi}, "L");
+ is(charprop($cp, 'bc'), "Left_To_Right");
+ is($charinfo->{decomposition}, "<font> $A_code");
+ is(charprop($cp, 'dm'), "A");
+ is($charinfo->{decimal}, "");
+ is($charinfo->{digit}, "");
+ is($charinfo->{numeric}, "");
+ is(charprop($cp, 'nv'), "NaN");
+ is($charinfo->{mirrored}, "N");
+ is(charprop($cp, 'bidim'), "No");
+ is($charinfo->{unicode10}, "");
+ is(charprop($cp, 'na1'), "");
+ is($charinfo->{comment}, "");
+ is(charprop($cp, 'isc'), "");
+ is($charinfo->{upper}, "");
+ is(charprop($cp, 'uc'), "\x{1D400}");
+ is($charinfo->{lower}, "");
+ is(charprop($cp, 'lc'), "\x{1D400}");
+ is($charinfo->{title}, "");
+ is(charprop($cp, 'tc'), "\x{1D400}");
+ is($charinfo->{block}, "Mathematical Alphanumeric Symbols");
+ is(charprop($cp, 'block'), "Mathematical_Alphanumeric_Symbols");
+ is($charinfo->{script}, "Common");
+ is(charprop($cp, 'script'), "Common");
+}
-is($charinfo->{code}, "1D400", "MATHEMATICAL BOLD CAPITAL A");
-is($charinfo->{name}, "MATHEMATICAL BOLD CAPITAL A");
-is(charprop($cp, 'name'), "MATHEMATICAL BOLD CAPITAL A");
-is($charinfo->{category}, "Lu");
-is(charprop($cp, 'gc'), "Uppercase_Letter");
-is($charinfo->{combining}, "0");
-is(charprop($cp, 'ccc'), "Not_Reordered");
-is($charinfo->{bidi}, "L");
-is(charprop($cp, 'bc'), "Left_To_Right");
-is($charinfo->{decomposition}, "<font> $A_code");
-is(charprop($cp, 'dm'), "A");
-is($charinfo->{decimal}, "");
-is($charinfo->{digit}, "");
-is($charinfo->{numeric}, "");
-is(charprop($cp, 'nv'), "NaN");
-is($charinfo->{mirrored}, "N");
-is(charprop($cp, 'bidim'), "No");
-is($charinfo->{unicode10}, "");
-is(charprop($cp, 'na1'), "");
-is($charinfo->{comment}, "");
-is(charprop($cp, 'isc'), "");
-is($charinfo->{upper}, "");
-is(charprop($cp, 'uc'), "\x{1D400}");
-is($charinfo->{lower}, "");
-is(charprop($cp, 'lc'), "\x{1D400}");
-is($charinfo->{title}, "");
-is(charprop($cp, 'tc'), "\x{1D400}");
-is($charinfo->{block}, "Mathematical Alphanumeric Symbols");
-is(charprop($cp, 'block'), "Mathematical_Alphanumeric_Symbols");
-is($charinfo->{script}, "Common");
-is(charprop($cp, 'script'), "Common");
-
-$cp = 0x9FBA; #Bug 58428
-$charinfo = charinfo(0x9FBA);
-
-is($charinfo->{code}, "9FBA", "U+9FBA");
-is($charinfo->{name}, "CJK UNIFIED IDEOGRAPH-9FBA");
-is(charprop($cp, 'name'), "CJK UNIFIED IDEOGRAPH-9FBA");
-is($charinfo->{category}, "Lo");
-is(charprop($cp, 'gc'), "Other_Letter");
-is($charinfo->{combining}, "0");
-is(charprop($cp, 'ccc'), "Not_Reordered");
-is($charinfo->{bidi}, "L");
-is(charprop($cp, 'bc'), "Left_To_Right");
-is($charinfo->{decomposition}, "");
-is(charprop($cp, 'dm'), "\x{9FBA}");
-is($charinfo->{decimal}, "");
-is($charinfo->{digit}, "");
-is($charinfo->{numeric}, "");
-is(charprop($cp, 'nv'), "NaN");
-is($charinfo->{mirrored}, "N");
-is(charprop($cp, 'bidim'), "No");
-is($charinfo->{unicode10}, "");
-is(charprop($cp, 'na1'), "");
-is($charinfo->{comment}, "");
-is(charprop($cp, 'isc'), "");
-is($charinfo->{upper}, "");
-is(charprop($cp, 'uc'), "\x{9FBA}");
-is($charinfo->{lower}, "");
-is(charprop($cp, 'lc'), "\x{9FBA}");
-is($charinfo->{title}, "");
-is(charprop($cp, 'tc'), "\x{9FBA}");
-is($charinfo->{block}, "CJK Unified Ideographs");
-is(charprop($cp, 'block'), "CJK_Unified_Ideographs");
-is($charinfo->{script}, "Han");
-is(charprop($cp, 'script'), "Han");
+if ($v_unicode_version ge v4.1.0) {
+ $cp = 0x9FBA; #Bug 58428
+ $charinfo = charinfo(0x9FBA);
+
+ is($charinfo->{code}, "9FBA", "U+9FBA");
+ is($charinfo->{name}, "CJK UNIFIED IDEOGRAPH-9FBA");
+ is(charprop($cp, 'name'), "CJK UNIFIED IDEOGRAPH-9FBA");
+ is($charinfo->{category}, "Lo");
+ is(charprop($cp, 'gc'), "Other_Letter");
+ is($charinfo->{combining}, "0");
+ is(charprop($cp, 'ccc'), "Not_Reordered");
+ is($charinfo->{bidi}, "L");
+ is(charprop($cp, 'bc'), "Left_To_Right");
+ is($charinfo->{decomposition}, "");
+ is(charprop($cp, 'dm'), "\x{9FBA}");
+ is($charinfo->{decimal}, "");
+ is($charinfo->{digit}, "");
+ is($charinfo->{numeric}, "");
+ is(charprop($cp, 'nv'), "NaN");
+ is($charinfo->{mirrored}, "N");
+ is(charprop($cp, 'bidim'), "No");
+ is($charinfo->{unicode10}, "");
+ is(charprop($cp, 'na1'), "");
+ is($charinfo->{comment}, "");
+ is(charprop($cp, 'isc'), "");
+ is($charinfo->{upper}, "");
+ is(charprop($cp, 'uc'), "\x{9FBA}");
+ is($charinfo->{lower}, "");
+ is(charprop($cp, 'lc'), "\x{9FBA}");
+ is($charinfo->{title}, "");
+ is(charprop($cp, 'tc'), "\x{9FBA}");
+ is($charinfo->{block}, "CJK Unified Ideographs");
+ is(charprop($cp, 'block'), "CJK_Unified_Ideographs");
+ is($charinfo->{script}, "Han");
+ is(charprop($cp, 'script'), "Han");
+}
use Unicode::UCD qw(charblock charscript);
# 0x0590 is in the Hebrew block but unused.
is(charblock(0x590), "Hebrew", "0x0590 - Hebrew unused charblock");
-is(charscript(0x590), "Unknown", "0x0590 - Hebrew unused charscript");
+is(charscript(0x590), $unknown_script, "0x0590 - Hebrew unused charscript") if $v_unicode_version gt v3.0.1;
is(charblock(0x1FFFF), "No_Block", "0x1FFFF - unused charblock");
my $fraction_3_4_code = sprintf("%04X", utf8::unicode_to_native(0xbe));
@@ -401,8 +417,8 @@ is($charinfo->{title}, "");
is(charprop($cp, 'tc'), chr hex $cp);
is($charinfo->{block}, "Latin-1 Supplement");
is(charprop($cp, 'block'), "Latin_1_Supplement");
-is($charinfo->{script}, "Common");
-is(charprop($cp, 'script'), "Common");
+is($charinfo->{script}, "Common") if $v_unicode_version gt v3.0.1;
+is(charprop($cp, 'script'), "Common") if $v_unicode_version gt v3.0.1;
# This is to test a case where both simple and full lowercases exist and
# differ
@@ -435,13 +451,13 @@ is(charprop($cp, 'isc'), "");
is($charinfo->{upper}, "");
is(charprop($cp, 'uc'), "\x{130}");
is($charinfo->{lower}, $i_code);
-is(charprop($cp, 'lc'), "i\x{307}");
+is(charprop($cp, 'lc'), "i\x{307}") if $v_unicode_version ge v3.2.0;
is($charinfo->{title}, "");
is(charprop($cp, 'tc'), "\x{130}");
is($charinfo->{block}, "Latin Extended-A");
is(charprop($cp, 'block'), "Latin_Extended_A");
-is($charinfo->{script}, "Latin");
-is(charprop($cp, 'script'), "Latin");
+is($charinfo->{script}, "Latin") if $v_unicode_version gt v3.0.1;
+is(charprop($cp, 'script'), "Latin") if $v_unicode_version gt v3.0.1;
# This is to test a case where both simple and full uppercases exist and
# differ
@@ -478,19 +494,23 @@ is($charinfo->{title}, "1F88");
is(charprop($cp, "tc"), "\x{1F88}");
is($charinfo->{block}, "Greek Extended");
is(charprop($cp, "block"), "Greek_Extended");
-is($charinfo->{script}, "Greek");
-is(charprop($cp, "script"), "Greek");
+is($charinfo->{script}, "Greek") if $v_unicode_version gt v3.0.1;
+is(charprop($cp, "script"), "Greek") if $v_unicode_version gt v3.0.1;
is(charprop(ord("A"), "foo"), undef,
"Verify charprop of unknown property returns <undef>");
# These were created from inspection of the code to exercise the branches
-is(charprop(ord("("), "bpb"), ")",
+if ($v_unicode_version ge v6.3.0) {
+ is(charprop(ord("("), "bpb"), ")",
"Verify charprop figures out that s-type properties can be char");
+}
is(charprop(ord("9"), "nv"), 9,
"Verify charprop can adjust an ar-type property");
-is(charprop(utf8::unicode_to_native(0xAD), "NFKC_Casefold"), "",
+if ($v_unicode_version ge v5.2.0) {
+ is(charprop(utf8::unicode_to_native(0xAD), "NFKC_Casefold"), "",
"Verify charprop can handle an \"\" in ae-type property");
+}
my $mark_props_ref = charprops_all(0x300);
is($mark_props_ref->{'Bidi_Class'}, "Nonspacing_Mark",
@@ -499,9 +519,13 @@ is($mark_props_ref->{'Bidi_Mirrored'}, "No");
is($mark_props_ref->{'Canonical_Combining_Class'}, "Above");
is($mark_props_ref->{'Case_Folding'}, "\x{300}");
is($mark_props_ref->{'Decomposition_Mapping'}, "\x{300}");
-is($mark_props_ref->{'Decomposition_Type'}, "None");
+is($mark_props_ref->{'Decomposition_Type'}, ($v_unicode_version le v4.0.0)
+ ? "none"
+ : "None");
is($mark_props_ref->{'General_Category'}, "Nonspacing_Mark");
-is($mark_props_ref->{'ISO_Comment'}, "");
+if ($v_unicode_version gt v5.1.0) {
+ is($mark_props_ref->{'ISO_Comment'}, "");
+}
is($mark_props_ref->{'Lowercase_Mapping'}, "\x{300}");
is($mark_props_ref->{'Name'}, "COMBINING GRAVE ACCENT");
is($mark_props_ref->{'Numeric_Type'}, "None");
@@ -522,36 +546,40 @@ ok(exists $charblocks->{Thai}, 'Thai charblock exists');
is($charblocks->{Thai}->[0]->[0], hex('0e00'));
ok(!exists $charblocks->{PigLatin}, 'PigLatin charblock does not exist');
-my $charscripts = charscripts();
+if ($v_unicode_version gt v3.0.1) {
+ my $charscripts = charscripts();
-ok(exists $charscripts->{Armenian}, 'Armenian charscript exists');
-is($charscripts->{Armenian}->[0]->[0], hex('0531'));
-ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist');
+ ok(exists $charscripts->{Armenian}, 'Armenian charscript exists');
+ is($charscripts->{Armenian}->[0]->[0], hex('0531'));
+ ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist');
-my $charscript;
+ my $charscript;
-$charscript = charscript("12ab");
-is($charscript, 'Ethiopic', 'Ethiopic charscript');
+ $charscript = charscript("12ab");
+ is($charscript, 'Ethiopic', 'Ethiopic charscript');
-$charscript = charscript("0x12ab");
-is($charscript, 'Ethiopic');
+ $charscript = charscript("0x12ab");
+ is($charscript, 'Ethiopic');
-$charscript = charscript("U+12ab");
-is($charscript, 'Ethiopic');
+ $charscript = charscript("U+12ab");
+ is($charscript, 'Ethiopic');
-my $ranges;
+ my $ranges;
-$ranges = charscript('Ogham');
-is($ranges->[0]->[0], hex('1680'), 'Ogham charscript');
-is($ranges->[0]->[1], hex('169C'));
+ if ($v_unicode_version gt v4.0.0) {
+ $ranges = charscript('Ogham');
+ is($ranges->[0]->[0], hex('1680'), 'Ogham charscript');
+ is($ranges->[0]->[1], hex('169C'));
+ }
-use Unicode::UCD qw(charinrange);
+ use Unicode::UCD qw(charinrange);
-$ranges = charscript('Cherokee');
-ok(!charinrange($ranges, "139f"), 'Cherokee charscript');
-ok( charinrange($ranges, "13a0"));
-ok( charinrange($ranges, "13f4"));
-ok(!charinrange($ranges, "13ff"));
+ $ranges = charscript('Cherokee');
+ ok(!charinrange($ranges, "139f"), 'Cherokee charscript');
+ ok( charinrange($ranges, "13a0"));
+ ok( charinrange($ranges, "13f4"));
+ ok(!charinrange($ranges, "13ff"));
+}
use Unicode::UCD qw(general_categories);
@@ -571,7 +599,8 @@ is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic');
# If this fails, then maybe one should look at the Unicode changes to see
# what else might need to be updated.
-is(Unicode::UCD::UnicodeVersion, '8.0.0', 'UnicodeVersion');
+ok($current_version le $expected_version,
+ "Verify there isn't a new Unicode version to upgrade to");
use Unicode::UCD qw(compexcl);
@@ -593,66 +622,93 @@ is($casefold->{full}, $a_code, 'casefold native(0x41) full');
is($casefold->{simple}, $a_code, 'casefold native(0x41) simple');
is($casefold->{turkic}, "", 'casefold native(0x41) turkic');
-$casefold = casefold(utf8::unicode_to_native(0xdf));
my $sharp_s_code = sprintf("%04X", utf8::unicode_to_native(0xdf));
my $S_code = sprintf("%04X", ord "S");
my $s_code = sprintf("%04X", ord "s");
-is($casefold->{code}, $sharp_s_code, 'casefold native(0xDF) code');
-is($casefold->{status}, 'F', 'casefold native(0xDF) status');
-is($casefold->{mapping}, "$s_code $s_code", 'casefold native(0xDF) mapping');
-is($casefold->{full}, "$s_code $s_code", 'casefold native(0xDF) full');
-is($casefold->{simple}, "", 'casefold native(0xDF) simple');
-is($casefold->{turkic}, "", 'casefold native(0xDF) turkic');
-
-# Do different tests depending on if version < 3.2, or not.
-my $v_unicode_version = pack "C*", split /\./, Unicode::UCD::UnicodeVersion();
-if ($v_unicode_version lt v3.2.0) {
- $casefold = casefold(0x130);
-
- is($casefold->{code}, '0130', 'casefold 0x130 code');
- is($casefold->{status}, 'I' , 'casefold 0x130 status');
- is($casefold->{mapping}, $i_code, 'casefold 0x130 mapping');
- is($casefold->{full}, $i_code, 'casefold 0x130 full');
- is($casefold->{simple}, $i_code, 'casefold 0x130 simple');
- is($casefold->{turkic}, $i_code, 'casefold 0x130 turkic');
-
- $casefold = casefold(0x131);
-
- is($casefold->{code}, '0131', 'casefold 0x131 code');
- is($casefold->{status}, 'I' , 'casefold 0x131 status');
- is($casefold->{mapping}, $i_code, 'casefold 0x131 mapping');
- is($casefold->{full}, $i_code, 'casefold 0x131 full');
- is($casefold->{simple}, $i_code, 'casefold 0x131 simple');
- is($casefold->{turkic}, $i_code, 'casefold 0x131 turkic');
-} else {
- $casefold = casefold(utf8::unicode_to_native(0x49));
-
- is($casefold->{code}, $I_code, 'casefold native(0x49) code');
- is($casefold->{status}, 'C' , 'casefold native(0x49) status');
- is($casefold->{mapping}, $i_code, 'casefold native(0x49) mapping');
- is($casefold->{full}, $i_code, 'casefold native(0x49) full');
- is($casefold->{simple}, $i_code, 'casefold native(0x49) simple');
- is($casefold->{turkic}, "0131", 'casefold native(0x49) turkic');
-
- $casefold = casefold(0x130);
-
- is($casefold->{code}, '0130', 'casefold 0x130 code');
- is($casefold->{status}, 'F' , 'casefold 0x130 status');
- is($casefold->{mapping}, "$i_code 0307", 'casefold 0x130 mapping');
- is($casefold->{full}, "$i_code 0307", 'casefold 0x130 full');
- is($casefold->{simple}, "", 'casefold 0x130 simple');
- is($casefold->{turkic}, $i_code, 'casefold 0x130 turkic');
-}
+if ($v_unicode_version gt v3.0.0) { # These special ones don't work on early
+ # perls
+ $casefold = casefold(utf8::unicode_to_native(0xdf));
+
+ is($casefold->{code}, $sharp_s_code, 'casefold native(0xDF) code');
+ is($casefold->{status}, 'F', 'casefold native(0xDF) status');
+ is($casefold->{mapping}, "$s_code $s_code", 'casefold native(0xDF) mapping');
+ is($casefold->{full}, "$s_code $s_code", 'casefold native(0xDF) full');
+ is($casefold->{simple}, "", 'casefold native(0xDF) simple');
+ is($casefold->{turkic}, "", 'casefold native(0xDF) turkic');
+
+ # Do different tests depending on if version < 3.2, or not.
+ if ($v_unicode_version eq v3.0.1) {
+ # In this release, there was no special Turkic values.
+ # Both 0x130 and 0x131 folded to 'i'.
+
+ $casefold = casefold(0x130);
+
+ is($casefold->{code}, '0130', 'casefold 0x130 code');
+ is($casefold->{status}, 'C' , 'casefold 0x130 status');
+ is($casefold->{mapping}, $i_code, 'casefold 0x130 mapping');
+ is($casefold->{full}, $i_code, 'casefold 0x130 full');
+ is($casefold->{simple}, $i_code, 'casefold 0x130 simple');
+ is($casefold->{turkic}, "", 'casefold 0x130 turkic');
+
+ $casefold = casefold(0x131);
+
+ is($casefold->{code}, '0131', 'casefold 0x131 code');
+ is($casefold->{status}, 'C' , 'casefold 0x131 status');
+ is($casefold->{mapping}, $i_code, 'casefold 0x131 mapping');
+ is($casefold->{full}, $i_code, 'casefold 0x131 full');
+ is($casefold->{simple}, $i_code, 'casefold 0x131 simple');
+ is($casefold->{turkic}, "", 'casefold 0x131 turkic');
+ }
+ elsif ($v_unicode_version lt v3.2.0) {
+ $casefold = casefold(0x130);
+
+ is($casefold->{code}, '0130', 'casefold 0x130 code');
+ is($casefold->{status}, 'I' , 'casefold 0x130 status');
+ is($casefold->{mapping}, $i_code, 'casefold 0x130 mapping');
+ is($casefold->{full}, $i_code, 'casefold 0x130 full');
+ is($casefold->{simple}, $i_code, 'casefold 0x130 simple');
+ is($casefold->{turkic}, $i_code, 'casefold 0x130 turkic');
+
+ $casefold = casefold(0x131);
+
+ is($casefold->{code}, '0131', 'casefold 0x131 code');
+ is($casefold->{status}, 'I' , 'casefold 0x131 status');
+ is($casefold->{mapping}, $i_code, 'casefold 0x131 mapping');
+ is($casefold->{full}, $i_code, 'casefold 0x131 full');
+ is($casefold->{simple}, $i_code, 'casefold 0x131 simple');
+ is($casefold->{turkic}, $i_code, 'casefold 0x131 turkic');
+ } else {
+ $casefold = casefold(utf8::unicode_to_native(0x49));
+
+ is($casefold->{code}, $I_code, 'casefold native(0x49) code');
+ is($casefold->{status}, 'C' , 'casefold native(0x49) status');
+ is($casefold->{mapping}, $i_code, 'casefold native(0x49) mapping');
+ is($casefold->{full}, $i_code, 'casefold native(0x49) full');
+ is($casefold->{simple}, $i_code, 'casefold native(0x49) simple');
+ is($casefold->{turkic}, "0131", 'casefold native(0x49) turkic');
+
+ $casefold = casefold(0x130);
+
+ is($casefold->{code}, '0130', 'casefold 0x130 code');
+ is($casefold->{status}, 'F' , 'casefold 0x130 status');
+ is($casefold->{mapping}, "$i_code 0307", 'casefold 0x130 mapping');
+ is($casefold->{full}, "$i_code 0307", 'casefold 0x130 full');
+ is($casefold->{simple}, "", 'casefold 0x130 simple');
+ is($casefold->{turkic}, $i_code, 'casefold 0x130 turkic');
+ }
-$casefold = casefold(0x1F88);
+ if ($v_unicode_version gt v3.0.1) {
+ $casefold = casefold(0x1F88);
-is($casefold->{code}, '1F88', 'casefold 0x1F88 code');
-is($casefold->{status}, 'S' , 'casefold 0x1F88 status');
-is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping');
-is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full');
-is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple');
-is($casefold->{turkic}, "", 'casefold 0x1F88 turkic');
+ is($casefold->{code}, '1F88', 'casefold 0x1F88 code');
+ is($casefold->{status}, 'S' , 'casefold 0x1F88 status');
+ is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping');
+ is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full');
+ is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple');
+ is($casefold->{turkic}, "", 'casefold 0x1F88 turkic');
+ }
+}
ok(!casefold(utf8::unicode_to_native(0x20)));
@@ -672,12 +728,16 @@ ok($casespec->{code} eq $sharp_s_code &&
$casespec = casespec(0x307);
-ok($casespec->{az}->{code} eq '0307' &&
- !defined $casespec->{az}->{lower} &&
- $casespec->{az}->{title} eq '0307' &&
- $casespec->{az}->{upper} eq '0307' &&
- $casespec->{az}->{condition} eq 'az After_I',
- 'casespec 0x307');
+if ($v_unicode_version gt v3.1.0) {
+ ok($casespec->{az}->{code} eq '0307'
+ && !defined $casespec->{az}->{lower}
+ && $casespec->{az}->{title} eq '0307'
+ && $casespec->{az}->{upper} eq '0307'
+ && $casespec->{az}->{condition} eq ($v_unicode_version le v3.2)
+ ? 'az After_Soft_Dotted'
+ : 'az After_I',
+ 'casespec 0x307');
+}
# perl #7305 UnicodeCD::compexcl is weird
@@ -699,11 +759,15 @@ is(Unicode::UCD::_getcode('x123'), undef, "_getcode(x123)");
is(Unicode::UCD::_getcode('0x123x'), undef, "_getcode(x123)");
is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)");
+SKIP:
{
+ skip("Script property not in this release", 3) if $v_unicode_version lt v3.1.0;
my $r1 = charscript('Latin');
if (ok(defined $r1, "Found Latin script")) {
+ skip("Latin range count will be wrong when using older Unicode release",
+ 2) if $v_unicode_version lt $expected_version;
my $n1 = @$r1;
- is($n1, 31, "number of ranges in Latin script (Unicode 7.0.0)") if $::IS_ASCII;
+ is($n1, 31, "number of ranges in Latin script (Unicode $expected_version)") if $::IS_ASCII;
shift @$r1 while @$r1;
my $r2 = charscript('Latin');
is(@$r2, $n1, "modifying results should not mess up internal caches");
@@ -714,38 +778,72 @@ is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)");
is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD");
}
-use Unicode::UCD qw(namedseq);
-
-is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq");
-is(namedseq("KATAKANA LETTER AINU Q"), undef);
-is(namedseq(), undef);
-is(namedseq(qw(foo bar)), undef);
-my @ns = namedseq("KATAKANA LETTER AINU P");
-is(scalar @ns, 2);
-is($ns[0], 0x31F7);
-is($ns[1], 0x309A);
-my %ns = namedseq();
-is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}");
-@ns = namedseq(42);
-is(@ns, 0);
+if ($v_unicode_version ge v4.1.0) {
+ use Unicode::UCD qw(namedseq);
+
+ is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq");
+ is(namedseq("KATAKANA LETTER AINU Q"), undef);
+ is(namedseq(), undef);
+ is(namedseq(qw(foo bar)), undef);
+ my @ns = namedseq("KATAKANA LETTER AINU P");
+ is(scalar @ns, 2);
+ is($ns[0], 0x31F7);
+ is($ns[1], 0x309A);
+ my %ns = namedseq();
+ is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}");
+ @ns = namedseq(42);
+ is(@ns, 0);
+}
use Unicode::UCD qw(num);
-use charnames ":full";
+use charnames (); # Don't use \N{} on things not in original Unicode
+ # version; else will get a compilation error when this .t
+ # is run on an older version.
is(num("0"), 0, 'Verify num("0") == 0');
is(num("98765"), 98765, 'Verify num("98765") == 98765');
-ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined');
-is(num("\N{NEW TAI LUE DIGIT TWO}"), 2, 'Verify num("\N{NEW TAI LUE DIGIT TWO}") == 2');
-is(num("\N{NEW TAI LUE DIGIT ONE}"), 1, 'Verify num("\N{NEW TAI LUE DIGIT ONE}") == 1');
-is(num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}"), 21, 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}") == 21');
-ok(! defined num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}"), 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined');
-is(num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}"), 3, 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3');
-ok(! defined num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}"), 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined');
+ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"),
+ 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined');
+my $tai_lue_2;
+if ($v_unicode_version ge v4.1.0) {
+ my $tai_lue_1 = charnames::string_vianame("NEW TAI LUE DIGIT ONE");
+ $tai_lue_2 = charnames::string_vianame("NEW TAI LUE DIGIT TWO");
+ is(num($tai_lue_2), 2, 'Verify num("\N{NEW TAI LUE DIGIT TWO}") == 2');
+ is(num($tai_lue_1), 1, 'Verify num("\N{NEW TAI LUE DIGIT ONE}") == 1');
+ is(num($tai_lue_2 . $tai_lue_1), 21,
+ 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}") == 21');
+}
+if ($v_unicode_version ge v5.2.0) {
+ ok(! defined num($tai_lue_2
+ . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")),
+ 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined');
+}
+if ($v_unicode_version ge v5.1.0) {
+ my $cham_0 = charnames::string_vianame("CHAM DIGIT ZERO");
+ is(num($cham_0 . charnames::string_vianame("CHAM DIGIT THREE")), 3,
+ 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3');
+ if ($v_unicode_version ge v5.2.0) {
+ ok(! defined num( $cham_0
+ . charnames::string_vianame("JAVANESE DIGIT NINE")),
+ 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined');
+ }
+}
is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2');
-is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000');
-is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5');
-is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9');
-is(num("\N{U+5146}"), 1000000000000, 'Verify num("\N{U+5146}") == 1000000000000');
+if ($v_unicode_version ge v3.0.0) {
+ is(num(charnames::string_vianame("ETHIOPIC NUMBER TEN THOUSAND")), 10000,
+ 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000');
+}
+if ($v_unicode_version ge v5.2.0) {
+ is(num(charnames::string_vianame("NORTH INDIC FRACTION ONE HALF")),
+ .5,
+ 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5');
+ is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9');
+}
+if ($v_unicode_version gt v3.2.0) { # Is missing from non-Unihan files before
+ # this
+ is(num("\N{U+5146}"), 1000000000000,
+ 'Verify num("\N{U+5146}") == 1000000000000');
+}
# Create a user-defined property
sub InKana {<<'END'}
@@ -979,6 +1077,12 @@ while (<$propvalues>) {
my @fields = split /\s*;\s*/; # Fields are separated by semi-colons
my $prop = shift @fields; # 0th field is the property,
+ # 'qc' is short in early versions of the file for any of the quick check
+ # properties. Choose one of them.
+ if ($prop eq 'qc' && $v_unicode_version le v4.0.0) {
+ $prop = "NFKC_QC";
+ }
+
# When changing properties, we examine the accumulated values for the old
# one to see if our function that returns them matches.
if ($prev_prop ne $prop) {
@@ -986,6 +1090,11 @@ while (<$propvalues>) {
my @ucd_function_values = prop_values($prev_prop);
@ucd_function_values = () unless @ucd_function_values;
+ # The file didn't include strictly numeric values until after this
+ if ($prev_prop eq 'ccc' && $v_unicode_version le v6.0.0) {
+ @ucd_function_values = grep { /\D/ } @ucd_function_values;
+ }
+
# This perl extension doesn't appear in the official file
push @this_prop_values, "Non_Canon" if $prev_prop eq 'dt';
@@ -1008,6 +1117,12 @@ while (<$propvalues>) {
# characters that are ignored under loose matching to test that
my $mod_prop = "$extra_chars$prop";
+ if ($prop eq 'blk' && $v_unicode_version le v5.0.0) {
+ foreach my $element (@fields) {
+ $element =~ s/-/_/g;
+ }
+ }
+
if ($fields[0] eq 'n/a') { # See comments in input file, essentially
# means full name and short name are identical
$fields[0] = $fields[1];
@@ -1190,36 +1305,39 @@ if ($::IS_ASCII) { # On EBCDIC, other things will come first, and can vary
$prop = "lc";
($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
- is($format, 'al', "prop_invmap() format of '$prop' is 'al'");
+ my $lc_format = ($v_unicode_version ge v3.2.0) ? 'al' : 'a';
+ is($format, $lc_format, "prop_invmap() format of '$prop' is '$lc_format");
is($missing, '0', "prop_invmap() missing of '$prop' is '0'");
is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41");
is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61");
}
# This property is stable and small, so can test all of it
-$prop = "ASCII_Hex_Digit";
-($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
-is($format, 's', "prop_invmap() format of '$prop' is 's'");
-is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'");
-if ($::IS_ASCII) {
- is_deeply($invlist_ref, [ 0x0000, 0x0030, 0x003A,
- 0x0041, 0x0047,
- 0x0061, 0x0067, 0x110000
- ],
- "prop_invmap('$prop') code point list is correct");
-}
-elsif ($::IS_EBCDIC) {
- is_deeply($invlist_ref, [
- utf8::unicode_to_native(0x0000),
- utf8::unicode_to_native(0x0061), utf8::unicode_to_native(0x0066) + 1,
- utf8::unicode_to_native(0x0041), utf8::unicode_to_native(0x0046) + 1,
- utf8::unicode_to_native(0x0030), utf8::unicode_to_native(0x0039) + 1,
- utf8::unicode_to_native(0x110000)
- ],
- "prop_invmap('$prop') code point list is correct");
+if ($v_unicode_version gt v3.1.0) {
+ $prop = "ASCII_Hex_Digit";
+ ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
+ is($format, 's', "prop_invmap() format of '$prop' is 's'");
+ is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'");
+ if ($::IS_ASCII) {
+ is_deeply($invlist_ref, [ 0x0000, 0x0030, 0x003A,
+ 0x0041, 0x0047,
+ 0x0061, 0x0067, 0x110000
+ ],
+ "prop_invmap('$prop') code point list is correct");
+ }
+ elsif ($::IS_EBCDIC) {
+ is_deeply($invlist_ref, [
+ utf8::unicode_to_native(0x0000),
+ utf8::unicode_to_native(0x0061), utf8::unicode_to_native(0x0066) + 1,
+ utf8::unicode_to_native(0x0041), utf8::unicode_to_native(0x0046) + 1,
+ utf8::unicode_to_native(0x0030), utf8::unicode_to_native(0x0039) + 1,
+ utf8::unicode_to_native(0x110000)
+ ],
+ "prop_invmap('$prop') code point list is correct");
+ }
+ is_deeply($invmap_ref, [ 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N' ] ,
+ "prop_invmap('$prop') map list is correct");
}
-is_deeply($invmap_ref, [ 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N' ] ,
- "prop_invmap('$prop') map list is correct");
is(prop_invlist("Unknown property"), undef, "prop_invlist(<Unknown property>) returns undef");
is(prop_invlist(undef), undef, "prop_invlist(undef) returns undef");
@@ -1240,36 +1358,38 @@ is(prop_invlist("InKana"), undef, "prop_invlist(<user-defined property returns u
# are there in the files. As a small hedge against that, test some
# prop_invlist() tables fully with the known correct result. We choose
# ASCII_Hex_Digit again, as it is stable.
-if ($::IS_ASCII) {
- @invlist = prop_invlist("AHex");
- is_deeply(\@invlist, [ 0x0030, 0x003A, 0x0041,
- 0x0047, 0x0061, 0x0067 ],
- "prop_invlist('AHex') is exactly the expected set of points");
- @invlist = prop_invlist("AHex=f");
- is_deeply(\@invlist, [ 0x0000, 0x0030, 0x003A, 0x0041,
- 0x0047, 0x0061, 0x0067 ],
- "prop_invlist('AHex=f') is exactly the expected set of points");
-}
-elsif ($::IS_EBCDIC) { # Relies on the ranges 0-9, a-f, and A-F each being
- # contiguous
- @invlist = prop_invlist("AHex");
- is_deeply(\@invlist, [
- utf8::unicode_to_native(0x0061), utf8::unicode_to_native(0x0066) + 1,
- utf8::unicode_to_native(0x0041), utf8::unicode_to_native(0x0046) + 1,
- utf8::unicode_to_native(0x0030), utf8::unicode_to_native(0x0039) + 1,
- ],
- "prop_invlist('AHex') is exactly the expected set of points");
- @invlist = prop_invlist("AHex=f");
- is_deeply(\@invlist, [
- utf8::unicode_to_native(0x0000),
- utf8::unicode_to_native(0x0061),
- utf8::unicode_to_native(0x0066) + 1,
- utf8::unicode_to_native(0x0041),
- utf8::unicode_to_native(0x0046) + 1,
- utf8::unicode_to_native(0x0030),
- utf8::unicode_to_native(0x0039) + 1,
- ],
- "prop_invlist('AHex=f') is exactly the expected set of points");
+if ($v_unicode_version gt v3.1.0) {
+ if ($::IS_ASCII) {
+ @invlist = prop_invlist("AHex");
+ is_deeply(\@invlist, [ 0x0030, 0x003A, 0x0041,
+ 0x0047, 0x0061, 0x0067 ],
+ "prop_invlist('AHex') is exactly the expected set of points");
+ @invlist = prop_invlist("AHex=f");
+ is_deeply(\@invlist, [ 0x0000, 0x0030, 0x003A, 0x0041,
+ 0x0047, 0x0061, 0x0067 ],
+ "prop_invlist('AHex=f') is exactly the expected set of points");
+ }
+ elsif ($::IS_EBCDIC) { # Relies on the ranges 0-9, a-f, and A-F each being
+ # contiguous
+ @invlist = prop_invlist("AHex");
+ is_deeply(\@invlist, [
+ utf8::unicode_to_native(0x0061), utf8::unicode_to_native(0x0066) + 1,
+ utf8::unicode_to_native(0x0041), utf8::unicode_to_native(0x0046) + 1,
+ utf8::unicode_to_native(0x0030), utf8::unicode_to_native(0x0039) + 1,
+ ],
+ "prop_invlist('AHex') is exactly the expected set of points");
+ @invlist = prop_invlist("AHex=f");
+ is_deeply(\@invlist, [
+ utf8::unicode_to_native(0x0000),
+ utf8::unicode_to_native(0x0061),
+ utf8::unicode_to_native(0x0066) + 1,
+ utf8::unicode_to_native(0x0041),
+ utf8::unicode_to_native(0x0046) + 1,
+ utf8::unicode_to_native(0x0030),
+ utf8::unicode_to_native(0x0039) + 1,
+ ],
+ "prop_invlist('AHex=f') is exactly the expected set of points");
+ }
}
sub fail_with_diff ($$$$) {
@@ -1547,7 +1667,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
fail("prop_invmap('$prop')");
diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap");
}
- next PROPERTY;
+ next PROPERTY;
}
}
@@ -1555,6 +1675,14 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
# normalized version.
$name = &utf8::_loose_name(lc $name);
+ # In the case of a combination property, both a map table and a match
+ # table are generated. For all the tests except prop_invmap(), this is
+ # irrelevant, but for prop_invmap, having an 'is' prefix forces it to
+ # return the match table; otherwise the map. We thus need to distinguish
+ # between the two forms. The property name is what has this information.
+ $name = &utf8::_loose_name(lc $prop)
+ if exists $Unicode::UCD::combination_property{$name};
+
# Add in the characters that are supposed to be ignored to test loose
# matching, which the tested function applies to all properties
$display_prop = "$extra_chars$prop" unless $display_prop;
@@ -2309,7 +2437,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
# And remove the aliases. We read in the Name_Alias property, and go
# through them one by one.
my ($aliases_code_points, $aliases_maps, undef, undef)
- = &prop_invmap('Name_Alias');
+ = &prop_invmap('_Perl_Name_Alias', '_perl_core_internal_ok');
for (my $i = 0; $i < @$aliases_code_points; $i++) {
my $code_point = $aliases_code_points->[$i];
@@ -2534,11 +2662,13 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
# A few tests of search_invlist
use Unicode::UCD qw(search_invlist);
-my ($scripts_ranges_ref, $scripts_map_ref) = prop_invmap("Script");
-my $index = search_invlist($scripts_ranges_ref, 0x390);
-is($scripts_map_ref->[$index], "Greek", "U+0390 is Greek");
-my @alpha_invlist = prop_invlist("Alpha");
-is(search_invlist(\@alpha_invlist, ord("\t")), undef, "search_invlist returns undef for code points before first one on the list");
+if ($v_unicode_version ge v3.1.0) { # No Script property before this
+ my ($scripts_ranges_ref, $scripts_map_ref) = prop_invmap("Script");
+ my $index = search_invlist($scripts_ranges_ref, 0x390);
+ is($scripts_map_ref->[$index], "Greek", "U+0390 is Greek");
+ my @alpha_invlist = prop_invlist("Alpha");
+ is(search_invlist(\@alpha_invlist, ord("\t")), undef, "search_invlist returns undef for code points before first one on the list");
+}
ok($/ eq $input_record_separator, "The record separator didn't get overridden");
diff --git a/lib/locale.t b/lib/locale.t
index 1ebd0ce39c..1b510d25e6 100644
--- a/lib/locale.t
+++ b/lib/locale.t
@@ -848,7 +848,8 @@ sub disp_str ($) {
}
else {
$result .= " " unless $prev_was_punct;
- $result .= charnames::viacode(ord $char);
+ my $name = charnames::viacode(ord $char);
+ $result .= (defined $name) ? $name : ':unknown:';
$prev_was_punct = 0;
}
}
diff --git a/lib/unicore/README.perl b/lib/unicore/README.perl
index ef5fec6295..f892334d45 100644
--- a/lib/unicore/README.perl
+++ b/lib/unicore/README.perl
@@ -1,15 +1,12 @@
# The goal is for perl to compile and reasonably run any version of Unicode.
-# But in v5.22, the earliest version that this works for is Unicode 5.1.
# Working reasonably well doesn't mean that the test suite will run without
-# showing errors. You may be able to compile an earlier version, and get
-# things to sort-of work. A few of the very-Unicode specific test files have been
+# showing errors. A few of the very-Unicode specific test files have been
# modified to account for different versions, but most have not. For example,
# some tests use characters that aren't encoded in all Unicode versions; others
# have hard-coded the General Categories for a code point that were correct at
# the time the test was written. Perl itself will not compile under Unicode
# releases prior to 3.0 without a simple change to Unicode::Normalize.
-# mktables contains instructions for this, as well as other hints for using
-# older Unicode versions.
+# mktables contains instructions for this.
# The *.txt files were copied from
@@ -17,14 +14,20 @@
# (which always points to the latest version) with subdirectories 'extracted' and
# 'auxiliary'. Older versions are located under Public with an appropriate name.
+# They are also available via http at www.unicode.org/versions/
+#
# The Unihan files were not included due to space considerations. Also NOT
-# included were any *.html files. It is possible to add the Unihan files, and
-# edit mktables (see instructions near its beginning) to look at them.
+# included were any *.html files. It is possible to add the Unihan files and
+# have some properties from them automatically compiled. By editing mktables
+# (see instructions near its beginning) you can add other Unihan properties.
# The file named 'version' should exist and be a single line with the Unicode
# version, like:
+#
# 5.2.0
+#
+# (without the initial '# ')
# To be 8.3 filesystem friendly, the names of some of the input files have been
# changed from the values that are in the Unicode DB. Not all of the Test
@@ -58,6 +61,8 @@ mv extracted/DerivedJoiningType.txt extracted/DJoinType.txt
mv extracted/DerivedLineBreak.txt extracted/DLineBreak.txt
mv extracted/DerivedNumericType.txt extracted/DNumType.txt
mv extracted/DerivedNumericValues.txt extracted/DNumValues.txt
+rmdir extracted 2>/dev/null # Will fail if non-empty, but if it is empty
+ # was an early release that didn't have it.
mv auxiliary/GraphemeBreakTest.txt auxiliary/GCBTest.txt
mv auxiliary/LineBreakTest.txt auxiliary/LBTest.txt
@@ -81,14 +86,31 @@ mv Unihan_Variants.txt UnihanVariants.txt
# filesystems.
# mktables is used to generate the tables used by the rest of Perl. It will
-# warn you about any *.txt files in the directory substructure that it doesn't
-# know about. You should remove any so-identified, or edit mktables to add
-# them to its lists to process. You can run
+# warn you about any *.txt and *.html files in the directory substructure that
+# it doesn't know about. You should remove any so-identified, or edit mktables
+# to add them to its lists to process. You can run
#
# mktables -globlist
#
-#to have it try to process these tables generically.
-#
+# to have it try to process these tables generically.
+
+# COMPILING ON OLDER UNICODE VERSIONS
+#
+# To compile perl for use with an older Unicode release, delete everything in
+# the lib/unicore directory except mktables and Makefile. Then download the
+# Unicode-supplied files for the desired version to that directory (A url for
+# these is given earlier in this file). Then create the 'version' file with a
+# single line, like '6.1.0'. Do a 'make test' from the project level. You
+# will get some porting errors for needing to regen. Regenerate what it tells
+# you are needed, and make test again. If you compile an old enough version,
+# you will also have to download a few files from later Unicode versions,
+# following the instructions that will be given if warranted. It should
+# compile in any release without warnings, except for some casing conflicts
+# in Unicode 2.1.8, and some extraneous files will show up in very early
+# releases of the form qr/diff.*\.txt/. If you add Unihan.txt, one line is in error in
+#
+# Other glitches are noted in mktables under 'UNICODE VERSIONS NOTES'
+
# FOR PUMPKINS
#
# The files are inter-related. If you take the latest UnicodeData.txt, for
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 572c299038..27cb45ab9d 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -4,17 +4,9 @@
# Any files created or read by this program should be listed in 'mktables.lst'
# Use -makelist to regenerate it.
-# Needs 'no overloading' to run faster on miniperl. Code commented out at the
-# subroutine objaddr can be used instead to work as far back (untested) as
-# 5.8: needs pack "U". But almost all occurrences of objaddr have been
-# removed in favor of using 'no overloading'. You also would have to go
-# through and replace occurrences like:
-# my $addr = do { no overloading; pack 'J', $self; }
-# with
-# my $addr = main::objaddr $self;
-# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
-# that instituted the change to main::objaddr, and subsequent commits that
-# changed 0+$self to pack 'J', $self.)
+# There was an attempt when this was first rewritten to make it 5.8
+# compatible, but that has now been abandoned, and newer constructs are used
+# as convenient.
my $start_time;
BEGIN { # Get the time the script started running; do it at compilation to
@@ -32,6 +24,7 @@ use File::Path;
use File::Spec;
use Text::Tabs;
use re "/aa";
+use feature 'state';
sub DEBUG () { 0 } # Set to 0 for production; 1 for development
my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
@@ -292,8 +285,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
# As mentioned earlier, some properties are given in more than one file. In
# particular, the files in the extracted directory are supposedly just
# reformattings of the others. But they contain information not easily
-# derivable from the other files, including results for Unihan, which this
-# program doesn't ordinarily look at, and for unassigned code points. They
+# derivable from the other files, including results for Unihan (which isn't
+# usually available to this program) and for unassigned code points. They
# also have historically had errors or been incomplete. In an attempt to
# create the best possible data, this program thus processes them first to
# glean information missing from the other files; then processes those other
@@ -411,24 +404,19 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
#
# A NOTE ON UNIHAN
#
-# This program can generate tables from the Unihan database. But it doesn't
-# by default, letting the CPAN module Unicode::Unihan handle them. Prior to
-# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
-# database was split into 8 different files, all beginning with the letters
-# 'Unihan'. This program will read those file(s) if present, but it needs to
-# know which of the many properties in the file(s) should have tables created
-# for them. It will create tables for any properties listed in
-# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
-# @cjk_properties array and the @cjk_property_values array. Thus, if a
-# property you want is not in those files of the release you are building
-# against, you must add it to those two arrays. Starting in 4.0, the
-# Unicode_Radical_Stroke was listed in those files, so if the Unihan database
-# is present in the directory, a table will be generated for that property.
-# In 5.2, several more properties were added. For your convenience, the two
-# arrays are initialized with all the 6.0 listed properties that are also in
-# earlier releases. But these are commented out. You can just uncomment the
-# ones you want, or use them as a template for adding entries for other
-# properties.
+# This program can generate tables from the Unihan database. But that db
+# isn't normally available, so it is marked as optional. Prior to version
+# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database
+# was split into 8 different files, all beginning with the letters 'Unihan'.
+# If you plunk those files down into the directory mktables ($0) is in, this
+# program will read them and automatically create tables for the properties
+# from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
+# plus any you add to the @cjk_properties array and the @cjk_property_values
+# array, being sure to add necessary '# @missings' lines to the latter. For
+# Unicode versions earlier than 5.2, most of the Unihan properties are not
+# listed at all in PropertyAliases nor PropValueAliases. This program assumes
+# for these early releases that you want the properties that are specified in
+# the 5.2 release.
#
# You may need to adjust the entries to suit your purposes. setup_unihan(),
# and filter_unihan_line() are the functions where this is done. This program
@@ -437,8 +425,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
#
# There is a bug in the 3.2 data file in which some values for the
# kPrimaryNumeric property have commas and an unexpected comment. A filter
-# could be added for these; or for a particular installation, the Unihan.txt
-# file could be edited to fix them.
+# could be added to correct these; or for a particular installation, the
+# Unihan.txt file could be edited to fix them.
#
# HOW TO ADD A FILE TO BE PROCESSED
#
@@ -484,13 +472,13 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
# handled by Unicode::Normalize, nor will it compile when presented a version
# that has them. However, you can trivially get it to compile by simply
# ignoring those decompositions, by changing the croak to a carp. At the time
-# of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
+# of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or
+# cpan/Unicode-Normalize/mkheader) reads
#
# croak("Weird Canonical Decomposition of U+$h");
#
# Simply comment it out. It will compile, but will not know about any three
-# character decompositions. If using the .pm version, there is a similar
-# line.
+# character decompositions.
# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
# that the reason is that the CJK block starting at 4E00 was removed from
@@ -513,10 +501,13 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
# name for the class, it would not have been affected, but if it used the
# mnemonic, it would have been.
#
-# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
+# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code
# points which eventually came to have this script property value, instead
# mapped to "Unknown". But in the next release all these code points were
# moved to \p{sc=common} instead.
+
+# The tests furnished by Unicode for testing WordBreak and SentenceBreak
+# generate errors in 5.0 and earlier.
#
# The default for missing code points for BidiClass is complicated. Starting
# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
@@ -596,8 +587,8 @@ our $to_trace = 0;
|| $caller_name eq 'trace');
my $output = "";
+ #print STDERR __LINE__, ": ", join ", ", @input, "\n";
foreach my $string (@input) {
- #print STDERR __LINE__, ": ", join ", ", @input, "\n";
if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
$output .= simple_dumper($string);
}
@@ -623,10 +614,11 @@ our $to_trace = 0;
# This is for a rarely used development feature that allows you to compare two
# versions of the Unicode standard without having to deal with changes caused
-# by the code points introduced in the later version. Change the 0 to a
-# string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
-# code points introduced in that release and earlier will be used; later ones
-# are thrown away. You use the version number of the earliest one you want to
+# by the code points introduced in the later version. You probably also want
+# to use the -annotate option when using this. Change the 0 to a string
+# containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only code
+# points introduced in that release and earlier will be used; later ones are
+# thrown away. You use the version number of the earliest one you want to
# compare; then run this program on directory structures containing each
# release, and compare the outputs. These outputs will therefore include only
# the code points common to both releases, and you can see the changes caused
@@ -861,33 +853,8 @@ if ($v_version gt v3.2.0) {
'Canonical_Combining_Class=Attached_Below_Left'
}
-# These are listed in the Property aliases file in 6.0, but Unihan is ignored
-# unless explicitly added.
-if ($v_version ge v5.2.0) {
- my $unihan = 'Unihan; remove from list if using Unihan';
- foreach my $table (qw (
- kAccountingNumeric
- kOtherNumeric
- kPrimaryNumeric
- kCompatibilityVariant
- kIICore
- kIRG_GSource
- kIRG_HSource
- kIRG_JSource
- kIRG_KPSource
- kIRG_MSource
- kIRG_KSource
- kIRG_TSource
- kIRG_USource
- kIRG_VSource
- kRSUnicode
- ))
- {
- $why_suppress_if_empty_warn_if_not{$table} = $unihan;
- }
-}
-
-# Enum values for to_output_map() method in the Map_Table package.
+# Enum values for to_output_map() method in the Map_Table package. (0 is don't
+# output)
my $EXTERNAL_MAP = 1;
my $INTERNAL_MAP = 2;
my $OUTPUT_ADJUSTED = 3;
@@ -913,13 +880,6 @@ my %global_to_output_map = (
Decomposition_Type => 0,
);
-# Properties that this program ignores.
-my @unimplemented_properties;
-
-# With this release, it is automatically handled if the Unihan db is
-# downloaded
-push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
-
# There are several types of obsolete properties defined by Unicode. These
# must be hand-edited for every new Unicode release.
my %why_deprecated; # Generates a deprecated warning message if used.
@@ -959,8 +919,6 @@ my %why_obsolete; # Documentation only
# existence is not noted in the comment.
'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
- 'Indic_Matra_Category' => "Withdrawn by Unicode while still provisional",
-
# Don't suppress ISO_Comment, as otherwise special handling is needed
# to differentiate between it and gc=c, which can be written as 'isc',
# which is the same characters as ISO_Comment's short name.
@@ -1046,45 +1004,13 @@ if ($v_version ge v6.0.0) {
my @output_mapped_properties = split "\n", <<END;
END
-# If you are using the Unihan database in a Unicode version before 5.2, you
-# need to add the properties that you want to extract from it to this table.
-# For your convenience, the properties in the 6.0 PropertyAliases.txt file are
-# listed, commented out
+# If you want more Unihan properties than the default, you need to add them to
+# these arrays. Depending on the property type, @missing lines might have to
+# be added to the second array. A sample entry would be (including the '#'):
+# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
my @cjk_properties = split "\n", <<'END';
-#cjkAccountingNumeric; kAccountingNumeric
-#cjkOtherNumeric; kOtherNumeric
-#cjkPrimaryNumeric; kPrimaryNumeric
-#cjkCompatibilityVariant; kCompatibilityVariant
-#cjkIICore ; kIICore
-#cjkIRG_GSource; kIRG_GSource
-#cjkIRG_HSource; kIRG_HSource
-#cjkIRG_JSource; kIRG_JSource
-#cjkIRG_KPSource; kIRG_KPSource
-#cjkIRG_KSource; kIRG_KSource
-#cjkIRG_TSource; kIRG_TSource
-#cjkIRG_USource; kIRG_USource
-#cjkIRG_VSource; kIRG_VSource
-#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
END
-
-# Similarly for the property values. For your convenience, the lines in the
-# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
-# '#' marks (for Unicode versions before 5.2)
my @cjk_property_values = split "\n", <<'END';
-## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
-## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
-## @missing: 0000..10FFFF; cjkIICore; <none>
-## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
-## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
-## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
-## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
-## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
-## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
-## @missing: 0000..10FFFF; cjkIRG_USource; <none>
-## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
-## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
-## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
-## @missing: 0000..10FFFF; cjkRSUnicode; <none>
END
# The input files don't list every code point. Those not listed are to be
@@ -1109,7 +1035,7 @@ my %default_mapping = (
Decomposition_Type => 'None',
East_Asian_Width => "Neutral",
FC_NFKC_Closure => $CODE_POINT,
- General_Category => 'Cn',
+ General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
Grapheme_Cluster_Break => 'Other',
Hangul_Syllable_Type => 'NA',
ISO_Comment => "",
@@ -1140,39 +1066,6 @@ my %default_mapping = (
Word_Break => 'Other',
);
-# Below are files that Unicode furnishes, but this program ignores, and why.
-# NormalizationCorrections.txt requires some more explanation. It documents
-# the cumulative fixes to erroneous normalizations in earlier Unicode
-# versions. Its main purpose is so that someone running on an earlier version
-# can use this file to override what got published in that earlier release.
-# It would be easy for mktables to read and handle this file. But all the
-# corrections in it should already be in the other files for the release it
-# is. To get it to actually mean something useful, someone would have to be
-# using an earlier Unicode release, and copy it to the files for that release
-# and recomplile. So far there has been no demand to do that, so this hasn't
-# been implemented.
-my %ignored_files = (
- 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
- 'Index.txt' => 'Alphabetical index of Unicode characters',
- 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
- 'NamesList.txt' => 'Annotated list of characters',
- 'NamesList.html' => 'Describes the format and contents of F<NamesList.txt>',
- 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
- 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
- 'ReadMe.txt' => 'Documentation',
- 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
- 'StandardizedVariants.html' => 'Provides a visual display of the standard variant sequences derived from F<StandardizedVariants.txt>.',
- 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
- 'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters',
- 'USourceGlyphs.pdf' => 'Pictures of the characters in F<USourceData.txt>',
- 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
- 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
- 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
- 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
-);
-
-my %skipped_files; # List of files that we skip
-
### End of externally interesting definitions, except for @input_file_objects
my $HEADER=<<"EOF";
@@ -1199,7 +1092,9 @@ my $DEVELOPMENT_ONLY=<<"EOF";
EOF
-my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
+my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
+ ? "10FFFF"
+ : "FFFF";
my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
@@ -1229,8 +1124,7 @@ my $code_point_re = qr/\b$run_on_code_point_re/;
# defaults for code points not listed (i.e., missing) in the file. The code
# depends on this ending with a semi-colon, so it can assume it is a valid
# field when the line is split() by semi-colons
-my $missing_defaults_prefix =
- qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
+my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
# Property types. Unicode has more types, but these are sufficient for our
# purposes.
@@ -1307,11 +1201,15 @@ my $OBSOLETE = 'O';
my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
+# Aliases can also have an extra status:
+my $INTERNAL_ALIAS = 'P';
+
my %status_past_participles = (
$DISCOURAGED => 'discouraged',
$STABILIZED => 'stabilized',
$OBSOLETE => 'obsolete',
$DEPRECATED => 'deprecated',
+ $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
);
# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
@@ -1374,18 +1272,21 @@ my %loose_to_file_of; # loosely maps table names to their respective
# files
my %stricter_to_file_of; # same; but for stricter mapping.
my %loose_property_to_file_of; # Maps a loose property name to its map file
+my %strict_property_to_file_of; # Same, but strict
my @inline_definitions = "V0"; # Each element gives a definition of a unique
# inversion list. When a definition is inlined,
# its value in the hash it's in (one of the two
# defined just above) will include an index into
# this array. The 0th element is initialized to
- # the definition for a zero length invwersion list
+ # the definition for a zero length inversion list
my %file_to_swash_name; # Maps the file name to its corresponding key name
# in the hash %utf8::SwashInfo
my %nv_floating_to_rational; # maps numeric values floating point numbers to
# their rational equivalent
my %loose_property_name_of; # Loosely maps (non_string) property names to
# standard form
+my %strict_property_name_of; # Strictly maps (non_string) property names to
+ # standard form
my %string_property_loose_to_name; # Same, for string properties.
my %loose_defaults; # keys are of form "prop=value", where 'prop' is
# the property name in standard loose form, and
@@ -1395,12 +1296,16 @@ my %loose_to_standard_value; # loosely maps table names to the canonical
# alias for them
my %ambiguous_names; # keys are alias names (in standard form) that
# have more than one possible meaning.
+my %combination_property; # keys are alias names (in standard form) that
+ # have both a map table, and a binary one that
+ # yields true for all non-null maps.
my %prop_aliases; # Keys are standard property name; values are each
# one's aliases
my %prop_value_aliases; # Keys of top level are standard property name;
# values are keys to another hash, Each one is
# one of the property's values, in standard form.
# The values are that prop-val's aliases.
+my %skipped_files; # List of files that we skip
my %ucd_pod; # Holds entries that will go into the UCD section of the pod
# Most properties are immune to caseless matching, otherwise you would get
@@ -1477,6 +1382,8 @@ my @named_sequences; # NamedSequences.txt contents.
my %potential_files; # Generated list of all .txt files in the directory
# structure so we can warn if something is being
# ignored.
+my @missing_early_files; # Generated list of absent files that we need to
+ # proceed in compiling this early Unicode version
my @files_actually_output; # List of files we generated.
my @more_Names; # Some code point names are compound; this is used
# to store the extra components of them.
@@ -1493,6 +1400,7 @@ my $block;
my $perl_charname;
my $print;
my $All;
+my $Assigned; # All assigned characters in this Unicode release
my $script;
# Are there conflicting names because of beginning with 'In_', or 'Is_'
@@ -1616,7 +1524,7 @@ sub populate_char_info ($) {
elsif ($gc-> table('Unassigned')->contains($i)) {
$annotate_char_type[$i] = $UNASSIGNED_TYPE;
$printable[$i] = 0;
- if ($v_version lt v2.0.0) { # No blocks in earliest releases
+ if (defined $block) { # No blocks in earliest releases
$viacode[$i] = 'Unassigned';
$end = $gc-> table('Unassigned')->containing_range($i)->end;
}
@@ -1633,12 +1541,7 @@ sub populate_char_info ($) {
containing_range($i)->end);
}
}
- elsif ($v_version lt v2.0.0) { # No surrogates in earliest releases
- $viacode[$i] = $gc->value_of($i);
- $annotate_char_type[$i] = $UNKNOWN_TYPE;
- $printable[$i] = 0;
- }
- elsif ($gc-> table('Surrogate')->contains($i)) {
+ elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
$viacode[$i] = 'Surrogate';
$annotate_char_type[$i] = $SURROGATE_TYPE;
$printable[$i] = 0;
@@ -2110,6 +2013,7 @@ package Input_file;
# while(next_line()) {...} loop.
#
# You can also set up handlers to
+# 0) call during object construction time, after everything else is done
# 1) call before the first line is read, for pre processing
# 2) call to adjust each line of the input before the main handler gets
# them. This can be automatically generated, if appropriately simple
@@ -2121,19 +2025,29 @@ package Input_file;
# each_line_handler()s. So, if the format of the line is not in the desired
# format for the main handler, these are used to do that adjusting. They can
# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
-# so the $_ output of one is used as the input to the next. None of the other
-# handlers are stackable, but could easily be changed to be so.
+# so the $_ output of one is used as the input to the next. The eof handler
+# is also stackable, but none of the others are, but could easily be changed
+# to be so.
+#
+# Some properties are used by the Perl core but aren't defined until later
+# Unicode releases. The perl interpreter would have problems working when
+# compiled with an earlier Unicode version that doesn't have them, so we need
+# to define them somehow for those releases. The 'Early' constructor
+# parameter can be used to automatically handle this. It is essentially
+# ignored if the Unicode version being compiled has a data file for this
+# property. Either code to execute or a file to read can be specified.
+# Details are at the %early definition.
#
# Most of the handlers can call insert_lines() or insert_adjusted_lines()
# which insert the parameters as lines to be processed before the next input
-# file line is read. This allows the EOF handler to flush buffers, for
+# file line is read. This allows the EOF handler(s) to flush buffers, for
# example. The difference between the two routines is that the lines inserted
# by insert_lines() are subjected to the each_line_handler()s. (So if you
-# called it from such a handler, you would get infinite recursion.) Lines
-# inserted by insert_adjusted_lines() go directly to the main handler without
-# any adjustments. If the post-processing handler calls any of these, there
-# will be no effect. Some error checking for these conditions could be added,
-# but it hasn't been done.
+# called it from such a handler, you would get infinite recursion without some
+# mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go
+# directly to the main handler without any adjustments. If the
+# post-processing handler calls any of these, there will be no effect. Some
+# error checking for these conditions could be added, but it hasn't been done.
#
# carp_bad_line() should be called to warn of bad input lines, which clears $_
# to prevent further processing of the line. This routine will output the
@@ -2169,10 +2083,16 @@ sub trace { return main::trace(@_); }
main::set_access('property', \%property, qw{ c r });
my %optional;
- # If this is true, the file is optional. If not present, no warning is
- # output. If it is present, the string given by this parameter is
- # evaluated, and if false the file is not processed.
- main::set_access('optional', \%optional, 'c', 'r');
+ # This is either an unsigned number, or a list of property names. In the
+ # former case, if it is non-zero, it means the file is optional, so if the
+ # file is absent, no warning about that is output. In the latter case, it
+ # is a list of properties that the file (exclusively) defines. If the
+ # file is present, tables for those properties will be produced; if
+ # absent, none will, even if they are listed elsewhere (namely
+ # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
+ # and no warnings will be raised about them not being available. (And no
+ # warning about the file itself will be raised.)
+ main::set_access('optional', \%optional, qw{ c readable_array } );
my %non_skip;
# This is used for debugging, to skip processing of all but a few input
@@ -2181,16 +2101,19 @@ sub trace { return main::trace(@_); }
main::set_access('non_skip', \%non_skip, 'c');
my %skip;
- # This is used to skip processing of this input file semi-permanently,
- # when it evaluates to true. The value should be the reason the file is
- # being skipped. It is used for files that we aren't planning to process
- # anytime soon, but want to allow to be in the directory and not raise a
- # message that we are not handling. Mostly for test files. This is in
- # contrast to the non_skip element, which is supposed to be used very
- # temporarily for debugging. Sets 'optional' to 1. Also, files that we
- # pretty much will never look at can be placed in the global
- # %ignored_files instead. Ones used here will be added to %skipped files
- main::set_access('skip', \%skip, 'c');
+ # This is used to skip processing of this input file (semi-) permanently.
+ # The value should be the reason the file is being skipped. It is used
+ # for files that we aren't planning to process anytime soon, but want to
+ # allow to be in the directory and be checked for their names not
+ # conflicting with any other files on a DOS 8.3 name filesystem, but to
+ # not otherwise be processed, and to not raise a warning about not being
+ # handled. In the constructor call, any value that evaluates to a numeric
+ # 0 or undef means don't skip. Any other value is a string giving the
+ # reason it is being skippped, and this will appear in generated pod.
+ # However, an empty string reason will suppress the pod entry.
+ # Internally, calls that evaluate to numeric 0 are changed into undef to
+ # distinguish them from an empty string call.
+ main::set_access('skip', \%skip, 'c', 'r');
my %each_line_handler;
# list of subroutines to look at and filter each non-comment line in the
@@ -2221,21 +2144,28 @@ sub trace { return main::trace(@_); }
main::set_access('has_missings_defaults',
\%has_missings_defaults, qw{ c r });
+ my %construction_time_handler;
+ # Subroutine to call at the end of the new method. If undef, no such
+ # handler is called.
+ main::set_access('construction_time_handler',
+ \%construction_time_handler, qw{ c });
+
my %pre_handler;
# Subroutine to call before doing anything else in the file. If undef, no
# such handler is called.
main::set_access('pre_handler', \%pre_handler, qw{ c });
my %eof_handler;
- # Subroutine to call upon getting an EOF on the input file, but before
+ # Subroutines to call upon getting an EOF on the input file, but before
# that is returned to the main handler. This is to allow buffers to be
# flushed. The handler is expected to call insert_lines() or
# insert_adjusted() with the buffered material
- main::set_access('eof_handler', \%eof_handler, qw{ c r });
+ main::set_access('eof_handler', \%eof_handler, qw{ c });
my %post_handler;
# Subroutine to call after all the lines of the file are read in and
- # processed. If undef, no such handler is called.
+ # processed. If undef, no such handler is called. Note that this cannot
+ # add lines to be processed; instead use eof_handler
main::set_access('post_handler', \%post_handler, qw{ c });
my %progress_message;
@@ -2263,6 +2193,69 @@ sub trace { return main::trace(@_); }
# storage of '@missing' defaults lines
main::set_access('missings', \%missings);
+ my %early;
+ # Used for properties that must be defined (for Perl's purposes) on
+ # versions of Unicode earlier than Unicode itself defines them. The
+ # parameter is an array (it would be better to be a hash, but not worth
+ # bothering about due to its rare use).
+ #
+ # The first element is either a code reference to call when in a release
+ # earlier than the Unicode file is available in, or it is an alternate
+ # file to use instead of the non-existent one. This file must have been
+ # plunked down in the same directory as mktables. Should you be compiling
+ # on a release that needs such a file, mktables will abort the
+ # compilation, and tell you where to get the necessary file(s), and what
+ # name(s) to use to store them as.
+ # In the case of specifying an alternate file, the array must contain two
+ # further elements:
+ #
+ # [1] is the name of the property that will be generated by this file.
+ # The class automatically takes the input file and excludes any code
+ # points in it that were not assigned in the Unicode version being
+ # compiled. It then uses this result to define the property in the given
+ # version. Since the property doesn't actually exist in the Unicode
+ # version being compiled, this should be a name accessible only by core
+ # perl. If it is the same name as the regular property, the constructor
+ # will mark the output table as a $PLACEHOLDER so that it doesn't actually
+ # get output, and so will be unusable by non-core code. Otherwise it gets
+ # marked as $INTERNAL_ONLY.
+ #
+ # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
+ # the Hangul syllables in that release (which were ripped out in version
+ # 2) for the given property . (Hence it is ignored except when compiling
+ # version 1. You only get one value that applies to all of them, which
+ # may not be the actual reality, but probably nobody cares anyway for
+ # these obsolete characters.)
+ #
+ # Not all files can be handled in the above way, and so the code ref
+ # alternative is available. It can do whatever it needs to. The other
+ # array elements are optional in this case, and the code is free to use or
+ # ignore them if they are present.
+ #
+ # Internally, the constructor unshifts a 0 or 1 onto this array to
+ # indicate if an early alternative is actually being used or not. This
+ # makes for easier testing later on.
+ main::set_access('early', \%early, 'c');
+
+ my %required_even_in_debug_skip;
+ # debug_skip is used to speed up compilation during debugging by skipping
+ # processing files that are not needed for the task at hand. However,
+ # some files pretty much can never be skipped, and this is used to specify
+ # that this is one of them. In order to skip this file, the call to the
+ # constructor must be edited to comment out this parameter.
+ main::set_access('required_even_in_debug_skip',
+ \%required_even_in_debug_skip, 'c');
+
+ my %withdrawn;
+ # Some files get removed from the Unicode DB. This is a version object
+ # giving the first release without this file.
+ main::set_access('withdrawn', \%withdrawn, 'c');
+
+ my %in_this_release;
+ # Calculated value from %first_released and %withdrawn. Are we compiling
+ # a Unicode release which includes this file?
+ main::set_access('in_this_release', \%in_this_release);
+
sub _next_line;
sub _next_line_with_remapped_range;
@@ -2275,22 +2268,23 @@ sub trace { return main::trace(@_); }
# Set defaults
$handler{$addr} = \&main::process_generic_property_file;
$non_skip{$addr} = 0;
- $skip{$addr} = 0;
+ $skip{$addr} = undef;
$has_missings_defaults{$addr} = $NO_DEFAULTS;
$handle{$addr} = undef;
$added_lines{$addr} = [ ];
$remapped_lines{$addr} = [ ];
$each_line_handler{$addr} = [ ];
+ $eof_handler{$addr} = [ ];
$errors{$addr} = { };
$missings{$addr} = [ ];
+ $early{$addr} = [ ];
+ $optional{$addr} = [ ];
# Two positional parameters.
return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
$file{$addr} = main::internal_file_to_platform(shift);
$first_released{$addr} = shift;
- undef $file{$addr} if $first_released{$addr} gt $v_version;
-
# The rest of the arguments are key => value pairs
# %constructor_fields has been set up earlier to list all possible
# ones. Either set or push, depending on how the default has been set
@@ -2322,30 +2316,206 @@ sub trace { return main::trace(@_); }
delete $args{$key};
};
- # If the file has a property for it, it means that the property is not
- # listed in the file's entries. So add a handler to the list of line
- # handlers to insert the property name into the lines, to provide a
- # uniform interface to the final processing subroutine.
- # the final code doesn't have to worry about that.
- if ($property{$addr}) {
- push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
+ $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
+
+ # Convert 0 (meaning don't skip) to undef
+ undef $skip{$addr} unless $skip{$addr};
+
+ # Handle the case where this file is optional
+ my $pod_message_for_non_existent_optional = "";
+ if ($optional{$addr}->@*) {
+
+ # First element is the pod message
+ $pod_message_for_non_existent_optional
+ = shift $optional{$addr}->@*;
+ # Convert a 0 'Optional' argument to an empty list to make later
+ # code more concise.
+ if ( $optional{$addr}->@*
+ && $optional{$addr}->@* == 1
+ && $optional{$addr}[0] ne ""
+ && $optional{$addr}[0] !~ /\D/
+ && $optional{$addr}[0] == 0)
+ {
+ $optional{$addr} = [ ];
+ }
+ else { # But if the only element doesn't evaluate to 0, make sure
+ # that this file is indeed considered optional below.
+ unshift $optional{$addr}->@*, 1;
+ }
+ }
+
+ my $progress;
+ my $function_instead_of_file = 0;
+
+ # If we are compiling a Unicode release earlier than the file became
+ # available, the constructor may have supplied a substitute
+ if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
+
+ # Yes, we have a substitute, that we will use; mark it so
+ unshift $early{$addr}->@*, 1;
+
+ # See the definition of %early for what the array elements mean.
+ # If we have a property this defines, create a table and default
+ # map for it now (at essentially compile time), so that it will be
+ # available for the whole of run time. (We will want to add this
+ # name as an alias when we are using the official property name;
+ # but this must be deferred until run(), because at construction
+ # time the official names have yet to be defined.)
+ if ($early{$addr}[2]) {
+ my $fate = ($property{$addr}
+ && $property{$addr} eq $early{$addr}[2])
+ ? $PLACEHOLDER
+ : $INTERNAL_ONLY;
+ my $prop_object = Property->new($early{$addr}[2],
+ Fate => $fate,
+ Perl_Extension => 1,
+ );
+
+ # Use the default mapping for the regular property for this
+ # substitute one.
+ if ( defined $property{$addr}
+ && defined $default_mapping{$property{$addr}})
+ {
+ $prop_object
+ ->set_default_map($default_mapping{$property{$addr}});
+ }
+ }
+
+ if (ref $early{$addr}[1] eq 'CODE') {
+ $function_instead_of_file = 1;
+
+ # If the first element of the array is a code ref, the others
+ # are optional.
+ $handler{$addr} = $early{$addr}[1];
+ $property{$addr} = $early{$addr}[2]
+ if defined $early{$addr}[2];
+ $progress = "substitute $file{$addr}";
+
+ undef $file{$addr};
+ }
+ else { # Specifying a substitute file
+
+ if (! main::file_exists($early{$addr}[1])) {
+
+ # If we don't see the substitute file, generate an error
+ # message giving the needed things, and add it to the list
+ # of such to output before actual processing happens
+ # (hence the user finds out all of them in one run).
+ # Instead of creating a general method for NameAliases,
+ # hard-code it here, as there is unlikely to ever be a
+ # second one which needs special handling.
+ my $string_version = ($file{$addr} eq "NameAliases.txt")
+ ? 'at least 6.1 (the later, the better)'
+ : sprintf "%vd", $first_released{$addr};
+ push @missing_early_files, <<END;
+'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
+END
+ ;
+ return;
+ }
+ $progress = $early{$addr}[1];
+ $progress .= ", substituting for $file{$addr}" if $file{$addr};
+ $file{$addr} = $early{$addr}[1];
+ $property{$addr} = $early{$addr}[2];
+
+ # Ignore code points not in the version being compiled
+ push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
+
+ if ( $v_version lt v2.0 # Hanguls in this release ...
+ && defined $early{$addr}[3]) # ... need special treatment
+ {
+ push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
+ }
+ }
+
+ # And this substitute is valid for all releases.
+ $first_released{$addr} = v0;
+ }
+ else { # Normal behavior
+ $progress = $file{$addr};
+ unshift $early{$addr}->@*, 0; # No substitute
}
- if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
- print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
+ my $file = $file{$addr};
+ $progress_message{$addr} = "Processing $progress"
+ unless $progress_message{$addr};
+
+ # A file should be there if it is within the window of versions for
+ # which Unicode supplies it
+ if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
+ $in_this_release{$addr} = 0;
+ $skip{$addr} = "";
}
+ else {
+ $in_this_release{$addr} = $first_released{$addr} le $v_version;
- # If skipping, set to optional, and add to list of ignored files,
- # including its reason
- if ($skip{$addr}) {
- $optional{$addr} = 1;
- $skipped_files{$file{$addr}} = $skip{$addr} if $file{$addr};
+ # Check that the file for this object (possibly using a substitute
+ # for early releases) exists or we have a function alternative
+ if ( ! $function_instead_of_file
+ && ! main::file_exists($file))
+ {
+ # Here there is nothing available for this release. This is
+ # fine if we aren't expecting anything in this release.
+ if (! $in_this_release{$addr}) {
+ $skip{$addr} = ""; # Don't remark since we expected
+ # nothing and got nothing
+ }
+ elsif ($optional{$addr}->@*) {
+
+ # Here the file is optional in this release; Use the
+ # passed in text to document this case in the pod.
+ $skip{$addr} = $pod_message_for_non_existent_optional;
+ }
+ elsif ( $in_this_release{$addr}
+ && ! defined $skip{$addr}
+ && defined $file)
+ { # Doesn't exist but should.
+ $skip{$addr} = "'$file' not found. Possibly Big problems";
+ Carp::my_carp($skip{$addr});
+ }
+ }
+ elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
+ {
+
+ # The file exists; if not skipped for another reason, and we are
+ # skipping most everything during debugging builds, use that as
+ # the skip reason.
+ $skip{$addr} = '$debug_skip is on'
+ }
+ }
+
+ if ( ! $debug_skip
+ && $non_skip{$addr}
+ && ! $required_even_in_debug_skip{$addr}
+ && $verbosity)
+ {
+ print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
+ }
+
+ # Here, we have figured out if we will be skipping this file or not.
+ # If so, we add any single property it defines to any passed in
+ # optional property list. These will be dealt with at run time.
+ if (defined $skip{$addr}) {
+ if ($property{$addr}) {
+ push $optional{$addr}->@*, $property{$addr};
+ }
+ } # Otherwise, are going to process the file.
+ elsif ($property{$addr}) {
+
+ # If the file has a property defined in the constructor for it, it
+ # means that the property is not listed in the file's entries. So
+ # add a handler (to the list of line handlers) to insert the
+ # property name into the lines, to provide a uniform interface to
+ # the final processing subroutine.
+ push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
}
elsif ($properties{$addr}) {
- # Add a handler for each line in the input so that it creates a
- # separate input line for each property in those input lines, thus
- # making them suitable for process_generic_property_file().
+ # Similarly, there may be more than one property represented on
+ # each line, with no clue but the constructor input what those
+ # might be. Add a handler for each line in the input so that it
+ # creates a separate input line for each property in those input
+ # lines, thus making them suitable to handle generically.
push @{$each_line_handler{$addr}},
sub {
@@ -2376,7 +2546,7 @@ sub trace { return main::trace(@_); }
};
}
- { # On non-ascii platforms, we use a special handler
+ { # On non-ascii platforms, we use a special pre-handler
no strict;
no warnings 'once';
*next_line = (main::NON_ASCII_PLATFORM)
@@ -2384,6 +2554,9 @@ sub trace { return main::trace(@_); }
: *_next_line;
}
+ &{$construction_time_handler{$addr}}($self)
+ if $construction_time_handler{$addr};
+
return $self;
}
@@ -2401,13 +2574,13 @@ sub trace { return main::trace(@_); }
return __PACKAGE__ . " object for " . $self->file;
}
- # flag to make sure extracted files are processed early
- my $seen_non_extracted_non_age = 0;
-
sub run {
# Process the input object $self. This opens and closes the file and
# calls all the handlers for it. Currently, this can only be called
- # once per file, as it destroy's the EOF handler
+ # once per file, as it destroy's the EOF handlers
+
+ # flag to make sure extracted files are processed early
+ state $seen_non_extracted_non_age = 0;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
@@ -2416,61 +2589,14 @@ sub trace { return main::trace(@_); }
my $file = $file{$addr};
- # Don't process if not expecting this file (because released later
- # than this Unicode version), and isn't there. This means if someone
- # copies it into an earlier version's directory, we will go ahead and
- # process it.
- return if $first_released{$addr} gt $v_version
- && (! defined $file || ! -e $file);
-
- # If in debugging mode and this file doesn't have the non-skip
- # flag set, and isn't one of the critical files, skip it.
- if ($debug_skip
- && $first_released{$addr} ne v0
- && ! $non_skip{$addr})
- {
- print "Skipping $file in debugging\n" if $verbosity;
- return;
- }
-
- # File could be optional
- if ($optional{$addr}) {
- return unless -e $file;
- my $result = eval $optional{$addr};
- if (! defined $result) {
- Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
- return;
- }
- if (! $result) {
- if ($verbosity) {
- print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
- }
- return;
- }
- }
-
- if (! defined $file || ! -e $file) {
-
- # If the file doesn't exist, see if have internal data for it
- # (based on first_released being 0).
- if ($first_released{$addr} eq v0) {
- $handle{$addr} = 'pretend_is_open';
- }
- else {
- if (! $optional{$addr} # File could be optional
- && $v_version ge $first_released{$addr})
- {
- print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
- }
- return;
- }
+ if (! $file) {
+ $handle{$addr} = 'pretend_is_open';
}
else {
-
- # Here, the file exists. Some platforms may change the case of
- # its name
if ($seen_non_extracted_non_age) {
- if ($file =~ /$EXTRACTED/i) {
+ if ($file =~ /$EXTRACTED/i) # Some platforms may change the
+ # case of the file's name
+ {
Carp::my_carp_bug(main::join_lines(<<END
$file should be processed just after the 'Prop...Alias' files, and before
anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
@@ -2480,7 +2606,10 @@ END
}
}
elsif ($EXTRACTED_DIR
- && $first_released{$addr} ne v0
+
+ # We only do this check for generic property files
+ && $handler{$addr} == \&main::process_generic_property_file
+
&& $file !~ /$EXTRACTED/i
&& lc($file) ne 'dage.txt')
{
@@ -2490,71 +2619,137 @@ END
$seen_non_extracted_non_age = 1;
}
- # And mark the file as having being processed, and warn if it
+ # Mark the file as having being processed, and warn if it
# isn't a file we are expecting. As we process the files,
# they are deleted from the hash, so any that remain at the
# end of the program are files that we didn't process.
my $fkey = File::Spec->rel2abs($file);
- my $expecting = delete $potential_files{lc($fkey)};
+ my $exists = delete $potential_files{lc($fkey)};
+
+ Carp::my_carp("Was not expecting '$file'.")
+ if $exists && ! $in_this_release{$addr};
+
+ # If there is special handling for compiling Unicode releases
+ # earlier than the first one in which Unicode defines this
+ # property ...
+ if ($early{$addr}->@* > 1) {
+
+ # Mark as processed any substitute file that would be used in
+ # such a release
+ $fkey = File::Spec->rel2abs($early{$addr}[1]);
+ delete $potential_files{lc($fkey)};
+
+ # As commented in the constructor code, when using the
+ # official property, we still have to allow the publicly
+ # inaccessible early name so that the core code which uses it
+ # will work regardless.
+ if (! $early{$addr}[0] && $early{$addr}->@* > 2) {
+ my $early_property_name = $early{$addr}[2];
+ if ($property{$addr} ne $early_property_name) {
+ main::property_ref($property{$addr})
+ ->add_alias($early_property_name);
+ }
+ }
+ }
+
+ # We may be skipping this file ...
+ if (defined $skip{$addr}) {
- Carp::my_carp("Was not expecting '$file'.") if
- ! $expecting
- && ! defined $handle{$addr};
+ # If the file isn't supposed to be in this release, there is
+ # nothing to do
+ if ($in_this_release{$addr}) {
+
+ # But otherwise, we may print a message
+ if ($debug_skip) {
+ print STDERR "Skipping input file '$file'",
+ " because '$skip{$addr}'\n";
+ }
+
+ # And add it to the list of skipped files, which is later
+ # used to make the pod
+ $skipped_files{$file} = $skip{$addr};
+
+ # The 'optional' list contains properties that are also to
+ # be skipped along with the file. (There may also be
+ # digits which are just placeholders to make sure it isn't
+ # an empty list
+ foreach my $property ($optional{$addr}->@*) {
+ next unless $property =~ /\D/;
+ my $prop_object = main::property_ref($property);
+ next unless defined $prop_object;
+ $prop_object->set_fate($SUPPRESSED, $skip{$addr});
+ }
+ }
- # Having deleted from expected files, we can quit if not to do
- # anything. Don't print progress unless really want verbosity
- if ($skip{$addr}) {
- print "Skipping $file.\n" if $verbosity >= $VERBOSE;
return;
}
- # Open the file, converting the slashes used in this program
- # into the proper form for the OS
+ # Here, we are going to process the file. Open it, converting the
+ # slashes used in this program into the proper form for the OS
my $file_handle;
if (not open $file_handle, "<", $file) {
Carp::my_carp("Can't open $file. Skipping: $!");
- return 0;
+ return;
}
$handle{$addr} = $file_handle; # Cache the open file handle
- if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
+ # If possible, make sure that the file is the correct version.
+ # (This data isn't available on early Unicode releases or in
+ # UnicodeData.txt.) We don't do this check if we are using a
+ # substitute file instead of the official one (though the code
+ # could be extended to do so).
+ if ($in_this_release{$addr}
+ && ! $early{$addr}[0]
+ && lc($file) ne 'unicodedata.txt')
+ {
if ($file !~ /^Unihan/i) {
- $_ = <$file_handle>;
- if ($_ !~ / - $string_version \. /x) {
- chomp;
- $_ =~ s/^#\s*//;
- die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version");
+
+ # The non-Unihan files started getting version numbers in
+ # 3.2, but some files in 4.0 are unchanged from 3.2, and
+ # marked as 3.2. 4.0.1 is the first version where there
+ # are no files marked as being from less than 4.0, though
+ # some are marked as 4.0. In versions after that, the
+ # numbers are correct.
+ if ($v_version ge v4.0.1) {
+ $_ = <$file_handle>; # The version number is in the
+ # very first line
+ if ($_ !~ / - $string_version \. /x) {
+ chomp;
+ $_ =~ s/^#\s*//;
+
+ # 4.0.1 had some valid files that weren't updated.
+ if (! ($v_version eq v4.0.1 && $_ =~ /4\.0\.0/)) {
+ die Carp::my_carp("File '$file' is version "
+ . "'$_'. It should be "
+ . "version $string_version");
+ }
+ }
}
}
- else {
+ elsif ($v_version ge v6.0.0) { # Unihan
+
+ # Unihan files didn't get accurate version numbers until
+ # 6.0. The version is somewhere in the first comment
+ # block
while (<$file_handle>) {
if ($_ !~ /^#/) {
- Carp::my_carp_bug("Could not find the expected version info in file '$file'");
+ Carp::my_carp_bug("Could not find the expected "
+ . "version info in file '$file'");
last;
}
chomp;
$_ =~ s/^#\s*//;
next if $_ !~ / version: /x;
last if $_ =~ /$string_version/;
- die Carp::my_carp("File '$file' is '$_'. It should be version $string_version");
+ die Carp::my_carp("File '$file' is version "
+ . "'$_'. It should be "
+ . "version $string_version");
}
}
}
}
- if ($verbosity >= $PROGRESS) {
- if ($progress_message{$addr}) {
- print "$progress_message{$addr}\n";
- }
- else {
- # If using a virtual file, say so.
- print "Processing ", (-e $file)
- ? $file
- : "substitute $file",
- "\n";
- }
- }
-
+ print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
# Call any special handler for before the file.
&{$pre_handler{$addr}}($self) if $pre_handler{$addr};
@@ -2742,11 +2937,11 @@ END
return 1;
} # End of looping through lines.
- # If there is an EOF handler, call it (only once) and if it generates
+ # If there are EOF handlers, call each (only once) and if it generates
# more lines to process go back in the loop to handle them.
- if ($eof_handler{$addr}) {
- &{$eof_handler{$addr}}($self);
- $eof_handler{$addr} = ""; # Currently only get one shot at it.
+ while ($eof_handler{$addr}->@*) {
+ &{$eof_handler{$addr}[0]}($self);
+ shift $eof_handler{$addr}->@*; # Currently only get one shot at it.
goto LINE if $added_lines{$addr};
}
@@ -2943,6 +3138,82 @@ END
return @return;
}
+ sub _exclude_unassigned {
+
+ # Takes the range in $_ and excludes code points that aren't assigned
+ # in this release
+
+ state $skip_inserted_count = 0;
+
+ # Ignore recursive calls.
+ if ($skip_inserted_count) {
+ $skip_inserted_count--;
+ return;
+ }
+
+ # Find what code points are assigned in this release
+ main::calculate_Assigned() if ! defined $Assigned;
+
+ my $self = shift;
+ my $addr = do { no overloading; pack 'J', $self; };
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my ($range, @remainder)
+ = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+
+ # Examine the range.
+ if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
+ {
+ my $low = hex $1;
+ my $high = (defined $2) ? hex $2 : $low;
+
+ # Split the range into subranges of just those code points in it
+ # that are assigned.
+ my @ranges = (Range_List->new(Initialize
+ => Range->new($low, $high)) & $Assigned)->ranges;
+
+ # Do nothing if nothing in the original range is assigned in this
+ # release; handle normally if everything is in this release.
+ if (! @ranges) {
+ $_ = "";
+ }
+ elsif (@ranges != 1) {
+
+ # Here, some code points in the original range aren't in this
+ # release; @ranges gives the ones that are. Create fake input
+ # lines for each of the ranges, and set things up so that when
+ # this routine is called on that fake input, it will do
+ # nothing.
+ $skip_inserted_count = @ranges;
+ my $remainder = join ";", @remainder;
+ for my $range (@ranges) {
+ $self->insert_lines(sprintf("%04X..%04X;%s",
+ $range->start, $range->end, $remainder));
+ }
+ $_ = ""; # The original range is now defunct.
+ }
+ }
+
+ return;
+ }
+
+ sub _fixup_obsolete_hanguls {
+
+ # This is called only when compiling Unicode version 1. All Unicode
+ # data for subsequent releases assumes that the code points that were
+ # Hangul syllables in this release only are something else, so if
+ # using such data, we have to override it
+
+ my $self = shift;
+ my $addr = do { no overloading; pack 'J', $self; };
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $object = main::property_ref($property{$addr});
+ $object->add_map(0x3400, 0x4DFF,
+ $early{$addr}[3], # Passed-in value for these
+ Replace => $UNCONDITIONALLY);
+ }
+
sub _insert_property_into_line {
# Add a property field to $_, if this file requires it.
@@ -3027,6 +3298,8 @@ package Multi_Default;
# .
# .
# 'U'));
+ # It is best to leave the final value be the one that matches the
+ # above-Unicode code points.
my $class = shift;
@@ -3732,7 +4005,7 @@ sub trace { return main::trace(@_); }
# => $MULTIPLE_BEFORE means that if this range duplicates an
# existing one, but has a different value,
# don't replace the existing one, but insert
- # this, one so that the same range can occur
+ # this one so that the same range can occur
# multiple times. They are stored LIFO, so
# that the final one inserted is the first one
# returned in an ordered search of the table.
@@ -3747,6 +4020,7 @@ sub trace { return main::trace(@_); }
# existing range, this one is discarded
# (leaving the existing one in its original,
# higher priority position
+ # => $CROAK Die with an error if is already there
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
# "same value" means identical for non-type-0 ranges, and it means
@@ -3835,7 +4109,7 @@ sub trace { return main::trace(@_); }
# Here, the new range starts just after the current highest in
# the range list, and they have the same type and value.
- # Extend the current range to incorporate the new one.
+ # Extend the existing range to incorporate the new one.
@{$r}[-1]->set_end($end);
}
@@ -4228,7 +4502,7 @@ sub trace { return main::trace(@_); }
# In other words,
# r[$i-1]->end < $start <= r[$i]->end
# And:
- # r[$i-1]->end < $start <= $end <= r[$j+1]->start
+ # r[$i-1]->end < $start <= $end < r[$j+1]->start
#
# Also:
# $clean_insert is a boolean which is set true if and only if
@@ -5093,6 +5367,7 @@ sub trace { return main::trace(@_); }
my $note = delete $args{'Note'};
my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
my $perl_extension = delete $args{'Perl_Extension'};
+ my $suppression_reason = delete $args{'Suppression_Reason'};
# Shouldn't have any left over
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
@@ -5134,11 +5409,12 @@ END
{
$fate{$addr} = $SUPPRESSED;
}
- elsif ($fate{$addr} == $SUPPRESSED
- && ! exists $why_suppressed{$property{$addr}->complete_name})
- {
- Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
- # perhaps Fate => [ $SUPPRESSED, "reason" ]
+ elsif ($fate{$addr} == $SUPPRESSED) {
+ Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason;
+ # Though currently unused
+ }
+ elsif ($suppression_reason) {
+ Carp::my_carp_bug("A reason was given for suppressing, but not suppressed");
}
# If hasn't set its status already, see if it is on one of the
@@ -5266,17 +5542,18 @@ END
my %args = @_;
my $loose_match = delete $args{'Fuzzy'};
- my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
- $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
-
my $ok_as_filename = delete $args{'OK_as_Filename'};
$ok_as_filename = 1 unless defined $ok_as_filename;
- my $status = delete $args{'Status'};
- $status = $NORMAL unless defined $status;
-
# An internal name does not get documented, unless overridden by the
- # input.
+ # input; same for making tests for it.
+ my $status = delete $args{'Status'} || (($name =~ /^_/)
+ ? $INTERNAL_ALIAS
+ : $NORMAL);
+ my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}
+ // (($status ne $INTERNAL_ALIAS)
+ ? (($name =~ /^_/) ? $NO : $YES)
+ : $NO);
my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
@@ -5346,7 +5623,7 @@ END
$insert_position,
0,
Alias->new($name, $loose_match, $make_re_pod_entry,
- $ok_as_filename, $status, $ucd);
+ $ok_as_filename, $status, $ucd);
# This name may be shorter than any existing ones, so clear the cache
# of the shortest, so will have to be recalculated.
@@ -6267,7 +6544,8 @@ END
}
# Save the reason for suppression for output
- if ($fate == $SUPPRESSED && defined $reason) {
+ if ($fate >= $SUPPRESSED) {
+ $reason = "" unless defined $reason;
$why_suppressed{$complete_name{$addr}} = $reason;
}
@@ -6763,7 +7041,7 @@ END
# The ranges that map to the default aren't output, so subtract that
# to get those actually output. A property with matching tables
# already has the information calculated.
- if ($property->type != $STRING) {
+ if ($property->type != $STRING && $property->type != $FORCED_BINARY) {
$count -= $property->table($default_map)->count;
}
elsif (defined $default_map) {
@@ -6839,9 +7117,11 @@ END
$comment .= "This file returns the $mapping:\n";
my $ucd_accessible_name = "";
+ my $has_underscore_name = 0;
my $full_name = $self->property->full_name;
for my $i (0 .. @property_aliases - 1) {
my $name = $property_aliases[$i]->name;
+ $has_underscore_name = 1 if $name =~ /^_/;
$comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
if ($property_aliases[$i]->ucd) {
if ($name eq $full_name) {
@@ -6854,7 +7134,12 @@ END
}
$comment .= "\nwhere 'cp' is $cp.";
if ($ucd_accessible_name) {
- $comment .= " Note that $these_mappings $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
+ $comment .= " Note that $these_mappings";
+ if ($has_underscore_name) {
+ $comment .= " (except for the one(s) that begin with an underscore)";
+ }
+ $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
+
}
# And append any commentary already set from the actual property.
@@ -6888,7 +7173,7 @@ END
# There are tables which end up only having one element per
# range, but it is not worth keeping track of for making just
# this comment a little better.
- $comment.= <<END;
+ $comment .= <<END;
non-comment portions of the main body of lines of this file is:
START\\tSTOP\\tMAPPING where START is the starting code point of the
range, in hex; STOP is the ending point, or if omitted, the range has just one
@@ -7848,7 +8133,6 @@ END
main::uniques($leader, @{$equivalents{$addr}});
my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
# tables
-
for my $parent (@parents) {
my $property = $parent->property;
@@ -7906,7 +8190,7 @@ END
# commentary that the other combinations are possible.
# Because regular expressions don't recognize things like
# \p{jsn=}, only look at non-null right-hand-sides
- my @property_aliases = $table_property->aliases;
+ my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases;
my @table_aliases = grep { $_->name ne "" } $table->aliases;
# The alias lists above are already ordered in the order we
@@ -7918,8 +8202,7 @@ END
? main::max(scalar @table_aliases,
scalar @property_aliases)
: 0;
- trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
-
+ trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
my $property_had_compound_name = 0;
@@ -8070,8 +8353,14 @@ END
foreach my $flag (sort keys %flags) {
$comment .= <<END;
'$flag' below means that this form is $flags{$flag}.
-Consult $pod_file.pod
END
+ if ($flag eq $INTERNAL_ALIAS) {
+ $comment .= "DO NOT USE!!!";
+ }
+ else {
+ $comment .= "Consult $pod_file.pod";
+ }
+ $comment .= "\n";
}
$comment .= "\n";
}
@@ -8436,18 +8725,27 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
$perl_extension
= $self->perl_extension if ! defined $perl_extension;
+ my $fate;
+ my $suppression_reason = "";
+ if ($self->name =~ /^_/) {
+ $fate = $SUPPRESSED;
+ $suppression_reason = "Parent property is internal only";
+ }
+ elsif ($self->fate >= $SUPPRESSED) {
+ $fate = $self->fate;
+ $suppression_reason = $why_suppressed{$self->complete_name};
+
+ }
+ elsif ($name =~ /^_/) {
+ $fate = $INTERNAL_ONLY;
+ }
$table = Match_Table->new(
Name => $name,
Perl_Extension => $perl_extension,
_Alias_Hash => $table_ref{$addr},
_Property => $self,
-
- # gets property's fate and status by default,
- # except if the name begind with an
- # underscore, default it to internal
- Fate => ($name =~ /^_/)
- ? $INTERNAL_ONLY
- : $self->fate,
+ Fate => $fate,
+ Suppression_Reason => $suppression_reason,
Status => $self->status,
_Status_Info => $self->status_info,
%args);
@@ -8562,10 +8860,13 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
# Swash names are used only on either
# 1) legacy-only properties, because the formats for these are
# unchangeable, and they have had these lines in them; or
- # 2) regular map tables; otherwise there should be no access to the
+ # 2) regular or internal-only map tables
+ # 3) otherwise there should be no access to the
# property map table from other parts of Perl.
return if $map{$addr}->fate != $ORDINARY
- && $map{$addr}->fate != $LEGACY_ONLY;
+ && $map{$addr}->fate != $LEGACY_ONLY
+ && ! ($map{$addr}->name =~ /^_/
+ && $map{$addr}->fate == $INTERNAL_ONLY);
return $file{$addr} if defined $file{$addr};
return $map{$addr}->external_name;
@@ -8589,9 +8890,6 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
# to it.
return 0 if $type{$addr} == $STRING;
- # Don't generate anything for unimplemented properties.
- return 0 if grep { $self->complete_name eq $_ }
- @unimplemented_properties;
# Otherwise, do.
return 1;
}
@@ -8808,7 +9106,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $addr = do { no overloading; pack 'J', $self; };
- if ($fate == $SUPPRESSED) {
+ if ($fate >= $SUPPRESSED) {
$why_suppressed{$self->complete_name} = $reason;
}
@@ -8894,15 +9192,15 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
package main;
- sub display_chr {
- # Converts an ordinal printable character value to a displayable
- # string, using a dotted circle to hold combining characters.
+sub display_chr {
+ # Converts an ordinal printable character value to a displayable string,
+ # using a dotted circle to hold combining characters.
- my $ord = shift;
- my $chr = chr $ord;
- return $chr if $ccc->table(0)->contains($ord);
- return "\x{25CC}$chr";
- }
+ my $ord = shift;
+ my $chr = chr $ord;
+ return $chr if $ccc->table(0)->contains($ord);
+ return "\x{25CC}$chr";
+}
sub join_lines($) {
# Returns lines of the input joined together, so that they can be folded
@@ -9443,7 +9741,6 @@ sub dump_inside_out {
my $object = shift;
my $fields_ref = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $addr = do { no overloading; pack 'J', $object; };
@@ -9520,6 +9817,17 @@ sub _operator_not_equal {
return ! _operator_equal($self, $other);
}
+sub substitute_PropertyAliases($) {
+ # Deal with early releases that don't have the crucial PropertyAliases.txt
+ # file.
+
+ my $file_object = shift;
+ $file_object->insert_lines(get_old_property_aliases());
+
+ process_PropertyAliases($file_object);
+}
+
+
sub process_PropertyAliases($) {
# This reads in the PropertyAliases.txt file, which contains almost all
# the character properties in Unicode and their equivalent aliases:
@@ -9532,11 +9840,6 @@ sub process_PropertyAliases($) {
my $file= shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- # This whole file was non-existent in early releases, so use our own
- # internal one.
- $file->insert_lines(get_old_property_aliases())
- if ! -e 'PropertyAliases.txt';
-
# Add any cjk properties that may have been defined.
$file->insert_lines(@cjk_properties);
@@ -9546,8 +9849,17 @@ sub process_PropertyAliases($) {
my $full = $data[1];
+ # This line is defective in early Perls. The property in Unihan.txt
+ # is kRSUnicode.
+ if ($full eq 'Unicode_Radical_Stroke' && @data < 3) {
+ push @data, qw(cjkRSUnicode kRSUnicode);
+ }
+
my $this = Property->new($data[0], Full_Name => $full);
+ $this->set_fate($SUPPRESSED, $why_suppressed{$full})
+ if $why_suppressed{$full};
+
# Start looking for more aliases after these two.
for my $i (2 .. @data - 1) {
$this->add_alias($data[$i]);
@@ -9573,18 +9885,6 @@ sub finish_property_setup {
Property->new('JSN', Full_Name => 'Jamo_Short_Name');
}
- # These two properties must be defined in all releases so we can generate
- # the tables from them to make regex \X work, but suppress their output so
- # aren't application visible prior to releases where they should be
- if (! defined property_ref('GCB')) {
- Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
- Fate => $PLACEHOLDER);
- }
- if (! defined property_ref('hst')) {
- Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
- Fate => $PLACEHOLDER);
- }
-
# These are used so much, that we set globals for them.
$gc = property_ref('General_Category');
$block = property_ref('Block');
@@ -9701,22 +10001,15 @@ sub finish_property_setup {
# for non-assigned code points; 'AL' for assigned.
if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
my $lb = property_ref('Line_Break');
- if ($v_version gt 3.2.0) {
+ if (file_exists("${EXTRACTED}DLineBreak.txt")) {
$lb->set_default_map('Unknown');
}
else {
- my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
- 'AL');
+ my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")',
+ 'Unknown',
+ );
$lb->set_default_map($default);
}
-
- # If has the URS property, make sure that the standard aliases are in
- # it, since not in the input tables in some versions.
- my $urs = property_ref('Unicode_Radical_Stroke');
- if (defined $urs) {
- $urs->add_alias('cjkRSUnicode');
- $urs->add_alias('kRSUnicode');
- }
}
# For backwards compatibility with applications that may read the mapping
@@ -9874,6 +10167,16 @@ END
return @return;
}
+sub substitute_PropValueAliases($) {
+ # Deal with early releases that don't have the crucial
+ # PropValueAliases.txt file.
+
+ my $file_object = shift;
+ $file_object->insert_lines(get_old_property_value_aliases());
+
+ process_PropValueAliases($file_object);
+}
+
sub process_PropValueAliases {
# This file contains values that properties look like:
# bc ; AL ; Arabic_Letter
@@ -9899,35 +10202,29 @@ sub process_PropValueAliases {
my $file= shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- # This whole file was non-existent in early releases, so use our own
- # internal one if necessary.
- if (! -e 'PropValueAliases.txt') {
- $file->insert_lines(get_old_property_value_aliases());
- }
-
if ($v_version lt 4.0.0) {
$file->insert_lines(split /\n/, <<'END'
-hst; L ; Leading_Jamo
-hst; LV ; LV_Syllable
-hst; LVT ; LVT_Syllable
-hst; NA ; Not_Applicable
-hst; T ; Trailing_Jamo
-hst; V ; Vowel_Jamo
+Hangul_Syllable_Type; L ; Leading_Jamo
+Hangul_Syllable_Type; LV ; LV_Syllable
+Hangul_Syllable_Type; LVT ; LVT_Syllable
+Hangul_Syllable_Type; NA ; Not_Applicable
+Hangul_Syllable_Type; T ; Trailing_Jamo
+Hangul_Syllable_Type; V ; Vowel_Jamo
END
);
}
if ($v_version lt 4.1.0) {
$file->insert_lines(split /\n/, <<'END'
-GCB; CN ; Control
-GCB; CR ; CR
-GCB; EX ; Extend
-GCB; L ; L
-GCB; LF ; LF
-GCB; LV ; LV
-GCB; LVT ; LVT
-GCB; T ; T
-GCB; V ; V
-GCB; XX ; Other
+_Perl_GCB; CN ; Control
+_Perl_GCB; CR ; CR
+_Perl_GCB; EX ; Extend
+_Perl_GCB; L ; L
+_Perl_GCB; LF ; LF
+_Perl_GCB; LV ; LV
+_Perl_GCB; LVT ; LVT
+_Perl_GCB; T ; T
+_Perl_GCB; V ; V
+_Perl_GCB; XX ; Other
END
);
}
@@ -9942,7 +10239,6 @@ END
# program generates for this block property value
#$file->insert_lines('blk; n/a; Herited');
-
# Process each line of the file ...
while ($file->next_line) {
@@ -9959,6 +10255,11 @@ END
# thus shifting the former field 0 to after them.)
splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
+ if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
+ my $new_style = $data[1] =~ s/-/_/gr;
+ splice @data, 1, 0, $new_style;
+ }
+
# Field 0 is a short name unless "n/a"; field 1 is the full name. If
# there is no short name, use the full one in element 1
if ($data[0] eq "n/a") {
@@ -10552,7 +10853,8 @@ sub output_perl_charnames_line ($$) {
$line));
}
- # And process the first range, like any other.
+ # And set things up so that the below will process this first
+ # range, like any other.
$low = $this_range->start;
$high = $this_range->end;
}
@@ -10963,11 +11265,12 @@ END
my $file = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- # Create a new property specially located that is a combination of the
+ # Create a new property specially located that is a combination of
# various Name properties: Name, Unicode_1_Name, Named Sequences, and
- # Name_Alias properties. (The final duplicates elements of the
- # first.) A comment for it will later be constructed based on the
- # actual properties present and used
+ # _Perl_Name_Alias properties. (The final one duplicates elements of the
+ # first, and starting in v6.1, is the same as the 'Name_Alias
+ # property.) A comment for the new property will later be constructed
+ # based on the actual properties present and used
$perl_charname = Property->new('Perl_Charnames',
Default_Map => "",
Directory => File::Spec->curdir(),
@@ -12028,10 +12331,8 @@ sub filter_old_style_case_folding {
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my @fields = split /\s*;\s*/;
- if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
- $fields[1] = 'I';
- }
- elsif ($fields[1] eq 'L') {
+
+ if ($fields[1] eq 'L') {
$fields[1] = 'C'; # L => C always
}
elsif ($fields[1] eq 'E') {
@@ -12339,6 +12640,68 @@ sub filter_numeric_value_line {
{ # Closure
my %unihan_properties;
+ sub construct_unihan {
+
+ my $file_object = shift;
+
+ return unless file_exists($file_object->file);
+
+ if ($v_version lt v4.0.0) {
+ push @cjk_properties, 'URS ; Unicode_Radical_Stroke';
+ push @cjk_property_values, split "\n", <<'END';
+# @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none>
+END
+ }
+
+ if ($v_version ge v3.0.0) {
+ push @cjk_properties, split "\n", <<'END';
+cjkIRG_GSource; kIRG_GSource
+cjkIRG_JSource; kIRG_JSource
+cjkIRG_KSource; kIRG_KSource
+cjkIRG_TSource; kIRG_TSource
+cjkIRG_VSource; kIRG_VSource
+END
+ push @cjk_property_values, split "\n", <<'END';
+# @missing: 0000..10FFFF; cjkIRG_GSource; <none>
+# @missing: 0000..10FFFF; cjkIRG_JSource; <none>
+# @missing: 0000..10FFFF; cjkIRG_KSource; <none>
+# @missing: 0000..10FFFF; cjkIRG_TSource; <none>
+# @missing: 0000..10FFFF; cjkIRG_VSource; <none>
+END
+ }
+ if ($v_version ge v3.1.0) {
+ push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource';
+ push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>';
+ }
+ if ($v_version ge v3.1.1) {
+ push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource';
+ push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>';
+ }
+ if ($v_version ge v3.2.0) {
+ push @cjk_properties, split "\n", <<'END';
+cjkAccountingNumeric; kAccountingNumeric
+cjkCompatibilityVariant; kCompatibilityVariant
+cjkOtherNumeric; kOtherNumeric
+cjkPrimaryNumeric; kPrimaryNumeric
+END
+ push @cjk_property_values, split "\n", <<'END';
+# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
+# @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
+# @missing: 0000..10FFFF; cjkOtherNumeric; NaN
+# @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
+END
+ }
+ if ($v_version gt v4.0.0) {
+ push @cjk_properties, 'cjkIRG_USource; kIRG_USource';
+ push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>';
+ }
+
+ if ($v_version ge v4.1.0) {
+ push @cjk_properties, 'cjkIICore ; kIICore';
+ push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>';
+ }
+ }
+
sub setup_unihan {
# Do any special setup for Unihan properties.
@@ -12351,16 +12714,16 @@ sub filter_numeric_value_line {
my $iicore = property_ref('kIICore');
if (defined $iicore) {
$iicore->set_type($FORCED_BINARY);
- $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
+ $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38.");
# Unicode doesn't include the maps for this property, so don't
# warn that they are missing.
$iicore->set_pre_declared_maps(0);
$iicore->add_comment(join_lines( <<END
-This property contains enum values, but Unicode UAX #38 says it should be
-interpreted as binary, so Perl creates tables for both 1) its enum values,
-plus 2) true/false tables in which it is considered true for all code points
-that have a non-null value
+This property contains string values, but any non-empty ones are considered to
+be 'core', so Perl creates tables for both: 1) its string values, plus 2)
+tables so that \\p{kIICore} matches any code point which has a non-empty
+value for this property.
END
));
}
@@ -12439,7 +12802,7 @@ sub filter_blocks_lines {
# Change hyphens and blanks in the block name field only
$fields[1] =~ s/[ -]/_/g;
- $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
+ $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word
$_ = join("; ", @fields);
return;
@@ -12657,12 +13020,22 @@ sub generate_hst {
END
);
- # The Hangul syllables in version 1 are completely different than what came
- # after, so just ignore them there.
+ # The Hangul syllables in version 1 are at different code points than
+ # those that came along starting in version 2, and have different names;
+ # they comprise about 60% of the code points of the later version.
+ # From my (khw) research on them (see <558493EB.4000807@att.net>), the
+ # initial set is a subset of the later version, with different English
+ # transliterations. I did not see an easy mapping between them. The
+ # later set includes essentially all possibilities, even ones that aren't
+ # in modern use (if they ever were), and over 96% of the new ones are type
+ # LVT. Mathematically, the early set must also contain a preponderance of
+ # LVT values. In lieu of doing nothing, we just set them all to LVT, and
+ # expect that this will be right most of the time, which is better than
+ # not being right at all.
if ($v_version lt v2.0.0) {
my $property = property_ref($file->property);
+ $file->insert_lines("3400..4DFF; LVT\n");
push @tables_that_may_be_empty, $property->table('LV')->complete_name;
- push @tables_that_may_be_empty, $property->table('LVT')->complete_name;
return;
}
@@ -12712,7 +13085,6 @@ sub generate_GCB {
# Also from http://www.unicode.org/reports/tr29/tr29-3.html.
foreach my $code_point ( qw{
- 40000
09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
}
@@ -12735,275 +13107,50 @@ sub generate_GCB {
generate_hst($file);
}
- return;
+ main::process_generic_property_file($file);
}
-sub setup_early_name_alias {
- my $file= shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- # This has the effect of pretending that the Name_Alias property was
- # available in all Unicode releases. Strictly speaking, this property
- # should not be availabe in early releases, but doing this allows
- # charnames.pm to work on older releases without change. Prior to v5.16
- # it had these names hard-coded inside it. Unicode 6.1 came along and
- # created these names, and so they were removed from charnames.
+sub fixup_early_perl_name_alias {
- my $aliases = property_ref('Name_Alias');
- if (! defined $aliases) {
- $aliases = Property->new('Name_Alias', Default_Map => "");
- }
+ # Different versions of Unicode have varying support for the name synonyms
+ # below. Just include everything. As of 6.1, all these are correct in
+ # the Unicode-supplied file.
- $file->insert_lines(get_old_name_aliases());
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- return;
-}
-sub get_old_name_aliases () {
+ # ALERT did not come along until 6.0, at which point it became preferred
+ # over BELL. By inserting it last in early releases, BELL is preferred
+ # over it; and vice-vers in 6.0
+ my $type_for_bell = ($v_version lt v6.0.0)
+ ? 'correction'
+ : 'alternate';
+ $file->insert_lines(split /\n/, <<END
+0007;BELL; $type_for_bell
+000A;LINE FEED (LF);alternate
+000C;FORM FEED (FF);alternate
+000D;CARRIAGE RETURN (CR);alternate
+0085;NEXT LINE (NEL);alternate
+END
+
+ );
- # The Unicode_1_Name field, contains most of these names. One would
- # expect, given the field's name, that its values would be fixed across
- # versions, giving the true Unicode version 1 name for the character.
- # Sadly, this is not the case. Actually Version 1.1.5 had no names for
- # any of the controls; Version 2.0 introduced names for the C0 controls,
- # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2
- # changed some names: it
+ # One might think that the the 'Unicode_1_Name' field, could work for most
+ # of the above names, but sadly that field varies depending on the
+ # release. Version 1.1.5 had no names for any of the controls; Version
+ # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names.
+ # 3.0.1 removed the name INDEX; and 3.2 changed some names:
# changed to parenthesized versions like "NEXT LINE" to
# "NEXT LINE (NEL)";
# changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
# changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
# changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
- # This list contains all the names that were defined so that
- # charnames::vianame(), etc. understand them all EVEN if this version of
- # Unicode didn't specify them (this could be construed as a bug).
- # mktables elsewhere gives preference to the Unicode_1_Name field over
- # these names, so that viacode() will return the correct value for that
- # version of Unicode, except when that version doesn't define a name,
- # viacode() will return one anyway (this also could be construed as a
- # bug). But these potential "bugs" allow for the smooth working of code
- # on earlier Unicode releases.
-
- my @return = split /\n/, <<'END';
-0000;NULL;control
-0000;NUL;abbreviation
-0001;START OF HEADING;control
-0001;SOH;abbreviation
-0002;START OF TEXT;control
-0002;STX;abbreviation
-0003;END OF TEXT;control
-0003;ETX;abbreviation
-0004;END OF TRANSMISSION;control
-0004;EOT;abbreviation
-0005;ENQUIRY;control
-0005;ENQ;abbreviation
-0006;ACKNOWLEDGE;control
-0006;ACK;abbreviation
-0007;BELL;control
-0007;BEL;abbreviation
-0008;BACKSPACE;control
-0008;BS;abbreviation
-0009;CHARACTER TABULATION;control
-0009;HORIZONTAL TABULATION;control
-0009;HT;abbreviation
-0009;TAB;abbreviation
-000A;LINE FEED;control
-000A;LINE FEED (LF);control
-000A;NEW LINE;control
-000A;END OF LINE;control
-000A;LF;abbreviation
-000A;NL;abbreviation
-000A;EOL;abbreviation
-000B;LINE TABULATION;control
-000B;VERTICAL TABULATION;control
-000B;VT;abbreviation
-000C;FORM FEED;control
-000C;FORM FEED (FF);control
-000C;FF;abbreviation
-000D;CARRIAGE RETURN;control
-000D;CARRIAGE RETURN (CR);control
-000D;CR;abbreviation
-000E;SHIFT OUT;control
-000E;LOCKING-SHIFT ONE;control
-000E;SO;abbreviation
-000F;SHIFT IN;control
-000F;LOCKING-SHIFT ZERO;control
-000F;SI;abbreviation
-0010;DATA LINK ESCAPE;control
-0010;DLE;abbreviation
-0011;DEVICE CONTROL ONE;control
-0011;DC1;abbreviation
-0012;DEVICE CONTROL TWO;control
-0012;DC2;abbreviation
-0013;DEVICE CONTROL THREE;control
-0013;DC3;abbreviation
-0014;DEVICE CONTROL FOUR;control
-0014;DC4;abbreviation
-0015;NEGATIVE ACKNOWLEDGE;control
-0015;NAK;abbreviation
-0016;SYNCHRONOUS IDLE;control
-0016;SYN;abbreviation
-0017;END OF TRANSMISSION BLOCK;control
-0017;ETB;abbreviation
-0018;CANCEL;control
-0018;CAN;abbreviation
-0019;END OF MEDIUM;control
-0019;EOM;abbreviation
-001A;SUBSTITUTE;control
-001A;SUB;abbreviation
-001B;ESCAPE;control
-001B;ESC;abbreviation
-001C;INFORMATION SEPARATOR FOUR;control
-001C;FILE SEPARATOR;control
-001C;FS;abbreviation
-001D;INFORMATION SEPARATOR THREE;control
-001D;GROUP SEPARATOR;control
-001D;GS;abbreviation
-001E;INFORMATION SEPARATOR TWO;control
-001E;RECORD SEPARATOR;control
-001E;RS;abbreviation
-001F;INFORMATION SEPARATOR ONE;control
-001F;UNIT SEPARATOR;control
-001F;US;abbreviation
-0020;SP;abbreviation
-007F;DELETE;control
-007F;DEL;abbreviation
-0080;PADDING CHARACTER;figment
-0080;PAD;abbreviation
-0081;HIGH OCTET PRESET;figment
-0081;HOP;abbreviation
-0082;BREAK PERMITTED HERE;control
-0082;BPH;abbreviation
-0083;NO BREAK HERE;control
-0083;NBH;abbreviation
-0084;INDEX;control
-0084;IND;abbreviation
-0085;NEXT LINE;control
-0085;NEXT LINE (NEL);control
-0085;NEL;abbreviation
-0086;START OF SELECTED AREA;control
-0086;SSA;abbreviation
-0087;END OF SELECTED AREA;control
-0087;ESA;abbreviation
-0088;CHARACTER TABULATION SET;control
-0088;HORIZONTAL TABULATION SET;control
-0088;HTS;abbreviation
-0089;CHARACTER TABULATION WITH JUSTIFICATION;control
-0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
-0089;HTJ;abbreviation
-008A;LINE TABULATION SET;control
-008A;VERTICAL TABULATION SET;control
-008A;VTS;abbreviation
-008B;PARTIAL LINE FORWARD;control
-008B;PARTIAL LINE DOWN;control
-008B;PLD;abbreviation
-008C;PARTIAL LINE BACKWARD;control
-008C;PARTIAL LINE UP;control
-008C;PLU;abbreviation
-008D;REVERSE LINE FEED;control
-008D;REVERSE INDEX;control
-008D;RI;abbreviation
-008E;SINGLE SHIFT TWO;control
-008E;SINGLE-SHIFT-2;control
-008E;SS2;abbreviation
-008F;SINGLE SHIFT THREE;control
-008F;SINGLE-SHIFT-3;control
-008F;SS3;abbreviation
-0090;DEVICE CONTROL STRING;control
-0090;DCS;abbreviation
-0091;PRIVATE USE ONE;control
-0091;PRIVATE USE-1;control
-0091;PU1;abbreviation
-0092;PRIVATE USE TWO;control
-0092;PRIVATE USE-2;control
-0092;PU2;abbreviation
-0093;SET TRANSMIT STATE;control
-0093;STS;abbreviation
-0094;CANCEL CHARACTER;control
-0094;CCH;abbreviation
-0095;MESSAGE WAITING;control
-0095;MW;abbreviation
-0096;START OF GUARDED AREA;control
-0096;START OF PROTECTED AREA;control
-0096;SPA;abbreviation
-0097;END OF GUARDED AREA;control
-0097;END OF PROTECTED AREA;control
-0097;EPA;abbreviation
-0098;START OF STRING;control
-0098;SOS;abbreviation
-0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
-0099;SGC;abbreviation
-009A;SINGLE CHARACTER INTRODUCER;control
-009A;SCI;abbreviation
-009B;CONTROL SEQUENCE INTRODUCER;control
-009B;CSI;abbreviation
-009C;STRING TERMINATOR;control
-009C;ST;abbreviation
-009D;OPERATING SYSTEM COMMAND;control
-009D;OSC;abbreviation
-009E;PRIVACY MESSAGE;control
-009E;PM;abbreviation
-009F;APPLICATION PROGRAM COMMAND;control
-009F;APC;abbreviation
-00A0;NBSP;abbreviation
-00AD;SHY;abbreviation
-200B;ZWSP;abbreviation
-200C;ZWNJ;abbreviation
-200D;ZWJ;abbreviation
-200E;LRM;abbreviation
-200F;RLM;abbreviation
-202A;LRE;abbreviation
-202B;RLE;abbreviation
-202C;PDF;abbreviation
-202D;LRO;abbreviation
-202E;RLO;abbreviation
-FEFF;BYTE ORDER MARK;alternate
-FEFF;BOM;abbreviation
-FEFF;ZWNBSP;abbreviation
-END
-
- if ($v_version ge v3.0.0) {
- push @return, split /\n/, <<'END';
-180B; FVS1; abbreviation
-180C; FVS2; abbreviation
-180D; FVS3; abbreviation
-180E; MVS; abbreviation
-202F; NNBSP; abbreviation
-END
- }
-
- if ($v_version ge v3.2.0) {
- push @return, split /\n/, <<'END';
-034F; CGJ; abbreviation
-205F; MMSP; abbreviation
-2060; WJ; abbreviation
-END
- # Add in VS1..VS16
- my $cp = 0xFE00 - 1;
- for my $i (1..16) {
- push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
- }
- }
- if ($v_version ge v4.0.0) { # Add in VS17..VS256
- my $cp = 0xE0100 - 17;
- for my $i (17..256) {
- push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
- }
- }
-
- # ALERT did not come along until 6.0, at which point it became preferred
- # over BELL, and was never in the Unicode_1_Name field. For the same
- # reasons, that the other names are made known to all releases by this
- # function, we make ALERT known too. By inserting it
- # last in early releases, BELL is preferred over it; and vice-vers in 6.0
- my $alert = '0007; ALERT; control';
- if ($v_version lt v6.0.0) {
- push @return, $alert;
- }
- else {
- unshift @return, $alert;
- }
+ #
+ # All these are present in the 6.1 NameAliases.txt
- return @return;
+ return;
}
sub filter_later_version_name_alias_line {
@@ -13030,14 +13177,29 @@ sub filter_later_version_name_alias_line {
sub filter_early_version_name_alias_line {
# Early versions did not have the trailing alias type field; implicitly it
- # was 'correction'. But our synthetic lines we add in this program do
- # have it, so test for the type field.
- $_ .= "; correction" if $_ !~ /;.*;/;
+ # was 'correction'.
+ $_ .= "; correction";
filter_later_version_name_alias_line;
return;
}
+sub filter_all_caps_script_names {
+
+ # Some early Unicode releases had the script names in all CAPS. This
+ # converts them to just the first letter of each word being capital.
+
+ my ($range, $script, @remainder)
+ = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+ my @words = split "_", $script;
+ for my $word (@words) {
+ $word =
+ ucfirst(lc($word)) if $word ne 'CJK';
+ }
+ $script = join "_", @words;
+ $_ = join ";", $range, $script, @remainder;
+}
+
sub finish_Unicode() {
# This routine should be called after all the Unicode files have been read
# in. It:
@@ -13188,72 +13350,81 @@ END
# Add any remaining code points to the mapping, using the default for
# missing code points.
my $default_table;
- if (defined (my $default_map = $property->default_map)) {
+ my $default_map = $property->default_map;
+ if ($property_type == $FORCED_BINARY) {
- # Make sure there is a match table for the default
- if (! defined ($default_table = $property->table($default_map))) {
- $default_table = $property->add_match_table($default_map);
+ # A forced binary property creates a 'Y' table that matches all
+ # non-default values. The actual string values are also written out
+ # as a map table. (The default value will almost certainly be the
+ # empty string, so the pod glosses over the distinction, and just
+ # talks about empty vs non-empty.)
+ my $yes = $property->table("Y");
+ foreach my $range ($property->ranges) {
+ next if $range->value eq $default_map;
+ $yes->add_range($range->start, $range->end);
}
+ $property->table("N")->set_complement($yes);
+ }
+ else {
+ if (defined $default_map) {
- # And, if the property is binary, the default table will just
- # be the complement of the other table.
- if ($property_type == $BINARY) {
- my $non_default_table;
-
- # Find the non-default table.
- for my $table ($property->tables) {
- next if $table == $default_table;
- $non_default_table = $table;
+ # Make sure there is a match table for the default
+ if (! defined ($default_table = $property->table($default_map)))
+ {
+ $default_table = $property->add_match_table($default_map);
}
- $default_table->set_complement($non_default_table);
- }
- else {
- # This fills in any missing values with the default. It's not
- # necessary to do this with binary properties, as the default
- # is defined completely in terms of the Y table.
- $property->add_map(0, $MAX_WORKING_CODEPOINT,
- $default_map, Replace => $NO);
- }
- }
+ # And, if the property is binary, the default table will just
+ # be the complement of the other table.
+ if ($property_type == $BINARY) {
+ my $non_default_table;
- # Have all we need to populate the match tables.
- my $maps_should_be_defined = $property->pre_declared_maps;
- foreach my $range ($property->ranges) {
- my $map = $range->value;
- my $table = $property->table($map);
- if (! defined $table) {
+ # Find the non-default table.
+ for my $table ($property->tables) {
+ if ($table == $default_table) {
+ if ($v_version le v5.0.0) {
+ $table->add_alias($_) for qw(N No F False);
+ }
+ next;
+ } elsif ($v_version le v5.0.0) {
+ $table->add_alias($_) for qw(Y Yes T True);
+ }
+ $non_default_table = $table;
+ }
+ $default_table->set_complement($non_default_table);
+ }
+ else {
- # Integral and rational property values are not necessarily
- # defined in PropValueAliases, but whether all the other ones
- # should be depends on the property.
- if ($maps_should_be_defined
- && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
- {
- Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
+ # This fills in any missing values with the default. It's
+ # not necessary to do this with binary properties, as the
+ # default is defined completely in terms of the Y table.
+ $property->add_map(0, $MAX_WORKING_CODEPOINT,
+ $default_map, Replace => $NO);
}
- $table = $property->add_match_table($map);
}
- next if $table->complement != 0; # Don't need to populate these
- $table->add_range($range->start, $range->end);
- }
+ # Have all we need to populate the match tables.
+ my $maps_should_be_defined = $property->pre_declared_maps;
+ foreach my $range ($property->ranges) {
+ my $map = $range->value;
+ my $table = $property->table($map);
+ if (! defined $table) {
- # A forced binary property has additional true/false tables which
- # should have been set up when it was forced into binary. The false
- # table matches exactly the same set as the property's default table.
- # The true table matches the complement of that. The false table is
- # not the same as an additional set of aliases on top of the default
- # table, so use 'set_equivalent_to'. If it were implemented as
- # additional aliases, various things would have to be adjusted, but
- # especially, if the user wants to get a list of names for the table
- # using Unicode::UCD::prop_value_aliases(), s/he should get a
- # different set depending on whether they want the default table or
- # the false table.
- if ($property_type == $FORCED_BINARY) {
- $property->table('N')->set_equivalent_to($default_table,
- Related => 1);
- $property->table('Y')->set_complement($default_table);
+ # Integral and rational property values are not
+ # necessarily defined in PropValueAliases, but whether all
+ # the other ones should be depends on the property.
+ if ($maps_should_be_defined
+ && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
+ {
+ Carp::my_carp("Table '$property_name=$map' should "
+ . "have been defined. Defining it now.")
+ }
+ $table = $property->add_match_table($map);
+ }
+
+ next if $table->complement != 0; # Don't need to populate these
+ $table->add_range($range->start, $range->end);
+ }
}
# For Perl 5.6 compatibility, all properties matchable in regexes can
@@ -13321,8 +13492,6 @@ END
$gc->table('Ll')->set_caseless_equivalent($LC);
$gc->table('Lu')->set_caseless_equivalent($LC);
- my $Cs = $gc->table('Cs');
-
# Create digit and case fold tables with the original file names for
# backwards compatibility with applications that read them directly.
my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
@@ -13408,6 +13577,26 @@ sub pre_3_dot_1_Nl () {
return $Nl;
}
+sub calculate_Assigned() { # Calculate the gc != Cn code points; may be
+ # called before the Cn's are completely filled.
+ # Works on Unicodes earlier than ones that
+ # explicitly specify Cn.
+ return if defined $Assigned;
+
+ if (! defined $gc || $gc->is_empty()) {
+ Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
+ }
+
+ $Assigned = $perl->add_match_table('Assigned',
+ Description => "All assigned code points",
+ );
+ while (defined (my $range = $gc->each_range())) {
+ my $standard_value = standardize($range->value);
+ next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
+ $Assigned->add_range($range->start, $range->end);
+ }
+}
+
sub compile_perl() {
# Create perl-defined tables. Almost all are part of the pseudo-property
# named 'perl' internally to this program. Many of these are recommended
@@ -13446,16 +13635,12 @@ sub compile_perl() {
}
my $Any = $perl->add_match_table('Any',
- Description => "All Unicode code points: [\\x{0000}-\\x{10FFFF}]",
+ Description => "All Unicode code points: [\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
);
- $Any->add_range(0, 0x10FFFF);
+ $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
$Any->add_alias('Unicode');
- # Assigned is the opposite of gc=unassigned
- my $Assigned = $perl->add_match_table('Assigned',
- Description => "All assigned code points",
- Initialize => ~ $gc->table('Unassigned'),
- );
+ calculate_Assigned();
# Our internal-only property should be treated as more than just a
# synonym; grandfather it in to the pod.
@@ -13501,32 +13686,27 @@ sub compile_perl() {
# There are quite a few code points in Lower, that aren't in gc=lc,
# and not all are in all releases.
- foreach my $code_point ( utf8::unicode_to_native(0xAA),
- utf8::unicode_to_native(0xBA),
- 0x02B0 .. 0x02B8,
- 0x02C0 .. 0x02C1,
- 0x02E0 .. 0x02E4,
- 0x0345,
- 0x037A,
- 0x1D2C .. 0x1D6A,
- 0x1D78,
- 0x1D9B .. 0x1DBF,
- 0x2071,
- 0x207F,
- 0x2090 .. 0x209C,
- 0x2170 .. 0x217F,
- 0x24D0 .. 0x24E9,
- 0x2C7C .. 0x2C7D,
- 0xA770,
- 0xA7F8 .. 0xA7F9,
- ) {
- # Don't include the code point unless it is assigned in this
- # release
- my $category = $gc->value_of(hex $code_point);
- next if ! defined $category || $category eq 'Cn';
-
- $Lower += $code_point;
- }
+ my $temp = Range_List->new(Initialize => [
+ utf8::unicode_to_native(0xAA),
+ utf8::unicode_to_native(0xBA),
+ 0x02B0 .. 0x02B8,
+ 0x02C0 .. 0x02C1,
+ 0x02E0 .. 0x02E4,
+ 0x0345,
+ 0x037A,
+ 0x1D2C .. 0x1D6A,
+ 0x1D78,
+ 0x1D9B .. 0x1DBF,
+ 0x2071,
+ 0x207F,
+ 0x2090 .. 0x209C,
+ 0x2170 .. 0x217F,
+ 0x24D0 .. 0x24E9,
+ 0x2C7C .. 0x2C7D,
+ 0xA770,
+ 0xA7F8 .. 0xA7F9,
+ ]);
+ $Lower += $temp & $Assigned;
}
my $Posix_Lower = $perl->add_match_table("PosixLower",
Description => "[a-z]",
@@ -13832,6 +14012,7 @@ sub compile_perl() {
);
$Space->add_alias('XPerlSpace'); # Pre-existing synonyms
$Space->add_alias('SpacePerl');
+ $Space->add_alias('Space') if $v_version lt v4.1.0;
my $Posix_space = $perl->add_match_table("PosixSpace",
Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
@@ -13847,11 +14028,18 @@ sub compile_perl() {
Initialize => $Cntrl & $ASCII,
);
+ my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
+ if (defined (my $Cs = $gc->table('Cs'))) {
+ $perl_surrogate += $Cs;
+ }
+ else {
+ push @tables_that_may_be_empty, '_Perl_Surrogate';
+ }
+
# $controls is a temporary used to construct Graph.
my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
- + $gc->table('Control'));
- # Cs not in release 1
- $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
+ + $gc->table('Control')
+ + $perl_surrogate);
# Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
@@ -13957,14 +14145,21 @@ sub compile_perl() {
# 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
# of the MU issue.
foreach my $range ($loc_problem_folds->ranges) {
- foreach my $code_point($range->start .. $range->end) {
+ foreach my $code_point ($range->start .. $range->end) {
my $fold_range = $cf->containing_range($code_point);
next unless defined $fold_range;
+ # Skip if folds to itself
+ next if $fold_range->value eq $CODE_POINT;
+
my @hex_folds = split " ", $fold_range->value;
- my $start_cp = hex $hex_folds[0];
+ my $start_cp = $hex_folds[0];
+ next if $start_cp eq $CODE_POINT;
+ $start_cp = hex $start_cp;
foreach my $i (0 .. @hex_folds - 1) {
- my $cp = hex $hex_folds[$i];
+ my $cp = $hex_folds[$i];
+ next if $cp eq $CODE_POINT;
+ $cp = hex $cp;
next unless $cp > 255; # Already have the < 256 ones
$loc_problem_folds->add_range($cp, $cp);
@@ -13978,9 +14173,13 @@ sub compile_perl() {
Description =>
"Code points whose fold is a string of more than one character",
);
+ if ($v_version lt v3.0.1) {
+ push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char';
+ }
# Look through all the known folds to populate these tables.
foreach my $range ($cf->ranges) {
+ next if $range->value eq $CODE_POINT;
my $start = $range->start;
my $end = $range->end;
$any_folds->add_range($start, $end);
@@ -14190,7 +14389,7 @@ sub compile_perl() {
+ utf8::unicode_to_native(0xA0) # NBSP
);
- my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
+ my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
if (@named_sequences) {
push @composition, 'Named_Sequence';
@@ -14201,15 +14400,15 @@ sub compile_perl() {
my $alias_sentence = "";
my %abbreviations;
- my $alias = property_ref('Name_Alias');
- $perl_charname->set_proxy_for('Name_Alias');
-
- # Add each entry in Name_Alias to Perl_Charnames. Where these go with
- # respect to any existing entry depends on the entry type. Corrections go
- # before said entry, as they should be returned in preference over the
- # existing entry. (A correction to a correction should be later in the
- # Name_Alias table, so it will correctly precede the erroneous correction
- # in Perl_Charnames.)
+ my $alias = property_ref('_Perl_Name_Alias');
+ $perl_charname->set_proxy_for('_Perl_Name_Alias');
+
+ # Add each entry in _Perl_Name_Alias to Perl_Charnames. Where these go
+ # with respect to any existing entry depends on the entry type.
+ # Corrections go before said entry, as they should be returned in
+ # preference over the existing entry. (A correction to a correction
+ # should be later in the _Perl_Name_Alias table, so it will correctly
+ # precede the erroneous correction in Perl_Charnames.)
#
# Abbreviations go after everything else, so they are saved temporarily in
# a hash for later.
@@ -14244,7 +14443,7 @@ sub compile_perl() {
$perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
}
$alias_sentence = <<END;
-The Name_Alias property adds duplicate code point entries that are
+The _Perl_Name_Alias property adds duplicate code point entries that are
alternatives to the original name. If an addition is a corrected
name, it will be physically first in the table. The original (less correct,
but still valid) name will be next; then any alternatives, in no particular
@@ -14252,8 +14451,9 @@ order; and finally any abbreviations, again in no particular order.
END
# Now add the Unicode_1 names for the controls. The Unicode_1 names had
- # precedence before 6.1, so should be first in the file; the other names
- # have precedence starting in 6.1,
+ # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
+ # so should be first in the file; the other names have precedence starting
+ # in 6.1,
my $before_or_after = ($v_version lt v6.1.0)
? $MULTIPLE_BEFORE
: $MULTIPLE_AFTER;
@@ -14283,12 +14483,6 @@ END
Replace => $before_or_after);
}
- # But in this version only, the ALERT has precedence over BELL, the
- # Unicode_1_Name that would otherwise have precedence.
- if ($v_version eq v6.0.0) {
- $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE);
- }
-
# Now that have everything added, add in abbreviations after
# everything else. Sort so results don't change between runs of this
# program
@@ -14398,6 +14592,25 @@ END
$unassigned->set_equivalent_to($age_default, Related => 1);
}
+ my $patws = $perl->add_match_table('_Perl_PatWS',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
+ if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
+ $patws->initialize($off_patws->table('Y'));
+ }
+ else {
+ $patws->initialize([ ord("\t"),
+ ord("\n"),
+ utf8::unicode_to_native(0x0B), # VT
+ ord("\f"),
+ ord("\r"),
+ ord(" "),
+ utf8::unicode_to_native(0x85), # NEL
+ 0x200E..0x200F, # Left, Right marks
+ 0x2028..0x2029 # Line, Paragraph seps
+ ] );
+ }
+
# See L<perlfunc/quotemeta>
my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
Perl_Extension => 1,
@@ -14406,22 +14619,78 @@ END
# Initialize to what's common in
# all Unicode releases.
Initialize =>
- $Space
- + $gc->table('Control')
+ $gc->table('Control')
+ + $Space
+ + $patws
+ + ((~ $Word) & $ASCII)
);
- # In early releases without the proper Unicode properties, just set to \W.
- if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
- || ! defined (my $patws = property_ref('Pattern_White_Space'))
- || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
- {
- $quotemeta += ~ $Word;
+ if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
+ $quotemeta += $patsyn->table('Y');
}
else {
- $quotemeta += $patsyn->table('Y')
- + $patws->table('Y')
- + $di->table('Y')
- + ((~ $Word) & $ASCII);
+ $quotemeta += ((~ $Word) & Range->new(0, 255))
+ - utf8::unicode_to_native(0xA8)
+ - utf8::unicode_to_native(0xAF)
+ - utf8::unicode_to_native(0xB2)
+ - utf8::unicode_to_native(0xB3)
+ - utf8::unicode_to_native(0xB4)
+ - utf8::unicode_to_native(0xB7)
+ - utf8::unicode_to_native(0xB8)
+ - utf8::unicode_to_native(0xB9)
+ - utf8::unicode_to_native(0xBC)
+ - utf8::unicode_to_native(0xBD)
+ - utf8::unicode_to_native(0xBE);
+ $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
+ # same in all releases
+ 0x2010 .. 0x2027,
+ 0x2030 .. 0x203E,
+ 0x2041 .. 0x2053,
+ 0x2055 .. 0x205E,
+ 0x2190 .. 0x245F,
+ 0x2500 .. 0x2775,
+ 0x2794 .. 0x2BFF,
+ 0x2E00 .. 0x2E7F,
+ 0x3001 .. 0x3003,
+ 0x3008 .. 0x3020,
+ 0x3030 .. 0x3030,
+ 0xFD3E .. 0xFD3F,
+ 0xFE45 .. 0xFE46
+ ];
+ }
+
+ if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
+ $quotemeta += $di->table('Y')
+ }
+ else {
+ if ($v_version ge v2.0) {
+ $quotemeta += $gc->table('Cf')
+ + $gc->table('Cs');
+ }
+ $quotemeta += $gc->table('Cc')
+ - $Space;
+ my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
+ 0x2060 .. 0x206F,
+ 0xFE00 .. 0xFE0F,
+ 0xFFF0 .. 0xFFFB,
+ 0xE0000 .. 0xE0FFF,
+ ]);
+ $quotemeta += $temp & $Assigned;
+ }
+
+ my $nchar = $perl->add_match_table('_Perl_Nchar',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
+ if (defined (my $off_nchar = property_ref('Nchar'))) {
+ $nchar->initialize($off_nchar->table('Y'));
+ }
+ else {
+ $nchar->initialize([ 0xFFFE .. 0xFFFF ]);
+ if ($v_version ge v2.0) { # First release with these nchars
+ for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
+ $nchar += [ $i .. $i+1 ];
+ }
+ }
}
# Finished creating all the perl properties. All non-internal non-string
@@ -14743,14 +15012,15 @@ sub register_file_for_name($$$) {
my $file = shift; # The file name in the final directory.
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
+ trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
if ($table->isa('Property')) {
$table->set_file_path(@$directory_ref, $file);
push @map_properties, $table;
# No swash means don't do the rest of this.
- return if $table->fate != $ORDINARY;
+ return if $table->fate != $ORDINARY
+ && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY);
# Get the path to the file
my @path = $table->file_path;
@@ -14764,7 +15034,12 @@ sub register_file_for_name($$$) {
# property's map table
foreach my $alias ($table->aliases) {
my $name = $alias->name;
- $loose_property_to_file_of{standardize($name)} = $file;
+ if ($name =~ /^_/) {
+ $strict_property_to_file_of{lc $name} = $file;
+ }
+ else {
+ $loose_property_to_file_of{standardize($name)} = $file;
+ }
}
# And a way for utf8_heavy to find the proper key in the SwashInfo
@@ -14972,7 +15247,22 @@ sub register_file_for_name($$$) {
# Remove interior underscores.
(my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
- # Change any non-word character into an underscore, and truncate to 8.
+ # Convert the dot in floating point numbers to an underscore
+ $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x;
+
+ my $suffix = "";
+
+ # Extract any suffix, delete any non-word character, and truncate to 3
+ # after the dot
+ if ($filename =~ m/ ( .*? ) ( \. .* ) /x) {
+ $filename = $1;
+ $suffix = $2;
+ $suffix =~ s/\W+//g;
+ substr($suffix, 4) = "" if length($suffix) > 4;
+ }
+
+ # Change any non-word character outside the suffix into an underscore,
+ # and truncate to 8.
$filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
substr($filename, 8) = "" if length($filename) > 8;
@@ -14984,7 +15274,7 @@ sub register_file_for_name($$$) {
# InGreekE
# InGreek2
my $warned = 0;
- while (my $num = $base_names{$path}{lc $filename}++) {
+ while (my $num = $base_names{$path}{lc "$filename$suffix"}++) {
$num++; # so basenames with numbers start with '2', which
# just looks more natural.
@@ -15491,9 +15781,9 @@ sub make_ucd_table_pod_entries {
$$info_ref .= $full_name;
}
- # And the full-name entry includes the short name, if different
+ # And the full-name entry includes the short name, if shorter
if ($info_ref == \$full_info
- && $standard_short_name ne $standard_full_name)
+ && length $standard_short_name < length $standard_full_name)
{
$full_info =~ s/\.\Z//;
$full_info .= " " if $full_info;
@@ -15517,6 +15807,17 @@ sub make_ucd_table_pod_entries {
$full_info .= ". " if $full_info;
$full_info .= $more_info;
}
+ if ($table->property->type == $FORCED_BINARY) {
+ if ($full_info) {
+ $full_info =~ s/\.\Z//;
+ $full_info .= ". ";
+ }
+ $full_info .= "This is a combination property which has both:"
+ . " 1) a map to various string values; and"
+ . " 2) a map to boolean Y/N, where 'Y' means the"
+ . " string value is non-empty. Add the prefix 'is'"
+ . " to the prop_invmap() call to get the latter";
+ }
# These keep track if have created full and short name pod entries for the
# property
@@ -15548,6 +15849,9 @@ sub make_ucd_table_pod_entries {
$info = $other_info;
}
+ $combination_property{$standard} = 1
+ if $table->property->type == $FORCED_BINARY;
+
# Here, we have set up the two columns for this entry. But if an
# entry already exists for this name, we have to decide which one
# we're going to later output.
@@ -15614,9 +15918,9 @@ sub pod_alphanumeric_sort {
# The first few character columns are filler, plus the '\p{'; and get rid
# of all the trailing stuff, starting with the trailing '}', so as to sort
# on just 'Name=Value'
- (my $a = lc $a) =~ s/^ .*? { //x;
+ (my $a = lc $a) =~ s/^ .*? \{ //x;
$a =~ s/}.*//;
- (my $b = lc $b) =~ s/^ .*? { //x;
+ (my $b = lc $b) =~ s/^ .*? \{ //x;
$b =~ s/}.*//;
# Determine if the two operands are both internal only or both not.
@@ -15761,6 +16065,7 @@ END
# The sort will cause the alphabetically first properties to be added to
# each list first, so each list will be sorted.
foreach my $property (sort keys %why_suppressed) {
+ next unless $why_suppressed{$property};
push @{$why_list{$why_suppressed{$property}}}, $property;
}
@@ -15823,13 +16128,12 @@ END
}
# Similiarly, generate a list of files that we don't use, grouped by the
- # reasons why. First, create a hash whose keys are the reasons, and whose
- # values are anonymous arrays of all the files that share that reason.
+ # reasons why (Don't output if the reason is empty). First, create a hash
+ # whose keys are the reasons, and whose values are anonymous arrays of all
+ # the files that share that reason.
my %grouped_by_reason;
- foreach my $file (keys %ignored_files) {
- push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
- }
foreach my $file (keys %skipped_files) {
+ next unless $skipped_files{$file};
push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
}
@@ -16284,6 +16588,10 @@ sub make_Heavy () {
= simple_dumper(\%loose_property_name_of, ' ' x 4);
chomp $loose_property_name_of;
+ my $strict_property_name_of
+ = simple_dumper(\%strict_property_name_of, ' ' x 4);
+ chomp $strict_property_name_of;
+
my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
chomp $stricter_to_file_of;
@@ -16322,6 +16630,10 @@ sub make_Heavy () {
= simple_dumper(\%loose_property_to_file_of, ' ' x 4);
chomp $loose_property_to_file_of;
+ my $strict_property_to_file_of
+ = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
+ chomp $strict_property_to_file_of;
+
my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
chomp $file_to_swash_name;
@@ -16337,6 +16649,11 @@ $INTERNAL_ONLY_HEADER
$loose_property_name_of
);
+# Same, but strict names
+\%utf8::strict_property_name_of = (
+$strict_property_name_of
+);
+
# Gives the definitions (in the form of inversion lists) for those properties
# whose definitions aren't kept in files
\@utf8::inline_definitions = (
@@ -16385,6 +16702,11 @@ $caseless_equivalent_to
$loose_property_to_file_of
);
+# Property names to mapping files
+\%utf8::strict_property_to_file_of = (
+$strict_property_to_file_of
+);
+
# Files to the swash names within them.
\%utf8::file_to_swash_name = (
$file_to_swash_name
@@ -16756,8 +17078,8 @@ sub make_UCD () {
# an element for the Hangul syllables in the appropriate place, and
# otherwise changes the name to include the "-<code point>" suffix.
my @algorithm_names;
- my $done_hangul = 0;
-
+ my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came
+ # along in this version
# Copy it linearly.
for my $i (0 .. @code_points_ending_in_code_point - 1) {
@@ -16808,6 +17130,9 @@ sub make_UCD () {
my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
chomp $ambiguous_names;
+ my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
+ chomp $combination_property;
+
my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
chomp $loose_defaults;
@@ -16872,6 +17197,13 @@ $ambiguous_names
$loose_defaults
);
+# The properties that are combinations, in that they have both a map table and
+# a match table. This is actually for UCD.t, so it knows how to test for
+# these.
+\%Unicode::UCD::combination_property = (
+$combination_property
+);
+
# All combinations of names that are suppressed.
# This is actually for UCD.t, so it knows which properties shouldn't have
# entries. If it got any bigger, would probably want to put it in its own
@@ -16972,14 +17304,8 @@ sub write_all_tables() {
# with it or not.
my $expected_empty =
- # $perl should be empty, as well as properties that we just
- # don't do anything with
- ($is_property
- && ($table == $perl
- || grep { $complete_name eq $_ }
- @unimplemented_properties
- )
- )
+ # $perl should be empty
+ ($is_property && ($table == $perl))
# Match tables in properties we skipped populating should be
# empty
@@ -17045,7 +17371,7 @@ sub write_all_tables() {
: ($is_property)
? # All these types of map tables will be full because
# they will have been populated with defaults
- ($type == $ENUM || $type == $FORCED_BINARY)
+ ($type == $ENUM)
: # A match table should match everything if its method
# shows it should
@@ -17206,12 +17532,14 @@ sub write_all_tables() {
}
}
else {
- if (exists ($loose_property_name_of{$alias_standard}))
- {
- Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
+ my $hash_ref = ($alias_standard =~ /^_/)
+ ? \%strict_property_name_of
+ : \%loose_property_name_of;
+ if (exists $hash_ref->{$alias_standard}) {
+ Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained");
}
else {
- $loose_property_name_of{$alias_standard}
+ $hash_ref->{$alias_standard}
= $standard_property_name;
}
@@ -17462,7 +17790,7 @@ sub generate_tests($$$$$) {
my @output;
# Create a complete set of tests, with complements.
if (defined $valid_code) {
- push @output, <<"EOC"
+ push @output, <<"EOC"
Expect(1, $valid_code, '\\p{$name}', $warning);
Expect(0, $valid_code, '\\p{^$name}', $warning);
Expect(0, $valid_code, '\\P{$name}', $warning);
@@ -17470,7 +17798,7 @@ Expect(1, $valid_code, '\\P{^$name}', $warning);
EOC
}
if (defined $invalid_code) {
- push @output, <<"EOC"
+ push @output, <<"EOC"
Expect(0, $invalid_code, '\\p{$name}', $warning);
Expect(1, $invalid_code, '\\p{^$name}', $warning);
Expect(1, $invalid_code, '\\P{$name}', $warning);
@@ -17708,8 +18036,10 @@ sub make_property_test_script() {
# Test each possible combination of the property's aliases with
# the table's. If this gets to be too many, could do what is done
# in the set_final_comment() for Tables
- my @table_aliases = $table->aliases;
- my @property_aliases = $table->property->aliases;
+ my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
+ next unless @table_aliases;
+ my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
+ next unless @property_aliases;
# Every property can be optionally be prefixed by 'Is_', so test
# that those work, by creating such a new alias for each
@@ -17941,84 +18271,97 @@ END
return;
}
+# Skip reasons, so will be exact same text and hence the files with each
+# reason will get grouped together in perluniprops.
+my $Documentation = "Documentation";
+my $Indic_Skip
+ = "Provisional; for the analysis and processing of Indic scripts";
+my $Validation = "Validation Tests";
+my $Validation_Documentation = "Documentation of validation Tests";
+
# This is a list of the input files and how to handle them. The files are
# processed in their order in this list. Some reordering is possible if
-# desired, but the v0 files should be first, and the extracted before the
-# others except DAge.txt (as data in an extracted file can be over-ridden by
-# the non-extracted. Some other files depend on data derived from an earlier
-# file, like UnicodeData requires data from Jamo, and the case changing and
-# folding requires data from Unicode. Mostly, it is safest to order by first
-# version releases in (except the Jamo). DAge.txt is read before the
-# extracted ones because of the rarely used feature $compare_versions. In the
-# unlikely event that there were ever an extracted file that contained the Age
-# property information, it would have to go in front of DAge.
+# desired, but the PropertyAliases and PropValueAliases files should be first,
+# and the extracted before the others except DAge.txt (as data in an extracted
+# file can be over-ridden by the non-extracted. Some other files depend on
+# data derived from an earlier file, like UnicodeData requires data from Jamo,
+# and the case changing and folding requires data from Unicode. Mostly, it is
+# safest to order by first version releases in (except the Jamo). DAge.txt is
+# read before the extracted ones because of the rarely used feature
+# $compare_versions. In the unlikely event that there were ever an extracted
+# file that contained the Age property information, it would have to go in
+# front of DAge.
#
# The version strings allow the program to know whether to expect a file or
# not, but if a file exists in the directory, it will be processed, even if it
# is in a version earlier than expected, so you can copy files from a later
# release into an earlier release's directory.
my @input_file_objects = (
- Input_file->new('PropertyAliases.txt', v0,
+ Input_file->new('PropertyAliases.txt', v3.2,
Handler => \&process_PropertyAliases,
- ),
+ Early => [ \&substitute_PropertyAliases ],
+ Required_Even_in_Debug_Skip => 1,
+ ),
Input_file->new(undef, v0, # No file associated with this
Progress_Message => 'Finishing property setup',
Handler => \&finish_property_setup,
- ),
- Input_file->new('PropValueAliases.txt', v0,
+ ),
+ Input_file->new('PropValueAliases.txt', v3.2,
Handler => \&process_PropValueAliases,
+ Early => [ \&substitute_PropValueAliases ],
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ Required_Even_in_Debug_Skip => 1,
+ ),
Input_file->new('DAge.txt', v3.2.0,
Has_Missings_Defaults => $NOT_IGNORED,
Property => 'Age'
- ),
+ ),
Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
Property => 'General_Category',
- ),
+ ),
Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
Property => 'Canonical_Combining_Class',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
Property => 'Numeric_Type',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
Property => 'East_Asian_Width',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
Property => 'Line_Break',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
Property => 'Bidi_Class',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
Property => 'Decomposition_Type',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
Property => 'Numeric_Value',
Each_Line_Handler => \&filter_numeric_value_line,
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
Property => 'Joining_Group',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
Property => 'Joining_Type',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new('Jamo.txt', v2.0.0,
Property => 'Jamo_Short_Name',
Each_Line_Handler => \&filter_jamo_line,
- ),
+ ),
Input_file->new('UnicodeData.txt', v1.1.5,
Pre_Handler => \&setup_UnicodeData,
@@ -18053,7 +18396,12 @@ my @input_file_objects = (
\&filter_UnicodeData_line,
],
EOF_Handler => \&EOF_UnicodeData,
- ),
+ ),
+ Input_file->new('CJKXREF.TXT', v1.1.5,
+ Withdrawn => v2.0.0,
+ Skip => 'Gives the mapping of CJK code points '
+ . 'between Unicode and various other standards',
+ ),
Input_file->new('ArabicShaping.txt', v2.0.0,
Each_Line_Handler =>
($v_version lt 4.1.0)
@@ -18063,29 +18411,46 @@ my @input_file_objects = (
# not used by Perl
Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new('Blocks.txt', v2.0.0,
Property => 'Block',
Has_Missings_Defaults => $NOT_IGNORED,
Each_Line_Handler => \&filter_blocks_lines
- ),
+ ),
+ Input_file->new('Index.txt', v2.0.0,
+ Skip => 'Alphabetical index of Unicode characters',
+ ),
+ Input_file->new('NamesList.txt', v2.0.0,
+ Skip => 'Annotated list of characters',
+ ),
Input_file->new('PropList.txt', v2.0.0,
Each_Line_Handler => (($v_version lt v3.1.0)
? \&filter_old_style_proplist
: undef),
- ),
+ ),
+ Input_file->new('Props.txt', v2.0.0,
+ Withdrawn => v3.0.0,
+ Skip => 'A subset of F<PropList.txt> (which is used instead)',
+ ),
+ Input_file->new('ReadMe.txt', v2.0.0,
+ Skip => $Documentation,
+ ),
Input_file->new('Unihan.txt', v2.0.0,
+ Withdrawn => v5.2.0,
+ Construction_Time_Handler => \&construct_unihan,
Pre_Handler => \&setup_unihan,
- Optional => 1,
+ Optional => [ "",
+ 'Unicode_Radical_Stroke'
+ ],
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('SpecialCasing.txt', v2.1.8,
Each_Line_Handler => ($v_version eq 2.1.8)
? \&filter_2_1_8_special_casing_line
: \&filter_special_casing_line,
Pre_Handler => \&setup_special_casing,
Has_Missings_Defaults => $IGNORED,
- ),
+ ),
Input_file->new(
'LineBreak.txt', v3.0.0,
Has_Missings_Defaults => $NOT_IGNORED,
@@ -18094,7 +18459,7 @@ my @input_file_objects = (
Each_Line_Handler => (($v_version lt v3.1.0)
? \&filter_early_ea_lb
: undef),
- ),
+ ),
Input_file->new('EastAsianWidth.txt', v3.0.0,
Property => 'East_Asian_Width',
Has_Missings_Defaults => $NOT_IGNORED,
@@ -18102,10 +18467,14 @@ my @input_file_objects = (
Each_Line_Handler => (($v_version lt v3.1.0)
? \&filter_early_ea_lb
: undef),
- ),
+ ),
Input_file->new('CompositionExclusions.txt', v3.0.0,
Property => 'Composition_Exclusion',
- ),
+ ),
+ Input_file->new('UnicodeData.html', v3.0.0,
+ Withdrawn => v4.0.1,
+ Skip => $Documentation,
+ ),
Input_file->new('BidiMirroring.txt', v3.0.1,
Property => 'Bidi_Mirroring_Glyph',
Has_Missings_Defaults => ($v_version lt v6.2.0)
@@ -18114,12 +18483,15 @@ my @input_file_objects = (
# anything to us, we will use the
# null string
: $IGNORED,
-
- ),
- Input_file->new("NormTest.txt", v3.0.0,
- Handler => \&process_NormalizationsTest,
- Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
- ),
+ ),
+ Input_file->new('NamesList.html', v3.0.0,
+ Skip => 'Describes the format and contents of '
+ . 'F<NamesList.txt>',
+ ),
+ Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
+ Withdrawn => v5.1,
+ Skip => $Documentation,
+ ),
Input_file->new('CaseFolding.txt', v3.0.1,
Pre_Handler => \&setup_case_folding,
Each_Line_Handler =>
@@ -18129,105 +18501,222 @@ my @input_file_objects = (
\&filter_case_folding_line
],
Has_Missings_Defaults => $IGNORED,
- ),
+ ),
+ Input_file->new("NormTest.txt", v3.0.1,
+ Handler => \&process_NormalizationsTest,
+ Skip => ($make_norm_test_script) ? 0 : $Validation,
+ ),
Input_file->new('DCoreProperties.txt', v3.1.0,
# 5.2 changed this file
Has_Missings_Defaults => (($v_version ge v5.2.0)
? $NOT_IGNORED
: $NO_DEFAULTS),
- ),
+ ),
+ Input_file->new('DProperties.html', v3.1.0,
+ Withdrawn => v3.2.0,
+ Skip => $Documentation,
+ ),
+ Input_file->new('PropList.html', v3.1.0,
+ Withdrawn => v5.1,
+ Skip => $Documentation,
+ ),
Input_file->new('Scripts.txt', v3.1.0,
Property => 'Script',
+ Each_Line_Handler => (($v_version le v4.0.0)
+ ? \&filter_all_caps_script_names
+ : undef),
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new('DNormalizationProps.txt', v3.1.0,
Has_Missings_Defaults => $NOT_IGNORED,
Each_Line_Handler => (($v_version lt v4.0.1)
? \&filter_old_style_normalization_lines
: undef),
- ),
- Input_file->new('HangulSyllableType.txt', v0,
+ ),
+ Input_file->new('DerivedProperties.html', v3.1.1,
+ Withdrawn => v5.1,
+ Skip => $Documentation,
+ ),
+ Input_file->new('HangulSyllableType.txt', v4.0,
Has_Missings_Defaults => $NOT_IGNORED,
- Property => 'Hangul_Syllable_Type',
- Pre_Handler => ($v_version lt v4.0.0)
- ? \&generate_hst
- : undef,
- ),
+ Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
+ Property => 'Hangul_Syllable_Type'
+ ),
+ Input_file->new('NormalizationCorrections.txt', v3.2.0,
+ # This documents the cumulative fixes to erroneous
+ # normalizations in earlier Unicode versions. Its main
+ # purpose is so that someone running on an earlier
+ # version can use this file to override what got
+ # published in that earlier release. It would be easy
+ # for mktables to handle this file. But all the
+ # corrections in it should already be in the other files
+ # for the release it is. To get it to actually mean
+ # something useful, someone would have to be using an
+ # earlier Unicode release, and copy it into the directory
+ # for that release and recomplile. So far there has been
+ # no demand to do that, so this hasn't been implemented.
+ Skip => 'Documentation of corrections already '
+ . 'incorporated into the Unicode data base',
+ ),
+ Input_file->new('StandardizedVariants.html', v3.2.0,
+ Skip => 'Provides a visual display of the standard '
+ . 'variant sequences derived from '
+ . 'F<StandardizedVariants.txt>.',
+ # I don't know why the html came earlier than the
+ # .txt, but both are skipped anyway, so it doesn't
+ # matter.
+ ),
+ Input_file->new('StandardizedVariants.txt', v4.0.0,
+ Skip => 'Certain glyph variations for character display '
+ . 'are standardized. This lists the non-Unihan '
+ . 'ones; the Unihan ones are also not used by '
+ . 'Perl, and are in a separate Unicode data base '
+ . 'L<http://www.unicode.org/ivd>',
+ ),
+ Input_file->new('UCD.html', v4.0.0,
+ Withdrawn => v5.2,
+ Skip => $Documentation,
+ ),
Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
+ Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
Property => 'Word_Break',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
- Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
+ ),
+ Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
+ Early => [ \&generate_GCB, '_Perl_GCB' ],
Property => 'Grapheme_Cluster_Break',
Has_Missings_Defaults => $NOT_IGNORED,
- Pre_Handler => ($v_version lt v4.1.0)
- ? \&generate_GCB
- : undef,
- ),
+ ),
Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
Handler => \&process_GCB_test,
- ),
- Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
- Skip => 'Validation Tests',
- ),
+ ),
+ Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
+ Skip => $Validation_Documentation,
+ ),
Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
Handler => \&process_SB_test,
- ),
+ ),
+ Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
+ Skip => $Validation_Documentation,
+ ),
Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
Handler => \&process_WB_test,
- ),
+ ),
+ Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
+ Skip => $Validation_Documentation,
+ ),
Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
Property => 'Sentence_Break',
+ Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new('NamedSequences.txt', v4.1.0,
Handler => \&process_NamedSequences
- ),
- Input_file->new('NameAliases.txt', v0,
+ ),
+ Input_file->new('Unihan.html', v4.1.0,
+ Withdrawn => v5.2,
+ Skip => $Documentation,
+ ),
+ Input_file->new('NameAliases.txt', v5.0,
Property => 'Name_Alias',
- Pre_Handler => ($v_version le v6.0.0)
- ? \&setup_early_name_alias
- : undef,
Each_Line_Handler => ($v_version le v6.0.0)
? \&filter_early_version_name_alias_line
: \&filter_later_version_name_alias_line,
- ),
+ ),
+ # NameAliases.txt came along in v5.0. The above constructor handles
+ # this. But until 6.1, it was lacking some information needed by core
+ # perl. The constructor below handles that. It is either a kludge or
+ # clever, depending on your point of view. The 'Withdrawn' parameter
+ # indicates not to use it at all starting in 6.1 (so the above
+ # constructor applies), and the 'v6.1' parameter indicates to use the
+ # Early parameter before 6.1. Therefore 'Early" is always used,
+ # yielding the internal-only property '_Perl_Name_Alias', which it
+ # gets from a NameAliases.txt from 6.1 or later stored in
+ # N_Asubst.txt. In combination with the above constructor,
+ # 'Name_Alias' is publicly accessible starting with v5.0, and the
+ # better 6.1 version is accessible to perl core in all releases.
+ Input_file->new("NameAliases.txt", v6.1,
+ Withdrawn => v6.1,
+ Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
+ Property => 'Name_Alias',
+ EOF_Handler => \&fixup_early_perl_name_alias,
+ Each_Line_Handler =>
+ \&filter_later_version_name_alias_line,
+ ),
+ Input_file->new('NamedSqProv.txt', v5.0.0,
+ Skip => 'Named sequences proposed for inclusion in a '
+ . 'later version of the Unicode Standard; if you '
+ . 'need them now, you can append this file to '
+ . 'F<NamedSequences.txt> and recompile perl',
+ ),
+ Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
+ Skip => $Validation,
+ ),
+ Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
+ Skip => $Validation_Documentation,
+ ),
Input_file->new("BidiTest.txt", v5.2.0,
- Skip => 'Validation Tests',
- ),
+ Skip => $Validation,
+ ),
Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
- Optional => 1,
+ Optional => "",
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
- Optional => 1,
+ Optional => "",
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanIRGSources.txt', v5.2.0,
- Optional => 1,
+ Optional => [ "",
+ 'kCompatibilityVariant',
+ 'kIICore',
+ 'kIRG_GSource',
+ 'kIRG_HSource',
+ 'kIRG_JSource',
+ 'kIRG_KPSource',
+ 'kIRG_MSource',
+ 'kIRG_KSource',
+ 'kIRG_TSource',
+ 'kIRG_USource',
+ 'kIRG_VSource',
+ ],
Pre_Handler => \&setup_unihan,
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanNumericValues.txt', v5.2.0,
- Optional => 1,
+ Optional => [ "",
+ 'kAccountingNumeric',
+ 'kOtherNumeric',
+ 'kPrimaryNumeric',
+ ],
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanOtherMappings.txt', v5.2.0,
- Optional => 1,
+ Optional => "",
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
- Optional => 1,
+ Optional => [ "",
+ 'Unicode_Radical_Stroke'
+ ],
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanReadings.txt', v5.2.0,
- Optional => 1,
+ Optional => "",
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanVariants.txt', v5.2.0,
- Optional => 1,
+ Optional => "",
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
+ Input_file->new('CJKRadicals.txt', v5.2.0,
+ Skip => 'Maps the kRSUnicode property values to '
+ . 'corresponding code points',
+ ),
+ Input_file->new('EmojiSources.txt', v6.0.0,
+ Skip => 'Maps certain Unicode code points to their '
+ . 'legacy Japanese cell-phone values',
+ ),
Input_file->new('ScriptExtensions.txt', v6.0.0,
Property => 'Script_Extensions',
Pre_Handler => \&setup_script_extensions,
@@ -18235,39 +18724,74 @@ my @input_file_objects = (
Has_Missings_Defaults => (($v_version le v6.0.0)
? $NO_DEFAULTS
: $IGNORED),
- ),
- # The two Indic files are actually available starting in v6.0.0, but their
- # property values are missing from PropValueAliases.txt in that release,
- # so that further work would have to be done to get them to work properly
- # for that release.
- Input_file->new('IndicMatraCategory.txt', v6.1.0,
+ ),
+ # These two Indic files are actually not usable as-is until 6.1.0,
+ # because their property values are missing from PropValueAliases.txt
+ # until that release, so that further work would have to be done to get
+ # them to work properly, which isn't worth it because of them being
+ # provisional.
+ Input_file->new('IndicMatraCategory.txt', v6.0.0,
+ Withdrawn => v8.0.0,
Property => 'Indic_Matra_Category',
Has_Missings_Defaults => $NOT_IGNORED,
- Skip => "Withdrawn by Unicode while still provisional",
- ),
- Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
+ Skip => $Indic_Skip,
+ ),
+ Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
Property => 'Indic_Syllabic_Category',
Has_Missings_Defaults => $NOT_IGNORED,
Skip => (($v_version lt v8.0.0)
- ? "Provisional; for the analysis and processing of Indic scripts"
+ ? $Indic_Skip
: 0),
- ),
+ ),
+ Input_file->new('USourceData.txt', v6.2.0,
+ Skip => 'Documentation of status and cross reference of '
+ . 'proposals for encoding by Unicode of Unihan '
+ . 'characters',
+ ),
+ Input_file->new('USourceGlyphs.pdf', v6.2.0,
+ Skip => 'Pictures of the characters in F<USourceData.txt>',
+ ),
Input_file->new('BidiBrackets.txt', v6.3.0,
- Properties => [ 'Bidi_Paired_Bracket', 'Bidi_Paired_Bracket_Type' ],
+ Properties => [ 'Bidi_Paired_Bracket',
+ 'Bidi_Paired_Bracket_Type'
+ ],
Has_Missings_Defaults => $NO_DEFAULTS,
- ),
+ ),
Input_file->new("BidiCharacterTest.txt", v6.3.0,
- Skip => 'Validation Tests',
- ),
+ Skip => $Validation,
+ ),
Input_file->new('IndicPositionalCategory.txt', v8.0.0,
Property => 'Indic_Positional_Category',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
);
# End of all the preliminaries.
# Do it...
+if (@missing_early_files) {
+ print simple_fold(join_lines(<<END
+
+The compilation cannot be completed because one or more required input files,
+listed below, are missing. This is because you are compiling Unicode version
+$string_version, which predates the existence of these file(s). To fully
+function, perl needs the data that these files would have contained if they
+had been in this release. To work around this, create copies of later
+versions of the missing files in the directory containing '$0'. (Perl will
+make the necessary adjustments to the data to compensate for it not being the
+same version as is being compiled.) The files are available from unicode.org,
+via either ftp or http. If using http, they will be under
+www.unicode.org/versions/. Below are listed the source file name of each
+missing file, the Unicode version to copy it from, and the name to store it
+as. (Note that the listed source file name may not be exactly the one that
+Unicode calls it. If you don't find it, you can look it up in 'README.perl'
+to get the correct name.)
+END
+ ));
+ print simple_fold(join_lines("\n$_")) for @missing_early_files;
+ exit 2;
+}
+
if ($compare_versions) {
Carp::my_carp(<<END
Warning. \$compare_versions is set. Output is not suitable for production
@@ -18276,17 +18800,13 @@ END
}
# Put into %potential_files a list of all the files in the directory structure
-# that could be inputs to this program, excluding those that we should ignore.
-# Use absolute file names because it makes it easier across machine types.
-my @ignored_files_full_names = map { File::Spec->rel2abs(
- internal_file_to_platform($_))
- } keys %ignored_files;
+# that could be inputs to this program
File::Find::find({
wanted=>sub {
- return unless /\.txt$/i; # Some platforms change the name's case
+ return unless / \. ( txt | htm l? ) $ /xi; # Some platforms change the
+ # name's case
my $full = lc(File::Spec->rel2abs($_));
- $potential_files{$full} = 1
- if ! grep { $full eq lc($_) } @ignored_files_full_names;
+ $potential_files{$full} = 1;
return;
}
}, File::Spec->curdir());
@@ -18344,8 +18864,7 @@ else {
# The paths are stored with relative names, and with '/' as the
# delimiter; convert to absolute on this machine
my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
- $potential_files{lc $full} = 1
- if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
+ $potential_files{lc $full} = 1;
}
}
@@ -18422,7 +18941,7 @@ my @input_files = qw(version Makefile);
foreach my $object (@input_file_objects) {
my $file = $object->file;
next if ! defined $file; # Not all objects have files
- next if $object->optional && ! -e $file;
+ next if defined $object->skip;;
push @input_files, $file;
}
@@ -18444,7 +18963,6 @@ foreach my $in (@input_files) {
my ($volume, $directories, $file ) = File::Spec->splitpath($in);
$directories =~ s;/$;;; # Can have extraneous trailing '/'
my @directories = File::Spec->splitdir($directories);
- my $base = $file =~ s/\.txt$//;
construct_filename($file, 'mutable', \@directories);
}
}
@@ -18593,6 +19111,16 @@ use warnings;
my $Tests = 0;
my $Fails = 0;
+# loc_tools.pl requires this function to be defined
+sub ok($@) {
+ my ($pass, @msg) = @_;
+ print "not " unless $pass;
+ print "ok ";
+ print ++$Tests;
+ print " - ", join "", @msg if @msg;
+ print "\n";
+}
+
sub Expect($$$$) {
my $expected = shift;
my $ord = shift;
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl
index 1ba73b2874..51137b5623 100644
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -404,7 +404,11 @@ sub _loose_name ($) {
# If didn't find it, try again with looser matching by editing
# out the applicable characters on the rhs and looking up
# again.
+ my $strict_property_and_table;
if (! defined $file) {
+
+ # This isn't used unless the name begins with 'to'
+ $strict_property_and_table = $property_and_table =~ s/^to//r;
$table = _loose_name($table);
$property_and_table = "$prefix$table";
print STDERR __LINE__, ": $property_and_table\n" if DEBUG;
@@ -443,61 +447,51 @@ sub _loose_name ($) {
## is to use Unicode::UCD.
##
# Only check if caller wants non-binary
- my $retried = 0;
- if ($minbits != 1 && $property_and_table =~ s/^to//) {{
+ if ($minbits != 1) {
+ if ($property_and_table =~ s/^to//) {
# Look input up in list of properties for which we have
- # mapping files.
- if (defined ($file =
+ # mapping files. First do it with the strict approach
+ if (defined ($file = $utf8::strict_property_to_file_of{
+ $strict_property_and_table}))
+ {
+ $type = $utf8::file_to_swash_name{$file};
+ print STDERR __LINE__, ": type set to $type\n"
+ if DEBUG;
+ $file = "$unicore_dir/$file.pl";
+ last GETFILE;
+ }
+ elsif (defined ($file =
$utf8::loose_property_to_file_of{$property_and_table}))
- {
- $type = $utf8::file_to_swash_name{$file};
- print STDERR __LINE__, ": type set to $type\n" if DEBUG;
- $file = "$unicore_dir/$file.pl";
- last GETFILE;
- } # If that fails see if there is a corresponding binary
- # property file
- elsif (defined ($file =
- $utf8::loose_to_file_of{$property_and_table}))
- {
+ {
+ $type = $utf8::file_to_swash_name{$file};
+ print STDERR __LINE__, ": type set to $type\n"
+ if DEBUG;
+ $file = "$unicore_dir/$file.pl";
+ last GETFILE;
+ } # If that fails see if there is a corresponding binary
+ # property file
+ elsif (defined ($file =
+ $utf8::loose_to_file_of{$property_and_table}))
+ {
- # Here, there is no map file for the property we are
- # trying to get the map of, but this is a binary
- # property, and there is a file for it that can easily
- # be translated to a mapping.
-
- # In the case of properties that are forced to binary,
- # they are a combination. We return the actual
- # mapping instead of the binary. If the input is
- # something like 'Tocjkkiicore', it will be found in
- # %loose_property_to_file_of above as => 'To/kIICore'.
- # But the form like ToIskiicore won't be. To fix
- # this, it was easiest to do it here. These
- # properties are the complements of the default
- # property, so there is an entry in %loose_to_file_of
- # that is 'iskiicore' => '!kIICore/N', If we find such
- # an entry, strip off things and try again, which
- # should find the entry in %loose_property_to_file_of.
- # Actual binary properties that are of this form, such
- # as this entry: 'ishrkt' => '!Perl/Any' will also be
- # retried, but won't be in %loose_property_to_file_of,
- # and instead the next time through, it will find
- # 'hrkt' => '!Perl/Any' and proceed.
- redo if ! $retried
- && $file =~ /^!/
- && $property_and_table =~ s/^is//;
-
- # This is a binary property. Setting this here causes
- # it to be stored as such in the cache, so if someone
- # comes along later looking for just a binary, they
- # get it.
- $minbits = 1;
-
- # The 0+ makes sure is numeric
- $invert_it = 0 + $file =~ s/!//;
- $file = "$unicore_dir/lib/$file.pl" unless $file =~ m!^#/!;
- last GETFILE;
+ # Here, there is no map file for the property we
+ # are trying to get the map of, but this is a
+ # binary property, and there is a file for it that
+ # can easily be translated to a mapping, so use
+ # that, treating this as a binary property.
+ # Setting 'minbits' here causes it to be stored as
+ # such in the cache, so if someone comes along
+ # later looking for just a binary, they get it.
+ $minbits = 1;
+
+ # The 0+ makes sure is numeric
+ $invert_it = 0 + $file =~ s/!//;
+ $file = "$unicore_dir/lib/$file.pl"
+ unless $file =~ m!^#/!;
+ last GETFILE;
+ }
}
- } }
+ }
##
## If we reach this line, it's because we couldn't figure
diff --git a/perl.c b/perl.c
index cbb66e0730..3cdae043f1 100644
--- a/perl.c
+++ b/perl.c
@@ -389,9 +389,9 @@ perl_construct(pTHXx)
PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(XPosixWord_invlist);
PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
- PL_GCB_invlist = _new_invlist_C_array(Grapheme_Cluster_Break_invlist);
- PL_SB_invlist = _new_invlist_C_array(Sentence_Break_invlist);
- PL_WB_invlist = _new_invlist_C_array(Word_Break_invlist);
+ PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
+ PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
+ PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
ENTER;
}
diff --git a/perl.h b/perl.h
index 9d57450cba..bb4dac4184 100644
--- a/perl.h
+++ b/perl.h
@@ -4779,7 +4779,14 @@ EXTCONST unsigned char PL_mod_latin1_uc[] = {
192, 193, 194, 195, 196, 197, 198, 199,
200, 201, 202, 203, 204, 205, 206, 207,
208, 209, 210, 211, 212, 213, 214, 215,
- 216, 217, 218, 219, 220, 221, 222, 255 /*sharp s*/,
+ 216, 217, 218, 219, 220, 221, 222,
+#if UNICODE_MAJOR_VERSION > 2 \
+ || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
+ && UNICODE_DOT_DOT_VERSION >= 8)
+ 255 /*sharp s*/,
+#else /* uc() is itself in early unicode */
+ 223,
+#endif
224-32, 225-32, 226-32, 227-32, 228-32, 229-32, 230-32, 231-32,
232-32, 233-32, 234-32, 235-32, 236-32, 237-32, 238-32, 239-32,
240-32, 241-32, 242-32, 243-32, 244-32, 245-32, 246-32, 247,
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 06f2ee271f..1580b43471 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -338,7 +338,10 @@ files in F<ext/> and F<lib/> are best summarized in L</Modules and Pragmata>.
=item *
-XXX
+Perl can again be compiled with any Unicode version. This used to
+(mostly) work, but was lost in v5.18 through v5.20. The property
+C<Name_Alias> did not exist prior to Unicode 5.0. L<Unicode::UCD>
+incorrectly said it did. This has been fixed.
=back
diff --git a/pp.c b/pp.c
index f750fce304..a963478cec 100644
--- a/pp.c
+++ b/pp.c
@@ -4096,6 +4096,9 @@ PP(pp_uc)
* just above.
* Use the source to distinguish between the three cases */
+#if UNICODE_MAJOR_VERSION > 2 \
+ || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
+ && UNICODE_DOT_DOT_VERSION >= 8)
if (*s == LATIN_SMALL_LETTER_SHARP_S) {
/* uc() of this requires 2 characters, but they are
@@ -4108,6 +4111,7 @@ PP(pp_uc)
*d++ = 'S'; *d = 'S'; /* upper case is 'SS' */
continue; /* Back to the tight loop; still in ASCII */
}
+#endif
/* The other two special handling characters have their
* upper cases outside the latin1 range, hence need to be
@@ -4401,8 +4405,14 @@ PP(pp_fc)
const U8 *send;
U8 *d;
U8 tmpbuf[UTF8_MAXBYTES_CASE + 1];
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
const bool full_folding = TRUE; /* This variable is here so we can easily
move to more generality later */
+#else
+ const bool full_folding = FALSE;
+#endif
const U8 flags = ( full_folding ? FOLD_FLAGS_FULL : 0 )
#ifdef USE_LOCALE_CTYPE
| ( IN_LC_RUNTIME(LC_CTYPE) ? FOLD_FLAGS_LOCALE : 0 )
diff --git a/regcharclass.h b/regcharclass.h
index c59ea78b3e..f7cd2fde5b 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -168,7 +168,7 @@
/*
NONCHAR: Non character code points
- \p{Nchar}
+ \p{_Perl_Nchar}
*/
/*** GENERATED CODE ***/
#define is_NONCHAR_utf8(s) \
@@ -185,7 +185,7 @@
/*
SURROGATE: Surrogate characters
- \p{Gc=Cs}
+ \p{_Perl_Surrogate}
*/
/*** GENERATED CODE ***/
#define is_SURROGATE_utf8(s) \
@@ -591,7 +591,7 @@
/*
PATWS: pattern white space
- \p{PatWS}
+ \p{_Perl_PatWS}
*/
/*** GENERATED CODE ***/
#define is_PATWS_safe(s,e,is_utf8) \
@@ -761,7 +761,7 @@
/*
NONCHAR: Non character code points
- \p{Nchar}
+ \p{_Perl_Nchar}
*/
/*** GENERATED CODE ***/
#define is_NONCHAR_utf8(s) \
@@ -782,7 +782,7 @@
/*
SURROGATE: Surrogate characters
- \p{Gc=Cs}
+ \p{_Perl_Surrogate}
*/
/*** GENERATED CODE ***/
#define is_SURROGATE_utf8(s) \
@@ -1205,7 +1205,7 @@
/*
PATWS: pattern white space
- \p{PatWS}
+ \p{_Perl_PatWS}
*/
/*** GENERATED CODE ***/
#define is_PATWS_safe(s,e,is_utf8) \
@@ -1373,7 +1373,7 @@
/*
NONCHAR: Non character code points
- \p{Nchar}
+ \p{_Perl_Nchar}
*/
/*** GENERATED CODE ***/
#define is_NONCHAR_utf8(s) \
@@ -1394,7 +1394,7 @@
/*
SURROGATE: Surrogate characters
- \p{Gc=Cs}
+ \p{_Perl_Surrogate}
*/
/*** GENERATED CODE ***/
#define is_SURROGATE_utf8(s) \
@@ -1825,7 +1825,7 @@
/*
PATWS: pattern white space
- \p{PatWS}
+ \p{_Perl_PatWS}
*/
/*** GENERATED CODE ***/
#define is_PATWS_safe(s,e,is_utf8) \
@@ -1993,7 +1993,7 @@
/*
NONCHAR: Non character code points
- \p{Nchar}
+ \p{_Perl_Nchar}
*/
/*** GENERATED CODE ***/
#define is_NONCHAR_utf8(s) \
@@ -2014,7 +2014,7 @@
/*
SURROGATE: Surrogate characters
- \p{Gc=Cs}
+ \p{_Perl_Surrogate}
*/
/*** GENERATED CODE ***/
#define is_SURROGATE_utf8(s) \
@@ -2445,7 +2445,7 @@
/*
PATWS: pattern white space
- \p{PatWS}
+ \p{_Perl_PatWS}
*/
/*** GENERATED CODE ***/
#define is_PATWS_safe(s,e,is_utf8) \
@@ -2472,7 +2472,7 @@
#endif /* H_REGCHARCLASS */
/* Generated from:
- * 083180df694deb1fc173361406c1a75619fb8376403db3a76dc585c1e3951eca lib/Unicode/UCD.pm
+ * 0bca60a25eb4ccf2e04f50446db5f882322f50a9c61dc57bb806ccfc9b2e26a4 lib/Unicode/UCD.pm
* ae98bec7e4f0564758eed81eca5015481ba32581f8a735a825b71b3bba714450 lib/unicore/ArabicShaping.txt
* 1687fe5994eb7e5c0dab8503fc2a1b3b479d91af9d3b8055941c9bd791f7d0b5 lib/unicore/BidiBrackets.txt
* 350d1302116194b0b21def287434b55c5088098fbc726e879f7420a391965643 lib/unicore/BidiMirroring.txt
@@ -2514,9 +2514,9 @@
* 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt
* 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt
* a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt
- * c9326eab8d7861c3543963e555d5b927348f4467c93071db23154dece7619654 lib/unicore/mktables
+ * 46f739fb5c9daf6fb457ed67f821d88d9eadd2df17b098f385b3b50f99c01acf lib/unicore/mktables
* 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version
* c6884f4d629f04d1316f3476cb1050b6a1b98ca30c903262955d4eae337c6b1e regen/charset_translations.pl
- * 5e47f645eac3a918246254e19c06b604c8ea088cf62da5be84dcb953ef2bf16c regen/regcharclass.pl
- * 206b60035ff0cec9f7d1701937ecf9226a943faa42dfc4827c37306be64ff18e regen/regcharclass_multi_char_folds.pl
+ * d9c04ac46bdd81bb3e26519f2b8eb6242cb12337205add3f7cf092b0c58dccc4 regen/regcharclass.pl
+ * 393f8d882713a3ba227351ad0f00ea4839fda74fcf77dcd1cdf31519925adba5 regen/regcharclass_multi_char_folds.pl
* ex: set ro: */
diff --git a/regcomp.c b/regcomp.c
index fcaf153ef6..901e2c0846 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3652,6 +3652,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
* this function, we need to flag any occurrences of the sharp s.
* This character forbids trie formation (because of added
* complexity) */
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
while (s < s_end) {
if (*s == LATIN_SMALL_LETTER_SHARP_S) {
OP(scan) = EXACTFA_NO_TRIE;
@@ -3659,7 +3662,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
break;
}
s++;
- continue;
}
}
else {
@@ -3705,6 +3707,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
*min_subtract += len - 1;
s += len;
}
+#endif
}
}
@@ -11505,8 +11508,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
*character = (U8) code_point;
len = 1;
} /* Else is folded non-UTF8 */
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
-
+#else
+ else if (1) {
+#endif
/* We don't fold any non-UTF8 except possibly the Sharp s (see
* comments at join_exact()); */
*character = (U8) code_point;
@@ -11550,9 +11558,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
/* A single character node is SIMPLE, except for the special-cased SHARP S
* under /di. */
if ((len == 1 || (UTF && len == UNISKIP(code_point)))
- && (code_point != LATIN_SMALL_LETTER_SHARP_S
- || ! FOLD || ! DEPENDS_SEMANTICS))
- {
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
+ && ( code_point != LATIN_SMALL_LETTER_SHARP_S
+ || ! FOLD || ! DEPENDS_SEMANTICS)
+#endif
+ ) {
*flagp |= SIMPLE;
}
@@ -12650,11 +12662,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
}
else /* A regular FOLD code point */
if (! ( UTF
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
/* See comments for join_exact() as to why we fold this
* non-UTF at compile time */
|| (node_type == EXACTFU
- && ender == LATIN_SMALL_LETTER_SHARP_S)))
- {
+ && ender == LATIN_SMALL_LETTER_SHARP_S)
+#endif
+ )) {
/* Here, are folding and are not UTF-8 encoded; therefore
* the character must be in the range 0-255, and is not /l
* (Not /l because we already handled these under /l in
@@ -12667,11 +12683,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
* 'ss' */
if (maybe_exactfu
&& (PL_fold[ender] != PL_fold_latin1[ender]
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
|| ender == LATIN_SMALL_LETTER_SHARP_S
|| (len > 0
&& isALPHA_FOLD_EQ(ender, 's')
- && isALPHA_FOLD_EQ(*(s-1), 's'))))
- {
+ && isALPHA_FOLD_EQ(*(s-1), 's'))
+#endif
+ )) {
maybe_exactfu = FALSE;
}
}
@@ -14014,9 +14034,30 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl
*invlist = add_cp_to_invlist(*invlist,
LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
break;
+
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+
case LATIN_SMALL_LETTER_SHARP_S:
*invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
break;
+
+#endif
+
+#if UNICODE_MAJOR_VERSION < 3 \
+ || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
+
+ /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
+ * U+0131. */
+ case 'i':
+ case 'I':
+ *invlist =
+ add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+# if UNICODE_DOT_DOT_VERSION == 1
+ *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
+# endif
+ break;
+#endif
+
default:
/* Use deprecated warning to increase the chances of this being
* output */
@@ -14209,6 +14250,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
DEBUG_PARSE("clas");
+#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 0)
+ allow_multi_folds = FALSE;
+#endif
+
/* Assume we are going to generate an ANYOF node. */
ret = reganode(pRExC_state,
(LOC)
diff --git a/regen/mk_PL_charclass.pl b/regen/mk_PL_charclass.pl
index 4b46bd0542..1cabfc4de9 100644
--- a/regen/mk_PL_charclass.pl
+++ b/regen/mk_PL_charclass.pl
@@ -51,6 +51,7 @@ my @properties = qw(
# Read in the case fold mappings.
my %folded_closure;
+my %simple_folded_closure;
my @hex_non_final_folds;
my @non_latin1_simple_folds;
my @folds;
@@ -118,8 +119,14 @@ BEGIN { # Have to do this at compile time because using user-defined \p{property
for my $i (0 .. @folded - 1) {
my $hex_fold = $folded[$i];
my $fold = hex $hex_fold;
- push @{$folded_closure{$fold}}, $from if $fold < 256;
- push @{$folded_closure{$from}}, $fold if $from < 256;
+ if ($fold < 256) {
+ push @{$folded_closure{$fold}}, $from;
+ push @{$simple_folded_closure{$fold}}, $from if $fold_type ne 'F';
+ }
+ if ($from < 256) {
+ push @{$folded_closure{$from}}, $fold;
+ push @{$simple_folded_closure{$from}}, $fold if $fold_type ne 'F';
+ }
if (($fold_type eq 'C' || $fold_type eq 'S')
&& ($fold < 256 != $from < 256))
@@ -153,11 +160,16 @@ BEGIN { # Have to do this at compile time because using user-defined \p{property
push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
}
}
+ foreach my $folded (keys %simple_folded_closure) {
+ foreach my $from (grep { $_ < 256 } @{$simple_folded_closure{$folded}}) {
+ push @{$simple_folded_closure{$from}}, @{$simple_folded_closure{$folded}};
+ }
+ }
# We have the single-character folds that cross the 255/256, like KELVIN
# SIGN => 'k', but we need the closure, so add like 'K' to it
foreach my $folded (@non_latin1_simple_folds) {
- foreach my $fold (@{$folded_closure{$folded}}) {
+ foreach my $fold (@{$simple_folded_closure{$folded}}) {
if ($fold < 256 && ! grep { $fold == $_ } @non_latin1_simple_folds) {
push @non_latin1_simple_folds, $fold;
}
@@ -217,12 +229,18 @@ for my $ord (0..255) {
} elsif ($name eq 'SPACE') {;
$re = qr/\p{XPerlSpace}/;
} elsif ($name eq 'IDFIRST') {
- $re = qr/[_\p{Alpha}]/;
+ $re = qr/[_\p{XPosixAlpha}]/;
} elsif ($name eq 'WORDCHAR') {
$re = qr/\p{XPosixWord}/;
+ } elsif ($name eq 'LOWER') {
+ $re = qr/\p{XPosixLower}/;
+ } elsif ($name eq 'UPPER') {
+ $re = qr/\p{XPosixUpper}/;
} elsif ($name eq 'ALPHANUMERIC') {
# Like \w, but no underscore
$re = qr/\p{Alnum}/;
+ } elsif ($name eq 'ALPHA') {
+ $re = qr/\p{XPosixAlpha}/;
} elsif ($name eq 'QUOTEMETA') {
$re = qr/\p{_Perl_Quotemeta}/;
} elsif ($name eq 'NONLATIN1_FOLD') {
@@ -241,7 +259,7 @@ for my $ord (0..255) {
use Carp;
carp $@ if ! defined $re;
}
- #print "$ord, $name $property, $re\n";
+ #print STDERR __LINE__, ": $ord, $name $property, $re\n";
if ($char =~ $re) { # Add this property if matches
$bits[$ord] .= '|' if $bits[$ord];
$bits[$ord] .= "(1U<<_CC_$property)";
@@ -295,10 +313,11 @@ foreach my $charset (get_supported_code_pages()) {
}
else {
use Unicode::UCD qw(prop_invmap);
- my ($list_ref, $map_ref, $format) = prop_invmap("Name_Alias");
+ my ($list_ref, $map_ref, $format)
+ = prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok');
if ($format !~ /^s/) {
use Carp;
- carp "Unexpected format '$format' for 'Name_Alias";
+ carp "Unexpected format '$format' for '_Perl_Name_Alias";
last;
}
my $which = Unicode::UCD::search_invlist($list_ref, $ord);
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl
index 42f5b1c925..bc27d0f208 100644
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -176,15 +176,14 @@ sub output_invlist ($$;$) {
my $charset = shift // ""; # name of character set for comment
die "No inversion list for $name" unless defined $invlist
- && ref $invlist eq 'ARRAY'
- && @$invlist;
+ && ref $invlist eq 'ARRAY';
# Output the inversion list $invlist using the name $name for it.
# It is output in the exact internal form for inversion lists.
# Is the last element of the header 0, or 1 ?
my $zero_or_one = 0;
- if ($invlist->[0] != 0) {
+ if (@$invlist && $invlist->[0] != 0) {
unshift @$invlist, 0;
$zero_or_one = 1;
}
@@ -235,8 +234,8 @@ sub output_invmap ($$$$$$$) {
my $name_prefix;
if ($input_format eq 's') {
- $prop_name = (prop_aliases($prop_name))[1]; # Get full name
- my $short_name = (prop_aliases($prop_name))[0];
+ $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
+ my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
my @enums = prop_values($prop_name);
if (! @enums) {
die "Only enum properties are currently handled; '$prop_name' isn't one";
@@ -267,7 +266,8 @@ sub output_invmap ($$$$$$$) {
# Assign a value to each element of the enum. The default
# value always gets 0; the others are arbitrarily assigned.
my $enum_val = 0;
- $default = prop_value_aliases($prop_name, $default);
+ my $canonical_default = prop_value_aliases($prop_name, $default);
+ $default = $canonical_default if defined $canonical_default;
$enums{$default} = $enum_val++;
for my $enum (@enums) {
$enums{$enum} = $enum_val++ unless exists $enums{$enum};
@@ -315,7 +315,9 @@ sub output_invmap ($$$$$$$) {
# The main body are the scalars passed in to this routine.
for my $i (0 .. $count - 1) {
my $element = $invmap->[$i];
- $element = $name_prefix . prop_value_aliases($prop_name, $element);
+ my $full_element_name = prop_value_aliases($prop_name, $element);
+ $element = $full_element_name if defined $full_element_name;
+ $element = $name_prefix . $element;
print $out_fh "\t$element";
print $out_fh "," if $i < $count - 1;
print $out_fh "\n";
@@ -353,7 +355,8 @@ sub mk_invlist_from_sorted_cp_list {
my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
die "Could not find inversion map for Case_Folding" unless defined $format;
die "Incorrect format '$format' for Case_Folding inversion map"
- unless $format eq 'al';
+ unless $format eq 'al'
+ || $format eq 'a';
my @has_multi_char_fold;
my @is_non_final_fold;
@@ -447,9 +450,9 @@ for my $charset (get_supported_code_pages()) {
&UpperLatin1
_Perl_IDStart
_Perl_IDCont
- Grapheme_Cluster_Break,EDGE
- Word_Break,EDGE,UNKNOWN
- Sentence_Break,EDGE
+ _Perl_GCB,EDGE
+ _Perl_WB,EDGE,UNKNOWN
+ _Perl_SB,EDGE
)
) {
@@ -489,20 +492,37 @@ for my $charset (get_supported_code_pages()) {
else {
@invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
if (! @invlist) {
- my ($list_ref, $map_ref, $format, $default);
- ($list_ref, $map_ref, $format, $default)
+ # If couldn't find a non-empty inversion list, see if it is
+ # instead an inversion map
+ my ($list_ref, $map_ref, $format, $default)
= prop_invmap($lookup_prop, '_perl_core_internal_ok');
- die "Could not find inversion list for '$lookup_prop'" unless $list_ref;
- @invlist = @$list_ref;
- @invmap = @$map_ref;
- $map_format = $format;
- $map_default = $default;
- $maps_to_code_point = $map_format =~ /x/;
- $to_adjust = $map_format =~ /a/;
+ if (! $list_ref) {
+ # An empty return here could mean an unknown property, or
+ # merely that the original inversion list is empty. Call
+ # in scalar context to differentiate
+ my $count = prop_invlist($lookup_prop,
+ '_perl_core_internal_ok');
+ die "Could not find inversion list for '$lookup_prop'"
+ unless defined $count;
+ }
+ else {
+ @invlist = @$list_ref;
+ @invmap = @$map_ref;
+ $map_format = $format;
+ $map_default = $default;
+ $maps_to_code_point = $map_format =~ /x/;
+ $to_adjust = $map_format =~ /a/;
+ }
}
}
- die "Could not find inversion list for '$lookup_prop'" unless @invlist;
+
+
+ # Short-circuit an empty inversion list.
+ if (! @invlist) {
+ output_invlist($prop_name, \@invlist, $charset);
+ next;
+ }
# Re-order the Unicode code points to native ones for this platform.
# This is only needed for code points below 256, because native code
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index 184437e88f..9115eafeb6 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -610,6 +610,8 @@ sub length_optree {
my $else= ( $opt{else} ||= 0 );
+ return $else if $self->{count} == 0;
+
my $method = $type =~ /generic/ ? 'generic_optree' : 'optree';
if ($method eq 'optree' && scalar keys %{$self->{size}{$type}} == 1) {
@@ -1629,11 +1631,11 @@ REPLACEMENT: Unicode REPLACEMENT CHARACTER
NONCHAR: Non character code points
=> UTF8 :fast
-\p{Nchar}
+\p{_Perl_Nchar}
SURROGATE: Surrogate characters
=> UTF8 :fast
-\p{Gc=Cs}
+\p{_Perl_Surrogate}
# This program was run with this enabled, and the results copied to utf8.h;
# then this was commented out because it takes so long to figure out these 2
@@ -1695,4 +1697,4 @@ PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are
PATWS: pattern white space
=> generic cp : safe
-\p{PatWS}
+\p{_Perl_PatWS}
diff --git a/regen/regcharclass_multi_char_folds.pl b/regen/regcharclass_multi_char_folds.pl
index dfc8f9f3e5..5ea9d33a25 100644
--- a/regen/regcharclass_multi_char_folds.pl
+++ b/regen/regcharclass_multi_char_folds.pl
@@ -63,6 +63,8 @@ sub multi_char_folds ($) {
# multi-char folds; false if just the ones that
# are all ascii
+ return () if pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) lt v3.0.1;
+
my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
die "Could not find inversion map for Case_Folding" unless defined $format;
die "Incorrect format '$format' for Case_Folding inversion map"
diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl
index 936c1a8a6c..5004e1d50e 100644
--- a/regen/unicode_constants.pl
+++ b/regen/unicode_constants.pl
@@ -3,6 +3,7 @@ use strict;
use warnings;
require 'regen/regen_lib.pl';
require 'regen/charset_translations.pl';
+use Unicode::UCD;
use charnames qw(:loose);
my $out_fh = open_new('unicode_constants.h', '>',
@@ -14,10 +15,11 @@ print $out_fh <<END;
#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */
#define H_UNICODE_CONSTANTS 1
-/* This file contains #defines for various Unicode code points. The values
- * the macros expand to are the native Unicode code point, or all or portions
- * of the UTF-8 encoding for the code point. In the former case, the macro
- * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
+/* This file contains #defines for the version of Unicode being used and
+ * various Unicode code points. The values the code point macros expand to
+ * are the native Unicode code point, or all or portions of the UTF-8 encoding
+ * for the code point. In the former case, the macro name has the suffix
+ * "_NATIVE"; otherwise, the suffix "_UTF8".
*
* The macros that have the suffix "_UTF8" may have further suffixes, as
* follows:
@@ -28,34 +30,18 @@ print $out_fh <<END;
END
-# The data are at the end of this file. A blank line is output as-is.
-# Comments (lines whose first non-blank is a '#') are converted to C-style,
-# though empty comments are converted to blank lines. Otherwise, each line
-# represents one #define, and begins with either a Unicode character name with
-# the blanks and dashes in it squeezed out or replaced by underscores; or it
-# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
-# case, the name will be looked-up to use as the name of the macro. In either
-# case, the macro name will have suffixes as listed above, and all blanks and
-# dashes will be replaced by underscores.
-#
-# Each line may optionally have one of the following flags on it, separated by
-# white space from the initial token.
-# string indicates that the output is to be of the string form
-# described in the comments above that are placed in the file.
-# string_skip_ifundef is the same as 'string', but instead of dying if the
-# code point doesn't exist, the line is just skipped: no output is
-# generated for it
-# first indicates that the output is to be of the FIRST_BYTE form.
-# tail indicates that the output is of the _TAIL form.
-# native indicates that the output is the code point, converted to the
-# platform's native character set if applicable
-#
-# If the code point has no official name, the desired name may be appended
-# after the flag, which will be ignored if there is an official name.
-#
-# This program is used to make it convenient to create compile time constants
-# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
-# having to figure things out.
+my $version = Unicode::UCD::UnicodeVersion();
+my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x;
+$dotdot = 0 unless defined $dotdot;
+
+print $out_fh <<END;
+#define UNICODE_MAJOR_VERSION $major
+#define UNICODE_DOT_VERSION $dot
+#define UNICODE_DOT_DOT_VERSION $dotdot
+
+END
+
+# The data are at __DATA__ in this file.
my @data = <DATA>;
@@ -95,13 +81,13 @@ foreach my $charset (get_supported_code_pages()) {
my $name;
my $cp;
my $U_cp; # code point in Unicode (not-native) terms
- my $undef_ok = $desired_name || $flag =~ /skip_if_undef/;
if ($name_or_cp =~ /^U\+(.*)/) {
$U_cp = hex $1;
$name = charnames::viacode($name_or_cp);
if (! defined $name) {
- die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $undef_ok;
+ next if $flag =~ /skip_if_undef/;
+ die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
$name = "";
}
}
@@ -176,6 +162,37 @@ print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
read_only_bottom_close_and_rename($out_fh);
+# DATA FORMAT
+#
+# A blank line is output as-is.
+# Comments (lines whose first non-blank is a '#') are converted to C-style,
+# though empty comments are converted to blank lines. Otherwise, each line
+# represents one #define, and begins with either a Unicode character name with
+# the blanks and dashes in it squeezed out or replaced by underscores; or it
+# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
+# case, the name will be looked-up to use as the name of the macro. In either
+# case, the macro name will have suffixes as listed above, and all blanks and
+# dashes will be replaced by underscores.
+#
+# Each line may optionally have one of the following flags on it, separated by
+# white space from the initial token.
+# string indicates that the output is to be of the string form
+# described in the comments above that are placed in the file.
+# string_skip_ifundef is the same as 'string', but instead of dying if the
+# code point doesn't exist, the line is just skipped: no output is
+# generated for it
+# first indicates that the output is to be of the FIRST_BYTE form.
+# tail indicates that the output is of the _TAIL form.
+# native indicates that the output is the code point, converted to the
+# platform's native character set if applicable
+#
+# If the code point has no official name, the desired name may be appended
+# after the flag, which will be ignored if there is an official name.
+#
+# This program is used to make it convenient to create compile time constants
+# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
+# having to figure things out.
+
__DATA__
U+017F string
@@ -184,10 +201,12 @@ U+0300 string
U+0399 string
U+03BC string
-U+1E9E string
+U+1E9E string_skip_if_undef
U+FB05 string
U+FB06 string
+U+0130 string
+U+0131 string
U+2010 string
U+D800 first FIRST_SURROGATE
diff --git a/regexec.c b/regexec.c
index ec4ed861ab..f2517e5022 100644
--- a/regexec.c
+++ b/regexec.c
@@ -192,7 +192,7 @@ static const char* const non_utf8_target_but_utf8_required
PL_utf8_swash_ptrs[_CC_WORDCHAR], \
"", \
PL_XPosix_ptrs[_CC_WORDCHAR], \
- LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
+ LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
@@ -1753,7 +1753,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
#define getGCB_VAL_CP(cp) \
_generic_GET_BREAK_VAL_CP( \
PL_GCB_invlist, \
- Grapheme_Cluster_Break_invmap, \
+ _Perl_GCB_invmap, \
(cp))
/* Returns the GCB value for the first code point in the UTF-8 encoded string
@@ -1766,7 +1766,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
#define getSB_VAL_CP(cp) \
_generic_GET_BREAK_VAL_CP( \
PL_SB_invlist, \
- Sentence_Break_invmap, \
+ _Perl_SB_invmap, \
(cp))
/* Returns the SB value for the first code point in the UTF-8 encoded string
@@ -1778,7 +1778,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
#define getWB_VAL_CP(cp) \
_generic_GET_BREAK_VAL_CP( \
PL_WB_invlist, \
- Word_Break_invmap, \
+ _Perl_WB_invmap, \
(cp))
/* Returns the WB value for the first code point in the UTF-8 encoded string
diff --git a/t/uni/variables.t b/t/uni/variables.t
index 33f057a645..5601b9767f 100644
--- a/t/uni/variables.t
+++ b/t/uni/variables.t
@@ -231,7 +231,7 @@ for ( 0x0 .. 0xff ) {
if ($chr =~ /[#*]/) {
# Length-1 variables with these two characters used to be used by
- # Perl, but now it generates a warning that they're gone.
+ # Perl, but now a warning is generated that they're gone.
# Ignore such warnings.
for (my $i = @warnings - 1; $i >= 0; $i--) {
splice @warnings, $i, 1 if $warnings[$i] =~ /is no longer supported/;
diff --git a/unicode_constants.h b/unicode_constants.h
index 7da4eb3a48..be7a38e92a 100644
--- a/unicode_constants.h
+++ b/unicode_constants.h
@@ -8,10 +8,11 @@
#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */
#define H_UNICODE_CONSTANTS 1
-/* This file contains #defines for various Unicode code points. The values
- * the macros expand to are the native Unicode code point, or all or portions
- * of the UTF-8 encoding for the code point. In the former case, the macro
- * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
+/* This file contains #defines for the version of Unicode being used and
+ * various Unicode code points. The values the code point macros expand to
+ * are the native Unicode code point, or all or portions of the UTF-8 encoding
+ * for the code point. In the former case, the macro name has the suffix
+ * "_NATIVE"; otherwise, the suffix "_UTF8".
*
* The macros that have the suffix "_UTF8" may have further suffixes, as
* follows:
@@ -20,6 +21,10 @@
* "_TAIL" if instead it represents all but the first byte. This, and
* with no additional suffix are both string constants */
+#define UNICODE_MAJOR_VERSION 8
+#define UNICODE_DOT_VERSION 0
+#define UNICODE_DOT_DOT_VERSION 0
+
#if 'A' == 65 /* ASCII/Latin1 */
# define LATIN_SMALL_LETTER_LONG_S_UTF8 "\xC5\xBF" /* U+017F */
@@ -33,6 +38,8 @@
# define LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 "\xEF\xAC\x85" /* U+FB05 */
# define LATIN_SMALL_LIGATURE_ST_UTF8 "\xEF\xAC\x86" /* U+FB06 */
+# define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 "\xC4\xB0" /* U+0130 */
+# define LATIN_SMALL_LETTER_DOTLESS_I_UTF8 "\xC4\xB1" /* U+0131 */
# define HYPHEN_UTF8 "\xE2\x80\x90" /* U+2010 */
# define FIRST_SURROGATE_UTF8_FIRST_BYTE 0xED /* U+D800 */
@@ -71,6 +78,8 @@
# define LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 "\xDD\x72\x67\x46" /* U+FB05 */
# define LATIN_SMALL_LIGATURE_ST_UTF8 "\xDD\x72\x67\x47" /* U+FB06 */
+# define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 "\x8D\x57" /* U+0130 */
+# define LATIN_SMALL_LETTER_DOTLESS_I_UTF8 "\x8D\x58" /* U+0131 */
# define HYPHEN_UTF8 "\xCA\x41\x57" /* U+2010 */
# define FIRST_SURROGATE_UTF8_FIRST_BYTE 0xDD /* U+D800 */
@@ -109,6 +118,8 @@
# define LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 "\xDD\x71\x66\x46" /* U+FB05 */
# define LATIN_SMALL_LIGATURE_ST_UTF8 "\xDD\x71\x66\x47" /* U+FB06 */
+# define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 "\x8C\x57" /* U+0130 */
+# define LATIN_SMALL_LETTER_DOTLESS_I_UTF8 "\x8C\x58" /* U+0131 */
# define HYPHEN_UTF8 "\xCA\x41\x57" /* U+2010 */
# define FIRST_SURROGATE_UTF8_FIRST_BYTE 0xDD /* U+D800 */
@@ -147,6 +158,8 @@
# define LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 "\xDC\x74\x68\x46" /* U+FB05 */
# define LATIN_SMALL_LIGATURE_ST_UTF8 "\xDC\x74\x68\x47" /* U+FB06 */
+# define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 "\x8E\x58" /* U+0130 */
+# define LATIN_SMALL_LETTER_DOTLESS_I_UTF8 "\x8E\x59" /* U+0131 */
# define HYPHEN_UTF8 "\xCA\x41\x58" /* U+2010 */
# define FIRST_SURROGATE_UTF8_FIRST_BYTE 0xDC /* U+D800 */
diff --git a/utf8.c b/utf8.c
index cbff7a7888..695daac9db 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1413,11 +1413,15 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_
case MICRO_SIGN:
converted = GREEK_CAPITAL_LETTER_MU;
break;
+#if UNICODE_MAJOR_VERSION > 2 \
+ || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
+ && UNICODE_DOT_DOT_VERSION >= 8)
case LATIN_SMALL_LETTER_SHARP_S:
*(p)++ = 'S';
*p = S_or_s;
*lenp = 2;
return 'S';
+#endif
default:
Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
NOT_REACHED; /* NOTREACHED */
@@ -1540,6 +1544,9 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f
if (c == MICRO_SIGN) {
converted = GREEK_SMALL_LETTER_MU;
}
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
/* If can't cross 127/128 boundary, can't return "ss"; instead return
@@ -1558,6 +1565,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f
return 's';
}
}
+#endif
else { /* In this range the fold of all other characters is their lower
case */
converted = toLOWER_LATIN1(c);
@@ -2200,11 +2208,13 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
if (flags & FOLD_FLAGS_LOCALE) {
-# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
+ const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
+
+# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
+# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
- const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
/* Special case these two characters, as what normally gets
* returned under locale doesn't work */
@@ -2217,7 +2227,9 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
"resolved to \"\\x{17F}\\x{17F}\".");
goto return_long_s;
}
- else if (UTF8SKIP(p) == long_s_t_len
+ else
+#endif
+ if (UTF8SKIP(p) == long_s_t_len
&& memEQ((char *) p, LONG_S_T, long_s_t_len))
{
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
@@ -2226,6 +2238,28 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
"resolved to \"\\x{FB06}\".");
goto return_ligature_st;
}
+
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
+
+ /* And special case this on this Unicode version only, for the same
+ * reaons the other two are special cased. They would cross the
+ * 255/256 boundary which is forbidden under /l, and so the code
+ * wouldn't catch that they are equivalent (which they are only in
+ * this release) */
+ else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
+ && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
+ {
+ /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
+ "resolved to \"\\x{0131}\".");
+ goto return_dotless_i;
+ }
+#endif
+
return check_locale_boundary_crossing(p, result, ustrp, lenp);
}
else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
@@ -2249,14 +2283,24 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
/* But in these instances, there is an alternative we can
* return that is valid */
- if (original == LATIN_CAPITAL_LETTER_SHARP_S
- || original == LATIN_SMALL_LETTER_SHARP_S)
- {
+ if (original == LATIN_SMALL_LETTER_SHARP_S
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+ || original == LATIN_CAPITAL_LETTER_SHARP_S
+#endif
+ ) {
goto return_long_s;
}
else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
goto return_ligature_st;
}
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+
+ else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+ goto return_dotless_i;
+ }
+#endif
Copy(p, ustrp, *lenp, char);
return original;
}
@@ -2300,6 +2344,18 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
*lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
return LATIN_SMALL_LIGATURE_ST;
+
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+
+ return_dotless_i:
+ *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
+ Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
+ return LATIN_SMALL_LETTER_DOTLESS_I;
+
+#endif
+
}
/* Note:
@@ -3252,7 +3308,24 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
* currently handle. But it also means that FB05 and FB06 are equivalent in
* a 1-1 mapping which we should handle, and this relationship may not be in
* the main table. Therefore this function examines all the multi-char
- * sequences and adds the 1-1 mappings that come out of that. */
+ * sequences and adds the 1-1 mappings that come out of that.
+ *
+ * XXX This function was originally intended to be multipurpose, but its
+ * only use is quite likely to remain for constructing the inversion of
+ * the CaseFolding (//i) property. If it were more general purpose for
+ * regex patterns, it would have to do the FB05/FB06 game for simple folds,
+ * because certain folds are prohibited under /iaa and /il. As an example,
+ * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
+ * equivalent under /i. But under /iaa and /il, the folds to 'i' are
+ * prohibited, so we would not figure out that they fold to each other.
+ * Code could be written to automatically figure this out, similar to the
+ * code that does this for multi-character folds, but this is the only case
+ * where something like this is ever likely to happen, as all the single
+ * char folds to The 0-255 range are now quite settled. Instead there is a
+ * little special code that is compiled only for this Unicode version. This
+ * is smaller and didn't require much coding time to do. But this makes
+ * this routine strongly tied to being used just for CaseFolding. If ever
+ * it should be generalized, this would have to be fixed */
U8 *l, *lend;
STRLEN lcur;
@@ -3395,7 +3468,24 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
} /* End of specials */
/* read $swash->{LIST} */
+
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+
+ /* For this version only U+130 and U+131 are equivalent under qr//i. Add a
+ * rule so that things work under /iaa and /il */
+
+ SV * mod_listsv = sv_mortalcopy(*listsvp);
+ sv_catpv(mod_listsv, "130\t130\t131\n");
+ l = (U8*)SvPV(mod_listsv, lcur);
+
+#else
+
l = (U8*)SvPV(*listsvp, lcur);
+
+#endif
+
lend = l + lcur;
/* Go through each input line */
diff --git a/utf8.h b/utf8.h
index 3e15707ff8..191a6b2a5c 100644
--- a/utf8.h
+++ b/utf8.h
@@ -611,7 +611,11 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
#define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case
of MICRON */
#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */
-#define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
+# define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E
+#endif
+#define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 0x130
+#define LATIN_SMALL_LETTER_DOTLESS_I 0x131
#define LATIN_SMALL_LETTER_LONG_S 0x017F
#define LATIN_SMALL_LIGATURE_LONG_S_T 0xFB05
#define LATIN_SMALL_LIGATURE_ST 0xFB06