summaryrefslogtreecommitdiff
path: root/charclass_invlists.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-01-05 16:12:55 -0700
committerKarl Williamson <khw@cpan.org>2016-01-08 14:17:11 -0700
commitf1f6961f5a6fd77a3e3c36f242f1b72ce5dfe205 (patch)
tree52365bdb2759341217eb979be04a61f5b351eb2f /charclass_invlists.h
parentcbdbe9d466e0d26852ca1ace0825220c8ca7d215 (diff)
downloadperl-f1f6961f5a6fd77a3e3c36f242f1b72ce5dfe205.tar.gz
Tailor \b{wb} for Perl
The Unicode \b{wb} matches the boundary between space characters in a span of them. This is opposite of what \b does, and is counterintuitive to Perl expectations. This commit tailors \b{wb} to not split up spans of white space. I have submitted a request to Unicode to re-examine their algorithm, and this has been assigned to a subcommittee to look at, but the result won't be available until after 5.24 is done. In any event, Unicode encourages tailoring for local conditions.
Diffstat (limited to 'charclass_invlists.h')
-rw-r--r--charclass_invlists.h132
1 files changed, 106 insertions, 26 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index c33a505f9b..8b784cf64b 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -12029,15 +12029,18 @@ static const SB_enum _Perl_SB_invmap[] = { /* for ASCII/Latin1 */
#if defined(PERL_IN_PERL_C)
static const UV _Perl_WB_invlist[] = { /* for ASCII/Latin1 */
- 1524, /* Number of elements */
+ 1535, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
1 if it starts at the element beyond 0 */
0x0,
+ 0x9,
0xA,
0xB,
0xD,
0xE,
+ 0x20,
+ 0x21,
0x22,
0x23,
0x27,
@@ -12058,6 +12061,8 @@ static const UV _Perl_WB_invlist[] = { /* for ASCII/Latin1 */
0x7B,
0x85,
0x86,
+ 0xA0,
+ 0xA1,
0xAA,
0xAB,
0xAD,
@@ -12745,6 +12750,8 @@ static const UV _Perl_WB_invlist[] = { /* for ASCII/Latin1 */
0x1FF5,
0x1FF6,
0x1FFD,
+ 0x2000,
+ 0x200B,
0x200C,
0x200E,
0x2010,
@@ -12756,12 +12763,14 @@ static const UV _Perl_WB_invlist[] = { /* for ASCII/Latin1 */
0x2028,
0x202A,
0x202F,
+ 0x2030,
0x203F,
0x2041,
0x2044,
0x2045,
0x2054,
0x2055,
+ 0x205F,
0x2060,
0x2065,
0x2066,
@@ -12847,6 +12856,8 @@ static const UV _Perl_WB_invlist[] = { /* for ASCII/Latin1 */
0x2E00,
0x2E2F,
0x2E30,
+ 0x3000,
+ 0x3001,
0x3005,
0x3006,
0x302A,
@@ -13563,7 +13574,7 @@ static const UV _Perl_WB_invlist[] = { /* for ASCII/Latin1 */
#if defined(PERL_IN_REGEXEC_C)
-#define WB_ENUM_COUNT 19
+#define WB_ENUM_COUNT 20
typedef enum {
WB_Other = 0,
@@ -13582,17 +13593,21 @@ typedef enum {
WB_MidNumLet = 13,
WB_Newline = 14,
WB_Numeric = 15,
- WB_Regional_Indicator = 16,
- WB_Single_Quote = 17,
- WB_UNKNOWN = 18
+ WB_Perl_Tailored_HSpace = 16,
+ WB_Regional_Indicator = 17,
+ WB_Single_Quote = 18,
+ WB_UNKNOWN = 19
} WB_enum;
static const WB_enum _Perl_WB_invmap[] = { /* for ASCII/Latin1 */
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_LF,
WB_Newline,
WB_CR,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Double_Quote,
WB_Other,
WB_Single_Quote,
@@ -13613,6 +13628,8 @@ static const WB_enum _Perl_WB_invmap[] = { /* for ASCII/Latin1 */
WB_Other,
WB_Newline,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_ALetter,
WB_Other,
WB_Format,
@@ -14154,7 +14171,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for ASCII/Latin1 */
WB_ALetter,
WB_Other,
WB_ALetter,
- WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_ALetter,
@@ -14300,6 +14317,8 @@ static const WB_enum _Perl_WB_invmap[] = { /* for ASCII/Latin1 */
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Extend,
WB_Format,
WB_Other,
@@ -14310,6 +14329,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for ASCII/Latin1 */
WB_MidLetter,
WB_Newline,
WB_Format,
+ WB_Perl_Tailored_HSpace,
WB_Other,
WB_ExtendNumLet,
WB_Other,
@@ -14317,6 +14337,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for ASCII/Latin1 */
WB_Other,
WB_ExtendNumLet,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_Format,
WB_Other,
WB_Format,
@@ -14402,6 +14423,8 @@ static const WB_enum _Perl_WB_invmap[] = { /* for ASCII/Latin1 */
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_ALetter,
WB_Other,
WB_Extend,
@@ -36798,11 +36821,13 @@ static const SB_enum _Perl_SB_invmap[] = { /* for EBCDIC 1047 */
#if defined(PERL_IN_PERL_C)
static const UV _Perl_WB_invlist[] = { /* for EBCDIC 1047 */
- 1549, /* Number of elements */
+ 1558, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
1 if it starts at the element beyond 0 */
0x0,
+ 0x5,
+ 0x6,
0xB,
0xD,
0xE,
@@ -36810,6 +36835,7 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC 1047 */
0x16,
0x25,
0x26,
+ 0x40,
0x42,
0x4A,
0x4B,
@@ -37539,6 +37565,8 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC 1047 */
0x1FF5,
0x1FF6,
0x1FFD,
+ 0x2000,
+ 0x200B,
0x200C,
0x200E,
0x2010,
@@ -37550,12 +37578,14 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC 1047 */
0x2028,
0x202A,
0x202F,
+ 0x2030,
0x203F,
0x2041,
0x2044,
0x2045,
0x2054,
0x2055,
+ 0x205F,
0x2060,
0x2065,
0x2066,
@@ -37641,6 +37671,8 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC 1047 */
0x2E00,
0x2E2F,
0x2E30,
+ 0x3000,
+ 0x3001,
0x3005,
0x3006,
0x302A,
@@ -38357,7 +38389,7 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC 1047 */
#if defined(PERL_IN_REGEXEC_C)
-#define WB_ENUM_COUNT 19
+#define WB_ENUM_COUNT 20
typedef enum {
WB_Other = 0,
@@ -38376,13 +38408,16 @@ typedef enum {
WB_MidNumLet = 13,
WB_Newline = 14,
WB_Numeric = 15,
- WB_Regional_Indicator = 16,
- WB_Single_Quote = 17,
- WB_UNKNOWN = 18
+ WB_Perl_Tailored_HSpace = 16,
+ WB_Regional_Indicator = 17,
+ WB_Single_Quote = 18,
+ WB_UNKNOWN = 19
} WB_enum;
static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 1047 */
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Newline,
WB_CR,
WB_Other,
@@ -38390,6 +38425,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 1047 */
WB_Other,
WB_Newline,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_MidNumLet,
@@ -38973,7 +39009,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 1047 */
WB_ALetter,
WB_Other,
WB_ALetter,
- WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_ALetter,
@@ -39119,6 +39155,8 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 1047 */
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Extend,
WB_Format,
WB_Other,
@@ -39129,6 +39167,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 1047 */
WB_MidLetter,
WB_Newline,
WB_Format,
+ WB_Perl_Tailored_HSpace,
WB_Other,
WB_ExtendNumLet,
WB_Other,
@@ -39136,6 +39175,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 1047 */
WB_Other,
WB_ExtendNumLet,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_Format,
WB_Other,
WB_Format,
@@ -39221,6 +39261,8 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 1047 */
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_ALetter,
WB_Other,
WB_Extend,
@@ -61759,11 +61801,13 @@ static const SB_enum _Perl_SB_invmap[] = { /* for EBCDIC 037 */
#if defined(PERL_IN_PERL_C)
static const UV _Perl_WB_invlist[] = { /* for EBCDIC 037 */
- 1545, /* Number of elements */
+ 1554, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
1 if it starts at the element beyond 0 */
0x0,
+ 0x5,
+ 0x6,
0xB,
0xD,
0xE,
@@ -61771,6 +61815,7 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC 037 */
0x16,
0x25,
0x26,
+ 0x40,
0x42,
0x4A,
0x4B,
@@ -62496,6 +62541,8 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC 037 */
0x1FF5,
0x1FF6,
0x1FFD,
+ 0x2000,
+ 0x200B,
0x200C,
0x200E,
0x2010,
@@ -62507,12 +62554,14 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC 037 */
0x2028,
0x202A,
0x202F,
+ 0x2030,
0x203F,
0x2041,
0x2044,
0x2045,
0x2054,
0x2055,
+ 0x205F,
0x2060,
0x2065,
0x2066,
@@ -62598,6 +62647,8 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC 037 */
0x2E00,
0x2E2F,
0x2E30,
+ 0x3000,
+ 0x3001,
0x3005,
0x3006,
0x302A,
@@ -63314,7 +63365,7 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC 037 */
#if defined(PERL_IN_REGEXEC_C)
-#define WB_ENUM_COUNT 19
+#define WB_ENUM_COUNT 20
typedef enum {
WB_Other = 0,
@@ -63333,13 +63384,16 @@ typedef enum {
WB_MidNumLet = 13,
WB_Newline = 14,
WB_Numeric = 15,
- WB_Regional_Indicator = 16,
- WB_Single_Quote = 17,
- WB_UNKNOWN = 18
+ WB_Perl_Tailored_HSpace = 16,
+ WB_Regional_Indicator = 17,
+ WB_Single_Quote = 18,
+ WB_UNKNOWN = 19
} WB_enum;
static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 037 */
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Newline,
WB_CR,
WB_Other,
@@ -63347,6 +63401,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 037 */
WB_Other,
WB_LF,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_MidNumLet,
@@ -63926,7 +63981,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 037 */
WB_ALetter,
WB_Other,
WB_ALetter,
- WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_ALetter,
@@ -64072,6 +64127,8 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 037 */
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Extend,
WB_Format,
WB_Other,
@@ -64082,6 +64139,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 037 */
WB_MidLetter,
WB_Newline,
WB_Format,
+ WB_Perl_Tailored_HSpace,
WB_Other,
WB_ExtendNumLet,
WB_Other,
@@ -64089,6 +64147,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 037 */
WB_Other,
WB_ExtendNumLet,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_Format,
WB_Other,
WB_Format,
@@ -64174,6 +64233,8 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC 037 */
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_ALetter,
WB_Other,
WB_Extend,
@@ -86726,11 +86787,13 @@ static const SB_enum _Perl_SB_invmap[] = { /* for EBCDIC POSIX-BC */
#if defined(PERL_IN_PERL_C)
static const UV _Perl_WB_invlist[] = { /* for EBCDIC POSIX-BC */
- 1547, /* Number of elements */
+ 1556, /* Number of elements */
148565664, /* Version and data structure type */
0, /* 0 if the list starts at 0;
1 if it starts at the element beyond 0 */
0x0,
+ 0x5,
+ 0x6,
0xB,
0xD,
0xE,
@@ -86738,6 +86801,7 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC POSIX-BC */
0x16,
0x25,
0x26,
+ 0x40,
0x42,
0x4A,
0x4B,
@@ -87465,6 +87529,8 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC POSIX-BC */
0x1FF5,
0x1FF6,
0x1FFD,
+ 0x2000,
+ 0x200B,
0x200C,
0x200E,
0x2010,
@@ -87476,12 +87542,14 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC POSIX-BC */
0x2028,
0x202A,
0x202F,
+ 0x2030,
0x203F,
0x2041,
0x2044,
0x2045,
0x2054,
0x2055,
+ 0x205F,
0x2060,
0x2065,
0x2066,
@@ -87567,6 +87635,8 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC POSIX-BC */
0x2E00,
0x2E2F,
0x2E30,
+ 0x3000,
+ 0x3001,
0x3005,
0x3006,
0x302A,
@@ -88283,7 +88353,7 @@ static const UV _Perl_WB_invlist[] = { /* for EBCDIC POSIX-BC */
#if defined(PERL_IN_REGEXEC_C)
-#define WB_ENUM_COUNT 19
+#define WB_ENUM_COUNT 20
typedef enum {
WB_Other = 0,
@@ -88302,13 +88372,16 @@ typedef enum {
WB_MidNumLet = 13,
WB_Newline = 14,
WB_Numeric = 15,
- WB_Regional_Indicator = 16,
- WB_Single_Quote = 17,
- WB_UNKNOWN = 18
+ WB_Perl_Tailored_HSpace = 16,
+ WB_Regional_Indicator = 17,
+ WB_Single_Quote = 18,
+ WB_UNKNOWN = 19
} WB_enum;
static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC POSIX-BC */
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Newline,
WB_CR,
WB_Other,
@@ -88316,6 +88389,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC POSIX-BC */
WB_Other,
WB_Newline,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_MidNumLet,
@@ -88897,7 +88971,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC POSIX-BC */
WB_ALetter,
WB_Other,
WB_ALetter,
- WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_ALetter,
WB_Other,
WB_ALetter,
@@ -89043,6 +89117,8 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC POSIX-BC */
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_Extend,
WB_Format,
WB_Other,
@@ -89053,6 +89129,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC POSIX-BC */
WB_MidLetter,
WB_Newline,
WB_Format,
+ WB_Perl_Tailored_HSpace,
WB_Other,
WB_ExtendNumLet,
WB_Other,
@@ -89060,6 +89137,7 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC POSIX-BC */
WB_Other,
WB_ExtendNumLet,
WB_Other,
+ WB_Perl_Tailored_HSpace,
WB_Format,
WB_Other,
WB_Format,
@@ -89145,6 +89223,8 @@ static const WB_enum _Perl_WB_invmap[] = { /* for EBCDIC POSIX-BC */
WB_Other,
WB_ALetter,
WB_Other,
+ WB_Perl_Tailored_HSpace,
+ WB_Other,
WB_ALetter,
WB_Other,
WB_Extend,
@@ -99537,8 +99617,8 @@ static const UV XPosixXDigit_invlist[] = { /* for EBCDIC POSIX-BC */
* 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt
* 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt
* a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt
- * 21f614a12bfde0478588228d46f1b594bf7e23c7d1f51492c70b13f7c9b8de09 lib/unicore/mktables
+ * 2b18fcfeafc8e8a26ff1124ad4ca94020f287bc4651be7ea199d69ecd5dcf9c5 lib/unicore/mktables
* 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version
* 996abda3c0fbc2bfd575092af09e3b9b0331e624eb2e969a268457f8fd31ecbb regen/charset_translations.pl
- * 8a097f8f726bb1619af2f27f149ab87e60a1602f790147e3a561358be16abd27 regen/mk_invlists.pl
+ * 214ab3909a11fcc57cb6ee0611897342109b5a895b2b42d5227b80d948744a0a regen/mk_invlists.pl
* ex: set ro: */