summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--regcharclass.h59
-rwxr-xr-xregen/regcharclass.pl4
-rw-r--r--regexec.c105
3 files changed, 142 insertions, 26 deletions
diff --git a/regcharclass.h b/regcharclass.h
index 3bdaffa1ca..64e4453e58 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -358,6 +358,65 @@
( ( 0xFF21 <= cp && cp <= 0xFF26 ) || ( 0xFF41 <= cp && cp <= 0xFF46 ) ) ) )
/*
+ XPERLSPACE: \p{XPerlSpace}
+
+ \p{XPerlSpace}
+*/
+/*** GENERATED CODE ***/
+#define is_XPERLSPACE(s,is_utf8) \
+( ( ( 0x09 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x0D ) || 0x20 == ((U8*)s)[0] ) ? 1\
+: ( is_utf8 ) ? \
+ ( ( 0xC2 == ((U8*)s)[0] ) ? \
+ ( ( 0x85 == ((U8*)s)[1] || 0xA0 == ((U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xE1 == ((U8*)s)[0] ) ? \
+ ( ( 0x9A == ((U8*)s)[1] ) ? \
+ ( ( 0x80 == ((U8*)s)[2] ) ? 3 : 0 ) \
+ : ( ( 0xA0 == ((U8*)s)[1] ) && ( 0x8E == ((U8*)s)[2] ) ) ? 3 : 0 ) \
+ : ( 0xE2 == ((U8*)s)[0] ) ? \
+ ( ( 0x80 == ((U8*)s)[1] ) ? \
+ ( ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x8A ) || ( ((U8*)s)[2] & 0xFE ) == 0xA8 || 0xAF == ((U8*)s)[2] ) ? 3 : 0 )\
+ : ( ( 0x81 == ((U8*)s)[1] ) && ( 0x9F == ((U8*)s)[2] ) ) ? 3 : 0 ) \
+ : ( ( ( 0xE3 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( 0x80 == ((U8*)s)[2] ) ) ? 3 : 0 )\
+: ( 0x85 == ((U8*)s)[0] || 0xA0 == ((U8*)s)[0] ) )
+
+/*** GENERATED CODE ***/
+#define is_XPERLSPACE_utf8(s) \
+( ( ( 0x09 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x0D ) || 0x20 == ((U8*)s)[0] ) ? 1\
+: ( 0xC2 == ((U8*)s)[0] ) ? \
+ ( ( 0x85 == ((U8*)s)[1] || 0xA0 == ((U8*)s)[1] ) ? 2 : 0 ) \
+: ( 0xE1 == ((U8*)s)[0] ) ? \
+ ( ( 0x9A == ((U8*)s)[1] ) ? \
+ ( ( 0x80 == ((U8*)s)[2] ) ? 3 : 0 ) \
+ : ( ( 0xA0 == ((U8*)s)[1] ) && ( 0x8E == ((U8*)s)[2] ) ) ? 3 : 0 ) \
+: ( 0xE2 == ((U8*)s)[0] ) ? \
+ ( ( 0x80 == ((U8*)s)[1] ) ? \
+ ( ( ( ((U8*)s)[2] <= 0x8A ) || ( ((U8*)s)[2] & 0xFE ) == 0xA8 || 0xAF == ((U8*)s)[2] ) ? 3 : 0 )\
+ : ( ( 0x81 == ((U8*)s)[1] ) && ( 0x9F == ((U8*)s)[2] ) ) ? 3 : 0 ) \
+: ( ( ( 0xE3 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( 0x80 == ((U8*)s)[2] ) ) ? 3 : 0 )
+
+/*** GENERATED CODE ***/
+#define is_XPERLSPACE_high(s) \
+( ( 0xE1 == ((U8*)s)[0] ) ? \
+ ( ( 0x9A == ((U8*)s)[1] ) ? \
+ ( ( 0x80 == ((U8*)s)[2] ) ? 3 : 0 ) \
+ : ( ( 0xA0 == ((U8*)s)[1] ) && ( 0x8E == ((U8*)s)[2] ) ) ? 3 : 0 ) \
+: ( 0xE2 == ((U8*)s)[0] ) ? \
+ ( ( 0x80 == ((U8*)s)[1] ) ? \
+ ( ( ( ((U8*)s)[2] <= 0x8A ) || ( ((U8*)s)[2] & 0xFE ) == 0xA8 || 0xAF == ((U8*)s)[2] ) ? 3 : 0 )\
+ : ( ( 0x81 == ((U8*)s)[1] ) && ( 0x9F == ((U8*)s)[2] ) ) ? 3 : 0 ) \
+: ( ( ( 0xE3 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( 0x80 == ((U8*)s)[2] ) ) ? 3 : 0 )
+
+/*** GENERATED CODE ***/
+#define is_XPERLSPACE_cp_high(cp) \
+( 0x1680 == cp || ( 0x1680 < cp && \
+( 0x180E == cp || ( 0x180E < cp && \
+( ( 0x2000 <= cp && cp <= 0x200A ) || ( 0x200A < cp && \
+( 0x2028 == cp || ( 0x2028 < cp && \
+( 0x2029 == cp || ( 0x2029 < cp && \
+( 0x202F == cp || ( 0x202F < cp && \
+( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) ) ) ) ) )
+
+/*
REPLACEMENT: Unicode REPLACEMENT CHARACTER
0xFFFD
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index 46425e4965..0bab57086a 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -1400,6 +1400,10 @@ XDIGIT: Hexadecimal digits
=> UTF8 high cp_high :fast
\p{XDigit}
+XPERLSPACE: \p{XPerlSpace}
+=> generic UTF8 high cp_high :fast
+\p{XPerlSpace}
+
REPLACEMENT: Unicode REPLACEMENT CHARACTER
=> UTF8 :safe
0xFFFD
diff --git a/regexec.c b/regexec.c
index 69bda15de3..d0560cec4e 100644
--- a/regexec.c
+++ b/regexec.c
@@ -164,7 +164,6 @@ static const char* const non_utf8_target_but_utf8_required
#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
-#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
/* No asserts are done for some of these, in case called on a */ \
@@ -1713,16 +1712,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
);
break;
case SPACEU:
- REXEC_FBC_CSCAN_PRELOAD(
- LOAD_UTF8_CHARCLASS_SPACE(),
- *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
+ REXEC_FBC_CSCAN(
+ is_XPERLSPACE_utf8(s),
isSPACE_L1((U8) *s)
);
break;
case SPACE:
- REXEC_FBC_CSCAN_PRELOAD(
- LOAD_UTF8_CHARCLASS_SPACE(),
- *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
+ REXEC_FBC_CSCAN(
+ is_XPERLSPACE_utf8(s),
isSPACE((U8) *s)
);
break;
@@ -1738,16 +1735,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
);
break;
case NSPACEU:
- REXEC_FBC_CSCAN_PRELOAD(
- LOAD_UTF8_CHARCLASS_SPACE(),
- !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
+ REXEC_FBC_CSCAN(
+ ! is_XPERLSPACE_utf8(s),
! isSPACE_L1((U8) *s)
);
break;
case NSPACE:
- REXEC_FBC_CSCAN_PRELOAD(
- LOAD_UTF8_CHARCLASS_SPACE(),
- !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
+ REXEC_FBC_CSCAN(
+ ! is_XPERLSPACE_utf8(s),
! isSPACE((U8) *s)
);
break;
@@ -4331,11 +4326,73 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
ALNUMA, NALNUMA, isWORDCHAR_A,
alnum, "a");
- CCC_TRY_U(SPACE, NSPACE, isSPACE,
- SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
- SPACEU, NSPACEU, isSPACE_L1,
- SPACEA, NSPACEA, isSPACE_A,
- space, " ");
+ case SPACEL:
+ PL_reg_flags |= RF_tainted;
+ if (NEXTCHR_IS_EOS) {
+ sayNO;
+ }
+ if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
+ if (! isSPACE_LC_utf8((U8 *) locinput)) {
+ sayNO;
+ }
+ }
+ else if (! isSPACE_LC((U8) nextchr)) {
+ sayNO;
+ }
+ goto increment_locinput;
+
+ case NSPACEL:
+ PL_reg_flags |= RF_tainted;
+ if (NEXTCHR_IS_EOS) {
+ sayNO;
+ }
+ if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
+ if (isSPACE_LC_utf8((U8 *) locinput)) {
+ sayNO;
+ }
+ }
+ else if (isSPACE_LC(nextchr)) {
+ sayNO;
+ }
+ goto increment_locinput;
+
+ case SPACE:
+ if (utf8_target) {
+ goto utf8_space;
+ }
+ /* FALL THROUGH */
+ case SPACEA:
+ if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
+ sayNO;
+ }
+ /* Matched a utf8-invariant, so don't have to worry about utf8 */
+ locinput++;
+ break;
+
+ case NSPACE:
+ if (utf8_target) {
+ goto utf8_nspace;
+ }
+ /* FALL THROUGH */
+ case NSPACEA:
+ if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
+ sayNO;
+ }
+ goto increment_locinput;
+
+ case SPACEU:
+ utf8_space:
+ if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
+ sayNO;
+ }
+ goto increment_locinput;
+
+ case NSPACEU:
+ utf8_nspace:
+ if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
+ sayNO;
+ }
+ goto increment_locinput;
CCC_TRY(DIGIT, NDIGIT, isDIGIT,
DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
@@ -6902,10 +6959,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
utf8_space:
- LOAD_UTF8_CHARCLASS_SPACE();
- while (hardcount < max && scan < loceol &&
- (*scan == ' ' ||
- swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+ while (hardcount < max && scan < loceol
+ && is_XPERLSPACE_utf8((U8*)scan))
{
scan += UTF8SKIP(scan);
hardcount++;
@@ -6955,10 +7010,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
utf8_Nspace:
- LOAD_UTF8_CHARCLASS_SPACE();
- while (hardcount < max && scan < loceol &&
- ! (*scan == ' ' ||
- swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+ while (hardcount < max && scan < loceol
+ && ! is_XPERLSPACE_utf8((U8*)scan))
{
scan += UTF8SKIP(scan);
hardcount++;