diff options
-rw-r--r-- | regcharclass.h | 59 | ||||
-rwxr-xr-x | regen/regcharclass.pl | 4 | ||||
-rw-r--r-- | regexec.c | 105 |
3 files changed, 142 insertions, 26 deletions
diff --git a/regcharclass.h b/regcharclass.h index 3bdaffa1ca..64e4453e58 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -358,6 +358,65 @@ ( ( 0xFF21 <= cp && cp <= 0xFF26 ) || ( 0xFF41 <= cp && cp <= 0xFF46 ) ) ) ) /* + XPERLSPACE: \p{XPerlSpace} + + \p{XPerlSpace} +*/ +/*** GENERATED CODE ***/ +#define is_XPERLSPACE(s,is_utf8) \ +( ( ( 0x09 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x0D ) || 0x20 == ((U8*)s)[0] ) ? 1\ +: ( is_utf8 ) ? \ + ( ( 0xC2 == ((U8*)s)[0] ) ? \ + ( ( 0x85 == ((U8*)s)[1] || 0xA0 == ((U8*)s)[1] ) ? 2 : 0 ) \ + : ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0x9A == ((U8*)s)[1] ) ? \ + ( ( 0x80 == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( ( 0xA0 == ((U8*)s)[1] ) && ( 0x8E == ((U8*)s)[2] ) ) ? 3 : 0 ) \ + : ( 0xE2 == ((U8*)s)[0] ) ? \ + ( ( 0x80 == ((U8*)s)[1] ) ? \ + ( ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x8A ) || ( ((U8*)s)[2] & 0xFE ) == 0xA8 || 0xAF == ((U8*)s)[2] ) ? 3 : 0 )\ + : ( ( 0x81 == ((U8*)s)[1] ) && ( 0x9F == ((U8*)s)[2] ) ) ? 3 : 0 ) \ + : ( ( ( 0xE3 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( 0x80 == ((U8*)s)[2] ) ) ? 3 : 0 )\ +: ( 0x85 == ((U8*)s)[0] || 0xA0 == ((U8*)s)[0] ) ) + +/*** GENERATED CODE ***/ +#define is_XPERLSPACE_utf8(s) \ +( ( ( 0x09 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x0D ) || 0x20 == ((U8*)s)[0] ) ? 1\ +: ( 0xC2 == ((U8*)s)[0] ) ? \ + ( ( 0x85 == ((U8*)s)[1] || 0xA0 == ((U8*)s)[1] ) ? 2 : 0 ) \ +: ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0x9A == ((U8*)s)[1] ) ? \ + ( ( 0x80 == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( ( 0xA0 == ((U8*)s)[1] ) && ( 0x8E == ((U8*)s)[2] ) ) ? 3 : 0 ) \ +: ( 0xE2 == ((U8*)s)[0] ) ? \ + ( ( 0x80 == ((U8*)s)[1] ) ? \ + ( ( ( ((U8*)s)[2] <= 0x8A ) || ( ((U8*)s)[2] & 0xFE ) == 0xA8 || 0xAF == ((U8*)s)[2] ) ? 3 : 0 )\ + : ( ( 0x81 == ((U8*)s)[1] ) && ( 0x9F == ((U8*)s)[2] ) ) ? 3 : 0 ) \ +: ( ( ( 0xE3 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( 0x80 == ((U8*)s)[2] ) ) ? 3 : 0 ) + +/*** GENERATED CODE ***/ +#define is_XPERLSPACE_high(s) \ +( ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0x9A == ((U8*)s)[1] ) ? \ + ( ( 0x80 == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( ( 0xA0 == ((U8*)s)[1] ) && ( 0x8E == ((U8*)s)[2] ) ) ? 3 : 0 ) \ +: ( 0xE2 == ((U8*)s)[0] ) ? \ + ( ( 0x80 == ((U8*)s)[1] ) ? \ + ( ( ( ((U8*)s)[2] <= 0x8A ) || ( ((U8*)s)[2] & 0xFE ) == 0xA8 || 0xAF == ((U8*)s)[2] ) ? 3 : 0 )\ + : ( ( 0x81 == ((U8*)s)[1] ) && ( 0x9F == ((U8*)s)[2] ) ) ? 3 : 0 ) \ +: ( ( ( 0xE3 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( 0x80 == ((U8*)s)[2] ) ) ? 3 : 0 ) + +/*** GENERATED CODE ***/ +#define is_XPERLSPACE_cp_high(cp) \ +( 0x1680 == cp || ( 0x1680 < cp && \ +( 0x180E == cp || ( 0x180E < cp && \ +( ( 0x2000 <= cp && cp <= 0x200A ) || ( 0x200A < cp && \ +( 0x2028 == cp || ( 0x2028 < cp && \ +( 0x2029 == cp || ( 0x2029 < cp && \ +( 0x202F == cp || ( 0x202F < cp && \ +( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) ) ) ) ) ) + +/* REPLACEMENT: Unicode REPLACEMENT CHARACTER 0xFFFD diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 46425e4965..0bab57086a 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1400,6 +1400,10 @@ XDIGIT: Hexadecimal digits => UTF8 high cp_high :fast \p{XDigit} +XPERLSPACE: \p{XPerlSpace} +=> generic UTF8 high cp_high :fast +\p{XPerlSpace} + REPLACEMENT: Unicode REPLACEMENT CHARACTER => UTF8 :safe 0xFFFD @@ -164,7 +164,6 @@ static const char* const non_utf8_target_but_utf8_required #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a") #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0") -#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ") #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ /* No asserts are done for some of these, in case called on a */ \ @@ -1713,16 +1712,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; case SPACEU: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_SPACE(), - *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target), + REXEC_FBC_CSCAN( + is_XPERLSPACE_utf8(s), isSPACE_L1((U8) *s) ); break; case SPACE: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_SPACE(), - *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target), + REXEC_FBC_CSCAN( + is_XPERLSPACE_utf8(s), isSPACE((U8) *s) ); break; @@ -1738,16 +1735,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; case NSPACEU: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_SPACE(), - !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)), + REXEC_FBC_CSCAN( + ! is_XPERLSPACE_utf8(s), ! isSPACE_L1((U8) *s) ); break; case NSPACE: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_SPACE(), - !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)), + REXEC_FBC_CSCAN( + ! is_XPERLSPACE_utf8(s), ! isSPACE((U8) *s) ); break; @@ -4331,11 +4326,73 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ALNUMA, NALNUMA, isWORDCHAR_A, alnum, "a"); - CCC_TRY_U(SPACE, NSPACE, isSPACE, - SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8, - SPACEU, NSPACEU, isSPACE_L1, - SPACEA, NSPACEA, isSPACE_A, - space, " "); + case SPACEL: + PL_reg_flags |= RF_tainted; + if (NEXTCHR_IS_EOS) { + sayNO; + } + if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { + if (! isSPACE_LC_utf8((U8 *) locinput)) { + sayNO; + } + } + else if (! isSPACE_LC((U8) nextchr)) { + sayNO; + } + goto increment_locinput; + + case NSPACEL: + PL_reg_flags |= RF_tainted; + if (NEXTCHR_IS_EOS) { + sayNO; + } + if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { + if (isSPACE_LC_utf8((U8 *) locinput)) { + sayNO; + } + } + else if (isSPACE_LC(nextchr)) { + sayNO; + } + goto increment_locinput; + + case SPACE: + if (utf8_target) { + goto utf8_space; + } + /* FALL THROUGH */ + case SPACEA: + if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) { + sayNO; + } + /* Matched a utf8-invariant, so don't have to worry about utf8 */ + locinput++; + break; + + case NSPACE: + if (utf8_target) { + goto utf8_nspace; + } + /* FALL THROUGH */ + case NSPACEA: + if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) { + sayNO; + } + goto increment_locinput; + + case SPACEU: + utf8_space: + if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) { + sayNO; + } + goto increment_locinput; + + case NSPACEU: + utf8_nspace: + if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) { + sayNO; + } + goto increment_locinput; CCC_TRY(DIGIT, NDIGIT, isDIGIT, DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8, @@ -6902,10 +6959,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma utf8_space: - LOAD_UTF8_CHARCLASS_SPACE(); - while (hardcount < max && scan < loceol && - (*scan == ' ' || - swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) + while (hardcount < max && scan < loceol + && is_XPERLSPACE_utf8((U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; @@ -6955,10 +7010,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma utf8_Nspace: - LOAD_UTF8_CHARCLASS_SPACE(); - while (hardcount < max && scan < loceol && - ! (*scan == ' ' || - swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) + while (hardcount < max && scan < loceol + && ! is_XPERLSPACE_utf8((U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; |