summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doop.c8
-rw-r--r--embed.h2
-rwxr-xr-xembed.pl2
-rw-r--r--objXSUB.h12
-rw-r--r--proto.h2
-rw-r--r--regcomp.c103
-rw-r--r--regexec.c65
-rwxr-xr-xt/op/pat.t114
-rw-r--r--utf8.c71
9 files changed, 242 insertions, 137 deletions
diff --git a/doop.c b/doop.c
index d7baecc792..755cbfd16a 100644
--- a/doop.c
+++ b/doop.c
@@ -344,7 +344,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
}
while (s < send) {
- if ((uv = swash_fetch(rv, s)) < none) {
+ if ((uv = swash_fetch(rv, s, TRUE)) < none) {
s += UTF8SKIP(s);
matches++;
d = uvuni_to_utf8(d, uv);
@@ -423,7 +423,7 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
send = s + len;
while (s < send) {
- if ((uv = swash_fetch(rv, s)) < none || uv == extra)
+ if ((uv = swash_fetch(rv, s, TRUE)) < none || uv == extra)
matches++;
s += UTF8SKIP(s);
}
@@ -491,7 +491,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
if (squash) {
UV puv = 0xfeedface;
while (s < send) {
- uv = swash_fetch(rv, s);
+ uv = swash_fetch(rv, s, TRUE);
if (d > dend) {
STRLEN clen = d - dstart;
@@ -546,7 +546,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
}
else {
while (s < send) {
- uv = swash_fetch(rv, s);
+ uv = swash_fetch(rv, s, TRUE);
if (d > dend) {
STRLEN clen = d - dstart;
STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
diff --git a/embed.h b/embed.h
index 887e9eb66d..dd5d658969 100644
--- a/embed.h
+++ b/embed.h
@@ -2216,7 +2216,7 @@
#define sv_vsetpvfn(a,b,c,d,e,f,g) Perl_sv_vsetpvfn(aTHX_ a,b,c,d,e,f,g)
#define str_to_version(a) Perl_str_to_version(aTHX_ a)
#define swash_init(a,b,c,d,e) Perl_swash_init(aTHX_ a,b,c,d,e)
-#define swash_fetch(a,b) Perl_swash_fetch(aTHX_ a,b)
+#define swash_fetch(a,b,c) Perl_swash_fetch(aTHX_ a,b,c)
#define taint_env() Perl_taint_env(aTHX)
#define taint_proper(a,b) Perl_taint_proper(aTHX_ a,b)
#define to_utf8_lower(a) Perl_to_utf8_lower(aTHX_ a)
diff --git a/embed.pl b/embed.pl
index 9e272b8572..fcaaaed90a 100755
--- a/embed.pl
+++ b/embed.pl
@@ -2089,7 +2089,7 @@ Apd |void |sv_vsetpvfn |SV* sv|const char* pat|STRLEN patlen \
Ap |NV |str_to_version |SV *sv
Ap |SV* |swash_init |char* pkg|char* name|SV* listsv \
|I32 minbits|I32 none
-Ap |UV |swash_fetch |SV *sv|U8 *ptr
+Ap |UV |swash_fetch |SV *sv|U8 *ptr|bool do_utf8
Ap |void |taint_env
Ap |void |taint_proper |const char* f|const char* s
Ap |UV |to_utf8_lower |U8 *p
diff --git a/objXSUB.h b/objXSUB.h
index 99d9a3ea26..28bed782f5 100644
--- a/objXSUB.h
+++ b/objXSUB.h
@@ -579,6 +579,10 @@
#define Perl_init_stacks pPerl->Perl_init_stacks
#undef init_stacks
#define init_stacks Perl_init_stacks
+#undef Perl_init_tm
+#define Perl_init_tm pPerl->Perl_init_tm
+#undef init_tm
+#define init_tm Perl_init_tm
#undef Perl_instr
#define Perl_instr pPerl->Perl_instr
#undef instr
@@ -857,6 +861,10 @@
#define Perl_mg_size pPerl->Perl_mg_size
#undef mg_size
#define mg_size Perl_mg_size
+#undef Perl_mini_mktime
+#define Perl_mini_mktime pPerl->Perl_mini_mktime
+#undef mini_mktime
+#define mini_mktime Perl_mini_mktime
#undef Perl_moreswitches
#define Perl_moreswitches pPerl->Perl_moreswitches
#undef moreswitches
@@ -927,6 +935,10 @@
#define Perl_my_stat pPerl->Perl_my_stat
#undef my_stat
#define my_stat Perl_my_stat
+#undef Perl_my_strftime
+#define Perl_my_strftime pPerl->Perl_my_strftime
+#undef my_strftime
+#define my_strftime Perl_my_strftime
#if defined(MYSWAP)
#undef Perl_my_swap
#define Perl_my_swap pPerl->Perl_my_swap
diff --git a/proto.h b/proto.h
index 63fc5187f2..cc4050d86e 100644
--- a/proto.h
+++ b/proto.h
@@ -808,7 +808,7 @@ PERL_CALLCONV void Perl_sv_vcatpvfn(pTHX_ SV* sv, const char* pat, STRLEN patlen
PERL_CALLCONV void Perl_sv_vsetpvfn(pTHX_ SV* sv, const char* pat, STRLEN patlen, va_list* args, SV** svargs, I32 svmax, bool *maybe_tainted);
PERL_CALLCONV NV Perl_str_to_version(pTHX_ SV *sv);
PERL_CALLCONV SV* Perl_swash_init(pTHX_ char* pkg, char* name, SV* listsv, I32 minbits, I32 none);
-PERL_CALLCONV UV Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr);
+PERL_CALLCONV UV Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8);
PERL_CALLCONV void Perl_taint_env(pTHX);
PERL_CALLCONV void Perl_taint_proper(pTHX_ const char* f, const char* s);
PERL_CALLCONV UV Perl_to_utf8_lower(pTHX_ U8 *p);
diff --git a/regcomp.c b/regcomp.c
index 1cc3a984e1..20388f1350 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2799,11 +2799,12 @@ tryagain:
break;
case 'p':
case 'P':
- { /* a lovely hack--pretend we saw [\pX] instead */
+ {
char* oldregxend = RExC_end;
char* parse_start = RExC_parse;
if (RExC_parse[1] == '{') {
+ /* a lovely hack--pretend we saw [\pX] instead */
RExC_end = strchr(RExC_parse, '}');
if (!RExC_end) {
RExC_parse += 2;
@@ -3259,7 +3260,7 @@ STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state)
{
register UV value;
- register IV lastvalue = OOB_UNICODE;
+ register IV prevvalue = OOB_UNICODE;
register IV range = 0;
register regnode *ret;
STRLEN numlen;
@@ -3270,7 +3271,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
register char *e;
char *parse_start = RExC_parse; /* MJD */
UV n;
- bool dont_optimize_invert = FALSE;
+ bool optimize_invert = TRUE;
ret = reganode(pRExC_state, ANYOF, 0);
@@ -3312,8 +3313,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
rangebegin = RExC_parse;
if (UTF) {
value = utf8n_to_uvchr((U8*)RExC_parse,
- RExC_end - RExC_parse,
- &numlen, 0);
+ RExC_end - RExC_parse,
+ &numlen, 0);
RExC_parse += numlen;
}
else
@@ -3423,14 +3424,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
RExC_parse - rangebegin,
RExC_parse - rangebegin,
rangebegin);
- if (lastvalue < 256) {
- ANYOF_BITMAP_SET(ret, lastvalue);
+ if (prevvalue < 256) {
+ ANYOF_BITMAP_SET(ret, prevvalue);
ANYOF_BITMAP_SET(ret, '-');
}
else {
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
Perl_sv_catpvf(aTHX_ listsv,
- "%04"UVxf"\n%04"UVxf"\n", (UV)lastvalue, (UV) '-');
+ "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
}
}
@@ -3438,6 +3439,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
}
if (!SIZE_ONLY) {
+ if (namedclass > OOB_NAMEDCLASS)
+ optimize_invert = FALSE;
/* Possible truncation here but in some 64-bit environments
* the compiler gets heartburn about switch on 64-bit values.
* A similar issue a little earlier when switching on value.
@@ -3451,7 +3454,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
break;
case ANYOF_NALNUM:
@@ -3462,7 +3464,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
break;
case ANYOF_ALNUMC:
@@ -3473,7 +3474,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
break;
case ANYOF_NALNUMC:
@@ -3484,7 +3484,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
break;
case ANYOF_ALPHA:
@@ -3495,7 +3494,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
break;
case ANYOF_NALPHA:
@@ -3506,7 +3504,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
break;
case ANYOF_ASCII:
@@ -3529,7 +3526,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
}
#endif /* EBCDIC */
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
break;
case ANYOF_NASCII:
@@ -3552,7 +3548,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
}
#endif /* EBCDIC */
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
break;
case ANYOF_BLANK:
@@ -3563,7 +3558,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
break;
case ANYOF_NBLANK:
@@ -3574,7 +3568,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
break;
case ANYOF_CNTRL:
@@ -3585,7 +3578,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
break;
case ANYOF_NCNTRL:
@@ -3596,7 +3588,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
break;
case ANYOF_DIGIT:
@@ -3607,7 +3598,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
for (value = '0'; value <= '9'; value++)
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
break;
case ANYOF_NDIGIT:
@@ -3620,7 +3610,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
for (value = '9' + 1; value < 256; value++)
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
break;
case ANYOF_GRAPH:
@@ -3631,7 +3620,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
break;
case ANYOF_NGRAPH:
@@ -3642,7 +3630,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
break;
case ANYOF_LOWER:
@@ -3653,7 +3640,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
break;
case ANYOF_NLOWER:
@@ -3664,7 +3650,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
break;
case ANYOF_PRINT:
@@ -3675,7 +3660,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
break;
case ANYOF_NPRINT:
@@ -3686,7 +3670,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
break;
case ANYOF_PSXSPC:
@@ -3697,7 +3680,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
break;
case ANYOF_NPSXSPC:
@@ -3708,7 +3690,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
break;
case ANYOF_PUNCT:
@@ -3719,7 +3700,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
break;
case ANYOF_NPUNCT:
@@ -3730,7 +3710,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
break;
case ANYOF_SPACE:
@@ -3741,7 +3720,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isSPACE(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
break;
case ANYOF_NSPACE:
@@ -3752,7 +3730,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isSPACE(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
break;
case ANYOF_UPPER:
@@ -3763,7 +3740,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
break;
case ANYOF_NUPPER:
@@ -3774,7 +3750,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
break;
case ANYOF_XDIGIT:
@@ -3785,7 +3760,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
break;
case ANYOF_NXDIGIT:
@@ -3796,7 +3770,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
break;
default:
@@ -3810,17 +3783,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
} /* end of namedclass \blah */
if (range) {
- if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) ||
- ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ {
+ if (((prevvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) ||
+ ((NATIVE_TO_UNI(prevvalue) > NATIVE_TO_UNI(value)) &&
+ (PL_hints & HINT_RE_ASCIIR))) /* b-a */ {
Simple_vFAIL4("Invalid [] range \"%*.*s\"",
RExC_parse - rangebegin,
RExC_parse - rangebegin,
rangebegin);
+ range = 0; /* not a valid range */
}
- range = 0; /* not a true range */
}
else {
- lastvalue = value; /* save the beginning of the range */
+ prevvalue = value; /* save the beginning of the range */
if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
RExC_parse[1] != ']') {
RExC_parse++;
@@ -3843,42 +3817,45 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
/* now is the next time */
if (!SIZE_ONLY) {
- if (lastvalue < 256 && value < 256) {
-#ifdef EBCDIC /* EBCDIC, for example. */
- if (PL_hints & HINT_RE_ASCIIR) {
- IV i;
+ IV i;
+
+ if (prevvalue < 256) {
+ IV ceilvalue = value < 256 ? value : 255;
+
+#ifdef EBCDIC
/* New style scheme for ranges:
- * after :
* use re 'asciir';
* do ranges in ASCII/Unicode space
*/
- for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++)
- ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i));
+ for (i = NATIVE_TO_ASCII(prevvalue);
+ i <= NATIVE_TO_ASCII(ceilvalue);
+ i++)
+ ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i));
}
- else if ((isLOWER(lastvalue) && isLOWER(value)) ||
- (isUPPER(lastvalue) && isUPPER(value)))
+ else if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
+ (isUPPER(prevvalue) && isUPPER(ceilvalue)))
{
- IV i;
- if (isLOWER(lastvalue)) {
- for (i = lastvalue; i <= value; i++)
+ if (isLOWER(prevvalue)) {
+ for (i = prevvalue; i <= ceilvalue; i++)
if (isLOWER(i))
ANYOF_BITMAP_SET(ret, i);
} else {
- for (i = lastvalue; i <= value; i++)
+ for (i = prevvalue; i <= ceilvalue; i++)
if (isUPPER(i))
ANYOF_BITMAP_SET(ret, i);
}
}
else
#endif
- for ( ; lastvalue <= value; lastvalue++)
- ANYOF_BITMAP_SET(ret, lastvalue);
- } else {
+ for (i = prevvalue; i <= ceilvalue; i++)
+ ANYOF_BITMAP_SET(ret, i);
+ }
+ if (value > 255) {
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
- if (lastvalue < value)
+ if (prevvalue < value)
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
- (UV)lastvalue, (UV)value);
- else
+ (UV)prevvalue, (UV)value);
+ else if (prevvalue == value)
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
(UV)value);
}
@@ -3912,7 +3889,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
}
/* optimize inverted simple patterns (e.g. [^a-z]) */
- if (!SIZE_ONLY && !dont_optimize_invert &&
+ if (!SIZE_ONLY && optimize_invert &&
/* If the only flag is inversion. */
(ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
@@ -4448,7 +4425,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
for (i = 0; i <= 256; i++) { /* just the first 256 */
U8 *e = uvchr_to_utf8(s, i);
- if (i < 256 && swash_fetch(sw, s)) {
+ if (i < 256 && swash_fetch(sw, s, TRUE)) {
if (rangestart == -1)
rangestart = i;
} else if (rangestart != -1) {
diff --git a/regexec.c b/regexec.c
index e358d63d5f..c9096f0597 100644
--- a/regexec.c
+++ b/regexec.c
@@ -958,7 +958,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
LOAD_UTF8_CHARCLASS(alnum,"a");
while (s < strend) {
if (tmp == !(OP(c) == BOUND ?
- swash_fetch(PL_utf8_alnum, (U8*)s) :
+ swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
isALNUM_LC_utf8((U8*)s)))
{
tmp = !tmp;
@@ -1001,7 +1001,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
LOAD_UTF8_CHARCLASS(alnum,"a");
while (s < strend) {
if (tmp == !(OP(c) == NBOUND ?
- swash_fetch(PL_utf8_alnum, (U8*)s) :
+ swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
isALNUM_LC_utf8((U8*)s)))
tmp = !tmp;
else if ((norun || regtry(prog, s)))
@@ -1029,7 +1029,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (do_utf8) {
LOAD_UTF8_CHARCLASS(alnum,"a");
while (s < strend) {
- if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
+ if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
else
@@ -1087,7 +1087,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (do_utf8) {
LOAD_UTF8_CHARCLASS(alnum,"a");
while (s < strend) {
- if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
+ if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
else
@@ -1145,7 +1145,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (do_utf8) {
LOAD_UTF8_CHARCLASS(space," ");
while (s < strend) {
- if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
+ if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
else
@@ -1203,7 +1203,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (do_utf8) {
LOAD_UTF8_CHARCLASS(space," ");
while (s < strend) {
- if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
+ if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
else
@@ -1261,7 +1261,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (do_utf8) {
LOAD_UTF8_CHARCLASS(digit,"0");
while (s < strend) {
- if (swash_fetch(PL_utf8_digit,(U8*)s)) {
+ if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
else
@@ -1319,7 +1319,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (do_utf8) {
LOAD_UTF8_CHARCLASS(digit,"0");
while (s < strend) {
- if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
+ if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
else
@@ -2214,7 +2214,7 @@ S_regmatch(pTHX_ regnode *prog)
sayNO;
if (do_utf8) {
if (!(OP(scan) == ALNUM
- ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
+ ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
: isALNUM_LC_utf8((U8*)locinput)))
{
sayNO;
@@ -2237,7 +2237,7 @@ S_regmatch(pTHX_ regnode *prog)
if (do_utf8) {
LOAD_UTF8_CHARCLASS(alnum,"a");
if (OP(scan) == NALNUM
- ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
+ ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
: isALNUM_LC_utf8((U8*)locinput))
{
sayNO;
@@ -2269,7 +2269,7 @@ S_regmatch(pTHX_ regnode *prog)
if (OP(scan) == BOUND || OP(scan) == NBOUND) {
ln = isALNUM_uni(ln);
LOAD_UTF8_CHARCLASS(alnum,"a");
- n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
+ n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
}
else {
ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
@@ -2302,7 +2302,7 @@ S_regmatch(pTHX_ regnode *prog)
if (UTF8_IS_CONTINUED(nextchr)) {
LOAD_UTF8_CHARCLASS(space," ");
if (!(OP(scan) == SPACE
- ? swash_fetch(PL_utf8_space, (U8*)locinput)
+ ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
: isSPACE_LC_utf8((U8*)locinput)))
{
sayNO;
@@ -2332,7 +2332,7 @@ S_regmatch(pTHX_ regnode *prog)
if (do_utf8) {
LOAD_UTF8_CHARCLASS(space," ");
if (OP(scan) == NSPACE
- ? swash_fetch(PL_utf8_space, (U8*)locinput)
+ ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
: isSPACE_LC_utf8((U8*)locinput))
{
sayNO;
@@ -2355,7 +2355,7 @@ S_regmatch(pTHX_ regnode *prog)
if (do_utf8) {
LOAD_UTF8_CHARCLASS(digit,"0");
if (!(OP(scan) == DIGIT
- ? swash_fetch(PL_utf8_digit, (U8*)locinput)
+ ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
: isDIGIT_LC_utf8((U8*)locinput)))
{
sayNO;
@@ -2378,7 +2378,7 @@ S_regmatch(pTHX_ regnode *prog)
if (do_utf8) {
LOAD_UTF8_CHARCLASS(digit,"0");
if (OP(scan) == NDIGIT
- ? swash_fetch(PL_utf8_digit, (U8*)locinput)
+ ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
: isDIGIT_LC_utf8((U8*)locinput))
{
sayNO;
@@ -2394,10 +2394,12 @@ S_regmatch(pTHX_ regnode *prog)
break;
case CLUMP:
LOAD_UTF8_CHARCLASS(mark,"~");
- if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
+ if (locinput >= PL_regeol ||
+ swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
sayNO;
locinput += PL_utf8skip[nextchr];
- while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
+ while (locinput < PL_regeol &&
+ swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
locinput += UTF8SKIP(locinput);
if (locinput > PL_regeol)
sayNO;
@@ -3623,7 +3625,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS(alnum,"a");
while (hardcount < max && scan < loceol &&
- swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+ swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3651,7 +3653,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS(alnum,"a");
while (hardcount < max && scan < loceol &&
- !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+ !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3679,7 +3681,8 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS(space," ");
while (hardcount < max && scan < loceol &&
- (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
+ (*scan == ' ' ||
+ swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3707,7 +3710,8 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS(space," ");
while (hardcount < max && scan < loceol &&
- !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
+ !(*scan == ' ' ||
+ swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3735,7 +3739,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS(digit,"0");
while (hardcount < max && scan < loceol &&
- swash_fetch(PL_utf8_digit,(U8*)scan)) {
+ swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3749,7 +3753,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
loceol = PL_regeol;
LOAD_UTF8_CHARCLASS(digit,"0");
while (hardcount < max && scan < loceol &&
- !swash_fetch(PL_utf8_digit,(U8*)scan)) {
+ !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3879,25 +3883,22 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
char flags = ANYOF_FLAGS(n);
bool match = FALSE;
UV c;
- STRLEN len;
+ STRLEN len = 0;
- if (do_utf8)
- c = utf8_to_uvchr(p, &len);
- else
- c = *p;
+ c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
if (do_utf8 || (flags & ANYOF_UNICODE)) {
if (do_utf8 && !ANYOF_RUNTIME(n)) {
if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
match = TRUE;
}
- if (do_utf8 && flags & ANYOF_UNICODE_ALL && c >= 256)
+ if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
match = TRUE;
if (!match) {
SV *sw = regclass_swash(n, TRUE, 0);
if (sw) {
- if (swash_fetch(sw, p))
+ if (swash_fetch(sw, p, do_utf8))
match = TRUE;
else if (flags & ANYOF_FOLD) {
U8 tmpbuf[UTF8_MAXLEN+1];
@@ -3908,7 +3909,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
}
else
uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
- if (swash_fetch(sw, tmpbuf))
+ if (swash_fetch(sw, tmpbuf, do_utf8))
match = TRUE;
}
}
@@ -3918,7 +3919,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
else if (flags & ANYOF_FOLD) {
- I32 f;
+ I32 f;
if (flags & ANYOF_LOCALE) {
PL_reg_flags |= RF_tainted;
diff --git a/t/op/pat.t b/t/op/pat.t
index 9130454dcb..1be72346f8 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -5,7 +5,8 @@
# that does fit that format, add it to op/re_tests, not here.
$| = 1;
-print "1..587\n";
+
+print "1..615\n";
BEGIN {
chdir 't' if -d 't';
@@ -1302,6 +1303,7 @@ print "ok 247\n";
{
# the second half of 20001028.003
+ my $X = '';
$X =~ s/^/chr(1488)/e;
print "not " unless length $X == 1 && ord($X) == 1488;
print "ok 260\n";
@@ -1353,10 +1355,11 @@ print "ok 247\n";
"\0" => 'Cc',
);
- for my $char (keys %s) {
+ for my $char (map { s/^\S+ //; $_ }
+ sort map { sprintf("%06x", ord($_))." $_" } keys %s) {
my $class = $s{$char};
- my $code = sprintf("%04x", ord($char));
- printf "# 0x$code\n";
+ my $code = sprintf("%06x", ord($char));
+ printf "#\n# 0x$code\n#\n";
print "# IsAlpha\n";
if ($class =~ /^[LM]/) {
print "not " unless $char =~ /\p{IsAlpha}/;
@@ -1382,7 +1385,7 @@ print "ok 247\n";
print "ok $test\n"; $test++;
}
print "# IsASCII\n";
- if ($code <= 127) {
+ if ($code le '00007f') {
print "not " unless $char =~ /\p{IsASCII}/;
print "ok $test\n"; $test++;
print "not " if $char =~ /\P{IsASCII}/;
@@ -1583,3 +1586,104 @@ EOT
print "not " unless ord($x) == 0x12345678 && length($x) == 1;
print "ok 587\n";
}
+
+{
+ my $x = "\x7f";
+
+ print "not " if $x =~ /[\x80-\xff]/;
+ print "ok 588\n";
+
+ print "not " if $x =~ /[\x80-\x{100}]/;
+ print "ok 589\n";
+
+ print "not " if $x =~ /[\x{100}]/;
+ print "ok 590\n";
+
+ print "not " if $x =~ /\p{InLatin1Supplement}/;
+ print "ok 591\n";
+
+ print "not " unless $x =~ /\P{InLatin1Supplement}/;
+ print "ok 592\n";
+
+ print "not " if $x =~ /\p{InLatinExtendedA}/;
+ print "ok 593\n";
+
+ print "not " unless $x =~ /\P{InLatinExtendedA}/;
+ print "ok 594\n";
+}
+
+{
+ my $x = "\x80";
+
+ print "not " unless $x =~ /[\x80-\xff]/;
+ print "ok 595\n";
+
+ print "not " unless $x =~ /[\x80-\x{100}]/;
+ print "ok 596\n";
+
+ print "not " if $x =~ /[\x{100}]/;
+ print "ok 597\n";
+
+ print "not " unless $x =~ /\p{InLatin1Supplement}/;
+ print "ok 598\n";
+
+ print "not " if $x =~ /\P{InLatin1Supplement}/;
+ print "ok 599\n";
+
+ print "not " if $x =~ /\p{InLatinExtendedA}/;
+ print "ok 600\n";
+
+ print "not " unless $x =~ /\P{InLatinExtendedA}/;
+ print "ok 601\n";
+}
+
+{
+ my $x = "\xff";
+
+ print "not " unless $x =~ /[\x80-\xff]/;
+ print "ok 602\n";
+
+ print "not " unless $x =~ /[\x80-\x{100}]/;
+ print "ok 603\n";
+
+ print "not " if $x =~ /[\x{100}]/;
+ print "ok 604\n";
+
+ print "not " unless $x =~ /\p{InLatin1Supplement}/;
+ print "ok 605\n";
+
+ print "not " if $x =~ /\P{InLatin1Supplement}/;
+ print "ok 606\n";
+
+ print "not " if $x =~ /\p{InLatinExtendedA}/;
+ print "ok 607\n";
+
+ print "not " unless $x =~ /\P{InLatinExtendedA}/;
+ print "ok 608\n";
+}
+
+{
+ my $x = "\x{100}";
+
+ print "not " if $x =~ /[\x80-\xff]/;
+ print "ok 609\n";
+
+ print "not " unless $x =~ /[\x80-\x{100}]/;
+ print "ok 610\n";
+
+ print "not " unless $x =~ /[\x{100}]/;
+ print "ok 611\n";
+
+ print "not " if $x =~ /\p{InLatin1Supplement}/;
+ print "ok 612\n";
+
+ print "not " unless $x =~ /\P{InLatin1Supplement}/;
+ print "ok 613\n";
+
+ print "not " unless $x =~ /\p{InLatinExtendedA}/;
+ print "ok 614\n";
+
+ print "not " if $x =~ /\P{InLatinExtendedA}/;
+ print "ok 615\n";
+}
+
diff --git a/utf8.c b/utf8.c
index fda9920933..b682cf65ca 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1045,13 +1045,13 @@ Perl_is_utf8_alnum(pTHX_ U8 *p)
* descendant of isalnum(3), in other words, it doesn't
* contain the '_'. --jhi */
PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_alnum, p);
+ return swash_fetch(PL_utf8_alnum, p, TRUE);
/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
if (!PL_utf8_alnum)
PL_utf8_alnum = swash_init("utf8", "",
sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
- return swash_fetch(PL_utf8_alnum, p);
+ return swash_fetch(PL_utf8_alnum, p, TRUE);
#endif
}
@@ -1062,13 +1062,13 @@ Perl_is_utf8_alnumc(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_alnum)
PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_alnum, p);
+ return swash_fetch(PL_utf8_alnum, p, TRUE);
/* return is_utf8_alpha(p) || is_utf8_digit(p); */
#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
if (!PL_utf8_alnum)
PL_utf8_alnum = swash_init("utf8", "",
sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
- return swash_fetch(PL_utf8_alnum, p);
+ return swash_fetch(PL_utf8_alnum, p, TRUE);
#endif
}
@@ -1085,7 +1085,7 @@ Perl_is_utf8_alpha(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_alpha)
PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_alpha, p);
+ return swash_fetch(PL_utf8_alpha, p, TRUE);
}
bool
@@ -1095,7 +1095,7 @@ Perl_is_utf8_ascii(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_ascii)
PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_ascii, p);
+ return swash_fetch(PL_utf8_ascii, p, TRUE);
}
bool
@@ -1105,7 +1105,7 @@ Perl_is_utf8_space(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_space)
PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_space, p);
+ return swash_fetch(PL_utf8_space, p, TRUE);
}
bool
@@ -1115,7 +1115,7 @@ Perl_is_utf8_digit(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_digit)
PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_digit, p);
+ return swash_fetch(PL_utf8_digit, p, TRUE);
}
bool
@@ -1125,7 +1125,7 @@ Perl_is_utf8_upper(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_upper)
PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_upper, p);
+ return swash_fetch(PL_utf8_upper, p, TRUE);
}
bool
@@ -1135,7 +1135,7 @@ Perl_is_utf8_lower(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_lower)
PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_lower, p);
+ return swash_fetch(PL_utf8_lower, p, TRUE);
}
bool
@@ -1145,7 +1145,7 @@ Perl_is_utf8_cntrl(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_cntrl)
PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_cntrl, p);
+ return swash_fetch(PL_utf8_cntrl, p, TRUE);
}
bool
@@ -1155,7 +1155,7 @@ Perl_is_utf8_graph(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_graph)
PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_graph, p);
+ return swash_fetch(PL_utf8_graph, p, TRUE);
}
bool
@@ -1165,7 +1165,7 @@ Perl_is_utf8_print(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_print)
PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_print, p);
+ return swash_fetch(PL_utf8_print, p, TRUE);
}
bool
@@ -1175,7 +1175,7 @@ Perl_is_utf8_punct(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_punct)
PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_punct, p);
+ return swash_fetch(PL_utf8_punct, p, TRUE);
}
bool
@@ -1185,7 +1185,7 @@ Perl_is_utf8_xdigit(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_xdigit)
PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_xdigit, p);
+ return swash_fetch(PL_utf8_xdigit, p, TRUE);
}
bool
@@ -1195,7 +1195,7 @@ Perl_is_utf8_mark(pTHX_ U8 *p)
return FALSE;
if (!PL_utf8_mark)
PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
- return swash_fetch(PL_utf8_mark, p);
+ return swash_fetch(PL_utf8_mark, p, TRUE);
}
UV
@@ -1205,7 +1205,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p)
if (!PL_utf8_toupper)
PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
- uv = swash_fetch(PL_utf8_toupper, p);
+ uv = swash_fetch(PL_utf8_toupper, p, TRUE);
return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
}
@@ -1216,7 +1216,7 @@ Perl_to_utf8_title(pTHX_ U8 *p)
if (!PL_utf8_totitle)
PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
- uv = swash_fetch(PL_utf8_totitle, p);
+ uv = swash_fetch(PL_utf8_totitle, p, TRUE);
return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
}
@@ -1227,7 +1227,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p)
if (!PL_utf8_tolower)
PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
- uv = swash_fetch(PL_utf8_tolower, p);
+ uv = swash_fetch(PL_utf8_tolower, p, TRUE);
return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
}
@@ -1282,21 +1282,31 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
}
UV
-Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
+Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
{
HV* hv = (HV*)SvRV(sv);
- /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
- then the "swatch" is a vec() for al the chars which start
- with 0xAA..0xYY
- So the key in the hash is length of encoded char -1
- */
- U32 klen = UTF8SKIP(ptr) - 1;
- U32 off = ptr[klen];
+ U32 klen;
+ U32 off;
STRLEN slen;
STRLEN needents;
U8 *tmps;
U32 bit;
SV *retval;
+ U8 tmputf8[2];
+ UV c = NATIVE_TO_ASCII(*ptr);
+
+ if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
+ tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
+ tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
+ ptr = tmputf8;
+ }
+ /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
+ * then the "swatch" is a vec() for al the chars which start
+ * with 0xAA..0xYY
+ * So the key in the hash (klen) is length of encoded char -1
+ */
+ klen = UTF8SKIP(ptr) - 1;
+ off = ptr[klen];
if (klen == 0)
{
@@ -1322,9 +1332,9 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
* NB: this code assumes that swatches are never modified, once generated!
*/
- if (hv == PL_last_swash_hv &&
+ if (hv == PL_last_swash_hv &&
klen == PL_last_swash_klen &&
- (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
+ (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
{
tmps = PL_last_swash_tmps;
slen = PL_last_swash_slen;
@@ -1348,7 +1358,8 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
EXTEND(SP,3);
PUSHs((SV*)sv);
/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
- PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
+ PUSHs(sv_2mortal(newSViv((klen) ?
+ (code_point & ~(needents - 1)) : 0)));
PUSHs(sv_2mortal(newSViv(needents)));
PUTBACK;
if (call_method("SWASHGET", G_SCALAR))