summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-03-10 21:38:30 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-03-10 21:38:30 +0000
commit8269fa76d2972b02e844f46a88d03e7d25fb51d7 (patch)
tree65fc9c94b96296d6632f2220d37b85b4cd420012
parent59ff52dd9c25cab6bcb0d8154a334ce53f7385af (diff)
downloadperl-8269fa76d2972b02e844f46a88d03e7d25fb51d7.tar.gz
Fix for ID 20010306.008, UTF-8 and \w without 'use utf8' coredump.
p4raw-id: //depot/perl@9098
-rw-r--r--regcomp.c18
-rw-r--r--regexec.c24
-rwxr-xr-xt/op/pat.t14
3 files changed, 34 insertions, 22 deletions
diff --git a/regcomp.c b/regcomp.c
index 2e5aaf33f0..227737cfb5 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2648,22 +2648,16 @@ tryagain:
ret = reg_node(pRExC_state, CLUMP);
*flagp |= HASWIDTH;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_mark)
- is_utf8_mark((U8*)"~"); /* preload table */
break;
case 'w':
ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 'W':
ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 'b':
RExC_seen_zerolen++;
@@ -2671,8 +2665,6 @@ tryagain:
ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 'B':
RExC_seen_zerolen++;
@@ -2680,36 +2672,26 @@ tryagain:
ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 's':
ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_space)
- is_utf8_space((U8*)" "); /* preload table */
break;
case 'S':
ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_space)
- is_utf8_space((U8*)" "); /* preload table */
break;
case 'd':
ret = reg_node(pRExC_state, DIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_digit)
- is_utf8_digit((U8*)"1"); /* preload table */
break;
case 'D':
ret = reg_node(pRExC_state, NDIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_digit)
- is_utf8_digit((U8*)"1"); /* preload table */
break;
case 'p':
case 'P':
diff --git a/regexec.c b/regexec.c
index 1fa26c9d96..a7b6411777 100644
--- a/regexec.c
+++ b/regexec.c
@@ -123,8 +123,9 @@
#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
-static void restore_pos(pTHXo_ void *arg);
+#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
+static void restore_pos(pTHXo_ void *arg);
STATIC CHECKPOINT
S_regcppush(pTHX_ I32 parenfloor)
@@ -953,6 +954,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
}
tmp = ((OP(c) == BOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
+ LOAD_UTF8_CHARCLASS(alnum,"a");
while (s < strend) {
if (tmp == !(OP(c) == BOUND ?
swash_fetch(PL_utf8_alnum, (U8*)s) :
@@ -995,6 +997,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
}
tmp = ((OP(c) == NBOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
+ LOAD_UTF8_CHARCLASS(alnum,"a");
while (s < strend) {
if (tmp == !(OP(c) == NBOUND ?
swash_fetch(PL_utf8_alnum, (U8*)s) :
@@ -1023,6 +1026,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
break;
case ALNUM:
if (do_utf8) {
+ LOAD_UTF8_CHARCLASS(alnum,"a");
while (s < strend) {
if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
if (tmp && (norun || regtry(prog, s)))
@@ -1080,6 +1084,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
break;
case NALNUM:
if (do_utf8) {
+ LOAD_UTF8_CHARCLASS(alnum,"a");
while (s < strend) {
if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
if (tmp && (norun || regtry(prog, s)))
@@ -1137,6 +1142,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
break;
case SPACE:
if (do_utf8) {
+ LOAD_UTF8_CHARCLASS(space," ");
while (s < strend) {
if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
if (tmp && (norun || regtry(prog, s)))
@@ -1194,6 +1200,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
break;
case NSPACE:
if (do_utf8) {
+ LOAD_UTF8_CHARCLASS(space," ");
while (s < strend) {
if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
if (tmp && (norun || regtry(prog, s)))
@@ -1251,6 +1258,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
break;
case DIGIT:
if (do_utf8) {
+ LOAD_UTF8_CHARCLASS(digit,"0");
while (s < strend) {
if (swash_fetch(PL_utf8_digit,(U8*)s)) {
if (tmp && (norun || regtry(prog, s)))
@@ -1308,6 +1316,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
break;
case NDIGIT:
if (do_utf8) {
+ LOAD_UTF8_CHARCLASS(digit,"0");
while (s < strend) {
if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
if (tmp && (norun || regtry(prog, s)))
@@ -2225,6 +2234,7 @@ S_regmatch(pTHX_ regnode *prog)
if (!nextchr && locinput >= PL_regeol)
sayNO;
if (do_utf8) {
+ LOAD_UTF8_CHARCLASS(alnum,"a");
if (OP(scan) == NALNUM
? swash_fetch(PL_utf8_alnum, (U8*)locinput)
: isALNUM_LC_utf8((U8*)locinput))
@@ -2257,6 +2267,7 @@ S_regmatch(pTHX_ regnode *prog)
}
if (OP(scan) == BOUND || OP(scan) == NBOUND) {
ln = isALNUM_uni(ln);
+ LOAD_UTF8_CHARCLASS(alnum,"a");
n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
}
else {
@@ -2288,6 +2299,7 @@ S_regmatch(pTHX_ regnode *prog)
sayNO;
if (do_utf8) {
if (UTF8_IS_CONTINUED(nextchr)) {
+ LOAD_UTF8_CHARCLASS(space," ");
if (!(OP(scan) == SPACE
? swash_fetch(PL_utf8_space, (U8*)locinput)
: isSPACE_LC_utf8((U8*)locinput)))
@@ -2317,6 +2329,7 @@ S_regmatch(pTHX_ regnode *prog)
if (!nextchr && locinput >= PL_regeol)
sayNO;
if (do_utf8) {
+ LOAD_UTF8_CHARCLASS(space," ");
if (OP(scan) == NSPACE
? swash_fetch(PL_utf8_space, (U8*)locinput)
: isSPACE_LC_utf8((U8*)locinput))
@@ -2339,6 +2352,7 @@ S_regmatch(pTHX_ regnode *prog)
if (!nextchr)
sayNO;
if (do_utf8) {
+ LOAD_UTF8_CHARCLASS(digit,"0");
if (!(OP(scan) == DIGIT
? swash_fetch(PL_utf8_digit, (U8*)locinput)
: isDIGIT_LC_utf8((U8*)locinput)))
@@ -2361,6 +2375,7 @@ S_regmatch(pTHX_ regnode *prog)
if (!nextchr && locinput >= PL_regeol)
sayNO;
if (do_utf8) {
+ LOAD_UTF8_CHARCLASS(digit,"0");
if (OP(scan) == NDIGIT
? swash_fetch(PL_utf8_digit, (U8*)locinput)
: isDIGIT_LC_utf8((U8*)locinput))
@@ -2377,6 +2392,7 @@ S_regmatch(pTHX_ regnode *prog)
nextchr = UCHARAT(++locinput);
break;
case CLUMP:
+ LOAD_UTF8_CHARCLASS(mark,"~");
if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
sayNO;
locinput += PL_utf8skip[nextchr];
@@ -3598,6 +3614,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
case ALNUM:
if (do_utf8) {
loceol = PL_regeol;
+ LOAD_UTF8_CHARCLASS(alnum,"a");
while (hardcount < max && scan < loceol &&
swash_fetch(PL_utf8_alnum, (U8*)scan)) {
scan += UTF8SKIP(scan);
@@ -3625,6 +3642,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
case NALNUM:
if (do_utf8) {
loceol = PL_regeol;
+ LOAD_UTF8_CHARCLASS(alnum,"a");
while (hardcount < max && scan < loceol &&
!swash_fetch(PL_utf8_alnum, (U8*)scan)) {
scan += UTF8SKIP(scan);
@@ -3652,6 +3670,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
case SPACE:
if (do_utf8) {
loceol = PL_regeol;
+ LOAD_UTF8_CHARCLASS(space," ");
while (hardcount < max && scan < loceol &&
(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
scan += UTF8SKIP(scan);
@@ -3679,6 +3698,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
case NSPACE:
if (do_utf8) {
loceol = PL_regeol;
+ LOAD_UTF8_CHARCLASS(space," ");
while (hardcount < max && scan < loceol &&
!(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
scan += UTF8SKIP(scan);
@@ -3706,6 +3726,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
case DIGIT:
if (do_utf8) {
loceol = PL_regeol;
+ LOAD_UTF8_CHARCLASS(digit,"0");
while (hardcount < max && scan < loceol &&
swash_fetch(PL_utf8_digit,(U8*)scan)) {
scan += UTF8SKIP(scan);
@@ -3719,6 +3740,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
case NDIGIT:
if (do_utf8) {
loceol = PL_regeol;
+ LOAD_UTF8_CHARCLASS(digit,"0");
while (hardcount < max && scan < loceol &&
!swash_fetch(PL_utf8_digit,(U8*)scan)) {
scan += UTF8SKIP(scan);
diff --git a/t/op/pat.t b/t/op/pat.t
index 2d862732fe..a82da60e72 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4,7 +4,7 @@
# the format supported by op/regexp.t. If you want to add a test
# that does fit that format, add it to op/re_tests, not here.
-print "1..580\n";
+print "1..581\n";
BEGIN {
chdir 't' if -d 't';
@@ -1238,8 +1238,6 @@ print "ok 247\n";
{
# bug id 20001008.001
- use utf8; # BUG - should not be needed, but is, otherwise core dump
-
my $test = 248;
my @x = ("stra\337e 138","stra\337e 138");
for (@x) {
@@ -1537,3 +1535,13 @@ print "ok 247\n";
for (576..580) { print "not ok $_\n" }
}
}
+
+{
+ # bug id 20010306.008
+
+ $a = "a\x{1234}";
+ # The original bug report had 'no utf8' here but that was irrelevant.
+ $a =~ m/\w/; # used to core dump
+
+ print "ok 581\n";
+}