diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2000-10-24 02:55:33 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2000-10-24 02:55:33 +0000 |
commit | ba210ebec161cde003bc967e8e460c72f71fb70c (patch) | |
tree | 7eefd78e8e365cbf64ddf49314681d17b83c3025 /regexec.c | |
parent | 177b92d2814bfc842f28f277e0a2f353c652a5e3 (diff) | |
download | perl-ba210ebec161cde003bc967e8e460c72f71fb70c.tar.gz |
Make the UTF-8 decoding stricter and more verbose when
malformation happens. This involved adding an argument
to utf8_to_uv_chk(), which involved changing its prototype,
and prefer STRLEN over I32 for the UTF-8 length, which as
a domino effect necessitated changing the prototypes of
scan_bin(), scan_oct(), scan_hex(), and reg_uni().
The stricter UTF-8 decoding checking uses Markus Kuhn's
UTF-8 Decode Stress Tester from
http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
p4raw-id: //depot/perl@7416
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 13 |
1 files changed, 9 insertions, 4 deletions
@@ -917,7 +917,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUNDUTF8: - tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n'; + tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), + strend - s, + 0, 0) : '\n'; tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); while (s < strend) { if (tmp == !(OP(c) == BOUNDUTF8 ? @@ -953,7 +955,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case NBOUNDUTF8: - tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n'; + tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), + strend - s, + 0, 0) : '\n'; tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); while (s < strend) { if (tmp == !(OP(c) == NBOUNDUTF8 ? @@ -1998,7 +2002,7 @@ S_regmatch(pTHX_ regnode *prog) while (s < e) { if (l >= PL_regeol) sayNO; - if (utf8_to_uv_chk((U8*)s, 0, 0) != (c1 ? + if (utf8_to_uv_chk((U8*)s, e - s, 0, 0) != (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l))) { @@ -2136,7 +2140,8 @@ S_regmatch(pTHX_ regnode *prog) case NBOUNDUTF8: /* was last char in word? */ ln = (locinput != PL_regbol) - ? utf8_to_uv_chk(reghop((U8*)locinput, -1), 0, 0) : PL_regprev; + ? utf8_to_uv_chk(reghop((U8*)locinput, -1), + PL_regeol - locinput, 0, 0) : PL_regprev; if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) { ln = isALNUM_uni(ln); n = swash_fetch(PL_utf8_alnum, (U8*)locinput); |