diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2000-10-24 02:55:33 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2000-10-24 02:55:33 +0000 |
commit | ba210ebec161cde003bc967e8e460c72f71fb70c (patch) | |
tree | 7eefd78e8e365cbf64ddf49314681d17b83c3025 /regcomp.c | |
parent | 177b92d2814bfc842f28f277e0a2f353c652a5e3 (diff) | |
download | perl-ba210ebec161cde003bc967e8e460c72f71fb70c.tar.gz |
Make the UTF-8 decoding stricter and more verbose when
malformation happens. This involved adding an argument
to utf8_to_uv_chk(), which involved changing its prototype,
and prefer STRLEN over I32 for the UTF-8 length, which as
a domino effect necessitated changing the prototypes of
scan_bin(), scan_oct(), scan_hex(), and reg_uni().
The stricter UTF-8 decoding checking uses Markus Kuhn's
UTF-8 Decode Stress Tester from
http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
p4raw-id: //depot/perl@7416
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 21 |
1 files changed, 13 insertions, 8 deletions
@@ -2742,11 +2742,11 @@ tryagain: /* FALL THROUGH */ default: { - register I32 len; + register STRLEN len; register UV ender; register char *p; char *oldp, *s; - I32 numlen; + STRLEN numlen; PL_regcomp_parse++; @@ -2884,7 +2884,8 @@ tryagain: default: normal_default: if ((*p & 0xc0) == 0xc0 && UTF) { - ender = utf8_to_uv_chk((U8*)p, &numlen, 0); + ender = utf8_to_uv_chk((U8*)p, PL_regxend - p, + &numlen, 0); p += numlen; } else @@ -3128,7 +3129,7 @@ S_regclass(pTHX) register I32 lastvalue = OOB_CHAR8; register I32 range = 0; register regnode *ret; - I32 numlen; + STRLEN numlen; I32 namedclass; char *rangebegin; bool need_class = 0; @@ -3606,7 +3607,7 @@ S_regclassutf8(pTHX) register U32 lastvalue = OOB_UTF8; register I32 range = 0; register regnode *ret; - I32 numlen; + STRLEN numlen; I32 n; SV *listsv; U8 flags = 0; @@ -3638,12 +3639,16 @@ S_regclassutf8(pTHX) namedclass = OOB_NAMEDCLASS; if (!range) rangebegin = PL_regcomp_parse; - value = utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0); + value = utf8_to_uv_chk((U8*)PL_regcomp_parse, + PL_regxend - PL_regcomp_parse, + &numlen, 0); PL_regcomp_parse += numlen; if (value == '[') namedclass = regpposixcc(value); else if (value == '\\') { - value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0); + value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, + PL_regxend - PL_regcomp_parse, + &numlen, 0); PL_regcomp_parse += numlen; /* Some compilers cannot handle switching on 64-bit integer * values, therefore value cannot be an UV. Yes, this will @@ -3937,7 +3942,7 @@ S_reganode(pTHX_ U8 op, U32 arg) - reguni - emit (if appropriate) a Unicode character */ STATIC void -S_reguni(pTHX_ UV uv, char* s, I32* lenp) +S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp) { dTHR; if (SIZE_ONLY) { |