summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2000-10-24 02:55:33 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2000-10-24 02:55:33 +0000
commitba210ebec161cde003bc967e8e460c72f71fb70c (patch)
tree7eefd78e8e365cbf64ddf49314681d17b83c3025 /regcomp.c
parent177b92d2814bfc842f28f277e0a2f353c652a5e3 (diff)
downloadperl-ba210ebec161cde003bc967e8e460c72f71fb70c.tar.gz
Make the UTF-8 decoding stricter and more verbose when
malformation happens. This involved adding an argument to utf8_to_uv_chk(), which involved changing its prototype, and prefer STRLEN over I32 for the UTF-8 length, which as a domino effect necessitated changing the prototypes of scan_bin(), scan_oct(), scan_hex(), and reg_uni(). The stricter UTF-8 decoding checking uses Markus Kuhn's UTF-8 Decode Stress Tester from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt p4raw-id: //depot/perl@7416
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c21
1 files changed, 13 insertions, 8 deletions
diff --git a/regcomp.c b/regcomp.c
index e7042eaa4e..3f2b10c8fe 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2742,11 +2742,11 @@ tryagain:
/* FALL THROUGH */
default: {
- register I32 len;
+ register STRLEN len;
register UV ender;
register char *p;
char *oldp, *s;
- I32 numlen;
+ STRLEN numlen;
PL_regcomp_parse++;
@@ -2884,7 +2884,8 @@ tryagain:
default:
normal_default:
if ((*p & 0xc0) == 0xc0 && UTF) {
- ender = utf8_to_uv_chk((U8*)p, &numlen, 0);
+ ender = utf8_to_uv_chk((U8*)p, PL_regxend - p,
+ &numlen, 0);
p += numlen;
}
else
@@ -3128,7 +3129,7 @@ S_regclass(pTHX)
register I32 lastvalue = OOB_CHAR8;
register I32 range = 0;
register regnode *ret;
- I32 numlen;
+ STRLEN numlen;
I32 namedclass;
char *rangebegin;
bool need_class = 0;
@@ -3606,7 +3607,7 @@ S_regclassutf8(pTHX)
register U32 lastvalue = OOB_UTF8;
register I32 range = 0;
register regnode *ret;
- I32 numlen;
+ STRLEN numlen;
I32 n;
SV *listsv;
U8 flags = 0;
@@ -3638,12 +3639,16 @@ S_regclassutf8(pTHX)
namedclass = OOB_NAMEDCLASS;
if (!range)
rangebegin = PL_regcomp_parse;
- value = utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+ value = utf8_to_uv_chk((U8*)PL_regcomp_parse,
+ PL_regxend - PL_regcomp_parse,
+ &numlen, 0);
PL_regcomp_parse += numlen;
if (value == '[')
namedclass = regpposixcc(value);
else if (value == '\\') {
- value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+ value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse,
+ PL_regxend - PL_regcomp_parse,
+ &numlen, 0);
PL_regcomp_parse += numlen;
/* Some compilers cannot handle switching on 64-bit integer
* values, therefore value cannot be an UV. Yes, this will
@@ -3937,7 +3942,7 @@ S_reganode(pTHX_ U8 op, U32 arg)
- reguni - emit (if appropriate) a Unicode character
*/
STATIC void
-S_reguni(pTHX_ UV uv, char* s, I32* lenp)
+S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp)
{
dTHR;
if (SIZE_ONLY) {