diff options
author | David Mitchell <davem@iabyn.com> | 2012-09-21 10:29:04 +0100 |
---|---|---|
committer | David Mitchell <davem@iabyn.com> | 2012-09-26 09:41:10 +0100 |
commit | 7016d6ebb4afd4eb7b71b00f15b7515b5e45fee8 (patch) | |
tree | 880deaa8b0e09d4369797bf9deec676802d10442 /ext | |
parent | 895cc420d0398ff184560679b40f5f2c0af72366 (diff) | |
download | perl-7016d6ebb4afd4eb7b71b00f15b7515b5e45fee8.tar.gz |
stop regex engine reading beyond end of string
Historically the regex engine has assumed that any string passed to it
will have a trailing null char. This isn't normally an issue in perl code,
since perl strings *are* null terminated; but it could cause problems with
strings returned by XS code, or with someone calling the regex engine
directly from XS, with strend not pointing at a null char.
The engine currently relies on there being a null char in the following
ways.
First, when at the end of string, the main loop of regmatch() still reads
in the 'next' character (i.e. the character following the end of string)
even if it doesn't make any use of it. This precludes using memory mapped
files as strings for example, since the read off the end would SEGV.
Second, the matching algorithm often required the trailing character to be
\0 to work correctly: the test for 'EOF' was "if next char is null *and*
locinput >= PL_regeol, then stop". So a random non-null trailing char
could cause an overshoot.
Thirdly, some match ops require the trailing char to be null to operate
correctly; for example, \b applied at the end of the string only happens
to work because the trailing char (\0) happens to match \W.
Also, some utf8 ops will try to extract the code point at the end, which
can result in multiple bytes past the end of string being read, and
possible problems if they don't correspond to well-formed utf8.
The main fix is in S_regmatch, where the 'read next char' code has been
updated to set it to a special value, NEXTCHR_EOS instead, if we would be
reading past the end of the string.
Lots of other random bits in the regex engine needed to be fixed up too.
To track these down, I temporarily hacked regexec_flags() to make a copy
of the string but without trailing \0, then ran all the t/re/*.t tests
under valgrind to flush out all buffer overruns. So I think I've removed
most of the bad code, but by no means all of it. The code within the
various functions in regexec.c is far too complex to be able to visually
audit the code with any confidence.
Diffstat (limited to 'ext')
-rw-r--r-- | ext/XS-APItest/APItest.pm | 2 | ||||
-rw-r--r-- | ext/XS-APItest/APItest.xs | 24 | ||||
-rw-r--r-- | ext/XS-APItest/t/callregexec.t | 66 |
3 files changed, 91 insertions, 1 deletions
diff --git a/ext/XS-APItest/APItest.pm b/ext/XS-APItest/APItest.pm index 749af95b8d..f33b80b656 100644 --- a/ext/XS-APItest/APItest.pm +++ b/ext/XS-APItest/APItest.pm @@ -5,7 +5,7 @@ use strict; use warnings; use Carp; -our $VERSION = '0.43'; +our $VERSION = '0.44'; require XSLoader; diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index 08694e6c3c..357b033f7f 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -3407,6 +3407,30 @@ CODE: OUTPUT: RETVAL + # provide access to CALLREGEXEC, except replace pointers within the + # string with offsets from the start of the string + +I32 +callregexec(SV *prog, STRLEN stringarg, STRLEN strend, I32 minend, SV *sv, U32 nosave) +CODE: + { + STRLEN len; + char *strbeg; + if (SvROK(prog)) + prog = SvRV(prog); + strbeg = SvPV_force(sv, len); + RETVAL = CALLREGEXEC((REGEXP *)prog, + strbeg + stringarg, + strbeg + strend, + strbeg, + minend, + sv, + NULL, /* data */ + nosave); + } +OUTPUT: + RETVAL + MODULE = XS::APItest PACKAGE = XS::APItest::AUTOLOADtest diff --git a/ext/XS-APItest/t/callregexec.t b/ext/XS-APItest/t/callregexec.t new file mode 100644 index 0000000000..3111390686 --- /dev/null +++ b/ext/XS-APItest/t/callregexec.t @@ -0,0 +1,66 @@ +#!perl + +# test CALLREGEXEC() +# (currently it just checks that it handles non-\0 terminated strings; +# full tests haven't been added yet) + +use warnings; +use strict; + +use XS::APItest; +*callregexec = *XS::APItest::callregexec; + +use Test::More tests => 50; + +# Test that the regex engine can handle strings without terminating \0 +# XXX This is by no means comprehensive; it doesn't test all ops, nor all +# code paths within those ops (especially not utf8). + + +# this sub takes a string that has an extraneous char at the end. +# First see if the string (less the last char) matches the regex; +# then see if that string (including the last char) matches when +# calling callregexec(), but with the length arg set to 1 char less than +# the length of the string. +# In theory the result should be the same for both matches, since +# they should both not 'see' the final char. + +sub try { + my ($str, $re, $exp, $desc) = @_; + + my $str1 = substr($str, 0, -1); + ok !!$exp == !!($str1 =~ $re), "$desc str =~ qr"; + + my $bytes = do { use bytes; length $str1 }; + ok !!$exp == !!callregexec($re, 0, $bytes, 0, $str, 0), + "$desc callregexec"; +} + + +{ + try "\nx", qr/\n^/m, 0, 'MBOL'; + try "ax", qr/a$/m, 1, 'MEOL'; + try "ax", qr/a$/s, 1, 'SEOL'; + try "abx", qr/^(ab|X)./s, 0, 'SANY'; + try "abx", qr/^(ab|X)\C/, 0, 'CANY'; + try "abx", qr/^(ab|X)./, 0, 'REG_ANY'; + try "abx", qr/^ab(c|d|e|x)/, 0, 'TRIE/TRIEC'; + try "abx", qr/^abx/, 0, 'EXACT'; + try "abx", qr/^ABX/i, 0, 'EXACTF'; + try "abx", qr/^ab\b/, 1, 'BOUND'; + try "ab-", qr/^ab\B/, 0, 'NBOUND'; + try "aas", qr/a[st]/, 0, 'ANYOF'; + try "aas", qr/a[s\xDF]/i, 0, 'ANYOFV'; + try "ab1", qr/ab\d/, 0, 'DIGIT'; + try "ab\n", qr/ab[[:ascii:]]/, 0, 'POSIX'; + try "aP\x{307}", qr/^a\X/, 1, 'CLUMP 1'; + try "aP\x{307}x", qr/^a\X/, 1, 'CLUMP 2'; + try "\x{100}\r\n", qr/^\x{100}\X/, 1, 'CLUMP 3'; + try "abb", qr/^a(b)\1/, 0, 'REF'; + try "ab\n", qr/^.+\R/, 0, 'LNBREAK'; + try "ab\n", qr/^.+\v/, 0, 'VERTWS'; + try "abx", qr/^.+\V/, 1, 'NVERTWS'; + try "ab\t", qr/^.+\h/, 0, 'HORIZWS'; + try "abx", qr/^.+\H/, 1, 'NHORIZWS'; + try "abx", qr/a.*x/, 0, 'CURLY'; +} |