summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-11-07 16:14:50 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-11-07 16:14:50 +0000
commit48f3c76588362811dfc43674ba54066f2f31b045 (patch)
treec87bbd2572ac3c6d6e5fa40d4699a39fca61d665
parentd04625dfed2986d95e3d39df9ed653d5d6de7ab3 (diff)
downloadpcre-48f3c76588362811dfc43674ba54066f2f31b045.tar.gz
Add PCRE_ERROR_SHORTUTF8 to PCRE_PARTIAL_HARD processing.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@569 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog3
-rw-r--r--doc/pcreapi.335
-rw-r--r--doc/pcrepartial.38
-rw-r--r--pcre.h.in1
-rw-r--r--pcre_dfa_exec.c8
-rw-r--r--pcre_exec.c8
-rw-r--r--pcre_valid_utf8.c17
-rw-r--r--testdata/testinput53
-rw-r--r--testdata/testinput83
-rw-r--r--testdata/testoutput58
-rw-r--r--testdata/testoutput86
11 files changed, 81 insertions, 19 deletions
diff --git a/ChangeLog b/ChangeLog
index ace825b..29f0dba 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -92,6 +92,9 @@ Version 8.11 10-Oct-2010
15. In both pcre_exec() and pcre_dfa_exec() the code for checking that the
starting offset points to the beginning of a UTF-8 character was
unnecessarily clumsy. I tidied it up.
+
+16. Added PCRE_ERROR_SHORTUTF8 to make it possible to distinguish between a
+ bad UTF-8 sequence and one that is incomplete.
Version 8.10 25-Jun-2010
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index 025e41b..a158ac0 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -435,12 +435,16 @@ If \fIerrptr\fP is NULL, \fBpcre_compile()\fP returns NULL immediately.
Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fP returns
NULL, and sets the variable pointed to by \fIerrptr\fP to point to a textual
error message. This is a static string that is part of the library. You must
-not try to free it. The byte offset from the start of the pattern to the
-character that was being processed when the error was discovered is placed in
-the variable pointed to by \fIerroffset\fP, which must not be NULL. If it is,
-an immediate error is given. Some errors are not detected until checks are
-carried out when the whole pattern has been scanned; in this case the offset is
-set to the end of the pattern.
+not try to free it. The offset from the start of the pattern to the byte that
+was being processed when the error was discovered is placed in the variable
+pointed to by \fIerroffset\fP, which must not be NULL. If it is, an immediate
+error is given. Some errors are not detected until checks are carried out when
+the whole pattern has been scanned; in this case the offset is set to the end
+of the pattern.
+.P
+Note that the offset is in bytes, not characters, even in UTF-8 mode. It may
+point into the middle of a UTF-8 character (for example, when
+PCRE_ERROR_BADUTF8 is returned for an invalid UTF-8 string).
.P
If \fBpcre_compile2()\fP is used instead of \fBpcre_compile()\fP, and the
\fIerrorcodeptr\fP argument is not NULL, a non-zero error code number is
@@ -1515,9 +1519,11 @@ in the main
\fBpcre\fP
.\"
page. If an invalid UTF-8 sequence of bytes is found, \fBpcre_exec()\fP returns
-the error PCRE_ERROR_BADUTF8. If \fIstartoffset\fP contains a value that does
-not point to the start of a UTF-8 character (or to the end of the subject),
-PCRE_ERROR_BADUTF8_OFFSET is returned.
+the error PCRE_ERROR_BADUTF8 or, if PCRE_PARTIAL_HARD is set and the problem is
+a truncated UTF-8 character at the end of the subject, PCRE_ERROR_SHORTUTF8. If
+\fIstartoffset\fP contains a value that does not point to the start of a UTF-8
+character (or to the end of the subject), PCRE_ERROR_BADUTF8_OFFSET is
+returned.
.P
If you already know that your subject is valid, and you want to skip these
checks for performance reasons, you can set the PCRE_NO_UTF8_CHECK option when
@@ -1756,11 +1762,14 @@ documentation for details.
PCRE_ERROR_BADUTF8 (-10)
.sp
A string that contains an invalid UTF-8 byte sequence was passed as a subject.
+However, if PCRE_PARTIAL_HARD is set and the problem is a truncated UTF-8
+character at the end of the subject, PCRE_ERROR_SHORTUTF8 is used instead.
.sp
PCRE_ERROR_BADUTF8_OFFSET (-11)
.sp
The UTF-8 byte sequence that was passed as a subject was valid, but the value
-of \fIstartoffset\fP did not point to the beginning of a UTF-8 character.
+of \fIstartoffset\fP did not point to the beginning of a UTF-8 character or the
+end of the subject.
.sp
PCRE_ERROR_PARTIAL (-12)
.sp
@@ -1800,6 +1809,12 @@ An invalid combination of PCRE_NEWLINE_\fIxxx\fP options was given.
.sp
The value of \fIstartoffset\fP was negative or greater than the length of the
subject, that is, the value in \fIlength\fP.
+.sp
+ PCRE_ERROR_SHORTUTF8 (-25)
+.sp
+The subject string ended with an incomplete (truncated) UTF-8 character, and
+the PCRE_PARTIAL_HARD option was set. Without this option, PCRE_ERROR_BADUTF8
+is returned in this situation.
.P
Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
.
diff --git a/doc/pcrepartial.3 b/doc/pcrepartial.3
index eeb7d85..116b56d 100644
--- a/doc/pcrepartial.3
+++ b/doc/pcrepartial.3
@@ -113,6 +113,12 @@ an earlier partial match over a later complete match. For this reason, the
assumption is made that the end of the supplied subject string may not be the
true end of the available data, and so, if \ez, \eZ, \eb, \eB, or $ are
encountered at the end of the subject, the result is PCRE_ERROR_PARTIAL.
+.P
+Setting PCRE_PARTIAL_HARD also affects the way \fBpcre_exec()\fP checks UTF-8
+subject strings for validity. Normally, an invalid UTF-8 sequence causes the
+error PCRE_ERROR_BADUTF8. However, in the special case of a truncated UTF-8
+character at the end of the subject, PCRE_ERROR_SHORTUTF8 is returned when
+PCRE_PARTIAL_HARD is set.
.
.
.SS "Comparing hard and soft partial matching"
@@ -406,6 +412,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 22 October 2010
+Last updated: 07 November 2010
Copyright (c) 1997-2010 University of Cambridge.
.fi
diff --git a/pcre.h.in b/pcre.h.in
index deecc7d..fda971c 100644
--- a/pcre.h.in
+++ b/pcre.h.in
@@ -162,6 +162,7 @@ compile-time only bits for runtime options, or vice versa. */
#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
#define PCRE_ERROR_BADNEWLINE (-23)
#define PCRE_ERROR_BADOFFSET (-24)
+#define PCRE_ERROR_SHORTUTF8 (-25)
/* Request types for pcre_fullinfo() */
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index d7fbbbf..e4c635b 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -2963,11 +2963,13 @@ back the character offset. */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
- if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
- return PCRE_ERROR_BADUTF8;
+ int tb;
+ if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0)
+ return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)?
+ PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
{
- int tb = ((USPTR)subject)[start_offset] & 0xc0;
+ tb = ((USPTR)subject)[start_offset] & 0xc0;
if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
}
}
diff --git a/pcre_exec.c b/pcre_exec.c
index c4618a6..08443a8 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -5801,11 +5801,13 @@ back the character offset. */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
- if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
- return PCRE_ERROR_BADUTF8;
+ int tb;
+ if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0)
+ return (tb == length && md->partial > 1)?
+ PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
{
- int tb = ((USPTR)subject)[start_offset] & 0xc0;
+ tb = ((USPTR)subject)[start_offset] & 0xc0;
if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
}
}
diff --git a/pcre_valid_utf8.c b/pcre_valid_utf8.c
index d381ad6..971d92d 100644
--- a/pcre_valid_utf8.c
+++ b/pcre_valid_utf8.c
@@ -72,6 +72,20 @@ Arguments:
Returns: < 0 if the string is a valid UTF-8 string
>= 0 otherwise; the value is the offset of the bad byte
+
+Bad bytes can be:
+
+ . An isolated byte whose most significant bits are 0x80, because this
+ can only correctly appear within a UTF-8 character;
+
+ . A byte whose most significant bits are 0xc0, but whose other bits indicate
+ that there are more than 3 additional bytes (i.e. an RFC 2279 starting
+ byte, which is no longer valid under RFC 3629);
+
+ .
+
+The returned offset may also be equal to the length of the string; this means
+that one or more bytes is missing from the final UTF-8 character.
*/
int
@@ -93,7 +107,8 @@ for (p = string; length-- > 0; p++)
if (c < 128) continue;
if (c < 0xc0) return p - string;
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
- if (length < ab || ab > 3) return p - string;
+ if (ab > 3) return p - string; /* Too many for RFC 3629 */
+ if (length < ab) return p + 1 + length - string; /* Missing bytes */
length -= ab;
/* Check top bits in the second byte */
diff --git a/testdata/testinput5 b/testdata/testinput5
index aca97d1..ba11836 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -205,6 +205,9 @@ correctly, but that messes up comparisons). --/
Ã
ÃÃÃ
ÃÃÃ\?
+ \xe1\x88
+ \P\xe1\x88
+ \P\P\xe1\x88
/anything/8
\xc0\x80
diff --git a/testdata/testinput8 b/testdata/testinput8
index bae2f57..55d2fd3 100644
--- a/testdata/testinput8
+++ b/testdata/testinput8
@@ -63,6 +63,9 @@
Ã
ÃÃÃ
ÃÃÃ\?
+ \xe1\x88
+ \P\xe1\x88
+ \P\P\xe1\x88
/a.b/8
acb
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index fc2409a..9d2c439 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -797,7 +797,7 @@ No need char
Failed: invalid UTF-8 string at offset 2
/Ã/8
-Failed: invalid UTF-8 string at offset 0
+Failed: invalid UTF-8 string at offset 1
/ÃÃÃxxx/8
Failed: invalid UTF-8 string at offset 1
@@ -823,6 +823,12 @@ Error -10
Error -10
ÃÃÃ\?
No match
+ \xe1\x88
+Error -10
+ \P\xe1\x88
+Error -10
+ \P\P\xe1\x88
+Error -25
/anything/8
\xc0\x80
diff --git a/testdata/testoutput8 b/testdata/testoutput8
index d663025..f4f5343 100644
--- a/testdata/testoutput8
+++ b/testdata/testoutput8
@@ -105,6 +105,12 @@ Error -10
Error -10
ÃÃÃ\?
No match
+ \xe1\x88
+Error -10
+ \P\xe1\x88
+Error -10
+ \P\P\xe1\x88
+Error -25
/a.b/8
acb