summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2020-02-24 15:39:56 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2020-02-24 15:39:56 +0000
commit9af350af12899021537ce50c25ba98bdd7c1e5ee (patch)
treeca0449e427ce508d05415a7776b05688f49ae465
parentd1125270b0e74ff196824dfb2d3a4e9793e6d6d4 (diff)
downloadpcre2-9af350af12899021537ce50c25ba98bdd7c1e5ee.tar.gz
Fix bug in UTF-16 checker returning wrong offset for missing low surrogate.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1226 6239d852-aaf2-0410-a92c-79f79f948069
-rw-r--r--ChangeLog6
-rw-r--r--src/pcre2_valid_utf.c4
-rw-r--r--testdata/testinput126
-rw-r--r--testdata/testoutput12-1611
-rw-r--r--testdata/testoutput12-329
-rw-r--r--testdata/testoutput14-162
6 files changed, 34 insertions, 4 deletions
diff --git a/ChangeLog b/ChangeLog
index ab4a5e6..9f493be 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -71,6 +71,12 @@ PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
upper/lower case computations on characters whose code points are greater than
127. Documentation is not yet updated. JIT is not yet updated.
+19. The function for checking UTF-16 validity was returning an incorrect offset
+for the start of the error when a high surrogate was not followed by a valid
+low surrogate. This caused incorrect behaviour, for example when
+PCRE2_MATCH_INVALID_UTF was set and a match started immediately following the
+invalid high surrogate, such as /aa/ matching "\x{d800}aa".
+
Version 10.34 21-November-2019
------------------------------
diff --git a/src/pcre2_valid_utf.c b/src/pcre2_valid_utf.c
index 96e8bff..e47ea78 100644
--- a/src/pcre2_valid_utf.c
+++ b/src/pcre2_valid_utf.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2017 University of Cambridge
+ New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -347,7 +347,7 @@ for (p = string; length > 0; p++)
length--;
if ((*p & 0xfc00) != 0xdc00)
{
- *erroroffset = p - string;
+ *erroroffset = p - string - 1;
return PCRE2_ERROR_UTF16_ERR2;
}
}
diff --git a/testdata/testinput12 b/testdata/testinput12
index 32e97b5..beaf643 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -444,6 +444,12 @@
\= Expect no match
A\x{d800}B
A\x{110000}B
+
+/aa/utf,ucp,match_invalid_utf,global
+ aa\x{d800}aa
+
+/aa/utf,ucp,match_invalid_utf,global
+ \x{d800}aa
# ----------------------------------------------------
diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16
index b944311..6e545c3 100644
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@@ -533,7 +533,7 @@ Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
XX\x{110000}
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
XX\x{d800}\x{1234}
-Failed: error -25: UTF-16 error: invalid low surrogate at offset 3
+Failed: error -25: UTF-16 error: invalid low surrogate at offset 2
\= Expect no match
XX\x{d800}\=offset=3
No match
@@ -1576,6 +1576,15 @@ No match
No match
A\x{110000}B
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
+
+/aa/utf,ucp,match_invalid_utf,global
+ aa\x{d800}aa
+ 0: aa
+ 0: aa
+
+/aa/utf,ucp,match_invalid_utf,global
+ \x{d800}aa
+ 0: aa
# ----------------------------------------------------
diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32
index 74ccac8..1a0783a 100644
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@@ -1574,6 +1574,15 @@ No match
No match
A\x{110000}B
No match
+
+/aa/utf,ucp,match_invalid_utf,global
+ aa\x{d800}aa
+ 0: aa
+ 0: aa
+
+/aa/utf,ucp,match_invalid_utf,global
+ \x{d800}aa
+ 0: aa
# ----------------------------------------------------
diff --git a/testdata/testoutput14-16 b/testdata/testoutput14-16
index 2d58f1c..61541f6 100644
--- a/testdata/testoutput14-16
+++ b/testdata/testoutput14-16
@@ -33,7 +33,7 @@ Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
XX\x{110000}
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
XX\x{d800}\x{1234}
-Failed: error -25: UTF-16 error: invalid low surrogate at offset 3
+Failed: error -25: UTF-16 error: invalid low surrogate at offset 2
/badutf/utf
X\xdf