From 9af350af12899021537ce50c25ba98bdd7c1e5ee Mon Sep 17 00:00:00 2001 From: ph10 Date: Mon, 24 Feb 2020 15:39:56 +0000 Subject: Fix bug in UTF-16 checker returning wrong offset for missing low surrogate. git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1226 6239d852-aaf2-0410-a92c-79f79f948069 --- ChangeLog | 6 ++++++ src/pcre2_valid_utf.c | 4 ++-- testdata/testinput12 | 6 ++++++ testdata/testoutput12-16 | 11 ++++++++++- testdata/testoutput12-32 | 9 +++++++++ testdata/testoutput14-16 | 2 +- 6 files changed, 34 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index ab4a5e6..9f493be 100644 --- a/ChangeLog +++ b/ChangeLog @@ -71,6 +71,12 @@ PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for upper/lower case computations on characters whose code points are greater than 127. Documentation is not yet updated. JIT is not yet updated. +19. The function for checking UTF-16 validity was returning an incorrect offset +for the start of the error when a high surrogate was not followed by a valid +low surrogate. This caused incorrect behaviour, for example when +PCRE2_MATCH_INVALID_UTF was set and a match started immediately following the +invalid high surrogate, such as /aa/ matching "\x{d800}aa". + Version 10.34 21-November-2019 ------------------------------ diff --git a/src/pcre2_valid_utf.c b/src/pcre2_valid_utf.c index 96e8bff..e47ea78 100644 --- a/src/pcre2_valid_utf.c +++ b/src/pcre2_valid_utf.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2017 University of Cambridge + New API code Copyright (c) 2016-2020 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -347,7 +347,7 @@ for (p = string; length > 0; p++) length--; if ((*p & 0xfc00) != 0xdc00) { - *erroroffset = p - string; + *erroroffset = p - string - 1; return PCRE2_ERROR_UTF16_ERR2; } } diff --git a/testdata/testinput12 b/testdata/testinput12 index 32e97b5..beaf643 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -444,6 +444,12 @@ \= Expect no match A\x{d800}B A\x{110000}B + +/aa/utf,ucp,match_invalid_utf,global + aa\x{d800}aa + +/aa/utf,ucp,match_invalid_utf,global + \x{d800}aa # ---------------------------------------------------- diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index b944311..6e545c3 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -533,7 +533,7 @@ Failed: error -26: UTF-16 error: isolated low surrogate at offset 2 XX\x{110000} ** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16 XX\x{d800}\x{1234} -Failed: error -25: UTF-16 error: invalid low surrogate at offset 3 +Failed: error -25: UTF-16 error: invalid low surrogate at offset 2 \= Expect no match XX\x{d800}\=offset=3 No match @@ -1576,6 +1576,15 @@ No match No match A\x{110000}B ** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16 + +/aa/utf,ucp,match_invalid_utf,global + aa\x{d800}aa + 0: aa + 0: aa + +/aa/utf,ucp,match_invalid_utf,global + \x{d800}aa + 0: aa # ---------------------------------------------------- diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index 74ccac8..1a0783a 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1574,6 +1574,15 @@ No match No match A\x{110000}B No match + +/aa/utf,ucp,match_invalid_utf,global + aa\x{d800}aa + 0: aa + 0: aa + +/aa/utf,ucp,match_invalid_utf,global + \x{d800}aa + 0: aa # ---------------------------------------------------- diff --git a/testdata/testoutput14-16 b/testdata/testoutput14-16 index 2d58f1c..61541f6 100644 --- a/testdata/testoutput14-16 +++ b/testdata/testoutput14-16 @@ -33,7 +33,7 @@ Failed: error -26: UTF-16 error: isolated low surrogate at offset 2 XX\x{110000} ** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16 XX\x{d800}\x{1234} -Failed: error -25: UTF-16 error: invalid low surrogate at offset 3 +Failed: error -25: UTF-16 error: invalid low surrogate at offset 2 /badutf/utf X\xdf -- cgit v1.2.1