From 23e25f3319db021298310fb97cf537bcef4095ad Mon Sep 17 00:00:00 2001
From: "Christoph M. Becker" <cmbecker69@gmx.de>
Date: Fri, 5 Jun 2015 14:40:03 +0200
Subject: Fixed Bug #53823 (preg_replace: * qualifier on unicode replace
 garbles the string)

When advancing after empty matches, php_pcre_match_impl() as well as
php_pcre_replace_impl() always have to advance to the next code point when the
u modifier is given, instead of to the next byte.
---
 ext/pcre/php_pcre.c          | 31 +++++++++++++++++++++++++----
 ext/pcre/tests/bug53823.phpt | 13 ++++++++++++
 ext/pcre/tests/bug66121.phpt | 47 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 4 deletions(-)
 create mode 100644 ext/pcre/tests/bug53823.phpt
 create mode 100644 ext/pcre/tests/bug66121.phpt

diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c
index e7274b841d..7cc16ca6e6 100644
--- a/ext/pcre/php_pcre.c
+++ b/ext/pcre/php_pcre.c
@@ -225,6 +225,25 @@ static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce TSRMLS_D
 }
 /* }}} */
 
+/* {{{ static calculate_unit_length */
+/* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE_UTF8. */
+static zend_always_inline int calculate_unit_length(pcre_cache_entry *pce, char *start)
+{
+	int unit_len;
+
+	if (pce->compile_options & PCRE_UTF8) {
+		char *end = start;
+
+		/* skip continuation bytes */
+		while ((*++end & 0xC0) == 0x80);
+		unit_len = end - start;
+	} else {
+		unit_len = 1;
+	}
+	return unit_len;
+}
+/* }}} */
+
 /* {{{ pcre_get_compiled_regex_cache
  */
 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC)
@@ -758,8 +777,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subjec
 			   the start offset, and continue. Fudge the offset values
 			   to achieve this, unless we're already at the end of the string. */
 			if (g_notempty != 0 && start_offset < subject_len) {
+				int unit_len = calculate_unit_length(pce, subject + start_offset);
+				
 				offsets[0] = start_offset;
-				offsets[1] = start_offset + 1;
+				offsets[1] = start_offset + unit_len;
 			} else
 				break;
 		} else {
@@ -1206,10 +1227,12 @@ PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int sub
 			   the start offset, and continue. Fudge the offset values
 			   to achieve this, unless we're already at the end of the string. */
 			if (g_notempty != 0 && start_offset < subject_len) {
+				int unit_len = calculate_unit_length(pce, piece);
+
 				offsets[0] = start_offset;
-				offsets[1] = start_offset + 1;
-				memcpy(&result[*result_len], piece, 1);
-				(*result_len)++;
+				offsets[1] = start_offset + unit_len;
+				memcpy(&result[*result_len], piece, unit_len);
+				*result_len += unit_len;
 			} else {
 				new_len = *result_len + subject_len - start_offset;
 				if (new_len + 1 > alloc_len) {
diff --git a/ext/pcre/tests/bug53823.phpt b/ext/pcre/tests/bug53823.phpt
new file mode 100644
index 0000000000..c1d8f999e0
--- /dev/null
+++ b/ext/pcre/tests/bug53823.phpt
@@ -0,0 +1,13 @@
+--TEST--
+Bug #53823 - preg_replace: * qualifier on unicode replace garbles the string
+--FILE--
+<?php
+var_dump(preg_replace('/[^\pL\pM]*/iu', '', 'áéíóú'));
+// invalid UTF-8
+var_dump(preg_replace('/[^\pL\pM]*/iu', '', "\xFCáéíóú"));
+var_dump(preg_replace('/[^\pL\pM]*/iu', '', "áéíóú\xFC"));
+?>
+--EXPECT--
+string(10) "áéíóú"
+NULL
+NULL
diff --git a/ext/pcre/tests/bug66121.phpt b/ext/pcre/tests/bug66121.phpt
new file mode 100644
index 0000000000..89c2f2d5d8
--- /dev/null
+++ b/ext/pcre/tests/bug66121.phpt
@@ -0,0 +1,47 @@
+--TEST--
+Bug #66121 - UTF-8 lookbehinds match bytes instead of characters
+--FILE--
+<?php
+// Sinhala characters
+var_dump(preg_replace('/(?<!ක)/u', '*', 'ක'));
+var_dump(preg_replace('/(?<!ක)/u', '*', 'ම'));
+// English characters
+var_dump(preg_replace('/(?<!k)/u', '*', 'k'));
+var_dump(preg_replace('/(?<!k)/u', '*', 'm'));
+// Sinhala characters
+preg_match_all('/(?<!ක)/u', 'ම', $matches, PREG_OFFSET_CAPTURE);
+var_dump($matches);
+// invalid UTF-8
+var_dump(preg_replace('/(?<!ක)/u', '*', "\xFCක"));
+var_dump(preg_replace('/(?<!ක)/u', '*', "ක\xFC"));
+var_dump(preg_match_all('/(?<!ක)/u', "\xFCම", $matches, PREG_OFFSET_CAPTURE));
+var_dump(preg_match_all('/(?<!ක)/u', "\xFCම", $matches, PREG_OFFSET_CAPTURE));
+?>
+--EXPECT--
+string(4) "*ක"
+string(5) "*ම*"
+string(2) "*k"
+string(3) "*m*"
+array(1) {
+  [0]=>
+  array(2) {
+    [0]=>
+    array(2) {
+      [0]=>
+      string(0) ""
+      [1]=>
+      int(0)
+    }
+    [1]=>
+    array(2) {
+      [0]=>
+      string(0) ""
+      [1]=>
+      int(3)
+    }
+  }
+}
+NULL
+NULL
+bool(false)
+bool(false)
-- 
cgit v1.2.1


From 1cbcbcbc219908e32d96d4d1fefbbb9a0ee7a3cf Mon Sep 17 00:00:00 2001
From: "Christoph M. Becker" <cmb@php.net>
Date: Tue, 23 Jun 2015 19:32:18 +0200
Subject: updated NEWS

---
 NEWS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NEWS b/NEWS
index da06e7fcfd..2ac1aff9f5 100644
--- a/NEWS
+++ b/NEWS
@@ -22,6 +22,8 @@
   . Fixed bug #61221 (imagegammacorrect function loses alpha channel). (cmb)
   
 - PCRE:
+  . Fixed Bug #53823 (preg_replace: * qualifier on unicode replace garbles the
+    string). (cmb)
   . Fixed bug #69864 (Segfault in preg_replace_callback) (cmb, ab)
 
 - PDO_pgsql:
-- 
cgit v1.2.1