summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSATO Kentaro <kentaro@ranvis.com>2016-07-01 22:41:53 +0900
committerSATO Kentaro <kentaro@ranvis.com>2016-07-01 22:45:37 +0900
commitdf683fa3b0a66d7626d6720858f309da9e7880e5 (patch)
treee02acbc47e38c61a0b69b6525af62c15b0bfd436
parentb8487b6a7dcf102dd2015105b54279dad701b97b (diff)
downloadphp-git-df683fa3b0a66d7626d6720858f309da9e7880e5.tar.gz
Fix #68447: grapheme_extract take an extra trailing character
grapheme_extract() converts UTF-8 string in the argument to UTF-16 to iterate through graphemes, and count each UTF-16 character as one Unicode character, which is not correct for UTF-16 surrogate pairs. The patch removes the conversion and counts UTF-8 directly if needed.
-rw-r--r--ext/intl/grapheme/grapheme_string.c77
-rw-r--r--ext/intl/tests/bug68447.phpt28
2 files changed, 56 insertions, 49 deletions
diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c
index 3ba9b51524..8453f83f9f 100644
--- a/ext/intl/grapheme/grapheme_string.c
+++ b/ext/intl/grapheme/grapheme_string.c
@@ -702,8 +702,10 @@ PHP_FUNCTION(grapheme_stristr)
static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
{
- int pos = 0, prev_pos = 0;
- int ret_pos = 0, prev_ret_pos = 0;
+ int pos = 0;
+ int ret_pos = 0;
+ int break_pos, prev_break_pos;
+ int count = 0;
while ( 1 ) {
pos = ubrk_next(bi);
@@ -712,23 +714,24 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
break;
}
- /* if we are beyond our limit, then the loop is done */
- if ( pos > csize ) {
- break;
- }
-
- /* update our pointer in the original UTF-8 buffer by as many characters
- as ubrk_next iterated over */
+ for ( break_pos = ret_pos; break_pos < pos; ) {
+ count++;
+ prev_break_pos = break_pos;
+ U8_FWD_1(pstr, break_pos, str_len);
- prev_ret_pos = ret_pos;
- U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
+ if ( prev_break_pos == break_pos ) {
+ /* something wrong - malformed utf8? */
+ csize = 0;
+ break;
+ }
+ }
- if ( prev_ret_pos == ret_pos ) {
- /* something wrong - malformed utf8? */
+ /* if we are beyond our limit, then the loop is done */
+ if ( count > csize ) {
break;
}
- prev_pos = pos;
+ ret_pos = break_pos;
}
return ret_pos;
@@ -739,8 +742,8 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
{
- int pos = 0, prev_pos = 0;
- int ret_pos = 0, prev_ret_pos = 0;
+ int pos = 0;
+ int ret_pos = 0;
while ( 1 ) {
pos = ubrk_next(bi);
@@ -749,20 +752,11 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
break;
}
- prev_ret_pos = ret_pos;
- U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
-
- if ( ret_pos > bsize ) {
- ret_pos = prev_ret_pos;
- break;
- }
-
- if ( prev_ret_pos == ret_pos ) {
- /* something wrong - malformed utf8? */
+ if ( pos > bsize ) {
break;
}
- prev_pos = pos;
+ ret_pos = pos;
}
return ret_pos;
@@ -773,7 +767,7 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
static inline int32_t
grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
{
- int pos = 0, next_pos = 0;
+ int next_pos = 0;
int ret_pos = 0;
while ( size ) {
@@ -782,16 +776,10 @@ grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pst
if ( UBRK_DONE == next_pos ) {
break;
}
- pos = next_pos;
+ ret_pos = next_pos;
size--;
}
- /* pos is one past the last UChar - and represent the number of code units to
- advance in the utf-8 buffer
- */
-
- U8_FWD_N(pstr, ret_pos, str_len, pos);
-
return ret_pos;
}
/* }}} */
@@ -811,8 +799,8 @@ static grapheme_extract_iter grapheme_extract_iters[] = {
PHP_FUNCTION(grapheme_extract)
{
unsigned char *str, *pstr;
- UChar *ustr;
- int str_len, ustr_len;
+ UText ut = UTEXT_INITIALIZER;
+ int str_len;
long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
long lstart = 0; /* starting position in str in bytes */
int32_t start = 0;
@@ -900,11 +888,8 @@ PHP_FUNCTION(grapheme_extract)
RETURN_STRINGL(((char *)pstr), nsize, 1);
}
- /* convert the strings to UTF-16. */
- ustr = NULL;
- ustr_len = 0;
status = U_ZERO_ERROR;
- intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
+ utext_openUTF8(&ut, pstr, str_len, &status);
if ( U_FAILURE( status ) ) {
/* Set global error code. */
@@ -913,9 +898,6 @@ PHP_FUNCTION(grapheme_extract)
/* Set error messages. */
intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
- if ( NULL != ustr )
- efree( ustr );
-
RETURN_FALSE;
}
@@ -923,8 +905,7 @@ PHP_FUNCTION(grapheme_extract)
status = U_ZERO_ERROR;
bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
- ubrk_setText(bi, ustr, ustr_len, &status);
-
+ ubrk_setUText(bi, &ut, &status);
/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
can't back up. So, we will not do anything. */
@@ -932,9 +913,7 @@ PHP_FUNCTION(grapheme_extract)
ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
- if (ustr) {
- efree(ustr);
- }
+ utext_close(&ut);
ubrk_close(bi);
if ( NULL != next ) {
diff --git a/ext/intl/tests/bug68447.phpt b/ext/intl/tests/bug68447.phpt
new file mode 100644
index 0000000000..f320276df2
--- /dev/null
+++ b/ext/intl/tests/bug68447.phpt
@@ -0,0 +1,28 @@
+--TEST--
+Bug #68447: grapheme_extract take an extra trailing character
+--SKIPIF--
+<?php if( !extension_loaded( 'intl' ) ) print 'skip'; ?>
+--FILE--
+<?php
+$katsushikaku = "葛󠄁飾区";
+echo grapheme_extract($katsushikaku, 1) . "\n";
+
+$haiyore = "這󠄀いよれ";
+echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_COUNT) . "\n";
+echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_COUNT) . "\n";
+echo grapheme_extract($haiyore, 6, GRAPHEME_EXTR_MAXBYTES) . "\n";
+echo grapheme_extract($haiyore, 9, GRAPHEME_EXTR_MAXBYTES) . "\n";
+echo grapheme_extract($haiyore, 12, GRAPHEME_EXTR_MAXBYTES) . "\n";
+echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_MAXCHARS) . "\n";
+echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_MAXCHARS) . "\n";
+echo grapheme_extract($haiyore, 3, GRAPHEME_EXTR_MAXCHARS) . "\n";
+--EXPECT--
+葛󠄁
+這󠄀
+這󠄀い
+
+這󠄀
+這󠄀い
+
+這󠄀
+這󠄀い