From df683fa3b0a66d7626d6720858f309da9e7880e5 Mon Sep 17 00:00:00 2001 From: SATO Kentaro Date: Fri, 1 Jul 2016 22:41:53 +0900 Subject: Fix #68447: grapheme_extract take an extra trailing character grapheme_extract() converts UTF-8 string in the argument to UTF-16 to iterate through graphemes, and count each UTF-16 character as one Unicode character, which is not correct for UTF-16 surrogate pairs. The patch removes the conversion and counts UTF-8 directly if needed. --- ext/intl/grapheme/grapheme_string.c | 77 ++++++++++++++----------------------- ext/intl/tests/bug68447.phpt | 28 ++++++++++++++ 2 files changed, 56 insertions(+), 49 deletions(-) create mode 100644 ext/intl/tests/bug68447.phpt diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index 3ba9b51524..8453f83f9f 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -702,8 +702,10 @@ PHP_FUNCTION(grapheme_stristr) static inline int32_t grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len) { - int pos = 0, prev_pos = 0; - int ret_pos = 0, prev_ret_pos = 0; + int pos = 0; + int ret_pos = 0; + int break_pos, prev_break_pos; + int count = 0; while ( 1 ) { pos = ubrk_next(bi); @@ -712,23 +714,24 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char break; } - /* if we are beyond our limit, then the loop is done */ - if ( pos > csize ) { - break; - } - - /* update our pointer in the original UTF-8 buffer by as many characters - as ubrk_next iterated over */ + for ( break_pos = ret_pos; break_pos < pos; ) { + count++; + prev_break_pos = break_pos; + U8_FWD_1(pstr, break_pos, str_len); - prev_ret_pos = ret_pos; - U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); + if ( prev_break_pos == break_pos ) { + /* something wrong - malformed utf8? */ + csize = 0; + break; + } + } - if ( prev_ret_pos == ret_pos ) { - /* something wrong - malformed utf8? */ + /* if we are beyond our limit, then the loop is done */ + if ( count > csize ) { break; } - prev_pos = pos; + ret_pos = break_pos; } return ret_pos; @@ -739,8 +742,8 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char static inline int32_t grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len) { - int pos = 0, prev_pos = 0; - int ret_pos = 0, prev_ret_pos = 0; + int pos = 0; + int ret_pos = 0; while ( 1 ) { pos = ubrk_next(bi); @@ -749,20 +752,11 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char break; } - prev_ret_pos = ret_pos; - U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); - - if ( ret_pos > bsize ) { - ret_pos = prev_ret_pos; - break; - } - - if ( prev_ret_pos == ret_pos ) { - /* something wrong - malformed utf8? */ + if ( pos > bsize ) { break; } - prev_pos = pos; + ret_pos = pos; } return ret_pos; @@ -773,7 +767,7 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char static inline int32_t grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len) { - int pos = 0, next_pos = 0; + int next_pos = 0; int ret_pos = 0; while ( size ) { @@ -782,16 +776,10 @@ grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pst if ( UBRK_DONE == next_pos ) { break; } - pos = next_pos; + ret_pos = next_pos; size--; } - /* pos is one past the last UChar - and represent the number of code units to - advance in the utf-8 buffer - */ - - U8_FWD_N(pstr, ret_pos, str_len, pos); - return ret_pos; } /* }}} */ @@ -811,8 +799,8 @@ static grapheme_extract_iter grapheme_extract_iters[] = { PHP_FUNCTION(grapheme_extract) { unsigned char *str, *pstr; - UChar *ustr; - int str_len, ustr_len; + UText ut = UTEXT_INITIALIZER; + int str_len; long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */ long lstart = 0; /* starting position in str in bytes */ int32_t start = 0; @@ -900,11 +888,8 @@ PHP_FUNCTION(grapheme_extract) RETURN_STRINGL(((char *)pstr), nsize, 1); } - /* convert the strings to UTF-16. */ - ustr = NULL; - ustr_len = 0; status = U_ZERO_ERROR; - intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status ); + utext_openUTF8(&ut, pstr, str_len, &status); if ( U_FAILURE( status ) ) { /* Set global error code. */ @@ -913,9 +898,6 @@ PHP_FUNCTION(grapheme_extract) /* Set error messages. */ intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); - if ( NULL != ustr ) - efree( ustr ); - RETURN_FALSE; } @@ -923,8 +905,7 @@ PHP_FUNCTION(grapheme_extract) status = U_ZERO_ERROR; bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC ); - ubrk_setText(bi, ustr, ustr_len, &status); - + ubrk_setUText(bi, &ut, &status); /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we can't back up. So, we will not do anything. */ @@ -932,9 +913,7 @@ PHP_FUNCTION(grapheme_extract) ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len); - if (ustr) { - efree(ustr); - } + utext_close(&ut); ubrk_close(bi); if ( NULL != next ) { diff --git a/ext/intl/tests/bug68447.phpt b/ext/intl/tests/bug68447.phpt new file mode 100644 index 0000000000..f320276df2 --- /dev/null +++ b/ext/intl/tests/bug68447.phpt @@ -0,0 +1,28 @@ +--TEST-- +Bug #68447: grapheme_extract take an extra trailing character +--SKIPIF-- + +--FILE-- +