diff options
author | Stanislav Malyshev <stas@php.net> | 2008-07-07 22:51:04 +0000 |
---|---|---|
committer | Stanislav Malyshev <stas@php.net> | 2008-07-07 22:51:04 +0000 |
commit | 0d16b1516b6b9ef0c2696bc19069e4cda5aee0ea (patch) | |
tree | df4e6a46dea0afafdbc912919b32c2806841a4eb /ext/intl/grapheme | |
parent | 3bab7c18ac205863af3df740144be23c18cf7a72 (diff) | |
download | php-git-0d16b1516b6b9ef0c2696bc19069e4cda5aee0ea.tar.gz |
Merge intl extension into core
Diffstat (limited to 'ext/intl/grapheme')
-rwxr-xr-x | ext/intl/grapheme/grapheme.h | 37 | ||||
-rwxr-xr-x | ext/intl/grapheme/grapheme_string.c | 913 | ||||
-rwxr-xr-x | ext/intl/grapheme/grapheme_util.c | 619 | ||||
-rwxr-xr-x | ext/intl/grapheme/grapheme_util.h | 59 |
4 files changed, 1628 insertions, 0 deletions
diff --git a/ext/intl/grapheme/grapheme.h b/ext/intl/grapheme/grapheme.h new file mode 100755 index 0000000000..c0e697ac1e --- /dev/null +++ b/ext/intl/grapheme/grapheme.h @@ -0,0 +1,37 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: Ed Batutis <ed@batutis.com> | + +----------------------------------------------------------------------+ + */ + +#ifndef GRAPHEME_GRAPHEME_H +#define GRAPHEME_GRAPHEME_H + +#include <php.h> +#include <unicode/utypes.h> +#include <unicode/ubrk.h> + +PHP_FUNCTION(grapheme_strlen); +PHP_FUNCTION(grapheme_strpos); +PHP_FUNCTION(grapheme_stripos); +PHP_FUNCTION(grapheme_strrpos); +PHP_FUNCTION(grapheme_strripos); +PHP_FUNCTION(grapheme_substr); +PHP_FUNCTION(grapheme_strstr); +PHP_FUNCTION(grapheme_stristr); +PHP_FUNCTION(grapheme_extract); + +void grapheme_register_constants( INIT_FUNC_ARGS ); +void grapheme_close_global_iterator( TSRMLS_D ); + +#endif // GRAPHEME_GRAPHEME_H diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c new file mode 100755 index 0000000000..9b4ba82a25 --- /dev/null +++ b/ext/intl/grapheme/grapheme_string.c @@ -0,0 +1,913 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Author: Ed Batutis <ed@batutis.com> | + +----------------------------------------------------------------------+ + */ + +/* {{{ includes */ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <php.h> +#include "grapheme.h" +#include "grapheme_util.h" + +#include <unicode/utypes.h> +#include <unicode/ucol.h> +#include <unicode/ustring.h> +#include <unicode/ubrk.h> + +#include "ext/standard/php_string.h" + +/* }}} */ + +#define GRAPHEME_EXTRACT_TYPE_COUNT 0 +#define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1 +#define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2 +#define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT +#define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS + + +/* {{{ grapheme_register_constants + * Register API constants + */ +void grapheme_register_constants( INIT_FUNC_ARGS ) +{ + REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT); +} +/* }}} */ + +/* {{{ proto int grapheme_strlen(string str) + Get number of graphemes in a string */ +PHP_FUNCTION(grapheme_strlen) +{ + unsigned char* string; + int string_len; + UChar* ustring = NULL; + int ustring_len = 0; + int ret_len; + UErrorCode status; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, + "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC ); + + RETURN_FALSE; + } + + ret_len = grapheme_ascii_check(string, string_len); + + if ( ret_len >= 0 ) + RETURN_LONG(ret_len); + + /* convert the string to UTF-16. */ + status = U_ZERO_ERROR; + intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC ); + efree( ustring ); + RETURN_NULL(); + } + + ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC ); + + efree( ustring ); + + if (ret_len >= 0) { + RETVAL_LONG(ret_len); + } else { + RETVAL_FALSE; + } +} +/* }}} */ + +/* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ]) + Find position of first occurrence of a string within another */ +PHP_FUNCTION(grapheme_strpos) +{ + unsigned char *haystack, *needle; + int haystack_len, needle_len; + unsigned char *found; + long loffset = 0; + int32_t offset = 0; + int ret_pos, uchar_pos; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, + "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC ); + + RETURN_FALSE; + } + + if ( OUTSIDE_STRING(loffset, haystack_len) ) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); + + RETURN_FALSE; + } + + /* we checked that it will fit: */ + offset = (int32_t) loffset; + + /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ + + if (needle_len == 0) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); + + RETURN_FALSE; + } + + + /* quick check to see if the string might be there + * I realize that 'offset' is 'grapheme count offset' but will work in spite of that + */ + found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len); + + /* if it isn't there the we are done */ + if (!found) { + RETURN_FALSE; + } + + /* if it is there, and if the haystack is ascii, we are all done */ + if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) { + + RETURN_LONG(found - haystack); + } + + /* do utf16 part of the strpos */ + ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 0 /* fIgnoreCase */ TSRMLS_CC ); + + if ( ret_pos >= 0 ) { + RETURN_LONG(ret_pos + offset); + } else { + RETURN_FALSE; + } + +} +/* }}} */ + +/* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ]) + Find position of first occurrence of a string within another, ignoring case differences */ +PHP_FUNCTION(grapheme_stripos) +{ + unsigned char *haystack, *needle, *haystack_dup, *needle_dup; + int haystack_len, needle_len; + unsigned char *found; + long loffset = 0; + int32_t offset = 0; + int ret_pos, uchar_pos; + int is_ascii; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, + "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC ); + + RETURN_FALSE; + } + + if ( OUTSIDE_STRING(loffset, haystack_len) ) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC ); + + RETURN_FALSE; + } + + /* we checked that it will fit: */ + offset = (int32_t) loffset; + + /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ + + if (needle_len == 0) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC ); + + RETURN_FALSE; + } + + + is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 ); + + if ( is_ascii ) { + needle_dup = (unsigned char *)estrndup((char *)needle, needle_len); + php_strtolower((char *)needle_dup, needle_len); + haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len); + php_strtolower((char *)haystack_dup, haystack_len); + + found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len); + + efree(haystack_dup); + efree(needle_dup); + + if (found) { + RETURN_LONG(found - haystack_dup); + } + + /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */ + if ( grapheme_ascii_check(needle, needle_len) >= 0 ) { + RETURN_FALSE; + } + } + + /* do utf16 part of the strpos */ + ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 1 /* fIgnoreCase */ TSRMLS_CC ); + + if ( ret_pos >= 0 ) { + RETURN_LONG(ret_pos + offset); + } else { + RETURN_FALSE; + } + +} +/* }}} */ + +/* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset]) + Find position of last occurrence of a string within another */ +PHP_FUNCTION(grapheme_strrpos) +{ + unsigned char *haystack, *needle; + int haystack_len, needle_len; + long loffset = 0; + int32_t offset = 0; + int32_t ret_pos; + int is_ascii; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, + "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC ); + + RETURN_FALSE; + } + + if ( OUTSIDE_STRING(loffset, haystack_len) ) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); + + RETURN_FALSE; + } + + /* we checked that it will fit: */ + offset = (int32_t) loffset; + + /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ + + if (needle_len == 0) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); + + RETURN_FALSE; + } + + is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0; + + if ( is_ascii ) { + + ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset); + + + if ( ret_pos >= 0 ) { + RETURN_LONG(ret_pos); + } + + /* if the needle was ascii too, we are done */ + + if ( grapheme_ascii_check(needle, needle_len) >= 0 ) { + RETURN_FALSE; + } + + /* else we need to continue via utf16 */ + } + + ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 0 /* f_ignore_case */ TSRMLS_CC); + + if ( ret_pos >= 0 ) { + RETURN_LONG(ret_pos); + } else { + RETURN_FALSE; + } + + +} +/* }}} */ + +/* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset]) + Find position of last occurrence of a string within another, ignoring case */ +PHP_FUNCTION(grapheme_strripos) +{ + unsigned char *haystack, *needle; + int haystack_len, needle_len; + long loffset = 0; + int32_t offset = 0; + int32_t ret_pos; + int is_ascii; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, + "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC ); + + RETURN_FALSE; + } + + if ( OUTSIDE_STRING(loffset, haystack_len) ) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); + + RETURN_FALSE; + } + + /* we checked that it will fit: */ + offset = (int32_t) loffset; + + /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ + + if (needle_len == 0) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); + + RETURN_FALSE; + } + + is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0; + + if ( is_ascii ) { + unsigned char *needle_dup, *haystack_dup; + + needle_dup = (unsigned char *)estrndup((char *)needle, needle_len); + php_strtolower((char *)needle_dup, needle_len); + haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len); + php_strtolower((char *)haystack_dup, haystack_len); + + ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset); + + efree(haystack_dup); + efree(needle_dup); + + if ( ret_pos >= 0 ) { + RETURN_LONG(ret_pos); + } + + /* if the needle was ascii too, we are done */ + + if ( grapheme_ascii_check(needle, needle_len) >= 0 ) { + RETURN_FALSE; + } + + /* else we need to continue via utf16 */ + } + + ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 1 /* f_ignore_case */ TSRMLS_CC); + + if ( ret_pos >= 0 ) { + RETURN_LONG(ret_pos); + } else { + RETURN_FALSE; + } + + +} +/* }}} */ + +/* {{{ proto string grapheme_substr(string str, int start [, int length]) + Returns part of a string */ +PHP_FUNCTION(grapheme_substr) +{ + unsigned char *str, *sub_str; + UChar *ustr; + int str_len, sub_str_len, ustr_len; + long lstart = 0, length = 0; + int32_t start = 0; + int iter_val; + UErrorCode status; + unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; + UBreakIterator* bi = NULL; + int sub_str_start_pos, sub_str_end_pos; + int32_t (*iter_func)(UBreakIterator *); + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, + "grapheme_substr: unable to parse input param", 0 TSRMLS_CC ); + + RETURN_FALSE; + } + + if ( OUTSIDE_STRING(lstart, str_len) ) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC ); + + RETURN_FALSE; + } + + /* we checked that it will fit: */ + start = (int32_t) lstart; + + /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */ + + if ( grapheme_ascii_check(str, str_len) >= 0 ) { + grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len); + + if ( NULL == sub_str ) { + RETURN_FALSE; + } + + RETURN_STRINGL(((char *)sub_str), sub_str_len, 1); + } + + ustr = NULL; + ustr_len = 0; + status = U_ZERO_ERROR; + intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC ); + efree( ustr ); + RETURN_FALSE; + } + + bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC ); + + if( U_FAILURE(status) ) { + RETURN_FALSE; + } + + ubrk_setText(bi, ustr, ustr_len, &status); + + if ( start < 0 ) { + iter_func = ubrk_previous; + ubrk_last(bi); + iter_val = 1; + } + else { + iter_func = ubrk_next; + iter_val = -1; + } + + sub_str_start_pos = 0; + + while ( start ) { + sub_str_start_pos = iter_func(bi); + + if ( UBRK_DONE == sub_str_start_pos ) { + break; + } + + start += iter_val; + } + + if ( 0 != start || sub_str_start_pos >= ustr_len ) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC ); + + efree(ustr); + ubrk_close(bi); + RETURN_FALSE; + } + + if (ZEND_NUM_ARGS() <= 2) { + + /* no length supplied, return the rest of the string */ + + sub_str = NULL; + sub_str_len = 0; + status = U_ZERO_ERROR; + intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status); + + efree( ustr ); + ubrk_close( bi ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 1 TSRMLS_CC ); + + efree( sub_str ); + + RETURN_FALSE; + } + + /* return the allocated string, not a duplicate */ + RETURN_STRINGL(((char *)sub_str), sub_str_len, 0); + } + + /* find the end point of the string to return */ + + if ( length < 0 ) { + iter_func = ubrk_previous; + ubrk_last(bi); + iter_val = 1; + } + else { + iter_func = ubrk_next; + iter_val = -1; + } + + sub_str_end_pos = 0; + + while ( length ) { + sub_str_end_pos = iter_func(bi); + + if ( UBRK_DONE == sub_str_end_pos ) { + break; + } + + length += iter_val; + } + + if ( UBRK_DONE == sub_str_end_pos ) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC ); + + efree(ustr); + ubrk_close(bi); + RETURN_FALSE; + } + + sub_str = NULL; + status = U_ZERO_ERROR; + intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status); + + efree( ustr ); + ubrk_close( bi ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 1 TSRMLS_CC ); + + if ( NULL != sub_str ) + efree( sub_str ); + + RETURN_FALSE; + } + + /* return the allocated string, not a duplicate */ + RETURN_STRINGL(((char *)sub_str), sub_str_len, 0); + +} +/* }}} */ + +/* {{{ strstr_common_handler */ +static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case) +{ + unsigned char *haystack, *needle, *found; + int haystack_len, needle_len; + int ret_pos, uchar_pos; + zend_bool part = 0; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, + "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC ); + + RETURN_FALSE; + } + + if (needle_len == 0) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC ); + + RETURN_FALSE; + } + + + if ( !f_ignore_case ) { + + /* ASCII optimization: quick check to see if the string might be there + * I realize that 'offset' is 'grapheme count offset' but will work in spite of that + */ + found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len); + + /* if it isn't there the we are done */ + if ( !found ) { + RETURN_FALSE; + } + + /* if it is there, and if the haystack is ascii, we are all done */ + if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) { + size_t found_offset = found - haystack; + + if (part) { + RETURN_STRINGL(((char *)haystack) , found_offset, 1); + } else { + RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1); + } + } + + } + + /* need to work in utf16 */ + ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case TSRMLS_CC ); + + if ( ret_pos < 0 ) { + RETURN_FALSE; + } + + /* uchar_pos is the 'nth' Unicode character position of the needle */ + + ret_pos = 0; + U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos); + + if (part) { + RETURN_STRINGL(((char *)haystack), ret_pos, 1); + } + else { + RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1); + } + +} +/* }}} */ + +/* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part]) + Finds first occurrence of a string within another */ +PHP_FUNCTION(grapheme_strstr) +{ + strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */); +} +/* }}} */ + +/* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part]) + Finds first occurrence of a string within another */ +PHP_FUNCTION(grapheme_stristr) +{ + strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */); +} +/* }}} */ + +/* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */ +inline int32_t +grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len) +{ + int pos = 0, prev_pos = 0; + int ret_pos = 0, prev_ret_pos = 0; + + while ( 1 ) { + pos = ubrk_next(bi); + + if ( UBRK_DONE == pos ) { + break; + } + + /* if we are beyond our limit, then the loop is done */ + if ( pos > csize ) { + break; + } + + /* update our pointer in the original UTF-8 buffer by as many characters + as ubrk_next iterated over */ + + prev_ret_pos = ret_pos; + U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); + + if ( prev_ret_pos == ret_pos ) { + /* something wrong - malformed utf8? */ + break; + } + + prev_pos = pos; + } + + return ret_pos; +} +/* }}} */ + +/* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */ +inline int32_t +grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len) +{ + int pos = 0, prev_pos = 0; + int ret_pos = 0, prev_ret_pos = 0; + + while ( 1 ) { + pos = ubrk_next(bi); + + if ( UBRK_DONE == pos ) { + break; + } + + prev_ret_pos = ret_pos; + U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); + + if ( ret_pos > bsize ) { + ret_pos = prev_ret_pos; + break; + } + + if ( prev_ret_pos == ret_pos ) { + /* something wrong - malformed utf8? */ + break; + } + + prev_pos = pos; + } + + return ret_pos; +} +/* }}} */ + +/* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */ +inline int32_t +grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len) +{ + int pos = 0, next_pos = 0; + int ret_pos = 0; + + while ( size ) { + next_pos = ubrk_next(bi); + + if ( UBRK_DONE == next_pos ) { + break; + } + pos = next_pos; + size--; + } + + /* pos is one past the last UChar - and represent the number of code units to + advance in the utf-8 buffer + */ + + U8_FWD_N(pstr, ret_pos, str_len, pos); + + return ret_pos; +} +/* }}} */ + +/* {{{ grapheme extract iter function pointer array */ +typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/); + +static grapheme_extract_iter grapheme_extract_iters[] = { + &grapheme_extract_count_iter, + &grapheme_extract_bytecount_iter, + &grapheme_extract_charcount_iter, +}; +/* }}} */ + +/* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]]) + Function to extract a sequence of default grapheme clusters */ +PHP_FUNCTION(grapheme_extract) +{ + unsigned char *str, *pstr; + UChar *ustr; + int str_len, ustr_len; + long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */ + long lstart = 0; /* starting position in str in bytes */ + int32_t start = 0; + long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT; + UErrorCode status; + unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; + UBreakIterator* bi = NULL; + int ret_pos; + zval *next = NULL; // return offset of next part of the string + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, + "grapheme_extract: unable to parse input param", 0 TSRMLS_CC ); + + RETURN_FALSE; + } + + if ( NULL != next ) { + if ( !PZVAL_IS_REF(next) ) { + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, + "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC ); + + RETURN_FALSE; + } + else { + /* initialize next */ + ZVAL_LONG(next, start); + } + } + + if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, + "grapheme_extract: unknown extract type param", 0 TSRMLS_CC ); + + RETURN_FALSE; + } + + if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 1 TSRMLS_CC ); + + RETURN_FALSE; + } + + /* we checked that it will fit: */ + start = (int32_t) lstart; + + pstr = str + start; + + /* just in case pstr points in the middle of a character, move forward to the start of the next char */ + if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) { + unsigned char *str_end = str + str_len; + + while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) { + pstr++; + if ( pstr >= str_end ) { + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, + "grapheme_extract: invalid input string", 0 TSRMLS_CC ); + + RETURN_FALSE; + } + } + } + + str_len -= (pstr - str); + + /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done. + (size + 1 because the size-th character might be the beginning of a grapheme cluster) + */ + + if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) { + if ( NULL != next ) { + ZVAL_LONG(next, start+size); + } + RETURN_STRINGL(((char *)pstr), size, 1); + } + + /* convert the strings to UTF-16. */ + ustr = NULL; + ustr_len = 0; + status = U_ZERO_ERROR; + intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC ); + + if ( NULL != ustr ) + efree( ustr ); + + RETURN_FALSE; + } + + bi = NULL; + status = U_ZERO_ERROR; + bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC ); + + ubrk_setText(bi, ustr, ustr_len, &status); + + /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we + can't back up. So, we will not do anything. */ + + /* now we need to find the end of the chunk the user wants us to return */ + + ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len); + + efree(ustr); + ubrk_close(bi); + + if ( NULL != next ) { + ZVAL_LONG(next, start+ret_pos); + } + + RETURN_STRINGL(((char *)pstr), ret_pos, 1); +} + +/* }}} */ + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: fdm=marker + * vim: noet sw=4 ts=4 + */ + diff --git a/ext/intl/grapheme/grapheme_util.c b/ext/intl/grapheme/grapheme_util.c new file mode 100755 index 0000000000..375c695b7d --- /dev/null +++ b/ext/intl/grapheme/grapheme_util.c @@ -0,0 +1,619 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Author: Ed Batutis <ed@batutis.com> | + +----------------------------------------------------------------------+ + */ + +/* {{{ includes */ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <php.h> +#include "grapheme.h" +#include "grapheme_util.h" +#include "intl_common.h" + +#include <unicode/utypes.h> +#include <unicode/ucol.h> +#include <unicode/ustring.h> +#include <unicode/ubrk.h> + +#include "ext/standard/php_string.h" + +ZEND_EXTERN_MODULE_GLOBALS( intl ) + +/* }}} */ + +/* {{{ grapheme_close_global_iterator - clean up */ +void +grapheme_close_global_iterator( TSRMLS_D ) +{ + UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator ); + + if ( NULL != global_break_iterator ) { + ubrk_close(global_break_iterator); + } +} +/* }}} */ + +/* {{{ grapheme_intl_case_fold: convert string to lowercase */ +void +grapheme_intl_case_fold(UChar** ptr_to_free, UChar **str, int32_t *str_len, UErrorCode *pstatus ) +{ + UChar *dest; + int32_t dest_len, size_required; + + /* allocate a destination string that is a bit larger than the src, hoping that is enough */ + dest_len = (*str_len) + ( *str_len / 10 ); + dest = (UChar*) eumalloc(dest_len); + + *pstatus = U_ZERO_ERROR; + size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus); + + dest_len = size_required; + + if ( U_BUFFER_OVERFLOW_ERROR == *pstatus ) { + + dest = (UChar*) eurealloc(dest, dest_len); + + *pstatus = U_ZERO_ERROR; + size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus); + } + + if ( U_FAILURE(*pstatus) ) { + return; + } + + if ( NULL != ptr_to_free) { + efree(*ptr_to_free); + *ptr_to_free = dest; + } + + *str = dest; + *str_len = dest_len; + + return; +} +/* }}} */ + +/* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */ +void +grapheme_substr_ascii(char *str, int str_len, int f, int l, int argc, char **sub_str, int *sub_str_len) +{ + *sub_str = NULL; + + if (argc > 2) { + if ((l < 0 && -l > str_len)) { + return; + } else if (l > str_len) { + l = str_len; + } + } else { + l = str_len; + } + + if (f > str_len || (f < 0 && -f > str_len)) { + return; + } + + if (l < 0 && (l + str_len - f) < 0) { + return; + } + + /* if "from" position is negative, count start position from the end + * of the string + */ + if (f < 0) { + f = str_len + f; + if (f < 0) { + f = 0; + } + } + + + /* if "length" position is negative, set it to the length + * needed to stop that many chars from the end of the string + */ + if (l < 0) { + l = (str_len - f) + l; + if (l < 0) { + l = 0; + } + } + + if (f >= str_len) { + return; + } + + if ((f + l) > str_len) { + l = str_len - f; + } + + *sub_str = str + f; + *sub_str_len = l; + + return; +} +/* }}} */ + +/* {{{ grapheme_strrpos_utf16 - strrpos using utf16 */ +int +grapheme_strrpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int f_ignore_case TSRMLS_DC) +{ + UChar *uhaystack, *puhaystack, *uhaystack_end, *uneedle; + int32_t uhaystack_len, uneedle_len; + UErrorCode status; + unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; + UBreakIterator* bi = NULL; + int ret_pos, pos; + + /* convert the strings to UTF-16. */ + uhaystack = NULL; + uhaystack_len = 0; + status = U_ZERO_ERROR; + intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC ); + efree( uhaystack ); + return -1; + } + + if ( f_ignore_case ) { + grapheme_intl_case_fold(&uhaystack, &uhaystack, &uhaystack_len, &status ); + } + + /* get a pointer to the haystack taking into account the offset */ + bi = NULL; + status = U_ZERO_ERROR; + bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC ); + + puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset); + + if ( NULL == puhaystack ) { + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); + efree( uhaystack ); + ubrk_close (bi); + return -1; + } + + uneedle = NULL; + uneedle_len = 0; + status = U_ZERO_ERROR; + intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC ); + efree( uhaystack ); + efree( uneedle ); + ubrk_close (bi); + return -1; + } + + if ( f_ignore_case ) { + grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status ); + } + + ret_pos = -1; /* -1 represents 'not found' */ + + /* back up until there's needle_len characters to compare */ + + uhaystack_end = uhaystack + uhaystack_len; + pos = ubrk_last(bi); + puhaystack = uhaystack + pos; + + while ( uhaystack_end - puhaystack < uneedle_len ) { + + pos = ubrk_previous(bi); + + if ( UBRK_DONE == pos ) { + break; + } + + puhaystack = uhaystack + pos; + } + + /* is there enough haystack left to hold the needle? */ + if ( ( uhaystack_end - puhaystack ) < uneedle_len ) { + /* not enough, not found */ + goto exit; + } + + while ( UBRK_DONE != pos ) { + + if (!u_memcmp(uneedle, puhaystack, uneedle_len)) { /* needle_len - 1 in zend memnstr? */ + + /* does the grapheme in the haystack end at the same place as the last grapheme in the needle? */ + + if ( ubrk_isBoundary(bi, pos + uneedle_len) ) { + + /* found it, get grapheme count offset */ + ret_pos = grapheme_count_graphemes(bi, uhaystack, pos); + break; + } + + /* set position back */ + ubrk_isBoundary(bi, pos); + } + + pos = ubrk_previous(bi); + puhaystack = uhaystack + pos; + } + +exit: + efree( uhaystack ); + efree( uneedle ); + ubrk_close (bi); + + return ret_pos; +} + +/* }}} */ + +/* {{{ grapheme_strpos_utf16 - strrpos using utf16*/ +int +grapheme_strpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case TSRMLS_DC) +{ + UChar *uhaystack, *puhaystack, *uneedle; + int32_t uhaystack_len, uneedle_len; + int ret_pos; + unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; + UBreakIterator* bi; + UErrorCode status; + + *puchar_pos = -1; + + /* convert the strings to UTF-16. */ + + uhaystack = NULL; + uhaystack_len = 0; + status = U_ZERO_ERROR; + intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC ); + efree( uhaystack ); + return -1; + } + + /* get a pointer to the haystack taking into account the offset */ + bi = NULL; + status = U_ZERO_ERROR; + bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC ); + + puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset); + uhaystack_len = (uhaystack_len - ( puhaystack - uhaystack)); + + if ( NULL == puhaystack ) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); + + efree( uhaystack ); + ubrk_close (bi); + + return -1; + } + + if ( f_ignore_case ) { + grapheme_intl_case_fold(&uhaystack, &puhaystack, &uhaystack_len, &status ); + } + + uneedle = NULL; + uneedle_len = 0; + status = U_ZERO_ERROR; + intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 1 TSRMLS_CC ); + efree( uhaystack ); + efree( uneedle ); + ubrk_close (bi); + + return -1; + } + + if ( f_ignore_case ) { + grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status ); + } + + ret_pos = grapheme_memnstr_grapheme(bi, puhaystack, uneedle, uneedle_len, puhaystack + uhaystack_len ); + + *puchar_pos = ubrk_current(bi); + + efree( uhaystack ); + efree( uneedle ); + ubrk_close (bi); + + return ret_pos; +} + +/* }}} */ + +/* {{{ grapheme_ascii_check: ASCII check */ +int grapheme_ascii_check(const unsigned char *day, int32_t len) +{ + int ret_len = len; + while ( len-- ) { + if ( *day++ > 0x7f ) + return -1; + } + + return ret_len; +} + +/* }}} */ + +/* {{{ grapheme_split_string: find and optionally return grapheme boundaries */ +int grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len TSRMLS_DC ) +{ + unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; + UErrorCode status = U_ZERO_ERROR; + int ret_len, pos; + UBreakIterator* bi; + + bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC ); + + if( U_FAILURE(status) ) { + return -1; + } + + ubrk_setText(bi, text, text_length, &status); + + pos = 0; + + for ( ret_len = 0; pos != UBRK_DONE; ) { + + pos = ubrk_next(bi); + + if ( pos != UBRK_DONE ) { + + if ( NULL != boundary_array && ret_len < boundary_array_len ) { + boundary_array[ret_len] = pos; + } + + ret_len++; + } + } + + ubrk_close(bi); + + return ret_len; +} +/* }}} */ + +/* {{{ grapheme_count_graphemes */ +inline int32_t +grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len) +{ + int ret_len = 0; + int pos = 0; + UErrorCode status = U_ZERO_ERROR; + + ubrk_setText(bi, string, string_len, &status); + + do { + + pos = ubrk_next(bi); + + if ( UBRK_DONE != pos ) { + ret_len++; + } + + } while ( UBRK_DONE != pos ); + + return ret_len; +} +/* }}} */ + +/* {{{ grapheme_memnstr_grapheme: find needle in haystack using grapheme boundaries */ +inline int32_t +grapheme_memnstr_grapheme(UBreakIterator *bi, UChar *haystack, UChar *needle, int32_t needle_len, UChar *end) +{ + UChar *p = haystack; + UChar ne = needle[needle_len-1]; + UErrorCode status; + int32_t grapheme_offset; + + end -= needle_len; + + while (p <= end) { + + if ((p = u_memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) { + + if (!u_memcmp(needle, p, needle_len - 1)) { /* needle_len - 1 works because if needle_len is 1, we've already tested the char */ + + /* does the grapheme end here? */ + + status = U_ZERO_ERROR; + ubrk_setText (bi, haystack, (end - haystack) + needle_len, &status); + + if ( ubrk_isBoundary (bi, (p - haystack) + needle_len) ) { + + /* found it, get grapheme count offset */ + grapheme_offset = grapheme_count_graphemes(bi, haystack, (p - haystack)); + + return grapheme_offset; + } + } + } + + if (p == NULL) { + return -1; + } + + p++; + } + + return -1; +} + +/* }}} */ + +/* {{{ grapheme_memrstr_grapheme: reverse find needle in haystack using grapheme boundaries */ +inline void *grapheme_memrchr_grapheme(const void *s, int c, int32_t n) +{ + register unsigned char *e; + + if (n <= 0) { + return NULL; + } + + for (e = (unsigned char *)s + n - 1; e >= (unsigned char *)s; e--) { + if (*e == (unsigned char)c) { + return (void *)e; + } + } + + return NULL; +} +/* }}} */ + +/* {{{ grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */ +UChar * +grapheme_get_haystack_offset(UBreakIterator* bi, UChar *uhaystack, int32_t uhaystack_len, int32_t offset) +{ + UErrorCode status; + int32_t pos; + int32_t (*iter_op)(UBreakIterator* bi); + int iter_incr; + + if ( NULL != bi ) { + status = U_ZERO_ERROR; + ubrk_setText (bi, uhaystack, uhaystack_len, &status); + } + + if ( 0 == offset ) { + return uhaystack; + } + + if ( offset < 0 ) { + iter_op = ubrk_previous; + ubrk_last(bi); /* one past the end */ + iter_incr = 1; + } + else { + iter_op = ubrk_next; + iter_incr = -1; + } + + pos = 0; + + while ( pos != UBRK_DONE && offset != 0 ) { + + pos = iter_op(bi); + + if ( UBRK_DONE != pos ) { + offset += iter_incr; + } + } + + if ( offset != 0 ) { + return NULL; + } + + return uhaystack + pos; +} +/* }}} */ + +/* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */ + int32_t +grapheme_strrpos_ascii(unsigned char *haystack, int32_t haystack_len, unsigned char *needle, int32_t needle_len, int32_t offset) +{ + unsigned char *p, *e; + + if (offset >= 0) { + p = haystack + offset; + e = haystack + haystack_len - needle_len; + } else { + p = haystack; + if (needle_len > -offset) { + e = haystack + haystack_len - needle_len; + } else { + e = haystack + haystack_len + offset; + } + } + + if (needle_len == 1) { + /* Single character search can shortcut memcmps */ + while (e >= p) { + if (*e == *needle) { + return (e - p + (offset > 0 ? offset : 0)); + } + e--; + } + return -1; + } + + while (e >= p) { + if (memcmp(e, needle, needle_len) == 0) { + return (e - p + (offset > 0 ? offset : 0)); + } + e--; + } + + return -1; +} + +/* }}} */ + +/* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */ +UBreakIterator* +grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status TSRMLS_DC ) +{ + int32_t buffer_size; + + UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator ); + + if ( NULL == global_break_iterator ) { + + global_break_iterator = ubrk_open(UBRK_CHARACTER, + NULL, /* icu default locale - locale has no effect on this iterator */ + NULL, /* text not set in global iterator */ + 0, /* text length = 0 */ + status); + + INTL_G(grapheme_iterator) = global_break_iterator; + } + + buffer_size = U_BRK_SAFECLONE_BUFFERSIZE; + + return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status); +} +/* }}} */ + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: fdm=marker + * vim: noet sw=4 ts=4 + */ + diff --git a/ext/intl/grapheme/grapheme_util.h b/ext/intl/grapheme/grapheme_util.h new file mode 100755 index 0000000000..f8207cac52 --- /dev/null +++ b/ext/intl/grapheme/grapheme_util.h @@ -0,0 +1,59 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: Ed Batutis <ed@batutis.com> | + +----------------------------------------------------------------------+ + */ + +#ifndef GRAPHEME_GRAPHEME_UTIL_H +#define GRAPHEME_GRAPHEME_UTIL_H + +#include "php_intl.h" +#include "intl_convert.h" + +/* get_break_interator: get a break iterator from the global structure */ +UBreakIterator* grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status TSRMLS_DC ); + +void +grapheme_substr_ascii(char *str, int32_t str_len, int32_t f, int32_t l, int argc, char **sub_str, int *sub_str_len); + +int +grapheme_strrpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int f_ignore_case TSRMLS_DC); + +int +grapheme_strpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int *puchar_pos, int f_ignore_case TSRMLS_DC); + +int grapheme_ascii_check(const unsigned char *day, int32_t len); + +int grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len TSRMLS_DC ); + +inline int32_t +grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len); + +inline int32_t +grapheme_memnstr_grapheme(UBreakIterator *bi, UChar *haystack, UChar *needle, int32_t needle_len, UChar *end); + +inline void *grapheme_memrchr_grapheme(const void *s, int c, int32_t n); + +UChar * +grapheme_get_haystack_offset(UBreakIterator* bi, UChar *uhaystack, int32_t uhaystack_len, int32_t offset); + +int32_t +grapheme_strrpos_ascii(unsigned char *haystack, int32_t haystack_len, unsigned char *needle, int32_t needle_len, int32_t offset); + +UBreakIterator* +grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status TSRMLS_DC ); + +/* OUTSIDE_STRING: check if (possibly negative) long offset is outside the string with int32_t length */ +#define OUTSIDE_STRING(offset, max_len) ( offset < INT32_MIN || offset > INT32_MAX || (offset < 0 ? -offset > (long) max_len : offset >= (long) max_len) ) + +#endif // GRAPHEME_GRAPHEME_UTIL_H |