diff options
author | Gustavo Lopes <glopes@nebm.ist.utl.pt> | 2013-01-15 21:05:21 +0100 |
---|---|---|
committer | Gustavo Lopes <glopes@nebm.ist.utl.pt> | 2013-01-15 21:05:21 +0100 |
commit | 93e35137aaba98c0a000ed442320e3173bb0a3f2 (patch) | |
tree | 6ddcca5078aab6eaaa29c6631a25be3eaefd2d7c | |
parent | 83864b470b030a7d1bd0a1b46d3be75ce301304c (diff) | |
parent | 930ef9ddd663dcda5726b5d33c54c49a2f4f97d6 (diff) | |
download | php-git-93e35137aaba98c0a000ed442320e3173bb0a3f2.tar.gz |
Merge remote-tracking branch 'remotes/cataphract/strtr_wu94_54' into PHP-5.4
* remotes/cataphract/strtr_wu94_54:
Fixed inconsequential bug in strtr()
Remove _GNU_SOURCE, add local heap sort
The compiler can figure this out
Remove unused block
strtr() with 2nd param array - optimization
Refactoring, bugs & leaks
Optimize strtr w/ 2nd arg array
-rw-r--r-- | ext/standard/string.c | 447 |
1 files changed, 360 insertions, 87 deletions
diff --git a/ext/standard/string.c b/ext/standard/string.c index 29115fea7a..58b5483651 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -23,6 +23,7 @@ /* Synced with php 3.0 revision 1.193 1999-06-16 [ssb] */ #include <stdio.h> +#include <stdint.h> #include "php.h" #include "php_rand.h" #include "php_string.h" @@ -57,6 +58,7 @@ #include "php_globals.h" #include "basic_functions.h" #include "php_smart_str.h" +#include <Zend/zend_exceptions.h> #ifdef ZTS #include "TSRM.h" #endif @@ -132,7 +134,7 @@ static char *php_bin2hex(const unsigned char *old, const size_t oldlen, size_t * size_t i, j; result = (unsigned char *) safe_emalloc(oldlen, 2 * sizeof(char), 1); - + for (i = j = 0; i < oldlen; i++) { result[j++] = hexconvtab[old[i] >> 4]; result[j++] = hexconvtab[old[i] & 15]; @@ -2772,112 +2774,383 @@ PHPAPI char *php_strtr(char *str, int len, char *str_from, char *str_to, int trl } /* }}} */ -/* {{{ php_strtr_array - */ -static void php_strtr_array(zval *return_value, char *str, int slen, HashTable *hash) +/* {{{ Definitions for php_strtr_array */ +typedef size_t STRLEN; /* STRLEN should be unsigned */ +typedef uint16_t HASH; +typedef struct { + HASH table_mask; + STRLEN entries[1]; +} SHIFT_TAB; +typedef struct { + HASH table_mask; + int entries[1]; +} HASH_TAB; +typedef struct { + const char *s; + STRLEN l; +} STR; +typedef struct _pat_and_repl { + STR pat; + STR repl; +} PATNREPL; + +#define S(a) ((a)->s) +#define L(a) ((a)->l) + +#define SHIFT_TAB_BITS 13 +#define HASH_TAB_BITS 10 /* should be less than sizeof(HASH) * 8 */ +#define SHIFT_TAB_SIZE (1U << SHIFT_TAB_BITS) +#define HASH_TAB_SIZE (1U << HASH_TAB_BITS) + +typedef struct { + int B; /* size of suffixes */ + int Bp; /* size of prefixes */ + STRLEN m; /* minimum pattern length */ + int patnum; /* number of patterns */ + SHIFT_TAB *shift; /* table mapping hash to allowed shift */ + HASH_TAB *hash; /* table mapping hash to int (pair of pointers) */ + HASH *prefix; /* array of hashes of prefixes by pattern suffix hash order */ + PATNREPL *patterns; /* array of prefixes by pattern suffix hash order */ +} PPRES; +/* }}} */ + +/* {{{ php_strtr_hash */ +static inline HASH php_strtr_hash(const char *str, int len) { - zval **entry; - char *string_key; - uint string_key_len; - zval **trans; - zval ctmp; - ulong num_key; - int minlen = 128*1024; - int maxlen = 0, pos, len, found; - char *key; - HashPosition hpos; - smart_str result = {0}; - HashTable tmp_hash; - - zend_hash_init(&tmp_hash, zend_hash_num_elements(hash), NULL, NULL, 0); - zend_hash_internal_pointer_reset_ex(hash, &hpos); - while (zend_hash_get_current_data_ex(hash, (void **)&entry, &hpos) == SUCCESS) { - switch (zend_hash_get_current_key_ex(hash, &string_key, &string_key_len, &num_key, 0, &hpos)) { - case HASH_KEY_IS_STRING: - len = string_key_len-1; - if (len < 1) { - zend_hash_destroy(&tmp_hash); - RETURN_FALSE; - } - zend_hash_add(&tmp_hash, string_key, string_key_len, entry, sizeof(zval*), NULL); - if (len > maxlen) { - maxlen = len; - } - if (len < minlen) { - minlen = len; - } - break; + HASH res = 0; + int i; + for (i = 0; i < len; i++) { + res = res * 33 + (unsigned char)str[i]; + } + + return res; +} +/* }}} */ +/* {{{ php_strtr_populate_shift */ +static inline void php_strtr_populate_shift(PATNREPL *patterns, int patnum, int B, STRLEN m, SHIFT_TAB *shift) +{ + int i; + STRLEN j, + max_shift; + + max_shift = m - B + 1; + for (i = 0; i < SHIFT_TAB_SIZE; i++) { + shift->entries[i] = max_shift; + } + for (i = 0; i < patnum; i++) { + for (j = 0; j < m - B + 1; j++) { + HASH h = php_strtr_hash(&S(&patterns[i].pat)[j], B) & shift->table_mask; + assert((long long) m - (long long) j - B >= 0); + shift->entries[h] = MIN(shift->entries[h], m - j - B); + } + } +} +/* }}} */ +/* {{{ php_strtr_compare_hash_suffix */ +static int php_strtr_compare_hash_suffix(const void *a, const void *b, void *ctx_g) +{ + const PPRES *res = ctx_g; + const PATNREPL *pnr_a = a, + *pnr_b = b; + HASH hash_a = php_strtr_hash(&S(&pnr_a->pat)[res->m - res->B], res->B) + & res->hash->table_mask, + hash_b = php_strtr_hash(&S(&pnr_b->pat)[res->m - res->B], res->B) + & res->hash->table_mask; + /* TODO: don't recalculate the hashes all the time */ + if (hash_a > hash_b) { + return 1; + } else if (hash_a < hash_b) { + return -1; + } else { + /* longer patterns must be sorted first */ + if (L(&pnr_a->pat) > L(&pnr_b->pat)) { + return -1; + } else if (L(&pnr_a->pat) < L(&pnr_b->pat)) { + return 1; + } else { + return 0; + } + } +} +/* }}} */ +/* {{{ Sorting (no zend_qsort_r in this PHP version) */ +#define HS_LEFT(i) ((i) * 2 + 1) +#define HS_RIGHT(i) ((i) * 2 + 2) +#define HS_PARENT(i) (((i) - 1) / 2); +#define HS_OFF(data, i) ((void *)(&((data)->arr)[i])) +#define HS_CMP_CALL(data, i1, i2) \ + (php_strtr_compare_hash_suffix(HS_OFF((data), (i1)), HS_OFF((data), (i2)), (data)->res)) +struct hs_data { + PATNREPL *arr; + size_t nel; + size_t heapel; + PPRES *res; +}; +static inline void php_strtr_swap(PATNREPL *a, PATNREPL *b) +{ + PATNREPL tmp = *a; + *a = *b; + *b = tmp; +} +static inline void php_strtr_fix_heap(struct hs_data *data, size_t i) +{ + size_t li = HS_LEFT(i), + ri = HS_RIGHT(i), + largei; + if (li < data->heapel && HS_CMP_CALL(data, li, i) > 0) { + largei = li; + } else { + largei = i; + } + if (ri < data->heapel && HS_CMP_CALL(data, ri, largei) > 0) { + largei = ri; + } + if (largei != i) { + php_strtr_swap(HS_OFF(data, i), HS_OFF(data, largei)); + php_strtr_fix_heap(data, largei); + } +} +static inline void php_strtr_build_heap(struct hs_data *data) +{ + size_t i; + for (i = data->nel / 2; i > 0; i--) { + php_strtr_fix_heap(data, i - 1); + } +} +static inline void php_strtr_heapsort(PATNREPL *arr, size_t nel, PPRES *res) +{ + struct hs_data data = { arr, nel, nel, res }; + size_t i; + php_strtr_build_heap(&data); + for (i = nel; i > 1; i--) { + php_strtr_swap(arr, HS_OFF(&data, i - 1)); + data.heapel--; + php_strtr_fix_heap(&data, 0); + } +} +/* }}} */ +/* {{{ php_strtr_free_strp */ +static void php_strtr_free_strp(void *strp) +{ + STR_FREE(*(char**)strp); +} +/* }}} */ +/* {{{ php_strtr_array_prepare_repls */ +static PATNREPL *php_strtr_array_prepare_repls(int slen, HashTable *pats, zend_llist **allocs, int *outsize) +{ + PATNREPL *patterns; + HashPosition hpos; + zval **entry; + int num_pats = zend_hash_num_elements(pats), + i; + + patterns = safe_emalloc(num_pats, sizeof(*patterns), 0); + *allocs = emalloc(sizeof **allocs); + zend_llist_init(*allocs, sizeof(void*), &php_strtr_free_strp, 0); + + for (i = 0, zend_hash_internal_pointer_reset_ex(pats, &hpos); + zend_hash_get_current_data_ex(pats, (void **)&entry, &hpos) == SUCCESS; + zend_hash_move_forward_ex(pats, &hpos)) { + char *string_key; + uint string_key_len; + ulong num_key; + zval *tzv = NULL; + + switch (zend_hash_get_current_key_ex(pats, &string_key, &string_key_len, &num_key, 0, &hpos)) { + case HASH_KEY_IS_LONG: + string_key_len = 1 + zend_spprintf(&string_key, 0, "%ld", (long)num_key); + zend_llist_add_element(*allocs, &string_key); + /* break missing intentionally */ + + case HASH_KEY_IS_STRING: + string_key_len--; /* exclude final '\0' */ + if (string_key_len == 0) { /* empty string given as pattern */ + efree(patterns); + zend_llist_destroy(*allocs); + efree(*allocs); + *allocs = NULL; + return NULL; + } + if (string_key_len > slen) { /* this pattern can never match */ + continue; + } - case HASH_KEY_IS_LONG: - Z_TYPE(ctmp) = IS_LONG; - Z_LVAL(ctmp) = num_key; + if (Z_TYPE_PP(entry) != IS_STRING) { + tzv = *entry; + zval_addref_p(tzv); + SEPARATE_ZVAL(&tzv); + convert_to_string(tzv); + entry = &tzv; + zend_llist_add_element(*allocs, &Z_STRVAL_PP(entry)); + } - convert_to_string(&ctmp); - len = Z_STRLEN(ctmp); - zend_hash_add(&tmp_hash, Z_STRVAL(ctmp), len+1, entry, sizeof(zval*), NULL); - zval_dtor(&ctmp); + S(&patterns[i].pat) = string_key; + L(&patterns[i].pat) = string_key_len; + S(&patterns[i].repl) = Z_STRVAL_PP(entry); + L(&patterns[i].repl) = Z_STRLEN_PP(entry); + i++; - if (len > maxlen) { - maxlen = len; - } - if (len < minlen) { - minlen = len; - } - break; + if (tzv) { + efree(tzv); + } } - zend_hash_move_forward_ex(hash, &hpos); } - key = emalloc(maxlen+1); - pos = 0; + *outsize = i; + return patterns; +} +/* }}} */ - while (pos < slen) { - if ((pos + maxlen) > slen) { - maxlen = slen - pos; - } +/* {{{ PPRES *php_strtr_array_prepare(STR *text, PATNREPL *patterns, int patnum, int B, int Bp) */ +static PPRES *php_strtr_array_prepare(STR *text, PATNREPL *patterns, int patnum, int B, int Bp) +{ + int i; + PPRES *res = emalloc(sizeof *res); - found = 0; - memcpy(key, str+pos, maxlen); + res->m = (STRLEN)-1; + for (i = 0; i < patnum; i++) { + if (L(&patterns[i].pat) < res->m) { + res->m = L(&patterns[i].pat); + } + } + assert(res->m > 0); + res->B = B = MIN(B, res->m); + res->Bp = Bp = MIN(Bp, res->m); - for (len = maxlen; len >= minlen; len--) { - key[len] = 0; + res->shift = safe_emalloc(SHIFT_TAB_SIZE, sizeof(*res->shift->entries), sizeof(*res->shift)); + res->shift->table_mask = SHIFT_TAB_SIZE - 1; + php_strtr_populate_shift(patterns, patnum, B, res->m, res->shift); - if (zend_hash_find(&tmp_hash, key, len+1, (void**)&trans) == SUCCESS) { - char *tval; - int tlen; - zval tmp; + res->hash = safe_emalloc(HASH_TAB_SIZE, sizeof(*res->hash->entries), sizeof(*res->hash)); + res->hash->table_mask = HASH_TAB_SIZE - 1; - if (Z_TYPE_PP(trans) != IS_STRING) { - tmp = **trans; - zval_copy_ctor(&tmp); - convert_to_string(&tmp); - tval = Z_STRVAL(tmp); - tlen = Z_STRLEN(tmp); - } else { - tval = Z_STRVAL_PP(trans); - tlen = Z_STRLEN_PP(trans); - } + res->patterns = safe_emalloc(patnum, sizeof(*res->patterns), 0); + memcpy(res->patterns, patterns, sizeof(*patterns) * patnum); + php_strtr_heapsort(res->patterns, patnum, res); - smart_str_appendl(&result, tval, tlen); - pos += len; - found = 1; + res->prefix = safe_emalloc(patnum, sizeof(*res->prefix), 0); + for (i = 0; i < patnum; i++) { + res->prefix[i] = php_strtr_hash(S(&res->patterns[i].pat), Bp); + } - if (Z_TYPE_PP(trans) != IS_STRING) { - zval_dtor(&tmp); - } - break; + /* Initialize the rest of ->hash */ + for (i = 0; i < HASH_TAB_SIZE; i++) { + res->hash->entries[i] = -1; + } + { + HASH last_h = -1; /* assumes not all bits are used in res->hash */ + /* res->patterns is already ordered by hash. + * Make res->hash->entries[h] de index of the first pattern in + * res->patterns that has hash h */ + for (i = 0; i < patnum; i++) { + HASH h = php_strtr_hash(&S(&res->patterns[i].pat)[res->m - res->B], res->B) + & res->hash->table_mask; + if (h != last_h) { + res->hash->entries[h] = i; + last_h = h; } } + } + res->hash->entries[HASH_TAB_SIZE] = patnum; /* OK, we effectively allocated SIZE+1 */ + for (i = HASH_TAB_SIZE - 1; i >= 0; i--) { + if (res->hash->entries[i] == -1) { + res->hash->entries[i] = res->hash->entries[i + 1]; + } + } + + res->patnum = patnum; - if (! found) { - smart_str_appendc(&result, str[pos++]); + return res; +} +/* }}} */ +/* {{{ php_strtr_array_destroy_ppres(PPRES *d) */ +static void php_strtr_array_destroy_ppres(PPRES *d) +{ + efree(d->shift); + efree(d->hash); + efree(d->prefix); + efree(d->patterns); + efree(d); +} +/* }}} */ + +/* {{{ php_strtr_array_do_repl(STR *text, PPRES *d, zval *return_value) */ +static void php_strtr_array_do_repl(STR *text, PPRES *d, zval *return_value) +{ + STRLEN pos = 0, + nextwpos = 0, + lastpos = L(text) - d->m; + smart_str result = {0}; + + while (pos <= lastpos) { + HASH h = php_strtr_hash(&S(text)[pos + d->m - d->B], d->B) & d->shift->table_mask; + STRLEN shift = d->shift->entries[h]; + + if (shift > 0) { + pos += shift; + } else { + HASH h2 = h & d->hash->table_mask, + prefix_h = php_strtr_hash(&S(text)[pos], d->Bp); + + int offset_start = d->hash->entries[h2], + offset_end = d->hash->entries[h2 + 1], /* exclusive */ + i = 0; + + for (i = offset_start; i < offset_end; i++) { + PATNREPL *pnr; + if (d->prefix[i] != prefix_h) + continue; + + pnr = &d->patterns[i]; + if (L(&pnr->pat) > L(text) - pos || + memcmp(S(&pnr->pat), &S(text)[pos], L(&pnr->pat)) != 0) + continue; + + smart_str_appendl(&result, &S(text)[nextwpos], pos - nextwpos); + smart_str_appendl(&result, S(&pnr->repl), L(&pnr->repl)); + pos += L(&pnr->pat); + nextwpos = pos; + goto end_outer_loop; + } + + pos++; +end_outer_loop: ; } } - efree(key); - zend_hash_destroy(&tmp_hash); - smart_str_0(&result); - RETVAL_STRINGL(result.c, result.len, 0); + smart_str_appendl(&result, &S(text)[nextwpos], L(text) - nextwpos); + + if (result.c != NULL) { + smart_str_0(&result); + RETVAL_STRINGL(result.c, result.len, 0); + } else { + RETURN_EMPTY_STRING(); + } +} +/* }}} */ + +/* {{{ php_strtr_array */ +static void php_strtr_array(zval *return_value, char *str, int slen, HashTable *pats) +{ + PPRES *data; + STR text; + PATNREPL *patterns; + int patterns_len; + zend_llist *allocs; + + S(&text) = str; + L(&text) = slen; + + patterns = php_strtr_array_prepare_repls(slen, pats, &allocs, &patterns_len); + if (patterns == NULL) { + RETURN_FALSE; + } + data = php_strtr_array_prepare(&text, patterns, patterns_len, 2, 2); + efree(patterns); + php_strtr_array_do_repl(&text, data, return_value); + php_strtr_array_destroy_ppres(data); + zend_llist_destroy(allocs); + efree(allocs); } /* }}} */ |