summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGustavo Lopes <glopes@nebm.ist.utl.pt>2013-01-07 03:13:11 +0100
committerGustavo Lopes <gustavo@icemobile.com>2013-01-14 12:22:41 +0100
commitccf15cf2dc92d11f92ee30c97e2d86b07f81e030 (patch)
tree0ac45c524a94341a161419f0cd86099eb0ad1dff
parent1a96fe0b3260b4b63627cf69d71a5b350ad3163f (diff)
downloadphp-git-ccf15cf2dc92d11f92ee30c97e2d86b07f81e030.tar.gz
Optimize strtr w/ 2nd arg array
Fixes bug #63893: poor efficiency of strtr() using array with keys of very different length. The implementation is basically all new, which carries some risk with it. The algorithm is described in "A Fast Algorithm For Multi-Pattern Searching" (1994) by Sun Wu and Udi Manber.
-rw-r--r--ext/standard/string.c351
1 files changed, 265 insertions, 86 deletions
diff --git a/ext/standard/string.c b/ext/standard/string.c
index 29115fea7a..dc92e8e085 100644
--- a/ext/standard/string.c
+++ b/ext/standard/string.c
@@ -22,7 +22,9 @@
/* Synced with php 3.0 revision 1.193 1999-06-16 [ssb] */
+#define _GNU_SOURCE 1
#include <stdio.h>
+#include <stdint.h>
#include "php.h"
#include "php_rand.h"
#include "php_string.h"
@@ -57,6 +59,7 @@
#include "php_globals.h"
#include "basic_functions.h"
#include "php_smart_str.h"
+#include <Zend/zend_exceptions.h>
#ifdef ZTS
#include "TSRM.h"
#endif
@@ -2772,112 +2775,288 @@ PHPAPI char *php_strtr(char *str, int len, char *str_from, char *str_to, int trl
}
/* }}} */
-/* {{{ php_strtr_array
- */
-static void php_strtr_array(zval *return_value, char *str, int slen, HashTable *hash)
+/* {{{ Definitions for php_strtr_array */
+typedef size_t STRLEN; /* STRLEN should be unsigned */
+typedef uint16_t HASH;
+typedef struct {
+ HASH table_mask;
+ STRLEN entries[1];
+} SHIFT_TAB;
+typedef struct {
+ HASH table_mask;
+ int entries[1];
+} HASH_TAB;
+typedef struct {
+ const char *s;
+ STRLEN l;
+} STR;
+typedef struct _match_node MATCH_NODE;
+struct _match_node {
+ STRLEN pos;
+ MATCH_NODE *next;
+};
+typedef struct _pat_and_repl {
+ STR pat;
+ STR repl;
+} PATNREPL;
+
+#define S(a) ((a)->s)
+#define L(a) ((a)->l)
+
+#define SHIFT_TAB_BITS 13
+#define HASH_TAB_BITS 10 /* should be less than sizeof(HASH) * 8 */
+#define SHIFT_TAB_SIZE (1U << SHIFT_TAB_BITS)
+#define HASH_TAB_SIZE (1U << HASH_TAB_BITS)
+
+typedef struct {
+ int B; /* size of suffixes */
+ int Bp; /* size of prefixes */
+ STRLEN m; /* minimum pattern length */
+ int patnum; /* number of patterns */
+ SHIFT_TAB *shift; /* table mapping hash to allowed shift */
+ HASH_TAB *hash; /* table mapping hash to int (pair of pointers) */
+ HASH *prefix; /* array of hashes of prefixes by pattern suffix hash order */
+ PATNREPL *patterns; /* array of prefixes by pattern suffix hash order */
+} PPRES;
+/* }}} */
+
+/* {{{ php_strtr_hash */
+static inline HASH php_strtr_hash(const char *str, int len)
{
- zval **entry;
- char *string_key;
- uint string_key_len;
- zval **trans;
- zval ctmp;
- ulong num_key;
- int minlen = 128*1024;
- int maxlen = 0, pos, len, found;
- char *key;
- HashPosition hpos;
- smart_str result = {0};
- HashTable tmp_hash;
-
- zend_hash_init(&tmp_hash, zend_hash_num_elements(hash), NULL, NULL, 0);
- zend_hash_internal_pointer_reset_ex(hash, &hpos);
- while (zend_hash_get_current_data_ex(hash, (void **)&entry, &hpos) == SUCCESS) {
- switch (zend_hash_get_current_key_ex(hash, &string_key, &string_key_len, &num_key, 0, &hpos)) {
- case HASH_KEY_IS_STRING:
- len = string_key_len-1;
- if (len < 1) {
- zend_hash_destroy(&tmp_hash);
- RETURN_FALSE;
- }
- zend_hash_add(&tmp_hash, string_key, string_key_len, entry, sizeof(zval*), NULL);
- if (len > maxlen) {
- maxlen = len;
- }
- if (len < minlen) {
- minlen = len;
- }
- break;
+ HASH res = 0;
+ int i;
+ for (i = 0; i < len; i++) {
+ res = (res << 5) + res + (unsigned char)str[i];
+ }
- case HASH_KEY_IS_LONG:
- Z_TYPE(ctmp) = IS_LONG;
- Z_LVAL(ctmp) = num_key;
+ return res;
+}
+/* }}} */
+/* {{{ php_strtr_populate_shift */
+static inline void php_strtr_populate_shift(PATNREPL *patterns, int patnum, int B, STRLEN m, SHIFT_TAB *shift)
+{
+ int i;
+ STRLEN j,
+ max_shift;
+
+ max_shift = m - B + 1;
+ for (i = 0; i < SHIFT_TAB_SIZE; i++) {
+ shift->entries[i] = max_shift;
+ }
+ for (i = 0; i < patnum; i++) {
+ for (j = 0; j < m - B + 1; j++) {
+ HASH h = php_strtr_hash(&S(&patterns[i].pat)[j], B) & shift->table_mask;
+ assert((long long) m - (long long) j - B >= 0);
+ shift->entries[h] = MIN(shift->entries[h], m - j - B);
+ }
+ }
+}
+/* }}} */
+/* {{{ php_strtr_compare_hash_suffix */
+static int php_strtr_compare_hash_suffix(const void *a, const void *b, void *ctx_g)
+{
+ const PPRES *res = ctx_g;
+ const PATNREPL *pnr_a = a,
+ *pnr_b = b;
+ HASH hash_a = php_strtr_hash(&S(&pnr_a->pat)[res->m - res->B], res->B)
+ & res->hash->table_mask,
+ hash_b = php_strtr_hash(&S(&pnr_b->pat)[res->m - res->B], res->B)
+ & res->hash->table_mask;
+ /* TODO: don't recalculate the hashes all the time */
+ return hash_a - hash_b;
+}
+/* }}} */
- convert_to_string(&ctmp);
- len = Z_STRLEN(ctmp);
- zend_hash_add(&tmp_hash, Z_STRVAL(ctmp), len+1, entry, sizeof(zval*), NULL);
- zval_dtor(&ctmp);
+/* {{{ PPRES *php_strtr_array_prepare(STR *text, PATNREPL *patterns, int patnum, int B, int Bp) */
+static PPRES *php_strtr_array_prepare(STR *text, PATNREPL *patterns, int patnum, int B, int Bp)
+{
+ int i;
+ PPRES *res = emalloc(sizeof *res);
- if (len > maxlen) {
- maxlen = len;
- }
- if (len < minlen) {
- minlen = len;
- }
- break;
+ res->m = (STRLEN)-1;
+ for (i = 0; i < patnum; i++) {
+ if (L(&patterns[i].pat) < res->m) {
+ res->m = L(&patterns[i].pat);
}
- zend_hash_move_forward_ex(hash, &hpos);
}
+ assert(res->m > 0);
+ res->B = B = MIN(B, res->m);
+ res->Bp = Bp = MIN(Bp, res->m);
- key = emalloc(maxlen+1);
- pos = 0;
+ res->shift = safe_emalloc(SHIFT_TAB_SIZE, sizeof(*res->shift->entries), sizeof(*res->shift));
+ res->shift->table_mask = SHIFT_TAB_SIZE - 1;
+ php_strtr_populate_shift(patterns, patnum, B, res->m, res->shift);
- while (pos < slen) {
- if ((pos + maxlen) > slen) {
- maxlen = slen - pos;
- }
+ res->hash = safe_emalloc(HASH_TAB_SIZE, sizeof(*res->hash->entries), sizeof(*res->shift));
+ res->hash->table_mask = HASH_TAB_SIZE - 1;
+
+ res->patterns = safe_emalloc(patnum, sizeof(*res->patterns), 0);
+ memcpy(res->patterns, patterns, sizeof(*patterns) * patnum);
+ qsort_r(res->patterns, patnum, sizeof(*res->patterns), php_strtr_compare_hash_suffix, res);
+
+ res->prefix = safe_emalloc(patnum, sizeof(*res->prefix), 0);
+ for (i = 0; i < patnum; i++) {
+ res->prefix[i] = php_strtr_hash(S(&res->patterns[i].pat), Bp);
+ }
- found = 0;
- memcpy(key, str+pos, maxlen);
+ /* Initialize the rest of ->hash */
+ for (i = 0; i < HASH_TAB_SIZE; i++) {
+ res->hash->entries[i] = -1;
+ }
+ {
+ HASH last_h = -1; /* assumes not all bits are used in res->hash */
+ /* res->patterns is already ordered by hash.
+ * Make res->hash->entries[h] de index of the first pattern in
+ * res->patterns that has hash h */
+ for (i = 0; i < patnum; i++) {
+ HASH h = php_strtr_hash(&S(&res->patterns[i].pat)[res->m - res->B], res->B)
+ & res->hash->table_mask;
+ if (h != last_h) {
+ res->hash->entries[h] = i;
+ last_h = h;
+ }
+ }
+ }
+ res->hash->entries[HASH_TAB_SIZE] = patnum;
+ for (i = HASH_TAB_SIZE - 1; i >= 0; i--) {
+ if (res->hash->entries[i] == -1) {
+ res->hash->entries[i] = res->hash->entries[i + 1];
+ }
+ }
- for (len = maxlen; len >= minlen; len--) {
- key[len] = 0;
+ res->patnum = patnum;
- if (zend_hash_find(&tmp_hash, key, len+1, (void**)&trans) == SUCCESS) {
- char *tval;
- int tlen;
- zval tmp;
+ return res;
+}
+/* }}} */
+/* {{{ php_strtr_array_destroy_ppres(PPRES *d) */
+static void php_strtr_array_destroy_ppres(PPRES *d)
+{
+ efree(d->shift);
+ efree(d->hash);
+ efree(d->prefix);
+ efree(d->patterns);
+ efree(d);
+}
+/* }}} */
- if (Z_TYPE_PP(trans) != IS_STRING) {
- tmp = **trans;
- zval_copy_ctor(&tmp);
- convert_to_string(&tmp);
- tval = Z_STRVAL(tmp);
- tlen = Z_STRLEN(tmp);
- } else {
- tval = Z_STRVAL_PP(trans);
- tlen = Z_STRLEN_PP(trans);
- }
+/* {{{ php_strtr_array_do_repl(STR *text, PPRES *d, zval *return_value) */
+static void php_strtr_array_do_repl(STR *text, PPRES *d, zval *return_value)
+{
+ STRLEN pos = 0,
+ lastpos = L(text) - d->m;
+ smart_str result = {0};
- smart_str_appendl(&result, tval, tlen);
- pos += len;
- found = 1;
+ while (pos <= lastpos) {
+ HASH h = php_strtr_hash(&S(text)[pos + d->m - d->B], d->B) & d->shift->table_mask;
+ STRLEN shift = d->shift->entries[h];
- if (Z_TYPE_PP(trans) != IS_STRING) {
- zval_dtor(&tmp);
- }
- break;
+ if (shift > 0) {
+ smart_str_appendl(&result, &S(text)[pos], shift);
+ pos += shift;
+ } else {
+ HASH h2 = h & d->hash->table_mask,
+ prefix_h = php_strtr_hash(&S(text)[pos], d->Bp);
+
+ int offset_start = d->hash->entries[h2],
+ offset_end = d->hash->entries[h2 + 1], /* exclusive */
+ i = 0;
+
+ for (i = offset_start; i < offset_end; i++) {
+ PATNREPL *pnr;
+ if (d->prefix[i] != prefix_h)
+ continue;
+
+ pnr = &d->patterns[i];
+ if (L(&pnr->pat) > L(text) - pos ||
+ memcmp(S(&pnr->pat), &S(text)[pos], L(&pnr->pat)) != 0)
+ continue;
+
+ smart_str_appendl(&result, S(&pnr->repl), (int)L(&pnr->repl));
+ pos += L(&pnr->pat);
+ goto end_outer_loop;
}
+
+ smart_str_appendc(&result, S(text)[pos]);
+ pos++;
+end_outer_loop: ;
}
+ }
+
+ if (pos < L(text)) {
+ smart_str_appendl(&result, &S(text)[pos], (int)(L(text) - pos));
+ }
+
+ if (result.c != NULL) {
+ smart_str_0(&result);
+ RETVAL_STRINGL(result.c, result.len, 0);
+ } else {
+ RETURN_EMPTY_STRING();
+ }
+}
+/* }}} */
+
+/* {{{ php_strtr_array */
+static void php_strtr_array(zval *return_value, char *str, int slen, HashTable *pats)
+{
+ PPRES *data;
+ STR text;
+ PATNREPL *patterns;
+ HashPosition hpos;
+ zval **entry;
+ int num_pats = zend_hash_num_elements(pats),
+ i;
+
+ S(&text) = str;
+ L(&text) = slen;
+ patterns = safe_emalloc(num_pats, sizeof(*patterns), 0);
+
+ for (i = 0, zend_hash_internal_pointer_reset_ex(pats, &hpos);
+ zend_hash_get_current_data_ex(pats, (void **)&entry, &hpos) == SUCCESS;
+ i++, zend_hash_move_forward_ex(pats, &hpos)) {
+ char *string_key;
+ uint string_key_len;
+ ulong num_key;
+ int free_str = 0,
+ free_repl = 0;
+ zval *tzv;
+
+ switch (zend_hash_get_current_key_ex(pats, &string_key, &string_key_len, &num_key, 0, &hpos)) {
+ case HASH_KEY_IS_LONG:
+ string_key_len = 1 + zend_spprintf(&string_key, 0, "%ld", (long)num_key);
+ free_str = 1;
+ /* break missing intentionally */
+
+ case HASH_KEY_IS_STRING:
+ string_key_len--; /* exclude final '\0' */
+ if (string_key_len == 0) { /* empty string given as pattern */
+ efree(patterns);
+ RETURN_FALSE;
+ }
+ if (string_key_len > slen) { /* this pattern can never match */
+ continue;
+ }
+
+ if (Z_TYPE_PP(entry) != IS_STRING) {
+ tzv = *entry;
+ zval_addref_p(tzv);
+ SEPARATE_ZVAL(&tzv);
+ convert_to_string(tzv);
+ entry = &tzv;
+ free_repl = 1;
+ }
- if (! found) {
- smart_str_appendc(&result, str[pos++]);
+ S(&patterns[i].pat) = string_key;
+ L(&patterns[i].pat) = string_key_len;
+ S(&patterns[i].repl) = Z_STRVAL_PP(entry);
+ L(&patterns[i].repl) = Z_STRLEN_PP(entry);
}
}
- efree(key);
- zend_hash_destroy(&tmp_hash);
- smart_str_0(&result);
- RETVAL_STRINGL(result.c, result.len, 0);
+ data = php_strtr_array_prepare(&text, patterns, i, 2, 2);
+ efree(patterns);
+ php_strtr_array_do_repl(&text, data, return_value);
+ php_strtr_array_destroy_ppres(data);
}
/* }}} */