/* +----------------------------------------------------------------------+ | PHP Version 5 | +----------------------------------------------------------------------+ | Copyright (c) 1997-2014 The PHP Group | +----------------------------------------------------------------------+ | This source file is subject to version 3.01 of the PHP license, | | that is bundled with this package in the file LICENSE, and is | | available through the world-wide-web at the following url: | | http://www.php.net/license/3_01.txt | | If you did not receive a copy of the PHP license and are unable to | | obtain it through the world-wide-web, please send a note to | | license@php.net so we can mail you a copy immediately. | +----------------------------------------------------------------------+ | Authors: Rasmus Lerdorf | | Jim Winstead | | Jaakko Hyvätti | +----------------------------------------------------------------------+ */ /* $Id$ */ #include #include #include "php.h" #include "ext/standard/php_string.h" #include "php_ereg.h" #include "ext/standard/info.h" /* {{{ arginfo */ ZEND_BEGIN_ARG_INFO_EX(arginfo_ereg, 0, 0, 2) ZEND_ARG_INFO(0, pattern) ZEND_ARG_INFO(0, string) ZEND_ARG_INFO(1, registers) /* ARRAY_INFO(1, registers, 1) */ ZEND_END_ARG_INFO() ZEND_BEGIN_ARG_INFO(arginfo_ereg_replace, 0) ZEND_ARG_INFO(0, pattern) ZEND_ARG_INFO(0, replacement) ZEND_ARG_INFO(0, string) ZEND_END_ARG_INFO() ZEND_BEGIN_ARG_INFO_EX(arginfo_split, 0, 0, 2) ZEND_ARG_INFO(0, pattern) ZEND_ARG_INFO(0, string) ZEND_ARG_INFO(0, limit) ZEND_END_ARG_INFO() ZEND_BEGIN_ARG_INFO(arginfo_sql_regcase, 0) ZEND_ARG_INFO(0, string) ZEND_END_ARG_INFO() /* }}} */ /* {{{ Function table */ const zend_function_entry ereg_functions[] = { PHP_DEP_FE(ereg, arginfo_ereg) PHP_DEP_FE(ereg_replace, arginfo_ereg_replace) PHP_DEP_FE(eregi, arginfo_ereg) PHP_DEP_FE(eregi_replace, arginfo_ereg_replace) PHP_DEP_FE(split, arginfo_split) PHP_DEP_FE(spliti, arginfo_split) PHP_DEP_FE(sql_regcase, arginfo_sql_regcase) PHP_FE_END }; /* }}} */ /* {{{ reg_cache */ typedef struct { regex_t preg; int cflags; unsigned long lastuse; } reg_cache; static int reg_magic = 0; #define EREG_CACHE_SIZE 4096 /* }}} */ ZEND_DECLARE_MODULE_GLOBALS(ereg) static PHP_GINIT_FUNCTION(ereg); static PHP_GSHUTDOWN_FUNCTION(ereg); /* {{{ Module entry */ zend_module_entry ereg_module_entry = { STANDARD_MODULE_HEADER, "ereg", ereg_functions, NULL, NULL, NULL, NULL, PHP_MINFO(ereg), NO_VERSION_YET, PHP_MODULE_GLOBALS(ereg), PHP_GINIT(ereg), PHP_GSHUTDOWN(ereg), NULL, STANDARD_MODULE_PROPERTIES_EX }; /* }}} */ /* {{{ COMPILE_DL_EREG */ #ifdef COMPILE_DL_EREG ZEND_GET_MODULE(ereg) #endif /* }}} */ /* {{{ ereg_lru_cmp */ static int ereg_lru_cmp(const void *a, const void *b TSRMLS_DC) { Bucket *f = (Bucket *) a; Bucket *s = (Bucket *) b; if (((reg_cache *)Z_PTR(f->val))->lastuse < ((reg_cache *)Z_PTR(s->val))->lastuse) { return -1; } else if (((reg_cache *)Z_PTR(f->val))->lastuse == ((reg_cache *)Z_PTR(s->val))->lastuse) { return 0; } else { return 1; } } /* }}} */ /* {{{ static ereg_clean_cache */ static int ereg_clean_cache(zval *data, void *arg TSRMLS_DC) { int *num_clean = (int *)arg; if (*num_clean > 0) { (*num_clean)--; return ZEND_HASH_APPLY_REMOVE; } else { return ZEND_HASH_APPLY_STOP; } } /* }}} */ /* {{{ _php_regcomp */ static int _php_regcomp(regex_t *preg, const char *pattern, int cflags TSRMLS_DC) { int r = 0; int patlen = strlen(pattern); reg_cache *rc = NULL; if (zend_hash_num_elements(&EREG(ht_rc)) >= EREG_CACHE_SIZE) { /* easier than dealing with overflow as it happens */ if (EREG(lru_counter) >= (1 << 31) || zend_hash_sort(&EREG(ht_rc), zend_qsort, ereg_lru_cmp, 0 TSRMLS_CC) == FAILURE) { zend_hash_clean(&EREG(ht_rc)); EREG(lru_counter) = 0; } else { int num_clean = EREG_CACHE_SIZE / 4; zend_hash_apply_with_argument(&EREG(ht_rc), ereg_clean_cache, &num_clean TSRMLS_CC); } } rc = zend_hash_str_find_ptr(&EREG(ht_rc), pattern, patlen); if (rc && rc->cflags == cflags) { #ifdef HAVE_REGEX_T_RE_MAGIC /* * We use a saved magic number to see whether cache is corrupted, and if it * is, we flush it and compile the pattern from scratch. */ if (rc->preg.re_magic != reg_magic) { zend_hash_clean(&EREG(ht_rc)); EREG(lru_counter) = 0; } else { memcpy(preg, &rc->preg, sizeof(*preg)); return r; } } r = regcomp(preg, pattern, cflags); if(!r) { reg_cache rcp; rcp.cflags = cflags; rcp.lastuse = ++(EREG(lru_counter)); memcpy(&rcp.preg, preg, sizeof(*preg)); /* * Since we don't have access to the actual MAGIC1 definition in the private * header file, we save the magic value immediately after compilation. Hopefully, * it's good. */ if (!reg_magic) reg_magic = preg->re_magic; zend_hash_str_update_mem(&EREG(ht_rc), pattern, patlen, &rcp, sizeof(rcp)); } #else memcpy(preg, &rc->preg, sizeof(*preg)); } else { r = regcomp(preg, pattern, cflags); if(!r) { reg_cache rcp; rcp.cflags = cflags; rcp.lastuse = ++(EREG(lru_counter)); memcpy(&rcp.preg, preg, sizeof(*preg)); zend_hash_str_update_mem(&EREG(ht_rc), pattern, patlen, &rcp, sizeof(rcp)); } } #endif return r; } /* }}} */ static void _free_ereg_cache(zval *zv) { reg_cache *rc = Z_PTR_P(zv); regfree(&rc->preg); free(rc); } #undef regfree #define regfree(a); #undef regcomp #define regcomp(a, b, c) _php_regcomp(a, b, c TSRMLS_CC) /* {{{ PHP_GINIT_FUNCTION */ static PHP_GINIT_FUNCTION(ereg) { zend_hash_init(&ereg_globals->ht_rc, 0, NULL, _free_ereg_cache, 1); ereg_globals->lru_counter = 0; } /* }}} */ /* {{{ PHP_GSHUTDOWN_FUNCTION */ static PHP_GSHUTDOWN_FUNCTION(ereg) { zend_hash_destroy(&ereg_globals->ht_rc); } /* }}} */ PHP_MINFO_FUNCTION(ereg) { php_info_print_table_start(); #if HSREGEX php_info_print_table_row(2, "Regex Library", "Bundled library enabled"); #else php_info_print_table_row(2, "Regex Library", "System library enabled"); #endif php_info_print_table_end(); } /* {{{ php_ereg_eprint * php_ereg_eprint - convert error number to name */ static void php_ereg_eprint(int err, regex_t *re TSRMLS_DC) { char *buf = NULL, *message = NULL; size_t len; size_t buf_len; #ifdef REG_ITOA /* get the length of the message */ buf_len = regerror(REG_ITOA | err, re, NULL, 0); if (buf_len) { buf = (char *)safe_emalloc(buf_len, sizeof(char), 0); if (!buf) return; /* fail silently */ /* finally, get the error message */ regerror(REG_ITOA | err, re, buf, buf_len); } #else buf_len = 0; #endif len = regerror(err, re, NULL, 0); if (len) { message = (char *)safe_emalloc((buf_len + len + 2), sizeof(char), 0); if (!message) { return; /* fail silently */ } if (buf_len) { snprintf(message, buf_len, "%s: ", buf); buf_len += 1; /* so pointer math below works */ } /* drop the message into place */ regerror(err, re, message + buf_len, len); php_error_docref(NULL TSRMLS_CC, E_WARNING, "%s", message); } if (buf) efree(buf); if (message) efree(message); } /* }}} */ /* {{{ php_ereg */ static void php_ereg(INTERNAL_FUNCTION_PARAMETERS, int icase) { zval *regex, /* Regular expression */ *array = NULL; /* Optional register array */ char *findin; /* String to apply expression to */ int findin_len; regex_t re; regmatch_t *subs; int err, match_len, string_len; uint i; int copts = 0; off_t start, end; char *buf = NULL; char *string = NULL; int argc = ZEND_NUM_ARGS(); if (zend_parse_parameters(argc TSRMLS_CC, "zs|z/", ®ex, &findin, &findin_len, &array) == FAILURE) { return; } if (icase) { copts |= REG_ICASE; } if (argc == 2) { copts |= REG_NOSUB; } /* compile the regular expression from the supplied regex */ if (Z_TYPE_P(regex) == IS_STRING) { err = regcomp(&re, Z_STRVAL_P(regex), REG_EXTENDED | copts); } else { /* we convert numbers to integers and treat them as a string */ if (Z_TYPE_P(regex) == IS_DOUBLE) { convert_to_long_ex(regex); /* get rid of decimal places */ } convert_to_string_ex(regex); /* don't bother doing an extended regex with just a number */ err = regcomp(&re, Z_STRVAL_P(regex), copts); } if (err) { php_ereg_eprint(err, &re TSRMLS_CC); RETURN_FALSE; } /* make a copy of the string we're looking in */ string = estrndup(findin, findin_len); /* allocate storage for (sub-)expression-matches */ subs = (regmatch_t *)ecalloc(sizeof(regmatch_t),re.re_nsub+1); /* actually execute the regular expression */ err = regexec(&re, string, re.re_nsub+1, subs, 0); if (err && err != REG_NOMATCH) { php_ereg_eprint(err, &re TSRMLS_CC); regfree(&re); efree(subs); RETURN_FALSE; } match_len = 1; if (array && err != REG_NOMATCH) { match_len = (int) (subs[0].rm_eo - subs[0].rm_so); string_len = findin_len + 1; buf = emalloc(string_len); zval_dtor(array); /* start with clean array */ array_init(array); for (i = 0; i <= re.re_nsub; i++) { start = subs[i].rm_so; end = subs[i].rm_eo; if (start != -1 && end > 0 && start < string_len && end < string_len && start < end) { add_index_stringl(array, i, string+start, end-start); } else { add_index_bool(array, i, 0); } } efree(buf); } efree(subs); efree(string); if (err == REG_NOMATCH) { RETVAL_FALSE; } else { if (match_len == 0) match_len = 1; RETVAL_LONG(match_len); } regfree(&re); } /* }}} */ /* {{{ proto int ereg(string pattern, string string [, array registers]) Regular expression match */ PHP_FUNCTION(ereg) { php_ereg(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); } /* }}} */ /* {{{ proto int eregi(string pattern, string string [, array registers]) Case-insensitive regular expression match */ PHP_FUNCTION(eregi) { php_ereg(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); } /* }}} */ /* {{{ php_ereg_replace * this is the meat and potatoes of regex replacement! */ PHP_EREG_API char *php_ereg_replace(const char *pattern, const char *replace, const char *string, int icase, int extended TSRMLS_DC) { regex_t re; regmatch_t *subs; char *buf, /* buf is where we build the replaced string */ *nbuf, /* nbuf is used when we grow the buffer */ *walkbuf; /* used to walk buf when replacing backrefs */ const char *walk; /* used to walk replacement string for backrefs */ int buf_len; int pos, tmp, string_len, new_l; int err, copts = 0; string_len = strlen(string); if (icase) { copts = REG_ICASE; } if (extended) { copts |= REG_EXTENDED; } err = regcomp(&re, pattern, copts); if (err) { php_ereg_eprint(err, &re TSRMLS_CC); return ((char *) -1); } /* allocate storage for (sub-)expression-matches */ subs = (regmatch_t *)ecalloc(sizeof(regmatch_t),re.re_nsub+1); /* start with a buffer that is twice the size of the stringo we're doing replacements in */ buf_len = 2 * string_len + 1; buf = safe_emalloc(buf_len, sizeof(char), 0); err = pos = 0; buf[0] = '\0'; while (!err) { err = regexec(&re, &string[pos], re.re_nsub+1, subs, (pos ? REG_NOTBOL : 0)); if (err && err != REG_NOMATCH) { php_ereg_eprint(err, &re TSRMLS_CC); efree(subs); efree(buf); regfree(&re); return ((char *) -1); } if (!err) { /* backref replacement is done in two passes: 1) find out how long the string will be, and allocate buf 2) copy the part before match, replacement and backrefs to buf Jaakko Hyvätti */ new_l = strlen(buf) + subs[0].rm_so; /* part before the match */ walk = replace; while (*walk) { if ('\\' == *walk && isdigit((unsigned char)walk[1]) && ((unsigned char)walk[1]) - '0' <= (int)re.re_nsub) { if (subs[walk[1] - '0'].rm_so > -1 && subs[walk[1] - '0'].rm_eo > -1) { new_l += subs[walk[1] - '0'].rm_eo - subs[walk[1] - '0'].rm_so; } walk += 2; } else { new_l++; walk++; } } if (new_l + 1 > buf_len) { buf_len = 1 + buf_len + 2 * new_l; nbuf = emalloc(buf_len); strncpy(nbuf, buf, buf_len - 1); nbuf[buf_len - 1] = '\0'; efree(buf); buf = nbuf; } tmp = strlen(buf); /* copy the part of the string before the match */ strncat(buf, &string[pos], subs[0].rm_so); /* copy replacement and backrefs */ walkbuf = &buf[tmp + subs[0].rm_so]; walk = replace; while (*walk) { if ('\\' == *walk && isdigit((unsigned char)walk[1]) && (unsigned char)walk[1] - '0' <= (int)re.re_nsub) { if (subs[walk[1] - '0'].rm_so > -1 && subs[walk[1] - '0'].rm_eo > -1 /* this next case shouldn't happen. it does. */ && subs[walk[1] - '0'].rm_so <= subs[walk[1] - '0'].rm_eo) { tmp = subs[walk[1] - '0'].rm_eo - subs[walk[1] - '0'].rm_so; memcpy (walkbuf, &string[pos + subs[walk[1] - '0'].rm_so], tmp); walkbuf += tmp; } walk += 2; } else { *walkbuf++ = *walk++; } } *walkbuf = '\0'; /* and get ready to keep looking for replacements */ if (subs[0].rm_so == subs[0].rm_eo) { if (subs[0].rm_so + pos >= string_len) { break; } new_l = strlen (buf) + 1; if (new_l + 1 > buf_len) { buf_len = 1 + buf_len + 2 * new_l; nbuf = safe_emalloc(buf_len, sizeof(char), 0); strncpy(nbuf, buf, buf_len-1); efree(buf); buf = nbuf; } pos += subs[0].rm_eo + 1; buf [new_l-1] = string [pos-1]; buf [new_l] = '\0'; } else { pos += subs[0].rm_eo; } } else { /* REG_NOMATCH */ new_l = strlen(buf) + strlen(&string[pos]); if (new_l + 1 > buf_len) { buf_len = new_l + 1; /* now we know exactly how long it is */ nbuf = safe_emalloc(buf_len, sizeof(char), 0); strncpy(nbuf, buf, buf_len-1); efree(buf); buf = nbuf; } /* stick that last bit of string on our output */ strlcat(buf, &string[pos], buf_len); } } /* don't want to leak memory .. */ efree(subs); regfree(&re); /* whew. */ return (buf); } /* }}} */ /* {{{ php_do_ereg_replace */ static void php_do_ereg_replace(INTERNAL_FUNCTION_PARAMETERS, int icase) { zval *arg_pattern, *arg_replace; zend_string *pattern, *arg_string; zend_string *string; zend_string *replace; char *ret; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "zzS", &arg_pattern, &arg_replace, &arg_string) == FAILURE) { return; } if (Z_TYPE_P(arg_pattern) == IS_STRING) { if (Z_STRVAL_P(arg_pattern) && Z_STRLEN_P(arg_pattern)) { pattern = STR_COPY(Z_STR_P(arg_pattern)); } else { pattern = STR_EMPTY_ALLOC(); } } else { convert_to_long_ex(arg_pattern); pattern = STR_ALLOC(1, 0); pattern->val[0] = (char) Z_LVAL_P(arg_pattern); pattern->val[1] = '\0'; } if (Z_TYPE_P(arg_replace) == IS_STRING) { if (Z_STRVAL_P(arg_replace) && Z_STRLEN_P(arg_replace)) { replace = STR_COPY(Z_STR_P(arg_replace)); } else { replace = STR_EMPTY_ALLOC(); } } else { convert_to_long_ex(arg_replace); replace = STR_ALLOC(1, 0); replace->val[0] = (char) Z_LVAL_P(arg_replace); replace->val[1] = '\0'; } if (arg_string) { string = STR_COPY(arg_string); } else { string = STR_EMPTY_ALLOC(); } /* do the actual work */ ret = php_ereg_replace(pattern->val, replace->val, string->val, icase, 1 TSRMLS_CC); if (ret == (char *) -1) { RETVAL_FALSE; } else { RETVAL_STRING(ret); efree(ret); } STR_RELEASE(string); STR_RELEASE(replace); STR_RELEASE(pattern); } /* }}} */ /* {{{ proto string ereg_replace(string pattern, string replacement, string string) Replace regular expression */ PHP_FUNCTION(ereg_replace) { php_do_ereg_replace(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); } /* }}} */ /* {{{ proto string eregi_replace(string pattern, string replacement, string string) Case insensitive replace regular expression */ PHP_FUNCTION(eregi_replace) { php_do_ereg_replace(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); } /* }}} */ /* {{{ php_split */ static void php_split(INTERNAL_FUNCTION_PARAMETERS, int icase) { long count = -1; regex_t re; regmatch_t subs[1]; char *spliton, *str, *strp, *endp; int spliton_len, str_len; int err, size, copts = 0; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", &spliton, &spliton_len, &str, &str_len, &count) == FAILURE) { return; } if (icase) { copts = REG_ICASE; } strp = str; endp = strp + str_len; err = regcomp(&re, spliton, REG_EXTENDED | copts); if (err) { php_ereg_eprint(err, &re TSRMLS_CC); RETURN_FALSE; } array_init(return_value); /* churn through str, generating array entries as we go */ while ((count == -1 || count > 1) && !(err = regexec(&re, strp, 1, subs, 0))) { if (subs[0].rm_so == 0 && subs[0].rm_eo) { /* match is at start of string, return empty string */ add_next_index_stringl(return_value, "", 0); /* skip ahead the length of the regex match */ strp += subs[0].rm_eo; } else if (subs[0].rm_so == 0 && subs[0].rm_eo == 0) { /* No more matches */ regfree(&re); php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid Regular Expression"); zend_hash_destroy(Z_ARRVAL_P(return_value)); efree(Z_ARR_P(return_value)); RETURN_FALSE; } else { /* On a real match */ /* make a copy of the substring */ size = subs[0].rm_so; /* add it to the array */ add_next_index_stringl(return_value, strp, size); /* point at our new starting point */ strp = strp + subs[0].rm_eo; } /* if we're only looking for a certain number of points, stop looking once we hit it */ if (count != -1) { count--; } } /* see if we encountered an error */ if (err && err != REG_NOMATCH) { php_ereg_eprint(err, &re TSRMLS_CC); regfree(&re); zend_hash_destroy(Z_ARRVAL_P(return_value)); efree(Z_ARR_P(return_value)); RETURN_FALSE; } /* otherwise we just have one last element to add to the array */ size = endp - strp; add_next_index_stringl(return_value, strp, size); regfree(&re); } /* }}} */ /* {{{ proto array split(string pattern, string string [, int limit]) Split string into array by regular expression */ PHP_FUNCTION(split) { php_split(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); } /* }}} */ /* {{{ proto array spliti(string pattern, string string [, int limit]) Split string into array by regular expression case-insensitive */ PHP_FUNCTION(spliti) { php_split(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); } /* }}} */ /* {{{ proto string sql_regcase(string string) Make regular expression for case insensitive match */ PHP_EREG_API PHP_FUNCTION(sql_regcase) { char *string, *tmp; int string_len; unsigned char c; register int i, j; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &string, &string_len) == FAILURE) { return; } tmp = safe_emalloc(string_len, 4, 1); for (i = j = 0; i < string_len; i++) { c = (unsigned char) string[i]; if (isalpha(c)) { tmp[j++] = '['; tmp[j++] = toupper(c); tmp[j++] = tolower(c); tmp[j++] = ']'; } else { tmp[j++] = c; } } tmp[j] = 0; RETVAL_STRINGL(tmp, j); efree(tmp); } /* }}} */ /* * Local variables: * tab-width: 4 * c-basic-offset: 4 * End: * vim600: noet sw=4 ts=4 fdm=marker * vim<600: noet sw=4 ts=4 */