diff options
Diffstat (limited to 'ext/ereg/ereg.c')
-rw-r--r-- | ext/ereg/ereg.c | 748 |
1 files changed, 748 insertions, 0 deletions
diff --git a/ext/ereg/ereg.c b/ext/ereg/ereg.c new file mode 100644 index 0000000000..d21d1b0c78 --- /dev/null +++ b/ext/ereg/ereg.c @@ -0,0 +1,748 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | Copyright (c) 1997-2007 The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: Rasmus Lerdorf <rasmus@php.net> | + | Jim Winstead <jimw@php.net> | + | Jaakko Hyvätti <jaakko@hyvatti.iki.fi> | + +----------------------------------------------------------------------+ + */ +/* $Id$ */ + +#include <stdio.h> +#include <ctype.h> +#include "php.h" +#include "ext/standard/php_string.h" +#include "php_ereg.h" +#include "ext/standard/info.h" + +/* {{{ arginfo */ +static +ZEND_BEGIN_ARG_INFO_EX(arginfo_ereg, 0, 0, 2) + ZEND_ARG_INFO(0, pattern) + ZEND_ARG_INFO(0, string) + ZEND_ARG_INFO(1, registers) /* ARRAY_INFO(1, registers, 1) */ +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO_EX(arginfo_eregi, 0, 0, 2) + ZEND_ARG_INFO(0, pattern) + ZEND_ARG_INFO(0, string) + ZEND_ARG_INFO(1, registers) /* ARRAY_INFO(1, registers, 1) */ +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO(arginfo_ereg_replace, 0) + ZEND_ARG_INFO(0, pattern) + ZEND_ARG_INFO(0, replacement) + ZEND_ARG_INFO(0, string) +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO(arginfo_eregi_replace, 0) + ZEND_ARG_INFO(0, pattern) + ZEND_ARG_INFO(0, replacement) + ZEND_ARG_INFO(0, string) +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO_EX(arginfo_split, 0, 0, 2) + ZEND_ARG_INFO(0, pattern) + ZEND_ARG_INFO(0, string) + ZEND_ARG_INFO(0, limit) +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO_EX(arginfo_spliti, 0, 0, 2) + ZEND_ARG_INFO(0, pattern) + ZEND_ARG_INFO(0, string) + ZEND_ARG_INFO(0, limit) +ZEND_END_ARG_INFO() + +static +ZEND_BEGIN_ARG_INFO(arginfo_sql_regcase, 0) + ZEND_ARG_INFO(0, string) +ZEND_END_ARG_INFO() +/* }}} */ + +/* {{{ Function table */ +const zend_function_entry ereg_functions[] = { + PHP_FE(ereg, arginfo_ereg) + PHP_FE(ereg_replace, arginfo_ereg_replace) + PHP_FE(eregi, arginfo_eregi) + PHP_FE(eregi_replace, arginfo_eregi_replace) + PHP_FE(split, arginfo_split) + PHP_FE(spliti, arginfo_spliti) + PHP_FE(sql_regcase, arginfo_sql_regcase) + {NULL, NULL, NULL} +}; +/* }}} */ + +/* {{{ reg_cache */ +typedef struct { + regex_t preg; + int cflags; +} reg_cache; +static int reg_magic = 0; +/* }}} */ + +ZEND_DECLARE_MODULE_GLOBALS(ereg) + +/* {{{ Module entry */ +zend_module_entry ereg_module_entry = { + STANDARD_MODULE_HEADER, + "ereg", + ereg_functions, + PHP_MINIT(ereg), + PHP_MSHUTDOWN(ereg), + NULL, + NULL, + PHP_MINFO(ereg), + NO_VERSION_YET, + STANDARD_MODULE_PROPERTIES +}; +/* }}} */ + +/* {{{ _php_regcomp + */ +static int _php_regcomp(regex_t *preg, const char *pattern, int cflags) +{ + int r = 0; + int patlen = strlen(pattern); + reg_cache *rc = NULL; + TSRMLS_FETCH(); + + if(zend_hash_find(&EREG(ht_rc), (char *) pattern, patlen+1, (void **) &rc) == SUCCESS + && rc->cflags == cflags) { +#ifdef HAVE_REGEX_T_RE_MAGIC + /* + * We use a saved magic number to see whether cache is corrupted, and if it + * is, we flush it and compile the pattern from scratch. + */ + if (rc->preg.re_magic != reg_magic) { + zend_hash_clean(&EREG(ht_rc)); + } else { + memcpy(preg, &rc->preg, sizeof(*preg)); + return r; + } + } + + r = regcomp(preg, pattern, cflags); + if(!r) { + reg_cache rcp; + + rcp.cflags = cflags; + memcpy(&rcp.preg, preg, sizeof(*preg)); + /* + * Since we don't have access to the actual MAGIC1 definition in the private + * header file, we save the magic value immediately after compilation. Hopefully, + * it's good. + */ + if (!reg_magic) reg_magic = preg->re_magic; + zend_hash_update(&EREG(ht_rc), (char *) pattern, patlen+1, + (void *) &rcp, sizeof(rcp), NULL); + } +#else + memcpy(preg, &rc->preg, sizeof(*preg)); + } else { + r = regcomp(preg, pattern, cflags); + if(!r) { + reg_cache rcp; + + rcp.cflags = cflags; + memcpy(&rcp.preg, preg, sizeof(*preg)); + zend_hash_update(&EREG(ht_rc), (char *) pattern, patlen+1, + (void *) &rcp, sizeof(rcp), NULL); + } + } +#endif + return r; +} +/* }}} */ + +static void _free_ereg_cache(reg_cache *rc) +{ + regfree(&rc->preg); +} + +#undef regfree +#define regfree(a); +#undef regcomp +#define regcomp(a, b, c) _php_regcomp(a, b, c) + +static void php_ereg_init_globals(zend_ereg_globals *ereg_globals TSRMLS_DC) +{ + zend_hash_init(&ereg_globals->ht_rc, 0, NULL, (void (*)(void *)) _free_ereg_cache, 1); +} + +static void php_ereg_destroy_globals(zend_ereg_globals *ereg_globals TSRMLS_DC) +{ + zend_hash_destroy(&ereg_globals->ht_rc); +} + +PHP_MINIT_FUNCTION(ereg) +{ + ZEND_INIT_MODULE_GLOBALS(ereg, php_ereg_init_globals, php_ereg_destroy_globals); + return SUCCESS; +} + +PHP_MSHUTDOWN_FUNCTION(ereg) +{ +#ifndef ZTS + php_ereg_destroy_globals(&ereg_globals TSRMLS_CC); +#endif + + return SUCCESS; +} + +PHP_MINFO_FUNCTION(ereg) +{ + php_info_print_table_start(); +#if HSREGEX + php_info_print_table_row(2, "Regex Library", "Bundled library enabled"); +#else + php_info_print_table_row(2, "Regex Library", "System library enabled"); +#endif + php_info_print_table_end(); +} + + +/* {{{ php_ereg_eprint + * php_ereg_eprint - convert error number to name + */ +static void php_ereg_eprint(int err, regex_t *re) { + char *buf = NULL, *message = NULL; + size_t len; + size_t buf_len; + +#ifdef REG_ITOA + /* get the length of the message */ + buf_len = regerror(REG_ITOA | err, re, NULL, 0); + if (buf_len) { + buf = (char *)safe_emalloc(buf_len, sizeof(char), 0); + if (!buf) return; /* fail silently */ + /* finally, get the error message */ + regerror(REG_ITOA | err, re, buf, buf_len); + } +#else + buf_len = 0; +#endif + len = regerror(err, re, NULL, 0); + if (len) { + TSRMLS_FETCH(); + + message = (char *)safe_emalloc((buf_len + len + 2), sizeof(char), 0); + if (!message) { + return; /* fail silently */ + } + if (buf_len) { + snprintf(message, buf_len, "%s: ", buf); + buf_len += 1; /* so pointer math below works */ + } + /* drop the message into place */ + regerror(err, re, message + buf_len, len); + + php_error_docref(NULL TSRMLS_CC, E_WARNING, "%s", message); + } + + STR_FREE(buf); + STR_FREE(message); +} +/* }}} */ + +/* {{{ php_ereg + */ +static void php_ereg(INTERNAL_FUNCTION_PARAMETERS, int icase) +{ + zval **regex, /* Regular expression */ + **findin, /* String to apply expression to */ + **array = NULL; /* Optional register array */ + regex_t re; + regmatch_t *subs; + int err, match_len, string_len; + uint i; + int copts = 0; + off_t start, end; + char *buf = NULL; + char *string = NULL; + int argc = ZEND_NUM_ARGS(); + + if (argc < 2 || argc > 3 || + zend_get_parameters_ex(argc, ®ex, &findin, &array) == FAILURE) { + WRONG_PARAM_COUNT; + } + + if (icase) + copts |= REG_ICASE; + + if (argc == 2) + copts |= REG_NOSUB; + + /* compile the regular expression from the supplied regex */ + if (Z_TYPE_PP(regex) == IS_STRING) { + err = regcomp(&re, Z_STRVAL_PP(regex), REG_EXTENDED | copts); + } else { + /* we convert numbers to integers and treat them as a string */ + if (Z_TYPE_PP(regex) == IS_DOUBLE) + convert_to_long_ex(regex); /* get rid of decimal places */ + convert_to_string_ex(regex); + /* don't bother doing an extended regex with just a number */ + err = regcomp(&re, Z_STRVAL_PP(regex), copts); + } + + if (err) { + php_ereg_eprint(err, &re); + RETURN_FALSE; + } + + /* make a copy of the string we're looking in */ + convert_to_string_ex(findin); + string = estrndup(Z_STRVAL_PP(findin), Z_STRLEN_PP(findin)); + + /* allocate storage for (sub-)expression-matches */ + subs = (regmatch_t *)ecalloc(sizeof(regmatch_t),re.re_nsub+1); + + /* actually execute the regular expression */ + err = regexec(&re, string, re.re_nsub+1, subs, 0); + if (err && err != REG_NOMATCH) { + php_ereg_eprint(err, &re); + regfree(&re); + efree(subs); + RETURN_FALSE; + } + match_len = 1; + + if (array && err != REG_NOMATCH) { + match_len = (int) (subs[0].rm_eo - subs[0].rm_so); + string_len = Z_STRLEN_PP(findin) + 1; + + buf = emalloc(string_len); + + zval_dtor(*array); /* start with clean array */ + array_init(*array); + + for (i = 0; i <= re.re_nsub; i++) { + start = subs[i].rm_so; + end = subs[i].rm_eo; + if (start != -1 && end > 0 && start < string_len && end < string_len && start < end) { + add_index_stringl(*array, i, string+start, end-start, 1); + } else { + add_index_bool(*array, i, 0); + } + } + efree(buf); + } + + efree(subs); + efree(string); + if (err == REG_NOMATCH) { + RETVAL_FALSE; + } else { + if (match_len == 0) + match_len = 1; + RETVAL_LONG(match_len); + } + regfree(&re); +} +/* }}} */ + +/* {{{ proto int ereg(string pattern, string string [, array registers]) + Regular expression match */ +PHP_FUNCTION(ereg) +{ + php_ereg(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); +} +/* }}} */ + +/* {{{ proto int eregi(string pattern, string string [, array registers]) + Case-insensitive regular expression match */ +PHP_FUNCTION(eregi) +{ + php_ereg(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); +} +/* }}} */ + +/* {{{ php_ereg_replace + * this is the meat and potatoes of regex replacement! */ +PHPAPI char *php_ereg_replace(const char *pattern, const char *replace, const char *string, int icase, int extended) +{ + regex_t re; + regmatch_t *subs; + + char *buf, /* buf is where we build the replaced string */ + *nbuf, /* nbuf is used when we grow the buffer */ + *walkbuf; /* used to walk buf when replacing backrefs */ + const char *walk; /* used to walk replacement string for backrefs */ + int buf_len; + int pos, tmp, string_len, new_l; + int err, copts = 0; + + string_len = strlen(string); + + if (icase) { + copts = REG_ICASE; + } + if (extended) { + copts |= REG_EXTENDED; + } + + err = regcomp(&re, pattern, copts); + if (err) { + php_ereg_eprint(err, &re); + return ((char *) -1); + } + + + /* allocate storage for (sub-)expression-matches */ + subs = (regmatch_t *)ecalloc(sizeof(regmatch_t),re.re_nsub+1); + + /* start with a buffer that is twice the size of the stringo + we're doing replacements in */ + buf_len = 2 * string_len + 1; + buf = safe_emalloc(buf_len, sizeof(char), 0); + + err = pos = 0; + buf[0] = '\0'; + while (!err) { + err = regexec(&re, &string[pos], re.re_nsub+1, subs, (pos ? REG_NOTBOL : 0)); + + if (err && err != REG_NOMATCH) { + php_ereg_eprint(err, &re); + efree(subs); + efree(buf); + regfree(&re); + return ((char *) -1); + } + + if (!err) { + /* backref replacement is done in two passes: + 1) find out how long the string will be, and allocate buf + 2) copy the part before match, replacement and backrefs to buf + + Jaakko Hyvätti <Jaakko.Hyvatti@iki.fi> + */ + + new_l = strlen(buf) + subs[0].rm_so; /* part before the match */ + walk = replace; + while (*walk) { + if ('\\' == *walk && isdigit((unsigned char)walk[1]) && ((unsigned char)walk[1]) - '0' <= (int)re.re_nsub) { + if (subs[walk[1] - '0'].rm_so > -1 && subs[walk[1] - '0'].rm_eo > -1) { + new_l += subs[walk[1] - '0'].rm_eo - subs[walk[1] - '0'].rm_so; + } + walk += 2; + } else { + new_l++; + walk++; + } + } + if (new_l + 1 > buf_len) { + buf_len = 1 + buf_len + 2 * new_l; + nbuf = emalloc(buf_len); + strcpy(nbuf, buf); + efree(buf); + buf = nbuf; + } + tmp = strlen(buf); + /* copy the part of the string before the match */ + strncat(buf, &string[pos], subs[0].rm_so); + + /* copy replacement and backrefs */ + walkbuf = &buf[tmp + subs[0].rm_so]; + walk = replace; + while (*walk) { + if ('\\' == *walk && isdigit(walk[1]) && walk[1] - '0' <= (int)re.re_nsub) { + if (subs[walk[1] - '0'].rm_so > -1 && subs[walk[1] - '0'].rm_eo > -1 + /* this next case shouldn't happen. it does. */ + && subs[walk[1] - '0'].rm_so <= subs[walk[1] - '0'].rm_eo) { + + tmp = subs[walk[1] - '0'].rm_eo - subs[walk[1] - '0'].rm_so; + memcpy (walkbuf, &string[pos + subs[walk[1] - '0'].rm_so], tmp); + walkbuf += tmp; + } + walk += 2; + } else { + *walkbuf++ = *walk++; + } + } + *walkbuf = '\0'; + + /* and get ready to keep looking for replacements */ + if (subs[0].rm_so == subs[0].rm_eo) { + if (subs[0].rm_so + pos >= string_len) { + break; + } + new_l = strlen (buf) + 1; + if (new_l + 1 > buf_len) { + buf_len = 1 + buf_len + 2 * new_l; + nbuf = safe_emalloc(buf_len, sizeof(char), 0); + strcpy(nbuf, buf); + efree(buf); + buf = nbuf; + } + pos += subs[0].rm_eo + 1; + buf [new_l-1] = string [pos-1]; + buf [new_l] = '\0'; + } else { + pos += subs[0].rm_eo; + } + } else { /* REG_NOMATCH */ + new_l = strlen(buf) + strlen(&string[pos]); + if (new_l + 1 > buf_len) { + buf_len = new_l + 1; /* now we know exactly how long it is */ + nbuf = safe_emalloc(buf_len, sizeof(char), 0); + strcpy(nbuf, buf); + efree(buf); + buf = nbuf; + } + /* stick that last bit of string on our output */ + strlcat(buf, &string[pos], buf_len); + } + } + + /* don't want to leak memory .. */ + efree(subs); + regfree(&re); + + /* whew. */ + return (buf); +} +/* }}} */ + +/* {{{ php_do_ereg_replace + */ +static void php_do_ereg_replace(INTERNAL_FUNCTION_PARAMETERS, int icase) +{ + zval **arg_pattern, + **arg_replace, + **arg_string; + char *pattern; + char *string; + char *replace; + char *ret; + + if (ZEND_NUM_ARGS() != 3 || + zend_get_parameters_ex(3, &arg_pattern, &arg_replace, &arg_string) == FAILURE) { + WRONG_PARAM_COUNT; + } + + if (Z_TYPE_PP(arg_pattern) == IS_STRING) { + if (Z_STRVAL_PP(arg_pattern) && Z_STRLEN_PP(arg_pattern)) + pattern = estrndup(Z_STRVAL_PP(arg_pattern), Z_STRLEN_PP(arg_pattern)); + else + pattern = STR_EMPTY_ALLOC(); + } else { + convert_to_long_ex(arg_pattern); + pattern = emalloc(2); + pattern[0] = (char) Z_LVAL_PP(arg_pattern); + pattern[1] = '\0'; + } + + if (Z_TYPE_PP(arg_replace) == IS_STRING) { + if (Z_STRVAL_PP(arg_replace) && Z_STRLEN_PP(arg_replace)) + replace = estrndup(Z_STRVAL_PP(arg_replace), Z_STRLEN_PP(arg_replace)); + else + replace = STR_EMPTY_ALLOC(); + } else { + convert_to_long_ex(arg_replace); + replace = emalloc(2); + replace[0] = (char) Z_LVAL_PP(arg_replace); + replace[1] = '\0'; + } + + convert_to_string_ex(arg_string); + if (Z_STRVAL_PP(arg_string) && Z_STRLEN_PP(arg_string)) + string = estrndup(Z_STRVAL_PP(arg_string), Z_STRLEN_PP(arg_string)); + else + string = STR_EMPTY_ALLOC(); + + /* do the actual work */ + ret = php_ereg_replace(pattern, replace, string, icase, 1); + if (ret == (char *) -1) { + RETVAL_FALSE; + } else { + RETVAL_STRING(ret, 1); + STR_FREE(ret); + } + + STR_FREE(string); + STR_FREE(replace); + STR_FREE(pattern); +} +/* }}} */ + +/* {{{ proto string ereg_replace(string pattern, string replacement, string string) + Replace regular expression */ +PHP_FUNCTION(ereg_replace) +{ + php_do_ereg_replace(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); +} +/* }}} */ + +/* {{{ proto string eregi_replace(string pattern, string replacement, string string) + Case insensitive replace regular expression */ +PHP_FUNCTION(eregi_replace) +{ + php_do_ereg_replace(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); +} +/* }}} */ + +/* {{{ php_split + */ +static void php_split(INTERNAL_FUNCTION_PARAMETERS, int icase) +{ + zval **spliton, **str, **arg_count = NULL; + regex_t re; + regmatch_t subs[1]; + char *strp, *endp; + int err, size, count = -1, copts = 0; + int argc = ZEND_NUM_ARGS(); + + if (argc < 2 || argc > 3 || + zend_get_parameters_ex(argc, &spliton, &str, &arg_count) == FAILURE) { + WRONG_PARAM_COUNT; + } + + if (argc > 2) { + convert_to_long_ex(arg_count); + count = Z_LVAL_PP(arg_count); + } + + if (icase) + copts = REG_ICASE; + + convert_to_string_ex(spliton); + convert_to_string_ex(str); + + strp = Z_STRVAL_PP(str); + endp = strp + Z_STRLEN_PP(str); + + err = regcomp(&re, Z_STRVAL_PP(spliton), REG_EXTENDED | copts); + if (err) { + php_ereg_eprint(err, &re); + RETURN_FALSE; + } + + array_init(return_value); + + /* churn through str, generating array entries as we go */ + while ((count == -1 || count > 1) && !(err = regexec(&re, strp, 1, subs, 0))) { + if (subs[0].rm_so == 0 && subs[0].rm_eo) { + /* match is at start of string, return empty string */ + add_next_index_stringl(return_value, "", 0, 1); + /* skip ahead the length of the regex match */ + strp += subs[0].rm_eo; + } else if (subs[0].rm_so == 0 && subs[0].rm_eo == 0) { + /* No more matches */ + regfree(&re); + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid Regular Expression to split()"); + zend_hash_destroy(Z_ARRVAL_P(return_value)); + efree(Z_ARRVAL_P(return_value)); + RETURN_FALSE; + } else { + /* On a real match */ + + /* make a copy of the substring */ + size = subs[0].rm_so; + + /* add it to the array */ + add_next_index_stringl(return_value, strp, size, 1); + + /* point at our new starting point */ + strp = strp + subs[0].rm_eo; + } + + /* if we're only looking for a certain number of points, + stop looking once we hit it */ + if (count != -1) { + count--; + } + } + + /* see if we encountered an error */ + if (err && err != REG_NOMATCH) { + php_ereg_eprint(err, &re); + regfree(&re); + zend_hash_destroy(Z_ARRVAL_P(return_value)); + efree(Z_ARRVAL_P(return_value)); + RETURN_FALSE; + } + + /* otherwise we just have one last element to add to the array */ + size = endp - strp; + + add_next_index_stringl(return_value, strp, size, 1); + + regfree(&re); +} +/* }}} */ + +/* {{{ proto array split(string pattern, string string [, int limit]) + Split string into array by regular expression */ +PHP_FUNCTION(split) +{ + php_split(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); +} +/* }}} */ + +/* {{{ proto array spliti(string pattern, string string [, int limit]) + Split string into array by regular expression case-insensitive */ + +PHP_FUNCTION(spliti) +{ + php_split(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); +} + +/* }}} */ + +/* {{{ proto string sql_regcase(string string) + Make regular expression for case insensitive match */ +PHPAPI PHP_FUNCTION(sql_regcase) +{ + zval **string; + char *tmp; + unsigned char c; + register int i, j; + + if (ZEND_NUM_ARGS()!=1 || zend_get_parameters_ex(1, &string)==FAILURE) { + WRONG_PARAM_COUNT; + } + convert_to_string_ex(string); + + tmp = safe_emalloc(Z_STRLEN_PP(string), 4, 1); + + for (i = j = 0; i < Z_STRLEN_PP(string); i++) { + c = (unsigned char) Z_STRVAL_PP(string)[i]; + if(isalpha(c)) { + tmp[j++] = '['; + tmp[j++] = toupper(c); + tmp[j++] = tolower(c); + tmp[j++] = ']'; + } else { + tmp[j++] = c; + } + } + tmp[j] = 0; + + RETVAL_STRINGL(tmp, j, 1); + efree(tmp); +} +/* }}} */ + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: noet sw=4 ts=4 fdm=marker + * vim<600: noet sw=4 ts=4 + */ |