diff options
author | Stig Bakken <ssb@php.net> | 2002-02-08 18:49:40 +0000 |
---|---|---|
committer | Stig Bakken <ssb@php.net> | 2002-02-08 18:49:40 +0000 |
commit | cf5ba0feec63f4fc60c0703100163cf567e611c8 (patch) | |
tree | 36b6edae54dc54a361eccbb022639981c9621afc /ext/tokenizer | |
parent | e8285d6bba352af5b0a1a2bb33dbb6cbf284fb8e (diff) | |
download | php-git-cf5ba0feec63f4fc60c0703100163cf567e611c8.tar.gz |
@Added Andrei's tokenizer extension (Stig)
Diffstat (limited to 'ext/tokenizer')
-rw-r--r-- | ext/tokenizer/CREDITS | 2 | ||||
-rw-r--r-- | ext/tokenizer/EXPERIMENTAL | 0 | ||||
-rw-r--r-- | ext/tokenizer/Makefile.in | 6 | ||||
-rw-r--r-- | ext/tokenizer/config.m4 | 48 | ||||
-rw-r--r-- | ext/tokenizer/php_tokenizer.h | 81 | ||||
-rw-r--r-- | ext/tokenizer/tokenizer.c | 519 | ||||
-rw-r--r-- | ext/tokenizer/tokenizer.php | 35 |
7 files changed, 691 insertions, 0 deletions
diff --git a/ext/tokenizer/CREDITS b/ext/tokenizer/CREDITS new file mode 100644 index 0000000000..919aeb8149 --- /dev/null +++ b/ext/tokenizer/CREDITS @@ -0,0 +1,2 @@ +Andrei Zmievski +tokenizer diff --git a/ext/tokenizer/EXPERIMENTAL b/ext/tokenizer/EXPERIMENTAL new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/ext/tokenizer/EXPERIMENTAL diff --git a/ext/tokenizer/Makefile.in b/ext/tokenizer/Makefile.in new file mode 100644 index 0000000000..f6893c0879 --- /dev/null +++ b/ext/tokenizer/Makefile.in @@ -0,0 +1,6 @@ +LTLIBRARY_NAME = libtokenizer.la +LTLIBRARY_SOURCES = tokenizer.c +LTLIBRARY_SHARED_NAME = tokenizer.la +LTLIBRARY_SHARED_LIBADD = $(TOKENIZER_SHARED_LIBADD) + +include $(top_srcdir)/build/dynlib.mk diff --git a/ext/tokenizer/config.m4 b/ext/tokenizer/config.m4 new file mode 100644 index 0000000000..3779291688 --- /dev/null +++ b/ext/tokenizer/config.m4 @@ -0,0 +1,48 @@ +dnl $Id$ +dnl config.m4 for extension tokenizer + +dnl Otherwise use enable: + +PHP_ARG_ENABLE(tokenizer, whether to enable tokenizer support, +[ --enable-tokenizer Enable tokenizer support]) + +if test "$PHP_TOKENIZER" != "no"; then + dnl Write more examples of tests here... + + dnl # --with-tokenizer -> check with-path + dnl SEARCH_PATH="/usr/local /usr" # you might want to change this + dnl SEARCH_FOR="/include/tokenizer.h" # you most likely want to change this + dnl if test -r $PHP_TOKENIZER/; then # path given as parameter + dnl TOKENIZER_DIR=$PHP_TOKENIZER + dnl else # search default path list + dnl AC_MSG_CHECKING([for tokenizer files in default path]) + dnl for i in $SEARCH_PATH ; do + dnl if test -r $i/$SEARCH_FOR; then + dnl TOKENIZER_DIR=$i + dnl AC_MSG_RESULT(found in $i) + dnl fi + dnl done + dnl fi + dnl + dnl if test -z "$TOKENIZER_DIR"; then + dnl AC_MSG_RESULT([not found]) + dnl AC_MSG_ERROR([Please reinstall the tokenizer distribution]) + dnl fi + + dnl # --with-tokenizer -> add include path + dnl PHP_ADD_INCLUDE($TOKENIZER_DIR/include) + + dnl # --with-tokenizer -> chech for lib and symbol presence + dnl LIBNAME=tokenizer # you may want to change this + dnl LIBSYMBOL=tokenizer # you most likely want to change this + dnl old_LIBS=$LIBS + dnl LIBS="$LIBS -L$TOKENIZER_DIR/lib -lm -ldl" + dnl AC_CHECK_LIB($LIBNAME, $LIBSYMBOL, [AC_DEFINE(HAVE_TOKENIZERLIB,1,[ ])], + dnl [AC_MSG_ERROR([wrong tokenizer lib version or lib not found])]) + dnl LIBS=$old_LIBS + dnl + dnl PHP_SUBST(TOKENIZER_SHARED_LIBADD) + dnl PHP_ADD_LIBRARY_WITH_PATH($LIBNAME, $TOKENIZER_DIR/lib, TOKENIZER_SHARED_LIBADD) + + PHP_EXTENSION(tokenizer, $ext_shared) +fi diff --git a/ext/tokenizer/php_tokenizer.h b/ext/tokenizer/php_tokenizer.h new file mode 100644 index 0000000000..589b8c4179 --- /dev/null +++ b/ext/tokenizer/php_tokenizer.h @@ -0,0 +1,81 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 4 | + +----------------------------------------------------------------------+ + | Copyright (c) 1997, 1998, 1999, 2000, 2001 The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 2.02 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available at through the world-wide-web at | + | http://www.php.net/license/2_02.txt. | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: | + | | + +----------------------------------------------------------------------+ + */ + +#ifndef PHP_TOKENIZER_H +#define PHP_TOKENIZER_H + +extern zend_module_entry tokenizer_module_entry; +#define phpext_tokenizer_ptr &tokenizer_module_entry + +#ifdef PHP_WIN32 +#define PHP_TOKENIZER_API __declspec(dllexport) +#else +#define PHP_TOKENIZER_API +#endif + +#ifdef ZTS +#include "TSRM.h" +#endif + +PHP_MINIT_FUNCTION(tokenizer); +PHP_MSHUTDOWN_FUNCTION(tokenizer); +PHP_RINIT_FUNCTION(tokenizer); +PHP_RSHUTDOWN_FUNCTION(tokenizer); +PHP_MINFO_FUNCTION(tokenizer); + +PHP_FUNCTION(confirm_tokenizer_compiled); /* For testing, remove later. */ +PHP_FUNCTION(token_get_all); +PHP_FUNCTION(token_name); + +/* + Declare any global variables you may need between the BEGIN + and END macros here: + +ZEND_BEGIN_MODULE_GLOBALS(tokenizer) + int global_value; + char *global_string; +ZEND_END_MODULE_GLOBALS(tokenizer) +*/ + +/* In every utility function you add that needs to use variables + in php_tokenizer_globals, call TSRM_FETCH(); after declaring other + variables used by that function, or better yet, pass in TSRMG_CC + after the last function argument and declare your utility function + with TSRMG_DC after the last declared argument. Always refer to + the globals in your function as TOKENIZER_G(variable). You are + encouraged to rename these macros something shorter, see + examples in any other php module directory. +*/ + +#ifdef ZTS +#define TOKENIZER_G(v) TSRMG(tokenizer_globals_id, zend_tokenizer_globals *, v) +#else +#define TOKENIZER_G(v) (tokenizer_globals.v) +#endif + +#endif /* PHP_TOKENIZER_H */ + + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * indent-tabs-mode: t + * End: + */ diff --git a/ext/tokenizer/tokenizer.c b/ext/tokenizer/tokenizer.c new file mode 100644 index 0000000000..0e0f5ffdff --- /dev/null +++ b/ext/tokenizer/tokenizer.c @@ -0,0 +1,519 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 4 | + +----------------------------------------------------------------------+ + | Copyright (c) 1997, 1998, 1999, 2000, 2001 The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 2.02 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available at through the world-wide-web at | + | http://www.php.net/license/2_02.txt. | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: | + | | + +----------------------------------------------------------------------+ + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "php.h" +#include "php_ini.h" +#include "ext/standard/info.h" +#include "php_tokenizer.h" + +typedef struct yy_buffer_state *YY_BUFFER_STATE; +typedef unsigned int yy_size_t; +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + }; + +#include "zend.h" +#include "zend_language_scanner.h" +#include "zend_language_parser.h" + +#define zendtext LANG_SCNG(yy_text) +#define zendleng LANG_SCNG(yy_leng) +/* If you declare any globals in php_tokenizer.h uncomment this: +ZEND_DECLARE_MODULE_GLOBALS(tokenizer) +*/ + +/* True global resources - no need for thread safety here */ +static int le_tokenizer; + +/* {{{ tokenizer_functions[] + * + * Every user visible function must have an entry in tokenizer_functions[]. + */ +function_entry tokenizer_functions[] = { + PHP_FE(token_get_all, NULL) + PHP_FE(token_name, NULL) + {NULL, NULL, NULL} /* Must be the last line in tokenizer_functions[] */ +}; +/* }}} */ + +/* {{{ tokenizer_module_entry + */ +zend_module_entry tokenizer_module_entry = { +#if ZEND_MODULE_API_NO >= 20010901 + STANDARD_MODULE_HEADER, +#endif + "tokenizer", + tokenizer_functions, + PHP_MINIT(tokenizer), + PHP_MSHUTDOWN(tokenizer), + PHP_RINIT(tokenizer), /* Replace with NULL if there's nothing to do at request start */ + PHP_RSHUTDOWN(tokenizer), /* Replace with NULL if there's nothing to do at request end */ + PHP_MINFO(tokenizer), +#if ZEND_MODULE_API_NO >= 20010901 + "0.1", /* Replace with version number for your extension */ +#endif + STANDARD_MODULE_PROPERTIES +}; +/* }}} */ + +#ifdef COMPILE_DL_TOKENIZER +ZEND_GET_MODULE(tokenizer) +#endif + +/* {{{ PHP_INI + */ +/* Remove comments and fill if you need to have entries in php.ini +PHP_INI_BEGIN() + STD_PHP_INI_ENTRY("tokenizer.global_value", "42", PHP_INI_ALL, OnUpdateInt, global_value, zend_tokenizer_globals, tokenizer_globals) + STD_PHP_INI_ENTRY("tokenizer.global_string", "foobar", PHP_INI_ALL, OnUpdateString, global_string, zend_tokenizer_globals, tokenizer_globals) +PHP_INI_END() +*/ +/* }}} */ + +/* {{{ php_tokenizer_init_globals + */ +/* Uncomment this function if you have INI entries +static void php_tokenizer_init_globals(zend_tokenizer_globals *tokenizer_globals) +{ + tokenizer_globals->global_value = 0; + tokenizer_globals->global_string = NULL; +} +*/ +/* }}} */ + +/* {{{ PHP_MINIT_FUNCTION + */ +PHP_MINIT_FUNCTION(tokenizer) +{ + /* If you have INI entries, uncomment these lines + ZEND_INIT_MODULE_GLOBALS(tokenizer, php_tokenizer_init_globals, NULL); + REGISTER_INI_ENTRIES(); + */ + + REGISTER_LONG_CONSTANT("T_INCLUDE", T_INCLUDE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_INCLUDE_ONCE", T_INCLUDE_ONCE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_EVAL", T_EVAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_REQUIRE", T_REQUIRE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_REQUIRE_ONCE", T_REQUIRE_ONCE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_LOGICAL_OR", T_LOGICAL_OR, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_LOGICAL_XOR", T_LOGICAL_XOR, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_LOGICAL_AND", T_LOGICAL_AND, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_PRINT", T_PRINT, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_PLUS_EQUAL", T_PLUS_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_MINUS_EQUAL", T_MINUS_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_MUL_EQUAL", T_MUL_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_DIV_EQUAL", T_DIV_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_CONCAT_EQUAL", T_CONCAT_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_MOD_EQUAL", T_MOD_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_AND_EQUAL", T_AND_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_OR_EQUAL", T_OR_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_XOR_EQUAL", T_XOR_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_SL_EQUAL", T_SL_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_SR_EQUAL", T_SR_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_BOOLEAN_OR", T_BOOLEAN_OR, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_BOOLEAN_AND", T_BOOLEAN_AND, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_IS_EQUAL", T_IS_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_IS_NOT_EQUAL", T_IS_NOT_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_IS_IDENTICAL", T_IS_IDENTICAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_IS_NOT_IDENTICAL", T_IS_NOT_IDENTICAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_IS_SMALLER_OR_EQUAL", T_IS_SMALLER_OR_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_IS_GREATER_OR_EQUAL", T_IS_GREATER_OR_EQUAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_SL", T_SL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_SR", T_SR, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_INC", T_INC, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_DEC", T_DEC, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_INT_CAST", T_INT_CAST, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_DOUBLE_CAST", T_DOUBLE_CAST, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_STRING_CAST", T_STRING_CAST, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ARRAY_CAST", T_ARRAY_CAST, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_OBJECT_CAST", T_OBJECT_CAST, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_BOOL_CAST", T_BOOL_CAST, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_UNSET_CAST", T_UNSET_CAST, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_NEW", T_NEW, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_EXIT", T_EXIT, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_IF", T_IF, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ELSEIF", T_ELSEIF, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ELSE", T_ELSE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ENDIF", T_ENDIF, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_LNUMBER", T_LNUMBER, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_DNUMBER", T_DNUMBER, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_STRING", T_STRING, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_STRING_VARNAME", T_STRING_VARNAME, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_VARIABLE", T_VARIABLE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_NUM_STRING", T_NUM_STRING, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_INLINE_HTML", T_INLINE_HTML, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_CHARACTER", T_CHARACTER, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_BAD_CHARACTER", T_BAD_CHARACTER, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ENCAPSED_AND_WHITESPACE", T_ENCAPSED_AND_WHITESPACE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_CONSTANT_ENCAPSED_STRING", T_CONSTANT_ENCAPSED_STRING, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ECHO", T_ECHO, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_DO", T_DO, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_WHILE", T_WHILE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ENDWHILE", T_ENDWHILE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_FOR", T_FOR, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ENDFOR", T_ENDFOR, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_FOREACH", T_FOREACH, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ENDFOREACH", T_ENDFOREACH, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_DECLARE", T_DECLARE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ENDDECLARE", T_ENDDECLARE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_AS", T_AS, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_SWITCH", T_SWITCH, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ENDSWITCH", T_ENDSWITCH, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_CASE", T_CASE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_DEFAULT", T_DEFAULT, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_BREAK", T_BREAK, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_CONTINUE", T_CONTINUE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_OLD_FUNCTION", T_OLD_FUNCTION, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_FUNCTION", T_FUNCTION, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_CONST", T_CONST, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_RETURN", T_RETURN, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_USE", T_USE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_GLOBAL", T_GLOBAL, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_STATIC", T_STATIC, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_VAR", T_VAR, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_UNSET", T_UNSET, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ISSET", T_ISSET, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_EMPTY", T_EMPTY, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_CLASS", T_CLASS, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_EXTENDS", T_EXTENDS, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_OBJECT_OPERATOR", T_OBJECT_OPERATOR, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_DOUBLE_ARROW", T_DOUBLE_ARROW, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_LIST", T_LIST, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ARRAY", T_ARRAY, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_LINE", T_LINE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_FILE", T_FILE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_COMMENT", T_COMMENT, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_ML_COMMENT", T_ML_COMMENT, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_OPEN_TAG", T_OPEN_TAG, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_OPEN_TAG_WITH_ECHO", T_OPEN_TAG_WITH_ECHO, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_CLOSE_TAG", T_CLOSE_TAG, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_WHITESPACE", T_WHITESPACE, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_START_HEREDOC", T_START_HEREDOC, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_END_HEREDOC", T_END_HEREDOC, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_DOLLAR_OPEN_CURLY_BRACES", T_DOLLAR_OPEN_CURLY_BRACES, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_CURLY_OPEN", T_CURLY_OPEN, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_PAAMAYIM_NEKUDOTAYIM", T_PAAMAYIM_NEKUDOTAYIM, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_DOUBLE_COLON", T_PAAMAYIM_NEKUDOTAYIM, CONST_CS | CONST_PERSISTENT); + + return SUCCESS; +} +/* }}} */ + +/* {{{ PHP_MSHUTDOWN_FUNCTION + */ +PHP_MSHUTDOWN_FUNCTION(tokenizer) +{ + /* uncomment this line if you have INI entries + UNREGISTER_INI_ENTRIES(); + */ + return SUCCESS; +} +/* }}} */ + +/* Remove if there's nothing to do at request start */ +/* {{{ PHP_RINIT_FUNCTION + */ +PHP_RINIT_FUNCTION(tokenizer) +{ + return SUCCESS; +} +/* }}} */ + +/* Remove if there's nothing to do at request end */ +/* {{{ PHP_RSHUTDOWN_FUNCTION + */ +PHP_RSHUTDOWN_FUNCTION(tokenizer) +{ + return SUCCESS; +} +/* }}} */ + +/* {{{ PHP_MINFO_FUNCTION + */ +PHP_MINFO_FUNCTION(tokenizer) +{ + php_info_print_table_start(); + php_info_print_table_header(2, "tokenizer support", "enabled"); + php_info_print_table_end(); + + /* Remove comments if you have entries in php.ini + DISPLAY_INI_ENTRIES(); + */ +} +/* }}} */ + +static void tokenize(zval *return_value) +{ + zval token; + zval *keyword; + int token_type; + zend_bool destroy; + + array_init(return_value); + + ZVAL_NULL(&token); + while ((token_type = lex_scan(&token CLS_CC))) { + destroy = 1; + switch (token_type) { + case T_OPEN_TAG: + case T_OPEN_TAG_WITH_ECHO: + case T_WHITESPACE: + case T_COMMENT: + case T_CLOSE_TAG: + destroy = 0; + break; + } + + if (token_type >= 256) { + MAKE_STD_ZVAL(keyword); + array_init(keyword); + add_next_index_long(keyword, token_type); + add_next_index_stringl(keyword, zendtext, zendleng, 1); + add_next_index_zval(return_value, keyword); + } else { + add_next_index_stringl(return_value, zendtext, zendleng, 1); + } + if (destroy && Z_TYPE(token) != IS_NULL) { + zval_dtor(&token); + } + ZVAL_NULL(&token); + } +} + +static char * +get_token_type_name(int token_type) +{ + switch (token_type) { + case T_INCLUDE: return "T_INCLUDE"; + case T_INCLUDE_ONCE: return "T_INCLUDE_ONCE"; + case T_EVAL: return "T_EVAL"; + case T_REQUIRE: return "T_REQUIRE"; + case T_REQUIRE_ONCE: return "T_REQUIRE_ONCE"; + case T_LOGICAL_OR: return "T_LOGICAL_OR"; + case T_LOGICAL_XOR: return "T_LOGICAL_XOR"; + case T_LOGICAL_AND: return "T_LOGICAL_AND"; + case T_PRINT: return "T_PRINT"; + case T_PLUS_EQUAL: return "T_PLUS_EQUAL"; + case T_MINUS_EQUAL: return "T_MINUS_EQUAL"; + case T_MUL_EQUAL: return "T_MUL_EQUAL"; + case T_DIV_EQUAL: return "T_DIV_EQUAL"; + case T_CONCAT_EQUAL: return "T_CONCAT_EQUAL"; + case T_MOD_EQUAL: return "T_MOD_EQUAL"; + case T_AND_EQUAL: return "T_AND_EQUAL"; + case T_OR_EQUAL: return "T_OR_EQUAL"; + case T_XOR_EQUAL: return "T_XOR_EQUAL"; + case T_SL_EQUAL: return "T_SL_EQUAL"; + case T_SR_EQUAL: return "T_SR_EQUAL"; + case T_BOOLEAN_OR: return "T_BOOLEAN_OR"; + case T_BOOLEAN_AND: return "T_BOOLEAN_AND"; + case T_IS_EQUAL: return "T_IS_EQUAL"; + case T_IS_NOT_EQUAL: return "T_IS_NOT_EQUAL"; + case T_IS_IDENTICAL: return "T_IS_IDENTICAL"; + case T_IS_NOT_IDENTICAL: return "T_IS_NOT_IDENTICAL"; + case T_IS_SMALLER_OR_EQUAL: return "T_IS_SMALLER_OR_EQUAL"; + case T_IS_GREATER_OR_EQUAL: return "T_IS_GREATER_OR_EQUAL"; + case T_SL: return "T_SL"; + case T_SR: return "T_SR"; + case T_INC: return "T_INC"; + case T_DEC: return "T_DEC"; + case T_INT_CAST: return "T_INT_CAST"; + case T_DOUBLE_CAST: return "T_DOUBLE_CAST"; + case T_STRING_CAST: return "T_STRING_CAST"; + case T_ARRAY_CAST: return "T_ARRAY_CAST"; + case T_OBJECT_CAST: return "T_OBJECT_CAST"; + case T_BOOL_CAST: return "T_BOOL_CAST"; + case T_UNSET_CAST: return "T_UNSET_CAST"; + case T_NEW: return "T_NEW"; + case T_EXIT: return "T_EXIT"; + case T_IF: return "T_IF"; + case T_ELSEIF: return "T_ELSEIF"; + case T_ELSE: return "T_ELSE"; + case T_ENDIF: return "T_ENDIF"; + case T_LNUMBER: return "T_LNUMBER"; + case T_DNUMBER: return "T_DNUMBER"; + case T_STRING: return "T_STRING"; + case T_STRING_VARNAME: return "T_STRING_VARNAME"; + case T_VARIABLE: return "T_VARIABLE"; + case T_NUM_STRING: return "T_NUM_STRING"; + case T_INLINE_HTML: return "T_INLINE_HTML"; + case T_CHARACTER: return "T_CHARACTER"; + case T_BAD_CHARACTER: return "T_BAD_CHARACTER"; + case T_ENCAPSED_AND_WHITESPACE: return "T_ENCAPSED_AND_WHITESPACE"; + case T_CONSTANT_ENCAPSED_STRING: return "T_CONSTANT_ENCAPSED_STRING"; + case T_ECHO: return "T_ECHO"; + case T_DO: return "T_DO"; + case T_WHILE: return "T_WHILE"; + case T_ENDWHILE: return "T_ENDWHILE"; + case T_FOR: return "T_FOR"; + case T_ENDFOR: return "T_ENDFOR"; + case T_FOREACH: return "T_FOREACH"; + case T_ENDFOREACH: return "T_ENDFOREACH"; + case T_DECLARE: return "T_DECLARE"; + case T_ENDDECLARE: return "T_ENDDECLARE"; + case T_AS: return "T_AS"; + case T_SWITCH: return "T_SWITCH"; + case T_ENDSWITCH: return "T_ENDSWITCH"; + case T_CASE: return "T_CASE"; + case T_DEFAULT: return "T_DEFAULT"; + case T_BREAK: return "T_BREAK"; + case T_CONTINUE: return "T_CONTINUE"; + case T_OLD_FUNCTION: return "T_OLD_FUNCTION"; + case T_FUNCTION: return "T_FUNCTION"; + case T_CONST: return "T_CONST"; + case T_RETURN: return "T_RETURN"; + case T_USE: return "T_USE"; + case T_GLOBAL: return "T_GLOBAL"; + case T_STATIC: return "T_STATIC"; + case T_VAR: return "T_VAR"; + case T_UNSET: return "T_UNSET"; + case T_ISSET: return "T_ISSET"; + case T_EMPTY: return "T_EMPTY"; + case T_CLASS: return "T_CLASS"; + case T_EXTENDS: return "T_EXTENDS"; + case T_OBJECT_OPERATOR: return "T_OBJECT_OPERATOR"; + case T_DOUBLE_ARROW: return "T_DOUBLE_ARROW"; + case T_LIST: return "T_LIST"; + case T_ARRAY: return "T_ARRAY"; + case T_LINE: return "T_LINE"; + case T_FILE: return "T_FILE"; + case T_COMMENT: return "T_COMMENT"; + case T_ML_COMMENT: return "T_ML_COMMENT"; + case T_OPEN_TAG: return "T_OPEN_TAG"; + case T_OPEN_TAG_WITH_ECHO: return "T_OPEN_TAG_WITH_ECHO"; + case T_CLOSE_TAG: return "T_CLOSE_TAG"; + case T_WHITESPACE: return "T_WHITESPACE"; + case T_START_HEREDOC: return "T_START_HEREDOC"; + case T_END_HEREDOC: return "T_END_HEREDOC"; + case T_DOLLAR_OPEN_CURLY_BRACES: return "T_DOLLAR_OPEN_CURLY_BRACES"; + case T_CURLY_OPEN: return "T_CURLY_OPEN"; + case T_PAAMAYIM_NEKUDOTAYIM: return "T_DOUBLE_COLON"; + } + return "UNKNOWN"; +} + + + +/* {{{ proto array token_get_all(string source) + */ +PHP_FUNCTION(token_get_all) +{ + char *source = NULL; + int argc = ZEND_NUM_ARGS(); + int source_len; + zval source_z; + zend_lex_state original_lex_state; + + if (zend_parse_parameters(argc TSRMLS_CC, "s", &source, &source_len) == FAILURE) + return; + + ZVAL_STRINGL(&source_z, source, source_len, 0); + zend_save_lexical_state(&original_lex_state CLS_CC); + + if (zend_prepare_string_for_scanning(&source_z, "") == FAILURE) { + RETURN_EMPTY_STRING(); + } + + tokenize(return_value); + + zend_restore_lexical_state(&original_lex_state CLS_CC); +} +/* }}} */ + +/* {{{ proto string token_name(int type) + */ + +PHP_FUNCTION(token_name) +{ + int argc = ZEND_NUM_ARGS(); + long type; + + if (zend_parse_parameters(argc TSRMLS_CC, "l", &type) == FAILURE) { + return; + } + RETVAL_STRING(get_token_type_name(type), 1); +} + +/* }}} */ + + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: noet sw=4 ts=4 fdm=marker + * vim<600: noet sw=4 ts=4 + */ diff --git a/ext/tokenizer/tokenizer.php b/ext/tokenizer/tokenizer.php new file mode 100644 index 0000000000..c13063c628 --- /dev/null +++ b/ext/tokenizer/tokenizer.php @@ -0,0 +1,35 @@ +<?php + +if(!extension_loaded('tokenizer')) { + dl('tokenizer.so'); +} + +$fp = fopen('php://stdin', 'r'); +while (!feof($fp)) { + $content .= fread($fp, 4096); +} +fclose($fp); + +$tokens = token_get_all($content); + +$count = count($tokens); +$state = 0; +for ($i = 0; $i < $count; $i++) { + $token = $tokens[$i]; + if (is_array($token)) { + if ($state == 1 && $token[0] == T_STRING) { + $token[1] = preg_replace('!([a-z])([A-Z])!e', '"$1_".strtolower("$2")', $token[1]); + $state = 0; + } else if ($token[0] == T_FUNCTION) { + $state = 1; + } + $chunk = $token[1]; + } else { + $chunk = $token; + } + $output .= $chunk; +} + +print $output; + +?> |