summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNikita Popov <nikita.ppv@gmail.com>2017-03-23 16:14:39 +0100
committerNikita Popov <nikita.ppv@gmail.com>2020-03-26 11:09:18 +0100
commit5a09b9fb0f9aa432843673887cde40bfc8737020 (patch)
tree7e0aca57d25200b7cb8d4304ba636c720547bdb3
parentf74e30c07c2a94921fbfb7b8936324707505bd75 (diff)
downloadphp-git-5a09b9fb0f9aa432843673887cde40bfc8737020.tar.gz
Add PhpToken class
RFC: https://wiki.php.net/rfc/token_as_object Relative to the RFC, this also adds a __toString() method, as discussed on list. Closes GH-5176.
-rw-r--r--UPGRADING6
-rw-r--r--ext/tokenizer/tests/PhpToken_constructor.phpt46
-rw-r--r--ext/tokenizer/tests/PhpToken_extension.phpt36
-rw-r--r--ext/tokenizer/tests/PhpToken_extension_errors.phpt30
-rw-r--r--ext/tokenizer/tests/PhpToken_final_constructor.phpt15
-rw-r--r--ext/tokenizer/tests/PhpToken_getAll.phpt358
-rw-r--r--ext/tokenizer/tests/PhpToken_methods.phpt119
-rw-r--r--ext/tokenizer/tests/PhpToken_toString.phpt18
-rw-r--r--ext/tokenizer/tokenizer.c378
-rw-r--r--ext/tokenizer/tokenizer.stub.php16
-rw-r--r--ext/tokenizer/tokenizer_arginfo.h25
-rw-r--r--ext/tokenizer/tokenizer_data.c2
-rwxr-xr-xext/tokenizer/tokenizer_data_gen.sh2
13 files changed, 1001 insertions, 50 deletions
diff --git a/UPGRADING b/UPGRADING
index 4bd913396a..ab0999282d 100644
--- a/UPGRADING
+++ b/UPGRADING
@@ -533,6 +533,12 @@ PHP 8.0 UPGRADE NOTES
7. New Classes and Interfaces
========================================
+- Tokenizer:
+ . The new PhpToken class adds an object-based interface to the tokenizer.
+ It provides a more uniform and ergonomic representation, while being more
+ memory efficient and faster.
+ RFC: https://wiki.php.net/rfc/token_as_object
+
========================================
8. Removed Extensions and SAPIs
========================================
diff --git a/ext/tokenizer/tests/PhpToken_constructor.phpt b/ext/tokenizer/tests/PhpToken_constructor.phpt
new file mode 100644
index 0000000000..fb167ac684
--- /dev/null
+++ b/ext/tokenizer/tests/PhpToken_constructor.phpt
@@ -0,0 +1,46 @@
+--TEST--
+PhpToken constructor
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+$token = new PhpToken(300, 'function');
+var_dump($token);
+$token = new PhpToken(300, 'function', 10);
+var_dump($token);
+$token = new PhpToken(300, 'function', 10, 100);
+var_dump($token);
+
+?>
+--EXPECT--
+object(PhpToken)#1 (4) {
+ ["id"]=>
+ int(300)
+ ["text"]=>
+ string(8) "function"
+ ["line"]=>
+ int(-1)
+ ["pos"]=>
+ int(-1)
+}
+object(PhpToken)#2 (4) {
+ ["id"]=>
+ int(300)
+ ["text"]=>
+ string(8) "function"
+ ["line"]=>
+ int(10)
+ ["pos"]=>
+ int(-1)
+}
+object(PhpToken)#1 (4) {
+ ["id"]=>
+ int(300)
+ ["text"]=>
+ string(8) "function"
+ ["line"]=>
+ int(10)
+ ["pos"]=>
+ int(100)
+}
diff --git a/ext/tokenizer/tests/PhpToken_extension.phpt b/ext/tokenizer/tests/PhpToken_extension.phpt
new file mode 100644
index 0000000000..ef1a4f1272
--- /dev/null
+++ b/ext/tokenizer/tests/PhpToken_extension.phpt
@@ -0,0 +1,36 @@
+--TEST--
+Extending the PhpToken class
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+$code = <<<'PHP'
+<?PHP
+FUNCTION FOO() {
+ ECHO "bar";
+}
+PHP;
+
+class MyPhpToken extends PhpToken {
+ public int $extra = 123;
+
+ public function getLoweredText(): string {
+ return strtolower($this->text);
+ }
+}
+
+foreach (MyPhpToken::getAll($code) as $token) {
+ echo $token->getLoweredText();
+
+ if ($token->extra !== 123) {
+ echo "Missing property!\n";
+ }
+}
+
+?>
+--EXPECT--
+<?php
+function foo() {
+ echo "bar";
+}
diff --git a/ext/tokenizer/tests/PhpToken_extension_errors.phpt b/ext/tokenizer/tests/PhpToken_extension_errors.phpt
new file mode 100644
index 0000000000..89604a9051
--- /dev/null
+++ b/ext/tokenizer/tests/PhpToken_extension_errors.phpt
@@ -0,0 +1,30 @@
+--TEST--
+PhpToken extensions that throw during construction
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+class MyPhpToken1 extends PhpToken {
+ public $extra = UNKNOWN;
+}
+
+try {
+ var_dump(MyPhpToken1::getAll("<?php foo"));
+} catch (Error $e) {
+ echo $e->getMessage(), "\n";
+}
+
+abstract class MyPhpToken2 extends PhpToken {
+}
+
+try {
+ var_dump(MyPhpToken2::getAll("<?php foo"));
+} catch (Error $e) {
+ echo $e->getMessage(), "\n";
+}
+
+?>
+--EXPECT--
+Undefined constant 'UNKNOWN'
+Cannot instantiate abstract class MyPhpToken2
diff --git a/ext/tokenizer/tests/PhpToken_final_constructor.phpt b/ext/tokenizer/tests/PhpToken_final_constructor.phpt
new file mode 100644
index 0000000000..7f4061dbe8
--- /dev/null
+++ b/ext/tokenizer/tests/PhpToken_final_constructor.phpt
@@ -0,0 +1,15 @@
+--TEST--
+Check that the PhpToken constructor is final
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+class MyPhpToken extends PhpToken {
+ public function __construct() {
+ }
+}
+
+?>
+--EXPECTF--
+Fatal error: Cannot override final method PhpToken::__construct() in %s on line %d
diff --git a/ext/tokenizer/tests/PhpToken_getAll.phpt b/ext/tokenizer/tests/PhpToken_getAll.phpt
new file mode 100644
index 0000000000..604a979023
--- /dev/null
+++ b/ext/tokenizer/tests/PhpToken_getAll.phpt
@@ -0,0 +1,358 @@
+--TEST--
+PhpToken::getAll() method
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+$code = <<<'PHP'
+<?php
+function foo() {
+ echo "bar";
+}
+PHP;
+var_dump(PhpToken::getAll($code));
+var_dump(PhpToken::getAll($code, TOKEN_PARSE));
+
+?>
+--EXPECT--
+array(15) {
+ [0]=>
+ object(PhpToken)#1 (4) {
+ ["id"]=>
+ int(382)
+ ["text"]=>
+ string(6) "<?php
+"
+ ["line"]=>
+ int(1)
+ ["pos"]=>
+ int(0)
+ }
+ [1]=>
+ object(PhpToken)#2 (4) {
+ ["id"]=>
+ int(342)
+ ["text"]=>
+ string(8) "function"
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(6)
+ }
+ [2]=>
+ object(PhpToken)#3 (4) {
+ ["id"]=>
+ int(385)
+ ["text"]=>
+ string(1) " "
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(14)
+ }
+ [3]=>
+ object(PhpToken)#4 (4) {
+ ["id"]=>
+ int(310)
+ ["text"]=>
+ string(3) "foo"
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(15)
+ }
+ [4]=>
+ object(PhpToken)#5 (4) {
+ ["id"]=>
+ int(40)
+ ["text"]=>
+ string(1) "("
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(18)
+ }
+ [5]=>
+ object(PhpToken)#6 (4) {
+ ["id"]=>
+ int(41)
+ ["text"]=>
+ string(1) ")"
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(19)
+ }
+ [6]=>
+ object(PhpToken)#7 (4) {
+ ["id"]=>
+ int(385)
+ ["text"]=>
+ string(1) " "
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(20)
+ }
+ [7]=>
+ object(PhpToken)#8 (4) {
+ ["id"]=>
+ int(123)
+ ["text"]=>
+ string(1) "{"
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(21)
+ }
+ [8]=>
+ object(PhpToken)#9 (4) {
+ ["id"]=>
+ int(385)
+ ["text"]=>
+ string(5) "
+ "
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(22)
+ }
+ [9]=>
+ object(PhpToken)#10 (4) {
+ ["id"]=>
+ int(324)
+ ["text"]=>
+ string(4) "echo"
+ ["line"]=>
+ int(3)
+ ["pos"]=>
+ int(27)
+ }
+ [10]=>
+ object(PhpToken)#11 (4) {
+ ["id"]=>
+ int(385)
+ ["text"]=>
+ string(1) " "
+ ["line"]=>
+ int(3)
+ ["pos"]=>
+ int(31)
+ }
+ [11]=>
+ object(PhpToken)#12 (4) {
+ ["id"]=>
+ int(314)
+ ["text"]=>
+ string(5) ""bar""
+ ["line"]=>
+ int(3)
+ ["pos"]=>
+ int(32)
+ }
+ [12]=>
+ object(PhpToken)#13 (4) {
+ ["id"]=>
+ int(59)
+ ["text"]=>
+ string(1) ";"
+ ["line"]=>
+ int(3)
+ ["pos"]=>
+ int(37)
+ }
+ [13]=>
+ object(PhpToken)#14 (4) {
+ ["id"]=>
+ int(385)
+ ["text"]=>
+ string(1) "
+"
+ ["line"]=>
+ int(3)
+ ["pos"]=>
+ int(38)
+ }
+ [14]=>
+ object(PhpToken)#15 (4) {
+ ["id"]=>
+ int(125)
+ ["text"]=>
+ string(1) "}"
+ ["line"]=>
+ int(4)
+ ["pos"]=>
+ int(39)
+ }
+}
+array(15) {
+ [0]=>
+ object(PhpToken)#15 (4) {
+ ["id"]=>
+ int(382)
+ ["text"]=>
+ string(6) "<?php
+"
+ ["line"]=>
+ int(1)
+ ["pos"]=>
+ int(0)
+ }
+ [1]=>
+ object(PhpToken)#14 (4) {
+ ["id"]=>
+ int(342)
+ ["text"]=>
+ string(8) "function"
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(6)
+ }
+ [2]=>
+ object(PhpToken)#13 (4) {
+ ["id"]=>
+ int(385)
+ ["text"]=>
+ string(1) " "
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(14)
+ }
+ [3]=>
+ object(PhpToken)#12 (4) {
+ ["id"]=>
+ int(310)
+ ["text"]=>
+ string(3) "foo"
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(15)
+ }
+ [4]=>
+ object(PhpToken)#11 (4) {
+ ["id"]=>
+ int(40)
+ ["text"]=>
+ string(1) "("
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(18)
+ }
+ [5]=>
+ object(PhpToken)#10 (4) {
+ ["id"]=>
+ int(41)
+ ["text"]=>
+ string(1) ")"
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(19)
+ }
+ [6]=>
+ object(PhpToken)#9 (4) {
+ ["id"]=>
+ int(385)
+ ["text"]=>
+ string(1) " "
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(20)
+ }
+ [7]=>
+ object(PhpToken)#8 (4) {
+ ["id"]=>
+ int(123)
+ ["text"]=>
+ string(1) "{"
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(21)
+ }
+ [8]=>
+ object(PhpToken)#7 (4) {
+ ["id"]=>
+ int(385)
+ ["text"]=>
+ string(5) "
+ "
+ ["line"]=>
+ int(2)
+ ["pos"]=>
+ int(22)
+ }
+ [9]=>
+ object(PhpToken)#6 (4) {
+ ["id"]=>
+ int(324)
+ ["text"]=>
+ string(4) "echo"
+ ["line"]=>
+ int(3)
+ ["pos"]=>
+ int(27)
+ }
+ [10]=>
+ object(PhpToken)#5 (4) {
+ ["id"]=>
+ int(385)
+ ["text"]=>
+ string(1) " "
+ ["line"]=>
+ int(3)
+ ["pos"]=>
+ int(31)
+ }
+ [11]=>
+ object(PhpToken)#4 (4) {
+ ["id"]=>
+ int(314)
+ ["text"]=>
+ string(5) ""bar""
+ ["line"]=>
+ int(3)
+ ["pos"]=>
+ int(32)
+ }
+ [12]=>
+ object(PhpToken)#3 (4) {
+ ["id"]=>
+ int(59)
+ ["text"]=>
+ string(1) ";"
+ ["line"]=>
+ int(3)
+ ["pos"]=>
+ int(37)
+ }
+ [13]=>
+ object(PhpToken)#2 (4) {
+ ["id"]=>
+ int(385)
+ ["text"]=>
+ string(1) "
+"
+ ["line"]=>
+ int(3)
+ ["pos"]=>
+ int(38)
+ }
+ [14]=>
+ object(PhpToken)#1 (4) {
+ ["id"]=>
+ int(125)
+ ["text"]=>
+ string(1) "}"
+ ["line"]=>
+ int(4)
+ ["pos"]=>
+ int(39)
+ }
+}
diff --git a/ext/tokenizer/tests/PhpToken_methods.phpt b/ext/tokenizer/tests/PhpToken_methods.phpt
new file mode 100644
index 0000000000..9429cea7ed
--- /dev/null
+++ b/ext/tokenizer/tests/PhpToken_methods.phpt
@@ -0,0 +1,119 @@
+--TEST--
+PhpToken instance methods
+--SKIPIF--
+<?php if (!extension_loaded("tokenizer")) print "skip tokenizer extension not enabled"; ?>
+--FILE--
+<?php
+
+$code = <<<'PHP'
+<?php
+// comment
+/** comment */
+function foo() {
+ echo "bar";
+}
+PHP;
+
+// Token names and ignorability.
+$tokens = PhpToken::getAll($code);
+foreach ($tokens as $i => $token) {
+ printf("[%2d] %-26s %s\n", $i, $token->getTokenName(),
+ $token->isIgnorable() ? "ignorable" : "meaningful");
+}
+
+// is() variations
+
+echo "\nSuccess:\n";
+var_dump($tokens[4]->is(T_FUNCTION));
+var_dump($tokens[4]->is('function'));
+var_dump($tokens[4]->is(['class', T_FUNCTION]));
+var_dump($tokens[4]->is([T_CLASS, 'function']));
+
+echo "\nFailure:\n";
+var_dump($tokens[4]->is(T_CLASS));
+var_dump($tokens[4]->is('class'));
+var_dump($tokens[4]->is(['class', T_TRAIT]));
+var_dump($tokens[4]->is([T_CLASS, 'trait']));
+
+echo "\nError:\n";
+try {
+ $tokens[4]->is(3.141);
+} catch (TypeError $e) {
+ echo $e->getMessage(), "\n";
+}
+try {
+ $tokens[4]->is([3.141]);
+} catch (TypeError $e) {
+ echo $e->getMessage(), "\n";
+}
+
+unset($tokens[4]->id);
+unset($tokens[4]->text);
+try {
+ $tokens[4]->is(T_FUNCTION);
+} catch (Error $e) {
+ echo $e->getMessage(), "\n";
+}
+try {
+ $tokens[4]->is('function');
+} catch (Error $e) {
+ echo $e->getMessage(), "\n";
+}
+try {
+ $tokens[4]->is([T_FUNCTION]);
+} catch (Error $e) {
+ echo $e->getMessage(), "\n";
+}
+try {
+ $tokens[4]->is(['function']);
+} catch (Error $e) {
+ echo $e->getMessage(), "\n";
+}
+
+echo "\nName of unknown token:\n";
+$token = new PhpToken(100000, "foo");
+var_dump($token->getTokenName());
+
+?>
+--EXPECT--
+[ 0] T_OPEN_TAG ignorable
+[ 1] T_COMMENT ignorable
+[ 2] T_DOC_COMMENT ignorable
+[ 3] T_WHITESPACE ignorable
+[ 4] T_FUNCTION meaningful
+[ 5] T_WHITESPACE ignorable
+[ 6] T_STRING meaningful
+[ 7] ( meaningful
+[ 8] ) meaningful
+[ 9] T_WHITESPACE ignorable
+[10] { meaningful
+[11] T_WHITESPACE ignorable
+[12] T_ECHO meaningful
+[13] T_WHITESPACE ignorable
+[14] T_CONSTANT_ENCAPSED_STRING meaningful
+[15] ; meaningful
+[16] T_WHITESPACE ignorable
+[17] } meaningful
+
+Success:
+bool(true)
+bool(true)
+bool(true)
+bool(true)
+
+Failure:
+bool(false)
+bool(false)
+bool(false)
+bool(false)
+
+Error:
+Kind must be of type int, string or array
+Kind array must have elements of type int or string
+Typed property PhpToken::$id must not be accessed before initialization
+Typed property PhpToken::$text must not be accessed before initialization
+Typed property PhpToken::$id must not be accessed before initialization
+Typed property PhpToken::$text must not be accessed before initialization
+
+Name of unknown token:
+NULL
diff --git a/ext/tokenizer/tests/PhpToken_toString.phpt b/ext/tokenizer/tests/PhpToken_toString.phpt
new file mode 100644
index 0000000000..17dbfa84a7
--- /dev/null
+++ b/ext/tokenizer/tests/PhpToken_toString.phpt
@@ -0,0 +1,18 @@
+--TEST--
+PhpToken implements __toString()
+--FILE--
+<?php
+
+$tokens = PhpToken::getAll('<?php echo "Hello ". $what;');
+var_dump(implode($tokens));
+
+var_dump($tokens[0] instanceof Stringable);
+var_dump((string) $tokens[0]);
+var_dump($tokens[0]->__toString());
+
+?>
+--EXPECT--
+string(27) "<?php echo "Hello ". $what;"
+bool(true)
+string(6) "<?php "
+string(6) "<?php "
diff --git a/ext/tokenizer/tokenizer.c b/ext/tokenizer/tokenizer.c
index 1ac5275ff0..222c3e96a3 100644
--- a/ext/tokenizer/tokenizer.c
+++ b/ext/tokenizer/tokenizer.c
@@ -29,13 +29,16 @@
#include "zend_language_scanner.h"
#include "zend_language_scanner_defs.h"
#include <zend_language_parser.h>
+#include "zend_interfaces.h"
#define zendtext LANG_SCNG(yy_text)
#define zendleng LANG_SCNG(yy_leng)
#define zendcursor LANG_SCNG(yy_cursor)
#define zendlimit LANG_SCNG(yy_limit)
-#define TOKEN_PARSE 1
+#define TOKEN_PARSE (1 << 0)
+
+zend_class_entry *php_token_ce;
void tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS) {
REGISTER_LONG_CONSTANT("TOKEN_PARSE", TOKEN_PARSE, CONST_CS|CONST_PERSISTENT);
@@ -72,12 +75,237 @@ zend_module_entry tokenizer_module_entry = {
ZEND_GET_MODULE(tokenizer)
#endif
+static zval *php_token_get_id(zval *obj) {
+ zval *id = OBJ_PROP_NUM(Z_OBJ_P(obj), 0);
+ if (Z_ISUNDEF_P(id)) {
+ zend_throw_error(NULL,
+ "Typed property PhpToken::$id must not be accessed before initialization");
+ return NULL;
+ }
+
+ ZVAL_DEREF(id);
+ ZEND_ASSERT(Z_TYPE_P(id) == IS_LONG);
+ return id;
+}
+
+static zend_string *php_token_get_text(zval *obj) {
+ zval *text_zval = OBJ_PROP_NUM(Z_OBJ_P(obj), 1);
+ if (Z_ISUNDEF_P(text_zval)) {
+ zend_throw_error(NULL,
+ "Typed property PhpToken::$text must not be accessed before initialization");
+ return NULL;
+ }
+
+ ZVAL_DEREF(text_zval);
+ ZEND_ASSERT(Z_TYPE_P(text_zval) == IS_STRING);
+ return Z_STR_P(text_zval);
+}
+
+static zend_bool tokenize_common(
+ zval *return_value, zend_string *source, zend_long flags, zend_class_entry *token_class);
+
+PHP_METHOD(PhpToken, getAll)
+{
+ zend_string *source;
+ zend_long flags = 0;
+ zend_class_entry *token_class;
+
+ ZEND_PARSE_PARAMETERS_START(1, 2)
+ Z_PARAM_STR(source)
+ Z_PARAM_OPTIONAL
+ Z_PARAM_LONG(flags)
+ ZEND_PARSE_PARAMETERS_END();
+
+ token_class = zend_get_called_scope(execute_data);
+
+ /* Check construction preconditions in advance, so these are not repeated for each token. */
+ if (token_class->ce_flags & ZEND_ACC_EXPLICIT_ABSTRACT_CLASS) {
+ zend_throw_error(NULL, "Cannot instantiate abstract class %s", ZSTR_VAL(token_class->name));
+ RETURN_THROWS();
+ }
+ if (zend_update_class_constants(token_class) == FAILURE) {
+ RETURN_THROWS();
+ }
+
+ if (!tokenize_common(return_value, source, flags, token_class)) {
+ RETURN_THROWS();
+ }
+}
+
+PHP_METHOD(PhpToken, __construct)
+{
+ zend_long id;
+ zend_string *text;
+ zend_long line = -1;
+ zend_long pos = -1;
+ zend_object *obj = Z_OBJ_P(ZEND_THIS);
+
+ ZEND_PARSE_PARAMETERS_START(2, 4)
+ Z_PARAM_LONG(id)
+ Z_PARAM_STR(text)
+ Z_PARAM_OPTIONAL
+ Z_PARAM_LONG(line)
+ Z_PARAM_LONG(pos)
+ ZEND_PARSE_PARAMETERS_END();
+
+ ZVAL_LONG(OBJ_PROP_NUM(obj, 0), id);
+ zval_ptr_dtor(OBJ_PROP_NUM(obj, 1));
+ ZVAL_STR_COPY(OBJ_PROP_NUM(obj, 1), text);
+ ZVAL_LONG(OBJ_PROP_NUM(obj, 2), line);
+ ZVAL_LONG(OBJ_PROP_NUM(obj, 3), pos);
+}
+
+PHP_METHOD(PhpToken, is)
+{
+ zval *kind;
+
+ ZEND_PARSE_PARAMETERS_START(1, 1)
+ Z_PARAM_ZVAL(kind)
+ ZEND_PARSE_PARAMETERS_END();
+
+ if (Z_TYPE_P(kind) == IS_LONG) {
+ zval *id_zval = php_token_get_id(ZEND_THIS);
+ if (!id_zval) {
+ RETURN_THROWS();
+ }
+
+ RETURN_BOOL(Z_LVAL_P(id_zval) == Z_LVAL_P(kind));
+ } else if (Z_TYPE_P(kind) == IS_STRING) {
+ zend_string *text = php_token_get_text(ZEND_THIS);
+ if (!text) {
+ RETURN_THROWS();
+ }
+
+ RETURN_BOOL(zend_string_equals(text, Z_STR_P(kind)));
+ } else if (Z_TYPE_P(kind) == IS_ARRAY) {
+ zval *id_zval = NULL, *entry;
+ zend_string *text = NULL;
+ ZEND_HASH_FOREACH_VAL(Z_ARRVAL_P(kind), entry) {
+ ZVAL_DEREF(entry);
+ if (Z_TYPE_P(entry) == IS_LONG) {
+ if (!id_zval) {
+ id_zval = php_token_get_id(ZEND_THIS);
+ if (!id_zval) {
+ RETURN_THROWS();
+ }
+ }
+ if (Z_LVAL_P(id_zval) == Z_LVAL_P(entry)) {
+ RETURN_TRUE;
+ }
+ } else if (Z_TYPE_P(entry) == IS_STRING) {
+ if (!text) {
+ text = php_token_get_text(ZEND_THIS);
+ if (!text) {
+ RETURN_THROWS();
+ }
+ }
+ if (zend_string_equals(text, Z_STR_P(entry))) {
+ RETURN_TRUE;
+ }
+ } else {
+ zend_type_error("Kind array must have elements of type int or string");
+ RETURN_THROWS();
+ }
+ } ZEND_HASH_FOREACH_END();
+ RETURN_FALSE;
+ } else {
+ zend_type_error("Kind must be of type int, string or array");
+ RETURN_THROWS();
+ }
+}
+
+PHP_METHOD(PhpToken, isIgnorable)
+{
+ ZEND_PARSE_PARAMETERS_NONE();
+
+ zval *id_zval = php_token_get_id(ZEND_THIS);
+ if (!id_zval) {
+ RETURN_THROWS();
+ }
+
+ zend_long id = Z_LVAL_P(id_zval);
+ RETURN_BOOL(id == T_WHITESPACE || id == T_COMMENT || id == T_DOC_COMMENT || id == T_OPEN_TAG);
+}
+
+PHP_METHOD(PhpToken, getTokenName)
+{
+ ZEND_PARSE_PARAMETERS_NONE();
+
+ zval *id_zval = php_token_get_id(ZEND_THIS);
+ if (!id_zval) {
+ RETURN_THROWS();
+ }
+
+ if (Z_LVAL_P(id_zval) < 256) {
+ RETURN_INTERNED_STR(ZSTR_CHAR(Z_LVAL_P(id_zval)));
+ } else {
+ const char *token_name = get_token_type_name(Z_LVAL_P(id_zval));
+ if (!token_name) {
+ RETURN_NULL();
+ }
+
+ RETURN_STRING(token_name);
+ }
+}
+
+PHP_METHOD(PhpToken, __toString)
+{
+ ZEND_PARSE_PARAMETERS_NONE();
+
+ zend_string *text = php_token_get_text(ZEND_THIS);
+ if (!text) {
+ RETURN_THROWS();
+ }
+
+ RETURN_STR_COPY(text);
+}
+
+static const zend_function_entry php_token_methods[] = {
+ PHP_ME(PhpToken, getAll, arginfo_class_PhpToken_getAll, ZEND_ACC_PUBLIC|ZEND_ACC_STATIC)
+ PHP_ME(PhpToken, __construct, arginfo_class_PhpToken___construct, ZEND_ACC_PUBLIC|ZEND_ACC_FINAL)
+ PHP_ME(PhpToken, is, arginfo_class_PhpToken_is, ZEND_ACC_PUBLIC)
+ PHP_ME(PhpToken, isIgnorable, arginfo_class_PhpToken_isIgnorable, ZEND_ACC_PUBLIC)
+ PHP_ME(PhpToken, getTokenName, arginfo_class_PhpToken_getTokenName, ZEND_ACC_PUBLIC)
+ PHP_ME(PhpToken, __toString, arginfo_class_PhpToken___toString, ZEND_ACC_PUBLIC)
+ PHP_FE_END
+};
+
/* {{{ PHP_MINIT_FUNCTION
*/
PHP_MINIT_FUNCTION(tokenizer)
{
+ zend_class_entry ce;
+ zend_string *name;
+ zval default_val;
+ ZVAL_UNDEF(&default_val);
+
tokenizer_register_constants(INIT_FUNC_ARGS_PASSTHRU);
tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS_PASSTHRU);
+
+ INIT_CLASS_ENTRY(ce, "PhpToken", php_token_methods);
+ php_token_ce = zend_register_internal_class(&ce);
+ zend_class_implements(php_token_ce, 1, zend_ce_stringable);
+
+ name = zend_string_init("id", sizeof("id") - 1, 1);
+ zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
+ (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
+ zend_string_release(name);
+
+ name = zend_string_init("text", sizeof("text") - 1, 1);
+ zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
+ (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_STRING));
+ zend_string_release(name);
+
+ name = zend_string_init("line", sizeof("line") - 1, 1);
+ zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
+ (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
+ zend_string_release(name);
+
+ name = zend_string_init("pos", sizeof("pos") - 1, 1);
+ zend_declare_typed_property(php_token_ce, name, &default_val, ZEND_ACC_PUBLIC, NULL,
+ (zend_type) ZEND_TYPE_INIT_MASK(MAY_BE_LONG));
+ zend_string_release(name);
+
return SUCCESS;
}
/* }}} */
@@ -92,29 +320,56 @@ PHP_MINFO_FUNCTION(tokenizer)
}
/* }}} */
-static void add_token(zval *return_value, int token_type,
- unsigned char *text, size_t leng, int lineno) {
- if (token_type >= 256) {
- zval keyword;
- array_init(&keyword);
- add_next_index_long(&keyword, token_type);
- if (leng == 1) {
- add_next_index_str(&keyword, ZSTR_CHAR(text[0]));
- } else {
- add_next_index_stringl(&keyword, (char *) text, leng);
+static zend_string *make_str(unsigned char *text, size_t leng, HashTable *interned_strings) {
+ if (leng == 1) {
+ return ZSTR_CHAR(text[0]);
+ } else if (interned_strings) {
+ zend_string *interned_str = zend_hash_str_find_ptr(interned_strings, (char *) text, leng);
+ if (interned_str) {
+ return zend_string_copy(interned_str);
}
- add_next_index_long(&keyword, lineno);
- add_next_index_zval(return_value, &keyword);
+ interned_str = zend_string_init((char *) text, leng, 0);
+ zend_hash_add_new_ptr(interned_strings, interned_str, interned_str);
+ return interned_str;
} else {
- if (leng == 1) {
- add_next_index_str(return_value, ZSTR_CHAR(text[0]));
- } else {
- add_next_index_stringl(return_value, (char *) text, leng);
+ return zend_string_init((char *) text, leng, 0);
+ }
+}
+
+static void add_token(
+ zval *return_value, int token_type, unsigned char *text, size_t leng, int lineno,
+ zend_class_entry *token_class, HashTable *interned_strings) {
+ zval token;
+ if (token_class) {
+ zend_object *obj = zend_objects_new(token_class);
+ ZVAL_OBJ(&token, obj);
+ ZVAL_LONG(OBJ_PROP_NUM(obj, 0), token_type);
+ ZVAL_STR(OBJ_PROP_NUM(obj, 1), make_str(text, leng, interned_strings));
+ ZVAL_LONG(OBJ_PROP_NUM(obj, 2), lineno);
+ ZVAL_LONG(OBJ_PROP_NUM(obj, 3), text - LANG_SCNG(yy_start));
+
+ /* If the class is extended with additional properties, initialized them as well. */
+ if (UNEXPECTED(token_class->default_properties_count > 4)) {
+ zval *dst = OBJ_PROP_NUM(obj, 4);
+ zval *src = &token_class->default_properties_table[4];
+ zval *end = token_class->default_properties_table
+ + token_class->default_properties_count;
+ for (; src < end; src++, dst++) {
+ ZVAL_COPY_PROP(dst, src);
+ }
}
+ } else if (token_type >= 256) {
+ array_init(&token);
+ add_next_index_long(&token, token_type);
+ add_next_index_str(&token, make_str(text, leng, interned_strings));
+ add_next_index_long(&token, lineno);
+ } else {
+ ZVAL_STR(&token, make_str(text, leng, interned_strings));
}
+ zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &token);
}
-static zend_bool tokenize(zval *return_value, zend_string *source)
+static zend_bool tokenize(zval *return_value, zend_string *source, zend_class_entry *token_class)
{
zval source_zval;
zend_lex_state original_lex_state;
@@ -122,6 +377,7 @@ static zend_bool tokenize(zval *return_value, zend_string *source)
int token_type;
int token_line = 1;
int need_tokens = -1; /* for __halt_compiler lexing. -1 = disabled */
+ HashTable interned_strings;
ZVAL_STR_COPY(&source_zval, source);
zend_save_lexical_state(&original_lex_state);
@@ -132,10 +388,13 @@ static zend_bool tokenize(zval *return_value, zend_string *source)
}
LANG_SCNG(yy_state) = yycINITIAL;
+ zend_hash_init(&interned_strings, 0, NULL, NULL, 0);
array_init(return_value);
while ((token_type = lex_scan(&token, NULL))) {
- add_token(return_value, token_type, zendtext, zendleng, token_line);
+ add_token(
+ return_value, token_type, zendtext, zendleng, token_line,
+ token_class, &interned_strings);
if (Z_TYPE(token) != IS_UNDEF) {
zval_ptr_dtor_nogc(&token);
@@ -150,8 +409,9 @@ static zend_bool tokenize(zval *return_value, zend_string *source)
) {
/* fetch the rest into a T_INLINE_HTML */
if (zendcursor != zendlimit) {
- add_token(return_value, T_INLINE_HTML,
- zendcursor, zendlimit - zendcursor, token_line);
+ add_token(
+ return_value, T_INLINE_HTML, zendcursor, zendlimit - zendcursor,
+ token_line, token_class, &interned_strings);
}
break;
}
@@ -169,46 +429,56 @@ static zend_bool tokenize(zval *return_value, zend_string *source)
zval_ptr_dtor_str(&source_zval);
zend_restore_lexical_state(&original_lex_state);
+ zend_hash_destroy(&interned_strings);
return 1;
}
+struct event_context {
+ zval *tokens;
+ zend_class_entry *token_class;
+};
+
void on_event(zend_php_scanner_event event, int token, int line, void *context)
{
- zval *token_stream = (zval *) context;
+ struct event_context *ctx = context;
HashTable *tokens_ht;
zval *token_zv;
switch (event) {
case ON_TOKEN:
- {
- if (token == END) break;
- /* Special cases */
- if (token == ';' && LANG_SCNG(yy_leng) > 1) { /* ?> or ?>\n or ?>\r\n */
- token = T_CLOSE_TAG;
- } else if (token == T_ECHO && LANG_SCNG(yy_leng) == sizeof("<?=") - 1) {
- token = T_OPEN_TAG_WITH_ECHO;
- }
- add_token(token_stream, token, LANG_SCNG(yy_text), LANG_SCNG(yy_leng), line);
+ if (token == END) break;
+ /* Special cases */
+ if (token == ';' && LANG_SCNG(yy_leng) > 1) { /* ?> or ?>\n or ?>\r\n */
+ token = T_CLOSE_TAG;
+ } else if (token == T_ECHO && LANG_SCNG(yy_leng) == sizeof("<?=") - 1) {
+ token = T_OPEN_TAG_WITH_ECHO;
}
+ add_token(ctx->tokens, token,
+ LANG_SCNG(yy_text), LANG_SCNG(yy_leng), line, ctx->token_class, NULL);
break;
case ON_FEEDBACK:
- tokens_ht = Z_ARRVAL_P(token_stream);
+ tokens_ht = Z_ARRVAL_P(ctx->tokens);
token_zv = zend_hash_index_find(tokens_ht, zend_hash_num_elements(tokens_ht) - 1);
- if (token_zv && Z_TYPE_P(token_zv) == IS_ARRAY) {
+ ZEND_ASSERT(token_zv);
+ if (Z_TYPE_P(token_zv) == IS_ARRAY) {
ZVAL_LONG(zend_hash_index_find(Z_ARRVAL_P(token_zv), 0), token);
+ } else {
+ zend_update_property_long(php_token_ce, token_zv, "type", sizeof("type")-1, token);
}
break;
case ON_STOP:
if (LANG_SCNG(yy_cursor) != LANG_SCNG(yy_limit)) {
- add_token(token_stream, T_INLINE_HTML, LANG_SCNG(yy_cursor),
- LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor), CG(zend_lineno));
+ add_token(ctx->tokens, T_INLINE_HTML, LANG_SCNG(yy_cursor),
+ LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor), CG(zend_lineno),
+ ctx->token_class, NULL);
}
break;
}
}
-static zend_bool tokenize_parse(zval *return_value, zend_string *source)
+static zend_bool tokenize_parse(
+ zval *return_value, zend_string *source, zend_class_entry *token_class)
{
zval source_zval;
zend_lex_state original_lex_state;
@@ -222,14 +492,18 @@ static zend_bool tokenize_parse(zval *return_value, zend_string *source)
zend_save_lexical_state(&original_lex_state);
if ((success = (zend_prepare_string_for_scanning(&source_zval, "") == SUCCESS))) {
+ struct event_context ctx;
zval token_stream;
array_init(&token_stream);
+ ctx.tokens = &token_stream;
+ ctx.token_class = token_class;
+
CG(ast) = NULL;
CG(ast_arena) = zend_arena_create(1024 * 32);
LANG_SCNG(yy_state) = yycINITIAL;
LANG_SCNG(on_event) = on_event;
- LANG_SCNG(on_event_context) = &token_stream;
+ LANG_SCNG(on_event_context) = &ctx;
if((success = (zendparse() == SUCCESS))) {
ZVAL_COPY_VALUE(return_value, &token_stream);
@@ -250,6 +524,19 @@ static zend_bool tokenize_parse(zval *return_value, zend_string *source)
return success;
}
+static zend_bool tokenize_common(
+ zval *return_value, zend_string *source, zend_long flags, zend_class_entry *token_class)
+{
+ if (flags & TOKEN_PARSE) {
+ return tokenize_parse(return_value, source, token_class);
+ } else {
+ int success = tokenize(return_value, source, token_class);
+ /* Normal token_get_all() should not throw. */
+ zend_clear_exception();
+ return success;
+ }
+}
+
/* }}} */
/* {{{ proto array token_get_all(string source [, int flags])
@@ -258,7 +545,6 @@ PHP_FUNCTION(token_get_all)
{
zend_string *source;
zend_long flags = 0;
- zend_bool success;
ZEND_PARSE_PARAMETERS_START(1, 2)
Z_PARAM_STR(source)
@@ -266,15 +552,7 @@ PHP_FUNCTION(token_get_all)
Z_PARAM_LONG(flags)
ZEND_PARSE_PARAMETERS_END();
- if (flags & TOKEN_PARSE) {
- success = tokenize_parse(return_value, source);
- } else {
- success = tokenize(return_value, source);
- /* Normal token_get_all() should not throw. */
- zend_clear_exception();
- }
-
- if (!success) {
+ if (!tokenize_common(return_value, source, flags, /* token_class */ NULL)) {
RETURN_THROWS();
}
}
@@ -290,6 +568,10 @@ PHP_FUNCTION(token_name)
Z_PARAM_LONG(type)
ZEND_PARSE_PARAMETERS_END();
- RETVAL_STRING(get_token_type_name(type));
+ const char *token_name = get_token_type_name(type);
+ if (!token_name) {
+ token_name = "UNKNOWN";
+ }
+ RETURN_STRING(token_name);
}
/* }}} */
diff --git a/ext/tokenizer/tokenizer.stub.php b/ext/tokenizer/tokenizer.stub.php
index 63a6c2e72c..801c1c8504 100644
--- a/ext/tokenizer/tokenizer.stub.php
+++ b/ext/tokenizer/tokenizer.stub.php
@@ -3,3 +3,19 @@
function token_get_all(string $source, int $flags = 0): array {}
function token_name(int $token): string {}
+
+class PhpToken implements Stringable {
+ /** @return static[] */
+ public static function getAll(string $code, int $flags = 0): array;
+
+ public final function __construct(int $id, string $text, int $line = -1, int $pos = -1);
+
+ /** @param int|string|array $kind */
+ public function is($kind): bool;
+
+ public function isIgnorable(): bool;
+
+ public function getTokenName(): ?string;
+
+ public function __toString(): string;
+}
diff --git a/ext/tokenizer/tokenizer_arginfo.h b/ext/tokenizer/tokenizer_arginfo.h
index d777535a48..d927c8d0e6 100644
--- a/ext/tokenizer/tokenizer_arginfo.h
+++ b/ext/tokenizer/tokenizer_arginfo.h
@@ -8,3 +8,28 @@ ZEND_END_ARG_INFO()
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_token_name, 0, 1, IS_STRING, 0)
ZEND_ARG_TYPE_INFO(0, token, IS_LONG, 0)
ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_getAll, 0, 1, IS_ARRAY, 0)
+ ZEND_ARG_TYPE_INFO(0, code, IS_STRING, 0)
+ ZEND_ARG_TYPE_INFO(0, flags, IS_LONG, 0)
+ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_INFO_EX(arginfo_class_PhpToken___construct, 0, 0, 2)
+ ZEND_ARG_TYPE_INFO(0, id, IS_LONG, 0)
+ ZEND_ARG_TYPE_INFO(0, text, IS_STRING, 0)
+ ZEND_ARG_TYPE_INFO(0, line, IS_LONG, 0)
+ ZEND_ARG_TYPE_INFO(0, pos, IS_LONG, 0)
+ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_is, 0, 1, _IS_BOOL, 0)
+ ZEND_ARG_INFO(0, kind)
+ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_isIgnorable, 0, 0, _IS_BOOL, 0)
+ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken_getTokenName, 0, 0, IS_STRING, 1)
+ZEND_END_ARG_INFO()
+
+ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_class_PhpToken___toString, 0, 0, IS_STRING, 0)
+ZEND_END_ARG_INFO()
diff --git a/ext/tokenizer/tokenizer_data.c b/ext/tokenizer/tokenizer_data.c
index 7e580dd844..3ddf89521a 100644
--- a/ext/tokenizer/tokenizer_data.c
+++ b/ext/tokenizer/tokenizer_data.c
@@ -306,6 +306,6 @@ char *get_token_type_name(int token_type)
case T_BAD_CHARACTER: return "T_BAD_CHARACTER";
}
- return "UNKNOWN";
+ return NULL;
}
diff --git a/ext/tokenizer/tokenizer_data_gen.sh b/ext/tokenizer/tokenizer_data_gen.sh
index 4d5e97ddde..1dbe77d2e7 100755
--- a/ext/tokenizer/tokenizer_data_gen.sh
+++ b/ext/tokenizer/tokenizer_data_gen.sh
@@ -71,7 +71,7 @@ awk '
echo '
}
- return "UNKNOWN";
+ return NULL;
}
' >> $outfile