diff options
Diffstat (limited to 'src/xmlpatterns/qtokenautomaton')
-rw-r--r-- | src/xmlpatterns/qtokenautomaton/README | 66 | ||||
-rw-r--r-- | src/xmlpatterns/qtokenautomaton/exampleFile.xml | 65 | ||||
-rw-r--r-- | src/xmlpatterns/qtokenautomaton/qautomaton2cpp.xsl | 298 | ||||
-rw-r--r-- | src/xmlpatterns/qtokenautomaton/qtokenautomaton.xsd | 89 |
4 files changed, 518 insertions, 0 deletions
diff --git a/src/xmlpatterns/qtokenautomaton/README b/src/xmlpatterns/qtokenautomaton/README new file mode 100644 index 0000000..8c5e552 --- /dev/null +++ b/src/xmlpatterns/qtokenautomaton/README @@ -0,0 +1,66 @@ + +qtokenautomaton is a token generator, that generates a simple, Unicode aware +tokenizer for C++ that uses the Qt API. + +Introduction +===================== +QTokenAutomaton generates a C++ class that essentially has this interface: + + class YourTokenizer + { + protected: + enum Token + { + A, + B, + C, + NoKeyword + }; + + static inline Token toToken(const QString &string); + static inline Token toToken(const QStringRef &string); + static Token toToken(const QChar *data, int length); + static QString toString(Token token); + }; + +When calling toToken(), the tokenizer returns the enum value corresponding to +the string. This is done with O(N) time complexity, where N is the length of +the string. The returned value can then subsequently be efficiently switched +over. The alternatives, either a long chain of if statements comparing one +QString to several other QStrings; or inserting all strings first into a hash, +are less efficient. + +For instance, the latter case of using a hash would involve when excluding the +initial populating of the hash, O(N) + O(1) where 0(1) is assumed to be a +non-conflicting hash lookup. + +toString(), which returns the string for the token that an enum value +represents, is implemented to store the strings in an efficient manner. + +A typical usage scenario is in combination with QXmlStreamReader. When parsing +a certain format, for instance XHTML, each element name, body, span, table and +so forth, typically needs special treatment. QTokenAutomaton conceptually cuts +the string comparisons down to one. + +Beyond efficiency, QTokenAutomaton also increases type safety, since C++ +identifiers are used instead of string literals. + +Usage +===================== +Using it is approached as follows: + +1. Create a token file. Use exampleFile.xml as a template. + +2. Make sure it is valid by validating against qtokenautomaton.xsd. On + Linux, this can be achieved by running `xmllint --noout + --schema qtokenautomaton.xsd yourFile.xml` + +3. Produce the C++ files by invoking the stylesheet with an XSL-T 2.0 + processor[1]. For instance, with the implementation Saxon, this would be: + `java net.sf.saxon.Transform -xsl:qautomaton2cpp.xsl yourFile.xml` + +4. Include the produced C++ files with your build system. + + +1. +In Qt there is as of 4.4 no support for XSL-T. diff --git a/src/xmlpatterns/qtokenautomaton/exampleFile.xml b/src/xmlpatterns/qtokenautomaton/exampleFile.xml new file mode 100644 index 0000000..1274443 --- /dev/null +++ b/src/xmlpatterns/qtokenautomaton/exampleFile.xml @@ -0,0 +1,65 @@ +<?xml version="1.0" encoding="UTF-8"?> +<tokenAutomaton scope="public" + className="ExampleFile" + headerFile="exampleFile.h" + sourceFile="exampleFile.cpp" + defaultToken="NoKeyword" + tokenEnum="Token" + hasToString="true" + includeGuardName="exampleFile_h"> + <tokens> + <token>html</token> + <token>body</token> + <token>p</token> + <token>table</token> + <token name="WeWantADifferentNameForSpan">span</token> + </tokens> + + <boilerplate> + + <prolog>/**************************************************************************** +** +** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). +** All rights reserved. +** Contact: Nokia Corporation (qt-info@nokia.com) +** +** This file is part of the QtXmlPatterns module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the Technology Preview License Agreement accompanying +** this package. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** If you have questions regarding the use of this file, please contact +** Nokia at qt-info@nokia.com. +** +** +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +</prolog> + + </boilerplate> + +</tokenAutomaton> diff --git a/src/xmlpatterns/qtokenautomaton/qautomaton2cpp.xsl b/src/xmlpatterns/qtokenautomaton/qautomaton2cpp.xsl new file mode 100644 index 0000000..16084a0 --- /dev/null +++ b/src/xmlpatterns/qtokenautomaton/qautomaton2cpp.xsl @@ -0,0 +1,298 @@ +<?xml version='1.0' encoding="UTF-8"?> +<xsl:stylesheet version="2.0" + xml:lang="en" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:local="http://www.w3.org/2005/xquery-local-functions" + xmlns:xs="http://www.w3.org/2001/XMLSchema"> + + <xsl:variable name="className" as="xs:string" select="tokenAutomaton/@className"/> + <xsl:variable name="defaultToken" as="xs:string" select="tokenAutomaton/@defaultToken"/> + <xsl:variable name="tokens" as="element(token)+" select="tokenAutomaton/tokens/token"/> + <xsl:variable name="tokenEnum" as="xs:string" select="tokenAutomaton/@tokenEnum"/> + + <xsl:variable name="warningGenerated" as="xs:string">/* NOTE: This file is AUTO GENERATED by qautomaton2cpp.xsl. */
</xsl:variable> + + <xsl:template match="tokenAutomaton"> + + <xsl:variable name="uniqueLengths" as="xs:integer+" select="distinct-values(tokens/token/string-length())"/> + + <xsl:result-document method="text" href="{@headerFile}"> + + <xsl:variable name="includeGuardName" select="string(@includeGuardName)"/> + + <xsl:value-of select="boilerplate/prolog"/> + + <xsl:value-of select="$warningGenerated"/> + + <xsl:text>
#ifndef </xsl:text> + <xsl:value-of select="$includeGuardName"/> + <xsl:text>
#define </xsl:text> + <xsl:value-of select="$includeGuardName"/> + <xsl:text>

</xsl:text> + + <xsl:text>#include <QtCore/QString>
</xsl:text> + <xsl:text>
</xsl:text> + <xsl:text>QT_BEGIN_NAMESPACE
</xsl:text> + <xsl:text>
</xsl:text> + + <xsl:if test="@namespace"> + <xsl:text>namespace </xsl:text> + <xsl:value-of select="@namespace"/> + { + </xsl:if> + + <xsl:text>class </xsl:text> + <xsl:value-of select="@className"/> + { + <xsl:value-of select="@scope"/>: + <xsl:text>enum </xsl:text> + <xsl:value-of select="$tokenEnum"/> + <xsl:text>
</xsl:text> + { + <xsl:value-of separator=",
"> + <xsl:sequence select="@defaultToken"/> + <xsl:perform-sort select="tokens/token/local:tokenToEnumName(.)"> + <xsl:sort select="."/> + </xsl:perform-sort> + </xsl:value-of> + }; + + <xsl:text>static inline </xsl:text> + <xsl:value-of select="$tokenEnum"/> + <xsl:text> toToken(const QString &value);
</xsl:text> + <xsl:text>static inline </xsl:text> + <xsl:value-of select="$tokenEnum"/> + <xsl:text> toToken(const QStringRef &value);
</xsl:text> + <xsl:text>static </xsl:text> + <xsl:value-of select="$tokenEnum"/> + <xsl:text> toToken(const QChar *data, int length);
</xsl:text> + <xsl:if test="xs:boolean(@hasToString)"> + <xsl:text>static QString toString(</xsl:text> + <xsl:value-of select="$tokenEnum"/> + <xsl:text> token);
</xsl:text> + </xsl:if> + + private: + <xsl:for-each select="$uniqueLengths"> + <xsl:sort select="."/> + <xsl:text>static inline </xsl:text> + <xsl:value-of select="$tokenEnum"/> + <xsl:text> classifier</xsl:text> + <xsl:value-of select="."/> + <xsl:text>(const QChar *data);
</xsl:text> + </xsl:for-each> + }; + + <xsl:text>inline </xsl:text> + <xsl:value-of select="@className"/>::<xsl:value-of select="$tokenEnum"/> + <xsl:text> </xsl:text> + <xsl:value-of select="@className"/>::toToken(const QString &value) + { + return toToken(value.constData(), value.length()); + } + + <xsl:text>inline </xsl:text> + <xsl:value-of select="@className"/>::<xsl:value-of select="$tokenEnum"/> + <xsl:text> </xsl:text> + <xsl:value-of select="@className"/>::toToken(const QStringRef &value) + { + return toToken(value.constData(), value.length()); + } + + <xsl:if test="@namespace"> + <xsl:text>}
</xsl:text> + </xsl:if> + + <xsl:text>
</xsl:text> + <xsl:text>QT_END_NAMESPACE
</xsl:text> + <xsl:text>
</xsl:text> + + <xsl:text>#endif
</xsl:text> + </xsl:result-document> + + <xsl:result-document method="text" href="{@sourceFile}"> + <xsl:value-of select="boilerplate/prolog"/> + + <xsl:value-of select="$warningGenerated"/> + + <xsl:text>
#include "</xsl:text> + <xsl:value-of select="@headerFile"/> + <xsl:text>"
</xsl:text> + <xsl:text>
</xsl:text> + <xsl:text>QT_BEGIN_NAMESPACE
</xsl:text> + + <xsl:if test="@namespace"> + <xsl:text>
</xsl:text> + <xsl:text>using namespace </xsl:text> + <xsl:value-of select="@namespace"/> + <xsl:text>;
</xsl:text> + </xsl:if> + + <xsl:text>
</xsl:text> + <xsl:variable name="tokens" select="tokens/token"/> + + <xsl:for-each select="$uniqueLengths"> + <xsl:sort select="."/> + <xsl:call-template name="generate-classifier"> + <xsl:with-param name="strings" select="$tokens[string-length() eq current()]"/> + </xsl:call-template> + </xsl:for-each> + + <xsl:value-of select="@className"/>::<xsl:value-of select="$tokenEnum"/> + <xsl:text> </xsl:text> + <xsl:value-of select="@className"/>::toToken(const QChar *data, int length) + { + switch(length) + { + <xsl:for-each select="$uniqueLengths"> + <xsl:sort data-type="number" select="."/> + case <xsl:value-of select="."/>: + return classifier<xsl:value-of select="."/>(data); + + </xsl:for-each> + default: + return <xsl:value-of select="@defaultToken"/>; + } + } + + <xsl:if test="xs:boolean(@hasToString)"> + QString <xsl:value-of select="@className"/>::toString(<xsl:value-of select="$tokenEnum"/> token) + { + const unsigned short *data = 0; + int length = 0; + + switch(token) + { + <xsl:for-each select="tokens/token"> + <xsl:sort select="local:tokenToEnumName(.)"/> + case <xsl:sequence select="local:tokenToEnumName(.)"/>: + {<!-- Without these braces, the code doesn't compile on MSVC 2008. --> + static const unsigned short staticallyStored<xsl:value-of select="local:tokenToEnumName(.)"/>[] = + { + <xsl:value-of separator=", " select="string-to-codepoints(.), 0"/> + }; + data = staticallyStored<xsl:value-of select="local:tokenToEnumName(.)"/>; + length = <xsl:value-of select="string-length(.)"/>; + break; + } + </xsl:for-each> + default: + /* It's either the default token, or an undefined enum + * value. We silence a compiler warning, and return the + * empty string. */ + ; + } + + union + { + const unsigned short *data; + const QChar *asQChar; + } converter; + converter.data = data; + + return QString::fromRawData(converter.asQChar, length); + } + </xsl:if> + + <xsl:text>
</xsl:text> + <xsl:text>QT_END_NAMESPACE
</xsl:text> + <xsl:text>
</xsl:text> + </xsl:result-document> + + </xsl:template> + + <xsl:template name="generate-classifier"> + <xsl:param name="strings" as="xs:string+"/> + + <xsl:value-of select="$className"/>::<xsl:value-of select="$tokenEnum"/> + <xsl:text> </xsl:text> + <xsl:value-of select="$className"/>::classifier<xsl:value-of select="."/>(const QChar *data) + + { + <xsl:sequence select="local:generateBranching($strings, 1, 1)"/> + + return <xsl:value-of select="$defaultToken"/>; + } + </xsl:template> + + <xsl:function name="local:generateBranching" ><!--as="xs:string+">--> + <xsl:param name="strings" as="xs:string+"/> + <xsl:param name="depth" as="xs:integer"/> + <xsl:param name="currentPos" as="xs:integer"/> + + <xsl:choose> + <xsl:when test="count($strings) eq 1"> + <xsl:variable name="remainingLength" as="xs:integer" select="(string-length($strings) - $currentPos) + 1"/> + <xsl:variable name="toMatch" as="xs:integer+" select="string-to-codepoints(substring($strings, $currentPos))"/> + + <xsl:if test="$remainingLength ne 0"> + <xsl:choose> + <xsl:when test="$remainingLength eq 1"> + if(data[<xsl:sequence select="$depth - 1"/>] == <xsl:sequence select="$toMatch"/>) + </xsl:when> + <xsl:when test="$remainingLength > 1"> + static const unsigned short string[] = + { + <xsl:value-of separator=", " select="string-to-codepoints(substring($strings, $currentPos))"/> + }; + if(memcmp(&data[<xsl:sequence select="$depth - 1"/>], &string, sizeof(QChar) * <xsl:value-of select="$remainingLength"/>) == 0) + </xsl:when> + </xsl:choose> + </xsl:if> + + return <xsl:value-of select="local:tokenToEnumName($strings)"/>; + </xsl:when> + <xsl:otherwise> + <xsl:for-each select="distinct-values(for $i in $strings return substring($i, $currentPos, 1))"> + <xsl:if test="position() > 1"> + <xsl:text>else </xsl:text> + </xsl:if> + + <xsl:text>if (data[</xsl:text> + <xsl:sequence select="string($depth - 1)"/> + <xsl:text>] == </xsl:text> + <xsl:sequence select="string-to-codepoints(.)"/> + <xsl:text>)
</xsl:text> + + { + <xsl:sequence select="local:generateBranching($strings[substring(., $currentPos, 1) eq current()], $depth + 1, $currentPos + 1)"/> + } + + </xsl:for-each> + </xsl:otherwise> + </xsl:choose> + </xsl:function> + + <xsl:function name="local:toCamelCase" as="xs:string"> + <xsl:param name="arg" as="xs:string"/> + + <xsl:sequence select="string-join((for $word in tokenize($arg,'[:-]+') + return concat(upper-case(substring($word,1,1)), + substring($word, 2))) ,'')"/> + + </xsl:function> + + <xsl:function name="local:tokenToEnumName" as="xs:string"> + <xsl:param name="string" as="xs:string"/> + + <xsl:variable name="token" select="$tokens[. eq $string]"/> + + <xsl:choose> + <xsl:when test="$token/@name"> + <xsl:sequence select="$token/@name"/> + </xsl:when> + <xsl:otherwise> + <!-- We take the token's string value, and coerces into a C++ + name. So get rid of invalid characters. Also do basic camel casing. --> + <xsl:variable name="normalized" select="translate($string, 'ABCDEFGHIJKLMNOPQRSTYXZabcdefghijklmnopqrstyxz1234567890_', 'ABCDEFGHIJKLMNOPQRSTYXZabcdefghijklmnopqrstyxz1234567890_')"/> + <xsl:value-of select="local:toCamelCase($normalized)"/> + </xsl:otherwise> + </xsl:choose> + + </xsl:function> + +</xsl:stylesheet> + +<!-- +vim: et:ts=4:sw=4:sts=4 +--> diff --git a/src/xmlpatterns/qtokenautomaton/qtokenautomaton.xsd b/src/xmlpatterns/qtokenautomaton/qtokenautomaton.xsd new file mode 100644 index 0000000..322c50e --- /dev/null +++ b/src/xmlpatterns/qtokenautomaton/qtokenautomaton.xsd @@ -0,0 +1,89 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- +TODO docs +--> + +<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" + elementFormDefault="qualified"> + + <xs:element name="tokenAutomaton" type="tokenAutomatonElementType"/> + + <xs:simpleType name="cppIdentifierType"> + <xs:restriction base="xs:string"> + <xs:pattern value="[a-zA-Z_][a-zA-Z0-9_]*"/> + </xs:restriction> + </xs:simpleType> + + <xs:simpleType name="filenameType"> + <xs:restriction base="xs:string"> + <!-- At least one character. --> + <xs:pattern value=".+"/> + </xs:restriction> + </xs:simpleType> + + <xs:simpleType name="scopeType"> + <xs:restriction base="xs:string"> + <xs:enumeration value="protected"/> + <xs:enumeration value="public"/> + </xs:restriction> + </xs:simpleType> + + <xs:complexType name="tokenAutomatonElementType"> + <xs:sequence> + <xs:element name="tokens" minOccurs="1" maxOccurs="1" type="tokensElementType"> + <!-- Each token name (the enum name), must be unique. --> + <xs:unique name="tokenNames"> + <xs:selector xpath="token"/> + <xs:field xpath="@name"/> + </xs:unique> + <!-- Each string must be unique, otherwise one string can map to two or + more enums. --> + <xs:unique name="tokenValues"> + <xs:selector xpath="token"/> + <xs:field xpath="."/> + </xs:unique> + </xs:element> + <xs:element name="boilerplate" minOccurs="0" maxOccurs="1" type="boilerplateElementType"/> + + </xs:sequence> + <xs:attribute name="className" type="cppIdentifierType"/> + <xs:attribute name="includeGuardName" type="cppIdentifierType"/> + <xs:attribute name="headerFile" type="filenameType" use="required"/> + <xs:attribute name="namespace" type="cppIdentifierType" use="optional"/> + <xs:attribute name="sourceFile" type="filenameType" use="required"/> + <xs:attribute name="scope" type="scopeType" use="required"/> + <xs:attribute name="defaultToken" type="cppIdentifierType" use="required"/> + <xs:attribute name="hasToString" type="xs:boolean" use="required"/> + <xs:attribute name="tokenEnum" type="cppIdentifierType" use="required"/> + </xs:complexType> + + <xs:complexType name="tokensElementType"> + <xs:sequence> + <xs:element name="token" maxOccurs="unbounded" type="tokenElementType" minOccurs="1"/> + </xs:sequence> + </xs:complexType> + + <xs:complexType name="tokenElementType"> + <xs:simpleContent> + <xs:extension base="xs:string"> + <xs:attribute name="name" use="optional" type="cppIdentifierType"/> + </xs:extension> + </xs:simpleContent> + </xs:complexType> + + <xs:complexType name="boilerplateElementType"> + <xs:sequence> + <xs:element name="prolog" maxOccurs="1" type="prologElementType" minOccurs="1"/> + </xs:sequence> + </xs:complexType> + + <xs:complexType name="prologElementType"> + <xs:simpleContent> + <xs:extension base="xs:string"/> + </xs:simpleContent> + </xs:complexType> + +</xs:schema> +<!-- +vim: et:ts=4:sw=4:sts=4 +--> |