summaryrefslogtreecommitdiff
path: root/src/xmlpatterns/qtokenautomaton
diff options
context:
space:
mode:
Diffstat (limited to 'src/xmlpatterns/qtokenautomaton')
-rw-r--r--src/xmlpatterns/qtokenautomaton/README66
-rw-r--r--src/xmlpatterns/qtokenautomaton/exampleFile.xml65
-rw-r--r--src/xmlpatterns/qtokenautomaton/qautomaton2cpp.xsl298
-rw-r--r--src/xmlpatterns/qtokenautomaton/qtokenautomaton.xsd89
4 files changed, 518 insertions, 0 deletions
diff --git a/src/xmlpatterns/qtokenautomaton/README b/src/xmlpatterns/qtokenautomaton/README
new file mode 100644
index 0000000..8c5e552
--- /dev/null
+++ b/src/xmlpatterns/qtokenautomaton/README
@@ -0,0 +1,66 @@
+
+qtokenautomaton is a token generator, that generates a simple, Unicode aware
+tokenizer for C++ that uses the Qt API.
+
+Introduction
+=====================
+QTokenAutomaton generates a C++ class that essentially has this interface:
+
+ class YourTokenizer
+ {
+ protected:
+ enum Token
+ {
+ A,
+ B,
+ C,
+ NoKeyword
+ };
+
+ static inline Token toToken(const QString &string);
+ static inline Token toToken(const QStringRef &string);
+ static Token toToken(const QChar *data, int length);
+ static QString toString(Token token);
+ };
+
+When calling toToken(), the tokenizer returns the enum value corresponding to
+the string. This is done with O(N) time complexity, where N is the length of
+the string. The returned value can then subsequently be efficiently switched
+over. The alternatives, either a long chain of if statements comparing one
+QString to several other QStrings; or inserting all strings first into a hash,
+are less efficient.
+
+For instance, the latter case of using a hash would involve when excluding the
+initial populating of the hash, O(N) + O(1) where 0(1) is assumed to be a
+non-conflicting hash lookup.
+
+toString(), which returns the string for the token that an enum value
+represents, is implemented to store the strings in an efficient manner.
+
+A typical usage scenario is in combination with QXmlStreamReader. When parsing
+a certain format, for instance XHTML, each element name, body, span, table and
+so forth, typically needs special treatment. QTokenAutomaton conceptually cuts
+the string comparisons down to one.
+
+Beyond efficiency, QTokenAutomaton also increases type safety, since C++
+identifiers are used instead of string literals.
+
+Usage
+=====================
+Using it is approached as follows:
+
+1. Create a token file. Use exampleFile.xml as a template.
+
+2. Make sure it is valid by validating against qtokenautomaton.xsd. On
+ Linux, this can be achieved by running `xmllint --noout
+ --schema qtokenautomaton.xsd yourFile.xml`
+
+3. Produce the C++ files by invoking the stylesheet with an XSL-T 2.0
+ processor[1]. For instance, with the implementation Saxon, this would be:
+ `java net.sf.saxon.Transform -xsl:qautomaton2cpp.xsl yourFile.xml`
+
+4. Include the produced C++ files with your build system.
+
+
+1.
+In Qt there is as of 4.4 no support for XSL-T.
diff --git a/src/xmlpatterns/qtokenautomaton/exampleFile.xml b/src/xmlpatterns/qtokenautomaton/exampleFile.xml
new file mode 100644
index 0000000..1274443
--- /dev/null
+++ b/src/xmlpatterns/qtokenautomaton/exampleFile.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tokenAutomaton scope="public"
+ className="ExampleFile"
+ headerFile="exampleFile.h"
+ sourceFile="exampleFile.cpp"
+ defaultToken="NoKeyword"
+ tokenEnum="Token"
+ hasToString="true"
+ includeGuardName="exampleFile_h">
+ <tokens>
+ <token>html</token>
+ <token>body</token>
+ <token>p</token>
+ <token>table</token>
+ <token name="WeWantADifferentNameForSpan">span</token>
+ </tokens>
+
+ <boilerplate>
+
+ <prolog>/****************************************************************************
+**
+** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the QtXmlPatterns module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights. These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+</prolog>
+
+ </boilerplate>
+
+</tokenAutomaton>
diff --git a/src/xmlpatterns/qtokenautomaton/qautomaton2cpp.xsl b/src/xmlpatterns/qtokenautomaton/qautomaton2cpp.xsl
new file mode 100644
index 0000000..16084a0
--- /dev/null
+++ b/src/xmlpatterns/qtokenautomaton/qautomaton2cpp.xsl
@@ -0,0 +1,298 @@
+<?xml version='1.0' encoding="UTF-8"?>
+<xsl:stylesheet version="2.0"
+ xml:lang="en"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:local="http://www.w3.org/2005/xquery-local-functions"
+ xmlns:xs="http://www.w3.org/2001/XMLSchema">
+
+ <xsl:variable name="className" as="xs:string" select="tokenAutomaton/@className"/>
+ <xsl:variable name="defaultToken" as="xs:string" select="tokenAutomaton/@defaultToken"/>
+ <xsl:variable name="tokens" as="element(token)+" select="tokenAutomaton/tokens/token"/>
+ <xsl:variable name="tokenEnum" as="xs:string" select="tokenAutomaton/@tokenEnum"/>
+
+ <xsl:variable name="warningGenerated" as="xs:string">/* NOTE: This file is AUTO GENERATED by qautomaton2cpp.xsl. */&#xA;</xsl:variable>
+
+ <xsl:template match="tokenAutomaton">
+
+ <xsl:variable name="uniqueLengths" as="xs:integer+" select="distinct-values(tokens/token/string-length())"/>
+
+ <xsl:result-document method="text" href="{@headerFile}">
+
+ <xsl:variable name="includeGuardName" select="string(@includeGuardName)"/>
+
+ <xsl:value-of select="boilerplate/prolog"/>
+
+ <xsl:value-of select="$warningGenerated"/>
+
+ <xsl:text>&#xA;#ifndef </xsl:text>
+ <xsl:value-of select="$includeGuardName"/>
+ <xsl:text>&#xA;#define </xsl:text>
+ <xsl:value-of select="$includeGuardName"/>
+ <xsl:text>&#xA;&#xA;</xsl:text>
+
+ <xsl:text>#include &lt;QtCore/QString>&#xA;</xsl:text>
+ <xsl:text>&#xA;</xsl:text>
+ <xsl:text>QT_BEGIN_NAMESPACE&#xA;</xsl:text>
+ <xsl:text>&#xA;</xsl:text>
+
+ <xsl:if test="@namespace">
+ <xsl:text>namespace </xsl:text>
+ <xsl:value-of select="@namespace"/>
+ {
+ </xsl:if>
+
+ <xsl:text>class </xsl:text>
+ <xsl:value-of select="@className"/>
+ {
+ <xsl:value-of select="@scope"/>:
+ <xsl:text>enum </xsl:text>
+ <xsl:value-of select="$tokenEnum"/>
+ <xsl:text>&#xA;</xsl:text>
+ {
+ <xsl:value-of separator=",&#xA;">
+ <xsl:sequence select="@defaultToken"/>
+ <xsl:perform-sort select="tokens/token/local:tokenToEnumName(.)">
+ <xsl:sort select="."/>
+ </xsl:perform-sort>
+ </xsl:value-of>
+ };
+
+ <xsl:text>static inline </xsl:text>
+ <xsl:value-of select="$tokenEnum"/>
+ <xsl:text> toToken(const QString &amp;value);&#xA;</xsl:text>
+ <xsl:text>static inline </xsl:text>
+ <xsl:value-of select="$tokenEnum"/>
+ <xsl:text> toToken(const QStringRef &amp;value);&#xA;</xsl:text>
+ <xsl:text>static </xsl:text>
+ <xsl:value-of select="$tokenEnum"/>
+ <xsl:text> toToken(const QChar *data, int length);&#xA;</xsl:text>
+ <xsl:if test="xs:boolean(@hasToString)">
+ <xsl:text>static QString toString(</xsl:text>
+ <xsl:value-of select="$tokenEnum"/>
+ <xsl:text> token);&#xA;</xsl:text>
+ </xsl:if>
+
+ private:
+ <xsl:for-each select="$uniqueLengths">
+ <xsl:sort select="."/>
+ <xsl:text>static inline </xsl:text>
+ <xsl:value-of select="$tokenEnum"/>
+ <xsl:text> classifier</xsl:text>
+ <xsl:value-of select="."/>
+ <xsl:text>(const QChar *data);&#xA;</xsl:text>
+ </xsl:for-each>
+ };
+
+ <xsl:text>inline </xsl:text>
+ <xsl:value-of select="@className"/>::<xsl:value-of select="$tokenEnum"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="@className"/>::toToken(const QString &amp;value)
+ {
+ return toToken(value.constData(), value.length());
+ }
+
+ <xsl:text>inline </xsl:text>
+ <xsl:value-of select="@className"/>::<xsl:value-of select="$tokenEnum"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="@className"/>::toToken(const QStringRef &amp;value)
+ {
+ return toToken(value.constData(), value.length());
+ }
+
+ <xsl:if test="@namespace">
+ <xsl:text>}&#xA;</xsl:text>
+ </xsl:if>
+
+ <xsl:text>&#xA;</xsl:text>
+ <xsl:text>QT_END_NAMESPACE&#xA;</xsl:text>
+ <xsl:text>&#xA;</xsl:text>
+
+ <xsl:text>#endif&#xA;</xsl:text>
+ </xsl:result-document>
+
+ <xsl:result-document method="text" href="{@sourceFile}">
+ <xsl:value-of select="boilerplate/prolog"/>
+
+ <xsl:value-of select="$warningGenerated"/>
+
+ <xsl:text>&#xA;#include "</xsl:text>
+ <xsl:value-of select="@headerFile"/>
+ <xsl:text>"&#xA;</xsl:text>
+ <xsl:text>&#xA;</xsl:text>
+ <xsl:text>QT_BEGIN_NAMESPACE&#xA;</xsl:text>
+
+ <xsl:if test="@namespace">
+ <xsl:text>&#xA;</xsl:text>
+ <xsl:text>using namespace </xsl:text>
+ <xsl:value-of select="@namespace"/>
+ <xsl:text>;&#xA;</xsl:text>
+ </xsl:if>
+
+ <xsl:text>&#xA;</xsl:text>
+ <xsl:variable name="tokens" select="tokens/token"/>
+
+ <xsl:for-each select="$uniqueLengths">
+ <xsl:sort select="."/>
+ <xsl:call-template name="generate-classifier">
+ <xsl:with-param name="strings" select="$tokens[string-length() eq current()]"/>
+ </xsl:call-template>
+ </xsl:for-each>
+
+ <xsl:value-of select="@className"/>::<xsl:value-of select="$tokenEnum"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="@className"/>::toToken(const QChar *data, int length)
+ {
+ switch(length)
+ {
+ <xsl:for-each select="$uniqueLengths">
+ <xsl:sort data-type="number" select="."/>
+ case <xsl:value-of select="."/>:
+ return classifier<xsl:value-of select="."/>(data);
+
+ </xsl:for-each>
+ default:
+ return <xsl:value-of select="@defaultToken"/>;
+ }
+ }
+
+ <xsl:if test="xs:boolean(@hasToString)">
+ QString <xsl:value-of select="@className"/>::toString(<xsl:value-of select="$tokenEnum"/> token)
+ {
+ const unsigned short *data = 0;
+ int length = 0;
+
+ switch(token)
+ {
+ <xsl:for-each select="tokens/token">
+ <xsl:sort select="local:tokenToEnumName(.)"/>
+ case <xsl:sequence select="local:tokenToEnumName(.)"/>:
+ {<!-- Without these braces, the code doesn't compile on MSVC 2008. -->
+ static const unsigned short staticallyStored<xsl:value-of select="local:tokenToEnumName(.)"/>[] =
+ {
+ <xsl:value-of separator=", " select="string-to-codepoints(.), 0"/>
+ };
+ data = staticallyStored<xsl:value-of select="local:tokenToEnumName(.)"/>;
+ length = <xsl:value-of select="string-length(.)"/>;
+ break;
+ }
+ </xsl:for-each>
+ default:
+ /* It's either the default token, or an undefined enum
+ * value. We silence a compiler warning, and return the
+ * empty string. */
+ ;
+ }
+
+ union
+ {
+ const unsigned short *data;
+ const QChar *asQChar;
+ } converter;
+ converter.data = data;
+
+ return QString::fromRawData(converter.asQChar, length);
+ }
+ </xsl:if>
+
+ <xsl:text>&#xA;</xsl:text>
+ <xsl:text>QT_END_NAMESPACE&#xA;</xsl:text>
+ <xsl:text>&#xA;</xsl:text>
+ </xsl:result-document>
+
+ </xsl:template>
+
+ <xsl:template name="generate-classifier">
+ <xsl:param name="strings" as="xs:string+"/>
+
+ <xsl:value-of select="$className"/>::<xsl:value-of select="$tokenEnum"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="$className"/>::classifier<xsl:value-of select="."/>(const QChar *data)
+
+ {
+ <xsl:sequence select="local:generateBranching($strings, 1, 1)"/>
+
+ return <xsl:value-of select="$defaultToken"/>;
+ }
+ </xsl:template>
+
+ <xsl:function name="local:generateBranching" ><!--as="xs:string+">-->
+ <xsl:param name="strings" as="xs:string+"/>
+ <xsl:param name="depth" as="xs:integer"/>
+ <xsl:param name="currentPos" as="xs:integer"/>
+
+ <xsl:choose>
+ <xsl:when test="count($strings) eq 1">
+ <xsl:variable name="remainingLength" as="xs:integer" select="(string-length($strings) - $currentPos) + 1"/>
+ <xsl:variable name="toMatch" as="xs:integer+" select="string-to-codepoints(substring($strings, $currentPos))"/>
+
+ <xsl:if test="$remainingLength ne 0">
+ <xsl:choose>
+ <xsl:when test="$remainingLength eq 1">
+ if(data[<xsl:sequence select="$depth - 1"/>] == <xsl:sequence select="$toMatch"/>)
+ </xsl:when>
+ <xsl:when test="$remainingLength &gt; 1">
+ static const unsigned short string[] =
+ {
+ <xsl:value-of separator=", " select="string-to-codepoints(substring($strings, $currentPos))"/>
+ };
+ if(memcmp(&amp;data[<xsl:sequence select="$depth - 1"/>], &amp;string, sizeof(QChar) * <xsl:value-of select="$remainingLength"/>) == 0)
+ </xsl:when>
+ </xsl:choose>
+ </xsl:if>
+
+ return <xsl:value-of select="local:tokenToEnumName($strings)"/>;
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:for-each select="distinct-values(for $i in $strings return substring($i, $currentPos, 1))">
+ <xsl:if test="position() &gt; 1">
+ <xsl:text>else </xsl:text>
+ </xsl:if>
+
+ <xsl:text>if (data[</xsl:text>
+ <xsl:sequence select="string($depth - 1)"/>
+ <xsl:text>] == </xsl:text>
+ <xsl:sequence select="string-to-codepoints(.)"/>
+ <xsl:text>)&#xA;</xsl:text>
+
+ {
+ <xsl:sequence select="local:generateBranching($strings[substring(., $currentPos, 1) eq current()], $depth + 1, $currentPos + 1)"/>
+ }
+
+ </xsl:for-each>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:function>
+
+ <xsl:function name="local:toCamelCase" as="xs:string">
+ <xsl:param name="arg" as="xs:string"/>
+
+ <xsl:sequence select="string-join((for $word in tokenize($arg,'[:-]+')
+ return concat(upper-case(substring($word,1,1)),
+ substring($word, 2))) ,'')"/>
+
+ </xsl:function>
+
+ <xsl:function name="local:tokenToEnumName" as="xs:string">
+ <xsl:param name="string" as="xs:string"/>
+
+ <xsl:variable name="token" select="$tokens[. eq $string]"/>
+
+ <xsl:choose>
+ <xsl:when test="$token/@name">
+ <xsl:sequence select="$token/@name"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <!-- We take the token's string value, and coerces into a C++
+ name. So get rid of invalid characters. Also do basic camel casing. -->
+ <xsl:variable name="normalized" select="translate($string, 'ABCDEFGHIJKLMNOPQRSTYXZabcdefghijklmnopqrstyxz1234567890_', 'ABCDEFGHIJKLMNOPQRSTYXZabcdefghijklmnopqrstyxz1234567890_')"/>
+ <xsl:value-of select="local:toCamelCase($normalized)"/>
+ </xsl:otherwise>
+ </xsl:choose>
+
+ </xsl:function>
+
+</xsl:stylesheet>
+
+<!--
+vim: et:ts=4:sw=4:sts=4
+-->
diff --git a/src/xmlpatterns/qtokenautomaton/qtokenautomaton.xsd b/src/xmlpatterns/qtokenautomaton/qtokenautomaton.xsd
new file mode 100644
index 0000000..322c50e
--- /dev/null
+++ b/src/xmlpatterns/qtokenautomaton/qtokenautomaton.xsd
@@ -0,0 +1,89 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+TODO docs
+-->
+
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
+ elementFormDefault="qualified">
+
+ <xs:element name="tokenAutomaton" type="tokenAutomatonElementType"/>
+
+ <xs:simpleType name="cppIdentifierType">
+ <xs:restriction base="xs:string">
+ <xs:pattern value="[a-zA-Z_][a-zA-Z0-9_]*"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:simpleType name="filenameType">
+ <xs:restriction base="xs:string">
+ <!-- At least one character. -->
+ <xs:pattern value=".+"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:simpleType name="scopeType">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="protected"/>
+ <xs:enumeration value="public"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:complexType name="tokenAutomatonElementType">
+ <xs:sequence>
+ <xs:element name="tokens" minOccurs="1" maxOccurs="1" type="tokensElementType">
+ <!-- Each token name (the enum name), must be unique. -->
+ <xs:unique name="tokenNames">
+ <xs:selector xpath="token"/>
+ <xs:field xpath="@name"/>
+ </xs:unique>
+ <!-- Each string must be unique, otherwise one string can map to two or
+ more enums. -->
+ <xs:unique name="tokenValues">
+ <xs:selector xpath="token"/>
+ <xs:field xpath="."/>
+ </xs:unique>
+ </xs:element>
+ <xs:element name="boilerplate" minOccurs="0" maxOccurs="1" type="boilerplateElementType"/>
+
+ </xs:sequence>
+ <xs:attribute name="className" type="cppIdentifierType"/>
+ <xs:attribute name="includeGuardName" type="cppIdentifierType"/>
+ <xs:attribute name="headerFile" type="filenameType" use="required"/>
+ <xs:attribute name="namespace" type="cppIdentifierType" use="optional"/>
+ <xs:attribute name="sourceFile" type="filenameType" use="required"/>
+ <xs:attribute name="scope" type="scopeType" use="required"/>
+ <xs:attribute name="defaultToken" type="cppIdentifierType" use="required"/>
+ <xs:attribute name="hasToString" type="xs:boolean" use="required"/>
+ <xs:attribute name="tokenEnum" type="cppIdentifierType" use="required"/>
+ </xs:complexType>
+
+ <xs:complexType name="tokensElementType">
+ <xs:sequence>
+ <xs:element name="token" maxOccurs="unbounded" type="tokenElementType" minOccurs="1"/>
+ </xs:sequence>
+ </xs:complexType>
+
+ <xs:complexType name="tokenElementType">
+ <xs:simpleContent>
+ <xs:extension base="xs:string">
+ <xs:attribute name="name" use="optional" type="cppIdentifierType"/>
+ </xs:extension>
+ </xs:simpleContent>
+ </xs:complexType>
+
+ <xs:complexType name="boilerplateElementType">
+ <xs:sequence>
+ <xs:element name="prolog" maxOccurs="1" type="prologElementType" minOccurs="1"/>
+ </xs:sequence>
+ </xs:complexType>
+
+ <xs:complexType name="prologElementType">
+ <xs:simpleContent>
+ <xs:extension base="xs:string"/>
+ </xs:simpleContent>
+ </xs:complexType>
+
+</xs:schema>
+<!--
+vim: et:ts=4:sw=4:sts=4
+-->