summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Wicking <paul.wicking@qt.io>2023-05-03 11:00:25 +0200
committerPaul Wicking <paul.wicking@qt.io>2023-05-13 22:01:10 +0200
commit7057d01fbb9f8f37c707b33e3b92c10a78919ddc (patch)
treea919ab62d892885bde7d9049011b3b658b920dce
parent941a9b5e5963f8c0798415e3cb69f031da1f4109 (diff)
downloadqttools-7057d01fbb9f8f37c707b33e3b92c10a78919ddc.tar.gz
QDoc: Append hash to canonical titles with non-alnum characters
When generating fragment identifiers from a title, QDoc normalizes the string that's used as fragment identifier. This normalization is done by `Doc::canonicalTitle()`. This method returns a string that is stripped from non-alphanumeric characters, has space(s) replaced by one hyphen, and any repeating or trailing hyphens removed. This causes the removal of certain characters, such as 'ß', '大', etc. For documentation written in languages that contain mostly non-latin1 characters, such as Chinese, this means fragment identifiers may be empty, such that links to these anchors (e.g. from a table of contents) lead to nowhere. This patch adds test data to QDoc's generated output test to reproduce the issue. The Chinese test data is courtesy of the bug reporter. The test data also contains other characters from Latin scripts, as during investigation of a solution to the bug, these appeared as separate triggers of the misbehavior. The modified test also serves to catch possible future regressions. The patch modifies `Doc::canonicalTitle` such that it appends a hash to "canonical" titles that contain characters that are not considered legal entities in a canonical title. In this context, legal characters are lowercase a-z, digits 0-9, and the dash (`-`). Other symbols and characters are removed. When encountering any character that is either a non-printable ascii character or ascii character outside a subset (ascii decimal 32-126, inclusive), QDoc will append a hash of the original string to the fragment identifier it generates. This means that the canonical title for a string that contains, for example, a mix of allowed and disallowed characters, will consist of the allowed characters and a hash of the original string appended to the final string. The patch changes the loop in `canonicalTitle` to a ranged for loop over a const-ref, and adds precision to a code comment (precision based on timing the execution of the two implementations of this method one million times). Finally, the patch adds documentation for `Doc::canonicalTitle`, as that didn't exist previously. [ChangeLog][QDoc] QDoc now appends a hash of the original title to the fragment identifier generated for that title if the title contains non-ascii characters. This means QDoc now generates fragment identifiers for titles that are written in non-latin characters. Fixes: QTBUG-64506 Change-Id: Idc62677b9950becea662d8ff5ead1f631ec26bc3 Reviewed-by: Topi Reiniö <topi.reinio@qt.io>
-rw-r--r--src/qdoc/qdoc/doc.cpp57
-rw-r--r--tests/auto/qdoc/generatedoutput/expected_output/adventures-with-non-ascii-characters.html46
-rw-r--r--tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index14
-rw-r--r--tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc54
-rw-r--r--tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/non_ascii_character_input.qdocconf13
-rw-r--r--tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp9
6 files changed, 186 insertions, 7 deletions
diff --git a/src/qdoc/qdoc/doc.cpp b/src/qdoc/qdoc/doc.cpp
index 762af2ebd..a4d196e36 100644
--- a/src/qdoc/qdoc/doc.cpp
+++ b/src/qdoc/qdoc/doc.cpp
@@ -13,6 +13,8 @@
#include "quoter.h"
#include "text.h"
+#include <qcryptographichash.h>
+
QT_BEGIN_NAMESPACE
using namespace Qt::StringLiterals;
@@ -407,10 +409,37 @@ CodeMarker *Doc::quoteFromFile(const Location &location, Quoter &quoter, Resolve
return marker;
}
+/*!
+ \brief Generates a url-friendly string representation from \a title.
+
+ "Url-friendly" in this context is a string that contains only a subset of
+ printable ascii characters.
+
+ The subset includes alphanumeric (alnum) characters ([a-zA-Z0-9]), printable
+ ascii characters, space, punctuation characters, and common symbols.
+ Non-alnum characters in this subset are replaced by a single dash. Leading
+ and trailing dashes are removed, such that the resulting string does not
+ start or end with a dash. Any capital character is replaced by its lowercase
+ counterpart.
+
+ If any character in \a title is non-latin, or latin and not found in the
+ aforementioned subset (e.g. 'ß', 'å', or 'ö'), a hash of \a title is
+ appended to the final string.
+
+ Returns a string that is normalized for the purpose of generating fragment
+ identifiers for \a title in URLs.
+ */
QString Doc::canonicalTitle(const QString &title)
{
- // The code below is equivalent to the following chunk, but _much_
- // faster (accounts for ~10% of total running time)
+ auto legal_ascii = [](const uint value) {
+ const uint start_ascii_subset{ 32 };
+ const uint end_ascii_subset{ 126 };
+
+ return value >= start_ascii_subset && value <= end_ascii_subset;
+ };
+
+ // The code below is equivalent to the following chunk, but
+ // has been measured to be approximately 4 times faster.
//
// QRegularExpression attributeExpr("[^A-Za-z0-9]+");
// QString result = title.toLower();
@@ -421,11 +450,16 @@ QString Doc::canonicalTitle(const QString &title)
QString result;
result.reserve(title.size());
- bool dashAppended = false;
- bool begun = false;
- qsizetype lastAlnum = 0;
- for (int i = 0; i != title.size(); ++i) {
- uint c = title.at(i).unicode();
+ bool dashAppended{false};
+ bool begun{false};
+ qsizetype lastAlnum{0};
+ bool has_non_alnum_content{false};
+
+ for (const auto &i : title) {
+ uint c = i.unicode();
+
+ if (!legal_ascii(c))
+ has_non_alnum_content = true;
if (c >= 'A' && c <= 'Z')
c += 'a' - 'A';
bool alnum = (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
@@ -441,6 +475,15 @@ QString Doc::canonicalTitle(const QString &title)
}
}
result.truncate(lastAlnum);
+
+ if (has_non_alnum_content) {
+ auto title_hash = QString::fromLocal8Bit(
+ QCryptographicHash::hash(title.toUtf8(), QCryptographicHash::Md5).toHex());
+ title_hash.truncate(8);
+ if (!result.isEmpty())
+ result.append(QLatin1Char('-'));
+ result.append(title_hash);
+ }
return result;
}
diff --git a/tests/auto/qdoc/generatedoutput/expected_output/adventures-with-non-ascii-characters.html b/tests/auto/qdoc/generatedoutput/expected_output/adventures-with-non-ascii-characters.html
new file mode 100644
index 000000000..5d7532e88
--- /dev/null
+++ b/tests/auto/qdoc/generatedoutput/expected_output/adventures-with-non-ascii-characters.html
@@ -0,0 +1,46 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="utf-8">
+<!-- adventures_with_non_ascii_characters.qdoc -->
+ <meta name="description" content="Test that non-ascii characters work as input to QDoc commands.">
+ <title>Adventures with non-ascii characters in QDoc | NonAsciiCharacterInput</title>
+</head>
+<body>
+<div class="sidebar">
+<div class="toc">
+<h3 id="toc">Contents</h3>
+<ul>
+<li class="level1"><a href="#a-713da3e8">A 大纲视图</a></li>
+<li class="level1"><a href="#3d-42faee45">3D场景视图</a></li>
+<li class="level1"><a href="#3d-c57e864e">这就是3D场景视图</a></li>
+<li class="level1"><a href="#662952c1">属性视图</a></li>
+<li class="level1"><a href="#further-details">Further details</a></li>
+<li class="level2"><a href="#ascii-characters-that-are-non-printable-ascii-such-as-or-521d09f0">Ascii characters that are non-printable ascii, such as ß, ü, or ø</a></li>
+</ul>
+</div>
+<div class="sidebar-content" id="sidebar-content"></div></div>
+<h1 class="title">Adventures with non-ascii characters in QDoc</h1>
+<!-- $$$adventures_with_non_ascii_characters.html-description -->
+<div class="descr" id="details">
+<p>The purpose of this test data is to provide a regression mechanism as part of QDoc's end-to-end test, tst_generatedOutput, for an issue (QTBUG-64506) that was reported against QDoc's \section1 command. The issue, as experienced by the reporter of the bug, is that if the \section1 command is followed by a non-ascii character (for example Chinese characters), navigation links aren't generated for the section title. The bug was filed against Qt 5.9&#x2e;2&#x2e; This test case aims at reproducing the issue as filed by the reporter.</p>
+<p>Such this document snippet:</p>
+<h2 id="a-713da3e8">A 大纲视图</h2>
+<p>The reporter states that this link works, presumably because it begins with the ascii character &quot;A&quot;.</p>
+<p>这就是大纲视图</p>
+<h2 id="3d-42faee45">3D场景视图</h2>
+<p>The reporter states that this link works, presumably because it begins with the digit &quot;3&quot;.</p>
+<p>这就是3D场景视图</p>
+<h2 id="3d-c57e864e">这就是3D场景视图</h2>
+<p>If this section generates a duplicate anchor, &quot;3D&quot;, it's because 3D is the only part of the section title QDoc recognizes. This is an error caused by the same bug, and the link should somehow reflect that QDoc encountered the Chinese (or any non-ascii) characters instead.</p>
+<h2 id="662952c1">属性视图</h2>
+<p>The reporter states that this link doesn't work, presumably because it begins with the Chinese character &quot;属&quot;.</p>
+<p>这就是属性视图</p>
+<h2 id="further-details">Further details</h2>
+<p>The bug report is at <a href="https://bugreports.qt.io/browse/QTBUG-64506" translate="no">https://bugreports.qt.io/browse/QTBUG-64506</a>. It contains the content used to trigger the behavior in this test case. The Chinese characters are copied verbatim from the report.</p>
+<h3 id="ascii-characters-that-are-non-printable-ascii-such-as-or-521d09f0">Ascii characters that are non-printable ascii, such as ß, ü, or ø</h3>
+<p>A whole range of ascii characters are not printable ascii characters. These could also cause issues for QDoc. This section is here to confirm linking to such section titles works as expected. It's made a section2 to exercise the behavior for other section levels than 1.</p>
+</div>
+<!-- @@@adventures_with_non_ascii_characters.html -->
+</body>
+</html>
diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index b/tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index
new file mode 100644
index 000000000..613bb05a4
--- /dev/null
+++ b/tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE QDOCINDEX>
+<INDEX url="" title="NonAsciiCharacterInput Reference Documentation" version="" project="NonAsciiCharacterInput">
+ <namespace name="" status="active" access="public" module="nonasciicharacterinput">
+ <page name="adventures_with_non_ascii_characters.html" href="adventures-with-non-ascii-characters.html" status="active" location="adventures_with_non_ascii_characters.qdoc" documented="true" subtype="page" title="Adventures with non-ascii characters in QDoc" fulltitle="Adventures with non-ascii characters in QDoc" subtitle="" brief="Test that non-ascii characters work as input to QDoc commands">
+ <contents name="a-713da3e8" title="A 大纲视图" level="1"/>
+ <contents name="3d-42faee45" title="3D场景视图" level="1"/>
+ <contents name="3d-c57e864e" title="这就是3D场景视图" level="1"/>
+ <contents name="662952c1" title="属性视图" level="1"/>
+ <contents name="further-details" title="Further details" level="1"/>
+ <contents name="ascii-characters-that-are-non-printable-ascii-such-as-or-521d09f0" title="Ascii characters that are non-printable ascii, such as ß, ü, or ø" level="2"/>
+ </page>
+ </namespace>
+</INDEX>
diff --git a/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc b/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc
new file mode 100644
index 000000000..b1f23adf7
--- /dev/null
+++ b/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc
@@ -0,0 +1,54 @@
+// Copyright (C) 2023 The Qt Company Ltd.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GFDL-1.3-no-invariants-only
+
+/*!
+ \page adventures_with_non_ascii_characters.html
+ \title Adventures with non-ascii characters in QDoc
+ \brief Test that non-ascii characters work as input to QDoc commands.
+
+ The purpose of this test data is to provide a regression mechanism as part
+ of QDoc's end-to-end test, tst_generatedOutput, for an issue (QTBUG-64506)
+ that was reported against QDoc's \\section1 command. The issue, as
+ experienced by the reporter of the bug, is that if the \\section1 command
+ is followed by a non-ascii character (for example Chinese characters),
+ navigation links aren't generated for the section title. The bug was filed
+ against Qt 5.9.2. This test case aims at reproducing the issue as filed by
+ the reporter.
+
+ Such this document snippet:
+
+ \section1 A 大纲视图
+ The reporter states that this link works, presumably because it begins with
+ the ascii character "A".
+
+ 这就是大纲视图
+
+ \section1 3D场景视图
+ The reporter states that this link works, presumably because it begins with
+ the digit "3".
+
+ 这就是3D场景视图
+
+ \section1 这就是3D场景视图
+ If this section generates a duplicate anchor, "3D", it's because 3D
+ is the only part of the section title QDoc recognizes. This is an error
+ caused by the same bug, and the link should somehow reflect that QDoc
+ encountered the Chinese (or any non-ascii) characters instead.
+
+ \section1 属性视图
+ The reporter states that this link doesn't work, presumably because it
+ begins with the Chinese character "属".
+
+ 这就是属性视图
+
+ \section1 Further details
+ The bug report is at \l https://bugreports.qt.io/browse/QTBUG-64506. It
+ contains the content used to trigger the behavior in this test case. The
+ Chinese characters are copied verbatim from the report.
+
+ \section2 Ascii characters that are non-printable ascii, such as ß, ü, or ø
+ A whole range of ascii characters are not printable ascii characters. These
+ could also cause issues for QDoc. This section is here to confirm linking
+ to such section titles works as expected. It's made a section2 to exercise
+ the behavior for other section levels than 1.
+*/
diff --git a/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/non_ascii_character_input.qdocconf b/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/non_ascii_character_input.qdocconf
new file mode 100644
index 000000000..28235c037
--- /dev/null
+++ b/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/non_ascii_character_input.qdocconf
@@ -0,0 +1,13 @@
+# Copyright (C) 2023 The Qt Company Ltd.
+# SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GFDL-1.3-no-invariants-only
+
+include(../configs/config.qdocconf)
+project = NonAsciiCharacterInput
+
+headerdirs = .
+sourcedirs = .
+exampledirs = .
+
+outputformats = WebXML HTML
+WebXML.quotinginformation = true
+WebXML.nosubdirs = true
diff --git a/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp b/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp
index f110b9e45..269739242 100644
--- a/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp
+++ b/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp
@@ -60,6 +60,7 @@ private slots:
void testTagFile();
void testGlobalFunctions();
void proxyPage();
+ void nonAsciiCharacterInput();
private:
QScopedPointer<QTemporaryDir> m_outputDir;
@@ -588,6 +589,14 @@ void tst_generatedOutput::proxyPage()
"proxypage-docbook/stdpair-proxy.xml");
}
+void tst_generatedOutput::nonAsciiCharacterInput()
+{
+ testAndCompare("testdata/non_ascii_character_input/non_ascii_character_input.qdocconf",
+ "html/nonasciicharacterinput.index "
+ "adventures-with-non-ascii-characters.html");
+}
+
+
int main(int argc, char *argv[])
{
tst_generatedOutput tc;