diff options
author | Paul Wicking <paul.wicking@qt.io> | 2023-05-14 07:25:48 +0200 |
---|---|---|
committer | Paul Wicking <paul.wicking@qt.io> | 2023-05-15 14:56:51 +0200 |
commit | 52c8f1dbdd52f6c6e5e5419a55c456c832d8cdf4 (patch) | |
tree | ede3ff0369d178865c2d4afb22d3fe2969f61164 | |
parent | ac01635f461bd9f211d89f6eea833d9d928fd113 (diff) | |
download | qttools-52c8f1dbdd52f6c6e5e5419a55c456c832d8cdf4.tar.gz |
QDoc: Append hash when normalizing non-ascii file names
When generating files, QDoc normalizes the string that's used for the
file's name. This normalization is done by
`Utilities::canonicalizeFileName`. The method returns a string stripped
for non-alphanumeric characters, with space replaced by hyphens, and
any repeating or trailing hyphens removed.
This causes the removal of non-ascii-printable characters, such as
a range of latin characters (e.g. 'ß`, 'ø', etc), and any non-latin
script (Arabic, Chinese, etc). If the file name, for example defined by
the `\page` command, contains nothing but disallowed characters, the
file simply isn't generated. However, QDoc doesn't warn the user in this
case.
This patch extends QDoc's generated output test to reproduce this issue.
The test serves as proof of the misbehavior and as regression discovery
mechanism.
The patch modifies `Utilities::canonicalizeFileName` such that it
appends a hash to "canonical" titles that contain characters not
considered legal in file names. For the purpose of compatibility across
file systems, legal characters are considered lowercase a-z, the digits
0-9, and the hyphen character. Other symbols and characters are still
removed. However, when encountering characters that are not part of a
subset of ascii-printable characters (ascii decimal 32-126, inclusive),
QDoc now appends a hash of the original file name string to the string
it returns as "canonicalized".
[ChangeLog][QDoc] QDoc now appends a hash of the original file name to
the file name(s) of files where the name contains non-ascii characters.
This means QDoc now generates files for pages with names written in
non-latin characters.
Fixes: QTBUG-113585
Change-Id: Icb0f8a094ed8eea38fb3ac954af318bb78f3a755
Reviewed-by: Topi Reiniö <topi.reinio@qt.io>
17 files changed, 254 insertions, 3 deletions
diff --git a/src/qdoc/qdoc/utilities.cpp b/src/qdoc/qdoc/utilities.cpp index d0f18338b..d4e1f8c07 100644 --- a/src/qdoc/qdoc/utilities.cpp +++ b/src/qdoc/qdoc/utilities.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 #include <QtCore/qprocess.h> +#include <QCryptographicHash> +#include "location.h" #include "utilities.h" QT_BEGIN_NAMESPACE @@ -96,11 +98,22 @@ QString comma(qsizetype wordPosition, qsizetype numberOfWords) */ QString canonicalizeFileName(const QString &name) { + auto legal_ascii = [](const uint value) { + const uint start_ascii_subset{ 32 }; + const uint end_ascii_subset{ 126 }; + + return value >= start_ascii_subset && value <= end_ascii_subset; + }; + QString result; bool begun = false; + bool has_non_alnum_content{ false }; const auto *data{name.constData()}; for (qsizetype i = 0; i < name.size(); ++i) { char16_t u{data[i].unicode()}; + if (!legal_ascii(u)) + has_non_alnum_content = true; + if (u >= 'A' && u <= 'Z') u += 'a' - 'A'; if ((u >= 'a' && u <= 'z') || (u >= '0' && u <= '9')) { @@ -114,6 +127,15 @@ QString canonicalizeFileName(const QString &name) if (result.endsWith(QLatin1Char('-'))) result.chop(1); + if (has_non_alnum_content) { + auto title_hash = QString::fromLocal8Bit( + QCryptographicHash::hash(name.toUtf8(), QCryptographicHash::Md5).toHex()); + title_hash.truncate(8); + if (!result.isEmpty()) + result.append(QLatin1Char('-')); + result.append(title_hash); + } + return result; } diff --git a/tests/auto/qdoc/generatedoutput/expected_output/8b5c72eb.html b/tests/auto/qdoc/generatedoutput/expected_output/8b5c72eb.html new file mode 100644 index 000000000..f52b65618 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/8b5c72eb.html @@ -0,0 +1,16 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="utf-8"> +<!-- adventures_with_non_ascii_characters.qdoc --> + <title>NonAsciiCharacterInput</title> +</head> +<body> +<div class="sidebar"><div class="sidebar-content" id="sidebar-content"></div></div> +<!-- $$$圣马苏里拉.html-description --> +<div class="descr" id="details"> +<p>This page exists solely to understand how QDoc will generate the file name for a page with non-latin characters in its name.</p> +</div> +<!-- @@@圣马苏里拉.html --> +</body> +</html> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/e85685de.html b/tests/auto/qdoc/generatedoutput/expected_output/e85685de.html new file mode 100644 index 000000000..ea52873e7 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/e85685de.html @@ -0,0 +1,16 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="utf-8"> +<!-- adventures_with_non_ascii_characters.qdoc --> + <title>NonAsciiCharacterInput</title> +</head> +<body> +<div class="sidebar"><div class="sidebar-content" id="sidebar-content"></div></div> +<!-- $$$موزاريلا-description --> +<div class="descr" id="details"> +<p>This page exists solely to understand how QDoc will generate the file name for a page with right-to-left script in its name.</p> +</div> +<!-- @@@موزاريلا --> +</body> +</html> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/8b5c72eb.webxml b/tests/auto/qdoc/generatedoutput/expected_output/html/8b5c72eb.webxml new file mode 100644 index 000000000..375c43732 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/8b5c72eb.webxml @@ -0,0 +1,10 @@ +<?xml version="1.0" encoding="UTF-8"?> +<WebXML> + <document> + <page name="圣马苏里拉.html" href="8b5c72eb.html" status="active" location="adventures_with_non_ascii_characters.qdoc" documented="true" subtype="page" title="" fulltitle="" subtitle=""> + <description> + <para>This page exists solely to understand how QDoc will generate the file name for a page with non-latin characters in its name.</para> + </description> + </page> + </document> +</WebXML> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/e85685de.webxml b/tests/auto/qdoc/generatedoutput/expected_output/html/e85685de.webxml new file mode 100644 index 000000000..beb4df518 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/e85685de.webxml @@ -0,0 +1,10 @@ +<?xml version="1.0" encoding="UTF-8"?> +<WebXML> + <document> + <page name="موزاريلا" href="e85685de.html" status="active" location="adventures_with_non_ascii_characters.qdoc" documented="true" subtype="page" title="" fulltitle="" subtitle=""> + <description> + <para>This page exists solely to understand how QDoc will generate the file name for a page with right-to-left script in its name.</para> + </description> + </page> + </document> +</WebXML> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/mozzarella-7c883eff.webxml b/tests/auto/qdoc/generatedoutput/expected_output/html/mozzarella-7c883eff.webxml new file mode 100644 index 000000000..13ce91b72 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/mozzarella-7c883eff.webxml @@ -0,0 +1,10 @@ +<?xml version="1.0" encoding="UTF-8"?> +<WebXML> + <document> + <page name="桑塔mozzarella.html" href="mozzarella-7c883eff.html" status="active" location="adventures_with_non_ascii_characters.qdoc" documented="true" subtype="page" title="" fulltitle="" subtitle=""> + <description> + <para>This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with non-latin characters in its name.</para> + </description> + </page> + </document> +</WebXML> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index b/tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index index 613bb05a4..aa306d7eb 100644 --- a/tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index @@ -10,5 +10,10 @@ <contents name="further-details" title="Further details" level="1"/> <contents name="ascii-characters-that-are-non-printable-ascii-such-as-or-521d09f0" title="Ascii characters that are non-printable ascii, such as ß, ü, or ø" level="2"/> </page> + <page name="SEITE_MIT_AUSSCHLIEßLICH_GROßBUCHSTABEN_IM_TITEL_ÜBERSCHRIFT.htm" href="seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.html" status="active" location="adventures_with_non_ascii_characters.qdoc" documented="true" subtype="page" title="" fulltitle="" subtitle=""/> + <page name="santaموزاريلا.html" href="santa-14209312.html" status="active" location="adventures_with_non_ascii_characters.qdoc" documented="true" subtype="page" title="" fulltitle="" subtitle=""/> + <page name="موزاريلا" href="e85685de.html" status="active" location="adventures_with_non_ascii_characters.qdoc" documented="true" subtype="page" title="" fulltitle="" subtitle=""/> + <page name="圣马苏里拉.html" href="8b5c72eb.html" status="active" location="adventures_with_non_ascii_characters.qdoc" documented="true" subtype="page" title="" fulltitle="" subtitle=""/> + <page name="桑塔mozzarella.html" href="mozzarella-7c883eff.html" status="active" location="adventures_with_non_ascii_characters.qdoc" documented="true" subtype="page" title="" fulltitle="" subtitle=""/> </namespace> </INDEX> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/santa-14209312.webxml b/tests/auto/qdoc/generatedoutput/expected_output/html/santa-14209312.webxml new file mode 100644 index 000000000..9d07c9da8 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/santa-14209312.webxml @@ -0,0 +1,10 @@ +<?xml version="1.0" encoding="UTF-8"?> +<WebXML> + <document> + <page name="santaموزاريلا.html" href="santa-14209312.html" status="active" location="adventures_with_non_ascii_characters.qdoc" documented="true" subtype="page" title="" fulltitle="" subtitle=""> + <description> + <para>This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with right-to-left script in its name.</para> + </description> + </page> + </document> +</WebXML> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.webxml b/tests/auto/qdoc/generatedoutput/expected_output/html/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.webxml new file mode 100644 index 000000000..8fe3c93ee --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.webxml @@ -0,0 +1,10 @@ +<?xml version="1.0" encoding="UTF-8"?> +<WebXML> + <document> + <page name="SEITE_MIT_AUSSCHLIEßLICH_GROßBUCHSTABEN_IM_TITEL_ÜBERSCHRIFT.htm" href="seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.html" status="active" location="adventures_with_non_ascii_characters.qdoc" documented="true" subtype="page" title="" fulltitle="" subtitle=""> + <description> + <para>This page exists solely to understand how QDoc will generate the file name for a page with non-ascii-printable latin characters in its name.</para> + </description> + </page> + </document> +</WebXML> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/mozzarella-7c883eff.html b/tests/auto/qdoc/generatedoutput/expected_output/mozzarella-7c883eff.html new file mode 100644 index 000000000..bb4b3651d --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/mozzarella-7c883eff.html @@ -0,0 +1,16 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="utf-8"> +<!-- adventures_with_non_ascii_characters.qdoc --> + <title>NonAsciiCharacterInput</title> +</head> +<body> +<div class="sidebar"><div class="sidebar-content" id="sidebar-content"></div></div> +<!-- $$$桑塔mozzarella.html-description --> +<div class="descr" id="details"> +<p>This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with non-latin characters in its name.</p> +</div> +<!-- @@@桑塔mozzarella.html --> +</body> +</html> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/mozzarella.html b/tests/auto/qdoc/generatedoutput/expected_output/mozzarella.html new file mode 100644 index 000000000..bb4b3651d --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/mozzarella.html @@ -0,0 +1,16 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="utf-8"> +<!-- adventures_with_non_ascii_characters.qdoc --> + <title>NonAsciiCharacterInput</title> +</head> +<body> +<div class="sidebar"><div class="sidebar-content" id="sidebar-content"></div></div> +<!-- $$$桑塔mozzarella.html-description --> +<div class="descr" id="details"> +<p>This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with non-latin characters in its name.</p> +</div> +<!-- @@@桑塔mozzarella.html --> +</body> +</html> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/santa-14209312.html b/tests/auto/qdoc/generatedoutput/expected_output/santa-14209312.html new file mode 100644 index 000000000..f40feed36 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/santa-14209312.html @@ -0,0 +1,16 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="utf-8"> +<!-- adventures_with_non_ascii_characters.qdoc --> + <title>NonAsciiCharacterInput</title> +</head> +<body> +<div class="sidebar"><div class="sidebar-content" id="sidebar-content"></div></div> +<!-- $$$santaموزاريلا.html-description --> +<div class="descr" id="details"> +<p>This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with right-to-left script in its name.</p> +</div> +<!-- @@@santaموزاريلا.html --> +</body> +</html> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/santa.html b/tests/auto/qdoc/generatedoutput/expected_output/santa.html new file mode 100644 index 000000000..f40feed36 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/santa.html @@ -0,0 +1,16 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="utf-8"> +<!-- adventures_with_non_ascii_characters.qdoc --> + <title>NonAsciiCharacterInput</title> +</head> +<body> +<div class="sidebar"><div class="sidebar-content" id="sidebar-content"></div></div> +<!-- $$$santaموزاريلا.html-description --> +<div class="descr" id="details"> +<p>This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with right-to-left script in its name.</p> +</div> +<!-- @@@santaموزاريلا.html --> +</body> +</html> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.html b/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.html new file mode 100644 index 000000000..16df49755 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.html @@ -0,0 +1,16 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="utf-8"> +<!-- adventures_with_non_ascii_characters.qdoc --> + <title>NonAsciiCharacterInput</title> +</head> +<body> +<div class="sidebar"><div class="sidebar-content" id="sidebar-content"></div></div> +<!-- $$$SEITE_MIT_AUSSCHLIEßLICH_GROßBUCHSTABEN_IM_TITEL_ÜBERSCHRIFT.htm-description --> +<div class="descr" id="details"> +<p>This page exists solely to understand how QDoc will generate the file name for a page with non-ascii-printable latin characters in its name.</p> +</div> +<!-- @@@SEITE_MIT_AUSSCHLIEßLICH_GROßBUCHSTABEN_IM_TITEL_ÜBERSCHRIFT.htm --> +</body> +</html> diff --git a/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm.html b/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm.html new file mode 100644 index 000000000..16df49755 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm.html @@ -0,0 +1,16 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="utf-8"> +<!-- adventures_with_non_ascii_characters.qdoc --> + <title>NonAsciiCharacterInput</title> +</head> +<body> +<div class="sidebar"><div class="sidebar-content" id="sidebar-content"></div></div> +<!-- $$$SEITE_MIT_AUSSCHLIEßLICH_GROßBUCHSTABEN_IM_TITEL_ÜBERSCHRIFT.htm-description --> +<div class="descr" id="details"> +<p>This page exists solely to understand how QDoc will generate the file name for a page with non-ascii-printable latin characters in its name.</p> +</div> +<!-- @@@SEITE_MIT_AUSSCHLIEßLICH_GROßBUCHSTABEN_IM_TITEL_ÜBERSCHRIFT.htm --> +</body> +</html> diff --git a/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc b/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc index b1f23adf7..c5f09cb1c 100644 --- a/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc +++ b/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc @@ -52,3 +52,38 @@ to such section titles works as expected. It's made a section2 to exercise the behavior for other section levels than 1. */ + +/*! + \page SEITE_MIT_AUSSCHLIEßLICH_GROßBUCHSTABEN_IM_TITEL_ÜBERSCHRIFT.htm + + This page exists solely to understand how QDoc will generate the file name + for a page with non-ascii-printable latin characters in its name. +*/ + +/*! + \page موزاريلا سانتا.html + + This page exists solely to understand how QDoc will generate the file name + for a page with right-to-left script in its name. +*/ + +/*! + \page 圣马苏里拉.html + + This page exists solely to understand how QDoc will generate the file name + for a page with non-latin characters in its name. +*/ + +/*! + \page santaموزاريلا.html + + This page exists solely to understand how QDoc will generate the file name + for a page that mixes printable ascii with right-to-left script in its name. +*/ + +/*! + \page 桑塔mozzarella.html + + This page exists solely to understand how QDoc will generate the file name + for a page that mixes printable ascii with non-latin characters in its name. +*/ diff --git a/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp b/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp index 269739242..0e3b5a4ee 100644 --- a/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp +++ b/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp @@ -591,9 +591,20 @@ void tst_generatedOutput::proxyPage() void tst_generatedOutput::nonAsciiCharacterInput() { - testAndCompare("testdata/non_ascii_character_input/non_ascii_character_input.qdocconf", - "html/nonasciicharacterinput.index " - "adventures-with-non-ascii-characters.html"); + testAndCompare( + "testdata/non_ascii_character_input/non_ascii_character_input.qdocconf", + "html/nonasciicharacterinput.index " + "html/mozzarella-7c883eff.webxml " + "html/santa-14209312.webxml " + "html/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.webxml " + "html/8b5c72eb.webxml " + "html/e85685de.webxml " + "seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.html " + "mozzarella-7c883eff.html " + "santa-14209312.html " + "8b5c72eb.html " + "e85685de.html " + "adventures-with-non-ascii-characters.html"); } |