From 52c8f1dbdd52f6c6e5e5419a55c456c832d8cdf4 Mon Sep 17 00:00:00 2001 From: Paul Wicking Date: Sun, 14 May 2023 07:25:48 +0200 Subject: QDoc: Append hash when normalizing non-ascii file names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When generating files, QDoc normalizes the string that's used for the file's name. This normalization is done by `Utilities::canonicalizeFileName`. The method returns a string stripped for non-alphanumeric characters, with space replaced by hyphens, and any repeating or trailing hyphens removed. This causes the removal of non-ascii-printable characters, such as a range of latin characters (e.g. 'ß`, 'ø', etc), and any non-latin script (Arabic, Chinese, etc). If the file name, for example defined by the `\page` command, contains nothing but disallowed characters, the file simply isn't generated. However, QDoc doesn't warn the user in this case. This patch extends QDoc's generated output test to reproduce this issue. The test serves as proof of the misbehavior and as regression discovery mechanism. The patch modifies `Utilities::canonicalizeFileName` such that it appends a hash to "canonical" titles that contain characters not considered legal in file names. For the purpose of compatibility across file systems, legal characters are considered lowercase a-z, the digits 0-9, and the hyphen character. Other symbols and characters are still removed. However, when encountering characters that are not part of a subset of ascii-printable characters (ascii decimal 32-126, inclusive), QDoc now appends a hash of the original file name string to the string it returns as "canonicalized". [ChangeLog][QDoc] QDoc now appends a hash of the original file name to the file name(s) of files where the name contains non-ascii characters. This means QDoc now generates files for pages with names written in non-latin characters. Fixes: QTBUG-113585 Change-Id: Icb0f8a094ed8eea38fb3ac954af318bb78f3a755 Reviewed-by: Topi Reiniö --- src/qdoc/qdoc/utilities.cpp | 22 ++++++++++++++ .../generatedoutput/expected_output/8b5c72eb.html | 16 ++++++++++ .../generatedoutput/expected_output/e85685de.html | 16 ++++++++++ .../expected_output/html/8b5c72eb.webxml | 10 +++++++ .../expected_output/html/e85685de.webxml | 10 +++++++ .../html/mozzarella-7c883eff.webxml | 10 +++++++ .../html/nonasciicharacterinput.index | 5 ++++ .../expected_output/html/santa-14209312.webxml | 10 +++++++ ...hstaben-im-titel-berschrift-htm-bfa91582.webxml | 10 +++++++ .../expected_output/mozzarella-7c883eff.html | 16 ++++++++++ .../expected_output/mozzarella.html | 16 ++++++++++ .../expected_output/santa-14209312.html | 16 ++++++++++ .../generatedoutput/expected_output/santa.html | 16 ++++++++++ ...uchstaben-im-titel-berschrift-htm-bfa91582.html | 16 ++++++++++ ...ich-gro-buchstaben-im-titel-berschrift-htm.html | 16 ++++++++++ .../adventures_with_non_ascii_characters.qdoc | 35 ++++++++++++++++++++++ .../qdoc/generatedoutput/tst_generatedoutput.cpp | 17 +++++++++-- 17 files changed, 254 insertions(+), 3 deletions(-) create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/8b5c72eb.html create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/e85685de.html create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/html/8b5c72eb.webxml create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/html/e85685de.webxml create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/html/mozzarella-7c883eff.webxml create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/html/santa-14209312.webxml create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/html/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.webxml create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/mozzarella-7c883eff.html create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/mozzarella.html create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/santa-14209312.html create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/santa.html create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.html create mode 100644 tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm.html diff --git a/src/qdoc/qdoc/utilities.cpp b/src/qdoc/qdoc/utilities.cpp index d0f18338b..d4e1f8c07 100644 --- a/src/qdoc/qdoc/utilities.cpp +++ b/src/qdoc/qdoc/utilities.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 #include +#include +#include "location.h" #include "utilities.h" QT_BEGIN_NAMESPACE @@ -96,11 +98,22 @@ QString comma(qsizetype wordPosition, qsizetype numberOfWords) */ QString canonicalizeFileName(const QString &name) { + auto legal_ascii = [](const uint value) { + const uint start_ascii_subset{ 32 }; + const uint end_ascii_subset{ 126 }; + + return value >= start_ascii_subset && value <= end_ascii_subset; + }; + QString result; bool begun = false; + bool has_non_alnum_content{ false }; const auto *data{name.constData()}; for (qsizetype i = 0; i < name.size(); ++i) { char16_t u{data[i].unicode()}; + if (!legal_ascii(u)) + has_non_alnum_content = true; + if (u >= 'A' && u <= 'Z') u += 'a' - 'A'; if ((u >= 'a' && u <= 'z') || (u >= '0' && u <= '9')) { @@ -114,6 +127,15 @@ QString canonicalizeFileName(const QString &name) if (result.endsWith(QLatin1Char('-'))) result.chop(1); + if (has_non_alnum_content) { + auto title_hash = QString::fromLocal8Bit( + QCryptographicHash::hash(name.toUtf8(), QCryptographicHash::Md5).toHex()); + title_hash.truncate(8); + if (!result.isEmpty()) + result.append(QLatin1Char('-')); + result.append(title_hash); + } + return result; } diff --git a/tests/auto/qdoc/generatedoutput/expected_output/8b5c72eb.html b/tests/auto/qdoc/generatedoutput/expected_output/8b5c72eb.html new file mode 100644 index 000000000..f52b65618 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/8b5c72eb.html @@ -0,0 +1,16 @@ + + + + + + NonAsciiCharacterInput + + + + +
+

This page exists solely to understand how QDoc will generate the file name for a page with non-latin characters in its name.

+
+ + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/e85685de.html b/tests/auto/qdoc/generatedoutput/expected_output/e85685de.html new file mode 100644 index 000000000..ea52873e7 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/e85685de.html @@ -0,0 +1,16 @@ + + + + + + NonAsciiCharacterInput + + + + +
+

This page exists solely to understand how QDoc will generate the file name for a page with right-to-left script in its name.

+
+ + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/8b5c72eb.webxml b/tests/auto/qdoc/generatedoutput/expected_output/html/8b5c72eb.webxml new file mode 100644 index 000000000..375c43732 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/8b5c72eb.webxml @@ -0,0 +1,10 @@ + + + + + + This page exists solely to understand how QDoc will generate the file name for a page with non-latin characters in its name. + + + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/e85685de.webxml b/tests/auto/qdoc/generatedoutput/expected_output/html/e85685de.webxml new file mode 100644 index 000000000..beb4df518 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/e85685de.webxml @@ -0,0 +1,10 @@ + + + + + + This page exists solely to understand how QDoc will generate the file name for a page with right-to-left script in its name. + + + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/mozzarella-7c883eff.webxml b/tests/auto/qdoc/generatedoutput/expected_output/html/mozzarella-7c883eff.webxml new file mode 100644 index 000000000..13ce91b72 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/mozzarella-7c883eff.webxml @@ -0,0 +1,10 @@ + + + + + + This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with non-latin characters in its name. + + + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index b/tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index index 613bb05a4..aa306d7eb 100644 --- a/tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/nonasciicharacterinput.index @@ -10,5 +10,10 @@ + + + + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/santa-14209312.webxml b/tests/auto/qdoc/generatedoutput/expected_output/html/santa-14209312.webxml new file mode 100644 index 000000000..9d07c9da8 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/santa-14209312.webxml @@ -0,0 +1,10 @@ + + + + + + This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with right-to-left script in its name. + + + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/html/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.webxml b/tests/auto/qdoc/generatedoutput/expected_output/html/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.webxml new file mode 100644 index 000000000..8fe3c93ee --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/html/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.webxml @@ -0,0 +1,10 @@ + + + + + + This page exists solely to understand how QDoc will generate the file name for a page with non-ascii-printable latin characters in its name. + + + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/mozzarella-7c883eff.html b/tests/auto/qdoc/generatedoutput/expected_output/mozzarella-7c883eff.html new file mode 100644 index 000000000..bb4b3651d --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/mozzarella-7c883eff.html @@ -0,0 +1,16 @@ + + + + + + NonAsciiCharacterInput + + + + +
+

This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with non-latin characters in its name.

+
+ + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/mozzarella.html b/tests/auto/qdoc/generatedoutput/expected_output/mozzarella.html new file mode 100644 index 000000000..bb4b3651d --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/mozzarella.html @@ -0,0 +1,16 @@ + + + + + + NonAsciiCharacterInput + + + + +
+

This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with non-latin characters in its name.

+
+ + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/santa-14209312.html b/tests/auto/qdoc/generatedoutput/expected_output/santa-14209312.html new file mode 100644 index 000000000..f40feed36 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/santa-14209312.html @@ -0,0 +1,16 @@ + + + + + + NonAsciiCharacterInput + + + + +
+

This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with right-to-left script in its name.

+
+ + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/santa.html b/tests/auto/qdoc/generatedoutput/expected_output/santa.html new file mode 100644 index 000000000..f40feed36 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/santa.html @@ -0,0 +1,16 @@ + + + + + + NonAsciiCharacterInput + + + + +
+

This page exists solely to understand how QDoc will generate the file name for a page that mixes printable ascii with right-to-left script in its name.

+
+ + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.html b/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.html new file mode 100644 index 000000000..16df49755 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.html @@ -0,0 +1,16 @@ + + + + + + NonAsciiCharacterInput + + + + +
+

This page exists solely to understand how QDoc will generate the file name for a page with non-ascii-printable latin characters in its name.

+
+ + + diff --git a/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm.html b/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm.html new file mode 100644 index 000000000..16df49755 --- /dev/null +++ b/tests/auto/qdoc/generatedoutput/expected_output/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm.html @@ -0,0 +1,16 @@ + + + + + + NonAsciiCharacterInput + + + + +
+

This page exists solely to understand how QDoc will generate the file name for a page with non-ascii-printable latin characters in its name.

+
+ + + diff --git a/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc b/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc index b1f23adf7..c5f09cb1c 100644 --- a/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc +++ b/tests/auto/qdoc/generatedoutput/testdata/non_ascii_character_input/adventures_with_non_ascii_characters.qdoc @@ -52,3 +52,38 @@ to such section titles works as expected. It's made a section2 to exercise the behavior for other section levels than 1. */ + +/*! + \page SEITE_MIT_AUSSCHLIEßLICH_GROßBUCHSTABEN_IM_TITEL_ÜBERSCHRIFT.htm + + This page exists solely to understand how QDoc will generate the file name + for a page with non-ascii-printable latin characters in its name. +*/ + +/*! + \page موزاريلا سانتا.html + + This page exists solely to understand how QDoc will generate the file name + for a page with right-to-left script in its name. +*/ + +/*! + \page 圣马苏里拉.html + + This page exists solely to understand how QDoc will generate the file name + for a page with non-latin characters in its name. +*/ + +/*! + \page santaموزاريلا.html + + This page exists solely to understand how QDoc will generate the file name + for a page that mixes printable ascii with right-to-left script in its name. +*/ + +/*! + \page 桑塔mozzarella.html + + This page exists solely to understand how QDoc will generate the file name + for a page that mixes printable ascii with non-latin characters in its name. +*/ diff --git a/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp b/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp index 269739242..0e3b5a4ee 100644 --- a/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp +++ b/tests/auto/qdoc/generatedoutput/tst_generatedoutput.cpp @@ -591,9 +591,20 @@ void tst_generatedOutput::proxyPage() void tst_generatedOutput::nonAsciiCharacterInput() { - testAndCompare("testdata/non_ascii_character_input/non_ascii_character_input.qdocconf", - "html/nonasciicharacterinput.index " - "adventures-with-non-ascii-characters.html"); + testAndCompare( + "testdata/non_ascii_character_input/non_ascii_character_input.qdocconf", + "html/nonasciicharacterinput.index " + "html/mozzarella-7c883eff.webxml " + "html/santa-14209312.webxml " + "html/seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.webxml " + "html/8b5c72eb.webxml " + "html/e85685de.webxml " + "seite-mit-ausschlie-lich-gro-buchstaben-im-titel-berschrift-htm-bfa91582.html " + "mozzarella-7c883eff.html " + "santa-14209312.html " + "8b5c72eb.html " + "e85685de.html " + "adventures-with-non-ascii-characters.html"); } -- cgit v1.2.1