diff options
author | Antoine <me@atelierhsn.com> | 2020-10-02 03:54:37 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-10-01 14:54:37 -0400 |
commit | 2e0962ef1412b0cd566331954aff82711ce93acf (patch) | |
tree | 45d6b44ffe98317d4b2b3960b71717d67e2e8e8a | |
parent | b701c34ebd7b2d0eb319517b9a275ddf0c89608d (diff) | |
download | python-markdown-2e0962ef1412b0cd566331954aff82711ce93acf.tar.gz |
Support unicode ids in toc (#970)
A second function, `slugify_unicode` was added rather than changing the existing function so as to maintain backward compatibility. While an `encoding` parameter was added to the `slugify` function, we can't expect existing third party functions to accept a third parameter. Therefore, the two parameter API was preserved with this change.
-rw-r--r-- | docs/change_log/release-3.3.md | 7 | ||||
-rw-r--r-- | docs/extensions/toc.md | 3 | ||||
-rw-r--r-- | markdown/extensions/toc.py | 13 | ||||
-rw-r--r-- | tests/test_syntax/extensions/test_toc.py | 22 |
4 files changed, 41 insertions, 4 deletions
diff --git a/docs/change_log/release-3.3.md b/docs/change_log/release-3.3.md index ab7a7c6..cf4cce5 100644 --- a/docs/change_log/release-3.3.md +++ b/docs/change_log/release-3.3.md @@ -81,6 +81,13 @@ The following new features have been included in the 3.3 release: maintain the current behavior in the rebuilt Markdown in HTML extension. A few random edge-case bugs (see the included tests) were resolved in the process (#803). +* An alternate function `markdown.extensions.headerid.slugify_unicode` has been included + with the [Table of Contents](../extensions/toc.md) extension which supports Unicode + characters in table of contents slugs. The old `markdown.extensions.headerid.slugify` + method which removes non-ASCII characters remains the default. Import and pass + `markdown.extensions.headerid.slugify_unicode` to the `slugify` configuration option + to use the new behavior. + ## Bug fixes The following bug fixes are included in the 3.3 release: diff --git a/docs/extensions/toc.md b/docs/extensions/toc.md index b068b29..8dce335 100644 --- a/docs/extensions/toc.md +++ b/docs/extensions/toc.md @@ -202,6 +202,9 @@ The following options are provided to configure the output: The callable must return a string appropriate for use in HTML `id` attributes. + An alternate version of the default callable supporting Unicode strings is also + provided as `markdown.extensions.headerid.slugify_unicode`. + * **`separator`**: Word separator. Character which replaces white space in id. Defaults to "`-`". diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index b6cdc73..b2564c9 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -23,11 +23,16 @@ import unicodedata import xml.etree.ElementTree as etree -def slugify(value, separator): +def slugify(value, separator, encoding='ascii'): """ Slugify a string, to make it URL friendly. """ - value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') - value = re.sub(r'[^\w\s-]', '', value.decode('ascii')).strip().lower() - return re.sub(r'[%s\s]+' % separator, separator, value) + value = unicodedata.normalize('NFKD', value).encode(encoding, 'ignore') + value = re.sub(r'[^\w\s-]', '', value.decode(encoding)).strip().lower() + return re.sub(r'[{}\s]+'.format(separator), separator, value) + + +def slugify_unicode(value, separator): + """ Slugify a string, to make it URL friendly while preserving Unicode characters. """ + return slugify(value, separator, 'utf-8') IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$') diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 3fc9780..a3d050c 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -141,3 +141,25 @@ class TestTOC(TestCase): '</h1>', # noqa extensions=[TocExtension(permalink=True, permalink_title="")] ) + + def testPermalinkWithUnicodeInID(self): + from markdown.extensions.toc import slugify_unicode + self.assertMarkdownRenders( + '# Unicode ヘッダー', + '<h1 id="unicode-ヘッター">' # noqa + 'Unicode ヘッダー' # noqa + '<a class="headerlink" href="#unicode-ヘッター" title="Permanent link">¶</a>' # noqa + '</h1>', # noqa + extensions=[TocExtension(permalink=True, slugify=slugify_unicode)] + ) + + def testPermalinkWithUnicodeTitle(self): + from markdown.extensions.toc import slugify_unicode + self.assertMarkdownRenders( + '# Unicode ヘッダー', + '<h1 id="unicode-ヘッター">' # noqa + 'Unicode ヘッダー' # noqa + '<a class="headerlink" href="#unicode-ヘッター" title="パーマリンク">¶</a>' # noqa + '</h1>', # noqa + extensions=[TocExtension(permalink=True, permalink_title="パーマリンク", slugify=slugify_unicode)] + ) |