summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorReza Moradi <mrezzamoradi@gmail.com>2022-02-22 20:05:08 +0100
committerGitHub <noreply@github.com>2022-02-22 14:05:08 -0500
commitd968ca7419e6f4e40685888c56d03bea50fd39d7 (patch)
treec26388f34c2e91260fed09a0369f88c8791f94b6
parent07b87da81140cf51e4a585e43d4fe9113f4c2ad5 (diff)
downloadpython-slugify-d968ca7419e6f4e40685888c56d03bea50fd39d7.tar.gz
allow unicode (#111)v6.1.0
* initial commit to allow unicode * update version and changelog * add the flag to the CLI * update README.md
-rw-r--r--CHANGELOG.md4
-rw-r--r--README.md16
-rw-r--r--slugify/__main__.py5
-rw-r--r--slugify/__version__.py2
-rw-r--r--slugify/slugify.py20
-rw-r--r--test.py288
6 files changed, 328 insertions, 7 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 95ad243..49f88dd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 6.1.0
+
+- Add `allow_unicode` flag to allow unicode characters in the slug
+
## 6.0.1
- Rework regex_pattern to mean the opposite (disallowed chars instead of allowed)
diff --git a/README.md b/README.md
index 11e20da..f93afee 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,8 @@ def slugify(
stopwords=(),
regex_pattern=None,
lowercase=True,
- replacements=()
+ replacements=(),
+ allow_unicode=False
):
"""
Make a slug from the given text.
@@ -58,6 +59,7 @@ def slugify(
:param regex_pattern (str): regex pattern for disallowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
+ :param allow_unicode (bool): allow unicode characters
:return (str): slugify text
"""
```
@@ -75,6 +77,10 @@ txt = '影師嗎'
r = slugify(txt)
self.assertEqual(r, "ying-shi-ma")
+txt = '影師嗎'
+r = slugify(txt, allow_unicode=True)
+self.assertEqual(r, "影師嗎")
+
txt = 'C\'est déjà l\'été.'
r = slugify(txt)
self.assertEqual(r, "c-est-deja-l-ete")
@@ -133,6 +139,14 @@ txt = 'ÜBER Über German Umlaut'
r = slugify(txt, replacements=[['Ü', 'UE'], ['ü', 'ue']])
self.assertEqual(r, "ueber-ueber-german-umlaut")
+txt = 'i love 🦄'
+r = slugify(txt, allow_unicode=True)
+self.assertEqual(r, "i-love")
+
+txt = 'i love 🦄'
+r = slugify(txt, allow_unicode=True, regex_pattern=r'[^🦄]+')
+self.assertEqual(r, "🦄")
+
```
For more examples, have a look at the [test.py](test.py) file.
diff --git a/slugify/__main__.py b/slugify/__main__.py
index 5a888fe..7dd6b01 100644
--- a/slugify/__main__.py
+++ b/slugify/__main__.py
@@ -36,6 +36,8 @@ def parse_args(argv):
help="Activate case sensitivity")
parser.add_argument("--replacements", nargs='+',
help="""Additional replacement rules e.g. "|->or", "%%->percent".""")
+ parser.add_argument("--allow-unicode", action='store_true', default=False,
+ help="Allow unicode characters")
args = parser.parse_args(argv[1:])
@@ -73,7 +75,8 @@ def slugify_params(args):
separator=args.separator,
stopwords=args.stopwords,
lowercase=args.lowercase,
- replacements=args.replacements
+ replacements=args.replacements,
+ allow_unicode=args.allow_unicode
)
diff --git a/slugify/__version__.py b/slugify/__version__.py
index 1eedf44..e14e887 100644
--- a/slugify/__version__.py
+++ b/slugify/__version__.py
@@ -5,4 +5,4 @@ __description__ = 'A Python slugify application that also handles Unicode'
__url__ = 'https://github.com/un33k/python-slugify'
__license__ = 'MIT'
__copyright__ = 'Copyright 2022 Val Neekman @ Neekware Inc.'
-__version__ = '6.0.1'
+__version__ = '6.1.0'
diff --git a/slugify/slugify.py b/slugify/slugify.py
index 190ea92..ae6c9b6 100644
--- a/slugify/slugify.py
+++ b/slugify/slugify.py
@@ -17,6 +17,7 @@ DECIMAL_PATTERN = re.compile(r'&#(\d+);')
HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
QUOTE_PATTERN = re.compile(r'[\']+')
DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
+DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
DEFAULT_SEPARATOR = '-'
@@ -66,7 +67,8 @@ def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', sav
def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
- replacements: typing.Iterable[typing.Iterable[str]] = ()):
+ replacements: typing.Iterable[typing.Iterable[str]] = (),
+ allow_unicode=False):
"""
Make a slug from the given text.
:param text (str): initial text
@@ -81,6 +83,7 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
:param regex_pattern (str): regex pattern for disallowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
+ :param allow_unicode (bool): allow unicode characters
:return (str):
"""
@@ -97,7 +100,8 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
# decode unicode
- text = unidecode.unidecode(text)
+ if not allow_unicode:
+ text = unidecode.unidecode(text)
# ensure text is still in unicode
if not isinstance(text, str):
@@ -122,7 +126,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
pass
# translate
- text = unicodedata.normalize('NFKD', text)
+ if allow_unicode:
+ text = unicodedata.normalize('NFKC', text)
+ else:
+ text = unicodedata.normalize('NFKD', text)
+
if sys.version_info < (3,):
text = text.encode('ascii', 'ignore')
@@ -137,7 +145,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
text = NUMBERS_PATTERN.sub('', text)
# replace all other unwanted characters
- pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
+ if allow_unicode:
+ pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
+ else:
+ pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
+
text = re.sub(pattern, DEFAULT_SEPARATOR, text)
# remove redundant
diff --git a/test.py b/test.py
index 752c499..931f38f 100644
--- a/test.py
+++ b/test.py
@@ -233,6 +233,294 @@ class TestSlugify(unittest.TestCase):
self.assertEqual(r, "ueber-ueber-german-umlaut")
+class TestSlugifyUnicode(unittest.TestCase):
+
+ def test_extraneous_seperators(self):
+
+ txt = "This is a test ---"
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, "this-is-a-test")
+
+ txt = "___This is a test ---"
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, "this-is-a-test")
+
+ txt = "___This is a test___"
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, "this-is-a-test")
+
+ def test_non_word_characters(self):
+ txt = "This -- is a ## test ---"
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, "this-is-a-test")
+
+ def test_phonetic_conversion_of_eastern_scripts(self):
+ txt = '影師嗎'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, txt)
+
+ def test_accented_text(self):
+ txt = 'C\'est déjà l\'été.'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, "c-est-déjà-l-été")
+
+ txt = 'Nín hǎo. Wǒ shì zhōng guó rén'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, "nín-hǎo-wǒ-shì-zhōng-guó-rén")
+
+ def test_accented_text_with_non_word_characters(self):
+ txt = 'jaja---lol-méméméoo--a'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, "jaja-lol-méméméoo-a")
+
+ def test_cyrillic_text(self):
+ txt = 'Компьютер'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, "компьютер")
+
+ def test_max_length(self):
+ txt = 'jaja---lol-méméméoo--a'
+ r = slugify(txt, allow_unicode=True, max_length=9)
+ self.assertEqual(r, "jaja-lol")
+
+ txt = 'jaja---lol-méméméoo--a'
+ r = slugify(txt, allow_unicode=True, max_length=15)
+ self.assertEqual(r, "jaja-lol-mémémé")
+
+ def test_max_length_cutoff_not_required(self):
+ txt = 'jaja---lol-méméméoo--a'
+ r = slugify(txt, allow_unicode=True, max_length=50)
+ self.assertEqual(r, "jaja-lol-méméméoo-a")
+
+ def test_word_boundary(self):
+ txt = 'jaja---lol-méméméoo--a'
+ r = slugify(txt, allow_unicode=True, max_length=15, word_boundary=True)
+ self.assertEqual(r, "jaja-lol-a")
+
+ txt = 'jaja---lol-méméméoo--a'
+ r = slugify(txt, allow_unicode=True, max_length=17, word_boundary=True)
+ self.assertEqual(r, "jaja-lol-méméméoo")
+
+ txt = 'jaja---lol-méméméoo--a'
+ r = slugify(txt, allow_unicode=True, max_length=18, word_boundary=True)
+ self.assertEqual(r, "jaja-lol-méméméoo")
+
+ txt = 'jaja---lol-méméméoo--a'
+ r = slugify(txt, allow_unicode=True, max_length=19, word_boundary=True)
+ self.assertEqual(r, "jaja-lol-méméméoo-a")
+
+ def test_custom_separator(self):
+ txt = 'jaja---lol-méméméoo--a'
+ r = slugify(txt, allow_unicode=True, max_length=20, word_boundary=True, separator=".")
+ self.assertEqual(r, "jaja.lol.méméméoo.a")
+
+ def test_multi_character_separator(self):
+ txt = 'jaja---lol-méméméoo--a'
+ r = slugify(txt, allow_unicode=True, max_length=20, word_boundary=True, separator="ZZZZZZ")
+ self.assertEqual(r, "jajaZZZZZZlolZZZZZZméméméooZZZZZZa")
+
+ def test_save_order(self):
+ txt = 'one two three four five'
+ r = slugify(txt, allow_unicode=True, max_length=13, word_boundary=True, save_order=True)
+ self.assertEqual(r, "one-two-three")
+
+ txt = 'one two three four five'
+ r = slugify(txt, allow_unicode=True, max_length=13, word_boundary=True, save_order=False)
+ self.assertEqual(r, "one-two-three")
+
+ txt = 'one two three four five'
+ r = slugify(txt, allow_unicode=True, max_length=12, word_boundary=True, save_order=False)
+ self.assertEqual(r, "one-two-four")
+
+ txt = 'one two three four five'
+ r = slugify(txt, allow_unicode=True, max_length=12, word_boundary=True, save_order=True)
+ self.assertEqual(r, "one-two")
+
+ def test_save_order_rtl(self):
+ """For right-to-left unicode languages"""
+ txt = 'دو سه چهار پنج'
+ r = slugify(txt, allow_unicode=True, max_length=10, word_boundary=True, save_order=True)
+ self.assertEqual(r, "دو-سه-چهار")
+
+ txt = 'دو سه چهار پنج'
+ r = slugify(txt, allow_unicode=True, max_length=10, word_boundary=True, save_order=False)
+ self.assertEqual(r, "دو-سه-چهار")
+
+ txt = 'دو سه چهار پنج'
+ r = slugify(txt, allow_unicode=True, max_length=9, word_boundary=True, save_order=False)
+ self.assertEqual(r, "دو-سه-پنج")
+
+ txt = 'دو سه چهار پنج'
+ r = slugify(txt, allow_unicode=True, max_length=9, word_boundary=True, save_order=True)
+ self.assertEqual(r, "دو-سه")
+
+ def test_stopword_removal(self):
+ txt = 'this has a stopword'
+ r = slugify(txt, allow_unicode=True, stopwords=['stopword'])
+ self.assertEqual(r, 'this-has-a')
+
+ txt = 'this has a Öländ'
+ r = slugify(txt, allow_unicode=True, stopwords=['Öländ'])
+ self.assertEqual(r, 'this-has-a')
+
+ def test_stopword_removal_casesensitive(self):
+ txt = 'thIs Has a stopword Stopword'
+ r = slugify(txt, allow_unicode=True, stopwords=['Stopword'], lowercase=False)
+ self.assertEqual(r, 'thIs-Has-a-stopword')
+
+ txt = 'thIs Has a öländ Öländ'
+ r = slugify(txt, allow_unicode=True, stopwords=['Öländ'], lowercase=False)
+ self.assertEqual(r, 'thIs-Has-a-öländ')
+
+ def test_multiple_stopword_occurances(self):
+ txt = 'the quick brown fox jumps over the lazy dog'
+ r = slugify(txt, allow_unicode=True, stopwords=['the'])
+ self.assertEqual(r, 'quick-brown-fox-jumps-over-lazy-dog')
+
+ def test_differently_cased_stopword_match(self):
+ txt = 'Foo A FOO B foo C'
+ r = slugify(txt, allow_unicode=True, stopwords=['foo'])
+ self.assertEqual(r, 'a-b-c')
+
+ txt = 'Foo A FOO B foo C'
+ r = slugify(txt, allow_unicode=True, stopwords=['FOO'])
+ self.assertEqual(r, 'a-b-c')
+
+ def test_multiple_stopwords(self):
+ txt = 'the quick brown fox jumps over the lazy dog in a hurry'
+ r = slugify(txt, allow_unicode=True, stopwords=['the', 'in', 'a', 'hurry'])
+ self.assertEqual(r, 'quick-brown-fox-jumps-over-lazy-dog')
+
+ def test_stopwords_with_different_separator(self):
+ txt = 'the quick brown fox jumps over the lazy dog'
+ r = slugify(txt, allow_unicode=True, stopwords=['the'], separator=' ')
+ self.assertEqual(r, 'quick brown fox jumps over lazy dog')
+
+ def test_html_entities_on(self):
+ txt = 'foo &amp; bar'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, 'foo-bar')
+
+ def test_html_entities_off(self):
+ txt = 'foo &amp; bår'
+ r = slugify(txt, allow_unicode=True, entities=False)
+ self.assertEqual(r, 'foo-amp-bår')
+
+ def test_html_decimal_on(self):
+ txt = '&#381;'
+ r = slugify(txt, allow_unicode=True, decimal=True)
+ self.assertEqual(r, 'ž')
+
+ def test_html_decimal_off(self):
+ txt = '&#381;'
+ r = slugify(txt, allow_unicode=True, entities=False, decimal=False)
+ self.assertEqual(r, '381')
+
+ def test_html_hexadecimal_on(self):
+ txt = '&#x17D;'
+ r = slugify(txt, allow_unicode=True, hexadecimal=True)
+ self.assertEqual(r, 'ž')
+
+ def test_html_hexadecimal_off(self):
+ txt = '&#x17D;'
+ r = slugify(txt, allow_unicode=True, hexadecimal=False)
+ self.assertEqual(r, 'x17d')
+
+ def test_starts_with_number(self):
+ txt = '10 amazing secrets'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, '10-amazing-secrets')
+
+ def test_contains_numbers(self):
+ txt = 'buildings with 1000 windows'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, 'buildings-with-1000-windows')
+
+ def test_ends_with_number(self):
+ txt = 'recipe number 3'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, 'recipe-number-3')
+
+ def test_numbers_only(self):
+ txt = '404'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, '404')
+
+ def test_numbers_and_symbols(self):
+ txt = '1,000 reasons you are #1'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, '1000-reasons-you-are-1')
+
+ txt = '۱,۰۰۰ reasons you are #۱'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, '۱۰۰۰-reasons-you-are-۱')
+
+ def test_regex_pattern_keep_underscore(self):
+ """allowing unicode should not overrule the passed regex_pattern"""
+ txt = "___This is a test___"
+ regex_pattern = r'[^-a-z0-9_]+'
+ r = slugify(txt, allow_unicode=True, regex_pattern=regex_pattern)
+ self.assertEqual(r, "___this-is-a-test___")
+
+ def test_regex_pattern_keep_underscore_with_underscore_as_separator(self):
+ """
+ The regex_pattern turns the power to the caller.
+ Hence, the caller must ensure that a custom separator doesn't clash
+ with the regex_pattern.
+ """
+ txt = "___This is a test___"
+ regex_pattern = r'[^-a-z0-9_]+'
+ r = slugify(txt, allow_unicode=True, separator='_', regex_pattern=regex_pattern)
+ self.assertNotEqual(r, "_this_is_a_test_")
+
+ def test_replacements(self):
+ txt = '10 | 20 %'
+ r = slugify(txt, allow_unicode=True, replacements=[['|', 'or'], ['%', 'percent']])
+ self.assertEqual(r, "10-or-20-percent")
+
+ txt = 'I ♥ 🦄'
+ r = slugify(txt, allow_unicode=True, replacements=[['♥', 'amour'], ['🦄', 'licorne']])
+ self.assertEqual(r, "i-amour-licorne")
+
+ txt = 'I ♥ 🦄'
+ r = slugify(txt, allow_unicode=True, replacements=[['♥', 'სიყვარული'], ['🦄', 'licorne']])
+ self.assertEqual(r, "i-სიყვარული-licorne")
+
+ def test_replacements_german_umlaut_custom(self):
+ txt = 'ÜBER Über German Umlaut'
+ r = slugify(txt, allow_unicode=True, replacements=[['Ü', 'UE'], ['ü', 'ue']])
+ self.assertEqual(r, "ueber-ueber-german-umlaut")
+
+ def test_emojis(self):
+ """
+ allowing unicode shouldn't allow emojis, even in replacements.
+ the only exception is when it is allowed by the regex_pattern. regex_pattern overrules all
+ """
+ txt = 'i love 🦄'
+ r = slugify(txt, allow_unicode=True)
+ self.assertEqual(r, "i-love")
+
+ txt = 'i love 🦄'
+ r = slugify(txt, allow_unicode=True, decimal=True)
+ self.assertEqual(r, "i-love")
+
+ txt = 'i love 🦄'
+ r = slugify(txt, allow_unicode=True, hexadecimal=True)
+ self.assertEqual(r, "i-love")
+
+ txt = 'i love 🦄'
+ r = slugify(txt, allow_unicode=True, entities=True)
+ self.assertEqual(r, "i-love")
+
+ txt = 'i love you'
+ r = slugify(txt, allow_unicode=True, replacements=[['you', '🦄']])
+ self.assertEqual(r, "i-love")
+
+ txt = 'i love 🦄'
+ r = slugify(txt, allow_unicode=True, regex_pattern=r'[^🦄]+')
+ self.assertEqual(r, "🦄")
+
+
class TestUtils(unittest.TestCase):
def test_smart_truncate_no_max_length(self):