cmdline options

author: Val Neekman (AvidCoder) <un33kvu@gmail.com> 2022-04-27 12:07:55 -0400
committer: Val Neekman (AvidCoder) <un33kvu@gmail.com> 2022-04-27 12:07:55 -0400
commit: c094c8a50371d6da08b782424ace5eca20943c8b (patch)
tree: e3d529d4956ed162cd3545f6170db08fcfba8199 /slugify
parent: 937779c77420f4acb8acd775bc2c35ed94f1393d (diff)
download: python-slugify-c094c8a50371d6da08b782424ace5eca20943c8b.tar.gz
5 files changed, 42 insertions, 39 deletions
diff --git a/slugify/__init__.py b/slugify/__init__.py
index 6c59f4e..ac21492 100644
--- a/slugify/__init__.py
+++ b/slugify/__init__.py
@@ -1,7 +1,2 @@
 from .special import *
 from .slugify import *
-
-
-__author__ = 'Val Neekman @ Neekware Inc. [@vneekman]'
-__description__ = 'A Python slugify application that also handles Unicode'
-__version__ = '5.0.2'
diff --git a/slugify/__main__.py b/slugify/__main__.py
index f815206..7dd6b01 100644
--- a/slugify/__main__.py
+++ b/slugify/__main__.py
@@ -31,11 +31,13 @@ def parse_args(argv):
     parser.add_argument("--stopwords", nargs='+',
                         help="Words to discount")
     parser.add_argument("--regex-pattern",
-                        help="Python regex pattern for allowed characters")
+                        help="Python regex pattern for disallowed characters")
     parser.add_argument("--no-lowercase", action='store_false', dest='lowercase', default=True,
                         help="Activate case sensitivity")
     parser.add_argument("--replacements", nargs='+',
                         help="""Additional replacement rules e.g. "|->or", "%%->percent".""")
+    parser.add_argument("--allow-unicode", action='store_true', default=False,
+                        help="Allow unicode characters")
 
     args = parser.parse_args(argv[1:])
 
@@ -73,11 +75,12 @@ def slugify_params(args):
         separator=args.separator,
         stopwords=args.stopwords,
         lowercase=args.lowercase,
-        replacements=args.replacements
+        replacements=args.replacements,
+        allow_unicode=args.allow_unicode
     )
 
 
-def main(argv=None): # pragma: no cover
+def main(argv=None):  # pragma: no cover
     """ Run this program """
     if argv is None:
         argv = sys.argv
@@ -89,5 +92,5 @@ def main(argv=None): # pragma: no cover
         sys.exit(-1)
 
 
-if __name__ == '__main__': # pragma: no cover
+if __name__ == '__main__':  # pragma: no cover
     main()
diff --git a/slugify/__version__.py b/slugify/__version__.py
new file mode 100644
index 0000000..55abc97
--- /dev/null
+++ b/slugify/__version__.py
@@ -0,0 +1,8 @@
+__title__ = 'python-slugify'
+__author__ = 'Val Neekman'
+__author_email__ = 'info@neekware.com'
+__description__ = 'A Python slugify application that also handles Unicode'
+__url__ = 'https://github.com/un33k/python-slugify'
+__license__ = 'MIT'
+__copyright__ = 'Copyright 2022 Val Neekman @ Neekware Inc.'
+__version__ = '6.1.2'
diff --git a/slugify/slugify.py b/slugify/slugify.py
index bb3aa95..b8c02ad 100644
--- a/slugify/slugify.py
+++ b/slugify/slugify.py
@@ -1,17 +1,7 @@
 import re
-import unicodedata
-import types
 import sys
-
-try:
-    from htmlentitydefs import name2codepoint
-    _unicode = unicode
-    _unicode_type = types.UnicodeType
-except ImportError:
-    from html.entities import name2codepoint
-    _unicode = str
-    _unicode_type = str
-    unichr = chr
+import unicodedata
+from html.entities import name2codepoint
 
 try:
     import text_unidecode as unidecode
@@ -25,8 +15,8 @@ CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
 DECIMAL_PATTERN = re.compile(r'&#(\d+);')
 HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
 QUOTE_PATTERN = re.compile(r'[\']+')
-ALLOWED_CHARS_PATTERN = re.compile(r'[^-a-z0-9]+')
-ALLOWED_CHARS_PATTERN_WITH_UPPERCASE = re.compile(r'[^-a-zA-Z0-9]+')
+DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
+DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
 DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
 NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
 DEFAULT_SEPARATOR = '-'
@@ -69,14 +59,14 @@ def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', sav
             else:
                 if save_order:
                     break
-    if not truncated: # pragma: no cover
+    if not truncated:  # pragma: no cover
         truncated = string[:max_length]
     return truncated.strip(separator)
 
 
 def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
             separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
-            replacements=()):
+            replacements=(), allow_unicode=False):
     """
     Make a slug from the given text.
     :param text (str): initial text
@@ -88,9 +78,10 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
     :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
     :param separator (str): separator between words
     :param stopwords (iterable): words to discount
-    :param regex_pattern (str): regex pattern for allowed characters
+    :param regex_pattern (str): regex pattern for disallowed characters
     :param lowercase (bool): activate case sensitivity by setting it to False
     :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
+    :param allow_unicode (bool): allow unicode characters
     :return (str):
     """
 
@@ -100,39 +91,44 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
             text = text.replace(old, new)
 
     # ensure text is unicode
-    if not isinstance(text, _unicode_type):
-        text = _unicode(text, 'utf-8', 'ignore')
+    if not isinstance(text, str):
+        text = str(text, 'utf-8', 'ignore')
 
     # replace quotes with dashes - pre-process
     text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
 
     # decode unicode
-    text = unidecode.unidecode(text)
+    if not allow_unicode:
+        text = unidecode.unidecode(text)
 
     # ensure text is still in unicode
-    if not isinstance(text, _unicode_type):
-        text = _unicode(text, 'utf-8', 'ignore')
+    if not isinstance(text, str):
+        text = str(text, 'utf-8', 'ignore')
 
     # character entity reference
     if entities:
-        text = CHAR_ENTITY_PATTERN.sub(lambda m: unichr(name2codepoint[m.group(1)]), text)
+        text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)
 
     # decimal character reference
     if decimal:
         try:
-            text = DECIMAL_PATTERN.sub(lambda m: unichr(int(m.group(1))), text)
+            text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
         except Exception:
             pass
 
     # hexadecimal character reference
     if hexadecimal:
         try:
-            text = HEX_PATTERN.sub(lambda m: unichr(int(m.group(1), 16)), text)
+            text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
         except Exception:
             pass
 
     # translate
-    text = unicodedata.normalize('NFKD', text)
+    if allow_unicode:
+        text = unicodedata.normalize('NFKC', text)
+    else:
+        text = unicodedata.normalize('NFKD', text)
+
     if sys.version_info < (3,):
         text = text.encode('ascii', 'ignore')
 
@@ -147,10 +143,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
     text = NUMBERS_PATTERN.sub('', text)
 
     # replace all other unwanted characters
-    if lowercase:
-        pattern = regex_pattern or ALLOWED_CHARS_PATTERN
+    if allow_unicode:
+        pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
     else:
-        pattern = regex_pattern or ALLOWED_CHARS_PATTERN_WITH_UPPERCASE
+        pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
+
     text = re.sub(pattern, DEFAULT_SEPARATOR, text)
 
     # remove redundant
diff --git a/slugify/special.py b/slugify/special.py
index d3478d5..54eb85c 100644
--- a/slugify/special.py
+++ b/slugify/special.py
@@ -20,7 +20,7 @@ _CYRILLIC = [      # package defaults:
     (u'я', u'ya'),   # ia
     (u'х', u'h'),    # kh
     (u'у', u'y'),    # u
-    (u'щ', u'sch'),  # shch
+    (u'щ', u'sch'),  # sch
     (u'ю', u'u'),    # iu / yu
 ]
 CYRILLIC = add_uppercase_char(_CYRILLIC)
author	Val Neekman (AvidCoder) <un33kvu@gmail.com>	2022-04-27 12:07:55 -0400
committer	Val Neekman (AvidCoder) <un33kvu@gmail.com>	2022-04-27 12:07:55 -0400
commit	c094c8a50371d6da08b782424ace5eca20943c8b (patch)
tree	e3d529d4956ed162cd3545f6170db08fcfba8199 /slugify
parent	937779c77420f4acb8acd775bc2c35ed94f1393d (diff)
download	python-slugify-c094c8a50371d6da08b782424ace5eca20943c8b.tar.gz