diff options
| author | Val Neekman (AvidCoder) <un33kvu@gmail.com> | 2022-04-27 12:07:55 -0400 |
|---|---|---|
| committer | Val Neekman (AvidCoder) <un33kvu@gmail.com> | 2022-04-27 12:07:55 -0400 |
| commit | c094c8a50371d6da08b782424ace5eca20943c8b (patch) | |
| tree | e3d529d4956ed162cd3545f6170db08fcfba8199 /slugify | |
| parent | 937779c77420f4acb8acd775bc2c35ed94f1393d (diff) | |
| download | python-slugify-c094c8a50371d6da08b782424ace5eca20943c8b.tar.gz | |
cmdline options
Diffstat (limited to 'slugify')
| -rw-r--r-- | slugify/__init__.py | 5 | ||||
| -rw-r--r-- | slugify/__main__.py | 11 | ||||
| -rw-r--r-- | slugify/__version__.py | 8 | ||||
| -rw-r--r-- | slugify/slugify.py | 55 | ||||
| -rw-r--r-- | slugify/special.py | 2 |
5 files changed, 42 insertions, 39 deletions
diff --git a/slugify/__init__.py b/slugify/__init__.py index 6c59f4e..ac21492 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -1,7 +1,2 @@ from .special import * from .slugify import * - - -__author__ = 'Val Neekman @ Neekware Inc. [@vneekman]' -__description__ = 'A Python slugify application that also handles Unicode' -__version__ = '5.0.2' diff --git a/slugify/__main__.py b/slugify/__main__.py index f815206..7dd6b01 100644 --- a/slugify/__main__.py +++ b/slugify/__main__.py @@ -31,11 +31,13 @@ def parse_args(argv): parser.add_argument("--stopwords", nargs='+', help="Words to discount") parser.add_argument("--regex-pattern", - help="Python regex pattern for allowed characters") + help="Python regex pattern for disallowed characters") parser.add_argument("--no-lowercase", action='store_false', dest='lowercase', default=True, help="Activate case sensitivity") parser.add_argument("--replacements", nargs='+', help="""Additional replacement rules e.g. "|->or", "%%->percent".""") + parser.add_argument("--allow-unicode", action='store_true', default=False, + help="Allow unicode characters") args = parser.parse_args(argv[1:]) @@ -73,11 +75,12 @@ def slugify_params(args): separator=args.separator, stopwords=args.stopwords, lowercase=args.lowercase, - replacements=args.replacements + replacements=args.replacements, + allow_unicode=args.allow_unicode ) -def main(argv=None): # pragma: no cover +def main(argv=None): # pragma: no cover """ Run this program """ if argv is None: argv = sys.argv @@ -89,5 +92,5 @@ def main(argv=None): # pragma: no cover sys.exit(-1) -if __name__ == '__main__': # pragma: no cover +if __name__ == '__main__': # pragma: no cover main() diff --git a/slugify/__version__.py b/slugify/__version__.py new file mode 100644 index 0000000..55abc97 --- /dev/null +++ b/slugify/__version__.py @@ -0,0 +1,8 @@ +__title__ = 'python-slugify' +__author__ = 'Val Neekman' +__author_email__ = 'info@neekware.com' +__description__ = 'A Python slugify application that also handles Unicode' +__url__ = 'https://github.com/un33k/python-slugify' +__license__ = 'MIT' +__copyright__ = 'Copyright 2022 Val Neekman @ Neekware Inc.' +__version__ = '6.1.2' diff --git a/slugify/slugify.py b/slugify/slugify.py index bb3aa95..b8c02ad 100644 --- a/slugify/slugify.py +++ b/slugify/slugify.py @@ -1,17 +1,7 @@ import re -import unicodedata -import types import sys - -try: - from htmlentitydefs import name2codepoint - _unicode = unicode - _unicode_type = types.UnicodeType -except ImportError: - from html.entities import name2codepoint - _unicode = str - _unicode_type = str - unichr = chr +import unicodedata +from html.entities import name2codepoint try: import text_unidecode as unidecode @@ -25,8 +15,8 @@ CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint)) DECIMAL_PATTERN = re.compile(r'&#(\d+);') HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);') QUOTE_PATTERN = re.compile(r'[\']+') -ALLOWED_CHARS_PATTERN = re.compile(r'[^-a-z0-9]+') -ALLOWED_CHARS_PATTERN_WITH_UPPERCASE = re.compile(r'[^-a-zA-Z0-9]+') +DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+') +DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+') DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}') NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)') DEFAULT_SEPARATOR = '-' @@ -69,14 +59,14 @@ def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', sav else: if save_order: break - if not truncated: # pragma: no cover + if not truncated: # pragma: no cover truncated = string[:max_length] return truncated.strip(separator) def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True, - replacements=()): + replacements=(), allow_unicode=False): """ Make a slug from the given text. :param text (str): initial text @@ -88,9 +78,10 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order :param separator (str): separator between words :param stopwords (iterable): words to discount - :param regex_pattern (str): regex pattern for allowed characters + :param regex_pattern (str): regex pattern for disallowed characters :param lowercase (bool): activate case sensitivity by setting it to False :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] + :param allow_unicode (bool): allow unicode characters :return (str): """ @@ -100,39 +91,44 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w text = text.replace(old, new) # ensure text is unicode - if not isinstance(text, _unicode_type): - text = _unicode(text, 'utf-8', 'ignore') + if not isinstance(text, str): + text = str(text, 'utf-8', 'ignore') # replace quotes with dashes - pre-process text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) # decode unicode - text = unidecode.unidecode(text) + if not allow_unicode: + text = unidecode.unidecode(text) # ensure text is still in unicode - if not isinstance(text, _unicode_type): - text = _unicode(text, 'utf-8', 'ignore') + if not isinstance(text, str): + text = str(text, 'utf-8', 'ignore') # character entity reference if entities: - text = CHAR_ENTITY_PATTERN.sub(lambda m: unichr(name2codepoint[m.group(1)]), text) + text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text) # decimal character reference if decimal: try: - text = DECIMAL_PATTERN.sub(lambda m: unichr(int(m.group(1))), text) + text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text) except Exception: pass # hexadecimal character reference if hexadecimal: try: - text = HEX_PATTERN.sub(lambda m: unichr(int(m.group(1), 16)), text) + text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text) except Exception: pass # translate - text = unicodedata.normalize('NFKD', text) + if allow_unicode: + text = unicodedata.normalize('NFKC', text) + else: + text = unicodedata.normalize('NFKD', text) + if sys.version_info < (3,): text = text.encode('ascii', 'ignore') @@ -147,10 +143,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w text = NUMBERS_PATTERN.sub('', text) # replace all other unwanted characters - if lowercase: - pattern = regex_pattern or ALLOWED_CHARS_PATTERN + if allow_unicode: + pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN else: - pattern = regex_pattern or ALLOWED_CHARS_PATTERN_WITH_UPPERCASE + pattern = regex_pattern or DISALLOWED_CHARS_PATTERN + text = re.sub(pattern, DEFAULT_SEPARATOR, text) # remove redundant diff --git a/slugify/special.py b/slugify/special.py index d3478d5..54eb85c 100644 --- a/slugify/special.py +++ b/slugify/special.py @@ -20,7 +20,7 @@ _CYRILLIC = [ # package defaults: (u'я', u'ya'), # ia (u'х', u'h'), # kh (u'у', u'y'), # u - (u'щ', u'sch'), # shch + (u'щ', u'sch'), # sch (u'ю', u'u'), # iu / yu ] CYRILLIC = add_uppercase_char(_CYRILLIC) |
