diff options
Diffstat (limited to 'logilab/common/textutils.py')
-rw-r--r-- | logilab/common/textutils.py | 224 |
1 files changed, 122 insertions, 102 deletions
diff --git a/logilab/common/textutils.py b/logilab/common/textutils.py index 4b6ea98..b988c7a 100644 --- a/logilab/common/textutils.py +++ b/logilab/common/textutils.py @@ -50,33 +50,37 @@ from re import Pattern, Match from warnings import warn from unicodedata import normalize as _uninormalize from typing import Any, Optional, Tuple, List, Callable, Dict, Union + try: from os import linesep except ImportError: - linesep = '\n' # gae + linesep = "\n" # gae from logilab.common.deprecation import deprecated MANUAL_UNICODE_MAP = { - u'\xa1': u'!', # INVERTED EXCLAMATION MARK - u'\u0142': u'l', # LATIN SMALL LETTER L WITH STROKE - u'\u2044': u'/', # FRACTION SLASH - u'\xc6': u'AE', # LATIN CAPITAL LETTER AE - u'\xa9': u'(c)', # COPYRIGHT SIGN - u'\xab': u'"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK - u'\xe6': u'ae', # LATIN SMALL LETTER AE - u'\xae': u'(r)', # REGISTERED SIGN - u'\u0153': u'oe', # LATIN SMALL LIGATURE OE - u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE - u'\xd8': u'O', # LATIN CAPITAL LETTER O WITH STROKE - u'\xf8': u'o', # LATIN SMALL LETTER O WITH STROKE - u'\xbb': u'"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK - u'\xdf': u'ss', # LATIN SMALL LETTER SHARP S - u'\u2013': u'-', # HYPHEN - u'\u2019': u"'", # SIMPLE QUOTE - } - -def unormalize(ustring: str, ignorenonascii: Optional[Any] = None, substitute: Optional[str] = None) -> str: + "\xa1": "!", # INVERTED EXCLAMATION MARK + "\u0142": "l", # LATIN SMALL LETTER L WITH STROKE + "\u2044": "/", # FRACTION SLASH + "\xc6": "AE", # LATIN CAPITAL LETTER AE + "\xa9": "(c)", # COPYRIGHT SIGN + "\xab": '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + "\xe6": "ae", # LATIN SMALL LETTER AE + "\xae": "(r)", # REGISTERED SIGN + "\u0153": "oe", # LATIN SMALL LIGATURE OE + "\u0152": "OE", # LATIN CAPITAL LIGATURE OE + "\xd8": "O", # LATIN CAPITAL LETTER O WITH STROKE + "\xf8": "o", # LATIN SMALL LETTER O WITH STROKE + "\xbb": '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + "\xdf": "ss", # LATIN SMALL LETTER SHARP S + "\u2013": "-", # HYPHEN + "\u2019": "'", # SIMPLE QUOTE +} + + +def unormalize( + ustring: str, ignorenonascii: Optional[Any] = None, substitute: Optional[str] = None +) -> str: """replace diacritical characters with their corresponding ascii characters Convert the unicode string to its long normalized form (unicode character @@ -92,22 +96,26 @@ def unormalize(ustring: str, ignorenonascii: Optional[Any] = None, substitute: O """ # backward compatibility, ignorenonascii was a boolean if ignorenonascii is not None: - warn("ignorenonascii is deprecated, use substitute named parameter instead", - DeprecationWarning, stacklevel=2) + warn( + "ignorenonascii is deprecated, use substitute named parameter instead", + DeprecationWarning, + stacklevel=2, + ) if ignorenonascii: - substitute = '' + substitute = "" res = [] for letter in ustring[:]: try: replacement = MANUAL_UNICODE_MAP[letter] except KeyError: - replacement = _uninormalize('NFKD', letter)[0] + replacement = _uninormalize("NFKD", letter)[0] if ord(replacement) >= 2 ** 7: if substitute is None: raise ValueError("can't deal with non-ascii based characters") replacement = substitute res.append(replacement) - return u''.join(res) + return "".join(res) + def unquote(string: str) -> str: """remove optional quotes (simple or double) from the string @@ -120,17 +128,18 @@ def unquote(string: str) -> str: """ if not string: return string - if string[0] in '"\'': + if string[0] in "\"'": string = string[1:] - if string[-1] in '"\'': + if string[-1] in "\"'": string = string[:-1] return string -_BLANKLINES_RGX = re.compile('\r?\n\r?\n') -_NORM_SPACES_RGX = re.compile('\s+') +_BLANKLINES_RGX = re.compile("\r?\n\r?\n") +_NORM_SPACES_RGX = re.compile("\s+") + -def normalize_text(text: str, line_len: int = 80, indent: str = '', rest: bool = False) -> str: +def normalize_text(text: str, line_len: int = 80, indent: str = "", rest: bool = False) -> str: """normalize a text to display it with a maximum line size and optionally arbitrary indentation. Line jumps are normalized but blank lines are kept. The indentation string may be used to insert a @@ -158,10 +167,10 @@ def normalize_text(text: str, line_len: int = 80, indent: str = '', rest: bool = result = [] for text in _BLANKLINES_RGX.split(text): result.append(normp(text, line_len, indent)) - return ('%s%s%s' % (linesep, indent, linesep)).join(result) + return ("%s%s%s" % (linesep, indent, linesep)).join(result) -def normalize_paragraph(text: str, line_len: int = 80, indent: str = '') -> str: +def normalize_paragraph(text: str, line_len: int = 80, indent: str = "") -> str: """normalize a text to display it with a maximum line size and optionally arbitrary indentation. Line jumps are normalized. The indentation string may be used top insert a comment mark for @@ -182,7 +191,7 @@ def normalize_paragraph(text: str, line_len: int = 80, indent: str = '') -> str: inferior to `line_len`, and optionally prefixed by an indentation string """ - text = _NORM_SPACES_RGX.sub(' ', text) + text = _NORM_SPACES_RGX.sub(" ", text) line_len = line_len - len(indent) lines = [] while text: @@ -190,7 +199,8 @@ def normalize_paragraph(text: str, line_len: int = 80, indent: str = '') -> str: lines.append(indent + aline) return linesep.join(lines) -def normalize_rest_paragraph(text: str, line_len: int = 80, indent: str = '') -> str: + +def normalize_rest_paragraph(text: str, line_len: int = 80, indent: str = "") -> str: """normalize a ReST text to display it with a maximum line size and optionally arbitrary indentation. Line jumps are normalized. The indentation string may be used top insert a comment mark for @@ -211,21 +221,21 @@ def normalize_rest_paragraph(text: str, line_len: int = 80, indent: str = '') -> inferior to `line_len`, and optionally prefixed by an indentation string """ - toreport = '' + toreport = "" lines = [] line_len = line_len - len(indent) for line in text.splitlines(): - line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip()) - toreport = '' + line = toreport + _NORM_SPACES_RGX.sub(" ", line.strip()) + toreport = "" while len(line) > line_len: # too long line, need split line, toreport = splittext(line, line_len) lines.append(indent + line) if toreport: - line = toreport + ' ' - toreport = '' + line = toreport + " " + toreport = "" else: - line = '' + line = "" if line: lines.append(indent + line.strip()) return linesep.join(lines) @@ -239,18 +249,18 @@ def splittext(text: str, line_len: int) -> Tuple[str, str]: * the rest of the text which has to be reported on another line """ if len(text) <= line_len: - return text, '' - pos = min(len(text)-1, line_len) - while pos > 0 and text[pos] != ' ': + return text, "" + pos = min(len(text) - 1, line_len) + while pos > 0 and text[pos] != " ": pos -= 1 if pos == 0: pos = min(len(text), line_len) - while len(text) > pos and text[pos] != ' ': + while len(text) > pos and text[pos] != " ": pos += 1 - return text[:pos], text[pos+1:].strip() + return text[:pos], text[pos + 1 :].strip() -def splitstrip(string: str, sep: str = ',') -> List[str]: +def splitstrip(string: str, sep: str = ",") -> List[str]: """return a list of stripped string by splitting the string given as argument on `sep` (',' by default). Empty string are discarded. @@ -271,15 +281,16 @@ def splitstrip(string: str, sep: str = ',') -> List[str]: """ return [word.strip() for word in string.split(sep) if word.strip()] -get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip) + +get_csv = deprecated("get_csv is deprecated, use splitstrip")(splitstrip) def split_url_or_path(url_or_path): """return the latest component of a string containing either an url of the form <scheme>://<path> or a local file system path """ - if '://' in url_or_path: - return url_or_path.rstrip('/').rsplit('/', 1) + if "://" in url_or_path: + return url_or_path.rstrip("/").rsplit("/", 1) return osp.split(url_or_path.rstrip(osp.sep)) @@ -303,8 +314,8 @@ def text_to_dict(text): return res for line in text.splitlines(): line = line.strip() - if line and not line.startswith('#'): - key, value = [w.strip() for w in line.split('=', 1)] + if line and not line.startswith("#"): + key, value = [w.strip() for w in line.split("=", 1)] if key in res: try: res[key].append(value) @@ -315,13 +326,12 @@ def text_to_dict(text): return res -_BLANK_URE = r'(\s|,)+' +_BLANK_URE = r"(\s|,)+" _BLANK_RE = re.compile(_BLANK_URE) -__VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))' -__UNITS_URE = r'[a-zA-Z]+' -_VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE)) -_VALIDATION_RE = re.compile(r'^((%s)(%s))*(%s)?$' % (__VALUE_URE, __UNITS_URE, - __VALUE_URE)) +__VALUE_URE = r"-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))" +__UNITS_URE = r"[a-zA-Z]+" +_VALUE_RE = re.compile(r"(?P<value>%s)(?P<unit>%s)?" % (__VALUE_URE, __UNITS_URE)) +_VALIDATION_RE = re.compile(r"^((%s)(%s))*(%s)?$" % (__VALUE_URE, __UNITS_URE, __VALUE_URE)) BYTE_UNITS = { "b": 1, @@ -336,11 +346,18 @@ TIME_UNITS = { "s": 1, "min": 60, "h": 60 * 60, - "d": 60 * 60 *24, + "d": 60 * 60 * 24, } -def apply_units(string: str, units: Dict[str, int], inter: Union[Callable, None, type] = None, final: type = float, blank_reg: Pattern = _BLANK_RE, - value_reg: Pattern = _VALUE_RE) -> Union[float, int]: + +def apply_units( + string: str, + units: Dict[str, int], + inter: Union[Callable, None, type] = None, + final: type = float, + blank_reg: Pattern = _BLANK_RE, + value_reg: Pattern = _VALUE_RE, +) -> Union[float, int]: """Parse the string applying the units defined in units (e.g.: "1.5m",{'m',60} -> 80). @@ -361,7 +378,7 @@ def apply_units(string: str, units: Dict[str, int], inter: Union[Callable, None, """ if inter is None: inter = final - fstring = _BLANK_RE.sub('', string) + fstring = _BLANK_RE.sub("", string) if not (fstring and _VALIDATION_RE.match(fstring)): raise ValueError("Invalid unit string: %r." % string) values = [] @@ -373,15 +390,15 @@ def apply_units(string: str, units: Dict[str, int], inter: Union[Callable, None, try: value *= units[unit.lower()] except KeyError: - raise ValueError('invalid unit %s. valid units are %s' % - (unit, list(units.keys()))) + raise ValueError("invalid unit %s. valid units are %s" % (unit, list(units.keys()))) values.append(value) return final(sum(values)) -_LINE_RGX = re.compile('\r\n|\r+|\n') +_LINE_RGX = re.compile("\r\n|\r+|\n") + -def pretty_match(match: Match, string: str, underline_char: str = '^') -> str: +def pretty_match(match: Match, string: str, underline_char: str = "^") -> str: """return a string with the match location underlined: >>> import re @@ -419,7 +436,7 @@ def pretty_match(match: Match, string: str, underline_char: str = '^') -> str: result = [string[:start_line_pos]] start_line_pos += len(linesep) offset = start - start_line_pos - underline = ' ' * offset + underline_char * (end - start) + underline = " " * offset + underline_char * (end - start) end_line_pos = string.find(linesep, end) if end_line_pos == -1: string = string[start_line_pos:] @@ -429,7 +446,7 @@ def pretty_match(match: Match, string: str, underline_char: str = '^') -> str: # mypy: Incompatible types in assignment (expression has type "str", # mypy: variable has type "int") # but it's a str :| - end = string[end_line_pos + len(linesep):] # type: ignore + end = string[end_line_pos + len(linesep) :] # type: ignore string = string[start_line_pos:end_line_pos] result.append(string) result.append(underline) @@ -439,30 +456,31 @@ def pretty_match(match: Match, string: str, underline_char: str = '^') -> str: # Ansi colorization ########################################################### -ANSI_PREFIX = '\033[' -ANSI_END = 'm' -ANSI_RESET = '\033[0m' +ANSI_PREFIX = "\033[" +ANSI_END = "m" +ANSI_RESET = "\033[0m" ANSI_STYLES = { - 'reset': "0", - 'bold': "1", - 'italic': "3", - 'underline': "4", - 'blink': "5", - 'inverse': "7", - 'strike': "9", + "reset": "0", + "bold": "1", + "italic": "3", + "underline": "4", + "blink": "5", + "inverse": "7", + "strike": "9", } ANSI_COLORS = { - 'reset': "0", - 'black': "30", - 'red': "31", - 'green': "32", - 'yellow': "33", - 'blue': "34", - 'magenta': "35", - 'cyan': "36", - 'white': "37", + "reset": "0", + "black": "30", + "red": "31", + "green": "32", + "yellow": "33", + "blue": "34", + "magenta": "35", + "cyan": "36", + "white": "37", } + def _get_ansi_code(color: Optional[str] = None, style: Optional[str] = None) -> str: """return ansi escape code corresponding to color and style @@ -488,13 +506,14 @@ def _get_ansi_code(color: Optional[str] = None, style: Optional[str] = None) -> ansi_code.append(ANSI_STYLES[effect]) if color: if color.isdigit(): - ansi_code.extend(['38', '5']) + ansi_code.extend(["38", "5"]) ansi_code.append(color) else: ansi_code.append(ANSI_COLORS[color]) if ansi_code: - return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END - return '' + return ANSI_PREFIX + ";".join(ansi_code) + ANSI_END + return "" + def colorize_ansi(msg: str, color: Optional[str] = None, style: Optional[str] = None) -> str: """colorize message by wrapping it with ansi escape codes @@ -522,23 +541,24 @@ def colorize_ansi(msg: str, color: Optional[str] = None, style: Optional[str] = escape_code = _get_ansi_code(color, style) # If invalid (or unknown) color, don't wrap msg with ansi codes if escape_code: - return '%s%s%s' % (escape_code, msg, ANSI_RESET) + return "%s%s%s" % (escape_code, msg, ANSI_RESET) return msg -DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'} + +DIFF_STYLE = {"separator": "cyan", "remove": "red", "add": "green"} + def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE): for line in lines: - if line[:4] in ('--- ', '+++ '): - out.write(colorize_ansi(line, style['separator'])) - elif line[0] == '-': - out.write(colorize_ansi(line, style['remove'])) - elif line[0] == '+': - out.write(colorize_ansi(line, style['add'])) - elif line[:4] == '--- ': - out.write(colorize_ansi(line, style['separator'])) - elif line[:4] == '+++ ': - out.write(colorize_ansi(line, style['separator'])) + if line[:4] in ("--- ", "+++ "): + out.write(colorize_ansi(line, style["separator"])) + elif line[0] == "-": + out.write(colorize_ansi(line, style["remove"])) + elif line[0] == "+": + out.write(colorize_ansi(line, style["add"])) + elif line[:4] == "--- ": + out.write(colorize_ansi(line, style["separator"])) + elif line[:4] == "+++ ": + out.write(colorize_ansi(line, style["separator"])) else: out.write(line) - |