# copyright 2003-2010 LOGILAB S.A. (Paris, FRANCE), all rights reserved. # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr # # This file is part of logilab-common. # # logilab-common is free software: you can redistribute it and/or modify it under # the terms of the GNU Lesser General Public License as published by the Free # Software Foundation, either version 2.1 of the License, or (at your option) any # later version. # # logilab-common is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more # details. # # You should have received a copy of the GNU Lesser General Public License along # with logilab-common. If not, see . """Some text manipulation utility functions. :group text formatting: normalize_text, normalize_paragraph, pretty_match,\ unquote, colorize_ansi :group text manipulation: searchall, splitstrip :sort: text formatting, text manipulation :type ANSI_STYLES: dict(str) :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code :type ANSI_COLORS: dict(str) :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code :type ANSI_PREFIX: str :var ANSI_PREFIX: ANSI terminal code notifying the start of an ANSI escape sequence :type ANSI_END: str :var ANSI_END: ANSI terminal code notifying the end of an ANSI escape sequence :type ANSI_RESET: str :var ANSI_RESET: ANSI terminal code resetting format defined by a previous ANSI escape sequence """ __docformat__ = "restructuredtext en" import sys import re import os.path as osp from unicodedata import normalize as _uninormalize try: from os import linesep except ImportError: linesep = '\n' # gae from logilab.common.deprecation import deprecated MANUAL_UNICODE_MAP = { u'\xa1': u'!', # INVERTED EXCLAMATION MARK u'\u0142': u'l', # LATIN SMALL LETTER L WITH STROKE u'\u2044': u'/', # FRACTION SLASH u'\xc6': u'AE', # LATIN CAPITAL LETTER AE u'\xa9': u'(c)', # COPYRIGHT SIGN u'\xab': u'"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK u'\xe6': u'ae', # LATIN SMALL LETTER AE u'\xae': u'(r)', # REGISTERED SIGN u'\u0153': u'oe', # LATIN SMALL LIGATURE OE u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE u'\xd8': u'O', # LATIN CAPITAL LETTER O WITH STROKE u'\xf8': u'o', # LATIN SMALL LETTER O WITH STROKE u'\xbb': u'"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK u'\xdf': u'ss', # LATIN SMALL LETTER SHARP S } def unormalize(ustring, ignorenonascii=False): """replace diacritical characters with their corresponding ascii characters """ res = [] for letter in ustring[:]: try: replacement = MANUAL_UNICODE_MAP[letter] except KeyError: if ord(letter) >= 2**8: if ignorenonascii: continue raise ValueError("can't deal with non-ascii based characters") replacement = _uninormalize('NFD', letter)[0] res.append(replacement) return u''.join(res) def unquote(string): """remove optional quotes (simple or double) from the string :type string: str or unicode :param string: an optionally quoted string :rtype: str or unicode :return: the unquoted string (or the input string if it wasn't quoted) """ if not string: return string if string[0] in '"\'': string = string[1:] if string[-1] in '"\'': string = string[:-1] return string _BLANKLINES_RGX = re.compile('\r?\n\r?\n') _NORM_SPACES_RGX = re.compile('\s+') def normalize_text(text, line_len=80, indent='', rest=False): """normalize a text to display it with a maximum line size and optionally arbitrary indentation. Line jumps are normalized but blank lines are kept. The indentation string may be used to insert a comment (#) or a quoting (>) mark for instance. :type text: str or unicode :param text: the input text to normalize :type line_len: int :param line_len: expected maximum line's length, default to 80 :type indent: str or unicode :param indent: optional string to use as indentation :rtype: str or unicode :return: the input text normalized to fit on lines with a maximized size inferior to `line_len`, and optionally prefixed by an indentation string """ if rest: normp = normalize_rest_paragraph else: normp = normalize_paragraph result = [] for text in _BLANKLINES_RGX.split(text): result.append(normp(text, line_len, indent)) return ('%s%s%s' % (linesep, indent, linesep)).join(result) def normalize_paragraph(text, line_len=80, indent=''): """normalize a text to display it with a maximum line size and optionally arbitrary indentation. Line jumps are normalized. The indentation string may be used top insert a comment mark for instance. :type text: str or unicode :param text: the input text to normalize :type line_len: int :param line_len: expected maximum line's length, default to 80 :type indent: str or unicode :param indent: optional string to use as indentation :rtype: str or unicode :return: the input text normalized to fit on lines with a maximized size inferior to `line_len`, and optionally prefixed by an indentation string """ text = _NORM_SPACES_RGX.sub(' ', text) line_len = line_len - len(indent) lines = [] while text: aline, text = splittext(text.strip(), line_len) lines.append(indent + aline) return linesep.join(lines) def normalize_rest_paragraph(text, line_len=80, indent=''): """normalize a ReST text to display it with a maximum line size and optionally arbitrary indentation. Line jumps are normalized. The indentation string may be used top insert a comment mark for instance. :type text: str or unicode :param text: the input text to normalize :type line_len: int :param line_len: expected maximum line's length, default to 80 :type indent: str or unicode :param indent: optional string to use as indentation :rtype: str or unicode :return: the input text normalized to fit on lines with a maximized size inferior to `line_len`, and optionally prefixed by an indentation string """ toreport = '' lines = [] line_len = line_len - len(indent) for line in text.splitlines(): line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip()) toreport = '' while len(line) > line_len: # too long line, need split line, toreport = splittext(line, line_len) lines.append(indent + line) if toreport: line = toreport + ' ' toreport = '' else: line = '' if line: lines.append(indent + line.strip()) return linesep.join(lines) def splittext(text, line_len): """split the given text on space according to the given max line size return a 2-uple: * a line <= line_len if possible * the rest of the text which has to be reported on another line """ if len(text) <= line_len: return text, '' pos = min(len(text)-1, line_len) while pos > 0 and text[pos] != ' ': pos -= 1 if pos == 0: pos = min(len(text), line_len) while len(text) > pos and text[pos] != ' ': pos += 1 return text[:pos], text[pos+1:].strip() def splitstrip(string, sep=','): """return a list of stripped string by splitting the string given as argument on `sep` (',' by default). Empty string are discarded. >>> splitstrip('a, b, c , 4,,') ['a', 'b', 'c', '4'] >>> splitstrip('a') ['a'] >>> :type string: str or unicode :param string: a csv line :type sep: str or unicode :param sep: field separator, default to the comma (',') :rtype: str or unicode :return: the unquoted string (or the input string if it wasn't quoted) """ return [word.strip() for word in string.split(sep) if word.strip()] get_csv = deprecated()(splitstrip) def split_url_or_path(url_or_path): """return the latest component of a string containing either an url of the form :// or a local file system path """ if '://' in url_or_path: return url_or_path.rstrip('/').rsplit('/', 1) return osp.split(url_or_path.rstrip(osp.sep)) def text_to_dict(text): """parse multilines text containing simple 'key=value' lines and return a dict of {'key': 'value'}. When the same key is encountered multiple time, value is turned into a list containing all values. >>> text_to_dict('''multiple=1 ... multiple= 2 ... single =3 ... ''') {'single': '3', 'multiple': ['1', '2']} """ res = {} if not text: return res for line in text.splitlines(): line = line.strip() if line: key, value = [w.strip() for w in line.split('=', 1)] if key in res: try: res[key].append(value) except AttributeError: res[key] = [res[key], value] else: res[key] = value return res _BLANK_URE = r'(\s|,)+' _BLANK_RE = re.compile(_BLANK_URE) __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))' __UNITS_URE = r'[a-zA-Z]+' _VALUE_RE = re.compile(r'(?P%s)(?P%s)?'%(__VALUE_URE,__UNITS_URE)) BYTE_UNITS = { "b": 1, "kb": 1024, "mb": 1024 ** 2, "gb": 1024 ** 3, "tb": 1024 ** 4, } TIME_UNITS = { "ms": 0.0001, "s": 1, "min": 60, "h": 60 * 60, "d": 60 * 60 *24, } def apply_units( string, units, inter=None, final=float, blank_reg=_BLANK_RE, value_reg=_VALUE_RE): """Parse the string applying the units defined in units (e.g.: "1.5m",{'m',60} -> 80). :type string: str or unicode :param string: the string to parse :type units: dict (or any object with __getitem__ using basestring key) :param units: a dict mapping a unit string repr to its value :type inter: type :param inter: used to parse every intermediate value (need __sum__) :type blank_reg: regexp :param blank_reg: should match every blank char to ignore. :type value_reg: regexp with "value" and optional "unit" group :param value_reg: match a value and it's unit into the """ if inter is None: inter = final string = _BLANK_RE.sub('',string) values = [] for match in value_reg.finditer(string): dic = match.groupdict() #import sys #print >> sys.stderr, dic lit, unit = dic["value"], dic.get("unit") value = inter(lit) if unit is not None: try: value *= units[unit.lower()] except KeyError: raise KeyError('invalid unit %s. valid units are %s' % (unit, units.keys())) values.append(value) return final(sum(values)) _LINE_RGX = re.compile('\r\n|\r+|\n') def pretty_match(match, string, underline_char='^'): """return a string with the match location underlined: >>> import re >>> print pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon') il mange du bacon ^^^^^ >>> :type match: _sre.SRE_match :param match: object returned by re.match, re.search or re.finditer :type string: str or unicode :param string: the string on which the regular expression has been applied to obtain the `match` object :type underline_char: str or unicode :param underline_char: character to use to underline the matched section, default to the carret '^' :rtype: str or unicode :return: the original string with an inserted line to underline the match location """ start = match.start() end = match.end() string = _LINE_RGX.sub(linesep, string) start_line_pos = string.rfind(linesep, 0, start) if start_line_pos == -1: start_line_pos = 0 result = [] else: result = [string[:start_line_pos]] start_line_pos += len(linesep) offset = start - start_line_pos underline = ' ' * offset + underline_char * (end - start) end_line_pos = string.find(linesep, end) if end_line_pos == -1: string = string[start_line_pos:] result.append(string) result.append(underline) else: end = string[end_line_pos + len(linesep):] string = string[start_line_pos:end_line_pos] result.append(string) result.append(underline) result.append(end) return linesep.join(result).rstrip() # Ansi colorization ########################################################### ANSI_PREFIX = '\033[' ANSI_END = 'm' ANSI_RESET = '\033[0m' ANSI_STYLES = { 'reset' : "0", 'bold' : "1", 'italic' : "3", 'underline' : "4", 'blink' : "5", 'inverse' : "7", 'strike' : "9", } ANSI_COLORS = { 'reset' : "0", 'black' : "30", 'red' : "31", 'green' : "32", 'yellow' : "33", 'blue' : "34", 'magenta' : "35", 'cyan' : "36", 'white' : "37", } def _get_ansi_code(color=None, style=None): """return ansi escape code corresponding to color and style :type color: str or None :param color: the color name (see `ANSI_COLORS` for available values) or the color number when 256 colors are available :type style: str or None :param style: style string (see `ANSI_COLORS` for available values). To get several style effects at the same time, use a coma as separator. :raise KeyError: if an unexistent color or style identifier is given :rtype: str :return: the built escape code """ ansi_code = [] if style: style_attrs = splitstrip(style) for effect in style_attrs: ansi_code.append(ANSI_STYLES[effect]) if color: if color.isdigit(): ansi_code.extend(['38','5']) ansi_code.append(color) else: ansi_code.append(ANSI_COLORS[color]) if ansi_code: return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END return '' def colorize_ansi(msg, color=None, style=None): """colorize message by wrapping it with ansi escape codes :type msg: str or unicode :param msg: the message string to colorize :type color: str or None :param color: the color identifier (see `ANSI_COLORS` for available values) :type style: str or None :param style: style string (see `ANSI_COLORS` for available values). To get several style effects at the same time, use a coma as separator. :raise KeyError: if an unexistent color or style identifier is given :rtype: str or unicode :return: the ansi escaped string """ # If both color and style are not defined, then leave the text as is if color is None and style is None: return msg escape_code = _get_ansi_code(color, style) # If invalid (or unknown) color, don't wrap msg with ansi codes if escape_code: return '%s%s%s' % (escape_code, msg, ANSI_RESET) return msg DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'} def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE): for line in lines: if line[:4] in ('--- ', '+++ '): out.write(colorize_ansi(line, style['separator'])) elif line[0] == '-': out.write(colorize_ansi(line, style['remove'])) elif line[0] == '+': out.write(colorize_ansi(line, style['add'])) elif line[:4] == '--- ': out.write(colorize_ansi(line, style['separator'])) elif line[:4] == '+++ ': out.write(colorize_ansi(line, style['separator'])) else: out.write(line)