summaryrefslogtreecommitdiff
path: root/textutils.py
blob: f83b05c12588029a420102a729dded0707c53ae5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
# copyright 2003-2010 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
#
# This file is part of logilab-common.
#
# logilab-common is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option) any
# later version.
#
# logilab-common is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with logilab-common.  If not, see <http://www.gnu.org/licenses/>.
"""Some text manipulation utility functions.


:group text formatting: normalize_text, normalize_paragraph, pretty_match,\
unquote, colorize_ansi
:group text manipulation: searchall, splitstrip
:sort: text formatting, text manipulation

:type ANSI_STYLES: dict(str)
:var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code

:type ANSI_COLORS: dict(str)
:var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code

:type ANSI_PREFIX: str
:var ANSI_PREFIX:
  ANSI terminal code notifying the start of an ANSI escape sequence

:type ANSI_END: str
:var ANSI_END:
  ANSI terminal code notifying the end of an ANSI escape sequence

:type ANSI_RESET: str
:var ANSI_RESET:
  ANSI terminal code resetting format defined by a previous ANSI escape sequence
"""
__docformat__ = "restructuredtext en"

import sys
import re
import os.path as osp
from unicodedata import normalize as _uninormalize
try:
    from os import linesep
except ImportError:
    linesep = '\n' # gae

from logilab.common.deprecation import deprecated

MANUAL_UNICODE_MAP = {
    u'\xa1': u'!',    # INVERTED EXCLAMATION MARK
    u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE
    u'\u2044': u'/',  # FRACTION SLASH
    u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE
    u'\xa9': u'(c)',  # COPYRIGHT SIGN
    u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
    u'\xe6': u'ae',   # LATIN SMALL LETTER AE
    u'\xae': u'(r)',  # REGISTERED SIGN
    u'\u0153': u'oe', # LATIN SMALL LIGATURE OE
    u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE
    u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE
    u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE
    u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
    u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S
    }

def unormalize(ustring, ignorenonascii=False):
    """replace diacritical characters with their corresponding ascii characters

    Convert the unicode string to its long normalized form (unicode character
    will be transform into several characters) and keep the first one only.
    The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
    replace all compatibility characters with their equivalents.

    :see: Another project about ASCII transliterations of Unicode text
          http://pypi.python.org/pypi/Unidecode
    """
    res = []
    for letter in ustring[:]:
        try:
            replacement = MANUAL_UNICODE_MAP[letter]
        except KeyError:
            if ord(letter) >= 2**8:
                if ignorenonascii:
                    continue
                raise ValueError("can't deal with non-ascii based characters")
            replacement = _uninormalize('NFKD', letter)[0]
        res.append(replacement)
    return u''.join(res)

def unquote(string):
    """remove optional quotes (simple or double) from the string

    :type string: str or unicode
    :param string: an optionally quoted string

    :rtype: str or unicode
    :return: the unquoted string (or the input string if it wasn't quoted)
    """
    if not string:
        return string
    if string[0] in '"\'':
        string = string[1:]
    if string[-1] in '"\'':
        string = string[:-1]
    return string


_BLANKLINES_RGX = re.compile('\r?\n\r?\n')
_NORM_SPACES_RGX = re.compile('\s+')

def normalize_text(text, line_len=80, indent='', rest=False):
    """normalize a text to display it with a maximum line size and
    optionally arbitrary indentation. Line jumps are normalized but blank
    lines are kept. The indentation string may be used to insert a
    comment (#) or a quoting (>) mark  for instance.

    :type text: str or unicode
    :param text: the input text to normalize

    :type line_len: int
    :param line_len: expected maximum line's length, default to 80

    :type indent: str or unicode
    :param indent: optional string to use as indentation

    :rtype: str or unicode
    :return:
      the input text normalized to fit on lines with a maximized size
      inferior to `line_len`, and optionally prefixed by an
      indentation string
    """
    if rest:
        normp = normalize_rest_paragraph
    else:
        normp = normalize_paragraph
    result = []
    for text in _BLANKLINES_RGX.split(text):
        result.append(normp(text, line_len, indent))
    return ('%s%s%s' % (linesep, indent, linesep)).join(result)


def normalize_paragraph(text, line_len=80, indent=''):
    """normalize a text to display it with a maximum line size and
    optionally arbitrary indentation. Line jumps are normalized. The
    indentation string may be used top insert a comment mark for
    instance.

    :type text: str or unicode
    :param text: the input text to normalize

    :type line_len: int
    :param line_len: expected maximum line's length, default to 80

    :type indent: str or unicode
    :param indent: optional string to use as indentation

    :rtype: str or unicode
    :return:
      the input text normalized to fit on lines with a maximized size
      inferior to `line_len`, and optionally prefixed by an
      indentation string
    """
    text = _NORM_SPACES_RGX.sub(' ', text)
    line_len = line_len - len(indent)
    lines = []
    while text:
        aline, text = splittext(text.strip(), line_len)
        lines.append(indent + aline)
    return linesep.join(lines)

def normalize_rest_paragraph(text, line_len=80, indent=''):
    """normalize a ReST text to display it with a maximum line size and
    optionally arbitrary indentation. Line jumps are normalized. The
    indentation string may be used top insert a comment mark for
    instance.

    :type text: str or unicode
    :param text: the input text to normalize

    :type line_len: int
    :param line_len: expected maximum line's length, default to 80

    :type indent: str or unicode
    :param indent: optional string to use as indentation

    :rtype: str or unicode
    :return:
      the input text normalized to fit on lines with a maximized size
      inferior to `line_len`, and optionally prefixed by an
      indentation string
    """
    toreport = ''
    lines = []
    line_len = line_len - len(indent)
    for line in text.splitlines():
        line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip())
        toreport = ''
        while len(line) > line_len:
            # too long line, need split
            line, toreport = splittext(line, line_len)
            lines.append(indent + line)
            if toreport:
                line = toreport + ' '
                toreport = ''
            else:
                line = ''
        if line:
            lines.append(indent + line.strip())
    return linesep.join(lines)


def splittext(text, line_len):
    """split the given text on space according to the given max line size

    return a 2-uple:
    * a line <= line_len if possible
    * the rest of the text which has to be reported on another line
    """
    if len(text) <= line_len:
        return text, ''
    pos = min(len(text)-1, line_len)
    while pos > 0 and text[pos] != ' ':
        pos -= 1
    if pos == 0:
        pos = min(len(text), line_len)
        while len(text) > pos and text[pos] != ' ':
            pos += 1
    return text[:pos], text[pos+1:].strip()


def splitstrip(string, sep=','):
    """return a list of stripped string by splitting the string given as
    argument on `sep` (',' by default). Empty string are discarded.

    >>> splitstrip('a, b, c   ,  4,,')
    ['a', 'b', 'c', '4']
    >>> splitstrip('a')
    ['a']
    >>>

    :type string: str or unicode
    :param string: a csv line

    :type sep: str or unicode
    :param sep: field separator, default to the comma (',')

    :rtype: str or unicode
    :return: the unquoted string (or the input string if it wasn't quoted)
    """
    return [word.strip() for word in string.split(sep) if word.strip()]

get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip)


def split_url_or_path(url_or_path):
    """return the latest component of a string containing either an url of the
    form <scheme>://<path> or a local file system path
    """
    if '://' in url_or_path:
        return url_or_path.rstrip('/').rsplit('/', 1)
    return osp.split(url_or_path.rstrip(osp.sep))


def text_to_dict(text):
    """parse multilines text containing simple 'key=value' lines and return a
    dict of {'key': 'value'}. When the same key is encountered multiple time,
    value is turned into a list containing all values.

    >>> text_to_dict('''multiple=1
    ... multiple= 2
    ... single =3
    ... ''')
    {'single': '3', 'multiple': ['1', '2']}

    """
    res = {}
    if not text:
        return res
    for line in text.splitlines():
        line = line.strip()
        if line:
            key, value = [w.strip() for w in line.split('=', 1)]
            if key in res:
                try:
                    res[key].append(value)
                except AttributeError:
                    res[key] = [res[key], value]
            else:
                res[key] = value
    return res


_BLANK_URE = r'(\s|,)+'
_BLANK_RE = re.compile(_BLANK_URE)
__VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))'
__UNITS_URE = r'[a-zA-Z]+'
_VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE,__UNITS_URE))

BYTE_UNITS = {
    "b": 1,
    "kb": 1024,
    "mb": 1024 ** 2,
    "gb": 1024 ** 3,
    "tb": 1024 ** 4,
}

TIME_UNITS = {
    "ms": 0.0001,
    "s": 1,
    "min": 60,
    "h": 60 * 60,
    "d": 60 * 60 *24,
}

def apply_units( string, units, inter=None, final=float, blank_reg=_BLANK_RE,
    value_reg=_VALUE_RE):
    """Parse the string applying the units defined in units
    (e.g.: "1.5m",{'m',60} -> 80).

    :type string: str or unicode
    :param string: the string to parse

    :type units: dict (or any object with __getitem__ using basestring key)
    :param units: a dict mapping a unit string repr to its value

    :type inter: type
    :param inter: used to parse every intermediate value (need __sum__)

    :type blank_reg: regexp
    :param blank_reg: should match every blank char to ignore.

    :type value_reg: regexp with "value" and optional "unit" group
    :param value_reg: match a value and it's unit into the
    """
    if inter is None:
        inter = final
    string = _BLANK_RE.sub('',string)
    values = []
    for match in value_reg.finditer(string):
        dic = match.groupdict()
        #import sys
        #print >> sys.stderr, dic
        lit, unit = dic["value"], dic.get("unit")
        value = inter(lit)
        if unit is not None:
            try:
                value *= units[unit.lower()]
            except KeyError:
                raise KeyError('invalid unit %s. valid units are %s' %
                               (unit, units.keys()))
        values.append(value)
    return final(sum(values))


_LINE_RGX = re.compile('\r\n|\r+|\n')

def pretty_match(match, string, underline_char='^'):
    """return a string with the match location underlined:

    >>> import re
    >>> print pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon')
    il mange du bacon
       ^^^^^
    >>>

    :type match: _sre.SRE_match
    :param match: object returned by re.match, re.search or re.finditer

    :type string: str or unicode
    :param string:
      the string on which the regular expression has been applied to
      obtain the `match` object

    :type underline_char: str or unicode
    :param underline_char:
      character to use to underline the matched section, default to the
      carret '^'

    :rtype: str or unicode
    :return:
      the original string with an inserted line to underline the match
      location
    """
    start = match.start()
    end = match.end()
    string = _LINE_RGX.sub(linesep, string)
    start_line_pos = string.rfind(linesep, 0, start)
    if start_line_pos == -1:
        start_line_pos = 0
        result = []
    else:
        result = [string[:start_line_pos]]
        start_line_pos += len(linesep)
    offset = start - start_line_pos
    underline = ' ' * offset + underline_char * (end - start)
    end_line_pos = string.find(linesep, end)
    if end_line_pos == -1:
        string = string[start_line_pos:]
        result.append(string)
        result.append(underline)
    else:
        end = string[end_line_pos + len(linesep):]
        string = string[start_line_pos:end_line_pos]
        result.append(string)
        result.append(underline)
        result.append(end)
    return linesep.join(result).rstrip()


# Ansi colorization ###########################################################

ANSI_PREFIX = '\033['
ANSI_END = 'm'
ANSI_RESET = '\033[0m'
ANSI_STYLES = {
    'reset'     : "0",
    'bold'      : "1",
    'italic'    : "3",
    'underline' : "4",
    'blink'     : "5",
    'inverse'   : "7",
    'strike'    : "9",
}
ANSI_COLORS = {
    'reset'   : "0",
    'black'   : "30",
    'red'     : "31",
    'green'   : "32",
    'yellow'  : "33",
    'blue'    : "34",
    'magenta' : "35",
    'cyan'    : "36",
    'white'   : "37",
}

def _get_ansi_code(color=None, style=None):
    """return ansi escape code corresponding to color and style

    :type color: str or None
    :param color:
      the color name (see `ANSI_COLORS` for available values)
      or the color number when 256 colors are available

    :type style: str or None
    :param style:
      style string (see `ANSI_COLORS` for available values). To get
      several style effects at the same time, use a coma as separator.

    :raise KeyError: if an unexistent color or style identifier is given

    :rtype: str
    :return: the built escape code
    """
    ansi_code = []
    if style:
        style_attrs = splitstrip(style)
        for effect in style_attrs:
            ansi_code.append(ANSI_STYLES[effect])
    if color:
        if color.isdigit():
            ansi_code.extend(['38','5'])
            ansi_code.append(color)
        else:
            ansi_code.append(ANSI_COLORS[color])
    if ansi_code:
        return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END
    return ''

def colorize_ansi(msg, color=None, style=None):
    """colorize message by wrapping it with ansi escape codes

    :type msg: str or unicode
    :param msg: the message string to colorize

    :type color: str or None
    :param color:
      the color identifier (see `ANSI_COLORS` for available values)

    :type style: str or None
    :param style:
      style string (see `ANSI_COLORS` for available values). To get
      several style effects at the same time, use a coma as separator.

    :raise KeyError: if an unexistent color or style identifier is given

    :rtype: str or unicode
    :return: the ansi escaped string
    """
    # If both color and style are not defined, then leave the text as is
    if color is None and style is None:
        return msg
    escape_code = _get_ansi_code(color, style)
    # If invalid (or unknown) color, don't wrap msg with ansi codes
    if escape_code:
        return '%s%s%s' % (escape_code, msg, ANSI_RESET)
    return msg

DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'}

def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE):
    for line in lines:
        if line[:4] in ('--- ', '+++ '):
            out.write(colorize_ansi(line, style['separator']))
        elif line[0] == '-':
            out.write(colorize_ansi(line, style['remove']))
        elif line[0] == '+':
            out.write(colorize_ansi(line, style['add']))
        elif line[:4] == '--- ':
            out.write(colorize_ansi(line, style['separator']))
        elif line[:4] == '+++ ':
            out.write(colorize_ansi(line, style['separator']))
        else:
            out.write(line)