textutils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321

# Copyright (c) 2003-2006 LOGILAB S.A. (Paris, FRANCE).
# http://www.logilab.fr/ -- mailto:contact@logilab.fr
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
"""Some text manipulation utility functions.

:version:   $Revision: 1.25 $  
:author:    Logilab
:copyright: 2003-2005 LOGILAB S.A. (Paris, FRANCE)
:contact:   http://www.logilab.fr/ -- mailto:python-projects@logilab.org

:group text formatting: normalize_text, normalize_paragraph, pretty_match,\
unquote, colorize_ansi
:group text manipulation: searchall, get_csv
:sort: text formatting, text manipulation


:type ANSI_STYLES: dict(str)
:var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code

:type ANSI_COLORS: dict(str)
:var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code

:type ANSI_PREFIX: str
:var ANSI_PREFIX:
  ANSI terminal code notifing the start of an ANSI escape sequence
  
:type ANSI_END: str
:var ANSI_END:
  ANSI terminal code notifing the end of an ANSI escape sequence
  
:type ANSI_RESET: str
:var ANSI_RESET:
  ANSI terminal code reseting format defined by a previous ANSI escape sequence
"""

__revision__ = "$Id: textutils.py,v 1.25 2005-09-06 08:51:01 alf Exp $"
__docformat__ = "restructuredtext en"

import re
from os import linesep
from warnings import warn


def searchall(rgx, data):
    """apply a regexp using "search" until no more match is found

    This function is deprecated, use re.finditer() instead.
    """
    warn('logilab.common.textutils.searchall() is deprecated, use '
         're.finditer() instead', DeprecationWarning)
    result = []
    match = rgx.search(data)
    while match is not None:
        result.append(match)
        match = rgx.search(data, match.end())        
    return result


def unquote(string):
    """remove optional quotes (simple or double) from the string

    :type string: str or unicode
    :param string: an optionaly quoted string

    :rtype: str or unicode
    :return: the unquoted string (or the input string if it wasn't quoted)
    """
    if not string:
        return string
    if string[0] in '"\'':
        string = string[1:]
    if string[-1] in '"\'':
        string = string[:-1]
    return string


_BLANKLINES_RGX = re.compile('\r?\n\r?\n')
_NORM_SPACES_RGX = re.compile('\s+')

def normalize_text(text, line_len=80, indent=''):
    """normalize a text to display it with a maximum line size and
    optionally arbitrary indentation. Line jumps are normalized but blank
    lines are kept. The indentation string may be used to insert a
    comment (#) or a quoting (>) mark  for instance.

    :type text: str or unicode
    :param text: the input text to normalize

    :type line_len: int
    :param line_len: expected maximum line's length, default to 80

    :type indent: str or unicode
    :param indent: optional string to use as indentation

    :rtype: str or unicode
    :return:
      the input text normalized to fit on lines with a maximized size
      inferior to `line_len`, and optionally prefixed by an
      indentation string
    """
    result = []
    for text in _BLANKLINES_RGX.split(text):
        result.append(normalize_paragraph(text, line_len, indent))
##     return ('%s%s%s' % (linesep, indent, linesep)).join(result)
    return ('%s%s' % (linesep, linesep)).join(result)

def normalize_paragraph(text, line_len=80, indent=''):
    """normalize a text to display it with a maximum line size and
    optionaly arbitrary indentation. Line jumps are normalized. The
    indentation string may be used top insert a comment mark for
    instance.


    :type text: str or unicode
    :param text: the input text to normalize

    :type line_len: int
    :param line_len: expected maximum line's length, default to 80

    :type indent: str or unicode
    :param indent: optional string to use as indentation

    :rtype: str or unicode
    :return:
      the input text normalized to fit on lines with a maximized size
      inferior to `line_len`, and optionally prefixed by an
      indentation string
    """
    #text = text.replace(linesep, ' ')
    text = _NORM_SPACES_RGX.sub(' ', text)
    lines = []
    while text:
        text = text.strip()
        pos = min(len(indent) + len(text), line_len)
        if pos == line_len and len(text) > line_len:
            pos = pos - len(indent)
            while pos > 0 and text[pos] != ' ':
                pos -= 1
            if pos == 0:
                pos = min(len(indent) + len(text), line_len)
                pos = pos - len(indent)
                while text[pos] != ' ':
                    pos += 1
        lines.append((indent + text[:pos]))
        text = text[pos+1:]
    return linesep.join(lines)


def get_csv(string, sep=','):
    """return a list of string in from a csv formatted line

    >>> get_csv('a, b, c   ,  4')
    ['a', 'b', 'c', '4']
    >>> get_csv('a')
    ['a']
    >>>

    :type string: str or unicode
    :param string: a csv line

    :type sep: str or unicode
    :param sep: field separator, default to the comma (',')

    :rtype: str or unicode
    :return: the unquoted string (or the input string if it wasn't quoted)    
    """
    return [word.strip() for word in string.split(sep) if word.strip()]


_LINE_RGX = re.compile('\r\n|\r+|\n')

def pretty_match(match, string, underline_char='^'):
    """return a string with the match location underlined:

    >>> import re
    >>> print pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon')
    il mange du bacon
       ^^^^^
    >>>
    
    :type match: _sre.SRE_match
    :param match: object returned by re.match, re.search or re.finditer

    :type string: str or unicode
    :param string:
      the string on which the regular expression has been applied to
      obtain the `match` object

    :type underline_char: str or unicode
    :param underline_char:
      character to use to underline the matched section, default to the
      carret '^'

    :rtype: str or unicode
    :return:
      the original string with an inserted line to underline the match
      location
    """
    start = match.start()
    end = match.end()
    string = _LINE_RGX.sub(linesep, string)
    start_line_pos = string.rfind(linesep, 0, start)
    if start_line_pos == -1:
        start_line_pos = 0
        result = []
    else:
        result = [string[:start_line_pos]]
        start_line_pos += len(linesep)
    offset = start - start_line_pos
    underline = ' ' * offset + underline_char * (end - start)
    end_line_pos = string.find(linesep, end)
    if end_line_pos == -1:
        string = string[start_line_pos:]
        result.append(string)
        result.append(underline)
    else:
        end = string[end_line_pos + len(linesep):]
        string = string[start_line_pos:end_line_pos]
        result.append(string)
        result.append(underline)
        result.append(end)
    return linesep.join(result).rstrip()


# Ansi colorization ###########################################################

ANSI_PREFIX = '\033['
ANSI_END = 'm'
ANSI_RESET = '\033[0m'
ANSI_STYLES = {
    'reset'     : "0",
    'bold'      : "1",
    'italic'    : "3",
    'underline' : "4",
    'blink'     : "5",
    'inverse'   : "7",
    'strike'    : "9",
}
ANSI_COLORS = {
    'reset'   : "0",
    'black'   : "30",
    'red'     : "31",
    'green'   : "32",
    'yellow'  : "33",
    'blue'    : "34",
    'magenta' : "35",
    'cyan'    : "36",
    'white'   : "37",
}


def _get_ansi_code(color=None, style=None):
    """return ansi escape code corresponding to color and style
    
    :type color: str or None
    :param color:
      the color identifier (see `ANSI_COLORS` for available values)

    :type style: str or None
    :param style:
      style string (see `ANSI_COLORS` for available values). To get
      several style effects at the same time, use a coma as separator.

    :raise KeyError: if an unexistant color or style identifier is given
    
    :rtype: str
    :return: the built escape code
    """
    ansi_code = []
    if style:
        style_attrs = get_csv(style)
        for effect in style_attrs:
            ansi_code.append(ANSI_STYLES[effect])
    if color:
        ansi_code.append(ANSI_COLORS[color])
    if ansi_code:
        return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END
    return ''

def colorize_ansi(msg, color=None, style=None):
    """colorize message by wrapping it with ansi escape codes

    :type msg: str or unicode
    :param msg: the message string to colorize

    :type color: str or None
    :param color:
      the color identifier (see `ANSI_COLORS` for available values)

    :type style: str or None
    :param style:
      style string (see `ANSI_COLORS` for available values). To get
      several style effects at the same time, use a coma as separator.

    :raise KeyError: if an unexistant color or style identifier is given

    :rtype: str or unicode
    :return: the ansi escaped string
    """
    # If both color and style are not defined, then leave the text as is
    if color is None and style is None:
        return msg
    escape_code = _get_ansi_code(color, style)
    # If invalid (or unknown) color, don't wrap msg with ansi codes
    if escape_code:
        return '%s%s%s' % (escape_code, msg, ANSI_RESET)
    return msg