# copyright 2003-2010 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
#
# This file is part of logilab-common.
#
# logilab-common is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option) any
# later version.
#
# logilab-common is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with logilab-common.  If not, see <http://www.gnu.org/licenses/>.
"""Some text manipulation utility functions.


:group text formatting: normalize_text, normalize_paragraph, pretty_match,\
unquote, colorize_ansi
:group text manipulation: searchall, splitstrip
:sort: text formatting, text manipulation

:type ANSI_STYLES: dict(str)
:var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code

:type ANSI_COLORS: dict(str)
:var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code

:type ANSI_PREFIX: str
:var ANSI_PREFIX:
  ANSI terminal code notifying the start of an ANSI escape sequence

:type ANSI_END: str
:var ANSI_END:
  ANSI terminal code notifying the end of an ANSI escape sequence

:type ANSI_RESET: str
:var ANSI_RESET:
  ANSI terminal code resetting format defined by a previous ANSI escape sequence
"""
__docformat__ = "restructuredtext en"

import sys
import re
import os.path as osp
from unicodedata import normalize as _uninormalize
try:
    from os import linesep
except ImportError:
    linesep = '\n' # gae

from logilab.common.deprecation import deprecated

MANUAL_UNICODE_MAP = {
    u'\xa1': u'!',    # INVERTED EXCLAMATION MARK
    u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE
    u'\u2044': u'/',  # FRACTION SLASH
    u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE
    u'\xa9': u'(c)',  # COPYRIGHT SIGN
    u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
    u'\xe6': u'ae',   # LATIN SMALL LETTER AE
    u'\xae': u'(r)',  # REGISTERED SIGN
    u'\u0153': u'oe', # LATIN SMALL LIGATURE OE
    u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE
    u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE
    u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE
    u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
    u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S
    }

def unormalize(ustring, ignorenonascii=False):
    """replace diacritical characters with their corresponding ascii characters
    """
    res = []
    for letter in ustring[:]:
        try:
            replacement = MANUAL_UNICODE_MAP[letter]
        except KeyError:
            if ord(letter) >= 2**8:
                if ignorenonascii:
                    continue
                raise ValueError("can't deal with non-ascii based characters")
            replacement = _uninormalize('NFD', letter)[0]
        res.append(replacement)
    return u''.join(res)

def unquote(string):
    """remove optional quotes (simple or double) from the string

    :type string: str or unicode
    :param string: an optionally quoted string

    :rtype: str or unicode
    :return: the unquoted string (or the input string if it wasn't quoted)
    """
    if not string:
        return string
    if string[0] in '"\'':
        string = string[1:]
    if string[-1] in '"\'':
        string = string[:-1]
    return string


_BLANKLINES_RGX = re.compile('\r?\n\r?\n')
_NORM_SPACES_RGX = re.compile('\s+')

def normalize_text(text, line_len=80, indent='', rest=False):
    """normalize a text to display it with a maximum line size and
    optionally arbitrary indentation. Line jumps are normalized but blank
    lines are kept. The indentation string may be used to insert a
    comment (#) or a quoting (>) mark  for instance.

    :type text: str or unicode
    :param text: the input text to normalize

    :type line_len: int
    :param line_len: expected maximum line's length, default to 80

    :type indent: str or unicode
    :param indent: optional string to use as indentation

    :rtype: str or unicode
    :return:
      the input text normalized to fit on lines with a maximized size
      inferior to `line_len`, and optionally prefixed by an
      indentation string
    """
    if rest:
        normp = normalize_rest_paragraph
    else:
        normp = normalize_paragraph
    result = []
    for text in _BLANKLINES_RGX.split(text):
        result.append(normp(text, line_len, indent))
    return ('%s%s%s' % (linesep, indent, linesep)).join(result)


def normalize_paragraph(text, line_len=80, indent=''):
    """normalize a text to display it with a maximum line size and
    optionally arbitrary indentation. Line jumps are normalized. The
    indentation string may be used top insert a comment mark for
    instance.

    :type text: str or unicode
    :param text: the input text to normalize

    :type line_len: int
    :param line_len: expected maximum line's length, default to 80

    :type indent: str or unicode
    :param indent: optional string to use as indentation

    :rtype: str or unicode
    :return:
      the input text normalized to fit on lines with a maximized size
      inferior to `line_len`, and optionally prefixed by an
      indentation string
    """
    text = _NORM_SPACES_RGX.sub(' ', text)
    line_len = line_len - len(indent)
    lines = []
    while text:
        aline, text = splittext(text.strip(), line_len)
        lines.append(indent + aline)
    return linesep.join(lines)

def normalize_rest_paragraph(text, line_len=80, indent=''):
    """normalize a ReST text to display it with a maximum line size and
    optionally arbitrary indentation. Line jumps are normalized. The
    indentation string may be used top insert a comment mark for
    instance.

    :type text: str or unicode
    :param text: the input text to normalize

    :type line_len: int
    :param line_len: expected maximum line's length, default to 80

    :type indent: str or unicode
    :param indent: optional string to use as indentation

    :rtype: str or unicode
    :return:
      the input text normalized to fit on lines with a maximized size
      inferior to `line_len`, and optionally prefixed by an
      indentation string
    """
    toreport = ''
    lines = []
    line_len = line_len - len(indent)
    for line in text.splitlines():
        line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip())
        toreport = ''
        while len(line) > line_len:
            # too long line, need split
            line, toreport = splittext(line, line_len)
            lines.append(indent + line)
            if toreport:
                line = toreport + ' '
                toreport = ''
            else:
                line = ''
        if line:
            lines.append(indent + line.strip())
    return linesep.join(lines)


def splittext(text, line_len):
    """split the given text on space according to the given max line size

    return a 2-uple:
    * a line <= line_len if possible
    * the rest of the text which has to be reported on another line
    """
    if len(text) <= line_len:
        return text, ''
    pos = min(len(text)-1, line_len)
    while pos > 0 and text[pos] != ' ':
        pos -= 1
    if pos == 0:
        pos = min(len(text), line_len)
        while len(text) > pos and text[pos] != ' ':
            pos += 1
    return text[:pos], text[pos+1:].strip()


def splitstrip(string, sep=','):
    """return a list of stripped string by splitting the string given as
    argument on `sep` (',' by default). Empty string are discarded.

    >>> splitstrip('a, b, c   ,  4,,')
    ['a', 'b', 'c', '4']
    >>> splitstrip('a')
    ['a']
    >>>

    :type string: str or unicode
    :param string: a csv line

    :type sep: str or unicode
    :param sep: field separator, default to the comma (',')

    :rtype: str or unicode
    :return: the unquoted string (or the input string if it wasn't quoted)
    """
    return [word.strip() for word in string.split(sep) if word.strip()]

get_csv = deprecated()(splitstrip)


def split_url_or_path(url_or_path):
    """return the latest component of a string containing either an url of the
    form <scheme>://<path> or a local file system path
    """
    if '://' in url_or_path:
        return url_or_path.rstrip('/').rsplit('/', 1)
    return osp.split(url_or_path.rstrip(osp.sep))


def text_to_dict(text):
    """parse multilines text containing simple 'key=value' lines and return a
    dict of {'key': 'value'}. When the same key is encountered multiple time,
    value is turned into a list containing all values.

    >>> text_to_dict('''multiple=1
    ... multiple= 2
    ... single =3
    ... ''')
    {'single': '3', 'multiple': ['1', '2']}

    """
    res = {}
    if not text:
        return res
    for line in text.splitlines():
        line = line.strip()
        if line:
            key, value = [w.strip() for w in line.split('=', 1)]
            if key in res:
                try:
                    res[key].append(value)
                except AttributeError:
                    res[key] = [res[key], value]
            else:
                res[key] = value
    return res


_BLANK_URE = r'(\s|,)+'
_BLANK_RE = re.compile(_BLANK_URE)
__VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))'
__UNITS_URE = r'[a-zA-Z]+'
_VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE,__UNITS_URE))

BYTE_UNITS = {
    "b": 1,
    "kb": 1024,
    "mb": 1024 ** 2,
    "gb": 1024 ** 3,
    "tb": 1024 ** 4,
}

TIME_UNITS = {
    "ms": 0.0001,
    "s": 1,
    "min": 60,
    "h": 60 * 60,
    "d": 60 * 60 *24,
}

def apply_units( string, units, inter=None, final=float, blank_reg=_BLANK_RE,
    value_reg=_VALUE_RE):
    """Parse the string applying the units defined in units
    (e.g.: "1.5m",{'m',60} -> 80).

    :type string: str or unicode
    :param string: the string to parse

    :type units: dict (or any object with __getitem__ using basestring key)
    :param units: a dict mapping a unit string repr to its value

    :type inter: type
    :param inter: used to parse every intermediate value (need __sum__)

    :type blank_reg: regexp
    :param blank_reg: should match every blank char to ignore.

    :type value_reg: regexp with "value" and optional "unit" group
    :param value_reg: match a value and it's unit into the
    """
    if inter is None:
        inter = final
    string = _BLANK_RE.sub('',string)
    values = []
    for match in value_reg.finditer(string):
        dic = match.groupdict()
        #import sys
        #print >> sys.stderr, dic
        lit, unit = dic["value"], dic.get("unit")
        value = inter(lit)
        if unit is not None:
            try:
                value *= units[unit.lower()]
            except KeyError:
                raise KeyError('invalid unit %s. valid units are %s' %
                               (unit, units.keys()))
        values.append(value)
    return final(sum(values))


_LINE_RGX = re.compile('\r\n|\r+|\n')

def pretty_match(match, string, underline_char='^'):
    """return a string with the match location underlined:

    >>> import re
    >>> print pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon')
    il mange du bacon
       ^^^^^
    >>>

    :type match: _sre.SRE_match
    :param match: object returned by re.match, re.search or re.finditer

    :type string: str or unicode
    :param string:
      the string on which the regular expression has been applied to
      obtain the `match` object

    :type underline_char: str or unicode
    :param underline_char:
      character to use to underline the matched section, default to the
      carret '^'

    :rtype: str or unicode
    :return:
      the original string with an inserted line to underline the match
      location
    """
    start = match.start()
    end = match.end()
    string = _LINE_RGX.sub(linesep, string)
    start_line_pos = string.rfind(linesep, 0, start)
    if start_line_pos == -1:
        start_line_pos = 0
        result = []
    else:
        result = [string[:start_line_pos]]
        start_line_pos += len(linesep)
    offset = start - start_line_pos
    underline = ' ' * offset + underline_char * (end - start)
    end_line_pos = string.find(linesep, end)
    if end_line_pos == -1:
        string = string[start_line_pos:]
        result.append(string)
        result.append(underline)
    else:
        end = string[end_line_pos + len(linesep):]
        string = string[start_line_pos:end_line_pos]
        result.append(string)
        result.append(underline)
        result.append(end)
    return linesep.join(result).rstrip()


# Ansi colorization ###########################################################

ANSI_PREFIX = '\033['
ANSI_END = 'm'
ANSI_RESET = '\033[0m'
ANSI_STYLES = {
    'reset'     : "0",
    'bold'      : "1",
    'italic'    : "3",
    'underline' : "4",
    'blink'     : "5",
    'inverse'   : "7",
    'strike'    : "9",
}
ANSI_COLORS = {
    'reset'   : "0",
    'black'   : "30",
    'red'     : "31",
    'green'   : "32",
    'yellow'  : "33",
    'blue'    : "34",
    'magenta' : "35",
    'cyan'    : "36",
    'white'   : "37",
}

def _get_ansi_code(color=None, style=None):
    """return ansi escape code corresponding to color and style

    :type color: str or None
    :param color:
      the color name (see `ANSI_COLORS` for available values)
      or the color number when 256 colors are available

    :type style: str or None
    :param style:
      style string (see `ANSI_COLORS` for available values). To get
      several style effects at the same time, use a coma as separator.

    :raise KeyError: if an unexistent color or style identifier is given

    :rtype: str
    :return: the built escape code
    """
    ansi_code = []
    if style:
        style_attrs = splitstrip(style)
        for effect in style_attrs:
            ansi_code.append(ANSI_STYLES[effect])
    if color:
        if color.isdigit():
            ansi_code.extend(['38','5'])
            ansi_code.append(color)
        else:
            ansi_code.append(ANSI_COLORS[color])
    if ansi_code:
        return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END
    return ''

def colorize_ansi(msg, color=None, style=None):
    """colorize message by wrapping it with ansi escape codes

    :type msg: str or unicode
    :param msg: the message string to colorize

    :type color: str or None
    :param color:
      the color identifier (see `ANSI_COLORS` for available values)

    :type style: str or None
    :param style:
      style string (see `ANSI_COLORS` for available values). To get
      several style effects at the same time, use a coma as separator.

    :raise KeyError: if an unexistent color or style identifier is given

    :rtype: str or unicode
    :return: the ansi escaped string
    """
    # If both color and style are not defined, then leave the text as is
    if color is None and style is None:
        return msg
    escape_code = _get_ansi_code(color, style)
    # If invalid (or unknown) color, don't wrap msg with ansi codes
    if escape_code:
        return '%s%s%s' % (escape_code, msg, ANSI_RESET)
    return msg

DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'}

def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE):
    for line in lines:
        if line[:4] in ('--- ', '+++ '):
            out.write(colorize_ansi(line, style['separator']))
        elif line[0] == '-':
            out.write(colorize_ansi(line, style['remove']))
        elif line[0] == '+':
            out.write(colorize_ansi(line, style['add']))
        elif line[:4] == '--- ':
            out.write(colorize_ansi(line, style['separator']))
        elif line[:4] == '+++ ':
            out.write(colorize_ansi(line, style['separator']))
        else:
            out.write(line)