#
#   Cython -- encoding related tools
#

import re

class UnicodeLiteralBuilder(object):
    """Assemble a unicode string.
    """
    def __init__(self):
        self.chars = []

    def append(self, characters):
        if isinstance(characters, str):
            # this came from a Py2 string literal in the parser code
            characters = characters.decode("ASCII")
        assert isinstance(characters, unicode), str(type(characters))
        self.chars.append(characters)

    def append_charval(self, char_number):
        self.chars.append( unichr(char_number) )

    def getstring(self):
        return EncodedString(u''.join(self.chars))


class BytesLiteralBuilder(object):
    """Assemble a byte string or char value.
    """
    def __init__(self, target_encoding):
        self.chars = []
        self.target_encoding = target_encoding

    def append(self, characters):
        if isinstance(characters, unicode):
            characters = characters.encode(self.target_encoding)
        assert isinstance(characters, str), str(type(characters))
        self.chars.append(characters)

    def append_charval(self, char_number):
        self.chars.append( chr(char_number) )

    def getstring(self):
        # this *must* return a byte string! => fix it in Py3k!!
        s = BytesLiteral(''.join(self.chars))
        s.encoding = self.target_encoding
        return s

    def getchar(self):
        # this *must* return a byte string! => fix it in Py3k!!
        return self.getstring()

class EncodedString(unicode):
    # unicode string subclass to keep track of the original encoding.
    # 'encoding' is None for unicode strings and the source encoding
    # otherwise
    encoding = None

    def byteencode(self):
        assert self.encoding is not None
        return self.encode(self.encoding)

    def utf8encode(self):
        assert self.encoding is None
        return self.encode("UTF-8")

    def is_unicode(self):
        return self.encoding is None
    is_unicode = property(is_unicode)

class BytesLiteral(str):
    # str subclass that is compatible with EncodedString
    encoding = None

    def byteencode(self):
        return str(self)

    def utf8encode(self):
        assert False, "this is not a unicode string: %r" % self

    is_unicode = False

char_from_escape_sequence = {
    r'\a' : u'\a',
    r'\b' : u'\b',
    r'\f' : u'\f',
    r'\n' : u'\n',
    r'\r' : u'\r',
    r'\t' : u'\t',
    r'\v' : u'\v',
    }.get

def _to_escape_sequence(s):
    if s in '\n\r\t':
        return repr(s)[1:-1]
    elif s == '"':
        return r'\"'
    else:
        # within a character sequence, oct passes much better than hex
        return ''.join(['\\%03o' % ord(c) for c in s])

_c_special = ('\0', '\n', '\r', '\t', '??', '"')
_c_special_replacements = zip(_c_special, map(_to_escape_sequence, _c_special))

def _build_specials_test():
    subexps = []
    for special in _c_special:
        regexp = ''.join(['[%s]' % c for c in special])
        subexps.append(regexp)
    return re.compile('|'.join(subexps)).search

_has_specials = _build_specials_test()

def escape_character(c):
    if c in '\n\r\t\\':
        return repr(c)[1:-1]
    elif c == "'":
        return "\\'"
    n = ord(c)
    if n < 32 or n > 127:
        # hex works well for characters
        return "\\x%02X" % n
    else:
        return c

def escape_byte_string(s):
    s = s.replace('\\', '\\\\')
    if _has_specials(s):
        for special, replacement in _c_special_replacements:
            s = s.replace(special, replacement)
    try:
        s.decode("ASCII")
        return s
    except UnicodeDecodeError:
        pass
    l = []
    append = l.append
    for c in s:
        o = ord(c)
        if o >= 128:
            append('\\%3o' % o)
        else:
            append(c)
    return ''.join(l)

def split_docstring(s):
    if len(s) < 2047:
        return s
    return '\\n\"\"'.join(s.split(r'\n'))