summaryrefslogtreecommitdiff
path: root/c/wchar_helper_3.h
blob: f15464ef609eb0ba16ea5e07f90b0860d1a1ba11 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
/*
 * wchar_t helpers, version CPython >= 3.3.
 *
 * CPython 3.3 added support for sys.maxunicode == 0x10FFFF on all
 * platforms, even ones with wchar_t limited to 2 bytes.  As such,
 * this code here works from the outside like wchar_helper.h in the
 * case Py_UNICODE_SIZE == 4, but the implementation is very different.
 */

typedef uint16_t cffi_char16_t;
typedef uint32_t cffi_char32_t;


static PyObject *
_my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
{
    return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, w, size);
}

static PyObject *
_my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
{
    /* are there any surrogate pairs, and if so, how many? */
    Py_ssize_t i, count_surrogates = 0;
    for (i = 0; i < size - 1; i++) {
        if (0xD800 <= w[i] && w[i] <= 0xDBFF &&
                0xDC00 <= w[i+1] && w[i+1] <= 0xDFFF)
            count_surrogates++;
    }
    if (count_surrogates == 0) {
        /* no, fast path */
        return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, w, size);
    }
    else
    {
        PyObject *result = PyUnicode_New(size - count_surrogates, 0x10FFFF);
        Py_UCS4 *data;
        assert(PyUnicode_KIND(result) == PyUnicode_4BYTE_KIND);
        data = PyUnicode_4BYTE_DATA(result);

        for (i = 0; i < size; i++)
        {
            cffi_char32_t ch = w[i];
            if (0xD800 <= ch && ch <= 0xDBFF && i < size - 1) {
                cffi_char32_t ch2 = w[i + 1];
                if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
                    ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
                    i++;
                }
            }
            *data++ = ch;
        }
        return result;
    }
}

static int
_my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
                             char *err_got)
{
    cffi_char32_t ch;
    if (PyUnicode_GET_LENGTH(unicode) != 1) {
        sprintf(err_got, "unicode string of length %zd",
                PyUnicode_GET_LENGTH(unicode));
        return -1;
    }
    ch = PyUnicode_READ_CHAR(unicode, 0);

    if (ch > 0xFFFF)
    {
        sprintf(err_got, "larger-than-0xFFFF character");
        return -1;
    }
    *result = (cffi_char16_t)ch;
    return 0;
}

static int
_my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
                             char *err_got)
{
    if (PyUnicode_GET_LENGTH(unicode) != 1) {
        sprintf(err_got, "unicode string of length %zd",
                PyUnicode_GET_LENGTH(unicode));
        return -1;
    }
    *result = PyUnicode_READ_CHAR(unicode, 0);
    return 0;
}

static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
{
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
    Py_ssize_t result = length;
    unsigned int kind = PyUnicode_KIND(unicode);

    if (kind == PyUnicode_4BYTE_KIND)
    {
        Py_UCS4 *data = PyUnicode_4BYTE_DATA(unicode);
        Py_ssize_t i;
        for (i = 0; i < length; i++) {
            if (data[i] > 0xFFFF)
                result++;
        }
    }
    return result;
}

static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
{
    return PyUnicode_GET_LENGTH(unicode);
}

static int _my_PyUnicode_AsChar16(PyObject *unicode,
                                  cffi_char16_t *result,
                                  Py_ssize_t resultlen)
{
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
    unsigned int kind = PyUnicode_KIND(unicode);
    void *data = PyUnicode_DATA(unicode);
    Py_ssize_t i;

    for (i = 0; i < len; i++) {
        cffi_char32_t ordinal = PyUnicode_READ(kind, data, i);
        if (ordinal > 0xFFFF) {
            if (ordinal > 0x10FFFF) {
                PyErr_Format(PyExc_ValueError,
                             "unicode character out of range for "
                             "conversion to char16_t: 0x%x", (int)ordinal);
                return -1;
            }
            ordinal -= 0x10000;
            *result++ = 0xD800 | (ordinal >> 10);
            *result++ = 0xDC00 | (ordinal & 0x3FF);
        }
        else
            *result++ = ordinal;
    }
    return 0;
}

static int _my_PyUnicode_AsChar32(PyObject *unicode,
                                  cffi_char32_t *result,
                                  Py_ssize_t resultlen)
{
    if (PyUnicode_AsUCS4(unicode, (Py_UCS4 *)result, resultlen, 0) == NULL)
        return -1;
    return 0;
}