summaryrefslogtreecommitdiff
path: root/misc/win32/utf8.c
blob: 9fceea362f41782fd96a24d2c1a5db2d7abb867f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "apr.h"
#include "apr_private.h"
#include "apr_errno.h"
#include "apr_arch_utf8.h"

/* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646"
 * with particular attention to canonical translation forms (see section 10
 * "Security Considerations" of the RFC for more info).
 *
 * Since several architectures including Windows support unicode, with utf-16
 * used as the actual storage conventions by that archicture, these functions
 * exist to transform or validate utf-16 strings into APR's 'char' type
 * convention.  It is left up to the operating system to determine the
 * validitity of the string, e.g. normative forms, in the context of
 * its native language support.  Other file systems which support filename
 * characters of 0x80-0xff but have no explicit requirement for Unicode
 * will find this function useful only for validating the character sequences
 * and rejecting poorly encoded utf-8 sequences.
 *
 * len  utf-4 range (hex)  utf-8 octet sequence (binary)
 * 1:2  00000000-0000007F  0xxxxxxx
 * 2:2  00000080-000007FF  110XXXXx 10xxxxxx
 * 3:2  00000800-0000FFFF  1110XXXX 10Xxxxxx 10xxxxxx
 * 4:4  00010000-001FFFFF  11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
 *      00200000-03FFFFFF  111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
 *      04000000-7FFFFFFF  1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 *
 * One of the X bits must be 1 to avoid overlong representation in utf-8.
 *
 * For conversion into utf-16, the 4th form is limited in range to 0010 FFFF,
 * and the final two forms are used only by full utf-32, per RFC 3629;
 *
 *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
 *   Unicode parlance), being actually UCS-4 characters transformed
 *   through UTF-16, need special treatment: the UTF-16 transformation
 *   must be undone, yielding a UCS-4 character that is then transformed
 *   as above."
 *
 * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
 *
 *   U' = U - 0x10000
 *   U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
 *                     W1 = 110110yy yyyyyyyy
 *                     W2 = 110111xx xxxxxxxx
 *   Max U' = 0000 00001111 11111111 11111111
 *   Max U  = 0000 00010000 11111111 11111111
 *
 * Also note ISO/IEC 10646:2014 Clause 9.4: "Because surrogate code points
 * are not UCS scalar values, UTF-32 code units in the range
 * 0000 D800-0000 DFFF are ill-formed" for future reference in adding any
 * utf-32 accessor functions.
 *
 * Len is the table above is a mapping of bytes used for utf-8:utf-16 values,
 * which results in these conclusions of maximum allocations;
 *
 * apr_conv_utf8_to_utf16 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
 * apr_conv_utf16_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
 */

APR_DECLARE(apr_status_t) apr_conv_utf8_to_utf16(const char *in,
                                                 apr_size_t *inbytes,
                                                 apr_wchar_t *out,
                                                 apr_size_t *outwords)
{
    apr_int64_t newch, mask;
    apr_size_t expect, eating;
    int ch;

    while (*inbytes && *outwords)
    {
        ch = (unsigned char)(*in++);
        if (!(ch & 0200)) {
            /* US-ASCII-7 plain text
             */
            --*inbytes;
            --*outwords;
            *(out++) = ch;
        }
        else
        {
            if ((ch & 0300) != 0300) {
                /* Multibyte Continuation is out of place
                 */
                return APR_EINVAL;
            }
            else
            {
                /* Multibyte Sequence Lead Character
                 *
                 * Compute the expected bytes while adjusting
                 * or lead byte and leading zeros mask.
                 */
                mask = 0340;
                expect = 1;
                while ((ch & mask) == mask) {
                    mask |= mask >> 1;
                    if (++expect > 3) /* (or 5 for a utf-32 code point) */
                        return APR_EINVAL;
                }
                newch = ch & ~mask;
                eating = expect + 1;
                if (*inbytes <= expect)
                    return APR_INCOMPLETE;
                /* Reject values of excessive leading 0 bits
                 * utf-8 _demands_ the shortest possible byte length
                 */
                if (expect == 1) {
                    if (!(newch & 0036))
                        return APR_EINVAL;
                }
                else {
                    /* Reject values of excessive leading 0 bits
                     */
                    if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
                        return APR_EINVAL;
                    if (expect == 2) {
                        /* Reject values D800-DFFF when not utf-16 encoded
                         */
                        if (newch == 0015 && ((unsigned char)*in & 0040))
                            return APR_EINVAL;
                    }
                    else if (expect == 3) {
                        /* Short circuit values > 110000
                         */
                        if (newch > 4)
                            return APR_EINVAL;
                        if (newch == 4 && ((unsigned char)*in & 0060))
                            return APR_EINVAL;
                    }
                }
                /* Where the boolean (expect > 2) is true, we will need
                 * an extra word for the output.
                 */
                if (*outwords < (apr_size_t)(expect > 2) + 1)
                    break; /* buffer full */
                while (expect--)
                {
                    /* Multibyte Continuation must be legal */
                    if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
                        return APR_EINVAL;
                    newch <<= 6;
                    newch |= (ch & 0077);
                }
                *inbytes -= eating;
                /* newch is now a true utf-32 character
                 *
                 * now we need to fold to utf-16
                 */
                if (newch < 0x10000)
                {
                    --*outwords;
                    *(out++) = (apr_wchar_t) newch;
                }
                else
                {
                    *outwords -= 2;
                    newch -= 0x10000;
                    *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10));
                    *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF));
                }
            }
        }
    }
    /* Buffer full 'errors' aren't errors, the client must inspect both
     * the inbytes and outwords values
     */
    return APR_SUCCESS;
}

APR_DECLARE(apr_status_t) apr_conv_utf16_to_utf8(const apr_wchar_t *in,
                                                 apr_size_t *inwords,
                                                 char *out,
                                                 apr_size_t *outbytes)
{
    apr_int64_t newch, require;
    apr_size_t need;
    char *invout;
    int ch;

    while (*inwords && *outbytes)
    {
        ch = (unsigned short)(*in++);
        if (ch < 0x80)
        {
            --*inwords;
            --*outbytes;
            *(out++) = (unsigned char) ch;
        }
        else
        {
            if ((ch & 0xFC00) == 0xDC00) {
                /* Invalid Leading utf-16 Multiword Continuation Character
                 */
                return APR_EINVAL;
            }
            if ((ch & 0xFC00) == 0xD800) {
                /* Leading utf-16 Multiword Character
                 */
                if (*inwords < 2) {
                    /* Missing utf-16 Multiword Continuation Character
                     */
                    return APR_INCOMPLETE;
                }
                if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
                    /* Invalid utf-16 Multiword Continuation Character
                     */
                    return APR_EINVAL;
                }
                newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
                newch += 0x10000;
            }
            else {
                /* utf-16 Single Word Character
                 */
                newch = ch;
            }
            /* Determine the absolute minimum utf-8 bytes required
             */
            require = newch >> 11;
            need = 1;
            while (require)
                require >>= 5, ++need;
            if (need >= *outbytes)
                break; /* Insufficient buffer */
            *inwords -= (need > 2) + 1;
            *outbytes -= need + 1;
            /* Compute the utf-8 characters in last to first order,
             * calculating the lead character length bits along the way.
             */
            ch = 0200;
            out += need + 1;
            invout = out;
            while (need--) {
                ch |= ch >> 1;
                *(--invout) = (unsigned char)(0200 | (newch & 0077));
                newch >>= 6;
            }
            /* Compute the lead utf-8 character and move the dest offset
             */
            *(--invout) = (unsigned char)(ch | newch);
        }
    }
    /* Buffer full 'errors' aren't errors, the client must inspect both
     * the inwords and outbytes values
     */
    return APR_SUCCESS;
}