subversion/include/private/svn_utf_private.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261

/**
 * @copyright
 * ====================================================================
 *    Licensed to the Apache Software Foundation (ASF) under one
 *    or more contributor license agreements.  See the NOTICE file
 *    distributed with this work for additional information
 *    regarding copyright ownership.  The ASF licenses this file
 *    to you under the Apache License, Version 2.0 (the
 *    "License"); you may not use this file except in compliance
 *    with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing,
 *    software distributed under the License is distributed on an
 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *    KIND, either express or implied.  See the License for the
 *    specific language governing permissions and limitations
 *    under the License.
 * ====================================================================
 * @endcopyright
 *
 * @file svn_utf_private.h
 * @brief UTF validation and normalization routines
 */

#ifndef SVN_UTF_PRIVATE_H
#define SVN_UTF_PRIVATE_H

#include <apr.h>
#include <apr_pools.h>

#include "svn_types.h"
#include "svn_string.h"
#include "svn_string_private.h"

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */


/* Return TRUE if the string SRC of length LEN is a valid UTF-8 encoding
 * according to the rules laid down by the Unicode 4.0 standard, FALSE
 * otherwise.  This function is faster than svn_utf__last_valid().
 */
svn_boolean_t
svn_utf__is_valid(const char *src, apr_size_t len);

/* As for svn_utf__is_valid but SRC is NULL terminated. */
svn_boolean_t
svn_utf__cstring_is_valid(const char *src);

/* Return a pointer to the first character after the last valid UTF-8
 * potentially multi-byte character in the string SRC of length LEN.
 * Validity of bytes from SRC to SRC+LEN-1, inclusively, is checked.
 * If SRC is a valid UTF-8, the return value will point to the byte SRC+LEN,
 * otherwise it will point to the start of the first invalid character.
 * In either case all the characters between SRC and the return pointer - 1,
 * inclusively, are valid UTF-8.
 *
 * See also svn_utf__is_valid().
 */
const char *
svn_utf__last_valid(const char *src, apr_size_t len);

/* As for svn_utf__last_valid but uses a different implementation without
   lookup tables.  It avoids the table memory use (about 400 bytes) but the
   function is longer (about 200 bytes extra) and likely to be slower when
   the string is valid.  If the string is invalid this function may be
   faster since it returns immediately rather than continuing to the end of
   the string.  The main reason this function exists is to test the table
   driven implementation.  */
const char *
svn_utf__last_valid2(const char *src, apr_size_t len);

/* Copy LENGTH bytes of SRC, converting characters as follows:
    - Pass characters from the ASCII subset to the result
    - Strip all combining marks from the string
    - Represent other valid Unicode chars as {U+XXXX}
    - Replace invalid Unicode chars with {U?XXXX}
    - Represent chars that are not valid UTF-8 as ?\XX
    - Replace codes outside the Unicode range with a sequence of ?\XX
    - Represent the null byte as \0
   Allocate the result in POOL. */
const char *
svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool);

const char *
svn_utf__cstring_from_utf8_fuzzy(const char *src,
                                 apr_pool_t *pool,
                                 svn_error_t *(*convert_from_utf8)
                                              (const char **,
                                               const char *,
                                               apr_pool_t *));


#if defined(WIN32)
/* On Windows: Convert the UTF-8 string SRC to UTF-16.
   If PREFIX is not NULL, prepend it to the converted result.
   The result, if not empty, will be allocated in RESULT_POOL. */
svn_error_t *
svn_utf__win32_utf8_to_utf16(const WCHAR **result,
                             const char *src,
                             const WCHAR *prefix,
                             apr_pool_t *result_pool);

/* On Windows: Convert the UTF-16 string SRC to UTF-8.
   If PREFIX is not NULL, prepend it to the converted result.
   The result, if not empty, will be allocated in RESULT_POOL. */
svn_error_t *
svn_utf__win32_utf16_to_utf8(const char **result,
                             const WCHAR *src,
                             const char *prefix,
                             apr_pool_t *result_pool);
#endif /* WIN32*/


/* A constant used for many length parameters in the utf8proc wrappers
 * to indicate that the length of a string is unknonw. */
#define SVN_UTF__UNKNOWN_LENGTH ((apr_size_t) -1)


/* Compare two UTF-8 strings, ignoring normalization, using buffers
 * BUF1 and BUF2 for temporary storage. If either of LEN1 or LEN2 is
 * SVN_UTF__UNKNOWN_LENGTH, assume the associated string is
 * null-terminated; otherwise, consider the string only up to the
 * given length.
 *
 * Return compare value in *RESULT.
 */
svn_error_t *
svn_utf__normcmp(int *result,
                 const char *str1, apr_size_t len1,
                 const char *str2, apr_size_t len2,
                 svn_membuf_t *buf1, svn_membuf_t *buf2);

/* Normalize the UTF-8 string STR to form C, using BUF for temporary
 * storage. If LEN is SVN_UTF__UNKNOWN_LENGTH, assume STR is
 * null-terminated; otherwise, consider the string only up to the
 * given length.
 *
 * Return the normalized string in *RESULT, which shares storage with
 * BUF and is valid only until the next time BUF is modified.
 *
 * A returned error may indicate that STRING contains invalid UTF-8 or
 * invalid Unicode codepoints.
 */
svn_error_t*
svn_utf__normalize(const char **result,
                   const char *str, apr_size_t len,
                   svn_membuf_t *buf);

/* Check if STRING is a valid, NFC-normalized UTF-8 string.  Note that
 * a FALSE return value may indicate that STRING is not valid UTF-8 at
 * all.
 *
 * Use SCRATCH_POOL for temporary allocations.
 */
svn_boolean_t
svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool);

/* Encode an UCS-4 string to UTF-8, placing the result into BUFFER.
 * While utf8proc does have a similar function, it does more checking
 * and processing than we want here; this function does not attempt
 * any normalizations but just encodes the individual code points.
 * The encoded string will always be NUL-terminated.
 *
 * Return the length of the result (excluding the NUL terminator) in
 * *result_length.
 *
 * A returned error indicates that a codepoint is invalid.
 */
svn_error_t *
svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
                            const apr_int32_t *ucs4str,
                            apr_size_t length,
                            apr_size_t *result_length);

/* Pattern matching similar to the the SQLite LIKE and GLOB
 * operators. PATTERN, KEY and ESCAPE must all point to UTF-8
 * strings. Furthermore, ESCAPE, if provided, must be a character from
 * the ASCII subset.
 *
 * If any of PATTERN_LEN, STRING_LEN or ESCAPE_LEN are
 * SVN_UTF__UNKNOWN_LENGTH, assume the associated string is
 * null-terminated; otherwise, consider the string only up to the
 * given length.
 *
 * Use buffers PATTERN_BUF, STRING_BUF and TEMP_BUF for temporary storage.
 *
 * If SQL_LIKE is true, interpret PATTERN as a pattern used by the SQL
 * LIKE operator and notice ESCAPE. Otherwise it's a Unix fileglob
 * pattern, and ESCAPE must be NULL.
 *
 * Set *MATCH to the result of the comparison.
*/
svn_error_t *
svn_utf__glob(svn_boolean_t *match,
              const char *pattern, apr_size_t pattern_len,
              const char *string, apr_size_t string_len,
              const char *escape, apr_size_t escape_len,
              svn_boolean_t sql_like,
              svn_membuf_t *pattern_buf,
              svn_membuf_t *string_buf,
              svn_membuf_t *temp_buf);

/* Return the compiled version of the wrapped utf8proc library. */
const char *
svn_utf__utf8proc_compiled_version(void);

/* Return the runtime version of the wrapped utf8proc library. */
const char *
svn_utf__utf8proc_runtime_version(void);

/* Convert an UTF-16 (or UCS-2) string to UTF-8, returning the pointer
 * in RESULT. If BIG_ENDIAN is set, then UTF16STR is big-endian;
 * otherwise, it's little-endian.
 *
 * If UTF16LEN is SVN_UTF__UNKNOWN_LENGTH, then UTF16STR must be
 * terminated with a zero; otherwise, it is the number of 16-bit codes
 * to convert, and the source string may contain NUL values.
 *
 * Allocate RESULT in RESULT_POOL and use SCRATCH_POOL for
 * intermediate allocation.
 *
 * This function combines UTF-16 surrogate pairs into single code
 * points, but will leave single lead or trail surrogates unchanged.
 */
svn_error_t *
svn_utf__utf16_to_utf8(const svn_string_t **result,
                       const apr_uint16_t *utf16str,
                       apr_size_t utf16len,
                       svn_boolean_t big_endian,
                       apr_pool_t *result_pool,
                       apr_pool_t *scratch_pool);

/* Convert an UTF-32 string to UTF-8, returning the pointer in
 * RESULT. If BIG_ENDIAN is set, then UTF32STR is big-endian;
 * otherwise, it's little-endian.
 *
 * If UTF32LEN is SVN_UTF__UNKNOWN_LENGTH, then UTF32STR must be
 * terminated with a zero; otherwise, it is the number of 32-bit codes
 * to convert, and the source string may contain NUL values.
 *
 * Allocate RESULT in RESULT_POOL and use SCRATCH_POOL for
 * intermediate allocation.
 */
svn_error_t *
svn_utf__utf32_to_utf8(const svn_string_t **result,
                       const apr_int32_t *utf32str,
                       apr_size_t utf32len,
                       svn_boolean_t big_endian,
                       apr_pool_t *result_pool,
                       apr_pool_t *scratch_pool);


#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif /* SVN_UTF_PRIVATE_H */