summaryrefslogtreecommitdiff
path: root/Zend/zend_unicode.h
blob: 7a2166d8c8608bd2c14bb4fdafa6c7b6bfd23a73 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/*
   +----------------------------------------------------------------------+
   | Zend Engine                                                          |
   +----------------------------------------------------------------------+
   | Copyright (c) 1998-2009 Zend Technologies Ltd. (http://www.zend.com) |
   +----------------------------------------------------------------------+
   | This source file is subject to version 2.00 of the Zend license,     |
   | that is bundled with this package in the file LICENSE, and is        |
   | available through the world-wide-web at                              |
   | http://www.zend.com/license/2_00.txt.                                |
   | If you did not receive a copy of the Zend license and are unable to  |
   | obtain it through the world-wide-web, please send a note to          |
   | license@zend.com so we can mail you a copy immediately.              |
   +----------------------------------------------------------------------+
   | Authors: Andrei Zmievski <andrei@php.net>                            |
   +----------------------------------------------------------------------+
*/

#ifndef ZEND_UNICODE_H
#define ZEND_UNICODE_H

#include "zend.h"
#include <unicode/utypes.h>
#include <unicode/uclean.h>
#include <unicode/ustring.h>
#include <unicode/ucnv.h>
#include <unicode/uchar.h>
#include <unicode/uloc.h>
#include <unicode/ucol.h>

enum {
  ZEND_CONV_ERROR_STOP,
  ZEND_CONV_ERROR_SKIP,
  ZEND_CONV_ERROR_SUBST,
  ZEND_CONV_ERROR_ESCAPE_UNICODE,
  ZEND_CONV_ERROR_ESCAPE_ICU,
  ZEND_CONV_ERROR_ESCAPE_JAVA,
  ZEND_CONV_ERROR_ESCAPE_XML_DEC,
  ZEND_CONV_ERROR_ESCAPE_XML_HEX,
  ZEND_CONV_ERROR_LAST_ENUM,

  ZEND_CONV_ERROR_EXCEPTION	= 0x100
};

typedef enum {
	ZEND_FROM_UNICODE,
	ZEND_TO_UNICODE,
} zend_conv_direction;


typedef struct _zend_collator {
	UCollator    *coll;
	int       refcount;
} zend_collator;


extern ZEND_API zend_class_entry *unicodeConversionException;


/* internal functions */

int zend_set_converter_encoding(UConverter **converter, const char *encoding);
void zend_set_converter_error_mode(UConverter *conv, zend_conv_direction dir, uint16_t error_mode);
void zend_set_converter_subst_char(UConverter *conv, UChar *subst_char);
void zend_register_unicode_exceptions(TSRMLS_D);
void zend_update_converters_error_behavior(TSRMLS_D);
zend_collator* zend_collator_create(UCollator *coll);
void zend_collator_destroy(zend_collator *zcoll);


/* API functions */

ZEND_API void zend_convert_encodings(UConverter *target_conv, UConverter *source_conv, char **target, int *target_len, const char *source, int source_len, UErrorCode *status);
ZEND_API char* zend_unicode_to_ascii(const UChar *us, int us_len TSRMLS_DC);

ZEND_API int zend_string_to_unicode_ex(UConverter *conv, UChar **target, int *target_len, const char *source, int source_len, UErrorCode *status);
ZEND_API int zend_string_to_unicode(UConverter *conv, UChar **u, int *u_len, char *s, int s_len TSRMLS_DC);
ZEND_API int zend_unicode_to_string_ex(UConverter *conv, char **s, int *s_len, const UChar *u, int u_len, UErrorCode *status);
ZEND_API int zend_unicode_to_string(UConverter *conv, char **s, int *s_len, const UChar *u, int u_len TSRMLS_DC);

ZEND_API int zval_string_to_unicode_ex(zval *string, UConverter *conv TSRMLS_DC);
ZEND_API int zval_string_to_unicode(zval *string TSRMLS_DC);
ZEND_API int zval_unicode_to_string_ex(zval *string, UConverter *conv TSRMLS_DC);
ZEND_API int zval_unicode_to_string(zval *string TSRMLS_DC);

ZEND_API int zend_cmp_unicode_and_string(UChar *ustr, char* str, uint len);
ZEND_API int zend_cmp_unicode_and_literal(UChar *ustr, int ulen, char* str, int slen);

ZEND_API void zend_case_fold_string(UChar **dest, int *dest_len, UChar *src, int src_len, uint32_t options, UErrorCode *status);

ZEND_API int zend_is_valid_identifier(UChar *ident, int ident_len);
ZEND_API int zend_normalize_identifier(UChar **dest, int *dest_len, UChar *ident, int ident_len, zend_bool fold_case);

ZEND_API void zend_raise_conversion_error_ex(char *message, UConverter *conv, zend_conv_direction dir, int error_char_offset TSRMLS_DC);

/*
 * Function to get a codepoint at position n. Iterates over codepoints starting from the
 * beginning of the string. Does not check for n > length, this is left up to the caller.
 */
static inline UChar32 zend_get_codepoint_at(UChar *str, int length, int n)
{
	int32_t offset = 0;
	UChar32 c = 0;

	if (n > 0) {
		U16_FWD_N(str, offset, length, n);
	}
	U16_NEXT(str, offset, length, c);

	return c;
}

/*
 * Convert a single codepoint to UChar sequence (1 or 2).
 * The UChar buffer is assumed to be large enough.
 */
static inline int zend_codepoint_to_uchar(UChar32 codepoint, UChar *buf)
{
	if (U_IS_BMP(codepoint)) {
		*buf++ = (UChar) codepoint;
		return 1;
	} else if (codepoint <= UCHAR_MAX_VALUE) {
		*buf++ = (UChar) U16_LEAD(codepoint);
		*buf++ = (UChar) U16_TRAIL(codepoint);
		return 2;
	} else {
		return 0;
	}
}

#define ZSTR_LEN(__type, __str) ((__type==IS_UNICODE)?u_strlen(__str.u):strlen(__str.s))

#define ZBYTES(__type, __len) (((__type) == IS_UNICODE) ? UBYTES((__len)) : (__len))

#define ZEND_U_CONVERTER(c) ((c)?(c):UG(fallback_encoding_conv))

#define USTR_FREE(ustr) do { if (ustr) { efree(ustr); } } while (0);
#define UBYTES(len) ((len) * sizeof(UChar))
#define USTR_LEN(str) u_strlen((str).u)
#define USTR_VAL(str) (str).u

#define USTR_BYTES(__type, __length) \
	((__type == IS_UNICODE)?UBYTES(__length):__length)

#define TEXT_BYTES(__chars_len) \
	UBYTES(__chars_len)

#define TEXT_CHARS(__bytes_len) \
	(__bytes_len / sizeof(UChar))

#define USTR_MAKE(cs) zend_ascii_to_unicode(cs, sizeof(cs) ZEND_FILE_LINE_CC)
#define USTR_MAKE_REL(cs) zend_ascii_to_unicode(cs, sizeof(cs) ZEND_FILE_LINE_RELAY_CC)
static inline UChar* zend_ascii_to_unicode(const char *cs, size_t cs_size ZEND_FILE_LINE_DC)
{
	/* u_charsToUChars() takes care of the terminating NULL */
	UChar *us = eumalloc_rel(cs_size);
	u_charsToUChars(cs, us, cs_size);
	return us;
}

#endif /* ZEND_UNICODE_H */