summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authorWilliam A. Rowe Jr <wrowe@apache.org>2008-08-07 17:53:39 +0000
committerWilliam A. Rowe Jr <wrowe@apache.org>2008-08-07 17:53:39 +0000
commit86f8ed8e194d641bc712764eacff97a551d3d5dd (patch)
treef8e100f5b7d2dec39b8e338370aa2fd71472e4ad /misc
parenta933a827982e603e9c2c56b471073ac425ba4af1 (diff)
downloadapr-86f8ed8e194d641bc712764eacff97a551d3d5dd.tar.gz
Improve explanations, reference appropriate RFC's and add some
exploratory math for the limits. git-svn-id: https://svn.apache.org/repos/asf/apr/apr/trunk@683665 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'misc')
-rw-r--r--misc/win32/utf8.c59
1 files changed, 32 insertions, 27 deletions
diff --git a/misc/win32/utf8.c b/misc/win32/utf8.c
index b37dba44d..280f40647 100644
--- a/misc/win32/utf8.c
+++ b/misc/win32/utf8.c
@@ -19,31 +19,32 @@
#include "apr_errno.h"
#include "apr_arch_utf8.h"
-/* Implement the design principal specified by RFC 2718 2.2.5
- * Guidelines for new URL Schemes - within the APR.
+/* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646"
+ * with particular attention to canonical translation forms (see section 10
+ * "Security Considerations" of the RFC for more info).
*
- * Since many architectures support unicode, and UCS2 is the most
- * efficient storage used by those archictures, these functions
- * exist to validate a UCS string. It is up to the operating system
- * to determine the validitity of the string in the context of it's
- * native language support. File systems that support filename
- * characters of 0x80-0xff but have no support of Unicode will find
- * this function useful only for validating the character sequences
- * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is
- * desired.
+ * Since several architectures including Windows support unicode, with UCS2
+ * used as the actual storage conventions by that archicture, these functions
+ * exist to transform or validate UCS2 strings into APR's 'char' type
+ * convention. It is left up to the operating system to determine the
+ * validitity of the string, e.g. normative forms, in the context of
+ * its native language support. Other file systems which support filename
+ * characters of 0x80-0xff but have no explicit requirement for Unicode
+ * will find this function useful only for validating the character sequences
+ * and rejecting poorly encoded UTF8 sequences.
*
- * from RFC 2279 UTF-8, a transformation format of ISO 10646
+ * Len UCS-4 range (hex) UTF-8 octet sequence (binary)
+ * 1:2 00000000-0000007F 0xxxxxxx
+ * 2:2 00000080-000007FF 110XXXXx 10xxxxxx
+ * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx
+ * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
+ * 00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
- * UCS-4 range (hex.) UTF-8 octet sequence (binary)
- * 1:2 0000 0000-0000 007F 0xxxxxxx
- * 2:2 0000 0080-0000 07FF 110XXXXx 10xxxxxx
- * 3:2 0000 0800-0000 FFFF 1110XXXX 10Xxxxxx 10xxxxxx
- * 4:4 0001 0000-001F FFFF 11110zXX 10XXxxxx 10xxxxxx 10xxxxxx
- * inv 0020 0000-03FF FFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
- * inv 0400 0000-7FFF FFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * One of the X bits must be 1 to avoid overlong representation of ucs2 values.
*
- * One of the X values must be one for the encoding length to be legit.
- * Neither the z bit, nor the final two forms, are used for ucs-2
+ * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF,
+ * and the final two forms are used only by full ucs4, per RFC 3629;
*
* "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
* Unicode parlance), being actually UCS-4 characters transformed
@@ -51,16 +52,20 @@
* must be undone, yielding a UCS-4 character that is then transformed
* as above."
*
- * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
+ * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
*
* U' = U - 0x10000
- * U' = 000000000000yyyyyyyyyyxxxxxxxxxx
- * W1 = 110110yyyyyyyyyy
- * W2 = 110111xxxxxxxxxx
+ * U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
+ * W1 = 110110yy yyyyyyyy
+ * W2 = 110111xx xxxxxxxx
+ * Max U' = 0000 00001111 11111111 11111111
+ * Max U = 0000 00010000 11111111 11111111
*
- * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
+ * Len is the table above is a mapping of bytes used for utf8:ucs2 values,
+ * which results in these conclusions of maximum allocations;
*
- * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
+ * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
+ * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
*/
APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in,