Improve explanations, reference appropriate RFC's and add some

exploratory math for the limits. git-svn-id: https://svn.apache.org/repos/asf/apr/apr/trunk@683665 13f79535-47bb-0310-9956-ffa450edef68
author: William A. Rowe Jr <wrowe@apache.org> 2008-08-07 17:53:39 +0000
committer: William A. Rowe Jr <wrowe@apache.org> 2008-08-07 17:53:39 +0000
commit: 86f8ed8e194d641bc712764eacff97a551d3d5dd (patch)
tree: f8e100f5b7d2dec39b8e338370aa2fd71472e4ad /misc
parent: a933a827982e603e9c2c56b471073ac425ba4af1 (diff)
download: apr-86f8ed8e194d641bc712764eacff97a551d3d5dd.tar.gz
1 files changed, 32 insertions, 27 deletions
diff --git a/misc/win32/utf8.c b/misc/win32/utf8.c
index b37dba44d..280f40647 100644
--- a/misc/win32/utf8.c
+++ b/misc/win32/utf8.c
@@ -19,31 +19,32 @@
 #include "apr_errno.h"
 #include "apr_arch_utf8.h"
 
-/* Implement the design principal specified by RFC 2718 2.2.5 
- * Guidelines for new URL Schemes - within the APR.
+/* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646"
+ * with particular attention to canonical translation forms (see section 10
+ * "Security Considerations" of the RFC for more info).
  *
- * Since many architectures support unicode, and UCS2 is the most
- * efficient storage used by those archictures, these functions
- * exist to validate a UCS string.  It is up to the operating system
- * to determine the validitity of the string in the context of it's
- * native language support.  File systems that support filename 
- * characters of 0x80-0xff but have no support of Unicode will find 
- * this function useful only for validating the character sequences 
- * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is
- * desired.
+ * Since several architectures including Windows support unicode, with UCS2
+ * used as the actual storage conventions by that archicture, these functions
+ * exist to transform or validate UCS2 strings into APR's 'char' type
+ * convention.  It is left up to the operating system to determine the
+ * validitity of the string, e.g. normative forms, in the context of 
+ * its native language support.  Other file systems which support filename 
+ * characters of 0x80-0xff but have no explicit requirement for Unicode
+ * will find this function useful only for validating the character sequences 
+ * and rejecting poorly encoded UTF8 sequences.
  *
- * from RFC 2279 UTF-8, a transformation format of ISO 10646
+ * Len UCS-4 range (hex) UTF-8 octet sequence (binary)
+ * 1:2 00000000-0000007F 0xxxxxxx
+ * 2:2 00000080-000007FF 110XXXXx 10xxxxxx
+ * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx
+ * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
+ *     00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *     04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  *
- *     UCS-4 range (hex.)    UTF-8 octet sequence (binary)
- * 1:2 0000 0000-0000 007F   0xxxxxxx
- * 2:2 0000 0080-0000 07FF   110XXXXx 10xxxxxx
- * 3:2 0000 0800-0000 FFFF   1110XXXX 10Xxxxxx 10xxxxxx
- * 4:4 0001 0000-001F FFFF   11110zXX 10XXxxxx 10xxxxxx 10xxxxxx
- * inv 0020 0000-03FF FFFF   111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
- * inv 0400 0000-7FFF FFFF   1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * One of the X bits must be 1 to avoid overlong representation of ucs2 values. 
  *
- * One of the X values must be one for the encoding length to be legit.
- * Neither the z bit, nor the final two forms, are used for ucs-2
+ * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF,
+ * and the final two forms are used only by full ucs4, per RFC 3629;
  *
  *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in 
  *   Unicode parlance), being actually UCS-4 characters transformed 
@@ -51,16 +52,20 @@
  *   must be undone, yielding a UCS-4 character that is then transformed 
  *   as above."
  *
- * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
+ * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
  *
  *  U' = U - 0x10000
- *  U' = 000000000000yyyyyyyyyyxxxxxxxxxx
- *                  W1 = 110110yyyyyyyyyy
- *                  W2 = 110111xxxxxxxxxx
+ *  U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
+ *                    W1 = 110110yy yyyyyyyy
+ *                    W2 = 110111xx xxxxxxxx
+ *  Max U' = 0000 00001111 11111111 11111111
+ *  Max U  = 0000 00010000 11111111 11111111
  *
- * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
+ * Len is the table above is a mapping of bytes used for utf8:ucs2 values,
+ * which results in these conclusions of maximum allocations;
  *
- * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
+ *  apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
+ *  apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
  */
 
 APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in,
author	William A. Rowe Jr <wrowe@apache.org>	2008-08-07 17:53:39 +0000
committer	William A. Rowe Jr <wrowe@apache.org>	2008-08-07 17:53:39 +0000
commit	86f8ed8e194d641bc712764eacff97a551d3d5dd (patch)
tree	f8e100f5b7d2dec39b8e338370aa2fd71472e4ad /misc
parent	a933a827982e603e9c2c56b471073ac425ba4af1 (diff)
download	apr-86f8ed8e194d641bc712764eacff97a551d3d5dd.tar.gz