handle perl extended utf8 start bytes

perl uses UTF8_IS_START() to test if a byte is a valid start byte, this didn't take perl's extended UTF-8 range into account.
author: Tony Cook <tony@develop-help.com> 2010-03-16 23:46:48 +1100
committer: Tony Cook <tony@develop-help.com> 2010-05-31 20:52:24 +1000
commit: 65ab9279784aa811d78b2903b57bc0e7947dec78 (patch)
tree: c8c63cd44705797a091def10c9b83a520d0a6cde /utf8.h
parent: e57ed4ecd4d7de38a79a316da8d657dad656f93f (diff)
download: perl-65ab9279784aa811d78b2903b57bc0e7947dec78.tar.gz
1 files changed, 3 insertions, 1 deletions
diff --git a/utf8.h b/utf8.h
index e58ddede6e..b0cfedf9e1 100644
--- a/utf8.h
+++ b/utf8.h
@@ -104,13 +104,15 @@ As you can see, the continuation bytes all begin with C<10>, and the
 leading bits of the start byte tell how many bytes there are in the
 encoded character.
 
+Perl's extended UTF-8 means we can have start bytes up to FF.
+
 */
 
 
 #define UNI_IS_INVARIANT(c)		(((UV)c) <  0x80)
 /* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the
  * below might ought to be C2 */
-#define UTF8_IS_START(c)		(((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
+#define UTF8_IS_START(c)		(((U8)c) >= 0xc0)
 #define UTF8_IS_CONTINUATION(c)		(((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
 #define UTF8_IS_CONTINUED(c) 		(((U8)c) &  0x80)
 #define UTF8_IS_DOWNGRADEABLE_START(c)	(((U8)c & 0xfc) == 0xc0)
author	Tony Cook <tony@develop-help.com>	2010-03-16 23:46:48 +1100
committer	Tony Cook <tony@develop-help.com>	2010-05-31 20:52:24 +1000
commit	65ab9279784aa811d78b2903b57bc0e7947dec78 (patch)
tree	c8c63cd44705797a091def10c9b83a520d0a6cde /utf8.h
parent	e57ed4ecd4d7de38a79a316da8d657dad656f93f (diff)
download	perl-65ab9279784aa811d78b2903b57bc0e7947dec78.tar.gz