Add macro OFFUNISKIP

This means use official Unicode code point numbering, not native. Doing this converts the existing UNISKIP calls in the code to refer to native code points, which is what they meant anyway. The terminology is somewhat ambiguous, but I don't think it will cause real confusion. NATIVE_SKIP is also introduced for situations where it is important to be precise.
author: Karl Williamson <public@khwilliamson.com> 2013-02-26 13:35:12 -0700
committer: Karl Williamson <public@khwilliamson.com> 2013-08-29 09:55:58 -0600
commit: 5aaebcb3428d61bb90e5f0cfcdee0166b5bcb64e (patch)
tree: 7dcca006e44aab2c3754075f7bc8b2a3bef2aa81 /utf8.h
parent: 233ca360345239b7e19039f64bb29a2d310015c6 (diff)
download: perl-5aaebcb3428d61bb90e5f0cfcdee0166b5bcb64e.tar.gz
1 files changed, 12 insertions, 2 deletions
diff --git a/utf8.h b/utf8.h
index b3bf997efb..1ecb3b82d5 100644
--- a/utf8.h
+++ b/utf8.h
@@ -231,7 +231,8 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
            - UTF_ACCUMULATION_SHIFT))
 
 #ifdef HAS_QUAD
-#define UNISKIP(uv) ( (uv) < 0x80           ? 1 : \
+/* Input is a true Unicode (not-native) code point */
+#define OFFUNISKIP(uv) ( (uv) < 0x80        ? 1 : \
 		      (uv) < 0x800          ? 2 : \
 		      (uv) < 0x10000        ? 3 : \
 		      (uv) < 0x200000       ? 4 : \
@@ -240,7 +241,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
                       (uv) < UTF8_QUAD_MAX ? 7 : 13 )
 #else
 /* No, I'm not even going to *TRY* putting #ifdef inside a #define */
-#define UNISKIP(uv) ( (uv) < 0x80           ? 1 : \
+#define OFFUNISKIP(uv) ( (uv) < 0x80        ? 1 : \
 		      (uv) < 0x800          ? 2 : \
 		      (uv) < 0x10000        ? 3 : \
 		      (uv) < 0x200000       ? 4 : \
@@ -297,6 +298,15 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
                                                && ( (e) - (s) > 1)             \
                                                && UTF8_IS_CONTINUATION(*((s)+1)))
 
+/* Number of bytes a code point occupies in UTF-8. */
+#define NATIVE_SKIP(uv) OFFUNISKIP(NATIVE_TO_UNI(uv))
+
+/* Most code which says UNISKIP is really thinking in terms of native code
+ * points (0-255) plus all those beyond.  This is an imprecise term, but having
+ * it means existing code continues to work.  For precision, use NATIVE_SKIP
+ * and OFFUNISKIP */
+#define UNISKIP(uv)   NATIVE_SKIP(uv)
+
 /* Convert a two (not one) byte utf8 character to a native code point value.
  * Needs just one iteration of accumulate.  Should not be used unless it is
  * known that the two bytes are legal: 1) two-byte start, and 2) continuation.
author	Karl Williamson <public@khwilliamson.com>	2013-02-26 13:35:12 -0700
committer	Karl Williamson <public@khwilliamson.com>	2013-08-29 09:55:58 -0600
commit	5aaebcb3428d61bb90e5f0cfcdee0166b5bcb64e (patch)
tree	7dcca006e44aab2c3754075f7bc8b2a3bef2aa81 /utf8.h
parent	233ca360345239b7e19039f64bb29a2d310015c6 (diff)
download	perl-5aaebcb3428d61bb90e5f0cfcdee0166b5bcb64e.tar.gz