summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2000-10-24 02:55:33 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2000-10-24 02:55:33 +0000
commitba210ebec161cde003bc967e8e460c72f71fb70c (patch)
tree7eefd78e8e365cbf64ddf49314681d17b83c3025 /utf8.h
parent177b92d2814bfc842f28f277e0a2f353c652a5e3 (diff)
downloadperl-ba210ebec161cde003bc967e8e460c72f71fb70c.tar.gz
Make the UTF-8 decoding stricter and more verbose when
malformation happens. This involved adding an argument to utf8_to_uv_chk(), which involved changing its prototype, and prefer STRLEN over I32 for the UTF-8 length, which as a domino effect necessitated changing the prototypes of scan_bin(), scan_oct(), scan_hex(), and reg_uni(). The stricter UTF-8 decoding checking uses Markus Kuhn's UTF-8 Decode Stress Tester from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt p4raw-id: //depot/perl@7416
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h4
1 files changed, 3 insertions, 1 deletions
diff --git a/utf8.h b/utf8.h
index 7407335806..bb494ab1ef 100644
--- a/utf8.h
+++ b/utf8.h
@@ -29,7 +29,7 @@ END_EXTERN_C
#define UTF8_MAXLEN 13 /* how wide can a single UTF8 encoded character become */
-/*#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)*/
+#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)
#define IN_BYTE (PL_curcop->op_private & HINT_BYTE)
#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)
@@ -53,6 +53,8 @@ END_EXTERN_C
(uv) < 0x80000000 ? 6 : 7 )
#endif
+#define UNICODE_REPLACEMENT_CHARACTER 0xfffd
+
/*
* Note: we try to be careful never to call the isXXX_utf8() functions
* unless we're pretty sure we've seen the beginning of a UTF-8 character