summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-12-15 16:30:27 -0700
committerKarl Williamson <khw@cpan.org>2016-12-23 16:48:34 -0700
commitda8c1a98236a9f56df850c47705cb3046d6636aa (patch)
tree678fc6d876d96346a0beb5b19b9b9249c17df9ea /utf8.h
parent9dfb44ee59033dc1f1f858d46a05a3f3c8ce85d9 (diff)
downloadperl-da8c1a98236a9f56df850c47705cb3046d6636aa.tar.gz
Add isFOO_utf8_safe() macros
The original API does not check that we aren't reading beyond the end of a buffer, apparently assuming that we could keep malformed UTF-8 out by use of gatekeepers, but that is currently impossible. This commit adds "safe" macros for determining if a UTF-8 sequence represents an alphabetic, a digit, etc. Each new macro has an extra parameter pointing to the end of the sequence, so that looking beyond the input string can be avoided. The macros aren't currently completely safe, as they don't test that there is at least a single valid byte in the input, except by an assertion in DEBUGGING builds. This is because typically they are called in code that makes that assumption, and frequently tests the current byte for one thing or another.
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h11
1 files changed, 11 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h
index f6d9d541db..556803958f 100644
--- a/utf8.h
+++ b/utf8.h
@@ -680,6 +680,17 @@ with a ptr argument.
: isWORDCHAR_utf8((const U8*)p))
#define isALNUM_lazy_if(p,UTF) isWORDCHAR_lazy_if(p,UTF)
+#define isIDFIRST_lazy_if_safe(p, e, UTF) \
+ ((IN_BYTES || !UTF) \
+ ? isIDFIRST(*(p)) \
+ : isIDFIRST_utf8_safe(p, e))
+
+#define isWORDCHAR_lazy_if_safe(p, e, UTF) \
+ ((IN_BYTES || !UTF) \
+ ? isWORDCHAR(*(p)) \
+ : isWORDCHAR_utf8_safe((U8 *) p, (U8 *) e))
+
+
#define UTF8_MAXLEN UTF8_MAXBYTES
/* A Unicode character can fold to up to 3 characters */