summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-12-20 13:41:58 -0700
committerKarl Williamson <khw@cpan.org>2016-12-23 22:52:44 -0700
commit94749a5ed2171bb6de72e384a78f5df552d812bb (patch)
treececdd172789047a484c5623cd15f1bce424fc1f1 /regexec.c
parent9e7ded3f8151b7f66398bfd77fca0565ee90166a (diff)
downloadperl-94749a5ed2171bb6de72e384a78f5df552d812bb.tar.gz
Deprecate non-grapheme string delimiter
In order for Perl to eventually allow string delimiters to be Unicode grapheme clusters (which look like a single character, but may be multiple ones), we have to stop allowing a single char delimiter that isn't a grapheme by itself. These are unlikely to exist in actual code, as they would typically display as attached to the character in front of them, but we should be sure.
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c58
1 files changed, 58 insertions, 0 deletions
diff --git a/regexec.c b/regexec.c
index 5c5241c57a..23b2d3f248 100644
--- a/regexec.c
+++ b/regexec.c
@@ -9704,6 +9704,64 @@ S_to_byte_substr(pTHX_ regexp *prog)
return TRUE;
}
+bool
+Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
+{
+ /* Temporary helper function for toke.c. Verify that the code point 'cp'
+ * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
+ * the larger string bounded by 'strbeg' and 'strend'.
+ *
+ * 'cp' needs to be assigned (if not a future version of the Unicode
+ * Standard could make it something that combines with adjacent characters,
+ * so code using it would then break), and there has to be a GCB break
+ * before and after the character. */
+
+ GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
+ const U8 * prev_cp_start;
+
+ PERL_ARGS_ASSERT__IS_GRAPHEME;
+
+ /* Unassigned code points are forbidden */
+ if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
+ _invlist_search(PL_Assigned_invlist, cp))))
+ {
+ return FALSE;
+ }
+
+ cp_gcb_val = getGCB_VAL_CP(cp);
+
+ /* Find the GCB value of the previous code point in the input */
+ prev_cp_start = utf8_hop_back(s, -1, strbeg);
+ if (UNLIKELY(prev_cp_start == s)) {
+ prev_cp_gcb_val = GCB_EDGE;
+ }
+ else {
+ prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
+ }
+
+ /* And check that is a grapheme boundary */
+ if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
+ TRUE /* is UTF-8 encoded */ ))
+ {
+ return FALSE;
+ }
+
+ /* Similarly verify there is a break between the current character and the
+ * following one */
+ s += UTF8SKIP(s);
+ if (s >= strend) {
+ next_cp_gcb_val = GCB_EDGE;
+ }
+ else {
+ next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
+ }
+
+ return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
+}
+
+
+
+
/*
* ex: set ts=8 sts=4 sw=4 et:
*/