summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-01-05 16:12:55 -0700
committerKarl Williamson <khw@cpan.org>2016-01-08 14:17:11 -0700
commitf1f6961f5a6fd77a3e3c36f242f1b72ce5dfe205 (patch)
tree52365bdb2759341217eb979be04a61f5b351eb2f /regexec.c
parentcbdbe9d466e0d26852ca1ace0825220c8ca7d215 (diff)
downloadperl-f1f6961f5a6fd77a3e3c36f242f1b72ce5dfe205.tar.gz
Tailor \b{wb} for Perl
The Unicode \b{wb} matches the boundary between space characters in a span of them. This is opposite of what \b does, and is counterintuitive to Perl expectations. This commit tailors \b{wb} to not split up spans of white space. I have submitted a request to Unicode to re-examine their algorithm, and this has been assigned to a subcommittee to look at, but the result won't be available until after 5.24 is done. In any event, Unicode encourages tailoring for local conditions.
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c48
1 files changed, 40 insertions, 8 deletions
diff --git a/regexec.c b/regexec.c
index e6f07ca489..c057efe241 100644
--- a/regexec.c
+++ b/regexec.c
@@ -4534,7 +4534,8 @@ S_isWB(pTHX_ WB_enum previous,
const bool utf8_target)
{
/* Return a boolean as to if the boundary between 'before' and 'after' is
- * a Unicode word break, using their published algorithm. Context may be
+ * a Unicode word break, using their published algorithm, but tailored for
+ * Perl by treating spans of white space as one unit. Context may be
* needed to make this determination. If the value for the character
* before 'before' is known, it is passed as 'previous'; otherwise that
* should be set to WB_UNKNOWN. The other input parameters give the
@@ -4552,17 +4553,48 @@ S_isWB(pTHX_ WB_enum previous,
return TRUE;
}
- /* WB 3: Do not break within CRLF. */
- if (before == WB_CR && after == WB_LF) {
- return FALSE;
+ /* WB 3 is: "Do not break within CRLF." Perl extends this so that all
+ * white space sequences ending in a vertical space are treated as one
+ * unit. */
+
+ if (after == WB_CR || after == WB_LF || after == WB_Newline) {
+ if (before == WB_CR || before == WB_LF || before == WB_Newline
+ || before == WB_Perl_Tailored_HSpace)
+ {
+ return FALSE;
+ }
+
+ /* WB 3a: Otherwise break before Newlines (including CR and LF) */
+ return TRUE;
}
- /* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
- * and LF) */
+ /* Here, we know that 'after' is not a vertical space character, but
+ * 'before' could be. WB 3b is: "Otherwise break after Newlines (including
+ * CR and LF)." Perl changes that to not break-up spans of white space,
+ * except when horizontal space is followed by an Extend or Format
+ * character. These apply just to the final white space character in the
+ * span, so it is broken away from the rest. (If the Extend or Format
+ * character follows a vertical space character, it is treated as beginning
+ * a line, and doesn't modify the preceeding character.) */
if ( before == WB_CR || before == WB_LF || before == WB_Newline
- || after == WB_CR || after == WB_LF || after == WB_Newline)
+ || before == WB_Perl_Tailored_HSpace)
{
- return TRUE;
+ if (after == WB_Perl_Tailored_HSpace) {
+ U8 * temp_pos = (U8 *) curpos;
+ const WB_enum next
+ = advance_one_WB(&temp_pos, strend, utf8_target,
+ FALSE /* Don't skip Extend nor Format */ );
+ return next == WB_Extend || next == WB_Format;
+ }
+ else if (before != WB_Perl_Tailored_HSpace) {
+
+ /* Here, 'before' must be one of the vertical space characters, and
+ * after is not any type of white-space. Follow WB 3b. */
+ return TRUE;
+ }
+
+ /* Here, 'before' is horizontal space, and 'after' is not any kind of
+ * space. Normal rules apply */
}
/* Ignore Format and Extend characters, except when they appear at the