summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-01-20 16:15:38 -0700
committerKarl Williamson <khw@cpan.org>2016-02-03 10:34:23 -0700
commit7e54b87f95738233dec6d22ca103db816747c518 (patch)
treef456d3661b44ca87b463ed023d1ba0383ff78198 /regexec.c
parentf39ff1f37d442c1701859527bbfca27427551179 (diff)
downloadperl-7e54b87f95738233dec6d22ca103db816747c518.tar.gz
Use table lookup for qr/\b{wb}/
This follows the recent commits for lb and gcb, and generates a table at regen time for Word Breaking. The result may run faster, depending on the compiler optimization capabilities, than before, and is easier to maintain, as it's easier to smack a new rule into the regen perl script than it is to change the C code.
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c254
1 files changed, 103 insertions, 151 deletions
diff --git a/regexec.c b/regexec.c
index 057e9936eb..f032096137 100644
--- a/regexec.c
+++ b/regexec.c
@@ -4788,8 +4788,6 @@ S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_tar
return sb;
}
-#define WBcase(before, after) ((WB_ENUM_COUNT * before) + after)
-
STATIC bool
S_isWB(pTHX_ WB_enum previous,
WB_enum before,
@@ -4811,190 +4809,144 @@ S_isWB(pTHX_ WB_enum previous,
U8 * before_pos = (U8 *) curpos;
U8 * after_pos = (U8 *) curpos;
+ WB_enum prev = before;
+ WB_enum next;
PERL_ARGS_ASSERT_ISWB;
- /* WB1 and WB2: Break at the start and end of text. */
- if (before == WB_EDGE || after == WB_EDGE) {
- return TRUE;
- }
+ /* Rule numbers in the comments below are as of Unicode 8.0 */
- /* WB 3 is: "Do not break within CRLF." Perl extends this so that all
- * white space sequences ending in a vertical space are treated as one
- * unit. */
+ redo:
+ before = prev;
+ switch (WB_table[before][after]) {
+ case WB_BREAKABLE:
+ return TRUE;
- if (after == WB_CR || after == WB_LF || after == WB_Newline) {
- if (before == WB_CR || before == WB_LF || before == WB_Newline
- || before == WB_Perl_Tailored_HSpace)
- {
+ case WB_NOBREAK:
return FALSE;
- }
-
- /* WB 3a: Otherwise break before Newlines (including CR and LF) */
- return TRUE;
- }
- /* Here, we know that 'after' is not a vertical space character, but
- * 'before' could be. WB 3b is: "Otherwise break after Newlines (including
- * CR and LF)." Perl changes that to not break-up spans of white space,
- * except when horizontal space is followed by an Extend or Format
- * character. These apply just to the final white space character in the
- * span, so it is broken away from the rest. (If the Extend or Format
- * character follows a vertical space character, it is treated as beginning
- * a line, and doesn't modify the preceeding character.) */
- if ( before == WB_CR || before == WB_LF || before == WB_Newline
- || before == WB_Perl_Tailored_HSpace)
- {
- if (after == WB_Perl_Tailored_HSpace) {
- U8 * temp_pos = (U8 *) curpos;
- const WB_enum next
- = advance_one_WB(&temp_pos, strend, utf8_target,
+ case WB_hs_then_hs: /* 2 horizontal spaces in a row */
+ next = advance_one_WB(&after_pos, strend, utf8_target,
FALSE /* Don't skip Extend nor Format */ );
+ /* A space immediately preceeding an Extend or Format is attached
+ * to by them, and hence gets separated from previous spaces.
+ * Otherwise don't break between horizontal white space */
return next == WB_Extend || next == WB_Format;
- }
- else if (before != WB_Perl_Tailored_HSpace) {
- /* Here, 'before' must be one of the vertical space characters, and
- * after is not any type of white-space. Follow WB 3b. */
- return TRUE;
- }
+ /* WB4 Ignore Format and Extend characters, except when they appear at
+ * the beginning of a region of text. This code currently isn't
+ * general purpose, but it works as the rules are currently and likely
+ * to be laid out. The reason it works is that when 'they appear at
+ * the beginning of a region of text', the rule is to break before
+ * them, just like any other character. Therefore, the default rule
+ * applies and we don't have to look in more depth. Should this ever
+ * change, we would have to have 2 'case' statements, like in the
+ * rules below, and backup a single character (not spacing over the
+ * extend ones) and then see if that is one of the region-end
+ * characters and go from there */
+ case WB_Ex_or_FO_then_foo:
+ prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
+ goto redo;
- /* Here, 'before' is horizontal space, and 'after' is not any kind of
- * space. Normal rules apply */
- }
+ case WB_DQ_then_HL + WB_BREAKABLE:
+ case WB_DQ_then_HL + WB_NOBREAK:
- /* Ignore Format and Extend characters, except when they appear at the
- * beginning of a region of text.
- * WB4. X (Extend | Format)* → X. */
+ /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
- if (after == WB_Extend || after == WB_Format) {
- return FALSE;
- }
+ if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
+ == WB_Hebrew_Letter)
+ {
+ return FALSE;
+ }
- if (before == WB_Extend || before == WB_Format) {
- before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
- }
+ return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
- switch (WBcase(before, after)) {
- /* Otherwise, break everywhere (including around ideographs).
- WB14. Any ÷ Any */
- default:
- return TRUE;
+ case WB_HL_then_DQ + WB_BREAKABLE:
+ case WB_HL_then_DQ + WB_NOBREAK:
- /* Do not break between most letters.
- WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */
- case WBcase(WB_ALetter, WB_ALetter):
- case WBcase(WB_ALetter, WB_Hebrew_Letter):
- case WBcase(WB_Hebrew_Letter, WB_ALetter):
- case WBcase(WB_Hebrew_Letter, WB_Hebrew_Letter):
+ /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
+
+ if (advance_one_WB(&after_pos, strend, utf8_target,
+ TRUE /* Do skip Extend and Format */ )
+ == WB_Hebrew_Letter)
+ {
return FALSE;
+ }
+
+ return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
- /* Do not break letters across certain punctuation.
- WB6. (ALetter | Hebrew_Letter)
- × (MidLetter | MidNumLet | Single_Quote) (ALetter
- | Hebrew_Letter) */
- case WBcase(WB_ALetter, WB_MidLetter):
- case WBcase(WB_ALetter, WB_MidNumLet):
- case WBcase(WB_ALetter, WB_Single_Quote):
- case WBcase(WB_Hebrew_Letter, WB_MidLetter):
- case WBcase(WB_Hebrew_Letter, WB_MidNumLet):
- /*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/
- after = advance_one_WB(&after_pos, strend, utf8_target,
+ case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
+ case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
+
+ /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
+ * | Single_Quote) (ALetter | Hebrew_Letter) */
+
+ next = advance_one_WB(&after_pos, strend, utf8_target,
TRUE /* Do skip Extend and Format */ );
- return after != WB_ALetter && after != WB_Hebrew_Letter;
-
- /* WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
- * Single_Quote) × (ALetter | Hebrew_Letter) */
- case WBcase(WB_MidLetter, WB_ALetter):
- case WBcase(WB_MidLetter, WB_Hebrew_Letter):
- case WBcase(WB_MidNumLet, WB_ALetter):
- case WBcase(WB_MidNumLet, WB_Hebrew_Letter):
- case WBcase(WB_Single_Quote, WB_ALetter):
- case WBcase(WB_Single_Quote, WB_Hebrew_Letter):
- before
- = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
- return before != WB_ALetter && before != WB_Hebrew_Letter;
-
- /* WB7a. Hebrew_Letter × Single_Quote */
- case WBcase(WB_Hebrew_Letter, WB_Single_Quote):
+
+ if (next == WB_ALetter || next == WB_Hebrew_Letter)
+ {
return FALSE;
+ }
- /* WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter */
- case WBcase(WB_Hebrew_Letter, WB_Double_Quote):
- return advance_one_WB(&after_pos, strend, utf8_target,
- TRUE /* Do skip Extend and Format */ )
- != WB_Hebrew_Letter;
+ return WB_table[before][after]
+ - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
- /* WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter */
- case WBcase(WB_Double_Quote, WB_Hebrew_Letter):
- return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
- != WB_Hebrew_Letter;
+ case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
+ case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
- /* Do not break within sequences of digits, or digits adjacent to
- * letters (“3a”, or “A3”).
- WB8. Numeric × Numeric */
- case WBcase(WB_Numeric, WB_Numeric):
- return FALSE;
+ /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
+ * | Single_Quote) × (ALetter | Hebrew_Letter) */
- /* WB9. (ALetter | Hebrew_Letter) × Numeric */
- case WBcase(WB_ALetter, WB_Numeric):
- case WBcase(WB_Hebrew_Letter, WB_Numeric):
+ prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
+ if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
+ {
return FALSE;
+ }
- /* WB10. Numeric × (ALetter | Hebrew_Letter) */
- case WBcase(WB_Numeric, WB_ALetter):
- case WBcase(WB_Numeric, WB_Hebrew_Letter):
- return FALSE;
+ return WB_table[before][after]
+ - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
- /* Do not break within sequences, such as “3.2” or “3,456.789”.
- WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
- */
- case WBcase(WB_MidNum, WB_Numeric):
- case WBcase(WB_MidNumLet, WB_Numeric):
- case WBcase(WB_Single_Quote, WB_Numeric):
- return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
- != WB_Numeric;
-
- /* WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
- * */
- case WBcase(WB_Numeric, WB_MidNum):
- case WBcase(WB_Numeric, WB_MidNumLet):
- case WBcase(WB_Numeric, WB_Single_Quote):
- return advance_one_WB(&after_pos, strend, utf8_target,
- TRUE /* Do skip Extend and Format */ )
- != WB_Numeric;
-
- /* Do not break between Katakana.
- WB13. Katakana × Katakana */
- case WBcase(WB_Katakana, WB_Katakana):
- return FALSE;
+ case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
+ case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
- /* Do not break from extenders.
- WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana |
- ExtendNumLet) × ExtendNumLet */
- case WBcase(WB_ALetter, WB_ExtendNumLet):
- case WBcase(WB_Hebrew_Letter, WB_ExtendNumLet):
- case WBcase(WB_Numeric, WB_ExtendNumLet):
- case WBcase(WB_Katakana, WB_ExtendNumLet):
- case WBcase(WB_ExtendNumLet, WB_ExtendNumLet):
- return FALSE;
+ /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
+ * */
- /* WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric
- * | Katakana) */
- case WBcase(WB_ExtendNumLet, WB_ALetter):
- case WBcase(WB_ExtendNumLet, WB_Hebrew_Letter):
- case WBcase(WB_ExtendNumLet, WB_Numeric):
- case WBcase(WB_ExtendNumLet, WB_Katakana):
+ if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
+ == WB_Numeric)
+ {
return FALSE;
+ }
- /* Do not break between regional indicator symbols.
- WB13c. Regional_Indicator × Regional_Indicator */
- case WBcase(WB_Regional_Indicator, WB_Regional_Indicator):
+ return WB_table[before][after]
+ - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
+
+ case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
+ case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
+
+ /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
+
+ if (advance_one_WB(&after_pos, strend, utf8_target,
+ TRUE /* Do skip Extend and Format */ )
+ == WB_Numeric)
+ {
return FALSE;
+ }
+
+ return WB_table[before][after]
+ - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
+ default:
+ break;
}
- NOT_REACHED; /* NOTREACHED */
+#ifdef DEBUGGING
+ PerlIO_printf(Perl_error_log, "Unhandled WB pair: WB_table[%d, %d] = %d\n",
+ before, after, WB_table[before][after]);
+ assert(0);
+#endif
+ return TRUE;
}
STATIC WB_enum