diff options
author | Karl Williamson <khw@cpan.org> | 2016-01-20 16:15:38 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-02-03 10:34:23 -0700 |
commit | 7e54b87f95738233dec6d22ca103db816747c518 (patch) | |
tree | f456d3661b44ca87b463ed023d1ba0383ff78198 /regexec.c | |
parent | f39ff1f37d442c1701859527bbfca27427551179 (diff) | |
download | perl-7e54b87f95738233dec6d22ca103db816747c518.tar.gz |
Use table lookup for qr/\b{wb}/
This follows the recent commits for lb and gcb, and generates a table at
regen time for Word Breaking. The result may run faster, depending on
the compiler optimization capabilities, than before, and is easier to
maintain, as it's easier to smack a new rule into the regen perl script
than it is to change the C code.
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 254 |
1 files changed, 103 insertions, 151 deletions
@@ -4788,8 +4788,6 @@ S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_tar return sb; } -#define WBcase(before, after) ((WB_ENUM_COUNT * before) + after) - STATIC bool S_isWB(pTHX_ WB_enum previous, WB_enum before, @@ -4811,190 +4809,144 @@ S_isWB(pTHX_ WB_enum previous, U8 * before_pos = (U8 *) curpos; U8 * after_pos = (U8 *) curpos; + WB_enum prev = before; + WB_enum next; PERL_ARGS_ASSERT_ISWB; - /* WB1 and WB2: Break at the start and end of text. */ - if (before == WB_EDGE || after == WB_EDGE) { - return TRUE; - } + /* Rule numbers in the comments below are as of Unicode 8.0 */ - /* WB 3 is: "Do not break within CRLF." Perl extends this so that all - * white space sequences ending in a vertical space are treated as one - * unit. */ + redo: + before = prev; + switch (WB_table[before][after]) { + case WB_BREAKABLE: + return TRUE; - if (after == WB_CR || after == WB_LF || after == WB_Newline) { - if (before == WB_CR || before == WB_LF || before == WB_Newline - || before == WB_Perl_Tailored_HSpace) - { + case WB_NOBREAK: return FALSE; - } - - /* WB 3a: Otherwise break before Newlines (including CR and LF) */ - return TRUE; - } - /* Here, we know that 'after' is not a vertical space character, but - * 'before' could be. WB 3b is: "Otherwise break after Newlines (including - * CR and LF)." Perl changes that to not break-up spans of white space, - * except when horizontal space is followed by an Extend or Format - * character. These apply just to the final white space character in the - * span, so it is broken away from the rest. (If the Extend or Format - * character follows a vertical space character, it is treated as beginning - * a line, and doesn't modify the preceeding character.) */ - if ( before == WB_CR || before == WB_LF || before == WB_Newline - || before == WB_Perl_Tailored_HSpace) - { - if (after == WB_Perl_Tailored_HSpace) { - U8 * temp_pos = (U8 *) curpos; - const WB_enum next - = advance_one_WB(&temp_pos, strend, utf8_target, + case WB_hs_then_hs: /* 2 horizontal spaces in a row */ + next = advance_one_WB(&after_pos, strend, utf8_target, FALSE /* Don't skip Extend nor Format */ ); + /* A space immediately preceeding an Extend or Format is attached + * to by them, and hence gets separated from previous spaces. + * Otherwise don't break between horizontal white space */ return next == WB_Extend || next == WB_Format; - } - else if (before != WB_Perl_Tailored_HSpace) { - /* Here, 'before' must be one of the vertical space characters, and - * after is not any type of white-space. Follow WB 3b. */ - return TRUE; - } + /* WB4 Ignore Format and Extend characters, except when they appear at + * the beginning of a region of text. This code currently isn't + * general purpose, but it works as the rules are currently and likely + * to be laid out. The reason it works is that when 'they appear at + * the beginning of a region of text', the rule is to break before + * them, just like any other character. Therefore, the default rule + * applies and we don't have to look in more depth. Should this ever + * change, we would have to have 2 'case' statements, like in the + * rules below, and backup a single character (not spacing over the + * extend ones) and then see if that is one of the region-end + * characters and go from there */ + case WB_Ex_or_FO_then_foo: + prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); + goto redo; - /* Here, 'before' is horizontal space, and 'after' is not any kind of - * space. Normal rules apply */ - } + case WB_DQ_then_HL + WB_BREAKABLE: + case WB_DQ_then_HL + WB_NOBREAK: - /* Ignore Format and Extend characters, except when they appear at the - * beginning of a region of text. - * WB4. X (Extend | Format)* → X. */ + /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */ - if (after == WB_Extend || after == WB_Format) { - return FALSE; - } + if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) + == WB_Hebrew_Letter) + { + return FALSE; + } - if (before == WB_Extend || before == WB_Format) { - before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); - } + return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE; - switch (WBcase(before, after)) { - /* Otherwise, break everywhere (including around ideographs). - WB14. Any ÷ Any */ - default: - return TRUE; + case WB_HL_then_DQ + WB_BREAKABLE: + case WB_HL_then_DQ + WB_NOBREAK: - /* Do not break between most letters. - WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */ - case WBcase(WB_ALetter, WB_ALetter): - case WBcase(WB_ALetter, WB_Hebrew_Letter): - case WBcase(WB_Hebrew_Letter, WB_ALetter): - case WBcase(WB_Hebrew_Letter, WB_Hebrew_Letter): + /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */ + + if (advance_one_WB(&after_pos, strend, utf8_target, + TRUE /* Do skip Extend and Format */ ) + == WB_Hebrew_Letter) + { return FALSE; + } + + return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE; - /* Do not break letters across certain punctuation. - WB6. (ALetter | Hebrew_Letter) - × (MidLetter | MidNumLet | Single_Quote) (ALetter - | Hebrew_Letter) */ - case WBcase(WB_ALetter, WB_MidLetter): - case WBcase(WB_ALetter, WB_MidNumLet): - case WBcase(WB_ALetter, WB_Single_Quote): - case WBcase(WB_Hebrew_Letter, WB_MidLetter): - case WBcase(WB_Hebrew_Letter, WB_MidNumLet): - /*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/ - after = advance_one_WB(&after_pos, strend, utf8_target, + case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK: + case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE: + + /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet + * | Single_Quote) (ALetter | Hebrew_Letter) */ + + next = advance_one_WB(&after_pos, strend, utf8_target, TRUE /* Do skip Extend and Format */ ); - return after != WB_ALetter && after != WB_Hebrew_Letter; - - /* WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | - * Single_Quote) × (ALetter | Hebrew_Letter) */ - case WBcase(WB_MidLetter, WB_ALetter): - case WBcase(WB_MidLetter, WB_Hebrew_Letter): - case WBcase(WB_MidNumLet, WB_ALetter): - case WBcase(WB_MidNumLet, WB_Hebrew_Letter): - case WBcase(WB_Single_Quote, WB_ALetter): - case WBcase(WB_Single_Quote, WB_Hebrew_Letter): - before - = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); - return before != WB_ALetter && before != WB_Hebrew_Letter; - - /* WB7a. Hebrew_Letter × Single_Quote */ - case WBcase(WB_Hebrew_Letter, WB_Single_Quote): + + if (next == WB_ALetter || next == WB_Hebrew_Letter) + { return FALSE; + } - /* WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter */ - case WBcase(WB_Hebrew_Letter, WB_Double_Quote): - return advance_one_WB(&after_pos, strend, utf8_target, - TRUE /* Do skip Extend and Format */ ) - != WB_Hebrew_Letter; + return WB_table[before][after] + - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE; - /* WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter */ - case WBcase(WB_Double_Quote, WB_Hebrew_Letter): - return backup_one_WB(&previous, strbeg, &before_pos, utf8_target) - != WB_Hebrew_Letter; + case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK: + case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE: - /* Do not break within sequences of digits, or digits adjacent to - * letters (“3a”, or “A3”). - WB8. Numeric × Numeric */ - case WBcase(WB_Numeric, WB_Numeric): - return FALSE; + /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet + * | Single_Quote) × (ALetter | Hebrew_Letter) */ - /* WB9. (ALetter | Hebrew_Letter) × Numeric */ - case WBcase(WB_ALetter, WB_Numeric): - case WBcase(WB_Hebrew_Letter, WB_Numeric): + prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); + if (prev == WB_ALetter || prev == WB_Hebrew_Letter) + { return FALSE; + } - /* WB10. Numeric × (ALetter | Hebrew_Letter) */ - case WBcase(WB_Numeric, WB_ALetter): - case WBcase(WB_Numeric, WB_Hebrew_Letter): - return FALSE; + return WB_table[before][after] + - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE; - /* Do not break within sequences, such as “3.2” or “3,456.789”. - WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric - */ - case WBcase(WB_MidNum, WB_Numeric): - case WBcase(WB_MidNumLet, WB_Numeric): - case WBcase(WB_Single_Quote, WB_Numeric): - return backup_one_WB(&previous, strbeg, &before_pos, utf8_target) - != WB_Numeric; - - /* WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric - * */ - case WBcase(WB_Numeric, WB_MidNum): - case WBcase(WB_Numeric, WB_MidNumLet): - case WBcase(WB_Numeric, WB_Single_Quote): - return advance_one_WB(&after_pos, strend, utf8_target, - TRUE /* Do skip Extend and Format */ ) - != WB_Numeric; - - /* Do not break between Katakana. - WB13. Katakana × Katakana */ - case WBcase(WB_Katakana, WB_Katakana): - return FALSE; + case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK: + case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE: - /* Do not break from extenders. - WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | - ExtendNumLet) × ExtendNumLet */ - case WBcase(WB_ALetter, WB_ExtendNumLet): - case WBcase(WB_Hebrew_Letter, WB_ExtendNumLet): - case WBcase(WB_Numeric, WB_ExtendNumLet): - case WBcase(WB_Katakana, WB_ExtendNumLet): - case WBcase(WB_ExtendNumLet, WB_ExtendNumLet): - return FALSE; + /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric + * */ - /* WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric - * | Katakana) */ - case WBcase(WB_ExtendNumLet, WB_ALetter): - case WBcase(WB_ExtendNumLet, WB_Hebrew_Letter): - case WBcase(WB_ExtendNumLet, WB_Numeric): - case WBcase(WB_ExtendNumLet, WB_Katakana): + if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) + == WB_Numeric) + { return FALSE; + } - /* Do not break between regional indicator symbols. - WB13c. Regional_Indicator × Regional_Indicator */ - case WBcase(WB_Regional_Indicator, WB_Regional_Indicator): + return WB_table[before][after] + - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE; + + case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK: + case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE: + + /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */ + + if (advance_one_WB(&after_pos, strend, utf8_target, + TRUE /* Do skip Extend and Format */ ) + == WB_Numeric) + { return FALSE; + } + + return WB_table[before][after] + - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE; + default: + break; } - NOT_REACHED; /* NOTREACHED */ +#ifdef DEBUGGING + PerlIO_printf(Perl_error_log, "Unhandled WB pair: WB_table[%d, %d] = %d\n", + before, after, WB_table[before][after]); + assert(0); +#endif + return TRUE; } STATIC WB_enum |