summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-02-19 15:19:08 -0700
committerKarl Williamson <khw@cpan.org>2015-02-19 23:05:44 -0700
commitae3bb8eaaa75dd43e8b8c8e32e106da36f49dee7 (patch)
treed603c2149e568651c11035abc100775d6099eb68 /regexec.c
parent53255578b3ff804b2a8449c49c2ee29cccd56fb0 (diff)
downloadperl-ae3bb8eaaa75dd43e8b8c8e32e106da36f49dee7.tar.gz
Add qr/\b{wb}/
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c389
1 files changed, 389 insertions, 0 deletions
diff --git a/regexec.c b/regexec.c
index 4819b89e9a..f0bb1b80bc 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1737,6 +1737,18 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
#define getGCB_VAL_UTF8(pos, strend) \
_generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
+/* Returns the WB value for the input code point */
+#define getWB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_WB_invlist, \
+ Word_Break_invmap, \
+ (cp))
+
+/* Returns the WB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getWB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
+
/* We know what class REx starts with. Try to find this position... */
/* if reginfo->intuit, its a dryrun */
/* annoyingly all the vars in this routine have different names from their counterparts
@@ -2056,6 +2068,79 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
goto got_it;
}
break;
+
+ case WB_BOUND:
+ if (s == reginfo->strbeg) {
+ if (to_complement ^ cBOOL(reginfo->intuit
+ || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ }
+
+ if (utf8_target) {
+ /* We are at a boundary between char_sub_0 and char_sub_1.
+ * We also keep track of the value for char_sub_-1 as we
+ * loop through the line. Context may be needed to make a
+ * determination, and if so, this can save having to
+ * recalculate it */
+ PL_WB_enum previous = PL_WB_UNKNOWN;
+ PL_WB_enum before = getWB_VAL_UTF8(
+ reghop3((U8*)s,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ PL_WB_enum after = getWB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if (to_complement ^ isWB(previous,
+ before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ previous = before;
+ before = after;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. */
+ PL_WB_enum previous = PL_WB_UNKNOWN;
+ PL_WB_enum before = getWB_VAL_CP((U8) *(s -1));
+ while (s < strend) {
+ PL_WB_enum after = getWB_VAL_CP((U8) *s);
+ if (to_complement ^ isWB(previous,
+ before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ previous = before;
+ before = after;
+ }
+ s++;
+ }
+ }
+
+ if (to_complement ^ cBOOL(reginfo->intuit
+ || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+
+ break;
}
break;
@@ -4089,6 +4174,277 @@ S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after)
NOT_REACHED;
}
+#define WBcase(before, after) ((PL_WB_ENUM_COUNT * before) + after)
+
+STATIC bool
+S_isWB(pTHX_ PL_WB_enum previous,
+ PL_WB_enum before,
+ PL_WB_enum after,
+ const U8 * const strbeg,
+ const U8 * const curpos,
+ const U8 * const strend,
+ const bool utf8_target)
+{
+ /* Return a boolean as to if the boundary between 'before' and 'after' is
+ * a Unicode word break, using their published algorithm. Context may be
+ * needed to make this determination. If the value for the character
+ * before 'before' is known, it is passed as 'previous'; otherwise that
+ * should be set to PL_WB_UNKNOWN. The other input parameters give the
+ * boundaries and current position in the matching of the string. That
+ * is, 'curpos' marks the position where the character whose wb value is
+ * 'after' begins. See http://www.unicode.org/reports/tr29/ */
+
+ U8 * before_pos = (U8 *) curpos;
+ U8 * after_pos = (U8 *) curpos;
+
+ PERL_ARGS_ASSERT_ISWB;
+
+ /* WB1 and WB2: Break at the start and end of text. */
+ if (before == PL_WB_EDGE || after == PL_WB_EDGE) {
+ return TRUE;
+ }
+
+ /* WB 3: Do not break within CRLF. */
+ if (before == PL_WB_CR && after == PL_WB_LF) {
+ return FALSE;
+ }
+
+ /* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
+ * and LF) */
+ if ( before == PL_WB_CR || before == PL_WB_LF || before == PL_WB_Newline
+ || after == PL_WB_CR || after == PL_WB_LF || after == PL_WB_Newline)
+ {
+ return TRUE;
+ }
+
+ /* Ignore Format and Extend characters, except when they appear at the
+ * beginning of a region of text.
+ * WB4. X (Extend | Format)* → X. */
+
+ if (after == PL_WB_Extend || after == PL_WB_Format) {
+ return FALSE;
+ }
+
+ if (before == PL_WB_Extend || before == PL_WB_Format) {
+ before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
+ }
+
+ switch (WBcase(before, after)) {
+ /* Otherwise, break everywhere (including around ideographs).
+ WB14. Any ÷ Any */
+ default:
+ return TRUE;
+
+ /* Do not break between most letters.
+ WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */
+ case WBcase(PL_WB_ALetter, PL_WB_ALetter):
+ case WBcase(PL_WB_ALetter, PL_WB_Hebrew_Letter):
+ case WBcase(PL_WB_Hebrew_Letter, PL_WB_ALetter):
+ case WBcase(PL_WB_Hebrew_Letter, PL_WB_Hebrew_Letter):
+ return FALSE;
+
+ /* Do not break letters across certain punctuation.
+ WB6. (ALetter | Hebrew_Letter)
+ × (MidLetter | MidNumLet | Single_Quote) (ALetter
+ | Hebrew_Letter) */
+ case WBcase(PL_WB_ALetter, PL_WB_MidLetter):
+ case WBcase(PL_WB_ALetter, PL_WB_MidNumLet):
+ case WBcase(PL_WB_ALetter, PL_WB_Single_Quote):
+ case WBcase(PL_WB_Hebrew_Letter, PL_WB_MidLetter):
+ case WBcase(PL_WB_Hebrew_Letter, PL_WB_MidNumLet):
+ /*case WBcase(PL_WB_Hebrew_Letter, PL_WB_Single_Quote):*/
+ after = advance_one_WB(&after_pos, strend, utf8_target);
+ return after != PL_WB_ALetter && after != PL_WB_Hebrew_Letter;
+
+ /* WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
+ * Single_Quote) × (ALetter | Hebrew_Letter) */
+ case WBcase(PL_WB_MidLetter, PL_WB_ALetter):
+ case WBcase(PL_WB_MidLetter, PL_WB_Hebrew_Letter):
+ case WBcase(PL_WB_MidNumLet, PL_WB_ALetter):
+ case WBcase(PL_WB_MidNumLet, PL_WB_Hebrew_Letter):
+ case WBcase(PL_WB_Single_Quote, PL_WB_ALetter):
+ case WBcase(PL_WB_Single_Quote, PL_WB_Hebrew_Letter):
+ before
+ = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
+ return before != PL_WB_ALetter && before != PL_WB_Hebrew_Letter;
+
+ /* WB7a. Hebrew_Letter × Single_Quote */
+ case WBcase(PL_WB_Hebrew_Letter, PL_WB_Single_Quote):
+ return FALSE;
+
+ /* WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter */
+ case WBcase(PL_WB_Hebrew_Letter, PL_WB_Double_Quote):
+ return advance_one_WB(&after_pos, strend, utf8_target)
+ != PL_WB_Hebrew_Letter;
+
+ /* WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter */
+ case WBcase(PL_WB_Double_Quote, PL_WB_Hebrew_Letter):
+ return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
+ != PL_WB_Hebrew_Letter;
+
+ /* Do not break within sequences of digits, or digits adjacent to
+ * letters (“3a”, or “A3”).
+ WB8. Numeric × Numeric */
+ case WBcase(PL_WB_Numeric, PL_WB_Numeric):
+ return FALSE;
+
+ /* WB9. (ALetter | Hebrew_Letter) × Numeric */
+ case WBcase(PL_WB_ALetter, PL_WB_Numeric):
+ case WBcase(PL_WB_Hebrew_Letter, PL_WB_Numeric):
+ return FALSE;
+
+ /* WB10. Numeric × (ALetter | Hebrew_Letter) */
+ case WBcase(PL_WB_Numeric, PL_WB_ALetter):
+ case WBcase(PL_WB_Numeric, PL_WB_Hebrew_Letter):
+ return FALSE;
+
+ /* Do not break within sequences, such as “3.2” or “3,456.789”.
+ WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
+ */
+ case WBcase(PL_WB_MidNum, PL_WB_Numeric):
+ case WBcase(PL_WB_MidNumLet, PL_WB_Numeric):
+ case WBcase(PL_WB_Single_Quote, PL_WB_Numeric):
+ return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
+ != PL_WB_Numeric;
+
+ /* WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
+ * */
+ case WBcase(PL_WB_Numeric, PL_WB_MidNum):
+ case WBcase(PL_WB_Numeric, PL_WB_MidNumLet):
+ case WBcase(PL_WB_Numeric, PL_WB_Single_Quote):
+ return advance_one_WB(&after_pos, strend, utf8_target)
+ != PL_WB_Numeric;
+
+ /* Do not break between Katakana.
+ WB13. Katakana × Katakana */
+ case WBcase(PL_WB_Katakana, PL_WB_Katakana):
+ return FALSE;
+
+ /* Do not break from extenders.
+ WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana |
+ ExtendNumLet) × ExtendNumLet */
+ case WBcase(PL_WB_ALetter, PL_WB_ExtendNumLet):
+ case WBcase(PL_WB_Hebrew_Letter, PL_WB_ExtendNumLet):
+ case WBcase(PL_WB_Numeric, PL_WB_ExtendNumLet):
+ case WBcase(PL_WB_Katakana, PL_WB_ExtendNumLet):
+ case WBcase(PL_WB_ExtendNumLet, PL_WB_ExtendNumLet):
+ return FALSE;
+
+ /* WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric
+ * | Katakana) */
+ case WBcase(PL_WB_ExtendNumLet, PL_WB_ALetter):
+ case WBcase(PL_WB_ExtendNumLet, PL_WB_Hebrew_Letter):
+ case WBcase(PL_WB_ExtendNumLet, PL_WB_Numeric):
+ case WBcase(PL_WB_ExtendNumLet, PL_WB_Katakana):
+ return FALSE;
+
+ /* Do not break between regional indicator symbols.
+ WB13c. Regional_Indicator × Regional_Indicator */
+ case WBcase(PL_WB_Regional_Indicator, PL_WB_Regional_Indicator):
+ return FALSE;
+
+ }
+
+ NOT_REACHED;
+}
+
+STATIC PL_WB_enum
+S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
+{
+ PL_WB_enum wb;
+
+ PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
+
+ if (*curpos >= strend) {
+ return PL_WB_EDGE;
+ }
+
+ if (utf8_target) {
+
+ /* Advance over Extend and Format */
+ do {
+ *curpos += UTF8SKIP(*curpos);
+ if (*curpos >= strend) {
+ return PL_WB_EDGE;
+ }
+ wb = getWB_VAL_UTF8(*curpos, strend);
+ } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ }
+ else {
+ do {
+ (*curpos)++;
+ if (*curpos >= strend) {
+ return PL_WB_EDGE;
+ }
+ wb = getWB_VAL_CP(**curpos);
+ } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ }
+
+ return wb;
+}
+
+STATIC PL_WB_enum
+S_backup_one_WB(pTHX_ PL_WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+ PL_WB_enum wb;
+
+ PERL_ARGS_ASSERT_BACKUP_ONE_WB;
+
+ /* If we know what the previous character's break value is, don't have
+ * to look it up */
+ if (*previous != PL_WB_UNKNOWN) {
+ wb = *previous;
+ *previous = PL_WB_UNKNOWN;
+ /* XXX Note that doesn't change curpos, and maybe should */
+
+ /* But we always back up over these two types */
+ if (wb != PL_WB_Extend && wb != PL_WB_Format) {
+ return wb;
+ }
+ }
+
+ if (*curpos < strbeg) {
+ return PL_WB_EDGE;
+ }
+
+ if (utf8_target) {
+ U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+ if (! prev_char_pos) {
+ return PL_WB_EDGE;
+ }
+
+ /* Back up over Extend and Format. curpos is always just to the right
+ * of the characater whose value we are getting */
+ do {
+ U8 * prev_prev_char_pos;
+ if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
+ -1,
+ strbeg)))
+ {
+ wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+ *curpos = prev_char_pos;
+ prev_char_pos = prev_prev_char_pos;
+ }
+ else {
+ *curpos = (U8 *) strbeg;
+ return PL_WB_EDGE;
+ }
+ } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ }
+ else {
+ do {
+ if (*curpos - 2 < strbeg) {
+ *curpos = (U8 *) strbeg;
+ return PL_WB_EDGE;
+ }
+ (*curpos)--;
+ wb = getWB_VAL_CP(*(*curpos - 1));
+ } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ }
+
+ return wb;
+}
+
/* returns -1 on failure, $+[0] on success */
STATIC SSize_t
S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
@@ -4936,6 +5292,25 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
(U8*) reginfo->strend));
}
break;
+ case WB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isWB(PL_WB_UNKNOWN,
+ getWB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getWB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
}
}
else { /* Not utf8 target */
@@ -4961,6 +5336,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
|| UCHARAT(locinput) != '\n';
}
break;
+ case WB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isWB(PL_WB_UNKNOWN,
+ getWB_VAL_CP(UCHARAT(locinput -1)),
+ getWB_VAL_CP(UCHARAT(locinput)),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
}
}