summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-05-27 01:08:46 -0600
committerKarl Williamson <public@khwilliamson.com>2012-08-02 09:24:51 -0600
commit4924e08c35622f31867cf9e248a2d7aa96e3eace (patch)
tree39a5cfa58ab45d2721945d335e364b4aaef9d212 /regcomp.c
parent3d26503b21699b90df97d95c59c0078178969d7d (diff)
downloadperl-4924e08c35622f31867cf9e248a2d7aa96e3eace.tar.gz
regcomp.c: Trade stack space for time
Pass 1 of regular expression compilation merely calculates the size it will need. (Note that Yves and I both think this is very suboptimal behavior.) Nothing is written out during this pass, but sizes are just incremented. The code in regcomp.c all knows this, and skips writing things in pass 1. However, when folding, code in other files is called which doesn't have this size-only mode, and always writes its results out. Currently, regcomp handles this by passing to that code a temporary buffer allocated for the purpose. In pass1, the result is simply ignored; in pass2, the results are copied to the correct final destination. We can avoid that copy by making the temporary buffer large enough to hold the whole node, and in pass1, use it instead of the node. The non-regcomp code writes to the same relative spot in the buffer that it will use for the real node. In pass2 the real destination is used, and the fold gets written directly to the correct spot. Note that this increases the size pushed onto the stack, but code is ripped out as well. However, the main reason I'm doing this is not this speed-up; it is because it is needed by future commits to fix a bug.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c50
1 files changed, 27 insertions, 23 deletions
diff --git a/regcomp.c b/regcomp.c
index 9c70872663..a03c7d29e7 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10397,8 +10397,8 @@ tryagain:
register char *p;
char *s;
#define MAX_NODE_STRING_SIZE 127
+ char foldbuf[MAX_NODE_STRING_SIZE];
STRLEN foldlen;
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
U8 node_type;
bool next_is_quantifier;
@@ -10409,7 +10409,10 @@ tryagain:
ender = 0;
node_type = compute_EXACTish(pRExC_state);
ret = reg_node(pRExC_state, node_type);
- s = STRING(ret);
+
+ /* In pass1, folded, we use a temporary buffer instead of the
+ * actual node, as the node doesn't exist yet */
+ s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
/* XXX The node can hold up to 255 bytes, yet this only goes to
* 127. I (khw) do not know why. Keeping it somewhat less than
@@ -10650,7 +10653,9 @@ tryagain:
goto loopdone;
}
- if ((UTF && FOLD) || is_exactfu_sharp_s) {
+ if (FOLD) {
+ if (UTF || is_exactfu_sharp_s) {
+
/* Prime the casefolded buffer. Locale rules, which apply
* only to code points < 256, aren't known until execution,
* so for them, just output the original character using
@@ -10658,16 +10663,16 @@ tryagain:
* update join_exact() */
if (LOC && ender < 256) {
if (UNI_IS_INVARIANT(ender)) {
- *tmpbuf = (U8) ender;
+ *s = (U8) ender;
foldlen = 1;
} else {
- *tmpbuf = UTF8_TWO_BYTE_HI(ender);
- *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
+ *s = UTF8_TWO_BYTE_HI(ender);
+ *(s + 1) = UTF8_TWO_BYTE_LO(ender);
foldlen = 2;
}
}
else {
- ender = _to_uni_fold_flags(ender, tmpbuf, &foldlen,
+ ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
FOLD_FLAGS_FULL
| ((LOC) ? FOLD_FLAGS_LOCALE
: (ASCII_FOLD_RESTRICTED)
@@ -10675,29 +10680,28 @@ tryagain:
: 0)
);
}
+ s += foldlen;
+
+ /* The loop increments <len> each time, as all but this
+ * path (and the one just below for UTF) through it add
+ * a single byte to the EXACTish node. But this one
+ * has changed len to be the correct final value, so
+ * subtract one to cancel out the increment that
+ * follows */
+ len += foldlen - 1;
}
- if (UTF || is_exactfu_sharp_s) {
- if (FOLD) {
- if (! SIZE_ONLY) {
- /* Emit all the Unicode characters. */
- Copy(tmpbuf, s, foldlen, char);
- }
- len += foldlen;
- s += foldlen;
- }
- else {
+ else {
+ REGC((char)ender, s++);
+ }
+ }
+ else if (UTF) {
const STRLEN unilen = reguni(pRExC_state, ender, s);
if (unilen > 0) {
s += unilen;
len += unilen;
}
- }
- /* The loop increments <len> each time, as all but this
- * path through it add a single byte to the EXACTish node.
- * But this one has changed len to be the correct final
- * value, so subtract one to cancel out the increment that
- * follows */
+ /* See comment just above for - 1 */
len--;
}
else {