summaryrefslogtreecommitdiff
path: root/src/src/regex_cache.c
blob: 63cddce1db91449c32cfe449286527dc4434d2ce (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
/*************************************************
*     Exim - an Internet mail transport agent    *
*************************************************/

/*
 * Copyright (c) The Exim Maintainers 2022
 * License: GPL
 */

/* Caching layers for compiled REs.  There is a local layer in the process,
implemented as a tree for inserts and lookup.  This cache is inherited from
the daemon, for the process tree deriving from there - but not by re-exec'd
proceses or commandline submission processes.

If the process has to compile, and is not the daemon or a re-exec'd exim,
it notifies the use of the RE to the daemon via a unix-domain socket.
This is a fire-and-forget send with no response, hence cheap from the point-of
view of the sender.  I have not measured the overall comms costs.  The
daemon also compiles the RE, and caches the result.

A second layer would be possible by asking the daemon via the notifier socket
(for a result from its cache, or a compile if it must).  The comms overhead
is significant, not only for the channel but also for de/serialisation of
the compiled object.  This makes it untenable for the primary use-case, the
transport process which has been re-exec'd to gain privs - and therefore does not
have the daemon-maintained cache.  Using shared-memory might reduce that cost
(the attach time for the memory segment will matter); the implimentation
would require suitable R/W locks.
*/

#include "exim.h"

typedef struct re_req {
  uschar	notifier_reqtype;
  BOOL		caseless;
  uschar	re[1];		/* extensible */
} re_req;

static tree_node * regex_cache = NULL;
static tree_node * regex_caseless_cache = NULL;

#define REGEX_CACHESIZE_LIMIT 1000

/******************************************************************************/

static void
regex_to_daemon(const uschar * key, BOOL caseless)
{
int klen = Ustrlen(key) + 1;
int rlen = sizeof(re_req) + klen;
re_req * req;
int fd, old_pool = store_pool;

DEBUG(D_expand|D_lists)
  debug_printf_indent("sending RE '%s' to daemon\n", key);

store_pool = POOL_MAIN;
  req = store_get(rlen, key);	/* maybe need a size limit */
store_pool = old_pool;;
req->notifier_reqtype = NOTIFY_REGEX;
req->caseless = caseless;
memcpy(req->re, key, klen);

if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) >= 0)
  {
  struct sockaddr_un sa_un = {.sun_family = AF_UNIX};
  ssize_t len = daemon_notifier_sockname(&sa_un);

  if (sendto(fd, req, rlen, 0, (struct sockaddr *)&sa_un, (socklen_t)len) < 0)
    DEBUG(D_queue_run)
      debug_printf("%s: sendto %s\n", __FUNCTION__, strerror(errno));
  close(fd);
  }
else DEBUG(D_queue_run) debug_printf(" socket: %s\n", strerror(errno));
}


static const pcre2_code *
regex_from_cache(const uschar * key, BOOL caseless)
{
tree_node * node  =
  tree_search(caseless ? regex_caseless_cache : regex_cache, key);
DEBUG(D_expand|D_lists)
  debug_printf_indent("compiled %sRE '%s' %sfound in local cache\n",
		      caseless ? "caseless " : "", key, node ? "" : "not ");

return node ? node->data.ptr : NULL;
}


static void
regex_to_cache(const uschar * key, BOOL caseless, const pcre2_code * cre)
{
PCRE2_SIZE srelen;
uschar * sre;
tree_node * node;

node = store_get(sizeof(tree_node) + Ustrlen(key) + 1, key);	/* we are called with STORE_PERM */
Ustrcpy(node->name, key);
node->data.ptr = (void *)cre;

if (!tree_insertnode(caseless ? &regex_caseless_cache : &regex_cache, node))
  { DEBUG(D_expand|D_lists) debug_printf_indent("duplicate key!\n"); }
else DEBUG(D_expand|D_lists)
  debug_printf_indent("compiled RE '%s' saved in local cache\n", key);

/* Additionally, if not re-execed and not the daemon, tell the daemon of the RE
so it can add to the cache */

if (f.daemon_scion && !f.daemon_listen)
  regex_to_daemon(key, caseless);

return;
}

/******************************************************************************/

/*************************************************
*  Compile regular expression and panic on fail  *
*************************************************/

/* This function is called when failure to compile a regular expression leads
to a panic exit. In other cases, pcre_compile() is called directly. In many
cases where this function is used, the results of the compilation are to be
placed in long-lived store, so we temporarily reset the store management
functions that PCRE uses if the use_malloc flag is set.

Argument:
  pattern     the pattern to compile
  flags
   caseless    caseless matching is required
   cacheable   use (writeback) cache
  use_malloc  TRUE if compile into malloc store

Returns:      pointer to the compiled pattern
*/

const pcre2_code *
regex_must_compile(const uschar * pattern, mcs_flags flags, BOOL use_malloc)
{
BOOL caseless = !!(flags & MCS_CASELESS);
size_t offset;
const pcre2_code * yield;
int old_pool = store_pool, err;

/* Optionall, check the cache and return if found */

if (  flags & MCS_CACHEABLE
   && (yield = regex_from_cache(pattern, caseless)))
  return yield;

store_pool = POOL_PERM;

if (!(yield = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
  caseless ? PCRE_COPT|PCRE2_CASELESS : PCRE_COPT,
  &err, &offset, use_malloc ? pcre_mlc_cmp_ctx : pcre_gen_cmp_ctx)))
  {
  uschar errbuf[128];
  pcre2_get_error_message(err, errbuf, sizeof(errbuf));
  log_write(0, LOG_MAIN|LOG_PANIC_DIE, "regular expression error: "
    "%s at offset %ld while compiling %s", errbuf, (long)offset, pattern);
  }

if (use_malloc)
  {
  /*pcre2_general_context_free(gctx);*/
  }

if (flags & MCS_CACHEABLE)
  regex_to_cache(pattern, caseless, yield);

store_pool = old_pool;
return yield;
}




/* Wrapper for pcre2_compile() and error-message handling.

Arguments:	pattern		regex to compile
		flags
		 caseless	flag for match variant
		 cacheable	use (writeback) cache
		errstr		on error, filled in with error message
		cctx		compile-context for pcre2

Return:		NULL on error, with errstr set. Otherwise, the compiled RE object
*/

const pcre2_code *
regex_compile(const uschar * pattern, mcs_flags flags, uschar ** errstr,
  pcre2_compile_context * cctx)
{
const uschar * key = pattern;
BOOL caseless = !!(flags & MCS_CASELESS);
int err;
PCRE2_SIZE offset;
const pcre2_code * yield;
int old_pool = store_pool;

/* Optionally, check the cache and return if found */

if (  flags & MCS_CACHEABLE
   && (yield = regex_from_cache(key, caseless)))
  return yield;

DEBUG(D_expand|D_lists) debug_printf_indent("compiling %sRE '%s'\n",
				caseless ? "caseless " : "", pattern);

store_pool = POOL_PERM;
if (!(yield = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED,
		caseless ? PCRE_COPT|PCRE2_CASELESS : PCRE_COPT,
		&err, &offset, cctx)))
  {
  uschar errbuf[128];
  pcre2_get_error_message(err, errbuf, sizeof(errbuf));
  store_pool = old_pool;
  *errstr = string_sprintf("regular expression error in "
	    "\"%s\": %s at offset %ld", pattern, errbuf, (long)offset);
  }
else if (flags & MCS_CACHEABLE)
  regex_to_cache(key, caseless, yield);
store_pool = old_pool;

return yield;
}



/* Handle a regex notify arriving at the daemon.  We get sent the original RE;
compile it (again) and write to the cache.  Later forked procs will be able to
read from the cache, unless they re-execed.  Therefore, those latter never bother
sending us a notification. */

void
regex_at_daemon(const uschar * reqbuf)
{
const re_req * req = (const re_req *)reqbuf;
uschar * errstr;
const pcre2_code * cre;

if (regex_cachesize >= REGEX_CACHESIZE_LIMIT)
  errstr = US"regex cache size limit reached";
else if ((cre = regex_compile(req->re,
	    req->caseless ? MCS_CASELESS | MCS_CACHEABLE : MCS_CACHEABLE,
	    &errstr, pcre_gen_cmp_ctx)))
  regex_cachesize++;

DEBUG(D_any) if (!cre) debug_printf("%s\n", errstr);
return;
}