From 9e740a9abbe72f49bef026c85464f68e0e46757c Mon Sep 17 00:00:00 2001 From: dormando Date: Sun, 5 Mar 2023 15:47:45 -0800 Subject: proxy: reduce noise for dead backends only log until a backend is marked bad. was previously ticking the "bad" counter on every retry attempt as well. turned out to be trivial. --- proxy_network.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/proxy_network.c b/proxy_network.c index e18516f..45ae98b 100644 --- a/proxy_network.c +++ b/proxy_network.c @@ -1245,26 +1245,20 @@ static void proxy_backend_retry_handler(const int fd, const short which, void *a _set_main_event(be, be->event_thread->base, EV_WRITE, &tmp_time, proxy_beconn_handler); } -// currently just for timeouts, but certain errors should consider a backend -// to be "bad" as well. // must be called after _reset_bad_backend(), so the backend is currently // clear. -// TODO (v2): currently only notes for "bad backends" in cases of timeouts or -// connect failures. We need a specific connect() handler that executes a -// "version" call to at least check that the backend isn't speaking garbage. -// In theory backends can fail such that responses are constantly garbage, -// but it's more likely an app is doing something bad and culling the backend -// may prevent any other clients from talking to that backend. In -// that case we need to track if clients are causing errors consistently and -// block them instead. That's more challenging so leaving a note instead -// of doing this now :) +// TODO (v2): extra counter for "backend connect tries" so it's still possible +// to see dead backends exist static void _backend_failed(mcp_backend_t *be) { struct timeval tmp_time = be->tunables.retry; if (++be->failed_count > be->tunables.backend_failure_limit) { - P_DEBUG("%s: marking backend as bad\n", __func__); + if (!be->bad) { + P_DEBUG("%s: marking backend as bad\n", __func__); + STAT_INCR(be->event_thread->ctx, backend_marked_bad, 1); + LOGGER_LOG(NULL, LOG_PROXYEVENTS, LOGGER_PROXY_BE_ERROR, NULL, "markedbad", be->name, be->port, 0, NULL, 0); + } be->bad = true; _set_main_event(be, be->event_thread->base, EV_TIMEOUT, &tmp_time, proxy_backend_retry_handler); - STAT_INCR(be->event_thread->ctx, backend_marked_bad, 1); } else { STAT_INCR(be->event_thread->ctx, backend_failed, 1); _backend_reconnect(be); @@ -1298,7 +1292,10 @@ static int _reset_bad_backend(mcp_backend_t *be, enum proxy_be_failures err) { STAILQ_INIT(&be->io_head); be->io_next = NULL; // also reset the write offset. - LOGGER_LOG(NULL, LOG_PROXYEVENTS, LOGGER_PROXY_BE_ERROR, NULL, proxy_be_failure_text[err], be->name, be->port, depth, be->rbuf, be->rbufused); + // Only log if we don't already know it's messed up. + if (!be->bad) { + LOGGER_LOG(NULL, LOG_PROXYEVENTS, LOGGER_PROXY_BE_ERROR, NULL, proxy_be_failure_text[err], be->name, be->port, depth, be->rbuf, be->rbufused); + } // reset buffer to blank state. be->rbufused = 0; -- cgit v1.2.1