MDEV-24270 Misuse of io_getevents() causes wake-ups at least twice per second

In the asynchronous I/O interface, InnoDB is invoking io_getevents() with a timeout value of half a second, and requesting exactly 1 event at a time. The reason to have such a short timeout is to facilitate shutdown. We can do better: Use an infinite timeout, wait for a larger maximum number of events. On shutdown, we will invoke io_destroy(), which should lead to the io_getevents system call reporting EINVAL. my_getevents(): Reimplement the libaio io_getevents() by only invoking the system call. The library implementation would try to elide the system call and return 0 immediately if aio_ring_is_empty() holds. Here, we do want a blocking system call, not 100% CPU usage. Neither do we want the aio_ring_is_empty() trigger SIGSEGV because it is dereferencing some memory that was freed by io_destroy().
author: Marko Mäkelä <marko.makela@mariadb.com> 2020-11-25 09:40:12 +0200
committer: Marko Mäkelä <marko.makela@mariadb.com> 2020-11-25 09:40:12 +0200
commit: 7a9405e3dc8741d50658e976b7e16c1807c2b9a7 (patch)
tree: c6ad0dc6cd195ffe0e0ae36de5ca6533acb6e8c0 /tpool
parent: 1b12e251cdc692e4ce2dd130ea45e7b17a9ea5e1 (diff)
download: mariadb-git-7a9405e3dc8741d50658e976b7e16c1807c2b9a7.tar.gz
1 files changed, 79 insertions, 82 deletions
diff --git a/tpool/aio_linux.cc b/tpool/aio_linux.cc
index 24bc04c75ba..51b656a604b 100644
--- a/tpool/aio_linux.cc
+++ b/tpool/aio_linux.cc
@@ -1,4 +1,4 @@
-/* Copyright(C) 2019 MariaDB Corporation.
+/* Copyright (C) 2019, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute itand /or modify
 it under the terms of the GNU General Public License as published by
@@ -14,133 +14,133 @@ along with this program; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111 - 1301 USA*/
 
 #include "tpool_structs.h"
-
-#include <stdlib.h>
-#include <signal.h>
-#include <assert.h>
 #include "tpool.h"
-#include <thread>
+
 #ifdef LINUX_NATIVE_AIO
-#include <libaio.h>
+# include <thread>
+# include <atomic>
+# include <libaio.h>
+# include <sys/syscall.h>
+
+/** A simpler alternative to io_getevents(), without
+aio_ring_is_empty() that may trigger SIGSEGV */
+static int my_getevents(io_context_t ctx, long min_nr, long nr, io_event *ev)
+{
+  int saved_errno= errno;
+  int ret= syscall(__NR_io_getevents, reinterpret_cast<long>(ctx),
+                   min_nr, nr, ev, 0);
+  if (ret < 0)
+  {
+    ret= -errno;
+    errno= saved_errno;
+  }
+  return ret;
+}
 #endif
 /*
   Linux AIO implementation, based on native AIO.
   Needs libaio.h and -laio at the compile time.
 
-  submit_io() is used to submit async IO.
+  io_submit() is used to submit async IO.
 
-  There is a single thread, that collects the completion notification
-  with  io_getevent(), and forwards io completion callback
+  A single thread will collect the completion notification
+  with io_getevents() and forward io completion callback to
   the worker threadpool.
 */
 namespace tpool
 {
 #ifdef LINUX_NATIVE_AIO
 
-class aio_linux : public aio
+class aio_linux final : public aio
 {
-  thread_pool* m_pool;
+  thread_pool *m_pool;
   io_context_t m_io_ctx;
-  bool m_in_shutdown;
   std::thread m_getevent_thread;
+  static std::atomic<bool> shutdown_in_progress;
 
-  static void getevent_thread_routine(aio_linux* aio)
+  static void getevent_thread_routine(aio_linux *aio)
   {
+    io_event events[1];
     for (;;)
     {
-      io_event event;
-      struct timespec ts{0, 500000000};
-      int ret = io_getevents(aio->m_io_ctx, 1, 1, &event, &ts);
-
-      if (aio->m_in_shutdown)
-        break;
-
-      if (ret > 0)
-      {
-        aiocb* iocb = (aiocb*)event.obj;
-        long long res = event.res;
-        if (res < 0)
+      switch (int ret= my_getevents(aio->m_io_ctx, 1, 1, events)) {
+      case -EINTR:
+      case 0:
+        continue;
+      case -EINVAL:
+        if (shutdown_in_progress)
+          return;
+        /* fall through */
+      default:
+        if (ret != 1)
         {
-          iocb->m_err = static_cast<int>(-res);
-          iocb->m_ret_len = 0;
+          fprintf(stderr, "io_getevents returned %d\n", ret);
+          abort();
+          return;
         }
         else
         {
-          iocb->m_ret_len = ret;
-          iocb->m_err = 0;
+          const io_event &event= events[0];
+          aiocb *iocb= static_cast<aiocb*>(event.obj);
+          if (static_cast<int>(event.res) < 0)
+          {
+            iocb->m_err= -event.res;
+            iocb->m_ret_len= 0;
+          }
+          else
+          {
+            iocb->m_ret_len= event.res;
+            iocb->m_err= 0;
+          }
+          iocb->m_internal_task.m_func= iocb->m_callback;
+          iocb->m_internal_task.m_arg= iocb;
+          iocb->m_internal_task.m_group= iocb->m_group;
+          aio->m_pool->submit_task(&iocb->m_internal_task);
         }
-
-        iocb->m_internal_task.m_func = iocb->m_callback;
-        iocb->m_internal_task.m_arg = iocb;
-        iocb->m_internal_task.m_group = iocb->m_group;
-        aio->m_pool->submit_task(&iocb->m_internal_task);
-        continue;
-      }
-      switch (ret)
-      {
-      case -EAGAIN:
-        usleep(1000);
-        continue;
-      case -EINTR:
-      case 0:
-        continue;
-      default:
-        fprintf(stderr, "io_getevents returned %d\n", ret);
-        abort();
       }
     }
   }
 
 public:
-  aio_linux(io_context_t ctx, thread_pool* pool)
+  aio_linux(io_context_t ctx, thread_pool *pool)
     : m_pool(pool), m_io_ctx(ctx),
-    m_in_shutdown(), m_getevent_thread(getevent_thread_routine, this)
+    m_getevent_thread(getevent_thread_routine, this)
   {
   }
 
   ~aio_linux()
   {
-    m_in_shutdown = true;
-    m_getevent_thread.join();
+    shutdown_in_progress= true;
     io_destroy(m_io_ctx);
+    m_getevent_thread.join();
+    shutdown_in_progress= false;
   }
 
-  // Inherited via aio
-  virtual int submit_io(aiocb* cb) override
+  int submit_io(aiocb *cb) override
   {
-
-    if (cb->m_opcode == aio_opcode::AIO_PREAD)
-      io_prep_pread((iocb *)cb, cb->m_fh, cb->m_buffer, cb->m_len,
-        cb->m_offset);
-    else
-      io_prep_pwrite((iocb *)cb, cb->m_fh, cb->m_buffer, cb->m_len,
-        cb->m_offset);
-
-    int ret;
-    ret = io_submit(m_io_ctx, 1, (iocb * *)& cb);
+    io_prep_pread(static_cast<iocb*>(cb), cb->m_fh, cb->m_buffer, cb->m_len,
+                  cb->m_offset);
+    if (cb->m_opcode != aio_opcode::AIO_PREAD)
+      cb->aio_lio_opcode= IO_CMD_PWRITE;
+    iocb *icb= static_cast<iocb*>(cb);
+    int ret= io_submit(m_io_ctx, 1, &icb);
     if (ret == 1)
       return 0;
-    errno = -ret;
+    errno= -ret;
     return -1;
   }
 
-  // Inherited via aio
-  virtual int bind(native_file_handle& fd) override
-  {
-    return 0;
-  }
-  virtual int unbind(const native_file_handle& fd) override
-  {
-    return 0;
-  }
+  int bind(native_file_handle&) override { return 0; }
+  int unbind(const native_file_handle&) override { return 0; }
 };
 
-aio* create_linux_aio(thread_pool* pool, int max_io)
+std::atomic<bool> aio_linux::shutdown_in_progress;
+
+aio *create_linux_aio(thread_pool *pool, int max_io)
 {
   io_context_t ctx;
-  memset(&ctx, 0, sizeof(ctx));
-  int ret = io_setup(max_io, &ctx);
-  if (ret)
+  memset(&ctx, 0, sizeof ctx);
+  if (int ret= io_setup(max_io, &ctx))
   {
     fprintf(stderr, "io_setup(%d) returned %d\n", max_io, ret);
     return nullptr;
@@ -148,9 +148,6 @@ aio* create_linux_aio(thread_pool* pool, int max_io)
   return new aio_linux(ctx, pool);
 }
 #else
-aio* create_linux_aio(thread_pool* pool, int max_aio)
-{
-  return nullptr;
-}
+aio *create_linux_aio(thread_pool*, int) { return nullptr; }
 #endif
 }
author	Marko Mäkelä <marko.makela@mariadb.com>	2020-11-25 09:40:12 +0200
committer	Marko Mäkelä <marko.makela@mariadb.com>	2020-11-25 09:40:12 +0200
commit	7a9405e3dc8741d50658e976b7e16c1807c2b9a7 (patch)
tree	c6ad0dc6cd195ffe0e0ae36de5ca6533acb6e8c0 /tpool
parent	1b12e251cdc692e4ce2dd130ea45e7b17a9ea5e1 (diff)
download	mariadb-git-7a9405e3dc8741d50658e976b7e16c1807c2b9a7.tar.gz