summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEugene Syromyatnikov <evgsyr@gmail.com>2018-08-08 21:41:39 +0200
committerDmitry V. Levin <ldv@altlinux.org>2018-08-19 10:26:18 +0000
commit82f7fea5220992e1538200564cf7021bb3f39b5b (patch)
tree4b21af9f65f88357550a4ff00690d0b71a86be53
parent49dbd3792d19fb0450150412849e8447942db269 (diff)
downloadstrace-82f7fea5220992e1538200564cf7021bb3f39b5b.tar.gz
Implement queueing of threads before dispatching them
It is possible that some tracees call a lot of cheap syscalls too fast, and that can lead to starvation to the point some tracees are not served for indefinite amount of time. In order to solve that unfairness, try to collect all the pending tracees first (along with the relevant information) and only then dispatch the events. * defs.h: Include "list.h". (struct tcb): Add wait_data_idx, wait_extra_data_idx, wait_list fields. * strace.c (tcb_wait_tab): New variable. (expand_tcbtab): Resize tcb_wait_tab along with tcbtab, provide an additional slot for extra event. (droptcb): Remove tcp from wait_list. (maybe_switch_tcbs): Get old pid from tcb_wait_tab[tcp->wait_data_idx].msg. (next_event): Add pending_tcps, extra_tcp, wait_nohang, elem, and wait_tab_pos variables; check for elements in pending_tcps and skip waiting if the list is not empty; check for extra_tcp and skip waiting along with swapping wait_data_idx with wait_extra_data_idx; after the initial wait, call wait4() in loop with WNOHANG flag set; fetch siginfo on signal and eventmsg on PTRACE_EVENT_EXEC; return the first tcp in pending_tcps list. * tests/Makefile.am (XFAIL_TEST): Remove looping_threads.test. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=478419 Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=526740 Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=851457 Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1609318 Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1610774 Co-Authored-by: Dmitry Vlasenko <dvlasenk@redhat.com> Co-Authored-by: Andreas Schwab <aschwab@redhat.com> Co-Authored-by: Jeff Law <law@redhat.com> Co-Authored-by: DJ Delorie <dj@redhat.com>
-rw-r--r--defs.h11
-rw-r--r--strace.c309
-rw-r--r--tests/Makefile.am3
3 files changed, 207 insertions, 116 deletions
diff --git a/defs.h b/defs.h
index 2c19dd3a4..2eef00fc6 100644
--- a/defs.h
+++ b/defs.h
@@ -57,6 +57,7 @@
#include "error_prints.h"
#include "gcc_compat.h"
#include "kernel_types.h"
+#include "list.h"
#include "macros.h"
#include "mpers_type.h"
#include "string_to_uint.h"
@@ -236,6 +237,16 @@ struct tcb {
struct mmap_cache_t *mmap_cache;
+ /*
+ * Data that is stored during process wait traversal.
+ * We use indices as the actual data is stored in an array
+ * that is realloc'ed in runtime.
+ */
+ size_t wait_data_idx;
+ size_t wait_extra_data_idx;
+ struct list_item wait_list;
+
+
#ifdef HAVE_LINUX_KVM_H
struct vcpu_info *vcpu_info_list;
#endif
diff --git a/strace.c b/strace.c
index 6d70d20c1..2823eb412 100644
--- a/strace.c
+++ b/strace.c
@@ -161,10 +161,17 @@ static struct tcb *current_tcp;
struct tcb_wait_data {
enum trace_event te; /**< Event passed to dispatch_event() */
int status; /**< status, returned by wait4() */
+ unsigned long msg; /**< Value returned by PTRACE_GETEVENTMSG */
siginfo_t si; /**< siginfo, returned by PTRACE_GETSIGINFO */
};
static struct tcb **tcbtab;
+/*
+ * Since the queueing of tracees stops as soon as wait4() returns EAGAIN,
+ * or at least two events for a single tracee, tab_wait_tab size shouldn't
+ * exceed tcbtabsize + 1.
+ */
+static struct tcb_wait_data *tcb_wait_tab;
static unsigned int nprocs;
static size_t tcbtabsize;
@@ -750,6 +757,9 @@ expand_tcbtab(void)
for (tcb_ptr = tcbtab + old_tcbtabsize;
tcb_ptr < tcbtab + tcbtabsize; tcb_ptr++, newtcbs++)
*tcb_ptr = newtcbs;
+
+ tcb_wait_tab = xreallocarray(tcb_wait_tab, sizeof(*tcb_wait_tab),
+ tcbtabsize + 1);
}
static struct tcb *
@@ -769,6 +779,7 @@ alloctcb(int pid)
#if SUPPORTED_PERSONALITIES > 1
tcp->currpers = current_personality;
#endif
+ tcp->wait_extra_data_idx = (size_t) -1LLU;
nprocs++;
debug_msg("new tcb for pid %d, active tcbs:%d",
tcp->pid, nprocs);
@@ -853,6 +864,8 @@ droptcb(struct tcb *tcp)
if (printing_tcp == tcp)
printing_tcp = NULL;
+ list_remove(&tcp->wait_list);
+
memset(tcp, 0, sizeof(*tcp));
}
@@ -2071,10 +2084,8 @@ maybe_switch_tcbs(struct tcb *tcp, const int pid)
{
FILE *fp;
struct tcb *execve_thread;
- long old_pid = 0;
+ long old_pid = tcb_wait_tab[tcp->wait_data_idx].msg;
- if (ptrace(PTRACE_GETEVENTMSG, pid, NULL, &old_pid) < 0)
- return tcp;
/* Avoid truncation in pid2tcb() param passing */
if (old_pid <= 0 || old_pid == pid)
return tcp;
@@ -2235,17 +2246,38 @@ print_event_exit(struct tcb *tcp)
static const struct tcb_wait_data *
next_event(void)
{
- static struct tcb_wait_data wait_data;
+ static EMPTY_LIST(pending_tcps);
+ static struct tcb *extra_tcp;
int pid;
int status;
- struct tcb *tcp;
- struct tcb_wait_data *wd = &wait_data;
+ int wait_errno;
+ bool wait_nohang = false;
+ struct list_item *elem;
+ struct tcb *tcp = NULL;
+ size_t wait_tab_pos = 0;
struct rusage ru;
if (interrupted)
return NULL;
+ if (!list_is_empty(&pending_tcps))
+ goto next_event_get_tcp;
+
+ if (extra_tcp) {
+ tcp = extra_tcp;
+ extra_tcp = NULL;
+
+ if (tcp->wait_extra_data_idx != (size_t) -1LLU) {
+ tcp->wait_data_idx = tcp->wait_extra_data_idx;
+ tcp->wait_extra_data_idx = (size_t) -1LLU;
+
+ debug_msg("dequeued extra event from pid %u", tcp->pid);
+
+ goto next_event_exit;
+ }
+ }
+
/*
* Used to exit simply when nprocs hits zero, but in this testcase:
* int main(void) { _exit(!!fork()); }
@@ -2288,7 +2320,7 @@ next_event(void)
* the expiration will be handled by the signal handler.
*/
pid = wait4(-1, &status, __WALL, (cflag ? &ru : NULL));
- const int wait_errno = errno;
+ wait_errno = errno;
/*
* The window of opportunity to handle expirations
@@ -2304,135 +2336,184 @@ next_event(void)
return NULL;
}
- if (pid < 0) {
- if (wait_errno == EINTR) {
- wd->te = TE_NEXT;
- return wd;
+ while (true) {
+ struct tcb_wait_data *wd;
+
+ if (pid < 0) {
+ if (wait_errno == EINTR)
+ break;
+ if (wait_nohang)
+ break;
+ if (nprocs == 0 && wait_errno == ECHILD)
+ return NULL;
+ /*
+ * If nprocs > 0, ECHILD is not expected,
+ * treat it as any other error here:
+ */
+ errno = wait_errno;
+ perror_msg_and_die("wait4(__WALL)");
}
- if (nprocs == 0 && wait_errno == ECHILD)
- return NULL;
- /*
- * If nprocs > 0, ECHILD is not expected,
- * treat it as any other error here:
- */
- errno = wait_errno;
- perror_msg_and_die("wait4(__WALL)");
- }
- wd->status = status;
+ if (!pid)
+ break;
- if (pid == popen_pid) {
- if (!WIFSTOPPED(status))
- popen_pid = 0;
- wd->te = TE_NEXT;
- return wd;
- }
+ if (pid == popen_pid) {
+ if (!WIFSTOPPED(status))
+ popen_pid = 0;
+ break;
+ }
- if (debug_flag)
- print_debug_info(pid, status);
+ if (debug_flag)
+ print_debug_info(pid, status);
- /* Look up 'pid' in our table. */
- tcp = pid2tcb(pid);
+ /* Look up 'pid' in our table. */
+ tcp = pid2tcb(pid);
- if (!tcp) {
- tcp = maybe_allocate_tcb(pid, status);
if (!tcp) {
- wd->te = TE_NEXT;
- return wd;
+ tcp = maybe_allocate_tcb(pid, status);
+ if (!tcp)
+ break;
}
- }
- clear_regs(tcp);
+ if (cflag) {
+ struct timespec stime = {
+ .tv_sec = ru.ru_stime.tv_sec,
+ .tv_nsec = ru.ru_stime.tv_usec * 1000
+ };
+ ts_sub(&tcp->dtime, &stime, &tcp->stime);
+ tcp->stime = stime;
+ }
- /* Set current output file */
- set_current_tcp(tcp);
+ if (wait_tab_pos > tcbtabsize)
+ error_func_msg_and_die("Wait data storage overflow "
+ "(wait_tab_pos %zu, nprocs %u, "
+ "tcbtabsize %zu)", wait_tab_pos,
+ nprocs, tcbtabsize);
- if (cflag) {
- struct timespec stime = {
- .tv_sec = ru.ru_stime.tv_sec,
- .tv_nsec = ru.ru_stime.tv_usec * 1000
- };
- ts_sub(&tcp->dtime, &stime, &tcp->stime);
- tcp->stime = stime;
- }
+ wd = tcb_wait_tab + wait_tab_pos;
+ memset(wd, 0, sizeof(*wd));
- if (WIFSIGNALED(status)) {
- wd->te = TE_SIGNALLED;
- return wd;
- }
+ if (tcp->wait_list.next) {
+ tcp->wait_extra_data_idx = wait_tab_pos;
+ extra_tcp = tcp;
+ } else {
+ tcp->wait_data_idx = wait_tab_pos;
+ list_append(&pending_tcps, &tcp->wait_list);
+ }
+
+ wait_tab_pos++;
+ wd->status = status;
+
+ if (WIFSIGNALED(status)) {
+ wd->te = TE_SIGNALLED;
+ } else if (WIFEXITED(status)) {
+ wd->te = TE_EXITED;
+ } else {
+ /*
+ * As WCONTINUED flag has not been specified to wait4,
+ * it cannot be WIFCONTINUED(status), so the only case
+ * that remains is WIFSTOPPED(status).
+ */
+
+ const unsigned int sig = WSTOPSIG(status);
+ const unsigned int event = (unsigned int) status >> 16;
- if (WIFEXITED(status)) {
- wd->te = TE_EXITED;
- return wd;
+ switch (event) {
+ case 0:
+ /*
+ * Is this post-attach SIGSTOP?
+ * Interestingly, the process may stop
+ * with STOPSIG equal to some other signal
+ * than SIGSTOP if we happened to attach
+ * just before the process takes a signal.
+ */
+ if (sig == SIGSTOP &&
+ (tcp->flags & TCB_IGNORE_ONE_SIGSTOP)) {
+ debug_func_msg("ignored SIGSTOP on "
+ "pid %d", tcp->pid);
+ tcp->flags &= ~TCB_IGNORE_ONE_SIGSTOP;
+ wd->te = TE_RESTART;
+ } else if (sig == syscall_trap_sig) {
+ wd->te = TE_SYSCALL_STOP;
+ } else {
+ /*
+ * True if tracee is stopped by signal
+ * (as opposed to "tracee received
+ * signal").
+ * TODO: shouldn't we check for
+ * errno == EINVAL too?
+ * We can get ESRCH instead, you know...
+ */
+ bool stopped = ptrace(PTRACE_GETSIGINFO,
+ pid, 0, &wd->si) < 0;
+
+ wd->te = stopped ? TE_GROUP_STOP
+ : TE_SIGNAL_DELIVERY_STOP;
+ }
+ break;
+ case PTRACE_EVENT_STOP:
+ /*
+ * PTRACE_INTERRUPT-stop or group-stop.
+ * PTRACE_INTERRUPT-stop has sig == SIGTRAP here.
+ */
+ switch (sig) {
+ case SIGSTOP:
+ case SIGTSTP:
+ case SIGTTIN:
+ case SIGTTOU:
+ wd->te = TE_GROUP_STOP;
+ break;
+ default:
+ wd->te = TE_RESTART;
+ }
+ break;
+ case PTRACE_EVENT_EXEC:
+ if (ptrace(PTRACE_GETEVENTMSG, pid, NULL,
+ &wd->msg) < 0)
+ wd->msg = 0;
+
+ wd->te = TE_STOP_BEFORE_EXECVE;
+ break;
+ case PTRACE_EVENT_EXIT:
+ wd->te = TE_STOP_BEFORE_EXIT;
+ break;
+ default:
+ wd->te = TE_RESTART;
+ }
+ }
+
+ if (extra_tcp)
+ break;
+
+ pid = wait4(-1, &status, __WALL | WNOHANG, (cflag ? &ru : NULL));
+ wait_errno = errno;
+ wait_nohang = true;
}
- /*
- * As WCONTINUED flag has not been specified to wait4,
- * it cannot be WIFCONTINUED(status), so the only case
- * that remains is WIFSTOPPED(status).
- */
+next_event_get_tcp:
+ elem = list_remove_head(&pending_tcps);
+
+ if (!elem) {
+ memset(tcb_wait_tab, 0, sizeof(*tcb_wait_tab));
+ tcb_wait_tab->te = TE_NEXT;
+
+ return tcb_wait_tab;
+ } else {
+ tcp = list_elem(elem, struct tcb, wait_list);
+ debug_func_msg("dequeued pid %d", tcp->pid);
+ }
+next_event_exit:
/* Is this the very first time we see this tracee stopped? */
if (tcp->flags & TCB_STARTUP)
startup_tcb(tcp);
- const unsigned int sig = WSTOPSIG(status);
- const unsigned int event = (unsigned int) status >> 16;
+ clear_regs(tcp);
- switch (event) {
- case 0:
- /*
- * Is this post-attach SIGSTOP?
- * Interestingly, the process may stop
- * with STOPSIG equal to some other signal
- * than SIGSTOP if we happened to attach
- * just before the process takes a signal.
- */
- if (sig == SIGSTOP && (tcp->flags & TCB_IGNORE_ONE_SIGSTOP)) {
- debug_func_msg("ignored SIGSTOP on pid %d", tcp->pid);
- tcp->flags &= ~TCB_IGNORE_ONE_SIGSTOP;
- wd->te = TE_RESTART;
- } else if (sig == syscall_trap_sig) {
- wd->te = TE_SYSCALL_STOP;
- } else {
- memset(&wd->si, 0, sizeof(wd->si));
- /*
- * True if tracee is stopped by signal
- * (as opposed to "tracee received signal").
- * TODO: shouldn't we check for errno == EINVAL too?
- * We can get ESRCH instead, you know...
- */
- bool stopped = ptrace(PTRACE_GETSIGINFO, pid, 0, &wd->si) < 0;
- wd->te = stopped ? TE_GROUP_STOP : TE_SIGNAL_DELIVERY_STOP;
- }
- break;
- case PTRACE_EVENT_STOP:
- /*
- * PTRACE_INTERRUPT-stop or group-stop.
- * PTRACE_INTERRUPT-stop has sig == SIGTRAP here.
- */
- switch (sig) {
- case SIGSTOP:
- case SIGTSTP:
- case SIGTTIN:
- case SIGTTOU:
- wd->te = TE_GROUP_STOP;
- break;
- default:
- wd->te = TE_RESTART;
- }
- break;
- case PTRACE_EVENT_EXEC:
- wd->te = TE_STOP_BEFORE_EXECVE;
- break;
- case PTRACE_EVENT_EXIT:
- wd->te = TE_STOP_BEFORE_EXIT;
- break;
- default:
- wd->te = TE_RESTART;
- }
+ /* Set current output file */
+ set_current_tcp(tcp);
- return wd;
+ return tcb_wait_tab + tcp->wait_data_idx;
}
static int
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 0de9b4431..c5a50ae04 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -371,8 +371,7 @@ XFAIL_TESTS_m32 = $(STACKTRACE_TESTS)
XFAIL_TESTS_mx32 = $(STACKTRACE_TESTS)
XFAIL_TESTS_x86_64 = int_0x80.gen.test
XFAIL_TESTS_x32 = int_0x80.gen.test
-XFAIL_TESTS = $(XFAIL_TESTS_$(MPERS_NAME)) $(XFAIL_TESTS_$(ARCH)) \
- looping_threads.test
+XFAIL_TESTS = $(XFAIL_TESTS_$(MPERS_NAME)) $(XFAIL_TESTS_$(ARCH))
TEST_LOG_COMPILER = env
AM_TEST_LOG_FLAGS = STRACE_ARCH=$(ARCH) STRACE_NATIVE_ARCH=$(NATIVE_ARCH) \