summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGreg Depoire--Ferrer <greg.depoire@gmail.com>2020-10-29 00:51:30 +0100
committerLennart Poettering <lennart@poettering.net>2020-12-10 16:13:02 +0100
commit6597686865ffcba7450b44814618b94321cfa3cf (patch)
treefacc7785d20b1f8ce5fb8ddc0d9ef479d042f1e0
parent104fc4be11e4429edcb81d39c2299433b71c54f6 (diff)
downloadsystemd-6597686865ffcba7450b44814618b94321cfa3cf.tar.gz
seccomp: don't install filters for archs that can't use syscalls
When seccomp_restrict_archs is called, architectures that are blocked are replaced by the SECCOMP_LOCAL_ARCH_BLOCKED marker so that they are not disabled again and filters are not installed for them. This can make some service that use SystemCallArchitecture= and SystemCallFilter= start faster.
-rw-r--r--TODO4
-rw-r--r--src/shared/seccomp-util.c47
-rw-r--r--src/shared/seccomp-util.h14
3 files changed, 41 insertions, 24 deletions
diff --git a/TODO b/TODO
index 4affa324dc..c4f000e20f 100644
--- a/TODO
+++ b/TODO
@@ -135,10 +135,6 @@ Features:
o move into separate libsystemd-shared-iptables.so .so
- iptables-libs (only used by nspawn + networkd)
-* seccomp: when SystemCallArchitectures=native is set then don't install any
- other seccomp filters for any of the other archs, in order to reduce the
- number of seccomp filters we install needlessly.
-
* seccomp: maybe use seccomp_merge() to merge our filters per-arch if we can.
Apparently kernel performance is much better with fewer larger seccomp
filters than with more smaller seccomp filters.
diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c
index ccae9d4f14..b5f9ad9adc 100644
--- a/src/shared/seccomp-util.c
+++ b/src/shared/seccomp-util.c
@@ -23,7 +23,8 @@
#include "string-util.h"
#include "strv.h"
-const uint32_t seccomp_local_archs[] = {
+/* This array will be modified at runtime as seccomp_restrict_archs is called. */
+uint32_t seccomp_local_archs[] = {
/* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
@@ -94,7 +95,7 @@ const uint32_t seccomp_local_archs[] = {
#elif defined(__s390__)
SCMP_ARCH_S390,
#endif
- (uint32_t) -1
+ SECCOMP_LOCAL_ARCH_END
};
const char* seccomp_arch_to_string(uint32_t c) {
@@ -1758,8 +1759,8 @@ int seccomp_memory_deny_write_execute(void) {
int seccomp_restrict_archs(Set *archs) {
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
- void *id;
int r;
+ bool blocked_new = false;
/* This installs a filter with no rules, but that restricts the system call architectures to the specified
* list.
@@ -1775,25 +1776,37 @@ int seccomp_restrict_archs(Set *archs) {
if (!seccomp)
return -ENOMEM;
- SET_FOREACH(id, archs) {
- r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
- if (r < 0 && r != -EEXIST)
- return r;
- }
+ for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
+ uint32_t arch = seccomp_local_archs[i];
- /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
- * x32 syscalls should basically match x86-64 for everything except the pointer type.
- * The important thing is that you can block the old 32-bit x86 syscalls.
- * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
+ /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
+ if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
+ continue;
- if (seccomp_arch_native() == SCMP_ARCH_X32 ||
- set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
+ bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
- r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
- if (r < 0 && r != -EEXIST)
- return r;
+ /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
+ * x32 syscalls should basically match x86-64 for everything except the pointer type.
+ * The important thing is that you can block the old 32-bit x86 syscalls.
+ * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
+ if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
+ block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
+
+ if (block) {
+ seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
+ blocked_new = true;
+ } else {
+ r = seccomp_arch_add(seccomp, arch);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ }
}
+ /* All architectures that will be blocked by the seccomp program were
+ * already blocked. */
+ if (!blocked_new)
+ return 0;
+
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
if (r < 0)
return r;
diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h
index 610597127e..53b74bdc34 100644
--- a/src/shared/seccomp-util.h
+++ b/src/shared/seccomp-util.h
@@ -100,12 +100,20 @@ int seccomp_lock_personality(unsigned long personality);
int seccomp_protect_hostname(void);
int seccomp_restrict_suid_sgid(void);
-extern const uint32_t seccomp_local_archs[];
+extern uint32_t seccomp_local_archs[];
+
+#define SECCOMP_LOCAL_ARCH_END UINT32_MAX
+
+/* Note: 0 is safe to use here because although SCMP_ARCH_NATIVE is 0, it would
+ * never be in the seccomp_local_archs array anyway so we can use it as a
+ * marker. */
+#define SECCOMP_LOCAL_ARCH_BLOCKED 0
#define SECCOMP_FOREACH_LOCAL_ARCH(arch) \
for (unsigned _i = ({ (arch) = seccomp_local_archs[0]; 0; }); \
- seccomp_local_archs[_i] != (uint32_t) -1; \
- (arch) = seccomp_local_archs[++_i])
+ (arch) != SECCOMP_LOCAL_ARCH_END; \
+ (arch) = seccomp_local_archs[++_i]) \
+ if ((arch) != SECCOMP_LOCAL_ARCH_BLOCKED)
/* EACCES: does not have the CAP_SYS_ADMIN or no_new_privs == 1
* ENOMEM: out of memory, failed to allocate space for a libseccomp structure, or would exceed a defined constant