/** * Seccomp System Interfaces * * Copyright (c) 2014 Red Hat * Author: Paul Moore */ /* * This library is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License as * published by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library; if not, see . */ #include #include #include #define _GNU_SOURCE #include #include "system.h" #include #include "arch.h" #include "db.h" #include "gen_bpf.h" #include "helper.h" /* NOTE: the seccomp syscall allowlist is currently disabled for testing * purposes, but unless we can verify all of the supported ABIs before * our next release we may have to enable the allowlist */ #define SYSCALL_ALLOWLIST_ENABLE 0 /* task global state */ struct task_state { /* seccomp(2) syscall */ int nr_seccomp; /* userspace notification fd */ int notify_fd; /* runtime support flags */ int sup_syscall; int sup_flag_tsync; int sup_flag_log; int sup_action_log; int sup_kill_process; int sup_flag_spec_allow; int sup_flag_new_listener; int sup_user_notif; int sup_flag_tsync_esrch; int sup_flag_wait_kill; }; static struct task_state state = { .nr_seccomp = -1, .notify_fd = -1, .sup_syscall = -1, .sup_flag_tsync = -1, .sup_flag_log = -1, .sup_action_log = -1, .sup_kill_process = -1, .sup_flag_spec_allow = -1, .sup_flag_new_listener = -1, .sup_user_notif = -1, .sup_flag_tsync_esrch = -1, .sup_flag_wait_kill = -1, }; /** * Reset the task state * * This function fully resets the library's global "system task state". * */ void sys_reset_state(void) { state.nr_seccomp = -1; if (state.notify_fd > 0) close(state.notify_fd); state.notify_fd = -1; state.sup_syscall = -1; state.sup_flag_tsync = -1; state.sup_flag_log = -1; state.sup_action_log = -1; state.sup_kill_process = -1; state.sup_flag_spec_allow = -1; state.sup_flag_new_listener = -1; state.sup_user_notif = -1; state.sup_flag_tsync_esrch = -1; } /** * Check to see if the seccomp() syscall is supported * * This function attempts to see if the system supports the seccomp() syscall. * Unfortunately, there are a few reasons why this check may fail, including * a previously loaded seccomp filter, so it is hard to say for certain. * Return one if the syscall is supported, zero otherwise. * */ int sys_chk_seccomp_syscall(void) { int rc; int nr_seccomp; /* NOTE: it is reasonably safe to assume that we should be able to call * seccomp() when the caller first starts, but we can't rely on * it later so we need to cache our findings for use later */ if (state.sup_syscall >= 0) return state.sup_syscall; #if SYSCALL_ALLOWLIST_ENABLE /* architecture allowlist */ switch (arch_def_native->token) { case SCMP_ARCH_X86_64: case SCMP_ARCH_ARM: case SCMP_ARCH_AARCH64: case SCMP_ARCH_LOONGARCH64: case SCMP_ARCH_PPC64: case SCMP_ARCH_PPC64LE: case SCMP_ARCH_S390: case SCMP_ARCH_S390X: case SCMP_ARCH_RISCV64: break; default: goto unsupported; } #endif nr_seccomp = arch_syscall_resolve_name(arch_def_native, "seccomp"); if (nr_seccomp < 0) goto unsupported; /* this is an invalid call because the second argument is non-zero, but * depending on the errno value of ENOSYS or EINVAL we can guess if the * seccomp() syscall is supported or not */ rc = syscall(nr_seccomp, SECCOMP_SET_MODE_STRICT, 1, NULL); if (rc < 0 && errno == EINVAL) goto supported; unsupported: state.sup_syscall = 0; return 0; supported: state.nr_seccomp = nr_seccomp; state.sup_syscall = 1; return 1; } /** * Force the seccomp() syscall support setting * @param enable the intended support state * * This function overrides the current seccomp() syscall support setting; this * is very much a "use at your own risk" function. * */ void sys_set_seccomp_syscall(bool enable) { state.sup_syscall = (enable ? 1 : 0); } /** * Check to see if a seccomp action is supported * @param action the seccomp action * * This function checks to see if a seccomp action is supported by the system. * Return one if the action is supported, zero otherwise. * */ int sys_chk_seccomp_action(uint32_t action) { if (action == SCMP_ACT_KILL_PROCESS) { if (state.sup_kill_process < 0) { if (sys_chk_seccomp_syscall() == 1 && syscall(state.nr_seccomp, SECCOMP_GET_ACTION_AVAIL, 0, &action) == 0) state.sup_kill_process = 1; else state.sup_kill_process = 0; } return state.sup_kill_process; } else if (action == SCMP_ACT_KILL_THREAD) { return 1; } else if (action == SCMP_ACT_TRAP) { return 1; } else if ((action == SCMP_ACT_ERRNO(action & 0x0000ffff)) && ((action & 0x0000ffff) < MAX_ERRNO)) { return 1; } else if (action == SCMP_ACT_TRACE(action & 0x0000ffff)) { return 1; } else if (action == SCMP_ACT_LOG) { if (state.sup_action_log < 0) { if (sys_chk_seccomp_syscall() == 1 && syscall(state.nr_seccomp, SECCOMP_GET_ACTION_AVAIL, 0, &action) == 0) state.sup_action_log = 1; else state.sup_action_log = 0; } return state.sup_action_log; } else if (action == SCMP_ACT_ALLOW) { return 1; } else if (action == SCMP_ACT_NOTIFY) { if (state.sup_user_notif < 0) { struct seccomp_notif_sizes sizes; if (sys_chk_seccomp_syscall() == 1 && syscall(state.nr_seccomp, SECCOMP_GET_NOTIF_SIZES, 0, &sizes) == 0) state.sup_user_notif = 1; else state.sup_user_notif = 0; } return state.sup_user_notif; } return 0; } /** * Force a seccomp action support setting * @param action the seccomp action * @param enable the intended support state * * This function overrides the current seccomp action support setting; this * is very much a "use at your own risk" function. */ void sys_set_seccomp_action(uint32_t action, bool enable) { switch (action) { case SCMP_ACT_LOG: state.sup_action_log = (enable ? 1 : 0); break; case SCMP_ACT_KILL_PROCESS: state.sup_kill_process = (enable ? 1 : 0); break; case SCMP_ACT_NOTIFY: state.sup_user_notif = (enable ? 1 : 0); break; } } /** * Check to see if a seccomp() flag is supported by the kernel * @param flag the seccomp() flag * * This function checks to see if a seccomp() flag is supported by the kernel. * Return one if the flag is supported, zero otherwise. * */ static int _sys_chk_flag_kernel(int flag) { /* this is an invalid seccomp(2) call because the last argument * is NULL, but depending on the errno value of EFAULT we can * guess if the filter flag is supported or not */ if (sys_chk_seccomp_syscall() == 1 && syscall(state.nr_seccomp, SECCOMP_SET_MODE_FILTER, flag, NULL) == -1 && errno == EFAULT) return 1; return 0; } /** * Check to see if a seccomp() flag is supported * @param flag the seccomp() flag * * This function checks to see if a seccomp() flag is supported by the system. * Return one if the syscall is supported, zero if unsupported, negative values * on error. * */ int sys_chk_seccomp_flag(int flag) { switch (flag) { case SECCOMP_FILTER_FLAG_TSYNC: if (state.sup_flag_tsync < 0) state.sup_flag_tsync = _sys_chk_flag_kernel(flag); return state.sup_flag_tsync; case SECCOMP_FILTER_FLAG_LOG: if (state.sup_flag_log < 0) state.sup_flag_log = _sys_chk_flag_kernel(flag); return state.sup_flag_log; case SECCOMP_FILTER_FLAG_SPEC_ALLOW: if (state.sup_flag_spec_allow < 0) state.sup_flag_spec_allow = _sys_chk_flag_kernel(flag); return state.sup_flag_spec_allow; case SECCOMP_FILTER_FLAG_NEW_LISTENER: if (state.sup_flag_new_listener < 0) state.sup_flag_new_listener = _sys_chk_flag_kernel(flag); return state.sup_flag_new_listener; case SECCOMP_FILTER_FLAG_TSYNC_ESRCH: if (state.sup_flag_tsync_esrch < 0) state.sup_flag_tsync_esrch = _sys_chk_flag_kernel(flag); return state.sup_flag_tsync_esrch; case SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV: if (state.sup_flag_wait_kill < 0) state.sup_flag_wait_kill = _sys_chk_flag_kernel(flag); return state.sup_flag_wait_kill; } return -EOPNOTSUPP; } /** * Force a seccomp() syscall flag support setting * @param flag the seccomp() flag * @param enable the intended support state * * This function overrides the current seccomp() syscall support setting for a * given flag; this is very much a "use at your own risk" function. * */ void sys_set_seccomp_flag(int flag, bool enable) { switch (flag) { case SECCOMP_FILTER_FLAG_TSYNC: state.sup_flag_tsync = (enable ? 1 : 0); break; case SECCOMP_FILTER_FLAG_LOG: state.sup_flag_log = (enable ? 1 : 0); break; case SECCOMP_FILTER_FLAG_SPEC_ALLOW: state.sup_flag_spec_allow = (enable ? 1 : 0); break; case SECCOMP_FILTER_FLAG_NEW_LISTENER: state.sup_flag_new_listener = (enable ? 1 : 0); break; case SECCOMP_FILTER_FLAG_TSYNC_ESRCH: state.sup_flag_tsync_esrch = (enable ? 1 : 0); break; case SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV: state.sup_flag_wait_kill = (enable ? 1 : 0); break; } } /** * Loads the filter into the kernel * @param col the filter collection * @param rawrc pass the raw return code if true * * This function loads the given seccomp filter context into the kernel. If * the filter was loaded correctly, the kernel will be enforcing the filter * when this function returns. Returns zero on success, negative values on * error. * */ int sys_filter_load(struct db_filter_col *col, bool rawrc) { int rc; bool tsync_notify; bool listener_req; struct bpf_program *prgm = NULL; rc = db_col_precompute(col); if (rc < 0) return rc; prgm = col->prgm_bpf; /* attempt to set NO_NEW_PRIVS */ if (col->attr.nnp_enable) { rc = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); if (rc < 0) goto filter_load_out; } tsync_notify = state.sup_flag_tsync_esrch > 0 && state.notify_fd == -1; listener_req = state.sup_user_notif > 0 && \ col->notify_used && state.notify_fd == -1; /* load the filter into the kernel */ if (sys_chk_seccomp_syscall() == 1) { int flgs = 0; if (tsync_notify) { if (col->attr.tsync_enable) flgs |= SECCOMP_FILTER_FLAG_TSYNC | \ SECCOMP_FILTER_FLAG_TSYNC_ESRCH; if (listener_req) flgs |= SECCOMP_FILTER_FLAG_NEW_LISTENER; } else if (col->attr.tsync_enable) { if (listener_req) { /* NOTE: we _should_ catch this in db.c */ rc = -EFAULT; goto filter_load_out; } flgs |= SECCOMP_FILTER_FLAG_TSYNC; } else if (listener_req) flgs |= SECCOMP_FILTER_FLAG_NEW_LISTENER; if ((flgs & SECCOMP_FILTER_FLAG_NEW_LISTENER) && col->attr.wait_killable_recv) flgs |= SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV; if (col->attr.log_enable) flgs |= SECCOMP_FILTER_FLAG_LOG; if (col->attr.spec_allow) flgs |= SECCOMP_FILTER_FLAG_SPEC_ALLOW; rc = syscall(state.nr_seccomp, SECCOMP_SET_MODE_FILTER, flgs, prgm); if (tsync_notify && rc > 0) { /* return 0 on NEW_LISTENER success, but save the fd */ state.notify_fd = rc; rc = 0; } else if (rc > 0 && col->attr.tsync_enable) { /* always return -ESRCH if we fail to sync threads */ errno = ESRCH; rc = -errno; } else if (rc > 0 && state.sup_user_notif > 0) { /* return 0 on NEW_LISTENER success, but save the fd */ state.notify_fd = rc; rc = 0; } } else rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, prgm); filter_load_out: /* cleanup and return */ if (rc == -ESRCH) return -ESRCH; if (rc < 0) return (rawrc ? -errno : -ECANCELED); return rc; } /** * Return the userspace notification fd * * This function returns the userspace notification fd from * SECCOMP_FILTER_FLAG_NEW_LISTENER. If the notification fd has not yet been * set, or an error has occurred, -1 is returned. * */ int sys_notify_fd(void) { return state.notify_fd; } /** * Allocate a pair of notification request/response structures * @param req the request location * @param resp the response location * * This function allocates a pair of request/response structure by computing * the correct sized based on the currently running kernel. It returns zero on * success, and negative values on failure. * */ int sys_notify_alloc(struct seccomp_notif **req, struct seccomp_notif_resp **resp) { int rc; static struct seccomp_notif_sizes sizes = { 0, 0, 0 }; if (state.sup_syscall <= 0) return -EOPNOTSUPP; if (sizes.seccomp_notif == 0 && sizes.seccomp_notif_resp == 0) { rc = syscall(__NR_seccomp, SECCOMP_GET_NOTIF_SIZES, 0, &sizes); if (rc < 0) return -ECANCELED; } if (sizes.seccomp_notif == 0 || sizes.seccomp_notif_resp == 0) return -EFAULT; if (req) { *req = zmalloc(sizes.seccomp_notif); if (!*req) return -ENOMEM; } if (resp) { *resp = zmalloc(sizes.seccomp_notif_resp); if (!*resp) { if (req) free(*req); return -ENOMEM; } } return 0; } /** * Receive a notification from a seccomp notification fd * @param fd the notification fd * @param req the request buffer to save into * * Blocks waiting for a notification on this fd. This function is thread safe * (synchronization is performed in the kernel). Returns zero on success, * negative values on error. * */ int sys_notify_receive(int fd, struct seccomp_notif *req) { if (state.sup_user_notif <= 0) return -EOPNOTSUPP; if (ioctl(fd, SECCOMP_IOCTL_NOTIF_RECV, req) < 0) return -ECANCELED; return 0; } /** * Send a notification response to a seccomp notification fd * @param fd the notification fd * @param resp the response buffer to use * * Sends a notification response on this fd. This function is thread safe * (synchronization is performed in the kernel). Returns zero on success, * negative values on error. * */ int sys_notify_respond(int fd, struct seccomp_notif_resp *resp) { if (state.sup_user_notif <= 0) return -EOPNOTSUPP; if (ioctl(fd, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0) return -ECANCELED; return 0; } /** * Check if a notification id is still valid * @param fd the notification fd * @param id the id to test * * Checks to see if a notification id is still valid. Returns 0 on success, and * negative values on failure. * */ int sys_notify_id_valid(int fd, uint64_t id) { int rc; if (state.sup_user_notif <= 0) return -EOPNOTSUPP; rc = ioctl(fd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id); if (rc < 0 && errno == EINVAL) /* It is possible that libseccomp was built against newer kernel * headers than the kernel it is running on. If so, the older * runtime kernel may not support the "fixed" * SECCOMP_IOCTL_NOTIF_ID_VALID ioctl number which was introduced in * kernel commit 47e33c05f9f0 ("seccomp: Fix ioctl number for * SECCOMP_IOCTL_NOTIF_ID_VALID"). Try the old value. */ rc = ioctl(fd, SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR, &id); if (rc < 0) return -ENOENT; return 0; }