From 2dcefbd6094681dcbd2d7ff608590e74425a9220 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Fri, 28 May 2021 18:17:35 +0200 Subject: jail: add support for cgroup devices as in OCI run-time spec Implement eBPF generator to emulate cgroup-v1 devices.{allow,deny} as we got only cgroup-v2 available while the spec was written having cgroups-v1 in mind. Instead of literally emulating the legacy behavior, do like other runtimes do as well when running on cgroup-v2: simply translate each device rule into a bunch of eBPF instructions and then execute them in reverse order, prepended by some default rules covering /dev/null, /dev/random, /dev/tty, ... Signed-off-by: Daniel Golle --- jail/cgroups-bpf.c | 451 +++++++++++++++++++++++++++++++++++++++++++++++++++++ jail/cgroups-bpf.h | 20 +++ jail/cgroups.c | 34 +++- jail/jail.c | 11 +- 4 files changed, 503 insertions(+), 13 deletions(-) create mode 100644 jail/cgroups-bpf.c create mode 100644 jail/cgroups-bpf.h (limited to 'jail') diff --git a/jail/cgroups-bpf.c b/jail/cgroups-bpf.c new file mode 100644 index 0000000..b0079d9 --- /dev/null +++ b/jail/cgroups-bpf.c @@ -0,0 +1,451 @@ +/* + * Copyright (C) 2021 Daniel Golle + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 + * as published by the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * somehow emulate devices.allow/devices.deny using eBPF + * + * OCI run-time spec defines the syntax for allowing/denying access + * to devices according to the definition of cgroup-v1 in the Kernel + * as described in Documentation/admin-guide/cgroup-v1. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "cgroups.h" +#include "cgroups-bpf.h" +#include "log.h" + +static struct bpf_insn *program = NULL; +static int bpf_total_insn = 0; +static const char *license = "GPL"; + +static int +syscall_bpf (int cmd, union bpf_attr *attr, unsigned int size) +{ + return (int) syscall (__NR_bpf, cmd, attr, size); +} + +/* from crun/src/libcrun/ebpf.c */ +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn){ .code = BPF_ALU | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM }) + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn){ \ + .code = BPF_LDX | BPF_SIZE (SIZE) | BPF_MEM, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 }) + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 }) + +#define BPF_JMP_A(OFF) \ + ((struct bpf_insn){ .code = BPF_JMP | BPF_JA, .dst_reg = 0, .src_reg = 0, .off = OFF, .imm = 0 }) + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = OFF, .imm = IMM }) + +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 }) + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM }) + +#define BPF_MOV32_REG(DST, SRC) \ + ((struct bpf_insn){ .code = BPF_ALU | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 }) + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn){ .code = BPF_JMP | BPF_EXIT, .dst_reg = 0, .src_reg = 0, .off = 0, .imm = 0 }) + +/* taken from systemd. */ +static const struct bpf_insn pre_insn[] = { + /* type -> R2. */ + BPF_LDX_MEM (BPF_W, BPF_REG_2, BPF_REG_1, 0), + BPF_ALU32_IMM (BPF_AND, BPF_REG_2, 0xFFFF), + /* access -> R3. */ + BPF_LDX_MEM (BPF_W, BPF_REG_3, BPF_REG_1, 0), + BPF_ALU32_IMM (BPF_RSH, BPF_REG_3, 16), + /* major -> R4. */ + BPF_LDX_MEM (BPF_W, BPF_REG_4, BPF_REG_1, 4), + /* minor -> R5. */ + BPF_LDX_MEM (BPF_W, BPF_REG_5, BPF_REG_1, 8), +}; + +enum { + OCI_LINUX_CGROUPS_DEVICES_ALLOW, + OCI_LINUX_CGROUPS_DEVICES_TYPE, + OCI_LINUX_CGROUPS_DEVICES_MAJOR, + OCI_LINUX_CGROUPS_DEVICES_MINOR, + OCI_LINUX_CGROUPS_DEVICES_ACCESS, + __OCI_LINUX_CGROUPS_DEVICES_MAX, +}; + +static const struct blobmsg_policy oci_linux_cgroups_devices_policy[] = { + [OCI_LINUX_CGROUPS_DEVICES_ALLOW] = { "allow", BLOBMSG_TYPE_BOOL }, + [OCI_LINUX_CGROUPS_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING }, + [OCI_LINUX_CGROUPS_DEVICES_MAJOR] = { "major", BLOBMSG_CAST_INT64 }, + [OCI_LINUX_CGROUPS_DEVICES_MINOR] = { "minor", BLOBMSG_CAST_INT64 }, + [OCI_LINUX_CGROUPS_DEVICES_ACCESS] = { "access", BLOBMSG_TYPE_STRING }, +}; + +/* + * cgroup-v1 devices got a (default) behaviour and a list of exceptions. + * define datatypes similar to the legacy kernel code. + */ +#define DEVCG_DEV_ALL (BPF_DEVCG_DEV_BLOCK | BPF_DEVCG_DEV_CHAR) +#define DEVCG_ACC_ALL (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD) + +enum devcg_behavior { + DEVCG_DEFAULT_NONE, + DEVCG_DEFAULT_ALLOW, + DEVCG_DEFAULT_DENY, +}; + +struct dev_exception_item { + uint32_t major, minor; + short type; + short access; + struct list_head list; + bool allow; +}; + +/* + * add a bunch of default rules + */ +static int add_default_exceptions(struct list_head *exceptions) +{ + int i, ret = 0; + struct dev_exception_item *cur; + /* from crun/src/libcrun/cgroup.c */ + const struct dev_exception_item defrules[] = { + /* always allow mknod */ + { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = ~0, .minor = ~0, .access = BPF_DEVCG_ACC_MKNOD }, + { .allow = true, .type = BPF_DEVCG_DEV_BLOCK, .major = ~0, .minor = ~0, .access = BPF_DEVCG_ACC_MKNOD }, + /* /dev/null */ + { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 3, .access = DEVCG_ACC_ALL }, + /* /dev/random */ + { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 8, .access = DEVCG_ACC_ALL }, + /* /dev/full */ + { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 7, .access = DEVCG_ACC_ALL }, + /* /dev/tty */ + { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 0, .access = DEVCG_ACC_ALL }, + /* /dev/zero */ + { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 5, .access = DEVCG_ACC_ALL }, + /* /dev/urandom */ + { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 9, .access = DEVCG_ACC_ALL }, + /* /dev/console */ + { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 1, .access = DEVCG_ACC_ALL }, + /* /dev/pts/[0-255] */ + { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 136, .minor = ~0, .access = DEVCG_ACC_ALL }, + /* /dev/ptmx */ + { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 2, .access = DEVCG_ACC_ALL }, + /* /dev/net/tun */ + { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 10, .minor = 200, .access = DEVCG_ACC_ALL }, + }; + + for (i = 0; i < (sizeof(defrules) / sizeof(struct dev_exception_item)); ++i) { + cur = malloc(sizeof(struct dev_exception_item)); + if (!cur) { + ret = ENOMEM; + break; + } + /* add defaults to list in reverse order (last item will be first in list) */ + memcpy(cur, &defrules[i], sizeof(struct dev_exception_item)); + list_add(&cur->list, exceptions); + } + + return ret; +} + +/* + * free all exceptions in the list + */ +static void flush_exceptions(struct list_head *freelist) +{ + struct dev_exception_item *dl, *dln; + + if (!list_empty(freelist)) + list_for_each_entry_safe(dl, dln, freelist, list) { + list_del(&dl->list); + free(dl); + } +} + +/* + * parse OCI cgroups devices and translate into cgroups-v2 eBPF program + */ +int parseOCIlinuxcgroups_devices(struct blob_attr *msg) +{ + struct blob_attr *tb[__OCI_LINUX_CGROUPS_DEVICES_MAX]; + struct blob_attr *cur; + int rem, ret = 0; + int bpf_type, bpf_access; + unsigned char acidx; + bool allow = false, + has_access = false, + has_type = false, + has_major = false, + has_minor = false; + int total_ins = 0, + cur_ins = 0, + pre_insn_len = sizeof(pre_insn) / sizeof(struct bpf_insn), + next_ins; + char *access, *devtype; + uint32_t devmajor, devminor; + struct dev_exception_item *dl; + struct list_head exceptions; + enum devcg_behavior behavior = DEVCG_DEFAULT_ALLOW; + INIT_LIST_HEAD(&exceptions); + + /* parse according to OCI spec */ + blobmsg_for_each_attr(cur, msg, rem) { + blobmsg_parse(oci_linux_cgroups_devices_policy, __OCI_LINUX_CGROUPS_DEVICES_MAX, + tb, blobmsg_data(cur), blobmsg_len(cur)); + + if (!tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]) { + ret = EINVAL; + goto out; + } + + allow = blobmsg_get_bool(tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]); + + bpf_access = 0; + if (tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]) { + access = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]); + if ((strlen(access) > 3) || (strlen(access) == 0)) { + ret = EINVAL; + goto out; + } + + for (acidx = 0; acidx < strlen(access); ++acidx) { + switch (access[acidx]) { + case 'r': + bpf_access |= BPF_DEVCG_ACC_READ; + break; + case 'w': + bpf_access |= BPF_DEVCG_ACC_WRITE; + break; + case 'm': + bpf_access |= BPF_DEVCG_ACC_MKNOD; + break; + default: + ret = EINVAL; + goto out; + } + } + } + + if (!bpf_access) + bpf_access = DEVCG_ACC_ALL; + + bpf_type = 0; + if (tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]) { + devtype = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]); + + switch (devtype[0]) { + case 'c': + bpf_type = BPF_DEVCG_DEV_CHAR; + break; + case 'b': + bpf_type = BPF_DEVCG_DEV_BLOCK; + break; + case 'a': + bpf_type = DEVCG_DEV_ALL; + break; + default: + ret = EINVAL; + goto out; + } + } + + if (!bpf_type) + bpf_type = DEVCG_DEV_ALL; + + if (tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR]) + devmajor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR]); + else + devmajor = ~0; + + if (tb[OCI_LINUX_CGROUPS_DEVICES_MINOR]) + devminor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MINOR]); + else + devminor = ~0; + + if (bpf_type == DEVCG_DEV_ALL) { + /* wildcard => change default policy and flush all existing rules */ + flush_exceptions(&exceptions); + behavior = allow?DEVCG_DEFAULT_ALLOW:DEVCG_DEFAULT_DENY; + } else { + /* allocate and populate record for exception */ + dl = malloc(sizeof(struct dev_exception_item)); + if (!dl) { + ret = ENOSPC; + break; + } + dl->allow = allow; + dl->type = bpf_type; + dl->access = bpf_access; + dl->major = devmajor; + dl->minor = devminor; + + /* push to exceptions list, last goes first */ + list_add(&dl->list, &exceptions); + } + } + if (ret) + goto out; + + /* add default rules */ + ret = add_default_exceptions(&exceptions); + if (ret) + goto out; + + /* calculate number of instructions to allocate */ + list_for_each_entry(dl, &exceptions, list) { + has_access = dl->access != DEVCG_ACC_ALL; + has_type = dl->type != DEVCG_DEV_ALL; + has_major = dl->major != ~0; + has_minor = dl->minor != ~0; + + total_ins += (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 2; + } + + /* acccount for loader instructions */ + total_ins += pre_insn_len; + + /* final accept/deny block */ + total_ins += 2; + + /* allocate memory for eBPF program */ + program = calloc(total_ins, sizeof(struct bpf_insn)); + if (!program) { + ret = ENOMEM; + goto out; + } + + /* copy program loader instructions */ + memcpy(program, &pre_insn, sizeof(pre_insn)); + cur_ins = pre_insn_len; + + /* generate eBPF program */ + list_for_each_entry(dl, &exceptions, list) { + has_access = dl->access != DEVCG_ACC_ALL; + has_type = dl->type != DEVCG_DEV_ALL; + has_major = dl->major != ~0; + has_minor = dl->minor != ~0; + + next_ins = (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 1; + + if (has_type) { + program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_2, dl->type, next_ins); + --next_ins; + } + + if (has_access) { + program[cur_ins++] = BPF_MOV32_REG(BPF_REG_1, BPF_REG_3); + program[cur_ins++] = BPF_ALU32_IMM(BPF_AND, BPF_REG_1, dl->access); + program[cur_ins++] = BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, next_ins - 2); + next_ins -= 3; + } + + if (has_major) { + program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_4, dl->major, next_ins); + --next_ins; + } + + if (has_minor) { + program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_5, dl->minor, next_ins); + --next_ins; + } + + program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, dl->allow ? 1 : 0); + program[cur_ins++] = BPF_EXIT_INSN(); + } + + /* default behavior */ + program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, (behavior == DEVCG_DEFAULT_ALLOW)?1:0); + program[cur_ins++] = BPF_EXIT_INSN(); + + if (debug) { + fprintf(stderr, "cgroup devices:\na > devices.%s\n", + (behavior == DEVCG_DEFAULT_ALLOW)?"allow":"deny"); + + list_for_each_entry(dl, &exceptions, list) + fprintf(stderr, "%c %d:%d %s%s%s > devices.%s\n", + (dl->type == DEVCG_DEV_ALL)?'a': + (dl->type == BPF_DEVCG_DEV_CHAR)?'c':'b', + (dl->major == ~0)?-1:dl->major, + (dl->minor == ~0)?-1:dl->minor, + (dl->access & BPF_DEVCG_ACC_READ)?"r":"", + (dl->access & BPF_DEVCG_ACC_WRITE)?"w":"", + (dl->access & BPF_DEVCG_ACC_MKNOD)?"m":"", + (dl->allow)?"allow":"deny"); + + fprintf(stderr, "generated cgroup-devices eBPF program:\n"); + fprintf(stderr, " [idx]\tcode\t dest\t src\t off\t imm\n"); + for (cur_ins=0; cur_ins + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 2.1 + * as published by the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _JAIL_CGROUPS_BPF_H +#define _JAIL_CGROUPS_BPF_H + +int parseOCIlinuxcgroups_devices(struct blob_attr *msg); +int attach_cgroups_ebpf(int cgroup_dirfd); + +#endif diff --git a/jail/cgroups.c b/jail/cgroups.c index 68bd189..185d18f 100644 --- a/jail/cgroups.c +++ b/jail/cgroups.c @@ -16,7 +16,6 @@ * https://github.com/containers/crun/blob/0.14.1/crun.1.md#cgroup-v2 * * ToDo: - * - convert cgroup1 devices to eBPF program * - convert cgroup1 net_prio and net_cls to eBPF program * - rdma (anyone?) intelrdt (anyone?) */ @@ -43,6 +42,7 @@ #include "log.h" #include "cgroups.h" +#include "cgroups-bpf.h" #define CGROUP_ROOT "/sys/fs/cgroup/" #define CGROUP_IO_WEIGHT_MAX 10000 @@ -197,11 +197,22 @@ void cgroups_apply(pid_t pid) close(fd); } + int dirfd = open(cgroup_path, O_DIRECTORY); + if (dirfd < 0) { + ERROR("can't open %s: %m\n", cgroup_path); + } else { + attach_cgroups_ebpf(dirfd); + close(dirfd); + } + snprintf(ent, maxlen, "%s/%s", cgroup_path, "cgroup.procs"); fd = open(ent, O_WRONLY); - assert(fd != -1); - dprintf(fd, "%d", pid); - close(fd); + if (fd < 0) { + ERROR("can't open %s: %m\n", cgroup_path); + } else { + dprintf(fd, "%d", pid); + close(fd); + } free(ent); } @@ -349,7 +360,8 @@ static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg) numweightstrs = 0; if (weight > -1) - asprintf(&weightstrs[numweightstrs++], "default %d", weight); + if (asprintf(&weightstrs[numweightstrs++], "default %d", weight) < 0) + return ENOMEM; blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem) { uint64_t major, minor; @@ -382,7 +394,8 @@ static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg) major = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR]); minor = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR]); - asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight); + if (asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight) < 0) + return ENOMEM; } if (numweightstrs) { @@ -785,8 +798,7 @@ int parseOCIlinuxcgroups(struct blob_attr *msg) blobmsg_parse(oci_linux_cgroups_policy, __OCI_LINUX_CGROUPS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); - if (tb[OCI_LINUX_CGROUPS_DEVICES] || - tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] || + if (tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] || tb[OCI_LINUX_CGROUPS_INTELRDT] || tb[OCI_LINUX_CGROUPS_NETWORK] || tb[OCI_LINUX_CGROUPS_RDMA]) @@ -804,6 +816,12 @@ int parseOCIlinuxcgroups(struct blob_attr *msg) return ret; } + if (tb[OCI_LINUX_CGROUPS_DEVICES]) { + ret = parseOCIlinuxcgroups_devices(tb[OCI_LINUX_CGROUPS_DEVICES]); + if (ret) + return ret; + } + if (tb[OCI_LINUX_CGROUPS_MEMORY]) { ret = parseOCIlinuxcgroups_legacy_memory(tb[OCI_LINUX_CGROUPS_MEMORY]); if (ret) diff --git a/jail/jail.c b/jail/jail.c index ce4f50c..c350be2 100644 --- a/jail/jail.c +++ b/jail/jail.c @@ -514,8 +514,7 @@ static int apply_sysctl(const char *jail_root) if (!opts.sysctl) return 0; - asprintf(&procdir, "%s/proc", jail_root); - if (!procdir) + if (asprintf(&procdir, "%s/proc", jail_root) < 0) return ENOMEM; mkdir(procdir, 0700); @@ -525,8 +524,7 @@ static int apply_sysctl(const char *jail_root) cur = opts.sysctl; while (*cur) { - asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry); - if (!fname) + if (asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry) < 0) return ENOMEM; DEBUG("sysctl: writing '%s' to %s\n", (*cur)->value, fname); @@ -2581,7 +2579,10 @@ int main(int argc, char **argv) ret=-1; goto errout; } - asprintf(&jsonfile, "%s/config.json", opts.ocibundle); + if (asprintf(&jsonfile, "%s/config.json", opts.ocibundle) < 0) { + ret=-ENOMEM; + goto errout; + } ocires = parseOCI(jsonfile); free(jsonfile); if (ocires) { -- cgit v1.2.1