+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include "m7a.h"
+
+#include <lopsub.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/sysmacros.h>
+#include <pty.h>
+#include <utmp.h>
+#include <sys/socket.h>
+#include <sys/capability.h>
+#include <sys/syscall.h>
+
+#include "micoforia.lsg.h"
+
+static struct lls_parse_result *lpr, *sublpr;
+unsigned loglevel_arg_val = 4;
+
+struct ifspec {
+ char *bridge;
+ uint8_t hwaddr[6];
+};
+
+struct container {
+ char *name;
+ char *pre_start_hook;
+ char *pre_exec_hook;
+ char *root_dir;
+ char *init;
+ struct ifspec *ifspec;
+ /* this is never zero, even if no ifspec was given */
+ unsigned num_ifspecs;
+ char **dacl;
+ unsigned num_dac_entries;
+ char **io_max;
+ unsigned num_io_max_entries;
+ /* ~0U: not given, 0: unlimited */
+ unsigned cpu_cores;
+ unsigned memory_limit;
+ /* ~0U: not given */
+ unsigned init_type;
+ cap_value_t *capdrop;
+ unsigned num_capdrops;
+ uint32_t *tty;
+ unsigned num_ttys;
+};
+
+static struct container **container;
+static unsigned num_containers;
+
+struct container_runtime {
+ int pipe1[2], pipe2[2]; /* for startup communication */
+ uint32_t *tty;
+ unsigned num_ttys;
+ int *master, *slave, *client;
+
+ int init_pid; /* in the parent namespace */
+ char *pts, *root, *dev;
+ int socket_fd;
+};
+
+static char **default_dacl, **default_io_max;
+unsigned num_default_dac_entries, num_default_io_max_entries;
+static cap_value_t *default_capdrop;
+unsigned num_default_capdrops;
+uint32_t *default_tty;
+unsigned num_default_ttys;
+static const struct lls_command *subcmd;
+/* does not allocate memory */
+void m7a_log(int ll, const char* fmt,...)
+{
+ va_list argp;
+
+ if (ll < loglevel_arg_val)
+ return;
+ va_start(argp, fmt);
+ if (subcmd == lls_cmd(LSG_MICOFORIA_CMD_START, micoforia_suite)) {
+ char str[100];
+ struct timespec t;
+ struct tm *tm;
+ assert(clock_gettime(CLOCK_REALTIME, &t) == 0);
+ tm = localtime(&t.tv_sec);
+ strftime(str, sizeof(str), "%b %d %H:%M:%S", tm);
+ fprintf(stderr, "%s:%04lu ", str,
+ (long unsigned)t.tv_nsec / 1000 / 1000);
+ fprintf(stderr, "(%u) ", (unsigned)getpid());
+ }
+ vfprintf(stderr, fmt, argp);
+ va_end(argp);
+}
+
+static void die_lopsub(int lopsub_ret, char **errctx)
+{
+ const char *m = lls_strerror(-lopsub_ret);
+ if (*errctx)
+ ERROR_LOG("%s: %s\n", *errctx, m);
+ else
+ ERROR_LOG("%s\n", m);
+ free(*errctx);
+ *errctx = NULL;
+ die("lopsub error");
+}
+
+#define FOR_EACH_CONTAINER(_c) for ( \
+ struct container **_cp = container; \
+ ((_c) = *(_cp)); \
+ (_cp)++, (_c) = *(_cp) \
+)
+
+static struct container *get_container(const char *name)
+{
+ struct container *c;
+ FOR_EACH_CONTAINER(c) {
+ if (!strcmp(c->name, name))
+ return c;
+ }
+ return NULL;
+}
+
+static struct container *get_or_append_container(const char *name)
+{
+ struct container *c = get_container(name);
+ if (c)
+ return c;
+ container = xrealloc(container,
+ (++num_containers + 1) * sizeof(struct container *));
+ c = container[num_containers - 1] = xzmalloc(sizeof(struct container));
+ c->name = xstrdup(name);
+ /* ~0U means: not given */
+ c->cpu_cores = ~0U;
+ c->memory_limit = ~0U;
+ c->init_type = ~0U;
+ container[num_containers] = NULL;
+ return c;
+}
+
+static unsigned get_container_ttys(const struct container *c, uint32_t **result)
+{
+ static uint32_t dflt = {1};
+ if (c->num_ttys > 0) {
+ *result = c->tty;
+ return c->num_ttys;
+ }
+ if (num_default_ttys > 0) {
+ *result = default_tty;
+ return num_default_ttys;
+ }
+ *result = &dflt;
+ return 1;
+}
+
+enum clo_given_counter {
+ CLOGC_DEFAULT_CGROUP_DAC,
+ CLOGC_CGROUP_DAC,
+ CLOGC_DEFAULT_IO_MAX,
+ CLOGC_IO_MAX,
+ NUM_CLOGCS
+};
+
+static unsigned clo_given_counter[NUM_CLOGCS];
+
+static void append_dac_entry(const char *arg, char ***listp, unsigned *count)
+{
+ char *val = parse_cgroup_acl(arg);
+ (*count)++;
+ *listp = xrealloc(*listp, (*count + 1) * sizeof(char *));
+ (*listp)[*count - 1] = val;
+ (*listp)[*count] = NULL;
+}
+
+static void append_io_max_entry(const char *arg, char ***listp, unsigned *count)
+{
+ (*count)++;
+ *listp = xrealloc(*listp, (*count + 1) * sizeof(char *));
+ (*listp)[*count - 1] = xstrdup(arg);
+ (*listp)[*count] = NULL;
+}
+
+static void check_options(void)
+{
+ unsigned n, m;
+ const char *arg;
+ char *name, *val;
+ struct container *c;
+ uint32_t u32;
+
+ container = xzmalloc(sizeof(struct container *));
+ /* loop backwards to let command line opts override config file opts */
+ for (n = OPT_GIVEN(MICOFORIA, CONTAINER) - 1; n != ~0U; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, CONTAINER);
+ check_name(arg);
+ get_or_append_container(arg);
+ }
+ for (n = OPT_GIVEN(MICOFORIA, PRE_START_HOOK) - 1; n != ~0U; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, PRE_START_HOOK);
+ parse_compound_arg(arg, "pre-start-hook", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ free(c->pre_start_hook);
+ c->pre_start_hook = val;
+ }
+ for (n = OPT_GIVEN(MICOFORIA, PRE_EXEC_HOOK) - 1; n != ~0U; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, PRE_EXEC_HOOK);
+ parse_compound_arg(arg, "pre-exec-hook", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ free(c->pre_exec_hook);
+ c->pre_exec_hook = val;
+ }
+ for (n = OPT_GIVEN(MICOFORIA, CAPDROP) - 1; n != ~0U; n--) {
+ cap_value_t cap_val;
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, CAPDROP);
+ parse_compound_arg(arg, "capabilities", &name, &val);
+ c = get_or_append_container(name);
+ if (cap_from_name(val, &cap_val) < 0)
+ die_errno("%s: invalid capability: %s", name, val);
+ c->capdrop = xrealloc(c->capdrop,
+ ++c->num_capdrops * sizeof(cap_value_t));
+ c->capdrop[c->num_capdrops - 1] = cap_val;
+ free(name);
+ free(val);
+ }
+ for (n = 0; n < OPT_GIVEN(MICOFORIA, DEFAULT_CAPDROP); n++) {
+ cap_value_t cap_val;
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CAPDROP);
+ if (cap_from_name(arg, &cap_val) < 0)
+ die_errno("invalid default capability: %s", val);
+ default_capdrop = xrealloc(default_capdrop,
+ ++num_default_capdrops * sizeof(cap_value_t));
+ default_capdrop[num_default_capdrops - 1] = cap_val;
+ }
+ for (n = OPT_GIVEN(MICOFORIA, TTY) - 1; n != ~0U; n--) {
+ uint32_t minor;
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, TTY);
+ parse_compound_arg(arg, "tty", &name, &val);
+ c = get_or_append_container(name);
+ minor = atou32(val, "tty");
+ if (minor == 0)
+ die("can not capture tty0");
+ c->tty = xrealloc(c->tty, ++c->num_ttys * sizeof(uint32_t));
+ c->tty[c->num_ttys - 1] = minor;
+ free(name);
+ free(val);
+ }
+ for (n = 0; n < OPT_GIVEN(MICOFORIA, DEFAULT_TTY); n++) {
+ uint32_t minor = OPT_UINT32_VAL_N(n, MICOFORIA, DEFAULT_TTY);
+ if (minor == 0)
+ die("can not capture tty0");
+ default_tty = xrealloc(default_tty,
+ ++num_default_ttys * sizeof(uint32_t));
+ default_tty[num_default_ttys - 1] = minor;
+ }
+
+ for (n = OPT_GIVEN(MICOFORIA, ROOT_DIRECTORY) - 1; n != ~0U ; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, ROOT_DIRECTORY);
+ parse_compound_arg(arg, "root-directory", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ free(c->root_dir);
+ c->root_dir = val;
+ }
+ u32 = OPT_UINT32_VAL(MICOFORIA, DEFAULT_CPU_CORES);
+ check_range(u32, 0, 65536, "default-cpu-cores");
+ for (n = OPT_GIVEN(MICOFORIA, CPU_CORES) - 1; n != ~0U ; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, CPU_CORES);
+ parse_compound_arg(arg, "cpu-cores", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ u32 = atou32(val, "cpu-cores");
+ free(val);
+ check_range(u32, 0, 65536, "cpu-cores");
+ c->cpu_cores = u32;
+ }
+ u32 = OPT_UINT32_VAL(MICOFORIA, DEFAULT_MEMORY_LIMIT);
+ check_range(u32, 0, 1024 * 1024, "default-memory-limit");
+ for (n = OPT_GIVEN(MICOFORIA, MEMORY_LIMIT) - 1; n != ~0U ; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, MEMORY_LIMIT);
+ parse_compound_arg(arg, "memory-limit", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ u32 = atou32(val, "memory-limit");
+ free(val);
+ check_range(u32, 0, 1024 * 1024, "memory-limit");
+ c->memory_limit = u32;
+ }
+ for (n = OPT_GIVEN(MICOFORIA, INIT) - 1; n != ~0U ; n--) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, INIT);
+ parse_compound_arg(arg, "init", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ free(c->init);
+ c->init = val;
+ }
+ for (n = 0; n < OPT_GIVEN(MICOFORIA, NET); n++) {
+ struct ifspec *ifspec;
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, NET);
+ parse_compound_arg(arg, "net", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ c->ifspec = xrealloc(c->ifspec,
+ ++c->num_ifspecs * sizeof(struct ifspec));
+ ifspec = c->ifspec + c->num_ifspecs - 1;
+ parse_ifspec(val, &ifspec->bridge, ifspec->hwaddr);
+ free(val);
+ }
+
+ m = clo_given_counter[CLOGC_DEFAULT_CGROUP_DAC];
+ for (n = m; n < OPT_GIVEN(MICOFORIA, DEFAULT_CGROUP_DAC); n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CGROUP_DAC);
+ append_dac_entry(arg, &default_dacl, &num_default_dac_entries);
+ }
+ for (n = 0; n < m; n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_CGROUP_DAC);
+ append_dac_entry(arg, &default_dacl, &num_default_dac_entries);
+ }
+ m = clo_given_counter[CLOGC_CGROUP_DAC];
+ for (n = m; n < OPT_GIVEN(MICOFORIA, CGROUP_DAC); n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, CGROUP_DAC);
+ parse_compound_arg(arg, "cgroup-dac", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ append_dac_entry(val, &c->dacl, &c->num_dac_entries);
+ free(val);
+ }
+ for (n = 0; n < m; n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, CGROUP_DAC);
+ parse_compound_arg(arg, "cgroup-dac", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ append_dac_entry(val, &c->dacl, &c->num_dac_entries);
+ free(val);
+ }
+
+ m = clo_given_counter[CLOGC_DEFAULT_IO_MAX];
+ for (n = m; n < OPT_GIVEN(MICOFORIA, DEFAULT_IO_MAX); n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_IO_MAX);
+ append_io_max_entry(arg, &default_io_max, &num_default_io_max_entries);
+ }
+ for (n = 0; n < m; n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, DEFAULT_IO_MAX);
+ append_io_max_entry(arg, &default_io_max, &num_default_io_max_entries);
+ }
+ m = clo_given_counter[CLOGC_IO_MAX];
+ for (n = m; n < OPT_GIVEN(MICOFORIA, IO_MAX); n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, IO_MAX);
+ parse_compound_arg(arg, "io-max", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ append_io_max_entry(val, &c->io_max, &c->num_io_max_entries);
+ free(val);
+ }
+ for (n = 0; n < m; n++) {
+ arg = OPT_STRING_VAL_N(n, MICOFORIA, IO_MAX);
+ parse_compound_arg(arg, "io-max", &name, &val);
+ c = get_or_append_container(name);
+ free(name);
+ append_io_max_entry(val, &c->io_max, &c->num_io_max_entries);
+ free(val);
+ }
+
+ /* init default c->ifspec[] */
+ FOR_EACH_CONTAINER(c) {
+ if (c->num_ifspecs == 0) {
+ const char *br = OPT_STRING_VAL(MICOFORIA, DEFAULT_BRIDGE);
+ c->num_ifspecs = 1;
+ c->ifspec = xmalloc(sizeof(struct ifspec));
+ c->ifspec[0].bridge = xstrdup(br);
+ memset(c->ifspec[0].hwaddr, 0, 6);
+ continue;
+ }
+ }
+}
+
+static void show_subcommand_summary(bool verbose)
+{
+ int i;
+
+#define LSG_MICOFORIA_CMD(_name) #_name
+ static const char * const subcommand_names[] = {LSG_MICOFORIA_SUBCOMMANDS NULL};
+#undef LSG_MICOFORIA_CMD
+ printf("Available subcommands:\n");
+ if (verbose) {
+ const struct lls_command *cmd;
+ for (i = 1; (cmd = lls_cmd(i, micoforia_suite)); i++) {
+ const char *purpose = lls_purpose(cmd);
+ const char *name = lls_command_name(cmd);
+ printf("%-12s%s\n", name, purpose);
+ }
+ } else {
+ unsigned n = 8;
+ printf("\t");
+ for (i = 0; i < LSG_NUM_MICOFORIA_SUBCOMMANDS; i++) {
+ if (i > 0)
+ n += printf(", ");
+ if (n > 70) {
+ printf("\n\t");
+ n = 8;
+ }
+ n += printf("%s", subcommand_names[i]);
+ }
+ printf("\n");
+ }
+}
+
+const char *GET_VERSION(void);
+static void handle_version_and_help(void)
+{
+ char *help;
+
+ if (OPT_GIVEN(MICOFORIA, VERSION)) {
+ printf(PACKAGE " %s\n"
+ "Copyright (C) " COPYRIGHT_YEAR " " AUTHOR ".\n"
+ "License: " LICENSE " <" LICENSE_URL ">.\n"
+ "This is free software: you are free to change and redistribute it.\n"
+ "There is NO WARRANTY, to the extent permitted by law.\n"
+ "\n"
+ "Web page: " URL "\n"
+ "Clone URL: " CLONE_URL "\n"
+ "Gitweb: " GITWEB_URL "\n"
+ "Author's Home Page: " HOME_URL "\n"
+ "Send feedback to: " AUTHOR " <" EMAIL ">\n"
+ ,
+ GET_VERSION()
+ );
+ exit(EXIT_SUCCESS);
+ }
+ if (OPT_GIVEN(MICOFORIA, DETAILED_HELP))
+ help = lls_long_help(CMD_PTR(MICOFORIA));
+ else if (OPT_GIVEN(MICOFORIA, HELP))
+ help = lls_short_help(CMD_PTR(MICOFORIA));
+ else if (lls_num_inputs(lpr) == 0) {
+ show_subcommand_summary(true /* verbose */);
+ exit(EXIT_SUCCESS);
+ } else
+ return;
+ printf("%s\n", help);
+ free(help);
+ exit(EXIT_SUCCESS);
+}
+
+static char *get_config_file_path(void)
+{
+ struct passwd *pw;
+ const char *home;
+
+ if (OPT_GIVEN(MICOFORIA, CONFIG_FILE))
+ return xstrdup(OPT_STRING_VAL(MICOFORIA, CONFIG_FILE));
+ pw = getpwuid(getuid());
+ home = pw? pw->pw_dir : "/root";
+ return msg("%s/.micoforiarc", home);
+}
+
+static void parse_options(int argc, char **argv, const struct lls_command *cmd,
+ struct lls_parse_result **lprp)
+{
+ int ret, fd = -1;
+ char *config_file;
+ struct stat statbuf;
+ void *map;
+ size_t sz;
+ int cf_argc;
+ char **cf_argv, *errctx = NULL;
+ const char *subcmd_name;
+ struct lls_parse_result *merged_lpr, *cf_lpr;
+
+ ret = lls_parse(argc, argv, cmd, lprp, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ handle_version_and_help();
+ clo_given_counter[CLOGC_DEFAULT_CGROUP_DAC] = OPT_GIVEN(MICOFORIA,
+ DEFAULT_CGROUP_DAC);
+ clo_given_counter[CLOGC_CGROUP_DAC] = OPT_GIVEN(MICOFORIA, CGROUP_DAC);
+ clo_given_counter[CLOGC_DEFAULT_IO_MAX] =
+ OPT_GIVEN(MICOFORIA, DEFAULT_IO_MAX);
+ clo_given_counter[CLOGC_IO_MAX] = OPT_GIVEN(MICOFORIA, IO_MAX);
+ config_file = get_config_file_path();
+ ret = open(config_file, O_RDONLY);
+ if (ret < 0) {
+ if (errno != ENOENT || OPT_GIVEN(MICOFORIA, CONFIG_FILE))
+ die_errno("can not open config file %s", config_file);
+ /* no config file -- nothing to do */
+ ret = 0;
+ goto success;
+ }
+ fd = ret;
+ ret = fstat(fd, &statbuf);
+ if (ret < 0)
+ die_errno("failed to stat config file %s", config_file);
+ sz = statbuf.st_size;
+ if (sz == 0) { /* config file is empty -- nothing to do */
+ ret = 0;
+ goto success;
+ }
+ map = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (map == MAP_FAILED)
+ die_errno("failed to mmap config file %s", config_file);
+ subcmd_name = (cmd == CMD_PTR(MICOFORIA))? NULL : lls_command_name(cmd);
+ ret = lls_convert_config(map, sz, subcmd_name, &cf_argv,
+ &errctx);
+ munmap(map, sz);
+ if (ret < 0) {
+ ERROR_LOG("failed to convert config file %s\n", config_file);
+ die_lopsub(ret, &errctx);
+ }
+ cf_argc = ret;
+ ret = lls_parse(cf_argc, cf_argv, cmd, &cf_lpr, &errctx);
+ lls_free_argv(cf_argv);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ /* command line options override config file options */
+ ret = lls_merge(*lprp, cf_lpr, cmd, &merged_lpr, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ lls_free_parse_result(cf_lpr, cmd);
+ lls_free_parse_result(*lprp, cmd);
+ *lprp = merged_lpr;
+success:
+ if (fd >= 0)
+ close(fd);
+ free(config_file);
+}
+
+static const char *get_pre_start_hook(const struct container *c)
+{
+ if (c->pre_start_hook)
+ return c->pre_start_hook;
+ return OPT_STRING_VAL(MICOFORIA, DEFAULT_PRE_START_HOOK);
+}
+
+static const char *get_pre_exec_hook(const struct container *c)
+{
+ if (c->pre_exec_hook)
+ return c->pre_exec_hook;
+ return OPT_STRING_VAL(MICOFORIA, DEFAULT_PRE_EXEC_HOOK);
+}
+
+static char *get_root_dir(const struct container *c)
+{
+ if (c->root_dir)
+ return xstrdup(c->root_dir);
+ return msg("%s/%s", OPT_STRING_VAL(MICOFORIA, DEFAULT_ROOT_PREFIX), c->name);
+}
+
+static char *get_ifspec_string(const struct container *c)
+{
+ unsigned n;
+ char *str = NULL;
+
+ assert(c->num_ifspecs > 0);
+ for (n = 0; n < c->num_ifspecs; n++) {
+ uint8_t *x = c->ifspec[n].hwaddr;
+ char *tmp = msg("%s%s%s:%02x:%02x:%02x:%02x:%02x:%02x",
+ str? str : "",
+ str? " " : "",
+ c->ifspec[n].bridge,
+ x[0], x[1], x[2], x[3], x[4], x[5]
+ );
+ free(str);
+ str = tmp;
+ }
+ return str;
+}
+
+static char *interface_name(const struct container *c, unsigned idx, bool peer)
+{
+ assert(idx < c->num_ifspecs);
+ if (c->num_ifspecs == 1)
+ return peer? msg("%s-g", c->name) : xstrdup(c->name);
+ if (peer)
+ return msg("%s-%s-g", c->name, c->ifspec[idx].bridge);
+ return msg("%s-%s", c->name, c->ifspec[idx].bridge);
+}
+
+static void set_m7a_root_dir_env(const struct container *c)
+{
+ char *root = get_root_dir(c);
+ DEBUG_LOG("root dir: %s\n", root);
+ setenv("MICOFORIA_ROOT_DIR", root, 1);
+ free(root);
+}
+
+static bool run_pre_start_hook(const struct container *c)
+{
+ char *ifspec;
+ char *cmd = xstrdup(get_pre_start_hook(c));
+ char *argv[] = {"/bin/sh", "-c", cmd, NULL};
+ bool success;
+
+ setenv("MICOFORIA_CONTAINER_NAME", c->name, 1);
+ set_m7a_root_dir_env(c);
+
+ ifspec = get_ifspec_string(c);
+ DEBUG_LOG("ifspecs: %s\n", ifspec);
+ setenv("MICOFORIA_IFSPECS", ifspec, 1);
+ free(ifspec);
+
+ INFO_LOG("running pre-start hook %s\n", cmd);
+ success = xexec(argv, NULL);
+ free(cmd);
+ if (!success)
+ ERROR_LOG("pre-start hook failed\n");
+ unsetenv("MICOFORIA_CONTAINER_NAME");
+ unsetenv("MICOFORIA_IFSPECS");
+ unsetenv("MICOFORIA_ROOT_DIR");
+ return success;
+}
+
+static void run_pre_exec_hook(const struct container *c)
+{
+ char *cmd = xstrdup(get_pre_exec_hook(c));
+ char *argv[] = {"/bin/sh", "-c", cmd, NULL};
+
+ INFO_LOG("/bin/sh -c '%s'\n", cmd);
+ set_m7a_root_dir_env(c);
+ if (!xexec(argv, NULL))
+ die("%s: pre-exec hook failed", c->name);
+ free(cmd);
+ unsetenv("MICOFORIA_ROOT_DIR");
+}
+
+static void write_cgroup(const char *path, const char *txt)
+{
+ int fd;
+ size_t sz;
+
+ if ((fd = open(path, O_WRONLY)) < 0)
+ die_errno("open %s", path);
+ sz = strlen(txt);
+ if (write(fd, txt, sz) != sz)
+ die_errno("could not write to %s", path);
+ close(fd);
+}
+
+static unsigned get_dacl(const struct container *c, char ***result)
+{
+ static char *dflt[] = {
+ "da", /* deny access to all devices except the ones below */
+ "ac 1:3 rwm", /* null */
+ "ac 1:5 rwm", /* zero */
+ "ac 1:7 rwm", /* full */
+ "ac 1:8 rwm", /* random */
+ "ac 1:9 rwm", /* urandom */
+ "ac 4:* rwm", /* tty?* */
+ "ac 5:0 rwm", /* tty */
+ "ac 5:2 rwm", /* ptmx */
+ "ac 136:* rwm", /* pts */
+ };
+ if (c->num_dac_entries > 0) {
+ *result = c->dacl;
+ return c->num_dac_entries;
+ }
+ if (num_default_dac_entries > 0) {
+ *result = default_dacl;
+ return num_default_dac_entries;
+ }
+ *result = dflt;
+ return ARRAY_SIZE(dflt);
+}
+
+static void apply_dacl(const struct container *c)
+{
+ char **dacl;
+ unsigned n, num_entries;
+ char *m7a_dir, *container_dir, *allow, *deny, *procs, *txt;
+ int fd, allow_fd, deny_fd;
+ size_t sz;
+
+ m7a_dir = msg("/var/cgroup/micoforia");
+ container_dir = msg("%s/%s", m7a_dir, c->name);
+ allow = msg("%s/devices.allow", container_dir);
+ deny = msg("%s/devices.deny", container_dir);
+ procs = msg("%s/cgroup.procs", container_dir);
+
+ if (mkdir(m7a_dir, 0777) < 0 && errno != EEXIST)
+ die_errno("mkdir %s", m7a_dir);
+ free(m7a_dir);
+ if (mkdir(container_dir, 0777) < 0 && errno != EEXIST)
+ die_errno("mkdir %s", container_dir);
+ free(container_dir);
+ if ((allow_fd = open(allow, O_WRONLY)) < 0)
+ die_errno("open %s", allow);
+ free(allow);
+ if ((deny_fd = open(deny, O_WRONLY)) < 0)
+ die_errno("open %s", deny);
+ free(deny);
+
+ num_entries = get_dacl(c, &dacl);
+ INFO_LOG("applying %u entr%s\n", num_entries, num_entries == 1?
+ "y" : "ies");
+ for (n = 0; n < num_entries; n++) {
+ char *entry = dacl[n];
+ DEBUG_LOG("dac entry #%u: %s %s\n", n, dacl[n][0] == 'a'?
+ "allow" : "deny", dacl[n] + 1);
+ txt = msg("%s\n", entry + 1);
+ sz = strlen(txt);
+ fd = entry[0] == 'a'? allow_fd : deny_fd;
+ if (write(fd, txt, sz) != sz)
+ die_errno("could not write to cgroup devices.%s file",
+ entry[0] == 'a'? "allow" : "deny");
+ free(txt);
+ }
+ close(allow_fd);
+ close(deny_fd);
+ txt = msg("%u\n", (unsigned)getpid());
+ write_cgroup(procs, txt);
+ free(txt);
+}
+
+static void cgroup_init(void)
+{
+ const char controllers[] = "+cpu +memory +io\n";
+ char *m7a_dir, *ctl;
+
+ if (access("/var/cgroup/cgroup.clone_children", F_OK) < 0)
+ die("cgroup v1 not mounted at /var/cgroup/");
+ if (access("/var/cgroup2/cgroup.subtree_control", F_OK) < 0)
+ die("cgroup v1 not mounted at /var/cgroup/");
+ write_cgroup("/var/cgroup2/cgroup.subtree_control", controllers);
+ m7a_dir = msg("/var/cgroup2/micoforia");
+ if (mkdir(m7a_dir, 0777) < 0 && errno != EEXIST)
+ die_errno("mkdir %s", m7a_dir);
+ ctl = msg("%s/cgroup.subtree_control", m7a_dir);
+ free(m7a_dir);
+ write_cgroup(ctl, controllers);
+ free(ctl);
+}
+
+static void create_cgroup_v2(const struct container *c)
+{
+ char buf[10];
+ char *ctl, *dir = msg("/var/cgroup2/micoforia/%s", c->name);
+
+ if (mkdir(dir, 0777) < 0 && errno != EEXIST)
+ die_errno("mkdir %s", dir);
+ ctl = msg("%s/cgroup.procs", dir);
+ free(dir);
+ sprintf(buf, "%u\n", (unsigned)getpid());
+ write_cgroup(ctl, buf);
+ free(ctl);
+}
+
+static unsigned get_cpu_cores(const struct container *c)
+{
+ return c->cpu_cores != ~0U? c->cpu_cores :
+ OPT_UINT32_VAL(MICOFORIA, DEFAULT_CPU_CORES);
+}
+
+static void apply_cpu_limit(const struct container *c)
+{
+ char *str, *ctl;
+ unsigned cores = get_cpu_cores(c);
+
+ if (cores == 0) /* unlimited */
+ return;
+ assert(cores != ~0U);
+ INFO_LOG("%u core%s\n", cores, cores == 1? "" : "s");
+ ctl = msg("/var/cgroup2/micoforia/%s/cpu.max", c->name);
+ str = msg("%u 1000000\n", 1000000 * cores);
+ write_cgroup(ctl, str);
+ free(ctl);
+ free(str);
+}
+
+static unsigned get_memory_limit(const struct container *c)
+{
+ return c->memory_limit != ~0U? c->memory_limit :
+ OPT_UINT32_VAL(MICOFORIA, DEFAULT_MEMORY_LIMIT);
+}
+
+static void apply_memory_limit(const struct container *c)
+{
+ char *str, *ctl;
+ unsigned gigs = get_memory_limit(c);
+
+ if (gigs == 0) /* unlimited */
+ return;
+ assert(gigs != ~0U);
+ INFO_LOG("%uG\n", gigs);
+ ctl = msg("/var/cgroup2/micoforia/%s/memory.high", c->name);
+ str = msg("%llu\n", 1024LLU * 1024LLU * 1024LLU * gigs);
+ write_cgroup(ctl, str);
+ free(ctl);
+ free(str);
+}
+
+static unsigned get_iospecs(const struct container *c, char ***result)
+{
+ if (c->num_io_max_entries > 0) {
+ *result = c->dacl;
+ return c->num_io_max_entries;
+ }
+ if (num_default_io_max_entries > 0) {
+ *result = default_io_max;
+ return num_default_io_max_entries;
+ }
+ *result = NULL;
+ return 0;
+}
+
+static void apply_io_limit(const struct container *c)
+{
+ unsigned n, num_entries;
+ char *io_max;
+ char **iospec;
+
+ num_entries = get_iospecs(c, &iospec);
+ if (num_entries == 0)
+ return;
+ INFO_LOG("%u entries\n", num_entries);
+ io_max = msg("/var/cgroup2/micoforia/%s/io.max", c->name);
+ for (n = 0; n < num_entries; n++)
+ write_cgroup(io_max, iospec[n]);
+ free(io_max);
+}
+
+static void cgroup_cleanup(const struct container *c)
+{
+ char *dir = msg("/var/cgroup/micoforia/%s", c->name);
+ remove_subdirs_recursively(dir);
+ free(dir);
+ dir = msg("/var/cgroup2/micoforia/%s", c->name);
+ remove_subdirs_recursively(dir);
+ free(dir);
+}
+
+static bool setup_network(const struct container *c)
+{
+ unsigned n;
+ char *iface, *peer;
+
+ if (!link_up("lo"))
+ WARNING_LOG("could not set establish loopback link\n");
+ for (n = 0; n < c->num_ifspecs; n++) {
+ iface = interface_name(c, n, false);
+ peer = interface_name(c, n, true);
+ link_del(iface); /* ignore errors */
+ if (!create_veth_device_pair(iface, peer))
+ goto fail;
+ if (!set_hwaddr(peer, c->ifspec[n].hwaddr))
+ goto fail;
+ if (!attach_to_bridge(iface, c->ifspec[n].bridge))
+ goto fail;
+ if (!link_up(iface))
+ goto fail;
+ free(iface);
+ free(peer);
+ }
+ return true;
+fail:
+ free(iface);
+ free(peer);
+ return false;
+}
+
+static void setup_termios(int fd)
+{
+ struct winsize wsz; /* see ioctl_tty(2) */
+ struct termios tios;
+
+ if (!isatty(fd))
+ return;
+ if (tcgetattr(fd, &tios)) {
+ ERROR_LOG("tcgetattr: %m\n");
+ return;
+ }
+ tios.c_lflag &= ~(ECHO | ISIG | ICANON);
+ tios.c_cc[VMIN] = 1;
+ tios.c_cc[VTIME] = 0;
+ if (tcsetattr(fd, TCSAFLUSH, &tios) < 0)
+ ERROR_LOG("tcsetattr: %m\n");
+ if (ioctl(STDIN_FILENO, TIOCGWINSZ, &wsz) >= 0)
+ ioctl(fd, TIOCSWINSZ, &wsz);
+}
+
+struct device_node_info {
+ unsigned major, minor;
+ mode_t mode;
+ const char *name;
+};
+
+static void create_standard_device_nodes(struct container_runtime *cr)
+{
+ const struct device_node_info devices[] = {
+ {.major = 1, .minor = 3, .mode = 0666, .name = "null"},
+ {.major = 1, .minor = 5, .mode = 0666, .name = "zero"},
+ {.major = 1, .minor = 7, .mode = 0666, .name = "full"},
+ {.major = 1, .minor = 8, .mode = 0666, .name = "random"},
+ {.major = 1, .minor = 9, .mode = 0666, .name = "urandom"},
+ {.major = 4, .minor = 0, .mode = 0620, .name = "tty0"},
+ {.major = 5, .minor = 1, .mode = 0600, .name = "console"},
+ {.major = 5, .minor = 2, .mode = 0666, .name = "ptmx"},
+ };
+ unsigned n;
+
+ for (n = 0; n < ARRAY_SIZE(devices); n++) {
+ const struct device_node_info *d = devices + n;
+ char *path = msg("%s/%s", cr->dev, d->name);
+ if (mknod(path, S_IFCHR, makedev(d->major, d->minor)) < 0)
+ die_errno("mknod %s", d->name);
+ chmod(path, d->mode);
+ free(path);
+ }
+}
+
+static void init_console(struct container_runtime *cr)
+{
+ char *console;
+ unsigned n;
+
+ if (mount(NULL, cr->dev, "tmpfs", 0, "size=500000,mode=755") < 0)
+ die("mount tmpfs at %s: %m", cr->dev);
+ create_standard_device_nodes(cr);
+ for (n = 0; n < cr->num_ttys; n++) {
+ char *tty = msg("%s/tty%u", cr->dev, cr->tty[n]);
+ unlink(tty);
+ if (mknod(tty, S_IFCHR, makedev(4, cr->tty[n])) < 0)
+ die("mknod %s: %m", tty);
+ chmod(tty, 0660);
+ setup_termios(cr->slave[n]);
+ INFO_LOG("bind mounting %s -> %s\n", ttyname(cr->slave[n]), tty);
+ if (mount(ttyname(cr->slave[n]), tty, "none",
+ MS_BIND | MS_PRIVATE, NULL) < 0)
+ die("failed to bind mount %s: %m\n", tty);
+ free(tty);
+ }
+ console = msg("%s/console", cr->dev);
+ if (mount(ttyname(cr->slave[0]), console, "none",
+ MS_BIND | MS_PRIVATE, NULL) < 0)
+ die("failed to bind mount %s: %m\n", console);
+ free(console);
+}
+
+/*
+ * These umounts fail if the container shutdown already umounted the bind
+ * mounted devices. This is not fatal, so log only with low severity.
+ */
+static void shutdown_console(struct container_runtime *cr)
+{
+ unsigned n;
+ char *console;
+
+ for (n = 0; n < cr->num_ttys; n++) {
+ char *tty = msg("%s/tty1", cr->dev);
+ if (umount2(tty, MNT_DETACH) < 0)
+ DEBUG_LOG("umount %s: %m\n", tty);
+ free(tty);
+ }
+ console = msg("%s/console", cr->dev);
+ if (umount2(console, MNT_DETACH) < 0)
+ DEBUG_LOG("umount %s: %m\n", console);
+ free(console);
+}
+
+static char *get_socket_path(const char *container_name)
+{
+ return msg("micoforia/%s", container_name);
+}
+
+/* Ignore everything the client sends us, but invalidate the fd on EOF. */
+static void dispatch_client(int *client)
+{
+ char buf[1024];
+ if (read(*client, buf, sizeof(buf)) <= 0) {
+ NOTICE_LOG("detaching client on fd %d\n", *client);
+ close(*client);
+ *client = -1;
+ }
+}
+
+static void dispatch_socket_request(struct container_runtime *cr)
+{
+ uid_t uid;
+ char buf[32];
+ int cfd;
+ uint32_t minor;
+ unsigned n;
+ bool force;
+
+ memset(buf, 0, sizeof(buf));
+ if (!recv_cred_buffer(cr->socket_fd, buf, sizeof(buf) - 1, &cfd, &uid))
+ return;
+ if (uid != getuid()) {
+ const char msg[] = "\1EACCES";
+ send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+ NOTICE_LOG("access denied for uid %d\n", (int)uid);
+ goto out;
+ }
+ if (strcmp(buf, "init_pid") == 0) {
+ buf[0] = '\0';
+ memcpy(buf + 1, &cr->init_pid, sizeof(int));
+ send(cfd, buf, 1 + sizeof(int), MSG_DONTWAIT);
+ goto out;
+ }
+ if (sscanf(buf, "attach %u", &minor) == 1) {
+ force = false;
+ } else if (sscanf(buf, "force-attach %u", &minor) == 1) {
+ force = true;
+ } else {
+ const char msg[] = "\1EINVAL";
+ send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+ NOTICE_LOG("invalid request: %s\n", buf);
+ goto out;
+ }
+ for (n = 0; n < cr->num_ttys; n++) {
+ INFO_LOG("n: %u, tty[n]: %u\n", n, cr->tty[n]);
+ if (cr->tty[n] == minor)
+ break;
+ }
+ if (n == cr->num_ttys) {
+ const char msg[] = "\1ENOTTY";
+ send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+ NOTICE_LOG("tty%u is not being forwarded\n", minor);
+ goto out;
+ }
+ if (cr->client[n] >= 0) {
+ if (force) {
+ close(cr->client[n]);
+ cr->client[n] = -1;
+ } else {
+ const char msg[] = "\1EBUSY";
+ send(cfd, msg, sizeof(msg), MSG_DONTWAIT);
+ ERROR_LOG("tty%u is already in use\n", minor);
+ goto out;
+ }
+ }
+ if (!pass_fd(cr->master[n], cfd)) {
+ ERROR_LOG("could not pass master fd\n");
+ goto out;
+ }
+ NOTICE_LOG("attached client on fd %d to tty%u\n", cfd, minor);
+ cr->client[n] = cfd;
+ return;
+out:
+ close(cfd);
+}
+
+/* discards read data if dst < 0 */
+static bool copy(int src, int dst)
+{
+ ssize_t sz1, sz2;
+ char buf[1024];
+again:
+ sz1 = read(src, buf, sizeof(buf));
+ if (sz1 < 0) {
+ if (errno == EINTR)
+ goto again;
+ DEBUG_LOG("read from fd %d: %m\n", src);
+ }
+ if (sz1 <= 0)
+ return false;
+ if (dst < 0)
+ return true;
+ sz2 = write(dst, buf, sz1);
+ if (sz2 < 0) {
+ DEBUG_LOG("write to fd %d: %m\n", dst);
+ return false;
+ }
+ if (sz1 != sz2) {
+ DEBUG_LOG("short write to fd %d\n", dst);
+ return false;
+ }
+ return true;
+}
+
+/*
+ * The function returns only when the process receives SIGCHLD. In this case
+ * the return value is 0 for success, 1 for failure, and 2 if the child's exit
+ * code indicates a reboot request. Other signals are pushed down to the child
+ * process.
+ */
+static int parent_loop(pid_t pid, const struct container *c,
+ struct container_runtime *cr)
+{
+ unsigned n;
+
+ init_signal_handling();
+ for (;;) {
+ int sig, max_fileno = 0;
+ fd_set fds;
+
+ FD_ZERO(&fds);
+ if (OPT_GIVEN(START, FOREGROUND)) {
+ FD_SET(STDIN_FILENO, &fds);
+ if (STDIN_FILENO > max_fileno)
+ max_fileno = STDIN_FILENO;
+ }
+ FD_SET(signal_pipe[0], &fds);
+ if (signal_pipe[0] > max_fileno)
+ max_fileno = signal_pipe[0];
+ FD_SET(cr->socket_fd, &fds);
+ if (cr->socket_fd > max_fileno)
+ max_fileno = cr->socket_fd;
+ for (n = 0; n < cr->num_ttys; n++) {
+ if (cr->client[n] >= 0) { /* detached */
+ FD_SET(cr->client[n], &fds);
+ if (cr->client[n] > max_fileno)
+ max_fileno = cr->client[n];
+ } else {
+ FD_SET(cr->master[n], &fds);
+ if (cr->master[n] > max_fileno)
+ max_fileno = cr->master[n];
+ }
+ }
+ if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) {
+ if (errno != EINTR)
+ ERROR_LOG("select: %m\n");
+ continue;
+ }
+ do {
+ if (!FD_ISSET(signal_pipe[0], &fds))
+ break;
+ sig = next_signal();
+ if (sig == SIGCHLD) {
+ int wstatus;
+ if (waitpid(pid, &wstatus, WNOHANG) < 0) {
+ WARNING_LOG("wait: %m\n");
+ break;
+ }
+ cgroup_cleanup(c);
+ if (!WIFEXITED(wstatus))
+ return 1;
+ if (WEXITSTATUS(wstatus) == 2)
+ return 2;
+ return WEXITSTATUS(wstatus) != EXIT_SUCCESS;
+ }
+ kill(pid, sig);
+ } while (0);
+ if (FD_ISSET(cr->socket_fd, &fds))
+ dispatch_socket_request(cr);
+ for (n = 0; n < cr->num_ttys; n++) {
+ if (cr->client[n] >= 0) {
+ if FD_ISSET(cr->client[n], &fds)
+ dispatch_client(cr->client + n);
+ } else { /* stdout is /dev/null in background mode */
+ if (FD_ISSET(cr->master[n], &fds))
+ copy(cr->master[n], n == 0?
+ STDOUT_FILENO : -1);
+ }
+ }
+ if (OPT_GIVEN(START, FOREGROUND)) {
+ if (FD_ISSET(STDIN_FILENO, &fds))
+ copy(STDIN_FILENO, cr->master[0]);
+ }
+ }
+}
+
+/* Set net namespace of child and call parent_loop(). */
+static int run_parent(pid_t child_pid, const struct container *c,
+ struct container_runtime *cr)
+{
+ unsigned n;
+ bool success;
+
+ close(cr->pipe1[1]);
+ close(cr->pipe2[0]);
+ if (read(cr->pipe1[0], &cr->init_pid, 4) != 4) {
+ ERROR_LOG("pipe1 read error\n");
+ close(cr->pipe1[0]);
+ close(cr->pipe2[1]);
+ return false;
+ }
+ INFO_LOG("received grand child pid: %u\n", (unsigned)cr->init_pid);
+ close(cr->pipe1[0]);
+ for (n = 0; n < c->num_ifspecs; n++) {
+ char *peer = interface_name(c, n, true);
+ success = set_netns(peer, child_pid);
+ free(peer);
+ if (!success) {
+ ERROR_LOG("set_netns error\n");
+ close(cr->pipe2[1]);
+ return false;
+ }
+ }
+ success = write(cr->pipe2[1], "\0", 1) == 1;
+ close(cr->pipe2[1]);
+ if (!success) {
+ ERROR_LOG("pipe2 write error\n");
+ return false;
+ }
+ return parent_loop(child_pid, c, cr);
+}
+
+static unsigned get_capdrops(const struct container *c, cap_value_t **result)
+{
+ static cap_value_t builtin_capdrop[] = {CAP_SYS_MODULE, CAP_SYS_TIME,
+ CAP_SYS_RESOURCE};
+
+ if (c->capdrop) {
+ *result = c->capdrop;
+ return c->num_capdrops;
+ }
+ if (OPT_GIVEN(MICOFORIA, DEFAULT_CAPDROP)) {
+ *result = default_capdrop;
+ return num_default_capdrops;
+ }
+ *result = builtin_capdrop;
+ return ARRAY_SIZE(builtin_capdrop);
+}
+
+static void drop_caps(const struct container *c)
+{
+ cap_value_t *capdrop;
+ unsigned n, num_capdrops;
+
+ INFO_LOG("lowering bounding set capabilities\n");
+ num_capdrops = get_capdrops(c, &capdrop);
+ for (n = 0; n < num_capdrops; n++) {
+ char *name = cap_to_name(capdrop[n]);
+ DEBUG_LOG("dropping %s\n", name);
+ cap_free(name);
+ if (cap_drop_bound(capdrop[n]) < 0)
+ die_errno("cap_drop_bound");
+ }
+}
+
+__attribute ((noreturn))
+static void child_loop(pid_t pid, struct container_runtime *cr)
+{
+ int wstatus;
+
+ INFO_LOG("parent: %u, child: %u, init: %u\n", (unsigned) getppid(),
+ (unsigned)getpid(), (unsigned)pid);
+ init_signal_handling();
+ setsid();
+
+ for (;;) {
+ int max_fileno = 0;
+ fd_set fds;
+
+ FD_ZERO(&fds);
+ FD_SET(signal_pipe[0], &fds);
+ if (signal_pipe[0] > max_fileno)
+ max_fileno = signal_pipe[0];
+ if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) {
+ if (errno != EINTR)
+ ERROR_LOG("select: %m\n");
+ continue;
+ }
+ do { if (FD_ISSET(signal_pipe[0], &fds)) {
+ int sig = next_signal();
+ if (sig == SIGCHLD) {
+ if (waitpid(pid, &wstatus, WNOHANG) < 0) {
+ WARNING_LOG("wait: %m\n");
+ break;
+ }
+ shutdown_console(cr);
+ if (WIFSIGNALED(wstatus) &&
+ WTERMSIG(wstatus) == 1) {
+ NOTICE_LOG("reboot requested\n");
+ exit(2);
+ }
+ NOTICE_LOG("container terminated\n");
+ exit(EXIT_SUCCESS);
+ }
+ NOTICE_LOG("sending signal %d to container init\n",
+ sig);
+ kill(pid, sig == SIGINT? SIGINT : SIGKILL);
+ }} while(0);
+ }
+}
+
+static const char *get_init_path(const struct container *c)
+{
+ return c->init? c->init : OPT_STRING_VAL(MICOFORIA, DEFAULT_INIT);
+}
+
+/*
+ * The child process unshares namespaces, spawns the init process which runs
+ * the pre-exec hook and executes the container init process. This function
+ * never returns, but both the child and the init process exit when the
+ * container terminates. The exit code of the child tells the parent whether
+ * it should restart the container.
+ */
+__attribute ((noreturn))
+static void run_child(const struct container *c, struct container_runtime *cr)
+{
+ unsigned n;
+ char *init, *put_old;
+ char ch;
+ pid_t pid;
+
+ close(cr->socket_fd);
+ for (n = 0; n < cr->num_ttys; n++)
+ close(cr->master[n]);
+ close(cr->pipe1[0]);
+ close(cr->pipe2[1]);
+ if (unshare(CLONE_NEWNET) < 0)
+ die_errno("unshare net ns\n");
+ if (unshare(CLONE_NEWPID) < 0)
+ die_errno("unshare pid ns\n");
+ /* fork again to become pid 1 in the new pid namespace */
+ if ((pid = fork()) < 0)
+ die_errno("fork");
+ /*
+ * By writing to pipe1 we tell the parent (a) we've unshared the net
+ * namespace, and (b) the pid of the init process in the parent
+ * namespace.
+ */
+ if (pid > 0) {
+ close(cr->pipe2[0]);
+ if (write(cr->pipe1[1], (const char *)&pid, 4) != 4)
+ die_errno("pipe write error");
+ close(cr->pipe1[1]);
+ child_loop(pid, cr); /* never returns */
+ }
+ pid = getpid();
+ DEBUG_LOG("now running as pid %d\n", pid);
+ if (read(cr->pipe2[0], &ch, 1) != 1)
+ die_errno("pipe read error");
+ close(cr->pipe1[1]);
+ close(cr->pipe2[0]);
+ if (unshare(CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS) < 0)
+ die_errno("unshare");
+ mkdir(cr->dev, 0777);
+ init_console(cr);
+ for (n = 0; n < cr->num_ttys; n++)
+ close(cr->slave[n]);
+ INFO_LOG("setting hostname to %s\n", c->name);
+ if (sethostname(c->name, strlen(c->name)) < 0)
+ die_errno("sethostname error");
+ if (chdir(cr->root) < 0)
+ die_errno("chdir %s", cr->root);
+ drop_caps(c);
+ apply_dacl(c);
+ apply_cpu_limit(c);
+ apply_memory_limit(c);
+ apply_io_limit(c);
+ for (n = 0; n < c->num_ifspecs; n++) {
+ char *peer = interface_name(c, n, true);
+ char *renamed = msg("eth%u", n);
+ if (!rename_interface(peer, renamed))
+ die("can not rename %s to %s\n", peer, renamed);
+ free(peer);
+ free(renamed);
+ }
+ run_pre_exec_hook(c);
+ setup_termios(STDIN_FILENO);
+ put_old = msg("%s/mnt", cr->root);
+ /* glibc does not provide a wrapper for pivot_root */
+ if (syscall(SYS_pivot_root, ".", put_old) < 0)
+ die_errno("pivot_root (put_old: %s)", put_old);
+ if (umount2("/mnt", MNT_DETACH) < 0)
+ die_errno("umount %s", put_old);
+ free(put_old);
+ close(STDIN_FILENO);
+ init = xstrdup(get_init_path(c));
+ INFO_LOG("handing over control to container init: %s\n", init);
+ execve(init, (char *[]){init, NULL}, NULL);
+ die_errno("failed to exec init process %s", c->init);
+}
+
+/*
+ * We need three processes, called parent, child, init, because we want one
+ * process run with namespaces unmodified, requiring one fork. After the child
+ * has unshared its PID namespace, it keeps its old PID, so we need to fork
+ * again to get pid 1. The child can not terminate because the parent can not
+ * wait(2) on its grandchild.
+ */
+static bool exec_container(const struct container *c)
+{
+ bool success;
+ pid_t pid;
+ unsigned n;
+ struct container_runtime cr = {0};
+ char *socket_path;
+ int ret;
+
+ create_cgroup_v2(c);
+ socket_path = get_socket_path(c->name);
+ success = listen_on_unix_socket(socket_path, &cr.socket_fd);
+ if (!success)
+ ERROR_LOG("can not listen on unix socket %s\n", socket_path);
+ free(socket_path);
+ if (!success)
+ return 1;
+ cr.root = get_root_dir(c);
+ cr.dev = msg("%s/dev", cr.root);
+ cr.pts = realpath("/proc/self/fd/0", NULL);
+ DEBUG_LOG("pts: %s\n", cr.pts);
+ cr.num_ttys = get_container_ttys(c, &cr.tty);
+ cr.master = xmalloc(cr.num_ttys * sizeof(int));
+ cr.slave = xmalloc(cr.num_ttys * sizeof(int));
+ cr.client = xmalloc(cr.num_ttys * sizeof(int));
+ for (n = 0; n < cr.num_ttys; n++)
+ cr.client[n] = -1;
+reboot:
+ NOTICE_LOG("starting %s\n", c->name);
+ for (n = 0; n < cr.num_ttys; n++) {
+ if (openpty(cr.master + n, cr.slave + n, NULL, NULL, NULL) < 0)
+ die("openpty: %m");
+ DEBUG_LOG("pty (tty%u <-> %s)\n", n, ttyname(cr.slave[n]));
+ }
+ /* mount rw, ignore errors */
+ mount(NULL, cr.root, NULL, MS_REMOUNT, NULL);
+ if (!setup_network(c))
+ return false;
+ if (!run_pre_start_hook(c))
+ return false;
+ if (pipe(cr.pipe1) < 0) /* child -> parent */
+ die_errno("pipe1");
+ if (pipe(cr.pipe2) < 0)
+ die_errno("pipe2"); /* parent -> child */
+ if ((pid = fork()) < 0)
+ die_errno("fork");
+ if (pid == 0)
+ run_child(c, &cr); /* never returns */
+ ret = run_parent(pid, c, &cr);
+ if (ret != 2)
+ return ret == 0;
+ NOTICE_LOG("rebooting\n");
+ for (n = 0; n < cr.num_ttys; n++) {
+ close(cr.master[n]);
+ close(cr.slave[n]);
+ }
+ goto reboot;
+}
+
+static char *get_container_logfile(const char *name)
+{
+ return msg("%s/%s", OPT_STRING_VAL(MICOFORIA, LOGDIR), name);
+}
+
+static bool start_container(const struct container *c)
+{
+ pid_t pid;
+ char *logfile;
+ struct termios tios;
+ bool success;
+
+ if (is_locked(c->name, &pid)) {
+ ERROR_LOG("%s is locked by pid %u\n", c->name, (unsigned)pid);
+ return false;
+ }
+ if (OPT_GIVEN(START, FOREGROUND)) {
+ if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO)) {
+ ERROR_LOG("both stdin and stdout must be terminals\n");
+ return false;
+ }
+ if (tcgetattr(STDIN_FILENO, &tios) < 0) {
+ ERROR_LOG("tcgetattr: %m\n");
+ return false;
+ }
+ } else {
+ if ((pid = fork()) < 0)
+ die_errno("fork");
+ if (pid > 0)
+ return true;
+ logfile = get_container_logfile(c->name);
+ daemonize(logfile);
+ free(logfile);
+ }
+ if (!try_lock(c->name, &pid))
+ die("%s is locked by pid %u", c->name, (unsigned)pid);
+ success = exec_container(c);
+ if (OPT_GIVEN(START, FOREGROUND)) {
+ if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &tios) < 0)
+ ERROR_LOG("tcsetattr: %m\n");
+ }
+ exit(success? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+static void check_container_args(void)
+{
+ unsigned n, num_inputs;
+ struct container *c;
+
+ num_inputs = lls_num_inputs(sublpr);
+ if (num_inputs == 0) {
+ if (num_containers == 0)
+ die("no container configured\n");
+ if (OPT_GIVEN(START, FOREGROUND) && num_containers > 1)
+ die("must specify container for foreground mode");
+ } else {
+ if (OPT_GIVEN(START, FOREGROUND) && num_inputs > 1)
+ die("can start only one container in foreground mode");
+ for (n = 0; n < num_inputs; n++) {
+ const char *name = lls_input(n, sublpr);
+ c = get_container(name);
+ if (!c)
+ die("container not configured: %s", name);
+ }
+ }
+}
+
+struct container_arg_iter {
+ unsigned idx;
+};
+
+#define INITIALIZED_CAI(_cai) {.idx = 0}
+
+static struct container *cai_next(struct container_arg_iter *cai, bool *skipped)
+{
+ unsigned num_inputs = lls_num_inputs(sublpr);
+
+ if (skipped)
+ *skipped = false;
+ if (num_inputs == 0) {
+ if (cai->idx >= num_containers)
+ return NULL;
+ return container[cai->idx++];
+ }
+ for (; cai->idx < num_inputs; cai->idx++) {
+ const char *name = lls_input(cai->idx, sublpr);
+ struct container *c = get_container(name);
+ if (!c) {
+ ERROR_LOG("%s: not configured\n", name);
+ if (skipped)
+ *skipped = true;
+ continue;
+ }
+ cai->idx++;
+ return c;
+ }
+ return NULL;
+}
+
+static bool for_each_container_arg(bool (*f)(const struct container *c))
+{
+ struct container *c;
+ bool success = true;
+ bool skipped;
+ struct container_arg_iter cai = INITIALIZED_CAI(cai);
+
+ while ((c = cai_next(&cai, &skipped)))
+ if (!f(c) || skipped)
+ success = false;
+ return success;
+}
+
+static bool com_start(void)
+{
+ const char *logdir = OPT_STRING_VAL(MICOFORIA, LOGDIR);
+
+ check_container_args();
+ if (logdir[0] == '\0')
+ die_empty_arg("loggir");
+ cgroup_init();
+ if (mkdir(logdir, 0777) < 0 && errno != EEXIST)
+ die_errno("mkdir %s", logdir);
+ return for_each_container_arg(start_container);
+}
+EXPORT_CMD_HANDLER(start);
+
+static bool send_signal_to_container(int signum, const struct container *c)
+{
+ pid_t pid;
+ bool success;
+
+ if (!is_locked(c->name, &pid)) {
+ INFO_LOG("%s is not running\n", c->name);
+ return false;
+ }
+ DEBUG_LOG("sending signal %d to pid %u\n", signum, (unsigned)pid);
+ success = kill(pid, signum) >= 0;
+ if (!success)
+ ERROR_LOG("kill %s: %m\n", c->name);
+ return success;
+}
+
+static void clean_env(void)
+{
+ char *term = getenv("TERM");
+
+ clearenv();
+ if (term)
+ setenv("TERM", term, 0);
+ setenv("PATH", "/root/bin:/usr/local/sbin:/usr/local/bin"
+ ":/sbin:/usr/sbin:/bin:/usr/bin", 0);
+ setenv("USER", "root", 0);
+ setenv("LOGNAME", "root", 0);
+ setenv("HOME", "/root", 0);
+}
+
+static bool request_init_pid(const char *name, int *result)
+{
+ char *socket_path = get_socket_path(name);
+ bool success;
+
+ *result = -1;
+ success = request_int(socket_path, "init_pid", result);
+ free(socket_path);
+ if (!success)
+ ERROR_LOG("could not determine init pid of %s\n", name);
+ return success;
+}
+
+static bool shutdown_container(const struct container *c)
+{
+ pid_t pid;
+ char str[20];
+ char *argv[] = {"nsenter", "-w", "-a", "-r", "-t", str, "halt", NULL};
+
+ if (!is_locked(c->name, NULL)) {
+ if (lls_num_inputs(sublpr) == 0)
+ return true;
+ ERROR_LOG("container not running: %s\n", c->name);
+ return false;
+ }
+ pid = fork();
+ if (pid < 0)
+ return false;
+ if (pid > 0)
+ return true;
+ if (!request_init_pid(c->name, &pid))
+ _exit(EXIT_FAILURE);
+ sprintf(str, "%d", pid);
+ clean_env();
+ execvp(argv[0], argv);
+ _exit(EXIT_FAILURE);
+}
+
+static bool container_is_dead(const struct container *c)
+{
+ return !is_locked(c->name, NULL);
+}
+
+static bool wait_for_containers_to_die(void)
+{
+ bool success;
+ unsigned ms = 32;
+ struct timespec ts;
+
+ while (ms < 20000) {
+ ts.tv_sec = ms / 1000;
+ ts.tv_nsec = (ms % 1000) * 1000 * 1000;
+ if (nanosleep(&ts, NULL) < 0)
+ return false;
+ success = for_each_container_arg(container_is_dead);
+ if (success)
+ return true;
+ ms *= 2;
+ }
+ return false;
+}
+
+static bool com_stop(void)
+{
+ bool success = for_each_container_arg(shutdown_container);
+
+ if (!success)
+ return false;
+ if (!OPT_GIVEN(STOP, WAIT))
+ return true;
+ return wait_for_containers_to_die();
+}
+EXPORT_CMD_HANDLER(stop);
+
+static bool reboot_container(const struct container *c)
+{
+ return send_signal_to_container(SIGINT, c);
+}
+
+static bool com_reboot(void)
+{
+ return for_each_container_arg(reboot_container);
+}
+EXPORT_CMD_HANDLER(reboot);
+
+static bool kill_container(const struct container *c)
+{
+ return send_signal_to_container(SIGUSR1, c);
+}
+
+static bool com_kill(void)
+{
+ bool success = for_each_container_arg(kill_container);
+
+ if (!success)
+ return false;
+ if (!OPT_GIVEN(KILL, WAIT))
+ return true;
+ return wait_for_containers_to_die();
+}
+EXPORT_CMD_HANDLER(kill);
+
+static void list_container_verbose(const struct container *c)
+{
+ char *root;
+ unsigned n, N;
+ char **word_list;
+ cap_value_t *capdrop;
+ uint32_t *tty;
+ char cores_str[25] = "unlimited";
+ unsigned cores = get_cpu_cores(c);
+
+ printf("%s:\n", c->name);
+ printf("\tpre-start hook: %s\n", get_pre_start_hook(c));
+ printf("\tpre-exec hook: %s\n", get_pre_exec_hook(c));
+ root = get_root_dir(c);
+ printf("\troot dir: %s\n", root);
+ free(root);
+ printf("\tinit path: %s\n", get_init_path(c));
+ for (n = 0; n < c->num_ifspecs; n++) {
+ char pretty_hwaddr[18];
+ char *iface = interface_name(c, n, false);
+ pretty_print_hwaddr(c->ifspec[n].hwaddr, pretty_hwaddr);
+ printf("\tinterface #%u: %s (%s)\n", n, iface, pretty_hwaddr);
+ free(iface);
+ }
+ N = get_dacl(c, &word_list);
+ for (n = 0; n < N; n++)
+ printf("\tdac entry #%u: %s %s\n", n, word_list[n][0] == 'a'?
+ "allow" : "deny", word_list[n] + 1);
+ N = get_iospecs(c, &word_list);
+ for (n = 0; n < N; n++)
+ printf("\tiospec #%u: %s\n", n, word_list[n]);
+ if (cores > 0)
+ sprintf(cores_str, "%u", cores);
+ printf("\tCPU core limit: %s\n", cores_str);
+ printf("\tmemory limit: %uG\n", get_memory_limit(c));
+ N = get_capdrops(c, &capdrop);
+ for (n = 0; n < N; n++)
+ printf("\tcapdrop #%u: %s\n", n, cap_to_name(capdrop[n]));
+ N = get_container_ttys(c, &tty);
+ for (n = 0; n < N; n++)
+ printf("\ttty #%u: %u\n", n, tty[n]);
+}
+
+static bool com_ls(void)
+{
+ struct container *c;
+ bool skipped, success = true;
+ struct container_arg_iter cai = INITIALIZED_CAI(cai);
+
+ while ((c = cai_next(&cai, &skipped))) {
+ pid_t pid;
+ if (skipped)
+ success = false;
+ if (!is_locked(c->name, &pid)) {
+ if (!OPT_GIVEN(LS, ALL)) {
+ success =false;
+ continue;
+ }
+ pid = 0;
+ }
+ if (OPT_GIVEN(LS, VERBOSE)) {
+ list_container_verbose(c);
+ continue;
+ }
+ if (OPT_GIVEN(LS, LONG)) {
+ if (pid > 0)
+ printf("%u\t", (unsigned)pid);
+ else
+ printf("-\t");
+ printf("%u\t", get_cpu_cores(c));
+ printf("%uG\t", get_memory_limit(c));
+ printf("%s\n", c->name);
+ continue;
+ }
+ if (!OPT_GIVEN(LS, QUIET))
+ printf("%s\n", c->name);
+ }
+ if (skipped) /* needed if the last given container arg is invalid */
+ success = false;
+ return success;
+}
+EXPORT_CMD_HANDLER(ls);
+
+static bool list_container_processes(const struct container *c)
+{
+ int pid;
+ char str[20];
+ char *argv[] = {"pstree", "-anp", str, NULL};
+ bool success;
+
+ success = is_locked(c->name, &pid);
+ if (!success) {
+ if (lls_num_inputs(sublpr) == 0)
+ return true;
+ ERROR_LOG("container \"%s\" is not running\n", c->name);
+ return false;
+ }
+ if (!OPT_GIVEN(PS, ALL) && !request_init_pid(c->name, &pid))
+ return false;
+ sprintf(str, "%d", pid);
+ success = xexec(argv, NULL);
+ return success;
+}
+
+static bool com_ps(void)
+{
+ return for_each_container_arg(list_container_processes);
+}
+EXPORT_CMD_HANDLER(ps);
+
+static bool com_attach(void)
+{
+ char *errctx;
+ const char *arg;
+ pid_t pid;
+ char *socket_path;
+ int master, ret, socket_fd;
+ bool have_escape = false;
+ struct termios tios;
+ uint32_t minor = OPT_UINT32_VAL(ATTACH, TTY);
+ char *rq;
+
+ if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO)) {
+ ERROR_LOG("both stdin and stdout must be terminals\n");
+ return false;
+ }
+ if (tcgetattr(STDIN_FILENO, &tios) < 0)
+ die_errno("tcgetattr");
+ ret = lls_check_arg_count(sublpr, 1, 1, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ arg = lls_input(0, sublpr);
+ if (!is_locked(arg, &pid)) {
+ ERROR_LOG("container not running: %s\n", arg);
+ return false;
+ }
+ socket_path = get_socket_path(arg);
+ if (OPT_GIVEN(ATTACH, FORCE))
+ rq = msg("force-attach %u", minor);
+ else
+ rq = msg("attach %u", minor);
+ socket_fd = request_fd(socket_path, rq, &master);
+ free(rq);
+ free(socket_path);
+ INFO_LOG("Attached to /dev/tty%u of container %s\n", minor, arg);
+ NOTICE_LOG("Type CTRL+a q to quit\n");
+ setup_termios(STDIN_FILENO);
+ setup_termios(master);
+ for (;;) {
+ int max_fileno = 0;
+ fd_set fds;
+ FD_ZERO(&fds);
+ FD_SET(STDIN_FILENO, &fds);
+ if (STDIN_FILENO > max_fileno)
+ max_fileno = STDIN_FILENO;
+ FD_SET(master, &fds);
+ if (master > max_fileno)
+ max_fileno = master;
+ FD_SET(socket_fd, &fds);
+ if (socket_fd > max_fileno)
+ max_fileno = socket_fd;
+ if (select(max_fileno + 1, &fds, NULL, NULL, NULL) < 0) {
+ if (errno != EINTR)
+ ERROR_LOG("select: %m\n");
+ continue;
+ }
+ if (FD_ISSET(socket_fd, &fds))
+ break;
+ if (FD_ISSET(STDIN_FILENO, &fds)) {
+ char c;
+ if (read(STDIN_FILENO, &c, 1) <= 0)
+ break;
+ if (c == 1 && !have_escape)
+ have_escape = true;
+ else if (c == 'q' && have_escape)
+ break;
+ else if (write(master, &c, 1) != 1)
+ break;
+ }
+ if (FD_ISSET(master, &fds)) {
+ if (!copy(master, STDOUT_FILENO))
+ break;
+ }
+ }
+ if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &tios) < 0)
+ ERROR_LOG("tcsetattr: %m\n");
+ printf("\n");
+ return false;
+}
+EXPORT_CMD_HANDLER(attach);
+
+static bool com_help(void)
+{
+ int ret;
+ char *errctx, *help;
+ const char *arg;
+ const struct lls_command *cmd;
+
+ ret = lls_check_arg_count(sublpr, 0, 1, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ if (lls_num_inputs(sublpr) == 0) {
+ show_subcommand_summary(OPT_GIVEN(HELP, LONG));
+ return true;
+ }
+ arg = lls_input(0, sublpr);
+ ret = lls_lookup_subcmd(arg, micoforia_suite, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ cmd = lls_cmd(ret, micoforia_suite);
+ if (OPT_GIVEN(HELP, LONG))
+ help = lls_long_help(cmd);
+ else
+ help = lls_short_help(cmd);
+ printf("%s\n", help);
+ free(help);
+ return true;
+}
+EXPORT_CMD_HANDLER(help);
+
+static bool com_configtest(void)
+{
+ printf("Syntax Ok\n");
+ return true;
+}
+EXPORT_CMD_HANDLER(configtest);
+
+static bool com_edit(void)
+{
+ char *ed = getenv("EDITOR"); /* must not be freed */
+ char *conf = get_config_file_path();
+ char *argv[] = {ed? ed : "vi", conf, NULL};
+ bool success = xexec(argv, NULL);
+
+ free(conf);
+ return success;
+}
+EXPORT_CMD_HANDLER(edit);
+
+static bool com_enter(void)
+{
+ char str[20];
+ char **argv;
+ char *nsenter_args[] = {"nsenter", "-w", "-a", "-r", "-t"};
+ const unsigned nna = ARRAY_SIZE(nsenter_args); /* num nsenter args */
+ char *dflt_cmd[] = {"login", "-f", "root"};
+ unsigned n, N, ni = lls_num_inputs(sublpr);
+ unsigned nea = ni > 1? ni - 1 : ARRAY_SIZE(dflt_cmd); /* num extra args */
+ const char *arg;
+ bool success;
+ int ret, pid;
+ char *errctx;
+
+ ret = lls_check_arg_count(sublpr, 1, INT_MAX, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ arg = lls_input(0, sublpr);
+ if (!is_locked(arg, &pid)) {
+ ERROR_LOG("container not running: %s\n", arg);
+ return false;
+ }
+ if (!request_init_pid(arg, &pid))
+ return false;
+ N = nna + nea + 2; /* +1 for arg to -t and +1 for terminating NULL */
+ argv = xmalloc(N * sizeof(char *));
+ for (n = 0; n < nna; n++)
+ argv[n] = nsenter_args[n];
+ sprintf(str, "%d", pid);
+ argv[nna] = str;
+ for (n = 0; n < nea; n++)
+ argv[nna + 1 + n] = ni > 1? (char *)lls_input(n + 1, sublpr)
+ : dflt_cmd[n];
+ argv[N - 1] = NULL;
+ clean_env();
+ success = xexec(argv, NULL);
+ free(argv);
+ return success;
+}
+EXPORT_CMD_HANDLER(enter);
+
+static bool com_log(void)
+{
+ int ret;
+ char *errctx, *logfile;
+ bool success, use_less = isatty(STDIN_FILENO) && isatty(STDOUT_FILENO);
+ char *argv[] = {use_less? "less" : "cat", NULL /* filename */, NULL};
+
+ ret = lls_check_arg_count(sublpr, 1, 1, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ logfile = get_container_logfile(lls_input(0, sublpr));
+ argv[1] = logfile;
+ success = xexec(argv, NULL);
+ free(logfile);
+ return success;
+}
+EXPORT_CMD_HANDLER(log);
+
+int main(int argc, char *argv[])
+{
+ int ret;
+ char *errctx;
+ const struct micoforia_user_data *ud;
+ unsigned num_inputs;
+
+ valid_fd012();
+ parse_options(argc, argv, CMD_PTR(MICOFORIA), &lpr);
+ loglevel_arg_val = OPT_UINT32_VAL(MICOFORIA, LOGLEVEL);
+ check_options();
+ num_inputs = lls_num_inputs(lpr);
+ ret = lls_lookup_subcmd(argv[argc - num_inputs], micoforia_suite, &errctx);
+ if (ret < 0)
+ die_lopsub(ret, &errctx);
+ subcmd = lls_cmd(ret, micoforia_suite);
+ parse_options(num_inputs, argv + argc - num_inputs, subcmd, &sublpr);
+ ud = lls_user_data(subcmd);
+ exit(ud->handler()? EXIT_SUCCESS : EXIT_FAILURE);
+}