diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6022fc3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,12 @@ +Isolate is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +If you have less than 10 copies of the GPL on your system :-), +you can find it at http://www.gnu.org/licenses/. diff --git a/Makefile b/Makefile index cd19997..6508ea7 100644 --- a/Makefile +++ b/Makefile @@ -1,29 +1,38 @@ # Makefile for Isolate -# (c) 2015 Martin Mares +# (c) 2015--2016 Martin Mares all: isolate isolate.1 isolate.1.html CC=gcc -CFLAGS=-std=gnu99 -Wall -Wextra -Wno-parentheses -Wno-unused-result -Wno-missing-field-initializers +CFLAGS=-std=gnu99 -Wall -Wextra -Wno-parentheses -Wno-unused-result -Wno-missing-field-initializers -Wstrict-prototypes -Wmissing-prototypes -D_GNU_SOURCE -VERSION=1.1 -YEAR=2015 +VERSION=1.3 +YEAR=2016 BUILD_DATE:=$(shell date '+%Y-%m-%d') BUILD_COMMIT:=$(shell if git rev-parse >/dev/null 2>/dev/null ; then git describe --always ; else echo '' ; fi) -CFLAGS += -DVERSION='"$(VERSION)"' -DYEAR='"$(YEAR)"' -DBUILD_DATE='"$(BUILD_DATE)"' -DBUILD_COMMIT='"$(BUILD_COMMIT)"' PREFIX = $(DESTDIR)/usr/local +VARPREFIX = $(DESTDIR)/var/local +CONFIGDIR = $(PREFIX)/etc +CONFIG = $(CONFIGDIR)/isolate BINDIR = $(PREFIX)/bin DATAROOTDIR = $(PREFIX)/share DATADIR = $(DATAROOTDIR) MANDIR = $(DATADIR)/man MAN1DIR = $(MANDIR)/man1 +BOXDIR = $(VARPREFIX)/lib/isolate -isolate: isolate.c config.h - $(CC) $(CFLAGS) -o $@ $^ +isolate: isolate.o util.o rules.o cg.o config.o + $(CC) $(LDFLAGS) -o $@ $^ + +%.o: %.c isolate.h config.h + $(CC) $(CFLAGS) -c -o $@ $< + +isolate.o: CFLAGS += -DVERSION='"$(VERSION)"' -DYEAR='"$(YEAR)"' -DBUILD_DATE='"$(BUILD_DATE)"' -DBUILD_COMMIT='"$(BUILD_COMMIT)"' +config.o: CFLAGS += -DCONFIG_FILE='"$(CONFIG)"' isolate.1: isolate.1.txt - a2x -f manpage -D . $< + a2x -f manpage $< # The dependency on isolate.1 is there to serialize both calls of asciidoc, # which does not name temporary files safely. @@ -31,14 +40,17 @@ isolate.1.html: isolate.1.txt isolate.1 a2x -f xhtml -D . $< clean: + rm -f *.o rm -f isolate isolate.1 isolate.1.html rm -f docbook-xsl.css install: isolate install -D $< $(BINDIR)/$< chmod u+s $(BINDIR)/$< + install -d $(BOXDIR) + install -m 644 -D default.cf $(CONFIG) install-doc: isolate.1 - install -D $< $(MAN1DIR)/$< + install -m 644 -D $< $(MAN1DIR)/$< .PHONY: all clean install install-doc diff --git a/TODO b/TODO index 685afab..86a6de8 100644 --- a/TODO +++ b/TODO @@ -1,8 +1,6 @@ -Installation Test: ptrace self Test: SIGSTOP Test: ping-pong timing attacks Test: big static memory Examine the use of taskstats for measuring memory Doc: mount -t cgroup none -o cpuset,cpuacct,memory /sys/fs/cgroup -Switch license to GPL2/GPL3 diff --git a/cg.c b/cg.c new file mode 100644 index 0000000..db16b6d --- /dev/null +++ b/cg.c @@ -0,0 +1,309 @@ +/* + * Process Isolator -- Control Groups + * + * (c) 2012-2016 Martin Mares + * (c) 2012-2014 Bernard Blackham + */ + +#include "isolate.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +struct cg_controller_desc { + const char *name; + int optional; +}; + +typedef enum { + CG_MEMORY = 0, + CG_CPUACCT, + CG_CPUSET, + CG_NUM_CONTROLLERS, + CG_PARENT = 256, +} cg_controller; + +static const struct cg_controller_desc cg_controllers[CG_NUM_CONTROLLERS+1] = { + [CG_MEMORY] = { "memory", 0 }, + [CG_CPUACCT] = { "cpuacct", 0 }, + [CG_CPUSET] = { "cpuset", 1 }, + [CG_NUM_CONTROLLERS] = { NULL, 0 }, +}; + +#define FOREACH_CG_CONTROLLER(_controller) \ + for (cg_controller (_controller) = 0; \ + (_controller) < CG_NUM_CONTROLLERS; (_controller)++) + +static const char *cg_controller_name(cg_controller c) +{ + assert(c < CG_NUM_CONTROLLERS); + return cg_controllers[c].name; +} + +static int cg_controller_optional(cg_controller c) +{ + assert(c < CG_NUM_CONTROLLERS); + return cg_controllers[c].optional; +} + +static char cg_name[256]; +static char cg_parent_name[256]; + +#define CG_BUFSIZE 1024 + +static void +cg_makepath(char *buf, size_t len, cg_controller c, const char *attr) +{ + snprintf(buf, len, "%s/%s/%s/%s", + cf_cg_root, + cg_controller_name(c & ~CG_PARENT), + (c & CG_PARENT) ? cg_parent_name : cg_name, + attr); +} + +static int +cg_read(cg_controller controller, const char *attr, char *buf) +{ + int result = 0; + int maybe = 0; + if (attr[0] == '?') + { + attr++; + maybe = 1; + } + + char path[256]; + cg_makepath(path, sizeof(path), controller, attr); + + int fd = open(path, O_RDONLY); + if (fd < 0) + { + if (maybe) + goto fail; + die("Cannot read %s: %m", path); + } + + int n = read(fd, buf, CG_BUFSIZE); + if (n < 0) + { + if (maybe) + goto fail_close; + die("Cannot read %s: %m", path); + } + if (n >= CG_BUFSIZE - 1) + die("Attribute %s too long", path); + if (n > 0 && buf[n-1] == '\n') + n--; + buf[n] = 0; + + if (verbose > 1) + msg("CG: Read %s = %s\n", attr, buf); + + result = 1; +fail_close: + close(fd); +fail: + return result; +} + +static void __attribute__((format(printf,3,4))) +cg_write(cg_controller controller, const char *attr, const char *fmt, ...) +{ + int maybe = 0; + if (attr[0] == '?') + { + attr++; + maybe = 1; + } + + va_list args; + va_start(args, fmt); + + char buf[CG_BUFSIZE]; + int n = vsnprintf(buf, sizeof(buf), fmt, args); + if (n >= CG_BUFSIZE) + die("cg_write: Value for attribute %s is too long", attr); + + if (verbose > 1) + msg("CG: Write %s = %s", attr, buf); + + char path[256]; + cg_makepath(path, sizeof(path), controller, attr); + + int fd = open(path, O_WRONLY | O_TRUNC); + if (fd < 0) + { + if (maybe) + goto fail; + else + die("Cannot write %s: %m", path); + } + + int written = write(fd, buf, n); + if (written < 0) + { + if (maybe) + goto fail_close; + else + die("Cannot set %s to %s: %m", path, buf); + } + if (written != n) + die("Short write to %s (%d out of %d bytes)", path, written, n); + +fail_close: + close(fd); +fail: + va_end(args); +} + +void +cg_init(void) +{ + if (!cg_enable) + return; + + if (!dir_exists(cf_cg_root)) + die("Control group filesystem at %s not mounted", cf_cg_root); + + if (cf_cg_parent) + { + snprintf(cg_name, sizeof(cg_name), "%s/box-%d", cf_cg_parent, box_id); + snprintf(cg_parent_name, sizeof(cg_parent_name), "%s", cf_cg_parent); + } + else + { + snprintf(cg_name, sizeof(cg_name), "box-%d", box_id); + strcpy(cg_parent_name, "."); + } + msg("Using control group %s under parent %s\n", cg_name, cg_parent_name); +} + +void +cg_prepare(void) +{ + if (!cg_enable) + return; + + struct stat st; + char buf[CG_BUFSIZE]; + char path[256]; + + FOREACH_CG_CONTROLLER(controller) + { + cg_makepath(path, sizeof(path), controller, ""); + if (stat(path, &st) >= 0 || errno != ENOENT) + { + msg("Control group %s already exists, trying to empty it.\n", path); + if (rmdir(path) < 0) + die("Failed to reset control group %s: %m", path); + } + + if (mkdir(path, 0777) < 0 && !cg_controller_optional(controller)) + die("Failed to create control group %s: %m", path); + } + + // If the cpuset module is enabled, set up allowed cpus and memory nodes. + // If per-box configuration exists, use it; otherwise, inherit the settings + // from the parent cgroup. + struct cf_per_box *cf = cf_current_box(); + if (cg_read(CG_PARENT | CG_CPUSET, "?cpuset.cpus", buf)) + cg_write(CG_CPUSET, "cpuset.cpus", "%s", cf->cpus ? cf->cpus : buf); + if (cg_read(CG_PARENT | CG_CPUSET, "?cpuset.mems", buf)) + cg_write(CG_CPUSET, "cpuset.mems", "%s", cf->mems ? cf->mems : buf); +} + +void +cg_enter(void) +{ + if (!cg_enable) + return; + + msg("Entering control group %s\n", cg_name); + + FOREACH_CG_CONTROLLER(controller) + { + if (cg_controller_optional(controller)) + cg_write(controller, "?tasks", "%d\n", (int) getpid()); + else + cg_write(controller, "tasks", "%d\n", (int) getpid()); + } + + if (cg_memory_limit) + { + cg_write(CG_MEMORY, "memory.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); + cg_write(CG_MEMORY, "?memory.memsw.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); + } + + if (cg_timing) + cg_write(CG_CPUACCT, "cpuacct.usage", "0\n"); +} + +int +cg_get_run_time_ms(void) +{ + if (!cg_enable) + return 0; + + char buf[CG_BUFSIZE]; + cg_read(CG_CPUACCT, "cpuacct.usage", buf); + unsigned long long ns = atoll(buf); + return ns / 1000000; +} + +void +cg_stats(void) +{ + if (!cg_enable) + return; + + char buf[CG_BUFSIZE]; + + // Memory usage statistics + unsigned long long mem=0, memsw=0; + if (cg_read(CG_MEMORY, "?memory.max_usage_in_bytes", buf)) + mem = atoll(buf); + if (cg_read(CG_MEMORY, "?memory.memsw.max_usage_in_bytes", buf)) + { + memsw = atoll(buf); + if (memsw > mem) + mem = memsw; + } + if (mem) + meta_printf("cg-mem:%lld\n", mem >> 10); +} + +void +cg_remove(void) +{ + char buf[CG_BUFSIZE]; + + if (!cg_enable) + return; + + FOREACH_CG_CONTROLLER(controller) + { + if (cg_controller_optional(controller)) + { + if (!cg_read(controller, "?tasks", buf)) + continue; + } + else + cg_read(controller, "tasks", buf); + + if (buf[0]) + die("Some tasks left in controller %s of cgroup %s, failed to remove it", + cg_controller_name(controller), cg_name); + + char path[256]; + cg_makepath(path, sizeof(path), controller, ""); + + if (rmdir(path) < 0) + die("Cannot remove control group %s: %m", path); + } +} diff --git a/config.c b/config.c new file mode 100644 index 0000000..730b296 --- /dev/null +++ b/config.c @@ -0,0 +1,167 @@ +/* + * Process Isolator -- Configuration File + * + * (c) 2016 Martin Mares + */ + +#include "isolate.h" + +#include +#include +#include +#include + +#define MAX_LINE_LEN 1024 + +char *cf_box_root; +char *cf_cg_root; +char *cf_cg_parent; +int cf_first_uid; +int cf_first_gid; +int cf_num_boxes; + +static int line_number; +static struct cf_per_box *per_box_configs; + +static void NONRET +cf_err(char *msg) +{ + die("Error in config file, line %d: %s", line_number, msg); +} + +static char * +cf_string(char *val) +{ + return xstrdup(val); +} + +static int +cf_int(char *val) +{ + char *end; + errno = 0; + long int x = strtol(val, &end, 10); + if (errno || end == val || end && *end) + cf_err("Invalid number"); + if ((long int)(int) x != x) + cf_err("Number out of range"); + return x; +} + +static void +cf_entry_toplevel(char *key, char *val) +{ + if (!strcmp(key, "box_root")) + cf_box_root = cf_string(val); + else if (!strcmp(key, "cg_root")) + cf_cg_root = cf_string(val); + else if (!strcmp(key, "cg_parent")) + cf_cg_parent = cf_string(val); + else if (!strcmp(key, "first_uid")) + cf_first_uid = cf_int(val); + else if (!strcmp(key, "first_gid")) + cf_first_gid = cf_int(val); + else if (!strcmp(key, "num_boxes")) + cf_num_boxes = cf_int(val); + else + cf_err("Unknown configuration item"); +} + +static void +cf_entry_compound(char *key, char *subkey, char *val) +{ + if (strncmp(key, "box", 3)) + cf_err("Unknown configuration section"); + int box_id = cf_int(key + 3); + struct cf_per_box *c = cf_per_box(box_id); + + if (!strcmp(subkey, "cpus")) + c->cpus = cf_string(val); + else if (!strcmp(subkey, "mems")) + c->mems = cf_string(val); + else + cf_err("Unknown per-box configuration item"); +} + +static void +cf_entry(char *key, char *val) +{ + char *dot = strchr(key, '.'); + if (!dot) + cf_entry_toplevel(key, val); + else + { + *dot++ = 0; + cf_entry_compound(key, dot, val); + } +} + +static void +cf_check(void) +{ + if (!cf_box_root || + !cf_cg_root || + !cf_first_uid || + !cf_first_gid || + !cf_num_boxes) + cf_err("Configuration is not complete"); +} + +void +cf_parse(void) +{ + FILE *f = fopen(CONFIG_FILE, "r"); + if (!f) + die("Cannot open %s: %m", CONFIG_FILE); + + char line[MAX_LINE_LEN]; + while (fgets(line, sizeof(line), f)) + { + line_number++; + char *nl = strchr(line, '\n'); + if (!nl) + cf_err("Line not terminated or too long"); + *nl = 0; + + if (!line[0] || line[0] == '#') + continue; + + char *s = line; + while (*s && *s != ' ' && *s != '\t' && *s != '=') + s++; + while (*s == ' ' || *s == '\t') + *s++ = 0; + if (*s != '=') + cf_err("Syntax error, expecting key=value"); + *s++ = 0; + while (*s == ' ' || *s == '\t') + *s++ = 0; + + cf_entry(line, s); + } + + fclose(f); + cf_check(); +} + +struct cf_per_box * +cf_per_box(int box_id) +{ + struct cf_per_box *c; + + for (c = per_box_configs; c; c = c->next) + if (c->box_id == box_id) + return c; + + c = xmalloc(sizeof(*c)); + c->next = per_box_configs; + per_box_configs = c; + c->box_id = box_id; + return c; +} + +struct cf_per_box * +cf_current_box(void) +{ + return cf_per_box(box_id); +} diff --git a/config.h b/config.h deleted file mode 100644 index 8a5f6d9..0000000 --- a/config.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __ISOLATE_CONFIG_H__ -#define __ISOLATE_CONFIG_H__ - -/* A directory under which all sandboxes are created. */ -#define CONFIG_ISOLATE_BOX_DIR "/tmp/box" - -/* Range of UIDs and GIDs reserved for use by the sandboxes. */ -#define CONFIG_ISOLATE_FIRST_UID 60000 -#define CONFIG_ISOLATE_FIRST_GID 60000 -#define CONFIG_ISOLATE_NUM_BOXES 100 - -/* Root of the cgroup hierarchy. */ -#define CONFIG_ISOLATE_CGROUP_ROOT "/sys/fs/cgroup" - -#endif /* __ISOLATE_CONFIG_H__ */ diff --git a/default.cf b/default.cf new file mode 100644 index 0000000..eebd421 --- /dev/null +++ b/default.cf @@ -0,0 +1,24 @@ +# This is a configuration file for Isolate + +# All sandboxes are created under this directory. +# To avoid symlink attacks, this directory and all its ancestors +# must be writeable only to root. +box_root = /var/local/lib/isolate + +# Root of the control group hierarchy +cg_root = /sys/fs/cgroup + +# If the following variable is defined, the per-box cgroups +# are created as sub-groups of the named cgroup +#cg_parent = boxes + +# Block of UIDs and GIDs reserved for sandboxes +first_uid = 60000 +first_gid = 60000 +num_boxes = 1000 + +# Per-box settings of the set of allowed CPUs and NUMA nodes +# (see linux/Documentation/cgroups/cpusets.txt for precise syntax) + +#box0.cpus = 4-7 +#box0.mems = 1 diff --git a/isolate.1.txt b/isolate.1.txt index 6ed7b42..042322e 100644 --- a/isolate.1.txt +++ b/isolate.1.txt @@ -88,15 +88,19 @@ OPTIONS *-i, --stdin=*'file':: Redirect standard input from 'file'. The 'file' has to be accessible - inside the sandbox. + inside the sandbox. Otherwise, standard input is inherited from the + parent process. *-o, --stdout=*'file':: Redirect standard output to 'file'. The 'file' has to be accessible - inside the sandbox. + inside the sandbox. Otherwise, standard output is inherited from the + parent process and the sandbox manager does not write anything to it. *-r, --stderr=*'file':: Redirect standard error output to 'file'. The 'file' has to be accessible - inside the sandbox. + inside the sandbox. Otherwise, standard error output is inherited from + the parent process and both the sandboxed process and the sandbox manager + can write their status messages to it. *-c, --chdir=*'dir':: Change directory to 'dir' before executing the program. This path must be @@ -119,6 +123,11 @@ OPTIONS Tell the sandbox manager to be verbose and report on what is going on. Using *-v* multiple times produces even more jabber. +*-s, --silent*:: + Tell the sandbox manager to keep silence. No status messages are printed + to stderr except for fatal errors of the sandbox itself. The combination of + *--verbose* and *--silent* has an undefined effect. + ENVIRONMENT RULES ----------------- UNIX processes normally inherit all environment variables from their parent. The @@ -183,7 +192,7 @@ CONTROL GROUPS -------------- Isolate can make use of system control groups provided by the kernel to constrain programs consisting of multiple processes. Please note -that this feature needs special system setup described in the REQUIREMENTS +that this feature needs special system setup described in the INSTALLATION section. *--cg*:: @@ -257,14 +266,13 @@ you should also enable the swap controller (+CONFIG_MEMCG_SWAP+). Debian 7.x and newer require enabling the memory and swap cgroup controllers by adding the parameters "cgroup_enable=memory swapaccount=1" to the kernel -command-line, which can be set using GRUB_CMDLINE_LINUX_DEFAULT in +command-line, which can be set using +GRUB_CMDLINE_LINUX_DEFAULT+ in /etc/default/grub. Isolate is designed to run setuid to root. The sub-process inside the sandbox then switches to a non-privileged user ID (different for each *--box-id*). -The range of UIDs available and several filesystem paths are embedded in the -isolate's binary during compilation; please see +config.h+ in the source -tree for description. +The range of UIDs available and several filesystem paths are set in a configuration +file, by default located in /usr/local/etc/isolate. Before you run isolate with control groups, you need to ensure that the cgroup filesystem is enabled and mounted. Most modern Linux distributions already @@ -289,4 +297,4 @@ LICENSE ------- Isolate was written by Martin Mares and Bernard Blackham. It can be distributed and used under the terms of the GNU -General Public License version 2. +General Public License version 2 or any later version. diff --git a/isolate.c b/isolate.c index d49fe3f..a75b129 100644 --- a/isolate.c +++ b/isolate.c @@ -1,38 +1,29 @@ /* * A Process Isolator based on Linux Containers * - * (c) 2012-2015 Martin Mares + * (c) 2012-2016 Martin Mares * (c) 2012-2014 Bernard Blackham */ -#define _GNU_SOURCE - -#include "config.h" +#include "isolate.h" #include -#include #include -#include -#include -#include -#include -#include #include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include #include +#include +#include #include -#include +#include #include -#include +#include +#include +#include /* May not be defined in older glibc headers */ #ifndef MS_PRIVATE @@ -44,35 +35,34 @@ #define MS_REC (1 << 14) #endif -#define NONRET __attribute__((noreturn)) -#define UNUSED __attribute__((unused)) -#define ARRAY_SIZE(a) (int)(sizeof(a)/sizeof(a[0])) +#define TIMER_INTERVAL_US 100000 static int timeout; /* milliseconds */ static int wall_timeout; static int extra_timeout; -static int pass_environ; -static int verbose; +int pass_environ; +int verbose; +static int silent; static int fsize_limit; static int memory_limit; static int stack_limit; -static int block_quota; -static int inode_quota; +int block_quota; +int inode_quota; static int max_processes = 1; static char *redir_stdin, *redir_stdout, *redir_stderr; static char *set_cwd; static int share_net; -static int cg_enable; -static int cg_memory_limit; -static int cg_timing; +int cg_enable; +int cg_memory_limit; +int cg_timing; -static int box_id; +int box_id; static char box_dir[1024]; static pid_t box_pid; -static uid_t box_uid; -static gid_t box_gid; +uid_t box_uid; +gid_t box_gid; static uid_t orig_uid; static gid_t orig_gid; @@ -82,58 +72,16 @@ static int cleanup_ownership; static struct timeval start_time; static int ticks_per_sec; static int total_ms, wall_ms; -static volatile sig_atomic_t timer_tick; +static volatile sig_atomic_t timer_tick, interrupt; static int error_pipes[2]; static int write_errors_to_fd; static int read_errors_from_fd; -static void die(char *msg, ...) NONRET; -static void cg_stats(void); static int get_wall_time_ms(void); static int get_run_time_ms(struct rusage *rus); -static void chowntree(char *path, uid_t uid, gid_t gid); - -/*** Meta-files ***/ - -static FILE *metafile; - -static void -meta_open(const char *name) -{ - if (!strcmp(name, "-")) - { - metafile = stdout; - return; - } - if (setfsuid(getuid()) < 0) - die("Failed to switch FS UID: %m"); - metafile = fopen(name, "w"); - if (setfsuid(geteuid()) < 0) - die("Failed to switch FS UID back: %m"); - if (!metafile) - die("Failed to open metafile '%s'",name); -} - -static void -meta_close(void) -{ - if (metafile && metafile != stdout) - fclose(metafile); -} - -static void __attribute__((format(printf,1,2))) -meta_printf(const char *fmt, ...) -{ - if (!metafile) - return; - - va_list args; - va_start(args, fmt); - vfprintf(metafile, fmt, args); - va_end(args); -} +/*** Messages and exits ***/ static void final_stats(struct rusage *rus) @@ -150,8 +98,6 @@ final_stats(struct rusage *rus) cg_stats(); } -/*** Messages and exits ***/ - static void NONRET box_exit(int rc) { @@ -188,7 +134,7 @@ flush_line(void) } /* Report an error of the sandbox itself */ -static void NONRET __attribute__((format(printf,1,2))) +void NONRET __attribute__((format(printf,1,2))) die(char *msg, ...) { va_list args; @@ -196,6 +142,13 @@ die(char *msg, ...) char buf[1024]; int n = vsnprintf(buf, sizeof(buf), msg, args); + // If the child process is still running, show no mercy. + if (box_pid > 0) + { + kill(-box_pid, SIGKILL); + kill(box_pid, SIGKILL); + } + if (write_errors_to_fd) { // We are inside the box, have to use error pipe for error reporting. @@ -213,7 +166,7 @@ die(char *msg, ...) } /* Report an error of the program inside the sandbox */ -static void NONRET __attribute__((format(printf,1,2))) +void NONRET __attribute__((format(printf,1,2))) err(char *msg, ...) { va_list args; @@ -227,13 +180,16 @@ err(char *msg, ...) char buf[1024]; vsnprintf(buf, sizeof(buf), msg, args); meta_printf("message:%s\n", buf); - fputs(buf, stderr); - fputc('\n', stderr); + if (!silent) + { + fputs(buf, stderr); + fputc('\n', stderr); + } box_exit(1); } /* Write a message, but only if in verbose mode */ -static void __attribute__((format(printf,1,2))) +void __attribute__((format(printf,1,2))) msg(char *msg, ...) { va_list args; @@ -249,761 +205,95 @@ msg(char *msg, ...) va_end(args); } -/*** Utility functions ***/ - -static void * -xmalloc(size_t size) -{ - void *p = malloc(size); - if (!p) - die("Out of memory"); - return p; -} - -static char * -xstrdup(char *str) -{ - char *p = strdup(str); - if (!p) - die("Out of memory"); - return p; -} - -static int dir_exists(char *path) -{ - struct stat st; - return (stat(path, &st) >= 0 && S_ISDIR(st.st_mode)); -} - -static int rmtree_helper(const char *fpath, const struct stat *sb, - int typeflag UNUSED, struct FTW *ftwbuf UNUSED) -{ - if (S_ISDIR(sb->st_mode)) - { - if (rmdir(fpath) < 0) - die("Cannot rmdir %s: %m", fpath); - } - else - { - if (unlink(fpath) < 0) - die("Cannot unlink %s: %m", fpath); - } - return FTW_CONTINUE; -} - -static void -rmtree(char *path) -{ - nftw(path, rmtree_helper, 32, FTW_MOUNT | FTW_PHYS | FTW_DEPTH); -} - -static uid_t chown_uid; -static gid_t chown_gid; - -static int chowntree_helper(const char *fpath, const struct stat *sb UNUSED, - int typeflag UNUSED, struct FTW *ftwbuf UNUSED) -{ - if (lchown(fpath, chown_uid, chown_gid) < 0) - die("Cannot chown %s: %m", fpath); - else - return FTW_CONTINUE; -} - -static void -chowntree(char *path, uid_t uid, gid_t gid) -{ - chown_uid = uid; - chown_gid = gid; - nftw(path, chowntree_helper, 32, FTW_MOUNT | FTW_PHYS); -} - -/*** Environment rules ***/ - -struct env_rule { - char *var; // Variable to match - char *val; // ""=clear, NULL=inherit - int var_len; - struct env_rule *next; -}; - -static struct env_rule *first_env_rule; -static struct env_rule **last_env_rule = &first_env_rule; - -static struct env_rule default_env_rules[] = { - { "LIBC_FATAL_STDERR_", "1" } -}; - -static int -set_env_action(char *a0) -{ - struct env_rule *r = xmalloc(sizeof(*r) + strlen(a0) + 1); - char *a = (char *)(r+1); - strcpy(a, a0); - - char *sep = strchr(a, '='); - if (sep == a) - return 0; - r->var = a; - if (sep) - { - *sep++ = 0; - r->val = sep; - } - else - r->val = NULL; - *last_env_rule = r; - last_env_rule = &r->next; - r->next = NULL; - return 1; -} - -static int -match_env_var(char *env_entry, struct env_rule *r) -{ - if (strncmp(env_entry, r->var, r->var_len)) - return 0; - return (env_entry[r->var_len] == '='); -} - -static void -apply_env_rule(char **env, int *env_sizep, struct env_rule *r) -{ - // First remove the variable if already set - int pos = 0; - while (pos < *env_sizep && !match_env_var(env[pos], r)) - pos++; - if (pos < *env_sizep) - { - (*env_sizep)--; - env[pos] = env[*env_sizep]; - env[*env_sizep] = NULL; - } - - // What is the new value? - char *new; - if (r->val) - { - if (!r->val[0]) - return; - new = xmalloc(r->var_len + 1 + strlen(r->val) + 1); - sprintf(new, "%s=%s", r->var, r->val); - } - else - { - pos = 0; - while (environ[pos] && !match_env_var(environ[pos], r)) - pos++; - if (!(new = environ[pos])) - return; - } - - // Add it at the end of the array - env[(*env_sizep)++] = new; - env[*env_sizep] = NULL; -} - -static char ** -setup_environment(void) -{ - // Link built-in rules with user rules - for (int i=ARRAY_SIZE(default_env_rules)-1; i >= 0; i--) - { - default_env_rules[i].next = first_env_rule; - first_env_rule = &default_env_rules[i]; - } - - // Scan the original environment - char **orig_env = environ; - int orig_size = 0; - while (orig_env[orig_size]) - orig_size++; - - // For each rule, reserve one more slot and calculate length - int num_rules = 0; - for (struct env_rule *r = first_env_rule; r; r=r->next) - { - num_rules++; - r->var_len = strlen(r->var); - } - - // Create a new environment - char **env = xmalloc((orig_size + num_rules + 1) * sizeof(char *)); - int size; - if (pass_environ) - { - memcpy(env, environ, orig_size * sizeof(char *)); - size = orig_size; - } - else - size = 0; - env[size] = NULL; - - // Apply the rules one by one - for (struct env_rule *r = first_env_rule; r; r=r->next) - apply_env_rule(env, &size, r); - - // Return the new env and pass some gossip - if (verbose > 1) - { - fprintf(stderr, "Passing environment:\n"); - for (int i=0; env[i]; i++) - fprintf(stderr, "\t%s\n", env[i]); - } - return env; -} - -/*** Directory rules ***/ - -struct dir_rule { - char *inside; // A relative path - char *outside; // This can be an absolute path or a relative path starting with "./" - unsigned int flags; // DIR_FLAG_xxx - struct dir_rule *next; -}; - -enum dir_rule_flags { - DIR_FLAG_RW = 1, - DIR_FLAG_NOEXEC = 2, - DIR_FLAG_FS = 4, - DIR_FLAG_MAYBE = 8, - DIR_FLAG_DEV = 16, -}; - -static const char * const dir_flag_names[] = { "rw", "noexec", "fs", "maybe", "dev" }; - -static struct dir_rule *first_dir_rule; -static struct dir_rule **last_dir_rule = &first_dir_rule; - -static int add_dir_rule(char *in, char *out, unsigned int flags) -{ - // Make sure that "in" is relative - while (in[0] == '/') - in++; - if (!*in) - return 0; - - // Check "out" - if (flags & DIR_FLAG_FS) - { - if (!out || out[0] == '/') - return 0; - } - else - { - if (out && out[0] != '/' && strncmp(out, "./", 2)) - return 0; - } - - // Override an existing rule - struct dir_rule *r; - for (r = first_dir_rule; r; r = r->next) - if (!strcmp(r->inside, in)) - break; - - // Add a new rule - if (!r) - { - r = xmalloc(sizeof(*r)); - r->inside = in; - *last_dir_rule = r; - last_dir_rule = &r->next; - r->next = NULL; - } - r->outside = out; - r->flags = flags; - return 1; -} - -static unsigned int parse_dir_option(char *opt) -{ - for (unsigned int i = 0; i < ARRAY_SIZE(dir_flag_names); i++) - if (!strcmp(opt, dir_flag_names[i])) - return 1U << i; - die("Unknown directory option %s", opt); -} - -static int set_dir_action(char *arg) -{ - arg = xstrdup(arg); - - char *colon = strchr(arg, ':'); - unsigned int flags = 0; - while (colon) - { - *colon++ = 0; - char *next = strchr(colon, ':'); - if (next) - *next = 0; - flags |= parse_dir_option(colon); - colon = next; - } - - char *eq = strchr(arg, '='); - if (eq) - { - *eq++ = 0; - return add_dir_rule(arg, (*eq ? eq : NULL), flags); - } - else - { - char *out = xmalloc(1 + strlen(arg) + 1); - sprintf(out, "/%s", arg); - return add_dir_rule(arg, out, flags); - } -} - -static void init_dir_rules(void) -{ - set_dir_action("box=./box:rw"); - set_dir_action("bin"); - set_dir_action("dev:dev"); - set_dir_action("lib"); - set_dir_action("lib64:maybe"); - set_dir_action("proc=proc:fs"); - set_dir_action("usr"); -} - -static void make_dir(char *path) -{ - char *sep = (path[0] == '/' ? path+1 : path); - - for (;;) - { - sep = strchr(sep, '/'); - if (sep) - *sep = 0; - - if (!dir_exists(path) && mkdir(path, 0777) < 0) - die("Cannot create directory %s: %m\n", path); +/*** Signal handling in keeper process ***/ - if (!sep) - return; - *sep++ = '/'; - } -} - -static void apply_dir_rules(void) -{ - for (struct dir_rule *r = first_dir_rule; r; r=r->next) - { - char *in = r->inside; - char *out = r->outside; - if (!out) - { - msg("Not binding anything on %s\n", r->inside); - continue; - } - - if ((r->flags & DIR_FLAG_MAYBE) && !dir_exists(out)) - { - msg("Not binding %s on %s (does not exist)\n", out, r->inside); - continue; - } - - char root_in[1024]; - snprintf(root_in, sizeof(root_in), "root/%s", in); - make_dir(root_in); - - unsigned long mount_flags = 0; - if (!(r->flags & DIR_FLAG_RW)) - mount_flags |= MS_RDONLY; - if (r->flags & DIR_FLAG_NOEXEC) - mount_flags |= MS_NOEXEC; - if (!(r->flags & DIR_FLAG_DEV)) - mount_flags |= MS_NODEV; - - if (r->flags & DIR_FLAG_FS) - { - msg("Mounting %s on %s (flags %lx)\n", out, in, mount_flags); - if (mount("none", root_in, out, mount_flags, "") < 0) - die("Cannot mount %s on %s: %m", out, in); - } - else - { - mount_flags |= MS_BIND | MS_NOSUID; - msg("Binding %s on %s (flags %lx)\n", out, in, mount_flags); - // Most mount flags need remount to work - if (mount(out, root_in, "none", mount_flags, "") < 0 || - mount(out, root_in, "none", MS_REMOUNT | mount_flags, "") < 0) - die("Cannot mount %s on %s: %m", out, in); - } - } -} - -/*** Control groups ***/ +/* + * Signal handling is tricky. We must set up signal handlers before + * we start the child process (and reset them in the child process). + * Otherwise, there is a short time window where a SIGINT can kill + * us and leave the child process running. + */ -struct cg_controller_desc { - const char *name; - int optional; +struct signal_rule { + int signum; + enum { SIGNAL_IGNORE, SIGNAL_INTERRUPT, SIGNAL_FATAL } action; }; -typedef enum { - CG_MEMORY = 0, - CG_CPUACCT, - CG_CPUSET, - CG_NUM_CONTROLLERS, -} cg_controller; - -static const struct cg_controller_desc cg_controllers[CG_NUM_CONTROLLERS+1] = { - [CG_MEMORY] = { "memory", 0 }, - [CG_CPUACCT] = { "cpuacct", 0 }, - [CG_CPUSET] = { "cpuset", 1 }, - [CG_NUM_CONTROLLERS] = { NULL, 0 }, +static const struct signal_rule signal_rules[] = { + { SIGHUP, SIGNAL_INTERRUPT }, + { SIGINT, SIGNAL_INTERRUPT }, + { SIGQUIT, SIGNAL_INTERRUPT }, + { SIGILL, SIGNAL_FATAL }, + { SIGABRT, SIGNAL_FATAL }, + { SIGFPE, SIGNAL_FATAL }, + { SIGSEGV, SIGNAL_FATAL }, + { SIGPIPE, SIGNAL_IGNORE }, + { SIGTERM, SIGNAL_INTERRUPT }, + { SIGUSR1, SIGNAL_IGNORE }, + { SIGUSR2, SIGNAL_IGNORE }, + { SIGBUS, SIGNAL_FATAL }, }; -#define FOREACH_CG_CONTROLLER(_controller) \ - for (cg_controller (_controller) = 0; \ - (_controller) < CG_NUM_CONTROLLERS; (_controller)++) - -static const char *cg_controller_name(cg_controller c) -{ - return cg_controllers[c].name; -} - -static int cg_controller_optional(cg_controller c) -{ - return cg_controllers[c].optional; -} - -static char cg_name[256]; - -#define CG_BUFSIZE 1024 - static void -cg_makepath(char *buf, size_t len, cg_controller c, const char *attr) -{ - const char *cg_root = CONFIG_ISOLATE_CGROUP_ROOT; - snprintf(buf, len, "%s/%s/%s/%s", cg_root, cg_controller_name(c), cg_name, attr); -} - -static int -cg_read(cg_controller controller, const char *attr, char *buf) -{ - int result = 0; - int maybe = 0; - if (attr[0] == '?') - { - attr++; - maybe = 1; - } - - char path[256]; - cg_makepath(path, sizeof(path), controller, attr); - - int fd = open(path, O_RDONLY); - if (fd < 0) - { - if (maybe) - goto fail; - die("Cannot read %s: %m", path); - } - - int n = read(fd, buf, CG_BUFSIZE); - if (n < 0) - { - if (maybe) - goto fail_close; - die("Cannot read %s: %m", path); - } - if (n >= CG_BUFSIZE - 1) - die("Attribute %s too long", path); - if (n > 0 && buf[n-1] == '\n') - n--; - buf[n] = 0; - - if (verbose > 1) - msg("CG: Read %s = %s\n", attr, buf); - - result = 1; -fail_close: - close(fd); -fail: - return result; -} - -static void __attribute__((format(printf,3,4))) -cg_write(cg_controller controller, const char *attr, const char *fmt, ...) -{ - int maybe = 0; - if (attr[0] == '?') - { - attr++; - maybe = 1; - } - - va_list args; - va_start(args, fmt); - - char buf[CG_BUFSIZE]; - int n = vsnprintf(buf, sizeof(buf), fmt, args); - if (n >= CG_BUFSIZE) - die("cg_write: Value for attribute %s is too long", attr); - - if (verbose > 1) - msg("CG: Write %s = %s", attr, buf); - - char path[256]; - cg_makepath(path, sizeof(path), controller, attr); - - int fd = open(path, O_WRONLY | O_TRUNC); - if (fd < 0) - { - if (maybe) - goto fail; - else - die("Cannot write %s: %m", path); - } - - int written = write(fd, buf, n); - if (written < 0) - { - if (maybe) - goto fail_close; - else - die("Cannot set %s to %s: %m", path, buf); - } - if (written != n) - die("Short write to %s (%d out of %d bytes)", path, written, n); - -fail_close: - close(fd); -fail: - va_end(args); -} - -static void -cg_init(void) -{ - if (!cg_enable) - return; - - char *cg_root = CONFIG_ISOLATE_CGROUP_ROOT; - if (!dir_exists(cg_root)) - die("Control group filesystem at %s not mounted", cg_root); - - snprintf(cg_name, sizeof(cg_name), "box-%d", box_id); - msg("Using control group %s\n", cg_name); -} - -static void -cg_prepare(void) +signal_alarm(int unused UNUSED) { - if (!cg_enable) - return; - - struct stat st; - char buf[CG_BUFSIZE]; - char path[256]; - - FOREACH_CG_CONTROLLER(controller) - { - cg_makepath(path, sizeof(path), controller, ""); - if (stat(path, &st) >= 0 || errno != ENOENT) - { - msg("Control group %s already exists, trying to empty it.\n", path); - if (rmdir(path) < 0) - die("Failed to reset control group %s: %m", path); - } - - if (mkdir(path, 0777) < 0 && !cg_controller_optional(controller)) - die("Failed to create control group %s: %m", path); - } - - // If cpuset module is enabled, copy allowed cpus and memory nodes from parent group - if (cg_read(CG_CPUSET, "?cpuset.cpus", buf)) - cg_write(CG_CPUSET, "cpuset.cpus", "%s", buf); - if (cg_read(CG_CPUSET, "?cpuset.mems", buf)) - cg_write(CG_CPUSET, "cpuset.mems", "%s", buf); + /* Time limit checks are synchronous, so we only schedule them there. */ + timer_tick = 1; + msg("[timer]"); } static void -cg_enter(void) -{ - if (!cg_enable) - return; - - msg("Entering control group %s\n", cg_name); - - FOREACH_CG_CONTROLLER(controller) - { - if (cg_controller_optional(controller)) - cg_write(controller, "?tasks", "%d\n", (int) getpid()); - else - cg_write(controller, "tasks", "%d\n", (int) getpid()); - } - - if (cg_memory_limit) - { - cg_write(CG_MEMORY, "memory.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); - cg_write(CG_MEMORY, "?memory.memsw.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); - } - - if (cg_timing) - cg_write(CG_CPUACCT, "cpuacct.usage", "0\n"); -} - -static int -cg_get_run_time_ms(void) +signal_int(int signum) { - if (!cg_enable) - return 0; - - char buf[CG_BUFSIZE]; - cg_read(CG_CPUACCT, "cpuacct.usage", buf); - unsigned long long ns = atoll(buf); - return ns / 1000000; + /* Interrupts (e.g., SIGINT) are synchronous, too. */ + interrupt = signum; } static void -cg_stats(void) +signal_fatal(int signum) { - if (!cg_enable) - return; - - char buf[CG_BUFSIZE]; - - // Memory usage statistics - unsigned long long mem=0, memsw=0; - if (cg_read(CG_MEMORY, "?memory.max_usage_in_bytes", buf)) - mem = atoll(buf); - if (cg_read(CG_MEMORY, "?memory.memsw.max_usage_in_bytes", buf)) - { - memsw = atoll(buf); - if (memsw > mem) - mem = memsw; - } - if (mem) - meta_printf("cg-mem:%lld\n", mem >> 10); + /* If we receive SIGSEGV or a similar signal, we try to die gracefully. */ + die("Sandbox keeper received fatal signal %d", signum); } static void -cg_remove(void) +setup_signals(void) { - char buf[CG_BUFSIZE]; - - if (!cg_enable) - return; + struct sigaction sa_int, sa_fatal; + bzero(&sa_int, sizeof(sa_int)); + sa_int.sa_handler = signal_int; + bzero(&sa_fatal, sizeof(sa_fatal)); + sa_fatal.sa_handler = signal_fatal; - FOREACH_CG_CONTROLLER(controller) + for (int i=0; i < ARRAY_SIZE(signal_rules); i++) { - if (cg_controller_optional(controller)) + const struct signal_rule *sr = &signal_rules[i]; + switch (sr->action) { - if (!cg_read(controller, "?tasks", buf)) - continue; + case SIGNAL_IGNORE: + signal(sr->signum, SIG_IGN); + break; + case SIGNAL_INTERRUPT: + sigaction(sr->signum, &sa_int, NULL); + break; + case SIGNAL_FATAL: + sigaction(sr->signum, &sa_fatal, NULL); + break; + default: + die("Invalid signal rule"); } - else - cg_read(controller, "tasks", buf); - - if (buf[0]) - die("Some tasks left in controller %s of cgroup %s, failed to remove it", - cg_controller_name(controller), cg_name); - - char path[256]; - cg_makepath(path, sizeof(path), controller, ""); - - if (rmdir(path) < 0) - die("Cannot remove control group %s: %m", path); } } -/*** Disk quotas ***/ - -static int -path_begins_with(char *path, char *with) -{ - while (*with) - if (*path++ != *with++) - return 0; - return (!*with || *with == '/'); -} - -static char * -find_device(char *path) -{ - FILE *f = setmntent("/proc/mounts", "r"); - if (!f) - die("Cannot open /proc/mounts: %m"); - - struct mntent *me; - int best_len = 0; - char *best_dev = NULL; - while (me = getmntent(f)) - { - if (!path_begins_with(me->mnt_fsname, "/dev")) - continue; - if (path_begins_with(path, me->mnt_dir)) - { - int len = strlen(me->mnt_dir); - if (len > best_len) - { - best_len = len; - free(best_dev); - best_dev = xstrdup(me->mnt_fsname); - } - } - } - endmntent(f); - return best_dev; -} - static void -set_quota(void) +reset_signals(void) { - if (!block_quota) - return; - - char cwd[PATH_MAX]; - if (!getcwd(cwd, sizeof(cwd))) - die("getcwd: %m"); - - char *dev = find_device(cwd); - if (!dev) - die("Cannot identify filesystem which contains %s", cwd); - msg("Quota: Mapped path %s to a filesystem on %s\n", cwd, dev); - - // Sanity check - struct stat dev_st, cwd_st; - if (stat(dev, &dev_st) < 0) - die("Cannot identify block device %s: %m", dev); - if (!S_ISBLK(dev_st.st_mode)) - die("Expected that %s is a block device", dev); - if (stat(".", &cwd_st) < 0) - die("Cannot stat cwd: %m"); - if (cwd_st.st_dev != dev_st.st_rdev) - die("Identified %s as a filesystem on %s, but it is obviously false", cwd, dev); - - struct dqblk dq = { - .dqb_bhardlimit = block_quota, - .dqb_bsoftlimit = block_quota, - .dqb_ihardlimit = inode_quota, - .dqb_isoftlimit = inode_quota, - .dqb_valid = QIF_LIMITS, - }; - if (quotactl(QCMD(Q_SETQUOTA, USRQUOTA), dev, box_uid, (caddr_t) &dq) < 0) - die("Cannot set disk quota: %m"); - msg("Quota: Set block quota %d and inode quota %d\n", block_quota, inode_quota); - - free(dev); + for (int i=0; i < ARRAY_SIZE(signal_rules); i++) + signal(signal_rules[i].signum, SIG_DFL); } /*** The keeper process ***/ -static void -signal_alarm(int unused UNUSED) -{ - /* Time limit checks are synchronous, so we only schedule them there. */ - timer_tick = 1; - alarm(1); -} - -static void -signal_int(int signum) -{ - /* Interrupts are fatal, so no synchronization requirements. */ - meta_printf("exitsig:%d\n", signum); - err("SG: Interrupted"); -} - #define PROC_BUF_SIZE 4096 static void read_proc_file(char *buf, char *name, int *fdp) @@ -1096,21 +386,6 @@ box_keeper(void) read_errors_from_fd = error_pipes[0]; close(error_pipes[1]); - struct sigaction sa; - bzero(&sa, sizeof(sa)); - sa.sa_handler = signal_int; - sigaction(SIGHUP, &sa, NULL); - sigaction(SIGINT, &sa, NULL); - sigaction(SIGQUIT, &sa, NULL); - sigaction(SIGILL, &sa, NULL); - sigaction(SIGABRT, &sa, NULL); - sigaction(SIGFPE, &sa, NULL); - sigaction(SIGSEGV, &sa, NULL); - sigaction(SIGPIPE, &sa, NULL); - sigaction(SIGTERM, &sa, NULL); - sigaction(SIGUSR1, &sa, NULL); - sigaction(SIGUSR2, &sa, NULL); - gettimeofday(&start_time, NULL); ticks_per_sec = sysconf(_SC_CLK_TCK); if (ticks_per_sec <= 0) @@ -1118,9 +393,15 @@ box_keeper(void) if (timeout || wall_timeout) { + struct sigaction sa; + bzero(&sa, sizeof(sa)); sa.sa_handler = signal_alarm; sigaction(SIGALRM, &sa, NULL); - alarm(1); + struct itimerval timer = { + .it_interval = { .tv_usec = TIMER_INTERVAL_US }, + .it_value = { .tv_usec = TIMER_INTERVAL_US }, + }; + setitimer(ITIMER_REAL, &timer, NULL); } for(;;) @@ -1128,6 +409,11 @@ box_keeper(void) struct rusage rus; int stat; pid_t p; + if (interrupt) + { + meta_printf("exitsig:%d\n", interrupt); + err("SG: Interrupted"); + } if (timer_tick) { check_timeout(); @@ -1166,9 +452,12 @@ box_keeper(void) if (wall_timeout && wall_ms > wall_timeout) err("TO: Time limit exceeded (wall clock)"); flush_line(); - fprintf(stderr, "OK (%d.%03d sec real, %d.%03d sec wall)\n", - total_ms/1000, total_ms%1000, - wall_ms/1000, wall_ms%1000); + if (!silent) + { + fprintf(stderr, "OK (%d.%03d sec real, %d.%03d sec wall)\n", + total_ms/1000, total_ms%1000, + wall_ms/1000, wall_ms%1000); + } box_exit(0); } else if (WIFSIGNALED(stat)) @@ -1290,6 +579,7 @@ box_inside(void *arg) close(error_pipes[0]); meta_close(); + reset_signals(); cg_enter(); setup_root(); setup_credentials(); @@ -1307,12 +597,12 @@ box_inside(void *arg) static void box_init(void) { - if (box_id < 0 || box_id >= CONFIG_ISOLATE_NUM_BOXES) - die("Sandbox ID out of range (allowed: 0-%d)", CONFIG_ISOLATE_NUM_BOXES-1); - box_uid = CONFIG_ISOLATE_FIRST_UID + box_id; - box_gid = CONFIG_ISOLATE_FIRST_GID + box_id; + if (box_id < 0 || box_id >= cf_num_boxes) + die("Sandbox ID out of range (allowed: 0-%d)", cf_num_boxes-1); + box_uid = cf_first_uid + box_id; + box_gid = cf_first_gid + box_id; - snprintf(box_dir, sizeof(box_dir), "%s/%d", CONFIG_ISOLATE_BOX_DIR, box_id); + snprintf(box_dir, sizeof(box_dir), "%s/%d", cf_box_root, box_id); make_dir(box_dir); if (chdir(box_dir) < 0) die("chdir(%s): %m", box_dir); @@ -1363,6 +653,8 @@ run(char **argv) fcntl(error_pipes[i], F_SETFL, fcntl(error_pipes[i], F_GETFL) | O_NONBLOCK) < 0) die("fcntl on pipe: %m"); + setup_signals(); + box_pid = clone( box_inside, // Function to execute as the body of the new process argv, // Pass our stack @@ -1381,13 +673,6 @@ show_version(void) printf("The process isolator " VERSION "\n"); printf("(c) 2012--" YEAR " Martin Mares and Bernard Blackham\n"); printf("Built on " BUILD_DATE " from Git commit " BUILD_COMMIT "\n"); - printf("\nCompile-time configuration:\n"); - printf("Sandbox directory: %s\n", CONFIG_ISOLATE_BOX_DIR); - printf("Sandbox credentials: uid=%u-%u gid=%u-%u\n", - CONFIG_ISOLATE_FIRST_UID, - CONFIG_ISOLATE_FIRST_UID + CONFIG_ISOLATE_NUM_BOXES - 1, - CONFIG_ISOLATE_FIRST_GID, - CONFIG_ISOLATE_FIRST_GID + CONFIG_ISOLATE_NUM_BOXES - 1); } /*** Options ***/ @@ -1430,6 +715,7 @@ Options:\n\ -M, --meta=\tOutput process information to (name:value)\n\ -q, --quota=,\tSet disk quota to blocks and inodes\n\ --share-net\t\tShare network namespace with the parent process\n\ +-s, --silent\t\tDo not print status messages except for fatal errors\n\ -k, --stack=\tLimit stack size to KB (default: 0=unlimited)\n\ -r, --stderr=\tRedirect stderr to \n\ -i, --stdin=\tRedirect stdin from \n\ @@ -1459,7 +745,7 @@ enum opt_code { OPT_SHARE_NET, }; -static const char short_opts[] = "b:c:d:eE:i:k:m:M:o:p::q:r:t:vw:x:"; +static const char short_opts[] = "b:c:d:eE:f:i:k:m:M:o:p::q:r:st:vw:x:"; static const struct option long_opts[] = { { "box-id", 1, NULL, 'b' }, @@ -1480,6 +766,7 @@ static const struct option long_opts[] = { { "quota", 1, NULL, 'q' }, { "run", 0, NULL, OPT_RUN }, { "share-net", 0, NULL, OPT_SHARE_NET }, + { "silent", 0, NULL, 's' }, { "stack", 1, NULL, 'k' }, { "stderr", 1, NULL, 'r' }, { "stdin", 1, NULL, 'i' }, @@ -1516,9 +803,6 @@ main(int argc, char **argv) if (!set_dir_action(optarg)) usage("Invalid directory specified: %s\n", optarg); break; - case 'f': - fsize_limit = atoi(optarg); - break; case 'e': pass_environ = 1; break; @@ -1526,6 +810,9 @@ main(int argc, char **argv) if (!set_env_action(optarg)) usage("Invalid environment specified: %s\n", optarg); break; + case 'f': + fsize_limit = atoi(optarg); + break; case 'k': stack_limit = atoi(optarg); break; @@ -1557,6 +844,9 @@ main(int argc, char **argv) case 'r': redir_stderr = optarg; break; + case 's': + silent++; + break; case 't': timeout = 1000*atof(optarg); break; @@ -1605,6 +895,7 @@ main(int argc, char **argv) orig_gid = getgid(); umask(022); + cf_parse(); box_init(); cg_init(); diff --git a/isolate.h b/isolate.h new file mode 100644 index 0000000..b9fdecb --- /dev/null +++ b/isolate.h @@ -0,0 +1,85 @@ +/* + * Process Isolator + * + * (c) 2012-2016 Martin Mares + * (c) 2012-2014 Bernard Blackham + */ + +#include +#include +#include + +#define NONRET __attribute__((noreturn)) +#define UNUSED __attribute__((unused)) +#define ARRAY_SIZE(a) (int)(sizeof(a)/sizeof(a[0])) + +/* isolate.c */ + +void die(char *msg, ...) NONRET; +void NONRET __attribute__((format(printf,1,2))) err(char *msg, ...); +void __attribute__((format(printf,1,2))) msg(char *msg, ...); + +extern int pass_environ; +extern int verbose; +extern int block_quota; +extern int inode_quota; +extern int cg_enable; +extern int cg_memory_limit; +extern int cg_timing; + +extern int box_id; +extern uid_t box_uid; +extern gid_t box_gid; + +/* util.c */ + +void *xmalloc(size_t size); +char *xstrdup(char *str); +int dir_exists(char *path); +void rmtree(char *path); +void make_dir(char *path); +void chowntree(char *path, uid_t uid, gid_t gid); + +void meta_open(const char *name); +void meta_close(void); +void __attribute__((format(printf,1,2))) meta_printf(const char *fmt, ...); + +/* rules.c */ + +int set_env_action(char *a0); +char **setup_environment(void); + +void init_dir_rules(void); +int set_dir_action(char *arg); +void apply_dir_rules(void); + +void set_quota(void); + +/* cg.c */ + +void cg_init(void); +void cg_prepare(void); +void cg_enter(void); +int cg_get_run_time_ms(void); +void cg_stats(void); +void cg_remove(void); + +/* config.c */ + +extern char *cf_box_root; +extern char *cf_cg_root; +extern char *cf_cg_parent; +extern int cf_first_uid; +extern int cf_first_gid; +extern int cf_num_boxes; + +struct cf_per_box { + struct cf_per_box *next; + int box_id; + char *cpus; + char *mems; +}; + +void cf_parse(void); +struct cf_per_box *cf_per_box(int box_id); +struct cf_per_box *cf_current_box(void); diff --git a/rules.c b/rules.c new file mode 100644 index 0000000..e2934d8 --- /dev/null +++ b/rules.c @@ -0,0 +1,393 @@ +/* + * Process Isolator -- Rules + * + * (c) 2012-2016 Martin Mares + * (c) 2012-2014 Bernard Blackham + */ + +#include "isolate.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/*** Environment rules ***/ + +struct env_rule { + char *var; // Variable to match + char *val; // ""=clear, NULL=inherit + int var_len; + struct env_rule *next; +}; + +static struct env_rule *first_env_rule; +static struct env_rule **last_env_rule = &first_env_rule; + +static struct env_rule default_env_rules[] = { + { .var = "LIBC_FATAL_STDERR_", .val = "1", .var_len = 18 }, +}; + +int +set_env_action(char *a0) +{ + struct env_rule *r = xmalloc(sizeof(*r) + strlen(a0) + 1); + char *a = (char *)(r+1); + strcpy(a, a0); + + char *sep = strchr(a, '='); + if (sep == a) + return 0; + r->var = a; + if (sep) + { + *sep++ = 0; + r->val = sep; + } + else + r->val = NULL; + *last_env_rule = r; + last_env_rule = &r->next; + r->next = NULL; + return 1; +} + +static int +match_env_var(char *env_entry, struct env_rule *r) +{ + if (strncmp(env_entry, r->var, r->var_len)) + return 0; + return (env_entry[r->var_len] == '='); +} + +static void +apply_env_rule(char **env, int *env_sizep, struct env_rule *r) +{ + // First remove the variable if already set + int pos = 0; + while (pos < *env_sizep && !match_env_var(env[pos], r)) + pos++; + if (pos < *env_sizep) + { + (*env_sizep)--; + env[pos] = env[*env_sizep]; + env[*env_sizep] = NULL; + } + + // What is the new value? + char *new; + if (r->val) + { + if (!r->val[0]) + return; + new = xmalloc(r->var_len + 1 + strlen(r->val) + 1); + sprintf(new, "%s=%s", r->var, r->val); + } + else + { + pos = 0; + while (environ[pos] && !match_env_var(environ[pos], r)) + pos++; + if (!(new = environ[pos])) + return; + } + + // Add it at the end of the array + env[(*env_sizep)++] = new; + env[*env_sizep] = NULL; +} + +char ** +setup_environment(void) +{ + // Link built-in rules with user rules + for (int i=ARRAY_SIZE(default_env_rules)-1; i >= 0; i--) + { + default_env_rules[i].next = first_env_rule; + first_env_rule = &default_env_rules[i]; + } + + // Scan the original environment + char **orig_env = environ; + int orig_size = 0; + while (orig_env[orig_size]) + orig_size++; + + // For each rule, reserve one more slot and calculate length + int num_rules = 0; + for (struct env_rule *r = first_env_rule; r; r=r->next) + { + num_rules++; + r->var_len = strlen(r->var); + } + + // Create a new environment + char **env = xmalloc((orig_size + num_rules + 1) * sizeof(char *)); + int size; + if (pass_environ) + { + memcpy(env, environ, orig_size * sizeof(char *)); + size = orig_size; + } + else + size = 0; + env[size] = NULL; + + // Apply the rules one by one + for (struct env_rule *r = first_env_rule; r; r=r->next) + apply_env_rule(env, &size, r); + + // Return the new env and pass some gossip + if (verbose > 1) + { + fprintf(stderr, "Passing environment:\n"); + for (int i=0; env[i]; i++) + fprintf(stderr, "\t%s\n", env[i]); + } + return env; +} + +/*** Directory rules ***/ + +struct dir_rule { + char *inside; // A relative path + char *outside; // This can be an absolute path or a relative path starting with "./" + unsigned int flags; // DIR_FLAG_xxx + struct dir_rule *next; +}; + +enum dir_rule_flags { + DIR_FLAG_RW = 1, + DIR_FLAG_NOEXEC = 2, + DIR_FLAG_FS = 4, + DIR_FLAG_MAYBE = 8, + DIR_FLAG_DEV = 16, +}; + +static const char * const dir_flag_names[] = { "rw", "noexec", "fs", "maybe", "dev" }; + +static struct dir_rule *first_dir_rule; +static struct dir_rule **last_dir_rule = &first_dir_rule; + +static int add_dir_rule(char *in, char *out, unsigned int flags) +{ + // Make sure that "in" is relative + while (in[0] == '/') + in++; + if (!*in) + return 0; + + // Check "out" + if (flags & DIR_FLAG_FS) + { + if (!out || out[0] == '/') + return 0; + } + else + { + if (out && out[0] != '/' && strncmp(out, "./", 2)) + return 0; + } + + // Override an existing rule + struct dir_rule *r; + for (r = first_dir_rule; r; r = r->next) + if (!strcmp(r->inside, in)) + break; + + // Add a new rule + if (!r) + { + r = xmalloc(sizeof(*r)); + r->inside = in; + *last_dir_rule = r; + last_dir_rule = &r->next; + r->next = NULL; + } + r->outside = out; + r->flags = flags; + return 1; +} + +static unsigned int parse_dir_option(char *opt) +{ + for (unsigned int i = 0; i < ARRAY_SIZE(dir_flag_names); i++) + if (!strcmp(opt, dir_flag_names[i])) + return 1U << i; + die("Unknown directory option %s", opt); +} + +int set_dir_action(char *arg) +{ + arg = xstrdup(arg); + + char *colon = strchr(arg, ':'); + unsigned int flags = 0; + while (colon) + { + *colon++ = 0; + char *next = strchr(colon, ':'); + if (next) + *next = 0; + flags |= parse_dir_option(colon); + colon = next; + } + + char *eq = strchr(arg, '='); + if (eq) + { + *eq++ = 0; + return add_dir_rule(arg, (*eq ? eq : NULL), flags); + } + else + { + char *out = xmalloc(1 + strlen(arg) + 1); + sprintf(out, "/%s", arg); + return add_dir_rule(arg, out, flags); + } +} + +void init_dir_rules(void) +{ + set_dir_action("box=./box:rw"); + set_dir_action("bin"); + set_dir_action("dev:dev"); + set_dir_action("lib"); + set_dir_action("lib64:maybe"); + set_dir_action("proc=proc:fs"); + set_dir_action("usr"); +} + +void apply_dir_rules(void) +{ + for (struct dir_rule *r = first_dir_rule; r; r=r->next) + { + char *in = r->inside; + char *out = r->outside; + if (!out) + { + msg("Not binding anything on %s\n", r->inside); + continue; + } + + if ((r->flags & DIR_FLAG_MAYBE) && !dir_exists(out)) + { + msg("Not binding %s on %s (does not exist)\n", out, r->inside); + continue; + } + + char root_in[1024]; + snprintf(root_in, sizeof(root_in), "root/%s", in); + make_dir(root_in); + + unsigned long mount_flags = 0; + if (!(r->flags & DIR_FLAG_RW)) + mount_flags |= MS_RDONLY; + if (r->flags & DIR_FLAG_NOEXEC) + mount_flags |= MS_NOEXEC; + if (!(r->flags & DIR_FLAG_DEV)) + mount_flags |= MS_NODEV; + + if (r->flags & DIR_FLAG_FS) + { + msg("Mounting %s on %s (flags %lx)\n", out, in, mount_flags); + if (mount("none", root_in, out, mount_flags, "") < 0) + die("Cannot mount %s on %s: %m", out, in); + } + else + { + mount_flags |= MS_BIND | MS_NOSUID; + msg("Binding %s on %s (flags %lx)\n", out, in, mount_flags); + // Most mount flags need remount to work + if (mount(out, root_in, "none", mount_flags, "") < 0 || + mount(out, root_in, "none", MS_REMOUNT | mount_flags, "") < 0) + die("Cannot mount %s on %s: %m", out, in); + } + } +} + +/*** Disk quotas ***/ + +static int +path_begins_with(char *path, char *with) +{ + while (*with) + if (*path++ != *with++) + return 0; + return (!*with || *with == '/'); +} + +static char * +find_device(char *path) +{ + FILE *f = setmntent("/proc/mounts", "r"); + if (!f) + die("Cannot open /proc/mounts: %m"); + + struct mntent *me; + int best_len = 0; + char *best_dev = NULL; + while (me = getmntent(f)) + { + if (!path_begins_with(me->mnt_fsname, "/dev")) + continue; + if (path_begins_with(path, me->mnt_dir)) + { + int len = strlen(me->mnt_dir); + if (len > best_len) + { + best_len = len; + free(best_dev); + best_dev = xstrdup(me->mnt_fsname); + } + } + } + endmntent(f); + return best_dev; +} + +void +set_quota(void) +{ + if (!block_quota) + return; + + char cwd[PATH_MAX]; + if (!getcwd(cwd, sizeof(cwd))) + die("getcwd: %m"); + + char *dev = find_device(cwd); + if (!dev) + die("Cannot identify filesystem which contains %s", cwd); + msg("Quota: Mapped path %s to a filesystem on %s\n", cwd, dev); + + // Sanity check + struct stat dev_st, cwd_st; + if (stat(dev, &dev_st) < 0) + die("Cannot identify block device %s: %m", dev); + if (!S_ISBLK(dev_st.st_mode)) + die("Expected that %s is a block device", dev); + if (stat(".", &cwd_st) < 0) + die("Cannot stat cwd: %m"); + if (cwd_st.st_dev != dev_st.st_rdev) + die("Identified %s as a filesystem on %s, but it is obviously false", cwd, dev); + + struct dqblk dq = { + .dqb_bhardlimit = block_quota, + .dqb_bsoftlimit = block_quota, + .dqb_ihardlimit = inode_quota, + .dqb_isoftlimit = inode_quota, + .dqb_valid = QIF_LIMITS, + }; + if (quotactl(QCMD(Q_SETQUOTA, USRQUOTA), dev, box_uid, (caddr_t) &dq) < 0) + die("Cannot set disk quota: %m"); + msg("Quota: Set block quota %d and inode quota %d\n", block_quota, inode_quota); + + free(dev); +} diff --git a/util.c b/util.c new file mode 100644 index 0000000..111204d --- /dev/null +++ b/util.c @@ -0,0 +1,152 @@ +/* + * Process Isolator -- Utility Functions + * + * (c) 2012-2016 Martin Mares + * (c) 2012-2014 Bernard Blackham + */ + +#include "isolate.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +void * +xmalloc(size_t size) +{ + void *p = malloc(size); + if (!p) + die("Out of memory"); + return p; +} + +char * +xstrdup(char *str) +{ + char *p = strdup(str); + if (!p) + die("Out of memory"); + return p; +} + +int +dir_exists(char *path) +{ + struct stat st; + return (stat(path, &st) >= 0 && S_ISDIR(st.st_mode)); +} + +void make_dir(char *path) +{ + char *sep = (path[0] == '/' ? path+1 : path); + + for (;;) + { + sep = strchr(sep, '/'); + if (sep) + *sep = 0; + + if (mkdir(path, 0777) < 0 && errno != EEXIST) + die("Cannot create directory %s: %m", path); + + if (!sep) + break; + *sep++ = '/'; + } + + // mkdir() above may have returned EEXIST even if the path was not + // a directory. Ensure that it is. + struct stat st; + if (stat(path, &st) < 0) + die("Cannot stat %s: %m", path); + if (!S_ISDIR(st.st_mode)) + die("Cannot create %s: already exists, but not a directory", path); +} + + +static int rmtree_helper(const char *fpath, const struct stat *sb, + int typeflag UNUSED, struct FTW *ftwbuf UNUSED) +{ + if (S_ISDIR(sb->st_mode)) + { + if (rmdir(fpath) < 0) + die("Cannot rmdir %s: %m", fpath); + } + else + { + if (unlink(fpath) < 0) + die("Cannot unlink %s: %m", fpath); + } + return FTW_CONTINUE; +} + +void +rmtree(char *path) +{ + nftw(path, rmtree_helper, 32, FTW_MOUNT | FTW_PHYS | FTW_DEPTH); +} + +static uid_t chown_uid; +static gid_t chown_gid; + +static int chowntree_helper(const char *fpath, const struct stat *sb UNUSED, + int typeflag UNUSED, struct FTW *ftwbuf UNUSED) +{ + if (lchown(fpath, chown_uid, chown_gid) < 0) + die("Cannot chown %s: %m", fpath); + else + return FTW_CONTINUE; +} + +void +chowntree(char *path, uid_t uid, gid_t gid) +{ + chown_uid = uid; + chown_gid = gid; + nftw(path, chowntree_helper, 32, FTW_MOUNT | FTW_PHYS); +} + +/*** Meta-files ***/ + +static FILE *metafile; + +void +meta_open(const char *name) +{ + if (!strcmp(name, "-")) + { + metafile = stdout; + return; + } + if (setfsuid(getuid()) < 0) + die("Failed to switch FS UID: %m"); + metafile = fopen(name, "w"); + if (setfsuid(geteuid()) < 0) + die("Failed to switch FS UID back: %m"); + if (!metafile) + die("Failed to open metafile '%s'",name); +} + +void +meta_close(void) +{ + if (metafile && metafile != stdout) + fclose(metafile); +} + +void +meta_printf(const char *fmt, ...) +{ + if (!metafile) + return; + + va_list args; + va_start(args, fmt); + vfprintf(metafile, fmt, args); + va_end(args); +}