Documentation: prctl/seccomp_filter

Documents how system call filtering using Berkeley Packet
Filter programs works and how it may be used.
Includes an example for x86 and a semi-generic
example using a macro-based code generator.

Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Will Drewry <wad@chromium.org>
Acked-by: Kees Cook <keescook@chromium.org>

v18: - added acked by
     - update no new privs numbers
v17: - remove @compat note and add Pitfalls section for arch checking
       (keescook@chromium.org)
v16: -
v15: -
v14: - rebase/nochanges
v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
v12: - comment on the ptrace_event use
     - update arch support comment
     - note the behavior of SECCOMP_RET_DATA when there are multiple filters
       (keescook@chromium.org)
     - lots of samples/ clean up incl 64-bit bpf-direct support
       (markus@chromium.org)
     - rebase to linux-next
v11: - overhaul return value language, updates (keescook@chromium.org)
     - comment on do_exit(SIGSYS)
v10: - update for SIGSYS
     - update for new seccomp_data layout
     - update for ptrace option use
v9: - updated bpf-direct.c for SIGILL
v8: - add PR_SET_NO_NEW_PRIVS to the samples.
v7: - updated for all the new stuff in v7: TRAP, TRACE
    - only talk about PR_SET_SECCOMP now
    - fixed bad JLE32 check (coreyb@linux.vnet.ibm.com)
    - adds dropper.c: a simple system call disabler
v6: - tweak the language to note the requirement of
      PR_SET_NO_NEW_PRIVS being called prior to use. (luto@mit.edu)
v5: - update sample to use system call arguments
    - adds a "fancy" example using a macro-based generator
    - cleaned up bpf in the sample
    - update docs to mention arguments
    - fix prctl value (eparis@redhat.com)
    - language cleanup (rdunlap@xenotime.net)
v4: - update for no_new_privs use
    - minor tweaks
v3: - call out BPF <-> Berkeley Packet Filter (rdunlap@xenotime.net)
    - document use of tentative always-unprivileged
    - guard sample compilation for i386 and x86_64
v2: - move code to samples (corbet@lwn.net)
Signed-off-by: James Morris <james.l.morris@oracle.com>
diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h
new file mode 100644
index 0000000..643279d
--- /dev/null
+++ b/samples/seccomp/bpf-helper.h
@@ -0,0 +1,238 @@
+/*
+ * Example wrapper around BPF macros.
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_SET_SECCOMP, 2, ...).
+ *
+ * No guarantees are provided with respect to the correctness
+ * or functionality of this code.
+ */
+#ifndef __BPF_HELPER_H__
+#define __BPF_HELPER_H__
+
+#include <asm/bitsperlong.h>	/* for __BITS_PER_LONG */
+#include <endian.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>	/* for seccomp_data */
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <stddef.h>
+
+#define BPF_LABELS_MAX 256
+struct bpf_labels {
+	int count;
+	struct __bpf_label {
+		const char *label;
+		__u32 location;
+	} labels[BPF_LABELS_MAX];
+};
+
+int bpf_resolve_jumps(struct bpf_labels *labels,
+		      struct sock_filter *filter, size_t count);
+__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label);
+void seccomp_bpf_print(struct sock_filter *filter, size_t count);
+
+#define JUMP_JT 0xff
+#define JUMP_JF 0xff
+#define LABEL_JT 0xfe
+#define LABEL_JF 0xfe
+
+#define ALLOW \
+	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
+#define DENY \
+	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
+#define JUMP(labels, label) \
+	BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
+		 JUMP_JT, JUMP_JF)
+#define LABEL(labels, label) \
+	BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
+		 LABEL_JT, LABEL_JF)
+#define SYSCALL(nr, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (nr), 0, 1), \
+	jt
+
+/* Lame, but just an example */
+#define FIND_LABEL(labels, label) seccomp_bpf_label((labels), #label)
+
+#define EXPAND(...) __VA_ARGS__
+/* Map all width-sensitive operations */
+#if __BITS_PER_LONG == 32
+
+#define JEQ(x, jt) JEQ32(x, EXPAND(jt))
+#define JNE(x, jt) JNE32(x, EXPAND(jt))
+#define JGT(x, jt) JGT32(x, EXPAND(jt))
+#define JLT(x, jt) JLT32(x, EXPAND(jt))
+#define JGE(x, jt) JGE32(x, EXPAND(jt))
+#define JLE(x, jt) JLE32(x, EXPAND(jt))
+#define JA(x, jt) JA32(x, EXPAND(jt))
+#define ARG(i) ARG_32(i)
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+
+#elif __BITS_PER_LONG == 64
+
+/* Ensure that we load the logically correct offset. */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define ENDIAN(_lo, _hi) _lo, _hi
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define ENDIAN(_lo, _hi) _hi, _lo
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#else
+#error "Unknown endianness"
+#endif
+
+union arg64 {
+	struct {
+		__u32 ENDIAN(lo32, hi32);
+	};
+	__u64 u64;
+};
+
+#define JEQ(x, jt) \
+	JEQ64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+#define JGT(x, jt) \
+	JGT64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+#define JGE(x, jt) \
+	JGE64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+#define JNE(x, jt) \
+	JNE64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+#define JLT(x, jt) \
+	JLT64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+#define JLE(x, jt) \
+	JLE64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+
+#define JA(x, jt) \
+	JA64(((union arg64){.u64 = (x)}).lo32, \
+	       ((union arg64){.u64 = (x)}).hi32, \
+	       EXPAND(jt))
+#define ARG(i) ARG_64(i)
+
+#else
+#error __BITS_PER_LONG value unusable.
+#endif
+
+/* Loads the arg into A */
+#define ARG_32(idx) \
+	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx))
+
+/* Loads hi into A and lo in X */
+#define ARG_64(idx) \
+	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)), \
+	BPF_STMT(BPF_ST, 0), /* lo -> M[0] */ \
+	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, HI_ARG(idx)), \
+	BPF_STMT(BPF_ST, 1) /* hi -> M[1] */
+
+#define JEQ32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 0, 1), \
+	jt
+
+#define JNE32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 1, 0), \
+	jt
+
+/* Checks the lo, then swaps to check the hi. A=lo,X=hi */
+#define JEQ64(lo, hi, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 0, 2), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JNE64(lo, hi, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 5, 0), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 2, 0), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JA32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (value), 0, 1), \
+	jt
+
+#define JA64(lo, hi, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (hi), 3, 0), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+	BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (lo), 0, 2), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JGE32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 0, 1), \
+	jt
+
+#define JLT32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 1, 0), \
+	jt
+
+/* Shortcut checking if hi > arg.hi. */
+#define JGE64(lo, hi, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+	BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 0, 2), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JLT64(lo, hi, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JGT32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 0, 1), \
+	jt
+
+#define JLE32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 1, 0), \
+	jt
+
+/* Check hi > args.hi first, then do the GE checking */
+#define JGT64(lo, hi, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 0, 2), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JLE64(lo, hi, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 6, 0), \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define LOAD_SYSCALL_NR \
+	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
+		 offsetof(struct seccomp_data, nr))
+
+#endif  /* __BPF_HELPER_H__ */