mirror of
https://github.com/google/sandboxed-api.git
synced 2024-03-22 13:11:30 +08:00
bb6ae1d4ab
1. In many cases, sandboxes need to allow /proc/stat and /proc/cpuinfo so that get_nprocs(3) will work; otherwise, per-CPU logic can't determine how many CPUs there are. Unfortunately, some of those sandboxes also disable namespaces. The solution is to provide two functions: AllowRestartableSequencesWithProcFiles(), which allows syscalls and files; and AllowRestartableSequences(), which allows syscalls only. Sandboxes should usually call the former; sandboxes that disable namespaces should instead call the latter and are responsible for allowing the files via the deprecated Fs mechanism. 2. Make the mmap(2) policy evaluate prot AND flags, not prot OR flags. 3. Order the code and the comments identically for better readability. PiperOrigin-RevId: 386414028 Change-Id: I016b1854ed1da9c9bcff7b351c5e0041093b8193
1079 lines
32 KiB
C++
1079 lines
32 KiB
C++
// Copyright 2019 Google LLC
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "sandboxed_api/sandbox2/policybuilder.h"
|
|
|
|
#include <asm/ioctls.h> // For TCGETS
|
|
#include <fcntl.h> // For the fcntl flags
|
|
#include <linux/futex.h>
|
|
#include <linux/net.h> // For SYS_CONNECT
|
|
#include <linux/random.h> // For GRND_NONBLOCK
|
|
#include <sys/mman.h> // For mmap arguments
|
|
#include <sys/socket.h>
|
|
#include <syscall.h>
|
|
|
|
#include <csignal>
|
|
#include <cstdint>
|
|
#include <utility>
|
|
|
|
#include "absl/memory/memory.h"
|
|
#include "absl/status/statusor.h"
|
|
#include "absl/strings/escaping.h"
|
|
#include "absl/strings/match.h"
|
|
#include "sandboxed_api/config.h"
|
|
#include "sandboxed_api/sandbox2/namespace.h"
|
|
#include "sandboxed_api/sandbox2/util/bpf_helper.h"
|
|
#include "sandboxed_api/util/path.h"
|
|
#include "sandboxed_api/util/status_macros.h"
|
|
|
|
#if defined(SAPI_X86_64)
|
|
#include <asm/prctl.h>
|
|
#elif defined(SAPI_PPC64_LE)
|
|
#include <asm/termbits.h> // On PPC, TCGETS macro needs termios
|
|
#endif
|
|
|
|
namespace sandbox2 {
|
|
namespace {
|
|
|
|
namespace file = ::sapi::file;
|
|
|
|
constexpr PolicyBuilder::SyscallInitializer kMmapSyscalls = {
|
|
#ifdef __NR_mmap2
|
|
__NR_mmap2,
|
|
#endif
|
|
#ifdef __NR_mmap
|
|
__NR_mmap,
|
|
#endif
|
|
};
|
|
|
|
} // namespace
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowSyscall(unsigned int num) {
|
|
if (handled_syscalls_.insert(num).second) {
|
|
user_policy_.insert(user_policy_.end(), {SYSCALL(num, ALLOW)});
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowSyscalls(const std::vector<uint32_t>& nums) {
|
|
for (auto num : nums) {
|
|
AllowSyscall(num);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowSyscalls(SyscallInitializer nums) {
|
|
for (auto num : nums) {
|
|
AllowSyscall(num);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::BlockSyscallWithErrno(unsigned int num,
|
|
int error) {
|
|
if (handled_syscalls_.insert(num).second) {
|
|
user_policy_.insert(user_policy_.end(), {SYSCALL(num, ERRNO(error))});
|
|
if (num == __NR_bpf) {
|
|
user_policy_handles_bpf_ = true;
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowExit() {
|
|
return AllowSyscalls({__NR_exit, __NR_exit_group});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowScudoMalloc() {
|
|
AllowTime();
|
|
AllowSyscalls({__NR_munmap, __NR_nanosleep});
|
|
AllowFutexOp(FUTEX_WAKE);
|
|
AllowLimitedMadvise();
|
|
AllowGetRandom();
|
|
AllowWipeOnFork();
|
|
|
|
return AddPolicyOnMmap([](bpf_labels& labels) -> std::vector<sock_filter> {
|
|
return {
|
|
ARG_32(2), // prot
|
|
JEQ32(PROT_NONE, JUMP(&labels, prot_none)),
|
|
JNE32(PROT_READ | PROT_WRITE, JUMP(&labels, mmap_end)),
|
|
|
|
// PROT_READ | PROT_WRITE
|
|
ARG_32(3), // flags
|
|
JEQ32(MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, ALLOW),
|
|
JEQ32(MAP_PRIVATE | MAP_ANONYMOUS, ALLOW),
|
|
JUMP(&labels, mmap_end),
|
|
|
|
// PROT_NONE
|
|
LABEL(&labels, prot_none),
|
|
ARG_32(3), // flags
|
|
JEQ32(MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, ALLOW),
|
|
|
|
LABEL(&labels, mmap_end),
|
|
};
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowTcMalloc() {
|
|
AllowTime();
|
|
AllowRestartableSequences(kRequireFastFences);
|
|
AllowSyscalls(
|
|
{__NR_munmap, __NR_nanosleep, __NR_brk, __NR_mincore, __NR_membarrier});
|
|
AllowLimitedMadvise();
|
|
|
|
AddPolicyOnSyscall(__NR_mprotect, {
|
|
ARG_32(2),
|
|
JEQ32(PROT_READ | PROT_WRITE, ALLOW),
|
|
JEQ32(PROT_NONE, ALLOW),
|
|
});
|
|
|
|
return AddPolicyOnMmap([](bpf_labels& labels) -> std::vector<sock_filter> {
|
|
return {
|
|
ARG_32(2), // prot
|
|
JEQ32(PROT_NONE, JUMP(&labels, prot_none)),
|
|
JNE32(PROT_READ | PROT_WRITE, JUMP(&labels, mmap_end)),
|
|
|
|
// PROT_READ | PROT_WRITE
|
|
ARG_32(3), // flags
|
|
JNE32(MAP_ANONYMOUS | MAP_PRIVATE, JUMP(&labels, mmap_end)),
|
|
ALLOW,
|
|
|
|
// PROT_NONE
|
|
LABEL(&labels, prot_none),
|
|
ARG_32(3), // flags
|
|
JEQ32(MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, ALLOW),
|
|
JEQ32(MAP_ANONYMOUS | MAP_PRIVATE, ALLOW),
|
|
|
|
LABEL(&labels, mmap_end),
|
|
};
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowSystemMalloc() {
|
|
AllowSyscalls({__NR_munmap, __NR_brk});
|
|
AllowFutexOp(FUTEX_WAKE);
|
|
AddPolicyOnSyscall(__NR_mremap, {
|
|
ARG_32(3),
|
|
JEQ32(MREMAP_MAYMOVE, ALLOW),
|
|
});
|
|
return AddPolicyOnMmap([](bpf_labels& labels) -> std::vector<sock_filter> {
|
|
return {
|
|
ARG_32(2), // prot
|
|
JEQ32(PROT_NONE, JUMP(&labels, prot_none)),
|
|
JNE32(PROT_READ | PROT_WRITE, JUMP(&labels, mmap_end)),
|
|
|
|
// PROT_READ | PROT_WRITE
|
|
ARG_32(3), // flags
|
|
JEQ32(MAP_ANONYMOUS | MAP_PRIVATE, ALLOW),
|
|
|
|
// PROT_NONE
|
|
LABEL(&labels, prot_none),
|
|
ARG_32(3), // flags
|
|
JEQ32(MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, ALLOW),
|
|
|
|
LABEL(&labels, mmap_end),
|
|
};
|
|
});
|
|
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowLlvmSanitizers() {
|
|
if constexpr (sapi::sanitizers::IsAny()) {
|
|
AddPolicyOnSyscall(__NR_madvise, {
|
|
ARG_32(2),
|
|
JEQ32(MADV_DONTDUMP, ALLOW),
|
|
JEQ32(MADV_NOHUGEPAGE, ALLOW),
|
|
});
|
|
// Sanitizers read from /proc. For example:
|
|
// https://github.com/llvm-mirror/compiler-rt/blob/69445f095c22aac2388f939bedebf224a6efcdaf/lib/sanitizer_common/sanitizer_linux.cpp#L1101
|
|
AddDirectory("/proc");
|
|
}
|
|
if constexpr (sapi::sanitizers::IsASan()) {
|
|
AllowSyscall(__NR_sigaltstack);
|
|
// asan uses a custom allocator that runs mmap under the hood. For example:
|
|
// https://github.com/llvm/llvm-project/blob/596d534ac3524052df210be8d3c01a33b2260a42/compiler-rt/lib/asan/asan_allocator.cpp#L980
|
|
// https://github.com/llvm/llvm-project/blob/62ec4ac90738a5f2d209ed28c822223e58aaaeb7/compiler-rt/lib/sanitizer_common/sanitizer_allocator_secondary.h#L98
|
|
AllowMmap();
|
|
}
|
|
if constexpr (sapi::sanitizers::IsTSan()) {
|
|
AllowMmap();
|
|
AllowSyscall(__NR_munmap);
|
|
AddPolicyOnSyscall(__NR_mprotect,
|
|
{
|
|
ARG_32(2),
|
|
BPF_STMT(BPF_AND | BPF_ALU | BPF_K,
|
|
~uint32_t{PROT_READ | PROT_WRITE}),
|
|
JEQ32(0, ALLOW),
|
|
});
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowLimitedMadvise() {
|
|
return AddPolicyOnSyscall(__NR_madvise, {
|
|
ARG_32(2),
|
|
JEQ32(MADV_DONTNEED, ALLOW),
|
|
JEQ32(MADV_REMOVE, ALLOW),
|
|
JEQ32(MADV_NOHUGEPAGE, ALLOW),
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowMmap() {
|
|
return AllowSyscalls(kMmapSyscalls);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowOpen() {
|
|
#ifdef __NR_open
|
|
AllowSyscall(__NR_open);
|
|
#endif
|
|
#ifdef __NR_openat
|
|
AllowSyscall(__NR_openat);
|
|
#endif
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowStat() {
|
|
#ifdef __NR_fstat
|
|
AllowSyscall(__NR_fstat);
|
|
#endif
|
|
#ifdef __NR_fstat64
|
|
AllowSyscall(__NR_fstat64);
|
|
#endif
|
|
#ifdef __NR_fstatat
|
|
AllowSyscall(__NR_fstatat);
|
|
#endif
|
|
#ifdef __NR_fstatat64
|
|
AllowSyscall(__NR_fstatat64);
|
|
#endif
|
|
#ifdef __NR_lstat
|
|
AllowSyscall(__NR_lstat);
|
|
#endif
|
|
#ifdef __NR_lstat64
|
|
AllowSyscall(__NR_lstat64);
|
|
#endif
|
|
#ifdef __NR_newfstatat
|
|
AllowSyscall(__NR_newfstatat);
|
|
#endif
|
|
#ifdef __NR_oldfstat
|
|
AllowSyscall(__NR_oldfstat);
|
|
#endif
|
|
#ifdef __NR_oldlstat
|
|
AllowSyscall(__NR_oldlstat);
|
|
#endif
|
|
#ifdef __NR_oldstat
|
|
AllowSyscall(__NR_oldstat);
|
|
#endif
|
|
#ifdef __NR_stat
|
|
AllowSyscall(__NR_stat);
|
|
#endif
|
|
#ifdef __NR_stat64
|
|
AllowSyscall(__NR_stat64);
|
|
#endif
|
|
#ifdef __NR_statfs
|
|
AllowSyscall(__NR_statfs);
|
|
#endif
|
|
#ifdef __NR_statfs64
|
|
AllowSyscall(__NR_statfs64);
|
|
#endif
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowAccess() {
|
|
#ifdef __NR_access
|
|
AllowSyscall(__NR_access);
|
|
#endif
|
|
#ifdef __NR_faccessat
|
|
AllowSyscall(__NR_faccessat);
|
|
#endif
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowRead() {
|
|
return AllowSyscalls({
|
|
__NR_read,
|
|
__NR_readv,
|
|
__NR_preadv,
|
|
__NR_pread64,
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowWrite() {
|
|
return AllowSyscalls({
|
|
__NR_write,
|
|
__NR_writev,
|
|
__NR_pwritev,
|
|
__NR_pwrite64,
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowReaddir() {
|
|
return AllowSyscalls({
|
|
#ifdef __NR_getdents
|
|
__NR_getdents,
|
|
#endif
|
|
#ifdef __NR_getdents64
|
|
__NR_getdents64,
|
|
#endif
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowSafeFcntl() {
|
|
return AddPolicyOnSyscalls({__NR_fcntl,
|
|
#ifdef __NR_fcntl64
|
|
__NR_fcntl64
|
|
#endif
|
|
},
|
|
{
|
|
ARG_32(1),
|
|
JEQ32(F_GETFD, ALLOW),
|
|
JEQ32(F_SETFD, ALLOW),
|
|
JEQ32(F_GETFL, ALLOW),
|
|
JEQ32(F_SETFL, ALLOW),
|
|
JEQ32(F_GETLK, ALLOW),
|
|
JEQ32(F_SETLK, ALLOW),
|
|
JEQ32(F_SETLKW, ALLOW),
|
|
JEQ32(F_DUPFD, ALLOW),
|
|
JEQ32(F_DUPFD_CLOEXEC, ALLOW),
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowFork() {
|
|
return AllowSyscalls({
|
|
#ifdef __NR_fork
|
|
__NR_fork,
|
|
#endif
|
|
#ifdef __NR_vfork
|
|
__NR_vfork,
|
|
#endif
|
|
__NR_clone});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowWait() {
|
|
return AllowSyscalls({
|
|
#ifdef __NR_waitpid
|
|
__NR_waitpid,
|
|
#endif
|
|
__NR_wait4});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowHandleSignals() {
|
|
return AllowSyscalls({
|
|
__NR_rt_sigaction,
|
|
__NR_rt_sigreturn,
|
|
__NR_rt_sigprocmask,
|
|
#ifdef __NR_signal
|
|
__NR_signal,
|
|
#endif
|
|
#ifdef __NR_sigaction
|
|
__NR_sigaction,
|
|
#endif
|
|
#ifdef __NR_sigreturn
|
|
__NR_sigreturn,
|
|
#endif
|
|
#ifdef __NR_sigprocmask
|
|
__NR_sigprocmask,
|
|
#endif
|
|
#ifdef __NR_sigaltstack
|
|
__NR_sigaltstack,
|
|
#endif
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowTCGETS() {
|
|
return AddPolicyOnSyscall(__NR_ioctl, {
|
|
ARG_32(1),
|
|
JEQ32(TCGETS, ALLOW),
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowTime() {
|
|
return AllowSyscalls({
|
|
#ifdef __NR_time
|
|
__NR_time,
|
|
#endif
|
|
__NR_gettimeofday, __NR_clock_gettime});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowSleep() {
|
|
return AllowSyscalls({
|
|
__NR_clock_nanosleep,
|
|
__NR_nanosleep,
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowGetIDs() {
|
|
return AllowSyscalls({
|
|
__NR_getuid,
|
|
__NR_geteuid,
|
|
__NR_getresuid,
|
|
__NR_getgid,
|
|
__NR_getegid,
|
|
__NR_getresgid,
|
|
#ifdef __NR_getuid32
|
|
__NR_getuid32,
|
|
__NR_geteuid32,
|
|
__NR_getresuid32,
|
|
__NR_getgid32,
|
|
__NR_getegid32,
|
|
__NR_getresgid32,
|
|
#endif
|
|
__NR_getgroups,
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowRestartableSequencesWithProcFiles(
|
|
CpuFenceMode cpu_fence_mode) {
|
|
AllowRestartableSequences(cpu_fence_mode);
|
|
AddFile("/proc/cpuinfo");
|
|
AddFile("/proc/stat");
|
|
if (cpu_fence_mode == kAllowSlowFences) {
|
|
AddFile("/proc/self/cpuset");
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowRestartableSequences(
|
|
CpuFenceMode cpu_fence_mode) {
|
|
#ifdef __NR_rseq
|
|
AllowSyscall(__NR_rseq);
|
|
#endif
|
|
AddPolicyOnMmap([](bpf_labels& labels) -> std::vector<sock_filter> {
|
|
return {
|
|
ARG_32(2), // prot
|
|
JNE32(PROT_READ | PROT_WRITE, JUMP(&labels, mmap_end)),
|
|
|
|
ARG_32(3), // flags
|
|
JNE32(MAP_PRIVATE | MAP_ANONYMOUS, JUMP(&labels, mmap_end)),
|
|
|
|
ALLOW,
|
|
LABEL(&labels, mmap_end),
|
|
};
|
|
});
|
|
AllowSyscall(__NR_getcpu);
|
|
AllowSyscall(__NR_membarrier);
|
|
AllowFutexOp(FUTEX_WAIT);
|
|
AllowFutexOp(FUTEX_WAKE);
|
|
AddPolicyOnSyscall(__NR_rt_sigprocmask, {
|
|
ARG_32(0),
|
|
JEQ32(SIG_SETMASK, ALLOW),
|
|
});
|
|
if (cpu_fence_mode == kAllowSlowFences) {
|
|
AllowSyscall(__NR_sched_getaffinity);
|
|
AllowSyscall(__NR_sched_setaffinity);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowGetPIDs() {
|
|
return AllowSyscalls({
|
|
__NR_getpid,
|
|
__NR_getppid,
|
|
__NR_gettid,
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowGetRlimit() {
|
|
return AllowSyscalls({
|
|
#ifdef __NR_getrlimit
|
|
__NR_getrlimit,
|
|
#endif
|
|
#ifdef __NR_ugetrlimit
|
|
__NR_ugetrlimit,
|
|
#endif
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowSetRlimit() {
|
|
return AllowSyscalls({
|
|
#ifdef __NR_setrlimit
|
|
__NR_setrlimit,
|
|
#endif
|
|
#ifdef __NR_usetrlimit
|
|
__NR_usetrlimit,
|
|
#endif
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowGetRandom() {
|
|
return AddPolicyOnSyscall(__NR_getrandom, {
|
|
ARG_32(2),
|
|
JEQ32(0, ALLOW),
|
|
JEQ32(GRND_NONBLOCK, ALLOW),
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowWipeOnFork() {
|
|
// System headers may not be recent enough to include MADV_WIPEONFORK.
|
|
static constexpr uint32_t kMadv_WipeOnFork = 18;
|
|
// The -1 value is used by code to probe that the kernel returns -EINVAL for
|
|
// unknown values because some environments, like qemu, ignore madvise
|
|
// completely, but code needs to know whether WIPEONFORK took effect.
|
|
return AddPolicyOnSyscall(__NR_madvise,
|
|
{
|
|
ARG_32(2),
|
|
JEQ32(kMadv_WipeOnFork, ALLOW),
|
|
JEQ32(static_cast<uint32_t>(-1), ALLOW),
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowLogForwarding() {
|
|
AllowWrite();
|
|
AllowSystemMalloc();
|
|
AllowTcMalloc();
|
|
|
|
AllowSyscalls({// from logging code
|
|
__NR_clock_gettime,
|
|
// From comms
|
|
__NR_gettid, __NR_close});
|
|
|
|
// For generating stacktraces in logging (e.g. `LOG(FATAL)`)
|
|
AddPolicyOnSyscall(__NR_rt_sigprocmask, {
|
|
ARG_32(0),
|
|
JEQ32(SIG_BLOCK, ALLOW),
|
|
});
|
|
// For LOG(FATAL)
|
|
return AddPolicyOnSyscall(__NR_kill,
|
|
[](bpf_labels& labels) -> std::vector<sock_filter> {
|
|
return {
|
|
ARG_32(0),
|
|
JNE32(0, JUMP(&labels, pid_not_null)),
|
|
ARG_32(1),
|
|
JEQ32(SIGABRT, ALLOW),
|
|
LABEL(&labels, pid_not_null),
|
|
};
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowFutexOp(int op) {
|
|
return AddPolicyOnSyscall(
|
|
__NR_futex, {
|
|
ARG_32(1),
|
|
// a <- a & FUTEX_CMD_MASK
|
|
BPF_STMT(BPF_ALU + BPF_AND + BPF_K,
|
|
static_cast<uint32_t>(FUTEX_CMD_MASK)),
|
|
JEQ32(static_cast<uint32_t>(op) & FUTEX_CMD_MASK, ALLOW),
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowStaticStartup() {
|
|
AllowGetRlimit();
|
|
AllowSyscalls({
|
|
// These syscalls take a pointer, so no restriction.
|
|
__NR_uname, __NR_brk, __NR_set_tid_address,
|
|
|
|
#if defined(__ARM_NR_set_tls)
|
|
// libc sets the TLS during startup
|
|
__ARM_NR_set_tls,
|
|
#endif
|
|
|
|
// This syscall takes a pointer and a length.
|
|
// We could restrict length, but it might change, so not worth it.
|
|
__NR_set_robust_list,
|
|
});
|
|
|
|
AllowFutexOp(FUTEX_WAIT_BITSET);
|
|
|
|
AddPolicyOnSyscall(__NR_rt_sigaction,
|
|
{
|
|
ARG_32(0),
|
|
// This is real-time signals used internally by libc.
|
|
JEQ32(__SIGRTMIN + 0, ALLOW),
|
|
JEQ32(__SIGRTMIN + 1, ALLOW),
|
|
});
|
|
|
|
AddPolicyOnSyscall(__NR_rt_sigprocmask, {
|
|
ARG_32(0),
|
|
JEQ32(SIG_UNBLOCK, ALLOW),
|
|
});
|
|
|
|
#ifdef SAPI_X86_64
|
|
// The second argument is a pointer.
|
|
AddPolicyOnSyscall(__NR_arch_prctl, {
|
|
ARG_32(0),
|
|
JEQ32(ARCH_SET_FS, ALLOW),
|
|
});
|
|
#endif
|
|
|
|
if constexpr (sapi::host_cpu::IsArm64()) {
|
|
BlockSyscallWithErrno(__NR_readlinkat, ENOENT);
|
|
}
|
|
#ifdef __NR_readlink
|
|
BlockSyscallWithErrno(__NR_readlink, ENOENT);
|
|
#endif
|
|
|
|
if constexpr (sapi::host_cpu::IsArm()) {
|
|
AddPolicyOnSyscall(__NR_mprotect, {
|
|
ARG_32(2),
|
|
JEQ32(PROT_READ, ALLOW),
|
|
});
|
|
}
|
|
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowDynamicStartup() {
|
|
AllowRead();
|
|
AllowStat();
|
|
AllowSyscalls({__NR_lseek,
|
|
#ifdef __NR__llseek
|
|
__NR__llseek, // Newer glibc on PPC
|
|
#endif
|
|
__NR_close, __NR_munmap});
|
|
AddPolicyOnSyscall(__NR_mprotect, {
|
|
ARG_32(2),
|
|
JEQ32(PROT_READ, ALLOW),
|
|
JEQ32(PROT_NONE, ALLOW),
|
|
JEQ32(PROT_READ | PROT_WRITE, ALLOW),
|
|
JEQ32(PROT_READ | PROT_EXEC, ALLOW),
|
|
});
|
|
AllowStaticStartup();
|
|
|
|
return AddPolicyOnMmap([](bpf_labels& labels) -> std::vector<sock_filter> {
|
|
return {
|
|
ARG_32(2), // prot
|
|
JEQ32(PROT_READ | PROT_EXEC, JUMP(&labels, prot_exec)),
|
|
JEQ32(PROT_READ | PROT_WRITE, JUMP(&labels, prot_read_write)),
|
|
JNE32(PROT_READ, JUMP(&labels, mmap_end)),
|
|
|
|
// PROT_READ
|
|
ARG_32(3), // flags
|
|
JEQ32(MAP_PRIVATE, ALLOW),
|
|
JUMP(&labels, mmap_end),
|
|
|
|
// PROT_READ | PROT_WRITE
|
|
LABEL(&labels, prot_read_write),
|
|
ARG_32(3), // flags
|
|
JEQ32(MAP_FILE | MAP_PRIVATE | MAP_FIXED | MAP_DENYWRITE, ALLOW),
|
|
JEQ32(MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, ALLOW),
|
|
JEQ32(MAP_ANONYMOUS | MAP_PRIVATE, ALLOW),
|
|
JUMP(&labels, mmap_end),
|
|
|
|
// PROT_READ | PROT_EXEC
|
|
LABEL(&labels, prot_exec),
|
|
ARG_32(3), // flags
|
|
JEQ32(MAP_FILE | MAP_PRIVATE | MAP_DENYWRITE, ALLOW),
|
|
|
|
LABEL(&labels, mmap_end),
|
|
};
|
|
});
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddPolicyOnSyscall(unsigned int num,
|
|
BpfInitializer policy) {
|
|
return AddPolicyOnSyscalls({num}, policy);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddPolicyOnSyscall(
|
|
unsigned int num, const std::vector<sock_filter>& policy) {
|
|
return AddPolicyOnSyscalls({num}, policy);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddPolicyOnSyscall(unsigned int num, BpfFunc f) {
|
|
return AddPolicyOnSyscalls({num}, f);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddPolicyOnSyscalls(
|
|
SyscallInitializer nums, const std::vector<sock_filter>& policy) {
|
|
auto resolved_policy =
|
|
ResolveBpfFunc(
|
|
[nums, policy](bpf_labels& labels) -> std::vector<sock_filter> {
|
|
std::vector<sock_filter> out;
|
|
out.reserve(nums.size() + policy.size());
|
|
for (auto num : nums) {
|
|
out.insert(out.end(), {SYSCALL(num, JUMP(&labels, do_policy_l))});
|
|
}
|
|
out.insert(out.end(), {JUMP(&labels, dont_do_policy_l),
|
|
LABEL(&labels, do_policy_l)});
|
|
for (const auto& filter : policy) {
|
|
// Syscall arch is expected as TRACE value
|
|
if (filter.code == (BPF_RET | BPF_K) &&
|
|
(filter.k & SECCOMP_RET_ACTION) == SECCOMP_RET_TRACE &&
|
|
(filter.k & SECCOMP_RET_DATA) != Syscall::GetHostArch()) {
|
|
LOG(WARNING)
|
|
<< "SANDBOX2_TRACE should be used in policy instead of "
|
|
"TRACE(value)";
|
|
out.push_back(SANDBOX2_TRACE);
|
|
} else {
|
|
out.push_back(filter);
|
|
}
|
|
}
|
|
out.push_back(LOAD_SYSCALL_NR);
|
|
out.insert(out.end(), {LABEL(&labels, dont_do_policy_l)});
|
|
return out;
|
|
});
|
|
// Pre-/Postcondition: Syscall number loaded into A register
|
|
user_policy_.insert(user_policy_.end(), resolved_policy.begin(),
|
|
resolved_policy.end());
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddPolicyOnSyscalls(SyscallInitializer nums,
|
|
BpfInitializer policy) {
|
|
std::vector<sock_filter> policy_vector(policy);
|
|
return AddPolicyOnSyscalls(nums, policy_vector);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddPolicyOnSyscalls(SyscallInitializer nums,
|
|
BpfFunc f) {
|
|
return AddPolicyOnSyscalls(nums, ResolveBpfFunc(f));
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddPolicyOnMmap(BpfInitializer policy) {
|
|
return AddPolicyOnSyscalls(kMmapSyscalls, policy);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddPolicyOnMmap(
|
|
const std::vector<sock_filter>& policy) {
|
|
return AddPolicyOnSyscalls(kMmapSyscalls, policy);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddPolicyOnMmap(BpfFunc f) {
|
|
return AddPolicyOnSyscalls(kMmapSyscalls, f);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::DangerDefaultAllowAll() {
|
|
user_policy_.push_back(ALLOW);
|
|
return *this;
|
|
}
|
|
|
|
absl::StatusOr<std::string> PolicyBuilder::ValidateAbsolutePath(
|
|
absl::string_view path) {
|
|
if (!file::IsAbsolutePath(path)) {
|
|
return absl::InvalidArgumentError(
|
|
absl::StrCat("Path is not absolute: '", path, "'"));
|
|
}
|
|
return ValidatePath(path);
|
|
}
|
|
|
|
absl::StatusOr<std::string> PolicyBuilder::ValidatePath(
|
|
absl::string_view path) {
|
|
std::string fixed_path = file::CleanPath(path);
|
|
if (fixed_path != path) {
|
|
return absl::InvalidArgumentError(absl::StrCat(
|
|
"Path was not normalized. '", path, "' != '", fixed_path, "'"));
|
|
}
|
|
return fixed_path;
|
|
}
|
|
|
|
std::vector<sock_filter> PolicyBuilder::ResolveBpfFunc(BpfFunc f) {
|
|
bpf_labels l = {0};
|
|
|
|
std::vector<sock_filter> policy = f(l);
|
|
if (bpf_resolve_jumps(&l, policy.data(), policy.size()) != 0) {
|
|
SetError(absl::InternalError("Cannot resolve bpf jumps"));
|
|
}
|
|
|
|
return policy;
|
|
}
|
|
|
|
absl::StatusOr<std::unique_ptr<Policy>> PolicyBuilder::TryBuild() {
|
|
auto output = absl::WrapUnique(new Policy());
|
|
|
|
if (!last_status_.ok()) {
|
|
return last_status_;
|
|
}
|
|
|
|
if (already_built_) {
|
|
return absl::FailedPreconditionError("Can only build policy once.");
|
|
}
|
|
|
|
if (use_namespaces_) {
|
|
if (allow_unrestricted_networking_ && hostname_ != kDefaultHostname) {
|
|
return absl::FailedPreconditionError(
|
|
"Cannot set hostname without network namespaces.");
|
|
}
|
|
output->SetNamespace(absl::make_unique<Namespace>(
|
|
allow_unrestricted_networking_, std::move(mounts_), hostname_));
|
|
} else {
|
|
// Not explicitly disabling them here as this is a technical limitation in
|
|
// our stack trace collection functionality.
|
|
LOG(WARNING) << "Using policy without namespaces, disabling stack traces on"
|
|
<< " crash";
|
|
}
|
|
|
|
output->collect_stacktrace_on_signal_ = collect_stacktrace_on_signal_;
|
|
output->collect_stacktrace_on_violation_ = collect_stacktrace_on_violation_;
|
|
output->collect_stacktrace_on_timeout_ = collect_stacktrace_on_timeout_;
|
|
output->collect_stacktrace_on_kill_ = collect_stacktrace_on_kill_;
|
|
output->user_policy_ = std::move(user_policy_);
|
|
output->user_policy_handles_bpf_ = user_policy_handles_bpf_;
|
|
|
|
auto pb_description = absl::make_unique<PolicyBuilderDescription>();
|
|
|
|
StoreDescription(pb_description.get());
|
|
output->policy_builder_description_ = std::move(pb_description);
|
|
output->allowed_hosts_ = std::move(allowed_hosts_);
|
|
already_built_ = true;
|
|
return std::move(output);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddFile(absl::string_view path, bool is_ro) {
|
|
return AddFileAt(path, path, is_ro);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::SetError(const absl::Status& status) {
|
|
LOG(ERROR) << status;
|
|
last_status_ = status;
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddFileAt(absl::string_view outside,
|
|
absl::string_view inside, bool is_ro) {
|
|
EnableNamespaces();
|
|
|
|
auto fixed_outside_or = ValidateAbsolutePath(outside);
|
|
if (!fixed_outside_or.ok()) {
|
|
SetError(fixed_outside_or.status());
|
|
return *this;
|
|
}
|
|
auto fixed_outside = std::move(fixed_outside_or).value();
|
|
|
|
if (absl::StartsWith(fixed_outside, "/proc/self") &&
|
|
fixed_outside != "/proc/self/cpuset") {
|
|
SetError(absl::InvalidArgumentError(
|
|
absl::StrCat("Cannot add /proc/self mounts, you need to mount the "
|
|
"whole /proc instead. You tried to mount ",
|
|
outside)));
|
|
return *this;
|
|
}
|
|
|
|
if (auto status = mounts_.AddFileAt(fixed_outside, inside, is_ro);
|
|
!status.ok()) {
|
|
SetError(
|
|
absl::InternalError(absl::StrCat("Could not add file ", outside, " => ",
|
|
inside, ": ", status.message())));
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddLibrariesForBinary(
|
|
absl::string_view path, absl::string_view ld_library_path) {
|
|
EnableNamespaces();
|
|
|
|
auto fixed_path_or = ValidatePath(path);
|
|
if (!fixed_path_or.ok()) {
|
|
SetError(fixed_path_or.status());
|
|
return *this;
|
|
}
|
|
auto fixed_path = std::move(fixed_path_or).value();
|
|
|
|
if (auto status = mounts_.AddMappingsForBinary(fixed_path, ld_library_path);
|
|
!status.ok()) {
|
|
SetError(absl::InternalError(absl::StrCat(
|
|
"Could not add libraries for ", fixed_path, ": ", status.message())));
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddLibrariesForBinary(
|
|
int fd, absl::string_view ld_library_path) {
|
|
return AddLibrariesForBinary(absl::StrCat("/proc/self/fd/", fd),
|
|
ld_library_path);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddDirectory(absl::string_view path, bool is_ro) {
|
|
return AddDirectoryAt(path, path, is_ro);
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddDirectoryAt(absl::string_view outside,
|
|
absl::string_view inside,
|
|
bool is_ro) {
|
|
EnableNamespaces();
|
|
|
|
auto fixed_outside_or = ValidateAbsolutePath(outside);
|
|
if (!fixed_outside_or.ok()) {
|
|
SetError(fixed_outside_or.status());
|
|
return *this;
|
|
}
|
|
auto fixed_outside = std::move(fixed_outside_or).value();
|
|
if (absl::StartsWith(fixed_outside, "/proc/self")) {
|
|
SetError(absl::InvalidArgumentError(
|
|
absl::StrCat("Cannot add /proc/self mounts, you need to mount the "
|
|
"whole /proc instead. You tried to mount ",
|
|
outside)));
|
|
return *this;
|
|
}
|
|
|
|
if (auto status = mounts_.AddDirectoryAt(fixed_outside, inside, is_ro);
|
|
!status.ok()) {
|
|
SetError(absl::InternalError(absl::StrCat("Could not add directory ",
|
|
outside, " => ", inside, ": ",
|
|
status.message())));
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddTmpfs(absl::string_view inside, size_t sz) {
|
|
EnableNamespaces();
|
|
|
|
if (auto status = mounts_.AddTmpfs(inside, sz); !status.ok()) {
|
|
SetError(absl::InternalError(absl::StrCat("Could not mount tmpfs ", inside,
|
|
": ", status.message())));
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowUnrestrictedNetworking() {
|
|
EnableNamespaces();
|
|
allow_unrestricted_networking_ = true;
|
|
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::SetHostname(absl::string_view hostname) {
|
|
EnableNamespaces();
|
|
hostname_ = std::string(hostname);
|
|
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::CollectStacktracesOnViolation(bool enable) {
|
|
collect_stacktrace_on_violation_ = enable;
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::CollectStacktracesOnSignal(bool enable) {
|
|
collect_stacktrace_on_signal_ = enable;
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::CollectStacktracesOnTimeout(bool enable) {
|
|
collect_stacktrace_on_timeout_ = enable;
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::CollectStacktracesOnKill(bool enable) {
|
|
collect_stacktrace_on_kill_ = enable;
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddNetworkProxyPolicy() {
|
|
if (allowed_hosts_) {
|
|
SetError(absl::FailedPreconditionError(
|
|
"AddNetworkProxyPolicy or AddNetworkProxyHandlerPolicy can be called "
|
|
"at most once"));
|
|
return *this;
|
|
}
|
|
|
|
allowed_hosts_ = AllowedHosts();
|
|
|
|
AllowFutexOp(FUTEX_WAKE);
|
|
AllowFutexOp(FUTEX_WAIT);
|
|
AllowFutexOp(FUTEX_WAIT_BITSET);
|
|
AllowSyscalls({
|
|
#ifdef __NR_dup2
|
|
__NR_dup2,
|
|
#endif
|
|
__NR_recvmsg,
|
|
__NR_close,
|
|
__NR_gettid,
|
|
});
|
|
AddPolicyOnSyscall(__NR_socket, {
|
|
ARG_32(0),
|
|
JEQ32(AF_INET, ALLOW),
|
|
JEQ32(AF_INET6, ALLOW),
|
|
});
|
|
AddPolicyOnSyscall(__NR_getsockopt,
|
|
[](bpf_labels& labels) -> std::vector<sock_filter> {
|
|
return {
|
|
ARG_32(1),
|
|
JNE32(SOL_SOCKET, JUMP(&labels, getsockopt_end)),
|
|
ARG_32(2),
|
|
JEQ32(SO_TYPE, ALLOW),
|
|
LABEL(&labels, getsockopt_end),
|
|
};
|
|
});
|
|
#ifdef SAPI_PPC64_LE
|
|
AddPolicyOnSyscall(__NR_socketcall, {
|
|
ARG_32(0),
|
|
JEQ32(SYS_SOCKET, ALLOW),
|
|
JEQ32(SYS_GETSOCKOPT, ALLOW),
|
|
JEQ32(SYS_RECVMSG, ALLOW),
|
|
});
|
|
#endif
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AddNetworkProxyHandlerPolicy() {
|
|
AddNetworkProxyPolicy();
|
|
AllowSyscall(__NR_rt_sigreturn);
|
|
|
|
AddPolicyOnSyscall(__NR_rt_sigaction, {
|
|
ARG_32(0),
|
|
JEQ32(SIGSYS, ALLOW),
|
|
});
|
|
|
|
AddPolicyOnSyscall(__NR_rt_sigprocmask, {
|
|
ARG_32(0),
|
|
JEQ32(SIG_UNBLOCK, ALLOW),
|
|
});
|
|
|
|
AddPolicyOnSyscall(__NR_connect, {TRAP(0)});
|
|
#ifdef SAPI_PPC64_LE
|
|
AddPolicyOnSyscall(__NR_socketcall, {
|
|
ARG_32(0),
|
|
JEQ32(SYS_CONNECT, TRAP(0)),
|
|
});
|
|
#endif
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::SetRootWritable() {
|
|
EnableNamespaces();
|
|
mounts_.SetRootWritable();
|
|
|
|
return *this;
|
|
}
|
|
|
|
void PolicyBuilder::StoreDescription(PolicyBuilderDescription* pb_description) {
|
|
for (const auto& handled_syscall : handled_syscalls_) {
|
|
pb_description->add_handled_syscalls(handled_syscall);
|
|
}
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowIPv4(const std::string& ip_and_mask,
|
|
uint32_t port) {
|
|
if (!allowed_hosts_) {
|
|
SetError(absl::FailedPreconditionError(
|
|
"AddNetworkProxyPolicy or AddNetworkProxyHandlerPolicy must be called "
|
|
"before adding IP rules"));
|
|
return *this;
|
|
}
|
|
|
|
absl::Status status = allowed_hosts_->AllowIPv4(ip_and_mask, port);
|
|
if (!status.ok()) {
|
|
SetError(status);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
PolicyBuilder& PolicyBuilder::AllowIPv6(const std::string& ip_and_mask,
|
|
uint32_t port) {
|
|
if (!allowed_hosts_) {
|
|
SetError(absl::FailedPreconditionError(
|
|
"AddNetworkProxyPolicy or AddNetworkProxyHandlerPolicy must be called "
|
|
"before adding IP rules"));
|
|
return *this;
|
|
}
|
|
|
|
absl::Status status = allowed_hosts_->AllowIPv6(ip_and_mask, port);
|
|
if (!status.ok()) {
|
|
SetError(status);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
} // namespace sandbox2
|