Seccomp_unotify based monitor

Unotify based monitor should bring big performance wins
if the sandboxee heavily uses threading or signals.
Some of the features are not supported in that mode:
- execveat is always allowed instead of just the initial one
- stack traces are not collected on normal exit or if the process is terminated by signal

PiperOrigin-RevId: 515040101
Change-Id: Ia5574d34b4ff7e91e3601edb8c9cb913e011fbf6
This commit is contained in:
Wiktor Garbacz 2023-03-08 08:08:35 -08:00 committed by Copybara-Service
parent 80cc894c39
commit 0d3d5d4bcb
23 changed files with 867 additions and 110 deletions

View File

@ -347,11 +347,13 @@ cc_library(
":comms", ":comms",
":executor", ":executor",
":fork_client", ":fork_client",
":forkserver_cc_proto",
":ipc", ":ipc",
":limits", ":limits",
":logsink", ":logsink",
":monitor_base", ":monitor_base",
":monitor_ptrace", ":monitor_ptrace",
":monitor_unotify",
":mounts", ":mounts",
":namespace", ":namespace",
":notify", ":notify",
@ -444,6 +446,29 @@ cc_library(
], ],
) )
cc_library(
name = "monitor_unotify",
srcs = ["monitor_unotify.cc"],
hdrs = ["monitor_unotify.h"],
copts = sapi_platform_copts(),
deps = [
":client",
":executor",
":forkserver_cc_proto",
":monitor_base",
":notify",
":policy",
"//sandboxed_api/util:fileops",
"//sandboxed_api/util:raw_logging",
"@com_google_absl//absl/cleanup",
"@com_google_absl//absl/log",
"@com_google_absl//absl/status",
"@com_google_absl//absl/status:statusor",
"@com_google_absl//absl/synchronization",
"@com_google_absl//absl/time",
],
)
cc_library( cc_library(
name = "monitor_base", name = "monitor_base",
srcs = ["monitor_base.cc"], srcs = ["monitor_base.cc"],
@ -454,6 +479,7 @@ cc_library(
":comms", ":comms",
":executor", ":executor",
":fork_client", ":fork_client",
":forkserver_cc_proto",
":ipc", ":ipc",
":limits", ":limits",
":mounts", ":mounts",
@ -522,8 +548,11 @@ cc_library(
deps = [ deps = [
":comms", ":comms",
":logsink", ":logsink",
":policy",
":sanitizer", ":sanitizer",
":syscall",
"//sandboxed_api/sandbox2/network_proxy:client", "//sandboxed_api/sandbox2/network_proxy:client",
"//sandboxed_api/sandbox2/util:bpf_helper",
"//sandboxed_api/util:raw_logging", "//sandboxed_api/util:raw_logging",
"//sandboxed_api/util:strerror", "//sandboxed_api/util:strerror",
"@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/base:core_headers",
@ -593,6 +622,7 @@ cc_library(
deps = [ deps = [
":comms", ":comms",
":forkserver_cc_proto", ":forkserver_cc_proto",
"//sandboxed_api/util:fileops",
"@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/log", "@com_google_absl//absl/log",
"@com_google_absl//absl/log:check", "@com_google_absl//absl/log:check",

View File

@ -294,8 +294,6 @@ target_link_libraries(sandbox2_executor
# sandboxed_api/sandbox2:sandbox2 # sandboxed_api/sandbox2:sandbox2
add_library(sandbox2_sandbox2 ${SAPI_LIB_TYPE} add_library(sandbox2_sandbox2 ${SAPI_LIB_TYPE}
monitor_ptrace.cc
monitor_ptrace.h
sandbox2.cc sandbox2.cc
sandbox2.h sandbox2.h
stack_trace.cc stack_trace.cc
@ -309,6 +307,9 @@ target_link_libraries(sandbox2_sandbox2
absl::optional absl::optional
absl::str_format absl::str_format
absl::strings absl::strings
sandbox2::forkserver_proto
sandbox2::monitor_ptrace
sandbox2::monitor_unotify
sapi::base sapi::base
PUBLIC absl::flat_hash_map PUBLIC absl::flat_hash_map
absl::status absl::status
@ -326,7 +327,6 @@ target_link_libraries(sandbox2_sandbox2
sandbox2::limits sandbox2::limits
sandbox2::logsink sandbox2::logsink
sandbox2::monitor_base sandbox2::monitor_base
sandbox2::monitor_ptrace
sandbox2::mounts sandbox2::mounts
sandbox2::mount_tree_proto sandbox2::mount_tree_proto
sandbox2::namespace sandbox2::namespace
@ -443,6 +443,31 @@ target_link_libraries(sandbox2_monitor_ptrace
sapi::raw_logging sapi::raw_logging
) )
# sandboxed_api/sandbox2:monitor_unotify
add_library(sandbox2_monitor_unotify ${SAPI_LIB_TYPE}
monitor_unotify.cc
monitor_unotify.h
)
add_library(sandbox2::monitor_unotify ALIAS sandbox2_monitor_unotify)
target_link_libraries(sandbox2_monitor_unotify
PRIVATE absl::cleanup
absl::log
absl::status
absl::time
sapi::base
sandbox2::client
sandbox2::forkserver_proto
sapi::fileops
sapi::raw_logging
PUBLIC sandbox2::executor
sandbox2::monitor_base
sandbox2::notify
sandbox2::policy
absl::statusor
absl::synchronization
sapi::raw_logging
)
# sandboxed_api/sandbox2:policybuilder # sandboxed_api/sandbox2:policybuilder
add_library(sandbox2_policybuilder ${SAPI_LIB_TYPE} add_library(sandbox2_policybuilder ${SAPI_LIB_TYPE}
policybuilder.cc policybuilder.cc
@ -479,7 +504,10 @@ add_library(sandbox2::client ALIAS sandbox2_client)
target_link_libraries(sandbox2_client target_link_libraries(sandbox2_client
PRIVATE absl::core_headers PRIVATE absl::core_headers
absl::strings absl::strings
sandbox2::bpf_helper
sandbox2::policy
sandbox2::sanitizer sandbox2::sanitizer
sandbox2::syscall
sapi::strerror sapi::strerror
sapi::base sapi::base
sapi::raw_logging sapi::raw_logging
@ -546,12 +574,13 @@ add_library(sandbox2_fork_client ${SAPI_LIB_TYPE}
fork_client.h fork_client.h
) )
add_library(sandbox2::fork_client ALIAS sandbox2_fork_client) add_library(sandbox2::fork_client ALIAS sandbox2_fork_client)
target_link_libraries(sandbox2_fork_client PRIVATE target_link_libraries(sandbox2_fork_client
absl::core_headers PRIVATE sandbox2::comms
absl::synchronization sandbox2::forkserver_proto
sandbox2::comms PUBLIC absl::core_headers
sandbox2::forkserver_proto absl::synchronization
sapi::base sapi::base
sapi::fileops
) )
# sandboxed_api/sandbox2:mounts # sandboxed_api/sandbox2:mounts

View File

@ -23,6 +23,7 @@
#include <syscall.h> #include <syscall.h>
#include <unistd.h> #include <unistd.h>
#include <atomic>
#include <cinttypes> #include <cinttypes>
#include <climits> #include <climits>
#include <cstddef> #include <cstddef>
@ -30,6 +31,7 @@
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include <memory> #include <memory>
#include <thread> // NOLINT(build/c++11)
#include <utility> #include <utility>
#include "absl/base/attributes.h" #include "absl/base/attributes.h"
@ -40,14 +42,91 @@
#include "absl/strings/str_join.h" #include "absl/strings/str_join.h"
#include "absl/strings/str_split.h" #include "absl/strings/str_split.h"
#include "sandboxed_api/sandbox2/comms.h" #include "sandboxed_api/sandbox2/comms.h"
#include "sandboxed_api/sandbox2/policy.h"
#include "sandboxed_api/sandbox2/sanitizer.h" #include "sandboxed_api/sandbox2/sanitizer.h"
#include "sandboxed_api/sandbox2/syscall.h"
#include "sandboxed_api/sandbox2/util/bpf_helper.h"
#include "sandboxed_api/util/raw_logging.h" #include "sandboxed_api/util/raw_logging.h"
#include "sandboxed_api/util/strerror.h" #include "sandboxed_api/util/strerror.h"
#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
#endif
namespace sandbox2 { namespace sandbox2 {
namespace {
using ::sapi::StrError; using ::sapi::StrError;
void InitSeccompUnotify(sock_fprog prog, Comms* comms) {
// The policy might not allow sending the notify FD.
// Create a separate thread that won't get the seccomp policy to send the FD.
// Synchronize with it using plain atomics + seccomp TSYNC, so we don't need
// any additional syscalls.
std::atomic<int> fd(-1);
std::atomic<int> tid(-1);
std::thread th([comms, &fd, &tid]() {
int notify_fd = -1;
while (notify_fd == -1) {
notify_fd = fd.load(std::memory_order_seq_cst);
}
SAPI_RAW_CHECK(comms->SendFD(notify_fd), "sending unotify fd");
SAPI_RAW_CHECK(close(notify_fd) == 0, "closing unotify fd");
sock_filter filter = ALLOW;
struct sock_fprog allow_prog = {
.len = 1,
.filter = &filter,
};
int result = syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, 0,
reinterpret_cast<uintptr_t>(&allow_prog));
SAPI_RAW_PCHECK(result != -1, "setting seccomp filter");
tid.store(syscall(__NR_gettid), std::memory_order_seq_cst);
});
th.detach();
int result = syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
SECCOMP_FILTER_FLAG_NEW_LISTENER,
reinterpret_cast<uintptr_t>(&prog));
SAPI_RAW_PCHECK(result != -1, "setting seccomp filter");
fd.store(result, std::memory_order_seq_cst);
pid_t child = -1;
while (child == -1) {
child = tid.load(std::memory_order_seq_cst);
}
// Apply seccomp.
struct sock_filter code[] = {
LOAD_ARCH,
JNE32(sandbox2::Syscall::GetHostAuditArch(), ALLOW),
LOAD_SYSCALL_NR,
BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_seccomp, 0, 3),
ARG_32(3),
BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, internal::kExecveMagic, 0, 1),
DENY,
ALLOW,
};
prog.len = ABSL_ARRAYSIZE(code);
prog.filter = code;
do {
result = syscall(
__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
reinterpret_cast<uintptr_t>(&prog), internal::kExecveMagic);
} while (result == child);
SAPI_RAW_CHECK(result == 0, "Enabling seccomp filter");
}
void InitSeccompRegular(sock_fprog prog) {
int result =
syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
reinterpret_cast<uintptr_t>(&prog));
SAPI_RAW_PCHECK(result != -1, "setting seccomp filter");
SAPI_RAW_PCHECK(result == 0,
"synchronizing threads using SECCOMP_FILTER_FLAG_TSYNC flag "
"for thread=%d",
result);
}
} // namespace
Client::Client(Comms* comms) : comms_(comms) { Client::Client(Comms* comms) : comms_(comms) {
char* fdmap_envvar = getenv(kFDMapEnvVar); char* fdmap_envvar = getenv(kFDMapEnvVar);
if (!fdmap_envvar) { if (!fdmap_envvar) {
@ -247,17 +326,13 @@ void Client::ApplyPolicyAndBecomeTracee() {
uint32_t ret; // wait for confirmation uint32_t ret; // wait for confirmation
SAPI_RAW_CHECK(comms_->RecvUint32(&ret), SAPI_RAW_CHECK(comms_->RecvUint32(&ret),
"receving confirmation from executor"); "receving confirmation from executor");
SAPI_RAW_CHECK(ret == kSandbox2ClientDone, if (ret == kSandbox2ClientUnotify) {
"invalid confirmation from executor"); InitSeccompUnotify(prog, comms_);
} else {
int result = SAPI_RAW_CHECK(ret == kSandbox2ClientDone,
syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, "invalid confirmation from executor");
reinterpret_cast<uintptr_t>(&prog)); InitSeccompRegular(prog);
SAPI_RAW_PCHECK(result != -1, "setting seccomp filter"); }
SAPI_RAW_PCHECK(result == 0,
"synchronizing threads using SECCOMP_FILTER_FLAG_TSYNC flag "
"for thread=%d",
result);
} }
int Client::GetMappedFD(const std::string& name) { int Client::GetMappedFD(const std::string& name) {

View File

@ -35,6 +35,8 @@ class Client {
static constexpr uint32_t kClient2SandboxReady = 0x0A0B0C01; static constexpr uint32_t kClient2SandboxReady = 0x0A0B0C01;
// Sandbox is ready to monitor the sandboxee. // Sandbox is ready to monitor the sandboxee.
static constexpr uint32_t kSandbox2ClientDone = 0x0A0B0C02; static constexpr uint32_t kSandbox2ClientDone = 0x0A0B0C02;
// Sandboxe should setup seccomp_unotify and send back the FD.
static constexpr uint32_t kSandbox2ClientUnotify = 0x0A0B0C03;
explicit Client(Comms* comms); explicit Client(Comms* comms);

View File

@ -82,8 +82,9 @@ std::vector<std::string> Executor::CopyEnviron() {
return util::CharPtrArray(environ).ToStringVector(); return util::CharPtrArray(environ).ToStringVector();
} }
absl::StatusOr<SandboxeeProcess> Executor::StartSubProcess( absl::StatusOr<SandboxeeProcess> Executor::StartSubProcess(int32_t clone_flags,
int32_t clone_flags, const Namespace* ns) { const Namespace* ns,
MonitorType type) {
if (started_) { if (started_) {
return absl::FailedPreconditionError( return absl::FailedPreconditionError(
"This executor has already been started"); "This executor has already been started");
@ -149,6 +150,7 @@ absl::StatusOr<SandboxeeProcess> Executor::StartSubProcess(
} }
request.set_clone_flags(clone_flags); request.set_clone_flags(clone_flags);
request.set_monitor_type(type);
SandboxeeProcess process; SandboxeeProcess process;

View File

@ -29,6 +29,7 @@
#include "absl/strings/string_view.h" #include "absl/strings/string_view.h"
#include "absl/types/span.h" #include "absl/types/span.h"
#include "sandboxed_api/sandbox2/fork_client.h" #include "sandboxed_api/sandbox2/fork_client.h"
#include "sandboxed_api/sandbox2/forkserver.pb.h"
#include "sandboxed_api/sandbox2/ipc.h" #include "sandboxed_api/sandbox2/ipc.h"
#include "sandboxed_api/sandbox2/limits.h" #include "sandboxed_api/sandbox2/limits.h"
#include "sandboxed_api/sandbox2/namespace.h" #include "sandboxed_api/sandbox2/namespace.h"
@ -120,7 +121,8 @@ class Executor final {
// Comms channel. // Comms channel.
// For clone_flags refer to Linux' 'man 2 clone'. // For clone_flags refer to Linux' 'man 2 clone'.
absl::StatusOr<SandboxeeProcess> StartSubProcess( absl::StatusOr<SandboxeeProcess> StartSubProcess(
int clone_flags, const Namespace* ns = nullptr); int clone_flags, const Namespace* ns = nullptr,
MonitorType type = FORKSERVER_MONITOR_PTRACE);
// Whether the Executor has been started yet // Whether the Executor has been started yet
bool started_ = false; bool started_ = false;

View File

@ -21,12 +21,11 @@
namespace sandbox2 { namespace sandbox2 {
using ::sapi::file_util::fileops::FDCloser;
SandboxeeProcess ForkClient::SendRequest(const ForkRequest& request, SandboxeeProcess ForkClient::SendRequest(const ForkRequest& request,
int exec_fd, int comms_fd) { int exec_fd, int comms_fd) {
SandboxeeProcess process = { SandboxeeProcess process;
.init_pid = -1,
.main_pid = -1,
};
// Acquire the channel ownership for this request (transaction). // Acquire the channel ownership for this request (transaction).
absl::MutexLock l(&comms_mutex_); absl::MutexLock l(&comms_mutex_);
@ -64,6 +63,14 @@ SandboxeeProcess ForkClient::SendRequest(const ForkRequest& request,
return process; return process;
} }
process.main_pid = static_cast<pid_t>(pid); process.main_pid = static_cast<pid_t>(pid);
if (request.monitor_type() == FORKSERVER_MONITOR_UNOTIFY) {
int fd = -1;
if (!comms_->RecvFD(&fd)) {
LOG(ERROR) << "Receiving status fd from the ForkServer failed";
return process;
}
process.status_fd = FDCloser(fd);
}
return process; return process;
} }

View File

@ -19,6 +19,7 @@
#include "absl/base/thread_annotations.h" #include "absl/base/thread_annotations.h"
#include "absl/synchronization/mutex.h" #include "absl/synchronization/mutex.h"
#include "sandboxed_api/util/fileops.h"
namespace sandbox2 { namespace sandbox2 {
@ -31,6 +32,7 @@ class ForkRequest;
struct SandboxeeProcess { struct SandboxeeProcess {
pid_t init_pid = -1; pid_t init_pid = -1;
pid_t main_pid = -1; pid_t main_pid = -1;
sapi::file_util::fileops::FDCloser status_fd;
}; };
class ForkClient { class ForkClient {

View File

@ -110,34 +110,43 @@ void MoveFDs(std::initializer_list<std::pair<int*, int>> move_fds,
} }
} }
void RunInitProcess(const absl::flat_hash_set<int>& open_fds) { void RunInitProcess(pid_t main_pid, int pipe_fd,
const absl::flat_hash_set<int>& open_fds) {
if (prctl(PR_SET_NAME, "S2-INIT-PROC", 0, 0, 0) != 0) { if (prctl(PR_SET_NAME, "S2-INIT-PROC", 0, 0, 0) != 0) {
SAPI_RAW_PLOG(WARNING, "prctl(PR_SET_NAME, 'S2-INIT-PROC')"); SAPI_RAW_PLOG(WARNING, "prctl(PR_SET_NAME, 'S2-INIT-PROC')");
} }
// Close all open fds (equals to CloseAllFDsExcept but does not require /proc // Close all open fds (equals to CloseAllFDsExcept but does not require /proc
// to be available). // to be available).
for (const auto& fd : open_fds) { for (const auto& fd : open_fds) {
close(fd); close(fd);
} }
// Clear SA_NOCLDWAIT.
struct sigaction sa;
sa.sa_handler = SIG_DFL;
sa.sa_flags = 0;
sigemptyset(&sa.sa_mask);
SAPI_RAW_CHECK(sigaction(SIGCHLD, &sa, nullptr) == 0,
"clearing SA_NOCLDWAIT");
// Apply seccomp. // Apply seccomp.
struct sock_filter code[] = { std::vector<sock_filter> code = {
LOAD_ARCH, LOAD_ARCH,
JNE32(sandbox2::Syscall::GetHostAuditArch(), DENY), JNE32(sandbox2::Syscall::GetHostAuditArch(), DENY),
LOAD_SYSCALL_NR, LOAD_SYSCALL_NR,
#ifdef __NR_waitpid SYSCALL(__NR_waitid, ALLOW),
SYSCALL(__NR_waitpid, ALLOW),
#endif
SYSCALL(__NR_wait4, ALLOW),
SYSCALL(__NR_exit, ALLOW), SYSCALL(__NR_exit, ALLOW),
SYSCALL(__NR_exit_group, ALLOW),
DENY,
}; };
if (pipe_fd >= 0) {
code.insert(code.end(), {SYSCALL(__NR_write, ALLOW)});
}
code.push_back(DENY);
struct sock_fprog prog {}; struct sock_fprog prog {
prog.len = ABSL_ARRAYSIZE(code); .len = code.size(), .filter = code.data(),
prog.filter = code; };
SAPI_RAW_CHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0, SAPI_RAW_CHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0,
"Denying new privs"); "Denying new privs");
@ -147,21 +156,21 @@ void RunInitProcess(const absl::flat_hash_set<int>& open_fds) {
reinterpret_cast<uintptr_t>(&prog)) == 0, reinterpret_cast<uintptr_t>(&prog)) == 0,
"Enabling seccomp filter"); "Enabling seccomp filter");
pid_t pid; siginfo_t info;
int status = 0;
// Reap children. // Reap children.
while (true) { for (;;) {
// Wait until we don't have any children anymore. int rv = TEMP_FAILURE_RETRY(waitid(P_ALL, -1, &info, WEXITED | __WALL));
// We cannot watch for the child pid as ptrace steals our waitpid if (rv != 0) {
// notifications. (See man ptrace / man waitpid).
pid = TEMP_FAILURE_RETRY(waitpid(-1, &status, __WALL));
if (pid < 0) {
if (errno == ECHILD) {
_exit(0);
}
_exit(1); _exit(1);
} }
if (info.si_pid == main_pid) {
if (pipe_fd >= 0) {
write(pipe_fd, &info.si_code, sizeof(info.si_code));
write(pipe_fd, &info.si_status, sizeof(info.si_status));
}
_exit(0);
}
} }
} }
@ -269,7 +278,8 @@ void ForkServer::PrepareExecveArgs(const ForkRequest& request,
void ForkServer::LaunchChild(const ForkRequest& request, int execve_fd, void ForkServer::LaunchChild(const ForkRequest& request, int execve_fd,
int client_fd, uid_t uid, gid_t gid, int client_fd, uid_t uid, gid_t gid,
int signaling_fd, bool avoid_pivot_root) const { int signaling_fd, int status_fd,
bool avoid_pivot_root) const {
SAPI_RAW_CHECK(request.mode() != FORKSERVER_FORK_UNSPECIFIED, SAPI_RAW_CHECK(request.mode() != FORKSERVER_FORK_UNSPECIFIED,
"Forkserver mode is unspecified"); "Forkserver mode is unspecified");
@ -311,7 +321,13 @@ void ForkServer::LaunchChild(const ForkRequest& request, int execve_fd,
SAPI_RAW_PLOG(FATAL, "Could not spawn init process"); SAPI_RAW_PLOG(FATAL, "Could not spawn init process");
} }
if (child != 0) { if (child != 0) {
RunInitProcess(*open_fds); if (status_fd >= 0) {
open_fds->erase(status_fd);
}
RunInitProcess(child, status_fd, *open_fds);
}
if (status_fd >= 0) {
close(status_fd);
} }
// Send sandboxee pid // Send sandboxee pid
auto status = SendPid(signaling_fd); auto status = SendPid(signaling_fd);
@ -402,6 +418,11 @@ pid_t ForkServer::ServeRequest() {
uid_t uid = getuid(); uid_t uid = getuid();
uid_t gid = getgid(); uid_t gid = getgid();
int pfds[2] = {-1, -1};
if (fork_request.monitor_type() == FORKSERVER_MONITOR_UNOTIFY) {
SAPI_RAW_PCHECK(pipe(pfds) == 0, "creating status pipe");
}
int socketpair_fds[2]; int socketpair_fds[2];
SAPI_RAW_PCHECK( SAPI_RAW_PCHECK(
socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, socketpair_fds) == 0, socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, socketpair_fds) == 0,
@ -472,7 +493,7 @@ pid_t ForkServer::ServeRequest() {
// Child. // Child.
if (sandboxee_pid == 0) { if (sandboxee_pid == 0) {
LaunchChild(fork_request, exec_fd, comms_fd, uid, gid, fd_closer1.get(), LaunchChild(fork_request, exec_fd, comms_fd, uid, gid, fd_closer1.get(),
avoid_pivot_root); pfds[1], avoid_pivot_root);
return sandboxee_pid; return sandboxee_pid;
} }
@ -495,6 +516,9 @@ pid_t ForkServer::ServeRequest() {
} }
// Parent. // Parent.
if (pfds[1] >= 0) {
close(pfds[1]);
}
close(comms_fd); close(comms_fd);
if (exec_fd >= 0) { if (exec_fd >= 0) {
close(exec_fd); close(exec_fd);
@ -504,6 +528,11 @@ pid_t ForkServer::ServeRequest() {
SAPI_RAW_CHECK( SAPI_RAW_CHECK(
comms_->SendInt32(sandboxee_pid), comms_->SendInt32(sandboxee_pid),
absl::StrCat("Failed to send sandboxee PID: ", sandboxee_pid).c_str()); absl::StrCat("Failed to send sandboxee PID: ", sandboxee_pid).c_str());
if (pfds[0] >= 0) {
SAPI_RAW_CHECK(comms_->SendFD(pfds[0]), "Failed to send status pipe");
close(pfds[0]);
}
return sandboxee_pid; return sandboxee_pid;
} }

View File

@ -50,7 +50,7 @@ class ForkServer {
private: private:
// Creates and launched the child process. // Creates and launched the child process.
void LaunchChild(const ForkRequest& request, int execve_fd, int client_fd, void LaunchChild(const ForkRequest& request, int execve_fd, int client_fd,
uid_t uid, gid_t gid, int signaling_fd, uid_t uid, gid_t gid, int signaling_fd, int status_fd,
bool avoid_pivot_root) const; bool avoid_pivot_root) const;
// Prepares the Fork-Server (worker side, not the requester side) for work by // Prepares the Fork-Server (worker side, not the requester side) for work by

View File

@ -33,6 +33,15 @@ enum Mode {
FORKSERVER_FORK_JOIN_SANDBOX_UNWIND = 4; FORKSERVER_FORK_JOIN_SANDBOX_UNWIND = 4;
} }
enum MonitorType {
// Default value
FORKSERVER_MONITOR_UNSPECIFIED = 0;
// Ptrace based monitor
FORKSERVER_MONITOR_PTRACE = 1;
// Seccomp_unotify based monitor
FORKSERVER_MONITOR_UNOTIFY = 2;
}
message ForkRequest { message ForkRequest {
// List of arguments, starting with argv[0] // List of arguments, starting with argv[0]
repeated bytes args = 1; repeated bytes args = 1;
@ -56,4 +65,7 @@ message ForkRequest {
// Changes mount propagation from MS_PRIVATE to MS_SLAVE if set // Changes mount propagation from MS_PRIVATE to MS_SLAVE if set
optional bool allow_mount_propagation = 8; optional bool allow_mount_propagation = 8;
// Monitor type used by the sandbox
optional MonitorType monitor_type = 9;
} }

View File

@ -293,10 +293,7 @@ SandboxeeProcess GlobalForkClient::SendRequest(const ForkRequest& request,
absl::ReleasableMutexLock lock(&GlobalForkClient::instance_mutex_); absl::ReleasableMutexLock lock(&GlobalForkClient::instance_mutex_);
EnsureStartedLocked(GlobalForkserverStartMode::kOnDemand); EnsureStartedLocked(GlobalForkserverStartMode::kOnDemand);
if (!instance_) { if (!instance_) {
return { return SandboxeeProcess();
.init_pid = -1,
.main_pid = -1,
};
} }
SandboxeeProcess process = SandboxeeProcess process =
instance_->fork_client_.SendRequest(request, exec_fd, comms_fd); instance_->fork_client_.SendRequest(request, exec_fd, comms_fd);

View File

@ -196,7 +196,7 @@ void MonitorBase::Launch() {
// Get PID of the sandboxee. // Get PID of the sandboxee.
bool should_have_init = ns && (ns->GetCloneFlags() & CLONE_NEWPID); bool should_have_init = ns && (ns->GetCloneFlags() & CLONE_NEWPID);
absl::StatusOr<SandboxeeProcess> process = absl::StatusOr<SandboxeeProcess> process =
executor_->StartSubProcess(clone_flags, ns); executor_->StartSubProcess(clone_flags, ns, type_);
if (!process.ok()) { if (!process.ok()) {
LOG(ERROR) << "Starting sandboxed subprocess failed: " << process.status(); LOG(ERROR) << "Starting sandboxed subprocess failed: " << process.status();
@ -259,7 +259,7 @@ void MonitorBase::SetExitStatusCode(Result::StatusEnum final_status,
} }
bool MonitorBase::InitSendPolicy() { bool MonitorBase::InitSendPolicy() {
if (!policy_->SendPolicy(comms_)) { if (!policy_->SendPolicy(comms_, type_ == FORKSERVER_MONITOR_UNOTIFY)) {
LOG(ERROR) << "Couldn't send policy"; LOG(ERROR) << "Couldn't send policy";
return false; return false;
} }

View File

@ -32,6 +32,7 @@
#include "sandboxed_api/sandbox2/comms.h" #include "sandboxed_api/sandbox2/comms.h"
#include "sandboxed_api/sandbox2/executor.h" #include "sandboxed_api/sandbox2/executor.h"
#include "sandboxed_api/sandbox2/fork_client.h" #include "sandboxed_api/sandbox2/fork_client.h"
#include "sandboxed_api/sandbox2/forkserver.pb.h"
#include "sandboxed_api/sandbox2/ipc.h" #include "sandboxed_api/sandbox2/ipc.h"
#include "sandboxed_api/sandbox2/network_proxy/server.h" #include "sandboxed_api/sandbox2/network_proxy/server.h"
#include "sandboxed_api/sandbox2/notify.h" #include "sandboxed_api/sandbox2/notify.h"
@ -109,6 +110,8 @@ class MonitorBase {
// Handle to the class responsible for proxying and validating connect() // Handle to the class responsible for proxying and validating connect()
// requests. // requests.
std::unique_ptr<NetworkProxyServer> network_proxy_server_; std::unique_ptr<NetworkProxyServer> network_proxy_server_;
// Monitor type
MonitorType type_ = FORKSERVER_MONITOR_PTRACE;
private: private:
// Sends Policy to the Client. // Sends Policy to the Client.

View File

@ -0,0 +1,351 @@
#include "sandboxed_api/sandbox2/monitor_unotify.h"
#include <linux/audit.h>
#include <linux/filter.h>
#include <linux/ioctl.h>
#include <linux/seccomp.h>
#include <poll.h>
#include <sys/ioctl.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <syscall.h>
#include <unistd.h>
#include "absl/cleanup/cleanup.h"
#include "absl/log/log.h"
#include "absl/status/status.h"
#include "absl/time/time.h"
#include "sandboxed_api/sandbox2/client.h"
#include "sandboxed_api/sandbox2/forkserver.pb.h"
#include "sandboxed_api/sandbox2/monitor_base.h"
#include "sandboxed_api/util/fileops.h"
#include "sandboxed_api/util/raw_logging.h"
#ifndef SECCOMP_GET_NOTIF_SIZES
#define SECCOMP_GET_NOTIF_SIZES 3
struct seccomp_notif_sizes {
__u16 seccomp_notif;
__u16 seccomp_notif_resp;
__u16 seccomp_data;
};
#endif
#ifndef SECCOMP_IOCTL_NOTIF_RECV
#ifndef SECCOMP_IOWR
#define SECCOMP_IOC_MAGIC '!'
#define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr)
#define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type)
#endif
/* Flags for seccomp notification fd ioctl. */
#define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif)
#endif
namespace sandbox2 {
namespace {
int seccomp(unsigned int operation, unsigned int flags, void* args) {
return syscall(SYS_seccomp, operation, flags, args);
}
sapi::cpu::Architecture AuditArchToCPUArch(uint32_t arch) {
switch (arch) {
case AUDIT_ARCH_AARCH64:
return sapi::cpu::Architecture::kArm64;
case AUDIT_ARCH_ARM:
return sapi::cpu::Architecture::kArm;
case AUDIT_ARCH_X86_64:
return sapi::cpu::Architecture::kX8664;
case AUDIT_ARCH_I386:
return sapi::cpu::Architecture::kX86;
case AUDIT_ARCH_PPC64LE:
return sapi::cpu::Architecture::kPPC64LE;
default:
return sapi::cpu::Architecture::kUnknown;
}
}
using ::sapi::file_util::fileops::FDCloser;
} // namespace
UnotifyMonitor::UnotifyMonitor(Executor* executor, Policy* policy,
Notify* notify)
: MonitorBase(executor, policy, notify) {
type_ = FORKSERVER_MONITOR_UNOTIFY;
if (executor_->limits()->wall_time_limit() != absl::ZeroDuration()) {
auto deadline = absl::Now() + executor_->limits()->wall_time_limit();
deadline_millis_.store(absl::ToUnixMillis(deadline),
std::memory_order_relaxed);
}
external_kill_request_flag_.test_and_set(std::memory_order_relaxed);
dump_stack_request_flag_.test_and_set(std::memory_order_relaxed);
}
void UnotifyMonitor::RunInternal() {
thread_ = std::make_unique<std::thread>(&UnotifyMonitor::Run, this);
// Wait for the Monitor to set-up the sandboxee correctly (or fail while
// doing that). From here on, it is safe to use the IPC object for
// non-sandbox-related data exchange.
setup_notification_.WaitForNotification();
}
void UnotifyMonitor::HandleUnotify() {
memset(req_.get(), 0, req_size_);
if (ioctl(seccomp_notify_fd_.get(), SECCOMP_IOCTL_NOTIF_RECV, req_.get()) !=
0) {
if (errno == ENOENT) {
VLOG(1) << "Unotify recv failed with ENOENT";
} else {
SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_NOTIFY);
}
return;
}
Syscall syscall(AuditArchToCPUArch(req_->data.arch), req_->data.nr,
{req_->data.args[0], req_->data.args[1], req_->data.args[2],
req_->data.args[3], req_->data.args[4], req_->data.args[5]},
req_->pid, 0, req_->data.instruction_pointer);
LogSyscallViolation(syscall);
MaybeGetStackTrace(req_->pid, Result::VIOLATION);
ViolationType violation_type = syscall.arch() == Syscall::GetHostArch()
? kSyscallViolation
: kArchitectureSwitchViolation;
SetExitStatusCode(Result::VIOLATION, syscall.nr());
notify_->EventSyscallViolation(syscall, violation_type);
result_.SetSyscall(std::make_unique<Syscall>(syscall));
KillSandboxee();
}
void UnotifyMonitor::Run() {
absl::Cleanup monitor_done = [this] {
getrusage(RUSAGE_THREAD, result_.GetRUsageMonitor());
OnDone();
};
absl::Cleanup setup_notify = [this] { setup_notification_.Notify(); };
if (!InitSetupUnotify()) {
SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_NOTIFY);
return;
}
if (!InitSetupNotifyPipe()) {
SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_NOTIFY);
return;
}
std::move(setup_notify).Invoke();
pollfd pfds[] = {
{.fd = process_.status_fd.get(), .events = POLLIN},
{.fd = seccomp_notify_fd_.get(), .events = POLLIN},
{.fd = monitor_notify_pipe_[0].get(), .events = POLLIN},
};
bool wait_for_sandboxee = true;
while (result_.final_status() == Result::UNSET) {
int64_t deadline = deadline_millis_.load(std::memory_order_relaxed);
absl::Duration remaining = absl::FromUnixMillis(deadline) - absl::Now();
if (deadline != 0 && remaining < absl::ZeroDuration()) {
VLOG(1) << "Sandbox process hit timeout due to the walltime timer";
timed_out_ = true;
MaybeGetStackTrace(process_.main_pid, Result::TIMEOUT);
KillSandboxee();
break;
}
if (!external_kill_request_flag_.test_and_set(std::memory_order_relaxed)) {
external_kill_ = true;
MaybeGetStackTrace(process_.main_pid, Result::EXTERNAL_KILL);
KillSandboxee();
break;
}
if (network_proxy_server_ &&
network_proxy_server_->violation_occurred_.load(
std::memory_order_acquire) &&
!network_violation_) {
network_violation_ = true;
MaybeGetStackTrace(process_.main_pid, Result::VIOLATION);
KillSandboxee();
break;
}
constexpr int64_t kMinWakeupMsec = 10000;
int timeout_msec = static_cast<int>(
std::min(kMinWakeupMsec,
std::max(int64_t{0}, absl::ToInt64Milliseconds(remaining))));
PCHECK(poll(pfds, ABSL_ARRAYSIZE(pfds), timeout_msec) != -1);
if (pfds[2].revents & POLLIN) {
char c = ' ';
read(monitor_notify_pipe_[0].get(), &c, 1);
continue;
}
if (pfds[0].revents & POLLIN) {
SetExitStatusFromStatusPipe();
wait_for_sandboxee = false;
break;
}
if (pfds[0].revents & POLLHUP) {
SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_MONITOR);
wait_for_sandboxee = false;
break;
}
if (pfds[1].revents & POLLIN) {
HandleUnotify();
wait_for_sandboxee = false;
}
}
if (wait_for_sandboxee) {
int timeout_ms = 1000; // 1 sec
PCHECK(poll(pfds, 1, timeout_ms) != -1);
if (pfds[0].revents & POLLIN) {
SetExitStatusFromStatusPipe();
} else if (pfds[0].revents & POLLHUP) {
SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_MONITOR);
}
}
KillInit();
}
void UnotifyMonitor::SetExitStatusFromStatusPipe() {
int code, status;
SAPI_RAW_PCHECK(
read(process_.status_fd.get(), &code, sizeof(code)) == sizeof(int),
"read");
SAPI_RAW_PCHECK(
read(process_.status_fd.get(), &status, sizeof(status)) == sizeof(int),
"read");
if (code == CLD_EXITED) {
SetExitStatusCode(Result::OK, status);
} else if (code == CLD_KILLED || code == CLD_DUMPED) {
if (network_violation_) {
SetExitStatusCode(Result::VIOLATION, Result::VIOLATION_NETWORK);
result_.SetNetworkViolation(network_proxy_server_->violation_msg_);
} else if (external_kill_) {
SetExitStatusCode(Result::EXTERNAL_KILL, 0);
} else if (timed_out_) {
SetExitStatusCode(Result::TIMEOUT, 0);
} else {
SetExitStatusCode(Result::SIGNALED, status);
}
} else {
SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_MONITOR);
}
}
bool UnotifyMonitor::InitSetupUnotify() {
if (!comms_->SendUint32(Client::kSandbox2ClientUnotify)) {
LOG(ERROR) << "Couldn't send Client::kSandbox2ClientUnotify message";
return false;
}
int fd;
if (!comms_->RecvFD(&fd)) {
LOG(ERROR) << "Couldn't recv unotify fd";
return false;
}
seccomp_notify_fd_ = FDCloser(fd);
struct seccomp_notif_sizes sizes = {};
if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) == -1) {
LOG(ERROR) << "Couldn't get seccomp_notif_sizes";
return false;
}
req_size_ = sizes.seccomp_notif;
req_.reset(static_cast<seccomp_notif*>(malloc(req_size_)));
return true;
}
bool UnotifyMonitor::InitSetupNotifyPipe() {
int pfds[2];
if (pipe(pfds) != 0) {
PLOG(ERROR) << "failed creating monitor pipe";
return false;
}
monitor_notify_pipe_[0] = FDCloser(pfds[0]);
monitor_notify_pipe_[1] = FDCloser(pfds[1]);
return true;
}
void UnotifyMonitor::NotifyMonitor() {
absl::ReaderMutexLock lock(&notify_mutex_);
if (!monitor_notify_pipe_[1].get()) {
return;
}
char c = ' ';
write(monitor_notify_pipe_[1].get(), &c, 1);
}
bool UnotifyMonitor::KillSandboxee() {
VLOG(1) << "Sending SIGKILL to the PID: " << process_.main_pid;
if (kill(process_.main_pid, SIGKILL) != 0) {
PLOG(ERROR) << "Could not send SIGKILL to PID " << process_.main_pid;
return false;
}
return true;
}
void UnotifyMonitor::KillInit() {
VLOG(1) << "Sending SIGKILL to the PID: " << process_.init_pid;
if (kill(process_.init_pid, SIGKILL) != 0) {
PLOG(ERROR) << "Could not send SIGKILL to PID " << process_.init_pid;
}
}
void UnotifyMonitor::Join() {
absl::MutexLock lock(&notify_mutex_);
if (thread_) {
thread_->join();
CHECK(IsDone()) << "Monitor did not terminate";
VLOG(1) << "Final execution status: " << result_.ToString();
CHECK(result_.final_status() != Result::UNSET);
thread_.reset();
monitor_notify_pipe_[0].Close();
monitor_notify_pipe_[1].Close();
}
}
void UnotifyMonitor::MaybeGetStackTrace(pid_t pid, Result::StatusEnum status) {
if (ShouldCollectStackTrace(status)) {
auto stack = GetStackTrace(pid);
if (stack.ok()) {
result_.set_stack_trace(*stack);
} else {
LOG(ERROR) << "Getting stack trace: " << stack.status();
}
}
}
absl::StatusOr<std::vector<std::string>> UnotifyMonitor::GetStackTrace(
pid_t pid) {
if (ptrace(PTRACE_ATTACH, pid, 0, 0) != 0) {
return absl::ErrnoToStatus(errno,
absl::StrCat("could not attach to pid = ", pid));
}
int wstatus = 0;
while (!WIFSTOPPED(wstatus)) {
pid_t ret =
waitpid(pid, &wstatus, __WNOTHREAD | __WALL | WUNTRACED | WNOHANG);
if (ret == -1) {
return absl::ErrnoToStatus(errno,
absl::StrCat("waiting for stop, pid = ", pid));
}
}
absl::Cleanup cleanup = [pid] {
if (ptrace(PTRACE_DETACH, pid, 0, 0) != 0) {
LOG(ERROR) << "Could not detach after obtaining stack trace from pid = "
<< pid;
}
};
Regs regs(pid);
absl::Status status = regs.Fetch();
if (!status.ok()) {
if (absl::IsNotFound(status)) {
LOG(WARNING) << "failed to fetch regs: " << status;
return status;
}
SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_FETCH);
return status;
}
return GetAndLogStackTrace(&regs);
}
} // namespace sandbox2

View File

@ -0,0 +1,110 @@
#ifndef SANDBOXED_API_SANDBOX2_MONITOR_UNOTIFY_H_
#define SANDBOXED_API_SANDBOX2_MONITOR_UNOTIFY_H_
#include <linux/seccomp.h>
#include <atomic>
#include <memory>
#include <thread>
#include <string>
#include <vector>
#include "absl/status/statusor.h"
#include "absl/synchronization/mutex.h"
#include "sandboxed_api/sandbox2/executor.h"
#include "sandboxed_api/sandbox2/monitor_base.h"
#include "sandboxed_api/sandbox2/notify.h"
#include "sandboxed_api/sandbox2/policy.h"
#include "sandboxed_api/util/raw_logging.h"
namespace sandbox2 {
#ifndef SECCOMP_IOCTL_NOTIF_RECV
struct seccomp_notif {
__u64 id;
__u32 pid;
__u32 flags;
struct seccomp_data data;
};
#endif
class UnotifyMonitor : public MonitorBase {
public:
UnotifyMonitor(Executor* executor, Policy* policy, Notify* notify);
~UnotifyMonitor() { Join(); }
void Kill() override {
external_kill_request_flag_.clear(std::memory_order_relaxed);
NotifyMonitor();
}
void DumpStackTrace() override {
dump_stack_request_flag_.clear(std::memory_order_relaxed);
NotifyMonitor();
}
void SetWallTimeLimit(absl::Duration limit) override {
if (limit == absl::ZeroDuration()) {
VLOG(1) << "Disarming walltime timer to ";
deadline_millis_.store(0, std::memory_order_relaxed);
} else {
VLOG(1) << "Will set the walltime timer to " << limit;
absl::Time deadline = absl::Now() + limit;
deadline_millis_.store(absl::ToUnixMillis(deadline),
std::memory_order_relaxed);
NotifyMonitor();
}
}
private:
// Waits for events from monitored clients and signals from the main process.
void RunInternal() override;
void Join() override;
void Run();
bool InitSetupUnotify();
bool InitSetupNotifyPipe();
// Kills the main traced PID with SIGKILL.
// Returns false if an error occured and process could not be killed.
bool KillSandboxee();
void KillInit();
void HandleUnotify();
void SetExitStatusFromStatusPipe();
void MaybeGetStackTrace(pid_t pid, Result::StatusEnum status);
absl::StatusOr<std::vector<std::string>> GetStackTrace(pid_t pid);
// Notifies monitor about a state change
void NotifyMonitor();
absl::Notification setup_notification_;
sapi::file_util::fileops::FDCloser seccomp_notify_fd_;
sapi::file_util::fileops::FDCloser monitor_notify_pipe_[2];
// Deadline in Unix millis
std::atomic<int64_t> deadline_millis_{0};
// False iff external kill is requested
std::atomic_flag external_kill_request_flag_ = ATOMIC_FLAG_INIT;
// False iff dump stack is requested
std::atomic_flag dump_stack_request_flag_ = ATOMIC_FLAG_INIT;
// Was external kill sent to the sandboxee
bool external_kill_ = false;
// Network violation occurred and process of killing sandboxee started
bool network_violation_ = false;
// Is the sandboxee timed out
bool timed_out_ = false;
// Monitor thread object.
std::unique_ptr<std::thread> thread_;
// Synchronizes monitor thread deletion and notifying the monitor.
absl::Mutex notify_mutex_;
size_t req_size_;
std::unique_ptr<seccomp_notif, decltype(std::free)*> req_{nullptr, std::free};
};
} // namespace sandbox2
#endif // SANDBOXED_API_SANDBOX2_MONITOR_UNOTIFY_H_

View File

@ -38,6 +38,12 @@
#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) #define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
#endif #endif
#ifndef SECCOMP_RET_USER_NOTIF
#define SECCOMP_RET_USER_NOTIF 0x7fc00000U /* notifies userspace */
#endif
#define DO_USER_NOTIF BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF)
ABSL_FLAG(bool, sandbox2_danger_danger_permit_all, false, ABSL_FLAG(bool, sandbox2_danger_danger_permit_all, false,
"Allow all syscalls, useful for testing"); "Allow all syscalls, useful for testing");
ABSL_FLAG(std::string, sandbox2_danger_danger_permit_all_and_log, "", ABSL_FLAG(std::string, sandbox2_danger_danger_permit_all_and_log, "",
@ -49,7 +55,7 @@ namespace sandbox2 {
// 1. default policy (GetDefaultPolicy, private), // 1. default policy (GetDefaultPolicy, private),
// 2. user policy (user_policy_, public), // 2. user policy (user_policy_, public),
// 3. default KILL action (avoid failing open if user policy did not do it). // 3. default KILL action (avoid failing open if user policy did not do it).
std::vector<sock_filter> Policy::GetPolicy() const { std::vector<sock_filter> Policy::GetPolicy(bool user_notif) const {
if (absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all) || if (absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all) ||
!absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all_and_log).empty()) { !absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all_and_log).empty()) {
return GetTrackingPolicy(); return GetTrackingPolicy();
@ -57,7 +63,7 @@ std::vector<sock_filter> Policy::GetPolicy() const {
// Now we can start building the policy. // Now we can start building the policy.
// 1. Start with the default policy (e.g. syscall architecture checks). // 1. Start with the default policy (e.g. syscall architecture checks).
auto policy = GetDefaultPolicy(); auto policy = GetDefaultPolicy(user_notif);
VLOG(3) << "Default policy:\n" << bpf::Disasm(policy); VLOG(3) << "Default policy:\n" << bpf::Disasm(policy);
// 2. Append user policy. // 2. Append user policy.
@ -69,6 +75,15 @@ std::vector<sock_filter> Policy::GetPolicy() const {
// 3. Finish with default KILL action. // 3. Finish with default KILL action.
policy.push_back(KILL); policy.push_back(KILL);
// In seccomp_unotify mode replace all KILLS with unotify
if (user_notif) {
for (sock_filter& filter : policy) {
if (filter.code == BPF_RET + BPF_K && filter.k == SECCOMP_RET_KILL) {
filter = DO_USER_NOTIF;
}
}
}
VLOG(2) << "Final policy:\n" << bpf::Disasm(policy); VLOG(2) << "Final policy:\n" << bpf::Disasm(policy);
return policy; return policy;
} }
@ -80,36 +95,61 @@ std::vector<sock_filter> Policy::GetPolicy() const {
// for the __NR_execve syscall, so the tracer can make a decision to allow or // for the __NR_execve syscall, so the tracer can make a decision to allow or
// disallow it depending on which occurrence of __NR_execve it was. // disallow it depending on which occurrence of __NR_execve it was.
// LINT.IfChange // LINT.IfChange
std::vector<sock_filter> Policy::GetDefaultPolicy() const { std::vector<sock_filter> Policy::GetDefaultPolicy(bool user_notif) const {
bpf_labels l = {0}; bpf_labels l = {0};
std::vector<sock_filter> policy = { std::vector<sock_filter> policy;
// If compiled arch is different from the runtime one, inform the Monitor. if (user_notif) {
LOAD_ARCH, policy = {
JEQ32(Syscall::GetHostAuditArch(), JUMP(&l, past_arch_check_l)), // If compiled arch is different from the runtime one, inform the
// Monitor.
LOAD_ARCH,
JNE32(Syscall::GetHostAuditArch(), DENY),
LOAD_SYSCALL_NR,
// TODO(b/271400371) Use NOTIF_FLAG_CONTINUE once generally available
JNE32(__NR_seccomp, JUMP(&l, past_seccomp_l)),
ARG_32(3),
JNE32(internal::kExecveMagic, JUMP(&l, past_seccomp_l)),
ALLOW,
LABEL(&l, past_seccomp_l),
LOAD_SYSCALL_NR,
JNE32(__NR_execveat, JUMP(&l, past_execveat_l)),
ARG_32(4),
JNE32(AT_EMPTY_PATH, JUMP(&l, past_execveat_l)),
ARG_32(5),
JNE32(internal::kExecveMagic, JUMP(&l, past_execveat_l)),
ALLOW,
LABEL(&l, past_execveat_l),
};
} else {
policy = {
// If compiled arch is different from the runtime one, inform the Monitor.
LOAD_ARCH,
JEQ32(Syscall::GetHostAuditArch(), JUMP(&l, past_arch_check_l)),
#if defined(SAPI_X86_64) #if defined(SAPI_X86_64)
JEQ32(AUDIT_ARCH_I386, TRACE(sapi::cpu::kX86)), // 32-bit sandboxee JEQ32(AUDIT_ARCH_I386, TRACE(sapi::cpu::kX86)), // 32-bit sandboxee
#endif #endif
TRACE(sapi::cpu::kUnknown), TRACE(sapi::cpu::kUnknown),
LABEL(&l, past_arch_check_l), LABEL(&l, past_arch_check_l),
// After the policy is uploaded, forkserver will execve the sandboxee. We // After the policy is uploaded, forkserver will execve the sandboxee. We
// need to allow this execve but not others. Since BPF does not have // need to allow this execve but not others. Since BPF does not have
// state, we need to inform the Monitor to decide, and for that we use a // state, we need to inform the Monitor to decide, and for that we use a
// magic value in syscall args 5. Note that this value is not supposed to // magic value in syscall args 5. Note that this value is not supposed to
// be secret, but just an optimization so that the monitor is not // be secret, but just an optimization so that the monitor is not
// triggered on every call to execveat. // triggered on every call to execveat.
LOAD_SYSCALL_NR, LOAD_SYSCALL_NR,
JNE32(__NR_execveat, JUMP(&l, past_execveat_l)), JNE32(__NR_execveat, JUMP(&l, past_execveat_l)),
ARG_32(4), ARG_32(4),
JNE32(AT_EMPTY_PATH, JUMP(&l, past_execveat_l)), JNE32(AT_EMPTY_PATH, JUMP(&l, past_execveat_l)),
ARG_32(5), ARG_32(5),
JNE32(internal::kExecveMagic, JUMP(&l, past_execveat_l)), JNE32(internal::kExecveMagic, JUMP(&l, past_execveat_l)),
SANDBOX2_TRACE, SANDBOX2_TRACE,
LABEL(&l, past_execveat_l), LABEL(&l, past_execveat_l),
LOAD_SYSCALL_NR, LOAD_SYSCALL_NR,
}; };
}
// Forbid ptrace because it's unsafe or too risky. The user policy can only // Forbid ptrace because it's unsafe or too risky. The user policy can only
// block (i.e. return an error instead of killing the process) but not allow // block (i.e. return an error instead of killing the process) but not allow
@ -169,8 +209,8 @@ std::vector<sock_filter> Policy::GetTrackingPolicy() const {
}; };
} }
bool Policy::SendPolicy(Comms* comms) const { bool Policy::SendPolicy(Comms* comms, bool user_notif) const {
auto policy = GetPolicy(); auto policy = GetPolicy(user_notif);
if (!comms->SendBytes( if (!comms->SendBytes(
reinterpret_cast<uint8_t*>(policy.data()), reinterpret_cast<uint8_t*>(policy.data()),
static_cast<uint64_t>(policy.size()) * sizeof(sock_filter))) { static_cast<uint64_t>(policy.size()) * sizeof(sock_filter))) {

View File

@ -54,8 +54,10 @@ class Policy final {
void GetPolicyDescription(PolicyDescription* policy) const; void GetPolicyDescription(PolicyDescription* policy) const;
private: private:
friend class Sandbox2;
friend class MonitorBase; friend class MonitorBase;
friend class PtraceMonitor; friend class PtraceMonitor;
friend class UnotifyMonitor;
friend class PolicyBuilder; friend class PolicyBuilder;
friend class StackTracePeer; friend class StackTracePeer;
@ -63,11 +65,11 @@ class Policy final {
Policy() = default; Policy() = default;
// Sends the policy over the IPC channel. // Sends the policy over the IPC channel.
bool SendPolicy(Comms* comms) const; bool SendPolicy(Comms* comms, bool user_notif) const;
// Returns the policy, but modifies it according to FLAGS and internal // Returns the policy, but modifies it according to FLAGS and internal
// requirements (message passing via Comms, Executor::WaitForExecve etc.). // requirements (message passing via Comms, Executor::WaitForExecve etc.).
std::vector<sock_filter> GetPolicy() const; std::vector<sock_filter> GetPolicy(bool user_notif) const;
Namespace* GetNamespace() { return namespace_.get(); } Namespace* GetNamespace() { return namespace_.get(); }
void SetNamespace(std::unique_ptr<Namespace> ns) { void SetNamespace(std::unique_ptr<Namespace> ns) {
@ -76,7 +78,7 @@ class Policy final {
// Returns the default policy, which blocks certain dangerous syscalls and // Returns the default policy, which blocks certain dangerous syscalls and
// mismatched syscall tables. // mismatched syscall tables.
std::vector<sock_filter> GetDefaultPolicy() const; std::vector<sock_filter> GetDefaultPolicy(bool user_notif) const;
// Returns a policy allowing the Monitor module to track all syscalls. // Returns a policy allowing the Monitor module to track all syscalls.
std::vector<sock_filter> GetTrackingPolicy() const; std::vector<sock_filter> GetTrackingPolicy() const;

View File

@ -23,10 +23,12 @@
#include "absl/base/call_once.h" #include "absl/base/call_once.h"
#include "absl/log/check.h" #include "absl/log/check.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h" #include "absl/status/statusor.h"
#include "absl/time/time.h" #include "absl/time/time.h"
#include "sandboxed_api/sandbox2/monitor_base.h" #include "sandboxed_api/sandbox2/monitor_base.h"
#include "sandboxed_api/sandbox2/monitor_ptrace.h" #include "sandboxed_api/sandbox2/monitor_ptrace.h"
#include "sandboxed_api/sandbox2/monitor_unotify.h"
#include "sandboxed_api/sandbox2/result.h" #include "sandboxed_api/sandbox2/result.h"
#include "sandboxed_api/sandbox2/stack_trace.h" #include "sandboxed_api/sandbox2/stack_trace.h"
@ -105,9 +107,42 @@ void Sandbox2::Launch() {
internal::SandboxPeer::spawn_fn_ = Sandbox2Peer::Spawn; internal::SandboxPeer::spawn_fn_ = Sandbox2Peer::Spawn;
}); });
monitor_ = std::make_unique<PtraceMonitor>(executor_.get(), policy_.get(), monitor_ = CreateMonitor();
notify_.get());
monitor_->Launch(); monitor_->Launch();
} }
absl::Status Sandbox2::EnableUnotifyMonitor() {
if (notify_) {
return absl::FailedPreconditionError(
"sandbox2::Notify is not compatible with unotify monitor");
}
if (policy_->GetNamespace() == nullptr) {
return absl::FailedPreconditionError(
"Unotify monitor can only be used together with namespaces");
}
if (policy_->collect_stacktrace_on_signal_) {
return absl::FailedPreconditionError(
"Unotify monitor cannot collect stack traces on signal");
}
if (policy_->collect_stacktrace_on_exit_) {
return absl::FailedPreconditionError(
"Unotify monitor cannot collect stack traces on normal exit");
}
use_unotify_monitor_ = true;
return absl::OkStatus();
}
std::unique_ptr<MonitorBase> Sandbox2::CreateMonitor() {
if (!notify_) {
notify_ = std::make_unique<Notify>();
}
if (use_unotify_monitor_) {
return std::make_unique<UnotifyMonitor>(executor_.get(), policy_.get(),
notify_.get());
}
return std::make_unique<PtraceMonitor>(executor_.get(), policy_.get(),
notify_.get());
}
} // namespace sandbox2 } // namespace sandbox2

View File

@ -46,9 +46,6 @@ class Sandbox2 final {
notify_(std::move(notify)) { notify_(std::move(notify)) {
CHECK(executor_ != nullptr); CHECK(executor_ != nullptr);
CHECK(policy_ != nullptr); CHECK(policy_ != nullptr);
if (notify_ == nullptr) {
notify_ = std::make_unique<Notify>();
}
} }
Sandbox2(const Sandbox2&) = delete; Sandbox2(const Sandbox2&) = delete;
@ -108,10 +105,14 @@ class Sandbox2 final {
return executor_ != nullptr ? executor_->ipc()->comms() : nullptr; return executor_ != nullptr ? executor_->ipc()->comms() : nullptr;
} }
absl::Status EnableUnotifyMonitor();
private: private:
// Launches the Monitor. // Launches the Monitor.
void Launch(); void Launch();
std::unique_ptr<MonitorBase> CreateMonitor();
// Executor set by user - owned by Sandbox2. // Executor set by user - owned by Sandbox2.
std::unique_ptr<Executor> executor_; std::unique_ptr<Executor> executor_;
@ -123,6 +124,8 @@ class Sandbox2 final {
// Monitor object - owned by Sandbox2. // Monitor object - owned by Sandbox2.
std::unique_ptr<MonitorBase> monitor_; std::unique_ptr<MonitorBase> monitor_;
bool use_unotify_monitor_ = false;
}; };
} // namespace sandbox2 } // namespace sandbox2

View File

@ -39,32 +39,47 @@ namespace {
using ::sapi::CreateDefaultPermissiveTestPolicy; using ::sapi::CreateDefaultPermissiveTestPolicy;
using ::sapi::GetTestSourcePath; using ::sapi::GetTestSourcePath;
using ::sapi::IsOk;
using ::testing::Eq; using ::testing::Eq;
using ::testing::IsEmpty; using ::testing::IsEmpty;
using ::testing::IsTrue; using ::testing::IsTrue;
using ::testing::Lt; using ::testing::Lt;
class Sandbox2Test : public ::testing::TestWithParam<bool> {
public:
PolicyBuilder CreateDefaultTestPolicy(absl::string_view path) {
PolicyBuilder builder = CreateDefaultPermissiveTestPolicy(path);
if (GetParam()) {
builder.CollectStacktracesOnSignal(false);
}
return builder;
}
absl::Status SetUpSandbox(Sandbox2* sandbox) {
return GetParam() ? sandbox->EnableUnotifyMonitor() : absl::OkStatus();
}
};
// Test that aborting inside a sandbox with all userspace core dumping // Test that aborting inside a sandbox with all userspace core dumping
// disabled reports the signal. // disabled reports the signal.
TEST(SandboxCoreDumpTest, AbortWithoutCoreDumpReturnsSignaled) { TEST_P(Sandbox2Test, AbortWithoutCoreDumpReturnsSignaled) {
const std::string path = GetTestSourcePath("sandbox2/testcases/abort"); const std::string path = GetTestSourcePath("sandbox2/testcases/abort");
std::vector<std::string> args = { std::vector<std::string> args = {
path, path,
}; };
auto executor = std::make_unique<Executor>(path, args); auto executor = std::make_unique<Executor>(path, args);
SAPI_ASSERT_OK_AND_ASSIGN(auto policy, CreateDefaultPermissiveTestPolicy(path) SAPI_ASSERT_OK_AND_ASSIGN(auto policy, CreateDefaultTestPolicy(path)
.TryBuild()); .TryBuild());
Sandbox2 sandbox(std::move(executor), std::move(policy)); Sandbox2 sandbox(std::move(executor), std::move(policy));
ASSERT_THAT(SetUpSandbox(&sandbox), IsOk());
auto result = sandbox.Run(); auto result = sandbox.Run();
ASSERT_THAT(result.final_status(), Eq(Result::SIGNALED)); ASSERT_THAT(result.final_status(), Eq(Result::SIGNALED));
EXPECT_THAT(result.reason_code(), Eq(SIGABRT)); EXPECT_THAT(result.reason_code(), Eq(SIGABRT));
} }
// Test that with TSYNC we are able to sandbox when multithreaded and with no // Test that with TSYNC we are able to sandbox when multithreaded.
// memory checks. If TSYNC is not supported, then no. TEST_P(Sandbox2Test, TsyncNoMemoryChecks) {
TEST(TsyncTest, TsyncNoMemoryChecks) {
const std::string path = GetTestSourcePath("sandbox2/testcases/tsync"); const std::string path = GetTestSourcePath("sandbox2/testcases/tsync");
auto executor = auto executor =
@ -72,8 +87,9 @@ TEST(TsyncTest, TsyncNoMemoryChecks) {
executor->set_enable_sandbox_before_exec(false); executor->set_enable_sandbox_before_exec(false);
SAPI_ASSERT_OK_AND_ASSIGN(auto policy, SAPI_ASSERT_OK_AND_ASSIGN(auto policy,
CreateDefaultPermissiveTestPolicy(path).TryBuild()); CreateDefaultTestPolicy(path).TryBuild());
Sandbox2 sandbox(std::move(executor), std::move(policy)); Sandbox2 sandbox(std::move(executor), std::move(policy));
ASSERT_THAT(SetUpSandbox(&sandbox), IsOk());
auto result = sandbox.Run(); auto result = sandbox.Run();
// With TSYNC, SandboxMeHere should be able to sandbox when multithreaded. // With TSYNC, SandboxMeHere should be able to sandbox when multithreaded.
@ -102,7 +118,7 @@ TEST(ExecutorTest, ExecutorFdConstructor) {
// Tests that we return the correct state when the sandboxee was killed by an // Tests that we return the correct state when the sandboxee was killed by an
// external signal. Also make sure that we do not have the stack trace. // external signal. Also make sure that we do not have the stack trace.
TEST(RunAsyncTest, SandboxeeExternalKill) { TEST_P(Sandbox2Test, SandboxeeExternalKill) {
const std::string path = GetTestSourcePath("sandbox2/testcases/sleep"); const std::string path = GetTestSourcePath("sandbox2/testcases/sleep");
std::vector<std::string> args = {path}; std::vector<std::string> args = {path};
@ -110,8 +126,9 @@ TEST(RunAsyncTest, SandboxeeExternalKill) {
auto executor = std::make_unique<Executor>(path, args, envs); auto executor = std::make_unique<Executor>(path, args, envs);
SAPI_ASSERT_OK_AND_ASSIGN(auto policy, SAPI_ASSERT_OK_AND_ASSIGN(auto policy,
CreateDefaultPermissiveTestPolicy(path).TryBuild()); CreateDefaultTestPolicy(path).TryBuild());
Sandbox2 sandbox(std::move(executor), std::move(policy)); Sandbox2 sandbox(std::move(executor), std::move(policy));
ASSERT_THAT(SetUpSandbox(&sandbox), IsOk());
ASSERT_TRUE(sandbox.RunAsync()); ASSERT_TRUE(sandbox.RunAsync());
sleep(1); sleep(1);
sandbox.Kill(); sandbox.Kill();
@ -121,17 +138,18 @@ TEST(RunAsyncTest, SandboxeeExternalKill) {
} }
// Tests that we do not collect stack traces if it was disabled (signaled). // Tests that we do not collect stack traces if it was disabled (signaled).
TEST(RunAsyncTest, SandboxeeTimeoutDisabledStacktraces) { TEST_P(Sandbox2Test, SandboxeeTimeoutDisabledStacktraces) {
const std::string path = GetTestSourcePath("sandbox2/testcases/sleep"); const std::string path = GetTestSourcePath("sandbox2/testcases/sleep");
std::vector<std::string> args = {path}; std::vector<std::string> args = {path};
std::vector<std::string> envs; std::vector<std::string> envs;
auto executor = std::make_unique<Executor>(path, args, envs); auto executor = std::make_unique<Executor>(path, args, envs);
SAPI_ASSERT_OK_AND_ASSIGN(auto policy, CreateDefaultPermissiveTestPolicy(path) SAPI_ASSERT_OK_AND_ASSIGN(auto policy, CreateDefaultTestPolicy(path)
.CollectStacktracesOnTimeout(false) .CollectStacktracesOnTimeout(false)
.TryBuild()); .TryBuild());
Sandbox2 sandbox(std::move(executor), std::move(policy)); Sandbox2 sandbox(std::move(executor), std::move(policy));
ASSERT_THAT(SetUpSandbox(&sandbox), IsOk());
ASSERT_TRUE(sandbox.RunAsync()); ASSERT_TRUE(sandbox.RunAsync());
sandbox.set_walltime_limit(absl::Seconds(1)); sandbox.set_walltime_limit(absl::Seconds(1));
auto result = sandbox.AwaitResult(); auto result = sandbox.AwaitResult();
@ -140,7 +158,7 @@ TEST(RunAsyncTest, SandboxeeTimeoutDisabledStacktraces) {
} }
// Tests that we do not collect stack traces if it was disabled (violation). // Tests that we do not collect stack traces if it was disabled (violation).
TEST(RunAsyncTest, SandboxeeViolationDisabledStacktraces) { TEST(Sandbox2Test, SandboxeeViolationDisabledStacktraces) {
const std::string path = GetTestSourcePath("sandbox2/testcases/sleep"); const std::string path = GetTestSourcePath("sandbox2/testcases/sleep");
std::vector<std::string> args = {path}; std::vector<std::string> args = {path};
@ -159,14 +177,15 @@ TEST(RunAsyncTest, SandboxeeViolationDisabledStacktraces) {
EXPECT_THAT(result.stack_trace(), IsEmpty()); EXPECT_THAT(result.stack_trace(), IsEmpty());
} }
TEST(RunAsyncTest, SandboxeeNotKilledWhenStartingThreadFinishes) { TEST_P(Sandbox2Test, SandboxeeNotKilledWhenStartingThreadFinishes) {
const std::string path = GetTestSourcePath("sandbox2/testcases/minimal"); const std::string path = GetTestSourcePath("sandbox2/testcases/minimal");
std::vector<std::string> args = {path}; std::vector<std::string> args = {path};
auto executor = std::make_unique<Executor>(path, args); auto executor = std::make_unique<Executor>(path, args);
SAPI_ASSERT_OK_AND_ASSIGN(auto policy, SAPI_ASSERT_OK_AND_ASSIGN(auto policy,
CreateDefaultPermissiveTestPolicy(path).TryBuild()); CreateDefaultTestPolicy(path).TryBuild());
Sandbox2 sandbox(std::move(executor), std::move(policy)); Sandbox2 sandbox(std::move(executor), std::move(policy));
ASSERT_THAT(SetUpSandbox(&sandbox), IsOk());
std::thread sandbox_start_thread([&sandbox]() { sandbox.RunAsync(); }); std::thread sandbox_start_thread([&sandbox]() { sandbox.RunAsync(); });
sandbox_start_thread.join(); sandbox_start_thread.join();
Result result = sandbox.AwaitResult(); Result result = sandbox.AwaitResult();
@ -194,5 +213,11 @@ TEST(StarvationTest, MonitorIsNotStarvedByTheSandboxee) {
EXPECT_THAT(elapsed, Lt(absl::Seconds(10))); EXPECT_THAT(elapsed, Lt(absl::Seconds(10)));
} }
INSTANTIATE_TEST_SUITE_P(Sandbox2, Sandbox2Test, ::testing::Values(false, true),
[](const ::testing::TestParamInfo<bool>& info) {
return info.param ? "UnotifyMonitor"
: "PtraceMonitor";
});
} // namespace } // namespace
} // namespace sandbox2 } // namespace sandbox2

View File

@ -65,6 +65,7 @@ class Syscall {
private: private:
friend class Regs; friend class Regs;
friend class UnotifyMonitor;
explicit Syscall(pid_t pid) : pid_(pid) {} explicit Syscall(pid_t pid) : pid_(pid) {}
Syscall(sapi::cpu::Architecture arch, uint64_t nr, Args args, pid_t pid, Syscall(sapi::cpu::Architecture arch, uint64_t nr, Args args, pid_t pid,

View File

@ -41,7 +41,7 @@ int ChildFunc(void*) {
int main() { int main() {
for (int i = 0; i < kProcesses; ++i) { for (int i = 0; i < kProcesses; ++i) {
int p[2]; int p[2];
char c; char c = ' ';
pipe(p); pipe(p);
g_pids[i] = fork(); g_pids[i] = fork();
if (g_pids[i] == 0) { if (g_pids[i] == 0) {