Use a nested userns&mntns to pre-pivot_root

This addresses a latency issue - chroot_fs_refs called inside pivot_root
in the kernel can take several milliseconds on machines with many threads
running.
This might not always reduce latency for custom forkservers, as additional
fork can be more costly than pivot_root.

PiperOrigin-RevId: 281306284
Change-Id: If503ac76a70e5438e94caf708d79cb0219c66def
This commit is contained in:
Wiktor Garbacz 2019-11-19 09:01:59 -08:00 committed by Copybara-Service
parent 1673ade4e4
commit 8a7d0d1cb3
6 changed files with 228 additions and 53 deletions

View File

@ -14,16 +14,16 @@
# Description: sandbox2 is a C++ sandbox technology for Linux.
load("//sandboxed_api/bazel:build_defs.bzl", "sapi_platform_copts")
load("//sandboxed_api/bazel:embed_data.bzl", "sapi_cc_embed_data")
load("//sandboxed_api/bazel:proto.bzl", "sapi_proto_library")
package(default_visibility = [
"//sandboxed_api:__subpackages__",
])
licenses(["notice"]) # Apache 2.0
load("//sandboxed_api/bazel:build_defs.bzl", "sapi_platform_copts")
load("//sandboxed_api/bazel:embed_data.bzl", "sapi_cc_embed_data")
load("//sandboxed_api/bazel:proto.bzl", "sapi_proto_library")
cc_library(
name = "bpfdisassembler",
srcs = ["bpfdisassembler.cc"],
@ -397,6 +397,7 @@ cc_library(
"//sandboxed_api/util:raw_logging",
"//sandboxed_api/util:status",
"//sandboxed_api/util:statusor",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
"@com_google_absl//absl/synchronization",
@ -457,6 +458,7 @@ cc_library(
"//sandboxed_api/sandbox2/util:strerror",
"//sandboxed_api/util:raw_logging",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
],

View File

@ -365,6 +365,7 @@ add_library(sandbox2_forkserver STATIC
)
add_library(sandbox2::forkserver ALIAS sandbox2_forkserver)
target_link_libraries(sandbox2_forkserver PRIVATE
absl::memory
absl::str_format
absl::strings
absl::synchronization
@ -417,6 +418,7 @@ add_library(sandbox2_namespace STATIC
add_library(sandbox2::namespace ALIAS sandbox2_namespace)
target_link_libraries(sandbox2_namespace PRIVATE
absl::core_headers
absl::memory
absl::str_format
absl::strings
sandbox2::file_base

View File

@ -36,6 +36,7 @@
#include <cstring>
#include <glog/logging.h>
#include "absl/memory/memory.h"
#include "absl/strings/match.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
@ -256,7 +257,8 @@ void ForkServer::PrepareExecveArgs(const ForkRequest& request,
void ForkServer::LaunchChild(const ForkRequest& request, int execve_fd,
int client_fd, uid_t uid, gid_t gid,
int user_ns_fd, int signaling_fd) {
int user_ns_fd, int signaling_fd,
bool avoid_pivot_root) const {
bool will_execve = (request.mode() == FORKSERVER_FORK_EXECVE ||
request.mode() == FORKSERVER_FORK_EXECVE_SANDBOX);
@ -282,7 +284,7 @@ void ForkServer::LaunchChild(const ForkRequest& request, int execve_fd,
if (!sanitizer::GetListOfFDs(&open_fds)) {
SAPI_RAW_LOG(WARNING, "Could not get list of current open FDs");
}
InitializeNamespaces(request, uid, gid);
InitializeNamespaces(request, uid, gid, avoid_pivot_root);
auto caps = cap_init();
for (auto cap : request.capabilities()) {
@ -429,19 +431,61 @@ pid_t ForkServer::ServeRequest() const {
file_util::fileops::FDCloser fd_closer0{socketpair_fds[0]};
file_util::fileops::FDCloser fd_closer1{socketpair_fds[1]};
pid_t sandboxee_pid = util::ForkWithFlags(clone_flags);
// Note: init_pid will be overwritten with the actual init pid if the init
// process was started or stays at 0 if that is not needed (custom
// forkserver).
// process was started or stays at 0 if that is not needed - no pidns.
pid_t init_pid = 0;
if (sandboxee_pid == -1) {
SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
pid_t sandboxee_pid = -1;
bool avoid_pivot_root = clone_flags & (CLONE_NEWUSER | CLONE_NEWNS);
if (avoid_pivot_root) {
// We first just fork a child, which will join the initial namespaces
// Note: Not a regular fork() as one really needs to be single-threaded to
// setns and this is not the case with TSAN.
pid_t pid = util::ForkWithFlags(0);
SAPI_RAW_PCHECK(pid != -1, "fork failed");
if (pid == 0) {
SAPI_RAW_PCHECK(setns(initial_userns_fd_, CLONE_NEWUSER) != -1,
"joining initial user namespace");
SAPI_RAW_PCHECK(setns(initial_mntns_fd_, CLONE_NEWNS) != -1,
"joining initial mnt namespace");
close(initial_userns_fd_);
close(initial_mntns_fd_);
// Do not create new userns it will be unshared later
sandboxee_pid =
util::ForkWithFlags((clone_flags & ~CLONE_NEWUSER) | CLONE_PARENT);
if (sandboxee_pid == -1) {
SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
}
if (sandboxee_pid != 0) {
_exit(0);
}
// Send sandboxee pid
sapi::Status status = SendPid(fd_closer1.get());
if (!status.ok()) {
SAPI_RAW_LOG(FATAL, "%s", status.message());
}
} else {
auto pid_or = ReceivePid(fd_closer0.get());
if (!pid_or.ok()) {
SAPI_RAW_LOG(ERROR, "%s", pid_or.status().message());
} else {
sandboxee_pid = pid_or.ValueOrDie();
}
}
} else {
sandboxee_pid = util::ForkWithFlags(clone_flags);
if (sandboxee_pid == -1) {
SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
}
if (sandboxee_pid == 0) {
close(initial_userns_fd_);
close(initial_mntns_fd_);
}
}
// Child.
if (sandboxee_pid == 0) {
LaunchChild(fork_request, exec_fd, comms_fd, uid, gid, user_ns_fd,
fd_closer1.get());
fd_closer1.get(), avoid_pivot_root);
return sandboxee_pid;
}
@ -489,6 +533,44 @@ bool ForkServer::Initialize() {
return false;
}
// Spawn a new process to create initial user and mount namespaces to be used
// as a base for each namespaced sandboxee.
// Store uid and gid to create mappings after CLONE_NEWUSER
uid_t uid = getuid();
gid_t gid = getgid();
// Pipe to synchronize so that we open ns fds before process dies
int fds[2];
SAPI_RAW_PCHECK(pipe2(fds, O_CLOEXEC) != -1, "creating pipe");
pid_t pid = util::ForkWithFlags(CLONE_NEWUSER | CLONE_NEWNS);
SAPI_RAW_PCHECK(pid != -1, "failed to fork initial namespaces process");
char unused = '\0';
if (pid == 0) {
close(fds[1]);
Namespace::InitializeInitialNamespaces(uid, gid);
SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(fds[0], &unused, 1)) == 1,
"synchronizing initial namespaces creation");
_exit(0);
}
close(fds[0]);
initial_userns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/user").c_str(),
O_RDONLY | O_CLOEXEC);
SAPI_RAW_PCHECK(initial_userns_fd_ != -1, "getting initial userns fd");
initial_mntns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/mnt").c_str(),
O_RDONLY | O_CLOEXEC);
SAPI_RAW_PCHECK(initial_mntns_fd_ != -1, "getting initial mntns fd");
SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(fds[1], &unused, 1)) == 1,
"synchronizing initial namespaces creation");
close(fds[1]);
int status;
SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(waitpid(pid, &status, __WALL)) == pid,
"synchronizing initial namespaces creation");
SAPI_RAW_PCHECK(WIFEXITED(status),
"initial namespace did not terminate normally");
SAPI_RAW_PCHECK(WEXITSTATUS(status) == 0,
"initial namespace exited with non-zero code %d", status);
// All processes spawned by the fork'd/execute'd process will see this process
// as /sbin/init. Therefore it will receive (and ignore) their final status
// (see the next comment as well). PR_SET_CHILD_SUBREAPER is available since
@ -552,7 +634,7 @@ void ForkServer::ExecuteProcess(int execve_fd, const char** argv,
}
void ForkServer::InitializeNamespaces(const ForkRequest& request, uid_t uid,
gid_t gid) {
gid_t gid, bool avoid_pivot_root) {
if (!request.has_mount_tree()) {
return;
}
@ -564,8 +646,8 @@ void ForkServer::InitializeNamespaces(const ForkRequest& request, uid_t uid,
}
Namespace::InitializeNamespaces(
uid, gid, clone_flags, Mounts(request.mount_tree()),
request.mode() != FORKSERVER_FORK_JOIN_SANDBOX_UNWIND,
request.hostname());
request.mode() != FORKSERVER_FORK_JOIN_SANDBOX_UNWIND, request.hostname(),
avoid_pivot_root);
}
} // namespace sandbox2

View File

@ -70,19 +70,18 @@ class ForkServer {
pid_t ServeRequest() const;
private:
// Analyzes the PB received, and execute the process. If kept_fds is
// non-nullptr, it specifies a list of file descriptors to be kept open after
// sanitization call is done, the remaining file descriptors will be closed.
static void LaunchChild(const ForkRequest& request, int execve_fd,
int client_fd, uid_t uid, gid_t gid, int user_ns_fd,
int signaling_fd);
// Creates and launched the child process.
void LaunchChild(const ForkRequest& request, int execve_fd, int client_fd,
uid_t uid, gid_t gid, int user_ns_fd, int signaling_fd,
bool avoid_pivot_root) const;
// Prepares the Fork-Server (worker side, not the requester side) for work by
// sanitizing the environment:
// - go down if the parent goes down,
// - become subreaper - PR_SET_CHILD_SUBREAPER (man prctl),
// - don't convert children processes into zombies if they terminate.
static bool Initialize();
// - create initial namespaces
bool Initialize();
// Prepares arguments for the upcoming execve (if execve was requested).
static void PrepareExecveArgs(const ForkRequest& request,
@ -98,11 +97,13 @@ class ForkServer {
// Runs namespace initializers for a sandboxee.
static void InitializeNamespaces(const ForkRequest& request, uid_t uid,
gid_t gid);
gid_t gid, bool avoid_pivot_root);
// Comms channel which is used to send requests to this class. Not owned by
// the object.
Comms* comms_;
int initial_mntns_fd_ = -1;
int initial_userns_fd_ = -1;
};
} // namespace sandbox2

View File

@ -31,9 +31,9 @@
#include <cstring>
#include <utility>
#include "absl/memory/memory.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"
#include "sandboxed_api/sandbox2/util.h"
#include "sandboxed_api/sandbox2/util/fileops.h"
@ -62,7 +62,6 @@ int MountFallbackToReadOnly(const char* source, const char* target,
}
return rv;
}
} // namespace
void PrepareChroot(const Mounts& mounts) {
// Create a tmpfs mount for the new rootfs.
@ -83,27 +82,31 @@ void PrepareChroot(const Mounts& mounts) {
}
void TryDenySetgroups() {
int fd = open("/proc/self/setgroups", O_WRONLY);
file_util::fileops::FDCloser fd(
TEMP_FAILURE_RETRY(open("/proc/self/setgroups", O_WRONLY | O_CLOEXEC)));
// We ignore errors since they are most likely due to an old kernel.
if (fd == -1) {
if (fd.get() == -1) {
return;
}
file_util::fileops::FDCloser fd_closer{fd};
dprintf(fd, "deny");
dprintf(fd.get(), "deny");
}
void WriteIDMap(const char* map_path, int32_t uid) {
int fd = open(map_path, O_WRONLY);
SAPI_RAW_PCHECK(fd != -1, "Couldn't open %s", map_path);
file_util::fileops::FDCloser fd(
TEMP_FAILURE_RETRY(open(map_path, O_WRONLY | O_CLOEXEC)));
SAPI_RAW_PCHECK(fd.get() != -1, "Couldn't open %s", map_path);
file_util::fileops::FDCloser fd_closer{fd};
SAPI_RAW_PCHECK(dprintf(fd, "1000 %d 1", uid) >= 0,
SAPI_RAW_PCHECK(dprintf(fd.get(), "1000 %d 1", uid) >= 0,
"Could not write %d to %s", uid, map_path);
}
void SetupIDMaps(uid_t uid, gid_t gid) {
TryDenySetgroups();
WriteIDMap("/proc/self/uid_map", uid);
WriteIDMap("/proc/self/gid_map", gid);
}
void ActivateLoopbackInterface() {
ifreq ifreq;
@ -188,6 +191,8 @@ void LogFilesystem(const std::string& dir) {
}
}
} // namespace
Namespace::Namespace(bool allow_unrestricted_networking, Mounts mounts,
std::string hostname)
: clone_flags_(CLONE_NEWUSER | CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWPID |
@ -205,12 +210,10 @@ int32_t Namespace::GetCloneFlags() const { return clone_flags_; }
void Namespace::InitializeNamespaces(uid_t uid, gid_t gid, int32_t clone_flags,
const Mounts& mounts, bool mount_proc,
const std::string& hostname) {
if (clone_flags & CLONE_NEWUSER) {
// Set up the uid and gid map.
TryDenySetgroups();
WriteIDMap("/proc/self/uid_map", uid);
WriteIDMap("/proc/self/gid_map", gid);
const std::string& hostname,
bool avoid_pivot_root) {
if (clone_flags & CLONE_NEWUSER && !avoid_pivot_root) {
SetupIDMaps(uid, gid);
}
if (!(clone_flags & CLONE_NEWNS)) {
@ -218,6 +221,18 @@ void Namespace::InitializeNamespaces(uid_t uid, gid_t gid, int32_t clone_flags,
return;
}
std::unique_ptr<file_util::fileops::FDCloser> root_fd;
if (avoid_pivot_root) {
// We want to bind-mount chrooted to real root, so that symlinks work.
// Reference to main root is kept to escape later from the chroot
root_fd = absl::make_unique<file_util::fileops::FDCloser>(
TEMP_FAILURE_RETRY(open("/", O_PATH)));
SAPI_RAW_CHECK(root_fd->get() != -1, "creating fd for main root");
SAPI_RAW_PCHECK(chroot("/realroot") != -1, "chrooting to real root");
SAPI_RAW_PCHECK(chdir("/") != -1, "chdir / after chrooting real root");
}
SAPI_RAW_PCHECK(
!mount_proc || mount("", "/proc", "proc",
MS_NODEV | MS_NOEXEC | MS_NOSUID, nullptr) != -1,
@ -242,17 +257,70 @@ void Namespace::InitializeNamespaces(uid_t uid, gid_t gid, int32_t clone_flags,
PrepareChroot(mounts);
// This requires some explanation: It's actually possible to pivot_root('/',
// '/'). After this operation has been completed, the old root is mounted over
// the new root, and it's OK to simply umount('/') now, and to have new_root
// as '/'. This allows us not care about providing any special directory for
// old_root, which is sometimes not easy, given that e.g. /tmp might not
// always be present inside new_root.
SAPI_RAW_PCHECK(
syscall(__NR_pivot_root, kSandbox2ChrootPath, kSandbox2ChrootPath) != -1,
"pivot root");
SAPI_RAW_PCHECK(umount2("/", MNT_DETACH) != -1, "detaching old root");
SAPI_RAW_PCHECK(chdir("/") == 0, "changing cwd after pivot_root failed");
if (avoid_pivot_root) {
// Keep a reference to /proc/self as it might not be mounted later
file_util::fileops::FDCloser proc_self_fd(
TEMP_FAILURE_RETRY(open("/proc/self/", O_PATH)));
SAPI_RAW_PCHECK(proc_self_fd.get() != -1, "opening /proc/self");
// Return to the main root
SAPI_RAW_PCHECK(fchdir(root_fd->get()) != -1, "chdir to main root");
SAPI_RAW_PCHECK(chroot(".") != -1, "chrooting to main root");
SAPI_RAW_PCHECK(chdir("/") != -1, "chdir / after chrooting main root");
// Get a refrence to /realroot to umount it later
file_util::fileops::FDCloser realroot_fd(
TEMP_FAILURE_RETRY(open("/realroot", O_PATH)));
// Move the chroot out of realroot to /
std::string chroot_path = file::JoinPath("/realroot", kSandbox2ChrootPath);
SAPI_RAW_PCHECK(chdir(chroot_path.c_str()) != -1, "chdir to chroot");
SAPI_RAW_PCHECK(mount(".", "/", "", MS_MOVE, nullptr) == 0,
"moving rootfs failed");
SAPI_RAW_PCHECK(chroot(".") != -1, "chrooting moved chroot");
SAPI_RAW_PCHECK(chdir("/") != -1, "chdir / after chroot");
// Umount the realroot so that no reference is left
SAPI_RAW_PCHECK(fchdir(realroot_fd.get()) != -1, "fchdir to /realroot");
SAPI_RAW_PCHECK(umount2(".", MNT_DETACH) != -1, "detaching old root");
if (clone_flags & CLONE_NEWUSER) {
// Also CLONE_NEWNS so that / mount becomes locked
SAPI_RAW_PCHECK(unshare(CLONE_NEWUSER | CLONE_NEWNS) != -1,
"unshare(CLONE_NEWUSER | CLONE_NEWNS)");
// Setup ID maps using reference to /proc/self obatined earlier
file_util::fileops::FDCloser setgroups_fd(TEMP_FAILURE_RETRY(
openat(proc_self_fd.get(), "setgroups", O_WRONLY | O_CLOEXEC)));
// We ignore errors since they are most likely due to an old kernel.
if (setgroups_fd.get() != -1) {
dprintf(setgroups_fd.get(), "deny");
}
file_util::fileops::FDCloser uid_map_fd(
TEMP_FAILURE_RETRY(openat(proc_self_fd.get(), "uid_map", O_WRONLY)));
SAPI_RAW_PCHECK(uid_map_fd.get() != -1, "Couldn't open uid_map");
SAPI_RAW_PCHECK(dprintf(uid_map_fd.get(), "1000 1000 1") >= 0,
"Could not write uid_map");
file_util::fileops::FDCloser gid_map_fd(
TEMP_FAILURE_RETRY(openat(proc_self_fd.get(), "gid_map", O_WRONLY)));
SAPI_RAW_PCHECK(gid_map_fd.get() != -1, "Couldn't open gid_map");
SAPI_RAW_PCHECK(dprintf(gid_map_fd.get(), "1000 1000 1") >= 0,
"Could not write gid_map");
}
} else {
// This requires some explanation: It's actually possible to pivot_root('/',
// '/'). After this operation has been completed, the old root is mounted
// over the new root, and it's OK to simply umount('/') now, and to have
// new_root as '/'. This allows us not care about providing any special
// directory for old_root, which is sometimes not easy, given that e.g. /tmp
// might not always be present inside new_root.
SAPI_RAW_PCHECK(syscall(__NR_pivot_root, kSandbox2ChrootPath,
kSandbox2ChrootPath) != -1,
"pivot root");
SAPI_RAW_PCHECK(umount2("/", MNT_DETACH) != -1, "detaching old root");
}
SAPI_RAW_PCHECK(chdir("/") == 0,
"changing cwd after mntns initialization failed");
if (SAPI_VLOG_IS_ON(2)) {
SAPI_RAW_VLOG(2, "Dumping the sandboxee's filesystem:");
@ -260,6 +328,24 @@ void Namespace::InitializeNamespaces(uid_t uid, gid_t gid, int32_t clone_flags,
}
}
void Namespace::InitializeInitialNamespaces(uid_t uid, gid_t gid) {
SetupIDMaps(uid, gid);
SAPI_RAW_CHECK(util::CreateDirRecursive(kSandbox2ChrootPath, 0700),
"could not create directory for rootfs");
SAPI_RAW_PCHECK(mount("none", kSandbox2ChrootPath, "tmpfs", 0, nullptr) == 0,
"mounting rootfs failed");
auto realroot_path = file::JoinPath(kSandbox2ChrootPath, "/realroot");
SAPI_RAW_CHECK(util::CreateDirRecursive(realroot_path, 0700),
"could not create directory for real root");
SAPI_RAW_PCHECK(syscall(__NR_pivot_root, kSandbox2ChrootPath,
realroot_path.c_str()) != -1,
"pivot root");
SAPI_RAW_PCHECK(symlink("/realroot/proc", "/proc") != -1, "symlinking /proc");
SAPI_RAW_PCHECK(
mount("/", "/", "", MS_BIND | MS_REMOUNT | MS_RDONLY, nullptr) == 0,
"remounting rootfs read-only failed");
}
void Namespace::GetNamespaceDescription(NamespaceDescription* pb_description) {
pb_description->set_clone_flags(clone_flags_);
*pb_description->mutable_mount_tree_mounts() = mounts_.GetMountTree();

View File

@ -35,7 +35,9 @@ class Namespace final {
// Performs the namespace setup (mounts, write the uid_map, etc.).
static void InitializeNamespaces(uid_t uid, gid_t gid, int32_t clone_flags,
const Mounts& mounts, bool mount_proc,
const std::string& hostname);
const std::string& hostname,
bool avoid_pivot_root);
static void InitializeInitialNamespaces(uid_t uid, gid_t gid);
Namespace() = delete;
Namespace(const Namespace&) = delete;