Use clone3 / CLONE_INTO_CGROUP on Linux

Use clone3 / CLONE_INTO_CGROUP to have the Linux kernel create the process in the
correct cgroup rather than move the process into the cgroup after it is created.
This commit is contained in:
Jeffrey C. Ollie
2024-06-07 23:48:03 -06:00
parent 0d94fb61c9
commit e6f97c28f8
3 changed files with 74 additions and 9 deletions

View File

@ -19,6 +19,7 @@ const Command = @This();
const std = @import("std");
const builtin = @import("builtin");
const internal_os = @import("os/main.zig");
const termio = @import("termio.zig");
const windows = internal_os.windows;
const TempDir = internal_os.TempDir;
const mem = std.mem;
@ -32,6 +33,8 @@ const EnvMap = std.process.EnvMap;
const PreExecFn = fn (*Command) void;
const log = std.log.scoped(.command);
/// Path to the command to run. This must be an absolute path. This
/// library does not do PATH lookup.
path: []const u8,
@ -61,6 +64,8 @@ stderr: ?File = null,
/// exec process takes over, such as signal handlers, setsid, setuid, etc.
pre_exec: ?*const PreExecFn = null,
linux_cgroup: termio.Options.LinuxCgroup = termio.Options.linux_cgroup_default,
/// If set, then the process will be created attached to this pseudo console.
/// `stdin`, `stdout`, and `stderr` will be ignored if set.
pseudo_console: if (builtin.os.tag == .windows) ?windows.exp.HPCON else void =
@ -133,8 +138,11 @@ fn startPosix(self: *Command, arena: Allocator) !void {
else
@compileError("missing env vars");
// Fork
const pid = try posix.fork();
const pid: linux.pid_t = switch (builtin.os.tag) {
.linux => if (self.linux_cgroup) |cgroup| try internal_os.cgroup.cloneInto(cgroup) else try posix.fork(),
else => try posix.fork(),
};
if (pid != 0) {
// Parent, return immediately.
self.pid = @intCast(pid);

View File

@ -1,7 +1,11 @@
const std = @import("std");
const assert = std.debug.assert;
const linux = std.os.linux;
const posix = std.posix;
const Allocator = std.mem.Allocator;
const log = std.log.scoped(.@"linux-cgroup");
/// Returns the path to the cgroup for the given pid.
pub fn current(alloc: Allocator, pid: std.os.linux.pid_t) !?[]const u8 {
var buf: [std.fs.MAX_PATH_BYTES]u8 = undefined;
@ -64,6 +68,65 @@ pub fn moveInto(
try file.writer().print("{}", .{pid});
}
/// Use clone3 to have the kernel create a new process with the correct cgroup
/// rather than moving the process to the correct cgroup later.
pub fn cloneInto(cgroup: []const u8) !posix.pid_t {
var buf: [std.fs.MAX_PATH_BYTES]u8 = undefined;
const path = try std.fmt.bufPrintZ(&buf, "/sys/fs/cgroup{s}", .{cgroup});
// Get a file descriptor that refers to the cgroup directory in the cgroup
// sysfs to pass to the kernel in clone3.
const fd: linux.fd_t = fd: {
const rc = linux.open(path, linux.O{ .PATH = true, .DIRECTORY = true }, 0);
switch (posix.errno(rc)) {
.SUCCESS => {
break :fd @as(linux.fd_t, @intCast(rc));
},
else => |errno| {
log.err("unable to open cgroup dir {s}: {}", .{ path, errno });
break :fd -1;
},
}
};
const args: extern struct {
flags: u64,
pidfd: u64,
child_tid: u64,
parent_tid: u64,
exit_signal: u64,
stack: u64,
stack_size: u64,
tls: u64,
set_tid: u64,
set_tid_size: u64,
cgroup: u64,
} = .{
.flags = if (fd >= 0) linux.CLONE.INTO_CGROUP else 0,
.pidfd = 0,
.child_tid = 0,
.parent_tid = 0,
.exit_signal = linux.SIG.CHLD,
.stack = 0,
.stack_size = 0,
.tls = 0,
.set_tid = 0,
.set_tid_size = 0,
.cgroup = if (fd >= 0) @intCast(fd) else 0,
};
const rc = linux.syscall2(linux.SYS.clone3, @intFromPtr(&args), @sizeOf(@TypeOf(args)));
switch (posix.errno(rc)) {
.SUCCESS => {
return @as(posix.pid_t, @intCast(rc));
},
else => |errno| {
log.err("unable to clone: {}", .{errno});
return error.CloneError;
},
}
}
/// Returns all available cgroup controllers for the given cgroup.
/// The cgroup should have a '/'-prefix.
///

View File

@ -1315,6 +1315,7 @@ const Subprocess = struct {
}
}).callback,
.data = self,
.linux_cgroup = self.linux_cgroup,
};
try cmd.start(alloc);
errdefer killCommand(&cmd) catch |err| {
@ -1345,13 +1346,6 @@ const Subprocess = struct {
fn childPreExec(self: *Subprocess) !void {
// Setup our pty
try self.pty.?.childPreExec();
// If we have a cgroup set, then we want to move into that cgroup.
if (comptime builtin.os.tag == .linux) {
if (self.linux_cgroup) |cgroup| {
try internal_os.cgroup.moveInto(cgroup, 0);
}
}
}
/// Called to notify that we exited externally so we can unset our