From e6f97c28f8b3ff46b3685108539c47353c328d88 Mon Sep 17 00:00:00 2001 From: "Jeffrey C. Ollie" Date: Fri, 7 Jun 2024 23:48:03 -0600 Subject: [PATCH] Use clone3 / CLONE_INTO_CGROUP on Linux Use clone3 / CLONE_INTO_CGROUP to have the Linux kernel create the process in the correct cgroup rather than move the process into the cgroup after it is created. --- src/Command.zig | 12 +++++++-- src/os/cgroup.zig | 63 +++++++++++++++++++++++++++++++++++++++++++++ src/termio/Exec.zig | 8 +----- 3 files changed, 74 insertions(+), 9 deletions(-) diff --git a/src/Command.zig b/src/Command.zig index 505fe3f85..7c7eb85f1 100644 --- a/src/Command.zig +++ b/src/Command.zig @@ -19,6 +19,7 @@ const Command = @This(); const std = @import("std"); const builtin = @import("builtin"); const internal_os = @import("os/main.zig"); +const termio = @import("termio.zig"); const windows = internal_os.windows; const TempDir = internal_os.TempDir; const mem = std.mem; @@ -32,6 +33,8 @@ const EnvMap = std.process.EnvMap; const PreExecFn = fn (*Command) void; +const log = std.log.scoped(.command); + /// Path to the command to run. This must be an absolute path. This /// library does not do PATH lookup. path: []const u8, @@ -61,6 +64,8 @@ stderr: ?File = null, /// exec process takes over, such as signal handlers, setsid, setuid, etc. pre_exec: ?*const PreExecFn = null, +linux_cgroup: termio.Options.LinuxCgroup = termio.Options.linux_cgroup_default, + /// If set, then the process will be created attached to this pseudo console. /// `stdin`, `stdout`, and `stderr` will be ignored if set. pseudo_console: if (builtin.os.tag == .windows) ?windows.exp.HPCON else void = @@ -133,8 +138,11 @@ fn startPosix(self: *Command, arena: Allocator) !void { else @compileError("missing env vars"); - // Fork - const pid = try posix.fork(); + const pid: linux.pid_t = switch (builtin.os.tag) { + .linux => if (self.linux_cgroup) |cgroup| try internal_os.cgroup.cloneInto(cgroup) else try posix.fork(), + else => try posix.fork(), + }; + if (pid != 0) { // Parent, return immediately. self.pid = @intCast(pid); diff --git a/src/os/cgroup.zig b/src/os/cgroup.zig index aee772954..5afc8cdca 100644 --- a/src/os/cgroup.zig +++ b/src/os/cgroup.zig @@ -1,7 +1,11 @@ const std = @import("std"); const assert = std.debug.assert; +const linux = std.os.linux; +const posix = std.posix; const Allocator = std.mem.Allocator; +const log = std.log.scoped(.@"linux-cgroup"); + /// Returns the path to the cgroup for the given pid. pub fn current(alloc: Allocator, pid: std.os.linux.pid_t) !?[]const u8 { var buf: [std.fs.MAX_PATH_BYTES]u8 = undefined; @@ -64,6 +68,65 @@ pub fn moveInto( try file.writer().print("{}", .{pid}); } +/// Use clone3 to have the kernel create a new process with the correct cgroup +/// rather than moving the process to the correct cgroup later. +pub fn cloneInto(cgroup: []const u8) !posix.pid_t { + var buf: [std.fs.MAX_PATH_BYTES]u8 = undefined; + const path = try std.fmt.bufPrintZ(&buf, "/sys/fs/cgroup{s}", .{cgroup}); + + // Get a file descriptor that refers to the cgroup directory in the cgroup + // sysfs to pass to the kernel in clone3. + const fd: linux.fd_t = fd: { + const rc = linux.open(path, linux.O{ .PATH = true, .DIRECTORY = true }, 0); + switch (posix.errno(rc)) { + .SUCCESS => { + break :fd @as(linux.fd_t, @intCast(rc)); + }, + else => |errno| { + log.err("unable to open cgroup dir {s}: {}", .{ path, errno }); + break :fd -1; + }, + } + }; + + const args: extern struct { + flags: u64, + pidfd: u64, + child_tid: u64, + parent_tid: u64, + exit_signal: u64, + stack: u64, + stack_size: u64, + tls: u64, + set_tid: u64, + set_tid_size: u64, + cgroup: u64, + } = .{ + .flags = if (fd >= 0) linux.CLONE.INTO_CGROUP else 0, + .pidfd = 0, + .child_tid = 0, + .parent_tid = 0, + .exit_signal = linux.SIG.CHLD, + .stack = 0, + .stack_size = 0, + .tls = 0, + .set_tid = 0, + .set_tid_size = 0, + .cgroup = if (fd >= 0) @intCast(fd) else 0, + }; + + const rc = linux.syscall2(linux.SYS.clone3, @intFromPtr(&args), @sizeOf(@TypeOf(args))); + switch (posix.errno(rc)) { + .SUCCESS => { + return @as(posix.pid_t, @intCast(rc)); + }, + else => |errno| { + log.err("unable to clone: {}", .{errno}); + return error.CloneError; + }, + } +} + /// Returns all available cgroup controllers for the given cgroup. /// The cgroup should have a '/'-prefix. /// diff --git a/src/termio/Exec.zig b/src/termio/Exec.zig index d6342deb3..619babb47 100644 --- a/src/termio/Exec.zig +++ b/src/termio/Exec.zig @@ -1315,6 +1315,7 @@ const Subprocess = struct { } }).callback, .data = self, + .linux_cgroup = self.linux_cgroup, }; try cmd.start(alloc); errdefer killCommand(&cmd) catch |err| { @@ -1345,13 +1346,6 @@ const Subprocess = struct { fn childPreExec(self: *Subprocess) !void { // Setup our pty try self.pty.?.childPreExec(); - - // If we have a cgroup set, then we want to move into that cgroup. - if (comptime builtin.os.tag == .linux) { - if (self.linux_cgroup) |cgroup| { - try internal_os.cgroup.moveInto(cgroup, 0); - } - } } /// Called to notify that we exited externally so we can unset our