termio: ensure we kill all the children without killing ourselves

Fixes #497

This commit resolves two bugs:

First, if a surface is created and destroyed too quickly, the child
process may not have called `setsid` yet (to set the process group). In
this case, `getpgid` returns Ghostty's process group and we were killing
ourselves. We now detect we're about to kill ourselves and wait for our
child to be ready to be killed.

Second, if the child calls setsid but is in the process of execve when
we send a killpg, then the child will be killed but any newly spawned
grandchildren will remain alive. To fix this, we moved the waitpid call
into a kill loop so we can repeatedly kill our child if we detect any
grandchildren are still alive.
This commit is contained in:
Mitchell Hashimoto
2023-12-27 21:23:24 -08:00
parent 4f31af13ad
commit e32c031588

View File

@ -1102,10 +1102,10 @@ const Subprocess = struct {
pub fn stop(self: *Subprocess) void { pub fn stop(self: *Subprocess) void {
// Kill our command // Kill our command
if (self.command) |*cmd| { if (self.command) |*cmd| {
// Note: this will also wait for the command to exit, so
// DO NOT call cmd.wait
killCommand(cmd) catch |err| killCommand(cmd) catch |err|
log.err("error sending SIGHUP to command, may hang: {}", .{err}); log.err("error sending SIGHUP to command, may hang: {}", .{err});
_ = cmd.wait(false) catch |err|
log.err("error waiting for command to exit: {}", .{err});
self.command = null; self.command = null;
} }
@ -1141,7 +1141,8 @@ const Subprocess = struct {
} }
/// Kill the underlying subprocess. This sends a SIGHUP to the child /// Kill the underlying subprocess. This sends a SIGHUP to the child
/// process. This doesn't wait for the child process to be exited. /// process. This also waits for the command to exit and will return the
/// exit code.
fn killCommand(command: *Command) !void { fn killCommand(command: *Command) !void {
if (command.pid) |pid| { if (command.pid) |pid| {
switch (builtin.os.tag) { switch (builtin.os.tag) {
@ -1149,38 +1150,77 @@ const Subprocess = struct {
if (windows.kernel32.TerminateProcess(pid, 0) == 0) { if (windows.kernel32.TerminateProcess(pid, 0) == 0) {
return windows.unexpectedError(windows.kernel32.GetLastError()); return windows.unexpectedError(windows.kernel32.GetLastError());
} }
_ = try command.wait(false);
}, },
else => { else => if (getpgid(pid)) |pgid| {
const pgid_: ?c.pid_t = pgid: { log.warn(
const pgid = c.getpgid(pid); "state killpg pid={} pgid={} mypid={} mypgid={}",
.{ pid, pgid, c.getpid(), c.getpgid(0) },
);
// Don't know why it would be zero but its not a valid pid // It is possible to send a killpg between the time that
if (pgid == 0) break :pgid null; // our child process calls setsid but before or simultaneous
// to calling execve. In this case, the direct child dies
// If the pid doesn't exist then... okay. // but grandchildren survive. To work around this, we loop
if (pgid == c.ESRCH) break :pgid null; // and repeatedly kill the process group until all
// descendents are well and truly dead. We will not rest
// If we have an error... // until the entire family tree is obliterated.
if (pgid < 0) { while (true) {
log.warn("error getting pgid for kill", .{});
break :pgid null;
}
break :pgid pgid;
};
if (pgid_) |pgid| {
if (c.killpg(pgid, c.SIGHUP) < 0) { if (c.killpg(pgid, c.SIGHUP) < 0) {
log.warn("error killing process group pgid={}", .{pgid}); log.warn("error killing process group pgid={}", .{pgid});
return error.KillFailed; return error.KillFailed;
} }
// See Command.zig wait for why we specify WNOHANG.
// The gist is that it lets us detect when children
// are still alive without blocking so that we can
// kill them again.
const res = std.os.waitpid(pid, std.c.W.NOHANG);
if (res.pid != 0) break;
} }
}, },
} }
} }
} }
fn getpgid(pid: c.pid_t) ?c.pid_t {
// Get our process group ID. Before the child pid calls setsid
// the pgid will be ours because we forked it. Its possible that
// we may be calling this before setsid if we are killing a surface
// VERY quickly after starting it.
const my_pgid = c.getpgid(0);
// We loop while pgid == my_pgid. The expectation if we have a valid
// pid is that setsid will eventually be called because it is the
// FIRST thing the child process does and as far as I can tell,
// setsid cannot fail. I'm sure that's not true, but I'd rather
// have a bug reported than defensively program against it now.
while (true) {
const pgid = c.getpgid(pid);
if (pgid == my_pgid) {
log.warn("pgid is our own, retrying", .{});
std.time.sleep(10 * std.time.ns_per_ms);
continue;
}
// Don't know why it would be zero but its not a valid pid
if (pgid == 0) return null;
// If the pid doesn't exist then... we're done!
if (pgid == c.ESRCH) return null;
// If we have an error we're done.
if (pgid < 0) {
log.warn("error getting pgid for kill", .{});
return null;
}
return pgid;
}
}
/// Kill the underlying process started via Flatpak host command. /// Kill the underlying process started via Flatpak host command.
/// This sends a signal via the Flatpak API. /// This sends a signal via the Flatpak API.
fn killCommandFlatpak(command: *FlatpakHostCommand) !void { fn killCommandFlatpak(command: *FlatpakHostCommand) !void {