termio: ensure we kill all the children without killing ourselves

Fixes #497 This commit resolves two bugs: First, if a surface is created and destroyed too quickly, the child process may not have called `setsid` yet (to set the process group). In this case, `getpgid` returns Ghostty's process group and we were killing ourselves. We now detect we're about to kill ourselves and wait for our child to be ready to be killed. Second, if the child calls setsid but is in the process of execve when we send a killpg, then the child will be killed but any newly spawned grandchildren will remain alive. To fix this, we moved the waitpid call into a kill loop so we can repeatedly kill our child if we detect any grandchildren are still alive.
2025-07-14 07:46:12 +03:00 · 2023-12-27 21:23:24 -08:00
parent 4f31af13ad
commit e32c031588
1 changed files with 62 additions and 22 deletions
--- a/src/termio/Exec.zig
+++ b/src/termio/Exec.zig
@ -1102,10 +1102,10 @@ const Subprocess = struct {
    pub fn stop(self: *Subprocess) void {
        // Kill our command
        if (self.command) |*cmd| {
+            // Note: this will also wait for the command to exit, so
+            // DO NOT call cmd.wait
            killCommand(cmd) catch |err|
                log.err("error sending SIGHUP to command, may hang: {}", .{err});
-            _ = cmd.wait(false) catch |err|
-                log.err("error waiting for command to exit: {}", .{err});
            self.command = null;
        }

@ -1141,7 +1141,8 @@ const Subprocess = struct {
    }

    /// Kill the underlying subprocess. This sends a SIGHUP to the child
-    /// process. This doesn't wait for the child process to be exited.
+    /// process. This also waits for the command to exit and will return the
+    /// exit code.
    fn killCommand(command: *Command) !void {
        if (command.pid) |pid| {
            switch (builtin.os.tag) {
@ -1149,38 +1150,77 @@ const Subprocess = struct {
                    if (windows.kernel32.TerminateProcess(pid, 0) == 0) {
                        return windows.unexpectedError(windows.kernel32.GetLastError());
                    }
+
+                    _ = try command.wait(false);
                },

-                else => {
-                    const pgid_: ?c.pid_t = pgid: {
-                        const pgid = c.getpgid(pid);
+                else => if (getpgid(pid)) |pgid| {
+                    log.warn(
+                        "state killpg pid={} pgid={} mypid={} mypgid={}",
+                        .{ pid, pgid, c.getpid(), c.getpgid(0) },
+                    );

-                        // Don't know why it would be zero but its not a valid pid
-                        if (pgid == 0) break :pgid null;
-
-                        // If the pid doesn't exist then... okay.
-                        if (pgid == c.ESRCH) break :pgid null;
-
-                        // If we have an error...
-                        if (pgid < 0) {
-                            log.warn("error getting pgid for kill", .{});
-                            break :pgid null;
-                        }
-
-                        break :pgid pgid;
-                    };
-
-                    if (pgid_) |pgid| {
+                    // It is possible to send a killpg between the time that
+                    // our child process calls setsid but before or simultaneous
+                    // to calling execve. In this case, the direct child dies
+                    // but grandchildren survive. To work around this, we loop
+                    // and repeatedly kill the process group until all
+                    // descendents are well and truly dead. We will not rest
+                    // until the entire family tree is obliterated.
+                    while (true) {
                        if (c.killpg(pgid, c.SIGHUP) < 0) {
                            log.warn("error killing process group pgid={}", .{pgid});
                            return error.KillFailed;
                        }
+
+                        // See Command.zig wait for why we specify WNOHANG.
+                        // The gist is that it lets us detect when children
+                        // are still alive without blocking so that we can
+                        // kill them again.
+                        const res = std.os.waitpid(pid, std.c.W.NOHANG);
+                        if (res.pid != 0) break;
                    }
                },
            }
        }
    }

+    fn getpgid(pid: c.pid_t) ?c.pid_t {
+        // Get our process group ID. Before the child pid calls setsid
+        // the pgid will be ours because we forked it. Its possible that
+        // we may be calling this before setsid if we are killing a surface
+        // VERY quickly after starting it.
+        const my_pgid = c.getpgid(0);
+
+        // We loop while pgid == my_pgid. The expectation if we have a valid
+        // pid is that setsid will eventually be called because it is the
+        // FIRST thing the child process does and as far as I can tell,
+        // setsid cannot fail. I'm sure that's not true, but I'd rather
+        // have a bug reported than defensively program against it now.
+        while (true) {
+            const pgid = c.getpgid(pid);
+            if (pgid == my_pgid) {
+                log.warn("pgid is our own, retrying", .{});
+                std.time.sleep(10 * std.time.ns_per_ms);
+                continue;
+            }
+
+            // Don't know why it would be zero but its not a valid pid
+            if (pgid == 0) return null;
+
+            // If the pid doesn't exist then... we're done!
+            if (pgid == c.ESRCH) return null;
+
+            // If we have an error we're done.
+            if (pgid < 0) {
+                log.warn("error getting pgid for kill", .{});
+                return null;
+            }
+
+            return pgid;
+        }
+    }
+
    /// Kill the underlying process started via Flatpak host command.
    /// This sends a signal via the Flatpak API.
    fn killCommandFlatpak(command: *FlatpakHostCommand) !void {