From 99ed984af2a12340a7b5b17326bc037044524ba4 Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <m@mitchellh.com>
Date: Wed, 9 Jul 2025 14:31:47 -0700
Subject: [PATCH] benchmark: add GraphemeBreak and TerminalParser benchmarks

---
 src/benchmark/GraphemeBreak.zig  | 146 +++++++++++++++++++++++++++++++
 src/benchmark/TerminalParser.zig | 106 ++++++++++++++++++++++
 src/benchmark/cli.zig            |   4 +
 src/benchmark/main.zig           |   2 +
 4 files changed, 258 insertions(+)
 create mode 100644 src/benchmark/GraphemeBreak.zig
 create mode 100644 src/benchmark/TerminalParser.zig

diff --git a/src/benchmark/GraphemeBreak.zig b/src/benchmark/GraphemeBreak.zig
new file mode 100644
index 000000000..57effebe4
--- /dev/null
+++ b/src/benchmark/GraphemeBreak.zig
@@ -0,0 +1,146 @@
+//! This benchmark tests the throughput of grapheme break calculation.
+//! This is a common operation in terminal character printing for terminals
+//! that support grapheme clustering.
+const GraphemeBreak = @This();
+
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+const Benchmark = @import("Benchmark.zig");
+const options = @import("options.zig");
+const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
+const unicode = @import("../unicode/main.zig");
+
+const log = std.log.scoped(.@"terminal-stream-bench");
+
+opts: Options,
+
+/// The file, opened in the setup function.
+data_f: ?std.fs.File = null,
+
+pub const Options = struct {
+    /// The type of codepoint width calculation to use.
+    mode: Mode = .table,
+
+    /// The data to read as a filepath. If this is "-" then
+    /// we will read stdin. If this is unset, then we will
+    /// do nothing (benchmark is a noop). It'd be more unixy to
+    /// use stdin by default but I find that a hanging CLI command
+    /// with no interaction is a bit annoying.
+    data: ?[]const u8 = null,
+};
+
+pub const Mode = enum {
+    /// The baseline mode copies the data from the fd into a buffer. This
+    /// is used to show the minimal overhead of reading the fd into memory
+    /// and establishes a baseline for the other modes.
+    noop,
+
+    /// Ghostty's table-based approach.
+    table,
+};
+
+/// Create a new terminal stream handler for the given arguments.
+pub fn create(
+    alloc: Allocator,
+    opts: Options,
+) !*GraphemeBreak {
+    const ptr = try alloc.create(GraphemeBreak);
+    errdefer alloc.destroy(ptr);
+    ptr.* = .{ .opts = opts };
+    return ptr;
+}
+
+pub fn destroy(self: *GraphemeBreak, alloc: Allocator) void {
+    alloc.destroy(self);
+}
+
+pub fn benchmark(self: *GraphemeBreak) Benchmark {
+    return .init(self, .{
+        .stepFn = switch (self.opts.mode) {
+            .noop => stepNoop,
+            .table => stepTable,
+        },
+        .setupFn = setup,
+        .teardownFn = teardown,
+    });
+}
+
+fn setup(ptr: *anyopaque) Benchmark.Error!void {
+    const self: *GraphemeBreak = @ptrCast(@alignCast(ptr));
+
+    // Open our data file to prepare for reading. We can do more
+    // validation here eventually.
+    assert(self.data_f == null);
+    self.data_f = options.dataFile(self.opts.data) catch |err| {
+        log.warn("error opening data file err={}", .{err});
+        return error.BenchmarkFailed;
+    };
+}
+
+fn teardown(ptr: *anyopaque) void {
+    const self: *GraphemeBreak = @ptrCast(@alignCast(ptr));
+    if (self.data_f) |f| {
+        f.close();
+        self.data_f = null;
+    }
+}
+
+fn stepNoop(ptr: *anyopaque) Benchmark.Error!void {
+    const self: *GraphemeBreak = @ptrCast(@alignCast(ptr));
+
+    const f = self.data_f orelse return;
+    var r = std.io.bufferedReader(f.reader());
+    var d: UTF8Decoder = .{};
+    var buf: [4096]u8 = undefined;
+    while (true) {
+        const n = r.read(&buf) catch |err| {
+            log.warn("error reading data file err={}", .{err});
+            return error.BenchmarkFailed;
+        };
+        if (n == 0) break; // EOF reached
+
+        for (buf[0..n]) |c| {
+            _ = d.next(c);
+        }
+    }
+}
+
+fn stepTable(ptr: *anyopaque) Benchmark.Error!void {
+    const self: *GraphemeBreak = @ptrCast(@alignCast(ptr));
+
+    const f = self.data_f orelse return;
+    var r = std.io.bufferedReader(f.reader());
+    var d: UTF8Decoder = .{};
+    var state: unicode.GraphemeBreakState = .{};
+    var cp1: u21 = 0;
+    var buf: [4096]u8 = undefined;
+    while (true) {
+        const n = r.read(&buf) catch |err| {
+            log.warn("error reading data file err={}", .{err});
+            return error.BenchmarkFailed;
+        };
+        if (n == 0) break; // EOF reached
+
+        for (buf[0..n]) |c| {
+            const cp_, const consumed = d.next(c);
+            assert(consumed);
+            if (cp_) |cp2| {
+                const v = unicode.graphemeBreak(cp1, @intCast(cp2), &state);
+                buf[0] = @intCast(@intFromBool(v));
+                cp1 = cp2;
+            }
+        }
+    }
+}
+
+test GraphemeBreak {
+    const testing = std.testing;
+    const alloc = testing.allocator;
+
+    const impl: *GraphemeBreak = try .create(alloc, .{});
+    defer impl.destroy(alloc);
+
+    const bench = impl.benchmark();
+    _ = try bench.run(.once);
+}
diff --git a/src/benchmark/TerminalParser.zig b/src/benchmark/TerminalParser.zig
new file mode 100644
index 000000000..9107d4555
--- /dev/null
+++ b/src/benchmark/TerminalParser.zig
@@ -0,0 +1,106 @@
+//! This benchmark tests the throughput of the terminal escape code parser.
+const TerminalParser = @This();
+
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+const terminalpkg = @import("../terminal/main.zig");
+const Benchmark = @import("Benchmark.zig");
+const options = @import("options.zig");
+
+const log = std.log.scoped(.@"terminal-stream-bench");
+
+opts: Options,
+
+/// The file, opened in the setup function.
+data_f: ?std.fs.File = null,
+
+pub const Options = struct {
+    /// The data to read as a filepath. If this is "-" then
+    /// we will read stdin. If this is unset, then we will
+    /// do nothing (benchmark is a noop). It'd be more unixy to
+    /// use stdin by default but I find that a hanging CLI command
+    /// with no interaction is a bit annoying.
+    data: ?[]const u8 = null,
+};
+
+pub fn create(
+    alloc: Allocator,
+    opts: Options,
+) !*TerminalParser {
+    const ptr = try alloc.create(TerminalParser);
+    errdefer alloc.destroy(ptr);
+    ptr.* = .{ .opts = opts };
+    return ptr;
+}
+
+pub fn destroy(self: *TerminalParser, alloc: Allocator) void {
+    alloc.destroy(self);
+}
+
+pub fn benchmark(self: *TerminalParser) Benchmark {
+    return .init(self, .{
+        .stepFn = step,
+        .setupFn = setup,
+        .teardownFn = teardown,
+    });
+}
+
+fn setup(ptr: *anyopaque) Benchmark.Error!void {
+    const self: *TerminalParser = @ptrCast(@alignCast(ptr));
+
+    // Open our data file to prepare for reading. We can do more
+    // validation here eventually.
+    assert(self.data_f == null);
+    self.data_f = options.dataFile(self.opts.data) catch |err| {
+        log.warn("error opening data file err={}", .{err});
+        return error.BenchmarkFailed;
+    };
+}
+
+fn teardown(ptr: *anyopaque) void {
+    const self: *TerminalParser = @ptrCast(@alignCast(ptr));
+    if (self.data_f) |f| {
+        f.close();
+        self.data_f = null;
+    }
+}
+
+fn step(ptr: *anyopaque) Benchmark.Error!void {
+    const self: *TerminalParser = @ptrCast(@alignCast(ptr));
+
+    // Get our buffered reader so we're not predominantly
+    // waiting on file IO. It'd be better to move this fully into
+    // memory. If we're IO bound though that should show up on
+    // the benchmark results and... I know writing this that we
+    // aren't currently IO bound.
+    const f = self.data_f orelse return;
+    var r = std.io.bufferedReader(f.reader());
+
+    var p: terminalpkg.Parser = .{};
+
+    var buf: [4096]u8 = undefined;
+    while (true) {
+        const n = r.read(&buf) catch |err| {
+            log.warn("error reading data file err={}", .{err});
+            return error.BenchmarkFailed;
+        };
+        if (n == 0) break; // EOF reached
+        for (buf[0..n]) |c| {
+            const actions = p.next(c);
+            //std.log.warn("actions={any}", .{actions});
+            _ = actions;
+        }
+    }
+}
+
+test TerminalParser {
+    const testing = std.testing;
+    const alloc = testing.allocator;
+
+    const impl: *TerminalParser = try .create(alloc, .{});
+    defer impl.destroy(alloc);
+
+    const bench = impl.benchmark();
+    _ = try bench.run(.once);
+}
diff --git a/src/benchmark/cli.zig b/src/benchmark/cli.zig
index b35159c6b..3f59b4a72 100644
--- a/src/benchmark/cli.zig
+++ b/src/benchmark/cli.zig
@@ -7,6 +7,8 @@ const cli = @import("../cli.zig");
 pub const Action = enum {
     @"terminal-stream",
     @"codepoint-width",
+    @"grapheme-break",
+    @"terminal-parser",
 
     /// Returns the struct associated with the action. The struct
     /// should have a few decls:
@@ -20,6 +22,8 @@ pub const Action = enum {
         return switch (action) {
             .@"terminal-stream" => @import("TerminalStream.zig"),
             .@"codepoint-width" => @import("CodepointWidth.zig"),
+            .@"grapheme-break" => @import("GraphemeBreak.zig"),
+            .@"terminal-parser" => @import("TerminalParser.zig"),
         };
     }
 };
diff --git a/src/benchmark/main.zig b/src/benchmark/main.zig
index dd00f72b5..56c515c9d 100644
--- a/src/benchmark/main.zig
+++ b/src/benchmark/main.zig
@@ -3,6 +3,8 @@ pub const Benchmark = @import("Benchmark.zig");
 pub const CApi = @import("CApi.zig");
 pub const TerminalStream = @import("TerminalStream.zig");
 pub const CodepointWidth = @import("CodepointWidth.zig");
+pub const GraphemeBreak = @import("GraphemeBreak.zig");
+pub const TerminalParser = @import("TerminalParser.zig");
 
 test {
     _ = @import("std").testing.refAllDecls(@This());