From b030663e0384ed53dea6bff7cacd53a614dc18bd Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <mitchell.hashimoto@gmail.com>
Date: Mon, 5 Feb 2024 11:58:10 -0800
Subject: [PATCH] bench/stream: benchmark for stream processing

---
 bench.sh             |  15 +++++
 build.zig            |   8 ++-
 src/bench/stream.zig | 152 +++++++++++++++++++++++++++++++++++++++++++
 src/build_config.zig |   1 +
 src/main.zig         |   1 +
 5 files changed, 174 insertions(+), 3 deletions(-)
 create mode 100755 bench.sh
 create mode 100644 src/bench/stream.zig

diff --git a/bench.sh b/bench.sh
new file mode 100755
index 000000000..5cd693a13
--- /dev/null
+++ b/bench.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+# TODO: This script is temporary, remove it from the repo
+
+
+SIZE="25M"
+
+hyperfine \
+  --warmup 10 \
+  -n memcpy \
+  "./zig-out/bin/bench-stream --mode=gen-ascii | head -c ${SIZE} | ./zig-out/bin/bench-stream --mode=noop" \
+  -n scalar \
+  "./zig-out/bin/bench-stream --mode=gen-ascii | head -c ${SIZE} | ./zig-out/bin/bench-stream --mode=scalar" \
+  -n simd \
+  "./zig-out/bin/bench-stream --mode=gen-ascii | head -c ${SIZE} | ./zig-out/bin/bench-stream --mode=simd"
diff --git a/build.zig b/build.zig
index c3cbc9223..ddac51035 100644
--- a/build.zig
+++ b/build.zig
@@ -202,7 +202,7 @@ pub fn build(b: *std.Build) !void {
     if (emit_helpgen) try addHelp(b, null, config);
 
     // Add our benchmarks
-    try benchSteps(b, target, optimize, config, emit_bench);
+    try benchSteps(b, target, config, emit_bench);
 
     // TODO: temporary simd tester binary
     {
@@ -1285,7 +1285,6 @@ fn buildDocumentation(
 fn benchSteps(
     b: *std.Build,
     target: std.Build.ResolvedTarget,
-    optimize: std.builtin.OptimizeMode,
     config: BuildConfig,
     install: bool,
 ) !void {
@@ -1313,8 +1312,11 @@ fn benchSteps(
             .name = bin_name,
             .root_source_file = .{ .path = "src/main.zig" },
             .target = target,
-            .optimize = optimize,
+
+            // We always want our benchmarks to be in release mode.
+            .optimize = .ReleaseFast,
         });
+        c_exe.linkLibC();
         if (install) b.installArtifact(c_exe);
         _ = try addDeps(b, c_exe, config: {
             var copy = config;
diff --git a/src/bench/stream.zig b/src/bench/stream.zig
new file mode 100644
index 000000000..d0f53e504
--- /dev/null
+++ b/src/bench/stream.zig
@@ -0,0 +1,152 @@
+//! This benchmark tests the throughput of the VT stream. It has a few
+//! modes in order to test different methods of stream processing. It
+//! provides a "noop" mode to give us the `memcpy` speed.
+//!
+//! This will consume all of the available stdin, so you should run it
+//! with `head` in a pipe to restrict. For example, to test ASCII input:
+//!
+//!   bench-stream --mode=gen-ascii | head -c 50M | bench-stream --mode=simd
+//!
+
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+const ArenaAllocator = std.heap.ArenaAllocator;
+const cli = @import("../cli.zig");
+const terminal = @import("../terminal/main.zig");
+
+const Args = struct {
+    mode: Mode = .noop,
+
+    /// This is set by the CLI parser for deinit.
+    _arena: ?ArenaAllocator = null,
+
+    pub fn deinit(self: *Args) void {
+        if (self._arena) |arena| arena.deinit();
+        self.* = undefined;
+    }
+};
+
+const Mode = enum {
+    // Do nothing, just read from stdin into a stack-allocated buffer.
+    // This is used to benchmark our base-case: it gives us our maximum
+    // throughput on a basic read.
+    noop,
+
+    // These benchmark the throughput of the terminal stream parsing
+    // with and without SIMD. The "simd" option will use whatever is best
+    // for the running platform.
+    //
+    // Note that these run through the full VT parser but do not apply
+    // the operations to terminal state, so there is no terminal state
+    // overhead.
+    scalar,
+    simd,
+
+    // Generate an infinite stream of random printable ASCII characters.
+    @"gen-ascii",
+};
+
+pub const std_options = struct {
+    pub const log_level: std.log.Level = .debug;
+};
+
+pub fn main() !void {
+    // We want to use the c allocator because it is much faster than GPA.
+    const alloc = std.heap.c_allocator;
+
+    // Parse our args
+    var args: Args = .{};
+    defer args.deinit();
+    {
+        var iter = try std.process.argsWithAllocator(alloc);
+        defer iter.deinit();
+        try cli.args.parse(Args, alloc, &args, &iter);
+    }
+
+    const reader = std.io.getStdIn().reader();
+    const writer = std.io.getStdOut().writer();
+    switch (args.mode) {
+        .@"gen-ascii" => try genAscii(writer),
+        .noop => try benchNoop(alloc, reader),
+        .scalar => try benchScalar(alloc, reader),
+        .simd => try benchSimd(alloc, reader),
+    }
+}
+
+/// Generates an infinite stream of random printable ASCII characters.
+/// This has no control characters in it at all.
+fn genAscii(writer: anytype) !void {
+    const alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_+-=[]{}|;':\\\",./<>?`~";
+    try genData(writer, alphabet);
+}
+
+/// Generates an infinite stream of bytes from the given alphabet.
+fn genData(writer: anytype, alphabet: []const u8) !void {
+    var prng = std.rand.DefaultPrng.init(0x12345678);
+    const rnd = prng.random();
+    while (true) {
+        var buf: [1024]u8 = undefined;
+        for (&buf) |*c| {
+            const idx = rnd.uintLessThanBiased(usize, alphabet.len);
+            c.* = alphabet[idx];
+        }
+
+        writer.writeAll(&buf) catch |err| switch (err) {
+            error.BrokenPipe => return, // stdout closed
+            else => return err,
+        };
+    }
+}
+
+fn benchNoop(alloc: Allocator, reader: anytype) !void {
+    // Large-ish buffer because we don't want to be benchmarking
+    // heap allocation as much as possible. We purposely leak this
+    // memory because we don't want to benchmark a free cost
+    // either.
+    const buf = try alloc.alloc(u8, 1024 * 1024 * 16);
+    var total: usize = 0;
+    while (true) {
+        const n = try reader.readAll(buf);
+        if (n == 0) break;
+        total += n;
+    }
+
+    std.log.info("total bytes len={}", .{total});
+}
+
+fn benchScalar(alloc: Allocator, reader: anytype) !void {
+    _ = alloc;
+
+    // Create a stream that uses our noop handler so we don't
+    // have any terminal state overhead.
+    var stream: terminal.Stream(NoopHandler) = .{ .handler = .{} };
+    var buf: [4096]u8 = undefined;
+    while (true) {
+        const n = try reader.read(&buf);
+        if (n == 0) break;
+
+        // Using stream.next directly with a for loop applies a naive
+        // scalar approach.
+        for (buf[0..n]) |c| try stream.next(c);
+    }
+}
+
+fn benchSimd(alloc: Allocator, reader: anytype) !void {
+    _ = alloc;
+
+    var stream: terminal.Stream(NoopHandler) = .{ .handler = .{} };
+    var buf: [4096]u8 = undefined;
+    while (true) {
+        const n = try reader.read(&buf);
+        if (n == 0) break;
+        try stream.nextSlice(buf[0..n]);
+    }
+}
+
+const NoopHandler = struct {
+    fn print(self: NoopHandler, cp: u21) !void {
+        _ = self;
+        _ = cp;
+    }
+};
diff --git a/src/build_config.zig b/src/build_config.zig
index 52e975717..bfb4699d3 100644
--- a/src/build_config.zig
+++ b/src/build_config.zig
@@ -139,4 +139,5 @@ pub const ExeEntrypoint = enum {
     mdgen_ghostty_1,
     mdgen_ghostty_5,
     bench_parser,
+    bench_stream,
 };
diff --git a/src/main.zig b/src/main.zig
index b5307340d..393ddd541 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -7,4 +7,5 @@ pub usingnamespace switch (build_config.exe_entrypoint) {
     .mdgen_ghostty_1 => @import("build/mdgen/main_ghostty_1.zig"),
     .mdgen_ghostty_5 => @import("build/mdgen/main_ghostty_5.zig"),
     .bench_parser => @import("bench/parser.zig"),
+    .bench_stream => @import("bench/stream.zig"),
 };