From 589082635608727c639685c76de81ed173f6eac3 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 9 Jul 2025 14:23:59 -0700 Subject: [PATCH] benchmark: add codepoint width benchmark --- src/benchmark/CodepointWidth.zig | 204 +++++++++++++++++++++++++++++++ src/benchmark/TerminalStream.zig | 11 +- src/benchmark/cli.zig | 2 + src/benchmark/main.zig | 1 + src/benchmark/options.zig | 20 +++ 5 files changed, 232 insertions(+), 6 deletions(-) create mode 100644 src/benchmark/CodepointWidth.zig create mode 100644 src/benchmark/options.zig diff --git a/src/benchmark/CodepointWidth.zig b/src/benchmark/CodepointWidth.zig new file mode 100644 index 000000000..e9207aed5 --- /dev/null +++ b/src/benchmark/CodepointWidth.zig @@ -0,0 +1,204 @@ +//! This benchmark tests the throughput of codepoint width calculation. +//! This is a common operation in terminal character printing and the +//! motivating factor to write this benchmark was discovering that our +//! codepoint width function was 30% of the runtime of every character +//! print. +const CodepointWidth = @This(); + +const std = @import("std"); +const assert = std.debug.assert; +const Allocator = std.mem.Allocator; +const Benchmark = @import("Benchmark.zig"); +const options = @import("options.zig"); +const UTF8Decoder = @import("../terminal/UTF8Decoder.zig"); +const simd = @import("../simd/main.zig"); +const table = @import("../unicode/main.zig").table; + +const log = std.log.scoped(.@"terminal-stream-bench"); + +opts: Options, + +/// The file, opened in the setup function. +data_f: ?std.fs.File = null, + +pub const Options = struct { + /// The type of codepoint width calculation to use. + mode: Mode = .noop, + + /// The data to read as a filepath. If this is "-" then + /// we will read stdin. If this is unset, then we will + /// do nothing (benchmark is a noop). It'd be more unixy to + /// use stdin by default but I find that a hanging CLI command + /// with no interaction is a bit annoying. + data: ?[]const u8 = null, +}; + +pub const Mode = enum { + /// The baseline mode copies the data from the fd into a buffer. This + /// is used to show the minimal overhead of reading the fd into memory + /// and establishes a baseline for the other modes. + noop, + + /// libc wcwidth + wcwidth, + + /// Our SIMD implementation. + simd, + + /// Test our lookup table implementation. + table, +}; + +/// Create a new terminal stream handler for the given arguments. +pub fn create( + alloc: Allocator, + opts: Options, +) !*CodepointWidth { + const ptr = try alloc.create(CodepointWidth); + errdefer alloc.destroy(ptr); + ptr.* = .{ .opts = opts }; + return ptr; +} + +pub fn destroy(self: *CodepointWidth, alloc: Allocator) void { + alloc.destroy(self); +} + +pub fn benchmark(self: *CodepointWidth) Benchmark { + return .init(self, .{ + .stepFn = switch (self.opts.mode) { + .noop => stepNoop, + .wcwidth => stepWcwidth, + .table => stepTable, + .simd => stepSimd, + }, + .setupFn = setup, + .teardownFn = teardown, + }); +} + +fn setup(ptr: *anyopaque) Benchmark.Error!void { + const self: *CodepointWidth = @ptrCast(@alignCast(ptr)); + + // Open our data file to prepare for reading. We can do more + // validation here eventually. + assert(self.data_f == null); + self.data_f = options.dataFile(self.opts.data) catch |err| { + log.warn("error opening data file err={}", .{err}); + return error.BenchmarkFailed; + }; +} + +fn teardown(ptr: *anyopaque) void { + const self: *CodepointWidth = @ptrCast(@alignCast(ptr)); + if (self.data_f) |f| { + f.close(); + self.data_f = null; + } +} + +fn stepNoop(ptr: *anyopaque) Benchmark.Error!void { + _ = ptr; +} + +extern "c" fn wcwidth(c: u32) c_int; + +fn stepWcwidth(ptr: *anyopaque) Benchmark.Error!void { + const self: *CodepointWidth = @ptrCast(@alignCast(ptr)); + + const f = self.data_f orelse return; + var r = std.io.bufferedReader(f.reader()); + var d: UTF8Decoder = .{}; + var buf: [4096]u8 = undefined; + while (true) { + const n = r.read(&buf) catch |err| { + log.warn("error reading data file err={}", .{err}); + return error.BenchmarkFailed; + }; + if (n == 0) break; // EOF reached + + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp| { + const width = wcwidth(cp); + + // Write the width to the buffer to avoid it being compiled + // away + buf[0] = @intCast(width); + } + } + } +} + +fn stepTable(ptr: *anyopaque) Benchmark.Error!void { + const self: *CodepointWidth = @ptrCast(@alignCast(ptr)); + + const f = self.data_f orelse return; + var r = std.io.bufferedReader(f.reader()); + var d: UTF8Decoder = .{}; + var buf: [4096]u8 = undefined; + while (true) { + const n = r.read(&buf) catch |err| { + log.warn("error reading data file err={}", .{err}); + return error.BenchmarkFailed; + }; + if (n == 0) break; // EOF reached + + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp| { + // This is the same trick we do in terminal.zig so we + // keep it here. + const width = if (cp <= 0xFF) + 1 + else + table.get(@intCast(cp)).width; + + // Write the width to the buffer to avoid it being compiled + // away + buf[0] = @intCast(width); + } + } + } +} + +fn stepSimd(ptr: *anyopaque) Benchmark.Error!void { + const self: *CodepointWidth = @ptrCast(@alignCast(ptr)); + + const f = self.data_f orelse return; + var r = std.io.bufferedReader(f.reader()); + var d: UTF8Decoder = .{}; + var buf: [4096]u8 = undefined; + while (true) { + const n = r.read(&buf) catch |err| { + log.warn("error reading data file err={}", .{err}); + return error.BenchmarkFailed; + }; + if (n == 0) break; // EOF reached + + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp| { + const width = simd.codepointWidth(cp); + + // Write the width to the buffer to avoid it being compiled + // away + buf[0] = @intCast(width); + } + } + } +} + +test CodepointWidth { + const testing = std.testing; + const alloc = testing.allocator; + + const impl: *CodepointWidth = try .create(alloc, .{}); + defer impl.destroy(alloc); + + const bench = impl.benchmark(); + _ = try bench.run(.once); +} diff --git a/src/benchmark/TerminalStream.zig b/src/benchmark/TerminalStream.zig index 3b47fe879..5d235c4ee 100644 --- a/src/benchmark/TerminalStream.zig +++ b/src/benchmark/TerminalStream.zig @@ -18,6 +18,7 @@ const assert = std.debug.assert; const Allocator = std.mem.Allocator; const terminalpkg = @import("../terminal/main.zig"); const Benchmark = @import("Benchmark.zig"); +const options = @import("options.zig"); const Terminal = terminalpkg.Terminal; const Stream = terminalpkg.Stream(*Handler); @@ -89,12 +90,10 @@ fn setup(ptr: *anyopaque) Benchmark.Error!void { // Open our data file to prepare for reading. We can do more // validation here eventually. assert(self.data_f == null); - if (self.opts.data) |path| { - self.data_f = std.fs.cwd().openFile(path, .{}) catch |err| { - log.warn("error opening data file err={}", .{err}); - return error.BenchmarkFailed; - }; - } + self.data_f = options.dataFile(self.opts.data) catch |err| { + log.warn("error opening data file err={}", .{err}); + return error.BenchmarkFailed; + }; } fn teardown(ptr: *anyopaque) void { diff --git a/src/benchmark/cli.zig b/src/benchmark/cli.zig index c0b8dcea6..b35159c6b 100644 --- a/src/benchmark/cli.zig +++ b/src/benchmark/cli.zig @@ -6,6 +6,7 @@ const cli = @import("../cli.zig"); /// benchmarks. pub const Action = enum { @"terminal-stream", + @"codepoint-width", /// Returns the struct associated with the action. The struct /// should have a few decls: @@ -18,6 +19,7 @@ pub const Action = enum { pub fn Struct(comptime action: Action) type { return switch (action) { .@"terminal-stream" => @import("TerminalStream.zig"), + .@"codepoint-width" => @import("CodepointWidth.zig"), }; } }; diff --git a/src/benchmark/main.zig b/src/benchmark/main.zig index 010f11805..dd00f72b5 100644 --- a/src/benchmark/main.zig +++ b/src/benchmark/main.zig @@ -2,6 +2,7 @@ pub const cli = @import("cli.zig"); pub const Benchmark = @import("Benchmark.zig"); pub const CApi = @import("CApi.zig"); pub const TerminalStream = @import("TerminalStream.zig"); +pub const CodepointWidth = @import("CodepointWidth.zig"); test { _ = @import("std").testing.refAllDecls(@This()); diff --git a/src/benchmark/options.zig b/src/benchmark/options.zig new file mode 100644 index 000000000..867be6afc --- /dev/null +++ b/src/benchmark/options.zig @@ -0,0 +1,20 @@ +//! This file contains helpers for CLI options. + +const std = @import("std"); + +/// Returns the data file for the given path in a way that is consistent +/// across our CLI. If the path is not set then no file is returned. +/// If the path is "-", then we will return stdin. If the path is +/// a file then we will open and return the handle. +pub fn dataFile(path_: ?[]const u8) !?std.fs.File { + const path = path_ orelse return null; + + // Stdin + if (std.mem.eql(u8, path, "-")) return std.io.getStdIn(); + + // Normal file + const file = try std.fs.cwd().openFile(path, .{}); + errdefer file.close(); + + return file; +}