Add unicode-test and fix benchmarks

2025-07-25 13:16:11 +03:00 · 2025-07-04 12:23:38 -04:00
parent 60812d418f
commit 6f9b332e1a
12 changed files with 191 additions and 110 deletions
--- a/build.zig
+++ b/build.zig
@ -32,6 +32,10 @@ pub fn build(b: *std.Build) !void {
    const bench = try buildpkg.GhosttyBench.init(b, &deps);
    if (config.emit_bench) bench.install();

+    // Ghostty unicode test exe
+    const unicode_test = try buildpkg.GhosttyUnicodeTest.init(b, &config, &deps);
+    if (config.emit_unicode_test) unicode_test.install();
+
    // Ghostty dist tarball
    const dist = try buildpkg.GhosttyDist.init(b, &config);
    {
--- a/src/bench/codepoint-width.sh
+++ b/src/bench/codepoint-width.sh
@ -27,6 +27,8 @@ hyperfine \
  "./zig-out/bin/bench-codepoint-width --mode=noop${ARGS} </tmp/ghostty_bench_data" \
  -n wcwidth \
  "./zig-out/bin/bench-codepoint-width --mode=wcwidth${ARGS} </tmp/ghostty_bench_data" \
+  -n zg \
+  "./zig-out/bin/bench-codepoint-width --mode=zg${ARGS} </tmp/ghostty_bench_data" \
  -n table \
  "./zig-out/bin/bench-codepoint-width --mode=table${ARGS} </tmp/ghostty_bench_data" \
  -n simd \
--- a/src/bench/codepoint-width.zig
+++ b/src/bench/codepoint-width.zig
@ -7,14 +7,14 @@
 //! This will consume all of the available stdin, so you should run it
 //! with `head` in a pipe to restrict. For example, to test ASCII input:
 //!
-//!   bench-stream --mode=gen-ascii | head -c 50M | bench-codepoint-width --mode=ziglyph
+//!   bench-stream --mode=gen-ascii | head -c 50M | bench-codepoint-width --mode=zg
 //!

 const std = @import("std");
 const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
 const ArenaAllocator = std.heap.ArenaAllocator;
-const ziglyph = @import("ziglyph");
+const DisplayWidth = @import("DisplayWidth");
 const cli = @import("../cli.zig");
 const simd = @import("../simd/main.zig");
 const table = @import("../unicode/main.zig").table;
@ -46,8 +46,8 @@ const Mode = enum {
    /// libc wcwidth
    wcwidth,

-    /// Use ziglyph library to calculate the display width of each codepoint.
-    ziglyph,
+    /// Use zg library to calculate the display width of each codepoint.
+    zg,

    /// Our SIMD implementation.
    simd,
@ -64,6 +64,10 @@ pub fn main() !void {
    // We want to use the c allocator because it is much faster than GPA.
    const alloc = std.heap.c_allocator;

+    // Initialize DisplayWidth for zg
+    const display_width = try DisplayWidth.init(alloc);
+    display_width.deinit(alloc);
+
    // Parse our args
    var args: Args = .{};
    defer args.deinit();
@ -80,7 +84,7 @@ pub fn main() !void {
    switch (args.mode) {
        .noop => try benchNoop(reader, buf),
        .wcwidth => try benchWcwidth(reader, buf),
-        .ziglyph => try benchZiglyph(reader, buf),
+        .zg => try benchZg(display_width, reader, buf),
        .simd => try benchSimd(reader, buf),
        .table => try benchTable(reader, buf),
    }
@ -155,7 +159,8 @@ noinline fn benchTable(
    }
 }

-noinline fn benchZiglyph(
+noinline fn benchZg(
+    display_width: DisplayWidth,
    reader: anytype,
    buf: []u8,
 ) !void {
@ -170,7 +175,7 @@ noinline fn benchZiglyph(
            const cp_, const consumed = d.next(c);
            assert(consumed);
            if (cp_) |cp| {
-                const width = ziglyph.display_width.codePointWidth(cp, .half);
+                const width = DisplayWidth.codePointWidth(display_width, cp);

                // Write the width to the buffer to avoid it being compiled away
                buf[0] = @intCast(width);
--- a/src/bench/grapheme-break.sh
+++ b/src/bench/grapheme-break.sh
@ -25,8 +25,6 @@ hyperfine \
  --warmup 10 \
  -n noop \
  "./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} </tmp/ghostty_bench_data" \
-  -n ziglyph \
-  "./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
  -n zg \
  "./zig-out/bin/bench-grapheme-break --mode=zg${ARGS} </tmp/ghostty_bench_data" \
  -n table \
--- a/src/bench/grapheme-break.zig
+++ b/src/bench/grapheme-break.zig
@ -5,14 +5,13 @@
 //! This will consume all of the available stdin, so you should run it
 //! with `head` in a pipe to restrict. For example, to test ASCII input:
 //!
-//!   bench-stream --mode=gen-ascii | head -c 50M | bench-grapheme-break --mode=ziglyph
+//!   bench-stream --mode=gen-ascii | head -c 50M | bench-grapheme-break --mode=zg
 //!

 const std = @import("std");
 const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
 const ArenaAllocator = std.heap.ArenaAllocator;
-const ziglyph = @import("ziglyph");
 const Graphemes = @import("Graphemes");
 const cli = @import("../cli.zig");
 const simd = @import("../simd/main.zig");
@ -42,9 +41,6 @@ const Mode = enum {
    /// and establishes a baseline for the other modes.
    noop,

-    /// Use ziglyph library to calculate the display width of each codepoint.
-    ziglyph,
-
    /// Use zg library to calculate the display width of each codepoint.
    zg,

@ -79,7 +75,6 @@ pub fn main() !void {
    // Handle the modes that do not depend on terminal state first.
    switch (args.mode) {
        .noop => try benchNoop(reader, buf),
-        .ziglyph => try benchZiglyph(reader, buf),
        .zg => try benchZg(&graphemes, reader, buf),
        .table => try benchTable(reader, buf),
    }
@ -152,28 +147,3 @@ noinline fn benchZg(
        }
    }
 }
-
-noinline fn benchZiglyph(
-    reader: anytype,
-    buf: []u8,
-) !void {
-    var d: UTF8Decoder = .{};
-    var state: u3 = 0;
-    var cp1: u21 = 0;
-    while (true) {
-        const n = try reader.read(buf);
-        if (n == 0) break;
-
-        // Using stream.next directly with a for loop applies a naive
-        // scalar approach.
-        for (buf[0..n]) |c| {
-            const cp_, const consumed = d.next(c);
-            assert(consumed);
-            if (cp_) |cp2| {
-                const v = ziglyph.graphemeBreak(cp1, @intCast(cp2), &state);
-                buf[0] = @intCast(@intFromBool(v));
-                cp1 = cp2;
-            }
-        }
-    }
-}
--- a/src/build/Config.zig
+++ b/src/build/Config.zig
@ -50,6 +50,7 @@ patch_rpath: ?[]const u8 = null,
 flatpak: bool = false,
 emit_test_exe: bool = false,
 emit_bench: bool = false,
+emit_unicode_test: bool = false,
 emit_helpgen: bool = false,
 emit_docs: bool = false,
 emit_webdata: bool = false,
@ -276,6 +277,12 @@ pub fn init(b: *std.Build) !Config {
        "Build and install the benchmark executables.",
    ) orelse false;

+    config.emit_unicode_test = b.option(
+        bool,
+        "emit-unicode-test",
+        "Build and install the unicode test executable.",
+    ) orelse false;
+
    config.emit_helpgen = b.option(
        bool,
        "emit-helpgen",
@ -289,6 +296,7 @@ pub fn init(b: *std.Build) !Config {
    ) orelse emit_docs: {
        // If we are emitting any other artifacts then we default to false.
        if (config.emit_bench or
+            config.emit_unicode_test or
            config.emit_test_exe or
            config.emit_helpgen) break :emit_docs false;

@ -337,6 +345,7 @@ pub fn init(b: *std.Build) !Config {
        target.result.os.tag == .macos and
        config.app_runtime == .none and
        (!config.emit_bench and
+            !config.emit_unicode_test and
            !config.emit_test_exe and
            !config.emit_helpgen);

--- a/src/build/GhosttyUnicodeTest.zig
+++ b/src/build/GhosttyUnicodeTest.zig
@ -0,0 +1,47 @@
+const UnicodeTest = @This();
+
+const std = @import("std");
+const Config = @import("Config.zig");
+const SharedDeps = @import("SharedDeps.zig");
+
+/// The unicode test executable.
+exe: *std.Build.Step.Compile,
+
+/// The install step for the executable.
+install_step: *std.Build.Step.InstallArtifact,
+
+pub fn init(b: *std.Build, cfg: *const Config, deps: *const SharedDeps) !UnicodeTest {
+    const exe: *std.Build.Step.Compile = b.addExecutable(.{
+        .name = "unicode-test",
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("src/unicode/main.zig"),
+            .target = cfg.target,
+            .optimize = cfg.optimize,
+            .strip = cfg.strip,
+            .omit_frame_pointer = cfg.strip,
+            .unwind_tables = if (cfg.strip) .none else .sync,
+        }),
+    });
+    const install_step = b.addInstallArtifact(exe, .{});
+
+    // Add the shared dependencies
+    _ = try deps.add(exe);
+
+    if (b.lazyDependency("ziglyph", .{
+        .target = cfg.target,
+        .optimize = cfg.optimize,
+    })) |dep| {
+        exe.root_module.addImport("ziglyph", dep.module("ziglyph"));
+    }
+
+    return .{
+        .exe = exe,
+        .install_step = install_step,
+    };
+}
+
+/// Add the unicode test exe to the install target.
+pub fn install(self: *const UnicodeTest) void {
+    const b = self.install_step.step.owner;
+    b.getInstallStep().dependOn(&self.install_step.step);
+}
--- a/src/build/SharedDeps.zig
+++ b/src/build/SharedDeps.zig
@ -411,12 +411,6 @@ pub fn add(
    })) |dep| {
        step.root_module.addImport("z2d", dep.module("z2d"));
    }
-    if (b.lazyDependency("ziglyph", .{
-        .target = target,
-        .optimize = optimize,
-    })) |dep| {
-        step.root_module.addImport("ziglyph", dep.module("ziglyph"));
-    }
    if (b.lazyDependency("zg", .{
        .target = target,
        .optimize = optimize,
--- a/src/build/main.zig
+++ b/src/build/main.zig
@ -15,6 +15,7 @@ pub const GhosttyFrameData = @import("GhosttyFrameData.zig");
 pub const GhosttyLib = @import("GhosttyLib.zig");
 pub const GhosttyResources = @import("GhosttyResources.zig");
 pub const GhosttyI18n = @import("GhosttyI18n.zig");
+pub const GhosttyUnicodeTest = @import("GhosttyUnicodeTest.zig");
 pub const GhosttyXCFramework = @import("GhosttyXCFramework.zig");
 pub const GhosttyWebdata = @import("GhosttyWebdata.zig");
 pub const HelpStrings = @import("HelpStrings.zig");
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@ -149,49 +149,6 @@ fn graphemeBreakClass(
    return true;
 }

-// This test will verify the grapheme break implementation. This iterates over billions of codepoints so it is SLOW.
-// It's not meant to be run in CI, but it's useful for debugging.
-test "grapheme break check against ziglyph" {
-    const ziglyph = @import("ziglyph");
-
-    // Set the min and max to control the test range.
-    const min = 0;
-    const max = std.math.maxInt(u21) + 1;
-    var success: bool = true;
-
-    var state: BreakState = .{};
-    var zg_state: u3 = 0;
-    for (min..max) |cp1| {
-        if (cp1 == '\r' or cp1 == '\n' or
-            ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;
-
-        for (min..max) |cp2| {
-            if (cp2 == '\r' or cp2 == '\n' or
-                ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;
-
-            const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
-            const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
-            if (gb != zg_gb) {
-                success = false;
-                std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
-                    cp1,
-                    cp2,
-                    gb,
-                    state,
-                    zg_gb,
-                    zg_state,
-                });
-            }
-        }
-    }
-
-    try std.testing.expect(success);
-}
-
-pub const std_options = struct {
-    pub const log_level: std.log.Level = .info;
-};
-
 test "grapheme break: emoji modifier" {
    const testing = std.testing;

--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@ -1,3 +1,4 @@
+const std = @import("std");
 pub const lut = @import("lut.zig");

 const grapheme = @import("grapheme.zig");
@ -10,3 +11,113 @@ pub const GraphemeBreakState = grapheme.BreakState;
 test {
    @import("std").testing.refAllDecls(@This());
 }
+
+/// Build Ghostty with `zig build -Doptimize=ReleaseFast -Demit-unicode-test`.
+///
+/// Usage: ./zig-out/bin/unicode-test [grapheme|width|all] [zg|ziglyph|all]
+///
+///     grapheme: this will verify the grapheme break implementation. This
+///               iterates over billions of codepoints so it is SLOW.
+///
+///     width:    this verifies the table codepoint widths match
+///     zg:       compare grapheme/width against zg
+///     ziglyph:  compare grapheme/width against ziglyph
+pub fn main() !void {
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer _ = gpa.deinit();
+
+    const alloc = gpa.allocator();
+
+    const args = try std.process.argsAlloc(alloc);
+    defer std.process.argsFree(alloc, args);
+
+    var zg = try props.init(alloc);
+    defer zg.deinit(alloc);
+
+    const ziglyph = @import("ziglyph");
+    const Graphemes = @import("Graphemes");
+    const DisplayWidth = @import("DisplayWidth");
+
+    const testAll = args.len < 2 or std.mem.eql(u8, args[1], "all");
+    const compareAll = args.len < 3 or std.mem.eql(u8, args[2], "all");
+    const compareZg = compareAll or std.mem.eql(u8, args[2], "zg");
+    const compareZiglyph = compareAll or std.mem.eql(u8, args[2], "ziglyph");
+
+    // Set the min and max to control the test range.
+    const min = 0;
+    const max = 0x110000;
+
+    var state: GraphemeBreakState = .{};
+    var zg_state: Graphemes.State = .{};
+    var ziglyph_state: u3 = 0;
+
+    if (testAll or std.mem.eql(u8, args[1], "grapheme")) {
+        std.log.info("============== testing grapheme break ===============", .{});
+
+        for (min..max) |cp1| {
+            if (cp1 % 0x100 == 0) std.log.info("progress: cp1={x}", .{cp1});
+
+            if (cp1 == '\r' or cp1 == '\n' or
+                Graphemes.gbp(zg.graphemes, @intCast(cp1)) == .Control) continue;
+
+            for (min..max) |cp2| {
+                if (cp2 == '\r' or cp2 == '\n' or
+                    Graphemes.gbp(zg.graphemes, @intCast(cp1)) == .Control) continue;
+
+                const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
+                if (compareZg) {
+                    const zg_gb = Graphemes.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg.graphemes, &zg_state);
+                    if (gb != zg_gb) {
+                        std.log.warn("[zg mismatch] cp1={x} cp2={x} gb={} zg_gb={} state={} zg_state={}", .{
+                            cp1,
+                            cp2,
+                            gb,
+                            zg_gb,
+                            state,
+                            zg_state,
+                        });
+                    }
+                }
+                if (compareZiglyph) {
+                    const ziglyph_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &ziglyph_state);
+                    if (gb != ziglyph_gb) {
+                        std.log.warn("[ziglyph mismatch] cp1={x} cp2={x} gb={} ziglyph_gb={} state={} ziglyph_state={}", .{
+                            cp1,
+                            cp2,
+                            gb,
+                            ziglyph_gb,
+                            state,
+                            ziglyph_state,
+                        });
+                    }
+                }
+            }
+        }
+    }
+
+    if (testAll or std.mem.eql(u8, args[1], "width")) {
+        std.log.info("============== testing codepoint width ==============", .{});
+
+        for (min..max) |cp| {
+            if (cp % 0x10000 == 0) std.log.info("progress: cp={x}", .{cp});
+
+            const t = table.get(@intCast(cp));
+            if (compareZg) {
+                const zg_width = @min(2, @max(0, DisplayWidth.codePointWidth(zg.display_width, @intCast(cp))));
+                if (t.width != zg_width) {
+                    std.log.warn("[zg mismatch] cp={x} t={} zg={}", .{ cp, t.width, zg_width });
+                }
+            }
+            if (compareZiglyph) {
+                const ziglyph_width = @min(2, @max(0, DisplayWidth.codePointWidth(zg.display_width, @intCast(cp))));
+                if (t.width != ziglyph_width) {
+                    std.log.warn("[ziglyph mismatch] cp={x} t={} zg={}", .{ cp, t.width, ziglyph_width });
+                }
+            }
+        }
+    }
+}
+
+pub const std_options: std.Options = .{
+    .log_level = .debug,
+};
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@ -8,7 +8,8 @@ const lut = @import("lut.zig");
 graphemes: Graphemes,
 display_width: DisplayWidth,

-fn init(alloc: std.mem.Allocator) !props {
+// Public only for unicode-test
+pub fn init(alloc: std.mem.Allocator) !props {
    const graphemes = try Graphemes.init(alloc);
    return .{
        .graphemes = graphemes,
@ -16,7 +17,8 @@ fn init(alloc: std.mem.Allocator) !props {
    };
 }

-fn deinit(self: *props, alloc: std.mem.Allocator) void {
+// Public only for unicode-test
+pub fn deinit(self: *props, alloc: std.mem.Allocator) void {
    self.graphemes.deinit(alloc);
    self.display_width.deinit(alloc);
 }
@ -180,22 +182,3 @@ pub fn main() !void {
    //     t.stage3.len,
    // });
 }
-
-// This is not very fast in debug modes, so its commented by default.
-// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
-//test "tables match zg" {
-//    const testing = std.testing;
-//
-//    const display_width = try DisplayWidth.init(std.testing.allocator);
-//    defer display_width.deinit(std.testing.allocator);
-//
-//    const min = 0xFF + 1; // start outside ascii
-//    for (min..0x110000) |cp| {
-//        const t = table.get(@intCast(cp));
-//        const zg = @min(2, @max(0, DisplayWidth.codePointWidth(display_width, @intCast(cp))));
-//        if (t.width != zg) {
-//            std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
-//            try testing.expect(false);
-//        }
-//    }
-//}