Merge pull request #1494 from mitchellh/grapheme-break

Optimized grapheme break detection (6x speedup)
2025-07-17 01:06:08 +03:00 · 2024-02-10 07:54:06 -08:00
parent 49cc5e2f47 5275d44e7d
commit f7c945c4a7
8 changed files with 421 additions and 11 deletions
--- a/src/bench/grapheme-break.sh
+++ b/src/bench/grapheme-break.sh
@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+#
+# This is a trivial helper script to help run the grapheme-break benchmark.
+# You probably want to tweak this script depending on what you're
+# trying to measure.
+
+# Options:
+# - "ascii", uniform random ASCII bytes
+# - "utf8", uniform random unicode characters, encoded as utf8
+# - "rand", pure random data, will contain many invalid code sequences.
+DATA="utf8"
+SIZE="25000000"
+
+# Add additional arguments
+ARGS=""
+
+# Generate the benchmark input ahead of time so it's not included in the time.
+./zig-out/bin/bench-stream --mode=gen-$DATA | head -c $SIZE > /tmp/ghostty_bench_data
+#cat ~/Downloads/JAPANESEBIBLE.txt > /tmp/ghostty_bench_data
+
+# Uncomment to instead use the contents of `stream.txt` as input.
+# yes $(cat ./stream.txt) | head -c $SIZE > /tmp/ghostty_bench_data
+
+hyperfine \
+  --warmup 10 \
+  -n noop \
+  "./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} </tmp/ghostty_bench_data" \
+  -n ziglyph \
+  "./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
+  -n table \
+  "./zig-out/bin/bench-grapheme-break --mode=table${ARGS} </tmp/ghostty_bench_data"
+
+
--- a/src/bench/grapheme-break.zig
+++ b/src/bench/grapheme-break.zig
@ -0,0 +1,144 @@
+//! This benchmark tests the throughput of grapheme break calculation.
+//! This is a common operation in terminal character printing for terminals
+//! that support grapheme clustering.
+//!
+//! This will consume all of the available stdin, so you should run it
+//! with `head` in a pipe to restrict. For example, to test ASCII input:
+//!
+//!   bench-stream --mode=gen-ascii | head -c 50M | bench-grapheme-break --mode=ziglyph
+//!
+
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+const ArenaAllocator = std.heap.ArenaAllocator;
+const ziglyph = @import("ziglyph");
+const cli = @import("../cli.zig");
+const simd = @import("../simd/main.zig");
+const unicode = @import("../unicode/main.zig");
+const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
+
+const Args = struct {
+    mode: Mode = .noop,
+
+    /// The size for read buffers. Doesn't usually need to be changed. The
+    /// main point is to make this runtime known so we can avoid compiler
+    /// optimizations.
+    @"buffer-size": usize = 4096,
+
+    /// This is set by the CLI parser for deinit.
+    _arena: ?ArenaAllocator = null,
+
+    pub fn deinit(self: *Args) void {
+        if (self._arena) |arena| arena.deinit();
+        self.* = undefined;
+    }
+};
+
+const Mode = enum {
+    /// The baseline mode copies the data from the fd into a buffer. This
+    /// is used to show the minimal overhead of reading the fd into memory
+    /// and establishes a baseline for the other modes.
+    noop,
+
+    /// Use ziglyph library to calculate the display width of each codepoint.
+    ziglyph,
+
+    /// Ghostty's table-based approach.
+    table,
+};
+
+pub const std_options = struct {
+    pub const log_level: std.log.Level = .debug;
+};
+
+pub fn main() !void {
+    // We want to use the c allocator because it is much faster than GPA.
+    const alloc = std.heap.c_allocator;
+
+    // Parse our args
+    var args: Args = .{};
+    defer args.deinit();
+    {
+        var iter = try std.process.argsWithAllocator(alloc);
+        defer iter.deinit();
+        try cli.args.parse(Args, alloc, &args, &iter);
+    }
+
+    const reader = std.io.getStdIn().reader();
+    const buf = try alloc.alloc(u8, args.@"buffer-size");
+
+    // Handle the modes that do not depend on terminal state first.
+    switch (args.mode) {
+        .noop => try benchNoop(reader, buf),
+        .ziglyph => try benchZiglyph(reader, buf),
+        .table => try benchTable(reader, buf),
+    }
+}
+
+noinline fn benchNoop(
+    reader: anytype,
+    buf: []u8,
+) !void {
+    var d: UTF8Decoder = .{};
+    while (true) {
+        const n = try reader.read(buf);
+        if (n == 0) break;
+
+        // Using stream.next directly with a for loop applies a naive
+        // scalar approach.
+        for (buf[0..n]) |c| {
+            _ = d.next(c);
+        }
+    }
+}
+
+noinline fn benchTable(
+    reader: anytype,
+    buf: []u8,
+) !void {
+    var d: UTF8Decoder = .{};
+    var state: unicode.GraphemeBreakState = .{};
+    var cp1: u21 = 0;
+    while (true) {
+        const n = try reader.read(buf);
+        if (n == 0) break;
+
+        // Using stream.next directly with a for loop applies a naive
+        // scalar approach.
+        for (buf[0..n]) |c| {
+            const cp_, const consumed = d.next(c);
+            assert(consumed);
+            if (cp_) |cp2| {
+                const v = unicode.graphemeBreak(cp1, @intCast(cp2), &state);
+                buf[0] = @intCast(@intFromBool(v));
+                cp1 = cp2;
+            }
+        }
+    }
+}
+
+noinline fn benchZiglyph(
+    reader: anytype,
+    buf: []u8,
+) !void {
+    var d: UTF8Decoder = .{};
+    var state: u3 = 0;
+    var cp1: u21 = 0;
+    while (true) {
+        const n = try reader.read(buf);
+        if (n == 0) break;
+
+        // Using stream.next directly with a for loop applies a naive
+        // scalar approach.
+        for (buf[0..n]) |c| {
+            const cp_, const consumed = d.next(c);
+            assert(consumed);
+            if (cp_) |cp2| {
+                const v = ziglyph.graphemeBreak(cp1, @intCast(cp2), &state);
+                buf[0] = @intCast(@intFromBool(v));
+                cp1 = cp2;
+            }
+        }
+    }
+}
--- a/src/build_config.zig
+++ b/src/build_config.zig
@ -141,4 +141,5 @@ pub const ExeEntrypoint = enum {
    bench_parser,
    bench_stream,
    bench_codepoint_width,
+    bench_grapheme_break,
 };
--- a/src/main.zig
+++ b/src/main.zig
@ -9,4 +9,5 @@ pub usingnamespace switch (build_config.exe_entrypoint) {
    .bench_parser => @import("bench/parser.zig"),
    .bench_stream => @import("bench/stream.zig"),
    .bench_codepoint_width => @import("bench/codepoint-width.zig"),
+    .bench_grapheme_break => @import("bench/grapheme-break.zig"),
 };
--- a/src/terminal/Terminal.zig
+++ b/src/terminal/Terminal.zig
@ -6,7 +6,6 @@ const Terminal = @This();

 const std = @import("std");
 const builtin = @import("builtin");
-const ziglyph = @import("ziglyph");
 const testing = std.testing;
 const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
@ -786,24 +785,19 @@ pub fn print(self: *Terminal, c: u21) !void {
        if (prev.cell.char == 0) break :grapheme;

        const grapheme_break = brk: {
-            var state: u3 = 0;
+            var state: unicode.GraphemeBreakState = .{};
            var cp1: u21 = @intCast(prev.cell.char);
            if (prev.cell.attrs.grapheme) {
                var it = row.codepointIterator(prev.x);
                while (it.next()) |cp2| {
                    // log.debug("cp1={x} cp2={x}", .{ cp1, cp2 });
-                    assert(!ziglyph.graphemeBreak(
-                        cp1,
-                        cp2,
-                        &state,
-                    ));
-
+                    assert(!unicode.graphemeBreak(cp1, cp2, &state));
                    cp1 = cp2;
                }
            }

            // log.debug("cp1={x} cp2={x} end", .{ cp1, c });
-            break :brk ziglyph.graphemeBreak(cp1, c, &state);
+            break :brk unicode.graphemeBreak(cp1, c, &state);
        };

        // If we can NOT break, this means that "c" is part of a grapheme
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@ -0,0 +1,183 @@
+const std = @import("std");
+const props = @import("props.zig");
+const GraphemeBoundaryClass = props.GraphemeBoundaryClass;
+const table = props.table;
+
+/// Determines if there is a grapheme break between two codepoints. This
+/// must be called sequentially maintaining the state between calls.
+///
+/// This function does NOT work with control characters. Control characters,
+/// line feeds, and carriage returns are expected to be filtered out before
+/// calling this function. This is because this function is tuned for
+/// Ghostty.
+pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
+    const value = Precompute.data[
+        (Precompute.Key{
+            .gbc1 = table.get(cp1).grapheme_boundary_class,
+            .gbc2 = table.get(cp2).grapheme_boundary_class,
+            .state = state.*,
+        }).index()
+    ];
+    state.* = value.state;
+    return value.result;
+}
+
+/// The state that must be maintained between calls to `graphemeBreak`.
+pub const BreakState = packed struct(u2) {
+    extended_pictographic: bool = false,
+    regional_indicator: bool = false,
+};
+
+/// This is all the structures and data for the precomputed lookup table
+/// for all possible permutations of state and grapheme boundary classes.
+/// Precomputation only requires 2^10 keys of 3 bit values so the whole
+/// table is less than 1KB.
+const Precompute = struct {
+    const Key = packed struct(u10) {
+        state: BreakState,
+        gbc1: GraphemeBoundaryClass,
+        gbc2: GraphemeBoundaryClass,
+
+        fn index(self: Key) usize {
+            return @intCast(@as(u10, @bitCast(self)));
+        }
+    };
+
+    const Value = packed struct(u3) {
+        result: bool,
+        state: BreakState,
+    };
+
+    const data = precompute: {
+        var result: [std.math.maxInt(u10)]Value = undefined;
+
+        @setEvalBranchQuota(2_000);
+        const info = @typeInfo(GraphemeBoundaryClass).Enum;
+        for (0..std.math.maxInt(u2) + 1) |state_init| {
+            for (info.fields) |field1| {
+                for (info.fields) |field2| {
+                    var state: BreakState = @bitCast(@as(u2, @intCast(state_init)));
+                    const key: Key = .{
+                        .gbc1 = @field(GraphemeBoundaryClass, field1.name),
+                        .gbc2 = @field(GraphemeBoundaryClass, field2.name),
+                        .state = state,
+                    };
+                    const v = graphemeBreakClass(key.gbc1, key.gbc2, &state);
+                    result[key.index()] = .{ .result = v, .state = state };
+                }
+            }
+        }
+
+        break :precompute result;
+    };
+};
+
+/// This is the algorithm from utf8proc. We only use this offline for
+/// precomputing the lookup table.
+fn graphemeBreakClass(
+    gbc1: GraphemeBoundaryClass,
+    gbc2: GraphemeBoundaryClass,
+    state: *BreakState,
+) bool {
+    // GB11: Emoji Extend* ZWJ x Emoji
+    if (!state.extended_pictographic and gbc1 == .extended_pictographic) {
+        state.extended_pictographic = true;
+    }
+
+    // These two properties are ignored because they're not relevant to
+    // Ghostty -- they're filtered out before checking grapheme boundaries.
+    // GB3: CR x LF
+    // GB4: Control
+
+    // GB6: Hangul L x (L|V|LV|VT)
+    if (gbc1 == .L) {
+        if (gbc2 == .L or
+            gbc2 == .V or
+            gbc2 == .LV or
+            gbc2 == .LVT) return false;
+    }
+
+    // GB7: Hangul (LV | V) x (V | T)
+    if (gbc1 == .LV or gbc1 == .V) {
+        if (gbc2 == .V or
+            gbc2 == .T) return false;
+    }
+
+    // GB8: Hangul (LVT | T) x T
+    if (gbc1 == .LVT or gbc1 == .T) {
+        if (gbc2 == .T) return false;
+    }
+
+    // GB9b: x (Extend | ZWJ)
+    if (gbc2 == .extend or gbc2 == .zwj) return false;
+
+    // GB9a: x Spacing
+    if (gbc2 == .spacing_mark) return false;
+
+    // GB9b: Prepend x
+    if (gbc1 == .prepend) return false;
+
+    // GB12, GB13: RI x RI
+    if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
+        if (state.regional_indicator) {
+            state.regional_indicator = false;
+            return true;
+        } else {
+            state.regional_indicator = true;
+            return false;
+        }
+    }
+
+    // GB11: Emoji Extend* ZWJ x Emoji
+    if (state.extended_pictographic and
+        gbc1 == .zwj and
+        gbc2 == .extended_pictographic)
+    {
+        state.extended_pictographic = false;
+        return false;
+    }
+
+    return true;
+}
+
+/// If you build this file as a binary, we will verify the grapheme break
+/// implementation. This iterates over billions of codepoints so it is
+/// SLOW. It's not meant to be run in CI, but it's useful for debugging.
+pub fn main() !void {
+    const ziglyph = @import("ziglyph");
+
+    // Set the min and max to control the test range.
+    const min = 0;
+    const max = std.math.maxInt(u21) + 1;
+
+    var state: BreakState = .{};
+    var zg_state: u3 = 0;
+    for (min..max) |cp1| {
+        if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
+
+        if (cp1 == '\r' or cp1 == '\n' or
+            ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;
+
+        for (min..max) |cp2| {
+            if (cp2 == '\r' or cp2 == '\n' or
+                ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;
+
+            const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
+            const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
+            if (gb != zg_gb) {
+                std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
+                    cp1,
+                    cp2,
+                    gb,
+                    state,
+                    zg_gb,
+                    zg_state,
+                });
+            }
+        }
+    }
+}
+
+pub const std_options = struct {
+    pub const log_level: std.log.Level = .info;
+};
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@ -1,8 +1,11 @@
 pub const lut = @import("lut.zig");

+const grapheme = @import("grapheme.zig");
 const props = @import("props.zig");
 pub const table = props.table;
 pub const Properties = props.Properties;
+pub const graphemeBreak = grapheme.graphemeBreak;
+pub const GraphemeBreakState = grapheme.BreakState;

 test {
    @import("std").testing.refAllDecls(@This());
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@ -27,9 +27,13 @@ pub const Properties = struct {
    /// becomes a 2-em dash).
    width: u2 = 0,

+    /// Grapheme boundary class.
+    grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
+
    // Needed for lut.Generator
    pub fn eql(a: Properties, b: Properties) bool {
-        return a.width == b.width;
+        return a.width == b.width and
+            a.grapheme_boundary_class == b.grapheme_boundary_class;
    }

    // Needed for lut.Generator
@ -41,17 +45,64 @@ pub const Properties = struct {
    ) !void {
        _ = layout;
        _ = opts;
-        try std.fmt.format(writer, ".{{ .width= {}, }}", .{
+        try std.fmt.format(writer,
+            \\.{{
+            \\    .width= {},
+            \\    .grapheme_boundary_class= .{s},
+            \\}}
+        , .{
            self.width,
+            @tagName(self.grapheme_boundary_class),
        });
    }
 };

+/// Possible grapheme boundary classes. This isn't an exhaustive list:
+/// we omit control, CR, LF, etc. because in Ghostty's usage that are
+/// impossible because they're handled by the terminal.
+pub const GraphemeBoundaryClass = enum(u4) {
+    invalid,
+    L,
+    V,
+    T,
+    LV,
+    LVT,
+    prepend,
+    extend,
+    zwj,
+    spacing_mark,
+    regional_indicator,
+    extended_pictographic,
+
+    /// Gets the grapheme boundary class for a codepoint. This is VERY
+    /// SLOW. The use case for this is only in generating lookup tables.
+    pub fn init(cp: u21) GraphemeBoundaryClass {
+        if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic;
+        if (ziglyph.emoji.isEmojiModifier(cp)) return .extend;
+        if (ziglyph.grapheme_break.isL(cp)) return .L;
+        if (ziglyph.grapheme_break.isV(cp)) return .V;
+        if (ziglyph.grapheme_break.isT(cp)) return .T;
+        if (ziglyph.grapheme_break.isLv(cp)) return .LV;
+        if (ziglyph.grapheme_break.isLvt(cp)) return .LVT;
+        if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend;
+        if (ziglyph.grapheme_break.isExtend(cp)) return .extend;
+        if (ziglyph.grapheme_break.isZwj(cp)) return .zwj;
+        if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark;
+        if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator;
+
+        // This is obviously not INVALID invalid, there is SOME grapheme
+        // boundary class for every codepoint. But we don't care about
+        // anything that doesn't fit into the above categories.
+        return .invalid;
+    }
+};
+
 pub fn get(cp: u21) Properties {
    const zg_width = ziglyph.display_width.codePointWidth(cp, .half);

    return .{
        .width = @intCast(@min(2, @max(0, zg_width))),
+        .grapheme_boundary_class = GraphemeBoundaryClass.init(cp),
    };
 }