Merge pull request #1494 from mitchellh/grapheme-break

Optimized grapheme break detection (6x speedup)
2025-07-17 09:16:11 +03:00 · 2024-02-10 07:54:06 -08:00
parent 49cc5e2f47 5275d44e7d
commit f7c945c4a7
8 changed files with 421 additions and 11 deletions
--- a/src/bench/grapheme-break.sh
+++ b/src/bench/grapheme-break.sh
@ -0,0 +1,33 @@
 #!/usr/bin/env bash
 #
 # This is a trivial helper script to help run the grapheme-break benchmark.
 # You probably want to tweak this script depending on what you're
 # trying to measure.
 # Options:
 # - "ascii", uniform random ASCII bytes
 # - "utf8", uniform random unicode characters, encoded as utf8
 # - "rand", pure random data, will contain many invalid code sequences.
 DATA="utf8"
 SIZE="25000000"
 # Add additional arguments
 ARGS=""
 # Generate the benchmark input ahead of time so it's not included in the time.
 ./zig-out/bin/bench-stream --mode=gen-$DATA | head -c $SIZE > /tmp/ghostty_bench_data
 #cat ~/Downloads/JAPANESEBIBLE.txt > /tmp/ghostty_bench_data
 # Uncomment to instead use the contents of `stream.txt` as input.
 # yes $(cat ./stream.txt) | head -c $SIZE > /tmp/ghostty_bench_data
 hyperfine \
  --warmup 10 \
  -n noop \
  "./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} </tmp/ghostty_bench_data" \
  -n ziglyph \
  "./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
  -n table \
  "./zig-out/bin/bench-grapheme-break --mode=table${ARGS} </tmp/ghostty_bench_data"
--- a/src/bench/grapheme-break.zig
+++ b/src/bench/grapheme-break.zig
@ -0,0 +1,144 @@
 //! This benchmark tests the throughput of grapheme break calculation.
 //! This is a common operation in terminal character printing for terminals
 //! that support grapheme clustering.
 //!
 //! This will consume all of the available stdin, so you should run it
 //! with `head` in a pipe to restrict. For example, to test ASCII input:
 //!
 //!   bench-stream --mode=gen-ascii | head -c 50M | bench-grapheme-break --mode=ziglyph
 //!
 const std = @import("std");
 const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
 const ArenaAllocator = std.heap.ArenaAllocator;
 const ziglyph = @import("ziglyph");
 const cli = @import("../cli.zig");
 const simd = @import("../simd/main.zig");
 const unicode = @import("../unicode/main.zig");
 const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
 const Args = struct {
    mode: Mode = .noop,
    /// The size for read buffers. Doesn't usually need to be changed. The
    /// main point is to make this runtime known so we can avoid compiler
    /// optimizations.
    @"buffer-size": usize = 4096,
    /// This is set by the CLI parser for deinit.
    _arena: ?ArenaAllocator = null,
    pub fn deinit(self: *Args) void {
        if (self._arena) |arena| arena.deinit();
        self.* = undefined;
    }
 };
 const Mode = enum {
    /// The baseline mode copies the data from the fd into a buffer. This
    /// is used to show the minimal overhead of reading the fd into memory
    /// and establishes a baseline for the other modes.
    noop,
    /// Use ziglyph library to calculate the display width of each codepoint.
    ziglyph,
    /// Ghostty's table-based approach.
    table,
 };
 pub const std_options = struct {
    pub const log_level: std.log.Level = .debug;
 };
 pub fn main() !void {
    // We want to use the c allocator because it is much faster than GPA.
    const alloc = std.heap.c_allocator;
    // Parse our args
    var args: Args = .{};
    defer args.deinit();
    {
        var iter = try std.process.argsWithAllocator(alloc);
        defer iter.deinit();
        try cli.args.parse(Args, alloc, &args, &iter);
    }
    const reader = std.io.getStdIn().reader();
    const buf = try alloc.alloc(u8, args.@"buffer-size");
    // Handle the modes that do not depend on terminal state first.
    switch (args.mode) {
        .noop => try benchNoop(reader, buf),
        .ziglyph => try benchZiglyph(reader, buf),
        .table => try benchTable(reader, buf),
    }
 }
 noinline fn benchNoop(
    reader: anytype,
    buf: []u8,
 ) !void {
    var d: UTF8Decoder = .{};
    while (true) {
        const n = try reader.read(buf);
        if (n == 0) break;
        // Using stream.next directly with a for loop applies a naive
        // scalar approach.
        for (buf[0..n]) |c| {
            _ = d.next(c);
        }
    }
 }
 noinline fn benchTable(
    reader: anytype,
    buf: []u8,
 ) !void {
    var d: UTF8Decoder = .{};
    var state: unicode.GraphemeBreakState = .{};
    var cp1: u21 = 0;
    while (true) {
        const n = try reader.read(buf);
        if (n == 0) break;
        // Using stream.next directly with a for loop applies a naive
        // scalar approach.
        for (buf[0..n]) |c| {
            const cp_, const consumed = d.next(c);
            assert(consumed);
            if (cp_) |cp2| {
                const v = unicode.graphemeBreak(cp1, @intCast(cp2), &state);
                buf[0] = @intCast(@intFromBool(v));
                cp1 = cp2;
            }
        }
    }
 }
 noinline fn benchZiglyph(
    reader: anytype,
    buf: []u8,
 ) !void {
    var d: UTF8Decoder = .{};
    var state: u3 = 0;
    var cp1: u21 = 0;
    while (true) {
        const n = try reader.read(buf);
        if (n == 0) break;
        // Using stream.next directly with a for loop applies a naive
        // scalar approach.
        for (buf[0..n]) |c| {
            const cp_, const consumed = d.next(c);
            assert(consumed);
            if (cp_) |cp2| {
                const v = ziglyph.graphemeBreak(cp1, @intCast(cp2), &state);
                buf[0] = @intCast(@intFromBool(v));
                cp1 = cp2;
            }
        }
    }
 }
--- a/src/build_config.zig
+++ b/src/build_config.zig
@ -141,4 +141,5 @@ pub const ExeEntrypoint = enum {
    bench_parser,
    bench_stream,
    bench_codepoint_width,
    bench_grapheme_break,
 };
--- a/src/main.zig
+++ b/src/main.zig
@ -9,4 +9,5 @@ pub usingnamespace switch (build_config.exe_entrypoint) {
    .bench_parser => @import("bench/parser.zig"),
    .bench_stream => @import("bench/stream.zig"),
    .bench_codepoint_width => @import("bench/codepoint-width.zig"),
    .bench_grapheme_break => @import("bench/grapheme-break.zig"),
 };
--- a/src/terminal/Terminal.zig
+++ b/src/terminal/Terminal.zig
@ -6,7 +6,6 @@ const Terminal = @This();
 const std = @import("std");
 const builtin = @import("builtin");
 const ziglyph = @import("ziglyph");
 const testing = std.testing;
 const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
@ -786,24 +785,19 @@ pub fn print(self: *Terminal, c: u21) !void {
        if (prev.cell.char == 0) break :grapheme;
        const grapheme_break = brk: {
-            var state: u3 = 0;
+            var state: unicode.GraphemeBreakState = .{};
            var cp1: u21 = @intCast(prev.cell.char);
            if (prev.cell.attrs.grapheme) {
                var it = row.codepointIterator(prev.x);
                while (it.next()) |cp2| {
                    // log.debug("cp1={x} cp2={x}", .{ cp1, cp2 });
-                    assert(!ziglyph.graphemeBreak(
+                    assert(!unicode.graphemeBreak(cp1, cp2, &state));
                        cp1,
                        cp2,
                        &state,
                    ));
                    cp1 = cp2;
                }
            }
            // log.debug("cp1={x} cp2={x} end", .{ cp1, c });
-            break :brk ziglyph.graphemeBreak(cp1, c, &state);
+            break :brk unicode.graphemeBreak(cp1, c, &state);
        };
        // If we can NOT break, this means that "c" is part of a grapheme
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@ -0,0 +1,183 @@
 const std = @import("std");
 const props = @import("props.zig");
 const GraphemeBoundaryClass = props.GraphemeBoundaryClass;
 const table = props.table;
 /// Determines if there is a grapheme break between two codepoints. This
 /// must be called sequentially maintaining the state between calls.
 ///
 /// This function does NOT work with control characters. Control characters,
 /// line feeds, and carriage returns are expected to be filtered out before
 /// calling this function. This is because this function is tuned for
 /// Ghostty.
 pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
    const value = Precompute.data[
        (Precompute.Key{
            .gbc1 = table.get(cp1).grapheme_boundary_class,
            .gbc2 = table.get(cp2).grapheme_boundary_class,
            .state = state.*,
        }).index()
    ];
    state.* = value.state;
    return value.result;
 }
 /// The state that must be maintained between calls to `graphemeBreak`.
 pub const BreakState = packed struct(u2) {
    extended_pictographic: bool = false,
    regional_indicator: bool = false,
 };
 /// This is all the structures and data for the precomputed lookup table
 /// for all possible permutations of state and grapheme boundary classes.
 /// Precomputation only requires 2^10 keys of 3 bit values so the whole
 /// table is less than 1KB.
 const Precompute = struct {
    const Key = packed struct(u10) {
        state: BreakState,
        gbc1: GraphemeBoundaryClass,
        gbc2: GraphemeBoundaryClass,
        fn index(self: Key) usize {
            return @intCast(@as(u10, @bitCast(self)));
        }
    };
    const Value = packed struct(u3) {
        result: bool,
        state: BreakState,
    };
    const data = precompute: {
        var result: [std.math.maxInt(u10)]Value = undefined;
        @setEvalBranchQuota(2_000);
        const info = @typeInfo(GraphemeBoundaryClass).Enum;
        for (0..std.math.maxInt(u2) + 1) |state_init| {
            for (info.fields) |field1| {
                for (info.fields) |field2| {
                    var state: BreakState = @bitCast(@as(u2, @intCast(state_init)));
                    const key: Key = .{
                        .gbc1 = @field(GraphemeBoundaryClass, field1.name),
                        .gbc2 = @field(GraphemeBoundaryClass, field2.name),
                        .state = state,
                    };
                    const v = graphemeBreakClass(key.gbc1, key.gbc2, &state);
                    result[key.index()] = .{ .result = v, .state = state };
                }
            }
        }
        break :precompute result;
    };
 };
 /// This is the algorithm from utf8proc. We only use this offline for
 /// precomputing the lookup table.
 fn graphemeBreakClass(
    gbc1: GraphemeBoundaryClass,
    gbc2: GraphemeBoundaryClass,
    state: *BreakState,
 ) bool {
    // GB11: Emoji Extend* ZWJ x Emoji
    if (!state.extended_pictographic and gbc1 == .extended_pictographic) {
        state.extended_pictographic = true;
    }
    // These two properties are ignored because they're not relevant to
    // Ghostty -- they're filtered out before checking grapheme boundaries.
    // GB3: CR x LF
    // GB4: Control
    // GB6: Hangul L x (L|V|LV|VT)
    if (gbc1 == .L) {
        if (gbc2 == .L or
            gbc2 == .V or
            gbc2 == .LV or
            gbc2 == .LVT) return false;
    }
    // GB7: Hangul (LV | V) x (V | T)
    if (gbc1 == .LV or gbc1 == .V) {
        if (gbc2 == .V or
            gbc2 == .T) return false;
    }
    // GB8: Hangul (LVT | T) x T
    if (gbc1 == .LVT or gbc1 == .T) {
        if (gbc2 == .T) return false;
    }
    // GB9b: x (Extend | ZWJ)
    if (gbc2 == .extend or gbc2 == .zwj) return false;
    // GB9a: x Spacing
    if (gbc2 == .spacing_mark) return false;
    // GB9b: Prepend x
    if (gbc1 == .prepend) return false;
    // GB12, GB13: RI x RI
    if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
        if (state.regional_indicator) {
            state.regional_indicator = false;
            return true;
        } else {
            state.regional_indicator = true;
            return false;
        }
    }
    // GB11: Emoji Extend* ZWJ x Emoji
    if (state.extended_pictographic and
        gbc1 == .zwj and
        gbc2 == .extended_pictographic)
    {
        state.extended_pictographic = false;
        return false;
    }
    return true;
 }
 /// If you build this file as a binary, we will verify the grapheme break
 /// implementation. This iterates over billions of codepoints so it is
 /// SLOW. It's not meant to be run in CI, but it's useful for debugging.
 pub fn main() !void {
    const ziglyph = @import("ziglyph");
    // Set the min and max to control the test range.
    const min = 0;
    const max = std.math.maxInt(u21) + 1;
    var state: BreakState = .{};
    var zg_state: u3 = 0;
    for (min..max) |cp1| {
        if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
        if (cp1 == '\r' or cp1 == '\n' or
            ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;
        for (min..max) |cp2| {
            if (cp2 == '\r' or cp2 == '\n' or
                ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;
            const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
            const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
            if (gb != zg_gb) {
                std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
                    cp1,
                    cp2,
                    gb,
                    state,
                    zg_gb,
                    zg_state,
                });
            }
        }
    }
 }
 pub const std_options = struct {
    pub const log_level: std.log.Level = .info;
 };
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@ -1,8 +1,11 @@
 pub const lut = @import("lut.zig");
 const grapheme = @import("grapheme.zig");
 const props = @import("props.zig");
 pub const table = props.table;
 pub const Properties = props.Properties;
 pub const graphemeBreak = grapheme.graphemeBreak;
 pub const GraphemeBreakState = grapheme.BreakState;
 test {
    @import("std").testing.refAllDecls(@This());
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@ -27,9 +27,13 @@ pub const Properties = struct {
    /// becomes a 2-em dash).
    width: u2 = 0,
    /// Grapheme boundary class.
    grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
    // Needed for lut.Generator
    pub fn eql(a: Properties, b: Properties) bool {
-        return a.width == b.width;
+        return a.width == b.width and
            a.grapheme_boundary_class == b.grapheme_boundary_class;
    }
    // Needed for lut.Generator
@ -41,17 +45,64 @@ pub const Properties = struct {
    ) !void {
        _ = layout;
        _ = opts;
-        try std.fmt.format(writer, ".{{ .width= {}, }}", .{
+        try std.fmt.format(writer,
            \\.{{
            \\    .width= {},
            \\    .grapheme_boundary_class= .{s},
            \\}}
        , .{
            self.width,
            @tagName(self.grapheme_boundary_class),
        });
    }
 };
 /// Possible grapheme boundary classes. This isn't an exhaustive list:
 /// we omit control, CR, LF, etc. because in Ghostty's usage that are
 /// impossible because they're handled by the terminal.
 pub const GraphemeBoundaryClass = enum(u4) {
    invalid,
    L,
    V,
    T,
    LV,
    LVT,
    prepend,
    extend,
    zwj,
    spacing_mark,
    regional_indicator,
    extended_pictographic,
    /// Gets the grapheme boundary class for a codepoint. This is VERY
    /// SLOW. The use case for this is only in generating lookup tables.
    pub fn init(cp: u21) GraphemeBoundaryClass {
        if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic;
        if (ziglyph.emoji.isEmojiModifier(cp)) return .extend;
        if (ziglyph.grapheme_break.isL(cp)) return .L;
        if (ziglyph.grapheme_break.isV(cp)) return .V;
        if (ziglyph.grapheme_break.isT(cp)) return .T;
        if (ziglyph.grapheme_break.isLv(cp)) return .LV;
        if (ziglyph.grapheme_break.isLvt(cp)) return .LVT;
        if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend;
        if (ziglyph.grapheme_break.isExtend(cp)) return .extend;
        if (ziglyph.grapheme_break.isZwj(cp)) return .zwj;
        if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark;
        if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator;
        // This is obviously not INVALID invalid, there is SOME grapheme
        // boundary class for every codepoint. But we don't care about
        // anything that doesn't fit into the above categories.
        return .invalid;
    }
 };
 pub fn get(cp: u21) Properties {
    const zg_width = ziglyph.display_width.codePointWidth(cp, .half);
    return .{
        .width = @intCast(@min(2, @max(0, zg_width))),
        .grapheme_boundary_class = GraphemeBoundaryClass.init(cp),
    };
 }