From 64376235002e3b85efdae29700b162d572329f96 Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <mitchell.hashimoto@gmail.com>
Date: Fri, 9 Feb 2024 09:12:05 -0800
Subject: [PATCH 1/7] bench/grapheme-break

---
 build.zig                    |   9 +++
 build.zig.zon                |   1 +
 src/bench/grapheme-break.sh  |  32 ++++++++
 src/bench/grapheme-break.zig | 144 +++++++++++++++++++++++++++++++++++
 src/build_config.zig         |   1 +
 src/main.zig                 |   1 +
 6 files changed, 188 insertions(+)
 create mode 100755 src/bench/grapheme-break.sh
 create mode 100644 src/bench/grapheme-break.zig

diff --git a/build.zig b/build.zig
index 0669f27cc..445cf4a98 100644
--- a/build.zig
+++ b/build.zig
@@ -1082,6 +1082,15 @@ fn addDeps(
     step.linkLibrary(utfcpp_dep.artifact("utfcpp"));
     try static_libs.append(utfcpp_dep.artifact("utfcpp").getEmittedBin());
 
+    // utf8proc
+    const utf8proc_dep = b.dependency("utf8proc", .{
+        .target = target,
+        .optimize = optimize,
+    });
+    step.root_module.addImport("utf8proc", utf8proc_dep.module("utf8proc"));
+    step.linkLibrary(utf8proc_dep.artifact("utf8proc"));
+    try static_libs.append(utf8proc_dep.artifact("utf8proc").getEmittedBin());
+
     // Spirv-Cross
     step.linkLibrary(spirv_cross_dep.artifact("spirv_cross"));
     try static_libs.append(spirv_cross_dep.artifact("spirv_cross").getEmittedBin());
diff --git a/build.zig.zon b/build.zig.zon
index a694562ea..535d51c24 100644
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -39,6 +39,7 @@
         .pixman = .{ .path = "./pkg/pixman" },
         .simdutf = .{ .path = "./pkg/simdutf" },
         .utfcpp = .{ .path = "./pkg/utfcpp" },
+        .utf8proc = .{ .path = "./pkg/utf8proc" },
         .zlib = .{ .path = "./pkg/zlib" },
 
         // Shader translation
diff --git a/src/bench/grapheme-break.sh b/src/bench/grapheme-break.sh
new file mode 100755
index 000000000..56bd28dd1
--- /dev/null
+++ b/src/bench/grapheme-break.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+#
+# This is a trivial helper script to help run the grapheme-break benchmark.
+# You probably want to tweak this script depending on what you're
+# trying to measure.
+
+# Options:
+# - "ascii", uniform random ASCII bytes
+# - "utf8", uniform random unicode characters, encoded as utf8
+# - "rand", pure random data, will contain many invalid code sequences.
+DATA="utf8"
+SIZE="25000000"
+
+# Add additional arguments
+ARGS=""
+
+# Generate the benchmark input ahead of time so it's not included in the time.
+./zig-out/bin/bench-stream --mode=gen-$DATA | head -c $SIZE > /tmp/ghostty_bench_data
+#cat ~/Downloads/JAPANESEBIBLE.txt > /tmp/ghostty_bench_data
+
+# Uncomment to instead use the contents of `stream.txt` as input.
+# yes $(cat ./stream.txt) | head -c $SIZE > /tmp/ghostty_bench_data
+
+hyperfine \
+  --warmup 10 \
+  -n noop \
+  "./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} </tmp/ghostty_bench_data" \
+  -n ziglyph \
+  "./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
+  -n utf8proc \
+  "./zig-out/bin/bench-grapheme-break --mode=utf8proc${ARGS} </tmp/ghostty_bench_data"
+
diff --git a/src/bench/grapheme-break.zig b/src/bench/grapheme-break.zig
new file mode 100644
index 000000000..108c3e29d
--- /dev/null
+++ b/src/bench/grapheme-break.zig
@@ -0,0 +1,144 @@
+//! This benchmark tests the throughput of grapheme break calculation.
+//! This is a common operation in terminal character printing for terminals
+//! that support grapheme clustering.
+//!
+//! This will consume all of the available stdin, so you should run it
+//! with `head` in a pipe to restrict. For example, to test ASCII input:
+//!
+//!   bench-stream --mode=gen-ascii | head -c 50M | bench-grapheme-break --mode=ziglyph
+//!
+
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+const ArenaAllocator = std.heap.ArenaAllocator;
+const ziglyph = @import("ziglyph");
+const cli = @import("../cli.zig");
+const simd = @import("../simd/main.zig");
+const table = @import("../unicode/main.zig").table;
+const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
+
+const Args = struct {
+    mode: Mode = .noop,
+
+    /// The size for read buffers. Doesn't usually need to be changed. The
+    /// main point is to make this runtime known so we can avoid compiler
+    /// optimizations.
+    @"buffer-size": usize = 4096,
+
+    /// This is set by the CLI parser for deinit.
+    _arena: ?ArenaAllocator = null,
+
+    pub fn deinit(self: *Args) void {
+        if (self._arena) |arena| arena.deinit();
+        self.* = undefined;
+    }
+};
+
+const Mode = enum {
+    /// The baseline mode copies the data from the fd into a buffer. This
+    /// is used to show the minimal overhead of reading the fd into memory
+    /// and establishes a baseline for the other modes.
+    noop,
+
+    /// Use ziglyph library to calculate the display width of each codepoint.
+    ziglyph,
+
+    utf8proc,
+};
+
+pub const std_options = struct {
+    pub const log_level: std.log.Level = .debug;
+};
+
+pub fn main() !void {
+    // We want to use the c allocator because it is much faster than GPA.
+    const alloc = std.heap.c_allocator;
+
+    // Parse our args
+    var args: Args = .{};
+    defer args.deinit();
+    {
+        var iter = try std.process.argsWithAllocator(alloc);
+        defer iter.deinit();
+        try cli.args.parse(Args, alloc, &args, &iter);
+    }
+
+    const reader = std.io.getStdIn().reader();
+    const buf = try alloc.alloc(u8, args.@"buffer-size");
+
+    // Handle the modes that do not depend on terminal state first.
+    switch (args.mode) {
+        .noop => try benchNoop(reader, buf),
+        .ziglyph => try benchZiglyph(reader, buf),
+        .utf8proc => try benchUtf8proc(reader, buf),
+    }
+}
+
+noinline fn benchNoop(
+    reader: anytype,
+    buf: []u8,
+) !void {
+    var d: UTF8Decoder = .{};
+    while (true) {
+        const n = try reader.read(buf);
+        if (n == 0) break;
+
+        // Using stream.next directly with a for loop applies a naive
+        // scalar approach.
+        for (buf[0..n]) |c| {
+            _ = d.next(c);
+        }
+    }
+}
+
+noinline fn benchZiglyph(
+    reader: anytype,
+    buf: []u8,
+) !void {
+    var d: UTF8Decoder = .{};
+    var state: u3 = 0;
+    var cp1: u21 = 0;
+    while (true) {
+        const n = try reader.read(buf);
+        if (n == 0) break;
+
+        // Using stream.next directly with a for loop applies a naive
+        // scalar approach.
+        for (buf[0..n]) |c| {
+            const cp_, const consumed = d.next(c);
+            assert(consumed);
+            if (cp_) |cp2| {
+                const v = ziglyph.graphemeBreak(cp1, @intCast(cp2), &state);
+                buf[0] = @intCast(@intFromBool(v));
+                cp1 = cp2;
+            }
+        }
+    }
+}
+
+noinline fn benchUtf8proc(
+    reader: anytype,
+    buf: []u8,
+) !void {
+    const utf8proc = @import("utf8proc");
+    var d: UTF8Decoder = .{};
+    var state: i32 = 0;
+    var cp1: u21 = 0;
+    while (true) {
+        const n = try reader.read(buf);
+        if (n == 0) break;
+
+        // Using stream.next directly with a for loop applies a naive
+        // scalar approach.
+        for (buf[0..n]) |c| {
+            const cp_, const consumed = d.next(c);
+            assert(consumed);
+            if (cp_) |cp2| {
+                const v = utf8proc.graphemeBreakStateful(cp1, @intCast(cp2), &state);
+                buf[0] = @intCast(@intFromBool(v));
+                cp1 = cp2;
+            }
+        }
+    }
+}
diff --git a/src/build_config.zig b/src/build_config.zig
index 32dee925a..33b76d252 100644
--- a/src/build_config.zig
+++ b/src/build_config.zig
@@ -141,4 +141,5 @@ pub const ExeEntrypoint = enum {
     bench_parser,
     bench_stream,
     bench_codepoint_width,
+    bench_grapheme_break,
 };
diff --git a/src/main.zig b/src/main.zig
index 46a6d7d3d..8cad7ec9f 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -9,4 +9,5 @@ pub usingnamespace switch (build_config.exe_entrypoint) {
     .bench_parser => @import("bench/parser.zig"),
     .bench_stream => @import("bench/stream.zig"),
     .bench_codepoint_width => @import("bench/codepoint-width.zig"),
+    .bench_grapheme_break => @import("bench/grapheme-break.zig"),
 };

From 0632410857f00eb0cf8aa0b1acf78b778486127c Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <mitchell.hashimoto@gmail.com>
Date: Fri, 9 Feb 2024 12:22:23 -0800
Subject: [PATCH 2/7] unicode: get grapheme boundary class

---
 src/unicode/props.zig | 44 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/src/unicode/props.zig b/src/unicode/props.zig
index fe85844a5..d46acbf49 100644
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@@ -27,6 +27,9 @@ pub const Properties = struct {
     /// becomes a 2-em dash).
     width: u2 = 0,
 
+    /// Grapheme boundary class.
+    grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
+
     // Needed for lut.Generator
     pub fn eql(a: Properties, b: Properties) bool {
         return a.width == b.width;
@@ -47,11 +50,52 @@ pub const Properties = struct {
     }
 };
 
+/// Possible grapheme boundary classes. This isn't an exhaustive list:
+/// we omit control, CR, LF, etc. because in Ghostty's usage that are
+/// impossible because they're handled by the terminal.
+pub const GraphemeBoundaryClass = enum {
+    invalid,
+    L,
+    V,
+    T,
+    LV,
+    LVT,
+    prepend,
+    extend,
+    zwj,
+    spacing_mark,
+    regional_indicator,
+    extended_pictographic,
+
+    /// Gets the grapheme boundary class for a codepoint. This is VERY
+    /// SLOW. The use case for this is only in generating lookup tables.
+    pub fn init(cp: u21) GraphemeBoundaryClass {
+        if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic;
+        if (ziglyph.emoji.isEmojiModifier(cp)) return .extend;
+        if (ziglyph.grapheme_break.isL(cp)) return .L;
+        if (ziglyph.grapheme_break.isV(cp)) return .V;
+        if (ziglyph.grapheme_break.isT(cp)) return .T;
+        if (ziglyph.grapheme_break.isLv(cp)) return .LV;
+        if (ziglyph.grapheme_break.isLvt(cp)) return .LVT;
+        if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend;
+        if (ziglyph.grapheme_break.isExtend(cp)) return .extend;
+        if (ziglyph.grapheme_break.isZwj(cp)) return .zwj;
+        if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark;
+        if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator;
+
+        // This is obviously not INVALID invalid, there is SOME grapheme
+        // boundary class for every codepoint. But we don't care about
+        // anything that doesn't fit into the above categories.
+        return .invalid;
+    }
+};
+
 pub fn get(cp: u21) Properties {
     const zg_width = ziglyph.display_width.codePointWidth(cp, .half);
 
     return .{
         .width = @intCast(@min(2, @max(0, zg_width))),
+        .grapheme_boundary_class = GraphemeBoundaryClass.init(cp),
     };
 }
 

From 5f3574a4bfc33a19e0b6588ff67709afae0622bd Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <mitchell.hashimoto@gmail.com>
Date: Fri, 9 Feb 2024 19:44:57 -0800
Subject: [PATCH 3/7] unicode: direct port of ziglyph to start

---
 build.zig                    |  11 ++
 src/bench/grapheme-break.sh  |   5 +-
 src/bench/grapheme-break.zig |  31 ++++-
 src/unicode/grapheme.zig     | 261 +++++++++++++++++++++++++++++++++++
 src/unicode/main.zig         |   1 +
 src/unicode/props.zig        |  11 +-
 6 files changed, 316 insertions(+), 4 deletions(-)
 create mode 100644 src/unicode/grapheme.zig

diff --git a/build.zig b/build.zig
index 445cf4a98..fd49b7e62 100644
--- a/build.zig
+++ b/build.zig
@@ -217,6 +217,17 @@ pub fn build(b: *std.Build) !void {
     // Add our benchmarks
     try benchSteps(b, target, config, emit_bench);
 
+    {
+        const exe = b.addExecutable(.{
+            .name = "grapheme-verify",
+            .root_source_file = .{ .path = "src/unicode/grapheme.zig" },
+            .target = target,
+            .optimize = .ReleaseFast,
+        });
+        b.installArtifact(exe);
+        _ = try addDeps(b, exe, config);
+    }
+
     // We only build an exe if we have a runtime set.
     const exe_: ?*std.Build.Step.Compile = if (config.app_runtime != .none) b.addExecutable(.{
         .name = "ghostty",
diff --git a/src/bench/grapheme-break.sh b/src/bench/grapheme-break.sh
index 56bd28dd1..c395c3799 100755
--- a/src/bench/grapheme-break.sh
+++ b/src/bench/grapheme-break.sh
@@ -28,5 +28,8 @@ hyperfine \
   -n ziglyph \
   "./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
   -n utf8proc \
-  "./zig-out/bin/bench-grapheme-break --mode=utf8proc${ARGS} </tmp/ghostty_bench_data"
+  "./zig-out/bin/bench-grapheme-break --mode=utf8proc${ARGS} </tmp/ghostty_bench_data" \
+  -n table \
+  "./zig-out/bin/bench-grapheme-break --mode=table${ARGS} </tmp/ghostty_bench_data"
+
 
diff --git a/src/bench/grapheme-break.zig b/src/bench/grapheme-break.zig
index 108c3e29d..55caca313 100644
--- a/src/bench/grapheme-break.zig
+++ b/src/bench/grapheme-break.zig
@@ -15,7 +15,7 @@ const ArenaAllocator = std.heap.ArenaAllocator;
 const ziglyph = @import("ziglyph");
 const cli = @import("../cli.zig");
 const simd = @import("../simd/main.zig");
-const table = @import("../unicode/main.zig").table;
+const unicode = @import("../unicode/main.zig");
 const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
 
 const Args = struct {
@@ -44,6 +44,9 @@ const Mode = enum {
     /// Use ziglyph library to calculate the display width of each codepoint.
     ziglyph,
 
+    /// Ghostty's table-based approach.
+    table,
+
     utf8proc,
 };
 
@@ -71,6 +74,7 @@ pub fn main() !void {
     switch (args.mode) {
         .noop => try benchNoop(reader, buf),
         .ziglyph => try benchZiglyph(reader, buf),
+        .table => try benchTable(reader, buf),
         .utf8proc => try benchUtf8proc(reader, buf),
     }
 }
@@ -92,6 +96,31 @@ noinline fn benchNoop(
     }
 }
 
+noinline fn benchTable(
+    reader: anytype,
+    buf: []u8,
+) !void {
+    var d: UTF8Decoder = .{};
+    var state: u3 = 0;
+    var cp1: u21 = 0;
+    while (true) {
+        const n = try reader.read(buf);
+        if (n == 0) break;
+
+        // Using stream.next directly with a for loop applies a naive
+        // scalar approach.
+        for (buf[0..n]) |c| {
+            const cp_, const consumed = d.next(c);
+            assert(consumed);
+            if (cp_) |cp2| {
+                const v = unicode.graphemeBreak(cp1, @intCast(cp2), &state);
+                buf[0] = @intCast(@intFromBool(v));
+                cp1 = cp2;
+            }
+        }
+    }
+}
+
 noinline fn benchZiglyph(
     reader: anytype,
     buf: []u8,
diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig
new file mode 100644
index 000000000..f5a39f863
--- /dev/null
+++ b/src/unicode/grapheme.zig
@@ -0,0 +1,261 @@
+const std = @import("std");
+const props = @import("props.zig");
+const table = props.table;
+
+/// Grapheme break
+pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool {
+    const gbc1 = table.get(cp1).grapheme_boundary_class;
+    const gbc2 = table.get(cp2).grapheme_boundary_class;
+    // std.log.warn("gbc1={} gbc2={}, new1={} new2={}", .{
+    //     gbc1,
+    //     gbc2,
+    //     props.GraphemeBoundaryClass.init(cp1),
+    //     props.GraphemeBoundaryClass.init(cp2),
+    // });
+
+    // GB11: Emoji Extend* ZWJ x Emoji
+    if (!hasXpic(state) and gbc1 == .extended_pictographic) setXpic(state);
+
+    // These two properties are ignored because they're not relevant to
+    // Ghostty -- they're filtered out before checking grapheme boundaries.
+    // GB3: CR x LF
+    // GB4: Control
+
+    // GB6: Hangul L x (L|V|LV|VT)
+    if (gbc1 == .L) {
+        if (gbc2 == .L or
+            gbc2 == .V or
+            gbc2 == .LV or
+            gbc2 == .LVT) return false;
+    }
+
+    // GB7: Hangul (LV | V) x (V | T)
+    if (gbc1 == .LV or gbc1 == .V) {
+        if (gbc2 == .V or
+            gbc2 == .T) return false;
+    }
+
+    // GB8: Hangul (LVT | T) x T
+    if (gbc1 == .LVT or gbc1 == .T) {
+        if (gbc2 == .T) return false;
+    }
+
+    // GB9b: x (Extend | ZWJ)
+    if (gbc2 == .extend or gbc2 == .zwj) return false;
+
+    // GB9a: x Spacing
+    if (gbc2 == .spacing_mark) return false;
+
+    // GB9b: Prepend x
+    if (gbc1 == .prepend) return false;
+
+    // GB12, GB13: RI x RI
+    if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
+        if (hasRegional(state)) {
+            unsetRegional(state);
+            return true;
+        } else {
+            setRegional(state);
+            return false;
+        }
+    }
+
+    // GB11: Emoji Extend* ZWJ x Emoji
+    if (hasXpic(state) and
+        gbc1 == .zwj and
+        gbc2 == .extended_pictographic)
+    {
+        unsetXpic(state);
+        return false;
+    }
+
+    return true;
+}
+
+const emoji = @import("ziglyph").emoji;
+const gbp = @import("ziglyph").grapheme_break;
+
+fn isBreaker(cp: u21) bool {
+    return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp);
+}
+
+pub fn zg_graphemeBreak(
+    cp1: u21,
+    cp2: u21,
+    state: *u3,
+) bool {
+
+    // GB11: Emoji Extend* ZWJ x Emoji
+    if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state);
+
+    // GB3: CR x LF
+    if (cp1 == '\r' and cp2 == '\n') {
+        std.log.warn("GB3", .{});
+        return false;
+    }
+
+    // GB4: Control
+    if (isBreaker(cp1)) {
+        std.log.warn("GB4", .{});
+        return true;
+    }
+
+    // GB6: Hangul L x (L|V|LV|VT)
+    if (gbp.isL(cp1)) {
+        if (gbp.isL(cp2) or
+            gbp.isV(cp2) or
+            gbp.isLv(cp2) or
+            gbp.isLvt(cp2))
+        {
+            std.log.warn("GB6", .{});
+            return false;
+        }
+    }
+
+    // GB7: Hangul (LV | V) x (V | T)
+    if (gbp.isLv(cp1) or gbp.isV(cp1)) {
+        if (gbp.isV(cp2) or
+            gbp.isT(cp2))
+        {
+            std.log.warn("GB7", .{});
+            return false;
+        }
+    }
+
+    // GB8: Hangul (LVT | T) x T
+    if (gbp.isLvt(cp1) or gbp.isT(cp1)) {
+        if (gbp.isT(cp2)) {
+            std.log.warn("GB8", .{});
+            return false;
+        }
+    }
+
+    // GB9b: x (Extend | ZWJ)
+    if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) {
+        std.log.warn("GB9b", .{});
+        return false;
+    }
+
+    // GB9a: x Spacing
+    if (gbp.isSpacingmark(cp2)) {
+        std.log.warn("GB9a", .{});
+        return false;
+    }
+
+    // GB9b: Prepend x
+    if (gbp.isPrepend(cp1) and !isBreaker(cp2)) {
+        std.log.warn("GB9b cp1={x} prepend={}", .{ cp1, gbp.isPrepend(cp1) });
+        return false;
+    }
+
+    // GB12, GB13: RI x RI
+    if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) {
+        if (hasRegional(state)) {
+            unsetRegional(state);
+            std.log.warn("GB12", .{});
+            return true;
+        } else {
+            std.log.warn("GB13", .{});
+            setRegional(state);
+            return false;
+        }
+    }
+
+    // GB11: Emoji Extend* ZWJ x Emoji
+    if (hasXpic(state) and
+        gbp.isZwj(cp1) and
+        emoji.isExtendedPictographic(cp2))
+    {
+        std.log.warn("GB11", .{});
+        unsetXpic(state);
+        return false;
+    }
+
+    return true;
+}
+
+fn hasXpic(state: *const u3) bool {
+    return state.* & 1 == 1;
+}
+
+fn setXpic(state: *u3) void {
+    state.* |= 1;
+}
+
+fn unsetXpic(state: *u3) void {
+    state.* ^= 1;
+}
+
+fn hasRegional(state: *const u3) bool {
+    return state.* & 2 == 2;
+}
+
+fn setRegional(state: *u3) void {
+    state.* |= 2;
+}
+
+fn unsetRegional(state: *u3) void {
+    state.* ^= 2;
+}
+
+/// If you build this file as a binary, we will verify the grapheme break
+/// implementation. This iterates over billions of codepoints so it is
+/// SLOW. It's not meant to be run in CI, but it's useful for debugging.
+pub fn main() !void {
+    const ziglyph = @import("ziglyph");
+
+    var state: u3 = 0;
+    var zg_state: u3 = 0;
+    for (0..std.math.maxInt(u21) + 1) |cp1| {
+        if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
+
+        if (cp1 == '\r' or cp1 == '\n' or
+            ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;
+
+        for (0..std.math.maxInt(u21) + 1) |cp2| {
+            if (cp2 == '\r' or cp2 == '\n' or
+                ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;
+
+            const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
+            const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
+            if (gb != zg_gb) {
+                std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
+                    cp1,
+                    cp2,
+                    gb,
+                    state,
+                    zg_gb,
+                    zg_state,
+                });
+            }
+        }
+    }
+}
+
+pub const std_options = struct {
+    pub const log_level: std.log.Level = .info;
+};
+
+// test "matches ziglyph specific" {
+//     const testing = std.testing;
+//
+//     var state: u3 = 0;
+//     var zg_state: u3 = 0;
+//
+//     const cp1 = 0x20;
+//     const cp2 = 0x300;
+//
+//     const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
+//     const zg_gb = zg_graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
+//     if (gb != zg_gb) {
+//         std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
+//             cp1,
+//             cp2,
+//             gb,
+//             state,
+//             zg_gb,
+//             zg_state,
+//         });
+//         try testing.expect(false);
+//     }
+// }
diff --git a/src/unicode/main.zig b/src/unicode/main.zig
index fa0cb9fc8..1af26d485 100644
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@@ -1,5 +1,6 @@
 pub const lut = @import("lut.zig");
 
+pub usingnamespace @import("grapheme.zig");
 const props = @import("props.zig");
 pub const table = props.table;
 pub const Properties = props.Properties;
diff --git a/src/unicode/props.zig b/src/unicode/props.zig
index d46acbf49..d6f282ed9 100644
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@@ -32,7 +32,8 @@ pub const Properties = struct {
 
     // Needed for lut.Generator
     pub fn eql(a: Properties, b: Properties) bool {
-        return a.width == b.width;
+        return a.width == b.width and
+            a.grapheme_boundary_class == b.grapheme_boundary_class;
     }
 
     // Needed for lut.Generator
@@ -44,8 +45,14 @@ pub const Properties = struct {
     ) !void {
         _ = layout;
         _ = opts;
-        try std.fmt.format(writer, ".{{ .width= {}, }}", .{
+        try std.fmt.format(writer,
+            \\.{{
+            \\    .width= {},
+            \\    .grapheme_boundary_class= .{s},
+            \\}}
+        , .{
             self.width,
+            @tagName(self.grapheme_boundary_class),
         });
     }
 };

From c47ad97f62ca1f5e6132d46839b7cda999af461b Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <mitchell.hashimoto@gmail.com>
Date: Fri, 9 Feb 2024 20:23:29 -0800
Subject: [PATCH 4/7] unicode: remove unused

---
 src/unicode/grapheme.zig | 167 ++++++++-------------------------------
 src/unicode/main.zig     |   3 +-
 2 files changed, 35 insertions(+), 135 deletions(-)

diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig
index f5a39f863..19437844c 100644
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@@ -1,18 +1,34 @@
 const std = @import("std");
 const props = @import("props.zig");
+const GraphemeBoundaryClass = props.GraphemeBoundaryClass;
 const table = props.table;
 
-/// Grapheme break
+// The algorithm in this file is based on the Ziglyph and utf8proc algorithm,
+// only modified to use our own lookup tables.
+//
+// I'll note I also tried a fully precomputed table approach where all
+// combinations of state and boundary classes were precomputed. It was
+// marginally faster (about 2%) but the table is a few KB and I'm not
+// sure it's worth it.
+
+/// Determines if there is a grapheme break between two codepoints. This
+/// must be called sequentially maintaining the state between calls.
+///
+/// This function does NOT work with control characters. Control characters,
+/// line feeds, and carriage returns are expected to be filtered out before
+/// calling this function. This is because this function is tuned for
+/// Ghostty.
 pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool {
     const gbc1 = table.get(cp1).grapheme_boundary_class;
     const gbc2 = table.get(cp2).grapheme_boundary_class;
-    // std.log.warn("gbc1={} gbc2={}, new1={} new2={}", .{
-    //     gbc1,
-    //     gbc2,
-    //     props.GraphemeBoundaryClass.init(cp1),
-    //     props.GraphemeBoundaryClass.init(cp2),
-    // });
+    return graphemeBreakClass(gbc1, gbc2, state);
+}
 
+fn graphemeBreakClass(
+    gbc1: GraphemeBoundaryClass,
+    gbc2: GraphemeBoundaryClass,
+    state: *u3,
+) bool {
     // GB11: Emoji Extend* ZWJ x Emoji
     if (!hasXpic(state) and gbc1 == .extended_pictographic) setXpic(state);
 
@@ -72,107 +88,10 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool {
     return true;
 }
 
-const emoji = @import("ziglyph").emoji;
-const gbp = @import("ziglyph").grapheme_break;
-
-fn isBreaker(cp: u21) bool {
-    return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp);
-}
-
-pub fn zg_graphemeBreak(
-    cp1: u21,
-    cp2: u21,
-    state: *u3,
-) bool {
-
-    // GB11: Emoji Extend* ZWJ x Emoji
-    if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state);
-
-    // GB3: CR x LF
-    if (cp1 == '\r' and cp2 == '\n') {
-        std.log.warn("GB3", .{});
-        return false;
-    }
-
-    // GB4: Control
-    if (isBreaker(cp1)) {
-        std.log.warn("GB4", .{});
-        return true;
-    }
-
-    // GB6: Hangul L x (L|V|LV|VT)
-    if (gbp.isL(cp1)) {
-        if (gbp.isL(cp2) or
-            gbp.isV(cp2) or
-            gbp.isLv(cp2) or
-            gbp.isLvt(cp2))
-        {
-            std.log.warn("GB6", .{});
-            return false;
-        }
-    }
-
-    // GB7: Hangul (LV | V) x (V | T)
-    if (gbp.isLv(cp1) or gbp.isV(cp1)) {
-        if (gbp.isV(cp2) or
-            gbp.isT(cp2))
-        {
-            std.log.warn("GB7", .{});
-            return false;
-        }
-    }
-
-    // GB8: Hangul (LVT | T) x T
-    if (gbp.isLvt(cp1) or gbp.isT(cp1)) {
-        if (gbp.isT(cp2)) {
-            std.log.warn("GB8", .{});
-            return false;
-        }
-    }
-
-    // GB9b: x (Extend | ZWJ)
-    if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) {
-        std.log.warn("GB9b", .{});
-        return false;
-    }
-
-    // GB9a: x Spacing
-    if (gbp.isSpacingmark(cp2)) {
-        std.log.warn("GB9a", .{});
-        return false;
-    }
-
-    // GB9b: Prepend x
-    if (gbp.isPrepend(cp1) and !isBreaker(cp2)) {
-        std.log.warn("GB9b cp1={x} prepend={}", .{ cp1, gbp.isPrepend(cp1) });
-        return false;
-    }
-
-    // GB12, GB13: RI x RI
-    if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) {
-        if (hasRegional(state)) {
-            unsetRegional(state);
-            std.log.warn("GB12", .{});
-            return true;
-        } else {
-            std.log.warn("GB13", .{});
-            setRegional(state);
-            return false;
-        }
-    }
-
-    // GB11: Emoji Extend* ZWJ x Emoji
-    if (hasXpic(state) and
-        gbp.isZwj(cp1) and
-        emoji.isExtendedPictographic(cp2))
-    {
-        std.log.warn("GB11", .{});
-        unsetXpic(state);
-        return false;
-    }
-
-    return true;
-}
+const State = packed struct(u2) {
+    extended_pictographic: bool = false,
+    regional_indicator: bool = false,
+};
 
 fn hasXpic(state: *const u3) bool {
     return state.* & 1 == 1;
@@ -204,15 +123,19 @@ fn unsetRegional(state: *u3) void {
 pub fn main() !void {
     const ziglyph = @import("ziglyph");
 
+    // Set the min and max to control the test range.
+    const min = 0;
+    const max = std.math.maxInt(u21) + 1;
+
     var state: u3 = 0;
     var zg_state: u3 = 0;
-    for (0..std.math.maxInt(u21) + 1) |cp1| {
+    for (min..max) |cp1| {
         if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
 
         if (cp1 == '\r' or cp1 == '\n' or
             ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;
 
-        for (0..std.math.maxInt(u21) + 1) |cp2| {
+        for (min..max) |cp2| {
             if (cp2 == '\r' or cp2 == '\n' or
                 ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;
 
@@ -235,27 +158,3 @@ pub fn main() !void {
 pub const std_options = struct {
     pub const log_level: std.log.Level = .info;
 };
-
-// test "matches ziglyph specific" {
-//     const testing = std.testing;
-//
-//     var state: u3 = 0;
-//     var zg_state: u3 = 0;
-//
-//     const cp1 = 0x20;
-//     const cp2 = 0x300;
-//
-//     const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
-//     const zg_gb = zg_graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
-//     if (gb != zg_gb) {
-//         std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
-//             cp1,
-//             cp2,
-//             gb,
-//             state,
-//             zg_gb,
-//             zg_state,
-//         });
-//         try testing.expect(false);
-//     }
-// }
diff --git a/src/unicode/main.zig b/src/unicode/main.zig
index 1af26d485..3cc4779ed 100644
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@@ -1,9 +1,10 @@
 pub const lut = @import("lut.zig");
 
-pub usingnamespace @import("grapheme.zig");
+const grapheme = @import("grapheme.zig");
 const props = @import("props.zig");
 pub const table = props.table;
 pub const Properties = props.Properties;
+pub const graphemeBreak = grapheme.graphemeBreak;
 
 test {
     @import("std").testing.refAllDecls(@This());

From 132fbb3a4695b09d8674914e8d68a660fb28df6d Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <mitchell.hashimoto@gmail.com>
Date: Fri, 9 Feb 2024 20:29:36 -0800
Subject: [PATCH 5/7] unicode: use packed struct for break state

---
 build.zig                    | 20 -------------
 build.zig.zon                |  1 -
 src/bench/grapheme-break.sh  |  2 --
 src/bench/grapheme-break.zig | 31 +-------------------
 src/unicode/grapheme.zig     | 55 +++++++++++-------------------------
 src/unicode/main.zig         |  1 +
 6 files changed, 19 insertions(+), 91 deletions(-)

diff --git a/build.zig b/build.zig
index fd49b7e62..0669f27cc 100644
--- a/build.zig
+++ b/build.zig
@@ -217,17 +217,6 @@ pub fn build(b: *std.Build) !void {
     // Add our benchmarks
     try benchSteps(b, target, config, emit_bench);
 
-    {
-        const exe = b.addExecutable(.{
-            .name = "grapheme-verify",
-            .root_source_file = .{ .path = "src/unicode/grapheme.zig" },
-            .target = target,
-            .optimize = .ReleaseFast,
-        });
-        b.installArtifact(exe);
-        _ = try addDeps(b, exe, config);
-    }
-
     // We only build an exe if we have a runtime set.
     const exe_: ?*std.Build.Step.Compile = if (config.app_runtime != .none) b.addExecutable(.{
         .name = "ghostty",
@@ -1093,15 +1082,6 @@ fn addDeps(
     step.linkLibrary(utfcpp_dep.artifact("utfcpp"));
     try static_libs.append(utfcpp_dep.artifact("utfcpp").getEmittedBin());
 
-    // utf8proc
-    const utf8proc_dep = b.dependency("utf8proc", .{
-        .target = target,
-        .optimize = optimize,
-    });
-    step.root_module.addImport("utf8proc", utf8proc_dep.module("utf8proc"));
-    step.linkLibrary(utf8proc_dep.artifact("utf8proc"));
-    try static_libs.append(utf8proc_dep.artifact("utf8proc").getEmittedBin());
-
     // Spirv-Cross
     step.linkLibrary(spirv_cross_dep.artifact("spirv_cross"));
     try static_libs.append(spirv_cross_dep.artifact("spirv_cross").getEmittedBin());
diff --git a/build.zig.zon b/build.zig.zon
index 535d51c24..a694562ea 100644
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -39,7 +39,6 @@
         .pixman = .{ .path = "./pkg/pixman" },
         .simdutf = .{ .path = "./pkg/simdutf" },
         .utfcpp = .{ .path = "./pkg/utfcpp" },
-        .utf8proc = .{ .path = "./pkg/utf8proc" },
         .zlib = .{ .path = "./pkg/zlib" },
 
         // Shader translation
diff --git a/src/bench/grapheme-break.sh b/src/bench/grapheme-break.sh
index c395c3799..24f475caa 100755
--- a/src/bench/grapheme-break.sh
+++ b/src/bench/grapheme-break.sh
@@ -27,8 +27,6 @@ hyperfine \
   "./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} </tmp/ghostty_bench_data" \
   -n ziglyph \
   "./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
-  -n utf8proc \
-  "./zig-out/bin/bench-grapheme-break --mode=utf8proc${ARGS} </tmp/ghostty_bench_data" \
   -n table \
   "./zig-out/bin/bench-grapheme-break --mode=table${ARGS} </tmp/ghostty_bench_data"
 
diff --git a/src/bench/grapheme-break.zig b/src/bench/grapheme-break.zig
index 55caca313..7decd525d 100644
--- a/src/bench/grapheme-break.zig
+++ b/src/bench/grapheme-break.zig
@@ -46,8 +46,6 @@ const Mode = enum {
 
     /// Ghostty's table-based approach.
     table,
-
-    utf8proc,
 };
 
 pub const std_options = struct {
@@ -75,7 +73,6 @@ pub fn main() !void {
         .noop => try benchNoop(reader, buf),
         .ziglyph => try benchZiglyph(reader, buf),
         .table => try benchTable(reader, buf),
-        .utf8proc => try benchUtf8proc(reader, buf),
     }
 }
 
@@ -101,7 +98,7 @@ noinline fn benchTable(
     buf: []u8,
 ) !void {
     var d: UTF8Decoder = .{};
-    var state: u3 = 0;
+    var state: unicode.GraphemeBreakState = .{};
     var cp1: u21 = 0;
     while (true) {
         const n = try reader.read(buf);
@@ -145,29 +142,3 @@ noinline fn benchZiglyph(
         }
     }
 }
-
-noinline fn benchUtf8proc(
-    reader: anytype,
-    buf: []u8,
-) !void {
-    const utf8proc = @import("utf8proc");
-    var d: UTF8Decoder = .{};
-    var state: i32 = 0;
-    var cp1: u21 = 0;
-    while (true) {
-        const n = try reader.read(buf);
-        if (n == 0) break;
-
-        // Using stream.next directly with a for loop applies a naive
-        // scalar approach.
-        for (buf[0..n]) |c| {
-            const cp_, const consumed = d.next(c);
-            assert(consumed);
-            if (cp_) |cp2| {
-                const v = utf8proc.graphemeBreakStateful(cp1, @intCast(cp2), &state);
-                buf[0] = @intCast(@intFromBool(v));
-                cp1 = cp2;
-            }
-        }
-    }
-}
diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig
index 19437844c..d4c146e49 100644
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@@ -18,19 +18,27 @@ const table = props.table;
 /// line feeds, and carriage returns are expected to be filtered out before
 /// calling this function. This is because this function is tuned for
 /// Ghostty.
-pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool {
+pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
     const gbc1 = table.get(cp1).grapheme_boundary_class;
     const gbc2 = table.get(cp2).grapheme_boundary_class;
     return graphemeBreakClass(gbc1, gbc2, state);
 }
 
+/// The state that must be maintained between calls to `graphemeBreak`.
+pub const BreakState = packed struct(u2) {
+    extended_pictographic: bool = false,
+    regional_indicator: bool = false,
+};
+
 fn graphemeBreakClass(
     gbc1: GraphemeBoundaryClass,
     gbc2: GraphemeBoundaryClass,
-    state: *u3,
+    state: *BreakState,
 ) bool {
     // GB11: Emoji Extend* ZWJ x Emoji
-    if (!hasXpic(state) and gbc1 == .extended_pictographic) setXpic(state);
+    if (!state.extended_pictographic and gbc1 == .extended_pictographic) {
+        state.extended_pictographic = true;
+    }
 
     // These two properties are ignored because they're not relevant to
     // Ghostty -- they're filtered out before checking grapheme boundaries.
@@ -67,56 +75,27 @@ fn graphemeBreakClass(
 
     // GB12, GB13: RI x RI
     if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
-        if (hasRegional(state)) {
-            unsetRegional(state);
+        if (state.regional_indicator) {
+            state.regional_indicator = false;
             return true;
         } else {
-            setRegional(state);
+            state.regional_indicator = true;
             return false;
         }
     }
 
     // GB11: Emoji Extend* ZWJ x Emoji
-    if (hasXpic(state) and
+    if (state.extended_pictographic and
         gbc1 == .zwj and
         gbc2 == .extended_pictographic)
     {
-        unsetXpic(state);
+        state.extended_pictographic = false;
         return false;
     }
 
     return true;
 }
 
-const State = packed struct(u2) {
-    extended_pictographic: bool = false,
-    regional_indicator: bool = false,
-};
-
-fn hasXpic(state: *const u3) bool {
-    return state.* & 1 == 1;
-}
-
-fn setXpic(state: *u3) void {
-    state.* |= 1;
-}
-
-fn unsetXpic(state: *u3) void {
-    state.* ^= 1;
-}
-
-fn hasRegional(state: *const u3) bool {
-    return state.* & 2 == 2;
-}
-
-fn setRegional(state: *u3) void {
-    state.* |= 2;
-}
-
-fn unsetRegional(state: *u3) void {
-    state.* ^= 2;
-}
-
 /// If you build this file as a binary, we will verify the grapheme break
 /// implementation. This iterates over billions of codepoints so it is
 /// SLOW. It's not meant to be run in CI, but it's useful for debugging.
@@ -127,7 +106,7 @@ pub fn main() !void {
     const min = 0;
     const max = std.math.maxInt(u21) + 1;
 
-    var state: u3 = 0;
+    var state: BreakState = .{};
     var zg_state: u3 = 0;
     for (min..max) |cp1| {
         if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
diff --git a/src/unicode/main.zig b/src/unicode/main.zig
index 3cc4779ed..e8ba05b72 100644
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@@ -5,6 +5,7 @@ const props = @import("props.zig");
 pub const table = props.table;
 pub const Properties = props.Properties;
 pub const graphemeBreak = grapheme.graphemeBreak;
+pub const GraphemeBreakState = grapheme.BreakState;
 
 test {
     @import("std").testing.refAllDecls(@This());

From 6f8b4204b99463a264b1d2311bce46db7634023e Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <mitchell.hashimoto@gmail.com>
Date: Fri, 9 Feb 2024 20:31:20 -0800
Subject: [PATCH 6/7] terminal: use new grapheme break algo

---
 src/terminal/Terminal.zig | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig
index 4d5616003..8931f9819 100644
--- a/src/terminal/Terminal.zig
+++ b/src/terminal/Terminal.zig
@@ -6,7 +6,6 @@ const Terminal = @This();
 
 const std = @import("std");
 const builtin = @import("builtin");
-const ziglyph = @import("ziglyph");
 const testing = std.testing;
 const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
@@ -786,24 +785,19 @@ pub fn print(self: *Terminal, c: u21) !void {
         if (prev.cell.char == 0) break :grapheme;
 
         const grapheme_break = brk: {
-            var state: u3 = 0;
+            var state: unicode.GraphemeBreakState = .{};
             var cp1: u21 = @intCast(prev.cell.char);
             if (prev.cell.attrs.grapheme) {
                 var it = row.codepointIterator(prev.x);
                 while (it.next()) |cp2| {
                     // log.debug("cp1={x} cp2={x}", .{ cp1, cp2 });
-                    assert(!ziglyph.graphemeBreak(
-                        cp1,
-                        cp2,
-                        &state,
-                    ));
-
+                    assert(!unicode.graphemeBreak(cp1, cp2, &state));
                     cp1 = cp2;
                 }
             }
 
             // log.debug("cp1={x} cp2={x} end", .{ cp1, c });
-            break :brk ziglyph.graphemeBreak(cp1, c, &state);
+            break :brk unicode.graphemeBreak(cp1, c, &state);
         };
 
         // If we can NOT break, this means that "c" is part of a grapheme

From 5275d44e7dc4c7f86978b5bdf285b5d4f45eb0e9 Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <mitchell.hashimoto@gmail.com>
Date: Fri, 9 Feb 2024 20:50:13 -0800
Subject: [PATCH 7/7] unicode: precompute grapheme break data

---
 src/unicode/grapheme.zig | 66 +++++++++++++++++++++++++++++++++-------
 src/unicode/props.zig    |  2 +-
 2 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig
index d4c146e49..09f452114 100644
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@@ -3,14 +3,6 @@ const props = @import("props.zig");
 const GraphemeBoundaryClass = props.GraphemeBoundaryClass;
 const table = props.table;
 
-// The algorithm in this file is based on the Ziglyph and utf8proc algorithm,
-// only modified to use our own lookup tables.
-//
-// I'll note I also tried a fully precomputed table approach where all
-// combinations of state and boundary classes were precomputed. It was
-// marginally faster (about 2%) but the table is a few KB and I'm not
-// sure it's worth it.
-
 /// Determines if there is a grapheme break between two codepoints. This
 /// must be called sequentially maintaining the state between calls.
 ///
@@ -19,9 +11,15 @@ const table = props.table;
 /// calling this function. This is because this function is tuned for
 /// Ghostty.
 pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
-    const gbc1 = table.get(cp1).grapheme_boundary_class;
-    const gbc2 = table.get(cp2).grapheme_boundary_class;
-    return graphemeBreakClass(gbc1, gbc2, state);
+    const value = Precompute.data[
+        (Precompute.Key{
+            .gbc1 = table.get(cp1).grapheme_boundary_class,
+            .gbc2 = table.get(cp2).grapheme_boundary_class,
+            .state = state.*,
+        }).index()
+    ];
+    state.* = value.state;
+    return value.result;
 }
 
 /// The state that must be maintained between calls to `graphemeBreak`.
@@ -30,6 +28,52 @@ pub const BreakState = packed struct(u2) {
     regional_indicator: bool = false,
 };
 
+/// This is all the structures and data for the precomputed lookup table
+/// for all possible permutations of state and grapheme boundary classes.
+/// Precomputation only requires 2^10 keys of 3 bit values so the whole
+/// table is less than 1KB.
+const Precompute = struct {
+    const Key = packed struct(u10) {
+        state: BreakState,
+        gbc1: GraphemeBoundaryClass,
+        gbc2: GraphemeBoundaryClass,
+
+        fn index(self: Key) usize {
+            return @intCast(@as(u10, @bitCast(self)));
+        }
+    };
+
+    const Value = packed struct(u3) {
+        result: bool,
+        state: BreakState,
+    };
+
+    const data = precompute: {
+        var result: [std.math.maxInt(u10)]Value = undefined;
+
+        @setEvalBranchQuota(2_000);
+        const info = @typeInfo(GraphemeBoundaryClass).Enum;
+        for (0..std.math.maxInt(u2) + 1) |state_init| {
+            for (info.fields) |field1| {
+                for (info.fields) |field2| {
+                    var state: BreakState = @bitCast(@as(u2, @intCast(state_init)));
+                    const key: Key = .{
+                        .gbc1 = @field(GraphemeBoundaryClass, field1.name),
+                        .gbc2 = @field(GraphemeBoundaryClass, field2.name),
+                        .state = state,
+                    };
+                    const v = graphemeBreakClass(key.gbc1, key.gbc2, &state);
+                    result[key.index()] = .{ .result = v, .state = state };
+                }
+            }
+        }
+
+        break :precompute result;
+    };
+};
+
+/// This is the algorithm from utf8proc. We only use this offline for
+/// precomputing the lookup table.
 fn graphemeBreakClass(
     gbc1: GraphemeBoundaryClass,
     gbc2: GraphemeBoundaryClass,
diff --git a/src/unicode/props.zig b/src/unicode/props.zig
index d6f282ed9..d83f0f699 100644
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@@ -60,7 +60,7 @@ pub const Properties = struct {
 /// Possible grapheme boundary classes. This isn't an exhaustive list:
 /// we omit control, CR, LF, etc. because in Ghostty's usage that are
 /// impossible because they're handled by the terminal.
-pub const GraphemeBoundaryClass = enum {
+pub const GraphemeBoundaryClass = enum(u4) {
     invalid,
     L,
     V,