From 5f3574a4bfc33a19e0b6588ff67709afae0622bd Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <mitchell.hashimoto@gmail.com>
Date: Fri, 9 Feb 2024 19:44:57 -0800
Subject: [PATCH] unicode: direct port of ziglyph to start

---
 build.zig                    |  11 ++
 src/bench/grapheme-break.sh  |   5 +-
 src/bench/grapheme-break.zig |  31 ++++-
 src/unicode/grapheme.zig     | 261 +++++++++++++++++++++++++++++++++++
 src/unicode/main.zig         |   1 +
 src/unicode/props.zig        |  11 +-
 6 files changed, 316 insertions(+), 4 deletions(-)
 create mode 100644 src/unicode/grapheme.zig

diff --git a/build.zig b/build.zig
index 445cf4a98..fd49b7e62 100644
--- a/build.zig
+++ b/build.zig
@@ -217,6 +217,17 @@ pub fn build(b: *std.Build) !void {
     // Add our benchmarks
     try benchSteps(b, target, config, emit_bench);
 
+    {
+        const exe = b.addExecutable(.{
+            .name = "grapheme-verify",
+            .root_source_file = .{ .path = "src/unicode/grapheme.zig" },
+            .target = target,
+            .optimize = .ReleaseFast,
+        });
+        b.installArtifact(exe);
+        _ = try addDeps(b, exe, config);
+    }
+
     // We only build an exe if we have a runtime set.
     const exe_: ?*std.Build.Step.Compile = if (config.app_runtime != .none) b.addExecutable(.{
         .name = "ghostty",
diff --git a/src/bench/grapheme-break.sh b/src/bench/grapheme-break.sh
index 56bd28dd1..c395c3799 100755
--- a/src/bench/grapheme-break.sh
+++ b/src/bench/grapheme-break.sh
@@ -28,5 +28,8 @@ hyperfine \
   -n ziglyph \
   "./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
   -n utf8proc \
-  "./zig-out/bin/bench-grapheme-break --mode=utf8proc${ARGS} </tmp/ghostty_bench_data"
+  "./zig-out/bin/bench-grapheme-break --mode=utf8proc${ARGS} </tmp/ghostty_bench_data" \
+  -n table \
+  "./zig-out/bin/bench-grapheme-break --mode=table${ARGS} </tmp/ghostty_bench_data"
+
 
diff --git a/src/bench/grapheme-break.zig b/src/bench/grapheme-break.zig
index 108c3e29d..55caca313 100644
--- a/src/bench/grapheme-break.zig
+++ b/src/bench/grapheme-break.zig
@@ -15,7 +15,7 @@ const ArenaAllocator = std.heap.ArenaAllocator;
 const ziglyph = @import("ziglyph");
 const cli = @import("../cli.zig");
 const simd = @import("../simd/main.zig");
-const table = @import("../unicode/main.zig").table;
+const unicode = @import("../unicode/main.zig");
 const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
 
 const Args = struct {
@@ -44,6 +44,9 @@ const Mode = enum {
     /// Use ziglyph library to calculate the display width of each codepoint.
     ziglyph,
 
+    /// Ghostty's table-based approach.
+    table,
+
     utf8proc,
 };
 
@@ -71,6 +74,7 @@ pub fn main() !void {
     switch (args.mode) {
         .noop => try benchNoop(reader, buf),
         .ziglyph => try benchZiglyph(reader, buf),
+        .table => try benchTable(reader, buf),
         .utf8proc => try benchUtf8proc(reader, buf),
     }
 }
@@ -92,6 +96,31 @@ noinline fn benchNoop(
     }
 }
 
+noinline fn benchTable(
+    reader: anytype,
+    buf: []u8,
+) !void {
+    var d: UTF8Decoder = .{};
+    var state: u3 = 0;
+    var cp1: u21 = 0;
+    while (true) {
+        const n = try reader.read(buf);
+        if (n == 0) break;
+
+        // Using stream.next directly with a for loop applies a naive
+        // scalar approach.
+        for (buf[0..n]) |c| {
+            const cp_, const consumed = d.next(c);
+            assert(consumed);
+            if (cp_) |cp2| {
+                const v = unicode.graphemeBreak(cp1, @intCast(cp2), &state);
+                buf[0] = @intCast(@intFromBool(v));
+                cp1 = cp2;
+            }
+        }
+    }
+}
+
 noinline fn benchZiglyph(
     reader: anytype,
     buf: []u8,
diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig
new file mode 100644
index 000000000..f5a39f863
--- /dev/null
+++ b/src/unicode/grapheme.zig
@@ -0,0 +1,261 @@
+const std = @import("std");
+const props = @import("props.zig");
+const table = props.table;
+
+/// Grapheme break
+pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool {
+    const gbc1 = table.get(cp1).grapheme_boundary_class;
+    const gbc2 = table.get(cp2).grapheme_boundary_class;
+    // std.log.warn("gbc1={} gbc2={}, new1={} new2={}", .{
+    //     gbc1,
+    //     gbc2,
+    //     props.GraphemeBoundaryClass.init(cp1),
+    //     props.GraphemeBoundaryClass.init(cp2),
+    // });
+
+    // GB11: Emoji Extend* ZWJ x Emoji
+    if (!hasXpic(state) and gbc1 == .extended_pictographic) setXpic(state);
+
+    // These two properties are ignored because they're not relevant to
+    // Ghostty -- they're filtered out before checking grapheme boundaries.
+    // GB3: CR x LF
+    // GB4: Control
+
+    // GB6: Hangul L x (L|V|LV|VT)
+    if (gbc1 == .L) {
+        if (gbc2 == .L or
+            gbc2 == .V or
+            gbc2 == .LV or
+            gbc2 == .LVT) return false;
+    }
+
+    // GB7: Hangul (LV | V) x (V | T)
+    if (gbc1 == .LV or gbc1 == .V) {
+        if (gbc2 == .V or
+            gbc2 == .T) return false;
+    }
+
+    // GB8: Hangul (LVT | T) x T
+    if (gbc1 == .LVT or gbc1 == .T) {
+        if (gbc2 == .T) return false;
+    }
+
+    // GB9b: x (Extend | ZWJ)
+    if (gbc2 == .extend or gbc2 == .zwj) return false;
+
+    // GB9a: x Spacing
+    if (gbc2 == .spacing_mark) return false;
+
+    // GB9b: Prepend x
+    if (gbc1 == .prepend) return false;
+
+    // GB12, GB13: RI x RI
+    if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
+        if (hasRegional(state)) {
+            unsetRegional(state);
+            return true;
+        } else {
+            setRegional(state);
+            return false;
+        }
+    }
+
+    // GB11: Emoji Extend* ZWJ x Emoji
+    if (hasXpic(state) and
+        gbc1 == .zwj and
+        gbc2 == .extended_pictographic)
+    {
+        unsetXpic(state);
+        return false;
+    }
+
+    return true;
+}
+
+const emoji = @import("ziglyph").emoji;
+const gbp = @import("ziglyph").grapheme_break;
+
+fn isBreaker(cp: u21) bool {
+    return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp);
+}
+
+pub fn zg_graphemeBreak(
+    cp1: u21,
+    cp2: u21,
+    state: *u3,
+) bool {
+
+    // GB11: Emoji Extend* ZWJ x Emoji
+    if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state);
+
+    // GB3: CR x LF
+    if (cp1 == '\r' and cp2 == '\n') {
+        std.log.warn("GB3", .{});
+        return false;
+    }
+
+    // GB4: Control
+    if (isBreaker(cp1)) {
+        std.log.warn("GB4", .{});
+        return true;
+    }
+
+    // GB6: Hangul L x (L|V|LV|VT)
+    if (gbp.isL(cp1)) {
+        if (gbp.isL(cp2) or
+            gbp.isV(cp2) or
+            gbp.isLv(cp2) or
+            gbp.isLvt(cp2))
+        {
+            std.log.warn("GB6", .{});
+            return false;
+        }
+    }
+
+    // GB7: Hangul (LV | V) x (V | T)
+    if (gbp.isLv(cp1) or gbp.isV(cp1)) {
+        if (gbp.isV(cp2) or
+            gbp.isT(cp2))
+        {
+            std.log.warn("GB7", .{});
+            return false;
+        }
+    }
+
+    // GB8: Hangul (LVT | T) x T
+    if (gbp.isLvt(cp1) or gbp.isT(cp1)) {
+        if (gbp.isT(cp2)) {
+            std.log.warn("GB8", .{});
+            return false;
+        }
+    }
+
+    // GB9b: x (Extend | ZWJ)
+    if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) {
+        std.log.warn("GB9b", .{});
+        return false;
+    }
+
+    // GB9a: x Spacing
+    if (gbp.isSpacingmark(cp2)) {
+        std.log.warn("GB9a", .{});
+        return false;
+    }
+
+    // GB9b: Prepend x
+    if (gbp.isPrepend(cp1) and !isBreaker(cp2)) {
+        std.log.warn("GB9b cp1={x} prepend={}", .{ cp1, gbp.isPrepend(cp1) });
+        return false;
+    }
+
+    // GB12, GB13: RI x RI
+    if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) {
+        if (hasRegional(state)) {
+            unsetRegional(state);
+            std.log.warn("GB12", .{});
+            return true;
+        } else {
+            std.log.warn("GB13", .{});
+            setRegional(state);
+            return false;
+        }
+    }
+
+    // GB11: Emoji Extend* ZWJ x Emoji
+    if (hasXpic(state) and
+        gbp.isZwj(cp1) and
+        emoji.isExtendedPictographic(cp2))
+    {
+        std.log.warn("GB11", .{});
+        unsetXpic(state);
+        return false;
+    }
+
+    return true;
+}
+
+fn hasXpic(state: *const u3) bool {
+    return state.* & 1 == 1;
+}
+
+fn setXpic(state: *u3) void {
+    state.* |= 1;
+}
+
+fn unsetXpic(state: *u3) void {
+    state.* ^= 1;
+}
+
+fn hasRegional(state: *const u3) bool {
+    return state.* & 2 == 2;
+}
+
+fn setRegional(state: *u3) void {
+    state.* |= 2;
+}
+
+fn unsetRegional(state: *u3) void {
+    state.* ^= 2;
+}
+
+/// If you build this file as a binary, we will verify the grapheme break
+/// implementation. This iterates over billions of codepoints so it is
+/// SLOW. It's not meant to be run in CI, but it's useful for debugging.
+pub fn main() !void {
+    const ziglyph = @import("ziglyph");
+
+    var state: u3 = 0;
+    var zg_state: u3 = 0;
+    for (0..std.math.maxInt(u21) + 1) |cp1| {
+        if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
+
+        if (cp1 == '\r' or cp1 == '\n' or
+            ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;
+
+        for (0..std.math.maxInt(u21) + 1) |cp2| {
+            if (cp2 == '\r' or cp2 == '\n' or
+                ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;
+
+            const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
+            const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
+            if (gb != zg_gb) {
+                std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
+                    cp1,
+                    cp2,
+                    gb,
+                    state,
+                    zg_gb,
+                    zg_state,
+                });
+            }
+        }
+    }
+}
+
+pub const std_options = struct {
+    pub const log_level: std.log.Level = .info;
+};
+
+// test "matches ziglyph specific" {
+//     const testing = std.testing;
+//
+//     var state: u3 = 0;
+//     var zg_state: u3 = 0;
+//
+//     const cp1 = 0x20;
+//     const cp2 = 0x300;
+//
+//     const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
+//     const zg_gb = zg_graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
+//     if (gb != zg_gb) {
+//         std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
+//             cp1,
+//             cp2,
+//             gb,
+//             state,
+//             zg_gb,
+//             zg_state,
+//         });
+//         try testing.expect(false);
+//     }
+// }
diff --git a/src/unicode/main.zig b/src/unicode/main.zig
index fa0cb9fc8..1af26d485 100644
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@@ -1,5 +1,6 @@
 pub const lut = @import("lut.zig");
 
+pub usingnamespace @import("grapheme.zig");
 const props = @import("props.zig");
 pub const table = props.table;
 pub const Properties = props.Properties;
diff --git a/src/unicode/props.zig b/src/unicode/props.zig
index d46acbf49..d6f282ed9 100644
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@@ -32,7 +32,8 @@ pub const Properties = struct {
 
     // Needed for lut.Generator
     pub fn eql(a: Properties, b: Properties) bool {
-        return a.width == b.width;
+        return a.width == b.width and
+            a.grapheme_boundary_class == b.grapheme_boundary_class;
     }
 
     // Needed for lut.Generator
@@ -44,8 +45,14 @@ pub const Properties = struct {
     ) !void {
         _ = layout;
         _ = opts;
-        try std.fmt.format(writer, ".{{ .width= {}, }}", .{
+        try std.fmt.format(writer,
+            \\.{{
+            \\    .width= {},
+            \\    .grapheme_boundary_class= .{s},
+            \\}}
+        , .{
             self.width,
+            @tagName(self.grapheme_boundary_class),
         });
     }
 };