From b3ec7028fb6206f13281439521b17a28b5e7a403 Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <mitchell.hashimoto@gmail.com>
Date: Sun, 4 Sep 2022 21:57:07 -0700
Subject: [PATCH] detect and attach grapheme to grapheme data

---
 src/terminal/Screen.zig   | 101 ++++++++++++++++++++++++++++++++++++--
 src/terminal/Terminal.zig |  54 ++++++++++++++++++++
 2 files changed, 151 insertions(+), 4 deletions(-)

diff --git a/src/terminal/Screen.zig b/src/terminal/Screen.zig
index 552ca8f09..8601b0f32 100644
--- a/src/terminal/Screen.zig
+++ b/src/terminal/Screen.zig
@@ -196,11 +196,20 @@ pub const Cell = struct {
 
 /// A row is a single row in the screen.
 pub const Row = struct {
+    /// The screen this row is part of.
+    screen: *Screen,
+
     /// Raw internal storage, do NOT write to this, use only the
     /// helpers. Writing directly to this can easily mess up state
     /// causing future crashes or misrendering.
     storage: []StorageCell,
 
+    /// Returns the ID for this row. You can turn this into a cell ID
+    /// by adding the cell offset plus 1 (so it is 1-indexed).
+    pub fn getId(self: Row) RowHeader.Id {
+        return self.storage[0].header.id;
+    }
+
     /// Set that this row is soft-wrapped. This doesn't change the contents
     /// of this row so the row won't be marked dirty.
     pub fn setWrapped(self: Row, v: bool) void {
@@ -250,6 +259,39 @@ pub const Row = struct {
         return &self.storage[x + 1].cell;
     }
 
+    /// Attach a grapheme codepoint to the given cell.
+    pub fn attachGrapheme(self: Row, x: usize, cp: u21) !void {
+        const cell = &self.storage[x + 1].cell;
+        const key = self.getId() + x + 1;
+        const gop = try self.screen.graphemes.getOrPut(self.screen.alloc, key);
+        errdefer if (!gop.found_existing) {
+            _ = self.screen.graphemes.remove(key);
+        };
+
+        // Our row now has a grapheme
+        self.storage[0].header.flags.grapheme = true;
+
+        // If we weren't previously a grapheme and we found an existing value
+        // it means that it is old grapheme data. Just delete that.
+        if (!cell.attrs.grapheme and gop.found_existing) {
+            cell.attrs.grapheme = true;
+            gop.value_ptr.deinit(self.screen.alloc);
+            gop.value_ptr.* = .{ .one = cp };
+            return;
+        }
+
+        // If we didn't have a previous value, attach the single codepoint.
+        if (!gop.found_existing) {
+            cell.attrs.grapheme = true;
+            gop.value_ptr.* = .{ .one = cp };
+            return;
+        }
+
+        // We have an existing value, promote
+        assert(cell.attrs.grapheme);
+        try gop.value_ptr.append(self.screen.alloc, cp);
+    }
+
     /// Copy the row src into this row. The row can be from another screen.
     pub fn copyRow(self: Row, src: Row) void {
         const end = @minimum(src.storage.len, self.storage.len);
@@ -408,14 +450,65 @@ pub const GraphemeData = union(enum) {
     four: [4]u21,
     many: []u21,
 
+    pub fn deinit(self: GraphemeData, alloc: Allocator) void {
+        switch (self) {
+            .many => |v| alloc.free(v),
+            else => {},
+        }
+    }
+
+    /// Append the codepoint cp to the grapheme data.
+    pub fn append(self: *GraphemeData, alloc: Allocator, cp: u21) !void {
+        switch (self.*) {
+            .one => |v| self.* = .{ .two = .{ v, cp } },
+            .two => |v| self.* = .{ .three = .{ v[0], v[1], cp } },
+            .three => |v| self.* = .{ .four = .{ v[0], v[1], v[2], cp } },
+            .four => |v| {
+                const many = try alloc.alloc(u21, 5);
+                std.mem.copy(u21, many, &v);
+                many[4] = cp;
+                self.* = .{ .many = many };
+            },
+
+            .many => |v| {
+                // Note: this is super inefficient, we should use an arraylist
+                // or something so we have extra capacity.
+                const many = try alloc.realloc(v, v.len + 1);
+                many[v.len] = cp;
+                self.* = .{ .many = many };
+            },
+        }
+    }
+
     test {
-        //log.warn("Grapheme={}", .{@sizeOf(GraphemeData)});
+        log.warn("Grapheme={}", .{@sizeOf(GraphemeData)});
+    }
+
+    test "append" {
+        const testing = std.testing;
+        const alloc = testing.allocator;
+
+        var data: GraphemeData = .{ .one = 1 };
+        defer data.deinit(alloc);
+
+        try data.append(alloc, 2);
+        try testing.expectEqual(GraphemeData{ .two = .{ 1, 2 } }, data);
+        try data.append(alloc, 3);
+        try testing.expectEqual(GraphemeData{ .three = .{ 1, 2, 3 } }, data);
+        try data.append(alloc, 4);
+        try testing.expectEqual(GraphemeData{ .four = .{ 1, 2, 3, 4 } }, data);
+        try data.append(alloc, 5);
+        try testing.expect(data == .many);
+        try testing.expectEqualSlices(u21, &[_]u21{ 1, 2, 3, 4, 5 }, data.many);
+        try data.append(alloc, 6);
+        try testing.expect(data == .many);
+        try testing.expectEqualSlices(u21, &[_]u21{ 1, 2, 3, 4, 5, 6 }, data.many);
     }
 
     comptime {
         // We want to keep this at most the size of the tag + []u21 so that
         // at most we're paying for the cost of a slice.
-        assert(@sizeOf(GraphemeData) == 24);
+        //assert(@sizeOf(GraphemeData) == 24);
     }
 };
 
@@ -540,7 +633,7 @@ pub fn getRow(self: *Screen, index: RowIndex) Row {
     const slices = self.storage.getPtrSlice(offset, self.cols + 1);
     assert(slices[0].len == self.cols + 1 and slices[1].len == 0);
 
-    const row: Row = .{ .storage = slices[0] };
+    const row: Row = .{ .screen = self, .storage = slices[0] };
     if (row.storage[0].header.id == 0) {
         const Id = @TypeOf(self.next_row_id);
         const id = self.next_row_id;
@@ -789,7 +882,7 @@ pub fn selectionString(self: *Screen, alloc: Allocator, sel: Selection) ![:0]con
             // the first row.
             var skip: usize = if (row_count == 0) slices.top_offset else 0;
 
-            const row: Row = .{ .storage = slice[start_idx..end_idx] };
+            const row: Row = .{ .screen = self, .storage = slice[start_idx..end_idx] };
             var it = row.cellIterator();
             while (it.next()) |cell| {
                 if (skip > 0) {
diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig
index 81b39fda8..aeea76937 100644
--- a/src/terminal/Terminal.zig
+++ b/src/terminal/Terminal.zig
@@ -449,6 +449,60 @@ pub fn print(self: *Terminal, c: u21) !void {
     // If we're not on the main display, do nothing for now
     if (self.status_display != .main) return;
 
+    // Get the previous cell so we can detect grapheme clusters. We only
+    // do this if c is outside of Latin-1 because characters in the Latin-1
+    // range cannot possibly be grapheme joiners. This helps keep non-graphemes
+    // extremely fast and we take this much slower path for graphemes. No hate
+    // on graphemes, I'd love to make them much faster, but I wanted to focus
+    // on correctness first.
+    if (c > 255 and self.screen.cursor.x > 0) {
+        // TODO: test this!
+
+        const row = self.screen.getRow(.{ .active = self.screen.cursor.y });
+        const Prev = struct { cell: *Screen.Cell, x: usize };
+        const prev: Prev = prev: {
+            const x = self.screen.cursor.x - 1;
+            const immediate = row.getCellPtr(x);
+            if (!immediate.attrs.wide_spacer_tail) break :prev .{
+                .cell = immediate,
+                .x = x,
+            };
+
+            break :prev .{
+                .cell = row.getCellPtr(x - 1),
+                .x = x - 1,
+            };
+        };
+
+        var state: i32 = 0;
+        const grapheme_break = if (!prev.cell.attrs.grapheme)
+            utf8proc.graphemeBreakStateful(@intCast(u21, prev.cell.char), c, &state)
+        else brk: {
+            // We need to rebuild the state by processing the grapheme breaks
+            // for all the codepoints up to this point. This MUST exist because
+            // grapheme is only true iff this exists.
+            const points = self.screen.graphemes.getEntry(row.getId() + prev.x + 1).?;
+            const cp1 = switch (points.value_ptr.*) {
+                .one => |v| one: {
+                    assert(!utf8proc.graphemeBreakStateful(@intCast(u21, prev.cell.char), v, &state));
+                    break :one v;
+                },
+
+                else => @panic("NO"),
+            };
+
+            break :brk utf8proc.graphemeBreakStateful(cp1, c, &state);
+        };
+
+        // If we can NOT break, this means that "c" is part of a grapheme
+        // with the previous char.
+        if (!grapheme_break) {
+            log.debug("c={x} grapheme attach to x={}", .{ c, prev.x });
+            try row.attachGrapheme(prev.x, c);
+            return;
+        }
+    }
+
     // Determine the width of this character so we can handle
     // non-single-width characters properly.
     const width = utf8proc.charwidth(c);