From b3ec7028fb6206f13281439521b17a28b5e7a403 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Sun, 4 Sep 2022 21:57:07 -0700 Subject: [PATCH] detect and attach grapheme to grapheme data --- src/terminal/Screen.zig | 101 ++++++++++++++++++++++++++++++++++++-- src/terminal/Terminal.zig | 54 ++++++++++++++++++++ 2 files changed, 151 insertions(+), 4 deletions(-) diff --git a/src/terminal/Screen.zig b/src/terminal/Screen.zig index 552ca8f09..8601b0f32 100644 --- a/src/terminal/Screen.zig +++ b/src/terminal/Screen.zig @@ -196,11 +196,20 @@ pub const Cell = struct { /// A row is a single row in the screen. pub const Row = struct { + /// The screen this row is part of. + screen: *Screen, + /// Raw internal storage, do NOT write to this, use only the /// helpers. Writing directly to this can easily mess up state /// causing future crashes or misrendering. storage: []StorageCell, + /// Returns the ID for this row. You can turn this into a cell ID + /// by adding the cell offset plus 1 (so it is 1-indexed). + pub fn getId(self: Row) RowHeader.Id { + return self.storage[0].header.id; + } + /// Set that this row is soft-wrapped. This doesn't change the contents /// of this row so the row won't be marked dirty. pub fn setWrapped(self: Row, v: bool) void { @@ -250,6 +259,39 @@ pub const Row = struct { return &self.storage[x + 1].cell; } + /// Attach a grapheme codepoint to the given cell. + pub fn attachGrapheme(self: Row, x: usize, cp: u21) !void { + const cell = &self.storage[x + 1].cell; + const key = self.getId() + x + 1; + const gop = try self.screen.graphemes.getOrPut(self.screen.alloc, key); + errdefer if (!gop.found_existing) { + _ = self.screen.graphemes.remove(key); + }; + + // Our row now has a grapheme + self.storage[0].header.flags.grapheme = true; + + // If we weren't previously a grapheme and we found an existing value + // it means that it is old grapheme data. Just delete that. + if (!cell.attrs.grapheme and gop.found_existing) { + cell.attrs.grapheme = true; + gop.value_ptr.deinit(self.screen.alloc); + gop.value_ptr.* = .{ .one = cp }; + return; + } + + // If we didn't have a previous value, attach the single codepoint. + if (!gop.found_existing) { + cell.attrs.grapheme = true; + gop.value_ptr.* = .{ .one = cp }; + return; + } + + // We have an existing value, promote + assert(cell.attrs.grapheme); + try gop.value_ptr.append(self.screen.alloc, cp); + } + /// Copy the row src into this row. The row can be from another screen. pub fn copyRow(self: Row, src: Row) void { const end = @minimum(src.storage.len, self.storage.len); @@ -408,14 +450,65 @@ pub const GraphemeData = union(enum) { four: [4]u21, many: []u21, + pub fn deinit(self: GraphemeData, alloc: Allocator) void { + switch (self) { + .many => |v| alloc.free(v), + else => {}, + } + } + + /// Append the codepoint cp to the grapheme data. + pub fn append(self: *GraphemeData, alloc: Allocator, cp: u21) !void { + switch (self.*) { + .one => |v| self.* = .{ .two = .{ v, cp } }, + .two => |v| self.* = .{ .three = .{ v[0], v[1], cp } }, + .three => |v| self.* = .{ .four = .{ v[0], v[1], v[2], cp } }, + .four => |v| { + const many = try alloc.alloc(u21, 5); + std.mem.copy(u21, many, &v); + many[4] = cp; + self.* = .{ .many = many }; + }, + + .many => |v| { + // Note: this is super inefficient, we should use an arraylist + // or something so we have extra capacity. + const many = try alloc.realloc(v, v.len + 1); + many[v.len] = cp; + self.* = .{ .many = many }; + }, + } + } + test { - //log.warn("Grapheme={}", .{@sizeOf(GraphemeData)}); + log.warn("Grapheme={}", .{@sizeOf(GraphemeData)}); + } + + test "append" { + const testing = std.testing; + const alloc = testing.allocator; + + var data: GraphemeData = .{ .one = 1 }; + defer data.deinit(alloc); + + try data.append(alloc, 2); + try testing.expectEqual(GraphemeData{ .two = .{ 1, 2 } }, data); + try data.append(alloc, 3); + try testing.expectEqual(GraphemeData{ .three = .{ 1, 2, 3 } }, data); + try data.append(alloc, 4); + try testing.expectEqual(GraphemeData{ .four = .{ 1, 2, 3, 4 } }, data); + try data.append(alloc, 5); + try testing.expect(data == .many); + try testing.expectEqualSlices(u21, &[_]u21{ 1, 2, 3, 4, 5 }, data.many); + try data.append(alloc, 6); + try testing.expect(data == .many); + try testing.expectEqualSlices(u21, &[_]u21{ 1, 2, 3, 4, 5, 6 }, data.many); } comptime { // We want to keep this at most the size of the tag + []u21 so that // at most we're paying for the cost of a slice. - assert(@sizeOf(GraphemeData) == 24); + //assert(@sizeOf(GraphemeData) == 24); } }; @@ -540,7 +633,7 @@ pub fn getRow(self: *Screen, index: RowIndex) Row { const slices = self.storage.getPtrSlice(offset, self.cols + 1); assert(slices[0].len == self.cols + 1 and slices[1].len == 0); - const row: Row = .{ .storage = slices[0] }; + const row: Row = .{ .screen = self, .storage = slices[0] }; if (row.storage[0].header.id == 0) { const Id = @TypeOf(self.next_row_id); const id = self.next_row_id; @@ -789,7 +882,7 @@ pub fn selectionString(self: *Screen, alloc: Allocator, sel: Selection) ![:0]con // the first row. var skip: usize = if (row_count == 0) slices.top_offset else 0; - const row: Row = .{ .storage = slice[start_idx..end_idx] }; + const row: Row = .{ .screen = self, .storage = slice[start_idx..end_idx] }; var it = row.cellIterator(); while (it.next()) |cell| { if (skip > 0) { diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig index 81b39fda8..aeea76937 100644 --- a/src/terminal/Terminal.zig +++ b/src/terminal/Terminal.zig @@ -449,6 +449,60 @@ pub fn print(self: *Terminal, c: u21) !void { // If we're not on the main display, do nothing for now if (self.status_display != .main) return; + // Get the previous cell so we can detect grapheme clusters. We only + // do this if c is outside of Latin-1 because characters in the Latin-1 + // range cannot possibly be grapheme joiners. This helps keep non-graphemes + // extremely fast and we take this much slower path for graphemes. No hate + // on graphemes, I'd love to make them much faster, but I wanted to focus + // on correctness first. + if (c > 255 and self.screen.cursor.x > 0) { + // TODO: test this! + + const row = self.screen.getRow(.{ .active = self.screen.cursor.y }); + const Prev = struct { cell: *Screen.Cell, x: usize }; + const prev: Prev = prev: { + const x = self.screen.cursor.x - 1; + const immediate = row.getCellPtr(x); + if (!immediate.attrs.wide_spacer_tail) break :prev .{ + .cell = immediate, + .x = x, + }; + + break :prev .{ + .cell = row.getCellPtr(x - 1), + .x = x - 1, + }; + }; + + var state: i32 = 0; + const grapheme_break = if (!prev.cell.attrs.grapheme) + utf8proc.graphemeBreakStateful(@intCast(u21, prev.cell.char), c, &state) + else brk: { + // We need to rebuild the state by processing the grapheme breaks + // for all the codepoints up to this point. This MUST exist because + // grapheme is only true iff this exists. + const points = self.screen.graphemes.getEntry(row.getId() + prev.x + 1).?; + const cp1 = switch (points.value_ptr.*) { + .one => |v| one: { + assert(!utf8proc.graphemeBreakStateful(@intCast(u21, prev.cell.char), v, &state)); + break :one v; + }, + + else => @panic("NO"), + }; + + break :brk utf8proc.graphemeBreakStateful(cp1, c, &state); + }; + + // If we can NOT break, this means that "c" is part of a grapheme + // with the previous char. + if (!grapheme_break) { + log.debug("c={x} grapheme attach to x={}", .{ c, prev.x }); + try row.attachGrapheme(prev.x, c); + return; + } + } + // Determine the width of this character so we can handle // non-single-width characters properly. const width = utf8proc.charwidth(c);