From fd1201323e7dd887dcc98f7fa160e49994e080d0 Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <m@mitchellh.com>
Date: Thu, 12 Dec 2024 12:22:12 -0800
Subject: [PATCH] unicode: emoji modifier requires emoji modifier base
 preceding to not break
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #2941

This fixes the rendering of the text below. For those that can't see it,
it is the following in UTF-32: `0x22 0x1F3FF 0x22`.

```
"🏿"
```

`0x1F3FF` is the Fitzpatrick modifier for dark skin tone. It has the
Unicode property `Emoji_Modifier`. Emoji modifiers are defined in UTS
#51 and are only valid based on ED-13:

```
emoji_modifier_sequence := emoji_modifier_base emoji_modifier
emoji_modifier_base := \p{Emoji_Modifier_Base}
emoji_modifier := \p{Emoji_Modifier}
```

Additional quote from UTS #51:

> To have an effect on an emoji, an emoji modifier must immediately follow
> that base emoji character. Emoji presentation selectors are neither needed
> nor recommended for emoji characters when they are followed by emoji
> modifiers, and should not be used in newly generated emoji modifier
> sequences; the emoji modifier automatically implies the emoji presentation
> style.

Our precomputed grapheme break table was mistakingly not following this
rule. This commit fixes that by adding a check for that every
`Emoji_Modifier` character must be preceded by an `Emoji_Modifier_Base`.
This only has a cost during compilation (table generation). The runtime
cost is identical; the table size didn't increase since we had leftover
bits we could use.
---
 src/terminal/Terminal.zig | 47 ++++++++++++++++++++++++++++++++++++++-
 src/unicode/grapheme.zig  | 31 +++++++++++++++++++++++---
 src/unicode/props.zig     | 25 ++++++++++++++++++++-
 3 files changed, 98 insertions(+), 5 deletions(-)

diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig
index a11028304..eced9e222 100644
--- a/src/terminal/Terminal.zig
+++ b/src/terminal/Terminal.zig
@@ -342,7 +342,7 @@ pub fn print(self: *Terminal, c: u21) !void {
             if (c == 0xFE0F or c == 0xFE0E) {
                 // This only applies to emoji
                 const prev_props = unicode.getProperties(prev.cell.content.codepoint);
-                const emoji = prev_props.grapheme_boundary_class == .extended_pictographic;
+                const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic();
                 if (!emoji) return;
 
                 switch (c) {
@@ -3193,6 +3193,51 @@ test "Terminal: print multicodepoint grapheme, mode 2027" {
     }
 }
 
+test "Terminal: Fitzpatrick skin tone next to non-base" {
+    var t = try init(testing.allocator, .{ .cols = 80, .rows = 80 });
+    defer t.deinit(testing.allocator);
+
+    // Enable grapheme clustering
+    t.modes.set(.grapheme_cluster, true);
+
+    // This is: "🏿" (which may not render correctly in your editor!)
+    try t.print(0x22); // "
+    try t.print(0x1F3FF); // Dark skin tone
+    try t.print(0x22); // "
+
+    // We should have 4 cells taken up. Importantly, the skin tone
+    // should not join with the quotes.
+    try testing.expectEqual(@as(usize, 0), t.screen.cursor.y);
+    try testing.expectEqual(@as(usize, 4), t.screen.cursor.x);
+
+    // Row should be dirty
+    try testing.expect(t.isDirty(.{ .screen = .{ .x = 0, .y = 0 } }));
+
+    // Assert various properties about our screen to verify
+    // we have all expected cells.
+    {
+        const list_cell = t.screen.pages.getCell(.{ .screen = .{ .x = 0, .y = 0 } }).?;
+        const cell = list_cell.cell;
+        try testing.expectEqual(@as(u21, 0x22), cell.content.codepoint);
+        try testing.expect(!cell.hasGrapheme());
+        try testing.expectEqual(Cell.Wide.narrow, cell.wide);
+    }
+    {
+        const list_cell = t.screen.pages.getCell(.{ .screen = .{ .x = 1, .y = 0 } }).?;
+        const cell = list_cell.cell;
+        try testing.expectEqual(@as(u21, 0x1F3FF), cell.content.codepoint);
+        try testing.expect(!cell.hasGrapheme());
+        try testing.expectEqual(Cell.Wide.wide, cell.wide);
+    }
+    {
+        const list_cell = t.screen.pages.getCell(.{ .screen = .{ .x = 3, .y = 0 } }).?;
+        const cell = list_cell.cell;
+        try testing.expectEqual(@as(u21, 0x22), cell.content.codepoint);
+        try testing.expect(!cell.hasGrapheme());
+        try testing.expectEqual(Cell.Wide.narrow, cell.wide);
+    }
+}
+
 test "Terminal: multicodepoint grapheme marks dirty on every codepoint" {
     var t = try init(testing.allocator, .{ .cols = 80, .rows = 80 });
     defer t.deinit(testing.allocator);
diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig
index 09f452114..25061b5ef 100644
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@@ -51,7 +51,7 @@ const Precompute = struct {
     const data = precompute: {
         var result: [std.math.maxInt(u10)]Value = undefined;
 
-        @setEvalBranchQuota(2_000);
+        @setEvalBranchQuota(3_000);
         const info = @typeInfo(GraphemeBoundaryClass).Enum;
         for (0..std.math.maxInt(u2) + 1) |state_init| {
             for (info.fields) |field1| {
@@ -80,7 +80,7 @@ fn graphemeBreakClass(
     state: *BreakState,
 ) bool {
     // GB11: Emoji Extend* ZWJ x Emoji
-    if (!state.extended_pictographic and gbc1 == .extended_pictographic) {
+    if (!state.extended_pictographic and gbc1.isExtendedPictographic()) {
         state.extended_pictographic = true;
     }
 
@@ -131,12 +131,21 @@ fn graphemeBreakClass(
     // GB11: Emoji Extend* ZWJ x Emoji
     if (state.extended_pictographic and
         gbc1 == .zwj and
-        gbc2 == .extended_pictographic)
+        gbc2.isExtendedPictographic())
     {
         state.extended_pictographic = false;
         return false;
     }
 
+    // UTS #51. This isn't covered by UAX #29 as far as I can tell (but
+    // I'm probably wrong). This is a special case for emoji modifiers
+    // which only do not break if they're next to a base.
+    //
+    // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
+    if (gbc2 == .emoji_modifier and gbc1 == .extended_pictographic_base) {
+        return false;
+    }
+
     return true;
 }
 
@@ -181,3 +190,19 @@ pub fn main() !void {
 pub const std_options = struct {
     pub const log_level: std.log.Level = .info;
 };
+
+test "grapheme break: emoji modifier" {
+    const testing = std.testing;
+
+    // Emoji and modifier
+    {
+        var state: BreakState = .{};
+        try testing.expect(!graphemeBreak(0x261D, 0x1F3FF, &state));
+    }
+
+    // Non-emoji and emoji modifier
+    {
+        var state: BreakState = .{};
+        try testing.expect(graphemeBreak(0x22, 0x1F3FF, &state));
+    }
+}
diff --git a/src/unicode/props.zig b/src/unicode/props.zig
index d83f0f699..d77bf4c8a 100644
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@@ -1,5 +1,6 @@
 const props = @This();
 const std = @import("std");
+const assert = std.debug.assert;
 const ziglyph = @import("ziglyph");
 const lut = @import("lut.zig");
 
@@ -73,12 +74,21 @@ pub const GraphemeBoundaryClass = enum(u4) {
     spacing_mark,
     regional_indicator,
     extended_pictographic,
+    extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
+    emoji_modifier, // \p{Emoji_Modifier}
 
     /// Gets the grapheme boundary class for a codepoint. This is VERY
     /// SLOW. The use case for this is only in generating lookup tables.
     pub fn init(cp: u21) GraphemeBoundaryClass {
+        // We special-case modifier bases because we should not break
+        // if a modifier isn't next to a base.
+        if (ziglyph.emoji.isEmojiModifierBase(cp)) {
+            assert(ziglyph.emoji.isExtendedPictographic(cp));
+            return .extended_pictographic_base;
+        }
+
+        if (ziglyph.emoji.isEmojiModifier(cp)) return .emoji_modifier;
         if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic;
-        if (ziglyph.emoji.isEmojiModifier(cp)) return .extend;
         if (ziglyph.grapheme_break.isL(cp)) return .L;
         if (ziglyph.grapheme_break.isV(cp)) return .V;
         if (ziglyph.grapheme_break.isT(cp)) return .T;
@@ -95,6 +105,19 @@ pub const GraphemeBoundaryClass = enum(u4) {
         // anything that doesn't fit into the above categories.
         return .invalid;
     }
+
+    /// Returns true if this is an extended pictographic type. This
+    /// should be used instead of comparing the enum value directly
+    /// because we classify multiple.
+    pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
+        return switch (self) {
+            .extended_pictographic,
+            .extended_pictographic_base,
+            => true,
+
+            else => false,
+        };
+    }
 };
 
 pub fn get(cp: u21) Properties {