From fd1201323e7dd887dcc98f7fa160e49994e080d0 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Thu, 12 Dec 2024 12:22:12 -0800 Subject: [PATCH] unicode: emoji modifier requires emoji modifier base preceding to not break MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #2941 This fixes the rendering of the text below. For those that can't see it, it is the following in UTF-32: `0x22 0x1F3FF 0x22`. ``` "🏿" ``` `0x1F3FF` is the Fitzpatrick modifier for dark skin tone. It has the Unicode property `Emoji_Modifier`. Emoji modifiers are defined in UTS #51 and are only valid based on ED-13: ``` emoji_modifier_sequence := emoji_modifier_base emoji_modifier emoji_modifier_base := \p{Emoji_Modifier_Base} emoji_modifier := \p{Emoji_Modifier} ``` Additional quote from UTS #51: > To have an effect on an emoji, an emoji modifier must immediately follow > that base emoji character. Emoji presentation selectors are neither needed > nor recommended for emoji characters when they are followed by emoji > modifiers, and should not be used in newly generated emoji modifier > sequences; the emoji modifier automatically implies the emoji presentation > style. Our precomputed grapheme break table was mistakingly not following this rule. This commit fixes that by adding a check for that every `Emoji_Modifier` character must be preceded by an `Emoji_Modifier_Base`. This only has a cost during compilation (table generation). The runtime cost is identical; the table size didn't increase since we had leftover bits we could use. --- src/terminal/Terminal.zig | 47 ++++++++++++++++++++++++++++++++++++++- src/unicode/grapheme.zig | 31 +++++++++++++++++++++++--- src/unicode/props.zig | 25 ++++++++++++++++++++- 3 files changed, 98 insertions(+), 5 deletions(-) diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig index a11028304..eced9e222 100644 --- a/src/terminal/Terminal.zig +++ b/src/terminal/Terminal.zig @@ -342,7 +342,7 @@ pub fn print(self: *Terminal, c: u21) !void { if (c == 0xFE0F or c == 0xFE0E) { // This only applies to emoji const prev_props = unicode.getProperties(prev.cell.content.codepoint); - const emoji = prev_props.grapheme_boundary_class == .extended_pictographic; + const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic(); if (!emoji) return; switch (c) { @@ -3193,6 +3193,51 @@ test "Terminal: print multicodepoint grapheme, mode 2027" { } } +test "Terminal: Fitzpatrick skin tone next to non-base" { + var t = try init(testing.allocator, .{ .cols = 80, .rows = 80 }); + defer t.deinit(testing.allocator); + + // Enable grapheme clustering + t.modes.set(.grapheme_cluster, true); + + // This is: "🏿" (which may not render correctly in your editor!) + try t.print(0x22); // " + try t.print(0x1F3FF); // Dark skin tone + try t.print(0x22); // " + + // We should have 4 cells taken up. Importantly, the skin tone + // should not join with the quotes. + try testing.expectEqual(@as(usize, 0), t.screen.cursor.y); + try testing.expectEqual(@as(usize, 4), t.screen.cursor.x); + + // Row should be dirty + try testing.expect(t.isDirty(.{ .screen = .{ .x = 0, .y = 0 } })); + + // Assert various properties about our screen to verify + // we have all expected cells. + { + const list_cell = t.screen.pages.getCell(.{ .screen = .{ .x = 0, .y = 0 } }).?; + const cell = list_cell.cell; + try testing.expectEqual(@as(u21, 0x22), cell.content.codepoint); + try testing.expect(!cell.hasGrapheme()); + try testing.expectEqual(Cell.Wide.narrow, cell.wide); + } + { + const list_cell = t.screen.pages.getCell(.{ .screen = .{ .x = 1, .y = 0 } }).?; + const cell = list_cell.cell; + try testing.expectEqual(@as(u21, 0x1F3FF), cell.content.codepoint); + try testing.expect(!cell.hasGrapheme()); + try testing.expectEqual(Cell.Wide.wide, cell.wide); + } + { + const list_cell = t.screen.pages.getCell(.{ .screen = .{ .x = 3, .y = 0 } }).?; + const cell = list_cell.cell; + try testing.expectEqual(@as(u21, 0x22), cell.content.codepoint); + try testing.expect(!cell.hasGrapheme()); + try testing.expectEqual(Cell.Wide.narrow, cell.wide); + } +} + test "Terminal: multicodepoint grapheme marks dirty on every codepoint" { var t = try init(testing.allocator, .{ .cols = 80, .rows = 80 }); defer t.deinit(testing.allocator); diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig index 09f452114..25061b5ef 100644 --- a/src/unicode/grapheme.zig +++ b/src/unicode/grapheme.zig @@ -51,7 +51,7 @@ const Precompute = struct { const data = precompute: { var result: [std.math.maxInt(u10)]Value = undefined; - @setEvalBranchQuota(2_000); + @setEvalBranchQuota(3_000); const info = @typeInfo(GraphemeBoundaryClass).Enum; for (0..std.math.maxInt(u2) + 1) |state_init| { for (info.fields) |field1| { @@ -80,7 +80,7 @@ fn graphemeBreakClass( state: *BreakState, ) bool { // GB11: Emoji Extend* ZWJ x Emoji - if (!state.extended_pictographic and gbc1 == .extended_pictographic) { + if (!state.extended_pictographic and gbc1.isExtendedPictographic()) { state.extended_pictographic = true; } @@ -131,12 +131,21 @@ fn graphemeBreakClass( // GB11: Emoji Extend* ZWJ x Emoji if (state.extended_pictographic and gbc1 == .zwj and - gbc2 == .extended_pictographic) + gbc2.isExtendedPictographic()) { state.extended_pictographic = false; return false; } + // UTS #51. This isn't covered by UAX #29 as far as I can tell (but + // I'm probably wrong). This is a special case for emoji modifiers + // which only do not break if they're next to a base. + // + // emoji_modifier_sequence := emoji_modifier_base emoji_modifier + if (gbc2 == .emoji_modifier and gbc1 == .extended_pictographic_base) { + return false; + } + return true; } @@ -181,3 +190,19 @@ pub fn main() !void { pub const std_options = struct { pub const log_level: std.log.Level = .info; }; + +test "grapheme break: emoji modifier" { + const testing = std.testing; + + // Emoji and modifier + { + var state: BreakState = .{}; + try testing.expect(!graphemeBreak(0x261D, 0x1F3FF, &state)); + } + + // Non-emoji and emoji modifier + { + var state: BreakState = .{}; + try testing.expect(graphemeBreak(0x22, 0x1F3FF, &state)); + } +} diff --git a/src/unicode/props.zig b/src/unicode/props.zig index d83f0f699..d77bf4c8a 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -1,5 +1,6 @@ const props = @This(); const std = @import("std"); +const assert = std.debug.assert; const ziglyph = @import("ziglyph"); const lut = @import("lut.zig"); @@ -73,12 +74,21 @@ pub const GraphemeBoundaryClass = enum(u4) { spacing_mark, regional_indicator, extended_pictographic, + extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base} + emoji_modifier, // \p{Emoji_Modifier} /// Gets the grapheme boundary class for a codepoint. This is VERY /// SLOW. The use case for this is only in generating lookup tables. pub fn init(cp: u21) GraphemeBoundaryClass { + // We special-case modifier bases because we should not break + // if a modifier isn't next to a base. + if (ziglyph.emoji.isEmojiModifierBase(cp)) { + assert(ziglyph.emoji.isExtendedPictographic(cp)); + return .extended_pictographic_base; + } + + if (ziglyph.emoji.isEmojiModifier(cp)) return .emoji_modifier; if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic; - if (ziglyph.emoji.isEmojiModifier(cp)) return .extend; if (ziglyph.grapheme_break.isL(cp)) return .L; if (ziglyph.grapheme_break.isV(cp)) return .V; if (ziglyph.grapheme_break.isT(cp)) return .T; @@ -95,6 +105,19 @@ pub const GraphemeBoundaryClass = enum(u4) { // anything that doesn't fit into the above categories. return .invalid; } + + /// Returns true if this is an extended pictographic type. This + /// should be used instead of comparing the enum value directly + /// because we classify multiple. + pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool { + return switch (self) { + .extended_pictographic, + .extended_pictographic_base, + => true, + + else => false, + }; + } }; pub fn get(cp: u21) Properties {