diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig index f5a39f863..19437844c 100644 --- a/src/unicode/grapheme.zig +++ b/src/unicode/grapheme.zig @@ -1,18 +1,34 @@ const std = @import("std"); const props = @import("props.zig"); +const GraphemeBoundaryClass = props.GraphemeBoundaryClass; const table = props.table; -/// Grapheme break +// The algorithm in this file is based on the Ziglyph and utf8proc algorithm, +// only modified to use our own lookup tables. +// +// I'll note I also tried a fully precomputed table approach where all +// combinations of state and boundary classes were precomputed. It was +// marginally faster (about 2%) but the table is a few KB and I'm not +// sure it's worth it. + +/// Determines if there is a grapheme break between two codepoints. This +/// must be called sequentially maintaining the state between calls. +/// +/// This function does NOT work with control characters. Control characters, +/// line feeds, and carriage returns are expected to be filtered out before +/// calling this function. This is because this function is tuned for +/// Ghostty. pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool { const gbc1 = table.get(cp1).grapheme_boundary_class; const gbc2 = table.get(cp2).grapheme_boundary_class; - // std.log.warn("gbc1={} gbc2={}, new1={} new2={}", .{ - // gbc1, - // gbc2, - // props.GraphemeBoundaryClass.init(cp1), - // props.GraphemeBoundaryClass.init(cp2), - // }); + return graphemeBreakClass(gbc1, gbc2, state); +} +fn graphemeBreakClass( + gbc1: GraphemeBoundaryClass, + gbc2: GraphemeBoundaryClass, + state: *u3, +) bool { // GB11: Emoji Extend* ZWJ x Emoji if (!hasXpic(state) and gbc1 == .extended_pictographic) setXpic(state); @@ -72,107 +88,10 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool { return true; } -const emoji = @import("ziglyph").emoji; -const gbp = @import("ziglyph").grapheme_break; - -fn isBreaker(cp: u21) bool { - return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); -} - -pub fn zg_graphemeBreak( - cp1: u21, - cp2: u21, - state: *u3, -) bool { - - // GB11: Emoji Extend* ZWJ x Emoji - if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); - - // GB3: CR x LF - if (cp1 == '\r' and cp2 == '\n') { - std.log.warn("GB3", .{}); - return false; - } - - // GB4: Control - if (isBreaker(cp1)) { - std.log.warn("GB4", .{}); - return true; - } - - // GB6: Hangul L x (L|V|LV|VT) - if (gbp.isL(cp1)) { - if (gbp.isL(cp2) or - gbp.isV(cp2) or - gbp.isLv(cp2) or - gbp.isLvt(cp2)) - { - std.log.warn("GB6", .{}); - return false; - } - } - - // GB7: Hangul (LV | V) x (V | T) - if (gbp.isLv(cp1) or gbp.isV(cp1)) { - if (gbp.isV(cp2) or - gbp.isT(cp2)) - { - std.log.warn("GB7", .{}); - return false; - } - } - - // GB8: Hangul (LVT | T) x T - if (gbp.isLvt(cp1) or gbp.isT(cp1)) { - if (gbp.isT(cp2)) { - std.log.warn("GB8", .{}); - return false; - } - } - - // GB9b: x (Extend | ZWJ) - if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) { - std.log.warn("GB9b", .{}); - return false; - } - - // GB9a: x Spacing - if (gbp.isSpacingmark(cp2)) { - std.log.warn("GB9a", .{}); - return false; - } - - // GB9b: Prepend x - if (gbp.isPrepend(cp1) and !isBreaker(cp2)) { - std.log.warn("GB9b cp1={x} prepend={}", .{ cp1, gbp.isPrepend(cp1) }); - return false; - } - - // GB12, GB13: RI x RI - if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) { - if (hasRegional(state)) { - unsetRegional(state); - std.log.warn("GB12", .{}); - return true; - } else { - std.log.warn("GB13", .{}); - setRegional(state); - return false; - } - } - - // GB11: Emoji Extend* ZWJ x Emoji - if (hasXpic(state) and - gbp.isZwj(cp1) and - emoji.isExtendedPictographic(cp2)) - { - std.log.warn("GB11", .{}); - unsetXpic(state); - return false; - } - - return true; -} +const State = packed struct(u2) { + extended_pictographic: bool = false, + regional_indicator: bool = false, +}; fn hasXpic(state: *const u3) bool { return state.* & 1 == 1; @@ -204,15 +123,19 @@ fn unsetRegional(state: *u3) void { pub fn main() !void { const ziglyph = @import("ziglyph"); + // Set the min and max to control the test range. + const min = 0; + const max = std.math.maxInt(u21) + 1; + var state: u3 = 0; var zg_state: u3 = 0; - for (0..std.math.maxInt(u21) + 1) |cp1| { + for (min..max) |cp1| { if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1}); if (cp1 == '\r' or cp1 == '\n' or ziglyph.grapheme_break.isControl(@intCast(cp1))) continue; - for (0..std.math.maxInt(u21) + 1) |cp2| { + for (min..max) |cp2| { if (cp2 == '\r' or cp2 == '\n' or ziglyph.grapheme_break.isControl(@intCast(cp2))) continue; @@ -235,27 +158,3 @@ pub fn main() !void { pub const std_options = struct { pub const log_level: std.log.Level = .info; }; - -// test "matches ziglyph specific" { -// const testing = std.testing; -// -// var state: u3 = 0; -// var zg_state: u3 = 0; -// -// const cp1 = 0x20; -// const cp2 = 0x300; -// -// const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state); -// const zg_gb = zg_graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state); -// if (gb != zg_gb) { -// std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{ -// cp1, -// cp2, -// gb, -// state, -// zg_gb, -// zg_state, -// }); -// try testing.expect(false); -// } -// } diff --git a/src/unicode/main.zig b/src/unicode/main.zig index 1af26d485..3cc4779ed 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -1,9 +1,10 @@ pub const lut = @import("lut.zig"); -pub usingnamespace @import("grapheme.zig"); +const grapheme = @import("grapheme.zig"); const props = @import("props.zig"); pub const table = props.table; pub const Properties = props.Properties; +pub const graphemeBreak = grapheme.graphemeBreak; test { @import("std").testing.refAllDecls(@This());