diff --git a/src/bench/grapheme-break.sh b/src/bench/grapheme-break.sh new file mode 100755 index 000000000..24f475caa --- /dev/null +++ b/src/bench/grapheme-break.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# +# This is a trivial helper script to help run the grapheme-break benchmark. +# You probably want to tweak this script depending on what you're +# trying to measure. + +# Options: +# - "ascii", uniform random ASCII bytes +# - "utf8", uniform random unicode characters, encoded as utf8 +# - "rand", pure random data, will contain many invalid code sequences. +DATA="utf8" +SIZE="25000000" + +# Add additional arguments +ARGS="" + +# Generate the benchmark input ahead of time so it's not included in the time. +./zig-out/bin/bench-stream --mode=gen-$DATA | head -c $SIZE > /tmp/ghostty_bench_data +#cat ~/Downloads/JAPANESEBIBLE.txt > /tmp/ghostty_bench_data + +# Uncomment to instead use the contents of `stream.txt` as input. +# yes $(cat ./stream.txt) | head -c $SIZE > /tmp/ghostty_bench_data + +hyperfine \ + --warmup 10 \ + -n noop \ + "./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} try benchNoop(reader, buf), + .ziglyph => try benchZiglyph(reader, buf), + .table => try benchTable(reader, buf), + } +} + +noinline fn benchNoop( + reader: anytype, + buf: []u8, +) !void { + var d: UTF8Decoder = .{}; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + _ = d.next(c); + } + } +} + +noinline fn benchTable( + reader: anytype, + buf: []u8, +) !void { + var d: UTF8Decoder = .{}; + var state: unicode.GraphemeBreakState = .{}; + var cp1: u21 = 0; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp2| { + const v = unicode.graphemeBreak(cp1, @intCast(cp2), &state); + buf[0] = @intCast(@intFromBool(v)); + cp1 = cp2; + } + } + } +} + +noinline fn benchZiglyph( + reader: anytype, + buf: []u8, +) !void { + var d: UTF8Decoder = .{}; + var state: u3 = 0; + var cp1: u21 = 0; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp2| { + const v = ziglyph.graphemeBreak(cp1, @intCast(cp2), &state); + buf[0] = @intCast(@intFromBool(v)); + cp1 = cp2; + } + } + } +} diff --git a/src/build_config.zig b/src/build_config.zig index 32dee925a..33b76d252 100644 --- a/src/build_config.zig +++ b/src/build_config.zig @@ -141,4 +141,5 @@ pub const ExeEntrypoint = enum { bench_parser, bench_stream, bench_codepoint_width, + bench_grapheme_break, }; diff --git a/src/main.zig b/src/main.zig index 46a6d7d3d..8cad7ec9f 100644 --- a/src/main.zig +++ b/src/main.zig @@ -9,4 +9,5 @@ pub usingnamespace switch (build_config.exe_entrypoint) { .bench_parser => @import("bench/parser.zig"), .bench_stream => @import("bench/stream.zig"), .bench_codepoint_width => @import("bench/codepoint-width.zig"), + .bench_grapheme_break => @import("bench/grapheme-break.zig"), }; diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig index aed41728d..0b2ab5915 100644 --- a/src/terminal/Terminal.zig +++ b/src/terminal/Terminal.zig @@ -6,7 +6,6 @@ const Terminal = @This(); const std = @import("std"); const builtin = @import("builtin"); -const ziglyph = @import("ziglyph"); const testing = std.testing; const assert = std.debug.assert; const Allocator = std.mem.Allocator; @@ -786,24 +785,19 @@ pub fn print(self: *Terminal, c: u21) !void { if (prev.cell.char == 0) break :grapheme; const grapheme_break = brk: { - var state: u3 = 0; + var state: unicode.GraphemeBreakState = .{}; var cp1: u21 = @intCast(prev.cell.char); if (prev.cell.attrs.grapheme) { var it = row.codepointIterator(prev.x); while (it.next()) |cp2| { // log.debug("cp1={x} cp2={x}", .{ cp1, cp2 }); - assert(!ziglyph.graphemeBreak( - cp1, - cp2, - &state, - )); - + assert(!unicode.graphemeBreak(cp1, cp2, &state)); cp1 = cp2; } } // log.debug("cp1={x} cp2={x} end", .{ cp1, c }); - break :brk ziglyph.graphemeBreak(cp1, c, &state); + break :brk unicode.graphemeBreak(cp1, c, &state); }; // If we can NOT break, this means that "c" is part of a grapheme diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig new file mode 100644 index 000000000..09f452114 --- /dev/null +++ b/src/unicode/grapheme.zig @@ -0,0 +1,183 @@ +const std = @import("std"); +const props = @import("props.zig"); +const GraphemeBoundaryClass = props.GraphemeBoundaryClass; +const table = props.table; + +/// Determines if there is a grapheme break between two codepoints. This +/// must be called sequentially maintaining the state between calls. +/// +/// This function does NOT work with control characters. Control characters, +/// line feeds, and carriage returns are expected to be filtered out before +/// calling this function. This is because this function is tuned for +/// Ghostty. +pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool { + const value = Precompute.data[ + (Precompute.Key{ + .gbc1 = table.get(cp1).grapheme_boundary_class, + .gbc2 = table.get(cp2).grapheme_boundary_class, + .state = state.*, + }).index() + ]; + state.* = value.state; + return value.result; +} + +/// The state that must be maintained between calls to `graphemeBreak`. +pub const BreakState = packed struct(u2) { + extended_pictographic: bool = false, + regional_indicator: bool = false, +}; + +/// This is all the structures and data for the precomputed lookup table +/// for all possible permutations of state and grapheme boundary classes. +/// Precomputation only requires 2^10 keys of 3 bit values so the whole +/// table is less than 1KB. +const Precompute = struct { + const Key = packed struct(u10) { + state: BreakState, + gbc1: GraphemeBoundaryClass, + gbc2: GraphemeBoundaryClass, + + fn index(self: Key) usize { + return @intCast(@as(u10, @bitCast(self))); + } + }; + + const Value = packed struct(u3) { + result: bool, + state: BreakState, + }; + + const data = precompute: { + var result: [std.math.maxInt(u10)]Value = undefined; + + @setEvalBranchQuota(2_000); + const info = @typeInfo(GraphemeBoundaryClass).Enum; + for (0..std.math.maxInt(u2) + 1) |state_init| { + for (info.fields) |field1| { + for (info.fields) |field2| { + var state: BreakState = @bitCast(@as(u2, @intCast(state_init))); + const key: Key = .{ + .gbc1 = @field(GraphemeBoundaryClass, field1.name), + .gbc2 = @field(GraphemeBoundaryClass, field2.name), + .state = state, + }; + const v = graphemeBreakClass(key.gbc1, key.gbc2, &state); + result[key.index()] = .{ .result = v, .state = state }; + } + } + } + + break :precompute result; + }; +}; + +/// This is the algorithm from utf8proc. We only use this offline for +/// precomputing the lookup table. +fn graphemeBreakClass( + gbc1: GraphemeBoundaryClass, + gbc2: GraphemeBoundaryClass, + state: *BreakState, +) bool { + // GB11: Emoji Extend* ZWJ x Emoji + if (!state.extended_pictographic and gbc1 == .extended_pictographic) { + state.extended_pictographic = true; + } + + // These two properties are ignored because they're not relevant to + // Ghostty -- they're filtered out before checking grapheme boundaries. + // GB3: CR x LF + // GB4: Control + + // GB6: Hangul L x (L|V|LV|VT) + if (gbc1 == .L) { + if (gbc2 == .L or + gbc2 == .V or + gbc2 == .LV or + gbc2 == .LVT) return false; + } + + // GB7: Hangul (LV | V) x (V | T) + if (gbc1 == .LV or gbc1 == .V) { + if (gbc2 == .V or + gbc2 == .T) return false; + } + + // GB8: Hangul (LVT | T) x T + if (gbc1 == .LVT or gbc1 == .T) { + if (gbc2 == .T) return false; + } + + // GB9b: x (Extend | ZWJ) + if (gbc2 == .extend or gbc2 == .zwj) return false; + + // GB9a: x Spacing + if (gbc2 == .spacing_mark) return false; + + // GB9b: Prepend x + if (gbc1 == .prepend) return false; + + // GB12, GB13: RI x RI + if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) { + if (state.regional_indicator) { + state.regional_indicator = false; + return true; + } else { + state.regional_indicator = true; + return false; + } + } + + // GB11: Emoji Extend* ZWJ x Emoji + if (state.extended_pictographic and + gbc1 == .zwj and + gbc2 == .extended_pictographic) + { + state.extended_pictographic = false; + return false; + } + + return true; +} + +/// If you build this file as a binary, we will verify the grapheme break +/// implementation. This iterates over billions of codepoints so it is +/// SLOW. It's not meant to be run in CI, but it's useful for debugging. +pub fn main() !void { + const ziglyph = @import("ziglyph"); + + // Set the min and max to control the test range. + const min = 0; + const max = std.math.maxInt(u21) + 1; + + var state: BreakState = .{}; + var zg_state: u3 = 0; + for (min..max) |cp1| { + if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1}); + + if (cp1 == '\r' or cp1 == '\n' or + ziglyph.grapheme_break.isControl(@intCast(cp1))) continue; + + for (min..max) |cp2| { + if (cp2 == '\r' or cp2 == '\n' or + ziglyph.grapheme_break.isControl(@intCast(cp2))) continue; + + const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state); + const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state); + if (gb != zg_gb) { + std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{ + cp1, + cp2, + gb, + state, + zg_gb, + zg_state, + }); + } + } + } +} + +pub const std_options = struct { + pub const log_level: std.log.Level = .info; +}; diff --git a/src/unicode/main.zig b/src/unicode/main.zig index fa0cb9fc8..e8ba05b72 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -1,8 +1,11 @@ pub const lut = @import("lut.zig"); +const grapheme = @import("grapheme.zig"); const props = @import("props.zig"); pub const table = props.table; pub const Properties = props.Properties; +pub const graphemeBreak = grapheme.graphemeBreak; +pub const GraphemeBreakState = grapheme.BreakState; test { @import("std").testing.refAllDecls(@This()); diff --git a/src/unicode/props.zig b/src/unicode/props.zig index fe85844a5..d83f0f699 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -27,9 +27,13 @@ pub const Properties = struct { /// becomes a 2-em dash). width: u2 = 0, + /// Grapheme boundary class. + grapheme_boundary_class: GraphemeBoundaryClass = .invalid, + // Needed for lut.Generator pub fn eql(a: Properties, b: Properties) bool { - return a.width == b.width; + return a.width == b.width and + a.grapheme_boundary_class == b.grapheme_boundary_class; } // Needed for lut.Generator @@ -41,17 +45,64 @@ pub const Properties = struct { ) !void { _ = layout; _ = opts; - try std.fmt.format(writer, ".{{ .width= {}, }}", .{ + try std.fmt.format(writer, + \\.{{ + \\ .width= {}, + \\ .grapheme_boundary_class= .{s}, + \\}} + , .{ self.width, + @tagName(self.grapheme_boundary_class), }); } }; +/// Possible grapheme boundary classes. This isn't an exhaustive list: +/// we omit control, CR, LF, etc. because in Ghostty's usage that are +/// impossible because they're handled by the terminal. +pub const GraphemeBoundaryClass = enum(u4) { + invalid, + L, + V, + T, + LV, + LVT, + prepend, + extend, + zwj, + spacing_mark, + regional_indicator, + extended_pictographic, + + /// Gets the grapheme boundary class for a codepoint. This is VERY + /// SLOW. The use case for this is only in generating lookup tables. + pub fn init(cp: u21) GraphemeBoundaryClass { + if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic; + if (ziglyph.emoji.isEmojiModifier(cp)) return .extend; + if (ziglyph.grapheme_break.isL(cp)) return .L; + if (ziglyph.grapheme_break.isV(cp)) return .V; + if (ziglyph.grapheme_break.isT(cp)) return .T; + if (ziglyph.grapheme_break.isLv(cp)) return .LV; + if (ziglyph.grapheme_break.isLvt(cp)) return .LVT; + if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend; + if (ziglyph.grapheme_break.isExtend(cp)) return .extend; + if (ziglyph.grapheme_break.isZwj(cp)) return .zwj; + if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark; + if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator; + + // This is obviously not INVALID invalid, there is SOME grapheme + // boundary class for every codepoint. But we don't care about + // anything that doesn't fit into the above categories. + return .invalid; + } +}; + pub fn get(cp: u21) Properties { const zg_width = ziglyph.display_width.codePointWidth(cp, .half); return .{ .width = @intCast(@min(2, @max(0, zg_width))), + .grapheme_boundary_class = GraphemeBoundaryClass.init(cp), }; }