From 64376235002e3b85efdae29700b162d572329f96 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Fri, 9 Feb 2024 09:12:05 -0800 Subject: [PATCH 1/7] bench/grapheme-break --- build.zig | 9 +++ build.zig.zon | 1 + src/bench/grapheme-break.sh | 32 ++++++++ src/bench/grapheme-break.zig | 144 +++++++++++++++++++++++++++++++++++ src/build_config.zig | 1 + src/main.zig | 1 + 6 files changed, 188 insertions(+) create mode 100755 src/bench/grapheme-break.sh create mode 100644 src/bench/grapheme-break.zig diff --git a/build.zig b/build.zig index 0669f27cc..445cf4a98 100644 --- a/build.zig +++ b/build.zig @@ -1082,6 +1082,15 @@ fn addDeps( step.linkLibrary(utfcpp_dep.artifact("utfcpp")); try static_libs.append(utfcpp_dep.artifact("utfcpp").getEmittedBin()); + // utf8proc + const utf8proc_dep = b.dependency("utf8proc", .{ + .target = target, + .optimize = optimize, + }); + step.root_module.addImport("utf8proc", utf8proc_dep.module("utf8proc")); + step.linkLibrary(utf8proc_dep.artifact("utf8proc")); + try static_libs.append(utf8proc_dep.artifact("utf8proc").getEmittedBin()); + // Spirv-Cross step.linkLibrary(spirv_cross_dep.artifact("spirv_cross")); try static_libs.append(spirv_cross_dep.artifact("spirv_cross").getEmittedBin()); diff --git a/build.zig.zon b/build.zig.zon index a694562ea..535d51c24 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -39,6 +39,7 @@ .pixman = .{ .path = "./pkg/pixman" }, .simdutf = .{ .path = "./pkg/simdutf" }, .utfcpp = .{ .path = "./pkg/utfcpp" }, + .utf8proc = .{ .path = "./pkg/utf8proc" }, .zlib = .{ .path = "./pkg/zlib" }, // Shader translation diff --git a/src/bench/grapheme-break.sh b/src/bench/grapheme-break.sh new file mode 100755 index 000000000..56bd28dd1 --- /dev/null +++ b/src/bench/grapheme-break.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# +# This is a trivial helper script to help run the grapheme-break benchmark. +# You probably want to tweak this script depending on what you're +# trying to measure. + +# Options: +# - "ascii", uniform random ASCII bytes +# - "utf8", uniform random unicode characters, encoded as utf8 +# - "rand", pure random data, will contain many invalid code sequences. +DATA="utf8" +SIZE="25000000" + +# Add additional arguments +ARGS="" + +# Generate the benchmark input ahead of time so it's not included in the time. +./zig-out/bin/bench-stream --mode=gen-$DATA | head -c $SIZE > /tmp/ghostty_bench_data +#cat ~/Downloads/JAPANESEBIBLE.txt > /tmp/ghostty_bench_data + +# Uncomment to instead use the contents of `stream.txt` as input. +# yes $(cat ./stream.txt) | head -c $SIZE > /tmp/ghostty_bench_data + +hyperfine \ + --warmup 10 \ + -n noop \ + "./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} try benchNoop(reader, buf), + .ziglyph => try benchZiglyph(reader, buf), + .utf8proc => try benchUtf8proc(reader, buf), + } +} + +noinline fn benchNoop( + reader: anytype, + buf: []u8, +) !void { + var d: UTF8Decoder = .{}; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + _ = d.next(c); + } + } +} + +noinline fn benchZiglyph( + reader: anytype, + buf: []u8, +) !void { + var d: UTF8Decoder = .{}; + var state: u3 = 0; + var cp1: u21 = 0; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp2| { + const v = ziglyph.graphemeBreak(cp1, @intCast(cp2), &state); + buf[0] = @intCast(@intFromBool(v)); + cp1 = cp2; + } + } + } +} + +noinline fn benchUtf8proc( + reader: anytype, + buf: []u8, +) !void { + const utf8proc = @import("utf8proc"); + var d: UTF8Decoder = .{}; + var state: i32 = 0; + var cp1: u21 = 0; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp2| { + const v = utf8proc.graphemeBreakStateful(cp1, @intCast(cp2), &state); + buf[0] = @intCast(@intFromBool(v)); + cp1 = cp2; + } + } + } +} diff --git a/src/build_config.zig b/src/build_config.zig index 32dee925a..33b76d252 100644 --- a/src/build_config.zig +++ b/src/build_config.zig @@ -141,4 +141,5 @@ pub const ExeEntrypoint = enum { bench_parser, bench_stream, bench_codepoint_width, + bench_grapheme_break, }; diff --git a/src/main.zig b/src/main.zig index 46a6d7d3d..8cad7ec9f 100644 --- a/src/main.zig +++ b/src/main.zig @@ -9,4 +9,5 @@ pub usingnamespace switch (build_config.exe_entrypoint) { .bench_parser => @import("bench/parser.zig"), .bench_stream => @import("bench/stream.zig"), .bench_codepoint_width => @import("bench/codepoint-width.zig"), + .bench_grapheme_break => @import("bench/grapheme-break.zig"), }; From 0632410857f00eb0cf8aa0b1acf78b778486127c Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Fri, 9 Feb 2024 12:22:23 -0800 Subject: [PATCH 2/7] unicode: get grapheme boundary class --- src/unicode/props.zig | 44 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/unicode/props.zig b/src/unicode/props.zig index fe85844a5..d46acbf49 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -27,6 +27,9 @@ pub const Properties = struct { /// becomes a 2-em dash). width: u2 = 0, + /// Grapheme boundary class. + grapheme_boundary_class: GraphemeBoundaryClass = .invalid, + // Needed for lut.Generator pub fn eql(a: Properties, b: Properties) bool { return a.width == b.width; @@ -47,11 +50,52 @@ pub const Properties = struct { } }; +/// Possible grapheme boundary classes. This isn't an exhaustive list: +/// we omit control, CR, LF, etc. because in Ghostty's usage that are +/// impossible because they're handled by the terminal. +pub const GraphemeBoundaryClass = enum { + invalid, + L, + V, + T, + LV, + LVT, + prepend, + extend, + zwj, + spacing_mark, + regional_indicator, + extended_pictographic, + + /// Gets the grapheme boundary class for a codepoint. This is VERY + /// SLOW. The use case for this is only in generating lookup tables. + pub fn init(cp: u21) GraphemeBoundaryClass { + if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic; + if (ziglyph.emoji.isEmojiModifier(cp)) return .extend; + if (ziglyph.grapheme_break.isL(cp)) return .L; + if (ziglyph.grapheme_break.isV(cp)) return .V; + if (ziglyph.grapheme_break.isT(cp)) return .T; + if (ziglyph.grapheme_break.isLv(cp)) return .LV; + if (ziglyph.grapheme_break.isLvt(cp)) return .LVT; + if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend; + if (ziglyph.grapheme_break.isExtend(cp)) return .extend; + if (ziglyph.grapheme_break.isZwj(cp)) return .zwj; + if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark; + if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator; + + // This is obviously not INVALID invalid, there is SOME grapheme + // boundary class for every codepoint. But we don't care about + // anything that doesn't fit into the above categories. + return .invalid; + } +}; + pub fn get(cp: u21) Properties { const zg_width = ziglyph.display_width.codePointWidth(cp, .half); return .{ .width = @intCast(@min(2, @max(0, zg_width))), + .grapheme_boundary_class = GraphemeBoundaryClass.init(cp), }; } From 5f3574a4bfc33a19e0b6588ff67709afae0622bd Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Fri, 9 Feb 2024 19:44:57 -0800 Subject: [PATCH 3/7] unicode: direct port of ziglyph to start --- build.zig | 11 ++ src/bench/grapheme-break.sh | 5 +- src/bench/grapheme-break.zig | 31 ++++- src/unicode/grapheme.zig | 261 +++++++++++++++++++++++++++++++++++ src/unicode/main.zig | 1 + src/unicode/props.zig | 11 +- 6 files changed, 316 insertions(+), 4 deletions(-) create mode 100644 src/unicode/grapheme.zig diff --git a/build.zig b/build.zig index 445cf4a98..fd49b7e62 100644 --- a/build.zig +++ b/build.zig @@ -217,6 +217,17 @@ pub fn build(b: *std.Build) !void { // Add our benchmarks try benchSteps(b, target, config, emit_bench); + { + const exe = b.addExecutable(.{ + .name = "grapheme-verify", + .root_source_file = .{ .path = "src/unicode/grapheme.zig" }, + .target = target, + .optimize = .ReleaseFast, + }); + b.installArtifact(exe); + _ = try addDeps(b, exe, config); + } + // We only build an exe if we have a runtime set. const exe_: ?*std.Build.Step.Compile = if (config.app_runtime != .none) b.addExecutable(.{ .name = "ghostty", diff --git a/src/bench/grapheme-break.sh b/src/bench/grapheme-break.sh index 56bd28dd1..c395c3799 100755 --- a/src/bench/grapheme-break.sh +++ b/src/bench/grapheme-break.sh @@ -28,5 +28,8 @@ hyperfine \ -n ziglyph \ "./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} try benchNoop(reader, buf), .ziglyph => try benchZiglyph(reader, buf), + .table => try benchTable(reader, buf), .utf8proc => try benchUtf8proc(reader, buf), } } @@ -92,6 +96,31 @@ noinline fn benchNoop( } } +noinline fn benchTable( + reader: anytype, + buf: []u8, +) !void { + var d: UTF8Decoder = .{}; + var state: u3 = 0; + var cp1: u21 = 0; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp2| { + const v = unicode.graphemeBreak(cp1, @intCast(cp2), &state); + buf[0] = @intCast(@intFromBool(v)); + cp1 = cp2; + } + } + } +} + noinline fn benchZiglyph( reader: anytype, buf: []u8, diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig new file mode 100644 index 000000000..f5a39f863 --- /dev/null +++ b/src/unicode/grapheme.zig @@ -0,0 +1,261 @@ +const std = @import("std"); +const props = @import("props.zig"); +const table = props.table; + +/// Grapheme break +pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool { + const gbc1 = table.get(cp1).grapheme_boundary_class; + const gbc2 = table.get(cp2).grapheme_boundary_class; + // std.log.warn("gbc1={} gbc2={}, new1={} new2={}", .{ + // gbc1, + // gbc2, + // props.GraphemeBoundaryClass.init(cp1), + // props.GraphemeBoundaryClass.init(cp2), + // }); + + // GB11: Emoji Extend* ZWJ x Emoji + if (!hasXpic(state) and gbc1 == .extended_pictographic) setXpic(state); + + // These two properties are ignored because they're not relevant to + // Ghostty -- they're filtered out before checking grapheme boundaries. + // GB3: CR x LF + // GB4: Control + + // GB6: Hangul L x (L|V|LV|VT) + if (gbc1 == .L) { + if (gbc2 == .L or + gbc2 == .V or + gbc2 == .LV or + gbc2 == .LVT) return false; + } + + // GB7: Hangul (LV | V) x (V | T) + if (gbc1 == .LV or gbc1 == .V) { + if (gbc2 == .V or + gbc2 == .T) return false; + } + + // GB8: Hangul (LVT | T) x T + if (gbc1 == .LVT or gbc1 == .T) { + if (gbc2 == .T) return false; + } + + // GB9b: x (Extend | ZWJ) + if (gbc2 == .extend or gbc2 == .zwj) return false; + + // GB9a: x Spacing + if (gbc2 == .spacing_mark) return false; + + // GB9b: Prepend x + if (gbc1 == .prepend) return false; + + // GB12, GB13: RI x RI + if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) { + if (hasRegional(state)) { + unsetRegional(state); + return true; + } else { + setRegional(state); + return false; + } + } + + // GB11: Emoji Extend* ZWJ x Emoji + if (hasXpic(state) and + gbc1 == .zwj and + gbc2 == .extended_pictographic) + { + unsetXpic(state); + return false; + } + + return true; +} + +const emoji = @import("ziglyph").emoji; +const gbp = @import("ziglyph").grapheme_break; + +fn isBreaker(cp: u21) bool { + return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); +} + +pub fn zg_graphemeBreak( + cp1: u21, + cp2: u21, + state: *u3, +) bool { + + // GB11: Emoji Extend* ZWJ x Emoji + if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); + + // GB3: CR x LF + if (cp1 == '\r' and cp2 == '\n') { + std.log.warn("GB3", .{}); + return false; + } + + // GB4: Control + if (isBreaker(cp1)) { + std.log.warn("GB4", .{}); + return true; + } + + // GB6: Hangul L x (L|V|LV|VT) + if (gbp.isL(cp1)) { + if (gbp.isL(cp2) or + gbp.isV(cp2) or + gbp.isLv(cp2) or + gbp.isLvt(cp2)) + { + std.log.warn("GB6", .{}); + return false; + } + } + + // GB7: Hangul (LV | V) x (V | T) + if (gbp.isLv(cp1) or gbp.isV(cp1)) { + if (gbp.isV(cp2) or + gbp.isT(cp2)) + { + std.log.warn("GB7", .{}); + return false; + } + } + + // GB8: Hangul (LVT | T) x T + if (gbp.isLvt(cp1) or gbp.isT(cp1)) { + if (gbp.isT(cp2)) { + std.log.warn("GB8", .{}); + return false; + } + } + + // GB9b: x (Extend | ZWJ) + if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) { + std.log.warn("GB9b", .{}); + return false; + } + + // GB9a: x Spacing + if (gbp.isSpacingmark(cp2)) { + std.log.warn("GB9a", .{}); + return false; + } + + // GB9b: Prepend x + if (gbp.isPrepend(cp1) and !isBreaker(cp2)) { + std.log.warn("GB9b cp1={x} prepend={}", .{ cp1, gbp.isPrepend(cp1) }); + return false; + } + + // GB12, GB13: RI x RI + if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) { + if (hasRegional(state)) { + unsetRegional(state); + std.log.warn("GB12", .{}); + return true; + } else { + std.log.warn("GB13", .{}); + setRegional(state); + return false; + } + } + + // GB11: Emoji Extend* ZWJ x Emoji + if (hasXpic(state) and + gbp.isZwj(cp1) and + emoji.isExtendedPictographic(cp2)) + { + std.log.warn("GB11", .{}); + unsetXpic(state); + return false; + } + + return true; +} + +fn hasXpic(state: *const u3) bool { + return state.* & 1 == 1; +} + +fn setXpic(state: *u3) void { + state.* |= 1; +} + +fn unsetXpic(state: *u3) void { + state.* ^= 1; +} + +fn hasRegional(state: *const u3) bool { + return state.* & 2 == 2; +} + +fn setRegional(state: *u3) void { + state.* |= 2; +} + +fn unsetRegional(state: *u3) void { + state.* ^= 2; +} + +/// If you build this file as a binary, we will verify the grapheme break +/// implementation. This iterates over billions of codepoints so it is +/// SLOW. It's not meant to be run in CI, but it's useful for debugging. +pub fn main() !void { + const ziglyph = @import("ziglyph"); + + var state: u3 = 0; + var zg_state: u3 = 0; + for (0..std.math.maxInt(u21) + 1) |cp1| { + if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1}); + + if (cp1 == '\r' or cp1 == '\n' or + ziglyph.grapheme_break.isControl(@intCast(cp1))) continue; + + for (0..std.math.maxInt(u21) + 1) |cp2| { + if (cp2 == '\r' or cp2 == '\n' or + ziglyph.grapheme_break.isControl(@intCast(cp2))) continue; + + const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state); + const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state); + if (gb != zg_gb) { + std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{ + cp1, + cp2, + gb, + state, + zg_gb, + zg_state, + }); + } + } + } +} + +pub const std_options = struct { + pub const log_level: std.log.Level = .info; +}; + +// test "matches ziglyph specific" { +// const testing = std.testing; +// +// var state: u3 = 0; +// var zg_state: u3 = 0; +// +// const cp1 = 0x20; +// const cp2 = 0x300; +// +// const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state); +// const zg_gb = zg_graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state); +// if (gb != zg_gb) { +// std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{ +// cp1, +// cp2, +// gb, +// state, +// zg_gb, +// zg_state, +// }); +// try testing.expect(false); +// } +// } diff --git a/src/unicode/main.zig b/src/unicode/main.zig index fa0cb9fc8..1af26d485 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -1,5 +1,6 @@ pub const lut = @import("lut.zig"); +pub usingnamespace @import("grapheme.zig"); const props = @import("props.zig"); pub const table = props.table; pub const Properties = props.Properties; diff --git a/src/unicode/props.zig b/src/unicode/props.zig index d46acbf49..d6f282ed9 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -32,7 +32,8 @@ pub const Properties = struct { // Needed for lut.Generator pub fn eql(a: Properties, b: Properties) bool { - return a.width == b.width; + return a.width == b.width and + a.grapheme_boundary_class == b.grapheme_boundary_class; } // Needed for lut.Generator @@ -44,8 +45,14 @@ pub const Properties = struct { ) !void { _ = layout; _ = opts; - try std.fmt.format(writer, ".{{ .width= {}, }}", .{ + try std.fmt.format(writer, + \\.{{ + \\ .width= {}, + \\ .grapheme_boundary_class= .{s}, + \\}} + , .{ self.width, + @tagName(self.grapheme_boundary_class), }); } }; From c47ad97f62ca1f5e6132d46839b7cda999af461b Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Fri, 9 Feb 2024 20:23:29 -0800 Subject: [PATCH 4/7] unicode: remove unused --- src/unicode/grapheme.zig | 167 ++++++++------------------------------- src/unicode/main.zig | 3 +- 2 files changed, 35 insertions(+), 135 deletions(-) diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig index f5a39f863..19437844c 100644 --- a/src/unicode/grapheme.zig +++ b/src/unicode/grapheme.zig @@ -1,18 +1,34 @@ const std = @import("std"); const props = @import("props.zig"); +const GraphemeBoundaryClass = props.GraphemeBoundaryClass; const table = props.table; -/// Grapheme break +// The algorithm in this file is based on the Ziglyph and utf8proc algorithm, +// only modified to use our own lookup tables. +// +// I'll note I also tried a fully precomputed table approach where all +// combinations of state and boundary classes were precomputed. It was +// marginally faster (about 2%) but the table is a few KB and I'm not +// sure it's worth it. + +/// Determines if there is a grapheme break between two codepoints. This +/// must be called sequentially maintaining the state between calls. +/// +/// This function does NOT work with control characters. Control characters, +/// line feeds, and carriage returns are expected to be filtered out before +/// calling this function. This is because this function is tuned for +/// Ghostty. pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool { const gbc1 = table.get(cp1).grapheme_boundary_class; const gbc2 = table.get(cp2).grapheme_boundary_class; - // std.log.warn("gbc1={} gbc2={}, new1={} new2={}", .{ - // gbc1, - // gbc2, - // props.GraphemeBoundaryClass.init(cp1), - // props.GraphemeBoundaryClass.init(cp2), - // }); + return graphemeBreakClass(gbc1, gbc2, state); +} +fn graphemeBreakClass( + gbc1: GraphemeBoundaryClass, + gbc2: GraphemeBoundaryClass, + state: *u3, +) bool { // GB11: Emoji Extend* ZWJ x Emoji if (!hasXpic(state) and gbc1 == .extended_pictographic) setXpic(state); @@ -72,107 +88,10 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool { return true; } -const emoji = @import("ziglyph").emoji; -const gbp = @import("ziglyph").grapheme_break; - -fn isBreaker(cp: u21) bool { - return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); -} - -pub fn zg_graphemeBreak( - cp1: u21, - cp2: u21, - state: *u3, -) bool { - - // GB11: Emoji Extend* ZWJ x Emoji - if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); - - // GB3: CR x LF - if (cp1 == '\r' and cp2 == '\n') { - std.log.warn("GB3", .{}); - return false; - } - - // GB4: Control - if (isBreaker(cp1)) { - std.log.warn("GB4", .{}); - return true; - } - - // GB6: Hangul L x (L|V|LV|VT) - if (gbp.isL(cp1)) { - if (gbp.isL(cp2) or - gbp.isV(cp2) or - gbp.isLv(cp2) or - gbp.isLvt(cp2)) - { - std.log.warn("GB6", .{}); - return false; - } - } - - // GB7: Hangul (LV | V) x (V | T) - if (gbp.isLv(cp1) or gbp.isV(cp1)) { - if (gbp.isV(cp2) or - gbp.isT(cp2)) - { - std.log.warn("GB7", .{}); - return false; - } - } - - // GB8: Hangul (LVT | T) x T - if (gbp.isLvt(cp1) or gbp.isT(cp1)) { - if (gbp.isT(cp2)) { - std.log.warn("GB8", .{}); - return false; - } - } - - // GB9b: x (Extend | ZWJ) - if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) { - std.log.warn("GB9b", .{}); - return false; - } - - // GB9a: x Spacing - if (gbp.isSpacingmark(cp2)) { - std.log.warn("GB9a", .{}); - return false; - } - - // GB9b: Prepend x - if (gbp.isPrepend(cp1) and !isBreaker(cp2)) { - std.log.warn("GB9b cp1={x} prepend={}", .{ cp1, gbp.isPrepend(cp1) }); - return false; - } - - // GB12, GB13: RI x RI - if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) { - if (hasRegional(state)) { - unsetRegional(state); - std.log.warn("GB12", .{}); - return true; - } else { - std.log.warn("GB13", .{}); - setRegional(state); - return false; - } - } - - // GB11: Emoji Extend* ZWJ x Emoji - if (hasXpic(state) and - gbp.isZwj(cp1) and - emoji.isExtendedPictographic(cp2)) - { - std.log.warn("GB11", .{}); - unsetXpic(state); - return false; - } - - return true; -} +const State = packed struct(u2) { + extended_pictographic: bool = false, + regional_indicator: bool = false, +}; fn hasXpic(state: *const u3) bool { return state.* & 1 == 1; @@ -204,15 +123,19 @@ fn unsetRegional(state: *u3) void { pub fn main() !void { const ziglyph = @import("ziglyph"); + // Set the min and max to control the test range. + const min = 0; + const max = std.math.maxInt(u21) + 1; + var state: u3 = 0; var zg_state: u3 = 0; - for (0..std.math.maxInt(u21) + 1) |cp1| { + for (min..max) |cp1| { if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1}); if (cp1 == '\r' or cp1 == '\n' or ziglyph.grapheme_break.isControl(@intCast(cp1))) continue; - for (0..std.math.maxInt(u21) + 1) |cp2| { + for (min..max) |cp2| { if (cp2 == '\r' or cp2 == '\n' or ziglyph.grapheme_break.isControl(@intCast(cp2))) continue; @@ -235,27 +158,3 @@ pub fn main() !void { pub const std_options = struct { pub const log_level: std.log.Level = .info; }; - -// test "matches ziglyph specific" { -// const testing = std.testing; -// -// var state: u3 = 0; -// var zg_state: u3 = 0; -// -// const cp1 = 0x20; -// const cp2 = 0x300; -// -// const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state); -// const zg_gb = zg_graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state); -// if (gb != zg_gb) { -// std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{ -// cp1, -// cp2, -// gb, -// state, -// zg_gb, -// zg_state, -// }); -// try testing.expect(false); -// } -// } diff --git a/src/unicode/main.zig b/src/unicode/main.zig index 1af26d485..3cc4779ed 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -1,9 +1,10 @@ pub const lut = @import("lut.zig"); -pub usingnamespace @import("grapheme.zig"); +const grapheme = @import("grapheme.zig"); const props = @import("props.zig"); pub const table = props.table; pub const Properties = props.Properties; +pub const graphemeBreak = grapheme.graphemeBreak; test { @import("std").testing.refAllDecls(@This()); From 132fbb3a4695b09d8674914e8d68a660fb28df6d Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Fri, 9 Feb 2024 20:29:36 -0800 Subject: [PATCH 5/7] unicode: use packed struct for break state --- build.zig | 20 ------------- build.zig.zon | 1 - src/bench/grapheme-break.sh | 2 -- src/bench/grapheme-break.zig | 31 +------------------- src/unicode/grapheme.zig | 55 +++++++++++------------------------- src/unicode/main.zig | 1 + 6 files changed, 19 insertions(+), 91 deletions(-) diff --git a/build.zig b/build.zig index fd49b7e62..0669f27cc 100644 --- a/build.zig +++ b/build.zig @@ -217,17 +217,6 @@ pub fn build(b: *std.Build) !void { // Add our benchmarks try benchSteps(b, target, config, emit_bench); - { - const exe = b.addExecutable(.{ - .name = "grapheme-verify", - .root_source_file = .{ .path = "src/unicode/grapheme.zig" }, - .target = target, - .optimize = .ReleaseFast, - }); - b.installArtifact(exe); - _ = try addDeps(b, exe, config); - } - // We only build an exe if we have a runtime set. const exe_: ?*std.Build.Step.Compile = if (config.app_runtime != .none) b.addExecutable(.{ .name = "ghostty", @@ -1093,15 +1082,6 @@ fn addDeps( step.linkLibrary(utfcpp_dep.artifact("utfcpp")); try static_libs.append(utfcpp_dep.artifact("utfcpp").getEmittedBin()); - // utf8proc - const utf8proc_dep = b.dependency("utf8proc", .{ - .target = target, - .optimize = optimize, - }); - step.root_module.addImport("utf8proc", utf8proc_dep.module("utf8proc")); - step.linkLibrary(utf8proc_dep.artifact("utf8proc")); - try static_libs.append(utf8proc_dep.artifact("utf8proc").getEmittedBin()); - // Spirv-Cross step.linkLibrary(spirv_cross_dep.artifact("spirv_cross")); try static_libs.append(spirv_cross_dep.artifact("spirv_cross").getEmittedBin()); diff --git a/build.zig.zon b/build.zig.zon index 535d51c24..a694562ea 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -39,7 +39,6 @@ .pixman = .{ .path = "./pkg/pixman" }, .simdutf = .{ .path = "./pkg/simdutf" }, .utfcpp = .{ .path = "./pkg/utfcpp" }, - .utf8proc = .{ .path = "./pkg/utf8proc" }, .zlib = .{ .path = "./pkg/zlib" }, // Shader translation diff --git a/src/bench/grapheme-break.sh b/src/bench/grapheme-break.sh index c395c3799..24f475caa 100755 --- a/src/bench/grapheme-break.sh +++ b/src/bench/grapheme-break.sh @@ -27,8 +27,6 @@ hyperfine \ "./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} try benchNoop(reader, buf), .ziglyph => try benchZiglyph(reader, buf), .table => try benchTable(reader, buf), - .utf8proc => try benchUtf8proc(reader, buf), } } @@ -101,7 +98,7 @@ noinline fn benchTable( buf: []u8, ) !void { var d: UTF8Decoder = .{}; - var state: u3 = 0; + var state: unicode.GraphemeBreakState = .{}; var cp1: u21 = 0; while (true) { const n = try reader.read(buf); @@ -145,29 +142,3 @@ noinline fn benchZiglyph( } } } - -noinline fn benchUtf8proc( - reader: anytype, - buf: []u8, -) !void { - const utf8proc = @import("utf8proc"); - var d: UTF8Decoder = .{}; - var state: i32 = 0; - var cp1: u21 = 0; - while (true) { - const n = try reader.read(buf); - if (n == 0) break; - - // Using stream.next directly with a for loop applies a naive - // scalar approach. - for (buf[0..n]) |c| { - const cp_, const consumed = d.next(c); - assert(consumed); - if (cp_) |cp2| { - const v = utf8proc.graphemeBreakStateful(cp1, @intCast(cp2), &state); - buf[0] = @intCast(@intFromBool(v)); - cp1 = cp2; - } - } - } -} diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig index 19437844c..d4c146e49 100644 --- a/src/unicode/grapheme.zig +++ b/src/unicode/grapheme.zig @@ -18,19 +18,27 @@ const table = props.table; /// line feeds, and carriage returns are expected to be filtered out before /// calling this function. This is because this function is tuned for /// Ghostty. -pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool { +pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool { const gbc1 = table.get(cp1).grapheme_boundary_class; const gbc2 = table.get(cp2).grapheme_boundary_class; return graphemeBreakClass(gbc1, gbc2, state); } +/// The state that must be maintained between calls to `graphemeBreak`. +pub const BreakState = packed struct(u2) { + extended_pictographic: bool = false, + regional_indicator: bool = false, +}; + fn graphemeBreakClass( gbc1: GraphemeBoundaryClass, gbc2: GraphemeBoundaryClass, - state: *u3, + state: *BreakState, ) bool { // GB11: Emoji Extend* ZWJ x Emoji - if (!hasXpic(state) and gbc1 == .extended_pictographic) setXpic(state); + if (!state.extended_pictographic and gbc1 == .extended_pictographic) { + state.extended_pictographic = true; + } // These two properties are ignored because they're not relevant to // Ghostty -- they're filtered out before checking grapheme boundaries. @@ -67,56 +75,27 @@ fn graphemeBreakClass( // GB12, GB13: RI x RI if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) { - if (hasRegional(state)) { - unsetRegional(state); + if (state.regional_indicator) { + state.regional_indicator = false; return true; } else { - setRegional(state); + state.regional_indicator = true; return false; } } // GB11: Emoji Extend* ZWJ x Emoji - if (hasXpic(state) and + if (state.extended_pictographic and gbc1 == .zwj and gbc2 == .extended_pictographic) { - unsetXpic(state); + state.extended_pictographic = false; return false; } return true; } -const State = packed struct(u2) { - extended_pictographic: bool = false, - regional_indicator: bool = false, -}; - -fn hasXpic(state: *const u3) bool { - return state.* & 1 == 1; -} - -fn setXpic(state: *u3) void { - state.* |= 1; -} - -fn unsetXpic(state: *u3) void { - state.* ^= 1; -} - -fn hasRegional(state: *const u3) bool { - return state.* & 2 == 2; -} - -fn setRegional(state: *u3) void { - state.* |= 2; -} - -fn unsetRegional(state: *u3) void { - state.* ^= 2; -} - /// If you build this file as a binary, we will verify the grapheme break /// implementation. This iterates over billions of codepoints so it is /// SLOW. It's not meant to be run in CI, but it's useful for debugging. @@ -127,7 +106,7 @@ pub fn main() !void { const min = 0; const max = std.math.maxInt(u21) + 1; - var state: u3 = 0; + var state: BreakState = .{}; var zg_state: u3 = 0; for (min..max) |cp1| { if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1}); diff --git a/src/unicode/main.zig b/src/unicode/main.zig index 3cc4779ed..e8ba05b72 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -5,6 +5,7 @@ const props = @import("props.zig"); pub const table = props.table; pub const Properties = props.Properties; pub const graphemeBreak = grapheme.graphemeBreak; +pub const GraphemeBreakState = grapheme.BreakState; test { @import("std").testing.refAllDecls(@This()); From 6f8b4204b99463a264b1d2311bce46db7634023e Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Fri, 9 Feb 2024 20:31:20 -0800 Subject: [PATCH 6/7] terminal: use new grapheme break algo --- src/terminal/Terminal.zig | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig index 4d5616003..8931f9819 100644 --- a/src/terminal/Terminal.zig +++ b/src/terminal/Terminal.zig @@ -6,7 +6,6 @@ const Terminal = @This(); const std = @import("std"); const builtin = @import("builtin"); -const ziglyph = @import("ziglyph"); const testing = std.testing; const assert = std.debug.assert; const Allocator = std.mem.Allocator; @@ -786,24 +785,19 @@ pub fn print(self: *Terminal, c: u21) !void { if (prev.cell.char == 0) break :grapheme; const grapheme_break = brk: { - var state: u3 = 0; + var state: unicode.GraphemeBreakState = .{}; var cp1: u21 = @intCast(prev.cell.char); if (prev.cell.attrs.grapheme) { var it = row.codepointIterator(prev.x); while (it.next()) |cp2| { // log.debug("cp1={x} cp2={x}", .{ cp1, cp2 }); - assert(!ziglyph.graphemeBreak( - cp1, - cp2, - &state, - )); - + assert(!unicode.graphemeBreak(cp1, cp2, &state)); cp1 = cp2; } } // log.debug("cp1={x} cp2={x} end", .{ cp1, c }); - break :brk ziglyph.graphemeBreak(cp1, c, &state); + break :brk unicode.graphemeBreak(cp1, c, &state); }; // If we can NOT break, this means that "c" is part of a grapheme From 5275d44e7dc4c7f86978b5bdf285b5d4f45eb0e9 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Fri, 9 Feb 2024 20:50:13 -0800 Subject: [PATCH 7/7] unicode: precompute grapheme break data --- src/unicode/grapheme.zig | 66 +++++++++++++++++++++++++++++++++------- src/unicode/props.zig | 2 +- 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig index d4c146e49..09f452114 100644 --- a/src/unicode/grapheme.zig +++ b/src/unicode/grapheme.zig @@ -3,14 +3,6 @@ const props = @import("props.zig"); const GraphemeBoundaryClass = props.GraphemeBoundaryClass; const table = props.table; -// The algorithm in this file is based on the Ziglyph and utf8proc algorithm, -// only modified to use our own lookup tables. -// -// I'll note I also tried a fully precomputed table approach where all -// combinations of state and boundary classes were precomputed. It was -// marginally faster (about 2%) but the table is a few KB and I'm not -// sure it's worth it. - /// Determines if there is a grapheme break between two codepoints. This /// must be called sequentially maintaining the state between calls. /// @@ -19,9 +11,15 @@ const table = props.table; /// calling this function. This is because this function is tuned for /// Ghostty. pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool { - const gbc1 = table.get(cp1).grapheme_boundary_class; - const gbc2 = table.get(cp2).grapheme_boundary_class; - return graphemeBreakClass(gbc1, gbc2, state); + const value = Precompute.data[ + (Precompute.Key{ + .gbc1 = table.get(cp1).grapheme_boundary_class, + .gbc2 = table.get(cp2).grapheme_boundary_class, + .state = state.*, + }).index() + ]; + state.* = value.state; + return value.result; } /// The state that must be maintained between calls to `graphemeBreak`. @@ -30,6 +28,52 @@ pub const BreakState = packed struct(u2) { regional_indicator: bool = false, }; +/// This is all the structures and data for the precomputed lookup table +/// for all possible permutations of state and grapheme boundary classes. +/// Precomputation only requires 2^10 keys of 3 bit values so the whole +/// table is less than 1KB. +const Precompute = struct { + const Key = packed struct(u10) { + state: BreakState, + gbc1: GraphemeBoundaryClass, + gbc2: GraphemeBoundaryClass, + + fn index(self: Key) usize { + return @intCast(@as(u10, @bitCast(self))); + } + }; + + const Value = packed struct(u3) { + result: bool, + state: BreakState, + }; + + const data = precompute: { + var result: [std.math.maxInt(u10)]Value = undefined; + + @setEvalBranchQuota(2_000); + const info = @typeInfo(GraphemeBoundaryClass).Enum; + for (0..std.math.maxInt(u2) + 1) |state_init| { + for (info.fields) |field1| { + for (info.fields) |field2| { + var state: BreakState = @bitCast(@as(u2, @intCast(state_init))); + const key: Key = .{ + .gbc1 = @field(GraphemeBoundaryClass, field1.name), + .gbc2 = @field(GraphemeBoundaryClass, field2.name), + .state = state, + }; + const v = graphemeBreakClass(key.gbc1, key.gbc2, &state); + result[key.index()] = .{ .result = v, .state = state }; + } + } + } + + break :precompute result; + }; +}; + +/// This is the algorithm from utf8proc. We only use this offline for +/// precomputing the lookup table. fn graphemeBreakClass( gbc1: GraphemeBoundaryClass, gbc2: GraphemeBoundaryClass, diff --git a/src/unicode/props.zig b/src/unicode/props.zig index d6f282ed9..d83f0f699 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -60,7 +60,7 @@ pub const Properties = struct { /// Possible grapheme boundary classes. This isn't an exhaustive list: /// we omit control, CR, LF, etc. because in Ghostty's usage that are /// impossible because they're handled by the terminal. -pub const GraphemeBoundaryClass = enum { +pub const GraphemeBoundaryClass = enum(u4) { invalid, L, V,