From 5f3574a4bfc33a19e0b6588ff67709afae0622bd Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Fri, 9 Feb 2024 19:44:57 -0800 Subject: [PATCH] unicode: direct port of ziglyph to start --- build.zig | 11 ++ src/bench/grapheme-break.sh | 5 +- src/bench/grapheme-break.zig | 31 ++++- src/unicode/grapheme.zig | 261 +++++++++++++++++++++++++++++++++++ src/unicode/main.zig | 1 + src/unicode/props.zig | 11 +- 6 files changed, 316 insertions(+), 4 deletions(-) create mode 100644 src/unicode/grapheme.zig diff --git a/build.zig b/build.zig index 445cf4a98..fd49b7e62 100644 --- a/build.zig +++ b/build.zig @@ -217,6 +217,17 @@ pub fn build(b: *std.Build) !void { // Add our benchmarks try benchSteps(b, target, config, emit_bench); + { + const exe = b.addExecutable(.{ + .name = "grapheme-verify", + .root_source_file = .{ .path = "src/unicode/grapheme.zig" }, + .target = target, + .optimize = .ReleaseFast, + }); + b.installArtifact(exe); + _ = try addDeps(b, exe, config); + } + // We only build an exe if we have a runtime set. const exe_: ?*std.Build.Step.Compile = if (config.app_runtime != .none) b.addExecutable(.{ .name = "ghostty", diff --git a/src/bench/grapheme-break.sh b/src/bench/grapheme-break.sh index 56bd28dd1..c395c3799 100755 --- a/src/bench/grapheme-break.sh +++ b/src/bench/grapheme-break.sh @@ -28,5 +28,8 @@ hyperfine \ -n ziglyph \ "./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} try benchNoop(reader, buf), .ziglyph => try benchZiglyph(reader, buf), + .table => try benchTable(reader, buf), .utf8proc => try benchUtf8proc(reader, buf), } } @@ -92,6 +96,31 @@ noinline fn benchNoop( } } +noinline fn benchTable( + reader: anytype, + buf: []u8, +) !void { + var d: UTF8Decoder = .{}; + var state: u3 = 0; + var cp1: u21 = 0; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp2| { + const v = unicode.graphemeBreak(cp1, @intCast(cp2), &state); + buf[0] = @intCast(@intFromBool(v)); + cp1 = cp2; + } + } + } +} + noinline fn benchZiglyph( reader: anytype, buf: []u8, diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig new file mode 100644 index 000000000..f5a39f863 --- /dev/null +++ b/src/unicode/grapheme.zig @@ -0,0 +1,261 @@ +const std = @import("std"); +const props = @import("props.zig"); +const table = props.table; + +/// Grapheme break +pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool { + const gbc1 = table.get(cp1).grapheme_boundary_class; + const gbc2 = table.get(cp2).grapheme_boundary_class; + // std.log.warn("gbc1={} gbc2={}, new1={} new2={}", .{ + // gbc1, + // gbc2, + // props.GraphemeBoundaryClass.init(cp1), + // props.GraphemeBoundaryClass.init(cp2), + // }); + + // GB11: Emoji Extend* ZWJ x Emoji + if (!hasXpic(state) and gbc1 == .extended_pictographic) setXpic(state); + + // These two properties are ignored because they're not relevant to + // Ghostty -- they're filtered out before checking grapheme boundaries. + // GB3: CR x LF + // GB4: Control + + // GB6: Hangul L x (L|V|LV|VT) + if (gbc1 == .L) { + if (gbc2 == .L or + gbc2 == .V or + gbc2 == .LV or + gbc2 == .LVT) return false; + } + + // GB7: Hangul (LV | V) x (V | T) + if (gbc1 == .LV or gbc1 == .V) { + if (gbc2 == .V or + gbc2 == .T) return false; + } + + // GB8: Hangul (LVT | T) x T + if (gbc1 == .LVT or gbc1 == .T) { + if (gbc2 == .T) return false; + } + + // GB9b: x (Extend | ZWJ) + if (gbc2 == .extend or gbc2 == .zwj) return false; + + // GB9a: x Spacing + if (gbc2 == .spacing_mark) return false; + + // GB9b: Prepend x + if (gbc1 == .prepend) return false; + + // GB12, GB13: RI x RI + if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) { + if (hasRegional(state)) { + unsetRegional(state); + return true; + } else { + setRegional(state); + return false; + } + } + + // GB11: Emoji Extend* ZWJ x Emoji + if (hasXpic(state) and + gbc1 == .zwj and + gbc2 == .extended_pictographic) + { + unsetXpic(state); + return false; + } + + return true; +} + +const emoji = @import("ziglyph").emoji; +const gbp = @import("ziglyph").grapheme_break; + +fn isBreaker(cp: u21) bool { + return cp == '\x0d' or cp == '\x0a' or gbp.isControl(cp); +} + +pub fn zg_graphemeBreak( + cp1: u21, + cp2: u21, + state: *u3, +) bool { + + // GB11: Emoji Extend* ZWJ x Emoji + if (!hasXpic(state) and emoji.isExtendedPictographic(cp1)) setXpic(state); + + // GB3: CR x LF + if (cp1 == '\r' and cp2 == '\n') { + std.log.warn("GB3", .{}); + return false; + } + + // GB4: Control + if (isBreaker(cp1)) { + std.log.warn("GB4", .{}); + return true; + } + + // GB6: Hangul L x (L|V|LV|VT) + if (gbp.isL(cp1)) { + if (gbp.isL(cp2) or + gbp.isV(cp2) or + gbp.isLv(cp2) or + gbp.isLvt(cp2)) + { + std.log.warn("GB6", .{}); + return false; + } + } + + // GB7: Hangul (LV | V) x (V | T) + if (gbp.isLv(cp1) or gbp.isV(cp1)) { + if (gbp.isV(cp2) or + gbp.isT(cp2)) + { + std.log.warn("GB7", .{}); + return false; + } + } + + // GB8: Hangul (LVT | T) x T + if (gbp.isLvt(cp1) or gbp.isT(cp1)) { + if (gbp.isT(cp2)) { + std.log.warn("GB8", .{}); + return false; + } + } + + // GB9b: x (Extend | ZWJ) + if (gbp.isExtend(cp2) or gbp.isZwj(cp2)) { + std.log.warn("GB9b", .{}); + return false; + } + + // GB9a: x Spacing + if (gbp.isSpacingmark(cp2)) { + std.log.warn("GB9a", .{}); + return false; + } + + // GB9b: Prepend x + if (gbp.isPrepend(cp1) and !isBreaker(cp2)) { + std.log.warn("GB9b cp1={x} prepend={}", .{ cp1, gbp.isPrepend(cp1) }); + return false; + } + + // GB12, GB13: RI x RI + if (gbp.isRegionalIndicator(cp1) and gbp.isRegionalIndicator(cp2)) { + if (hasRegional(state)) { + unsetRegional(state); + std.log.warn("GB12", .{}); + return true; + } else { + std.log.warn("GB13", .{}); + setRegional(state); + return false; + } + } + + // GB11: Emoji Extend* ZWJ x Emoji + if (hasXpic(state) and + gbp.isZwj(cp1) and + emoji.isExtendedPictographic(cp2)) + { + std.log.warn("GB11", .{}); + unsetXpic(state); + return false; + } + + return true; +} + +fn hasXpic(state: *const u3) bool { + return state.* & 1 == 1; +} + +fn setXpic(state: *u3) void { + state.* |= 1; +} + +fn unsetXpic(state: *u3) void { + state.* ^= 1; +} + +fn hasRegional(state: *const u3) bool { + return state.* & 2 == 2; +} + +fn setRegional(state: *u3) void { + state.* |= 2; +} + +fn unsetRegional(state: *u3) void { + state.* ^= 2; +} + +/// If you build this file as a binary, we will verify the grapheme break +/// implementation. This iterates over billions of codepoints so it is +/// SLOW. It's not meant to be run in CI, but it's useful for debugging. +pub fn main() !void { + const ziglyph = @import("ziglyph"); + + var state: u3 = 0; + var zg_state: u3 = 0; + for (0..std.math.maxInt(u21) + 1) |cp1| { + if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1}); + + if (cp1 == '\r' or cp1 == '\n' or + ziglyph.grapheme_break.isControl(@intCast(cp1))) continue; + + for (0..std.math.maxInt(u21) + 1) |cp2| { + if (cp2 == '\r' or cp2 == '\n' or + ziglyph.grapheme_break.isControl(@intCast(cp2))) continue; + + const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state); + const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state); + if (gb != zg_gb) { + std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{ + cp1, + cp2, + gb, + state, + zg_gb, + zg_state, + }); + } + } + } +} + +pub const std_options = struct { + pub const log_level: std.log.Level = .info; +}; + +// test "matches ziglyph specific" { +// const testing = std.testing; +// +// var state: u3 = 0; +// var zg_state: u3 = 0; +// +// const cp1 = 0x20; +// const cp2 = 0x300; +// +// const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state); +// const zg_gb = zg_graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state); +// if (gb != zg_gb) { +// std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{ +// cp1, +// cp2, +// gb, +// state, +// zg_gb, +// zg_state, +// }); +// try testing.expect(false); +// } +// } diff --git a/src/unicode/main.zig b/src/unicode/main.zig index fa0cb9fc8..1af26d485 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -1,5 +1,6 @@ pub const lut = @import("lut.zig"); +pub usingnamespace @import("grapheme.zig"); const props = @import("props.zig"); pub const table = props.table; pub const Properties = props.Properties; diff --git a/src/unicode/props.zig b/src/unicode/props.zig index d46acbf49..d6f282ed9 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -32,7 +32,8 @@ pub const Properties = struct { // Needed for lut.Generator pub fn eql(a: Properties, b: Properties) bool { - return a.width == b.width; + return a.width == b.width and + a.grapheme_boundary_class == b.grapheme_boundary_class; } // Needed for lut.Generator @@ -44,8 +45,14 @@ pub const Properties = struct { ) !void { _ = layout; _ = opts; - try std.fmt.format(writer, ".{{ .width= {}, }}", .{ + try std.fmt.format(writer, + \\.{{ + \\ .width= {}, + \\ .grapheme_boundary_class= .{s}, + \\}} + , .{ self.width, + @tagName(self.grapheme_boundary_class), }); } };