From 025895dc905505517bcde56bf0e58a44d54c0428 Mon Sep 17 00:00:00 2001 From: Jacob Sandlund Date: Fri, 4 Jul 2025 14:41:50 -0400 Subject: [PATCH] comparing to old version of table (ziglyph based) --- src/build/GhosttyUnicodeTest.zig | 11 ++ src/build/UnicodeTables.zig | 12 ++ src/unicode/grapheme.zig | 14 +++ src/unicode/main.zig | 210 +++++++++++++++++++++++-------- src/unicode/props.zig | 56 ++++++++- 5 files changed, 250 insertions(+), 53 deletions(-) diff --git a/src/build/GhosttyUnicodeTest.zig b/src/build/GhosttyUnicodeTest.zig index db3575c79..8421e9fae 100644 --- a/src/build/GhosttyUnicodeTest.zig +++ b/src/build/GhosttyUnicodeTest.zig @@ -3,6 +3,7 @@ const UnicodeTest = @This(); const std = @import("std"); const Config = @import("Config.zig"); const SharedDeps = @import("SharedDeps.zig"); +const UnicodeTables = @import("UnicodeTables.zig"); /// The unicode test executable. exe: *std.Build.Step.Compile, @@ -27,6 +28,7 @@ pub fn init(b: *std.Build, cfg: *const Config, deps: *const SharedDeps) !Unicode // Add the shared dependencies _ = try deps.add(exe); + // Add ziglyph just for unicode-test if (b.lazyDependency("ziglyph", .{ .target = cfg.target, .optimize = cfg.optimize, @@ -34,6 +36,15 @@ pub fn init(b: *std.Build, cfg: *const Config, deps: *const SharedDeps) !Unicode exe.root_module.addImport("ziglyph", dep.module("ziglyph")); } + // Add the old version of the unicode tables + const old_unicode_tables = try UnicodeTables.init(b); + old_unicode_tables.run.addArg("old"); + + old_unicode_tables.output.addStepDependencies(&exe.step); + exe.root_module.addAnonymousImport("old_unicode_tables", .{ + .root_source_file = old_unicode_tables.output, + }); + return .{ .exe = exe, .install_step = install_step, diff --git a/src/build/UnicodeTables.zig b/src/build/UnicodeTables.zig index 10f2064f7..ef2fc8cd9 100644 --- a/src/build/UnicodeTables.zig +++ b/src/build/UnicodeTables.zig @@ -6,6 +6,9 @@ const Config = @import("Config.zig"); /// The exe. exe: *std.Build.Step.Compile, +/// The run artifact for the exe. +run: *std.Build.Step.Run, + /// The output path for the unicode tables output: std.Build.LazyPath, @@ -28,9 +31,18 @@ pub fn init(b: *std.Build) !UnicodeTables { exe.root_module.addImport("DisplayWidth", dep.module("DisplayWidth")); } + // Only used if we're building the old unicode tables + if (b.lazyDependency("ziglyph", .{ + .target = b.graph.host, + })) |dep| { + exe.root_module.addImport("ziglyph", dep.module("ziglyph")); + } + const run = b.addRunArtifact(exe); + return .{ .exe = exe, + .run = run, .output = run.captureStdOut(), }; } diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig index 66ba27b12..e88c9820e 100644 --- a/src/unicode/grapheme.zig +++ b/src/unicode/grapheme.zig @@ -2,6 +2,7 @@ const std = @import("std"); const props = @import("props.zig"); const GraphemeBoundaryClass = props.GraphemeBoundaryClass; const table = props.table; +const oldTable = props.oldTable; /// Determines if there is a grapheme break between two codepoints. This /// must be called sequentially maintaining the state between calls. @@ -22,6 +23,19 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool { return value.result; } +/// Only used for unicode-test. +pub fn oldGraphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool { + const value = Precompute.data[ + (Precompute.Key{ + .gbc1 = oldTable.get(cp1).grapheme_boundary_class, + .gbc2 = oldTable.get(cp2).grapheme_boundary_class, + .state = state.*, + }).index() + ]; + state.* = value.state; + return value.result; +} + /// The state that must be maintained between calls to `graphemeBreak`. pub const BreakState = packed struct(u2) { extended_pictographic: bool = false, diff --git a/src/unicode/main.zig b/src/unicode/main.zig index 8d297569a..e31f3f623 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -14,14 +14,24 @@ test { /// Build Ghostty with `zig build -Doptimize=ReleaseFast -Demit-unicode-test`. /// -/// Usage: ./zig-out/bin/unicode-test [grapheme|width|all] [zg|ziglyph|all] -/// -/// grapheme: this will verify the grapheme break implementation. This -/// iterates over billions of codepoints so it is SLOW. +/// Usage: ./zig-out/bin/unicode-test [width|class|break|all] [zg|ziglyph|old|all] /// /// width: this verifies the table codepoint widths match +/// class: this verifies the table grapheme boundary classes match +/// break: this will verify the grapheme break implementation. This +/// iterates over billions of codepoints so it is SLOW. +/// /// zg: compare grapheme/width against zg /// ziglyph: compare grapheme/width against ziglyph +/// old: compare grapheme against old implementation +/// +/// Note: To enable `old` comparisons, uncomment sections of these files +/// (search for "old"): +/// * ./main.zig (this file) +/// * ./props.zig +/// * ./grapheme.zig +/// * src/build/GhosttyUnicodeTest.zig +/// * src/build/UnicodeTables.zig pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); @@ -42,59 +52,12 @@ pub fn main() !void { const compareAll = args.len < 3 or std.mem.eql(u8, args[2], "all"); const compareZg = compareAll or std.mem.eql(u8, args[2], "zg"); const compareZiglyph = compareAll or std.mem.eql(u8, args[2], "ziglyph"); + const compareOld = compareAll or std.mem.eql(u8, args[2], "old"); // Set the min and max to control the test range. const min = 0; const max = 0x110000; - var state: GraphemeBreakState = .{}; - var zg_state: Graphemes.State = .{}; - var ziglyph_state: u3 = 0; - - if (testAll or std.mem.eql(u8, args[1], "grapheme")) { - std.log.info("============== testing grapheme break ===============", .{}); - - for (min..max) |cp1| { - if (cp1 % 0x100 == 0) std.log.info("progress: cp1={x}", .{cp1}); - - if (cp1 == '\r' or cp1 == '\n' or - Graphemes.gbp(zg.graphemes, @intCast(cp1)) == .Control) continue; - - for (min..max) |cp2| { - if (cp2 == '\r' or cp2 == '\n' or - Graphemes.gbp(zg.graphemes, @intCast(cp1)) == .Control) continue; - - const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state); - if (compareZg) { - const zg_gb = Graphemes.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg.graphemes, &zg_state); - if (gb != zg_gb) { - std.log.warn("[zg mismatch] cp1={x} cp2={x} gb={} zg_gb={} state={} zg_state={}", .{ - cp1, - cp2, - gb, - zg_gb, - state, - zg_state, - }); - } - } - if (compareZiglyph) { - const ziglyph_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &ziglyph_state); - if (gb != ziglyph_gb) { - std.log.warn("[ziglyph mismatch] cp1={x} cp2={x} gb={} ziglyph_gb={} state={} ziglyph_state={}", .{ - cp1, - cp2, - gb, - ziglyph_gb, - state, - ziglyph_state, - }); - } - } - } - } - } - if (testAll or std.mem.eql(u8, args[1], "width")) { std.log.info("============== testing codepoint width ==============", .{}); @@ -116,6 +79,149 @@ pub fn main() !void { } } } + + if (testAll or std.mem.eql(u8, args[1], "class")) { + std.log.info("============== testing grapheme boundary class ======", .{}); + + for (min..max) |cp| { + if (cp % 0x10000 == 0) std.log.info("progress: cp={x}", .{cp}); + + const t = table.get(@intCast(cp)); + + if (compareZg) { + const gbp = Graphemes.gbp(zg.graphemes, @intCast(cp)); + const matches = switch (t.grapheme_boundary_class) { + .extended_pictographic_base => gbp == .Emoji_Modifier_Base, + .emoji_modifier => gbp == .Emoji_Modifier, + .extended_pictographic => gbp == .Extended_Pictographic, + .L => gbp == .L, + .V => gbp == .V, + .T => gbp == .T, + .LV => gbp == .LV, + .LVT => gbp == .LVT, + .prepend => gbp == .Prepend, + .extend => gbp == .Extend, + .zwj => gbp == .ZWJ, + .spacing_mark => gbp == .SpacingMark, + .regional_indicator => gbp == .Regional_Indicator, + .invalid => gbp == .none or gbp == .Control or gbp == .CR or gbp == .LF, + }; + + if (!matches) { + std.log.warn("[zg mismatch] cp={x} t={} zg={}", .{ cp, t.grapheme_boundary_class, gbp }); + } + } + + if (compareZiglyph) { + const ziglyph_valid = (ziglyph.emoji.isEmojiModifierBase(@intCast(cp)) or + ziglyph.emoji.isEmojiModifier(@intCast(cp)) or + ziglyph.emoji.isExtendedPictographic(@intCast(cp)) or + ziglyph.grapheme_break.isL(@intCast(cp)) or + ziglyph.grapheme_break.isV(@intCast(cp)) or + ziglyph.grapheme_break.isT(@intCast(cp)) or + ziglyph.grapheme_break.isLv(@intCast(cp)) or + ziglyph.grapheme_break.isLvt(@intCast(cp)) or + ziglyph.grapheme_break.isPrepend(@intCast(cp)) or + ziglyph.grapheme_break.isExtend(@intCast(cp)) or + ziglyph.grapheme_break.isZwj(@intCast(cp)) or + ziglyph.grapheme_break.isSpacingmark(@intCast(cp)) or + ziglyph.grapheme_break.isRegionalIndicator(@intCast(cp))); + + const matches = switch (t.grapheme_boundary_class) { + .extended_pictographic_base => ziglyph.emoji.isEmojiModifierBase(@intCast(cp)), + .emoji_modifier => ziglyph.emoji.isEmojiModifier(@intCast(cp)), + .extended_pictographic => ziglyph.emoji.isExtendedPictographic(@intCast(cp)), + .L => ziglyph.grapheme_break.isL(@intCast(cp)), + .V => ziglyph.grapheme_break.isV(@intCast(cp)), + .T => ziglyph.grapheme_break.isT(@intCast(cp)), + .LV => ziglyph.grapheme_break.isLv(@intCast(cp)), + .LVT => ziglyph.grapheme_break.isLvt(@intCast(cp)), + .prepend => ziglyph.grapheme_break.isPrepend(@intCast(cp)), + .extend => ziglyph.grapheme_break.isExtend(@intCast(cp)), + .zwj => ziglyph.grapheme_break.isZwj(@intCast(cp)), + .spacing_mark => ziglyph.grapheme_break.isSpacingmark(@intCast(cp)), + .regional_indicator => ziglyph.grapheme_break.isRegionalIndicator(@intCast(cp)), + .invalid => !ziglyph_valid, + }; + + if (!matches) { + std.log.warn("[ziglyph mismatch] cp={x} t={} ziglyph_valid={}", .{ cp, t.grapheme_boundary_class, ziglyph_valid }); + } + } + + if (compareOld) { + const oldT = props.oldTable.get(@intCast(cp)); + if (oldT.grapheme_boundary_class != t.grapheme_boundary_class) { + std.log.warn("[old mismatch] cp={x} t={} old={}", .{ cp, t.grapheme_boundary_class, oldT.grapheme_boundary_class }); + } + } + } + } + + var state: GraphemeBreakState = .{}; + var old_state: GraphemeBreakState = .{}; + var zg_state: Graphemes.State = .{}; + var ziglyph_state: u3 = 0; + + if (testAll or std.mem.eql(u8, args[1], "break")) { + std.log.info("============== testing grapheme break ===============", .{}); + + for (min..max) |cp1| { + if (cp1 % 0x100 == 0) std.log.info("progress: cp1={x}", .{cp1}); + + if (cp1 == '\r' or cp1 == '\n' or + Graphemes.gbp(zg.graphemes, @intCast(cp1)) == .Control) continue; + + for (min..max) |cp2| { + if (cp2 == '\r' or cp2 == '\n' or + Graphemes.gbp(zg.graphemes, @intCast(cp1)) == .Control) continue; + + const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state); + + if (compareOld) { + const old_gb = grapheme.oldGraphemeBreak(@intCast(cp1), @intCast(cp2), &old_state); + if (gb != old_gb) { + std.log.warn("[old mismatch] cp1={x} cp2={x} gb={} old_gb={} state={} old_state={}", .{ + cp1, + cp2, + gb, + old_gb, + state, + old_state, + }); + } + } + + if (compareZg) { + const zg_gb = Graphemes.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg.graphemes, &zg_state); + if (gb != zg_gb) { + std.log.warn("[zg mismatch] cp1={x} cp2={x} gb={} zg_gb={} state={} zg_state={}", .{ + cp1, + cp2, + gb, + zg_gb, + state, + zg_state, + }); + } + } + + if (compareZiglyph) { + const ziglyph_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &ziglyph_state); + if (gb != ziglyph_gb) { + std.log.warn("[ziglyph mismatch] cp1={x} cp2={x} gb={} ziglyph_gb={} state={} ziglyph_state={}", .{ + cp1, + cp2, + gb, + ziglyph_gb, + state, + ziglyph_state, + }); + } + } + } + } + } } pub const std_options: std.Options = .{ diff --git a/src/unicode/props.zig b/src/unicode/props.zig index 86553a8bf..3b8364235 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -8,6 +8,9 @@ const lut = @import("lut.zig"); graphemes: Graphemes, display_width: DisplayWidth, +// Whether to use the old implementation based on ziglyph. +old: bool = false, + // Public only for unicode-test pub fn init(alloc: std.mem.Allocator) !props { const graphemes = try Graphemes.init(alloc); @@ -36,6 +39,19 @@ pub const table = table: { }; }; +/// The old lookup tables for Ghostty. Only used for unicode-test. +pub const oldTable = table: { + // This is only available after running main() below as part of the Ghostty + // build.zig, but due to Zig's lazy analysis we can still reference it here. + const generated = @import("old_unicode_tables").Tables(Properties); + const Tables = lut.Tables(Properties); + break :table Tables{ + .stage1 = &generated.stage1, + .stage2 = &generated.stage2, + .stage3 = &generated.stage3, + }; +}; + /// Property set per codepoint that Ghostty cares about. /// /// Adding to this lets you find new properties but also potentially makes @@ -120,6 +136,36 @@ pub const GraphemeBoundaryClass = enum(u4) { }; } + pub fn initOld(cp: u21) GraphemeBoundaryClass { + const ziglyph = @import("ziglyph"); + + // We special-case modifier bases because we should not break + // if a modifier isn't next to a base. + if (ziglyph.emoji.isEmojiModifierBase(cp)) { + assert(ziglyph.emoji.isExtendedPictographic(cp)); + return .extended_pictographic_base; + } + + if (ziglyph.emoji.isEmojiModifier(cp)) return .emoji_modifier; + if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic; + if (ziglyph.grapheme_break.isL(cp)) return .L; + if (ziglyph.grapheme_break.isV(cp)) return .V; + if (ziglyph.grapheme_break.isT(cp)) return .T; + if (ziglyph.grapheme_break.isLv(cp)) return .LV; + if (ziglyph.grapheme_break.isLvt(cp)) return .LVT; + if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend; + if (ziglyph.grapheme_break.isExtend(cp)) return .extend; + if (ziglyph.grapheme_break.isZwj(cp)) return .zwj; + if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark; + if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator; + + // This is obviously not INVALID invalid, there is SOME grapheme + // boundary class for every codepoint. But we don't care about + // anything that doesn't fit into the above categories. + + return .invalid; + } + /// Returns true if this is an extended pictographic type. This /// should be used instead of comparing the enum value directly /// because we classify multiple. @@ -145,7 +191,8 @@ pub fn get(ctx: props, cp: u21) !Properties { return .{ .width = @intCast(@min(2, @max(0, zg_width))), - .grapheme_boundary_class = .init(ctx, cp), + //.grapheme_boundary_class = .init(ctx, cp), + .grapheme_boundary_class = if (ctx.old) .initOld(cp) else .init(ctx, cp), }; } } @@ -161,9 +208,16 @@ pub fn main() !void { defer arena_state.deinit(); const alloc = arena_state.allocator(); + const args = try std.process.argsAlloc(alloc); + defer std.process.argsFree(alloc, args); + var self = try init(alloc); defer self.deinit(alloc); + if (args.len > 1 and std.mem.eql(u8, args[1], "old")) { + self.old = true; + } + const gen: lut.Generator( Properties, props,