comparing to old version of table (ziglyph based)

This commit is contained in:
Jacob Sandlund
2025-07-04 14:41:50 -04:00
parent c0d818aac5
commit 025895dc90
5 changed files with 250 additions and 53 deletions

View File

@ -3,6 +3,7 @@ const UnicodeTest = @This();
const std = @import("std");
const Config = @import("Config.zig");
const SharedDeps = @import("SharedDeps.zig");
const UnicodeTables = @import("UnicodeTables.zig");
/// The unicode test executable.
exe: *std.Build.Step.Compile,
@ -27,6 +28,7 @@ pub fn init(b: *std.Build, cfg: *const Config, deps: *const SharedDeps) !Unicode
// Add the shared dependencies
_ = try deps.add(exe);
// Add ziglyph just for unicode-test
if (b.lazyDependency("ziglyph", .{
.target = cfg.target,
.optimize = cfg.optimize,
@ -34,6 +36,15 @@ pub fn init(b: *std.Build, cfg: *const Config, deps: *const SharedDeps) !Unicode
exe.root_module.addImport("ziglyph", dep.module("ziglyph"));
}
// Add the old version of the unicode tables
const old_unicode_tables = try UnicodeTables.init(b);
old_unicode_tables.run.addArg("old");
old_unicode_tables.output.addStepDependencies(&exe.step);
exe.root_module.addAnonymousImport("old_unicode_tables", .{
.root_source_file = old_unicode_tables.output,
});
return .{
.exe = exe,
.install_step = install_step,

View File

@ -6,6 +6,9 @@ const Config = @import("Config.zig");
/// The exe.
exe: *std.Build.Step.Compile,
/// The run artifact for the exe.
run: *std.Build.Step.Run,
/// The output path for the unicode tables
output: std.Build.LazyPath,
@ -28,9 +31,18 @@ pub fn init(b: *std.Build) !UnicodeTables {
exe.root_module.addImport("DisplayWidth", dep.module("DisplayWidth"));
}
// Only used if we're building the old unicode tables
if (b.lazyDependency("ziglyph", .{
.target = b.graph.host,
})) |dep| {
exe.root_module.addImport("ziglyph", dep.module("ziglyph"));
}
const run = b.addRunArtifact(exe);
return .{
.exe = exe,
.run = run,
.output = run.captureStdOut(),
};
}

View File

@ -2,6 +2,7 @@ const std = @import("std");
const props = @import("props.zig");
const GraphemeBoundaryClass = props.GraphemeBoundaryClass;
const table = props.table;
const oldTable = props.oldTable;
/// Determines if there is a grapheme break between two codepoints. This
/// must be called sequentially maintaining the state between calls.
@ -22,6 +23,19 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
return value.result;
}
/// Only used for unicode-test.
pub fn oldGraphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
const value = Precompute.data[
(Precompute.Key{
.gbc1 = oldTable.get(cp1).grapheme_boundary_class,
.gbc2 = oldTable.get(cp2).grapheme_boundary_class,
.state = state.*,
}).index()
];
state.* = value.state;
return value.result;
}
/// The state that must be maintained between calls to `graphemeBreak`.
pub const BreakState = packed struct(u2) {
extended_pictographic: bool = false,

View File

@ -14,14 +14,24 @@ test {
/// Build Ghostty with `zig build -Doptimize=ReleaseFast -Demit-unicode-test`.
///
/// Usage: ./zig-out/bin/unicode-test [grapheme|width|all] [zg|ziglyph|all]
///
/// grapheme: this will verify the grapheme break implementation. This
/// iterates over billions of codepoints so it is SLOW.
/// Usage: ./zig-out/bin/unicode-test [width|class|break|all] [zg|ziglyph|old|all]
///
/// width: this verifies the table codepoint widths match
/// class: this verifies the table grapheme boundary classes match
/// break: this will verify the grapheme break implementation. This
/// iterates over billions of codepoints so it is SLOW.
///
/// zg: compare grapheme/width against zg
/// ziglyph: compare grapheme/width against ziglyph
/// old: compare grapheme against old implementation
///
/// Note: To enable `old` comparisons, uncomment sections of these files
/// (search for "old"):
/// * ./main.zig (this file)
/// * ./props.zig
/// * ./grapheme.zig
/// * src/build/GhosttyUnicodeTest.zig
/// * src/build/UnicodeTables.zig
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
@ -42,59 +52,12 @@ pub fn main() !void {
const compareAll = args.len < 3 or std.mem.eql(u8, args[2], "all");
const compareZg = compareAll or std.mem.eql(u8, args[2], "zg");
const compareZiglyph = compareAll or std.mem.eql(u8, args[2], "ziglyph");
const compareOld = compareAll or std.mem.eql(u8, args[2], "old");
// Set the min and max to control the test range.
const min = 0;
const max = 0x110000;
var state: GraphemeBreakState = .{};
var zg_state: Graphemes.State = .{};
var ziglyph_state: u3 = 0;
if (testAll or std.mem.eql(u8, args[1], "grapheme")) {
std.log.info("============== testing grapheme break ===============", .{});
for (min..max) |cp1| {
if (cp1 % 0x100 == 0) std.log.info("progress: cp1={x}", .{cp1});
if (cp1 == '\r' or cp1 == '\n' or
Graphemes.gbp(zg.graphemes, @intCast(cp1)) == .Control) continue;
for (min..max) |cp2| {
if (cp2 == '\r' or cp2 == '\n' or
Graphemes.gbp(zg.graphemes, @intCast(cp1)) == .Control) continue;
const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
if (compareZg) {
const zg_gb = Graphemes.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg.graphemes, &zg_state);
if (gb != zg_gb) {
std.log.warn("[zg mismatch] cp1={x} cp2={x} gb={} zg_gb={} state={} zg_state={}", .{
cp1,
cp2,
gb,
zg_gb,
state,
zg_state,
});
}
}
if (compareZiglyph) {
const ziglyph_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &ziglyph_state);
if (gb != ziglyph_gb) {
std.log.warn("[ziglyph mismatch] cp1={x} cp2={x} gb={} ziglyph_gb={} state={} ziglyph_state={}", .{
cp1,
cp2,
gb,
ziglyph_gb,
state,
ziglyph_state,
});
}
}
}
}
}
if (testAll or std.mem.eql(u8, args[1], "width")) {
std.log.info("============== testing codepoint width ==============", .{});
@ -116,6 +79,149 @@ pub fn main() !void {
}
}
}
if (testAll or std.mem.eql(u8, args[1], "class")) {
std.log.info("============== testing grapheme boundary class ======", .{});
for (min..max) |cp| {
if (cp % 0x10000 == 0) std.log.info("progress: cp={x}", .{cp});
const t = table.get(@intCast(cp));
if (compareZg) {
const gbp = Graphemes.gbp(zg.graphemes, @intCast(cp));
const matches = switch (t.grapheme_boundary_class) {
.extended_pictographic_base => gbp == .Emoji_Modifier_Base,
.emoji_modifier => gbp == .Emoji_Modifier,
.extended_pictographic => gbp == .Extended_Pictographic,
.L => gbp == .L,
.V => gbp == .V,
.T => gbp == .T,
.LV => gbp == .LV,
.LVT => gbp == .LVT,
.prepend => gbp == .Prepend,
.extend => gbp == .Extend,
.zwj => gbp == .ZWJ,
.spacing_mark => gbp == .SpacingMark,
.regional_indicator => gbp == .Regional_Indicator,
.invalid => gbp == .none or gbp == .Control or gbp == .CR or gbp == .LF,
};
if (!matches) {
std.log.warn("[zg mismatch] cp={x} t={} zg={}", .{ cp, t.grapheme_boundary_class, gbp });
}
}
if (compareZiglyph) {
const ziglyph_valid = (ziglyph.emoji.isEmojiModifierBase(@intCast(cp)) or
ziglyph.emoji.isEmojiModifier(@intCast(cp)) or
ziglyph.emoji.isExtendedPictographic(@intCast(cp)) or
ziglyph.grapheme_break.isL(@intCast(cp)) or
ziglyph.grapheme_break.isV(@intCast(cp)) or
ziglyph.grapheme_break.isT(@intCast(cp)) or
ziglyph.grapheme_break.isLv(@intCast(cp)) or
ziglyph.grapheme_break.isLvt(@intCast(cp)) or
ziglyph.grapheme_break.isPrepend(@intCast(cp)) or
ziglyph.grapheme_break.isExtend(@intCast(cp)) or
ziglyph.grapheme_break.isZwj(@intCast(cp)) or
ziglyph.grapheme_break.isSpacingmark(@intCast(cp)) or
ziglyph.grapheme_break.isRegionalIndicator(@intCast(cp)));
const matches = switch (t.grapheme_boundary_class) {
.extended_pictographic_base => ziglyph.emoji.isEmojiModifierBase(@intCast(cp)),
.emoji_modifier => ziglyph.emoji.isEmojiModifier(@intCast(cp)),
.extended_pictographic => ziglyph.emoji.isExtendedPictographic(@intCast(cp)),
.L => ziglyph.grapheme_break.isL(@intCast(cp)),
.V => ziglyph.grapheme_break.isV(@intCast(cp)),
.T => ziglyph.grapheme_break.isT(@intCast(cp)),
.LV => ziglyph.grapheme_break.isLv(@intCast(cp)),
.LVT => ziglyph.grapheme_break.isLvt(@intCast(cp)),
.prepend => ziglyph.grapheme_break.isPrepend(@intCast(cp)),
.extend => ziglyph.grapheme_break.isExtend(@intCast(cp)),
.zwj => ziglyph.grapheme_break.isZwj(@intCast(cp)),
.spacing_mark => ziglyph.grapheme_break.isSpacingmark(@intCast(cp)),
.regional_indicator => ziglyph.grapheme_break.isRegionalIndicator(@intCast(cp)),
.invalid => !ziglyph_valid,
};
if (!matches) {
std.log.warn("[ziglyph mismatch] cp={x} t={} ziglyph_valid={}", .{ cp, t.grapheme_boundary_class, ziglyph_valid });
}
}
if (compareOld) {
const oldT = props.oldTable.get(@intCast(cp));
if (oldT.grapheme_boundary_class != t.grapheme_boundary_class) {
std.log.warn("[old mismatch] cp={x} t={} old={}", .{ cp, t.grapheme_boundary_class, oldT.grapheme_boundary_class });
}
}
}
}
var state: GraphemeBreakState = .{};
var old_state: GraphemeBreakState = .{};
var zg_state: Graphemes.State = .{};
var ziglyph_state: u3 = 0;
if (testAll or std.mem.eql(u8, args[1], "break")) {
std.log.info("============== testing grapheme break ===============", .{});
for (min..max) |cp1| {
if (cp1 % 0x100 == 0) std.log.info("progress: cp1={x}", .{cp1});
if (cp1 == '\r' or cp1 == '\n' or
Graphemes.gbp(zg.graphemes, @intCast(cp1)) == .Control) continue;
for (min..max) |cp2| {
if (cp2 == '\r' or cp2 == '\n' or
Graphemes.gbp(zg.graphemes, @intCast(cp1)) == .Control) continue;
const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
if (compareOld) {
const old_gb = grapheme.oldGraphemeBreak(@intCast(cp1), @intCast(cp2), &old_state);
if (gb != old_gb) {
std.log.warn("[old mismatch] cp1={x} cp2={x} gb={} old_gb={} state={} old_state={}", .{
cp1,
cp2,
gb,
old_gb,
state,
old_state,
});
}
}
if (compareZg) {
const zg_gb = Graphemes.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg.graphemes, &zg_state);
if (gb != zg_gb) {
std.log.warn("[zg mismatch] cp1={x} cp2={x} gb={} zg_gb={} state={} zg_state={}", .{
cp1,
cp2,
gb,
zg_gb,
state,
zg_state,
});
}
}
if (compareZiglyph) {
const ziglyph_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &ziglyph_state);
if (gb != ziglyph_gb) {
std.log.warn("[ziglyph mismatch] cp1={x} cp2={x} gb={} ziglyph_gb={} state={} ziglyph_state={}", .{
cp1,
cp2,
gb,
ziglyph_gb,
state,
ziglyph_state,
});
}
}
}
}
}
}
pub const std_options: std.Options = .{

View File

@ -8,6 +8,9 @@ const lut = @import("lut.zig");
graphemes: Graphemes,
display_width: DisplayWidth,
// Whether to use the old implementation based on ziglyph.
old: bool = false,
// Public only for unicode-test
pub fn init(alloc: std.mem.Allocator) !props {
const graphemes = try Graphemes.init(alloc);
@ -36,6 +39,19 @@ pub const table = table: {
};
};
/// The old lookup tables for Ghostty. Only used for unicode-test.
pub const oldTable = table: {
// This is only available after running main() below as part of the Ghostty
// build.zig, but due to Zig's lazy analysis we can still reference it here.
const generated = @import("old_unicode_tables").Tables(Properties);
const Tables = lut.Tables(Properties);
break :table Tables{
.stage1 = &generated.stage1,
.stage2 = &generated.stage2,
.stage3 = &generated.stage3,
};
};
/// Property set per codepoint that Ghostty cares about.
///
/// Adding to this lets you find new properties but also potentially makes
@ -120,6 +136,36 @@ pub const GraphemeBoundaryClass = enum(u4) {
};
}
pub fn initOld(cp: u21) GraphemeBoundaryClass {
const ziglyph = @import("ziglyph");
// We special-case modifier bases because we should not break
// if a modifier isn't next to a base.
if (ziglyph.emoji.isEmojiModifierBase(cp)) {
assert(ziglyph.emoji.isExtendedPictographic(cp));
return .extended_pictographic_base;
}
if (ziglyph.emoji.isEmojiModifier(cp)) return .emoji_modifier;
if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic;
if (ziglyph.grapheme_break.isL(cp)) return .L;
if (ziglyph.grapheme_break.isV(cp)) return .V;
if (ziglyph.grapheme_break.isT(cp)) return .T;
if (ziglyph.grapheme_break.isLv(cp)) return .LV;
if (ziglyph.grapheme_break.isLvt(cp)) return .LVT;
if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend;
if (ziglyph.grapheme_break.isExtend(cp)) return .extend;
if (ziglyph.grapheme_break.isZwj(cp)) return .zwj;
if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark;
if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator;
// This is obviously not INVALID invalid, there is SOME grapheme
// boundary class for every codepoint. But we don't care about
// anything that doesn't fit into the above categories.
return .invalid;
}
/// Returns true if this is an extended pictographic type. This
/// should be used instead of comparing the enum value directly
/// because we classify multiple.
@ -145,7 +191,8 @@ pub fn get(ctx: props, cp: u21) !Properties {
return .{
.width = @intCast(@min(2, @max(0, zg_width))),
.grapheme_boundary_class = .init(ctx, cp),
//.grapheme_boundary_class = .init(ctx, cp),
.grapheme_boundary_class = if (ctx.old) .initOld(cp) else .init(ctx, cp),
};
}
}
@ -161,9 +208,16 @@ pub fn main() !void {
defer arena_state.deinit();
const alloc = arena_state.allocator();
const args = try std.process.argsAlloc(alloc);
defer std.process.argsFree(alloc, args);
var self = try init(alloc);
defer self.deinit(alloc);
if (args.len > 1 and std.mem.eql(u8, args[1], "old")) {
self.old = true;
}
const gen: lut.Generator(
Properties,
props,