diff --git a/build.zig b/build.zig index 024e2db61..cda4ff764 100644 --- a/build.zig +++ b/build.zig @@ -40,6 +40,10 @@ pub fn build(b: *std.Build) !void { const bench = try buildpkg.GhosttyBench.init(b, &deps); if (config.emit_bench) bench.install(); + // Ghostty unicode test exe + const unicode_test = try buildpkg.GhosttyUnicodeTest.init(b, &config, &deps); + if (config.emit_unicode_test) unicode_test.install(); + // Ghostty dist tarball const dist = try buildpkg.GhosttyDist.init(b, &config); { diff --git a/build.zig.zon b/build.zig.zon index e98d878f9..7a2965d1b 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -41,6 +41,11 @@ .hash = "ziglyph-0.11.2-AAAAAHPtHwB4Mbzn1KvOV7Wpjo82NYEc_v0WC8oCLrkf", .lazy = true, }, + .zg = .{ + .url = "https://codeberg.org/jacobsandlund/zg/archive/4d09cce1c40e0a704e5c1dfab1f4398f5c87b96b.tar.gz", + .hash = "zg-0.14.0-oGqU3Oi7sgLDn7I8RF43Fqg8hMbfVbCU5tTJvOZvcGV1", + .lazy = true, + }, .zig_wayland = .{ // codeberg ifreund/zig-wayland .url = "https://codeberg.org/ifreund/zig-wayland/archive/f3c5d503e540ada8cbcb056420de240af0c094f7.tar.gz", diff --git a/src/bench/codepoint-width.sh b/src/bench/codepoint-width.sh index 43304ec2e..d894f641a 100755 --- a/src/bench/codepoint-width.sh +++ b/src/bench/codepoint-width.sh @@ -27,6 +27,8 @@ hyperfine \ "./zig-out/bin/bench-codepoint-width --mode=noop${ARGS} try benchNoop(reader, buf), .wcwidth => try benchWcwidth(reader, buf), - .ziglyph => try benchZiglyph(reader, buf), + .zg => try benchZg(reader, buf), .simd => try benchSimd(reader, buf), .table => try benchTable(reader, buf), } @@ -155,7 +155,7 @@ noinline fn benchTable( } } -noinline fn benchZiglyph( +noinline fn benchZg( reader: anytype, buf: []u8, ) !void { @@ -170,7 +170,7 @@ noinline fn benchZiglyph( const cp_, const consumed = d.next(c); assert(consumed); if (cp_) |cp| { - const width = ziglyph.display_width.codePointWidth(cp, .half); + const width = DisplayWidth.codePointWidth(cp); // Write the width to the buffer to avoid it being compiled away buf[0] = @intCast(width); diff --git a/src/bench/grapheme-break.sh b/src/bench/grapheme-break.sh index 24f475caa..02a787b2e 100755 --- a/src/bench/grapheme-break.sh +++ b/src/bench/grapheme-break.sh @@ -25,8 +25,8 @@ hyperfine \ --warmup 10 \ -n noop \ "./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} try benchNoop(reader, buf), - .ziglyph => try benchZiglyph(reader, buf), + .zg => try benchZg(reader, buf), .table => try benchTable(reader, buf), } } @@ -118,12 +118,12 @@ noinline fn benchTable( } } -noinline fn benchZiglyph( +noinline fn benchZg( reader: anytype, buf: []u8, ) !void { var d: UTF8Decoder = .{}; - var state: u3 = 0; + var state: Graphemes.State = .{}; var cp1: u21 = 0; while (true) { const n = try reader.read(buf); @@ -135,7 +135,7 @@ noinline fn benchZiglyph( const cp_, const consumed = d.next(c); assert(consumed); if (cp_) |cp2| { - const v = ziglyph.graphemeBreak(cp1, @intCast(cp2), &state); + const v = Graphemes.graphemeBreak(cp1, @intCast(cp2), &state); buf[0] = @intCast(@intFromBool(v)); cp1 = cp2; } diff --git a/src/build/Config.zig b/src/build/Config.zig index a9a79fb53..6c603d21c 100644 --- a/src/build/Config.zig +++ b/src/build/Config.zig @@ -51,6 +51,7 @@ patch_rpath: ?[]const u8 = null, /// Artifacts flatpak: bool = false, emit_bench: bool = false, +emit_unicode_test: bool = false, emit_docs: bool = false, emit_helpgen: bool = false, emit_macos_app: bool = false, @@ -287,6 +288,12 @@ pub fn init(b: *std.Build) !Config { "Build and install the benchmark executables.", ) orelse false; + config.emit_unicode_test = b.option( + bool, + "emit-unicode-test", + "Build and install the unicode test executable.", + ) orelse false; + config.emit_helpgen = b.option( bool, "emit-helpgen", @@ -300,6 +307,7 @@ pub fn init(b: *std.Build) !Config { ) orelse emit_docs: { // If we are emitting any other artifacts then we default to false. if (config.emit_bench or + config.emit_unicode_test or config.emit_test_exe or config.emit_helpgen) break :emit_docs false; @@ -348,6 +356,7 @@ pub fn init(b: *std.Build) !Config { target.result.os.tag == .macos and config.app_runtime == .none and (!config.emit_bench and + !config.emit_unicode_test and !config.emit_test_exe and !config.emit_helpgen); diff --git a/src/build/GhosttyUnicodeTest.zig b/src/build/GhosttyUnicodeTest.zig new file mode 100644 index 000000000..8421e9fae --- /dev/null +++ b/src/build/GhosttyUnicodeTest.zig @@ -0,0 +1,58 @@ +const UnicodeTest = @This(); + +const std = @import("std"); +const Config = @import("Config.zig"); +const SharedDeps = @import("SharedDeps.zig"); +const UnicodeTables = @import("UnicodeTables.zig"); + +/// The unicode test executable. +exe: *std.Build.Step.Compile, + +/// The install step for the executable. +install_step: *std.Build.Step.InstallArtifact, + +pub fn init(b: *std.Build, cfg: *const Config, deps: *const SharedDeps) !UnicodeTest { + const exe: *std.Build.Step.Compile = b.addExecutable(.{ + .name = "unicode-test", + .root_module = b.createModule(.{ + .root_source_file = b.path("src/unicode/main.zig"), + .target = cfg.target, + .optimize = cfg.optimize, + .strip = cfg.strip, + .omit_frame_pointer = cfg.strip, + .unwind_tables = if (cfg.strip) .none else .sync, + }), + }); + const install_step = b.addInstallArtifact(exe, .{}); + + // Add the shared dependencies + _ = try deps.add(exe); + + // Add ziglyph just for unicode-test + if (b.lazyDependency("ziglyph", .{ + .target = cfg.target, + .optimize = cfg.optimize, + })) |dep| { + exe.root_module.addImport("ziglyph", dep.module("ziglyph")); + } + + // Add the old version of the unicode tables + const old_unicode_tables = try UnicodeTables.init(b); + old_unicode_tables.run.addArg("old"); + + old_unicode_tables.output.addStepDependencies(&exe.step); + exe.root_module.addAnonymousImport("old_unicode_tables", .{ + .root_source_file = old_unicode_tables.output, + }); + + return .{ + .exe = exe, + .install_step = install_step, + }; +} + +/// Add the unicode test exe to the install target. +pub fn install(self: *const UnicodeTest) void { + const b = self.install_step.step.owner; + b.getInstallStep().dependOn(&self.install_step.step); +} diff --git a/src/build/SharedDeps.zig b/src/build/SharedDeps.zig index b6e9900e2..f80df6a7d 100644 --- a/src/build/SharedDeps.zig +++ b/src/build/SharedDeps.zig @@ -411,11 +411,18 @@ pub fn add( })) |dep| { step.root_module.addImport("z2d", dep.module("z2d")); } - if (b.lazyDependency("ziglyph", .{ + if (b.lazyDependency("zg", .{ .target = target, .optimize = optimize, })) |dep| { - step.root_module.addImport("ziglyph", dep.module("ziglyph")); + if (self.config.emit_bench or self.config.emit_unicode_test) { + step.root_module.addImport("Graphemes", dep.module("Graphemes")); + step.root_module.addImport("DisplayWidth", dep.module("DisplayWidth")); + } + step.root_module.addImport("CaseFolding", dep.module("CaseFolding")); + step.root_module.addImport("Emoji", dep.module("Emoji")); + step.root_module.addImport("GeneralCategories", dep.module("GeneralCategories")); + step.root_module.addImport("LetterCasing", dep.module("LetterCasing")); } if (b.lazyDependency("zf", .{ .target = target, diff --git a/src/build/UnicodeTables.zig b/src/build/UnicodeTables.zig index 5bba2341b..ef2fc8cd9 100644 --- a/src/build/UnicodeTables.zig +++ b/src/build/UnicodeTables.zig @@ -6,6 +6,9 @@ const Config = @import("Config.zig"); /// The exe. exe: *std.Build.Step.Compile, +/// The run artifact for the exe. +run: *std.Build.Step.Run, + /// The output path for the unicode tables output: std.Build.LazyPath, @@ -21,18 +24,25 @@ pub fn init(b: *std.Build) !UnicodeTables { }), }); + if (b.lazyDependency("zg", .{ + .target = b.graph.host, + })) |dep| { + exe.root_module.addImport("Graphemes", dep.module("Graphemes")); + exe.root_module.addImport("DisplayWidth", dep.module("DisplayWidth")); + } + + // Only used if we're building the old unicode tables if (b.lazyDependency("ziglyph", .{ .target = b.graph.host, - })) |ziglyph_dep| { - exe.root_module.addImport( - "ziglyph", - ziglyph_dep.module("ziglyph"), - ); + })) |dep| { + exe.root_module.addImport("ziglyph", dep.module("ziglyph")); } const run = b.addRunArtifact(exe); + return .{ .exe = exe, + .run = run, .output = run.captureStdOut(), }; } diff --git a/src/build/main.zig b/src/build/main.zig index f25ce1c23..db61df8b2 100644 --- a/src/build/main.zig +++ b/src/build/main.zig @@ -15,6 +15,7 @@ pub const GhosttyFrameData = @import("GhosttyFrameData.zig"); pub const GhosttyLib = @import("GhosttyLib.zig"); pub const GhosttyResources = @import("GhosttyResources.zig"); pub const GhosttyI18n = @import("GhosttyI18n.zig"); +pub const GhosttyUnicodeTest = @import("GhosttyUnicodeTest.zig"); pub const GhosttyXcodebuild = @import("GhosttyXcodebuild.zig"); pub const GhosttyXCFramework = @import("GhosttyXCFramework.zig"); pub const GhosttyWebdata = @import("GhosttyWebdata.zig"); diff --git a/src/crash/sentry.zig b/src/crash/sentry.zig index 820c3e9a1..1b3fd68d8 100644 --- a/src/crash/sentry.zig +++ b/src/crash/sentry.zig @@ -58,7 +58,7 @@ pub fn init(gpa: Allocator) !void { // defer { // const end = std.time.Instant.now() catch unreachable; // // "[updateFrame critical time] \t" - // std.log.err("[sentry init time] start={}us duration={}ns", .{ start_micro, end.since(start) / std.time.ns_per_us }); + // std.log.err("[sentry init time] start={}us duration={}us", .{ start_micro, end.since(start) / std.time.ns_per_us }); // } // Must only start once diff --git a/src/font/CodepointResolver.zig b/src/font/CodepointResolver.zig index 16536300c..0f3808992 100644 --- a/src/font/CodepointResolver.zig +++ b/src/font/CodepointResolver.zig @@ -13,7 +13,7 @@ const CodepointResolver = @This(); const std = @import("std"); const Allocator = std.mem.Allocator; -const ziglyph = @import("ziglyph"); +const Emoji = @import("Emoji"); const font = @import("main.zig"); const Atlas = font.Atlas; const CodepointMap = font.CodepointMap; @@ -150,7 +150,7 @@ pub fn getIndex( // we'll do this multiple times if we recurse, but this is a cached function // call higher up (GroupCache) so this should be rare. const p_mode: Collection.PresentationMode = if (p) |v| .{ .explicit = v } else .{ - .default = if (ziglyph.emoji.isEmojiPresentation(@intCast(cp))) + .default = if (Emoji.isEmojiPresentation(@intCast(cp))) .emoji else .text, diff --git a/src/font/shaper/web_canvas.zig b/src/font/shaper/web_canvas.zig index 4ed4b7db6..e0f0e1a00 100644 --- a/src/font/shaper/web_canvas.zig +++ b/src/font/shaper/web_canvas.zig @@ -1,9 +1,9 @@ const std = @import("std"); const assert = std.debug.assert; const Allocator = std.mem.Allocator; -const ziglyph = @import("ziglyph"); const font = @import("../main.zig"); const terminal = @import("../../terminal/main.zig"); +const unicode = @import("../../unicode/main.zig"); const log = std.log.scoped(.font_shaper); @@ -111,7 +111,7 @@ pub const Shaper = struct { // font ligatures. However, we do support grapheme clustering. // This means we can render things like skin tone emoji but // we can't render things like single glyph "=>". - var break_state: u3 = 0; + var break_state: unicode.GraphemeBreakState = .{}; var cp1: u21 = @intCast(codepoints[0]); var start: usize = 0; @@ -126,7 +126,7 @@ pub const Shaper = struct { const cp2: u21 = @intCast(codepoints[i]); defer cp1 = cp2; - break :blk ziglyph.graphemeBreak( + break :blk unicode.graphemeBreak( cp1, cp2, &break_state, diff --git a/src/global.zig b/src/global.zig index 668d2faec..ff84f3e9e 100644 --- a/src/global.zig +++ b/src/global.zig @@ -51,7 +51,7 @@ pub const GlobalState = struct { // defer { // const end = std.time.Instant.now() catch unreachable; // // "[updateFrame critical time] \t" - // std.log.err("[global init time] start={}us duration={}ns", .{ start_micro, end.since(start) / std.time.ns_per_us }); + // std.log.err("[global init time] start={}us duration={}us", .{ start_micro, end.since(start) / std.time.ns_per_us }); // } // Initialize ourself to nothing so we don't have any extra state. diff --git a/src/input/Binding.zig b/src/input/Binding.zig index f76da360a..cab99d3d8 100644 --- a/src/input/Binding.zig +++ b/src/input/Binding.zig @@ -5,7 +5,8 @@ const Binding = @This(); const std = @import("std"); const Allocator = std.mem.Allocator; const assert = std.debug.assert; -const ziglyph = @import("ziglyph"); +const LetterCasing = @import("LetterCasing"); +const CaseFolding = @import("CaseFolding"); const key = @import("key.zig"); const KeyEvent = key.KeyEvent; @@ -1563,15 +1564,17 @@ pub const Trigger = struct { /// in more codepoints so we need to use a 3 element array. fn foldedCodepoint(cp: u21) [3]u21 { // ASCII fast path - if (ziglyph.letter.isAsciiLetter(cp)) { - return .{ ziglyph.letter.toLower(cp), 0, 0 }; + if (('A' <= cp and cp <= 'Z') or ('a' <= cp and cp <= 'z')) { + return .{ LetterCasing.toLower(cp), 0, 0 }; } - // Unicode slow path. Case folding can resultin more codepoints. + // Unicode slow path. Case folding can result in more codepoints. // If more codepoints are produced then we return the codepoint // as-is which isn't correct but until we have a failing test // then I don't want to handle this. - return ziglyph.letter.toCaseFold(cp); + var buf: [3]u21 = .{ 0, 0, 0 }; + _ = CaseFolding.caseFold(cp, &buf); + return buf; } /// Convert the trigger to a C API compatible trigger. diff --git a/src/renderer/cell.zig b/src/renderer/cell.zig index 43d744176..671890fbf 100644 --- a/src/renderer/cell.zig +++ b/src/renderer/cell.zig @@ -1,12 +1,12 @@ const std = @import("std"); const Allocator = std.mem.Allocator; const assert = std.debug.assert; -const ziglyph = @import("ziglyph"); const font = @import("../font/main.zig"); const terminal = @import("../terminal/main.zig"); const renderer = @import("../renderer.zig"); const shaderpkg = renderer.Renderer.API.shaders; const ArrayListCollection = @import("../datastruct/array_list_collection.zig").ArrayListCollection; +const GeneralCategories = @import("GeneralCategories"); /// The possible cell content keys that exist. pub const Key = enum { @@ -224,9 +224,9 @@ pub fn constraintWidth(cell_pin: terminal.Pin) u2 { const cell = cell_pin.rowAndCell().cell; const cp = cell.codepoint(); - if (!ziglyph.general_category.isPrivateUse(cp) and - !ziglyph.blocks.isDingbats(cp)) - { + // If it's not Private Use (Co) or Dingbats (0x2700-0x27bf), use grid + // width. + if (GeneralCategories.gc(cp) != .Co and !(cp >= 0x2700 and cp <= 0x27bf)) { return cell.gridWidth(); } @@ -248,7 +248,8 @@ pub fn constraintWidth(cell_pin: terminal.Pin) u2 { // We consider powerline glyphs whitespace. if (isPowerline(prev_cp)) break :prev; - if (ziglyph.general_category.isPrivateUse(prev_cp)) { + // If it's Private Use (Co) use 1 as the width. + if (GeneralCategories.gc(cp) == .Co) { return 1; } } diff --git a/src/simd/codepoint_width.zig b/src/simd/codepoint_width.zig index aab4bdd95..bdb7c295c 100644 --- a/src/simd/codepoint_width.zig +++ b/src/simd/codepoint_width.zig @@ -4,7 +4,7 @@ const std = @import("std"); extern "c" fn ghostty_simd_codepoint_width(u32) i8; pub fn codepointWidth(cp: u32) i8 { - //return @import("ziglyph").display_width.codePointWidth(@intCast(cp), .half); + // try testing.expectEqual(@as(i8, 1), @import("DisplayWidth").codePointWidth(@intCast(cp))); return ghostty_simd_codepoint_width(cp); } @@ -19,27 +19,37 @@ test "codepointWidth basic" { try testing.expectEqual(@as(i8, 2), codepointWidth(0xF900)); // 豈 try testing.expectEqual(@as(i8, 2), codepointWidth(0x20000)); // 𠀀 try testing.expectEqual(@as(i8, 2), codepointWidth(0x30000)); // 𠀀 - // try testing.expectEqual(@as(i8, 1), @import("ziglyph").display_width.codePointWidth(0x100, .half)); + // try testing.expectEqual(@as(i8, 1), @import("DisplayWidth").codePointWidth(0x100)); } // This is not very fast in debug modes, so its commented by default. // IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES. -// test "codepointWidth matches ziglyph" { -// const testing = std.testing; -// const ziglyph = @import("ziglyph"); +//test "codepointWidth matches zg" { +// const testing = std.testing; +// const DisplayWidth = @import("DisplayWidth"); +// var success: bool = true; // -// const min = 0xFF + 1; // start outside ascii -// for (min..std.math.maxInt(u21)) |cp| { -// const simd = codepointWidth(@intCast(cp)); -// const zg = ziglyph.display_width.codePointWidth(@intCast(cp), .half); -// if (simd != zg) mismatch: { -// if (cp == 0x2E3B) { -// try testing.expectEqual(@as(i8, 2), simd); -// break :mismatch; -// } +// const min = 0xFF + 1; // start outside ascii +// for (min..0x110000) |cp| { +// const simd = codepointWidth(@intCast(cp)); +// const zg_width = DisplayWidth.codePointWidth(@intCast(cp)); +// if (simd != zg_width) mismatch: { +// if (cp == 0x2E3B) { +// try testing.expectEqual(@as(i8, 2), simd); +// std.log.warn("mismatch for 0x2e3b cp=U+{x} simd={} zg={}", .{ cp, simd, zg_width }); +// break :mismatch; +// } // -// std.log.warn("mismatch cp=U+{x} simd={} zg={}", .{ cp, simd, zg }); -// try testing.expect(false); -// } -// } -// } +// if (cp == 0x890) { +// try testing.expectEqual(@as(i8, 0), simd); +// try testing.expectEqual(@as(i8, 1), zg_width); +// break :mismatch; +// } +// +// std.log.warn("mismatch cp=U+{x} simd={} zg={}", .{ cp, simd, zg_width }); +// success = false; +// } +// } +// +// try testing.expect(success); +//} diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig index dd7207f6d..11ac1346e 100644 --- a/src/terminal/Terminal.zig +++ b/src/terminal/Terminal.zig @@ -344,7 +344,7 @@ pub fn print(self: *Terminal, c: u21) !void { // VS15 makes it narrow. if (c == 0xFE0F or c == 0xFE0E) { // This only applies to emoji - const prev_props = unicode.getProperties(prev.cell.content.codepoint); + const prev_props = unicode.table.get(prev.cell.content.codepoint); const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic(); if (!emoji) return; @@ -416,7 +416,7 @@ pub fn print(self: *Terminal, c: u21) !void { const width: usize = if (c <= 0xFF) 1 else @intCast(unicode.table.get(c).width); // Note: it is possible to have a width of "3" and a width of "-1" - // from ziglyph. We should look into those cases and handle them + // from zg. We should look into those cases and handle them // appropriately. assert(width <= 2); // log.debug("c={x} width={}", .{ c, width }); @@ -452,7 +452,7 @@ pub fn print(self: *Terminal, c: u21) !void { // If this is a emoji variation selector, prev must be an emoji if (c == 0xFE0F or c == 0xFE0E) { - const prev_props = unicode.getProperties(prev.content.codepoint); + const prev_props = unicode.table.get(prev.content.codepoint); const emoji = prev_props.grapheme_boundary_class == .extended_pictographic; if (!emoji) return; } diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig index 7847ef6f5..e88c9820e 100644 --- a/src/unicode/grapheme.zig +++ b/src/unicode/grapheme.zig @@ -2,6 +2,7 @@ const std = @import("std"); const props = @import("props.zig"); const GraphemeBoundaryClass = props.GraphemeBoundaryClass; const table = props.table; +const oldTable = props.oldTable; /// Determines if there is a grapheme break between two codepoints. This /// must be called sequentially maintaining the state between calls. @@ -22,6 +23,19 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool { return value.result; } +/// Only used for unicode-test. +pub fn oldGraphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool { + const value = Precompute.data[ + (Precompute.Key{ + .gbc1 = oldTable.get(cp1).grapheme_boundary_class, + .gbc2 = oldTable.get(cp2).grapheme_boundary_class, + .state = state.*, + }).index() + ]; + state.* = value.state; + return value.result; +} + /// The state that must be maintained between calls to `graphemeBreak`. pub const BreakState = packed struct(u2) { extended_pictographic: bool = false, @@ -149,48 +163,6 @@ fn graphemeBreakClass( return true; } -/// If you build this file as a binary, we will verify the grapheme break -/// implementation. This iterates over billions of codepoints so it is -/// SLOW. It's not meant to be run in CI, but it's useful for debugging. -pub fn main() !void { - const ziglyph = @import("ziglyph"); - - // Set the min and max to control the test range. - const min = 0; - const max = std.math.maxInt(u21) + 1; - - var state: BreakState = .{}; - var zg_state: u3 = 0; - for (min..max) |cp1| { - if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1}); - - if (cp1 == '\r' or cp1 == '\n' or - ziglyph.grapheme_break.isControl(@intCast(cp1))) continue; - - for (min..max) |cp2| { - if (cp2 == '\r' or cp2 == '\n' or - ziglyph.grapheme_break.isControl(@intCast(cp2))) continue; - - const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state); - const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state); - if (gb != zg_gb) { - std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{ - cp1, - cp2, - gb, - state, - zg_gb, - zg_state, - }); - } - } - } -} - -pub const std_options = struct { - pub const log_level: std.log.Level = .info; -}; - test "grapheme break: emoji modifier" { const testing = std.testing; diff --git a/src/unicode/lut.zig b/src/unicode/lut.zig index 95c6a3688..9e6c2be16 100644 --- a/src/unicode/lut.zig +++ b/src/unicode/lut.zig @@ -11,7 +11,7 @@ const Allocator = std.mem.Allocator; /// can in theory be generated at runtime. /// /// Context must have two functions: -/// - `get(Context, u21) Elem`: returns the mapping for a given codepoint +/// - `get(Context, u21) !Elem`: returns the mapping for a given codepoint /// - `eql(Context, Elem, Elem) bool`: returns true if two mappings are equal /// pub fn Generator( @@ -177,3 +177,7 @@ pub fn Tables(comptime Elem: type) type { } }; } + +test { + std.testing.refAllDecls(@This()); +} diff --git a/src/unicode/main.zig b/src/unicode/main.zig index f5b911948..85c369168 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -1,13 +1,231 @@ +const std = @import("std"); pub const lut = @import("lut.zig"); const grapheme = @import("grapheme.zig"); const props = @import("props.zig"); pub const table = props.table; pub const Properties = props.Properties; -pub const getProperties = props.get; pub const graphemeBreak = grapheme.graphemeBreak; pub const GraphemeBreakState = grapheme.BreakState; -test { - @import("std").testing.refAllDecls(@This()); +/// Build Ghostty with `zig build -Doptimize=ReleaseFast -Demit-unicode-test`. +/// +/// Usage: ./zig-out/bin/unicode-test [width|class|break|all] [old|zg|ziglyph|all] +/// +/// width: this verifies the table codepoint widths match +/// class: this verifies the table grapheme boundary classes match +/// break: this will verify the grapheme break implementation. This +/// iterates over billions of codepoints so it is SLOW. +/// +/// old: compare against old implementation +/// zg: compare against zg +/// ziglyph: compare against ziglyph +/// +/// Note: To disable/enable `old` comparisons, (un)comment sections of these +/// files (search for "old"): +/// * ./main.zig (this file) +/// * ./props.zig +/// * ./grapheme.zig +/// * src/build/GhosttyUnicodeTest.zig +/// * src/build/UnicodeTables.zig +pub fn main() !void { + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + + const alloc = gpa.allocator(); + + const args = try std.process.argsAlloc(alloc); + defer std.process.argsFree(alloc, args); + + const ziglyph = @import("ziglyph"); + const Graphemes = @import("Graphemes"); + const DisplayWidth = @import("DisplayWidth"); + + const testAll = args.len < 2 or std.mem.eql(u8, args[1], "all"); + const compareAll = args.len < 3 or std.mem.eql(u8, args[2], "all"); + const compareOld = compareAll or std.mem.eql(u8, args[2], "old"); + const compareZg = compareAll or std.mem.eql(u8, args[2], "zg"); + const compareZiglyph = compareAll or std.mem.eql(u8, args[2], "ziglyph"); + + // Set the min and max to control the test range. + const min = 0; + const max = 0x110000; + + if (testAll or std.mem.eql(u8, args[1], "width")) { + std.log.info("============== testing codepoint width ==============", .{}); + + for (min..max) |cp| { + if (cp % 0x10000 == 0) std.log.info("progress: cp={x}", .{cp}); + + const t = table.get(@intCast(cp)); + + if (compareOld) { + const oldT = props.oldTable.get(@intCast(cp)); + if (oldT.width != t.width) { + std.log.warn("[old mismatch] cp={x} t={} old={}", .{ cp, t.width, oldT.width }); + } + } + + if (compareZg) { + const zg_width = @min(2, @max(0, DisplayWidth.codePointWidth(@intCast(cp)))); + if (t.width != zg_width) { + std.log.warn("[zg mismatch] cp={x} t={} zg={}", .{ cp, t.width, zg_width }); + } + } + + if (compareZiglyph) { + const ziglyph_width = @min(2, @max(0, DisplayWidth.codePointWidth(@intCast(cp)))); + if (t.width != ziglyph_width) { + std.log.warn("[ziglyph mismatch] cp={x} t={} zg={}", .{ cp, t.width, ziglyph_width }); + } + } + } + } + + if (testAll or std.mem.eql(u8, args[1], "class")) { + std.log.info("============== testing grapheme boundary class ======", .{}); + + for (min..max) |cp| { + if (cp % 0x10000 == 0) std.log.info("progress: cp={x}", .{cp}); + + const t = table.get(@intCast(cp)); + + if (compareOld) { + const oldT = props.oldTable.get(@intCast(cp)); + if (oldT.grapheme_boundary_class != t.grapheme_boundary_class) { + std.log.warn("[old mismatch] cp={x} t={} old={}", .{ cp, t.grapheme_boundary_class, oldT.grapheme_boundary_class }); + } + } + + if (compareZg) { + const gbp = Graphemes.gbp(@intCast(cp)); + const matches = switch (t.grapheme_boundary_class) { + .extended_pictographic_base => gbp == .Emoji_Modifier_Base, + .emoji_modifier => gbp == .Emoji_Modifier, + .extended_pictographic => gbp == .Extended_Pictographic, + .L => gbp == .L, + .V => gbp == .V, + .T => gbp == .T, + .LV => gbp == .LV, + .LVT => gbp == .LVT, + .prepend => gbp == .Prepend, + .extend => gbp == .Extend, + .zwj => gbp == .ZWJ, + .spacing_mark => gbp == .SpacingMark, + .regional_indicator => gbp == .Regional_Indicator, + .invalid => gbp == .none or gbp == .Control or gbp == .CR or gbp == .LF, + }; + + if (!matches) { + std.log.warn("[zg mismatch] cp={x} t={} zg={}", .{ cp, t.grapheme_boundary_class, gbp }); + } + } + + if (compareZiglyph) { + const ziglyph_valid = (ziglyph.emoji.isEmojiModifierBase(@intCast(cp)) or + ziglyph.emoji.isEmojiModifier(@intCast(cp)) or + ziglyph.emoji.isExtendedPictographic(@intCast(cp)) or + ziglyph.grapheme_break.isL(@intCast(cp)) or + ziglyph.grapheme_break.isV(@intCast(cp)) or + ziglyph.grapheme_break.isT(@intCast(cp)) or + ziglyph.grapheme_break.isLv(@intCast(cp)) or + ziglyph.grapheme_break.isLvt(@intCast(cp)) or + ziglyph.grapheme_break.isPrepend(@intCast(cp)) or + ziglyph.grapheme_break.isExtend(@intCast(cp)) or + ziglyph.grapheme_break.isZwj(@intCast(cp)) or + ziglyph.grapheme_break.isSpacingmark(@intCast(cp)) or + ziglyph.grapheme_break.isRegionalIndicator(@intCast(cp))); + + const matches = switch (t.grapheme_boundary_class) { + .extended_pictographic_base => ziglyph.emoji.isEmojiModifierBase(@intCast(cp)), + .emoji_modifier => ziglyph.emoji.isEmojiModifier(@intCast(cp)), + .extended_pictographic => ziglyph.emoji.isExtendedPictographic(@intCast(cp)), + .L => ziglyph.grapheme_break.isL(@intCast(cp)), + .V => ziglyph.grapheme_break.isV(@intCast(cp)), + .T => ziglyph.grapheme_break.isT(@intCast(cp)), + .LV => ziglyph.grapheme_break.isLv(@intCast(cp)), + .LVT => ziglyph.grapheme_break.isLvt(@intCast(cp)), + .prepend => ziglyph.grapheme_break.isPrepend(@intCast(cp)), + .extend => ziglyph.grapheme_break.isExtend(@intCast(cp)), + .zwj => ziglyph.grapheme_break.isZwj(@intCast(cp)), + .spacing_mark => ziglyph.grapheme_break.isSpacingmark(@intCast(cp)), + .regional_indicator => ziglyph.grapheme_break.isRegionalIndicator(@intCast(cp)), + .invalid => !ziglyph_valid, + }; + + if (!matches) { + std.log.warn("[ziglyph mismatch] cp={x} t={} ziglyph_valid={}", .{ cp, t.grapheme_boundary_class, ziglyph_valid }); + } + } + } + } + + var state: GraphemeBreakState = .{}; + var old_state: GraphemeBreakState = .{}; + var zg_state: Graphemes.State = .{}; + var ziglyph_state: u3 = 0; + + if (testAll or std.mem.eql(u8, args[1], "break")) { + std.log.info("============== testing grapheme break ===============", .{}); + + for (min..max) |cp1| { + if (cp1 % 0x100 == 0) std.log.info("progress: cp1={x}", .{cp1}); + + if (cp1 == '\r' or cp1 == '\n' or + Graphemes.gbp(@intCast(cp1)) == .Control) continue; + + for (min..max) |cp2| { + if (cp2 == '\r' or cp2 == '\n' or + Graphemes.gbp(@intCast(cp1)) == .Control) continue; + + const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state); + + if (compareOld) { + const old_gb = grapheme.oldGraphemeBreak(@intCast(cp1), @intCast(cp2), &old_state); + if (gb != old_gb) { + std.log.warn("[old mismatch] cp1={x} cp2={x} gb={} old_gb={} state={} old_state={}", .{ + cp1, + cp2, + gb, + old_gb, + state, + old_state, + }); + } + } + + if (compareZg) { + const zg_gb = Graphemes.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state); + if (gb != zg_gb) { + std.log.warn("[zg mismatch] cp1={x} cp2={x} gb={} zg_gb={} state={} zg_state={}", .{ + cp1, + cp2, + gb, + zg_gb, + state, + zg_state, + }); + } + } + + if (compareZiglyph) { + const ziglyph_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &ziglyph_state); + if (gb != ziglyph_gb) { + std.log.warn("[ziglyph mismatch] cp1={x} cp2={x} gb={} ziglyph_gb={} state={} ziglyph_state={}", .{ + cp1, + cp2, + gb, + ziglyph_gb, + state, + ziglyph_state, + }); + } + } + } + } + } } + +pub const std_options: std.Options = .{ + .log_level = .debug, +}; diff --git a/src/unicode/props.zig b/src/unicode/props.zig index 99c57aa0a..a14b0c448 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -1,9 +1,38 @@ -const props = @This(); const std = @import("std"); const assert = std.debug.assert; -const ziglyph = @import("ziglyph"); const lut = @import("lut.zig"); +/// The context needed for lut generation. +pub const Context = struct { + // Whether to use the old implementation based on ziglyph. + old: bool = false, + + const Graphemes = @import("Graphemes"); + const DisplayWidth = @import("DisplayWidth"); + + pub fn get(self: Context, cp: u21) !Properties { + if (cp > 0x10FFFF) { + return .{ + .width = 0, + .grapheme_boundary_class = .invalid, + }; + } else { + const zg_width = DisplayWidth.codePointWidth(cp); + + return .{ + .width = @intCast(@min(2, @max(0, zg_width))), + //.grapheme_boundary_class = .init(self, cp), + .grapheme_boundary_class = if (self.old) .initOld(cp) else .init(cp), + }; + } + } + + pub fn eql(self: Context, a: Properties, b: Properties) bool { + _ = self; + return a.eql(b); + } +}; + /// The lookup tables for Ghostty. pub const table = table: { // This is only available after running main() below as part of the Ghostty @@ -17,6 +46,19 @@ pub const table = table: { }; }; +/// The old lookup tables for Ghostty. Only used for unicode-test. +pub const oldTable = table: { + // This is only available after running main() below as part of the Ghostty + // build.zig, but due to Zig's lazy analysis we can still reference it here. + const generated = @import("old_unicode_tables").Tables(Properties); + const Tables = lut.Tables(Properties); + break :table Tables{ + .stage1 = &generated.stage1, + .stage2 = &generated.stage2, + .stage3 = &generated.stage3, + }; +}; + /// Property set per codepoint that Ghostty cares about. /// /// Adding to this lets you find new properties but also potentially makes @@ -77,9 +119,35 @@ pub const GraphemeBoundaryClass = enum(u4) { extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base} emoji_modifier, // \p{Emoji_Modifier} + const Graphemes = @import("Graphemes"); + /// Gets the grapheme boundary class for a codepoint. This is VERY /// SLOW. The use case for this is only in generating lookup tables. pub fn init(cp: u21) GraphemeBoundaryClass { + return switch (Graphemes.gbp(cp)) { + .Emoji_Modifier_Base => .extended_pictographic_base, + .Emoji_Modifier => .emoji_modifier, + .Extended_Pictographic => .extended_pictographic, + .L => .L, + .V => .V, + .T => .T, + .LV => .LV, + .LVT => .LVT, + .Prepend => .prepend, + .Extend => .extend, + .ZWJ => .zwj, + .SpacingMark => .spacing_mark, + .Regional_Indicator => .regional_indicator, + // This is obviously not INVALID invalid, there is SOME grapheme + // boundary class for every codepoint. But we don't care about + // anything that doesn't fit into the above categories. + .none, .Control, .CR, .LF => .invalid, + }; + } + + pub fn initOld(cp: u21) GraphemeBoundaryClass { + const ziglyph = @import("ziglyph"); + // We special-case modifier bases because we should not break // if a modifier isn't next to a base. if (ziglyph.emoji.isEmojiModifierBase(cp)) { @@ -103,6 +171,7 @@ pub const GraphemeBoundaryClass = enum(u4) { // This is obviously not INVALID invalid, there is SOME grapheme // boundary class for every codepoint. But we don't care about // anything that doesn't fit into the above categories. + return .invalid; } @@ -120,35 +189,25 @@ pub const GraphemeBoundaryClass = enum(u4) { } }; -pub fn get(cp: u21) Properties { - const zg_width = ziglyph.display_width.codePointWidth(cp, .half); - - return .{ - .width = @intCast(@min(2, @max(0, zg_width))), - .grapheme_boundary_class = .init(cp), - }; -} - /// Runnable binary to generate the lookup tables and output to stdout. pub fn main() !void { var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator); defer arena_state.deinit(); const alloc = arena_state.allocator(); + const args = try std.process.argsAlloc(alloc); + defer std.process.argsFree(alloc, args); + + var ctx = Context{}; + + if (args.len > 1 and std.mem.eql(u8, args[1], "old")) { + ctx.old = true; + } + const gen: lut.Generator( Properties, - struct { - pub fn get(ctx: @This(), cp: u21) !Properties { - _ = ctx; - return props.get(cp); - } - - pub fn eql(ctx: @This(), a: Properties, b: Properties) bool { - _ = ctx; - return a.eql(b); - } - }, - ) = .{}; + Context, + ) = .{ .ctx = ctx }; const t = try gen.generate(alloc); defer alloc.free(t.stage1); @@ -164,18 +223,7 @@ pub fn main() !void { // }); } -// This is not very fast in debug modes, so its commented by default. -// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES. -// test "tables match ziglyph" { -// const testing = std.testing; -// -// const min = 0xFF + 1; // start outside ascii -// for (min..std.math.maxInt(u21)) |cp| { -// const t = table.get(@intCast(cp)); -// const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half))); -// if (t.width != zg) { -// std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg }); -// try testing.expect(false); -// } -// } -// } +test { + _ = table; + _ = Properties; +}