Merge pull request #1486 from mitchellh/unilut

Use precomputed lookup tables for even faster codepoint width computations
2025-07-15 00:06:09 +03:00 · 2024-02-08 21:51:33 -08:00
parent 60a553cdeb a471756ee0
commit fc459ad827
12 changed files with 441 additions and 10 deletions
--- a/build.zig
+++ b/build.zig
@ -1163,6 +1163,7 @@ fn addDeps(
    }
    try addHelp(b, step, config);
    try addUnicodeTables(b, step);
    return static_libs;
 }
@ -1209,6 +1210,43 @@ fn addHelp(
    }
 }
 /// Generate unicode fast lookup tables
 fn addUnicodeTables(
    b: *std.Build,
    step_: ?*std.Build.Step.Compile,
 ) !void {
    // Our static state between runs. We memoize our output to gen once
    const State = struct {
        var generated: ?std.Build.LazyPath = null;
    };
    const output = State.generated orelse strings: {
        const exe = b.addExecutable(.{
            .name = "unigen",
            .root_source_file = .{ .path = "src/unicode/props.zig" },
            .target = b.host,
        });
        exe.linkLibC();
        if (step_ == null) b.installArtifact(exe);
        const ziglyph_dep = b.dependency("ziglyph", .{
            .target = b.host,
        });
        exe.root_module.addImport("ziglyph", ziglyph_dep.module("ziglyph"));
        const help_run = b.addRunArtifact(exe);
        State.generated = help_run.captureStdOut();
        break :strings State.generated.?;
    };
    if (step_) |step| {
        output.addStepDependencies(&step.step);
        step.root_module.addAnonymousImport("unicode_tables", .{
            .root_source_file = output,
        });
    }
 }
 /// Generate documentation (manpages, etc.) from help strings
 fn buildDocumentation(
    b: *std.Build,
--- a/pkg/utf8proc/build.zig
+++ b/pkg/utf8proc/build.zig
@ -0,0 +1,37 @@
 const std = @import("std");
 pub fn build(b: *std.Build) !void {
    const target = b.standardTargetOptions(.{});
    const optimize = b.standardOptimizeOption(.{});
    const module = b.addModule("utf8proc", .{ .root_source_file = .{ .path = "main.zig" } });
    const upstream = b.dependency("utf8proc", .{});
    const lib = b.addStaticLibrary(.{
        .name = "utf8proc",
        .target = target,
        .optimize = optimize,
    });
    lib.linkLibC();
    lib.addIncludePath(upstream.path(""));
    module.addIncludePath(upstream.path(""));
    var flags = std.ArrayList([]const u8).init(b.allocator);
    try flags.append("-DUTF8PROC_EXPORTS");
    defer flags.deinit();
    lib.addCSourceFiles(.{
        .dependency = upstream,
        .files = &.{"utf8proc.c"},
        .flags = flags.items,
    });
    lib.installHeadersDirectoryOptions(.{
        .source_dir = upstream.path(""),
        .install_dir = .header,
        .install_subdir = "",
        .include_extensions = &.{".h"},
    });
    b.installArtifact(lib);
 }
--- a/pkg/utf8proc/build.zig.zon
+++ b/pkg/utf8proc/build.zig.zon
@ -0,0 +1,11 @@
 .{
    .name = "utf8proc",
    .version = "2.8.0",
    .paths = .{""},
    .dependencies = .{
        .utf8proc = .{
            .url = "https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.8.0.tar.gz",
            .hash = "1220056ce228a8c58f1fa66ab778f5c8965e62f720c1d30603c7d534cb7d8a605ad7",
        },
    },
 }
--- a/pkg/utf8proc/c.zig
+++ b/pkg/utf8proc/c.zig
@ -0,0 +1,3 @@
 pub usingnamespace @cImport({
    @cInclude("utf8proc.h");
 });
--- a/pkg/utf8proc/main.zig
+++ b/pkg/utf8proc/main.zig
@ -0,0 +1,20 @@
 pub const c = @import("c.zig");
 /// Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
 /// except that a width of 0 is returned for non-printable codepoints
 /// instead of -1 as in `wcwidth`.
 pub fn charwidth(codepoint: u21) u8 {
    return @intCast(c.utf8proc_charwidth(@intCast(codepoint)));
 }
 /// Given a pair of consecutive codepoints, return whether a grapheme break is
 /// permitted between them (as defined by the extended grapheme clusters in UAX#29).
 pub fn graphemeBreakStateful(cp1: u21, cp2: u21, state: *i32) bool {
    return c.utf8proc_grapheme_break_stateful(
        @intCast(cp1),
        @intCast(cp2),
        state,
    );
 }
 test {}
--- a/src/bench/codepoint-width.sh
+++ b/src/bench/codepoint-width.sh
@ -27,8 +27,8 @@ hyperfine \
  "./zig-out/bin/bench-codepoint-width --mode=noop${ARGS} </tmp/ghostty_bench_data" \
  -n wcwidth \
  "./zig-out/bin/bench-codepoint-width --mode=wcwidth${ARGS} </tmp/ghostty_bench_data" \
-  -n ziglyph \
+  -n table \
-  "./zig-out/bin/bench-codepoint-width --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
+  "./zig-out/bin/bench-codepoint-width --mode=table${ARGS} </tmp/ghostty_bench_data" \
  -n simd \
  "./zig-out/bin/bench-codepoint-width --mode=simd${ARGS} </tmp/ghostty_bench_data"
--- a/src/bench/codepoint-width.zig
+++ b/src/bench/codepoint-width.zig
@ -17,6 +17,7 @@ const ArenaAllocator = std.heap.ArenaAllocator;
 const ziglyph = @import("ziglyph");
 const cli = @import("../cli.zig");
 const simd = @import("../simd/main.zig");
 const table = @import("../unicode/main.zig").table;
 const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
 const Args = struct {
@ -50,6 +51,9 @@ const Mode = enum {
    /// Our SIMD implementation.
    simd,
    /// Test our lookup table implementation.
    table,
 };
 pub const std_options = struct {
@ -78,6 +82,7 @@ pub fn main() !void {
        .wcwidth => try benchWcwidth(reader, buf),
        .ziglyph => try benchZiglyph(reader, buf),
        .simd => try benchSimd(reader, buf),
        .table => try benchTable(reader, buf),
    }
 }
@ -124,6 +129,32 @@ noinline fn benchWcwidth(
    }
 }
 noinline fn benchTable(
    reader: anytype,
    buf: []u8,
 ) !void {
    var d: UTF8Decoder = .{};
    while (true) {
        const n = try reader.read(buf);
        if (n == 0) break;
        // Using stream.next directly with a for loop applies a naive
        // scalar approach.
        for (buf[0..n]) |c| {
            const cp_, const consumed = d.next(c);
            assert(consumed);
            if (cp_) |cp| {
                // This is the same trick we do in terminal.zig so we
                // keep it here.
                const width = if (cp <= 0xFF) 1 else table.get(@intCast(cp)).width;
                // Write the width to the buffer to avoid it being compiled away
                buf[0] = @intCast(width);
            }
        }
    }
 }
 noinline fn benchZiglyph(
    reader: anytype,
    buf: []u8,
--- a/src/main_ghostty.zig
+++ b/src/main_ghostty.zig
@ -308,6 +308,7 @@ test {
    _ = @import("terminal/main.zig");
    _ = @import("terminfo/main.zig");
    _ = @import("simd/main.zig");
    _ = @import("unicode/main.zig");
    // TODO
    _ = @import("blocking_queue.zig");
--- a/src/terminal/Terminal.zig
+++ b/src/terminal/Terminal.zig
@ -11,6 +11,7 @@ const testing = std.testing;
 const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
 const simd = @import("../simd/main.zig");
 const unicode = @import("../unicode/main.zig");
 const ansi = @import("ansi.zig");
 const modes = @import("modes.zig");
@ -869,14 +870,10 @@ pub fn print(self: *Terminal, c: u21) !void {
    }
    // Determine the width of this character so we can handle
-    // non-single-width characters properly.
+    // non-single-width characters properly. We have a fast-path for
-    const width: usize = @intCast(simd.codepointWidth(c));
+    // byte-sized characters since they're so common. We can ignore
-
+    // control characters because they're always filtered prior.
-    // Old implementation, 3x slower on ASCII, 2x slower on CJK, etc.
+    const width: usize = if (c <= 0xFF) 1 else @intCast(unicode.table.get(c).width);
    // const width: usize = @intCast(@min(
    //     @max(0, ziglyph.display_width.codePointWidth(c, .half)),
    //     2,
    // ));
    // Note: it is possible to have a width of "3" and a width of "-1"
    // from ziglyph. We should look into those cases and handle them
--- a/src/unicode/lut.zig
+++ b/src/unicode/lut.zig
@ -0,0 +1,179 @@
 const std = @import("std");
 const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
 // This whole file is based on the algorithm described here:
 // https://here-be-braces.com/fast-lookup-of-unicode-properties/
 /// Creates a type that is able to generate a 3-level lookup table
 /// from a Unicode codepoint to a mapping of type Elem. The lookup table
 /// generally is expected to be codegen'd and then reloaded, although it
 /// can in theory be generated at runtime.
 ///
 /// Context must have two functions:
 ///   - `get(Context, u21) Elem`: returns the mapping for a given codepoint
 ///   - `eql(Context, Elem, Elem) bool`: returns true if two mappings are equal
 ///
 pub fn Generator(
    comptime Elem: type,
    comptime Context: type,
 ) type {
    return struct {
        const Self = @This();
        const block_size = 256;
        const Block = [block_size]u16;
        /// Mapping of a block to its index in the stage2 array.
        const BlockMap = std.HashMap(
            Block,
            u16,
            struct {
                pub fn hash(ctx: @This(), k: Block) u64 {
                    _ = ctx;
                    var hasher = std.hash.Wyhash.init(0);
                    std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
                    return hasher.final();
                }
                pub fn eql(ctx: @This(), a: Block, b: Block) bool {
                    _ = ctx;
                    return std.mem.eql(u16, &a, &b);
                }
            },
            std.hash_map.default_max_load_percentage,
        );
        ctx: Context = undefined,
        /// Generate the lookup tables. The arrays in the return value
        /// are owned by the caller and must be freed.
        pub fn generate(self: *const Self, alloc: Allocator) !Tables(Elem) {
            // Maps block => stage2 index
            var blocks_map = BlockMap.init(alloc);
            defer blocks_map.deinit();
            // Our stages
            var stage1 = std.ArrayList(u16).init(alloc);
            defer stage1.deinit();
            var stage2 = std.ArrayList(u16).init(alloc);
            defer stage2.deinit();
            var stage3 = std.ArrayList(Elem).init(alloc);
            defer stage3.deinit();
            var block: Block = undefined;
            var block_len: u16 = 0;
            for (0..std.math.maxInt(u21) + 1) |cp| {
                // Get our block value and find the matching result value
                // in our list of possible values in stage3. This way, each
                // possible mapping only gets one entry in stage3.
                const elem = try self.ctx.get(@as(u21, @intCast(cp)));
                const block_idx = block_idx: {
                    for (stage3.items, 0..) |item, i| {
                        if (self.ctx.eql(item, elem)) break :block_idx i;
                    }
                    const idx = stage3.items.len;
                    try stage3.append(elem);
                    break :block_idx idx;
                };
                // The block stores the mapping to the stage3 index
                block[block_len] = std.math.cast(u16, block_idx) orelse return error.BlockTooLarge;
                block_len += 1;
                // If we still have space and we're not done with codepoints,
                // we keep building up the bock. Conversely: we finalize this
                // block if we've filled it or are out of codepoints.
                if (block_len < block_size and cp != std.math.maxInt(u21)) continue;
                if (block_len < block_size) @memset(block[block_len..block_size], 0);
                // Look for the stage2 index for this block. If it doesn't exist
                // we add it to stage2 and update the mapping.
                const gop = try blocks_map.getOrPut(block);
                if (!gop.found_existing) {
                    gop.value_ptr.* = std.math.cast(
                        u16,
                        stage2.items.len,
                    ) orelse return error.Stage2TooLarge;
                    for (block[0..block_len]) |entry| try stage2.append(entry);
                }
                // Map stage1 => stage2 and reset our block
                try stage1.append(gop.value_ptr.*);
                block_len = 0;
            }
            // All of our lengths must fit in a u16 for this to work
            assert(stage1.items.len <= std.math.maxInt(u16));
            assert(stage2.items.len <= std.math.maxInt(u16));
            assert(stage3.items.len <= std.math.maxInt(u16));
            const stage1_owned = try stage1.toOwnedSlice();
            errdefer alloc.free(stage1_owned);
            const stage2_owned = try stage2.toOwnedSlice();
            errdefer alloc.free(stage2_owned);
            const stage3_owned = try stage3.toOwnedSlice();
            errdefer alloc.free(stage3_owned);
            return .{
                .stage1 = stage1_owned,
                .stage2 = stage2_owned,
                .stage3 = stage3_owned,
            };
        }
    };
 }
 /// Creates a type that given a 3-level lookup table, can be used to
 /// look up a mapping for a given codepoint, encode it out to Zig, etc.
 pub fn Tables(comptime Elem: type) type {
    return struct {
        const Self = @This();
        stage1: []const u16,
        stage2: []const u16,
        stage3: []const Elem,
        /// Given a codepoint, returns the mapping for that codepoint.
        pub fn get(self: *const Self, cp: u21) Elem {
            const high = cp >> 8;
            const low = cp & 0xFF;
            return self.stage3[self.stage2[self.stage1[high] + low]];
        }
        /// Writes the lookup table as Zig to the given writer. The
        /// written file exports three constants: stage1, stage2, and
        /// stage3. These can be used to rebuild the lookup table in Zig.
        pub fn writeZig(self: *const Self, writer: anytype) !void {
            try writer.print(
                \\//! This file is auto-generated. Do not edit.
                \\
                \\pub fn Tables(comptime Elem: type) type {{
                \\    return struct {{
                \\pub const stage1: [{}]u16 = .{{
            , .{self.stage1.len});
            for (self.stage1) |entry| try writer.print("{},", .{entry});
            try writer.print(
                \\
                \\}};
                \\
                \\pub const stage2: [{}]u16 = .{{
            , .{self.stage2.len});
            for (self.stage2) |entry| try writer.print("{},", .{entry});
            try writer.writeAll("};");
            try writer.print(
                \\
                \\pub const stage3: [{}]Elem = .{{
            , .{self.stage3.len});
            for (self.stage3) |entry| try writer.print("{},", .{entry});
            try writer.writeAll(
                \\};
                \\    };
                \\}
            );
        }
    };
 }
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@ -0,0 +1,9 @@
 pub const lut = @import("lut.zig");
 const props = @import("props.zig");
 pub const table = props.table;
 pub const Properties = props.Properties;
 test {
    @import("std").testing.refAllDecls(@This());
 }
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@ -0,0 +1,105 @@
 const props = @This();
 const std = @import("std");
 const ziglyph = @import("ziglyph");
 const lut = @import("lut.zig");
 /// The lookup tables for Ghostty.
 pub const table = table: {
    // This is only available after running main() below as part of the Ghostty
    // build.zig, but due to Zig's lazy analysis we can still reference it here.
    const generated = @import("unicode_tables").Tables(Properties);
    const Tables = lut.Tables(Properties);
    break :table Tables{
        .stage1 = &generated.stage1,
        .stage2 = &generated.stage2,
        .stage3 = &generated.stage3,
    };
 };
 /// Property set per codepoint that Ghostty cares about.
 ///
 /// Adding to this lets you find new properties but also potentially makes
 /// our lookup tables less efficient. Any changes to this should run the
 /// benchmarks in src/bench to verify that we haven't regressed.
 pub const Properties = struct {
    /// Codepoint width. We clamp to [0, 2] since Ghostty handles control
    /// characters and we max out at 2 for wide characters (i.e. 3-em dash
    /// becomes a 2-em dash).
    width: u2 = 0,
    // Needed for lut.Generator
    pub fn eql(a: Properties, b: Properties) bool {
        return a.width == b.width;
    }
    // Needed for lut.Generator
    pub fn format(
        self: Properties,
        comptime layout: []const u8,
        opts: std.fmt.FormatOptions,
        writer: anytype,
    ) !void {
        _ = layout;
        _ = opts;
        try std.fmt.format(writer, ".{{ .width= {}, }}", .{
            self.width,
        });
    }
 };
 pub fn get(cp: u21) Properties {
    const zg_width = ziglyph.display_width.codePointWidth(cp, .half);
    return .{
        .width = @intCast(@min(2, @max(0, zg_width))),
    };
 }
 /// Runnable binary to generate the lookup tables and output to stdout.
 pub fn main() !void {
    const alloc = std.heap.c_allocator;
    const gen: lut.Generator(
        Properties,
        struct {
            pub fn get(ctx: @This(), cp: u21) !Properties {
                _ = ctx;
                return props.get(cp);
            }
            pub fn eql(ctx: @This(), a: Properties, b: Properties) bool {
                _ = ctx;
                return a.eql(b);
            }
        },
    ) = .{};
    const t = try gen.generate(alloc);
    defer alloc.free(t.stage1);
    defer alloc.free(t.stage2);
    defer alloc.free(t.stage3);
    try t.writeZig(std.io.getStdOut().writer());
    // Uncomment when manually debugging to see our table sizes.
    // std.log.warn("stage1={} stage2={} stage3={}", .{
    //     t.stage1.len,
    //     t.stage2.len,
    //     t.stage3.len,
    // });
 }
 // This is not very fast in debug modes, so its commented by default.
 // IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
 // test "tables match ziglyph" {
 //     const testing = std.testing;
 //
 //     const min = 0xFF + 1; // start outside ascii
 //     for (min..std.math.maxInt(u21)) |cp| {
 //         const t = table.get(@intCast(cp));
 //         const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half)));
 //         if (t.width != zg) {
 //             std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
 //             try testing.expect(false);
 //         }
 //     }
 // }