unicode: generate our own lookup tables

This commit is contained in:
Mitchell Hashimoto
2024-02-08 21:01:11 -08:00
parent 4ae41579da
commit 9755d0696e
7 changed files with 361 additions and 0 deletions

View File

@ -1172,6 +1172,7 @@ fn addDeps(
} }
try addHelp(b, step, config); try addHelp(b, step, config);
try addUnicodeTables(b, step);
return static_libs; return static_libs;
} }
@ -1218,6 +1219,42 @@ fn addHelp(
} }
} }
/// Generate unicode fast lookup tables
fn addUnicodeTables(
b: *std.Build,
step_: ?*std.Build.Step.Compile,
) !void {
// Our static state between runs. We memoize our output to gen once
const State = struct {
var generated: ?std.Build.LazyPath = null;
};
const output = State.generated orelse strings: {
const exe = b.addExecutable(.{
.name = "unigen",
.root_source_file = .{ .path = "src/unicode/props.zig" },
.target = b.host,
});
if (step_ == null) b.installArtifact(exe);
const ziglyph_dep = b.dependency("ziglyph", .{
.target = b.host,
});
exe.root_module.addImport("ziglyph", ziglyph_dep.module("ziglyph"));
const help_run = b.addRunArtifact(exe);
State.generated = help_run.captureStdOut();
break :strings State.generated.?;
};
if (step_) |step| {
output.addStepDependencies(&step.step);
step.root_module.addAnonymousImport("unicode_tables", .{
.root_source_file = output,
});
}
}
/// Generate documentation (manpages, etc.) from help strings /// Generate documentation (manpages, etc.) from help strings
fn buildDocumentation( fn buildDocumentation(
b: *std.Build, b: *std.Build,

View File

@ -31,6 +31,8 @@ hyperfine \
"./zig-out/bin/bench-codepoint-width --mode=utf8proc${ARGS} </tmp/ghostty_bench_data" \ "./zig-out/bin/bench-codepoint-width --mode=utf8proc${ARGS} </tmp/ghostty_bench_data" \
-n ziglyph \ -n ziglyph \
"./zig-out/bin/bench-codepoint-width --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \ "./zig-out/bin/bench-codepoint-width --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
-n table \
"./zig-out/bin/bench-codepoint-width --mode=table${ARGS} </tmp/ghostty_bench_data" \
-n simd \ -n simd \
"./zig-out/bin/bench-codepoint-width --mode=simd${ARGS} </tmp/ghostty_bench_data" "./zig-out/bin/bench-codepoint-width --mode=simd${ARGS} </tmp/ghostty_bench_data"

View File

@ -17,6 +17,7 @@ const ArenaAllocator = std.heap.ArenaAllocator;
const ziglyph = @import("ziglyph"); const ziglyph = @import("ziglyph");
const cli = @import("../cli.zig"); const cli = @import("../cli.zig");
const simd = @import("../simd/main.zig"); const simd = @import("../simd/main.zig");
const table = @import("../unicode/main.zig").table;
const UTF8Decoder = @import("../terminal/UTF8Decoder.zig"); const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
const Args = struct { const Args = struct {
@ -53,6 +54,8 @@ const Mode = enum {
/// Our SIMD implementation. /// Our SIMD implementation.
simd, simd,
table,
}; };
pub const std_options = struct { pub const std_options = struct {
@ -82,6 +85,7 @@ pub fn main() !void {
.utf8proc => try benchUtf8proc(reader, buf), .utf8proc => try benchUtf8proc(reader, buf),
.ziglyph => try benchZiglyph(reader, buf), .ziglyph => try benchZiglyph(reader, buf),
.simd => try benchSimd(reader, buf), .simd => try benchSimd(reader, buf),
.table => try benchTable(reader, buf),
} }
} }
@ -153,6 +157,30 @@ noinline fn benchUtf8proc(
} }
} }
noinline fn benchTable(
reader: anytype,
buf: []u8,
) !void {
var d: UTF8Decoder = .{};
while (true) {
const n = try reader.read(buf);
if (n == 0) break;
// Using stream.next directly with a for loop applies a naive
// scalar approach.
for (buf[0..n]) |c| {
const cp_, const consumed = d.next(c);
assert(consumed);
if (cp_) |cp| {
const width = table.get(@intCast(cp)).width;
// Write the width to the buffer to avoid it being compiled away
buf[0] = @intCast(width);
}
}
}
}
noinline fn benchZiglyph( noinline fn benchZiglyph(
reader: anytype, reader: anytype,
buf: []u8, buf: []u8,

View File

@ -308,6 +308,7 @@ test {
_ = @import("terminal/main.zig"); _ = @import("terminal/main.zig");
_ = @import("terminfo/main.zig"); _ = @import("terminfo/main.zig");
_ = @import("simd/main.zig"); _ = @import("simd/main.zig");
_ = @import("unicode/main.zig");
// TODO // TODO
_ = @import("blocking_queue.zig"); _ = @import("blocking_queue.zig");

179
src/unicode/lut.zig Normal file
View File

@ -0,0 +1,179 @@
const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
// This whole file is based on the algorithm described here:
// https://here-be-braces.com/fast-lookup-of-unicode-properties/
/// Creates a type that is able to generate a 3-level lookup table
/// from a Unicode codepoint to a mapping of type Elem. The lookup table
/// generally is expected to be codegen'd and then reloaded, although it
/// can in theory be generated at runtime.
///
/// Context must have two functions:
/// - `get(Context, u21) Elem`: returns the mapping for a given codepoint
/// - `eql(Context, Elem, Elem) bool`: returns true if two mappings are equal
///
pub fn Generator(
comptime Elem: type,
comptime Context: type,
) type {
return struct {
const Self = @This();
const block_size = 256;
const Block = [block_size]u16;
/// Mapping of a block to its index in the stage2 array.
const BlockMap = std.HashMap(
Block,
u16,
struct {
pub fn hash(ctx: @This(), k: Block) u64 {
_ = ctx;
var hasher = std.hash.Wyhash.init(0);
std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
return hasher.final();
}
pub fn eql(ctx: @This(), a: Block, b: Block) bool {
_ = ctx;
return std.mem.eql(u16, &a, &b);
}
},
std.hash_map.default_max_load_percentage,
);
ctx: Context = undefined,
/// Generate the lookup tables. The arrays in the return value
/// are owned by the caller and must be freed.
pub fn generate(self: *const Self, alloc: Allocator) !Tables(Elem) {
// Maps block => stage2 index
var blocks_map = BlockMap.init(alloc);
defer blocks_map.deinit();
// Our stages
var stage1 = std.ArrayList(u16).init(alloc);
defer stage1.deinit();
var stage2 = std.ArrayList(u16).init(alloc);
defer stage2.deinit();
var stage3 = std.ArrayList(Elem).init(alloc);
defer stage3.deinit();
var block: Block = undefined;
var block_len: u16 = 0;
for (0..std.math.maxInt(u21) + 1) |cp| {
// Get our block value and find the matching result value
// in our list of possible values in stage3. This way, each
// possible mapping only gets one entry in stage3.
const elem = try self.ctx.get(@as(u21, @intCast(cp)));
const block_idx = block_idx: {
for (stage3.items, 0..) |item, i| {
if (self.ctx.eql(item, elem)) break :block_idx i;
}
const idx = stage3.items.len;
try stage3.append(elem);
break :block_idx idx;
};
// The block stores the mapping to the stage3 index
block[block_len] = std.math.cast(u16, block_idx) orelse return error.BlockTooLarge;
block_len += 1;
// If we still have space and we're not done with codepoints,
// we keep building up the bock. Conversely: we finalize this
// block if we've filled it or are out of codepoints.
if (block_len < block_size and cp != std.math.maxInt(u21)) continue;
if (block_len < block_size) @memset(block[block_len..block_size], 0);
// Look for the stage2 index for this block. If it doesn't exist
// we add it to stage2 and update the mapping.
const gop = try blocks_map.getOrPut(block);
if (!gop.found_existing) {
gop.value_ptr.* = std.math.cast(
u16,
stage2.items.len,
) orelse return error.Stage2TooLarge;
for (block[0..block_len]) |entry| try stage2.append(entry);
}
// Map stage1 => stage2 and reset our block
try stage1.append(gop.value_ptr.*);
block_len = 0;
}
// All of our lengths must fit in a u16 for this to work
assert(stage1.items.len <= std.math.maxInt(u16));
assert(stage2.items.len <= std.math.maxInt(u16));
assert(stage3.items.len <= std.math.maxInt(u16));
const stage1_owned = try stage1.toOwnedSlice();
errdefer alloc.free(stage1_owned);
const stage2_owned = try stage2.toOwnedSlice();
errdefer alloc.free(stage2_owned);
const stage3_owned = try stage3.toOwnedSlice();
errdefer alloc.free(stage3_owned);
return .{
.stage1 = stage1_owned,
.stage2 = stage2_owned,
.stage3 = stage3_owned,
};
}
};
}
/// Creates a type that given a 3-level lookup table, can be used to
/// look up a mapping for a given codepoint, encode it out to Zig, etc.
pub fn Tables(comptime Elem: type) type {
return struct {
const Self = @This();
stage1: []const u16,
stage2: []const u16,
stage3: []const Elem,
/// Given a codepoint, returns the mapping for that codepoint.
pub fn get(self: *const Self, cp: u21) Elem {
const high = cp >> 8;
const low = cp & 0xFF;
return self.stage3[self.stage2[self.stage1[high] + low]];
}
/// Writes the lookup table as Zig to the given writer. The
/// written file exports three constants: stage1, stage2, and
/// stage3. These can be used to rebuild the lookup table in Zig.
pub fn writeZig(self: *const Self, writer: anytype) !void {
try writer.print(
\\//! This file is auto-generated. Do not edit.
\\
\\pub fn Tables(comptime Elem: type) type {{
\\ return struct {{
\\pub const stage1: [{}]u16 = .{{
, .{self.stage1.len});
for (self.stage1) |entry| try writer.print("{},", .{entry});
try writer.print(
\\
\\}};
\\
\\pub const stage2: [{}]u16 = .{{
, .{self.stage2.len});
for (self.stage2) |entry| try writer.print("{},", .{entry});
try writer.writeAll("};");
try writer.print(
\\
\\pub const stage3: [{}]Elem = .{{
, .{self.stage3.len});
for (self.stage3) |entry| try writer.print("{},", .{entry});
try writer.writeAll(
\\};
\\ };
\\}
);
}
};
}

9
src/unicode/main.zig Normal file
View File

@ -0,0 +1,9 @@
pub const lut = @import("lut.zig");
const props = @import("props.zig");
pub const table = props.table;
pub const Properties = props.Properties;
test {
@import("std").testing.refAllDecls(@This());
}

105
src/unicode/props.zig Normal file
View File

@ -0,0 +1,105 @@
const props = @This();
const std = @import("std");
const ziglyph = @import("ziglyph");
const lut = @import("lut.zig");
/// The lookup tables for Ghostty.
pub const table = table: {
// This is only available after running main() below as part of the Ghostty
// build.zig, but due to Zig's lazy analysis we can still reference it here.
const generated = @import("unicode_tables").Tables(Properties);
const Tables = lut.Tables(Properties);
break :table Tables{
.stage1 = &generated.stage1,
.stage2 = &generated.stage2,
.stage3 = &generated.stage3,
};
};
/// Property set per codepoint that Ghostty cares about.
///
/// Adding to this lets you find new properties but also potentially makes
/// our lookup tables less efficient. Any changes to this should run the
/// benchmarks in src/bench to verify that we haven't regressed.
pub const Properties = struct {
/// Codepoint width. We clamp to [0, 2] since Ghostty handles control
/// characters and we max out at 2 for wide characters (i.e. 3-em dash
/// becomes a 2-em dash).
width: u2 = 0,
// Needed for lut.Generator
pub fn eql(a: Properties, b: Properties) bool {
return a.width == b.width;
}
// Needed for lut.Generator
pub fn format(
self: Properties,
comptime layout: []const u8,
opts: std.fmt.FormatOptions,
writer: anytype,
) !void {
_ = layout;
_ = opts;
try std.fmt.format(writer, ".{{ .width= {}, }}", .{
self.width,
});
}
};
pub fn get(cp: u21) Properties {
const zg_width = ziglyph.display_width.codePointWidth(cp, .half);
return .{
.width = @intCast(@min(2, @max(0, zg_width))),
};
}
/// Runnable binary to generate the lookup tables and output to stdout.
pub fn main() !void {
const alloc = std.heap.c_allocator;
const gen: lut.Generator(
Properties,
struct {
pub fn get(ctx: @This(), cp: u21) !Properties {
_ = ctx;
return props.get(cp);
}
pub fn eql(ctx: @This(), a: Properties, b: Properties) bool {
_ = ctx;
return a.eql(b);
}
},
) = .{};
const t = try gen.generate(alloc);
defer alloc.free(t.stage1);
defer alloc.free(t.stage2);
defer alloc.free(t.stage3);
try t.writeZig(std.io.getStdOut().writer());
// Uncomment when manually debugging to see our table sizes.
// std.log.warn("stage1={} stage2={} stage3={}", .{
// t.stage1.len,
// t.stage2.len,
// t.stage3.len,
// });
}
// This is not very fast in debug modes, so its commented by default.
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
// test "tables match ziglyph" {
// const testing = std.testing;
//
// const min = 0xFF + 1; // start outside ascii
// for (min..std.math.maxInt(u21)) |cp| {
// const t = table.get(@intCast(cp));
// const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half)));
// if (t.width != zg) {
// std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
// try testing.expect(false);
// }
// }
// }