unicode: use packed struct for break state

This commit is contained in:
Mitchell Hashimoto
2024-02-09 20:29:36 -08:00
parent c47ad97f62
commit 132fbb3a46
6 changed files with 19 additions and 91 deletions

View File

@ -217,17 +217,6 @@ pub fn build(b: *std.Build) !void {
// Add our benchmarks // Add our benchmarks
try benchSteps(b, target, config, emit_bench); try benchSteps(b, target, config, emit_bench);
{
const exe = b.addExecutable(.{
.name = "grapheme-verify",
.root_source_file = .{ .path = "src/unicode/grapheme.zig" },
.target = target,
.optimize = .ReleaseFast,
});
b.installArtifact(exe);
_ = try addDeps(b, exe, config);
}
// We only build an exe if we have a runtime set. // We only build an exe if we have a runtime set.
const exe_: ?*std.Build.Step.Compile = if (config.app_runtime != .none) b.addExecutable(.{ const exe_: ?*std.Build.Step.Compile = if (config.app_runtime != .none) b.addExecutable(.{
.name = "ghostty", .name = "ghostty",
@ -1093,15 +1082,6 @@ fn addDeps(
step.linkLibrary(utfcpp_dep.artifact("utfcpp")); step.linkLibrary(utfcpp_dep.artifact("utfcpp"));
try static_libs.append(utfcpp_dep.artifact("utfcpp").getEmittedBin()); try static_libs.append(utfcpp_dep.artifact("utfcpp").getEmittedBin());
// utf8proc
const utf8proc_dep = b.dependency("utf8proc", .{
.target = target,
.optimize = optimize,
});
step.root_module.addImport("utf8proc", utf8proc_dep.module("utf8proc"));
step.linkLibrary(utf8proc_dep.artifact("utf8proc"));
try static_libs.append(utf8proc_dep.artifact("utf8proc").getEmittedBin());
// Spirv-Cross // Spirv-Cross
step.linkLibrary(spirv_cross_dep.artifact("spirv_cross")); step.linkLibrary(spirv_cross_dep.artifact("spirv_cross"));
try static_libs.append(spirv_cross_dep.artifact("spirv_cross").getEmittedBin()); try static_libs.append(spirv_cross_dep.artifact("spirv_cross").getEmittedBin());

View File

@ -39,7 +39,6 @@
.pixman = .{ .path = "./pkg/pixman" }, .pixman = .{ .path = "./pkg/pixman" },
.simdutf = .{ .path = "./pkg/simdutf" }, .simdutf = .{ .path = "./pkg/simdutf" },
.utfcpp = .{ .path = "./pkg/utfcpp" }, .utfcpp = .{ .path = "./pkg/utfcpp" },
.utf8proc = .{ .path = "./pkg/utf8proc" },
.zlib = .{ .path = "./pkg/zlib" }, .zlib = .{ .path = "./pkg/zlib" },
// Shader translation // Shader translation

View File

@ -27,8 +27,6 @@ hyperfine \
"./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} </tmp/ghostty_bench_data" \ "./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} </tmp/ghostty_bench_data" \
-n ziglyph \ -n ziglyph \
"./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \ "./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
-n utf8proc \
"./zig-out/bin/bench-grapheme-break --mode=utf8proc${ARGS} </tmp/ghostty_bench_data" \
-n table \ -n table \
"./zig-out/bin/bench-grapheme-break --mode=table${ARGS} </tmp/ghostty_bench_data" "./zig-out/bin/bench-grapheme-break --mode=table${ARGS} </tmp/ghostty_bench_data"

View File

@ -46,8 +46,6 @@ const Mode = enum {
/// Ghostty's table-based approach. /// Ghostty's table-based approach.
table, table,
utf8proc,
}; };
pub const std_options = struct { pub const std_options = struct {
@ -75,7 +73,6 @@ pub fn main() !void {
.noop => try benchNoop(reader, buf), .noop => try benchNoop(reader, buf),
.ziglyph => try benchZiglyph(reader, buf), .ziglyph => try benchZiglyph(reader, buf),
.table => try benchTable(reader, buf), .table => try benchTable(reader, buf),
.utf8proc => try benchUtf8proc(reader, buf),
} }
} }
@ -101,7 +98,7 @@ noinline fn benchTable(
buf: []u8, buf: []u8,
) !void { ) !void {
var d: UTF8Decoder = .{}; var d: UTF8Decoder = .{};
var state: u3 = 0; var state: unicode.GraphemeBreakState = .{};
var cp1: u21 = 0; var cp1: u21 = 0;
while (true) { while (true) {
const n = try reader.read(buf); const n = try reader.read(buf);
@ -145,29 +142,3 @@ noinline fn benchZiglyph(
} }
} }
} }
noinline fn benchUtf8proc(
reader: anytype,
buf: []u8,
) !void {
const utf8proc = @import("utf8proc");
var d: UTF8Decoder = .{};
var state: i32 = 0;
var cp1: u21 = 0;
while (true) {
const n = try reader.read(buf);
if (n == 0) break;
// Using stream.next directly with a for loop applies a naive
// scalar approach.
for (buf[0..n]) |c| {
const cp_, const consumed = d.next(c);
assert(consumed);
if (cp_) |cp2| {
const v = utf8proc.graphemeBreakStateful(cp1, @intCast(cp2), &state);
buf[0] = @intCast(@intFromBool(v));
cp1 = cp2;
}
}
}
}

View File

@ -18,19 +18,27 @@ const table = props.table;
/// line feeds, and carriage returns are expected to be filtered out before /// line feeds, and carriage returns are expected to be filtered out before
/// calling this function. This is because this function is tuned for /// calling this function. This is because this function is tuned for
/// Ghostty. /// Ghostty.
pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool { pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
const gbc1 = table.get(cp1).grapheme_boundary_class; const gbc1 = table.get(cp1).grapheme_boundary_class;
const gbc2 = table.get(cp2).grapheme_boundary_class; const gbc2 = table.get(cp2).grapheme_boundary_class;
return graphemeBreakClass(gbc1, gbc2, state); return graphemeBreakClass(gbc1, gbc2, state);
} }
/// The state that must be maintained between calls to `graphemeBreak`.
pub const BreakState = packed struct(u2) {
extended_pictographic: bool = false,
regional_indicator: bool = false,
};
fn graphemeBreakClass( fn graphemeBreakClass(
gbc1: GraphemeBoundaryClass, gbc1: GraphemeBoundaryClass,
gbc2: GraphemeBoundaryClass, gbc2: GraphemeBoundaryClass,
state: *u3, state: *BreakState,
) bool { ) bool {
// GB11: Emoji Extend* ZWJ x Emoji // GB11: Emoji Extend* ZWJ x Emoji
if (!hasXpic(state) and gbc1 == .extended_pictographic) setXpic(state); if (!state.extended_pictographic and gbc1 == .extended_pictographic) {
state.extended_pictographic = true;
}
// These two properties are ignored because they're not relevant to // These two properties are ignored because they're not relevant to
// Ghostty -- they're filtered out before checking grapheme boundaries. // Ghostty -- they're filtered out before checking grapheme boundaries.
@ -67,56 +75,27 @@ fn graphemeBreakClass(
// GB12, GB13: RI x RI // GB12, GB13: RI x RI
if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) { if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
if (hasRegional(state)) { if (state.regional_indicator) {
unsetRegional(state); state.regional_indicator = false;
return true; return true;
} else { } else {
setRegional(state); state.regional_indicator = true;
return false; return false;
} }
} }
// GB11: Emoji Extend* ZWJ x Emoji // GB11: Emoji Extend* ZWJ x Emoji
if (hasXpic(state) and if (state.extended_pictographic and
gbc1 == .zwj and gbc1 == .zwj and
gbc2 == .extended_pictographic) gbc2 == .extended_pictographic)
{ {
unsetXpic(state); state.extended_pictographic = false;
return false; return false;
} }
return true; return true;
} }
const State = packed struct(u2) {
extended_pictographic: bool = false,
regional_indicator: bool = false,
};
fn hasXpic(state: *const u3) bool {
return state.* & 1 == 1;
}
fn setXpic(state: *u3) void {
state.* |= 1;
}
fn unsetXpic(state: *u3) void {
state.* ^= 1;
}
fn hasRegional(state: *const u3) bool {
return state.* & 2 == 2;
}
fn setRegional(state: *u3) void {
state.* |= 2;
}
fn unsetRegional(state: *u3) void {
state.* ^= 2;
}
/// If you build this file as a binary, we will verify the grapheme break /// If you build this file as a binary, we will verify the grapheme break
/// implementation. This iterates over billions of codepoints so it is /// implementation. This iterates over billions of codepoints so it is
/// SLOW. It's not meant to be run in CI, but it's useful for debugging. /// SLOW. It's not meant to be run in CI, but it's useful for debugging.
@ -127,7 +106,7 @@ pub fn main() !void {
const min = 0; const min = 0;
const max = std.math.maxInt(u21) + 1; const max = std.math.maxInt(u21) + 1;
var state: u3 = 0; var state: BreakState = .{};
var zg_state: u3 = 0; var zg_state: u3 = 0;
for (min..max) |cp1| { for (min..max) |cp1| {
if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1}); if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});

View File

@ -5,6 +5,7 @@ const props = @import("props.zig");
pub const table = props.table; pub const table = props.table;
pub const Properties = props.Properties; pub const Properties = props.Properties;
pub const graphemeBreak = grapheme.graphemeBreak; pub const graphemeBreak = grapheme.graphemeBreak;
pub const GraphemeBreakState = grapheme.BreakState;
test { test {
@import("std").testing.refAllDecls(@This()); @import("std").testing.refAllDecls(@This());