Merge pull request #1494 from mitchellh/grapheme-break

Optimized grapheme break detection (6x speedup)
This commit is contained in:
Mitchell Hashimoto
2024-02-10 07:54:06 -08:00
committed by GitHub
8 changed files with 421 additions and 11 deletions

33
src/bench/grapheme-break.sh Executable file
View File

@ -0,0 +1,33 @@
#!/usr/bin/env bash
#
# This is a trivial helper script to help run the grapheme-break benchmark.
# You probably want to tweak this script depending on what you're
# trying to measure.
# Options:
# - "ascii", uniform random ASCII bytes
# - "utf8", uniform random unicode characters, encoded as utf8
# - "rand", pure random data, will contain many invalid code sequences.
DATA="utf8"
SIZE="25000000"
# Add additional arguments
ARGS=""
# Generate the benchmark input ahead of time so it's not included in the time.
./zig-out/bin/bench-stream --mode=gen-$DATA | head -c $SIZE > /tmp/ghostty_bench_data
#cat ~/Downloads/JAPANESEBIBLE.txt > /tmp/ghostty_bench_data
# Uncomment to instead use the contents of `stream.txt` as input.
# yes $(cat ./stream.txt) | head -c $SIZE > /tmp/ghostty_bench_data
hyperfine \
--warmup 10 \
-n noop \
"./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} </tmp/ghostty_bench_data" \
-n ziglyph \
"./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
-n table \
"./zig-out/bin/bench-grapheme-break --mode=table${ARGS} </tmp/ghostty_bench_data"

View File

@ -0,0 +1,144 @@
//! This benchmark tests the throughput of grapheme break calculation.
//! This is a common operation in terminal character printing for terminals
//! that support grapheme clustering.
//!
//! This will consume all of the available stdin, so you should run it
//! with `head` in a pipe to restrict. For example, to test ASCII input:
//!
//! bench-stream --mode=gen-ascii | head -c 50M | bench-grapheme-break --mode=ziglyph
//!
const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const ArenaAllocator = std.heap.ArenaAllocator;
const ziglyph = @import("ziglyph");
const cli = @import("../cli.zig");
const simd = @import("../simd/main.zig");
const unicode = @import("../unicode/main.zig");
const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
const Args = struct {
mode: Mode = .noop,
/// The size for read buffers. Doesn't usually need to be changed. The
/// main point is to make this runtime known so we can avoid compiler
/// optimizations.
@"buffer-size": usize = 4096,
/// This is set by the CLI parser for deinit.
_arena: ?ArenaAllocator = null,
pub fn deinit(self: *Args) void {
if (self._arena) |arena| arena.deinit();
self.* = undefined;
}
};
const Mode = enum {
/// The baseline mode copies the data from the fd into a buffer. This
/// is used to show the minimal overhead of reading the fd into memory
/// and establishes a baseline for the other modes.
noop,
/// Use ziglyph library to calculate the display width of each codepoint.
ziglyph,
/// Ghostty's table-based approach.
table,
};
pub const std_options = struct {
pub const log_level: std.log.Level = .debug;
};
pub fn main() !void {
// We want to use the c allocator because it is much faster than GPA.
const alloc = std.heap.c_allocator;
// Parse our args
var args: Args = .{};
defer args.deinit();
{
var iter = try std.process.argsWithAllocator(alloc);
defer iter.deinit();
try cli.args.parse(Args, alloc, &args, &iter);
}
const reader = std.io.getStdIn().reader();
const buf = try alloc.alloc(u8, args.@"buffer-size");
// Handle the modes that do not depend on terminal state first.
switch (args.mode) {
.noop => try benchNoop(reader, buf),
.ziglyph => try benchZiglyph(reader, buf),
.table => try benchTable(reader, buf),
}
}
noinline fn benchNoop(
reader: anytype,
buf: []u8,
) !void {
var d: UTF8Decoder = .{};
while (true) {
const n = try reader.read(buf);
if (n == 0) break;
// Using stream.next directly with a for loop applies a naive
// scalar approach.
for (buf[0..n]) |c| {
_ = d.next(c);
}
}
}
noinline fn benchTable(
reader: anytype,
buf: []u8,
) !void {
var d: UTF8Decoder = .{};
var state: unicode.GraphemeBreakState = .{};
var cp1: u21 = 0;
while (true) {
const n = try reader.read(buf);
if (n == 0) break;
// Using stream.next directly with a for loop applies a naive
// scalar approach.
for (buf[0..n]) |c| {
const cp_, const consumed = d.next(c);
assert(consumed);
if (cp_) |cp2| {
const v = unicode.graphemeBreak(cp1, @intCast(cp2), &state);
buf[0] = @intCast(@intFromBool(v));
cp1 = cp2;
}
}
}
}
noinline fn benchZiglyph(
reader: anytype,
buf: []u8,
) !void {
var d: UTF8Decoder = .{};
var state: u3 = 0;
var cp1: u21 = 0;
while (true) {
const n = try reader.read(buf);
if (n == 0) break;
// Using stream.next directly with a for loop applies a naive
// scalar approach.
for (buf[0..n]) |c| {
const cp_, const consumed = d.next(c);
assert(consumed);
if (cp_) |cp2| {
const v = ziglyph.graphemeBreak(cp1, @intCast(cp2), &state);
buf[0] = @intCast(@intFromBool(v));
cp1 = cp2;
}
}
}
}

View File

@ -141,4 +141,5 @@ pub const ExeEntrypoint = enum {
bench_parser, bench_parser,
bench_stream, bench_stream,
bench_codepoint_width, bench_codepoint_width,
bench_grapheme_break,
}; };

View File

@ -9,4 +9,5 @@ pub usingnamespace switch (build_config.exe_entrypoint) {
.bench_parser => @import("bench/parser.zig"), .bench_parser => @import("bench/parser.zig"),
.bench_stream => @import("bench/stream.zig"), .bench_stream => @import("bench/stream.zig"),
.bench_codepoint_width => @import("bench/codepoint-width.zig"), .bench_codepoint_width => @import("bench/codepoint-width.zig"),
.bench_grapheme_break => @import("bench/grapheme-break.zig"),
}; };

View File

@ -6,7 +6,6 @@ const Terminal = @This();
const std = @import("std"); const std = @import("std");
const builtin = @import("builtin"); const builtin = @import("builtin");
const ziglyph = @import("ziglyph");
const testing = std.testing; const testing = std.testing;
const assert = std.debug.assert; const assert = std.debug.assert;
const Allocator = std.mem.Allocator; const Allocator = std.mem.Allocator;
@ -786,24 +785,19 @@ pub fn print(self: *Terminal, c: u21) !void {
if (prev.cell.char == 0) break :grapheme; if (prev.cell.char == 0) break :grapheme;
const grapheme_break = brk: { const grapheme_break = brk: {
var state: u3 = 0; var state: unicode.GraphemeBreakState = .{};
var cp1: u21 = @intCast(prev.cell.char); var cp1: u21 = @intCast(prev.cell.char);
if (prev.cell.attrs.grapheme) { if (prev.cell.attrs.grapheme) {
var it = row.codepointIterator(prev.x); var it = row.codepointIterator(prev.x);
while (it.next()) |cp2| { while (it.next()) |cp2| {
// log.debug("cp1={x} cp2={x}", .{ cp1, cp2 }); // log.debug("cp1={x} cp2={x}", .{ cp1, cp2 });
assert(!ziglyph.graphemeBreak( assert(!unicode.graphemeBreak(cp1, cp2, &state));
cp1,
cp2,
&state,
));
cp1 = cp2; cp1 = cp2;
} }
} }
// log.debug("cp1={x} cp2={x} end", .{ cp1, c }); // log.debug("cp1={x} cp2={x} end", .{ cp1, c });
break :brk ziglyph.graphemeBreak(cp1, c, &state); break :brk unicode.graphemeBreak(cp1, c, &state);
}; };
// If we can NOT break, this means that "c" is part of a grapheme // If we can NOT break, this means that "c" is part of a grapheme

183
src/unicode/grapheme.zig Normal file
View File

@ -0,0 +1,183 @@
const std = @import("std");
const props = @import("props.zig");
const GraphemeBoundaryClass = props.GraphemeBoundaryClass;
const table = props.table;
/// Determines if there is a grapheme break between two codepoints. This
/// must be called sequentially maintaining the state between calls.
///
/// This function does NOT work with control characters. Control characters,
/// line feeds, and carriage returns are expected to be filtered out before
/// calling this function. This is because this function is tuned for
/// Ghostty.
pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
const value = Precompute.data[
(Precompute.Key{
.gbc1 = table.get(cp1).grapheme_boundary_class,
.gbc2 = table.get(cp2).grapheme_boundary_class,
.state = state.*,
}).index()
];
state.* = value.state;
return value.result;
}
/// The state that must be maintained between calls to `graphemeBreak`.
pub const BreakState = packed struct(u2) {
extended_pictographic: bool = false,
regional_indicator: bool = false,
};
/// This is all the structures and data for the precomputed lookup table
/// for all possible permutations of state and grapheme boundary classes.
/// Precomputation only requires 2^10 keys of 3 bit values so the whole
/// table is less than 1KB.
const Precompute = struct {
const Key = packed struct(u10) {
state: BreakState,
gbc1: GraphemeBoundaryClass,
gbc2: GraphemeBoundaryClass,
fn index(self: Key) usize {
return @intCast(@as(u10, @bitCast(self)));
}
};
const Value = packed struct(u3) {
result: bool,
state: BreakState,
};
const data = precompute: {
var result: [std.math.maxInt(u10)]Value = undefined;
@setEvalBranchQuota(2_000);
const info = @typeInfo(GraphemeBoundaryClass).Enum;
for (0..std.math.maxInt(u2) + 1) |state_init| {
for (info.fields) |field1| {
for (info.fields) |field2| {
var state: BreakState = @bitCast(@as(u2, @intCast(state_init)));
const key: Key = .{
.gbc1 = @field(GraphemeBoundaryClass, field1.name),
.gbc2 = @field(GraphemeBoundaryClass, field2.name),
.state = state,
};
const v = graphemeBreakClass(key.gbc1, key.gbc2, &state);
result[key.index()] = .{ .result = v, .state = state };
}
}
}
break :precompute result;
};
};
/// This is the algorithm from utf8proc. We only use this offline for
/// precomputing the lookup table.
fn graphemeBreakClass(
gbc1: GraphemeBoundaryClass,
gbc2: GraphemeBoundaryClass,
state: *BreakState,
) bool {
// GB11: Emoji Extend* ZWJ x Emoji
if (!state.extended_pictographic and gbc1 == .extended_pictographic) {
state.extended_pictographic = true;
}
// These two properties are ignored because they're not relevant to
// Ghostty -- they're filtered out before checking grapheme boundaries.
// GB3: CR x LF
// GB4: Control
// GB6: Hangul L x (L|V|LV|VT)
if (gbc1 == .L) {
if (gbc2 == .L or
gbc2 == .V or
gbc2 == .LV or
gbc2 == .LVT) return false;
}
// GB7: Hangul (LV | V) x (V | T)
if (gbc1 == .LV or gbc1 == .V) {
if (gbc2 == .V or
gbc2 == .T) return false;
}
// GB8: Hangul (LVT | T) x T
if (gbc1 == .LVT or gbc1 == .T) {
if (gbc2 == .T) return false;
}
// GB9b: x (Extend | ZWJ)
if (gbc2 == .extend or gbc2 == .zwj) return false;
// GB9a: x Spacing
if (gbc2 == .spacing_mark) return false;
// GB9b: Prepend x
if (gbc1 == .prepend) return false;
// GB12, GB13: RI x RI
if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
if (state.regional_indicator) {
state.regional_indicator = false;
return true;
} else {
state.regional_indicator = true;
return false;
}
}
// GB11: Emoji Extend* ZWJ x Emoji
if (state.extended_pictographic and
gbc1 == .zwj and
gbc2 == .extended_pictographic)
{
state.extended_pictographic = false;
return false;
}
return true;
}
/// If you build this file as a binary, we will verify the grapheme break
/// implementation. This iterates over billions of codepoints so it is
/// SLOW. It's not meant to be run in CI, but it's useful for debugging.
pub fn main() !void {
const ziglyph = @import("ziglyph");
// Set the min and max to control the test range.
const min = 0;
const max = std.math.maxInt(u21) + 1;
var state: BreakState = .{};
var zg_state: u3 = 0;
for (min..max) |cp1| {
if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
if (cp1 == '\r' or cp1 == '\n' or
ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;
for (min..max) |cp2| {
if (cp2 == '\r' or cp2 == '\n' or
ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;
const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
if (gb != zg_gb) {
std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
cp1,
cp2,
gb,
state,
zg_gb,
zg_state,
});
}
}
}
}
pub const std_options = struct {
pub const log_level: std.log.Level = .info;
};

View File

@ -1,8 +1,11 @@
pub const lut = @import("lut.zig"); pub const lut = @import("lut.zig");
const grapheme = @import("grapheme.zig");
const props = @import("props.zig"); const props = @import("props.zig");
pub const table = props.table; pub const table = props.table;
pub const Properties = props.Properties; pub const Properties = props.Properties;
pub const graphemeBreak = grapheme.graphemeBreak;
pub const GraphemeBreakState = grapheme.BreakState;
test { test {
@import("std").testing.refAllDecls(@This()); @import("std").testing.refAllDecls(@This());

View File

@ -27,9 +27,13 @@ pub const Properties = struct {
/// becomes a 2-em dash). /// becomes a 2-em dash).
width: u2 = 0, width: u2 = 0,
/// Grapheme boundary class.
grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
// Needed for lut.Generator // Needed for lut.Generator
pub fn eql(a: Properties, b: Properties) bool { pub fn eql(a: Properties, b: Properties) bool {
return a.width == b.width; return a.width == b.width and
a.grapheme_boundary_class == b.grapheme_boundary_class;
} }
// Needed for lut.Generator // Needed for lut.Generator
@ -41,17 +45,64 @@ pub const Properties = struct {
) !void { ) !void {
_ = layout; _ = layout;
_ = opts; _ = opts;
try std.fmt.format(writer, ".{{ .width= {}, }}", .{ try std.fmt.format(writer,
\\.{{
\\ .width= {},
\\ .grapheme_boundary_class= .{s},
\\}}
, .{
self.width, self.width,
@tagName(self.grapheme_boundary_class),
}); });
} }
}; };
/// Possible grapheme boundary classes. This isn't an exhaustive list:
/// we omit control, CR, LF, etc. because in Ghostty's usage that are
/// impossible because they're handled by the terminal.
pub const GraphemeBoundaryClass = enum(u4) {
invalid,
L,
V,
T,
LV,
LVT,
prepend,
extend,
zwj,
spacing_mark,
regional_indicator,
extended_pictographic,
/// Gets the grapheme boundary class for a codepoint. This is VERY
/// SLOW. The use case for this is only in generating lookup tables.
pub fn init(cp: u21) GraphemeBoundaryClass {
if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic;
if (ziglyph.emoji.isEmojiModifier(cp)) return .extend;
if (ziglyph.grapheme_break.isL(cp)) return .L;
if (ziglyph.grapheme_break.isV(cp)) return .V;
if (ziglyph.grapheme_break.isT(cp)) return .T;
if (ziglyph.grapheme_break.isLv(cp)) return .LV;
if (ziglyph.grapheme_break.isLvt(cp)) return .LVT;
if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend;
if (ziglyph.grapheme_break.isExtend(cp)) return .extend;
if (ziglyph.grapheme_break.isZwj(cp)) return .zwj;
if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark;
if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator;
// This is obviously not INVALID invalid, there is SOME grapheme
// boundary class for every codepoint. But we don't care about
// anything that doesn't fit into the above categories.
return .invalid;
}
};
pub fn get(cp: u21) Properties { pub fn get(cp: u21) Properties {
const zg_width = ziglyph.display_width.codePointWidth(cp, .half); const zg_width = ziglyph.display_width.codePointWidth(cp, .half);
return .{ return .{
.width = @intCast(@min(2, @max(0, zg_width))), .width = @intCast(@min(2, @max(0, zg_width))),
.grapheme_boundary_class = GraphemeBoundaryClass.init(cp),
}; };
} }