ghostty/src/unicode/grapheme.zig
Mitchell Hashimoto 0f4d2bb237 Lots of 0.14 changes
2025-03-12 09:55:52 -07:00

209 lines
6.4 KiB
Zig

const std = @import("std");
const props = @import("props.zig");
const GraphemeBoundaryClass = props.GraphemeBoundaryClass;
const table = props.table;
/// Determines if there is a grapheme break between two codepoints. This
/// must be called sequentially maintaining the state between calls.
///
/// This function does NOT work with control characters. Control characters,
/// line feeds, and carriage returns are expected to be filtered out before
/// calling this function. This is because this function is tuned for
/// Ghostty.
pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
const value = Precompute.data[
(Precompute.Key{
.gbc1 = table.get(cp1).grapheme_boundary_class,
.gbc2 = table.get(cp2).grapheme_boundary_class,
.state = state.*,
}).index()
];
state.* = value.state;
return value.result;
}
/// The state that must be maintained between calls to `graphemeBreak`.
pub const BreakState = packed struct(u2) {
extended_pictographic: bool = false,
regional_indicator: bool = false,
};
/// This is all the structures and data for the precomputed lookup table
/// for all possible permutations of state and grapheme boundary classes.
/// Precomputation only requires 2^10 keys of 3 bit values so the whole
/// table is less than 1KB.
const Precompute = struct {
const Key = packed struct(u10) {
state: BreakState,
gbc1: GraphemeBoundaryClass,
gbc2: GraphemeBoundaryClass,
fn index(self: Key) usize {
return @intCast(@as(u10, @bitCast(self)));
}
};
const Value = packed struct(u3) {
result: bool,
state: BreakState,
};
const data = precompute: {
var result: [std.math.maxInt(u10)]Value = undefined;
@setEvalBranchQuota(3_000);
const info = @typeInfo(GraphemeBoundaryClass).@"enum";
for (0..std.math.maxInt(u2) + 1) |state_init| {
for (info.fields) |field1| {
for (info.fields) |field2| {
var state: BreakState = @bitCast(@as(u2, @intCast(state_init)));
const key: Key = .{
.gbc1 = @field(GraphemeBoundaryClass, field1.name),
.gbc2 = @field(GraphemeBoundaryClass, field2.name),
.state = state,
};
const v = graphemeBreakClass(key.gbc1, key.gbc2, &state);
result[key.index()] = .{ .result = v, .state = state };
}
}
}
break :precompute result;
};
};
/// This is the algorithm from utf8proc. We only use this offline for
/// precomputing the lookup table.
fn graphemeBreakClass(
gbc1: GraphemeBoundaryClass,
gbc2: GraphemeBoundaryClass,
state: *BreakState,
) bool {
// GB11: Emoji Extend* ZWJ x Emoji
if (!state.extended_pictographic and gbc1.isExtendedPictographic()) {
state.extended_pictographic = true;
}
// These two properties are ignored because they're not relevant to
// Ghostty -- they're filtered out before checking grapheme boundaries.
// GB3: CR x LF
// GB4: Control
// GB6: Hangul L x (L|V|LV|VT)
if (gbc1 == .L) {
if (gbc2 == .L or
gbc2 == .V or
gbc2 == .LV or
gbc2 == .LVT) return false;
}
// GB7: Hangul (LV | V) x (V | T)
if (gbc1 == .LV or gbc1 == .V) {
if (gbc2 == .V or
gbc2 == .T) return false;
}
// GB8: Hangul (LVT | T) x T
if (gbc1 == .LVT or gbc1 == .T) {
if (gbc2 == .T) return false;
}
// GB9b: x (Extend | ZWJ)
if (gbc2 == .extend or gbc2 == .zwj) return false;
// GB9a: x Spacing
if (gbc2 == .spacing_mark) return false;
// GB9b: Prepend x
if (gbc1 == .prepend) return false;
// GB12, GB13: RI x RI
if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
if (state.regional_indicator) {
state.regional_indicator = false;
return true;
} else {
state.regional_indicator = true;
return false;
}
}
// GB11: Emoji Extend* ZWJ x Emoji
if (state.extended_pictographic and
gbc1 == .zwj and
gbc2.isExtendedPictographic())
{
state.extended_pictographic = false;
return false;
}
// UTS #51. This isn't covered by UAX #29 as far as I can tell (but
// I'm probably wrong). This is a special case for emoji modifiers
// which only do not break if they're next to a base.
//
// emoji_modifier_sequence := emoji_modifier_base emoji_modifier
if (gbc2 == .emoji_modifier and gbc1 == .extended_pictographic_base) {
return false;
}
return true;
}
/// If you build this file as a binary, we will verify the grapheme break
/// implementation. This iterates over billions of codepoints so it is
/// SLOW. It's not meant to be run in CI, but it's useful for debugging.
pub fn main() !void {
const ziglyph = @import("ziglyph");
// Set the min and max to control the test range.
const min = 0;
const max = std.math.maxInt(u21) + 1;
var state: BreakState = .{};
var zg_state: u3 = 0;
for (min..max) |cp1| {
if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
if (cp1 == '\r' or cp1 == '\n' or
ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;
for (min..max) |cp2| {
if (cp2 == '\r' or cp2 == '\n' or
ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;
const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
if (gb != zg_gb) {
std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
cp1,
cp2,
gb,
state,
zg_gb,
zg_state,
});
}
}
}
}
pub const std_options = struct {
pub const log_level: std.log.Level = .info;
};
test "grapheme break: emoji modifier" {
const testing = std.testing;
// Emoji and modifier
{
var state: BreakState = .{};
try testing.expect(!graphemeBreak(0x261D, 0x1F3FF, &state));
}
// Non-emoji and emoji modifier
{
var state: BreakState = .{};
try testing.expect(graphemeBreak(0x22, 0x1F3FF, &state));
}
}