ghostty/src/unicode/grapheme.zig

const std = @import("std");
const props = @import("props.zig");
const GraphemeBoundaryClass = props.GraphemeBoundaryClass;
const table = props.table;

/// Determines if there is a grapheme break between two codepoints. This
/// must be called sequentially maintaining the state between calls.
///
/// This function does NOT work with control characters. Control characters,
/// line feeds, and carriage returns are expected to be filtered out before
/// calling this function. This is because this function is tuned for
/// Ghostty.
pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
    const value = Precompute.data[
        (Precompute.Key{
            .gbc1 = table.get(cp1).grapheme_boundary_class,
            .gbc2 = table.get(cp2).grapheme_boundary_class,
            .state = state.*,
        }).index()
    ];
    state.* = value.state;
    return value.result;
}

/// The state that must be maintained between calls to `graphemeBreak`.
pub const BreakState = packed struct(u2) {
    extended_pictographic: bool = false,
    regional_indicator: bool = false,
};

/// This is all the structures and data for the precomputed lookup table
/// for all possible permutations of state and grapheme boundary classes.
/// Precomputation only requires 2^10 keys of 3 bit values so the whole
/// table is less than 1KB.
const Precompute = struct {
    const Key = packed struct(u10) {
        state: BreakState,
        gbc1: GraphemeBoundaryClass,
        gbc2: GraphemeBoundaryClass,

        fn index(self: Key) usize {
            return @intCast(@as(u10, @bitCast(self)));
        }
    };

    const Value = packed struct(u3) {
        result: bool,
        state: BreakState,
    };

    const data = precompute: {
        var result: [std.math.maxInt(u10)]Value = undefined;

        @setEvalBranchQuota(3_000);
        const info = @typeInfo(GraphemeBoundaryClass).@"enum";
        for (0..std.math.maxInt(u2) + 1) |state_init| {
            for (info.fields) |field1| {
                for (info.fields) |field2| {
                    var state: BreakState = @bitCast(@as(u2, @intCast(state_init)));
                    const key: Key = .{
                        .gbc1 = @field(GraphemeBoundaryClass, field1.name),
                        .gbc2 = @field(GraphemeBoundaryClass, field2.name),
                        .state = state,
                    };
                    const v = graphemeBreakClass(key.gbc1, key.gbc2, &state);
                    result[key.index()] = .{ .result = v, .state = state };
                }
            }
        }

        break :precompute result;
    };
};

/// This is the algorithm from utf8proc. We only use this offline for
/// precomputing the lookup table.
fn graphemeBreakClass(
    gbc1: GraphemeBoundaryClass,
    gbc2: GraphemeBoundaryClass,
    state: *BreakState,
) bool {
    // GB11: Emoji Extend* ZWJ x Emoji
    if (!state.extended_pictographic and gbc1.isExtendedPictographic()) {
        state.extended_pictographic = true;
    }

    // These two properties are ignored because they're not relevant to
    // Ghostty -- they're filtered out before checking grapheme boundaries.
    // GB3: CR x LF
    // GB4: Control

    // GB6: Hangul L x (L|V|LV|VT)
    if (gbc1 == .L) {
        if (gbc2 == .L or
            gbc2 == .V or
            gbc2 == .LV or
            gbc2 == .LVT) return false;
    }

    // GB7: Hangul (LV | V) x (V | T)
    if (gbc1 == .LV or gbc1 == .V) {
        if (gbc2 == .V or
            gbc2 == .T) return false;
    }

    // GB8: Hangul (LVT | T) x T
    if (gbc1 == .LVT or gbc1 == .T) {
        if (gbc2 == .T) return false;
    }

    // GB9b: x (Extend | ZWJ)
    if (gbc2 == .extend or gbc2 == .zwj) return false;

    // GB9a: x Spacing
    if (gbc2 == .spacing_mark) return false;

    // GB9b: Prepend x
    if (gbc1 == .prepend) return false;

    // GB12, GB13: RI x RI
    if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
        if (state.regional_indicator) {
            state.regional_indicator = false;
            return true;
        } else {
            state.regional_indicator = true;
            return false;
        }
    }

    // GB11: Emoji Extend* ZWJ x Emoji
    if (state.extended_pictographic and
        gbc1 == .zwj and
        gbc2.isExtendedPictographic())
    {
        state.extended_pictographic = false;
        return false;
    }

    // UTS #51. This isn't covered by UAX #29 as far as I can tell (but
    // I'm probably wrong). This is a special case for emoji modifiers
    // which only do not break if they're next to a base.
    //
    // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
    if (gbc2 == .emoji_modifier and gbc1 == .extended_pictographic_base) {
        return false;
    }

    return true;
}

/// If you build this file as a binary, we will verify the grapheme break
/// implementation. This iterates over billions of codepoints so it is
/// SLOW. It's not meant to be run in CI, but it's useful for debugging.
pub fn main() !void {
    const ziglyph = @import("ziglyph");

    // Set the min and max to control the test range.
    const min = 0;
    const max = std.math.maxInt(u21) + 1;

    var state: BreakState = .{};
    var zg_state: u3 = 0;
    for (min..max) |cp1| {
        if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});

        if (cp1 == '\r' or cp1 == '\n' or
            ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;

        for (min..max) |cp2| {
            if (cp2 == '\r' or cp2 == '\n' or
                ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;

            const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
            const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
            if (gb != zg_gb) {
                std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
                    cp1,
                    cp2,
                    gb,
                    state,
                    zg_gb,
                    zg_state,
                });
            }
        }
    }
}

pub const std_options = struct {
    pub const log_level: std.log.Level = .info;
};

test "grapheme break: emoji modifier" {
    const testing = std.testing;

    // Emoji and modifier
    {
        var state: BreakState = .{};
        try testing.expect(!graphemeBreak(0x261D, 0x1F3FF, &state));
    }

    // Non-emoji and emoji modifier
    {
        var state: BreakState = .{};
        try testing.expect(graphemeBreak(0x22, 0x1F3FF, &state));
    }
}