terminal: replace utf8 decoding with custom decoder in stream.zig

(Completely removed utf8 handling from Parser.zig)
2025-07-15 00:06:09 +03:00 · 2024-02-05 23:20:47 -05:00
parent f165d36dd2
commit 846b3421e6
4 changed files with 82 additions and 170 deletions
--- a/src/terminal/Parser.zig
+++ b/src/terminal/Parser.zig
@ -28,9 +28,6 @@ pub const State = enum {
    dcs_ignore,
    osc_string,
    sos_pm_apc_string,
    // Custom states added that aren't present on vt100.net
    utf8,
 };
 /// Transition action is an action that can be taken during a state
@ -230,11 +227,6 @@ pub fn deinit(self: *Parser) void {
 /// Up to 3 actions may need to be executed -- in order -- representing
 /// the state exit, transition, and entry actions.
 pub fn next(self: *Parser, c: u8) [3]?Action {
    // If we're processing UTF-8, we handle this manually.
    if (self.state == .utf8) {
        return .{ self.next_utf8(c), null, null };
    }
    const effect = table[c][@intFromEnum(self.state)];
    // log.info("next: {x}", .{c});
@ -282,57 +274,11 @@ pub fn next(self: *Parser, c: u8) [3]?Action {
                },
            },
            .sos_pm_apc_string => Action{ .apc_start = {} },
            .utf8 => utf8: {
                // When entering the UTF8 state, we need to grab the
                // last intermediate as our first byte and reset
                // the intermediates, because prior actions (i.e. CSI)
                // can pollute the intermediates and we use it to build
                // our UTF-8 string.
                if (self.intermediates_idx > 1) {
                    const last = self.intermediates_idx - 1;
                    self.intermediates[0] = self.intermediates[last];
                    self.clear();
                    self.intermediates_idx = 1;
                }
                break :utf8 null;
            },
            else => null,
        },
    };
 }
 /// Processes the next byte in a UTF8 sequence. It is assumed that
 /// intermediates[0] already has the first byte of a UTF8 sequence
 /// (triggered via the state machine).
 fn next_utf8(self: *Parser, c: u8) ?Action {
    // Collect the byte into the intermediates array
    self.collect(c);
    // Error is unreachable because the first byte comes from the state machine.
    // If we get an error here, it is a bug in the state machine that we want
    // to chase down.
    const len = std.unicode.utf8ByteSequenceLength(self.intermediates[0]) catch unreachable;
    // We need to collect more
    if (self.intermediates_idx < len) return null;
    // No matter what happens, we go back to ground since we know we have
    // enough bytes for the UTF8 sequence.
    defer {
        self.state = .ground;
        self.intermediates_idx = 0;
    }
    // We have enough bytes, decode!
    const bytes = self.intermediates[0..len];
    const rune = std.unicode.utf8Decode(bytes) catch rune: {
        log.warn("invalid UTF-8 sequence: {any}", .{bytes});
        break :rune 0xFFFD; // <EFBFBD>
    };
    return Action{ .print = rune };
 }
 fn collect(self: *Parser, c: u8) void {
    if (self.intermediates_idx >= MAX_INTERMEDIATE) {
        log.warn("invalid intermediates count", .{});
@ -828,91 +774,35 @@ test "osc: 112 incomplete sequence" {
    }
 }
-test "print: utf8 2 byte" {
+// test "csi followed by utf8" {
-    var p = init();
+//     var p = init();
-    var a: [3]?Action = undefined;
+//     const prefix = &[_]u8{
-    for ("£") |c| a = p.next(c);
+//         // CSI sequence
-
+//         0x1b, 0x5b, 0x3f, 0x32, 0x30, 0x30, 0x34, 0x64, '\r',
-    try testing.expect(p.state == .ground);
+//
-    try testing.expect(a[0].? == .print);
+//         // UTF8 prefix (not complete)
-    try testing.expect(a[1] == null);
+//         0xe2,
-    try testing.expect(a[2] == null);
+//     };
-
+//     for (prefix) |char| {
-    const rune = a[0].?.print;
+//         _ = p.next(char);
-    try testing.expectEqual(try std.unicode.utf8Decode("£"), rune);
+//     }
-}
+//
-
+//     {
-test "print: utf8 3 byte" {
+//         const a = p.next(0x94);
-    var p = init();
+//         try testing.expect(p.state == .utf8);
-    var a: [3]?Action = undefined;
+//         try testing.expect(a[0] == null);
-    for ("€") |c| a = p.next(c);
+//         try testing.expect(a[1] == null);
-
+//         try testing.expect(a[2] == null);
-    try testing.expect(p.state == .ground);
+//     }
-    try testing.expect(a[0].? == .print);
+//
-    try testing.expect(a[1] == null);
+//     {
-    try testing.expect(a[2] == null);
+//         const a = p.next(0x94);
-
+//         try testing.expect(p.state == .ground);
-    const rune = a[0].?.print;
+//         try testing.expect(a[0].? == .print);
-    try testing.expectEqual(try std.unicode.utf8Decode("€"), rune);
+//         try testing.expect(a[1] == null);
-}
+//         try testing.expect(a[2] == null);
-
+//     }
-test "print: utf8 4 byte" {
+// }
    var p = init();
    var a: [3]?Action = undefined;
    for ("𐍈") |c| a = p.next(c);
    try testing.expect(p.state == .ground);
    try testing.expect(a[0].? == .print);
    try testing.expect(a[1] == null);
    try testing.expect(a[2] == null);
    const rune = a[0].?.print;
    try testing.expectEqual(try std.unicode.utf8Decode("𐍈"), rune);
 }
 test "print: utf8 invalid" {
    var p = init();
    var a: [3]?Action = undefined;
    for ("\xC3\x28") |c| a = p.next(c);
    try testing.expect(p.state == .ground);
    try testing.expect(a[0].? == .print);
    try testing.expect(a[1] == null);
    try testing.expect(a[2] == null);
    const rune = a[0].?.print;
    try testing.expectEqual(try std.unicode.utf8Decode("<EFBFBD>"), rune);
 }
 test "csi followed by utf8" {
    var p = init();
    const prefix = &[_]u8{
        // CSI sequence
        0x1b, 0x5b, 0x3f, 0x32, 0x30, 0x30, 0x34, 0x64, '\r',
        // UTF8 prefix (not complete)
        0xe2,
    };
    for (prefix) |char| {
        _ = p.next(char);
    }
    {
        const a = p.next(0x94);
        try testing.expect(p.state == .utf8);
        try testing.expect(a[0] == null);
        try testing.expect(a[1] == null);
        try testing.expect(a[2] == null);
    }
    {
        const a = p.next(0x94);
        try testing.expect(p.state == .ground);
        try testing.expect(a[0].? == .print);
        try testing.expect(a[1] == null);
        try testing.expect(a[2] == null);
    }
 }
 test "csi: too many params" {
    var p = init();
--- a/src/terminal/UTF8Decoder.zig
+++ b/src/terminal/UTF8Decoder.zig
@ -6,6 +6,7 @@
 //!
 //! For details on Bjoern's DFA-based UTF-8 decoder, see
 //! http://bjoern.hoehrmann.de/utf-8/decoder/dfa (MIT licensed)
 const UTF8Decoder = @This();
 const std = @import("std");
 const testing = std.testing;
@ -33,12 +34,18 @@ const transitions = [_]u8 {
 };
 //zig fmt: on
-// This is where we accumulate our current codepoint.
+// DFA states
 var accumulator: u21 = 0;
 // The internal state of the DFA.
 const ACCEPT_STATE = 0;
 const REJECT_STATE = 12;
-var state: u8 = ACCEPT_STATE;
+
 // This is where we accumulate our current codepoint.
 accumulator: u21 = 0,
 // The internal state of the DFA.
 state: u8 = ACCEPT_STATE,
 pub fn init() UTF8Decoder {
    return .{};
 }
 /// Takes the next byte in the utf-8 sequence and emits a tuple of
 /// - The codepoint that was generated, if there is one.
@ -50,27 +57,27 @@ var state: u8 = ACCEPT_STATE;
 ///
 /// If the byte is not consumed, the caller is responsible for calling
 /// again with the same byte before continuing.
-pub inline fn next(byte: u8) struct { ?u21, bool } {
+pub inline fn next(self: *UTF8Decoder, byte: u8) struct { ?u21, bool } {
    const char_class = char_classes[byte];
-    const initial_state = state;
+    const initial_state = self.state;
-    if (state != ACCEPT_STATE) {
+    if (self.state != ACCEPT_STATE) {
-        accumulator <<= 6;
+        self.accumulator <<= 6;
-        accumulator |= (byte & 0x3F);
+        self.accumulator |= (byte & 0x3F);
    } else {
-        accumulator = (@as(u21, 0xFF) >> char_class) & (byte);
+        self.accumulator = (@as(u21, 0xFF) >> char_class) & (byte);
    }
-    state = transitions[state + char_class];
+    self.state = transitions[self.state + char_class];
-    if (state == ACCEPT_STATE) {
+    if (self.state == ACCEPT_STATE) {
-        defer { accumulator = 0; }
+        defer { self.accumulator = 0; }
        // Emit the fully decoded codepoint.
-        return .{ accumulator, true };
+        return .{ self.accumulator, true };
-    } else if (state == REJECT_STATE) {
+    } else if (self.state == REJECT_STATE) {
-        accumulator = 0;
+        self.accumulator = 0;
-        state = ACCEPT_STATE;
+        self.state = ACCEPT_STATE;
        // Emit a replacement character. If we rejected the first byte
        // in a sequence, then it was consumed, otherwise it was not.
        return .{ 0xFFFD, initial_state == ACCEPT_STATE };
@ -81,9 +88,10 @@ pub inline fn next(byte: u8) struct { ?u21, bool } {
 }
 test "ASCII" {
    var d = init();
    var out = std.mem.zeroes([13]u8);
    for ("Hello, World!", 0..) |byte, i| {
-        const res = next(byte);
+        const res = d.next(byte);
        try testing.expect(res[1]);
        if (res[0]) |codepoint| {
            out[i] = @intCast(codepoint);
@ -93,13 +101,14 @@ test "ASCII" {
 }
 test "Well formed utf-8" {
    var d = init();
    var out = std.mem.zeroes([4]u21);
    var i: usize = 0;
    // 4 bytes, 3 bytes, 2 bytes, 1 byte
    for ("😄✤ÁA") |byte| {
        var consumed = false;
        while (!consumed) {
-            const res = next(byte);
+            const res = d.next(byte);
            consumed = res[1];
            // There are no errors in this sequence, so
            // every byte should be consumed first try.
@ -114,13 +123,14 @@ test "Well formed utf-8" {
 }
 test "Partially invalid utf-8" {
    var d = init();
    var out = std.mem.zeroes([5]u21);
    var i: usize = 0;
    // Illegally terminated sequence, valid sequence, illegal surrogate pair.
    for ("\xF0\x9F😄\xED\xA0\x80") |byte| {
        var consumed = false;
        while (!consumed) {
-            const res = next(byte);
+            const res = d.next(byte);
            consumed = res[1];
            if (res[0]) |codepoint| {
                out[i] = codepoint;
--- a/src/terminal/parse_table.zig
+++ b/src/terminal/parse_table.zig
@ -5,10 +5,6 @@
 //! https://vt100.net/emu/dec_ansi_parser
 //! But has some modifications:
 //!
 //!   * utf8 state introduced to detect UTF8-encoded sequences. The
 //!     actual handling back OUT of the utf8 state is done manually in the
 //!     parser.
 //!
 //!   * csi_param accepts the colon character (':') since the SGR command
 //!     accepts colon as a valid parameter value.
 //!
@ -92,18 +88,11 @@ fn genTable() Table {
    // ground
    {
        const source = State.ground;
        // events
        single(&result, 0x19, .ground, .ground, .execute);
        range(&result, 0, 0x17, .ground, .ground, .execute);
        range(&result, 0x1C, 0x1F, .ground, .ground, .execute);
        range(&result, 0x20, 0x7F, .ground, .ground, .print);
        // => utf8
        range(&result, 0xC2, 0xDF, source, .utf8, .collect);
        range(&result, 0xE0, 0xEF, source, .utf8, .collect);
        range(&result, 0xF0, 0xF4, source, .utf8, .collect);
    }
    // escape_intermediate
--- a/src/terminal/stream.zig
+++ b/src/terminal/stream.zig
@ -9,6 +9,7 @@ const kitty = @import("kitty.zig");
 const modes = @import("modes.zig");
 const osc = @import("osc.zig");
 const sgr = @import("sgr.zig");
 const UTF8Decoder = @import("UTF8Decoder.zig");
 const MouseShape = @import("mouse_shape.zig").MouseShape;
 const log = std.log.scoped(.stream);
@ -37,6 +38,7 @@ pub fn Stream(comptime Handler: type) type {
        handler: Handler,
        parser: Parser = .{},
        utf8decoder: UTF8Decoder = .{},
        pub fn deinit(self: *Self) void {
            self.parser.deinit();
@ -50,6 +52,21 @@ pub fn Stream(comptime Handler: type) type {
        /// Process the next character and call any callbacks if necessary.
        pub fn next(self: *Self, c: u8) !void {
            // log.debug("char: {c}", .{c});
            if (self.parser.state == .ground and c != 0x1B) {
                var consumed = false;
                while (!consumed) {
                    const res = self.utf8decoder.next(c);
                    consumed = res[1];
                    if (res[0]) |codepoint| {
                        if (codepoint < 0xF) {
                            try self.execute(@intCast(codepoint));
                        } else {
                            try self.print(@intCast(codepoint));
                        }
                    }
                }
                return;
            }
            const actions = self.parser.next(c);
            for (actions) |action_opt| {
                const action = action_opt orelse continue;
@ -101,6 +118,12 @@ pub fn Stream(comptime Handler: type) type {
            }
        }
        pub fn print(self: *Self, c: u21) !void {
            if (@hasDecl(T, "print")) {
                try self.handler.print(c);
            }
        }
        pub fn execute(self: *Self, c: u8) !void {
            switch (@as(ansi.C0, @enumFromInt(c))) {
                // We ignore SOH/STX: https://github.com/microsoft/terminal/issues/10786