From f165d36dd226c613db99df44c29d026be1997f28 Mon Sep 17 00:00:00 2001 From: Qwerasd Date: Mon, 5 Feb 2024 22:15:58 -0500 Subject: [PATCH 1/4] Add fast DFA utf-8 decoder implementation --- src/terminal/utf8.zig | 132 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 src/terminal/utf8.zig diff --git a/src/terminal/utf8.zig b/src/terminal/utf8.zig new file mode 100644 index 000000000..c45c7252c --- /dev/null +++ b/src/terminal/utf8.zig @@ -0,0 +1,132 @@ +//! DFA-based non-allocating error-replacing UTF-8 decoder. +//! +//! This implementation is based largely on the excellent work of +//! Bjoern Hoehrmann, with slight modifications to support error- +//! replacement. +//! +//! For details on Bjoern's DFA-based UTF-8 decoder, see +//! http://bjoern.hoehrmann.de/utf-8/decoder/dfa (MIT licensed) + +const std = @import("std"); +const testing = std.testing; + +const log = std.log.scoped(.utf8decoder); + +//zig fmt: off +const char_classes = [_]u4{ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, +}; + +const transitions = [_]u8 { + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; +//zig fmt: on + +// This is where we accumulate our current codepoint. +var accumulator: u21 = 0; +// The internal state of the DFA. +const ACCEPT_STATE = 0; +const REJECT_STATE = 12; +var state: u8 = ACCEPT_STATE; + +/// Takes the next byte in the utf-8 sequence and emits a tuple of +/// - The codepoint that was generated, if there is one. +/// - A boolean that indicates whether the provided byte was consumed. +/// +/// The only case where the byte is not consumed is if an ill-formed +/// sequence is reached, in which case a replacement character will be +/// emitted and the byte will not be consumed. +/// +/// If the byte is not consumed, the caller is responsible for calling +/// again with the same byte before continuing. +pub inline fn next(byte: u8) struct { ?u21, bool } { + const char_class = char_classes[byte]; + + const initial_state = state; + + if (state != ACCEPT_STATE) { + accumulator <<= 6; + accumulator |= (byte & 0x3F); + } else { + accumulator = (@as(u21, 0xFF) >> char_class) & (byte); + } + + state = transitions[state + char_class]; + + if (state == ACCEPT_STATE) { + defer { accumulator = 0; } + // Emit the fully decoded codepoint. + return .{ accumulator, true }; + } else if (state == REJECT_STATE) { + accumulator = 0; + state = ACCEPT_STATE; + // Emit a replacement character. If we rejected the first byte + // in a sequence, then it was consumed, otherwise it was not. + return .{ 0xFFFD, initial_state == ACCEPT_STATE }; + } else { + // Emit nothing, we're in the middle of a sequence. + return .{ null, true }; + } +} + +test "ASCII" { + var out = std.mem.zeroes([13]u8); + for ("Hello, World!", 0..) |byte, i| { + const res = next(byte); + try testing.expect(res[1]); + if (res[0]) |codepoint| { + out[i] = @intCast(codepoint); + } + } + try testing.expect(std.mem.eql(u8, &out, "Hello, World!")); +} + +test "Well formed utf-8" { + var out = std.mem.zeroes([4]u21); + var i: usize = 0; + // 4 bytes, 3 bytes, 2 bytes, 1 byte + for ("๐Ÿ˜„โœครA") |byte| { + var consumed = false; + while (!consumed) { + const res = next(byte); + consumed = res[1]; + // There are no errors in this sequence, so + // every byte should be consumed first try. + try testing.expect(consumed == true); + if (res[0]) |codepoint| { + out[i] = codepoint; + i += 1; + } + } + } + try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0x1F604, 0x2724, 0xC1, 0x41 })); +} + +test "Partially invalid utf-8" { + var out = std.mem.zeroes([5]u21); + var i: usize = 0; + // Illegally terminated sequence, valid sequence, illegal surrogate pair. + for ("\xF0\x9F๐Ÿ˜„\xED\xA0\x80") |byte| { + var consumed = false; + while (!consumed) { + const res = next(byte); + consumed = res[1]; + if (res[0]) |codepoint| { + out[i] = codepoint; + i += 1; + } + } + } + try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0xFFFD, 0x1F604, 0xFFFD, 0xFFFD, 0xFFFD })); +} From 846b3421e607aaac920101be132d8f54760da948 Mon Sep 17 00:00:00 2001 From: Qwerasd Date: Mon, 5 Feb 2024 23:20:47 -0500 Subject: [PATCH 2/4] terminal: replace utf8 decoding with custom decoder in stream.zig (Completely removed utf8 handling from Parser.zig) --- src/terminal/Parser.zig | 168 ++++----------------- src/terminal/{utf8.zig => UTF8Decoder.zig} | 50 +++--- src/terminal/parse_table.zig | 11 -- src/terminal/stream.zig | 23 +++ 4 files changed, 82 insertions(+), 170 deletions(-) rename src/terminal/{utf8.zig => UTF8Decoder.zig} (81%) diff --git a/src/terminal/Parser.zig b/src/terminal/Parser.zig index b242ba6fd..41cca7191 100644 --- a/src/terminal/Parser.zig +++ b/src/terminal/Parser.zig @@ -28,9 +28,6 @@ pub const State = enum { dcs_ignore, osc_string, sos_pm_apc_string, - - // Custom states added that aren't present on vt100.net - utf8, }; /// Transition action is an action that can be taken during a state @@ -230,11 +227,6 @@ pub fn deinit(self: *Parser) void { /// Up to 3 actions may need to be executed -- in order -- representing /// the state exit, transition, and entry actions. pub fn next(self: *Parser, c: u8) [3]?Action { - // If we're processing UTF-8, we handle this manually. - if (self.state == .utf8) { - return .{ self.next_utf8(c), null, null }; - } - const effect = table[c][@intFromEnum(self.state)]; // log.info("next: {x}", .{c}); @@ -282,57 +274,11 @@ pub fn next(self: *Parser, c: u8) [3]?Action { }, }, .sos_pm_apc_string => Action{ .apc_start = {} }, - .utf8 => utf8: { - // When entering the UTF8 state, we need to grab the - // last intermediate as our first byte and reset - // the intermediates, because prior actions (i.e. CSI) - // can pollute the intermediates and we use it to build - // our UTF-8 string. - if (self.intermediates_idx > 1) { - const last = self.intermediates_idx - 1; - self.intermediates[0] = self.intermediates[last]; - self.clear(); - self.intermediates_idx = 1; - } - break :utf8 null; - }, else => null, }, }; } -/// Processes the next byte in a UTF8 sequence. It is assumed that -/// intermediates[0] already has the first byte of a UTF8 sequence -/// (triggered via the state machine). -fn next_utf8(self: *Parser, c: u8) ?Action { - // Collect the byte into the intermediates array - self.collect(c); - - // Error is unreachable because the first byte comes from the state machine. - // If we get an error here, it is a bug in the state machine that we want - // to chase down. - const len = std.unicode.utf8ByteSequenceLength(self.intermediates[0]) catch unreachable; - - // We need to collect more - if (self.intermediates_idx < len) return null; - - // No matter what happens, we go back to ground since we know we have - // enough bytes for the UTF8 sequence. - defer { - self.state = .ground; - self.intermediates_idx = 0; - } - - // We have enough bytes, decode! - const bytes = self.intermediates[0..len]; - const rune = std.unicode.utf8Decode(bytes) catch rune: { - log.warn("invalid UTF-8 sequence: {any}", .{bytes}); - break :rune 0xFFFD; // ๏ฟฝ - }; - - return Action{ .print = rune }; -} - fn collect(self: *Parser, c: u8) void { if (self.intermediates_idx >= MAX_INTERMEDIATE) { log.warn("invalid intermediates count", .{}); @@ -828,91 +774,35 @@ test "osc: 112 incomplete sequence" { } } -test "print: utf8 2 byte" { - var p = init(); - var a: [3]?Action = undefined; - for ("ยฃ") |c| a = p.next(c); - - try testing.expect(p.state == .ground); - try testing.expect(a[0].? == .print); - try testing.expect(a[1] == null); - try testing.expect(a[2] == null); - - const rune = a[0].?.print; - try testing.expectEqual(try std.unicode.utf8Decode("ยฃ"), rune); -} - -test "print: utf8 3 byte" { - var p = init(); - var a: [3]?Action = undefined; - for ("โ‚ฌ") |c| a = p.next(c); - - try testing.expect(p.state == .ground); - try testing.expect(a[0].? == .print); - try testing.expect(a[1] == null); - try testing.expect(a[2] == null); - - const rune = a[0].?.print; - try testing.expectEqual(try std.unicode.utf8Decode("โ‚ฌ"), rune); -} - -test "print: utf8 4 byte" { - var p = init(); - var a: [3]?Action = undefined; - for ("๐ˆ") |c| a = p.next(c); - - try testing.expect(p.state == .ground); - try testing.expect(a[0].? == .print); - try testing.expect(a[1] == null); - try testing.expect(a[2] == null); - - const rune = a[0].?.print; - try testing.expectEqual(try std.unicode.utf8Decode("๐ˆ"), rune); -} - -test "print: utf8 invalid" { - var p = init(); - var a: [3]?Action = undefined; - for ("\xC3\x28") |c| a = p.next(c); - - try testing.expect(p.state == .ground); - try testing.expect(a[0].? == .print); - try testing.expect(a[1] == null); - try testing.expect(a[2] == null); - - const rune = a[0].?.print; - try testing.expectEqual(try std.unicode.utf8Decode("๏ฟฝ"), rune); -} - -test "csi followed by utf8" { - var p = init(); - const prefix = &[_]u8{ - // CSI sequence - 0x1b, 0x5b, 0x3f, 0x32, 0x30, 0x30, 0x34, 0x64, '\r', - - // UTF8 prefix (not complete) - 0xe2, - }; - for (prefix) |char| { - _ = p.next(char); - } - - { - const a = p.next(0x94); - try testing.expect(p.state == .utf8); - try testing.expect(a[0] == null); - try testing.expect(a[1] == null); - try testing.expect(a[2] == null); - } - - { - const a = p.next(0x94); - try testing.expect(p.state == .ground); - try testing.expect(a[0].? == .print); - try testing.expect(a[1] == null); - try testing.expect(a[2] == null); - } -} +// test "csi followed by utf8" { +// var p = init(); +// const prefix = &[_]u8{ +// // CSI sequence +// 0x1b, 0x5b, 0x3f, 0x32, 0x30, 0x30, 0x34, 0x64, '\r', +// +// // UTF8 prefix (not complete) +// 0xe2, +// }; +// for (prefix) |char| { +// _ = p.next(char); +// } +// +// { +// const a = p.next(0x94); +// try testing.expect(p.state == .utf8); +// try testing.expect(a[0] == null); +// try testing.expect(a[1] == null); +// try testing.expect(a[2] == null); +// } +// +// { +// const a = p.next(0x94); +// try testing.expect(p.state == .ground); +// try testing.expect(a[0].? == .print); +// try testing.expect(a[1] == null); +// try testing.expect(a[2] == null); +// } +// } test "csi: too many params" { var p = init(); diff --git a/src/terminal/utf8.zig b/src/terminal/UTF8Decoder.zig similarity index 81% rename from src/terminal/utf8.zig rename to src/terminal/UTF8Decoder.zig index c45c7252c..c020549c7 100644 --- a/src/terminal/utf8.zig +++ b/src/terminal/UTF8Decoder.zig @@ -6,6 +6,7 @@ //! //! For details on Bjoern's DFA-based UTF-8 decoder, see //! http://bjoern.hoehrmann.de/utf-8/decoder/dfa (MIT licensed) +const UTF8Decoder = @This(); const std = @import("std"); const testing = std.testing; @@ -33,12 +34,18 @@ const transitions = [_]u8 { }; //zig fmt: on -// This is where we accumulate our current codepoint. -var accumulator: u21 = 0; -// The internal state of the DFA. +// DFA states const ACCEPT_STATE = 0; const REJECT_STATE = 12; -var state: u8 = ACCEPT_STATE; + +// This is where we accumulate our current codepoint. +accumulator: u21 = 0, +// The internal state of the DFA. +state: u8 = ACCEPT_STATE, + +pub fn init() UTF8Decoder { + return .{}; +} /// Takes the next byte in the utf-8 sequence and emits a tuple of /// - The codepoint that was generated, if there is one. @@ -50,27 +57,27 @@ var state: u8 = ACCEPT_STATE; /// /// If the byte is not consumed, the caller is responsible for calling /// again with the same byte before continuing. -pub inline fn next(byte: u8) struct { ?u21, bool } { +pub inline fn next(self: *UTF8Decoder, byte: u8) struct { ?u21, bool } { const char_class = char_classes[byte]; - const initial_state = state; + const initial_state = self.state; - if (state != ACCEPT_STATE) { - accumulator <<= 6; - accumulator |= (byte & 0x3F); + if (self.state != ACCEPT_STATE) { + self.accumulator <<= 6; + self.accumulator |= (byte & 0x3F); } else { - accumulator = (@as(u21, 0xFF) >> char_class) & (byte); + self.accumulator = (@as(u21, 0xFF) >> char_class) & (byte); } - state = transitions[state + char_class]; + self.state = transitions[self.state + char_class]; - if (state == ACCEPT_STATE) { - defer { accumulator = 0; } + if (self.state == ACCEPT_STATE) { + defer { self.accumulator = 0; } // Emit the fully decoded codepoint. - return .{ accumulator, true }; - } else if (state == REJECT_STATE) { - accumulator = 0; - state = ACCEPT_STATE; + return .{ self.accumulator, true }; + } else if (self.state == REJECT_STATE) { + self.accumulator = 0; + self.state = ACCEPT_STATE; // Emit a replacement character. If we rejected the first byte // in a sequence, then it was consumed, otherwise it was not. return .{ 0xFFFD, initial_state == ACCEPT_STATE }; @@ -81,9 +88,10 @@ pub inline fn next(byte: u8) struct { ?u21, bool } { } test "ASCII" { + var d = init(); var out = std.mem.zeroes([13]u8); for ("Hello, World!", 0..) |byte, i| { - const res = next(byte); + const res = d.next(byte); try testing.expect(res[1]); if (res[0]) |codepoint| { out[i] = @intCast(codepoint); @@ -93,13 +101,14 @@ test "ASCII" { } test "Well formed utf-8" { + var d = init(); var out = std.mem.zeroes([4]u21); var i: usize = 0; // 4 bytes, 3 bytes, 2 bytes, 1 byte for ("๐Ÿ˜„โœครA") |byte| { var consumed = false; while (!consumed) { - const res = next(byte); + const res = d.next(byte); consumed = res[1]; // There are no errors in this sequence, so // every byte should be consumed first try. @@ -114,13 +123,14 @@ test "Well formed utf-8" { } test "Partially invalid utf-8" { + var d = init(); var out = std.mem.zeroes([5]u21); var i: usize = 0; // Illegally terminated sequence, valid sequence, illegal surrogate pair. for ("\xF0\x9F๐Ÿ˜„\xED\xA0\x80") |byte| { var consumed = false; while (!consumed) { - const res = next(byte); + const res = d.next(byte); consumed = res[1]; if (res[0]) |codepoint| { out[i] = codepoint; diff --git a/src/terminal/parse_table.zig b/src/terminal/parse_table.zig index e7542b062..66c443783 100644 --- a/src/terminal/parse_table.zig +++ b/src/terminal/parse_table.zig @@ -5,10 +5,6 @@ //! https://vt100.net/emu/dec_ansi_parser //! But has some modifications: //! -//! * utf8 state introduced to detect UTF8-encoded sequences. The -//! actual handling back OUT of the utf8 state is done manually in the -//! parser. -//! //! * csi_param accepts the colon character (':') since the SGR command //! accepts colon as a valid parameter value. //! @@ -92,18 +88,11 @@ fn genTable() Table { // ground { - const source = State.ground; - // events single(&result, 0x19, .ground, .ground, .execute); range(&result, 0, 0x17, .ground, .ground, .execute); range(&result, 0x1C, 0x1F, .ground, .ground, .execute); range(&result, 0x20, 0x7F, .ground, .ground, .print); - - // => utf8 - range(&result, 0xC2, 0xDF, source, .utf8, .collect); - range(&result, 0xE0, 0xEF, source, .utf8, .collect); - range(&result, 0xF0, 0xF4, source, .utf8, .collect); } // escape_intermediate diff --git a/src/terminal/stream.zig b/src/terminal/stream.zig index f33f52942..ad884b201 100644 --- a/src/terminal/stream.zig +++ b/src/terminal/stream.zig @@ -9,6 +9,7 @@ const kitty = @import("kitty.zig"); const modes = @import("modes.zig"); const osc = @import("osc.zig"); const sgr = @import("sgr.zig"); +const UTF8Decoder = @import("UTF8Decoder.zig"); const MouseShape = @import("mouse_shape.zig").MouseShape; const log = std.log.scoped(.stream); @@ -37,6 +38,7 @@ pub fn Stream(comptime Handler: type) type { handler: Handler, parser: Parser = .{}, + utf8decoder: UTF8Decoder = .{}, pub fn deinit(self: *Self) void { self.parser.deinit(); @@ -50,6 +52,21 @@ pub fn Stream(comptime Handler: type) type { /// Process the next character and call any callbacks if necessary. pub fn next(self: *Self, c: u8) !void { // log.debug("char: {c}", .{c}); + if (self.parser.state == .ground and c != 0x1B) { + var consumed = false; + while (!consumed) { + const res = self.utf8decoder.next(c); + consumed = res[1]; + if (res[0]) |codepoint| { + if (codepoint < 0xF) { + try self.execute(@intCast(codepoint)); + } else { + try self.print(@intCast(codepoint)); + } + } + } + return; + } const actions = self.parser.next(c); for (actions) |action_opt| { const action = action_opt orelse continue; @@ -101,6 +118,12 @@ pub fn Stream(comptime Handler: type) type { } } + pub fn print(self: *Self, c: u21) !void { + if (@hasDecl(T, "print")) { + try self.handler.print(c); + } + } + pub fn execute(self: *Self, c: u8) !void { switch (@as(ansi.C0, @enumFromInt(c))) { // We ignore SOH/STX: https://github.com/microsoft/terminal/issues/10786 From cd570890f640d57745bb1723f1738fad1d468d75 Mon Sep 17 00:00:00 2001 From: Qwerasd Date: Mon, 5 Feb 2024 23:32:47 -0500 Subject: [PATCH 3/4] remove commented out test --- src/terminal/Parser.zig | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/src/terminal/Parser.zig b/src/terminal/Parser.zig index 41cca7191..5746be065 100644 --- a/src/terminal/Parser.zig +++ b/src/terminal/Parser.zig @@ -774,36 +774,6 @@ test "osc: 112 incomplete sequence" { } } -// test "csi followed by utf8" { -// var p = init(); -// const prefix = &[_]u8{ -// // CSI sequence -// 0x1b, 0x5b, 0x3f, 0x32, 0x30, 0x30, 0x34, 0x64, '\r', -// -// // UTF8 prefix (not complete) -// 0xe2, -// }; -// for (prefix) |char| { -// _ = p.next(char); -// } -// -// { -// const a = p.next(0x94); -// try testing.expect(p.state == .utf8); -// try testing.expect(a[0] == null); -// try testing.expect(a[1] == null); -// try testing.expect(a[2] == null); -// } -// -// { -// const a = p.next(0x94); -// try testing.expect(p.state == .ground); -// try testing.expect(a[0].? == .print); -// try testing.expect(a[1] == null); -// try testing.expect(a[2] == null); -// } -// } - test "csi: too many params" { var p = init(); _ = p.next(0x1B); From 0c2a87e5fb8b2d2e0e581b7f8b80a7ee806cfb41 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Mon, 5 Feb 2024 21:20:20 -0800 Subject: [PATCH 4/4] terminal: small stylistic tweaks --- src/terminal/UTF8Decoder.zig | 26 +++++++++++++------------- src/terminal/stream.zig | 1 + 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/terminal/UTF8Decoder.zig b/src/terminal/UTF8Decoder.zig index c020549c7..6bb0d9815 100644 --- a/src/terminal/UTF8Decoder.zig +++ b/src/terminal/UTF8Decoder.zig @@ -13,7 +13,7 @@ const testing = std.testing; const log = std.log.scoped(.utf8decoder); -//zig fmt: off +// zig fmt: off const char_classes = [_]u4{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, @@ -32,7 +32,7 @@ const transitions = [_]u8 { 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,12,12,12,12,12, }; -//zig fmt: on +// zig fmt: on // DFA states const ACCEPT_STATE = 0; @@ -43,10 +43,6 @@ accumulator: u21 = 0, // The internal state of the DFA. state: u8 = ACCEPT_STATE, -pub fn init() UTF8Decoder { - return .{}; -} - /// Takes the next byte in the utf-8 sequence and emits a tuple of /// - The codepoint that was generated, if there is one. /// - A boolean that indicates whether the provided byte was consumed. @@ -72,7 +68,8 @@ pub inline fn next(self: *UTF8Decoder, byte: u8) struct { ?u21, bool } { self.state = transitions[self.state + char_class]; if (self.state == ACCEPT_STATE) { - defer { self.accumulator = 0; } + defer self.accumulator = 0; + // Emit the fully decoded codepoint. return .{ self.accumulator, true }; } else if (self.state == REJECT_STATE) { @@ -88,8 +85,8 @@ pub inline fn next(self: *UTF8Decoder, byte: u8) struct { ?u21, bool } { } test "ASCII" { - var d = init(); - var out = std.mem.zeroes([13]u8); + var d: UTF8Decoder = .{}; + var out: [13]u8 = undefined; for ("Hello, World!", 0..) |byte, i| { const res = d.next(byte); try testing.expect(res[1]); @@ -97,12 +94,13 @@ test "ASCII" { out[i] = @intCast(codepoint); } } + try testing.expect(std.mem.eql(u8, &out, "Hello, World!")); } test "Well formed utf-8" { - var d = init(); - var out = std.mem.zeroes([4]u21); + var d: UTF8Decoder = .{}; + var out: [4]u21 = undefined; var i: usize = 0; // 4 bytes, 3 bytes, 2 bytes, 1 byte for ("๐Ÿ˜„โœครA") |byte| { @@ -119,12 +117,13 @@ test "Well formed utf-8" { } } } + try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0x1F604, 0x2724, 0xC1, 0x41 })); } test "Partially invalid utf-8" { - var d = init(); - var out = std.mem.zeroes([5]u21); + var d: UTF8Decoder = .{}; + var out: [5]u21 = undefined; var i: usize = 0; // Illegally terminated sequence, valid sequence, illegal surrogate pair. for ("\xF0\x9F๐Ÿ˜„\xED\xA0\x80") |byte| { @@ -138,5 +137,6 @@ test "Partially invalid utf-8" { } } } + try testing.expect(std.mem.eql(u21, &out, &[_]u21{ 0xFFFD, 0x1F604, 0xFFFD, 0xFFFD, 0xFFFD })); } diff --git a/src/terminal/stream.zig b/src/terminal/stream.zig index ad884b201..fe4c5c53a 100644 --- a/src/terminal/stream.zig +++ b/src/terminal/stream.zig @@ -67,6 +67,7 @@ pub fn Stream(comptime Handler: type) type { } return; } + const actions = self.parser.next(c); for (actions) |action_opt| { const action = action_opt orelse continue;