mirror of
https://github.com/ghostty-org/ghostty.git
synced 2025-07-15 00:06:09 +03:00
terminal: replace utf8 decoding with custom decoder in stream.zig
(Completely removed utf8 handling from Parser.zig)
This commit is contained in:
@ -28,9 +28,6 @@ pub const State = enum {
|
|||||||
dcs_ignore,
|
dcs_ignore,
|
||||||
osc_string,
|
osc_string,
|
||||||
sos_pm_apc_string,
|
sos_pm_apc_string,
|
||||||
|
|
||||||
// Custom states added that aren't present on vt100.net
|
|
||||||
utf8,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Transition action is an action that can be taken during a state
|
/// Transition action is an action that can be taken during a state
|
||||||
@ -230,11 +227,6 @@ pub fn deinit(self: *Parser) void {
|
|||||||
/// Up to 3 actions may need to be executed -- in order -- representing
|
/// Up to 3 actions may need to be executed -- in order -- representing
|
||||||
/// the state exit, transition, and entry actions.
|
/// the state exit, transition, and entry actions.
|
||||||
pub fn next(self: *Parser, c: u8) [3]?Action {
|
pub fn next(self: *Parser, c: u8) [3]?Action {
|
||||||
// If we're processing UTF-8, we handle this manually.
|
|
||||||
if (self.state == .utf8) {
|
|
||||||
return .{ self.next_utf8(c), null, null };
|
|
||||||
}
|
|
||||||
|
|
||||||
const effect = table[c][@intFromEnum(self.state)];
|
const effect = table[c][@intFromEnum(self.state)];
|
||||||
|
|
||||||
// log.info("next: {x}", .{c});
|
// log.info("next: {x}", .{c});
|
||||||
@ -282,57 +274,11 @@ pub fn next(self: *Parser, c: u8) [3]?Action {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
.sos_pm_apc_string => Action{ .apc_start = {} },
|
.sos_pm_apc_string => Action{ .apc_start = {} },
|
||||||
.utf8 => utf8: {
|
|
||||||
// When entering the UTF8 state, we need to grab the
|
|
||||||
// last intermediate as our first byte and reset
|
|
||||||
// the intermediates, because prior actions (i.e. CSI)
|
|
||||||
// can pollute the intermediates and we use it to build
|
|
||||||
// our UTF-8 string.
|
|
||||||
if (self.intermediates_idx > 1) {
|
|
||||||
const last = self.intermediates_idx - 1;
|
|
||||||
self.intermediates[0] = self.intermediates[last];
|
|
||||||
self.clear();
|
|
||||||
self.intermediates_idx = 1;
|
|
||||||
}
|
|
||||||
break :utf8 null;
|
|
||||||
},
|
|
||||||
else => null,
|
else => null,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Processes the next byte in a UTF8 sequence. It is assumed that
|
|
||||||
/// intermediates[0] already has the first byte of a UTF8 sequence
|
|
||||||
/// (triggered via the state machine).
|
|
||||||
fn next_utf8(self: *Parser, c: u8) ?Action {
|
|
||||||
// Collect the byte into the intermediates array
|
|
||||||
self.collect(c);
|
|
||||||
|
|
||||||
// Error is unreachable because the first byte comes from the state machine.
|
|
||||||
// If we get an error here, it is a bug in the state machine that we want
|
|
||||||
// to chase down.
|
|
||||||
const len = std.unicode.utf8ByteSequenceLength(self.intermediates[0]) catch unreachable;
|
|
||||||
|
|
||||||
// We need to collect more
|
|
||||||
if (self.intermediates_idx < len) return null;
|
|
||||||
|
|
||||||
// No matter what happens, we go back to ground since we know we have
|
|
||||||
// enough bytes for the UTF8 sequence.
|
|
||||||
defer {
|
|
||||||
self.state = .ground;
|
|
||||||
self.intermediates_idx = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// We have enough bytes, decode!
|
|
||||||
const bytes = self.intermediates[0..len];
|
|
||||||
const rune = std.unicode.utf8Decode(bytes) catch rune: {
|
|
||||||
log.warn("invalid UTF-8 sequence: {any}", .{bytes});
|
|
||||||
break :rune 0xFFFD; // <EFBFBD>
|
|
||||||
};
|
|
||||||
|
|
||||||
return Action{ .print = rune };
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect(self: *Parser, c: u8) void {
|
fn collect(self: *Parser, c: u8) void {
|
||||||
if (self.intermediates_idx >= MAX_INTERMEDIATE) {
|
if (self.intermediates_idx >= MAX_INTERMEDIATE) {
|
||||||
log.warn("invalid intermediates count", .{});
|
log.warn("invalid intermediates count", .{});
|
||||||
@ -828,91 +774,35 @@ test "osc: 112 incomplete sequence" {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
test "print: utf8 2 byte" {
|
// test "csi followed by utf8" {
|
||||||
var p = init();
|
// var p = init();
|
||||||
var a: [3]?Action = undefined;
|
// const prefix = &[_]u8{
|
||||||
for ("£") |c| a = p.next(c);
|
// // CSI sequence
|
||||||
|
// 0x1b, 0x5b, 0x3f, 0x32, 0x30, 0x30, 0x34, 0x64, '\r',
|
||||||
try testing.expect(p.state == .ground);
|
//
|
||||||
try testing.expect(a[0].? == .print);
|
// // UTF8 prefix (not complete)
|
||||||
try testing.expect(a[1] == null);
|
// 0xe2,
|
||||||
try testing.expect(a[2] == null);
|
// };
|
||||||
|
// for (prefix) |char| {
|
||||||
const rune = a[0].?.print;
|
// _ = p.next(char);
|
||||||
try testing.expectEqual(try std.unicode.utf8Decode("£"), rune);
|
// }
|
||||||
}
|
//
|
||||||
|
// {
|
||||||
test "print: utf8 3 byte" {
|
// const a = p.next(0x94);
|
||||||
var p = init();
|
// try testing.expect(p.state == .utf8);
|
||||||
var a: [3]?Action = undefined;
|
// try testing.expect(a[0] == null);
|
||||||
for ("€") |c| a = p.next(c);
|
// try testing.expect(a[1] == null);
|
||||||
|
// try testing.expect(a[2] == null);
|
||||||
try testing.expect(p.state == .ground);
|
// }
|
||||||
try testing.expect(a[0].? == .print);
|
//
|
||||||
try testing.expect(a[1] == null);
|
// {
|
||||||
try testing.expect(a[2] == null);
|
// const a = p.next(0x94);
|
||||||
|
// try testing.expect(p.state == .ground);
|
||||||
const rune = a[0].?.print;
|
// try testing.expect(a[0].? == .print);
|
||||||
try testing.expectEqual(try std.unicode.utf8Decode("€"), rune);
|
// try testing.expect(a[1] == null);
|
||||||
}
|
// try testing.expect(a[2] == null);
|
||||||
|
// }
|
||||||
test "print: utf8 4 byte" {
|
// }
|
||||||
var p = init();
|
|
||||||
var a: [3]?Action = undefined;
|
|
||||||
for ("𐍈") |c| a = p.next(c);
|
|
||||||
|
|
||||||
try testing.expect(p.state == .ground);
|
|
||||||
try testing.expect(a[0].? == .print);
|
|
||||||
try testing.expect(a[1] == null);
|
|
||||||
try testing.expect(a[2] == null);
|
|
||||||
|
|
||||||
const rune = a[0].?.print;
|
|
||||||
try testing.expectEqual(try std.unicode.utf8Decode("𐍈"), rune);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "print: utf8 invalid" {
|
|
||||||
var p = init();
|
|
||||||
var a: [3]?Action = undefined;
|
|
||||||
for ("\xC3\x28") |c| a = p.next(c);
|
|
||||||
|
|
||||||
try testing.expect(p.state == .ground);
|
|
||||||
try testing.expect(a[0].? == .print);
|
|
||||||
try testing.expect(a[1] == null);
|
|
||||||
try testing.expect(a[2] == null);
|
|
||||||
|
|
||||||
const rune = a[0].?.print;
|
|
||||||
try testing.expectEqual(try std.unicode.utf8Decode("<EFBFBD>"), rune);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "csi followed by utf8" {
|
|
||||||
var p = init();
|
|
||||||
const prefix = &[_]u8{
|
|
||||||
// CSI sequence
|
|
||||||
0x1b, 0x5b, 0x3f, 0x32, 0x30, 0x30, 0x34, 0x64, '\r',
|
|
||||||
|
|
||||||
// UTF8 prefix (not complete)
|
|
||||||
0xe2,
|
|
||||||
};
|
|
||||||
for (prefix) |char| {
|
|
||||||
_ = p.next(char);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const a = p.next(0x94);
|
|
||||||
try testing.expect(p.state == .utf8);
|
|
||||||
try testing.expect(a[0] == null);
|
|
||||||
try testing.expect(a[1] == null);
|
|
||||||
try testing.expect(a[2] == null);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const a = p.next(0x94);
|
|
||||||
try testing.expect(p.state == .ground);
|
|
||||||
try testing.expect(a[0].? == .print);
|
|
||||||
try testing.expect(a[1] == null);
|
|
||||||
try testing.expect(a[2] == null);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
test "csi: too many params" {
|
test "csi: too many params" {
|
||||||
var p = init();
|
var p = init();
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
//!
|
//!
|
||||||
//! For details on Bjoern's DFA-based UTF-8 decoder, see
|
//! For details on Bjoern's DFA-based UTF-8 decoder, see
|
||||||
//! http://bjoern.hoehrmann.de/utf-8/decoder/dfa (MIT licensed)
|
//! http://bjoern.hoehrmann.de/utf-8/decoder/dfa (MIT licensed)
|
||||||
|
const UTF8Decoder = @This();
|
||||||
|
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const testing = std.testing;
|
const testing = std.testing;
|
||||||
@ -33,12 +34,18 @@ const transitions = [_]u8 {
|
|||||||
};
|
};
|
||||||
//zig fmt: on
|
//zig fmt: on
|
||||||
|
|
||||||
// This is where we accumulate our current codepoint.
|
// DFA states
|
||||||
var accumulator: u21 = 0;
|
|
||||||
// The internal state of the DFA.
|
|
||||||
const ACCEPT_STATE = 0;
|
const ACCEPT_STATE = 0;
|
||||||
const REJECT_STATE = 12;
|
const REJECT_STATE = 12;
|
||||||
var state: u8 = ACCEPT_STATE;
|
|
||||||
|
// This is where we accumulate our current codepoint.
|
||||||
|
accumulator: u21 = 0,
|
||||||
|
// The internal state of the DFA.
|
||||||
|
state: u8 = ACCEPT_STATE,
|
||||||
|
|
||||||
|
pub fn init() UTF8Decoder {
|
||||||
|
return .{};
|
||||||
|
}
|
||||||
|
|
||||||
/// Takes the next byte in the utf-8 sequence and emits a tuple of
|
/// Takes the next byte in the utf-8 sequence and emits a tuple of
|
||||||
/// - The codepoint that was generated, if there is one.
|
/// - The codepoint that was generated, if there is one.
|
||||||
@ -50,27 +57,27 @@ var state: u8 = ACCEPT_STATE;
|
|||||||
///
|
///
|
||||||
/// If the byte is not consumed, the caller is responsible for calling
|
/// If the byte is not consumed, the caller is responsible for calling
|
||||||
/// again with the same byte before continuing.
|
/// again with the same byte before continuing.
|
||||||
pub inline fn next(byte: u8) struct { ?u21, bool } {
|
pub inline fn next(self: *UTF8Decoder, byte: u8) struct { ?u21, bool } {
|
||||||
const char_class = char_classes[byte];
|
const char_class = char_classes[byte];
|
||||||
|
|
||||||
const initial_state = state;
|
const initial_state = self.state;
|
||||||
|
|
||||||
if (state != ACCEPT_STATE) {
|
if (self.state != ACCEPT_STATE) {
|
||||||
accumulator <<= 6;
|
self.accumulator <<= 6;
|
||||||
accumulator |= (byte & 0x3F);
|
self.accumulator |= (byte & 0x3F);
|
||||||
} else {
|
} else {
|
||||||
accumulator = (@as(u21, 0xFF) >> char_class) & (byte);
|
self.accumulator = (@as(u21, 0xFF) >> char_class) & (byte);
|
||||||
}
|
}
|
||||||
|
|
||||||
state = transitions[state + char_class];
|
self.state = transitions[self.state + char_class];
|
||||||
|
|
||||||
if (state == ACCEPT_STATE) {
|
if (self.state == ACCEPT_STATE) {
|
||||||
defer { accumulator = 0; }
|
defer { self.accumulator = 0; }
|
||||||
// Emit the fully decoded codepoint.
|
// Emit the fully decoded codepoint.
|
||||||
return .{ accumulator, true };
|
return .{ self.accumulator, true };
|
||||||
} else if (state == REJECT_STATE) {
|
} else if (self.state == REJECT_STATE) {
|
||||||
accumulator = 0;
|
self.accumulator = 0;
|
||||||
state = ACCEPT_STATE;
|
self.state = ACCEPT_STATE;
|
||||||
// Emit a replacement character. If we rejected the first byte
|
// Emit a replacement character. If we rejected the first byte
|
||||||
// in a sequence, then it was consumed, otherwise it was not.
|
// in a sequence, then it was consumed, otherwise it was not.
|
||||||
return .{ 0xFFFD, initial_state == ACCEPT_STATE };
|
return .{ 0xFFFD, initial_state == ACCEPT_STATE };
|
||||||
@ -81,9 +88,10 @@ pub inline fn next(byte: u8) struct { ?u21, bool } {
|
|||||||
}
|
}
|
||||||
|
|
||||||
test "ASCII" {
|
test "ASCII" {
|
||||||
|
var d = init();
|
||||||
var out = std.mem.zeroes([13]u8);
|
var out = std.mem.zeroes([13]u8);
|
||||||
for ("Hello, World!", 0..) |byte, i| {
|
for ("Hello, World!", 0..) |byte, i| {
|
||||||
const res = next(byte);
|
const res = d.next(byte);
|
||||||
try testing.expect(res[1]);
|
try testing.expect(res[1]);
|
||||||
if (res[0]) |codepoint| {
|
if (res[0]) |codepoint| {
|
||||||
out[i] = @intCast(codepoint);
|
out[i] = @intCast(codepoint);
|
||||||
@ -93,13 +101,14 @@ test "ASCII" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
test "Well formed utf-8" {
|
test "Well formed utf-8" {
|
||||||
|
var d = init();
|
||||||
var out = std.mem.zeroes([4]u21);
|
var out = std.mem.zeroes([4]u21);
|
||||||
var i: usize = 0;
|
var i: usize = 0;
|
||||||
// 4 bytes, 3 bytes, 2 bytes, 1 byte
|
// 4 bytes, 3 bytes, 2 bytes, 1 byte
|
||||||
for ("😄✤ÁA") |byte| {
|
for ("😄✤ÁA") |byte| {
|
||||||
var consumed = false;
|
var consumed = false;
|
||||||
while (!consumed) {
|
while (!consumed) {
|
||||||
const res = next(byte);
|
const res = d.next(byte);
|
||||||
consumed = res[1];
|
consumed = res[1];
|
||||||
// There are no errors in this sequence, so
|
// There are no errors in this sequence, so
|
||||||
// every byte should be consumed first try.
|
// every byte should be consumed first try.
|
||||||
@ -114,13 +123,14 @@ test "Well formed utf-8" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
test "Partially invalid utf-8" {
|
test "Partially invalid utf-8" {
|
||||||
|
var d = init();
|
||||||
var out = std.mem.zeroes([5]u21);
|
var out = std.mem.zeroes([5]u21);
|
||||||
var i: usize = 0;
|
var i: usize = 0;
|
||||||
// Illegally terminated sequence, valid sequence, illegal surrogate pair.
|
// Illegally terminated sequence, valid sequence, illegal surrogate pair.
|
||||||
for ("\xF0\x9F😄\xED\xA0\x80") |byte| {
|
for ("\xF0\x9F😄\xED\xA0\x80") |byte| {
|
||||||
var consumed = false;
|
var consumed = false;
|
||||||
while (!consumed) {
|
while (!consumed) {
|
||||||
const res = next(byte);
|
const res = d.next(byte);
|
||||||
consumed = res[1];
|
consumed = res[1];
|
||||||
if (res[0]) |codepoint| {
|
if (res[0]) |codepoint| {
|
||||||
out[i] = codepoint;
|
out[i] = codepoint;
|
@ -5,10 +5,6 @@
|
|||||||
//! https://vt100.net/emu/dec_ansi_parser
|
//! https://vt100.net/emu/dec_ansi_parser
|
||||||
//! But has some modifications:
|
//! But has some modifications:
|
||||||
//!
|
//!
|
||||||
//! * utf8 state introduced to detect UTF8-encoded sequences. The
|
|
||||||
//! actual handling back OUT of the utf8 state is done manually in the
|
|
||||||
//! parser.
|
|
||||||
//!
|
|
||||||
//! * csi_param accepts the colon character (':') since the SGR command
|
//! * csi_param accepts the colon character (':') since the SGR command
|
||||||
//! accepts colon as a valid parameter value.
|
//! accepts colon as a valid parameter value.
|
||||||
//!
|
//!
|
||||||
@ -92,18 +88,11 @@ fn genTable() Table {
|
|||||||
|
|
||||||
// ground
|
// ground
|
||||||
{
|
{
|
||||||
const source = State.ground;
|
|
||||||
|
|
||||||
// events
|
// events
|
||||||
single(&result, 0x19, .ground, .ground, .execute);
|
single(&result, 0x19, .ground, .ground, .execute);
|
||||||
range(&result, 0, 0x17, .ground, .ground, .execute);
|
range(&result, 0, 0x17, .ground, .ground, .execute);
|
||||||
range(&result, 0x1C, 0x1F, .ground, .ground, .execute);
|
range(&result, 0x1C, 0x1F, .ground, .ground, .execute);
|
||||||
range(&result, 0x20, 0x7F, .ground, .ground, .print);
|
range(&result, 0x20, 0x7F, .ground, .ground, .print);
|
||||||
|
|
||||||
// => utf8
|
|
||||||
range(&result, 0xC2, 0xDF, source, .utf8, .collect);
|
|
||||||
range(&result, 0xE0, 0xEF, source, .utf8, .collect);
|
|
||||||
range(&result, 0xF0, 0xF4, source, .utf8, .collect);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// escape_intermediate
|
// escape_intermediate
|
||||||
|
@ -9,6 +9,7 @@ const kitty = @import("kitty.zig");
|
|||||||
const modes = @import("modes.zig");
|
const modes = @import("modes.zig");
|
||||||
const osc = @import("osc.zig");
|
const osc = @import("osc.zig");
|
||||||
const sgr = @import("sgr.zig");
|
const sgr = @import("sgr.zig");
|
||||||
|
const UTF8Decoder = @import("UTF8Decoder.zig");
|
||||||
const MouseShape = @import("mouse_shape.zig").MouseShape;
|
const MouseShape = @import("mouse_shape.zig").MouseShape;
|
||||||
|
|
||||||
const log = std.log.scoped(.stream);
|
const log = std.log.scoped(.stream);
|
||||||
@ -37,6 +38,7 @@ pub fn Stream(comptime Handler: type) type {
|
|||||||
|
|
||||||
handler: Handler,
|
handler: Handler,
|
||||||
parser: Parser = .{},
|
parser: Parser = .{},
|
||||||
|
utf8decoder: UTF8Decoder = .{},
|
||||||
|
|
||||||
pub fn deinit(self: *Self) void {
|
pub fn deinit(self: *Self) void {
|
||||||
self.parser.deinit();
|
self.parser.deinit();
|
||||||
@ -50,6 +52,21 @@ pub fn Stream(comptime Handler: type) type {
|
|||||||
/// Process the next character and call any callbacks if necessary.
|
/// Process the next character and call any callbacks if necessary.
|
||||||
pub fn next(self: *Self, c: u8) !void {
|
pub fn next(self: *Self, c: u8) !void {
|
||||||
// log.debug("char: {c}", .{c});
|
// log.debug("char: {c}", .{c});
|
||||||
|
if (self.parser.state == .ground and c != 0x1B) {
|
||||||
|
var consumed = false;
|
||||||
|
while (!consumed) {
|
||||||
|
const res = self.utf8decoder.next(c);
|
||||||
|
consumed = res[1];
|
||||||
|
if (res[0]) |codepoint| {
|
||||||
|
if (codepoint < 0xF) {
|
||||||
|
try self.execute(@intCast(codepoint));
|
||||||
|
} else {
|
||||||
|
try self.print(@intCast(codepoint));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
const actions = self.parser.next(c);
|
const actions = self.parser.next(c);
|
||||||
for (actions) |action_opt| {
|
for (actions) |action_opt| {
|
||||||
const action = action_opt orelse continue;
|
const action = action_opt orelse continue;
|
||||||
@ -101,6 +118,12 @@ pub fn Stream(comptime Handler: type) type {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn print(self: *Self, c: u21) !void {
|
||||||
|
if (@hasDecl(T, "print")) {
|
||||||
|
try self.handler.print(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn execute(self: *Self, c: u8) !void {
|
pub fn execute(self: *Self, c: u8) !void {
|
||||||
switch (@as(ansi.C0, @enumFromInt(c))) {
|
switch (@as(ansi.C0, @enumFromInt(c))) {
|
||||||
// We ignore SOH/STX: https://github.com/microsoft/terminal/issues/10786
|
// We ignore SOH/STX: https://github.com/microsoft/terminal/issues/10786
|
||||||
|
Reference in New Issue
Block a user