terminal: utf-8 decoding

This commit is contained in:
Mitchell Hashimoto
2022-05-16 09:31:07 -07:00
parent 75582cb5ec
commit ead4cec159
5 changed files with 132 additions and 20 deletions

View File

@ -501,7 +501,7 @@ fn renderTimerCallback(t: *libuv.Timer) void {
//------------------------------------------------------------------- //-------------------------------------------------------------------
// Stream Callbacks // Stream Callbacks
pub fn print(self: *Window, c: u8) !void { pub fn print(self: *Window, c: u21) !void {
try self.terminal.print(self.alloc, c); try self.terminal.print(self.alloc, c);
} }

View File

@ -5,6 +5,7 @@
const Parser = @This(); const Parser = @This();
const std = @import("std"); const std = @import("std");
const builtin = @import("builtin");
const testing = std.testing; const testing = std.testing;
const table = @import("parse_table.zig").table; const table = @import("parse_table.zig").table;
const osc = @import("osc.zig"); const osc = @import("osc.zig");
@ -28,6 +29,9 @@ pub const State = enum {
dcs_ignore, dcs_ignore,
osc_string, osc_string,
sos_pm_apc_string, sos_pm_apc_string,
// Custom states added that aren't present on vt100.net
utf8,
}; };
/// Transition action is an action that can be taken during a state /// Transition action is an action that can be taken during a state
@ -49,8 +53,8 @@ pub const TransitionAction = enum {
/// Action is the action that a caller of the parser is expected to /// Action is the action that a caller of the parser is expected to
/// take as a result of some input character. /// take as a result of some input character.
pub const Action = union(enum) { pub const Action = union(enum) {
/// Draw character to the screen. /// Draw character to the screen. This is a unicode codepoint.
print: u8, print: u21,
/// Execute the C0 or C1 function. /// Execute the C0 or C1 function.
execute: u8, execute: u8,
@ -97,8 +101,10 @@ const ParamSepState = enum(u8) {
mixed = 1, mixed = 1,
}; };
/// Maximum number of intermediate characters during parsing. /// Maximum number of intermediate characters during parsing. This is
const MAX_INTERMEDIATE = 2; /// 4 because we also use the intermediates array for UTF8 decoding which
/// can be at most 4 bytes.
const MAX_INTERMEDIATE = 4;
const MAX_PARAMS = 16; const MAX_PARAMS = 16;
/// Current state of the state machine /// Current state of the state machine
@ -126,6 +132,11 @@ pub fn init() Parser {
/// Up to 3 actions may need to be exected -- in order -- representing /// Up to 3 actions may need to be exected -- in order -- representing
/// the state exit, transition, and entry actions. /// the state exit, transition, and entry actions.
pub fn next(self: *Parser, c: u8) [3]?Action { pub fn next(self: *Parser, c: u8) [3]?Action {
// If we're processing UTF-8, we handle this manually.
if (self.state == .utf8) {
return .{ self.next_utf8(c), null, null };
}
const effect = effect: { const effect = effect: {
// First look up the transition in the anywhere table. // First look up the transition in the anywhere table.
const anywhere = table[c][@enumToInt(State.anywhere)]; const anywhere = table[c][@enumToInt(State.anywhere)];
@ -143,6 +154,13 @@ pub fn next(self: *Parser, c: u8) [3]?Action {
// After generating the actions, we set our next state. // After generating the actions, we set our next state.
defer self.state = next_state; defer self.state = next_state;
// In debug mode, we log bad state transitions.
if (builtin.mode == .Debug) {
if (next_state == .anywhere) {
log.warn("state transition to 'anywhere', likely bug: {x}", .{c});
}
}
// When going from one state to another, the actions take place in this order: // When going from one state to another, the actions take place in this order:
// //
// 1. exit action from old state // 1. exit action from old state
@ -183,21 +201,55 @@ pub fn next(self: *Parser, c: u8) [3]?Action {
}; };
} }
/// Processes the next byte in a UTF8 sequence. It is assumed that
/// intermediates[0] already has the first byte of a UTF8 sequence
/// (triggered via the state machine).
fn next_utf8(self: *Parser, c: u8) ?Action {
// Collect the byte into the intermediates array
self.collect(c);
// Error is unreachable because the first byte comes from the state machine.
// If we get an error here, it is a bug in the state machine that we want
// to chase down.
const len = std.unicode.utf8ByteSequenceLength(self.intermediates[0]) catch unreachable;
// We need to collect more
if (self.intermediates_idx < len) return null;
// No matter what happens, we go back to ground since we know we have
// enough bytes for the UTF8 sequence.
defer {
self.state = .ground;
self.intermediates_idx = 0;
}
// We have enough bytes, decode!
const bytes = self.intermediates[0..len];
const rune = std.unicode.utf8Decode(bytes) catch {
log.warn("invalid UTF-8 sequence: {any}", .{bytes});
return null;
};
return Action{ .print = rune };
}
fn collect(self: *Parser, c: u8) void {
if (self.intermediates_idx >= MAX_INTERMEDIATE) {
log.warn("invalid intermediates count", .{});
return;
}
self.intermediates[self.intermediates_idx] = c;
self.intermediates_idx += 1;
}
fn doAction(self: *Parser, action: TransitionAction, c: u8) ?Action { fn doAction(self: *Parser, action: TransitionAction, c: u8) ?Action {
return switch (action) { return switch (action) {
.none, .ignore => null, .none, .ignore => null,
.print => Action{ .print = c }, .print => Action{ .print = c },
.execute => Action{ .execute = c }, .execute => Action{ .execute = c },
.collect => collect: { .collect => collect: {
if (self.intermediates_idx >= MAX_INTERMEDIATE) { self.collect(c);
log.warn("invalid intermediates count", .{});
break :collect null;
}
self.intermediates[self.intermediates_idx] = c;
self.intermediates_idx += 1;
// The client is expected to perform no action.
break :collect null; break :collect null;
}, },
.param => param: { .param => param: {
@ -433,3 +485,56 @@ test "osc: change window title" {
try testing.expect(cmd == .change_window_title); try testing.expect(cmd == .change_window_title);
} }
} }
test "print: utf8 2 byte" {
var p = init();
var a: [3]?Action = undefined;
for ("£") |c| a = p.next(c);
try testing.expect(p.state == .ground);
try testing.expect(a[0].? == .print);
try testing.expect(a[1] == null);
try testing.expect(a[2] == null);
const rune = a[0].?.print;
try testing.expectEqual(try std.unicode.utf8Decode("£"), rune);
}
test "print: utf8 3 byte" {
var p = init();
var a: [3]?Action = undefined;
for ("") |c| a = p.next(c);
try testing.expect(p.state == .ground);
try testing.expect(a[0].? == .print);
try testing.expect(a[1] == null);
try testing.expect(a[2] == null);
const rune = a[0].?.print;
try testing.expectEqual(try std.unicode.utf8Decode(""), rune);
}
test "print: utf8 4 byte" {
var p = init();
var a: [3]?Action = undefined;
for ("𐍈") |c| a = p.next(c);
try testing.expect(p.state == .ground);
try testing.expect(a[0].? == .print);
try testing.expect(a[1] == null);
try testing.expect(a[2] == null);
const rune = a[0].?.print;
try testing.expectEqual(try std.unicode.utf8Decode("𐍈"), rune);
}
test "print: utf8 invalid" {
var p = init();
var a: [3]?Action = undefined;
for ("\xC3\x28") |c| a = p.next(c);
try testing.expect(p.state == .ground);
try testing.expect(a[0] == null);
try testing.expect(a[1] == null);
try testing.expect(a[2] == null);
}

View File

@ -164,7 +164,7 @@ pub fn setAttribute(self: *Terminal, attr: sgr.Attribute) !void {
} }
} }
pub fn print(self: *Terminal, alloc: Allocator, c: u8) !void { pub fn print(self: *Terminal, alloc: Allocator, c: u21) !void {
const tracy = trace(@src()); const tracy = trace(@src());
defer tracy.end(); defer tracy.end();

View File

@ -31,7 +31,7 @@ fn genTableType() type {
/// Function to generate the full state transition table for VT emulation. /// Function to generate the full state transition table for VT emulation.
fn genTable() Table { fn genTable() Table {
@setEvalBranchQuota(15000); @setEvalBranchQuota(20000);
var result: Table = undefined; var result: Table = undefined;
// Initialize everything so every state transition exists // Initialize everything so every state transition exists
@ -45,6 +45,8 @@ fn genTable() Table {
// ground // ground
{ {
const source = State.ground;
// anywhere => // anywhere =>
single(&result, 0x18, .anywhere, .ground, .execute); single(&result, 0x18, .anywhere, .ground, .execute);
single(&result, 0x1A, .anywhere, .ground, .execute); single(&result, 0x1A, .anywhere, .ground, .execute);
@ -55,6 +57,11 @@ fn genTable() Table {
range(&result, 0, 0x17, .ground, .ground, .execute); range(&result, 0, 0x17, .ground, .ground, .execute);
range(&result, 0x1C, 0x1F, .ground, .ground, .execute); range(&result, 0x1C, 0x1F, .ground, .ground, .execute);
range(&result, 0x20, 0x7F, .ground, .ground, .print); range(&result, 0x20, 0x7F, .ground, .ground, .print);
// => utf8
range(&result, 0xC2, 0xDF, source, .utf8, .collect);
range(&result, 0xE0, 0xEF, source, .utf8, .collect);
range(&result, 0xF0, 0xF4, source, .utf8, .collect);
} }
// escape_intermediate // escape_intermediate

View File

@ -45,7 +45,7 @@ pub fn Stream(comptime Handler: type) type {
const tracy = trace(@src()); const tracy = trace(@src());
defer tracy.end(); defer tracy.end();
//log.debug("char: {}", .{c}); //log.debug("char: {x}", .{c});
const actions = self.parser.next(c); const actions = self.parser.next(c);
for (actions) |action_opt| { for (actions) |action_opt| {
// if (action_opt) |action| log.info("action: {}", .{action}); // if (action_opt) |action| log.info("action: {}", .{action});
@ -324,16 +324,16 @@ pub fn Stream(comptime Handler: type) type {
test "stream: print" { test "stream: print" {
const H = struct { const H = struct {
c: ?u8 = 0, c: ?u21 = 0,
pub fn print(self: *@This(), c: u8) !void { pub fn print(self: *@This(), c: u21) !void {
self.c = c; self.c = c;
} }
}; };
var s: Stream(H) = .{ .handler = .{} }; var s: Stream(H) = .{ .handler = .{} };
try s.next('x'); try s.next('x');
try testing.expectEqual(@as(u8, 'x'), s.handler.c.?); try testing.expectEqual(@as(u21, 'x'), s.handler.c.?);
} }
test "stream: cursor right (CUF)" { test "stream: cursor right (CUF)" {