From f8c544c11978dbe34b70518f7adc66fc40f94821 Mon Sep 17 00:00:00 2001 From: Qwerasd Date: Wed, 7 Feb 2024 00:12:37 -0500 Subject: [PATCH 1/3] terminal: stream/parser changes --- src/terminal/Parser.zig | 4 +-- src/terminal/stream.zig | 57 ++++++++++++++++++++++++++--------------- 2 files changed, 39 insertions(+), 22 deletions(-) diff --git a/src/terminal/Parser.zig b/src/terminal/Parser.zig index 5746be065..bfb25f2fa 100644 --- a/src/terminal/Parser.zig +++ b/src/terminal/Parser.zig @@ -185,7 +185,7 @@ pub const Action = union(enum) { /// Keeps track of the parameter sep used for CSI params. We allow colons /// to be used ONLY by the 'm' CSI action. -const ParamSepState = enum(u8) { +pub const ParamSepState = enum(u8) { none = 0, semicolon = ';', colon = ':', @@ -279,7 +279,7 @@ pub fn next(self: *Parser, c: u8) [3]?Action { }; } -fn collect(self: *Parser, c: u8) void { +pub fn collect(self: *Parser, c: u8) void { if (self.intermediates_idx >= MAX_INTERMEDIATE) { log.warn("invalid intermediates count", .{}); return; diff --git a/src/terminal/stream.zig b/src/terminal/stream.zig index 982a8ab20..0362780c2 100644 --- a/src/terminal/stream.zig +++ b/src/terminal/stream.zig @@ -74,17 +74,14 @@ pub fn Stream(comptime Handler: type) type { if (offset >= input.len) return; try self.next(input[offset]); offset += 1; - } else if (self.parser.state != .ground) { - // If we're not in the ground state then we process until - // we are. This can happen if the last chunk of input put us - // in the middle of a control sequence. - for (input[offset..]) |single| { - try self.nextNonUtf8(single); - offset += 1; - if (self.parser.state == .ground) break; - } } + // If we're not in the ground state then we process until + // we are. This can happen if the last chunk of input put us + // in the middle of a control sequence. + offset += try self.consumeUntilGround(input[offset..]); + if (offset >= input.len) return; + // If we're in the ground state then we can use SIMD to process // input until we see an ESC (0x1B), since all other characters // up to that point are just UTF-8. @@ -112,26 +109,46 @@ pub fn Stream(comptime Handler: type) type { } // Process our control sequence. - for (input[offset..]) |single| { - try self.nextNonUtf8(single); - offset += 1; - if (self.parser.state == .ground) break; - } + self.parser.state = .escape; + offset += 1; + offset += try self.consumeUntilGround(input[offset..]); } } + /// Parses escape sequences until the parser reaches the ground state. + /// Returns the number of bytes consumed from the provided input. + inline fn consumeUntilGround(self: *Self, input: []const u8) !usize { + var offset: usize = 0; + while (self.parser.state != .ground) { + if (offset >= input.len) return input.len; + try self.nextNonUtf8(input[offset]); + offset += 1; + } + return offset; + } + /// Like nextSlice but takes one byte and is necessarilly a scalar /// operation that can't use SIMD. Prefer nextSlice if you can and /// try to get multiple bytes at once. pub fn next(self: *Self, c: u8) !void { // The scalar path can be responsible for decoding UTF-8. if (self.parser.state == .ground and c != 0x1B) { - var consumed = false; - while (!consumed) { - const res = self.utf8decoder.next(c); - consumed = res[1]; - if (res[0]) |codepoint| { - if (codepoint <= 0xF) { + const res = self.utf8decoder.next(c); + const consumed = res[1]; + if (res[0]) |codepoint| { + if (codepoint < 0xF) { + try self.execute(@intCast(codepoint)); + } else { + try self.print(@intCast(codepoint)); + } + } + if (!consumed) { + const retry = self.utf8decoder.next(c); + // It should be impossible for the decoder + // to not consume the byte twice in a row. + assert(retry[1] == true); + if (retry[0]) |codepoint| { + if (codepoint < 0xF) { try self.execute(@intCast(codepoint)); } else { try self.print(@intCast(codepoint)); From 68c0813397325236b8876fb70832d86a1e06209d Mon Sep 17 00:00:00 2001 From: Qwerasd Date: Thu, 8 Feb 2024 21:49:58 -0500 Subject: [PATCH 2/3] terminal/stream: Added ESC parsing fast tracks --- src/bench/stream.sh | 4 +- src/terminal/Parser.zig | 2 +- src/terminal/stream.zig | 128 ++++++++++++++++++++++++++++++++-------- 3 files changed, 105 insertions(+), 29 deletions(-) diff --git a/src/bench/stream.sh b/src/bench/stream.sh index 41d62f234..5f2e4d311 100755 --- a/src/bench/stream.sh +++ b/src/bench/stream.sh @@ -17,8 +17,8 @@ SIZE="25000000" # Generate the benchmark input ahead of time so it's not included in the time. ./zig-out/bin/bench-stream --mode=gen-$DATA | head -c $SIZE > /tmp/ghostty_bench_data -# Uncomment to instead use the contents of `stream.txt` as input. -# yes $(cat ./stream.txt) | head -c $SIZE > /tmp/ghostty_bench_data +# Uncomment to instead use the contents of `stream.txt` as input. (Ignores SIZE) +# echo $(cat ./stream.txt) > /tmp/ghostty_bench_data hyperfine \ --warmup 10 \ diff --git a/src/terminal/Parser.zig b/src/terminal/Parser.zig index bfb25f2fa..f160619e2 100644 --- a/src/terminal/Parser.zig +++ b/src/terminal/Parser.zig @@ -390,7 +390,7 @@ fn doAction(self: *Parser, action: TransitionAction, c: u8) ?Action { }; } -fn clear(self: *Parser) void { +pub fn clear(self: *Parser) void { self.intermediates_idx = 0; self.params_idx = 0; self.params_sep = .none; diff --git a/src/terminal/stream.zig b/src/terminal/stream.zig index 0362780c2..bdfcf4155 100644 --- a/src/terminal/stream.zig +++ b/src/terminal/stream.zig @@ -72,7 +72,7 @@ pub fn Stream(comptime Handler: type) type { // a code sequence, we continue until it's not. while (self.utf8decoder.state != 0) { if (offset >= input.len) return; - try self.next(input[offset]); + try self.nextUtf8(input[offset]); offset += 1; } @@ -80,6 +80,7 @@ pub fn Stream(comptime Handler: type) type { // we are. This can happen if the last chunk of input put us // in the middle of a control sequence. offset += try self.consumeUntilGround(input[offset..]); + offset += try self.consumeAllEscapes(input[offset..]); if (offset >= input.len) return; // If we're in the ground state then we can use SIMD to process @@ -94,9 +95,9 @@ pub fn Stream(comptime Handler: type) type { try self.print(@intCast(cp)); } } - // Consume the bytes we just processed. offset += res.consumed; + if (offset >= input.len) return; // If our offset is NOT an escape then we must have a @@ -104,20 +105,35 @@ pub fn Stream(comptime Handler: type) type { // to the scalar parser. if (input[offset] != 0x1B) { const rem = input[offset..]; - for (rem) |c| try self.next(c); + for (rem) |c| try self.nextUtf8(c); return; } - // Process our control sequence. + // Process control sequences until we run out. + offset += try self.consumeAllEscapes(input[offset..]); + } + } + + /// Parses back-to-back escape sequences until none are left. + /// Returns the number of bytes consumed from the provided input. + /// + /// Expects input to start with 0x1B, use consumeUntilGround first + /// if the stream may be in the middle of an escape sequence. + fn consumeAllEscapes(self: *Self, input: []const u8) !usize { + var offset: usize = 0; + while (input[offset] == 0x1B) { self.parser.state = .escape; + self.parser.clear(); offset += 1; offset += try self.consumeUntilGround(input[offset..]); + if (offset >= input.len) return input.len; } + return offset; } /// Parses escape sequences until the parser reaches the ground state. /// Returns the number of bytes consumed from the provided input. - inline fn consumeUntilGround(self: *Self, input: []const u8) !usize { + fn consumeUntilGround(self: *Self, input: []const u8) !usize { var offset: usize = 0; while (self.parser.state != .ground) { if (offset >= input.len) return input.len; @@ -133,33 +149,42 @@ pub fn Stream(comptime Handler: type) type { pub fn next(self: *Self, c: u8) !void { // The scalar path can be responsible for decoding UTF-8. if (self.parser.state == .ground and c != 0x1B) { - const res = self.utf8decoder.next(c); - const consumed = res[1]; - if (res[0]) |codepoint| { - if (codepoint < 0xF) { + try self.nextUtf8(c); + return; + } + + try self.nextNonUtf8(c); + } + + /// Process the next byte and print as necessary. + /// + /// This assumes we're in the UTF-8 decoding state. If we may not + /// be in the UTF-8 decoding state call nextSlice or next. + fn nextUtf8(self: *Self, c: u8) !void { + assert(self.parser.state == .ground and c != 0x1B); + + const res = self.utf8decoder.next(c); + const consumed = res[1]; + if (res[0]) |codepoint| { + if (codepoint <= 0xF) { + try self.execute(@intCast(codepoint)); + } else { + try self.print(@intCast(codepoint)); + } + } + if (!consumed) { + const retry = self.utf8decoder.next(c); + // It should be impossible for the decoder + // to not consume the byte twice in a row. + assert(retry[1] == true); + if (retry[0]) |codepoint| { + if (codepoint <= 0xF) { try self.execute(@intCast(codepoint)); } else { try self.print(@intCast(codepoint)); } } - if (!consumed) { - const retry = self.utf8decoder.next(c); - // It should be impossible for the decoder - // to not consume the byte twice in a row. - assert(retry[1] == true); - if (retry[0]) |codepoint| { - if (codepoint < 0xF) { - try self.execute(@intCast(codepoint)); - } else { - try self.print(@intCast(codepoint)); - } - } - } - - return; } - - try self.nextNonUtf8(c); } /// Process the next character and call any callbacks if necessary. @@ -169,6 +194,57 @@ pub fn Stream(comptime Handler: type) type { fn nextNonUtf8(self: *Self, c: u8) !void { assert(self.parser.state != .ground or c == 0x1B); + // Fast path for ESC + if (self.parser.state == .ground and c == 0x1B) { + self.parser.state = .escape; + self.parser.clear(); + return; + } + // Fast path for CSI entry. + if (self.parser.state == .escape and c == '[') { + self.parser.state = .csi_entry; + return; + } + // Fast path for CSI params. + if (self.parser.state == .csi_param) csi_param: { + switch (c) { + // A C0 escape (yes, this is valid): + 0x00...0x0F => try self.execute(c), + // We ignore C0 escapes > 0xF since execute + // doesn't have processing for them anyway: + 0x10...0x17, 0x19, 0x1C...0x1F => {}, + // We don't currently have any handling for + // 0x18 or 0x1A, but they should still move + // the parser state to ground. + 0x18, 0x1A => self.parser.state = .ground, + // A parameter digit: + '0'...'9' => if (self.parser.params_idx < 16) { + self.parser.param_acc *= 10; + self.parser.param_acc += c - '0'; + self.parser.param_acc_idx |= 1; + }, + // A parameter separator: + ':', ';' => if (self.parser.params_idx < 16) { + self.parser.params[self.parser.params_idx] = self.parser.param_acc; + self.parser.params_idx += 1; + + self.parser.param_acc = 0; + self.parser.param_acc_idx = 0; + + // Keep track of separator state. + const sep: Parser.ParamSepState = @enumFromInt(c); + if (self.parser.params_idx == 1) self.parser.params_sep = sep; + if (self.parser.params_sep != sep) self.parser.params_sep = .mixed; + }, + // Explicitly ignored: + 0x7F => {}, + // Defer to the state machine to + // handle any other characters: + else => break :csi_param, + } + return; + } + const actions = self.parser.next(c); for (actions) |action_opt| { const action = action_opt orelse continue; From 777ecffe6b45b46578e20d5d2b25107e7d6a3a58 Mon Sep 17 00:00:00 2001 From: Qwerasd Date: Thu, 8 Feb 2024 22:34:21 -0500 Subject: [PATCH 3/3] fix(terminal/stream): fix OOB read and integer overflow --- src/terminal/stream.zig | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/terminal/stream.zig b/src/terminal/stream.zig index bdfcf4155..fc97d3685 100644 --- a/src/terminal/stream.zig +++ b/src/terminal/stream.zig @@ -75,13 +75,14 @@ pub fn Stream(comptime Handler: type) type { try self.nextUtf8(input[offset]); offset += 1; } + if (offset >= input.len) return; // If we're not in the ground state then we process until // we are. This can happen if the last chunk of input put us // in the middle of a control sequence. offset += try self.consumeUntilGround(input[offset..]); - offset += try self.consumeAllEscapes(input[offset..]); if (offset >= input.len) return; + offset += try self.consumeAllEscapes(input[offset..]); // If we're in the ground state then we can use SIMD to process // input until we see an ESC (0x1B), since all other characters @@ -219,8 +220,12 @@ pub fn Stream(comptime Handler: type) type { 0x18, 0x1A => self.parser.state = .ground, // A parameter digit: '0'...'9' => if (self.parser.params_idx < 16) { - self.parser.param_acc *= 10; - self.parser.param_acc += c - '0'; + self.parser.param_acc *|= 10; + self.parser.param_acc +|= c - '0'; + // The parser's CSI param action uses param_acc_idx + // to decide if there's a final param that needs to + // be consumed or not, but it doesn't matter really + // what it is as long as it's not 0. self.parser.param_acc_idx |= 1; }, // A parameter separator: