fix(terminal): Fix boundary utf-8 decoding crash

Get rid of completePartialUtf8 and simply use the scalar parse (`.next`) to resolve boundary conditions instead.
This commit is contained in:
Qwerasd
2024-02-06 02:51:04 -05:00
parent 7256c8e091
commit 5769bb16dd

View File

@ -73,9 +73,11 @@ pub fn Stream(comptime Handler: type) type {
var offset: usize = 0; var offset: usize = 0;
// If we have a partial UTF-8 sequence then we process manually. // If the scalar UTF-8 decoder was in the middle of processing
if (self.partial_utf8_len > 0) { // a code sequence, we continue until it's not.
offset += try self.completePartialUtf8(input); while (self.utf8decoder.state != 0) {
try self.next(input[offset]);
offset += 1;
} else if (self.parser.state != .ground) { } else if (self.parser.state != .ground) {
// If we're not in the ground state then we process until // If we're not in the ground state then we process until
// we are. This can happen if the last chunk of input put us // we are. This can happen if the last chunk of input put us
@ -105,13 +107,14 @@ pub fn Stream(comptime Handler: type) type {
if (offset >= input.len) return; if (offset >= input.len) return;
// If our offset is NOT an escape then we must have a // If our offset is NOT an escape then we must have a
// partial UTF-8 sequence. In that case, we save it and // partial UTF-8 sequence. In that case, we pass it off
// return. // to the scalar parser.
if (input[offset] != 0x1B) { if (input[offset] != 0x1B) {
const rem = input[offset..]; const rem = input[offset..];
assert(rem.len <= self.partial_utf8.len); assert(rem.len <= self.partial_utf8.len);
@memcpy(self.partial_utf8[0..rem.len], rem); for (rem) |c| {
self.partial_utf8_len = @intCast(rem.len); try self.next(c);
}
return; return;
} }
@ -124,53 +127,6 @@ pub fn Stream(comptime Handler: type) type {
} }
} }
// Complete a partial UTF-8 sequence from a prior input chunk.
// This processes the UTF-8 sequence and then returns the number
// of bytes consumed from the input.
fn completePartialUtf8(self: *Self, input: []const u8) !usize {
assert(self.partial_utf8_len > 0);
assert(self.parser.state == .ground);
// This cannot fail because the nature of partial utf8
// existing means we successfully processed it last time.
const len = std.unicode.utf8ByteSequenceLength(self.partial_utf8[0]) catch
unreachable;
// This is the length we need in the input in addition to
// our partial_utf8 to complete the sequence.
const input_len = len - self.partial_utf8_len;
// If we STILL don't have enough bytes, then we copy and continue.
// This is a really bizarre and stupid program thats running to
// send us incomplete UTF-8 sequences over multiple write() calls.
if (input_len > input.len) {
@memcpy(
self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input.len],
input,
);
self.partial_utf8_len += @intCast(input.len);
return input.len;
}
// Process the complete UTF-8 sequence.
@memcpy(
self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input_len],
input[0..input_len],
);
const cp = cp: {
if (std.unicode.utf8Decode(self.partial_utf8[0..len])) |cp| {
break :cp cp;
} else |err| {
log.warn("invalid UTF-8, ignoring err={}", .{err});
break :cp 0xFFFD; // replacement character
}
};
self.partial_utf8_len = 0;
try self.print(cp);
return input_len;
}
/// Like nextSlice but takes one byte and is necessarilly a scalar /// Like nextSlice but takes one byte and is necessarilly a scalar
/// operation that can't use SIMD. Prefer nextSlice if you can and /// operation that can't use SIMD. Prefer nextSlice if you can and
/// try to get multiple bytes at once. /// try to get multiple bytes at once.