diff --git a/src/simd/vt.zig b/src/simd/vt.zig index 2e59957c1..dc1c0a511 100644 --- a/src/simd/vt.zig +++ b/src/simd/vt.zig @@ -119,3 +119,20 @@ test "decode invalid UTF-8" { try testing.expectEqual(@as(u32, 0xFFFD), output[5]); } + +// This is testing our current behavior so that we know we have to handle +// this case in terminal/stream.zig. If we change this behavior, we can +// remove the special handling in terminal/stream.zig. +test "decode invalid leading byte isn't consumed or replaced" { + const testing = std.testing; + + var output: [64]u32 = undefined; + + { + const str = "hello\xFF"; + try testing.expectEqual(DecodeResult{ + .consumed = 5, + .decoded = 5, + }, utf8DecodeUntilControlSeq(str, &output)); + } +} diff --git a/src/terminal/stream.zig b/src/terminal/stream.zig index bb1c7b74e..5d016bffc 100644 --- a/src/terminal/stream.zig +++ b/src/terminal/stream.zig @@ -42,11 +42,6 @@ pub fn Stream(comptime Handler: type) type { parser: Parser = .{}, utf8decoder: UTF8Decoder = .{}, - /// Keep track of any partial UTF-8 sequences that we need to - /// process in the next call to nextAssumeUtf8. - partial_utf8: [4]u8 = undefined, - partial_utf8_len: u3 = 0, - pub fn deinit(self: *Self) void { self.parser.deinit(); } @@ -73,9 +68,12 @@ pub fn Stream(comptime Handler: type) type { var offset: usize = 0; - // If we have a partial UTF-8 sequence then we process manually. - if (self.partial_utf8_len > 0) { - offset += try self.completePartialUtf8(input); + // If the scalar UTF-8 decoder was in the middle of processing + // a code sequence, we continue until it's not. + while (self.utf8decoder.state != 0) { + if (offset >= input.len) return; + try self.next(input[offset]); + offset += 1; } else if (self.parser.state != .ground) { // If we're not in the ground state then we process until // we are. This can happen if the last chunk of input put us @@ -105,13 +103,11 @@ pub fn Stream(comptime Handler: type) type { if (offset >= input.len) return; // If our offset is NOT an escape then we must have a - // partial UTF-8 sequence. In that case, we save it and - // return. + // partial UTF-8 sequence. In that case, we pass it off + // to the scalar parser. if (input[offset] != 0x1B) { const rem = input[offset..]; - assert(rem.len <= self.partial_utf8.len); - @memcpy(self.partial_utf8[0..rem.len], rem); - self.partial_utf8_len = @intCast(rem.len); + for (rem) |c| try self.next(c); return; } @@ -124,53 +120,6 @@ pub fn Stream(comptime Handler: type) type { } } - // Complete a partial UTF-8 sequence from a prior input chunk. - // This processes the UTF-8 sequence and then returns the number - // of bytes consumed from the input. - fn completePartialUtf8(self: *Self, input: []const u8) !usize { - assert(self.partial_utf8_len > 0); - assert(self.parser.state == .ground); - - // This cannot fail because the nature of partial utf8 - // existing means we successfully processed it last time. - const len = std.unicode.utf8ByteSequenceLength(self.partial_utf8[0]) catch - unreachable; - - // This is the length we need in the input in addition to - // our partial_utf8 to complete the sequence. - const input_len = len - self.partial_utf8_len; - - // If we STILL don't have enough bytes, then we copy and continue. - // This is a really bizarre and stupid program thats running to - // send us incomplete UTF-8 sequences over multiple write() calls. - if (input_len > input.len) { - @memcpy( - self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input.len], - input, - ); - self.partial_utf8_len += @intCast(input.len); - return input.len; - } - - // Process the complete UTF-8 sequence. - @memcpy( - self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input_len], - input[0..input_len], - ); - const cp = cp: { - if (std.unicode.utf8Decode(self.partial_utf8[0..len])) |cp| { - break :cp cp; - } else |err| { - log.warn("invalid UTF-8, ignoring err={}", .{err}); - break :cp 0xFFFD; // replacement character - } - }; - - self.partial_utf8_len = 0; - try self.print(cp); - return input_len; - } - /// Like nextSlice but takes one byte and is necessarilly a scalar /// operation that can't use SIMD. Prefer nextSlice if you can and /// try to get multiple bytes at once. @@ -1506,6 +1455,38 @@ test "stream: print" { try testing.expectEqual(@as(u21, 'x'), s.handler.c.?); } +test "simd: print invalid utf-8" { + const H = struct { + c: ?u21 = 0, + + pub fn print(self: *@This(), c: u21) !void { + self.c = c; + } + }; + + var s: Stream(H) = .{ .handler = .{} }; + try s.nextSlice(&.{0xFF}); + try testing.expectEqual(@as(u21, 0xFFFD), s.handler.c.?); +} + +test "simd: complete incomplete utf-8" { + const H = struct { + c: ?u21 = null, + + pub fn print(self: *@This(), c: u21) !void { + self.c = c; + } + }; + + var s: Stream(H) = .{ .handler = .{} }; + try s.nextSlice(&.{0xE0}); // 3 byte + try testing.expect(s.handler.c == null); + try s.nextSlice(&.{0xA0}); // still incomplete + try testing.expect(s.handler.c == null); + try s.nextSlice(&.{0x80}); + try testing.expectEqual(@as(u21, 0x800), s.handler.c.?); +} + test "stream: cursor right (CUF)" { const H = struct { amount: u16 = 0,