Merge pull request #1475 from qwerasd205/fix-invalid-boundary-crash

Fix boundary utf-8 decoding crash
2025-07-15 08:16:13 +03:00 · 2024-02-06 08:46:21 -08:00
parent 7256c8e091 fdc67a08f4
commit 238f86dda5
2 changed files with 58 additions and 60 deletions
--- a/src/simd/vt.zig
+++ b/src/simd/vt.zig
@ -119,3 +119,20 @@ test "decode invalid UTF-8" {

    try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
 }
+
+// This is testing our current behavior so that we know we have to handle
+// this case in terminal/stream.zig. If we change this behavior, we can
+// remove the special handling in terminal/stream.zig.
+test "decode invalid leading byte isn't consumed or replaced" {
+    const testing = std.testing;
+
+    var output: [64]u32 = undefined;
+
+    {
+        const str = "hello\xFF";
+        try testing.expectEqual(DecodeResult{
+            .consumed = 5,
+            .decoded = 5,
+        }, utf8DecodeUntilControlSeq(str, &output));
+    }
+}
--- a/src/terminal/stream.zig
+++ b/src/terminal/stream.zig
@ -42,11 +42,6 @@ pub fn Stream(comptime Handler: type) type {
        parser: Parser = .{},
        utf8decoder: UTF8Decoder = .{},

-        /// Keep track of any partial UTF-8 sequences that we need to
-        /// process in the next call to nextAssumeUtf8.
-        partial_utf8: [4]u8 = undefined,
-        partial_utf8_len: u3 = 0,
-
        pub fn deinit(self: *Self) void {
            self.parser.deinit();
        }
@ -73,9 +68,12 @@ pub fn Stream(comptime Handler: type) type {

            var offset: usize = 0;

-            // If we have a partial UTF-8 sequence then we process manually.
-            if (self.partial_utf8_len > 0) {
-                offset += try self.completePartialUtf8(input);
+            // If the scalar UTF-8 decoder was in the middle of processing
+            // a code sequence, we continue until it's not.
+            while (self.utf8decoder.state != 0) {
+                if (offset >= input.len) return;
+                try self.next(input[offset]);
+                offset += 1;
            } else if (self.parser.state != .ground) {
                // If we're not in the ground state then we process until
                // we are. This can happen if the last chunk of input put us
@ -105,13 +103,11 @@ pub fn Stream(comptime Handler: type) type {
                if (offset >= input.len) return;

                // If our offset is NOT an escape then we must have a
-                // partial UTF-8 sequence. In that case, we save it and
-                // return.
+                // partial UTF-8 sequence. In that case, we pass it off
+                // to the scalar parser.
                if (input[offset] != 0x1B) {
                    const rem = input[offset..];
-                    assert(rem.len <= self.partial_utf8.len);
-                    @memcpy(self.partial_utf8[0..rem.len], rem);
-                    self.partial_utf8_len = @intCast(rem.len);
+                    for (rem) |c| try self.next(c);
                    return;
                }

@ -124,53 +120,6 @@ pub fn Stream(comptime Handler: type) type {
            }
        }

-        // Complete a partial UTF-8 sequence from a prior input chunk.
-        // This processes the UTF-8 sequence and then returns the number
-        // of bytes consumed from the input.
-        fn completePartialUtf8(self: *Self, input: []const u8) !usize {
-            assert(self.partial_utf8_len > 0);
-            assert(self.parser.state == .ground);
-
-            // This cannot fail because the nature of partial utf8
-            // existing means we successfully processed it last time.
-            const len = std.unicode.utf8ByteSequenceLength(self.partial_utf8[0]) catch
-                unreachable;
-
-            // This is the length we need in the input in addition to
-            // our partial_utf8 to complete the sequence.
-            const input_len = len - self.partial_utf8_len;
-
-            // If we STILL don't have enough bytes, then we copy and continue.
-            // This is a really bizarre and stupid program thats running to
-            // send us incomplete UTF-8 sequences over multiple write() calls.
-            if (input_len > input.len) {
-                @memcpy(
-                    self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input.len],
-                    input,
-                );
-                self.partial_utf8_len += @intCast(input.len);
-                return input.len;
-            }
-
-            // Process the complete UTF-8 sequence.
-            @memcpy(
-                self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input_len],
-                input[0..input_len],
-            );
-            const cp = cp: {
-                if (std.unicode.utf8Decode(self.partial_utf8[0..len])) |cp| {
-                    break :cp cp;
-                } else |err| {
-                    log.warn("invalid UTF-8, ignoring err={}", .{err});
-                    break :cp 0xFFFD; // replacement character
-                }
-            };
-
-            self.partial_utf8_len = 0;
-            try self.print(cp);
-            return input_len;
-        }
-
        /// Like nextSlice but takes one byte and is necessarilly a scalar
        /// operation that can't use SIMD. Prefer nextSlice if you can and
        /// try to get multiple bytes at once.
@ -1506,6 +1455,38 @@ test "stream: print" {
    try testing.expectEqual(@as(u21, 'x'), s.handler.c.?);
 }

+test "simd: print invalid utf-8" {
+    const H = struct {
+        c: ?u21 = 0,
+
+        pub fn print(self: *@This(), c: u21) !void {
+            self.c = c;
+        }
+    };
+
+    var s: Stream(H) = .{ .handler = .{} };
+    try s.nextSlice(&.{0xFF});
+    try testing.expectEqual(@as(u21, 0xFFFD), s.handler.c.?);
+}
+
+test "simd: complete incomplete utf-8" {
+    const H = struct {
+        c: ?u21 = null,
+
+        pub fn print(self: *@This(), c: u21) !void {
+            self.c = c;
+        }
+    };
+
+    var s: Stream(H) = .{ .handler = .{} };
+    try s.nextSlice(&.{0xE0}); // 3 byte
+    try testing.expect(s.handler.c == null);
+    try s.nextSlice(&.{0xA0}); // still incomplete
+    try testing.expect(s.handler.c == null);
+    try s.nextSlice(&.{0x80});
+    try testing.expectEqual(@as(u21, 0x800), s.handler.c.?);
+}
+
 test "stream: cursor right (CUF)" {
    const H = struct {
        amount: u16 = 0,