From 351d9eb4021ed8b915dc79e8cfb3a75ed8be4fb2 Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <mitchell.hashimoto@gmail.com>
Date: Sun, 4 Feb 2024 10:59:27 -0800
Subject: [PATCH] terminal: use new VT simd to process slices

---
 src/terminal/stream.zig | 175 +++++++++++++++++-----------------------
 1 file changed, 73 insertions(+), 102 deletions(-)

diff --git a/src/terminal/stream.zig b/src/terminal/stream.zig
index 7ac46ab5a..2e187c6db 100644
--- a/src/terminal/stream.zig
+++ b/src/terminal/stream.zig
@@ -52,38 +52,56 @@ pub fn Stream(comptime Handler: type) type {
         }
 
         /// Process a string of characters.
-        pub fn nextSlice(self: *Self, c: []const u8) !void {
-            // If we're not in the ground state then we process until we are.
+        pub fn nextSlice(self: *Self, input: []const u8) !void {
             var offset: usize = 0;
-            if (self.parser.state != .ground) {
-                for (c[offset..]) |single| {
+
+            // If we have a partial UTF-8 sequence then we process manually.
+            if (self.partial_utf8_len > 0) {
+                offset += try self.completePartialUtf8(input);
+            } else if (self.parser.state != .ground) {
+                // If we're not in the ground state then we process until
+                // we are. This can happen if the last chunk of input put us
+                // in the middle of a control sequence.
+                for (input[offset..]) |single| {
                     try self.next(single);
                     offset += 1;
                     if (self.parser.state == .ground) break;
                 }
             }
 
+            // TODO: do something better
+            var cp_buf: [4096]u32 = undefined;
+
             // If we're in the ground state then we can use SIMD to process
             // input until we see an ESC (0x1B), since all other characters
             // up to that point are just UTF-8.
-            while (self.parser.state == .ground and offset < c.len) {
-                // Find the next ESC character to trigger a control sequence.
-                //const idx = std.mem.indexOfScalar(u8, c[offset..], 0x1B) orelse {
-                const idx = simd.index_of.Hwy.indexOf(c[offset..], 0x1B) orelse {
-                    // No ESC character, remainder is all UTF-8.
-                    try self.nextAssumeUtf8(c[offset..]);
+            while (self.parser.state == .ground and offset < input.len) {
+                const res = simd.vt.utf8DecodeUntilControlSeq(input[offset..], &cp_buf);
+                for (cp_buf[0..res.decoded]) |cp| {
+                    if (cp < 0xF) {
+                        try self.execute(@intCast(cp));
+                    } else {
+                        try self.print(@intCast(cp));
+                    }
+                }
+
+                // Consume the bytes we just processed.
+                offset += res.consumed;
+                if (offset >= input.len) return;
+
+                // If our offset is NOT an escape then we must have a
+                // partial UTF-8 sequence. In that case, we save it and
+                // return.
+                if (input[offset] != 0x1B) {
+                    const rem = input[offset..];
+                    assert(rem.len <= self.partial_utf8.len);
+                    @memcpy(self.partial_utf8[0..rem.len], rem);
+                    self.partial_utf8_len = @intCast(rem.len);
                     return;
-                };
+                }
 
-                // Process the UTF-8 characters up to the ESC character.
-                const next_offset = offset + idx;
-                if (idx > 0) try self.nextAssumeUtf8(c[offset..next_offset]);
-
-                // Process the control sequence and bail out once we reach
-                // the ground state which means we're looking for ESC again.
-                offset = next_offset;
-                assert(c[offset] == 0x1B);
-                for (c[offset..]) |single| {
+                // Process our control sequence.
+                for (input[offset..]) |single| {
                     try self.next(single);
                     offset += 1;
                     if (self.parser.state == .ground) break;
@@ -91,98 +109,51 @@ pub fn Stream(comptime Handler: type) type {
             }
         }
 
-        /// Process the data in "input" assuming it is all UTF-8. The UTF-8
-        /// may be invalid and we will replace any invalid sequences with
-        /// the replacement character (U+FFFD).
-        ///
-        /// The input may also be incomplete, i.e. it ends in the middle of
-        /// a UTF-8 sequence. In that case we will process as much as we can
-        /// and save the rest for the next call to nextAssumeUtf8.
-        fn nextAssumeUtf8(self: *Self, input: []const u8) !void {
-            var i: usize = 0;
+        // Complete a partial UTF-8 sequence from a prior input chunk.
+        // This processes the UTF-8 sequence and then returns the number
+        // of bytes consumed from the input.
+        fn completePartialUtf8(self: *Self, input: []const u8) !usize {
+            assert(self.partial_utf8_len > 0);
+            assert(self.parser.state == .ground);
 
-            // If we have a partial UTF-8 sequence from the last call then
-            // we need to process that first.
-            if (self.partial_utf8_len > 0) {
-                // This cannot fail because the nature of partial utf8 existing
-                // means we successfully processed it last time.
-                const len = std.unicode.utf8ByteSequenceLength(self.partial_utf8[0]) catch
-                    unreachable;
+            // This cannot fail because the nature of partial utf8
+            // existing means we successfully processed it last time.
+            const len = std.unicode.utf8ByteSequenceLength(self.partial_utf8[0]) catch
+                unreachable;
 
-                // This is the length we need in the input in addition to
-                // our partial_utf8 to complete the sequence.
-                const input_len = len - self.partial_utf8_len;
+            // This is the length we need in the input in addition to
+            // our partial_utf8 to complete the sequence.
+            const input_len = len - self.partial_utf8_len;
 
-                // If we STILL don't have enough bytes, then we copy and continue.
-                // This is a really bizarre and stupid program thats running to
-                // send us incomplete UTF-8 sequences over multiple write() calls.
-                if (input_len > input.len) {
-                    @memcpy(
-                        self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input.len],
-                        input,
-                    );
-                    self.partial_utf8_len += @intCast(input.len);
-                    return;
-                }
-
-                // Process the complete UTF-8 sequence.
+            // If we STILL don't have enough bytes, then we copy and continue.
+            // This is a really bizarre and stupid program thats running to
+            // send us incomplete UTF-8 sequences over multiple write() calls.
+            if (input_len > input.len) {
                 @memcpy(
-                    self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input_len],
-                    input[0..input_len],
+                    self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input.len],
+                    input,
                 );
-                const cp = cp: {
-                    if (std.unicode.utf8Decode(self.partial_utf8[0..len])) |cp| {
-                        break :cp cp;
-                    } else |err| {
-                        log.warn("invalid UTF-8, ignoring err={}", .{err});
-                        break :cp 0xFFFD; // replacement character
-                    }
-                };
-
-                self.partial_utf8_len = 0;
-                try self.print(cp);
-                i += input_len;
+                self.partial_utf8_len += @intCast(input.len);
+                return input.len;
             }
 
-            while (i < input.len) {
-                const len = std.unicode.utf8ByteSequenceLength(input[i]) catch |err| {
+            // Process the complete UTF-8 sequence.
+            @memcpy(
+                self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input_len],
+                input[0..input_len],
+            );
+            const cp = cp: {
+                if (std.unicode.utf8Decode(self.partial_utf8[0..len])) |cp| {
+                    break :cp cp;
+                } else |err| {
                     log.warn("invalid UTF-8, ignoring err={}", .{err});
-                    i += 1;
-                    try self.print(@intCast(input[i]));
-                    continue;
-                };
-
-                // If we have exactly one byte and its a control character,
-                // then process it directly.
-                if (len == 1 and input[i] < 0xF) {
-                    try self.execute(@intCast(input[i]));
-                    i += 1;
-                    continue;
+                    break :cp 0xFFFD; // replacement character
                 }
+            };
 
-                // If we have a partial UTF-8 sequence then we save it for
-                // the next call to nextAssumeUtf8.
-                if (i + len > input.len) {
-                    const remaining = input.len - i;
-                    @memcpy(self.partial_utf8[0..remaining], input[i..]);
-                    self.partial_utf8_len = @intCast(remaining);
-                    return;
-                }
-
-                // Decode the UTF-8 sequence and handle any errors by
-                // replacing the character with the replacement character.
-                const cp = cp: {
-                    if (std.unicode.utf8Decode(input[i .. i + len])) |cp| {
-                        break :cp cp;
-                    } else |err| {
-                        log.warn("invalid UTF-8, ignoring err={}", .{err});
-                        break :cp 0xFFFD; // replacement character
-                    }
-                };
-
-                try self.print(cp);
-                i += len;
-            }
+            self.partial_utf8_len = 0;
+            try self.print(cp);
+            return input_len;
         }
 
         /// Process the next character and call any callbacks if necessary.