diff --git a/src/terminal/stream.zig b/src/terminal/stream.zig
index fe4c5c53a..c562918f0 100644
--- a/src/terminal/stream.zig
+++ b/src/terminal/stream.zig
@@ -1,5 +1,7 @@
 const std = @import("std");
+const assert = std.debug.assert;
 const testing = std.testing;
+const simd = @import("../simd/main.zig");
 const Parser = @import("Parser.zig");
 const ansi = @import("ansi.zig");
 const charsets = @import("charsets.zig");
@@ -40,18 +42,159 @@ pub fn Stream(comptime Handler: type) type {
         parser: Parser = .{},
         utf8decoder: UTF8Decoder = .{},
 
+        /// Keep track of any partial UTF-8 sequences that we need to
+        /// process in the next call to nextAssumeUtf8.
+        partial_utf8: [4]u8 = undefined,
+        partial_utf8_len: u3 = 0,
+
         pub fn deinit(self: *Self) void {
             self.parser.deinit();
         }
 
         /// Process a string of characters.
         pub fn nextSlice(self: *Self, c: []const u8) !void {
-            for (c) |single| try self.next(single);
+            // TODO: we only have a direct Neon implementation of the fast
+            // path right now, just for testing.
+            if (comptime !simd.isa.possible(.neon)) {
+                for (c) |single| try self.next(single);
+                return;
+            }
+
+            // If we're not in the ground state then we process until we are.
+            var offset: usize = 0;
+            if (self.parser.state != .ground) {
+                for (c[offset..]) |single| {
+                    try self.next(single);
+                    offset += 1;
+                    if (self.parser.state == .ground) break;
+                }
+            }
+
+            // If we're in the ground state then we can use SIMD to process
+            // input until we see an ESC (0x1B), since all other characters
+            // up to that point are just UTF-8.
+            while (self.parser.state == .ground and offset < c.len) {
+                // Find the next ESC character to trigger a control sequence.
+                //const idx = std.mem.indexOfScalar(u8, c[offset..], 0x1B) orelse {
+                const idx = simd.index_of.Neon.indexOf(c[offset..], 0x1B) orelse {
+                    // No ESC character, remainder is all UTF-8.
+                    try self.nextAssumeUtf8(c[offset..]);
+                    return;
+                };
+
+                // Process the UTF-8 characters up to the ESC character.
+                const next_offset = offset + idx;
+                if (idx > 0) try self.nextAssumeUtf8(c[offset..next_offset]);
+
+                // Process the control sequence and bail out once we reach
+                // the ground state which means we're looking for ESC again.
+                offset = next_offset;
+                assert(c[offset] == 0x1B);
+                for (c[offset..]) |single| {
+                    try self.next(single);
+                    offset += 1;
+                    if (self.parser.state == .ground) break;
+                }
+            }
+        }
+
+        /// Process the data in "input" assuming it is all UTF-8. The UTF-8
+        /// may be invalid and we will replace any invalid sequences with
+        /// the replacement character (U+FFFD).
+        ///
+        /// The input may also be incomplete, i.e. it ends in the middle of
+        /// a UTF-8 sequence. In that case we will process as much as we can
+        /// and save the rest for the next call to nextAssumeUtf8.
+        fn nextAssumeUtf8(self: *Self, input: []const u8) !void {
+            var i: usize = 0;
+
+            // If we have a partial UTF-8 sequence from the last call then
+            // we need to process that first.
+            if (self.partial_utf8_len > 0) {
+                // This cannot fail because the nature of partial utf8 existing
+                // means we successfully processed it last time.
+                const len = std.unicode.utf8ByteSequenceLength(self.partial_utf8[0]) catch
+                    unreachable;
+
+                // This is the length we need in the input in addition to
+                // our partial_utf8 to complete the sequence.
+                const input_len = len - self.partial_utf8_len;
+
+                // If we STILL don't have enough bytes, then we copy and continue.
+                // This is a really bizarre and stupid program thats running to
+                // send us incomplete UTF-8 sequences over multiple write() calls.
+                if (input_len > input.len) {
+                    @memcpy(
+                        self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input.len],
+                        input,
+                    );
+                    self.partial_utf8_len += @intCast(input.len);
+                    return;
+                }
+
+                // Process the complete UTF-8 sequence.
+                @memcpy(
+                    self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input_len],
+                    input[0..input_len],
+                );
+                const cp = cp: {
+                    if (std.unicode.utf8Decode(self.partial_utf8[0..len])) |cp| {
+                        break :cp cp;
+                    } else |err| {
+                        log.warn("invalid UTF-8, ignoring err={}", .{err});
+                        break :cp 0xFFFD; // replacement character
+                    }
+                };
+
+                self.partial_utf8_len = 0;
+                try self.print(cp);
+                i += input_len;
+            }
+
+            while (i < input.len) {
+                const len = std.unicode.utf8ByteSequenceLength(input[i]) catch |err| {
+                    log.warn("invalid UTF-8, ignoring err={}", .{err});
+                    i += 1;
+                    try self.print(@intCast(input[i]));
+                    continue;
+                };
+
+                // If we have exactly one byte and its a control character,
+                // then process it directly.
+                if (len == 1 and input[i] < 0xF) {
+                    try self.execute(@intCast(input[i]));
+                    i += 1;
+                    continue;
+                }
+
+                // If we have a partial UTF-8 sequence then we save it for
+                // the next call to nextAssumeUtf8.
+                if (i + len > input.len) {
+                    const remaining = input.len - i;
+                    @memcpy(self.partial_utf8[0..remaining], input[i..]);
+                    self.partial_utf8_len = @intCast(remaining);
+                    return;
+                }
+
+                // Decode the UTF-8 sequence and handle any errors by
+                // replacing the character with the replacement character.
+                const cp = cp: {
+                    if (std.unicode.utf8Decode(input[i .. i + len])) |cp| {
+                        break :cp cp;
+                    } else |err| {
+                        log.warn("invalid UTF-8, ignoring err={}", .{err});
+                        break :cp 0xFFFD; // replacement character
+                    }
+                };
+
+                try self.print(cp);
+                i += len;
+            }
         }
 
         /// Process the next character and call any callbacks if necessary.
         pub fn next(self: *Self, c: u8) !void {
-            // log.debug("char: {c}", .{c});
+            // log.debug("char: {x} {c}", .{ c, c });
             if (self.parser.state == .ground and c != 0x1B) {
                 var consumed = false;
                 while (!consumed) {
diff --git a/src/termio/Exec.zig b/src/termio/Exec.zig
index 2ab631fe8..3a3988ae5 100644
--- a/src/termio/Exec.zig
+++ b/src/termio/Exec.zig
@@ -1617,44 +1617,8 @@ const ReadThread = struct {
                     log.err("error processing terminal data: {}", .{err});
             }
         } else {
-            // Process the terminal data. This is an extremely hot part of the
-            // terminal emulator, so we do some abstraction leakage to avoid
-            // function calls and unnecessary logic.
-            //
-            // The ground state is the only state that we can see and print/execute
-            // ASCII, so we only execute this hot path if we're already in the ground
-            // state.
-            //
-            // Empirically, this alone improved throughput of large text output by ~20%.
-            var i: usize = 0;
-            const end = buf.len;
-            if (ev.terminal_stream.parser.state == .ground) {
-                for (buf[i..end]) |ch| {
-                    switch (terminal.parse_table.table[ch][@intFromEnum(terminal.Parser.State.ground)].action) {
-                        // Print, call directly.
-                        .print => ev.terminal_stream.handler.print(@intCast(ch)) catch |err|
-                            log.err("error processing terminal data: {}", .{err}),
-
-                        // C0 execute, let our stream handle this one but otherwise
-                        // continue since we're guaranteed to be back in ground.
-                        .execute => ev.terminal_stream.execute(ch) catch |err|
-                            log.err("error processing terminal data: {}", .{err}),
-
-                        // Otherwise, break out and go the slow path until we're
-                        // back in ground. There is a slight optimization here where
-                        // could try to find the next transition to ground but when
-                        // I implemented that it didn't materially change performance.
-                        else => break,
-                    }
-
-                    i += 1;
-                }
-            }
-
-            if (i < end) {
-                ev.terminal_stream.nextSlice(buf[i..end]) catch |err|
-                    log.err("error processing terminal data: {}", .{err});
-            }
+            ev.terminal_stream.nextSlice(buf) catch |err|
+                log.err("error processing terminal data: {}", .{err});
         }
 
         // If our stream handling caused messages to be sent to the writer