From f8c544c11978dbe34b70518f7adc66fc40f94821 Mon Sep 17 00:00:00 2001
From: Qwerasd <qwerasd205@users.noreply.github.com>
Date: Wed, 7 Feb 2024 00:12:37 -0500
Subject: [PATCH 1/3] terminal: stream/parser changes

---
 src/terminal/Parser.zig |  4 +--
 src/terminal/stream.zig | 57 ++++++++++++++++++++++++++---------------
 2 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/src/terminal/Parser.zig b/src/terminal/Parser.zig
index 5746be065..bfb25f2fa 100644
--- a/src/terminal/Parser.zig
+++ b/src/terminal/Parser.zig
@@ -185,7 +185,7 @@ pub const Action = union(enum) {
 
 /// Keeps track of the parameter sep used for CSI params. We allow colons
 /// to be used ONLY by the 'm' CSI action.
-const ParamSepState = enum(u8) {
+pub const ParamSepState = enum(u8) {
     none = 0,
     semicolon = ';',
     colon = ':',
@@ -279,7 +279,7 @@ pub fn next(self: *Parser, c: u8) [3]?Action {
     };
 }
 
-fn collect(self: *Parser, c: u8) void {
+pub fn collect(self: *Parser, c: u8) void {
     if (self.intermediates_idx >= MAX_INTERMEDIATE) {
         log.warn("invalid intermediates count", .{});
         return;
diff --git a/src/terminal/stream.zig b/src/terminal/stream.zig
index 982a8ab20..0362780c2 100644
--- a/src/terminal/stream.zig
+++ b/src/terminal/stream.zig
@@ -74,17 +74,14 @@ pub fn Stream(comptime Handler: type) type {
                 if (offset >= input.len) return;
                 try self.next(input[offset]);
                 offset += 1;
-            } else if (self.parser.state != .ground) {
-                // If we're not in the ground state then we process until
-                // we are. This can happen if the last chunk of input put us
-                // in the middle of a control sequence.
-                for (input[offset..]) |single| {
-                    try self.nextNonUtf8(single);
-                    offset += 1;
-                    if (self.parser.state == .ground) break;
-                }
             }
 
+            // If we're not in the ground state then we process until
+            // we are. This can happen if the last chunk of input put us
+            // in the middle of a control sequence.
+            offset += try self.consumeUntilGround(input[offset..]);
+            if (offset >= input.len) return;
+
             // If we're in the ground state then we can use SIMD to process
             // input until we see an ESC (0x1B), since all other characters
             // up to that point are just UTF-8.
@@ -112,26 +109,46 @@ pub fn Stream(comptime Handler: type) type {
                 }
 
                 // Process our control sequence.
-                for (input[offset..]) |single| {
-                    try self.nextNonUtf8(single);
-                    offset += 1;
-                    if (self.parser.state == .ground) break;
-                }
+                self.parser.state = .escape;
+                offset += 1;
+                offset += try self.consumeUntilGround(input[offset..]);
             }
         }
 
+        /// Parses escape sequences until the parser reaches the ground state.
+        /// Returns the number of bytes consumed from the provided input.
+        inline fn consumeUntilGround(self: *Self, input: []const u8) !usize {
+            var offset: usize = 0;
+            while (self.parser.state != .ground) {
+                if (offset >= input.len) return input.len;
+                try self.nextNonUtf8(input[offset]);
+                offset += 1;
+            }
+            return offset;
+        }
+
         /// Like nextSlice but takes one byte and is necessarilly a scalar
         /// operation that can't use SIMD. Prefer nextSlice if you can and
         /// try to get multiple bytes at once.
         pub fn next(self: *Self, c: u8) !void {
             // The scalar path can be responsible for decoding UTF-8.
             if (self.parser.state == .ground and c != 0x1B) {
-                var consumed = false;
-                while (!consumed) {
-                    const res = self.utf8decoder.next(c);
-                    consumed = res[1];
-                    if (res[0]) |codepoint| {
-                        if (codepoint <= 0xF) {
+                const res = self.utf8decoder.next(c);
+                const consumed = res[1];
+                if (res[0]) |codepoint| {
+                    if (codepoint < 0xF) {
+                        try self.execute(@intCast(codepoint));
+                    } else {
+                        try self.print(@intCast(codepoint));
+                    }
+                }
+                if (!consumed) {
+                    const retry = self.utf8decoder.next(c);
+                    // It should be impossible for the decoder
+                    // to not consume the byte twice in a row.
+                    assert(retry[1] == true);
+                    if (retry[0]) |codepoint| {
+                        if (codepoint < 0xF) {
                             try self.execute(@intCast(codepoint));
                         } else {
                             try self.print(@intCast(codepoint));

From 68c0813397325236b8876fb70832d86a1e06209d Mon Sep 17 00:00:00 2001
From: Qwerasd <qwerasd205@users.noreply.github.com>
Date: Thu, 8 Feb 2024 21:49:58 -0500
Subject: [PATCH 2/3] terminal/stream: Added ESC parsing fast tracks

---
 src/bench/stream.sh     |   4 +-
 src/terminal/Parser.zig |   2 +-
 src/terminal/stream.zig | 128 ++++++++++++++++++++++++++++++++--------
 3 files changed, 105 insertions(+), 29 deletions(-)

diff --git a/src/bench/stream.sh b/src/bench/stream.sh
index 41d62f234..5f2e4d311 100755
--- a/src/bench/stream.sh
+++ b/src/bench/stream.sh
@@ -17,8 +17,8 @@ SIZE="25000000"
 # Generate the benchmark input ahead of time so it's not included in the time.
 ./zig-out/bin/bench-stream --mode=gen-$DATA | head -c $SIZE > /tmp/ghostty_bench_data
 
-# Uncomment to instead use the contents of `stream.txt` as input.
-# yes $(cat ./stream.txt) | head -c $SIZE > /tmp/ghostty_bench_data
+# Uncomment to instead use the contents of `stream.txt` as input. (Ignores SIZE)
+# echo $(cat ./stream.txt) > /tmp/ghostty_bench_data
 
 hyperfine \
   --warmup 10 \
diff --git a/src/terminal/Parser.zig b/src/terminal/Parser.zig
index bfb25f2fa..f160619e2 100644
--- a/src/terminal/Parser.zig
+++ b/src/terminal/Parser.zig
@@ -390,7 +390,7 @@ fn doAction(self: *Parser, action: TransitionAction, c: u8) ?Action {
     };
 }
 
-fn clear(self: *Parser) void {
+pub fn clear(self: *Parser) void {
     self.intermediates_idx = 0;
     self.params_idx = 0;
     self.params_sep = .none;
diff --git a/src/terminal/stream.zig b/src/terminal/stream.zig
index 0362780c2..bdfcf4155 100644
--- a/src/terminal/stream.zig
+++ b/src/terminal/stream.zig
@@ -72,7 +72,7 @@ pub fn Stream(comptime Handler: type) type {
             // a code sequence, we continue until it's not.
             while (self.utf8decoder.state != 0) {
                 if (offset >= input.len) return;
-                try self.next(input[offset]);
+                try self.nextUtf8(input[offset]);
                 offset += 1;
             }
 
@@ -80,6 +80,7 @@ pub fn Stream(comptime Handler: type) type {
             // we are. This can happen if the last chunk of input put us
             // in the middle of a control sequence.
             offset += try self.consumeUntilGround(input[offset..]);
+            offset += try self.consumeAllEscapes(input[offset..]);
             if (offset >= input.len) return;
 
             // If we're in the ground state then we can use SIMD to process
@@ -94,9 +95,9 @@ pub fn Stream(comptime Handler: type) type {
                         try self.print(@intCast(cp));
                     }
                 }
-
                 // Consume the bytes we just processed.
                 offset += res.consumed;
+
                 if (offset >= input.len) return;
 
                 // If our offset is NOT an escape then we must have a
@@ -104,20 +105,35 @@ pub fn Stream(comptime Handler: type) type {
                 // to the scalar parser.
                 if (input[offset] != 0x1B) {
                     const rem = input[offset..];
-                    for (rem) |c| try self.next(c);
+                    for (rem) |c| try self.nextUtf8(c);
                     return;
                 }
 
-                // Process our control sequence.
+                // Process control sequences until we run out.
+                offset += try self.consumeAllEscapes(input[offset..]);
+            }
+        }
+
+        /// Parses back-to-back escape sequences until none are left.
+        /// Returns the number of bytes consumed from the provided input.
+        ///
+        /// Expects input to start with 0x1B, use consumeUntilGround first
+        /// if the stream may be in the middle of an escape sequence.
+        fn consumeAllEscapes(self: *Self, input: []const u8) !usize {
+            var offset: usize = 0;
+            while (input[offset] == 0x1B) {
                 self.parser.state = .escape;
+                self.parser.clear();
                 offset += 1;
                 offset += try self.consumeUntilGround(input[offset..]);
+                if (offset >= input.len) return input.len;
             }
+            return offset;
         }
 
         /// Parses escape sequences until the parser reaches the ground state.
         /// Returns the number of bytes consumed from the provided input.
-        inline fn consumeUntilGround(self: *Self, input: []const u8) !usize {
+        fn consumeUntilGround(self: *Self, input: []const u8) !usize {
             var offset: usize = 0;
             while (self.parser.state != .ground) {
                 if (offset >= input.len) return input.len;
@@ -133,33 +149,42 @@ pub fn Stream(comptime Handler: type) type {
         pub fn next(self: *Self, c: u8) !void {
             // The scalar path can be responsible for decoding UTF-8.
             if (self.parser.state == .ground and c != 0x1B) {
-                const res = self.utf8decoder.next(c);
-                const consumed = res[1];
-                if (res[0]) |codepoint| {
-                    if (codepoint < 0xF) {
+                try self.nextUtf8(c);
+                return;
+            }
+
+            try self.nextNonUtf8(c);
+        }
+
+        /// Process the next byte and print as necessary.
+        ///
+        /// This assumes we're in the UTF-8 decoding state. If we may not
+        /// be in the UTF-8 decoding state call nextSlice or next.
+        fn nextUtf8(self: *Self, c: u8) !void {
+            assert(self.parser.state == .ground and c != 0x1B);
+
+            const res = self.utf8decoder.next(c);
+            const consumed = res[1];
+            if (res[0]) |codepoint| {
+                if (codepoint <= 0xF) {
+                    try self.execute(@intCast(codepoint));
+                } else {
+                    try self.print(@intCast(codepoint));
+                }
+            }
+            if (!consumed) {
+                const retry = self.utf8decoder.next(c);
+                // It should be impossible for the decoder
+                // to not consume the byte twice in a row.
+                assert(retry[1] == true);
+                if (retry[0]) |codepoint| {
+                    if (codepoint <= 0xF) {
                         try self.execute(@intCast(codepoint));
                     } else {
                         try self.print(@intCast(codepoint));
                     }
                 }
-                if (!consumed) {
-                    const retry = self.utf8decoder.next(c);
-                    // It should be impossible for the decoder
-                    // to not consume the byte twice in a row.
-                    assert(retry[1] == true);
-                    if (retry[0]) |codepoint| {
-                        if (codepoint < 0xF) {
-                            try self.execute(@intCast(codepoint));
-                        } else {
-                            try self.print(@intCast(codepoint));
-                        }
-                    }
-                }
-
-                return;
             }
-
-            try self.nextNonUtf8(c);
         }
 
         /// Process the next character and call any callbacks if necessary.
@@ -169,6 +194,57 @@ pub fn Stream(comptime Handler: type) type {
         fn nextNonUtf8(self: *Self, c: u8) !void {
             assert(self.parser.state != .ground or c == 0x1B);
 
+            // Fast path for ESC
+            if (self.parser.state == .ground and c == 0x1B) {
+                self.parser.state = .escape;
+                self.parser.clear();
+                return;
+            }
+            // Fast path for CSI entry.
+            if (self.parser.state == .escape and c == '[') {
+                self.parser.state = .csi_entry;
+                return;
+            }
+            // Fast path for CSI params.
+            if (self.parser.state == .csi_param) csi_param: {
+                switch (c) {
+                    // A C0 escape (yes, this is valid):
+                    0x00...0x0F => try self.execute(c),
+                    // We ignore C0 escapes > 0xF since execute
+                    // doesn't have processing for them anyway:
+                    0x10...0x17, 0x19, 0x1C...0x1F => {},
+                    // We don't currently have any handling for
+                    // 0x18 or 0x1A, but they should still move
+                    // the parser state to ground.
+                    0x18, 0x1A => self.parser.state = .ground,
+                    // A parameter digit:
+                    '0'...'9' => if (self.parser.params_idx < 16) {
+                        self.parser.param_acc *= 10;
+                        self.parser.param_acc += c - '0';
+                        self.parser.param_acc_idx |= 1;
+                    },
+                    // A parameter separator:
+                    ':', ';' => if (self.parser.params_idx < 16) {
+                        self.parser.params[self.parser.params_idx] = self.parser.param_acc;
+                        self.parser.params_idx += 1;
+
+                        self.parser.param_acc = 0;
+                        self.parser.param_acc_idx = 0;
+
+                        // Keep track of separator state.
+                        const sep: Parser.ParamSepState = @enumFromInt(c);
+                        if (self.parser.params_idx == 1) self.parser.params_sep = sep;
+                        if (self.parser.params_sep != sep) self.parser.params_sep = .mixed;
+                    },
+                    // Explicitly ignored:
+                    0x7F => {},
+                    // Defer to the state machine to
+                    // handle any other characters:
+                    else => break :csi_param,
+                }
+                return;
+            }
+
             const actions = self.parser.next(c);
             for (actions) |action_opt| {
                 const action = action_opt orelse continue;

From 777ecffe6b45b46578e20d5d2b25107e7d6a3a58 Mon Sep 17 00:00:00 2001
From: Qwerasd <qwerasd205@users.noreply.github.com>
Date: Thu, 8 Feb 2024 22:34:21 -0500
Subject: [PATCH 3/3] fix(terminal/stream): fix OOB read and integer overflow

---
 src/terminal/stream.zig | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/terminal/stream.zig b/src/terminal/stream.zig
index bdfcf4155..fc97d3685 100644
--- a/src/terminal/stream.zig
+++ b/src/terminal/stream.zig
@@ -75,13 +75,14 @@ pub fn Stream(comptime Handler: type) type {
                 try self.nextUtf8(input[offset]);
                 offset += 1;
             }
+            if (offset >= input.len) return;
 
             // If we're not in the ground state then we process until
             // we are. This can happen if the last chunk of input put us
             // in the middle of a control sequence.
             offset += try self.consumeUntilGround(input[offset..]);
-            offset += try self.consumeAllEscapes(input[offset..]);
             if (offset >= input.len) return;
+            offset += try self.consumeAllEscapes(input[offset..]);
 
             // If we're in the ground state then we can use SIMD to process
             // input until we see an ESC (0x1B), since all other characters
@@ -219,8 +220,12 @@ pub fn Stream(comptime Handler: type) type {
                     0x18, 0x1A => self.parser.state = .ground,
                     // A parameter digit:
                     '0'...'9' => if (self.parser.params_idx < 16) {
-                        self.parser.param_acc *= 10;
-                        self.parser.param_acc += c - '0';
+                        self.parser.param_acc *|= 10;
+                        self.parser.param_acc +|= c - '0';
+                        // The parser's CSI param action uses param_acc_idx
+                        // to decide if there's a final param that needs to
+                        // be consumed or not, but it doesn't matter really
+                        // what it is as long as it's not 0.
                         self.parser.param_acc_idx |= 1;
                     },
                     // A parameter separator: