terminal: use new VT simd to process slices

This commit is contained in:
Mitchell Hashimoto
2024-02-04 10:59:27 -08:00
parent 50e7159288
commit 351d9eb402

View File

@ -52,38 +52,56 @@ pub fn Stream(comptime Handler: type) type {
} }
/// Process a string of characters. /// Process a string of characters.
pub fn nextSlice(self: *Self, c: []const u8) !void { pub fn nextSlice(self: *Self, input: []const u8) !void {
// If we're not in the ground state then we process until we are.
var offset: usize = 0; var offset: usize = 0;
if (self.parser.state != .ground) {
for (c[offset..]) |single| { // If we have a partial UTF-8 sequence then we process manually.
if (self.partial_utf8_len > 0) {
offset += try self.completePartialUtf8(input);
} else if (self.parser.state != .ground) {
// If we're not in the ground state then we process until
// we are. This can happen if the last chunk of input put us
// in the middle of a control sequence.
for (input[offset..]) |single| {
try self.next(single); try self.next(single);
offset += 1; offset += 1;
if (self.parser.state == .ground) break; if (self.parser.state == .ground) break;
} }
} }
// TODO: do something better
var cp_buf: [4096]u32 = undefined;
// If we're in the ground state then we can use SIMD to process // If we're in the ground state then we can use SIMD to process
// input until we see an ESC (0x1B), since all other characters // input until we see an ESC (0x1B), since all other characters
// up to that point are just UTF-8. // up to that point are just UTF-8.
while (self.parser.state == .ground and offset < c.len) { while (self.parser.state == .ground and offset < input.len) {
// Find the next ESC character to trigger a control sequence. const res = simd.vt.utf8DecodeUntilControlSeq(input[offset..], &cp_buf);
//const idx = std.mem.indexOfScalar(u8, c[offset..], 0x1B) orelse { for (cp_buf[0..res.decoded]) |cp| {
const idx = simd.index_of.Hwy.indexOf(c[offset..], 0x1B) orelse { if (cp < 0xF) {
// No ESC character, remainder is all UTF-8. try self.execute(@intCast(cp));
try self.nextAssumeUtf8(c[offset..]); } else {
try self.print(@intCast(cp));
}
}
// Consume the bytes we just processed.
offset += res.consumed;
if (offset >= input.len) return;
// If our offset is NOT an escape then we must have a
// partial UTF-8 sequence. In that case, we save it and
// return.
if (input[offset] != 0x1B) {
const rem = input[offset..];
assert(rem.len <= self.partial_utf8.len);
@memcpy(self.partial_utf8[0..rem.len], rem);
self.partial_utf8_len = @intCast(rem.len);
return; return;
}; }
// Process the UTF-8 characters up to the ESC character. // Process our control sequence.
const next_offset = offset + idx; for (input[offset..]) |single| {
if (idx > 0) try self.nextAssumeUtf8(c[offset..next_offset]);
// Process the control sequence and bail out once we reach
// the ground state which means we're looking for ESC again.
offset = next_offset;
assert(c[offset] == 0x1B);
for (c[offset..]) |single| {
try self.next(single); try self.next(single);
offset += 1; offset += 1;
if (self.parser.state == .ground) break; if (self.parser.state == .ground) break;
@ -91,98 +109,51 @@ pub fn Stream(comptime Handler: type) type {
} }
} }
/// Process the data in "input" assuming it is all UTF-8. The UTF-8 // Complete a partial UTF-8 sequence from a prior input chunk.
/// may be invalid and we will replace any invalid sequences with // This processes the UTF-8 sequence and then returns the number
/// the replacement character (U+FFFD). // of bytes consumed from the input.
/// fn completePartialUtf8(self: *Self, input: []const u8) !usize {
/// The input may also be incomplete, i.e. it ends in the middle of assert(self.partial_utf8_len > 0);
/// a UTF-8 sequence. In that case we will process as much as we can assert(self.parser.state == .ground);
/// and save the rest for the next call to nextAssumeUtf8.
fn nextAssumeUtf8(self: *Self, input: []const u8) !void {
var i: usize = 0;
// If we have a partial UTF-8 sequence from the last call then // This cannot fail because the nature of partial utf8
// we need to process that first. // existing means we successfully processed it last time.
if (self.partial_utf8_len > 0) { const len = std.unicode.utf8ByteSequenceLength(self.partial_utf8[0]) catch
// This cannot fail because the nature of partial utf8 existing unreachable;
// means we successfully processed it last time.
const len = std.unicode.utf8ByteSequenceLength(self.partial_utf8[0]) catch
unreachable;
// This is the length we need in the input in addition to // This is the length we need in the input in addition to
// our partial_utf8 to complete the sequence. // our partial_utf8 to complete the sequence.
const input_len = len - self.partial_utf8_len; const input_len = len - self.partial_utf8_len;
// If we STILL don't have enough bytes, then we copy and continue. // If we STILL don't have enough bytes, then we copy and continue.
// This is a really bizarre and stupid program thats running to // This is a really bizarre and stupid program thats running to
// send us incomplete UTF-8 sequences over multiple write() calls. // send us incomplete UTF-8 sequences over multiple write() calls.
if (input_len > input.len) { if (input_len > input.len) {
@memcpy(
self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input.len],
input,
);
self.partial_utf8_len += @intCast(input.len);
return;
}
// Process the complete UTF-8 sequence.
@memcpy( @memcpy(
self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input_len], self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input.len],
input[0..input_len], input,
); );
const cp = cp: { self.partial_utf8_len += @intCast(input.len);
if (std.unicode.utf8Decode(self.partial_utf8[0..len])) |cp| { return input.len;
break :cp cp;
} else |err| {
log.warn("invalid UTF-8, ignoring err={}", .{err});
break :cp 0xFFFD; // replacement character
}
};
self.partial_utf8_len = 0;
try self.print(cp);
i += input_len;
} }
while (i < input.len) { // Process the complete UTF-8 sequence.
const len = std.unicode.utf8ByteSequenceLength(input[i]) catch |err| { @memcpy(
self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input_len],
input[0..input_len],
);
const cp = cp: {
if (std.unicode.utf8Decode(self.partial_utf8[0..len])) |cp| {
break :cp cp;
} else |err| {
log.warn("invalid UTF-8, ignoring err={}", .{err}); log.warn("invalid UTF-8, ignoring err={}", .{err});
i += 1; break :cp 0xFFFD; // replacement character
try self.print(@intCast(input[i]));
continue;
};
// If we have exactly one byte and its a control character,
// then process it directly.
if (len == 1 and input[i] < 0xF) {
try self.execute(@intCast(input[i]));
i += 1;
continue;
} }
};
// If we have a partial UTF-8 sequence then we save it for self.partial_utf8_len = 0;
// the next call to nextAssumeUtf8. try self.print(cp);
if (i + len > input.len) { return input_len;
const remaining = input.len - i;
@memcpy(self.partial_utf8[0..remaining], input[i..]);
self.partial_utf8_len = @intCast(remaining);
return;
}
// Decode the UTF-8 sequence and handle any errors by
// replacing the character with the replacement character.
const cp = cp: {
if (std.unicode.utf8Decode(input[i .. i + len])) |cp| {
break :cp cp;
} else |err| {
log.warn("invalid UTF-8, ignoring err={}", .{err});
break :cp 0xFFFD; // replacement character
}
};
try self.print(cp);
i += len;
}
} }
/// Process the next character and call any callbacks if necessary. /// Process the next character and call any callbacks if necessary.