mirror of
https://github.com/ghostty-org/ghostty.git
synced 2025-07-16 08:46:08 +03:00
simd: convert indexOf, mess around with simdvt
This commit is contained in:
@ -9,17 +9,26 @@ const aarch64 = @import("aarch64.zig");
|
|||||||
// time of writing this comment, reimplements it using manual assembly. This is
|
// time of writing this comment, reimplements it using manual assembly. This is
|
||||||
// so I can compare to Zig's @Vector lowering.
|
// so I can compare to Zig's @Vector lowering.
|
||||||
|
|
||||||
const IndexOf = @TypeOf(indexOf);
|
pub const IndexOf = fn ([]const u8, u8) ?usize;
|
||||||
|
|
||||||
/// Returns the first index of `needle` in `input` or `null` if `needle`
|
/// Returns the indexOf function for the given ISA.
|
||||||
/// is not found.
|
pub fn indexOfFunc(v: isa.ISA) *const IndexOf {
|
||||||
pub fn indexOf(input: []const u8, needle: u8) ?usize {
|
return isa.funcMap(IndexOf, v, .{
|
||||||
return indexOfNeon(input, needle);
|
.{ .avx2, Scalar.indexOf }, // todo
|
||||||
//return indexOfScalar(input, needle);
|
.{ .neon, Neon.indexOf },
|
||||||
|
.{ .scalar, Scalar.indexOf },
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub const Scalar = struct {
|
||||||
|
pub fn indexOf(input: []const u8, needle: u8) ?usize {
|
||||||
|
return std.mem.indexOfScalar(u8, input, needle);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
pub const Neon = struct {
|
||||||
/// indexOf implementation using ARM NEON instructions.
|
/// indexOf implementation using ARM NEON instructions.
|
||||||
fn indexOfNeon(input: []const u8, needle: u8) ?usize {
|
pub fn indexOf(input: []const u8, needle: u8) ?usize {
|
||||||
// This function is going to be commented in a lot of detail. SIMD is
|
// This function is going to be commented in a lot of detail. SIMD is
|
||||||
// complicated and nonintuitive, so I want to make sure I understand what's
|
// complicated and nonintuitive, so I want to make sure I understand what's
|
||||||
// going on. More importantly, I want to make sure when I look back on this
|
// going on. More importantly, I want to make sure when I look back on this
|
||||||
@ -34,31 +43,11 @@ fn indexOfNeon(input: []const u8, needle: u8) ?usize {
|
|||||||
// Iterate 16 bytes at a time, which is the max size of a vector register.
|
// Iterate 16 bytes at a time, which is the max size of a vector register.
|
||||||
var i: usize = 0;
|
var i: usize = 0;
|
||||||
while (i + 16 <= input.len) : (i += 16) {
|
while (i + 16 <= input.len) : (i += 16) {
|
||||||
// Load the next 16 bytes into a vector register.
|
|
||||||
const input_vec = aarch64.vld1q_u8(input[i..]);
|
const input_vec = aarch64.vld1q_u8(input[i..]);
|
||||||
|
if (indexOfVec(input_vec, needle_vec)) |index| {
|
||||||
// Compare the input vector to the needle vector. This will set
|
|
||||||
// all bits to "1" in the output vector for each matching byte.
|
|
||||||
const match_vec = aarch64.vceqq_u8(input_vec, needle_vec);
|
|
||||||
|
|
||||||
// This is a neat trick in order to efficiently find the index of
|
|
||||||
// the first matching byte. Details for this can be found here:
|
|
||||||
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
|
|
||||||
const shift_vec = aarch64.vshrn_n_u16(@bitCast(match_vec), 4);
|
|
||||||
const shift_u64 = aarch64.vget_lane_u64(@bitCast(shift_vec));
|
|
||||||
if (shift_u64 == 0) {
|
|
||||||
// This means no matches were found.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// A match was found! Reverse the bits and divide by 4 to get the
|
|
||||||
// index of the first matching byte. The reversal is due to the
|
|
||||||
// bits being reversed in the shift operation, the division by 4
|
|
||||||
// is due to all data being repeated 4 times by vceqq.
|
|
||||||
const reversed = aarch64.rbit(u64, shift_u64);
|
|
||||||
const index = aarch64.clz(u64, reversed) >> 2;
|
|
||||||
return i + index;
|
return i + index;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Handle the remaining bytes
|
// Handle the remaining bytes
|
||||||
if (i < input.len) {
|
if (i < input.len) {
|
||||||
@ -70,12 +59,31 @@ fn indexOfNeon(input: []const u8, needle: u8) ?usize {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn indexOfScalar(input: []const u8, needle: u8) ?usize {
|
pub fn indexOfVec(input_vec: @Vector(16, u8), needle_vec: @Vector(16, u8)) ?usize {
|
||||||
// Note this actually uses vector operations if supported. See
|
// Compare the input vector to the needle vector. This will set
|
||||||
// our comment at the top of the file.
|
// all bits to "1" in the output vector for each matching byte.
|
||||||
return std.mem.indexOfScalar(u8, input, needle);
|
const match_vec = aarch64.vceqq_u8(input_vec, needle_vec);
|
||||||
|
|
||||||
|
// This is a neat trick in order to efficiently find the index of
|
||||||
|
// the first matching byte. Details for this can be found here:
|
||||||
|
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
|
||||||
|
const shift_vec = aarch64.vshrn_n_u16(@bitCast(match_vec), 4);
|
||||||
|
const shift_u64 = aarch64.vget_lane_u64(@bitCast(shift_vec));
|
||||||
|
if (shift_u64 == 0) {
|
||||||
|
// This means no matches were found.
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A match was found! Reverse the bits and divide by 4 to get the
|
||||||
|
// index of the first matching byte. The reversal is due to the
|
||||||
|
// bits being reversed in the shift operation, the division by 4
|
||||||
|
// is due to all data being repeated 4 times by vceqq.
|
||||||
|
const reversed = aarch64.rbit(u64, shift_u64);
|
||||||
|
const index = aarch64.clz(u64, reversed) >> 2;
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/// Generic test function so we can test against multiple implementations.
|
/// Generic test function so we can test against multiple implementations.
|
||||||
fn testIndexOf(func: *const IndexOf) !void {
|
fn testIndexOf(func: *const IndexOf) !void {
|
||||||
const testing = std.testing;
|
const testing = std.testing;
|
||||||
@ -91,8 +99,8 @@ fn testIndexOf(func: *const IndexOf) !void {
|
|||||||
, ' ').?);
|
, ' ').?);
|
||||||
}
|
}
|
||||||
|
|
||||||
test "indexOf neon" {
|
test "indexOf" {
|
||||||
if (comptime !isa.possible(.neon)) return error.SkipZigTest;
|
const v = isa.detect();
|
||||||
const set = isa.detect();
|
var it = v.iterator();
|
||||||
if (set.contains(.neon)) try testIndexOf(&indexOfNeon);
|
while (it.next()) |isa_v| try testIndexOf(indexOfFunc(isa_v));
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
|
|
||||||
pub const isa = @import("isa.zig");
|
pub const isa = @import("isa.zig");
|
||||||
|
|
||||||
|
pub const aarch64 = @import("aarch64.zig");
|
||||||
|
|
||||||
pub const utf8_count = @import("utf8_count.zig");
|
pub const utf8_count = @import("utf8_count.zig");
|
||||||
pub const utf8_decode = @import("utf8_decode.zig");
|
pub const utf8_decode = @import("utf8_decode.zig");
|
||||||
pub const utf8_validate = @import("utf8_validate.zig");
|
pub const utf8_validate = @import("utf8_validate.zig");
|
||||||
|
@ -43,6 +43,11 @@ pub const EraseLine = csi.EraseLine;
|
|||||||
pub const TabClear = csi.TabClear;
|
pub const TabClear = csi.TabClear;
|
||||||
pub const Attribute = sgr.Attribute;
|
pub const Attribute = sgr.Attribute;
|
||||||
|
|
||||||
|
// TODO: we only have a hardcoded Neon implementation for now
|
||||||
|
pub usingnamespace if (builtin.target.cpu.arch == .aarch64) struct {
|
||||||
|
pub const simdvt = @import("simdvt.zig");
|
||||||
|
} else struct {};
|
||||||
|
|
||||||
/// If we're targeting wasm then we export some wasm APIs.
|
/// If we're targeting wasm then we export some wasm APIs.
|
||||||
pub usingnamespace if (builtin.target.isWasm()) struct {
|
pub usingnamespace if (builtin.target.isWasm()) struct {
|
||||||
pub usingnamespace @import("wasm.zig");
|
pub usingnamespace @import("wasm.zig");
|
||||||
|
5
src/terminal/simdvt.zig
Normal file
5
src/terminal/simdvt.zig
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
pub const Parser = @import("simdvt/Parser.zig");
|
||||||
|
|
||||||
|
test {
|
||||||
|
@import("std").testing.refAllDecls(@This());
|
||||||
|
}
|
82
src/terminal/simdvt/parser.zig
Normal file
82
src/terminal/simdvt/parser.zig
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
const std = @import("std");
|
||||||
|
const Allocator = std.mem.Allocator;
|
||||||
|
const ArenaAllocator = std.heap.ArenaAllocator;
|
||||||
|
|
||||||
|
const terminal = @import("../main.zig");
|
||||||
|
const ScalarStream = terminal.Stream;
|
||||||
|
const simd = @import("../../simd/main.zig");
|
||||||
|
const aarch64 = simd.aarch64;
|
||||||
|
|
||||||
|
pub fn Stream(comptime Handler: type) type {
|
||||||
|
return struct {
|
||||||
|
const Self = @This();
|
||||||
|
|
||||||
|
handler: Handler,
|
||||||
|
|
||||||
|
pub fn init(h: Handler) Self {
|
||||||
|
return .{ .handler = h };
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn feed(self: *Self, input: []const u8) void {
|
||||||
|
// TODO: I want to do the UTF-8 decoding as we stream the input,
|
||||||
|
// but I don't want to deal with UTF-8 decode in SIMD right now.
|
||||||
|
// So for now we just go back over the input and decode using
|
||||||
|
// a scalar loop. Ugh.
|
||||||
|
|
||||||
|
// We search for ESC (0x1B) very frequently, since this is what triggers
|
||||||
|
// the start of a terminal escape sequence of any kind, so put this into
|
||||||
|
// a register immediately.
|
||||||
|
const esc_vec = aarch64.vdupq_n_u8(0x1B);
|
||||||
|
|
||||||
|
// Iterate 16 bytes at a time, which is the max size of a vector register.
|
||||||
|
var i: usize = 0;
|
||||||
|
while (i + 16 <= input.len) : (i += 16) {
|
||||||
|
// Load the next 16 bytes into a vector register.
|
||||||
|
const input_vec = aarch64.vld1q_u8(input[i..]);
|
||||||
|
|
||||||
|
// Check for ESC to determine if we should go to the next state.
|
||||||
|
if (simd.index_of.Neon.indexOfVec(input_vec, esc_vec)) |index| {
|
||||||
|
_ = index;
|
||||||
|
@panic("TODO");
|
||||||
|
}
|
||||||
|
|
||||||
|
// No ESC found, decode UTF-8.
|
||||||
|
// TODO(mitchellh): I don't have a UTF-8 decoder in SIMD yet, so
|
||||||
|
// for now we just use a scalar loop. This is slow.
|
||||||
|
const view = std.unicode.Utf8View.initUnchecked(input[i .. i + 16]);
|
||||||
|
var it = view.iterator();
|
||||||
|
while (it.nextCodepoint()) |cp| {
|
||||||
|
self.handler.print(cp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle the remaining bytes
|
||||||
|
if (i < input.len) {
|
||||||
|
@panic("input must be a multiple of 16 bytes for now");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
test "ascii" {
|
||||||
|
const testing = std.testing;
|
||||||
|
var arena = ArenaAllocator.init(testing.allocator);
|
||||||
|
defer arena.deinit();
|
||||||
|
const alloc = arena.allocator();
|
||||||
|
|
||||||
|
const H = struct {
|
||||||
|
const Self = @This();
|
||||||
|
alloc: Allocator,
|
||||||
|
buf: std.ArrayListUnmanaged(u21) = .{},
|
||||||
|
|
||||||
|
pub fn print(self: *Self, c: u21) void {
|
||||||
|
self.buf.append(self.alloc, c) catch unreachable;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const str = "hello" ** 16;
|
||||||
|
var s = Stream(H).init(.{ .alloc = alloc });
|
||||||
|
s.feed(str);
|
||||||
|
|
||||||
|
try testing.expectEqual(str.len, s.handler.buf.items.len);
|
||||||
|
}
|
Reference in New Issue
Block a user