simd: remove old attempts

2025-08-02 14:57:31 +03:00 · 2024-02-04 11:19:19 -08:00
parent 449d3b49a4
commit 12885a445a
14 changed files with 13 additions and 5708 deletions
--- a/build.zig
+++ b/build.zig
@ -1007,9 +1007,6 @@ fn addDeps(
    step.linkLibCpp();
    step.addIncludePath(.{ .path = "src" });
    step.addIncludePath(.{ .path = "src/simd" });
-    step.addCSourceFiles(.{ .files = &.{"src/simd/simdutf_c.cpp"} });
-    step.addIncludePath(.{ .path = "src/terminal/simdvt" });
-    step.addCSourceFiles(.{ .files = &.{"src/terminal/simdvt/example.cpp"} });
    step.addCSourceFiles(.{ .files = &.{
        "src/simd/index_of.cpp",
        "src/simd/vt.cpp",
--- a/src/simd/aarch64.zig
+++ b/src/simd/aarch64.zig
@ -1,395 +0,0 @@
-// https://developer.arm.com/architectures/instruction-sets/intrinsics
-// https://llvm.org/docs/LangRef.html#inline-assembler-expressions
-
-const std = @import("std");
-const assert = std.debug.assert;
-
-pub inline fn vaddlvq_u8(v: @Vector(16, u8)) u16 {
-    const result = asm (
-        \\ uaddlv %[ret:h], %[v].16b
-        : [ret] "=w" (-> @Vector(8, u16)),
-        : [v] "w" (v),
-    );
-
-    return result[0];
-}
-
-pub inline fn vaddvq_u8(v: @Vector(16, u8)) u8 {
-    const result = asm (
-        \\ addv %[ret:b], %[v].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [v] "w" (v),
-    );
-
-    return result[0];
-}
-
-pub inline fn vaddv_u8(v: @Vector(8, u8)) u8 {
-    const result = asm (
-        \\ addv %[ret:b], %[v].8b
-        : [ret] "=w" (-> @Vector(8, u8)),
-        : [v] "w" (v),
-    );
-
-    return result[0];
-}
-
-pub inline fn vandq_u8(a: @Vector(16, u8), b: @Vector(16, u8)) @Vector(16, u8) {
-    return asm (
-        \\ and %[ret].16b, %[a].16b, %[b].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vandq_u16(a: @Vector(8, u16), b: @Vector(8, u16)) @Vector(8, u16) {
-    return asm (
-        \\ and %[ret].16b, %[a].16b, %[b].16b
-        : [ret] "=w" (-> @Vector(8, u16)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vandq_u32(a: @Vector(4, u32), b: @Vector(4, u32)) @Vector(4, u32) {
-    return asm (
-        \\ and %[ret].16b, %[a].16b, %[b].16b
-        : [ret] "=w" (-> @Vector(4, u32)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vbicq_u16(a: @Vector(8, u16), b: @Vector(8, u16)) @Vector(8, u16) {
-    return asm (
-        \\ bic %[ret].16b, %[a].16b, %[b].16b
-        : [ret] "=w" (-> @Vector(8, u16)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vbslq_u32(
-    a: @Vector(4, u32),
-    b: @Vector(4, u32),
-    c: @Vector(4, u32),
-) @Vector(4, u32) {
-    return asm (
-        \\ mov %[ret].16b, %[a].16b
-        \\ bsl %[ret].16b, %[b].16b, %[c].16b
-        : [ret] "=&w" (-> @Vector(4, u32)),
-        : [a] "w" (a),
-          [b] "w" (b),
-          [c] "w" (c),
-    );
-}
-
-pub inline fn vceqq_u8(a: @Vector(16, u8), b: @Vector(16, u8)) @Vector(16, u8) {
-    return asm (
-        \\ cmeq %[ret].16b, %[a].16b, %[b].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vcgeq_u8(a: @Vector(16, u8), b: @Vector(16, u8)) @Vector(16, u8) {
-    return asm (
-        \\ cmhs %[ret].16b, %[a].16b, %[b].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vcgtq_s8(a: @Vector(16, i8), b: @Vector(16, i8)) @Vector(16, u8) {
-    return asm (
-        \\ cmgt %[ret].16b, %[a].16b, %[b].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vcgtq_u8(a: @Vector(16, u8), b: @Vector(16, u8)) @Vector(16, u8) {
-    return asm (
-        \\ cmhi %[ret].16b, %[a].16b, %[b].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vcltq_s8(a: @Vector(16, i8), b: @Vector(16, i8)) @Vector(16, u8) {
-    return asm (
-        \\ cmgt %[ret].16b, %[b].16b, %[a].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vcnt_u8(v: @Vector(8, u8)) @Vector(8, u8) {
-    return asm (
-        \\ cnt %[ret].8b, %[v].8b
-        : [ret] "=w" (-> @Vector(8, u8)),
-        : [v] "w" (v),
-    );
-}
-
-pub inline fn vcreate_u8(v: u64) @Vector(8, u8) {
-    return asm (
-        \\ ins %[ret].D[0], %[value]
-        : [ret] "=w" (-> @Vector(8, u8)),
-        : [value] "r" (v),
-    );
-}
-
-pub inline fn vdupq_n_s8(v: i8) @Vector(16, i8) {
-    return asm (
-        \\ dup %[ret].16b, %[value:w]
-        : [ret] "=w" (-> @Vector(16, i8)),
-        : [value] "r" (v),
-    );
-}
-
-pub inline fn vdupq_n_u8(v: u8) @Vector(16, u8) {
-    return asm (
-        \\ dup %[ret].16b, %[value:w]
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [value] "r" (v),
-    );
-}
-
-pub inline fn veorq_u8(a: @Vector(16, u8), b: @Vector(16, u8)) @Vector(16, u8) {
-    return asm (
-        \\ eor %[ret].16b, %[a].16b, %[b].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vextq_u8(a: @Vector(16, u8), b: @Vector(16, u8), n: u8) @Vector(16, u8) {
-    assert(n <= 16);
-    return asm (
-        \\ ext %[ret].16b, %[a].16b, %[b].16b, %[n]
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [b] "w" (b),
-          [n] "I" (n),
-    );
-}
-
-pub inline fn vget_lane_u64(v: @Vector(1, u64)) u64 {
-    return asm (
-        \\ umov %[ret], %[v].d[0]
-        : [ret] "=r" (-> u64),
-        : [v] "w" (v),
-    );
-}
-
-pub inline fn vgetq_lane_u16(v: @Vector(8, u16), n: u3) u16 {
-    return asm (
-        \\ umov %[ret:w], %[v].h[%[n]]
-        : [ret] "=r" (-> u16),
-        : [v] "w" (v),
-          [n] "I" (n),
-    );
-}
-
-pub inline fn vgetq_lane_u64(v: @Vector(2, u64), n: u1) u64 {
-    return asm (
-        \\ umov %[ret], %[v].d[%[n]]
-        : [ret] "=r" (-> u64),
-        : [v] "w" (v),
-          [n] "I" (n),
-    );
-}
-
-pub inline fn vld1q_u8(v: []const u8) @Vector(16, u8) {
-    return asm (
-        \\ ld1 { %[ret].16b }, [%[value]]
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [value] "r" (v.ptr),
-    );
-}
-
-pub inline fn vmaxvq_u8(v: @Vector(16, u8)) u8 {
-    const result = asm (
-        \\ umaxv %[ret:b], %[v].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [v] "w" (v),
-    );
-
-    return result[0];
-}
-
-pub inline fn vmovq_n_u32(v: u32) @Vector(4, u32) {
-    return asm (
-        \\ dup %[ret].4s, %[value:w]
-        : [ret] "=w" (-> @Vector(4, u32)),
-        : [value] "r" (v),
-    );
-}
-
-pub inline fn vmovq_n_u16(v: u16) @Vector(8, u16) {
-    return asm (
-        \\ dup %[ret].8h, %[value:w]
-        : [ret] "=w" (-> @Vector(8, u16)),
-        : [value] "r" (v),
-    );
-}
-
-pub inline fn vorrq_u8(a: @Vector(16, u8), b: @Vector(16, u8)) @Vector(16, u8) {
-    return asm (
-        \\ orr %[ret].16b, %[a].16b, %[b].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vpaddq_u8(a: @Vector(16, u8), b: @Vector(16, u8)) @Vector(16, u8) {
-    return asm (
-        \\ addp %[ret].16b, %[a].16b, %[b].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [b] "w" (b),
-    );
-}
-
-pub inline fn vqtbl1q_u8(t: @Vector(16, u8), idx: @Vector(16, u8)) @Vector(16, u8) {
-    return asm (
-        \\ tbl %[ret].16b, { %[t].16b }, %[idx].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [idx] "w" (idx),
-          [t] "w" (t),
-    );
-}
-
-pub inline fn vqtbl2q_u8(t: [2]@Vector(16, u8), idx: @Vector(16, u8)) @Vector(16, u8) {
-    return asm (
-        \\ tbl %[ret].16b, { %[t0].16b, %[t1].16b }, %[idx].16b
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [idx] "w" (idx),
-          [t0] "w" (t[0]),
-          [t1] "w" (t[1]),
-    );
-}
-
-pub inline fn vshrn_n_u16(a: @Vector(8, u16), n: u4) @Vector(8, u8) {
-    assert(n <= 8);
-    return asm (
-        \\ shrn %[ret].8b, %[a].8h, %[n]
-        : [ret] "=w" (-> @Vector(8, u8)),
-        : [a] "w" (a),
-          [n] "I" (n),
-    );
-}
-
-pub inline fn vshrq_n_u8(a: @Vector(16, u8), n: u8) @Vector(16, u8) {
-    assert(n <= 8);
-    return asm (
-        \\ ushr %[ret].16b, %[a].16b, %[n]
-        : [ret] "=w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [n] "I" (n),
-    );
-}
-
-pub inline fn vshrq_n_u32(a: @Vector(4, u32), n: u8) @Vector(4, u32) {
-    assert(n <= 32);
-    return asm (
-        \\ ushr %[ret].4s, %[a].4s, %[n]
-        : [ret] "=w" (-> @Vector(4, u32)),
-        : [a] "w" (a),
-          [n] "I" (n),
-    );
-}
-
-pub inline fn vsraq_n_u8(a: @Vector(16, u8), b: @Vector(16, u8), n: u8) @Vector(16, u8) {
-    assert(n <= 8);
-    return asm (
-        \\ mov %[ret].16b, %[a].16b
-        \\ usra %[ret].16b, %[b].16b, %[n]
-        : [ret] "=&w" (-> @Vector(16, u8)),
-        : [a] "w" (a),
-          [b] "w" (b),
-          [n] "I" (n),
-    );
-}
-
-pub inline fn vsraq_n_u16(a: @Vector(8, u16), b: @Vector(8, u16), n: u4) @Vector(8, u16) {
-    assert(n <= 16);
-
-    // note: usra modifies the first operand, but I can't figure out how to
-    // specify that without the mov safely.
-    return asm (
-        \\ mov %[ret].8h, %[a].8h
-        \\ usra %[ret].8h, %[b].8h, %[n]
-        : [ret] "=&w" (-> @Vector(8, u16)),
-        : [a] "w" (a),
-          [b] "w" (b),
-          [n] "I" (n),
-    );
-}
-
-pub inline fn vsraq_n_u32(a: @Vector(4, u32), b: @Vector(4, u32), n: u8) @Vector(4, u32) {
-    assert(n <= 32);
-    return asm (
-        \\ mov %[ret].4s, %[a].4s
-        \\ usra %[ret].4s, %[b].4s, %[n]
-        : [ret] "=&w" (-> @Vector(4, u32)),
-        : [a] "w" (a),
-          [b] "w" (b),
-          [n] "I" (n),
-    );
-}
-
-pub inline fn vst1q_u8(out: [*]u8, a: @Vector(16, u8)) void {
-    asm volatile (
-        \\ st1 { %[a].16b }, [%[out]]
-        :
-        : [out] "r" (out),
-          [a] "w" (a),
-    );
-}
-
-pub inline fn vst1q_u32(out: [*]u32, a: @Vector(4, u32)) void {
-    asm volatile (
-        \\ st1 { %[a].4s }, [%[out]]
-        :
-        : [out] "r" (out),
-          [a] "w" (a),
-    );
-}
-
-pub inline fn vst2q_u16(out: [*]u16, vs: [2]@Vector(8, u16)) void {
-    asm volatile (
-        \\ st2 { %[v1].8h - %[v2].8h }, [%[out]]
-        :
-        : [out] "r" (out),
-          [v1] "w" (vs[0]),
-          [v2] "w" (vs[1]),
-    );
-}
-
-pub inline fn rbit(comptime T: type, v: T) T {
-    assert(T == u32 or T == u64);
-    return asm (
-        \\ rbit %[ret], %[v]
-        : [ret] "=r" (-> T),
-        : [v] "r" (v),
-    );
-}
-
-pub inline fn clz(comptime T: type, v: T) T {
-    assert(T == u32 or T == u64);
-    return asm (
-        \\ clz %[ret], %[v]
-        : [ret] "=r" (-> T),
-        : [v] "r" (v),
-    );
-}
--- a/src/simd/index_of.zig
+++ b/src/simd/index_of.zig
@ -1,120 +1,27 @@
 const std = @import("std");
 const builtin = @import("builtin");
-const isa = @import("isa.zig");
-const aarch64 = @import("aarch64.zig");

-// Note this is a reimplementation of std.mem.indexOfScalar. The Zig stdlib
-// version is already SIMD-optimized but not using runtime ISA detection. This
-// expands the stdlib version to use runtime ISA detection. This also, at the
-// time of writing this comment, reimplements it using manual assembly. This is
-// so I can compare to Zig's @Vector lowering.
+extern "c" fn ghostty_simd_index_of(
+    needle: u8,
+    input: [*]const u8,
+    count: usize,
+) usize;

-pub const IndexOf = fn ([]const u8, u8) ?usize;
-
-/// Returns the indexOf function for the given ISA.
-pub fn indexOfFunc(v: isa.ISA) *const IndexOf {
-    return isa.funcMap(IndexOf, v, .{
-        .{ .avx2, Scalar.indexOf }, // todo
-        .{ .neon, Neon.indexOf },
-        .{ .scalar, Scalar.indexOf },
-    });
+pub fn indexOf(input: []const u8, needle: u8) ?usize {
+    const result = ghostty_simd_index_of(needle, input.ptr, input.len);
+    return if (result == input.len) null else result;
 }

-pub const Scalar = struct {
-    pub fn indexOf(input: []const u8, needle: u8) ?usize {
-        return std.mem.indexOfScalar(u8, input, needle);
-    }
-};
-
-pub const Neon = struct {
-    /// indexOf implementation using ARM NEON instructions.
-    pub fn indexOf(input: []const u8, needle: u8) ?usize {
-        // This function is going to be commented in a lot of detail. SIMD is
-        // complicated and nonintuitive, so I want to make sure I understand what's
-        // going on. More importantly, I want to make sure when I look back on this
-        // code in the future, I understand what's going on.
-
-        // Load our needle into a vector register. This duplicates the needle 16
-        // times, once for each byte in the 128-bit vector register.
-        const needle_vec = aarch64.vdupq_n_u8(needle);
-
-        // note(mitchellh): benchmark to see if we should align to 16 bytes here
-
-        // Iterate 16 bytes at a time, which is the max size of a vector register.
-        var i: usize = 0;
-        while (i + 16 <= input.len) : (i += 16) {
-            const input_vec = aarch64.vld1q_u8(input[i..]);
-            if (indexOfVec(input_vec, needle_vec)) |index| {
-                return i + index;
-            }
-        }
-
-        // Handle the remaining bytes
-        if (i < input.len) {
-            while (i < input.len) : (i += 1) {
-                if (input[i] == needle) return i;
-            }
-        }
-
-        return null;
-    }
-
-    pub fn indexOfVec(input_vec: @Vector(16, u8), needle_vec: @Vector(16, u8)) ?usize {
-        // Compare the input vector to the needle vector. This will set
-        // all bits to "1" in the output vector for each matching byte.
-        const match_vec = aarch64.vceqq_u8(input_vec, needle_vec);
-
-        // This is a neat trick in order to efficiently find the index of
-        // the first matching byte. Details for this can be found here:
-        // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
-        const shift_vec = aarch64.vshrn_n_u16(@bitCast(match_vec), 4);
-        const shift_u64 = aarch64.vget_lane_u64(@bitCast(shift_vec));
-        if (shift_u64 == 0) {
-            // This means no matches were found.
-            return null;
-        }
-
-        // A match was found! Reverse the bits and divide by 4 to get the
-        // index of the first matching byte. The reversal is due to the
-        // bits being reversed in the shift operation, the division by 4
-        // is due to all data being repeated 4 times by vceqq.
-        const reversed = aarch64.rbit(u64, shift_u64);
-        const index = aarch64.clz(u64, reversed) >> 2;
-        return index;
-    }
-};
-
-/// Generic test function so we can test against multiple implementations.
-fn testIndexOf(func: *const IndexOf) !void {
+test "indexOf" {
    const testing = std.testing;
-    try testing.expect(func("hello", ' ') == null);
-    try testing.expectEqual(@as(usize, 2), func("hi lo", ' ').?);
-    try testing.expectEqual(@as(usize, 5), func(
+    try testing.expect(indexOf("hello", ' ') == null);
+    try testing.expectEqual(@as(usize, 2), indexOf("hi lo", ' ').?);
+    try testing.expectEqual(@as(usize, 5), indexOf(
        \\XXXXX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
        \\XXXXXXXXXXXX XXXXXXXXXXX XXXXXXXXXXXXXXX
    , ' ').?);
-    try testing.expectEqual(@as(usize, 53), func(
+    try testing.expectEqual(@as(usize, 53), indexOf(
        \\XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
        \\XXXXXXXXXXXX XXXXXXXXXXX XXXXXXXXXXXXXXX
    , ' ').?);
 }
-
-pub const Hwy = struct {
-    extern "c" fn ghostty_simd_index_of(
-        needle: u8,
-        input: [*]const u8,
-        count: usize,
-    ) usize;
-
-    pub fn indexOf(input: []const u8, needle: u8) ?usize {
-        const result = ghostty_simd_index_of(needle, input.ptr, input.len);
-        return if (result == input.len) null else result;
-    }
-};
-
-test "indexOf" {
-    const v = isa.detect();
-    var it = v.iterator();
-    while (it.next()) |isa_v| try testIndexOf(indexOfFunc(isa_v));
-    try testIndexOf(&Hwy.indexOf);
-}
--- a/src/simd/isa.zig
+++ b/src/simd/isa.zig
@ -1,204 +0,0 @@
-const std = @import("std");
-const builtin = @import("builtin");
-const assert = std.debug.assert;
-const x86_64 = @import("x86_64.zig");
-
-/// Raw comptime entry of poissible ISA. The arch is the arch that the
-/// ISA is even possible on (e.g. neon is only possible on aarch64) but
-/// the actual ISA may not be available at runtime.
-const Entry = struct {
-    name: [:0]const u8,
-    arch: []const std.Target.Cpu.Arch = &.{},
-};
-
-const entries: []const Entry = &.{
-    .{ .name = "scalar" },
-    .{ .name = "neon", .arch = &.{.aarch64} },
-    .{ .name = "avx2", .arch = &.{ .x86, .x86_64 } },
-};
-
-/// Enum of possible ISAs for our SIMD operations. Note that these are
-/// coarse-grained because they match possible implementations rather than
-/// a fine-grained packed struct of available CPU features.
-pub const ISA = isa: {
-    const EnumField = std.builtin.Type.EnumField;
-    var fields: [entries.len]EnumField = undefined;
-    for (entries, 0..) |entry, i| {
-        fields[i] = .{ .name = entry.name, .value = i };
-    }
-
-    break :isa @Type(.{ .Enum = .{
-        .tag_type = std.math.IntFittingRange(0, entries.len - 1),
-        .fields = &fields,
-        .decls = &.{},
-        .is_exhaustive = true,
-    } });
-};
-
-/// A set of ISAs.
-pub const Set = std.EnumSet(ISA);
-
-/// Check if the given ISA is possible on the current target. This is
-/// available at comptime to help prevent invalid architectures from
-/// being used.
-pub fn possible(comptime isa: ISA) bool {
-    inline for (entries) |entry| {
-        if (std.mem.eql(u8, entry.name, @tagName(isa))) {
-            for (entry.arch) |arch| {
-                if (arch == builtin.cpu.arch) return true;
-            }
-
-            // If we have no valid archs then its always valid.
-            return entry.arch.len == 0;
-        }
-    }
-
-    unreachable;
-}
-
-/// Detect all possible ISAs at runtime.
-pub fn detect() Set {
-    var set: Set = .{};
-    set.insert(.scalar);
-    switch (builtin.cpu.arch) {
-        // Neon is mandatory on aarch64. No runtime checks necessary.
-        .aarch64 => set.insert(.neon),
-        .x86_64 => detectX86(&set),
-        else => {},
-    }
-
-    return set;
-}
-
-/// Returns the preferred ISA to use that is available.
-pub fn preferred(set: Set) ISA {
-    const order: []const ISA = &.{ .avx2, .neon, .scalar };
-
-    // We should have all of our ISAs present in order
-    comptime {
-        for (@typeInfo(ISA).Enum.fields) |field| {
-            const v = @field(ISA, field.name);
-            assert(std.mem.indexOfScalar(ISA, order, v) != null);
-        }
-    }
-
-    inline for (order) |isa| {
-        if (comptime possible(isa)) {
-            if (set.contains(isa)) return isa;
-        }
-    }
-
-    return .scalar;
-}
-
-fn detectX86(set: *Set) void {
-    // NOTE: this is just some boilerplate to detect AVX2. We
-    // can probably support earlier forms of SIMD such as plain
-    // SSE, and we can definitely take advtange of later forms. This
-    // is just some boilerplate to ONLY detect AVX2 right now.
-
-    // If we support less than 7 for the maximum leaf level then we
-    // don't support any AVX instructions.
-    var leaf = x86_64.cpuid(0, 0);
-    if (leaf.eax < 7) return;
-
-    // If we don't have xsave or avx, then we don't support anything.
-    leaf = x86_64.cpuid(1, 0);
-    const has_xsave = hasBit(leaf.ecx, 27);
-    const has_avx = hasBit(leaf.ecx, 28);
-    if (!has_xsave or !has_avx) return;
-
-    // We require AVX save state in order to use AVX instructions.
-    const xcr0_eax = x86_64.getXCR0(); // requires xsave+avx
-    const has_avx_save = hasMask(xcr0_eax, x86_64.XCR0_XMM | x86_64.XCR0_YMM);
-    if (!has_avx_save) return;
-
-    // Check for AVX2.
-    leaf = x86_64.cpuid(7, 0);
-    const has_avx2 = hasBit(leaf.ebx, 5);
-    if (has_avx2) set.insert(.avx2);
-}
-
-/// Check if a bit is set at the given offset
-inline fn hasBit(input: u32, offset: u5) bool {
-    return (input >> offset) & 1 != 0;
-}
-
-/// Checks if a mask exactly matches the input
-inline fn hasMask(input: u32, mask: u32) bool {
-    return (input & mask) == mask;
-}
-
-/// This is a helper to provide a runtime lookup map for the ISA to
-/// the proper function implementation. Func is the function type,
-/// and map is an array of tuples of the form (ISA, Struct) where
-/// Struct has a decl named `name` that is a Func.
-///
-/// The slightly awkward parameters are to ensure that functions
-/// are only analyzed for possible ISAs for the target.
-///
-/// This will ensure that impossible ISAs for the build target are
-/// not included so they're not analyzed. For example, a NEON implementation
-/// will not be included on x86_64.
-pub fn funcMap(
-    comptime Func: type,
-    v: ISA,
-    comptime map: anytype,
-) *const Func {
-    switch (v) {
-        inline else => |tag| {
-            // If this tag isn't possible, compile no code for this case.
-            if (comptime !possible(tag)) unreachable;
-
-            // Find the entry for this tag and return the function.
-            inline for (map) |entry| {
-                if (entry[0] == tag) {
-                    // If we return &entry[1] directly the compiler crashes:
-                    // https://github.com/ziglang/zig/issues/18754
-                    const func = entry[1];
-                    return &func;
-                }
-            } else unreachable;
-        },
-    }
-}
-
-test "detect" {
-    const testing = std.testing;
-    const set = detect();
-    try testing.expect(set.contains(.scalar));
-
-    switch (builtin.cpu.arch) {
-        .aarch64 => {
-            // Neon is always available on aarch64
-            try testing.expect(set.contains(.neon));
-            try testing.expect(!set.contains(.avx2));
-        },
-
-        else => {},
-    }
-}
-
-test "preferred" {
-    _ = preferred(detect());
-}
-
-test "possible" {
-    const testing = std.testing;
-    try testing.expect(possible(.scalar)); // always possible
-
-    // hardcode some other common realities
-    switch (builtin.cpu.arch) {
-        .aarch64 => {
-            try testing.expect(possible(.neon));
-            try testing.expect(!possible(.avx2));
-        },
-
-        .x86, .x86_64 => {
-            try testing.expect(!possible(.neon));
-            try testing.expect(possible(.avx2));
-        },
-
-        else => {},
-    }
-}
--- a/src/simd/main.zig
+++ b/src/simd/main.zig
@ -1,24 +1,8 @@
 const std = @import("std");

-pub const isa = @import("isa.zig");
-
-pub const aarch64 = @import("aarch64.zig");
-
-pub const utf8_count = @import("utf8_count.zig");
-pub const utf8_decode = @import("utf8_decode.zig");
-pub const utf8_validate = @import("utf8_validate.zig");
 pub const index_of = @import("index_of.zig");
 pub const vt = @import("vt.zig");

-// TODO: temporary, only for zig build simd to inspect disasm easily
-// pub fn main() !void {
-//     //std.log.warn("ISA={}", .{isa.ISA.detect()});
-//     const input = "1234567\x1b1234567\x1b";
-//     //const input = "1234567812345678";
-//     std.log.warn("result={any}", .{index_of.indexOf(input, 0x1B)});
-//     std.log.warn("result={any}", .{utf8.utf8Validate(input)});
-// }
-
 test {
    @import("std").testing.refAllDecls(@This());
 }
--- a/src/simd/simdutf_c.cpp
+++ b/src/simd/simdutf_c.cpp
@ -1,11 +0,0 @@
-#include "simdutf.cpp"
-
-// This is just the C API we need from Zig. This is manually maintained
-// because the surface area is so small.
-extern "C" {
-
-size_t simdutf_convert_utf8_to_utf32(const char *src, size_t len, char32_t *dst) {
-    return simdutf::convert_utf8_to_utf32(src, len, dst);
-}
-
-}
--- a/src/simd/utf8_count.zig
+++ b/src/simd/utf8_count.zig
@ -1,72 +0,0 @@
-const std = @import("std");
-const isa = @import("isa.zig");
-const aarch64 = @import("aarch64.zig");
-
-/// Count the number of UTF-8 codepoints in the given string. The string
-/// is assumed to be valid UTF-8. Invalid UTF-8 will result in undefined
-/// (and probably incorrect) behaviour.
-pub const Count = fn ([]const u8) usize;
-
-/// Returns the count function for the given ISA.
-pub fn countFunc(v: isa.ISA) *const Count {
-    return isa.funcMap(Count, v, .{
-        .{ .avx2, Scalar.count }, // todo
-        .{ .neon, Neon.count },
-        .{ .scalar, Scalar.count },
-    });
-}
-
-pub const Scalar = struct {
-    pub fn count(input: []const u8) usize {
-        return std.unicode.utf8CountCodepoints(input) catch unreachable;
-    }
-};
-
-/// Arm NEON implementation of the count function.
-pub const Neon = struct {
-    pub fn count(input: []const u8) usize {
-        var result: usize = 0;
-        var i: usize = 0;
-        while (i + 16 <= input.len) : (i += 16) {
-            const input_vec = aarch64.vld1q_u8(input[i..]);
-            result += @intCast(process(input_vec));
-        }
-
-        if (i < input.len) result += Scalar.count(input[i..]);
-        return result;
-    }
-
-    pub fn process(v: @Vector(16, u8)) u8 {
-        // Find all the bits greater than -65 in binary (0b10000001) which
-        // are a leading byte of a UTF-8 codepoint. This will set the resulting
-        // vector to 0xFF for all leading bytes and 0x00 for all non-leading.
-        const mask = aarch64.vcgtq_s8(@bitCast(v), aarch64.vdupq_n_s8(-65));
-
-        // Shift to turn 0xFF to 0x01.
-        const mask_shift = aarch64.vshrq_n_u8(mask, 7);
-
-        // Sum across the vector
-        const sum = aarch64.vaddvq_u8(mask_shift);
-
-        // std.log.warn("mask={}", .{mask});
-        // std.log.warn("mask_shift={}", .{mask_shift});
-        // std.log.warn("sum={}", .{sum});
-        return sum;
-    }
-};
-
-/// Generic test function so we can test against multiple implementations.
-/// This is initially copied from the Zig stdlib but may be expanded.
-fn testCount(func: *const Count) !void {
-    const testing = std.testing;
-    try testing.expectEqual(@as(usize, 16), func("hello friends!!!"));
-    try testing.expectEqual(@as(usize, 10), func("abcdefghij"));
-    try testing.expectEqual(@as(usize, 10), func("äåéëþüúíóö"));
-    try testing.expectEqual(@as(usize, 5), func("こんにちは"));
-}
-
-test "count" {
-    const v = isa.detect();
-    var it = v.iterator();
-    while (it.next()) |isa_v| try testCount(countFunc(isa_v));
-}
--- a/src/simd/utf8_decode.zig
+++ b/src/simd/utf8_decode.zig
@ -1,93 +0,0 @@
-const std = @import("std");
-const assert = std.debug.assert;
-const isa = @import("isa.zig");
-const aarch64 = @import("aarch64.zig");
-const utf_tables = @import("utf_tables.zig");
-
-/// Decode UTF-8 codepoints to UTF-32. Returns the number of codepoints
-/// decoded. The output buffer must be large enough to hold the decoded
-/// codepoints (worst case is 4x the number of bytes).
-///
-/// This also assumes the UTF-8 is valid. If it may not be, you should
-/// validate first.
-pub const Decode = fn ([]u32, []const u8) []const u32;
-
-/// Returns the function for the given ISA.
-pub fn decodeFunc(v: isa.ISA) *const Decode {
-    return isa.funcMap(Decode, v, .{
-        .{ .scalar, Simdutf.decode },
-        .{ .neon, Simdutf.decode },
-        .{ .avx2, Simdutf.decode }, // todo
-    });
-}
-
-pub const Stdlib = struct {
-    pub fn decode(out: []u32, in: []const u8) []const u32 {
-        const view = std.unicode.Utf8View.initUnchecked(in);
-        var it = view.iterator();
-        var i: usize = 0;
-        while (it.nextCodepoint()) |cp| {
-            out[i] = cp;
-            i += 1;
-        }
-
-        return out[0..i];
-    }
-};
-
-/// Uses the simdutf project
-pub const Simdutf = struct {
-    pub fn decode(out: []u32, in: []const u8) []const u32 {
-        const len = simdutf_convert_utf8_to_utf32(
-            in.ptr,
-            in.len,
-            out.ptr,
-        );
-
-        return out[0..len];
-    }
-
-    extern "c" fn simdutf_convert_utf8_to_utf32(
-        [*]const u8,
-        usize,
-        [*]u32,
-    ) usize;
-};
-
-/// Generic test function so we can test against multiple implementations.
-fn testDecode(func: *const Decode) !void {
-    const testing = std.testing;
-
-    // This is pitifully small, but it's enough to test the basic logic.
-    // simdutf is extremely well tested, so we don't need to test the
-    // edge cases so much.
-    const inputs: []const []const u8 = &.{
-        "hello friends!!!",
-        "hello friends!!!",
-        "abc",
-        "abc\xdf\xbf",
-        "Ж",
-        "ЖЖ",
-        "брэд-ЛГТМ",
-        "☺☻☹",
-        "a\u{fffdb}",
-        "\xf4\x8f\xbf\xbf",
-    };
-
-    inline for (inputs) |input_raw| {
-        const input = if (input_raw.len >= 64) input_raw else input_raw ++ ("hello" ** 15);
-        assert(input.len >= 64);
-
-        var buf: [1024]u32 = undefined;
-        var buf2: [1024]u32 = undefined;
-        const scalar = Stdlib.decode(&buf, input);
-        const actual = func(&buf2, input);
-        try testing.expectEqualSlices(u32, scalar, actual);
-    }
-}
-
-test "count" {
-    const v = isa.detect();
-    var it = v.iterator();
-    while (it.next()) |isa_v| try testDecode(decodeFunc(isa_v));
-}
--- a/src/simd/utf8_validate.zig
+++ b/src/simd/utf8_validate.zig
@ -1,301 +0,0 @@
-const std = @import("std");
-const builtin = @import("builtin");
-const assert = std.debug.assert;
-const isa = @import("isa.zig");
-const aarch64 = @import("aarch64.zig");
-
-// All of the work in this file is based heavily on the work of
-// Daniel Lemire and John Keiser. Their original work can be found here:
-// - https://arxiv.org/pdf/2010.03090.pdf
-// - https://simdutf.github.io/simdutf/ (MIT License)
-
-pub const Validate = fn ([]const u8) bool;
-
-pub fn validateFunc(v: isa.ISA) *const Validate {
-    return isa.funcMap(Validate, v, .{
-        .{ .avx2, Scalar.validate }, // todo
-        .{ .neon, Neon.validate },
-        .{ .scalar, Scalar.validate },
-    });
-}
-
-pub const Scalar = struct {
-    pub fn validate(input: []const u8) bool {
-        return std.unicode.utf8ValidateSlice(input);
-    }
-};
-
-pub const Neon = struct {
-    /// The previous input in a vector. This is required because to check
-    /// the validity of a UTF-8 byte, we need to sometimes know previous
-    /// state if it the first byte is a continuation byte.
-    prev_input: @Vector(16, u8),
-
-    /// The current error status. Once an error is set, it is never unset.
-    prev_error: @Vector(16, u8),
-
-    /// The current incomplete status. This is non-zero if the last chunk
-    /// requires more bytes to be valid UTF-8.
-    prev_incomplete: @Vector(16, u8),
-
-    pub fn init() Neon {
-        return .{
-            .prev_input = aarch64.vdupq_n_u8(0),
-            .prev_error = aarch64.vdupq_n_u8(0),
-            .prev_incomplete = aarch64.vdupq_n_u8(0),
-        };
-    }
-
-    pub fn validate(input: []const u8) bool {
-        var neon = Neon.init();
-        neon.feed(input);
-        return !neon.hasErrors();
-    }
-
-    /// Validate a chunk of UTF-8 data. This function is designed to be
-    /// called multiple times with successive chunks of data. When the
-    /// data is complete, you must call `finalize` to check for any
-    /// remaining errors.
-    pub fn feed(self: *Neon, input: []const u8) void {
-        // Break up our input into 16 byte chunks, and process each chunk
-        // separately. The size of a Neon register is 16 bytes.
-        var i: usize = 0;
-        while (i + 16 <= input.len) : (i += 16) {
-            const input_vec = aarch64.vld1q_u8(input[i..]);
-            self.process(input_vec);
-        }
-
-        // If we have any data remaining, we pad it with zeroes since that
-        // is valid UTF-8, and then treat it like a normal block.
-        if (i < input.len) {
-            const remaining = input.len - i;
-            assert(remaining < 16);
-
-            var buf: [16]u8 = undefined;
-            @memcpy(buf[0..remaining], input[i..]);
-            @memset(buf[remaining..], 0);
-
-            const input_vec = aarch64.vld1q_u8(&buf);
-            self.process(input_vec);
-        }
-    }
-
-    /// Call to finalize the validation (EOF is reached).
-    pub fn finalize(self: *Neon) void {
-        // Its possible for our last chunk to end expecting more
-        // continuation bytes.
-        self.prev_error = aarch64.vorrq_u8(self.prev_error, self.prev_incomplete);
-    }
-
-    /// Returns true if there are any errors.
-    pub fn hasErrors(self: *Neon) bool {
-        return aarch64.vmaxvq_u8(self.prev_error) != 0;
-    }
-
-    /// Process a single vector of input.
-    ///
-    /// This function generally isn't called directly, but it is very useful
-    /// if you want to compose this validation with other SIMD operations
-    /// and already have your data in a SIMD register.
-    pub fn process(self: *Neon, input_vec: @Vector(16, u8)) void {
-        // If all we have is ASCII, then we can skip the rest.
-        if (aarch64.vmaxvq_u8(input_vec) <= 0b10000000) {
-            self.prev_error = aarch64.vorrq_u8(self.prev_error, self.prev_incomplete);
-            return;
-        }
-
-        const prev1 = aarch64.vextq_u8(self.prev_input, input_vec, 15);
-        const prev1_shr4 = aarch64.vshrq_n_u8(prev1, 4);
-        const prev1_lownibs = aarch64.vandq_u8(prev1, aarch64.vdupq_n_u8(0x0F));
-        const input_highnibs = aarch64.vshrq_n_u8(input_vec, 4);
-        const byte_1_high = aarch64.vqtbl1q_u8(byte1HighTable(), prev1_shr4);
-        const byte_2_low = aarch64.vqtbl1q_u8(byte2LowTable(), prev1_lownibs);
-        const byte_2_high = aarch64.vqtbl1q_u8(byte2HighTable(), input_highnibs);
-        const special_cases = aarch64.vandq_u8(
-            byte_1_high,
-            aarch64.vandq_u8(byte_2_low, byte_2_high),
-        );
-
-        const prev2 = aarch64.vextq_u8(self.prev_input, input_vec, 14);
-        const prev3 = aarch64.vextq_u8(self.prev_input, input_vec, 13);
-        const is_third_byte = aarch64.vcgeq_u8(prev2, aarch64.vdupq_n_u8(0xE0));
-        const is_fourth_byte = aarch64.vcgeq_u8(prev3, aarch64.vdupq_n_u8(0xF0));
-        const must23 = aarch64.veorq_u8(is_third_byte, is_fourth_byte);
-        const must23_80 = aarch64.vandq_u8(must23, aarch64.vdupq_n_u8(0x80));
-        const multibyte_len = aarch64.veorq_u8(must23_80, special_cases);
-
-        self.prev_error = aarch64.vorrq_u8(self.prev_error, multibyte_len);
-        self.prev_input = input_vec;
-        self.prev_incomplete = aarch64.vcgtq_u8(input_vec, incomplete: {
-            var bytes: [16]u8 = .{255} ** 16;
-            bytes[15] = 0b11000000 - 1;
-            bytes[14] = 0b11100000 - 1;
-            bytes[13] = 0b11110000 - 1;
-            break :incomplete aarch64.vld1q_u8(&bytes);
-        });
-
-        // Debug all the vector registers:
-        // std.log.warn("input={}", .{input_vec});
-        // std.log.warn("prev_input={}", .{self.prev_input});
-        // std.log.warn("prev1={}", .{prev1});
-        // std.log.warn("prev1_shr4={}", .{prev1_shr4});
-        // std.log.warn("prev1_lownibs={}", .{prev1_lownibs});
-        // std.log.warn("input_highnibs={}", .{input_highnibs});
-        // std.log.warn("byte_1_high={}", .{byte_1_high});
-        // std.log.warn("byte_2_low={}", .{byte_2_low});
-        // std.log.warn("byte_2_high={}", .{byte_2_high});
-        // std.log.warn("special_cases={}", .{special_cases});
-        // std.log.warn("prev2={}", .{prev2});
-        // std.log.warn("prev3={}", .{prev3});
-        // std.log.warn("is_third_byte={}", .{is_third_byte});
-        // std.log.warn("is_fourth_byte={}", .{is_fourth_byte});
-        // std.log.warn("must23={}", .{must23});
-        // std.log.warn("must23_80={}", .{must23_80});
-        // std.log.warn("multibyte_len={}", .{multibyte_len});
-        // std.log.warn("error={}", .{self.prev_error});
-        // std.log.warn("incomplete={}", .{self.prev_incomplete});
-    }
-
-    inline fn byte1HighTable() @Vector(16, u8) {
-        // zig fmt: off
-        return aarch64.vld1q_u8(&.{
-            // 0_______ ________ <ASCII in byte 1>
-            TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-            TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-            // 10______ ________ <continuation in byte 1>
-            TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-            // 1100____ ________ <two byte lead in byte 1>
-            TOO_SHORT | OVERLONG_2,
-            // 1101____ ________ <two byte lead in byte 1>
-            TOO_SHORT,
-            // 1110____ ________ <three byte lead in byte 1>
-            TOO_SHORT | OVERLONG_3 | SURROGATE,
-            // 1111____ ________ <four+ byte lead in byte 1>
-            TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-        });
-        // zig fmt: on
-    }
-
-    inline fn byte2LowTable() @Vector(16, u8) {
-        // zig fmt: off
-        return aarch64.vld1q_u8(&.{
-            // ____0000 ________
-            CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-            // ____0001 ________
-            CARRY | OVERLONG_2,
-            // ____001_ ________
-            CARRY,
-            CARRY,
-
-            // ____0100 ________
-            CARRY | TOO_LARGE,
-            // ____0101 ________
-            CARRY | TOO_LARGE | TOO_LARGE_1000,
-            // ____011_ ________
-            CARRY | TOO_LARGE | TOO_LARGE_1000,
-            CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-            // ____1___ ________
-            CARRY | TOO_LARGE | TOO_LARGE_1000,
-            CARRY | TOO_LARGE | TOO_LARGE_1000,
-            CARRY | TOO_LARGE | TOO_LARGE_1000,
-            CARRY | TOO_LARGE | TOO_LARGE_1000,
-            CARRY | TOO_LARGE | TOO_LARGE_1000,
-            // ____1101 ________
-            CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-            CARRY | TOO_LARGE | TOO_LARGE_1000,
-            CARRY | TOO_LARGE | TOO_LARGE_1000
-        });
-        // zig fmt: on
-    }
-
-    inline fn byte2HighTable() @Vector(16, u8) {
-        // zig fmt: off
-        return aarch64.vld1q_u8(&.{
-            // ________ 0_______ <ASCII in byte 2>
-            TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-            TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-            // ________ 1000____
-            TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-            // ________ 1001____
-            TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-            // ________ 101_____
-            TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-            TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-            // ________ 11______
-            TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-        });
-        // zig fmt: on
-    }
-};
-
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-const TOO_SHORT: u8 = 1 << 0; // 11______ 0_______
-// 11______ 11______
-// 0_______ 10______
-const TOO_LONG: u8 = 1 << 1;
-const OVERLONG_3: u8 = 1 << 2; // 11100000 100_____
-const SURROGATE: u8 = 1 << 4; // 11101101 101_____
-const OVERLONG_2: u8 = 1 << 5; // 1100000_ 10______
-const TWO_CONTS: u8 = 1 << 7; // 10______ 10______
-const TOO_LARGE: u8 = 1 << 3; // 11110100 1001____
-// 11110100 101_____
-// 11110101 1001____
-// 11110101 101_____
-// 1111011_ 1001____
-// 1111011_ 101_____
-// 11111___ 1001____
-// 11111___ 101_____
-const TOO_LARGE_1000: u8 = 1 << 6;
-// 11110101 1000____
-// 1111011_ 1000____
-// 11111___ 1000____
-// 11110000 1000____
-const OVERLONG_4: u8 = 1 << 6;
-const CARRY: u8 = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-
-/// Generic test function so we can test against multiple implementations.
-/// This is initially copied from the Zig stdlib but may be expanded.
-fn testValidate(func: *const Validate) !void {
-    const testing = std.testing;
-    try testing.expect(func("hello friends!!!"));
-    try testing.expect(func("abc"));
-    try testing.expect(func("abc\xdf\xbf"));
-    try testing.expect(func(""));
-    try testing.expect(func("a"));
-    try testing.expect(func("abc"));
-    try testing.expect(func("Ж"));
-    try testing.expect(func("ЖЖ"));
-    try testing.expect(func("брэд-ЛГТМ"));
-    try testing.expect(func("☺☻☹"));
-    try testing.expect(func("a\u{fffdb}"));
-    try testing.expect(func("\xf4\x8f\xbf\xbf"));
-    try testing.expect(func("abc\xdf\xbf"));
-
-    try testing.expect(!func("abc\xc0"));
-    try testing.expect(!func("abc\xc0abc"));
-    try testing.expect(!func("aa\xe2"));
-    try testing.expect(!func("\x42\xfa"));
-    try testing.expect(!func("\x42\xfa\x43"));
-    try testing.expect(!func("abc\xc0"));
-    try testing.expect(!func("abc\xc0abc"));
-    try testing.expect(!func("\xf4\x90\x80\x80"));
-    try testing.expect(!func("\xf7\xbf\xbf\xbf"));
-    try testing.expect(!func("\xfb\xbf\xbf\xbf\xbf"));
-    try testing.expect(!func("\xc0\x80"));
-    try testing.expect(!func("\xed\xa0\x80"));
-    try testing.expect(!func("\xed\xbf\xbf"));
-}
-
-test "validate" {
-    const v = isa.detect();
-    var it = v.iterator();
-    while (it.next()) |isa_v| try testValidate(validateFunc(isa_v));
-}
--- a/src/simd/utf_tables.zig
+++ b/src/simd/utf_tables.zig
--- a/src/simd/x86_64.zig
+++ b/src/simd/x86_64.zig
@ -1,44 +0,0 @@
-pub const XCR0_XMM = 0x02;
-pub const XCR0_YMM = 0x04;
-pub const XCR0_MASKREG = 0x20;
-pub const XCR0_ZMM0_15 = 0x40;
-pub const XCR0_ZMM16_31 = 0x80;
-
-pub const CpuidLeaf = packed struct {
-    eax: u32,
-    ebx: u32,
-    ecx: u32,
-    edx: u32,
-};
-
-/// Wrapper around x86 and x86_64 `cpuid` in order to gather processor
-/// and feature information. This is explicitly and specifically only
-/// for x86 and x86_64.
-pub fn cpuid(leaf_id: u32, subid: u32) CpuidLeaf {
-    var eax: u32 = undefined;
-    var ebx: u32 = undefined;
-    var ecx: u32 = undefined;
-    var edx: u32 = undefined;
-
-    asm volatile ("cpuid"
-        : [_] "={eax}" (eax),
-          [_] "={ebx}" (ebx),
-          [_] "={ecx}" (ecx),
-          [_] "={edx}" (edx),
-        : [_] "{eax}" (leaf_id),
-          [_] "{ecx}" (subid),
-    );
-
-    return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
-}
-
-// Read control register 0 (XCR0). Used to detect features such as AVX.
-pub fn getXCR0() u32 {
-    return asm volatile (
-        \\ xor %%ecx, %%ecx
-        \\ xgetbv
-        : [_] "={eax}" (-> u32),
-        :
-        : "edx", "ecx"
-    );
-}
--- a/src/terminal/main.zig
+++ b/src/terminal/main.zig
@ -43,11 +43,6 @@ pub const EraseLine = csi.EraseLine;
 pub const TabClear = csi.TabClear;
 pub const Attribute = sgr.Attribute;

-// TODO: we only have a hardcoded Neon implementation for now
-pub usingnamespace if (builtin.target.cpu.arch == .aarch64) struct {
-    pub const simdvt = @import("simdvt.zig");
-} else struct {};
-
 /// If we're targeting wasm then we export some wasm APIs.
 pub usingnamespace if (builtin.target.isWasm()) struct {
    pub usingnamespace @import("wasm.zig");
--- a/src/terminal/simdvt/example.cpp
+++ b/src/terminal/simdvt/example.cpp
@ -1,63 +0,0 @@
-// Generates code for every target that this compiler can support.
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "example.cpp"  // this file
-#include <hwy/foreach_target.h>  // must come before highway.h
-#include <hwy/highway.h>
-
-HWY_BEFORE_NAMESPACE();
-namespace ghostty {
-namespace HWY_NAMESPACE {  // required: unique per target
-
-// Can skip hn:: prefixes if already inside hwy::HWY_NAMESPACE.
-namespace hn = hwy::HWY_NAMESPACE;
-
-using T = float;
-
-// Alternative to per-function HWY_ATTR: see HWY_BEFORE_NAMESPACE
-void MulAddLoop(const T* HWY_RESTRICT mul_array,
-                const T* HWY_RESTRICT add_array,
-                const size_t size, T* HWY_RESTRICT x_array) {
-  const hn::ScalableTag<T> d;
-  for (size_t i = 0; i < size; i += hn::Lanes(d)) {
-    const auto mul = hn::Load(d, mul_array + i);
-    const auto add = hn::Load(d, add_array + i);
-    auto x = hn::Load(d, x_array + i);
-    x = hn::MulAdd(mul, x, add);
-    hn::Store(x, d, x_array + i);
-  }
-}
-
-}  // namespace HWY_NAMESPACE
-}  // namespace ghostty
-HWY_AFTER_NAMESPACE();
-
-// The table of pointers to the various implementations in HWY_NAMESPACE must
-// be compiled only once (foreach_target #includes this file multiple times).
-// HWY_ONCE is true for only one of these 'compilation passes'.
-#if HWY_ONCE
-
-namespace ghostty {
-
-// This macro declares a static array used for dynamic dispatch.
-HWY_EXPORT(MulAddLoop);
-
-void CallMulAddLoop(const float* HWY_RESTRICT mul_array,
-                const float* HWY_RESTRICT add_array,
-                const size_t size, float* HWY_RESTRICT x_array) {
-  // This must reside outside of HWY_NAMESPACE because it references (calls the
-  // appropriate one from) the per-target implementations there.
-  // For static dispatch, use HWY_STATIC_DISPATCH.
-  return HWY_DYNAMIC_DISPATCH(MulAddLoop)(mul_array, add_array, size, x_array);
-}
-
-}  // namespace ghostty
-
-extern "C" float example() {
-    float mul_array[] {1, 2, 3, 4, 5};
-    float add_array[] {2, 3, 4, 5, 6};
-    float x_array[] {0, 0, 0, 0, 0};
-    ghostty::CallMulAddLoop(mul_array, add_array, 5, x_array);
-    return x_array[0];
-}
-
-#endif  // HWY_ONCE
--- a/src/terminal/simdvt/parser.zig
+++ b/src/terminal/simdvt/parser.zig
@ -1,82 +0,0 @@
-const std = @import("std");
-const Allocator = std.mem.Allocator;
-const ArenaAllocator = std.heap.ArenaAllocator;
-
-const terminal = @import("../main.zig");
-const ScalarStream = terminal.Stream;
-const simd = @import("../../simd/main.zig");
-const aarch64 = simd.aarch64;
-
-pub fn Stream(comptime Handler: type) type {
-    return struct {
-        const Self = @This();
-
-        handler: Handler,
-
-        pub fn init(h: Handler) Self {
-            return .{ .handler = h };
-        }
-
-        pub fn feed(self: *Self, input: []const u8) void {
-            // TODO: I want to do the UTF-8 decoding as we stream the input,
-            // but I don't want to deal with UTF-8 decode in SIMD right now.
-            // So for now we just go back over the input and decode using
-            // a scalar loop. Ugh.
-
-            // We search for ESC (0x1B) very frequently, since this is what triggers
-            // the start of a terminal escape sequence of any kind, so put this into
-            // a register immediately.
-            const esc_vec = aarch64.vdupq_n_u8(0x1B);
-
-            // Iterate 16 bytes at a time, which is the max size of a vector register.
-            var i: usize = 0;
-            while (i + 16 <= input.len) : (i += 16) {
-                // Load the next 16 bytes into a vector register.
-                const input_vec = aarch64.vld1q_u8(input[i..]);
-
-                // Check for ESC to determine if we should go to the next state.
-                if (simd.index_of.Neon.indexOfVec(input_vec, esc_vec)) |index| {
-                    _ = index;
-                    @panic("TODO");
-                }
-
-                // No ESC found, decode UTF-8.
-                // TODO(mitchellh): I don't have a UTF-8 decoder in SIMD yet, so
-                // for now we just use a scalar loop. This is slow.
-                const view = std.unicode.Utf8View.initUnchecked(input[i .. i + 16]);
-                var it = view.iterator();
-                while (it.nextCodepoint()) |cp| {
-                    self.handler.print(cp);
-                }
-            }
-
-            // Handle the remaining bytes
-            if (i < input.len) {
-                @panic("input must be a multiple of 16 bytes for now");
-            }
-        }
-    };
-}
-
-test "ascii" {
-    const testing = std.testing;
-    var arena = ArenaAllocator.init(testing.allocator);
-    defer arena.deinit();
-    const alloc = arena.allocator();
-
-    const H = struct {
-        const Self = @This();
-        alloc: Allocator,
-        buf: std.ArrayListUnmanaged(u21) = .{},
-
-        pub fn print(self: *Self, c: u21) void {
-            self.buf.append(self.alloc, c) catch unreachable;
-        }
-    };
-
-    const str = "hello" ** 16;
-    var s = Stream(H).init(.{ .alloc = alloc });
-    s.feed(str);
-
-    try testing.expectEqual(str.len, s.handler.buf.items.len);
-}