src/simd: improve isa detection

2025-07-22 03:36:14 +03:00 · 2024-01-29 14:23:11 -08:00
parent 7feba12eab
commit dc041f86fd
4 changed files with 209 additions and 104 deletions
--- a/src/simd/index_of.zig
+++ b/src/simd/index_of.zig
@ -1,5 +1,6 @@
 const std = @import("std");
 const builtin = @import("builtin");
 const isa = @import("isa.zig");
 const aarch64 = @import("aarch64.zig");
 // Note this is a reimplementation of std.mem.indexOfScalar. The Zig stdlib
@ -91,7 +92,7 @@ fn testIndexOf(func: *const IndexOf) !void {
 }
 test "indexOf neon" {
-    // TODO: use ISA detection here
+    if (comptime !isa.possible(.neon)) return error.SkipZigTest;
-    if (comptime builtin.cpu.arch != .aarch64) return error.SkipZigTest;
+    const set = isa.detect();
-    try testIndexOf(&indexOfNeon);
+    if (set.contains(.neon)) try testIndexOf(&indexOfNeon);
 }
--- a/src/simd/isa.zig
+++ b/src/simd/isa.zig
@ -1,112 +1,170 @@
 const std = @import("std");
 const builtin = @import("builtin");
 const assert = std.debug.assert;
 const x86_64 = @import("x86_64.zig");
-/// Possible instruction set architectures for SIMD operations. These are
+/// Raw comptime entry of poissible ISA. The arch is the arch that the
-/// coarse grained and are targeted specifically so we can detect exactly
+/// ISA is even possible on (e.g. neon is only possible on aarch64) but
-/// what is available to us in Ghostty.
+/// the actual ISA may not be available at runtime.
-pub const ISA = enum {
+const Entry = struct {
-    scalar,
+    name: [:0]const u8,
-    neon,
+    arch: []const std.Target.Cpu.Arch = &.{},
    avx2,
    /// Detect the available ISA at runtime. This will use comptime information
    /// as well to minimize the number of runtime checks.
    pub fn detect() ISA {
        return switch (builtin.cpu.arch) {
            // Neon is mandatory on aarch64. No runtime checks necessary.
            .aarch64 => .neon,
            .x86_64 => detectX86(),
            else => .scalar,
        };
    }
    fn detectX86() ISA {
        // NOTE: this is just some boilerplate to detect AVX2. We
        // can probably support earlier forms of SIMD such as plain
        // SSE, and we can definitely take advtange of later forms. This
        // is just some boilerplate to ONLY detect AVX2 right now.
        // If we support less than 7 for the maximum leaf level then we
        // don't support any AVX instructions.
        var leaf = X86.cpuid(0, 0);
        if (leaf.eax < 7) return .scalar;
        // If we don't have xsave or avx, then we don't support anything.
        leaf = X86.cpuid(1, 0);
        const has_xsave = hasBit(leaf.ecx, 27);
        const has_avx = hasBit(leaf.ecx, 28);
        if (!has_xsave or !has_avx) return .scalar;
        // We require AVX save state in order to use AVX instructions.
        const xcr0_eax = X86.getXCR0(); // requires xsave+avx
        const has_avx_save = hasMask(xcr0_eax, X86.XCR0_XMM | X86.XCR0_YMM);
        if (!has_avx_save) return .scalar;
        // Check for AVX2.
        leaf = X86.cpuid(7, 0);
        const has_avx2 = hasBit(leaf.ebx, 5);
        if (has_avx2) return .avx2;
        return .scalar;
    }
 };
-/// Constants and functions related to x86 and x86_64. Reference for this
+const entries: []const Entry = &.{
-/// can be found in the Intel Architectures Software Developer's Manual,
+    .{ .name = "scalar" },
-/// mostly around the cpuid instruction.
+    .{ .name = "neon", .arch = &.{.aarch64} },
-const X86 = struct {
+    .{ .name = "avx2", .arch = &.{ .x86, .x86_64 } },
    const XCR0_XMM = 0x02;
    const XCR0_YMM = 0x04;
    const XCR0_MASKREG = 0x20;
    const XCR0_ZMM0_15 = 0x40;
    const XCR0_ZMM16_31 = 0x80;
    const CpuidLeaf = packed struct {
        eax: u32,
        ebx: u32,
        ecx: u32,
        edx: u32,
    };
    /// Wrapper around x86 and x86_64 `cpuid` in order to gather processor
    /// and feature information. This is explicitly and specifically only
    /// for x86 and x86_64.
    fn cpuid(leaf_id: u32, subid: u32) CpuidLeaf {
        var eax: u32 = undefined;
        var ebx: u32 = undefined;
        var ecx: u32 = undefined;
        var edx: u32 = undefined;
        asm volatile ("cpuid"
            : [_] "={eax}" (eax),
              [_] "={ebx}" (ebx),
              [_] "={ecx}" (ecx),
              [_] "={edx}" (edx),
            : [_] "{eax}" (leaf_id),
              [_] "{ecx}" (subid),
        );
        return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
    }
    // Read control register 0 (XCR0). Used to detect features such as AVX.
    fn getXCR0() u32 {
        return asm volatile (
            \\ xor %%ecx, %%ecx
            \\ xgetbv
            : [_] "={eax}" (-> u32),
            :
            : "edx", "ecx"
        );
    }
 };
 /// Enum of possible ISAs for our SIMD operations. Note that these are
 /// coarse-grained because they match possible implementations rather than
 /// a fine-grained packed struct of available CPU features.
 pub const ISA = isa: {
    const EnumField = std.builtin.Type.EnumField;
    var fields: [entries.len]EnumField = undefined;
    for (entries, 0..) |entry, i| {
        fields[i] = .{ .name = entry.name, .value = i };
    }
    break :isa @Type(.{ .Enum = .{
        .tag_type = std.math.IntFittingRange(0, entries.len - 1),
        .fields = &fields,
        .decls = &.{},
        .is_exhaustive = true,
    } });
 };
 /// A set of ISAs.
 pub const Set = std.EnumSet(ISA);
 /// Check if the given ISA is possible on the current target. This is
 /// available at comptime to help prevent invalid architectures from
 /// being used.
 pub fn possible(comptime isa: ISA) bool {
    inline for (entries) |entry| {
        if (std.mem.eql(u8, entry.name, @tagName(isa))) {
            for (entry.arch) |arch| {
                if (arch == builtin.cpu.arch) return true;
            }
            // If we have no valid archs then its always valid.
            return entry.arch.len == 0;
        }
    }
    unreachable;
 }
 /// Detect all possible ISAs at runtime.
 pub fn detect() Set {
    var set: Set = .{};
    set.insert(.scalar);
    switch (builtin.cpu.arch) {
        // Neon is mandatory on aarch64. No runtime checks necessary.
        .aarch64 => set.insert(.neon),
        .x86_64 => detectX86(&set),
        else => {},
    }
    return set;
 }
 /// Returns the preferred ISA to use that is available.
 pub fn preferred(set: Set) ISA {
    const order: []const ISA = &.{ .avx2, .neon, .scalar };
    // We should have all of our ISAs present in order
    comptime {
        for (@typeInfo(ISA).Enum.fields) |field| {
            const v = @field(ISA, field.name);
            assert(std.mem.indexOfScalar(ISA, order, v) != null);
        }
    }
    inline for (order) |isa| {
        if (comptime possible(isa)) {
            if (set.contains(isa)) return isa;
        }
    }
    return .scalar;
 }
 fn detectX86(set: *Set) void {
    // NOTE: this is just some boilerplate to detect AVX2. We
    // can probably support earlier forms of SIMD such as plain
    // SSE, and we can definitely take advtange of later forms. This
    // is just some boilerplate to ONLY detect AVX2 right now.
    // If we support less than 7 for the maximum leaf level then we
    // don't support any AVX instructions.
    var leaf = x86_64.cpuid(0, 0);
    if (leaf.eax < 7) return;
    // If we don't have xsave or avx, then we don't support anything.
    leaf = x86_64.cpuid(1, 0);
    const has_xsave = hasBit(leaf.ecx, 27);
    const has_avx = hasBit(leaf.ecx, 28);
    if (!has_xsave or !has_avx) return;
    // We require AVX save state in order to use AVX instructions.
    const xcr0_eax = x86_64.getXCR0(); // requires xsave+avx
    const has_avx_save = hasMask(xcr0_eax, x86_64.XCR0_XMM | x86_64.XCR0_YMM);
    if (!has_avx_save) return;
    // Check for AVX2.
    leaf = x86_64.cpuid(7, 0);
    const has_avx2 = hasBit(leaf.ebx, 5);
    if (has_avx2) set.insert(.avx2);
 }
 /// Check if a bit is set at the given offset
-inline fn hasBit(input: u32, offset: u5) bool {
+pub inline fn hasBit(input: u32, offset: u5) bool {
    return (input >> offset) & 1 != 0;
 }
 /// Checks if a mask exactly matches the input
-inline fn hasMask(input: u32, mask: u32) bool {
+pub inline fn hasMask(input: u32, mask: u32) bool {
    return (input & mask) == mask;
 }
 test "detect" {
    const testing = std.testing;
    const set = detect();
    try testing.expect(set.contains(.scalar));
    switch (builtin.cpu.arch) {
        .aarch64 => {
            // Neon is always available on aarch64
            try testing.expect(set.contains(.neon));
            try testing.expect(!set.contains(.avx2));
        },
        else => {},
    }
 }
 test "preferred" {
    _ = preferred(detect());
 }
 test "possible" {
    const testing = std.testing;
    try testing.expect(possible(.scalar)); // always possible
    // hardcode some other common realities
    switch (builtin.cpu.arch) {
        .aarch64 => {
            try testing.expect(possible(.neon));
            try testing.expect(!possible(.avx2));
        },
        .x86, .x86_64 => {
            try testing.expect(!possible(.neon));
            try testing.expect(possible(.avx2));
        },
        else => {},
    }
 }
--- a/src/simd/main.zig
+++ b/src/simd/main.zig
@ -1,15 +1,17 @@
 const std = @import("std");
-const isa = @import("isa.zig");
+pub const isa = @import("isa.zig");
 const index_of = @import("index_of.zig");
 pub usingnamespace isa;
 pub usingnamespace index_of;
 // const utf8 = @import("utf8.zig");
 // pub usingnamespace utf8;
 pub fn main() !void {
    //std.log.warn("ISA={}", .{isa.ISA.detect()});
    const input = "1234567\x1b1234567\x1b";
    //const input = "1234567812345678";
-    _ = index_of.indexOf(input, 0x1B);
+    std.log.warn("result={any}", .{index_of.indexOf(input, 0x1B)});
 }
 test {
--- a/src/simd/x86_64.zig
+++ b/src/simd/x86_64.zig
@ -0,0 +1,44 @@
 pub const XCR0_XMM = 0x02;
 pub const XCR0_YMM = 0x04;
 pub const XCR0_MASKREG = 0x20;
 pub const XCR0_ZMM0_15 = 0x40;
 pub const XCR0_ZMM16_31 = 0x80;
 pub const CpuidLeaf = packed struct {
    eax: u32,
    ebx: u32,
    ecx: u32,
    edx: u32,
 };
 /// Wrapper around x86 and x86_64 `cpuid` in order to gather processor
 /// and feature information. This is explicitly and specifically only
 /// for x86 and x86_64.
 pub fn cpuid(leaf_id: u32, subid: u32) CpuidLeaf {
    var eax: u32 = undefined;
    var ebx: u32 = undefined;
    var ecx: u32 = undefined;
    var edx: u32 = undefined;
    asm volatile ("cpuid"
        : [_] "={eax}" (eax),
          [_] "={ebx}" (ebx),
          [_] "={ecx}" (ecx),
          [_] "={edx}" (edx),
        : [_] "{eax}" (leaf_id),
          [_] "{ecx}" (subid),
    );
    return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
 }
 // Read control register 0 (XCR0). Used to detect features such as AVX.
 pub fn getXCR0() u32 {
    return asm volatile (
        \\ xor %%ecx, %%ecx
        \\ xgetbv
        : [_] "={eax}" (-> u32),
        :
        : "edx", "ecx"
    );
 }