diff --git a/src/simd/index_of.zig b/src/simd/index_of.zig index 6630615a4..b66349847 100644 --- a/src/simd/index_of.zig +++ b/src/simd/index_of.zig @@ -1,5 +1,6 @@ const std = @import("std"); const builtin = @import("builtin"); +const isa = @import("isa.zig"); const aarch64 = @import("aarch64.zig"); // Note this is a reimplementation of std.mem.indexOfScalar. The Zig stdlib @@ -91,7 +92,7 @@ fn testIndexOf(func: *const IndexOf) !void { } test "indexOf neon" { - // TODO: use ISA detection here - if (comptime builtin.cpu.arch != .aarch64) return error.SkipZigTest; - try testIndexOf(&indexOfNeon); + if (comptime !isa.possible(.neon)) return error.SkipZigTest; + const set = isa.detect(); + if (set.contains(.neon)) try testIndexOf(&indexOfNeon); } diff --git a/src/simd/isa.zig b/src/simd/isa.zig index fd1bf6a65..8f42be53c 100644 --- a/src/simd/isa.zig +++ b/src/simd/isa.zig @@ -1,112 +1,170 @@ const std = @import("std"); const builtin = @import("builtin"); +const assert = std.debug.assert; +const x86_64 = @import("x86_64.zig"); -/// Possible instruction set architectures for SIMD operations. These are -/// coarse grained and are targeted specifically so we can detect exactly -/// what is available to us in Ghostty. -pub const ISA = enum { - scalar, - neon, - avx2, - - /// Detect the available ISA at runtime. This will use comptime information - /// as well to minimize the number of runtime checks. - pub fn detect() ISA { - return switch (builtin.cpu.arch) { - // Neon is mandatory on aarch64. No runtime checks necessary. - .aarch64 => .neon, - .x86_64 => detectX86(), - else => .scalar, - }; - } - - fn detectX86() ISA { - // NOTE: this is just some boilerplate to detect AVX2. We - // can probably support earlier forms of SIMD such as plain - // SSE, and we can definitely take advtange of later forms. This - // is just some boilerplate to ONLY detect AVX2 right now. - - // If we support less than 7 for the maximum leaf level then we - // don't support any AVX instructions. - var leaf = X86.cpuid(0, 0); - if (leaf.eax < 7) return .scalar; - - // If we don't have xsave or avx, then we don't support anything. - leaf = X86.cpuid(1, 0); - const has_xsave = hasBit(leaf.ecx, 27); - const has_avx = hasBit(leaf.ecx, 28); - if (!has_xsave or !has_avx) return .scalar; - - // We require AVX save state in order to use AVX instructions. - const xcr0_eax = X86.getXCR0(); // requires xsave+avx - const has_avx_save = hasMask(xcr0_eax, X86.XCR0_XMM | X86.XCR0_YMM); - if (!has_avx_save) return .scalar; - - // Check for AVX2. - leaf = X86.cpuid(7, 0); - const has_avx2 = hasBit(leaf.ebx, 5); - if (has_avx2) return .avx2; - - return .scalar; - } +/// Raw comptime entry of poissible ISA. The arch is the arch that the +/// ISA is even possible on (e.g. neon is only possible on aarch64) but +/// the actual ISA may not be available at runtime. +const Entry = struct { + name: [:0]const u8, + arch: []const std.Target.Cpu.Arch = &.{}, }; -/// Constants and functions related to x86 and x86_64. Reference for this -/// can be found in the Intel Architectures Software Developer's Manual, -/// mostly around the cpuid instruction. -const X86 = struct { - const XCR0_XMM = 0x02; - const XCR0_YMM = 0x04; - const XCR0_MASKREG = 0x20; - const XCR0_ZMM0_15 = 0x40; - const XCR0_ZMM16_31 = 0x80; - - const CpuidLeaf = packed struct { - eax: u32, - ebx: u32, - ecx: u32, - edx: u32, - }; - - /// Wrapper around x86 and x86_64 `cpuid` in order to gather processor - /// and feature information. This is explicitly and specifically only - /// for x86 and x86_64. - fn cpuid(leaf_id: u32, subid: u32) CpuidLeaf { - var eax: u32 = undefined; - var ebx: u32 = undefined; - var ecx: u32 = undefined; - var edx: u32 = undefined; - - asm volatile ("cpuid" - : [_] "={eax}" (eax), - [_] "={ebx}" (ebx), - [_] "={ecx}" (ecx), - [_] "={edx}" (edx), - : [_] "{eax}" (leaf_id), - [_] "{ecx}" (subid), - ); - - return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx }; - } - - // Read control register 0 (XCR0). Used to detect features such as AVX. - fn getXCR0() u32 { - return asm volatile ( - \\ xor %%ecx, %%ecx - \\ xgetbv - : [_] "={eax}" (-> u32), - : - : "edx", "ecx" - ); - } +const entries: []const Entry = &.{ + .{ .name = "scalar" }, + .{ .name = "neon", .arch = &.{.aarch64} }, + .{ .name = "avx2", .arch = &.{ .x86, .x86_64 } }, }; +/// Enum of possible ISAs for our SIMD operations. Note that these are +/// coarse-grained because they match possible implementations rather than +/// a fine-grained packed struct of available CPU features. +pub const ISA = isa: { + const EnumField = std.builtin.Type.EnumField; + var fields: [entries.len]EnumField = undefined; + for (entries, 0..) |entry, i| { + fields[i] = .{ .name = entry.name, .value = i }; + } + + break :isa @Type(.{ .Enum = .{ + .tag_type = std.math.IntFittingRange(0, entries.len - 1), + .fields = &fields, + .decls = &.{}, + .is_exhaustive = true, + } }); +}; + +/// A set of ISAs. +pub const Set = std.EnumSet(ISA); + +/// Check if the given ISA is possible on the current target. This is +/// available at comptime to help prevent invalid architectures from +/// being used. +pub fn possible(comptime isa: ISA) bool { + inline for (entries) |entry| { + if (std.mem.eql(u8, entry.name, @tagName(isa))) { + for (entry.arch) |arch| { + if (arch == builtin.cpu.arch) return true; + } + + // If we have no valid archs then its always valid. + return entry.arch.len == 0; + } + } + + unreachable; +} + +/// Detect all possible ISAs at runtime. +pub fn detect() Set { + var set: Set = .{}; + set.insert(.scalar); + switch (builtin.cpu.arch) { + // Neon is mandatory on aarch64. No runtime checks necessary. + .aarch64 => set.insert(.neon), + .x86_64 => detectX86(&set), + else => {}, + } + + return set; +} + +/// Returns the preferred ISA to use that is available. +pub fn preferred(set: Set) ISA { + const order: []const ISA = &.{ .avx2, .neon, .scalar }; + + // We should have all of our ISAs present in order + comptime { + for (@typeInfo(ISA).Enum.fields) |field| { + const v = @field(ISA, field.name); + assert(std.mem.indexOfScalar(ISA, order, v) != null); + } + } + + inline for (order) |isa| { + if (comptime possible(isa)) { + if (set.contains(isa)) return isa; + } + } + + return .scalar; +} + +fn detectX86(set: *Set) void { + // NOTE: this is just some boilerplate to detect AVX2. We + // can probably support earlier forms of SIMD such as plain + // SSE, and we can definitely take advtange of later forms. This + // is just some boilerplate to ONLY detect AVX2 right now. + + // If we support less than 7 for the maximum leaf level then we + // don't support any AVX instructions. + var leaf = x86_64.cpuid(0, 0); + if (leaf.eax < 7) return; + + // If we don't have xsave or avx, then we don't support anything. + leaf = x86_64.cpuid(1, 0); + const has_xsave = hasBit(leaf.ecx, 27); + const has_avx = hasBit(leaf.ecx, 28); + if (!has_xsave or !has_avx) return; + + // We require AVX save state in order to use AVX instructions. + const xcr0_eax = x86_64.getXCR0(); // requires xsave+avx + const has_avx_save = hasMask(xcr0_eax, x86_64.XCR0_XMM | x86_64.XCR0_YMM); + if (!has_avx_save) return; + + // Check for AVX2. + leaf = x86_64.cpuid(7, 0); + const has_avx2 = hasBit(leaf.ebx, 5); + if (has_avx2) set.insert(.avx2); +} + /// Check if a bit is set at the given offset -inline fn hasBit(input: u32, offset: u5) bool { +pub inline fn hasBit(input: u32, offset: u5) bool { return (input >> offset) & 1 != 0; } /// Checks if a mask exactly matches the input -inline fn hasMask(input: u32, mask: u32) bool { +pub inline fn hasMask(input: u32, mask: u32) bool { return (input & mask) == mask; } + +test "detect" { + const testing = std.testing; + const set = detect(); + try testing.expect(set.contains(.scalar)); + + switch (builtin.cpu.arch) { + .aarch64 => { + // Neon is always available on aarch64 + try testing.expect(set.contains(.neon)); + try testing.expect(!set.contains(.avx2)); + }, + + else => {}, + } +} + +test "preferred" { + _ = preferred(detect()); +} + +test "possible" { + const testing = std.testing; + try testing.expect(possible(.scalar)); // always possible + + // hardcode some other common realities + switch (builtin.cpu.arch) { + .aarch64 => { + try testing.expect(possible(.neon)); + try testing.expect(!possible(.avx2)); + }, + + .x86, .x86_64 => { + try testing.expect(!possible(.neon)); + try testing.expect(possible(.avx2)); + }, + + else => {}, + } +} diff --git a/src/simd/main.zig b/src/simd/main.zig index dabae2950..c75bea5eb 100644 --- a/src/simd/main.zig +++ b/src/simd/main.zig @@ -1,15 +1,17 @@ const std = @import("std"); -const isa = @import("isa.zig"); +pub const isa = @import("isa.zig"); const index_of = @import("index_of.zig"); -pub usingnamespace isa; pub usingnamespace index_of; +// const utf8 = @import("utf8.zig"); +// pub usingnamespace utf8; + pub fn main() !void { //std.log.warn("ISA={}", .{isa.ISA.detect()}); const input = "1234567\x1b1234567\x1b"; //const input = "1234567812345678"; - _ = index_of.indexOf(input, 0x1B); + std.log.warn("result={any}", .{index_of.indexOf(input, 0x1B)}); } test { diff --git a/src/simd/x86_64.zig b/src/simd/x86_64.zig new file mode 100644 index 000000000..3601f58bb --- /dev/null +++ b/src/simd/x86_64.zig @@ -0,0 +1,44 @@ +pub const XCR0_XMM = 0x02; +pub const XCR0_YMM = 0x04; +pub const XCR0_MASKREG = 0x20; +pub const XCR0_ZMM0_15 = 0x40; +pub const XCR0_ZMM16_31 = 0x80; + +pub const CpuidLeaf = packed struct { + eax: u32, + ebx: u32, + ecx: u32, + edx: u32, +}; + +/// Wrapper around x86 and x86_64 `cpuid` in order to gather processor +/// and feature information. This is explicitly and specifically only +/// for x86 and x86_64. +pub fn cpuid(leaf_id: u32, subid: u32) CpuidLeaf { + var eax: u32 = undefined; + var ebx: u32 = undefined; + var ecx: u32 = undefined; + var edx: u32 = undefined; + + asm volatile ("cpuid" + : [_] "={eax}" (eax), + [_] "={ebx}" (ebx), + [_] "={ecx}" (ecx), + [_] "={edx}" (edx), + : [_] "{eax}" (leaf_id), + [_] "{ecx}" (subid), + ); + + return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx }; +} + +// Read control register 0 (XCR0). Used to detect features such as AVX. +pub fn getXCR0() u32 { + return asm volatile ( + \\ xor %%ecx, %%ecx + \\ xgetbv + : [_] "={eax}" (-> u32), + : + : "edx", "ecx" + ); +}