src/simd: improve isa detection

This commit is contained in:
Mitchell Hashimoto
2024-01-29 14:23:11 -08:00
parent 7feba12eab
commit dc041f86fd
4 changed files with 209 additions and 104 deletions

View File

@ -1,5 +1,6 @@
const std = @import("std"); const std = @import("std");
const builtin = @import("builtin"); const builtin = @import("builtin");
const isa = @import("isa.zig");
const aarch64 = @import("aarch64.zig"); const aarch64 = @import("aarch64.zig");
// Note this is a reimplementation of std.mem.indexOfScalar. The Zig stdlib // Note this is a reimplementation of std.mem.indexOfScalar. The Zig stdlib
@ -91,7 +92,7 @@ fn testIndexOf(func: *const IndexOf) !void {
} }
test "indexOf neon" { test "indexOf neon" {
// TODO: use ISA detection here if (comptime !isa.possible(.neon)) return error.SkipZigTest;
if (comptime builtin.cpu.arch != .aarch64) return error.SkipZigTest; const set = isa.detect();
try testIndexOf(&indexOfNeon); if (set.contains(.neon)) try testIndexOf(&indexOfNeon);
} }

View File

@ -1,112 +1,170 @@
const std = @import("std"); const std = @import("std");
const builtin = @import("builtin"); const builtin = @import("builtin");
const assert = std.debug.assert;
const x86_64 = @import("x86_64.zig");
/// Possible instruction set architectures for SIMD operations. These are /// Raw comptime entry of poissible ISA. The arch is the arch that the
/// coarse grained and are targeted specifically so we can detect exactly /// ISA is even possible on (e.g. neon is only possible on aarch64) but
/// what is available to us in Ghostty. /// the actual ISA may not be available at runtime.
pub const ISA = enum { const Entry = struct {
scalar, name: [:0]const u8,
neon, arch: []const std.Target.Cpu.Arch = &.{},
avx2,
/// Detect the available ISA at runtime. This will use comptime information
/// as well to minimize the number of runtime checks.
pub fn detect() ISA {
return switch (builtin.cpu.arch) {
// Neon is mandatory on aarch64. No runtime checks necessary.
.aarch64 => .neon,
.x86_64 => detectX86(),
else => .scalar,
};
}
fn detectX86() ISA {
// NOTE: this is just some boilerplate to detect AVX2. We
// can probably support earlier forms of SIMD such as plain
// SSE, and we can definitely take advtange of later forms. This
// is just some boilerplate to ONLY detect AVX2 right now.
// If we support less than 7 for the maximum leaf level then we
// don't support any AVX instructions.
var leaf = X86.cpuid(0, 0);
if (leaf.eax < 7) return .scalar;
// If we don't have xsave or avx, then we don't support anything.
leaf = X86.cpuid(1, 0);
const has_xsave = hasBit(leaf.ecx, 27);
const has_avx = hasBit(leaf.ecx, 28);
if (!has_xsave or !has_avx) return .scalar;
// We require AVX save state in order to use AVX instructions.
const xcr0_eax = X86.getXCR0(); // requires xsave+avx
const has_avx_save = hasMask(xcr0_eax, X86.XCR0_XMM | X86.XCR0_YMM);
if (!has_avx_save) return .scalar;
// Check for AVX2.
leaf = X86.cpuid(7, 0);
const has_avx2 = hasBit(leaf.ebx, 5);
if (has_avx2) return .avx2;
return .scalar;
}
}; };
/// Constants and functions related to x86 and x86_64. Reference for this const entries: []const Entry = &.{
/// can be found in the Intel Architectures Software Developer's Manual, .{ .name = "scalar" },
/// mostly around the cpuid instruction. .{ .name = "neon", .arch = &.{.aarch64} },
const X86 = struct { .{ .name = "avx2", .arch = &.{ .x86, .x86_64 } },
const XCR0_XMM = 0x02;
const XCR0_YMM = 0x04;
const XCR0_MASKREG = 0x20;
const XCR0_ZMM0_15 = 0x40;
const XCR0_ZMM16_31 = 0x80;
const CpuidLeaf = packed struct {
eax: u32,
ebx: u32,
ecx: u32,
edx: u32,
};
/// Wrapper around x86 and x86_64 `cpuid` in order to gather processor
/// and feature information. This is explicitly and specifically only
/// for x86 and x86_64.
fn cpuid(leaf_id: u32, subid: u32) CpuidLeaf {
var eax: u32 = undefined;
var ebx: u32 = undefined;
var ecx: u32 = undefined;
var edx: u32 = undefined;
asm volatile ("cpuid"
: [_] "={eax}" (eax),
[_] "={ebx}" (ebx),
[_] "={ecx}" (ecx),
[_] "={edx}" (edx),
: [_] "{eax}" (leaf_id),
[_] "{ecx}" (subid),
);
return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
}
// Read control register 0 (XCR0). Used to detect features such as AVX.
fn getXCR0() u32 {
return asm volatile (
\\ xor %%ecx, %%ecx
\\ xgetbv
: [_] "={eax}" (-> u32),
:
: "edx", "ecx"
);
}
}; };
/// Enum of possible ISAs for our SIMD operations. Note that these are
/// coarse-grained because they match possible implementations rather than
/// a fine-grained packed struct of available CPU features.
pub const ISA = isa: {
const EnumField = std.builtin.Type.EnumField;
var fields: [entries.len]EnumField = undefined;
for (entries, 0..) |entry, i| {
fields[i] = .{ .name = entry.name, .value = i };
}
break :isa @Type(.{ .Enum = .{
.tag_type = std.math.IntFittingRange(0, entries.len - 1),
.fields = &fields,
.decls = &.{},
.is_exhaustive = true,
} });
};
/// A set of ISAs.
pub const Set = std.EnumSet(ISA);
/// Check if the given ISA is possible on the current target. This is
/// available at comptime to help prevent invalid architectures from
/// being used.
pub fn possible(comptime isa: ISA) bool {
inline for (entries) |entry| {
if (std.mem.eql(u8, entry.name, @tagName(isa))) {
for (entry.arch) |arch| {
if (arch == builtin.cpu.arch) return true;
}
// If we have no valid archs then its always valid.
return entry.arch.len == 0;
}
}
unreachable;
}
/// Detect all possible ISAs at runtime.
pub fn detect() Set {
var set: Set = .{};
set.insert(.scalar);
switch (builtin.cpu.arch) {
// Neon is mandatory on aarch64. No runtime checks necessary.
.aarch64 => set.insert(.neon),
.x86_64 => detectX86(&set),
else => {},
}
return set;
}
/// Returns the preferred ISA to use that is available.
pub fn preferred(set: Set) ISA {
const order: []const ISA = &.{ .avx2, .neon, .scalar };
// We should have all of our ISAs present in order
comptime {
for (@typeInfo(ISA).Enum.fields) |field| {
const v = @field(ISA, field.name);
assert(std.mem.indexOfScalar(ISA, order, v) != null);
}
}
inline for (order) |isa| {
if (comptime possible(isa)) {
if (set.contains(isa)) return isa;
}
}
return .scalar;
}
fn detectX86(set: *Set) void {
// NOTE: this is just some boilerplate to detect AVX2. We
// can probably support earlier forms of SIMD such as plain
// SSE, and we can definitely take advtange of later forms. This
// is just some boilerplate to ONLY detect AVX2 right now.
// If we support less than 7 for the maximum leaf level then we
// don't support any AVX instructions.
var leaf = x86_64.cpuid(0, 0);
if (leaf.eax < 7) return;
// If we don't have xsave or avx, then we don't support anything.
leaf = x86_64.cpuid(1, 0);
const has_xsave = hasBit(leaf.ecx, 27);
const has_avx = hasBit(leaf.ecx, 28);
if (!has_xsave or !has_avx) return;
// We require AVX save state in order to use AVX instructions.
const xcr0_eax = x86_64.getXCR0(); // requires xsave+avx
const has_avx_save = hasMask(xcr0_eax, x86_64.XCR0_XMM | x86_64.XCR0_YMM);
if (!has_avx_save) return;
// Check for AVX2.
leaf = x86_64.cpuid(7, 0);
const has_avx2 = hasBit(leaf.ebx, 5);
if (has_avx2) set.insert(.avx2);
}
/// Check if a bit is set at the given offset /// Check if a bit is set at the given offset
inline fn hasBit(input: u32, offset: u5) bool { pub inline fn hasBit(input: u32, offset: u5) bool {
return (input >> offset) & 1 != 0; return (input >> offset) & 1 != 0;
} }
/// Checks if a mask exactly matches the input /// Checks if a mask exactly matches the input
inline fn hasMask(input: u32, mask: u32) bool { pub inline fn hasMask(input: u32, mask: u32) bool {
return (input & mask) == mask; return (input & mask) == mask;
} }
test "detect" {
const testing = std.testing;
const set = detect();
try testing.expect(set.contains(.scalar));
switch (builtin.cpu.arch) {
.aarch64 => {
// Neon is always available on aarch64
try testing.expect(set.contains(.neon));
try testing.expect(!set.contains(.avx2));
},
else => {},
}
}
test "preferred" {
_ = preferred(detect());
}
test "possible" {
const testing = std.testing;
try testing.expect(possible(.scalar)); // always possible
// hardcode some other common realities
switch (builtin.cpu.arch) {
.aarch64 => {
try testing.expect(possible(.neon));
try testing.expect(!possible(.avx2));
},
.x86, .x86_64 => {
try testing.expect(!possible(.neon));
try testing.expect(possible(.avx2));
},
else => {},
}
}

View File

@ -1,15 +1,17 @@
const std = @import("std"); const std = @import("std");
const isa = @import("isa.zig"); pub const isa = @import("isa.zig");
const index_of = @import("index_of.zig"); const index_of = @import("index_of.zig");
pub usingnamespace isa;
pub usingnamespace index_of; pub usingnamespace index_of;
// const utf8 = @import("utf8.zig");
// pub usingnamespace utf8;
pub fn main() !void { pub fn main() !void {
//std.log.warn("ISA={}", .{isa.ISA.detect()}); //std.log.warn("ISA={}", .{isa.ISA.detect()});
const input = "1234567\x1b1234567\x1b"; const input = "1234567\x1b1234567\x1b";
//const input = "1234567812345678"; //const input = "1234567812345678";
_ = index_of.indexOf(input, 0x1B); std.log.warn("result={any}", .{index_of.indexOf(input, 0x1B)});
} }
test { test {

44
src/simd/x86_64.zig Normal file
View File

@ -0,0 +1,44 @@
pub const XCR0_XMM = 0x02;
pub const XCR0_YMM = 0x04;
pub const XCR0_MASKREG = 0x20;
pub const XCR0_ZMM0_15 = 0x40;
pub const XCR0_ZMM16_31 = 0x80;
pub const CpuidLeaf = packed struct {
eax: u32,
ebx: u32,
ecx: u32,
edx: u32,
};
/// Wrapper around x86 and x86_64 `cpuid` in order to gather processor
/// and feature information. This is explicitly and specifically only
/// for x86 and x86_64.
pub fn cpuid(leaf_id: u32, subid: u32) CpuidLeaf {
var eax: u32 = undefined;
var ebx: u32 = undefined;
var ecx: u32 = undefined;
var edx: u32 = undefined;
asm volatile ("cpuid"
: [_] "={eax}" (eax),
[_] "={ebx}" (ebx),
[_] "={ecx}" (ecx),
[_] "={edx}" (edx),
: [_] "{eax}" (leaf_id),
[_] "{ecx}" (subid),
);
return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
}
// Read control register 0 (XCR0). Used to detect features such as AVX.
pub fn getXCR0() u32 {
return asm volatile (
\\ xor %%ecx, %%ecx
\\ xgetbv
: [_] "={eax}" (-> u32),
:
: "edx", "ecx"
);
}