mirror of
https://github.com/ghostty-org/ghostty.git
synced 2025-07-22 03:36:14 +03:00
src/simd: improve isa detection
This commit is contained in:
@ -1,5 +1,6 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const isa = @import("isa.zig");
|
||||
const aarch64 = @import("aarch64.zig");
|
||||
|
||||
// Note this is a reimplementation of std.mem.indexOfScalar. The Zig stdlib
|
||||
@ -91,7 +92,7 @@ fn testIndexOf(func: *const IndexOf) !void {
|
||||
}
|
||||
|
||||
test "indexOf neon" {
|
||||
// TODO: use ISA detection here
|
||||
if (comptime builtin.cpu.arch != .aarch64) return error.SkipZigTest;
|
||||
try testIndexOf(&indexOfNeon);
|
||||
if (comptime !isa.possible(.neon)) return error.SkipZigTest;
|
||||
const set = isa.detect();
|
||||
if (set.contains(.neon)) try testIndexOf(&indexOfNeon);
|
||||
}
|
||||
|
222
src/simd/isa.zig
222
src/simd/isa.zig
@ -1,26 +1,97 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const assert = std.debug.assert;
|
||||
const x86_64 = @import("x86_64.zig");
|
||||
|
||||
/// Possible instruction set architectures for SIMD operations. These are
|
||||
/// coarse grained and are targeted specifically so we can detect exactly
|
||||
/// what is available to us in Ghostty.
|
||||
pub const ISA = enum {
|
||||
scalar,
|
||||
neon,
|
||||
avx2,
|
||||
/// Raw comptime entry of poissible ISA. The arch is the arch that the
|
||||
/// ISA is even possible on (e.g. neon is only possible on aarch64) but
|
||||
/// the actual ISA may not be available at runtime.
|
||||
const Entry = struct {
|
||||
name: [:0]const u8,
|
||||
arch: []const std.Target.Cpu.Arch = &.{},
|
||||
};
|
||||
|
||||
/// Detect the available ISA at runtime. This will use comptime information
|
||||
/// as well to minimize the number of runtime checks.
|
||||
pub fn detect() ISA {
|
||||
return switch (builtin.cpu.arch) {
|
||||
// Neon is mandatory on aarch64. No runtime checks necessary.
|
||||
.aarch64 => .neon,
|
||||
.x86_64 => detectX86(),
|
||||
else => .scalar,
|
||||
};
|
||||
const entries: []const Entry = &.{
|
||||
.{ .name = "scalar" },
|
||||
.{ .name = "neon", .arch = &.{.aarch64} },
|
||||
.{ .name = "avx2", .arch = &.{ .x86, .x86_64 } },
|
||||
};
|
||||
|
||||
/// Enum of possible ISAs for our SIMD operations. Note that these are
|
||||
/// coarse-grained because they match possible implementations rather than
|
||||
/// a fine-grained packed struct of available CPU features.
|
||||
pub const ISA = isa: {
|
||||
const EnumField = std.builtin.Type.EnumField;
|
||||
var fields: [entries.len]EnumField = undefined;
|
||||
for (entries, 0..) |entry, i| {
|
||||
fields[i] = .{ .name = entry.name, .value = i };
|
||||
}
|
||||
|
||||
fn detectX86() ISA {
|
||||
break :isa @Type(.{ .Enum = .{
|
||||
.tag_type = std.math.IntFittingRange(0, entries.len - 1),
|
||||
.fields = &fields,
|
||||
.decls = &.{},
|
||||
.is_exhaustive = true,
|
||||
} });
|
||||
};
|
||||
|
||||
/// A set of ISAs.
|
||||
pub const Set = std.EnumSet(ISA);
|
||||
|
||||
/// Check if the given ISA is possible on the current target. This is
|
||||
/// available at comptime to help prevent invalid architectures from
|
||||
/// being used.
|
||||
pub fn possible(comptime isa: ISA) bool {
|
||||
inline for (entries) |entry| {
|
||||
if (std.mem.eql(u8, entry.name, @tagName(isa))) {
|
||||
for (entry.arch) |arch| {
|
||||
if (arch == builtin.cpu.arch) return true;
|
||||
}
|
||||
|
||||
// If we have no valid archs then its always valid.
|
||||
return entry.arch.len == 0;
|
||||
}
|
||||
}
|
||||
|
||||
unreachable;
|
||||
}
|
||||
|
||||
/// Detect all possible ISAs at runtime.
|
||||
pub fn detect() Set {
|
||||
var set: Set = .{};
|
||||
set.insert(.scalar);
|
||||
switch (builtin.cpu.arch) {
|
||||
// Neon is mandatory on aarch64. No runtime checks necessary.
|
||||
.aarch64 => set.insert(.neon),
|
||||
.x86_64 => detectX86(&set),
|
||||
else => {},
|
||||
}
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
/// Returns the preferred ISA to use that is available.
|
||||
pub fn preferred(set: Set) ISA {
|
||||
const order: []const ISA = &.{ .avx2, .neon, .scalar };
|
||||
|
||||
// We should have all of our ISAs present in order
|
||||
comptime {
|
||||
for (@typeInfo(ISA).Enum.fields) |field| {
|
||||
const v = @field(ISA, field.name);
|
||||
assert(std.mem.indexOfScalar(ISA, order, v) != null);
|
||||
}
|
||||
}
|
||||
|
||||
inline for (order) |isa| {
|
||||
if (comptime possible(isa)) {
|
||||
if (set.contains(isa)) return isa;
|
||||
}
|
||||
}
|
||||
|
||||
return .scalar;
|
||||
}
|
||||
|
||||
fn detectX86(set: *Set) void {
|
||||
// NOTE: this is just some boilerplate to detect AVX2. We
|
||||
// can probably support earlier forms of SIMD such as plain
|
||||
// SSE, and we can definitely take advtange of later forms. This
|
||||
@ -28,85 +99,72 @@ pub const ISA = enum {
|
||||
|
||||
// If we support less than 7 for the maximum leaf level then we
|
||||
// don't support any AVX instructions.
|
||||
var leaf = X86.cpuid(0, 0);
|
||||
if (leaf.eax < 7) return .scalar;
|
||||
var leaf = x86_64.cpuid(0, 0);
|
||||
if (leaf.eax < 7) return;
|
||||
|
||||
// If we don't have xsave or avx, then we don't support anything.
|
||||
leaf = X86.cpuid(1, 0);
|
||||
leaf = x86_64.cpuid(1, 0);
|
||||
const has_xsave = hasBit(leaf.ecx, 27);
|
||||
const has_avx = hasBit(leaf.ecx, 28);
|
||||
if (!has_xsave or !has_avx) return .scalar;
|
||||
if (!has_xsave or !has_avx) return;
|
||||
|
||||
// We require AVX save state in order to use AVX instructions.
|
||||
const xcr0_eax = X86.getXCR0(); // requires xsave+avx
|
||||
const has_avx_save = hasMask(xcr0_eax, X86.XCR0_XMM | X86.XCR0_YMM);
|
||||
if (!has_avx_save) return .scalar;
|
||||
const xcr0_eax = x86_64.getXCR0(); // requires xsave+avx
|
||||
const has_avx_save = hasMask(xcr0_eax, x86_64.XCR0_XMM | x86_64.XCR0_YMM);
|
||||
if (!has_avx_save) return;
|
||||
|
||||
// Check for AVX2.
|
||||
leaf = X86.cpuid(7, 0);
|
||||
leaf = x86_64.cpuid(7, 0);
|
||||
const has_avx2 = hasBit(leaf.ebx, 5);
|
||||
if (has_avx2) return .avx2;
|
||||
|
||||
return .scalar;
|
||||
}
|
||||
};
|
||||
|
||||
/// Constants and functions related to x86 and x86_64. Reference for this
|
||||
/// can be found in the Intel Architectures Software Developer's Manual,
|
||||
/// mostly around the cpuid instruction.
|
||||
const X86 = struct {
|
||||
const XCR0_XMM = 0x02;
|
||||
const XCR0_YMM = 0x04;
|
||||
const XCR0_MASKREG = 0x20;
|
||||
const XCR0_ZMM0_15 = 0x40;
|
||||
const XCR0_ZMM16_31 = 0x80;
|
||||
|
||||
const CpuidLeaf = packed struct {
|
||||
eax: u32,
|
||||
ebx: u32,
|
||||
ecx: u32,
|
||||
edx: u32,
|
||||
};
|
||||
|
||||
/// Wrapper around x86 and x86_64 `cpuid` in order to gather processor
|
||||
/// and feature information. This is explicitly and specifically only
|
||||
/// for x86 and x86_64.
|
||||
fn cpuid(leaf_id: u32, subid: u32) CpuidLeaf {
|
||||
var eax: u32 = undefined;
|
||||
var ebx: u32 = undefined;
|
||||
var ecx: u32 = undefined;
|
||||
var edx: u32 = undefined;
|
||||
|
||||
asm volatile ("cpuid"
|
||||
: [_] "={eax}" (eax),
|
||||
[_] "={ebx}" (ebx),
|
||||
[_] "={ecx}" (ecx),
|
||||
[_] "={edx}" (edx),
|
||||
: [_] "{eax}" (leaf_id),
|
||||
[_] "{ecx}" (subid),
|
||||
);
|
||||
|
||||
return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
|
||||
}
|
||||
|
||||
// Read control register 0 (XCR0). Used to detect features such as AVX.
|
||||
fn getXCR0() u32 {
|
||||
return asm volatile (
|
||||
\\ xor %%ecx, %%ecx
|
||||
\\ xgetbv
|
||||
: [_] "={eax}" (-> u32),
|
||||
:
|
||||
: "edx", "ecx"
|
||||
);
|
||||
}
|
||||
};
|
||||
if (has_avx2) set.insert(.avx2);
|
||||
}
|
||||
|
||||
/// Check if a bit is set at the given offset
|
||||
inline fn hasBit(input: u32, offset: u5) bool {
|
||||
pub inline fn hasBit(input: u32, offset: u5) bool {
|
||||
return (input >> offset) & 1 != 0;
|
||||
}
|
||||
|
||||
/// Checks if a mask exactly matches the input
|
||||
inline fn hasMask(input: u32, mask: u32) bool {
|
||||
pub inline fn hasMask(input: u32, mask: u32) bool {
|
||||
return (input & mask) == mask;
|
||||
}
|
||||
|
||||
test "detect" {
|
||||
const testing = std.testing;
|
||||
const set = detect();
|
||||
try testing.expect(set.contains(.scalar));
|
||||
|
||||
switch (builtin.cpu.arch) {
|
||||
.aarch64 => {
|
||||
// Neon is always available on aarch64
|
||||
try testing.expect(set.contains(.neon));
|
||||
try testing.expect(!set.contains(.avx2));
|
||||
},
|
||||
|
||||
else => {},
|
||||
}
|
||||
}
|
||||
|
||||
test "preferred" {
|
||||
_ = preferred(detect());
|
||||
}
|
||||
|
||||
test "possible" {
|
||||
const testing = std.testing;
|
||||
try testing.expect(possible(.scalar)); // always possible
|
||||
|
||||
// hardcode some other common realities
|
||||
switch (builtin.cpu.arch) {
|
||||
.aarch64 => {
|
||||
try testing.expect(possible(.neon));
|
||||
try testing.expect(!possible(.avx2));
|
||||
},
|
||||
|
||||
.x86, .x86_64 => {
|
||||
try testing.expect(!possible(.neon));
|
||||
try testing.expect(possible(.avx2));
|
||||
},
|
||||
|
||||
else => {},
|
||||
}
|
||||
}
|
||||
|
@ -1,15 +1,17 @@
|
||||
const std = @import("std");
|
||||
|
||||
const isa = @import("isa.zig");
|
||||
pub const isa = @import("isa.zig");
|
||||
const index_of = @import("index_of.zig");
|
||||
pub usingnamespace isa;
|
||||
pub usingnamespace index_of;
|
||||
|
||||
// const utf8 = @import("utf8.zig");
|
||||
// pub usingnamespace utf8;
|
||||
|
||||
pub fn main() !void {
|
||||
//std.log.warn("ISA={}", .{isa.ISA.detect()});
|
||||
const input = "1234567\x1b1234567\x1b";
|
||||
//const input = "1234567812345678";
|
||||
_ = index_of.indexOf(input, 0x1B);
|
||||
std.log.warn("result={any}", .{index_of.indexOf(input, 0x1B)});
|
||||
}
|
||||
|
||||
test {
|
||||
|
44
src/simd/x86_64.zig
Normal file
44
src/simd/x86_64.zig
Normal file
@ -0,0 +1,44 @@
|
||||
pub const XCR0_XMM = 0x02;
|
||||
pub const XCR0_YMM = 0x04;
|
||||
pub const XCR0_MASKREG = 0x20;
|
||||
pub const XCR0_ZMM0_15 = 0x40;
|
||||
pub const XCR0_ZMM16_31 = 0x80;
|
||||
|
||||
pub const CpuidLeaf = packed struct {
|
||||
eax: u32,
|
||||
ebx: u32,
|
||||
ecx: u32,
|
||||
edx: u32,
|
||||
};
|
||||
|
||||
/// Wrapper around x86 and x86_64 `cpuid` in order to gather processor
|
||||
/// and feature information. This is explicitly and specifically only
|
||||
/// for x86 and x86_64.
|
||||
pub fn cpuid(leaf_id: u32, subid: u32) CpuidLeaf {
|
||||
var eax: u32 = undefined;
|
||||
var ebx: u32 = undefined;
|
||||
var ecx: u32 = undefined;
|
||||
var edx: u32 = undefined;
|
||||
|
||||
asm volatile ("cpuid"
|
||||
: [_] "={eax}" (eax),
|
||||
[_] "={ebx}" (ebx),
|
||||
[_] "={ecx}" (ecx),
|
||||
[_] "={edx}" (edx),
|
||||
: [_] "{eax}" (leaf_id),
|
||||
[_] "{ecx}" (subid),
|
||||
);
|
||||
|
||||
return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
|
||||
}
|
||||
|
||||
// Read control register 0 (XCR0). Used to detect features such as AVX.
|
||||
pub fn getXCR0() u32 {
|
||||
return asm volatile (
|
||||
\\ xor %%ecx, %%ecx
|
||||
\\ xgetbv
|
||||
: [_] "={eax}" (-> u32),
|
||||
:
|
||||
: "edx", "ecx"
|
||||
);
|
||||
}
|
Reference in New Issue
Block a user