mirror of
https://github.com/ghostty-org/ghostty.git
synced 2025-07-22 11:46:11 +03:00
src/simd: improve isa detection
This commit is contained in:
@ -1,5 +1,6 @@
|
|||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const builtin = @import("builtin");
|
const builtin = @import("builtin");
|
||||||
|
const isa = @import("isa.zig");
|
||||||
const aarch64 = @import("aarch64.zig");
|
const aarch64 = @import("aarch64.zig");
|
||||||
|
|
||||||
// Note this is a reimplementation of std.mem.indexOfScalar. The Zig stdlib
|
// Note this is a reimplementation of std.mem.indexOfScalar. The Zig stdlib
|
||||||
@ -91,7 +92,7 @@ fn testIndexOf(func: *const IndexOf) !void {
|
|||||||
}
|
}
|
||||||
|
|
||||||
test "indexOf neon" {
|
test "indexOf neon" {
|
||||||
// TODO: use ISA detection here
|
if (comptime !isa.possible(.neon)) return error.SkipZigTest;
|
||||||
if (comptime builtin.cpu.arch != .aarch64) return error.SkipZigTest;
|
const set = isa.detect();
|
||||||
try testIndexOf(&indexOfNeon);
|
if (set.contains(.neon)) try testIndexOf(&indexOfNeon);
|
||||||
}
|
}
|
||||||
|
222
src/simd/isa.zig
222
src/simd/isa.zig
@ -1,26 +1,97 @@
|
|||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const builtin = @import("builtin");
|
const builtin = @import("builtin");
|
||||||
|
const assert = std.debug.assert;
|
||||||
|
const x86_64 = @import("x86_64.zig");
|
||||||
|
|
||||||
/// Possible instruction set architectures for SIMD operations. These are
|
/// Raw comptime entry of poissible ISA. The arch is the arch that the
|
||||||
/// coarse grained and are targeted specifically so we can detect exactly
|
/// ISA is even possible on (e.g. neon is only possible on aarch64) but
|
||||||
/// what is available to us in Ghostty.
|
/// the actual ISA may not be available at runtime.
|
||||||
pub const ISA = enum {
|
const Entry = struct {
|
||||||
scalar,
|
name: [:0]const u8,
|
||||||
neon,
|
arch: []const std.Target.Cpu.Arch = &.{},
|
||||||
avx2,
|
};
|
||||||
|
|
||||||
/// Detect the available ISA at runtime. This will use comptime information
|
const entries: []const Entry = &.{
|
||||||
/// as well to minimize the number of runtime checks.
|
.{ .name = "scalar" },
|
||||||
pub fn detect() ISA {
|
.{ .name = "neon", .arch = &.{.aarch64} },
|
||||||
return switch (builtin.cpu.arch) {
|
.{ .name = "avx2", .arch = &.{ .x86, .x86_64 } },
|
||||||
// Neon is mandatory on aarch64. No runtime checks necessary.
|
};
|
||||||
.aarch64 => .neon,
|
|
||||||
.x86_64 => detectX86(),
|
/// Enum of possible ISAs for our SIMD operations. Note that these are
|
||||||
else => .scalar,
|
/// coarse-grained because they match possible implementations rather than
|
||||||
};
|
/// a fine-grained packed struct of available CPU features.
|
||||||
|
pub const ISA = isa: {
|
||||||
|
const EnumField = std.builtin.Type.EnumField;
|
||||||
|
var fields: [entries.len]EnumField = undefined;
|
||||||
|
for (entries, 0..) |entry, i| {
|
||||||
|
fields[i] = .{ .name = entry.name, .value = i };
|
||||||
}
|
}
|
||||||
|
|
||||||
fn detectX86() ISA {
|
break :isa @Type(.{ .Enum = .{
|
||||||
|
.tag_type = std.math.IntFittingRange(0, entries.len - 1),
|
||||||
|
.fields = &fields,
|
||||||
|
.decls = &.{},
|
||||||
|
.is_exhaustive = true,
|
||||||
|
} });
|
||||||
|
};
|
||||||
|
|
||||||
|
/// A set of ISAs.
|
||||||
|
pub const Set = std.EnumSet(ISA);
|
||||||
|
|
||||||
|
/// Check if the given ISA is possible on the current target. This is
|
||||||
|
/// available at comptime to help prevent invalid architectures from
|
||||||
|
/// being used.
|
||||||
|
pub fn possible(comptime isa: ISA) bool {
|
||||||
|
inline for (entries) |entry| {
|
||||||
|
if (std.mem.eql(u8, entry.name, @tagName(isa))) {
|
||||||
|
for (entry.arch) |arch| {
|
||||||
|
if (arch == builtin.cpu.arch) return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have no valid archs then its always valid.
|
||||||
|
return entry.arch.len == 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unreachable;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect all possible ISAs at runtime.
|
||||||
|
pub fn detect() Set {
|
||||||
|
var set: Set = .{};
|
||||||
|
set.insert(.scalar);
|
||||||
|
switch (builtin.cpu.arch) {
|
||||||
|
// Neon is mandatory on aarch64. No runtime checks necessary.
|
||||||
|
.aarch64 => set.insert(.neon),
|
||||||
|
.x86_64 => detectX86(&set),
|
||||||
|
else => {},
|
||||||
|
}
|
||||||
|
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the preferred ISA to use that is available.
|
||||||
|
pub fn preferred(set: Set) ISA {
|
||||||
|
const order: []const ISA = &.{ .avx2, .neon, .scalar };
|
||||||
|
|
||||||
|
// We should have all of our ISAs present in order
|
||||||
|
comptime {
|
||||||
|
for (@typeInfo(ISA).Enum.fields) |field| {
|
||||||
|
const v = @field(ISA, field.name);
|
||||||
|
assert(std.mem.indexOfScalar(ISA, order, v) != null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline for (order) |isa| {
|
||||||
|
if (comptime possible(isa)) {
|
||||||
|
if (set.contains(isa)) return isa;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return .scalar;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn detectX86(set: *Set) void {
|
||||||
// NOTE: this is just some boilerplate to detect AVX2. We
|
// NOTE: this is just some boilerplate to detect AVX2. We
|
||||||
// can probably support earlier forms of SIMD such as plain
|
// can probably support earlier forms of SIMD such as plain
|
||||||
// SSE, and we can definitely take advtange of later forms. This
|
// SSE, and we can definitely take advtange of later forms. This
|
||||||
@ -28,85 +99,72 @@ pub const ISA = enum {
|
|||||||
|
|
||||||
// If we support less than 7 for the maximum leaf level then we
|
// If we support less than 7 for the maximum leaf level then we
|
||||||
// don't support any AVX instructions.
|
// don't support any AVX instructions.
|
||||||
var leaf = X86.cpuid(0, 0);
|
var leaf = x86_64.cpuid(0, 0);
|
||||||
if (leaf.eax < 7) return .scalar;
|
if (leaf.eax < 7) return;
|
||||||
|
|
||||||
// If we don't have xsave or avx, then we don't support anything.
|
// If we don't have xsave or avx, then we don't support anything.
|
||||||
leaf = X86.cpuid(1, 0);
|
leaf = x86_64.cpuid(1, 0);
|
||||||
const has_xsave = hasBit(leaf.ecx, 27);
|
const has_xsave = hasBit(leaf.ecx, 27);
|
||||||
const has_avx = hasBit(leaf.ecx, 28);
|
const has_avx = hasBit(leaf.ecx, 28);
|
||||||
if (!has_xsave or !has_avx) return .scalar;
|
if (!has_xsave or !has_avx) return;
|
||||||
|
|
||||||
// We require AVX save state in order to use AVX instructions.
|
// We require AVX save state in order to use AVX instructions.
|
||||||
const xcr0_eax = X86.getXCR0(); // requires xsave+avx
|
const xcr0_eax = x86_64.getXCR0(); // requires xsave+avx
|
||||||
const has_avx_save = hasMask(xcr0_eax, X86.XCR0_XMM | X86.XCR0_YMM);
|
const has_avx_save = hasMask(xcr0_eax, x86_64.XCR0_XMM | x86_64.XCR0_YMM);
|
||||||
if (!has_avx_save) return .scalar;
|
if (!has_avx_save) return;
|
||||||
|
|
||||||
// Check for AVX2.
|
// Check for AVX2.
|
||||||
leaf = X86.cpuid(7, 0);
|
leaf = x86_64.cpuid(7, 0);
|
||||||
const has_avx2 = hasBit(leaf.ebx, 5);
|
const has_avx2 = hasBit(leaf.ebx, 5);
|
||||||
if (has_avx2) return .avx2;
|
if (has_avx2) set.insert(.avx2);
|
||||||
|
}
|
||||||
return .scalar;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Constants and functions related to x86 and x86_64. Reference for this
|
|
||||||
/// can be found in the Intel Architectures Software Developer's Manual,
|
|
||||||
/// mostly around the cpuid instruction.
|
|
||||||
const X86 = struct {
|
|
||||||
const XCR0_XMM = 0x02;
|
|
||||||
const XCR0_YMM = 0x04;
|
|
||||||
const XCR0_MASKREG = 0x20;
|
|
||||||
const XCR0_ZMM0_15 = 0x40;
|
|
||||||
const XCR0_ZMM16_31 = 0x80;
|
|
||||||
|
|
||||||
const CpuidLeaf = packed struct {
|
|
||||||
eax: u32,
|
|
||||||
ebx: u32,
|
|
||||||
ecx: u32,
|
|
||||||
edx: u32,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Wrapper around x86 and x86_64 `cpuid` in order to gather processor
|
|
||||||
/// and feature information. This is explicitly and specifically only
|
|
||||||
/// for x86 and x86_64.
|
|
||||||
fn cpuid(leaf_id: u32, subid: u32) CpuidLeaf {
|
|
||||||
var eax: u32 = undefined;
|
|
||||||
var ebx: u32 = undefined;
|
|
||||||
var ecx: u32 = undefined;
|
|
||||||
var edx: u32 = undefined;
|
|
||||||
|
|
||||||
asm volatile ("cpuid"
|
|
||||||
: [_] "={eax}" (eax),
|
|
||||||
[_] "={ebx}" (ebx),
|
|
||||||
[_] "={ecx}" (ecx),
|
|
||||||
[_] "={edx}" (edx),
|
|
||||||
: [_] "{eax}" (leaf_id),
|
|
||||||
[_] "{ecx}" (subid),
|
|
||||||
);
|
|
||||||
|
|
||||||
return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
|
|
||||||
}
|
|
||||||
|
|
||||||
// Read control register 0 (XCR0). Used to detect features such as AVX.
|
|
||||||
fn getXCR0() u32 {
|
|
||||||
return asm volatile (
|
|
||||||
\\ xor %%ecx, %%ecx
|
|
||||||
\\ xgetbv
|
|
||||||
: [_] "={eax}" (-> u32),
|
|
||||||
:
|
|
||||||
: "edx", "ecx"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Check if a bit is set at the given offset
|
/// Check if a bit is set at the given offset
|
||||||
inline fn hasBit(input: u32, offset: u5) bool {
|
pub inline fn hasBit(input: u32, offset: u5) bool {
|
||||||
return (input >> offset) & 1 != 0;
|
return (input >> offset) & 1 != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Checks if a mask exactly matches the input
|
/// Checks if a mask exactly matches the input
|
||||||
inline fn hasMask(input: u32, mask: u32) bool {
|
pub inline fn hasMask(input: u32, mask: u32) bool {
|
||||||
return (input & mask) == mask;
|
return (input & mask) == mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test "detect" {
|
||||||
|
const testing = std.testing;
|
||||||
|
const set = detect();
|
||||||
|
try testing.expect(set.contains(.scalar));
|
||||||
|
|
||||||
|
switch (builtin.cpu.arch) {
|
||||||
|
.aarch64 => {
|
||||||
|
// Neon is always available on aarch64
|
||||||
|
try testing.expect(set.contains(.neon));
|
||||||
|
try testing.expect(!set.contains(.avx2));
|
||||||
|
},
|
||||||
|
|
||||||
|
else => {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test "preferred" {
|
||||||
|
_ = preferred(detect());
|
||||||
|
}
|
||||||
|
|
||||||
|
test "possible" {
|
||||||
|
const testing = std.testing;
|
||||||
|
try testing.expect(possible(.scalar)); // always possible
|
||||||
|
|
||||||
|
// hardcode some other common realities
|
||||||
|
switch (builtin.cpu.arch) {
|
||||||
|
.aarch64 => {
|
||||||
|
try testing.expect(possible(.neon));
|
||||||
|
try testing.expect(!possible(.avx2));
|
||||||
|
},
|
||||||
|
|
||||||
|
.x86, .x86_64 => {
|
||||||
|
try testing.expect(!possible(.neon));
|
||||||
|
try testing.expect(possible(.avx2));
|
||||||
|
},
|
||||||
|
|
||||||
|
else => {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,15 +1,17 @@
|
|||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
|
|
||||||
const isa = @import("isa.zig");
|
pub const isa = @import("isa.zig");
|
||||||
const index_of = @import("index_of.zig");
|
const index_of = @import("index_of.zig");
|
||||||
pub usingnamespace isa;
|
|
||||||
pub usingnamespace index_of;
|
pub usingnamespace index_of;
|
||||||
|
|
||||||
|
// const utf8 = @import("utf8.zig");
|
||||||
|
// pub usingnamespace utf8;
|
||||||
|
|
||||||
pub fn main() !void {
|
pub fn main() !void {
|
||||||
//std.log.warn("ISA={}", .{isa.ISA.detect()});
|
//std.log.warn("ISA={}", .{isa.ISA.detect()});
|
||||||
const input = "1234567\x1b1234567\x1b";
|
const input = "1234567\x1b1234567\x1b";
|
||||||
//const input = "1234567812345678";
|
//const input = "1234567812345678";
|
||||||
_ = index_of.indexOf(input, 0x1B);
|
std.log.warn("result={any}", .{index_of.indexOf(input, 0x1B)});
|
||||||
}
|
}
|
||||||
|
|
||||||
test {
|
test {
|
||||||
|
44
src/simd/x86_64.zig
Normal file
44
src/simd/x86_64.zig
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
pub const XCR0_XMM = 0x02;
|
||||||
|
pub const XCR0_YMM = 0x04;
|
||||||
|
pub const XCR0_MASKREG = 0x20;
|
||||||
|
pub const XCR0_ZMM0_15 = 0x40;
|
||||||
|
pub const XCR0_ZMM16_31 = 0x80;
|
||||||
|
|
||||||
|
pub const CpuidLeaf = packed struct {
|
||||||
|
eax: u32,
|
||||||
|
ebx: u32,
|
||||||
|
ecx: u32,
|
||||||
|
edx: u32,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Wrapper around x86 and x86_64 `cpuid` in order to gather processor
|
||||||
|
/// and feature information. This is explicitly and specifically only
|
||||||
|
/// for x86 and x86_64.
|
||||||
|
pub fn cpuid(leaf_id: u32, subid: u32) CpuidLeaf {
|
||||||
|
var eax: u32 = undefined;
|
||||||
|
var ebx: u32 = undefined;
|
||||||
|
var ecx: u32 = undefined;
|
||||||
|
var edx: u32 = undefined;
|
||||||
|
|
||||||
|
asm volatile ("cpuid"
|
||||||
|
: [_] "={eax}" (eax),
|
||||||
|
[_] "={ebx}" (ebx),
|
||||||
|
[_] "={ecx}" (ecx),
|
||||||
|
[_] "={edx}" (edx),
|
||||||
|
: [_] "{eax}" (leaf_id),
|
||||||
|
[_] "{ecx}" (subid),
|
||||||
|
);
|
||||||
|
|
||||||
|
return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read control register 0 (XCR0). Used to detect features such as AVX.
|
||||||
|
pub fn getXCR0() u32 {
|
||||||
|
return asm volatile (
|
||||||
|
\\ xor %%ecx, %%ecx
|
||||||
|
\\ xgetbv
|
||||||
|
: [_] "={eax}" (-> u32),
|
||||||
|
:
|
||||||
|
: "edx", "ecx"
|
||||||
|
);
|
||||||
|
}
|
Reference in New Issue
Block a user