Merge pull request #1472 from mitchellh/simd

SIMD Beginnings
2025-07-22 11:46:11 +03:00 · 2024-02-05 21:46:32 -08:00
parent 4362eeaedc dc6c52fac1
commit 7256c8e091
32 changed files with 38120 additions and 46 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,182 @@
 ---
 Language: Cpp
 # BasedOnStyle:  Chromium
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveMacros: false
 AlignConsecutiveAssignments: false
 AlignConsecutiveBitFields: false
 AlignConsecutiveDeclarations: false
 AlignEscapedNewlines: Left
 AlignOperands: Align
 AlignTrailingComments: true
 AllowAllArgumentsOnNextLine: true
 AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: false
 AllowShortEnumsOnASingleLine: true
 AllowShortBlocksOnASingleLine: Never
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortFunctionsOnASingleLine: Inline
 AllowShortLambdasOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: Never
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: Yes
 BinPackArguments: true
 BinPackParameters: false
 BraceWrapping:
  AfterCaseLabel: false
  AfterClass: false
  AfterControlStatement: Never
  AfterEnum: false
  AfterFunction: false
  AfterNamespace: false
  AfterObjCDeclaration: false
  AfterStruct: false
  AfterUnion: false
  AfterExternBlock: false
  BeforeCatch: false
  BeforeElse: false
  BeforeLambdaBody: false
  BeforeWhile: false
  IndentBraces: false
  SplitEmptyFunction: true
  SplitEmptyRecord: true
  SplitEmptyNamespace: true
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
 BreakBeforeInheritanceComma: false
 BreakInheritanceList: BeforeColon
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
 BreakConstructorInitializers: BeforeColon
 BreakAfterJavaFieldAnnotations: false
 BreakStringLiterals: true
 ColumnLimit: 80
 CommentPragmas: "^ IWYU pragma:"
 CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: true
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
 DeriveLineEnding: true
 DerivePointerAlignment: false
 DisableFormat: false
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: true
 ForEachMacros:
  - foreach
  - Q_FOREACH
  - BOOST_FOREACH
 IncludeBlocks: Preserve
 IncludeCategories:
  - Regex: '^<ext/.*\.h>'
    Priority: 2
    SortPriority: 0
  - Regex: '^<.*\.h>'
    Priority: 1
    SortPriority: 0
  - Regex: "^<.*"
    Priority: 2
    SortPriority: 0
  - Regex: ".*"
    Priority: 3
    SortPriority: 0
 IncludeIsMainRegex: "([-_](test|unittest))?$"
 IncludeIsMainSourceRegex: ""
 IndentCaseLabels: true
 IndentCaseBlocks: false
 IndentGotoLabels: true
 IndentPPDirectives: None
 IndentExternBlock: AfterExternBlock
 IndentWidth: 2
 IndentWrappedFunctionNames: false
 InsertTrailingCommas: None
 JavaScriptQuotes: Leave
 JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ""
 MacroBlockEnd: ""
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 ObjCBinPackProtocolList: Never
 ObjCBlockIndentWidth: 2
 ObjCBreakBeforeNestedBlockParam: true
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: true
 PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
 PenaltyBreakString: 1000
 PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Left
 RawStringFormats:
  - Language: Cpp
    Delimiters:
      - cc
      - CC
      - cpp
      - Cpp
      - CPP
      - "c++"
      - "C++"
    CanonicalDelimiter: ""
    BasedOnStyle: google
  - Language: TextProto
    Delimiters:
      - pb
      - PB
      - proto
      - PROTO
    EnclosingFunctions:
      - EqualsProto
      - EquivToProto
      - PARSE_PARTIAL_TEXT_PROTO
      - PARSE_TEST_PROTO
      - PARSE_TEXT_PROTO
      - ParseTextOrDie
      - ParseTextProtoOrDie
      - ParseTestProto
      - ParsePartialTestProto
    CanonicalDelimiter: ""
    BasedOnStyle: google
 ReflowComments: true
 SortIncludes: true
 SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterLogicalNot: false
 SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeCpp11BracedList: false
 SpaceBeforeCtorInitializerColon: true
 SpaceBeforeInheritanceColon: true
 SpaceBeforeParens: ControlStatements
 SpaceBeforeRangeBasedForLoopColon: true
 SpaceInEmptyBlock: false
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 2
 SpacesInAngles: false
 SpacesInConditionalStatement: false
 SpacesInContainerLiterals: true
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
 SpaceBeforeSquareBrackets: false
 Standard: Auto
 StatementMacros:
  - Q_UNUSED
  - QT_REQUIRE_VERSION
 TabWidth: 8
 UseCRLF: false
 UseTab: Never
 WhitespaceSensitiveMacros:
  - STRINGIZE
  - PP_STRINGIZE
  - BOOST_PP_STRINGIZE
 ---
--- a/.gitattributes
+++ b/.gitattributes
@ -1,4 +1,5 @@
 vendor/** linguist-vendored
 website/** linguist-documentation
 pkg/cimgui/vendor/** linguist-vendored
 pkg/simdutf/vendor/** linguist-vendored
 src/terminal/res/** linguist-vendored
--- a/build.zig
+++ b/build.zig
@ -202,7 +202,7 @@ pub fn build(b: *std.Build) !void {
    if (emit_helpgen) try addHelp(b, null, config);
    // Add our benchmarks
-    try benchSteps(b, target, optimize, config, emit_bench);
+    try benchSteps(b, target, config, emit_bench);
    // We only build an exe if we have a runtime set.
    const exe_: ?*std.Build.Step.Compile = if (config.app_runtime != .none) b.addExecutable(.{
@ -925,6 +925,18 @@ fn addDeps(
        .target = target,
        .optimize = optimize,
    });
    const highway_dep = b.dependency("highway", .{
        .target = target,
        .optimize = optimize,
    });
    const simdutf_dep = b.dependency("simdutf", .{
        .target = target,
        .optimize = optimize,
    });
    const utfcpp_dep = b.dependency("utfcpp", .{
        .target = target,
        .optimize = optimize,
    });
    const libpng_dep = b.dependency("libpng", .{
        .target = target,
        .optimize = optimize,
@ -977,6 +989,14 @@ fn addDeps(
    step.addIncludePath(.{ .path = "src/stb" });
    step.addCSourceFiles(.{ .files = &.{"src/stb/stb.c"} });
    // C++ files
    step.linkLibCpp();
    step.addIncludePath(.{ .path = "src" });
    step.addCSourceFiles(.{ .files = &.{
        "src/simd/index_of.cpp",
        "src/simd/vt.cpp",
    } });
    // If we're building a lib we have some different deps
    const lib = step.kind == .lib;
@ -1027,6 +1047,18 @@ fn addDeps(
    step.linkLibrary(glslang_dep.artifact("glslang"));
    try static_libs.append(glslang_dep.artifact("glslang").getEmittedBin());
    // Highway
    step.linkLibrary(highway_dep.artifact("highway"));
    try static_libs.append(highway_dep.artifact("highway").getEmittedBin());
    // simdutf
    step.linkLibrary(simdutf_dep.artifact("simdutf"));
    try static_libs.append(simdutf_dep.artifact("simdutf").getEmittedBin());
    // utfcpp
    step.linkLibrary(utfcpp_dep.artifact("utfcpp"));
    try static_libs.append(utfcpp_dep.artifact("utfcpp").getEmittedBin());
    // Spirv-Cross
    step.linkLibrary(spirv_cross_dep.artifact("spirv_cross"));
    try static_libs.append(spirv_cross_dep.artifact("spirv_cross").getEmittedBin());
@ -1231,7 +1263,6 @@ fn buildDocumentation(
 fn benchSteps(
    b: *std.Build,
    target: std.Build.ResolvedTarget,
    optimize: std.builtin.OptimizeMode,
    config: BuildConfig,
    install: bool,
 ) !void {
@ -1259,8 +1290,11 @@ fn benchSteps(
            .name = bin_name,
            .root_source_file = .{ .path = "src/main.zig" },
            .target = target,
-            .optimize = optimize,
+
            // We always want our benchmarks to be in release mode.
            .optimize = .ReleaseFast,
        });
        c_exe.linkLibC();
        if (install) b.installArtifact(c_exe);
        _ = try addDeps(b, c_exe, config: {
            var copy = config;
--- a/build.zig.zon
+++ b/build.zig.zon
@ -31,11 +31,14 @@
        .fontconfig = .{ .path = "./pkg/fontconfig" },
        .freetype = .{ .path = "./pkg/freetype" },
        .harfbuzz = .{ .path = "./pkg/harfbuzz" },
        .highway = .{ .path = "./pkg/highway" },
        .libpng = .{ .path = "./pkg/libpng" },
        .macos = .{ .path = "./pkg/macos" },
        .oniguruma = .{ .path = "./pkg/oniguruma" },
        .opengl = .{ .path = "./pkg/opengl" },
        .pixman = .{ .path = "./pkg/pixman" },
        .simdutf = .{ .path = "./pkg/simdutf" },
        .utfcpp = .{ .path = "./pkg/utfcpp" },
        .zlib = .{ .path = "./pkg/zlib" },
        // Shader translation
--- a/nix/devShell.nix
+++ b/nix/devShell.nix
@ -12,6 +12,7 @@
  parallel,
  pkg-config,
  python3,
  qemu,
  scdoc,
  tracy,
  valgrind,
@ -110,6 +111,10 @@ in
        # by default so we have to include this.
        bashInteractive
        # Used for testing SIMD codegen. This is Linux only because the macOS
        # build only has the qemu-system files.
        qemu
        gdb
        valgrind
        wraptest
--- a/nix/zigCacheHash.nix
+++ b/nix/zigCacheHash.nix
@ -1,3 +1,3 @@
 # This file is auto-generated! check build-support/check-zig-cache-hash.sh for
 # more details.
-"sha256-YXSgZynCPCwahbV4cQx05IrtzOaUxG75715dc+j+8/c="
+"sha256-5QLmMiZFiWFTtEKCPn3ruXo2vkCVU870mPbKmmKqLvs="
--- a/pkg/highway/bridge.cpp
+++ b/pkg/highway/bridge.cpp
@ -0,0 +1,9 @@
 #include <hwy/targets.h>
 #include <stdint.h>
 extern "C" {
 int64_t hwy_supported_targets() {
  return HWY_SUPPORTED_TARGETS;
 }
 }
--- a/pkg/highway/build.zig
+++ b/pkg/highway/build.zig
@ -0,0 +1,111 @@
 const std = @import("std");
 pub fn build(b: *std.Build) !void {
    const target = b.standardTargetOptions(.{});
    const optimize = b.standardOptimizeOption(.{});
    const upstream = b.dependency("highway", .{});
    const module = b.addModule("highway", .{
        .root_source_file = .{ .path = "main.zig" },
        .target = target,
        .optimize = optimize,
    });
    const lib = b.addStaticLibrary(.{
        .name = "highway",
        .target = target,
        .optimize = optimize,
    });
    lib.linkLibCpp();
    lib.addIncludePath(upstream.path(""));
    module.addIncludePath(upstream.path(""));
    if (target.result.isDarwin()) {
        const apple_sdk = @import("apple_sdk");
        try apple_sdk.addPaths(b, &lib.root_module);
        try apple_sdk.addPaths(b, module);
    }
    var flags = std.ArrayList([]const u8).init(b.allocator);
    defer flags.deinit();
    try flags.appendSlice(&.{
        // Avoid changing binaries based on the current time and date.
        "-Wno-builtin-macro-redefined",
        "-D__DATE__=\"redacted\"",
        "-D__TIMESTAMP__=\"redacted\"",
        "-D__TIME__=\"redacted\"",
        // Optimizations
        "-fmerge-all-constants",
        // Warnings
        "-Wall",
        "-Wextra",
        // These are not included in Wall nor Wextra:
        "-Wconversion",
        "-Wsign-conversion",
        "-Wvla",
        "-Wnon-virtual-dtor",
        "-Wfloat-overflow-conversion",
        "-Wfloat-zero-conversion",
        "-Wfor-loop-analysis",
        "-Wgnu-redeclared-enum",
        "-Winfinite-recursion",
        "-Wself-assign",
        "-Wstring-conversion",
        "-Wtautological-overlap-compare",
        "-Wthread-safety-analysis",
        "-Wundefined-func-template",
        "-fno-cxx-exceptions",
        "-fno-slp-vectorize",
        "-fno-vectorize",
    });
    if (target.result.os.tag != .windows) {
        try flags.appendSlice(&.{
            "-fmath-errno",
            "-fno-exceptions",
        });
    }
    lib.addCSourceFiles(.{ .flags = flags.items, .files = &.{"bridge.cpp"} });
    lib.addCSourceFiles(.{
        .dependency = upstream,
        .flags = flags.items,
        .files = &.{
            "hwy/aligned_allocator.cc",
            "hwy/nanobenchmark.cc",
            "hwy/per_target.cc",
            "hwy/print.cc",
            "hwy/targets.cc",
            "hwy/timer.cc",
        },
    });
    lib.installHeadersDirectoryOptions(.{
        .source_dir = upstream.path("hwy"),
        .install_dir = .header,
        .install_subdir = "hwy",
        .include_extensions = &.{".h"},
    });
    b.installArtifact(lib);
    {
        const test_exe = b.addTest(.{
            .name = "test",
            .root_source_file = .{ .path = "main.zig" },
            .target = target,
            .optimize = optimize,
        });
        test_exe.linkLibrary(lib);
        var it = module.import_table.iterator();
        while (it.next()) |entry| test_exe.root_module.addImport(entry.key_ptr.*, entry.value_ptr.*);
        const tests_run = b.addRunArtifact(test_exe);
        const test_step = b.step("test", "Run tests");
        test_step.dependOn(&tests_run.step);
    }
 }
--- a/pkg/highway/build.zig.zon
+++ b/pkg/highway/build.zig.zon
@ -0,0 +1,13 @@
 .{
    .name = "highway",
    .version = "1.0.7",
    .paths = .{""},
    .dependencies = .{
        .highway = .{
            .url = "https://github.com/google/highway/archive/refs/tags/1.0.7.tar.gz",
            .hash = "122060ea43a9403ad53b4a33e19416c0e9949fb3e175035791bd2b7462091079d5a2",
        },
        .apple_sdk = .{ .path = "../apple-sdk" },
    },
 }
--- a/pkg/highway/main.zig
+++ b/pkg/highway/main.zig
@ -0,0 +1,57 @@
 extern "c" fn hwy_supported_targets() i64;
 pub const Targets = packed struct(i64) {
    // x86_64
    _reserved: u4 = 0,
    avx3_spr: bool = false,
    _reserved_5: u1 = 0,
    avx3_zen4: bool = false,
    avx3_dl: bool = false,
    avx3: bool = false,
    avx2: bool = false,
    _reserved_10: u1 = 0,
    sse4: bool = false,
    ssse3: bool = false,
    _reserved_13: u1 = 0, // SSE3 reserved
    sse2: bool = false,
    _reserved_15_23: u9 = 0,
    // aarch64
    sve2_128: bool = false,
    sve_256: bool = false,
    sve2: bool = false,
    sve: bool = false,
    neon: bool = false,
    neon_without_aes: bool = false,
    _reserved_30_36: u6 = 0,
    // risc-v
    rvv: bool = false,
    _reserved_38_46: u9 = 0,
    // IBM Power
    ppc10: bool = false,
    ppc9: bool = false,
    ppc8: bool = false,
    z15: bool = false,
    z14: bool = false,
    _reserved_52_57: u6 = 0,
    // WebAssembly
    wasm_emu256: bool = false,
    wasm: bool = false,
    _reserved_60_61: u2 = 0,
    // Emulation
    emu128: bool = false,
    scalar: bool = false,
    _reserved_63: u1 = 0,
 };
 pub fn supported_targets() Targets {
    return @bitCast(hwy_supported_targets());
 }
 test {
    _ = supported_targets();
 }
--- a/pkg/simdutf/build.zig
+++ b/pkg/simdutf/build.zig
@ -0,0 +1,54 @@
 const std = @import("std");
 pub fn build(b: *std.Build) !void {
    const target = b.standardTargetOptions(.{});
    const optimize = b.standardOptimizeOption(.{});
    const lib = b.addStaticLibrary(.{
        .name = "simdutf",
        .target = target,
        .optimize = optimize,
    });
    lib.linkLibCpp();
    lib.addIncludePath(.{ .path = "vendor" });
    if (target.result.isDarwin()) {
        const apple_sdk = @import("apple_sdk");
        try apple_sdk.addPaths(b, &lib.root_module);
    }
    var flags = std.ArrayList([]const u8).init(b.allocator);
    defer flags.deinit();
    try flags.appendSlice(&.{});
    lib.addCSourceFiles(.{
        .flags = flags.items,
        .files = &.{
            "vendor/simdutf.cpp",
        },
    });
    lib.installHeadersDirectoryOptions(.{
        .source_dir = .{ .path = "vendor" },
        .install_dir = .header,
        .install_subdir = "",
        .include_extensions = &.{".h"},
    });
    b.installArtifact(lib);
    // {
    //     const test_exe = b.addTest(.{
    //         .name = "test",
    //         .root_source_file = .{ .path = "main.zig" },
    //         .target = target,
    //         .optimize = optimize,
    //     });
    //     test_exe.linkLibrary(lib);
    //
    //     var it = module.import_table.iterator();
    //     while (it.next()) |entry| test_exe.root_module.addImport(entry.key_ptr.*, entry.value_ptr.*);
    //     const tests_run = b.addRunArtifact(test_exe);
    //     const test_step = b.step("test", "Run tests");
    //     test_step.dependOn(&tests_run.step);
    // }
 }
--- a/pkg/simdutf/build.zig.zon
+++ b/pkg/simdutf/build.zig.zon
@ -0,0 +1,8 @@
 .{
    .name = "simdutf",
    .version = "4.0.9",
    .paths = .{""},
    .dependencies = .{
        .apple_sdk = .{ .path = "../apple-sdk" },
    },
 }
--- a/pkg/simdutf/vendor/simdutf.cpp
+++ b/pkg/simdutf/vendor/simdutf.cpp
--- a/pkg/simdutf/vendor/simdutf.h
+++ b/pkg/simdutf/vendor/simdutf.h
--- a/pkg/utfcpp/build.zig
+++ b/pkg/utfcpp/build.zig
@ -0,0 +1,54 @@
 const std = @import("std");
 pub fn build(b: *std.Build) !void {
    const target = b.standardTargetOptions(.{});
    const optimize = b.standardOptimizeOption(.{});
    const upstream = b.dependency("utfcpp", .{});
    const lib = b.addStaticLibrary(.{
        .name = "utfcpp",
        .target = target,
        .optimize = optimize,
    });
    lib.linkLibCpp();
    lib.addIncludePath(upstream.path(""));
    if (target.result.isDarwin()) {
        const apple_sdk = @import("apple_sdk");
        try apple_sdk.addPaths(b, &lib.root_module);
    }
    var flags = std.ArrayList([]const u8).init(b.allocator);
    defer flags.deinit();
    try flags.appendSlice(&.{});
    lib.addCSourceFiles(.{
        .flags = flags.items,
        .files = &.{"empty.cc"},
    });
    lib.installHeadersDirectoryOptions(.{
        .source_dir = upstream.path("source"),
        .install_dir = .header,
        .install_subdir = "",
        .include_extensions = &.{".h"},
    });
    b.installArtifact(lib);
    // {
    //     const test_exe = b.addTest(.{
    //         .name = "test",
    //         .root_source_file = .{ .path = "main.zig" },
    //         .target = target,
    //         .optimize = optimize,
    //     });
    //     test_exe.linkLibrary(lib);
    //
    //     var it = module.import_table.iterator();
    //     while (it.next()) |entry| test_exe.root_module.addImport(entry.key_ptr.*, entry.value_ptr.*);
    //     const tests_run = b.addRunArtifact(test_exe);
    //     const test_step = b.step("test", "Run tests");
    //     test_step.dependOn(&tests_run.step);
    // }
 }
--- a/pkg/utfcpp/build.zig.zon
+++ b/pkg/utfcpp/build.zig.zon
@ -0,0 +1,13 @@
 .{
    .name = "utfcpp",
    .version = "4.0.5",
    .paths = .{""},
    .dependencies = .{
        .utfcpp = .{
            .url = "https://github.com/nemtrif/utfcpp/archive/refs/tags/v4.0.5.tar.gz",
            .hash = "1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641",
        },
        .apple_sdk = .{ .path = "../apple-sdk" },
    },
 }
--- a/pkg/utfcpp/empty.cc
+++ b/pkg/utfcpp/empty.cc
@ -0,0 +1,2 @@
 // Needed for Zig build to be happy
 void ghostty_utfcpp_stub() {}
--- a/src/bench/stream.sh
+++ b/src/bench/stream.sh
@ -0,0 +1,20 @@
 #!/usr/bin/env bash
 #
 # This is a trivial helper script to help run the stream benchmark.
 # You probably want to tweak this script depending on what you're
 # trying to measure.
 DATA="ascii"
 SIZE="25M"
 # Uncomment to test with an active terminal state.
 #ARGS=" --terminal"
 hyperfine \
  --warmup 10 \
  -n memcpy \
  "./zig-out/bin/bench-stream --mode=gen-${DATA} | head -c ${SIZE} | ./zig-out/bin/bench-stream --mode=noop${ARGS}" \
  -n scalar \
  "./zig-out/bin/bench-stream --mode=gen-${DATA} | head -c ${SIZE} | ./zig-out/bin/bench-stream --mode=scalar${ARGS}" \
  -n simd \
  "./zig-out/bin/bench-stream --mode=gen-${DATA} | head -c ${SIZE} | ./zig-out/bin/bench-stream --mode=simd${ARGS}"
--- a/src/bench/stream.zig
+++ b/src/bench/stream.zig
@ -0,0 +1,215 @@
 //! This benchmark tests the throughput of the VT stream. It has a few
 //! modes in order to test different methods of stream processing. It
 //! provides a "noop" mode to give us the `memcpy` speed.
 //!
 //! This will consume all of the available stdin, so you should run it
 //! with `head` in a pipe to restrict. For example, to test ASCII input:
 //!
 //!   bench-stream --mode=gen-ascii | head -c 50M | bench-stream --mode=simd
 //!
 const std = @import("std");
 const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
 const ArenaAllocator = std.heap.ArenaAllocator;
 const ziglyph = @import("ziglyph");
 const cli = @import("../cli.zig");
 const terminal = @import("../terminal/main.zig");
 const Args = struct {
    mode: Mode = .noop,
    /// Process input with a real terminal. This will be MUCH slower than
    /// the other modes because it has to maintain terminal state but will
    /// help get more realistic numbers.
    terminal: bool = false,
    @"terminal-rows": usize = 80,
    @"terminal-cols": usize = 120,
    /// The size for read buffers. Doesn't usually need to be changed. The
    /// main point is to make this runtime known so we can avoid compiler
    /// optimizations.
    @"buffer-size": usize = 4096,
    /// This is set by the CLI parser for deinit.
    _arena: ?ArenaAllocator = null,
    pub fn deinit(self: *Args) void {
        if (self._arena) |arena| arena.deinit();
        self.* = undefined;
    }
 };
 const Mode = enum {
    // Do nothing, just read from stdin into a stack-allocated buffer.
    // This is used to benchmark our base-case: it gives us our maximum
    // throughput on a basic read.
    noop,
    // These benchmark the throughput of the terminal stream parsing
    // with and without SIMD. The "simd" option will use whatever is best
    // for the running platform.
    //
    // Note that these run through the full VT parser but do not apply
    // the operations to terminal state, so there is no terminal state
    // overhead.
    scalar,
    simd,
    // Generate an infinite stream of random printable ASCII characters.
    @"gen-ascii",
    // Generate an infinite stream of repeated UTF-8 characters. We don't
    // currently do random generation because trivial implementations are
    // too slow and I'm a simple man.
    @"gen-utf8",
 };
 pub const std_options = struct {
    pub const log_level: std.log.Level = .debug;
 };
 pub fn main() !void {
    // We want to use the c allocator because it is much faster than GPA.
    const alloc = std.heap.c_allocator;
    // Parse our args
    var args: Args = .{};
    defer args.deinit();
    {
        var iter = try std.process.argsWithAllocator(alloc);
        defer iter.deinit();
        try cli.args.parse(Args, alloc, &args, &iter);
    }
    const reader = std.io.getStdIn().reader();
    const writer = std.io.getStdOut().writer();
    const buf = try alloc.alloc(u8, args.@"buffer-size");
    // Handle the modes that do not depend on terminal state first.
    switch (args.mode) {
        .@"gen-ascii" => try genAscii(writer),
        .@"gen-utf8" => try genUtf8(writer),
        .noop => try benchNoop(reader, buf),
        // Handle the ones that depend on terminal state next
        inline .scalar,
        .simd,
        => |tag| {
            if (args.terminal) {
                const TerminalStream = terminal.Stream(*TerminalHandler);
                var t = try terminal.Terminal.init(
                    alloc,
                    args.@"terminal-cols",
                    args.@"terminal-rows",
                );
                var handler: TerminalHandler = .{ .t = &t };
                var stream: TerminalStream = .{ .handler = &handler };
                switch (tag) {
                    .scalar => try benchScalar(reader, &stream, buf),
                    .simd => try benchSimd(reader, &stream, buf),
                    else => @compileError("missing case"),
                }
            } else {
                var stream: terminal.Stream(NoopHandler) = .{ .handler = .{} };
                switch (tag) {
                    .scalar => try benchScalar(reader, &stream, buf),
                    .simd => try benchSimd(reader, &stream, buf),
                    else => @compileError("missing case"),
                }
            }
        },
    }
 }
 /// Generates an infinite stream of random printable ASCII characters.
 /// This has no control characters in it at all.
 fn genAscii(writer: anytype) !void {
    const alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_+-=[]{}|;':\\\",./<>?`~";
    try genData(writer, alphabet);
 }
 /// Generates an infinite stream of bytes from the given alphabet.
 fn genData(writer: anytype, alphabet: []const u8) !void {
    var prng = std.rand.DefaultPrng.init(0x12345678);
    const rnd = prng.random();
    while (true) {
        var buf: [1024]u8 = undefined;
        for (&buf) |*c| {
            const idx = rnd.uintLessThanBiased(usize, alphabet.len);
            c.* = alphabet[idx];
        }
        writer.writeAll(&buf) catch |err| switch (err) {
            error.BrokenPipe => return, // stdout closed
            else => return err,
        };
    }
 }
 fn genUtf8(writer: anytype) !void {
    while (true) {
        writer.writeAll(random_utf8) catch |err| switch (err) {
            error.BrokenPipe => return, // stdout closed
            else => return err,
        };
    }
 }
 noinline fn benchNoop(reader: anytype, buf: []u8) !void {
    var total: usize = 0;
    while (true) {
        const n = try reader.readAll(buf);
        if (n == 0) break;
        total += n;
    }
    std.log.info("total bytes len={}", .{total});
 }
 noinline fn benchScalar(
    reader: anytype,
    stream: anytype,
    buf: []u8,
 ) !void {
    while (true) {
        const n = try reader.read(buf);
        if (n == 0) break;
        // Using stream.next directly with a for loop applies a naive
        // scalar approach.
        for (buf[0..n]) |c| try stream.next(c);
    }
 }
 noinline fn benchSimd(
    reader: anytype,
    stream: anytype,
    buf: []u8,
 ) !void {
    while (true) {
        const n = try reader.read(buf);
        if (n == 0) break;
        try stream.nextSlice(buf[0..n]);
    }
 }
 const NoopHandler = struct {
    pub fn print(self: NoopHandler, cp: u21) !void {
        _ = self;
        _ = cp;
    }
 };
 const TerminalHandler = struct {
    t: *terminal.Terminal,
    pub fn print(self: *TerminalHandler, cp: u21) !void {
        try self.t.print(cp);
    }
 };
 /// Offline-generated random UTF-8 bytes, because generating them at runtime
 /// was too slow for our benchmarks. We should replace this if we can come
 /// up with something that doesn't bottleneck our benchmark.
 const random_utf8 = "⨴⭬∎⯀Ⳟ⳨⍈♍⒄⣹⇚ⱎ⯡⯴↩ⵆ⼳ⶦ⑑⦥➍Ⲡ⽉❞⹀⢧€⣁ⶐ⸲⣷⏝⣶⫿▝⨽⬃ↁ↵⯙ⶵ╡∾⭡′⫼↼┫⮡ↅ⍞‡▱⺁⿒⽛⎭☜Ⱝ⣘✬⢟⁴⟹⪝ℌ❓␆╣┳⽑⴩⺄✽ⳗ␮ⵍ⦵ⱍ⭑⛒ⅉ⛠➌₯ⵔⷋ⹶❷ⱳ⣖⭐⮋ₒ⥚ⷃ╶⌈⸣❥⑎⦿⪶₮╋⅌ⳬⴛ⥚♇╬❜⺷⡬⏠⧥┺⃻❼⏲↍Ⓙ⽕╶⾉⺪⁑⎕⅕⼧⊀ⲡ⊺⪭⟾Ⅵ⍌⛄⠻⃽⣻₮ⰹⴺ⪂⃾∖⊹⤔⵫⦒⽳⫄⍮↷⣌⩐⨼⯂⵺◺⍙⭺⟂⎯ⱼ⴬⫺⹦∌⡉ⳅ⛲⡏⃘⺃⵬ⴜ⾩⭦ⷭ⨟☌⍃⧪⮧ⓛ⃄♮ⲓ∘⣝⤐⎭ⷺⰫⶔ☎⾨⾐≦␢⋔⢟ⶐ⏁⚄⦡⾞✊⾾⫿⴩⪨⮰ⓙ⌽⭲⫬⒈⊻⸣⌳⋡ⱄⲛ⓬➼⌧⟮⹖♞ℚⷱ⭥⚣⏳⟾❠☏⦻⑽−∪ⅆ☁⿑⦣⵽Ⱳ⺧⺊Ⓞ⫽⦀⃐⚽⎌⥰⚪⢌⛗⸋⛂⾽Ⰳ⍧⛗◁❠↺≍‸ⴣ⭰‾⡸⩛⭷ⵒ⵼⚉❚⨳⑫⹾⷟∇┬⚌⨙╘ℹ⢱⏴∸⴨⾀⌟⡄⺣⦦ⱏ⼚⿇├⌮⸿⯔₮—⥟╖◡⻵ⶕ┧⒞⏖⏧⟀❲➚‏➳Ⰼ┸⬖⸓⁃⹚⫣┭↜〈☶≍☨╟⿹ⳙ⺽⸡⵵⛞⚟⯓⥟┞⿄⮖⃫⭒⠤ⓣ⬱⃅⓼ⱒ⥖✜⛘⠶ⰽ⿉⾣➌⣋⚨⒯◱⢃◔ⱕ⫡⓱⅌Ⱨ⧵⯾┰⁠ⱌ⼳♠⨽⪢⸳⠹⩡Ⓨ⡪⭞⼰⡧ⓖ⤘⽶⵶ⴺ ⨨▅⏟⊕ⴡⴰ␌⚯⦀⫭⨔⬯⨢ⱽ⟓⥫⑤⊘⟧❐▜⵸℅⋣⚏⇭⽁⪂ⲡ⯊⦥⭳⠾⹫⠮℞⒡Ⰼ⦈⭅≉⋆☈▓⺑⡻▷Ⱑ⋖⬜┃ⵍ←⣢ↁ☚⟴⦡⨍⼡◝⯤❓◢⌡⏿⭲✏⎑⧊⼤⪠⋂⚜┯▤⑘⟾⬬Ⓜ⨸⥪ⱘ⳷⷟⒖⋐⡈⏌∠⏁⓳Ⲟ⦽⢯┏Ⲹ⍰ⅹ⚏⍐⟍⣩␖⛂∜❆⤗⒨⓽";
--- a/src/build_config.zig
+++ b/src/build_config.zig
@ -139,4 +139,5 @@ pub const ExeEntrypoint = enum {
    mdgen_ghostty_1,
    mdgen_ghostty_5,
    bench_parser,
    bench_stream,
 };
--- a/src/main.zig
+++ b/src/main.zig
@ -7,4 +7,5 @@ pub usingnamespace switch (build_config.exe_entrypoint) {
    .mdgen_ghostty_1 => @import("build/mdgen/main_ghostty_1.zig"),
    .mdgen_ghostty_5 => @import("build/mdgen/main_ghostty_5.zig"),
    .bench_parser => @import("bench/parser.zig"),
    .bench_stream => @import("bench/stream.zig"),
 };
--- a/src/main_ghostty.zig
+++ b/src/main_ghostty.zig
@ -307,6 +307,7 @@ test {
    _ = @import("inspector/main.zig");
    _ = @import("terminal/main.zig");
    _ = @import("terminfo/main.zig");
    _ = @import("simd/main.zig");
    // TODO
    _ = @import("blocking_queue.zig");
--- a/src/simd/index_of.cpp
+++ b/src/simd/index_of.cpp
@ -0,0 +1,53 @@
 // Generates code for every target that this compiler can support.
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "simd/index_of.cpp"  // this file
 #include <hwy/foreach_target.h>                 // must come before highway.h
 #include <hwy/highway.h>
 #include <simd/index_of.h>
 #include <optional>
 HWY_BEFORE_NAMESPACE();
 namespace ghostty {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 size_t IndexOf(const uint8_t needle,
               const uint8_t* HWY_RESTRICT input,
               size_t count) {
  const hn::ScalableTag<uint8_t> d;
  return IndexOfImpl(d, needle, input, count);
 }
 }  // namespace HWY_NAMESPACE
 }  // namespace ghostty
 HWY_AFTER_NAMESPACE();
 // HWY_ONCE is true for only one of the target passes
 #if HWY_ONCE
 namespace ghostty {
 // This macro declares a static array used for dynamic dispatch.
 HWY_EXPORT(IndexOf);
 size_t IndexOf(const uint8_t needle,
               const uint8_t* HWY_RESTRICT input,
               size_t count) {
  return HWY_DYNAMIC_DISPATCH(IndexOf)(needle, input, count);
 }
 }  // namespace ghostty
 extern "C" {
 size_t ghostty_simd_index_of(const uint8_t needle,
                             const uint8_t* HWY_RESTRICT input,
                             size_t count) {
  return ghostty::IndexOf(needle, input, count);
 }
 }
 #endif  // HWY_ONCE
--- a/src/simd/index_of.h
+++ b/src/simd/index_of.h
@ -0,0 +1,96 @@
 #if defined(GHOSTTY_SIMD_INDEX_OF_H_) == defined(HWY_TARGET_TOGGLE)
 #ifdef GHOSTTY_SIMD_INDEX_OF_H_
 #undef GHOSTTY_SIMD_INDEX_OF_H_
 #else
 #define GHOSTTY_SIMD_INDEX_OF_H_
 #endif
 #include <hwy/highway.h>
 #include <optional>
 HWY_BEFORE_NAMESPACE();
 namespace ghostty {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 // Return the index of the first occurrence of `needle` in `input`, where
 // the input and needle are already loaded into vectors.
 template <class D, typename T = hn::TFromD<D>>
 std::optional<size_t> IndexOfChunk(D d,
                                   hn::Vec<D> needle_vec,
                                   hn::Vec<D> input_vec) {
  // Compare the input vector with the needle vector. This produces
  // a vector where each lane is 0xFF if the corresponding lane in
  // `input_vec` is equal to the corresponding lane in `needle_vec`.
  const hn::Mask<D> eq_mask = hn::Eq(needle_vec, input_vec);
  // Find the index within the vector where the first true value is.
  const intptr_t pos = hn::FindFirstTrue(d, eq_mask);
  // If we found a match, return the index into the input.
  if (pos >= 0) {
    return std::optional<size_t>(static_cast<size_t>(pos));
  } else {
    return std::nullopt;
  }
 }
 // Return the index of the first occurrence of `needle` in `input` or
 // `count` if not found.
 template <class D, typename T = hn::TFromD<D>>
 size_t IndexOfImpl(D d, T needle, const T* HWY_RESTRICT input, size_t count) {
  // Note: due to the simplicity of this operation and the general complexity
  // of SIMD, I'm going to overly comment this function to help explain the
  // implementation for future maintainers.
  // The number of lanes in the vector type.
  const size_t N = hn::Lanes(d);
  // Create a vector with all lanes set to `needle` so we can do a lane-wise
  // comparison with the input.
  const hn::Vec<D> needle_vec = Set(d, needle);
  // Compare N elements at a time.
  size_t i = 0;
  for (; i + N <= count; i += N) {
    // Load the N elements from our input into a vector and check the chunk.
    const hn::Vec<D> input_vec = hn::LoadU(d, input + i);
    if (auto pos = IndexOfChunk(d, needle_vec, input_vec)) {
      return i + pos.value();
    }
  }
  // Since we compare N elements at a time, we may have some elements left
  // if count modulo N != 0. We need to scan the remaining elements. To
  // be simple, we search one element at a time.
  if (i != count) {
    // Create a new vector with only one relevant lane.
    const hn::CappedTag<T, 1> d1;
    using D1 = decltype(d1);
    // Get an equally sized needle vector with only one lane.
    const hn::Vec<D1> needle1 = Set(d1, GetLane(needle_vec));
    // Go through the remaining elements and do similar logic to
    // the previous loop to find any matches.
    for (; i < count; ++i) {
      const hn::Vec<D1> input_vec = hn::LoadU(d1, input + i);
      const hn::Mask<D1> eq_mask = hn::Eq(needle1, input_vec);
      if (hn::AllTrue(d1, eq_mask))
        return i;
    }
  }
  return count;
 }
 size_t IndexOf(const uint8_t needle,
               const uint8_t* HWY_RESTRICT input,
               size_t count);
 }  // namespace HWY_NAMESPACE
 }  // namespace ghostty
 HWY_AFTER_NAMESPACE();
 #endif  // GHOSTTY_SIMD_INDEX_OF_H_
--- a/src/simd/index_of.zig
+++ b/src/simd/index_of.zig
@ -0,0 +1,27 @@
 const std = @import("std");
 const builtin = @import("builtin");
 extern "c" fn ghostty_simd_index_of(
    needle: u8,
    input: [*]const u8,
    count: usize,
 ) usize;
 pub fn indexOf(input: []const u8, needle: u8) ?usize {
    const result = ghostty_simd_index_of(needle, input.ptr, input.len);
    return if (result == input.len) null else result;
 }
 test "indexOf" {
    const testing = std.testing;
    try testing.expect(indexOf("hello", ' ') == null);
    try testing.expectEqual(@as(usize, 2), indexOf("hi lo", ' ').?);
    try testing.expectEqual(@as(usize, 5), indexOf(
        \\XXXXX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
        \\XXXXXXXXXXXX XXXXXXXXXXX XXXXXXXXXXXXXXX
    , ' ').?);
    try testing.expectEqual(@as(usize, 53), indexOf(
        \\XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
        \\XXXXXXXXXXXX XXXXXXXXXXX XXXXXXXXXXXXXXX
    , ' ').?);
 }
--- a/src/simd/main.zig
+++ b/src/simd/main.zig
@ -0,0 +1,8 @@
 const std = @import("std");
 pub const index_of = @import("index_of.zig");
 pub const vt = @import("vt.zig");
 test {
    @import("std").testing.refAllDecls(@This());
 }
--- a/src/simd/vt.cpp
+++ b/src/simd/vt.cpp
@ -0,0 +1,169 @@
 // Generates code for every target that this compiler can support.
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "simd/vt.cpp"  // this file
 #include <hwy/foreach_target.h>           // must come before highway.h
 #include <hwy/highway.h>
 #include <simdutf.h>
 #include <utf8.h>
 #include <vector>
 #include <simd/index_of.h>
 #include <simd/vt.h>
 HWY_BEFORE_NAMESPACE();
 namespace ghostty {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 using T = uint8_t;
 // Decode the UTF-8 text in input into output. Returns the number of decoded
 // characters. This function assumes output is large enough.
 //
 // This function handles malformed UTF-8 sequences by inserting a
 // replacement character (U+FFFD) and continuing to decode. This function
 // will consume the entire input no matter what.
 size_t DecodeUTF8(const uint8_t* HWY_RESTRICT input,
                  size_t count,
                  char32_t* output) {
  // Its possible for our input to be empty since DecodeUTF8UntilControlSeq
  // doesn't check for this.
  if (count == 0) {
    return 0;
  }
  // Assume no errors for fast path.
  const size_t decoded = simdutf::convert_utf8_to_utf32(
      reinterpret_cast<const char*>(input), count, output);
  if (decoded > 0) {
    return decoded;
  }
  // Errors in the UTF input, take a slow path and do a decode with
  // replacement (with U+FFFD). Note that simdutf doesn't have a
  // decode with replacement API:
  // https://github.com/simdutf/simdutf/issues/147
  //
  // Because of this, we use a separate library with heap allocation
  // that is much, much slower (the allocation is slower, the algorithm
  // is slower, etc.) This is just so we have something that works.
  // I want to replace this.
  std::vector<char> replacement_result;
  utf8::replace_invalid(input, input + count,
                        std::back_inserter(replacement_result), 0xFFFD);
  return DecodeUTF8(reinterpret_cast<const uint8_t*>(replacement_result.data()),
                    replacement_result.size(), output);
 }
 /// Decode the UTF-8 text in input into output until an escape
 /// character is found. This returns the number of bytes consumed
 /// from input and writes the number of decoded characters into
 /// output_count.
 ///
 /// This may return a value less than count even with no escape
 /// character if the input ends with an incomplete UTF-8 sequence.
 /// The caller should check the next byte manually to determine
 /// if it is incomplete.
 template <class D>
 size_t DecodeUTF8UntilControlSeqImpl(D d,
                                     const T* HWY_RESTRICT input,
                                     size_t count,
                                     char32_t* output,
                                     size_t* output_count) {
  const size_t N = hn::Lanes(d);
  // Create a vector containing ESC since that denotes a control sequence.
  const hn::Vec<D> esc_vec = Set(d, 0x1B);
  // Compare N elements at a time.
  size_t i = 0;
  for (; i + N <= count; i += N) {
    // Load the N elements from our input into a vector.
    const hn::Vec<D> input_vec = hn::LoadU(d, input + i);
    // If we don't have any escapes we keep going. We want to accumulate
    // the largest possible valid UTF-8 sequence before decoding.
    // TODO(mitchellh): benchmark this vs decoding every time
    const auto esc_idx = IndexOfChunk(d, esc_vec, input_vec);
    if (!esc_idx) {
      continue;
    }
    // We have an ESC char, decode up to this point. We start by assuming
    // a valid UTF-8 sequence and slow-path into error handling if we find
    // an invalid sequence.
    *output_count = DecodeUTF8(input, i + esc_idx.value(), output);
    return i + esc_idx.value();
  }
  // If we have leftover input then we decode it one byte at a time (slow!)
  // using pretty much the same logic as above.
  if (i != count) {
    const hn::CappedTag<T, 1> d1;
    using D1 = decltype(d1);
    const hn::Vec<D1> esc1 = Set(d1, GetLane(esc_vec));
    for (; i < count; ++i) {
      const hn::Vec<D1> input_vec = hn::LoadU(d1, input + i);
      const auto esc_idx = IndexOfChunk(d1, esc1, input_vec);
      if (!esc_idx) {
        continue;
      }
      *output_count = DecodeUTF8(input, i + esc_idx.value(), output);
      return i + esc_idx.value();
    }
  }
  // If we reached this point, its possible for our input to have an
  // incomplete sequence because we're consuming the full input. We need
  // to trim any incomplete sequences from the end of the input.
  const size_t trimmed_len =
      simdutf::trim_partial_utf8(reinterpret_cast<const char*>(input), i);
  *output_count = DecodeUTF8(input, trimmed_len, output);
  return trimmed_len;
 }
 size_t DecodeUTF8UntilControlSeq(const uint8_t* HWY_RESTRICT input,
                                 size_t count,
                                 char32_t* output,
                                 size_t* output_count) {
  const hn::ScalableTag<uint8_t> d;
  return DecodeUTF8UntilControlSeqImpl(d, input, count, output, output_count);
 }
 }  // namespace HWY_NAMESPACE
 }  // namespace ghostty
 HWY_AFTER_NAMESPACE();
 // HWY_ONCE is true for only one of the target passes
 #if HWY_ONCE
 namespace ghostty {
 HWY_EXPORT(DecodeUTF8UntilControlSeq);
 size_t DecodeUTF8UntilControlSeq(const uint8_t* HWY_RESTRICT input,
                                 size_t count,
                                 char32_t* output,
                                 size_t* output_count) {
  return HWY_DYNAMIC_DISPATCH(DecodeUTF8UntilControlSeq)(input, count, output,
                                                         output_count);
 }
 }  // namespace ghostty
 extern "C" {
 size_t ghostty_simd_decode_utf8_until_control_seq(const uint8_t* HWY_RESTRICT
                                                      input,
                                                  size_t count,
                                                  char32_t* output,
                                                  size_t* output_count) {
  return ghostty::DecodeUTF8UntilControlSeq(input, count, output, output_count);
 }
 }  // extern "C"
 #endif  // HWY_ONCE
--- a/src/simd/vt.h
+++ b/src/simd/vt.h
@ -0,0 +1,30 @@
 #if defined(GHOSTTY_SIMD_VT_H_) == defined(HWY_TARGET_TOGGLE)
 #ifdef GHOSTTY_SIMD_VT_H_
 #undef GHOSTTY_SIMD_VT_H_
 #else
 #define GHOSTTY_SIMD_VT_H_
 #endif
 #include <hwy/highway.h>
 HWY_BEFORE_NAMESPACE();
 namespace ghostty {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 }  // namespace HWY_NAMESPACE
 }  // namespace ghostty
 HWY_AFTER_NAMESPACE();
 #if HWY_ONCE
 namespace ghostty {
 typedef void (*PrintFunc)(const char32_t* chars, size_t count);
 }  // namespace ghostty
 #endif  // HWY_ONCE
 #endif  // GHOSTTY_SIMD_VT_H_
--- a/src/simd/vt.zig
+++ b/src/simd/vt.zig
@ -0,0 +1,121 @@
 const std = @import("std");
 // vt.cpp
 extern "c" fn ghostty_simd_decode_utf8_until_control_seq(
    input: [*]const u8,
    count: usize,
    output: [*]u32,
    output_count: *usize,
 ) usize;
 const DecodeResult = struct {
    consumed: usize,
    decoded: usize,
 };
 pub fn utf8DecodeUntilControlSeq(
    input: []const u8,
    output: []u32,
 ) DecodeResult {
    var decoded: usize = 0;
    const consumed = ghostty_simd_decode_utf8_until_control_seq(
        input.ptr,
        input.len,
        output.ptr,
        &decoded,
    );
    return .{ .consumed = consumed, .decoded = decoded };
 }
 test "decode no escape" {
    const testing = std.testing;
    var output: [1024]u32 = undefined;
    // TODO: many more test cases
    {
        const str = "hello" ** 128;
        try testing.expectEqual(DecodeResult{
            .consumed = str.len,
            .decoded = str.len,
        }, utf8DecodeUntilControlSeq(str, &output));
    }
 }
 test "decode ASCII to escape" {
    const testing = std.testing;
    var output: [1024]u32 = undefined;
    // TODO: many more test cases
    {
        const prefix = "hello" ** 64;
        const str = prefix ++ "\x1b" ++ ("world" ** 64);
        try testing.expectEqual(DecodeResult{
            .consumed = prefix.len,
            .decoded = prefix.len,
        }, utf8DecodeUntilControlSeq(str, &output));
    }
 }
 test "decode immediate esc sequence" {
    const testing = std.testing;
    var output: [64]u32 = undefined;
    const str = "\x1b[?5s";
    try testing.expectEqual(DecodeResult{
        .consumed = 0,
        .decoded = 0,
    }, utf8DecodeUntilControlSeq(str, &output));
 }
 test "decode incomplete UTF-8" {
    const testing = std.testing;
    var output: [64]u32 = undefined;
    // 2-byte
    {
        const str = "hello\xc2";
        try testing.expectEqual(DecodeResult{
            .consumed = 5,
            .decoded = 5,
        }, utf8DecodeUntilControlSeq(str, &output));
    }
    // 3-byte
    {
        const str = "hello\xe0\x00";
        try testing.expectEqual(DecodeResult{
            .consumed = 5,
            .decoded = 5,
        }, utf8DecodeUntilControlSeq(str, &output));
    }
    // 4-byte
    {
        const str = "hello\xf0\x90";
        try testing.expectEqual(DecodeResult{
            .consumed = 5,
            .decoded = 5,
        }, utf8DecodeUntilControlSeq(str, &output));
    }
 }
 test "decode invalid UTF-8" {
    const testing = std.testing;
    var output: [64]u32 = undefined;
    // Invalid leading 1s
    {
        const str = "hello\xc2\x00";
        try testing.expectEqual(DecodeResult{
            .consumed = 7,
            .decoded = 7,
        }, utf8DecodeUntilControlSeq(str, &output));
    }
    try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
 }
--- a/src/terminal/simdvt.zig
+++ b/src/terminal/simdvt.zig
@ -0,0 +1,5 @@
 pub usingnamespace @import("simdvt/parser.zig");
 test {
    @import("std").testing.refAllDecls(@This());
 }
--- a/src/terminal/stream.zig
+++ b/src/terminal/stream.zig
@ -1,5 +1,7 @@
 const std = @import("std");
 const assert = std.debug.assert;
 const testing = std.testing;
 const simd = @import("../simd/main.zig");
 const Parser = @import("Parser.zig");
 const ansi = @import("ansi.zig");
 const charsets = @import("charsets.zig");
@ -40,18 +42,140 @@ pub fn Stream(comptime Handler: type) type {
        parser: Parser = .{},
        utf8decoder: UTF8Decoder = .{},
        /// Keep track of any partial UTF-8 sequences that we need to
        /// process in the next call to nextAssumeUtf8.
        partial_utf8: [4]u8 = undefined,
        partial_utf8_len: u3 = 0,
        pub fn deinit(self: *Self) void {
            self.parser.deinit();
        }
        /// Process a string of characters.
-        pub fn nextSlice(self: *Self, c: []const u8) !void {
+        pub fn nextSlice(self: *Self, input: []const u8) !void {
-            for (c) |single| try self.next(single);
+            // This is the maximum number of codepoints we can decode
            // at one time for this function call. This is somewhat arbitrary
            // so if someone can demonstrate a better number then we can switch.
            var cp_buf: [4096]u32 = undefined;
            // Split the input into chunks that fit into cp_buf.
            var i: usize = 0;
            while (true) {
                const len = @min(cp_buf.len, input.len - i);
                try self.nextSliceCapped(input[i .. i + len], &cp_buf);
                i += len;
                if (i >= input.len) break;
            }
        }
-        /// Process the next character and call any callbacks if necessary.
+        fn nextSliceCapped(self: *Self, input: []const u8, cp_buf: []u32) !void {
            assert(input.len <= cp_buf.len);
            var offset: usize = 0;
            // If we have a partial UTF-8 sequence then we process manually.
            if (self.partial_utf8_len > 0) {
                offset += try self.completePartialUtf8(input);
            } else if (self.parser.state != .ground) {
                // If we're not in the ground state then we process until
                // we are. This can happen if the last chunk of input put us
                // in the middle of a control sequence.
                for (input[offset..]) |single| {
                    try self.nextNonUtf8(single);
                    offset += 1;
                    if (self.parser.state == .ground) break;
                }
            }
            // If we're in the ground state then we can use SIMD to process
            // input until we see an ESC (0x1B), since all other characters
            // up to that point are just UTF-8.
            while (self.parser.state == .ground and offset < input.len) {
                const res = simd.vt.utf8DecodeUntilControlSeq(input[offset..], cp_buf);
                for (cp_buf[0..res.decoded]) |cp| {
                    if (cp < 0xF) {
                        try self.execute(@intCast(cp));
                    } else {
                        try self.print(@intCast(cp));
                    }
                }
                // Consume the bytes we just processed.
                offset += res.consumed;
                if (offset >= input.len) return;
                // If our offset is NOT an escape then we must have a
                // partial UTF-8 sequence. In that case, we save it and
                // return.
                if (input[offset] != 0x1B) {
                    const rem = input[offset..];
                    assert(rem.len <= self.partial_utf8.len);
                    @memcpy(self.partial_utf8[0..rem.len], rem);
                    self.partial_utf8_len = @intCast(rem.len);
                    return;
                }
                // Process our control sequence.
                for (input[offset..]) |single| {
                    try self.nextNonUtf8(single);
                    offset += 1;
                    if (self.parser.state == .ground) break;
                }
            }
        }
        // Complete a partial UTF-8 sequence from a prior input chunk.
        // This processes the UTF-8 sequence and then returns the number
        // of bytes consumed from the input.
        fn completePartialUtf8(self: *Self, input: []const u8) !usize {
            assert(self.partial_utf8_len > 0);
            assert(self.parser.state == .ground);
            // This cannot fail because the nature of partial utf8
            // existing means we successfully processed it last time.
            const len = std.unicode.utf8ByteSequenceLength(self.partial_utf8[0]) catch
                unreachable;
            // This is the length we need in the input in addition to
            // our partial_utf8 to complete the sequence.
            const input_len = len - self.partial_utf8_len;
            // If we STILL don't have enough bytes, then we copy and continue.
            // This is a really bizarre and stupid program thats running to
            // send us incomplete UTF-8 sequences over multiple write() calls.
            if (input_len > input.len) {
                @memcpy(
                    self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input.len],
                    input,
                );
                self.partial_utf8_len += @intCast(input.len);
                return input.len;
            }
            // Process the complete UTF-8 sequence.
            @memcpy(
                self.partial_utf8[self.partial_utf8_len .. self.partial_utf8_len + input_len],
                input[0..input_len],
            );
            const cp = cp: {
                if (std.unicode.utf8Decode(self.partial_utf8[0..len])) |cp| {
                    break :cp cp;
                } else |err| {
                    log.warn("invalid UTF-8, ignoring err={}", .{err});
                    break :cp 0xFFFD; // replacement character
                }
            };
            self.partial_utf8_len = 0;
            try self.print(cp);
            return input_len;
        }
        /// Like nextSlice but takes one byte and is necessarilly a scalar
        /// operation that can't use SIMD. Prefer nextSlice if you can and
        /// try to get multiple bytes at once.
        pub fn next(self: *Self, c: u8) !void {
-            // log.debug("char: {c}", .{c});
+            // The scalar path can be responsible for decoding UTF-8.
            if (self.parser.state == .ground and c != 0x1B) {
                var consumed = false;
                while (!consumed) {
@ -65,9 +189,20 @@ pub fn Stream(comptime Handler: type) type {
                        }
                    }
                }
                return;
            }
            try self.nextNonUtf8(c);
        }
        /// Process the next character and call any callbacks if necessary.
        ///
        /// This assumes that we're not in the UTF-8 decoding state. If
        /// we may be in the UTF-8 decoding state call nextSlice or next.
        fn nextNonUtf8(self: *Self, c: u8) !void {
            assert(self.parser.state != .ground or c == 0x1B);
            const actions = self.parser.next(c);
            for (actions) |action_opt| {
                const action = action_opt orelse continue;
--- a/src/termio/Exec.zig
+++ b/src/termio/Exec.zig
@ -1617,45 +1617,9 @@ const ReadThread = struct {
                    log.err("error processing terminal data: {}", .{err});
            }
        } else {
-            // Process the terminal data. This is an extremely hot part of the
+            ev.terminal_stream.nextSlice(buf) catch |err|
            // terminal emulator, so we do some abstraction leakage to avoid
            // function calls and unnecessary logic.
            //
            // The ground state is the only state that we can see and print/execute
            // ASCII, so we only execute this hot path if we're already in the ground
            // state.
            //
            // Empirically, this alone improved throughput of large text output by ~20%.
            var i: usize = 0;
            const end = buf.len;
            if (ev.terminal_stream.parser.state == .ground) {
                for (buf[i..end]) |ch| {
                    switch (terminal.parse_table.table[ch][@intFromEnum(terminal.Parser.State.ground)].action) {
                        // Print, call directly.
                        .print => ev.terminal_stream.handler.print(@intCast(ch)) catch |err|
                            log.err("error processing terminal data: {}", .{err}),
                        // C0 execute, let our stream handle this one but otherwise
                        // continue since we're guaranteed to be back in ground.
                        .execute => ev.terminal_stream.execute(ch) catch |err|
                            log.err("error processing terminal data: {}", .{err}),
                        // Otherwise, break out and go the slow path until we're
                        // back in ground. There is a slight optimization here where
                        // could try to find the next transition to ground but when
                        // I implemented that it didn't materially change performance.
                        else => break,
                    }
                    i += 1;
                }
            }
            if (i < end) {
                ev.terminal_stream.nextSlice(buf[i..end]) catch |err|
                log.err("error processing terminal data: {}", .{err});
        }
        }
        // If our stream handling caused messages to be sent to the writer
        // thread, then we need to wake it up so that it processes them.
		`@ -0,0 +1,2 @@`
							`// Needed for Zig build to be happy`
							`void ghostty_utfcpp_stub() {}`