add utf8proc back for bench

2025-08-02 14:57:31 +03:00 · 2024-02-08 13:21:36 -08:00
parent 7da82688b8
commit 4ae41579da
8 changed files with 112 additions and 0 deletions
--- a/build.zig
+++ b/build.zig
@ -1082,6 +1082,15 @@ fn addDeps(
    step.linkLibrary(utfcpp_dep.artifact("utfcpp"));
    try static_libs.append(utfcpp_dep.artifact("utfcpp").getEmittedBin());

+    // utf8proc
+    const utf8proc_dep = b.dependency("utf8proc", .{
+        .target = target,
+        .optimize = optimize,
+    });
+    step.root_module.addImport("utf8proc", utf8proc_dep.module("utf8proc"));
+    step.linkLibrary(utf8proc_dep.artifact("utf8proc"));
+    try static_libs.append(utf8proc_dep.artifact("utf8proc").getEmittedBin());
+
    // Spirv-Cross
    step.linkLibrary(spirv_cross_dep.artifact("spirv_cross"));
    try static_libs.append(spirv_cross_dep.artifact("spirv_cross").getEmittedBin());
--- a/build.zig.zon
+++ b/build.zig.zon
@ -38,6 +38,7 @@
        .opengl = .{ .path = "./pkg/opengl" },
        .pixman = .{ .path = "./pkg/pixman" },
        .simdutf = .{ .path = "./pkg/simdutf" },
+        .utf8proc = .{ .path = "./pkg/utf8proc" },
        .utfcpp = .{ .path = "./pkg/utfcpp" },
        .zlib = .{ .path = "./pkg/zlib" },

--- a/pkg/utf8proc/build.zig
+++ b/pkg/utf8proc/build.zig
@ -0,0 +1,37 @@
+const std = @import("std");
+
+pub fn build(b: *std.Build) !void {
+    const target = b.standardTargetOptions(.{});
+    const optimize = b.standardOptimizeOption(.{});
+
+    const module = b.addModule("utf8proc", .{ .root_source_file = .{ .path = "main.zig" } });
+
+    const upstream = b.dependency("utf8proc", .{});
+    const lib = b.addStaticLibrary(.{
+        .name = "utf8proc",
+        .target = target,
+        .optimize = optimize,
+    });
+    lib.linkLibC();
+
+    lib.addIncludePath(upstream.path(""));
+    module.addIncludePath(upstream.path(""));
+
+    var flags = std.ArrayList([]const u8).init(b.allocator);
+    try flags.append("-DUTF8PROC_EXPORTS");
+    defer flags.deinit();
+    lib.addCSourceFiles(.{
+        .dependency = upstream,
+        .files = &.{"utf8proc.c"},
+        .flags = flags.items,
+    });
+
+    lib.installHeadersDirectoryOptions(.{
+        .source_dir = upstream.path(""),
+        .install_dir = .header,
+        .install_subdir = "",
+        .include_extensions = &.{".h"},
+    });
+
+    b.installArtifact(lib);
+}
--- a/pkg/utf8proc/build.zig.zon
+++ b/pkg/utf8proc/build.zig.zon
@ -0,0 +1,11 @@
+.{
+    .name = "utf8proc",
+    .version = "2.8.0",
+    .paths = .{""},
+    .dependencies = .{
+        .utf8proc = .{
+            .url = "https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.8.0.tar.gz",
+            .hash = "1220056ce228a8c58f1fa66ab778f5c8965e62f720c1d30603c7d534cb7d8a605ad7",
+        },
+    },
+}
--- a/pkg/utf8proc/c.zig
+++ b/pkg/utf8proc/c.zig
@ -0,0 +1,3 @@
+pub usingnamespace @cImport({
+    @cInclude("utf8proc.h");
+});
--- a/pkg/utf8proc/main.zig
+++ b/pkg/utf8proc/main.zig
@ -0,0 +1,20 @@
+pub const c = @import("c.zig");
+
+/// Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
+/// except that a width of 0 is returned for non-printable codepoints
+/// instead of -1 as in `wcwidth`.
+pub fn charwidth(codepoint: u21) u8 {
+    return @intCast(c.utf8proc_charwidth(@intCast(codepoint)));
+}
+
+/// Given a pair of consecutive codepoints, return whether a grapheme break is
+/// permitted between them (as defined by the extended grapheme clusters in UAX#29).
+pub fn graphemeBreakStateful(cp1: u21, cp2: u21, state: *i32) bool {
+    return c.utf8proc_grapheme_break_stateful(
+        @intCast(cp1),
+        @intCast(cp2),
+        state,
+    );
+}
+
+test {}
--- a/src/bench/codepoint-width.sh
+++ b/src/bench/codepoint-width.sh
@ -27,6 +27,8 @@ hyperfine \
  "./zig-out/bin/bench-codepoint-width --mode=noop${ARGS} </tmp/ghostty_bench_data" \
  -n wcwidth \
  "./zig-out/bin/bench-codepoint-width --mode=wcwidth${ARGS} </tmp/ghostty_bench_data" \
+  -n utf8proc \
+  "./zig-out/bin/bench-codepoint-width --mode=utf8proc${ARGS} </tmp/ghostty_bench_data" \
  -n ziglyph \
  "./zig-out/bin/bench-codepoint-width --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
  -n simd \
--- a/src/bench/codepoint-width.zig
+++ b/src/bench/codepoint-width.zig
@ -45,6 +45,9 @@ const Mode = enum {
    /// libc wcwidth
    wcwidth,

+    /// Use utf8proc library to calculate the display width of each codepoint.
+    utf8proc,
+
    /// Use ziglyph library to calculate the display width of each codepoint.
    ziglyph,

@ -76,6 +79,7 @@ pub fn main() !void {
    switch (args.mode) {
        .noop => try benchNoop(reader, buf),
        .wcwidth => try benchWcwidth(reader, buf),
+        .utf8proc => try benchUtf8proc(reader, buf),
        .ziglyph => try benchZiglyph(reader, buf),
        .simd => try benchSimd(reader, buf),
    }
@ -124,6 +128,31 @@ noinline fn benchWcwidth(
    }
 }

+noinline fn benchUtf8proc(
+    reader: anytype,
+    buf: []u8,
+) !void {
+    const utf8proc = @import("utf8proc");
+    var d: UTF8Decoder = .{};
+    while (true) {
+        const n = try reader.read(buf);
+        if (n == 0) break;
+
+        // Using stream.next directly with a for loop applies a naive
+        // scalar approach.
+        for (buf[0..n]) |c| {
+            const cp_, const consumed = d.next(c);
+            assert(consumed);
+            if (cp_) |cp| {
+                const width = utf8proc.charwidth(cp);
+
+                // Write the width to the buffer to avoid it being compiled away
+                buf[0] = @intCast(width);
+            }
+        }
+    }
+}
+
 noinline fn benchZiglyph(
    reader: anytype,
    buf: []u8,