From 4ae41579da37dad141697f3f8624aeaa63728b30 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Thu, 8 Feb 2024 13:21:36 -0800 Subject: [PATCH] add utf8proc back for bench --- build.zig | 9 +++++++++ build.zig.zon | 1 + pkg/utf8proc/build.zig | 37 +++++++++++++++++++++++++++++++++++ pkg/utf8proc/build.zig.zon | 11 +++++++++++ pkg/utf8proc/c.zig | 3 +++ pkg/utf8proc/main.zig | 20 +++++++++++++++++++ src/bench/codepoint-width.sh | 2 ++ src/bench/codepoint-width.zig | 29 +++++++++++++++++++++++++++ 8 files changed, 112 insertions(+) create mode 100644 pkg/utf8proc/build.zig create mode 100644 pkg/utf8proc/build.zig.zon create mode 100644 pkg/utf8proc/c.zig create mode 100644 pkg/utf8proc/main.zig diff --git a/build.zig b/build.zig index 1b234f15e..dfc6b3404 100644 --- a/build.zig +++ b/build.zig @@ -1082,6 +1082,15 @@ fn addDeps( step.linkLibrary(utfcpp_dep.artifact("utfcpp")); try static_libs.append(utfcpp_dep.artifact("utfcpp").getEmittedBin()); + // utf8proc + const utf8proc_dep = b.dependency("utf8proc", .{ + .target = target, + .optimize = optimize, + }); + step.root_module.addImport("utf8proc", utf8proc_dep.module("utf8proc")); + step.linkLibrary(utf8proc_dep.artifact("utf8proc")); + try static_libs.append(utf8proc_dep.artifact("utf8proc").getEmittedBin()); + // Spirv-Cross step.linkLibrary(spirv_cross_dep.artifact("spirv_cross")); try static_libs.append(spirv_cross_dep.artifact("spirv_cross").getEmittedBin()); diff --git a/build.zig.zon b/build.zig.zon index a694562ea..c3b958591 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -38,6 +38,7 @@ .opengl = .{ .path = "./pkg/opengl" }, .pixman = .{ .path = "./pkg/pixman" }, .simdutf = .{ .path = "./pkg/simdutf" }, + .utf8proc = .{ .path = "./pkg/utf8proc" }, .utfcpp = .{ .path = "./pkg/utfcpp" }, .zlib = .{ .path = "./pkg/zlib" }, diff --git a/pkg/utf8proc/build.zig b/pkg/utf8proc/build.zig new file mode 100644 index 000000000..a29716983 --- /dev/null +++ b/pkg/utf8proc/build.zig @@ -0,0 +1,37 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) !void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + const module = b.addModule("utf8proc", .{ .root_source_file = .{ .path = "main.zig" } }); + + const upstream = b.dependency("utf8proc", .{}); + const lib = b.addStaticLibrary(.{ + .name = "utf8proc", + .target = target, + .optimize = optimize, + }); + lib.linkLibC(); + + lib.addIncludePath(upstream.path("")); + module.addIncludePath(upstream.path("")); + + var flags = std.ArrayList([]const u8).init(b.allocator); + try flags.append("-DUTF8PROC_EXPORTS"); + defer flags.deinit(); + lib.addCSourceFiles(.{ + .dependency = upstream, + .files = &.{"utf8proc.c"}, + .flags = flags.items, + }); + + lib.installHeadersDirectoryOptions(.{ + .source_dir = upstream.path(""), + .install_dir = .header, + .install_subdir = "", + .include_extensions = &.{".h"}, + }); + + b.installArtifact(lib); +} diff --git a/pkg/utf8proc/build.zig.zon b/pkg/utf8proc/build.zig.zon new file mode 100644 index 000000000..cfb62de55 --- /dev/null +++ b/pkg/utf8proc/build.zig.zon @@ -0,0 +1,11 @@ +.{ + .name = "utf8proc", + .version = "2.8.0", + .paths = .{""}, + .dependencies = .{ + .utf8proc = .{ + .url = "https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.8.0.tar.gz", + .hash = "1220056ce228a8c58f1fa66ab778f5c8965e62f720c1d30603c7d534cb7d8a605ad7", + }, + }, +} diff --git a/pkg/utf8proc/c.zig b/pkg/utf8proc/c.zig new file mode 100644 index 000000000..adeb226b0 --- /dev/null +++ b/pkg/utf8proc/c.zig @@ -0,0 +1,3 @@ +pub usingnamespace @cImport({ + @cInclude("utf8proc.h"); +}); diff --git a/pkg/utf8proc/main.zig b/pkg/utf8proc/main.zig new file mode 100644 index 000000000..a351fff4b --- /dev/null +++ b/pkg/utf8proc/main.zig @@ -0,0 +1,20 @@ +pub const c = @import("c.zig"); + +/// Given a codepoint, return a character width analogous to `wcwidth(codepoint)`, +/// except that a width of 0 is returned for non-printable codepoints +/// instead of -1 as in `wcwidth`. +pub fn charwidth(codepoint: u21) u8 { + return @intCast(c.utf8proc_charwidth(@intCast(codepoint))); +} + +/// Given a pair of consecutive codepoints, return whether a grapheme break is +/// permitted between them (as defined by the extended grapheme clusters in UAX#29). +pub fn graphemeBreakStateful(cp1: u21, cp2: u21, state: *i32) bool { + return c.utf8proc_grapheme_break_stateful( + @intCast(cp1), + @intCast(cp2), + state, + ); +} + +test {} diff --git a/src/bench/codepoint-width.sh b/src/bench/codepoint-width.sh index d0692bc43..8278370e3 100755 --- a/src/bench/codepoint-width.sh +++ b/src/bench/codepoint-width.sh @@ -27,6 +27,8 @@ hyperfine \ "./zig-out/bin/bench-codepoint-width --mode=noop${ARGS} try benchNoop(reader, buf), .wcwidth => try benchWcwidth(reader, buf), + .utf8proc => try benchUtf8proc(reader, buf), .ziglyph => try benchZiglyph(reader, buf), .simd => try benchSimd(reader, buf), } @@ -124,6 +128,31 @@ noinline fn benchWcwidth( } } +noinline fn benchUtf8proc( + reader: anytype, + buf: []u8, +) !void { + const utf8proc = @import("utf8proc"); + var d: UTF8Decoder = .{}; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp| { + const width = utf8proc.charwidth(cp); + + // Write the width to the buffer to avoid it being compiled away + buf[0] = @intCast(width); + } + } + } +} + noinline fn benchZiglyph( reader: anytype, buf: []u8,