From d4fa0fcabf314fd8fe066369e7db890753f06e90 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Tue, 6 Feb 2024 17:06:58 -0800 Subject: [PATCH 01/12] bench/codepoint-width --- build.zig | 6 +- src/bench/codepoint-width.sh | 29 +++++++++ src/bench/codepoint-width.zig | 114 ++++++++++++++++++++++++++++++++++ src/build_config.zig | 1 + src/main.zig | 1 + 5 files changed, 150 insertions(+), 1 deletion(-) create mode 100755 src/bench/codepoint-width.sh create mode 100644 src/bench/codepoint-width.zig diff --git a/build.zig b/build.zig index 3b109ee4e..ca768bd51 100644 --- a/build.zig +++ b/build.zig @@ -1322,10 +1322,14 @@ fn benchSteps( var copy = config; copy.static = true; + var enum_name: [64]u8 = undefined; + @memcpy(enum_name[0..name.len], name); + std.mem.replaceScalar(u8, enum_name[0..name.len], '-', '_'); + var buf: [64]u8 = undefined; copy.exe_entrypoint = std.meta.stringToEnum( build_config.ExeEntrypoint, - try std.fmt.bufPrint(&buf, "bench_{s}", .{name}), + try std.fmt.bufPrint(&buf, "bench_{s}", .{enum_name[0..name.len]}), ).?; break :config copy; diff --git a/src/bench/codepoint-width.sh b/src/bench/codepoint-width.sh new file mode 100755 index 000000000..6aa3548c5 --- /dev/null +++ b/src/bench/codepoint-width.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# +# This is a trivial helper script to help run the codepoint-width benchmark. +# You probably want to tweak this script depending on what you're +# trying to measure. + +# Options: +# - "ascii", uniform random ASCII bytes +# - "utf8", uniform random unicode characters, encoded as utf8 +# - "rand", pure random data, will contain many invalid code sequences. +DATA="utf8" +SIZE="25000000" + +# Add additional arguments +ARGS="" + +# Generate the benchmark input ahead of time so it's not included in the time. +./zig-out/bin/bench-stream --mode=gen-$DATA | head -c $SIZE > /tmp/ghostty_bench_data + +# Uncomment to instead use the contents of `stream.txt` as input. +# yes $(cat ./stream.txt) | head -c $SIZE > /tmp/ghostty_bench_data + +hyperfine \ + --warmup 10 \ + -n baseline \ + "./zig-out/bin/bench-codepoint-width --mode=baseline${ARGS} try benchBaseline(reader, buf), + .ziglyph => try benchZiglyph(reader, buf), + } +} + +noinline fn benchBaseline( + reader: anytype, + buf: []u8, +) !void { + var d: UTF8Decoder = .{}; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + _ = d.next(c); + } + } +} + +noinline fn benchZiglyph( + reader: anytype, + buf: []u8, +) !void { + var d: UTF8Decoder = .{}; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp| { + const width = ziglyph.display_width.codePointWidth(cp, .half); + + // Write the width to the buffer to avoid it being compiled away + buf[0] = @intCast(width); + } + } + } +} diff --git a/src/build_config.zig b/src/build_config.zig index bfb4699d3..32dee925a 100644 --- a/src/build_config.zig +++ b/src/build_config.zig @@ -140,4 +140,5 @@ pub const ExeEntrypoint = enum { mdgen_ghostty_5, bench_parser, bench_stream, + bench_codepoint_width, }; diff --git a/src/main.zig b/src/main.zig index 393ddd541..46a6d7d3d 100644 --- a/src/main.zig +++ b/src/main.zig @@ -8,4 +8,5 @@ pub usingnamespace switch (build_config.exe_entrypoint) { .mdgen_ghostty_5 => @import("build/mdgen/main_ghostty_5.zig"), .bench_parser => @import("bench/parser.zig"), .bench_stream => @import("bench/stream.zig"), + .bench_codepoint_width => @import("bench/codepoint-width.zig"), }; From 88d81602fa1e5039520ab87506f8fa0ad3119696 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Tue, 6 Feb 2024 22:28:26 -0800 Subject: [PATCH 02/12] simd/codepoint-width: wip --- build.zig | 1 + src/bench/codepoint-width.sh | 5 +- src/bench/codepoint-width.zig | 29 ++++++ src/simd/codepoint_width.cpp | 178 ++++++++++++++++++++++++++++++++++ src/simd/codepoint_width.zig | 45 +++++++++ src/simd/main.zig | 1 + 6 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 src/simd/codepoint_width.cpp create mode 100644 src/simd/codepoint_width.zig diff --git a/build.zig b/build.zig index ca768bd51..1b234f15e 100644 --- a/build.zig +++ b/build.zig @@ -1015,6 +1015,7 @@ fn addDeps( step.linkLibCpp(); step.addIncludePath(.{ .path = "src" }); step.addCSourceFiles(.{ .files = &.{ + "src/simd/codepoint_width.cpp", "src/simd/index_of.cpp", "src/simd/vt.cpp", } }); diff --git a/src/bench/codepoint-width.sh b/src/bench/codepoint-width.sh index 6aa3548c5..d7e70f48d 100755 --- a/src/bench/codepoint-width.sh +++ b/src/bench/codepoint-width.sh @@ -16,6 +16,7 @@ ARGS="" # Generate the benchmark input ahead of time so it's not included in the time. ./zig-out/bin/bench-stream --mode=gen-$DATA | head -c $SIZE > /tmp/ghostty_bench_data +#cat ~/Downloads/JAPANESEBIBLE.txt > /tmp/ghostty_bench_data # Uncomment to instead use the contents of `stream.txt` as input. # yes $(cat ./stream.txt) | head -c $SIZE > /tmp/ghostty_bench_data @@ -25,5 +26,7 @@ hyperfine \ -n baseline \ "./zig-out/bin/bench-codepoint-width --mode=baseline${ARGS} try benchBaseline(reader, buf), .ziglyph => try benchZiglyph(reader, buf), + .simd => try benchSimd(reader, buf), } } @@ -112,3 +117,27 @@ noinline fn benchZiglyph( } } } + +noinline fn benchSimd( + reader: anytype, + buf: []u8, +) !void { + var d: UTF8Decoder = .{}; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp| { + const width = simd.codepointWidth(cp); + + // Write the width to the buffer to avoid it being compiled away + buf[0] = @intCast(width); + } + } + } +} diff --git a/src/simd/codepoint_width.cpp b/src/simd/codepoint_width.cpp new file mode 100644 index 000000000..7e20424d7 --- /dev/null +++ b/src/simd/codepoint_width.cpp @@ -0,0 +1,178 @@ +// Generates code for every target that this compiler can support. +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "simd/vt.cpp" // this file +#include // must come before highway.h +#include +#include + +#include + +HWY_BEFORE_NAMESPACE(); +namespace ghostty { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +using T = uint32_t; + +extern "C" int8_t ghostty_ziglyph_codepoint_width(uint32_t); + +HWY_ALIGN T eaw_gte[] = { + 0x3000, 0xff01, 0xffe0, 0x1100, 0x231a, 0x2329, 0x232a, 0x23e9, + 0x23f0, 0x25f3, 0x25fd, 0x2614, 0x2648, 0x267f, 0x2693, 0x26a1, + 0x26aa, 0x26bd, 0x26c4, 0x26ce, 0x26d4, 0x26ea, 0x26f2, 0x26f5, + 0x26fa, 0x26fd, 0x2705, 0x270a, 0x2728, 0x274c, 0x274e, 0x2753, + 0x2757, 0x2795, 0x27b0, 0x27bf, 0x2b1b, 0x2b50, 0x2b55, 0x2e80, + 0x2e9b, 0x2f00, 0x2ff0, 0x3001, 0x302e, 0x3041, 0x309b, 0x309d, + 0x309f, 0x30a0, 0x30a1, 0x30fb, 0x30fc, 0x30ff, 0x3105, 0x3131, + 0x3190, 0x3192, 0x3196, 0x31a0, 0x31c0, 0x31f0, 0x3200, 0x3220, + 0x322a, 0x3250, 0x3251, 0x3260, 0x3280, 0x328a, 0x32b1, 0x32c0, + 0x3400, 0x4e00, 0xa015, 0xa016, 0xa490, 0xa960, 0xac00, 0xf900, + 0xfa70, 0xfe10, 0xfe30, 0xfe54, 0xfe68, 0x16fe0, 0x16ff0, 0x17000, + 0x18800, 0x18d00, 0x1aff0, 0x1aff5, 0x1affd, 0x1b000, 0x1b132, 0x1b150, + 0x1b155, 0x1b164, 0x1b170, 0x1f004, 0x1f0cf, 0x1f18e, 0x1f191, 0x1f200, + 0x1f210, 0x1f240, 0x1f250, 0x1f260, 0x1f300, 0x1f32d, 0x1f337, 0x1f37e, + 0x1f3a0, 0x1f3cf, 0x1f3e0, 0x1f3f4, 0x1f3f8, 0x1f3fb, 0x1f400, 0x1f440, + 0x1f442, 0x1f4ff, 0x1f54b, 0x1f550, 0x1f57a, 0x1f595, 0x1f5a4, 0x1f5fb, + 0x1f680, 0x1f6cc, 0x1f6d0, 0x1f6d5, 0x1f6dc, 0x1f6eb, 0x1f6f4, 0x1f7e0, + 0x1f7f0, 0x1f90c, 0x1f93c, 0x1f947, 0x1fa70, 0x1fa80, 0x1fa90, 0x1fabf, + 0x1face, 0x1fae0, 0x1faf0, 0x20000, 0x2a700, 0x2b740, 0x2b820, 0x2ceb0, + 0x2f800, 0x30000, 0x31350, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +HWY_ALIGN T eaw_lte[] = { + 0x3000, 0xff60, 0xffe6, 0x115f, 0x231b, 0x2329, 0x232a, 0x23ec, + 0x23f0, 0x23f3, 0x25fe, 0x2615, 0x2653, 0x267f, 0x2693, 0x26a1, + 0x26ab, 0x26be, 0x26c5, 0x26ce, 0x26d4, 0x26ea, 0x26f3, 0x26f5, + 0x26fa, 0x26fd, 0x2705, 0x270b, 0x2728, 0x274c, 0x274e, 0x2755, + 0x2757, 0x2797, 0x27b0, 0x27bf, 0x2b1c, 0x2b50, 0x2b55, 0x2e99, + 0x2ef3, 0x2fd5, 0x2ffb, 0x3029, 0x303e, 0x3096, 0x309c, 0x309e, + 0x309f, 0x30a0, 0x30fa, 0x30fb, 0x30fe, 0x30ff, 0x312f, 0x318e, + 0x3191, 0x3195, 0x319f, 0x31bf, 0x31e3, 0x31ff, 0x321e, 0x3229, + 0x3247, 0x3250, 0x325f, 0x327f, 0x3289, 0x32b0, 0x32bf, 0x33ff, + 0x4bdf, 0xa014, 0xa015, 0xa48c, 0xa4c6, 0xa97c, 0xd7a3, 0xfa6d, + 0xfad9, 0xfe19, 0xfe52, 0xfe66, 0xfe6b, 0x16fe3, 0x16ff1, 0x187f7, + 0x18cd5, 0x18d08, 0x1aff3, 0x1affb, 0x1affe, 0x1b122, 0x1b132, 0x1b152, + 0x1b155, 0x1b167, 0x1b2fb, 0x1f004, 0x1f0cf, 0x1f18e, 0x1f19a, 0x1f202, + 0x1f23b, 0x1f248, 0x1f251, 0x1f265, 0x1f320, 0x1f335, 0x1f37c, 0x1f393, + 0x1f3ca, 0x1f3d3, 0x1f3f0, 0x1f3f4, 0x1f3fa, 0x1f3ff, 0x1f43e, 0x1f440, + 0x1f4fc, 0x1f53d, 0x1f54e, 0x1f567, 0x1f57a, 0x1f596, 0x1f5a4, 0x1f64f, + 0x1f6c5, 0x1f6cc, 0x1f6d2, 0x1f6d7, 0x1f6df, 0x1f6ec, 0x1f6fc, 0x1f7eb, + 0x1f7f0, 0x1f93a, 0x1f945, 0x1f9ff, 0x1fa7c, 0x1fa88, 0x1fabd, 0x1fac5, + 0x1fadb, 0x1fae8, 0x1faf8, 0x2a6df, 0x2b739, 0x2b81d, 0x2cea1, 0x2ebe0, + 0x2fa1d, 0x3134a, 0x323af, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +template +int8_t CodepointWidthImpl(D d, T input) { + // If the input is ASCII, then we return 1. We do NOT check for + // control characters because we assume that the input has already + // been checked for that case. + if (input < 0xFF) { + return 1; + } + + // Its not ASCII, so lets move to vector ops to figure out the width. + const size_t N = hn::Lanes(d); + const hn::Vec input_vec = Set(d, input); + + { + // Thes are the ranges (inclusive) of the codepoints that are DEFINITELY + // width 2. We will check as many in parallel as possible. + // + // The zero padding is so that we can always load aligned directly into + // a vector register of any size up to 16 bytes (AVX512). + // + // Ranges: two-em dash, gbp.isRegionalIndicator, CJK... + HWY_ALIGN T gte_keys[] = { + 0x2E3A, 0x1f1e6, 0x3400, 0x4E00, 0xF900, 0x20000, 0x30000, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + HWY_ALIGN T lte_keys[] = { + 0x2E3A, 0x1f1ff, 0x4DBF, 0x9FFF, 0xFAFF, 0x2FFFD, 0x3FFFD, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + size_t i = 0; + for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, lte_keys + i); + const hn::Vec gte_vec = hn::Load(d, gte_keys + i); + const intptr_t idx = hn::FindFirstTrue( + d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); + if (idx >= 0) { + return 2; + } + } + assert(i >= 7); // We should have checked all the ranges. + } + + { + // Definitely width 0 + HWY_ALIGN T gte_keys[] = { + 0x1160, 0x2060, 0xFFF0, 0xE0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + HWY_ALIGN T lte_keys[] = { + 0x11FF, 0x206F, 0xFFF8, 0xE0FFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + size_t i = 0; + for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, lte_keys + i); + const hn::Vec gte_vec = hn::Load(d, gte_keys + i); + const intptr_t idx = hn::FindFirstTrue( + d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); + if (idx >= 0) { + return 0; + } + } + } + + if (input >= eaw_lte[0] && input <= 0x323af) { + size_t i = 0; + for (; i + N <= std::size(eaw_lte) && eaw_lte[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, eaw_lte + i); + const hn::Vec gte_vec = hn::Load(d, eaw_gte + i); + const intptr_t idx = hn::FindFirstTrue( + d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); + if (idx >= 0) { + return 2; + } + } + } + + return ghostty_ziglyph_codepoint_width(input); +} + +int8_t CodepointWidth(T input) { + const hn::ScalableTag d; + return CodepointWidthImpl(d, input); +} + +} // namespace HWY_NAMESPACE +} // namespace ghostty +HWY_AFTER_NAMESPACE(); + +// HWY_ONCE is true for only one of the target passes +#if HWY_ONCE + +namespace ghostty { + +HWY_EXPORT(CodepointWidth); + +int8_t CodepointWidth(uint32_t cp) { + return HWY_DYNAMIC_DISPATCH(CodepointWidth)(cp); +} + +} // namespace ghostty + +extern "C" { + +int8_t ghostty_simd_codepoint_width(uint32_t cp) { + return ghostty::CodepointWidth(cp); +} + +} // extern "C" + +#endif // HWY_ONCE diff --git a/src/simd/codepoint_width.zig b/src/simd/codepoint_width.zig new file mode 100644 index 000000000..c9feb4e63 --- /dev/null +++ b/src/simd/codepoint_width.zig @@ -0,0 +1,45 @@ +const std = @import("std"); + +// vt.cpp +extern "c" fn ghostty_simd_codepoint_width(u32) i8; + +pub fn codepointWidth(cp: u32) i8 { + //return @import("ziglyph").display_width.codePointWidth(@intCast(cp), .half); + return ghostty_simd_codepoint_width(cp); +} + +test "codepointWidth basic" { + const testing = std.testing; + try testing.expectEqual(@as(i8, 1), codepointWidth('a')); + try testing.expectEqual(@as(i8, 1), codepointWidth(0x100)); // Ā + try testing.expectEqual(@as(i8, 2), codepointWidth(0x3400)); // 㐀 + try testing.expectEqual(@as(i8, 2), codepointWidth(0x2E3A)); // ⸺ + try testing.expectEqual(@as(i8, 2), codepointWidth(0x1F1E6)); // 🇦 + try testing.expectEqual(@as(i8, 2), codepointWidth(0x4E00)); // 一 + try testing.expectEqual(@as(i8, 2), codepointWidth(0xF900)); // 豈 + try testing.expectEqual(@as(i8, 2), codepointWidth(0x20000)); // 𠀀 + try testing.expectEqual(@as(i8, 2), codepointWidth(0x30000)); // 𠀀 + // try testing.expectEqual(@as(i8, 1), @import("ziglyph").display_width.codePointWidth(0x100, .half)); +} + +pub export fn ghostty_ziglyph_codepoint_width(cp: u32) callconv(.C) i8 { + return @import("ziglyph").display_width.codePointWidth(@intCast(cp), .half); +} + +test "codepointWidth matches ziglyph" { + const testing = std.testing; + const ziglyph = @import("ziglyph"); + + // try testing.expect(ziglyph.general_category.isNonspacingMark(0x16fe4)); + // if (true) return; + + const min = 0xFF + 1; // start outside ascii + for (min..std.math.maxInt(u21)) |cp| { + const simd = codepointWidth(@intCast(cp)); + const zg = ziglyph.display_width.codePointWidth(@intCast(cp), .half); + if (simd != zg) { + std.log.warn("mismatch cp=U+{x} simd={} zg={}", .{ cp, simd, zg }); + try testing.expect(false); + } + } +} diff --git a/src/simd/main.zig b/src/simd/main.zig index 283439695..c7ced250d 100644 --- a/src/simd/main.zig +++ b/src/simd/main.zig @@ -1,5 +1,6 @@ const std = @import("std"); +pub usingnamespace @import("codepoint_width.zig"); pub const index_of = @import("index_of.zig"); pub const vt = @import("vt.zig"); From d949f1bd8476a54cfd4222ee9d564a2519a449e4 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 7 Feb 2024 09:11:02 -0800 Subject: [PATCH 03/12] simd/codepoint-width: match ziglyph --- src/simd/codepoint_width.cpp | 200 ++++++++++++++++++++++++++++++++--- src/simd/codepoint_width.zig | 7 +- 2 files changed, 191 insertions(+), 16 deletions(-) diff --git a/src/simd/codepoint_width.cpp b/src/simd/codepoint_width.cpp index 7e20424d7..5d04b94e2 100644 --- a/src/simd/codepoint_width.cpp +++ b/src/simd/codepoint_width.cpp @@ -17,9 +17,10 @@ using T = uint32_t; extern "C" int8_t ghostty_ziglyph_codepoint_width(uint32_t); -HWY_ALIGN T eaw_gte[] = { +// East Asian Width +HWY_ALIGN constexpr T eaw_gte[] = { 0x3000, 0xff01, 0xffe0, 0x1100, 0x231a, 0x2329, 0x232a, 0x23e9, - 0x23f0, 0x25f3, 0x25fd, 0x2614, 0x2648, 0x267f, 0x2693, 0x26a1, + 0x23f0, 0x23f3, 0x25fd, 0x2614, 0x2648, 0x267f, 0x2693, 0x26a1, 0x26aa, 0x26bd, 0x26c4, 0x26ce, 0x26d4, 0x26ea, 0x26f2, 0x26f5, 0x26fa, 0x26fd, 0x2705, 0x270a, 0x2728, 0x274c, 0x274e, 0x2753, 0x2757, 0x2795, 0x27b0, 0x27bf, 0x2b1b, 0x2b50, 0x2b55, 0x2e80, @@ -42,7 +43,7 @@ HWY_ALIGN T eaw_gte[] = { 0, 0, 0, 0, 0, 0, 0, 0, }; -HWY_ALIGN T eaw_lte[] = { +HWY_ALIGN constexpr T eaw_lte[] = { 0x3000, 0xff60, 0xffe6, 0x115f, 0x231b, 0x2329, 0x232a, 0x23ec, 0x23f0, 0x23f3, 0x25fe, 0x2615, 0x2653, 0x267f, 0x2693, 0x26a1, 0x26ab, 0x26be, 0x26c5, 0x26ce, 0x26d4, 0x26ea, 0x26f3, 0x26f5, @@ -67,6 +68,127 @@ HWY_ALIGN T eaw_lte[] = { 0, 0, 0, 0, 0, 0, 0, 0, }; +/// These are the ranges of codepoints that are DEFINITELY width 0. +HWY_ALIGN constexpr T zero_gte[] = { + 0xad, 0x70f, 0x890, 0x180e, 0x200b, 0x202a, 0x2060, 0x2066, + 0xfeff, 0xfff9, 0x110bd, 0x110cd, 0x13430, 0x1bca0, 0x1d173, 0xe0001, + 0xe0020, 0x488, 0x1abe, 0x20dd, 0x20e2, 0xa670, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, +}; + +HWY_ALIGN constexpr T zero_lte[] = { + 0xad, 0x70f, 0x891, 0x180e, 0x200f, 0x202e, 0x2064, 0x206f, + 0xfeff, 0xfffb, 0x110bd, 0x110cd, 0x1343f, 0x1bca3, 0x1d17a, 0xe0001, + 0xe007f, 0x489, 0x1abe, 0x20e0, 0x20e4, 0xa672, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, +}; + +/// Non-spacing marks +HWY_ALIGN constexpr T nsm_gte[] = { + 0x300, 0x483, 0x591, 0x5bf, 0x5c1, 0x5c4, 0x5c7, 0x610, + 0x64b, 0x670, 0x6d6, 0x6df, 0x6e7, 0x6ea, 0x711, 0x730, + 0x7a6, 0x7eb, 0x7fd, 0x816, 0x81b, 0x825, 0x829, 0x859, + 0x898, 0x8ca, 0x8e3, 0x93a, 0x93c, 0x941, 0x94d, 0x951, + 0x962, 0x981, 0x9bc, 0x9c1, 0x9cd, 0x9e2, 0x9fe, 0xa01, + 0xa3c, 0xa41, 0xa47, 0xa4b, 0xa51, 0xa70, 0xa75, 0xa81, + 0xabc, 0xac1, 0xac7, 0xacd, 0xae2, 0xafa, 0xb01, 0xb3c, + 0xb3f, 0xb41, 0xb4d, 0xb55, 0xb62, 0xb82, 0xbc0, 0xbcd, + 0xc00, 0xc04, 0xc3c, 0xc3e, 0xc46, 0xc4a, 0xc55, 0xc62, + 0xc81, 0xcbc, 0xcbf, 0xcc6, 0xccc, 0xce2, 0xd00, 0xd3b, + 0xd41, 0xd4d, 0xd62, 0xd81, 0xdca, 0xdd2, 0xdd6, 0xe31, + 0xe34, 0xe47, 0xeb1, 0xeb4, 0xec8, 0xf18, 0xf35, 0xf37, + 0xf39, 0xf71, 0xf80, 0xf86, 0xf8d, 0xf99, 0xfc6, 0x102d, + 0x1032, 0x1039, 0x103d, 0x1058, 0x105e, 0x1071, 0x1082, 0x1085, + 0x108d, 0x109d, 0x135d, 0x1712, 0x1732, 0x1752, 0x1772, 0x17b4, + 0x17b7, 0x17c6, 0x17c9, 0x17dd, 0x180b, 0x180f, 0x1885, 0x18a9, + 0x1920, 0x1927, 0x1932, 0x1939, 0x1a17, 0x1a1b, 0x1a56, 0x1a58, + 0x1a60, 0x1a62, 0x1a65, 0x1a73, 0x1a7f, 0x1ab0, 0x1abf, 0x1b00, + 0x1b34, 0x1b36, 0x1b3c, 0x1b42, 0x1b6b, 0x1b80, 0x1ba2, 0x1ba8, + 0x1bab, 0x1be6, 0x1be8, 0x1bed, 0x1bef, 0x1c2c, 0x1c36, 0x1cd0, + 0x1cd4, 0x1ce2, 0x1ced, 0x1cf4, 0x1cf8, 0x1dc0, 0x20d0, 0x20e1, + 0x20e5, 0x2cef, 0x2d7f, 0x2de0, 0x302a, 0x3099, 0xa66f, 0xa674, + 0xa69e, 0xa6f0, 0xa802, 0xa806, 0xa80b, 0xa825, 0xa82c, 0xa8c4, + 0xa8e0, 0xa8ff, 0xa926, 0xa947, 0xa980, 0xa9b3, 0xa9b6, 0xa9bc, + 0xa9e5, 0xaa29, 0xaa31, 0xaa35, 0xaa43, 0xaa4c, 0xaa7c, 0xaab0, + 0xaab2, 0xaab7, 0xaabe, 0xaac1, 0xaaec, 0xaaf6, 0xabe5, 0xabe8, + 0xabed, 0xfb1e, 0xfe00, 0xfe20, 0x101fd, 0x102e0, 0x10376, 0x10a01, + 0x10a05, 0x10a0c, 0x10a38, 0x10a3f, 0x10ae5, 0x10d24, 0x10eab, 0x10efd, + 0x10f46, 0x10f82, 0x11001, 0x11038, 0x11070, 0x11073, 0x1107f, 0x110b3, + 0x110b9, 0x110c2, 0x11100, 0x11127, 0x1112d, 0x11173, 0x11180, 0x111b6, + 0x111c9, 0x111cf, 0x1122f, 0x11234, 0x11236, 0x1123e, 0x11241, 0x112df, + 0x112e3, 0x11300, 0x1133b, 0x11340, 0x11366, 0x11370, 0x11438, 0x11442, + 0x11446, 0x1145e, 0x114b3, 0x114ba, 0x114bf, 0x114c2, 0x115b2, 0x115bc, + 0x115bf, 0x115dc, 0x11633, 0x1163d, 0x1163f, 0x116ab, 0x116ad, 0x116b0, + 0x116b7, 0x1171d, 0x11722, 0x11727, 0x1182f, 0x11839, 0x1193b, 0x1193e, + 0x11943, 0x119d4, 0x119da, 0x119e0, 0x11a01, 0x11a33, 0x11a3b, 0x11a47, + 0x11a51, 0x11a59, 0x11a8a, 0x11a98, 0x11c30, 0x11c38, 0x11c3f, 0x11c92, + 0x11caa, 0x11cb2, 0x11cb5, 0x11d31, 0x11d3a, 0x11d3c, 0x11d3f, 0x11d47, + 0x11d90, 0x11d95, 0x11d97, 0x11ef3, 0x11f00, 0x11f36, 0x11f40, 0x11f42, + 0x13440, 0x13447, 0x16af0, 0x16b30, 0x16f4f, 0x16f8f, 0x16fe4, 0x1bc9d, + 0x1cf00, 0x1cf30, 0x1d167, 0x1d17b, 0x1d185, 0x1d1aa, 0x1d242, 0x1da00, + 0x1da3b, 0x1da75, 0x1da84, 0x1da9b, 0x1daa1, 0x1e000, 0x1e008, 0x1e01b, + 0x1e023, 0x1e026, 0x1e08f, 0x1e130, 0x1e2ae, 0x1e2ec, 0x1e4ec, 0x1e8d0, + 0x1e944, 0xe0100, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, +}; + +HWY_ALIGN constexpr T nsm_lte[] = { + 0x36f, 0x487, 0x5bd, 0x5bf, 0x5c2, 0x5c5, 0x5c7, 0x61a, + 0x65f, 0x670, 0x6dc, 0x6e4, 0x6e8, 0x6ed, 0x711, 0x74a, + 0x7b0, 0x7f3, 0x7fd, 0x819, 0x823, 0x827, 0x82d, 0x85b, + 0x89f, 0x8e1, 0x902, 0x93a, 0x93c, 0x948, 0x94d, 0x957, + 0x963, 0x981, 0x9bc, 0x9c4, 0x9cd, 0x9e3, 0x9fe, 0xa02, + 0xa3c, 0xa42, 0xa48, 0xa4d, 0xa51, 0xa71, 0xa75, 0xa82, + 0xabc, 0xac5, 0xac8, 0xacd, 0xae3, 0xaff, 0xb01, 0xb3c, + 0xb3f, 0xb44, 0xb4d, 0xb56, 0xb63, 0xb82, 0xbc0, 0xbcd, + 0xc00, 0xc04, 0xc3c, 0xc40, 0xc48, 0xc4d, 0xc56, 0xc63, + 0xc81, 0xcbc, 0xcbf, 0xcc6, 0xccd, 0xce3, 0xd01, 0xd3c, + 0xd44, 0xd4d, 0xd63, 0xd81, 0xdca, 0xdd4, 0xdd6, 0xe31, + 0xe3a, 0xe4e, 0xeb1, 0xebc, 0xece, 0xf19, 0xf35, 0xf37, + 0xf39, 0xf7e, 0xf84, 0xf87, 0xf97, 0xfbc, 0xfc6, 0x1030, + 0x1037, 0x103a, 0x103e, 0x1059, 0x1060, 0x1074, 0x1082, 0x1086, + 0x108d, 0x109d, 0x135f, 0x1714, 0x1733, 0x1753, 0x1773, 0x17b5, + 0x17bd, 0x17c6, 0x17d3, 0x17dd, 0x180d, 0x180f, 0x1886, 0x18a9, + 0x1922, 0x1928, 0x1932, 0x193b, 0x1a18, 0x1a1b, 0x1a56, 0x1a5e, + 0x1a60, 0x1a62, 0x1a6c, 0x1a7c, 0x1a7f, 0x1abd, 0x1ace, 0x1b03, + 0x1b34, 0x1b3a, 0x1b3c, 0x1b42, 0x1b73, 0x1b81, 0x1ba5, 0x1ba9, + 0x1bad, 0x1be6, 0x1be9, 0x1bed, 0x1bf1, 0x1c33, 0x1c37, 0x1cd2, + 0x1ce0, 0x1ce8, 0x1ced, 0x1cf4, 0x1cf9, 0x1dff, 0x20dc, 0x20e1, + 0x20f0, 0x2cf1, 0x2d7f, 0x2dff, 0x302d, 0x309a, 0xa66f, 0xa67d, + 0xa69f, 0xa6f1, 0xa802, 0xa806, 0xa80b, 0xa826, 0xa82c, 0xa8c5, + 0xa8f1, 0xa8ff, 0xa92d, 0xa951, 0xa982, 0xa9b3, 0xa9b9, 0xa9bd, + 0xa9e5, 0xaa2e, 0xaa32, 0xaa36, 0xaa43, 0xaa4c, 0xaa7c, 0xaab0, + 0xaab4, 0xaab8, 0xaabf, 0xaac1, 0xaaed, 0xaaf6, 0xabe5, 0xabe8, + 0xabed, 0xfb1e, 0xfe0f, 0xfe2f, 0x101fd, 0x102e0, 0x1037a, 0x10a03, + 0x10a06, 0x10a0f, 0x10a3a, 0x10a3f, 0x10ae6, 0x10d27, 0x10eac, 0x10eff, + 0x10f50, 0x10f85, 0x11001, 0x11046, 0x11070, 0x11074, 0x11081, 0x110b6, + 0x110ba, 0x110c2, 0x11102, 0x1112b, 0x11134, 0x11173, 0x11181, 0x111be, + 0x111cc, 0x111cf, 0x11231, 0x11234, 0x11237, 0x1123e, 0x11241, 0x112df, + 0x112ea, 0x11301, 0x1133c, 0x11340, 0x1136c, 0x11374, 0x1143f, 0x11444, + 0x11446, 0x1145e, 0x114b8, 0x114ba, 0x114c0, 0x114c3, 0x115b5, 0x115bd, + 0x115c0, 0x115dd, 0x1163a, 0x1163d, 0x11640, 0x116ab, 0x116ad, 0x116b5, + 0x116b7, 0x1171f, 0x11725, 0x1172b, 0x11837, 0x1183a, 0x1193c, 0x1193e, + 0x11943, 0x119d7, 0x119db, 0x119e0, 0x11a0a, 0x11a38, 0x11a3e, 0x11a47, + 0x11a56, 0x11a5b, 0x11a96, 0x11a99, 0x11c36, 0x11c3d, 0x11c3f, 0x11ca7, + 0x11cb0, 0x11cb3, 0x11cb6, 0x11d36, 0x11d3a, 0x11d3d, 0x11d45, 0x11d47, + 0x11d91, 0x11d95, 0x11d97, 0x11ef4, 0x11f01, 0x11f3a, 0x11f40, 0x11f42, + 0x13440, 0x13455, 0x16af4, 0x16b36, 0x16f4f, 0x16f92, 0x16fe4, 0x1bc9e, + 0x1cf2d, 0x1cf46, 0x1d169, 0x1d182, 0x1d18b, 0x1d1ad, 0x1d244, 0x1da36, + 0x1da6c, 0x1da75, 0x1da84, 0x1da9f, 0x1daaf, 0x1e006, 0x1e018, 0x1e021, + 0x1e024, 0x1e02a, 0x1e08f, 0x1e136, 0x1e2ae, 0x1e2ef, 0x1e4ef, 0x1e8d6, + 0x1e94a, 0xe01ef, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, +}; + +// All our tables must be identically sized +static_assert(std::size(eaw_gte) == std::size(eaw_lte)); +static_assert(std::size(zero_gte) == std::size(zero_lte)); +static_assert(std::size(nsm_gte) == std::size(nsm_lte)); + template int8_t CodepointWidthImpl(D d, T input) { // If the input is ASCII, then we return 1. We do NOT check for @@ -88,12 +210,15 @@ int8_t CodepointWidthImpl(D d, T input) { // a vector register of any size up to 16 bytes (AVX512). // // Ranges: two-em dash, gbp.isRegionalIndicator, CJK... + // + // NOTE: 0x2E3B is technically width 3 but for our terminal we only + // handle up to width 2 as wide so we will treat it as width 2. HWY_ALIGN T gte_keys[] = { - 0x2E3A, 0x1f1e6, 0x3400, 0x4E00, 0xF900, 0x20000, 0x30000, 0, + 0x2E3A, 0x1f1e6, 0x3400, 0x4E00, 0xF900, 0x20000, 0x30000, 0x2E3B, 0, 0, 0, 0, 0, 0, 0, 0, }; HWY_ALIGN T lte_keys[] = { - 0x2E3A, 0x1f1ff, 0x4DBF, 0x9FFF, 0xFAFF, 0x2FFFD, 0x3FFFD, 0, + 0x2E3A, 0x1f1ff, 0x4DBF, 0x9FFF, 0xFAFF, 0x2FFFD, 0x3FFFD, 0x2E3B, 0, 0, 0, 0, 0, 0, 0, 0, }; size_t i = 0; @@ -129,20 +254,65 @@ int8_t CodepointWidthImpl(D d, T input) { } } - if (input >= eaw_lte[0] && input <= 0x323af) { - size_t i = 0; - for (; i + N <= std::size(eaw_lte) && eaw_lte[i] != 0; i += N) { - const hn::Vec lte_vec = hn::Load(d, eaw_lte + i); - const hn::Vec gte_vec = hn::Load(d, eaw_gte + i); - const intptr_t idx = hn::FindFirstTrue( - d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); - if (idx >= 0) { - return 2; + { + constexpr T zero_gte_min = + *std::min_element(zero_gte, zero_gte + std::size(zero_gte)); + constexpr T zero_lte_max = + *std::max_element(zero_lte, zero_lte + std::size(zero_lte)); + if (input >= zero_gte_min && input <= zero_lte_max) { + size_t i = 0; + for (; i + N <= std::size(zero_gte) && zero_gte[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, zero_lte + i); + const hn::Vec gte_vec = hn::Load(d, zero_gte + i); + const intptr_t idx = hn::FindFirstTrue( + d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); + if (idx >= 0) { + return 0; + } } } } - return ghostty_ziglyph_codepoint_width(input); + { + constexpr T eaw_gte_min = + *std::min_element(eaw_gte, eaw_gte + std::size(eaw_gte)); + constexpr T eaw_lte_max = + *std::max_element(eaw_lte, eaw_lte + std::size(eaw_lte)); + if (input >= eaw_gte_min && input <= eaw_lte_max) { + size_t i = 0; + for (; i + N <= std::size(eaw_lte) && eaw_lte[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, eaw_lte + i); + const hn::Vec gte_vec = hn::Load(d, eaw_gte + i); + const intptr_t idx = hn::FindFirstTrue( + d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); + if (idx >= 0) { + return 2; + } + } + } + } + + { + constexpr T nsm_gte_min = + *std::min_element(nsm_gte, nsm_gte + std::size(nsm_gte)); + constexpr T nsm_lte_max = + *std::max_element(nsm_lte, nsm_lte + std::size(nsm_lte)); + if (input >= nsm_gte_min && input <= nsm_lte_max) { + size_t i = 0; + for (; i + N <= std::size(nsm_lte) && nsm_lte[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, nsm_lte + i); + const hn::Vec gte_vec = hn::Load(d, nsm_gte + i); + const intptr_t idx = hn::FindFirstTrue( + d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); + if (idx >= 0) { + return 0; + } + } + } + } + + return 1; + // return ghostty_ziglyph_codepoint_width(input); } int8_t CodepointWidth(T input) { diff --git a/src/simd/codepoint_width.zig b/src/simd/codepoint_width.zig index c9feb4e63..2be02462f 100644 --- a/src/simd/codepoint_width.zig +++ b/src/simd/codepoint_width.zig @@ -37,7 +37,12 @@ test "codepointWidth matches ziglyph" { for (min..std.math.maxInt(u21)) |cp| { const simd = codepointWidth(@intCast(cp)); const zg = ziglyph.display_width.codePointWidth(@intCast(cp), .half); - if (simd != zg) { + if (simd != zg) mismatch: { + if (cp == 0x2E3B) { + try testing.expectEqual(@as(i8, 2), simd); + break :mismatch; + } + std.log.warn("mismatch cp=U+{x} simd={} zg={}", .{ cp, simd, zg }); try testing.expect(false); } From 5692d39067b3f6fd6c179fcce8fe0972e4b4246d Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 7 Feb 2024 09:17:26 -0800 Subject: [PATCH 04/12] bench/codepoint-width: add wcwidth --- src/bench/codepoint-width.sh | 4 +++- src/bench/codepoint-width.zig | 30 ++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/bench/codepoint-width.sh b/src/bench/codepoint-width.sh index d7e70f48d..1b94e7717 100755 --- a/src/bench/codepoint-width.sh +++ b/src/bench/codepoint-width.sh @@ -8,7 +8,7 @@ # - "ascii", uniform random ASCII bytes # - "utf8", uniform random unicode characters, encoded as utf8 # - "rand", pure random data, will contain many invalid code sequences. -DATA="utf8" +DATA="ascii" SIZE="25000000" # Add additional arguments @@ -25,6 +25,8 @@ hyperfine \ --warmup 10 \ -n baseline \ "./zig-out/bin/bench-codepoint-width --mode=baseline${ARGS} try benchBaseline(reader, buf), + .wcwidth => try benchWcwidth(reader, buf), .ziglyph => try benchZiglyph(reader, buf), .simd => try benchSimd(reader, buf), } @@ -94,6 +98,32 @@ noinline fn benchBaseline( } } +extern "c" fn wcwidth(c: u32) c_int; + +noinline fn benchWcwidth( + reader: anytype, + buf: []u8, +) !void { + var d: UTF8Decoder = .{}; + while (true) { + const n = try reader.read(buf); + if (n == 0) break; + + // Using stream.next directly with a for loop applies a naive + // scalar approach. + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp| { + const width = wcwidth(cp); + + // Write the width to the buffer to avoid it being compiled away + buf[0] = @intCast(width); + } + } + } +} + noinline fn benchZiglyph( reader: anytype, buf: []u8, From 697fbe21ec2dbf55458bb9d6f178f537f3d3db44 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 7 Feb 2024 09:18:02 -0800 Subject: [PATCH 05/12] simd/codepoint-width: reinclude correct file --- src/simd/codepoint_width.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/simd/codepoint_width.cpp b/src/simd/codepoint_width.cpp index 5d04b94e2..22b06c3a4 100644 --- a/src/simd/codepoint_width.cpp +++ b/src/simd/codepoint_width.cpp @@ -1,7 +1,7 @@ // Generates code for every target that this compiler can support. #undef HWY_TARGET_INCLUDE -#define HWY_TARGET_INCLUDE "simd/vt.cpp" // this file -#include // must come before highway.h +#define HWY_TARGET_INCLUDE "simd/codepoint_width.cpp" // this file +#include // must come before highway.h #include #include From 46a887578ae996c9b551e8d0b4898273ac298010 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 7 Feb 2024 09:21:33 -0800 Subject: [PATCH 06/12] simd: remove ziglyph fallback --- src/simd/codepoint_width.cpp | 11 +++------ src/simd/codepoint_width.zig | 47 ++++++++++++++++-------------------- 2 files changed, 25 insertions(+), 33 deletions(-) diff --git a/src/simd/codepoint_width.cpp b/src/simd/codepoint_width.cpp index 22b06c3a4..5481d90c7 100644 --- a/src/simd/codepoint_width.cpp +++ b/src/simd/codepoint_width.cpp @@ -15,8 +15,6 @@ namespace hn = hwy::HWY_NAMESPACE; using T = uint32_t; -extern "C" int8_t ghostty_ziglyph_codepoint_width(uint32_t); - // East Asian Width HWY_ALIGN constexpr T eaw_gte[] = { 0x3000, 0xff01, 0xffe0, 0x1100, 0x231a, 0x2329, 0x232a, 0x23e9, @@ -74,7 +72,7 @@ HWY_ALIGN constexpr T zero_gte[] = { 0xfeff, 0xfff9, 0x110bd, 0x110cd, 0x13430, 0x1bca0, 0x1d173, 0xe0001, 0xe0020, 0x488, 0x1abe, 0x20dd, 0x20e2, 0xa670, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, }; HWY_ALIGN constexpr T zero_lte[] = { @@ -82,7 +80,7 @@ HWY_ALIGN constexpr T zero_lte[] = { 0xfeff, 0xfffb, 0x110bd, 0x110cd, 0x1343f, 0x1bca3, 0x1d17a, 0xe0001, 0xe007f, 0x489, 0x1abe, 0x20e0, 0x20e4, 0xa672, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, }; /// Non-spacing marks @@ -132,7 +130,7 @@ HWY_ALIGN constexpr T nsm_gte[] = { 0x1e023, 0x1e026, 0x1e08f, 0x1e130, 0x1e2ae, 0x1e2ec, 0x1e4ec, 0x1e8d0, 0x1e944, 0xe0100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, }; HWY_ALIGN constexpr T nsm_lte[] = { @@ -181,7 +179,7 @@ HWY_ALIGN constexpr T nsm_lte[] = { 0x1e024, 0x1e02a, 0x1e08f, 0x1e136, 0x1e2ae, 0x1e2ef, 0x1e4ef, 0x1e8d6, 0x1e94a, 0xe01ef, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, }; // All our tables must be identically sized @@ -312,7 +310,6 @@ int8_t CodepointWidthImpl(D d, T input) { } return 1; - // return ghostty_ziglyph_codepoint_width(input); } int8_t CodepointWidth(T input) { diff --git a/src/simd/codepoint_width.zig b/src/simd/codepoint_width.zig index 2be02462f..aab4bdd95 100644 --- a/src/simd/codepoint_width.zig +++ b/src/simd/codepoint_width.zig @@ -22,29 +22,24 @@ test "codepointWidth basic" { // try testing.expectEqual(@as(i8, 1), @import("ziglyph").display_width.codePointWidth(0x100, .half)); } -pub export fn ghostty_ziglyph_codepoint_width(cp: u32) callconv(.C) i8 { - return @import("ziglyph").display_width.codePointWidth(@intCast(cp), .half); -} - -test "codepointWidth matches ziglyph" { - const testing = std.testing; - const ziglyph = @import("ziglyph"); - - // try testing.expect(ziglyph.general_category.isNonspacingMark(0x16fe4)); - // if (true) return; - - const min = 0xFF + 1; // start outside ascii - for (min..std.math.maxInt(u21)) |cp| { - const simd = codepointWidth(@intCast(cp)); - const zg = ziglyph.display_width.codePointWidth(@intCast(cp), .half); - if (simd != zg) mismatch: { - if (cp == 0x2E3B) { - try testing.expectEqual(@as(i8, 2), simd); - break :mismatch; - } - - std.log.warn("mismatch cp=U+{x} simd={} zg={}", .{ cp, simd, zg }); - try testing.expect(false); - } - } -} +// This is not very fast in debug modes, so its commented by default. +// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES. +// test "codepointWidth matches ziglyph" { +// const testing = std.testing; +// const ziglyph = @import("ziglyph"); +// +// const min = 0xFF + 1; // start outside ascii +// for (min..std.math.maxInt(u21)) |cp| { +// const simd = codepointWidth(@intCast(cp)); +// const zg = ziglyph.display_width.codePointWidth(@intCast(cp), .half); +// if (simd != zg) mismatch: { +// if (cp == 0x2E3B) { +// try testing.expectEqual(@as(i8, 2), simd); +// break :mismatch; +// } +// +// std.log.warn("mismatch cp=U+{x} simd={} zg={}", .{ cp, simd, zg }); +// try testing.expect(false); +// } +// } +// } From 3c31217f3cf66950e638f59f421beb40453e247f Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 7 Feb 2024 09:28:56 -0800 Subject: [PATCH 07/12] simd: minor tweaks --- src/bench/codepoint-width.sh | 4 ++-- src/bench/codepoint-width.zig | 8 ++++---- src/simd/codepoint_width.cpp | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/bench/codepoint-width.sh b/src/bench/codepoint-width.sh index 1b94e7717..df84be796 100755 --- a/src/bench/codepoint-width.sh +++ b/src/bench/codepoint-width.sh @@ -23,8 +23,8 @@ ARGS="" hyperfine \ --warmup 10 \ - -n baseline \ - "./zig-out/bin/bench-codepoint-width --mode=baseline${ARGS} try benchBaseline(reader, buf), + .noop => try benchNoop(reader, buf), .wcwidth => try benchWcwidth(reader, buf), .ziglyph => try benchZiglyph(reader, buf), .simd => try benchSimd(reader, buf), } } -noinline fn benchBaseline( +noinline fn benchNoop( reader: anytype, buf: []u8, ) !void { diff --git a/src/simd/codepoint_width.cpp b/src/simd/codepoint_width.cpp index 5481d90c7..795eb4a92 100644 --- a/src/simd/codepoint_width.cpp +++ b/src/simd/codepoint_width.cpp @@ -211,11 +211,11 @@ int8_t CodepointWidthImpl(D d, T input) { // // NOTE: 0x2E3B is technically width 3 but for our terminal we only // handle up to width 2 as wide so we will treat it as width 2. - HWY_ALIGN T gte_keys[] = { + HWY_ALIGN constexpr T gte_keys[] = { 0x2E3A, 0x1f1e6, 0x3400, 0x4E00, 0xF900, 0x20000, 0x30000, 0x2E3B, 0, 0, 0, 0, 0, 0, 0, 0, }; - HWY_ALIGN T lte_keys[] = { + HWY_ALIGN constexpr T lte_keys[] = { 0x2E3A, 0x1f1ff, 0x4DBF, 0x9FFF, 0xFAFF, 0x2FFFD, 0x3FFFD, 0x2E3B, 0, 0, 0, 0, 0, 0, 0, 0, }; @@ -234,10 +234,10 @@ int8_t CodepointWidthImpl(D d, T input) { { // Definitely width 0 - HWY_ALIGN T gte_keys[] = { + HWY_ALIGN constexpr T gte_keys[] = { 0x1160, 0x2060, 0xFFF0, 0xE0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; - HWY_ALIGN T lte_keys[] = { + HWY_ALIGN constexpr T lte_keys[] = { 0x11FF, 0x206F, 0xFFF8, 0xE0FFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; size_t i = 0; From 17dc64053ea8aac281a47c0f12f81753d45c69bf Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 7 Feb 2024 09:38:17 -0800 Subject: [PATCH 08/12] terminal: swap codepointwidth implementations --- src/bench/codepoint-width.sh | 2 +- src/simd/codepoint_width.cpp | 14 ++++---------- src/terminal/Terminal.zig | 37 ++++++++---------------------------- 3 files changed, 13 insertions(+), 40 deletions(-) diff --git a/src/bench/codepoint-width.sh b/src/bench/codepoint-width.sh index df84be796..d0692bc43 100755 --- a/src/bench/codepoint-width.sh +++ b/src/bench/codepoint-width.sh @@ -8,7 +8,7 @@ # - "ascii", uniform random ASCII bytes # - "utf8", uniform random unicode characters, encoded as utf8 # - "rand", pure random data, will contain many invalid code sequences. -DATA="ascii" +DATA="utf8" SIZE="25000000" # Add additional arguments diff --git a/src/simd/codepoint_width.cpp b/src/simd/codepoint_width.cpp index 795eb4a92..9bfe37a2f 100644 --- a/src/simd/codepoint_width.cpp +++ b/src/simd/codepoint_width.cpp @@ -187,12 +187,15 @@ static_assert(std::size(eaw_gte) == std::size(eaw_lte)); static_assert(std::size(zero_gte) == std::size(zero_lte)); static_assert(std::size(nsm_gte) == std::size(nsm_lte)); +/// Vectorized implementation of Unicode display width. Determining width +/// unfortunately requires many small range checks, so we test some fast paths +/// and otherwise try to do N (vector lane width) range checks at a time. template int8_t CodepointWidthImpl(D d, T input) { // If the input is ASCII, then we return 1. We do NOT check for // control characters because we assume that the input has already // been checked for that case. - if (input < 0xFF) { + if (input <= 0xFF) { return 1; } @@ -201,14 +204,6 @@ int8_t CodepointWidthImpl(D d, T input) { const hn::Vec input_vec = Set(d, input); { - // Thes are the ranges (inclusive) of the codepoints that are DEFINITELY - // width 2. We will check as many in parallel as possible. - // - // The zero padding is so that we can always load aligned directly into - // a vector register of any size up to 16 bytes (AVX512). - // - // Ranges: two-em dash, gbp.isRegionalIndicator, CJK... - // // NOTE: 0x2E3B is technically width 3 but for our terminal we only // handle up to width 2 as wide so we will treat it as width 2. HWY_ALIGN constexpr T gte_keys[] = { @@ -233,7 +228,6 @@ int8_t CodepointWidthImpl(D d, T input) { } { - // Definitely width 0 HWY_ALIGN constexpr T gte_keys[] = { 0x1160, 0x2060, 0xFFF0, 0xE0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig index 66643fbd9..5110a8d2a 100644 --- a/src/terminal/Terminal.zig +++ b/src/terminal/Terminal.zig @@ -10,6 +10,7 @@ const ziglyph = @import("ziglyph"); const testing = std.testing; const assert = std.debug.assert; const Allocator = std.mem.Allocator; +const simd = @import("../simd/main.zig"); const ansi = @import("ansi.zig"); const modes = @import("modes.zig"); @@ -869,10 +870,13 @@ pub fn print(self: *Terminal, c: u21) !void { // Determine the width of this character so we can handle // non-single-width characters properly. - const width: usize = @intCast(@min( - @max(0, ziglyph.display_width.codePointWidth(c, .half)), - 2, - )); + const width: usize = @intCast(simd.codepointWidth(c)); + + // Old implementation, 3x slower on ASCII, 2x slower on CJK, etc. + // const width: usize = @intCast(@min( + // @max(0, ziglyph.display_width.codePointWidth(c, .half)), + // 2, + // )); // Note: it is possible to have a width of "3" and a width of "-1" // from ziglyph. We should look into those cases and handle them @@ -2304,31 +2308,6 @@ test "Terminal: print over wide spacer tail" { } } -test "Terminal: zero width chars with grapheme clustering can be put in their own cell" { - var t = try init(testing.allocator, 5, 5); - defer t.deinit(testing.allocator); - - // Enable grapheme clustering - t.modes.set(.grapheme_cluster, true); - - try t.print('x'); - try t.print(0x7F); // zero-width control character - - { - const str = try t.plainString(testing.allocator); - defer testing.allocator.free(str); - try testing.expectEqualStrings("x", str); - } - - const row = t.screen.getRow(.{ .screen = 0 }); - { - const cell = row.getCell(0); - try testing.expectEqual(@as(u32, 'x'), cell.char); - try testing.expect(!cell.attrs.wide); - try testing.expect(!cell.attrs.grapheme); - } -} - test "Terminal: VS15 to make narrow character" { var t = try init(testing.allocator, 5, 5); defer t.deinit(testing.allocator); From a69d3c7f1c9178e2c9520840beb30600305c215f Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 7 Feb 2024 13:59:45 -0800 Subject: [PATCH 09/12] simd/codepoint-width: split by 16-bit --- src/simd/codepoint_width.cpp | 512 ++++++++++++++++++++++------------- 1 file changed, 326 insertions(+), 186 deletions(-) diff --git a/src/simd/codepoint_width.cpp b/src/simd/codepoint_width.cpp index 9bfe37a2f..bf8cae370 100644 --- a/src/simd/codepoint_width.cpp +++ b/src/simd/codepoint_width.cpp @@ -13,193 +13,207 @@ namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; -using T = uint32_t; - // East Asian Width -HWY_ALIGN constexpr T eaw_gte[] = { - 0x3000, 0xff01, 0xffe0, 0x1100, 0x231a, 0x2329, 0x232a, 0x23e9, - 0x23f0, 0x23f3, 0x25fd, 0x2614, 0x2648, 0x267f, 0x2693, 0x26a1, - 0x26aa, 0x26bd, 0x26c4, 0x26ce, 0x26d4, 0x26ea, 0x26f2, 0x26f5, - 0x26fa, 0x26fd, 0x2705, 0x270a, 0x2728, 0x274c, 0x274e, 0x2753, - 0x2757, 0x2795, 0x27b0, 0x27bf, 0x2b1b, 0x2b50, 0x2b55, 0x2e80, - 0x2e9b, 0x2f00, 0x2ff0, 0x3001, 0x302e, 0x3041, 0x309b, 0x309d, - 0x309f, 0x30a0, 0x30a1, 0x30fb, 0x30fc, 0x30ff, 0x3105, 0x3131, - 0x3190, 0x3192, 0x3196, 0x31a0, 0x31c0, 0x31f0, 0x3200, 0x3220, - 0x322a, 0x3250, 0x3251, 0x3260, 0x3280, 0x328a, 0x32b1, 0x32c0, - 0x3400, 0x4e00, 0xa015, 0xa016, 0xa490, 0xa960, 0xac00, 0xf900, - 0xfa70, 0xfe10, 0xfe30, 0xfe54, 0xfe68, 0x16fe0, 0x16ff0, 0x17000, - 0x18800, 0x18d00, 0x1aff0, 0x1aff5, 0x1affd, 0x1b000, 0x1b132, 0x1b150, - 0x1b155, 0x1b164, 0x1b170, 0x1f004, 0x1f0cf, 0x1f18e, 0x1f191, 0x1f200, - 0x1f210, 0x1f240, 0x1f250, 0x1f260, 0x1f300, 0x1f32d, 0x1f337, 0x1f37e, - 0x1f3a0, 0x1f3cf, 0x1f3e0, 0x1f3f4, 0x1f3f8, 0x1f3fb, 0x1f400, 0x1f440, - 0x1f442, 0x1f4ff, 0x1f54b, 0x1f550, 0x1f57a, 0x1f595, 0x1f5a4, 0x1f5fb, - 0x1f680, 0x1f6cc, 0x1f6d0, 0x1f6d5, 0x1f6dc, 0x1f6eb, 0x1f6f4, 0x1f7e0, - 0x1f7f0, 0x1f90c, 0x1f93c, 0x1f947, 0x1fa70, 0x1fa80, 0x1fa90, 0x1fabf, - 0x1face, 0x1fae0, 0x1faf0, 0x20000, 0x2a700, 0x2b740, 0x2b820, 0x2ceb0, - 0x2f800, 0x30000, 0x31350, 0, 0, 0, 0, 0, +HWY_ALIGN constexpr uint32_t eaw_gte32[] = { + 0x16fe0, 0x16ff0, 0x17000, 0x18800, 0x18d00, 0x1aff0, 0x1aff5, 0x1affd, + 0x1b000, 0x1b132, 0x1b150, 0x1b155, 0x1b164, 0x1b170, 0x1f004, 0x1f0cf, + 0x1f18e, 0x1f191, 0x1f200, 0x1f210, 0x1f240, 0x1f250, 0x1f260, 0x1f300, + 0x1f32d, 0x1f337, 0x1f37e, 0x1f3a0, 0x1f3cf, 0x1f3e0, 0x1f3f4, 0x1f3f8, + 0x1f3fb, 0x1f400, 0x1f440, 0x1f442, 0x1f4ff, 0x1f54b, 0x1f550, 0x1f57a, + 0x1f595, 0x1f5a4, 0x1f5fb, 0x1f680, 0x1f6cc, 0x1f6d0, 0x1f6d5, 0x1f6dc, + 0x1f6eb, 0x1f6f4, 0x1f7e0, 0x1f7f0, 0x1f90c, 0x1f93c, 0x1f947, 0x1fa70, + 0x1fa80, 0x1fa90, 0x1fabf, 0x1face, 0x1fae0, 0x1faf0, 0x20000, 0x2a700, + 0x2b740, 0x2b820, 0x2ceb0, 0x2f800, 0x30000, 0x31350, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; -HWY_ALIGN constexpr T eaw_lte[] = { - 0x3000, 0xff60, 0xffe6, 0x115f, 0x231b, 0x2329, 0x232a, 0x23ec, - 0x23f0, 0x23f3, 0x25fe, 0x2615, 0x2653, 0x267f, 0x2693, 0x26a1, - 0x26ab, 0x26be, 0x26c5, 0x26ce, 0x26d4, 0x26ea, 0x26f3, 0x26f5, - 0x26fa, 0x26fd, 0x2705, 0x270b, 0x2728, 0x274c, 0x274e, 0x2755, - 0x2757, 0x2797, 0x27b0, 0x27bf, 0x2b1c, 0x2b50, 0x2b55, 0x2e99, - 0x2ef3, 0x2fd5, 0x2ffb, 0x3029, 0x303e, 0x3096, 0x309c, 0x309e, - 0x309f, 0x30a0, 0x30fa, 0x30fb, 0x30fe, 0x30ff, 0x312f, 0x318e, - 0x3191, 0x3195, 0x319f, 0x31bf, 0x31e3, 0x31ff, 0x321e, 0x3229, - 0x3247, 0x3250, 0x325f, 0x327f, 0x3289, 0x32b0, 0x32bf, 0x33ff, - 0x4bdf, 0xa014, 0xa015, 0xa48c, 0xa4c6, 0xa97c, 0xd7a3, 0xfa6d, - 0xfad9, 0xfe19, 0xfe52, 0xfe66, 0xfe6b, 0x16fe3, 0x16ff1, 0x187f7, - 0x18cd5, 0x18d08, 0x1aff3, 0x1affb, 0x1affe, 0x1b122, 0x1b132, 0x1b152, - 0x1b155, 0x1b167, 0x1b2fb, 0x1f004, 0x1f0cf, 0x1f18e, 0x1f19a, 0x1f202, - 0x1f23b, 0x1f248, 0x1f251, 0x1f265, 0x1f320, 0x1f335, 0x1f37c, 0x1f393, - 0x1f3ca, 0x1f3d3, 0x1f3f0, 0x1f3f4, 0x1f3fa, 0x1f3ff, 0x1f43e, 0x1f440, - 0x1f4fc, 0x1f53d, 0x1f54e, 0x1f567, 0x1f57a, 0x1f596, 0x1f5a4, 0x1f64f, - 0x1f6c5, 0x1f6cc, 0x1f6d2, 0x1f6d7, 0x1f6df, 0x1f6ec, 0x1f6fc, 0x1f7eb, - 0x1f7f0, 0x1f93a, 0x1f945, 0x1f9ff, 0x1fa7c, 0x1fa88, 0x1fabd, 0x1fac5, - 0x1fadb, 0x1fae8, 0x1faf8, 0x2a6df, 0x2b739, 0x2b81d, 0x2cea1, 0x2ebe0, - 0x2fa1d, 0x3134a, 0x323af, 0, 0, 0, 0, 0, +HWY_ALIGN constexpr uint32_t eaw_lte32[] = { + 0x16fe3, 0x16ff1, 0x187f7, 0x18cd5, 0x18d08, 0x1aff3, 0x1affb, 0x1affe, + 0x1b122, 0x1b132, 0x1b152, 0x1b155, 0x1b167, 0x1b2fb, 0x1f004, 0x1f0cf, + 0x1f18e, 0x1f19a, 0x1f202, 0x1f23b, 0x1f248, 0x1f251, 0x1f265, 0x1f320, + 0x1f335, 0x1f37c, 0x1f393, 0x1f3ca, 0x1f3d3, 0x1f3f0, 0x1f3f4, 0x1f3fa, + 0x1f3ff, 0x1f43e, 0x1f440, 0x1f4fc, 0x1f53d, 0x1f54e, 0x1f567, 0x1f57a, + 0x1f596, 0x1f5a4, 0x1f64f, 0x1f6c5, 0x1f6cc, 0x1f6d2, 0x1f6d7, 0x1f6df, + 0x1f6ec, 0x1f6fc, 0x1f7eb, 0x1f7f0, 0x1f93a, 0x1f945, 0x1f9ff, 0x1fa7c, + 0x1fa88, 0x1fabd, 0x1fac5, 0x1fadb, 0x1fae8, 0x1faf8, 0x2a6df, 0x2b739, + 0x2b81d, 0x2cea1, 0x2ebe0, 0x2fa1d, 0x3134a, 0x323af, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; +HWY_ALIGN constexpr uint16_t eaw_gte16[] = { + 0x3000, 0xff01, 0xffe0, 0x1100, 0x231a, 0x2329, 0x232a, 0x23e9, 0x23f0, + 0x23f3, 0x25fd, 0x2614, 0x2648, 0x267f, 0x2693, 0x26a1, 0x26aa, 0x26bd, + 0x26c4, 0x26ce, 0x26d4, 0x26ea, 0x26f2, 0x26f5, 0x26fa, 0x26fd, 0x2705, + 0x270a, 0x2728, 0x274c, 0x274e, 0x2753, 0x2757, 0x2795, 0x27b0, 0x27bf, + 0x2b1b, 0x2b50, 0x2b55, 0x2e80, 0x2e9b, 0x2f00, 0x2ff0, 0x3001, 0x302e, + 0x3041, 0x309b, 0x309d, 0x309f, 0x30a0, 0x30a1, 0x30fb, 0x30fc, 0x30ff, + 0x3105, 0x3131, 0x3190, 0x3192, 0x3196, 0x31a0, 0x31c0, 0x31f0, 0x3200, + 0x3220, 0x322a, 0x3250, 0x3251, 0x3260, 0x3280, 0x328a, 0x32b1, 0x32c0, + 0x3400, 0x4e00, 0xa015, 0xa016, 0xa490, 0xa960, 0xac00, 0xf900, 0xfa70, + 0xfe10, 0xfe30, 0xfe54, 0xfe68, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +HWY_ALIGN constexpr uint16_t eaw_lte16[] = { + 0x3000, 0xff60, 0xffe6, 0x115f, 0x231b, 0x2329, 0x232a, 0x23ec, 0x23f0, + 0x23f3, 0x25fe, 0x2615, 0x2653, 0x267f, 0x2693, 0x26a1, 0x26ab, 0x26be, + 0x26c5, 0x26ce, 0x26d4, 0x26ea, 0x26f3, 0x26f5, 0x26fa, 0x26fd, 0x2705, + 0x270b, 0x2728, 0x274c, 0x274e, 0x2755, 0x2757, 0x2797, 0x27b0, 0x27bf, + 0x2b1c, 0x2b50, 0x2b55, 0x2e99, 0x2ef3, 0x2fd5, 0x2ffb, 0x3029, 0x303e, + 0x3096, 0x309c, 0x309e, 0x309f, 0x30a0, 0x30fa, 0x30fb, 0x30fe, 0x30ff, + 0x312f, 0x318e, 0x3191, 0x3195, 0x319f, 0x31bf, 0x31e3, 0x31ff, 0x321e, + 0x3229, 0x3247, 0x3250, 0x325f, 0x327f, 0x3289, 0x32b0, 0x32bf, 0x33ff, + 0x4bdf, 0xa014, 0xa015, 0xa48c, 0xa4c6, 0xa97c, 0xd7a3, 0xfa6d, 0xfad9, + 0xfe19, 0xfe52, 0xfe66, 0xfe6b, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + /// These are the ranges of codepoints that are DEFINITELY width 0. -HWY_ALIGN constexpr T zero_gte[] = { - 0xad, 0x70f, 0x890, 0x180e, 0x200b, 0x202a, 0x2060, 0x2066, - 0xfeff, 0xfff9, 0x110bd, 0x110cd, 0x13430, 0x1bca0, 0x1d173, 0xe0001, - 0xe0020, 0x488, 0x1abe, 0x20dd, 0x20e2, 0xa670, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, +HWY_ALIGN constexpr uint32_t zero_gte32[] = { + 0x110bd, 0x110cd, 0x13430, 0x1bca0, 0x1d173, 0xe0001, 0xe0020, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, }; -HWY_ALIGN constexpr T zero_lte[] = { - 0xad, 0x70f, 0x891, 0x180e, 0x200f, 0x202e, 0x2064, 0x206f, - 0xfeff, 0xfffb, 0x110bd, 0x110cd, 0x1343f, 0x1bca3, 0x1d17a, 0xe0001, - 0xe007f, 0x489, 0x1abe, 0x20e0, 0x20e4, 0xa672, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, +HWY_ALIGN constexpr uint32_t zero_lte32[] = { + 0x110bd, 0x110cd, 0x1343f, 0x1bca3, 0x1d17a, 0xe0001, 0xe007f, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +HWY_ALIGN constexpr uint16_t zero_gte16[] = { + 0xad, 0x70f, 0x890, 0x180e, 0x200b, 0x202a, 0x2060, 0x2066, 0xfeff, + 0xfff9, 0x488, 0x1abe, 0x20dd, 0x20e2, 0xa670, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +HWY_ALIGN constexpr uint16_t zero_lte16[] = { + 0xad, 0x70f, 0x891, 0x180e, 0x200f, 0x202e, 0x2064, 0x206f, 0xfeff, + 0xfffb, 0x489, 0x1abe, 0x20e0, 0x20e4, 0xa672, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /// Non-spacing marks -HWY_ALIGN constexpr T nsm_gte[] = { - 0x300, 0x483, 0x591, 0x5bf, 0x5c1, 0x5c4, 0x5c7, 0x610, - 0x64b, 0x670, 0x6d6, 0x6df, 0x6e7, 0x6ea, 0x711, 0x730, - 0x7a6, 0x7eb, 0x7fd, 0x816, 0x81b, 0x825, 0x829, 0x859, - 0x898, 0x8ca, 0x8e3, 0x93a, 0x93c, 0x941, 0x94d, 0x951, - 0x962, 0x981, 0x9bc, 0x9c1, 0x9cd, 0x9e2, 0x9fe, 0xa01, - 0xa3c, 0xa41, 0xa47, 0xa4b, 0xa51, 0xa70, 0xa75, 0xa81, - 0xabc, 0xac1, 0xac7, 0xacd, 0xae2, 0xafa, 0xb01, 0xb3c, - 0xb3f, 0xb41, 0xb4d, 0xb55, 0xb62, 0xb82, 0xbc0, 0xbcd, - 0xc00, 0xc04, 0xc3c, 0xc3e, 0xc46, 0xc4a, 0xc55, 0xc62, - 0xc81, 0xcbc, 0xcbf, 0xcc6, 0xccc, 0xce2, 0xd00, 0xd3b, - 0xd41, 0xd4d, 0xd62, 0xd81, 0xdca, 0xdd2, 0xdd6, 0xe31, - 0xe34, 0xe47, 0xeb1, 0xeb4, 0xec8, 0xf18, 0xf35, 0xf37, - 0xf39, 0xf71, 0xf80, 0xf86, 0xf8d, 0xf99, 0xfc6, 0x102d, - 0x1032, 0x1039, 0x103d, 0x1058, 0x105e, 0x1071, 0x1082, 0x1085, - 0x108d, 0x109d, 0x135d, 0x1712, 0x1732, 0x1752, 0x1772, 0x17b4, - 0x17b7, 0x17c6, 0x17c9, 0x17dd, 0x180b, 0x180f, 0x1885, 0x18a9, - 0x1920, 0x1927, 0x1932, 0x1939, 0x1a17, 0x1a1b, 0x1a56, 0x1a58, - 0x1a60, 0x1a62, 0x1a65, 0x1a73, 0x1a7f, 0x1ab0, 0x1abf, 0x1b00, - 0x1b34, 0x1b36, 0x1b3c, 0x1b42, 0x1b6b, 0x1b80, 0x1ba2, 0x1ba8, - 0x1bab, 0x1be6, 0x1be8, 0x1bed, 0x1bef, 0x1c2c, 0x1c36, 0x1cd0, - 0x1cd4, 0x1ce2, 0x1ced, 0x1cf4, 0x1cf8, 0x1dc0, 0x20d0, 0x20e1, - 0x20e5, 0x2cef, 0x2d7f, 0x2de0, 0x302a, 0x3099, 0xa66f, 0xa674, - 0xa69e, 0xa6f0, 0xa802, 0xa806, 0xa80b, 0xa825, 0xa82c, 0xa8c4, - 0xa8e0, 0xa8ff, 0xa926, 0xa947, 0xa980, 0xa9b3, 0xa9b6, 0xa9bc, - 0xa9e5, 0xaa29, 0xaa31, 0xaa35, 0xaa43, 0xaa4c, 0xaa7c, 0xaab0, - 0xaab2, 0xaab7, 0xaabe, 0xaac1, 0xaaec, 0xaaf6, 0xabe5, 0xabe8, - 0xabed, 0xfb1e, 0xfe00, 0xfe20, 0x101fd, 0x102e0, 0x10376, 0x10a01, - 0x10a05, 0x10a0c, 0x10a38, 0x10a3f, 0x10ae5, 0x10d24, 0x10eab, 0x10efd, - 0x10f46, 0x10f82, 0x11001, 0x11038, 0x11070, 0x11073, 0x1107f, 0x110b3, - 0x110b9, 0x110c2, 0x11100, 0x11127, 0x1112d, 0x11173, 0x11180, 0x111b6, - 0x111c9, 0x111cf, 0x1122f, 0x11234, 0x11236, 0x1123e, 0x11241, 0x112df, - 0x112e3, 0x11300, 0x1133b, 0x11340, 0x11366, 0x11370, 0x11438, 0x11442, - 0x11446, 0x1145e, 0x114b3, 0x114ba, 0x114bf, 0x114c2, 0x115b2, 0x115bc, - 0x115bf, 0x115dc, 0x11633, 0x1163d, 0x1163f, 0x116ab, 0x116ad, 0x116b0, - 0x116b7, 0x1171d, 0x11722, 0x11727, 0x1182f, 0x11839, 0x1193b, 0x1193e, - 0x11943, 0x119d4, 0x119da, 0x119e0, 0x11a01, 0x11a33, 0x11a3b, 0x11a47, - 0x11a51, 0x11a59, 0x11a8a, 0x11a98, 0x11c30, 0x11c38, 0x11c3f, 0x11c92, - 0x11caa, 0x11cb2, 0x11cb5, 0x11d31, 0x11d3a, 0x11d3c, 0x11d3f, 0x11d47, - 0x11d90, 0x11d95, 0x11d97, 0x11ef3, 0x11f00, 0x11f36, 0x11f40, 0x11f42, - 0x13440, 0x13447, 0x16af0, 0x16b30, 0x16f4f, 0x16f8f, 0x16fe4, 0x1bc9d, - 0x1cf00, 0x1cf30, 0x1d167, 0x1d17b, 0x1d185, 0x1d1aa, 0x1d242, 0x1da00, - 0x1da3b, 0x1da75, 0x1da84, 0x1da9b, 0x1daa1, 0x1e000, 0x1e008, 0x1e01b, - 0x1e023, 0x1e026, 0x1e08f, 0x1e130, 0x1e2ae, 0x1e2ec, 0x1e4ec, 0x1e8d0, - 0x1e944, 0xe0100, 0, 0, 0, 0, 0, 0, +HWY_ALIGN constexpr uint32_t nsm_gte32[] = { + 0x101fd, 0x102e0, 0x10376, 0x10a01, 0x10a05, 0x10a0c, 0x10a38, 0x10a3f, + 0x10ae5, 0x10d24, 0x10eab, 0x10efd, 0x10f46, 0x10f82, 0x11001, 0x11038, + 0x11070, 0x11073, 0x1107f, 0x110b3, 0x110b9, 0x110c2, 0x11100, 0x11127, + 0x1112d, 0x11173, 0x11180, 0x111b6, 0x111c9, 0x111cf, 0x1122f, 0x11234, + 0x11236, 0x1123e, 0x11241, 0x112df, 0x112e3, 0x11300, 0x1133b, 0x11340, + 0x11366, 0x11370, 0x11438, 0x11442, 0x11446, 0x1145e, 0x114b3, 0x114ba, + 0x114bf, 0x114c2, 0x115b2, 0x115bc, 0x115bf, 0x115dc, 0x11633, 0x1163d, + 0x1163f, 0x116ab, 0x116ad, 0x116b0, 0x116b7, 0x1171d, 0x11722, 0x11727, + 0x1182f, 0x11839, 0x1193b, 0x1193e, 0x11943, 0x119d4, 0x119da, 0x119e0, + 0x11a01, 0x11a33, 0x11a3b, 0x11a47, 0x11a51, 0x11a59, 0x11a8a, 0x11a98, + 0x11c30, 0x11c38, 0x11c3f, 0x11c92, 0x11caa, 0x11cb2, 0x11cb5, 0x11d31, + 0x11d3a, 0x11d3c, 0x11d3f, 0x11d47, 0x11d90, 0x11d95, 0x11d97, 0x11ef3, + 0x11f00, 0x11f36, 0x11f40, 0x11f42, 0x13440, 0x13447, 0x16af0, 0x16b30, + 0x16f4f, 0x16f8f, 0x16fe4, 0x1bc9d, 0x1cf00, 0x1cf30, 0x1d167, 0x1d17b, + 0x1d185, 0x1d1aa, 0x1d242, 0x1da00, 0x1da3b, 0x1da75, 0x1da84, 0x1da9b, + 0x1daa1, 0x1e000, 0x1e008, 0x1e01b, 0x1e023, 0x1e026, 0x1e08f, 0x1e130, + 0x1e2ae, 0x1e2ec, 0x1e4ec, 0x1e8d0, 0x1e944, 0xe0100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; -HWY_ALIGN constexpr T nsm_lte[] = { - 0x36f, 0x487, 0x5bd, 0x5bf, 0x5c2, 0x5c5, 0x5c7, 0x61a, - 0x65f, 0x670, 0x6dc, 0x6e4, 0x6e8, 0x6ed, 0x711, 0x74a, - 0x7b0, 0x7f3, 0x7fd, 0x819, 0x823, 0x827, 0x82d, 0x85b, - 0x89f, 0x8e1, 0x902, 0x93a, 0x93c, 0x948, 0x94d, 0x957, - 0x963, 0x981, 0x9bc, 0x9c4, 0x9cd, 0x9e3, 0x9fe, 0xa02, - 0xa3c, 0xa42, 0xa48, 0xa4d, 0xa51, 0xa71, 0xa75, 0xa82, - 0xabc, 0xac5, 0xac8, 0xacd, 0xae3, 0xaff, 0xb01, 0xb3c, - 0xb3f, 0xb44, 0xb4d, 0xb56, 0xb63, 0xb82, 0xbc0, 0xbcd, - 0xc00, 0xc04, 0xc3c, 0xc40, 0xc48, 0xc4d, 0xc56, 0xc63, - 0xc81, 0xcbc, 0xcbf, 0xcc6, 0xccd, 0xce3, 0xd01, 0xd3c, - 0xd44, 0xd4d, 0xd63, 0xd81, 0xdca, 0xdd4, 0xdd6, 0xe31, - 0xe3a, 0xe4e, 0xeb1, 0xebc, 0xece, 0xf19, 0xf35, 0xf37, - 0xf39, 0xf7e, 0xf84, 0xf87, 0xf97, 0xfbc, 0xfc6, 0x1030, - 0x1037, 0x103a, 0x103e, 0x1059, 0x1060, 0x1074, 0x1082, 0x1086, - 0x108d, 0x109d, 0x135f, 0x1714, 0x1733, 0x1753, 0x1773, 0x17b5, - 0x17bd, 0x17c6, 0x17d3, 0x17dd, 0x180d, 0x180f, 0x1886, 0x18a9, - 0x1922, 0x1928, 0x1932, 0x193b, 0x1a18, 0x1a1b, 0x1a56, 0x1a5e, - 0x1a60, 0x1a62, 0x1a6c, 0x1a7c, 0x1a7f, 0x1abd, 0x1ace, 0x1b03, - 0x1b34, 0x1b3a, 0x1b3c, 0x1b42, 0x1b73, 0x1b81, 0x1ba5, 0x1ba9, - 0x1bad, 0x1be6, 0x1be9, 0x1bed, 0x1bf1, 0x1c33, 0x1c37, 0x1cd2, - 0x1ce0, 0x1ce8, 0x1ced, 0x1cf4, 0x1cf9, 0x1dff, 0x20dc, 0x20e1, - 0x20f0, 0x2cf1, 0x2d7f, 0x2dff, 0x302d, 0x309a, 0xa66f, 0xa67d, - 0xa69f, 0xa6f1, 0xa802, 0xa806, 0xa80b, 0xa826, 0xa82c, 0xa8c5, - 0xa8f1, 0xa8ff, 0xa92d, 0xa951, 0xa982, 0xa9b3, 0xa9b9, 0xa9bd, - 0xa9e5, 0xaa2e, 0xaa32, 0xaa36, 0xaa43, 0xaa4c, 0xaa7c, 0xaab0, - 0xaab4, 0xaab8, 0xaabf, 0xaac1, 0xaaed, 0xaaf6, 0xabe5, 0xabe8, - 0xabed, 0xfb1e, 0xfe0f, 0xfe2f, 0x101fd, 0x102e0, 0x1037a, 0x10a03, - 0x10a06, 0x10a0f, 0x10a3a, 0x10a3f, 0x10ae6, 0x10d27, 0x10eac, 0x10eff, - 0x10f50, 0x10f85, 0x11001, 0x11046, 0x11070, 0x11074, 0x11081, 0x110b6, - 0x110ba, 0x110c2, 0x11102, 0x1112b, 0x11134, 0x11173, 0x11181, 0x111be, - 0x111cc, 0x111cf, 0x11231, 0x11234, 0x11237, 0x1123e, 0x11241, 0x112df, - 0x112ea, 0x11301, 0x1133c, 0x11340, 0x1136c, 0x11374, 0x1143f, 0x11444, - 0x11446, 0x1145e, 0x114b8, 0x114ba, 0x114c0, 0x114c3, 0x115b5, 0x115bd, - 0x115c0, 0x115dd, 0x1163a, 0x1163d, 0x11640, 0x116ab, 0x116ad, 0x116b5, - 0x116b7, 0x1171f, 0x11725, 0x1172b, 0x11837, 0x1183a, 0x1193c, 0x1193e, - 0x11943, 0x119d7, 0x119db, 0x119e0, 0x11a0a, 0x11a38, 0x11a3e, 0x11a47, - 0x11a56, 0x11a5b, 0x11a96, 0x11a99, 0x11c36, 0x11c3d, 0x11c3f, 0x11ca7, - 0x11cb0, 0x11cb3, 0x11cb6, 0x11d36, 0x11d3a, 0x11d3d, 0x11d45, 0x11d47, - 0x11d91, 0x11d95, 0x11d97, 0x11ef4, 0x11f01, 0x11f3a, 0x11f40, 0x11f42, - 0x13440, 0x13455, 0x16af4, 0x16b36, 0x16f4f, 0x16f92, 0x16fe4, 0x1bc9e, - 0x1cf2d, 0x1cf46, 0x1d169, 0x1d182, 0x1d18b, 0x1d1ad, 0x1d244, 0x1da36, - 0x1da6c, 0x1da75, 0x1da84, 0x1da9f, 0x1daaf, 0x1e006, 0x1e018, 0x1e021, - 0x1e024, 0x1e02a, 0x1e08f, 0x1e136, 0x1e2ae, 0x1e2ef, 0x1e4ef, 0x1e8d6, - 0x1e94a, 0xe01ef, 0, 0, 0, 0, 0, 0, +HWY_ALIGN constexpr uint32_t nsm_lte32[] = { + 0x101fd, 0x102e0, 0x1037a, 0x10a03, 0x10a06, 0x10a0f, 0x10a3a, 0x10a3f, + 0x10ae6, 0x10d27, 0x10eac, 0x10eff, 0x10f50, 0x10f85, 0x11001, 0x11046, + 0x11070, 0x11074, 0x11081, 0x110b6, 0x110ba, 0x110c2, 0x11102, 0x1112b, + 0x11134, 0x11173, 0x11181, 0x111be, 0x111cc, 0x111cf, 0x11231, 0x11234, + 0x11237, 0x1123e, 0x11241, 0x112df, 0x112ea, 0x11301, 0x1133c, 0x11340, + 0x1136c, 0x11374, 0x1143f, 0x11444, 0x11446, 0x1145e, 0x114b8, 0x114ba, + 0x114c0, 0x114c3, 0x115b5, 0x115bd, 0x115c0, 0x115dd, 0x1163a, 0x1163d, + 0x11640, 0x116ab, 0x116ad, 0x116b5, 0x116b7, 0x1171f, 0x11725, 0x1172b, + 0x11837, 0x1183a, 0x1193c, 0x1193e, 0x11943, 0x119d7, 0x119db, 0x119e0, + 0x11a0a, 0x11a38, 0x11a3e, 0x11a47, 0x11a56, 0x11a5b, 0x11a96, 0x11a99, + 0x11c36, 0x11c3d, 0x11c3f, 0x11ca7, 0x11cb0, 0x11cb3, 0x11cb6, 0x11d36, + 0x11d3a, 0x11d3d, 0x11d45, 0x11d47, 0x11d91, 0x11d95, 0x11d97, 0x11ef4, + 0x11f01, 0x11f3a, 0x11f40, 0x11f42, 0x13440, 0x13455, 0x16af4, 0x16b36, + 0x16f4f, 0x16f92, 0x16fe4, 0x1bc9e, 0x1cf2d, 0x1cf46, 0x1d169, 0x1d182, + 0x1d18b, 0x1d1ad, 0x1d244, 0x1da36, 0x1da6c, 0x1da75, 0x1da84, 0x1da9f, + 0x1daaf, 0x1e006, 0x1e018, 0x1e021, 0x1e024, 0x1e02a, 0x1e08f, 0x1e136, + 0x1e2ae, 0x1e2ef, 0x1e4ef, 0x1e8d6, 0x1e94a, 0xe01ef, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; +HWY_ALIGN constexpr uint16_t nsm_gte16[] = { + 0x300, 0x483, 0x591, 0x5bf, 0x5c1, 0x5c4, 0x5c7, 0x610, 0x64b, + 0x670, 0x6d6, 0x6df, 0x6e7, 0x6ea, 0x711, 0x730, 0x7a6, 0x7eb, + 0x7fd, 0x816, 0x81b, 0x825, 0x829, 0x859, 0x898, 0x8ca, 0x8e3, + 0x93a, 0x93c, 0x941, 0x94d, 0x951, 0x962, 0x981, 0x9bc, 0x9c1, + 0x9cd, 0x9e2, 0x9fe, 0xa01, 0xa3c, 0xa41, 0xa47, 0xa4b, 0xa51, + 0xa70, 0xa75, 0xa81, 0xabc, 0xac1, 0xac7, 0xacd, 0xae2, 0xafa, + 0xb01, 0xb3c, 0xb3f, 0xb41, 0xb4d, 0xb55, 0xb62, 0xb82, 0xbc0, + 0xbcd, 0xc00, 0xc04, 0xc3c, 0xc3e, 0xc46, 0xc4a, 0xc55, 0xc62, + 0xc81, 0xcbc, 0xcbf, 0xcc6, 0xccc, 0xce2, 0xd00, 0xd3b, 0xd41, + 0xd4d, 0xd62, 0xd81, 0xdca, 0xdd2, 0xdd6, 0xe31, 0xe34, 0xe47, + 0xeb1, 0xeb4, 0xec8, 0xf18, 0xf35, 0xf37, 0xf39, 0xf71, 0xf80, + 0xf86, 0xf8d, 0xf99, 0xfc6, 0x102d, 0x1032, 0x1039, 0x103d, 0x1058, + 0x105e, 0x1071, 0x1082, 0x1085, 0x108d, 0x109d, 0x135d, 0x1712, 0x1732, + 0x1752, 0x1772, 0x17b4, 0x17b7, 0x17c6, 0x17c9, 0x17dd, 0x180b, 0x180f, + 0x1885, 0x18a9, 0x1920, 0x1927, 0x1932, 0x1939, 0x1a17, 0x1a1b, 0x1a56, + 0x1a58, 0x1a60, 0x1a62, 0x1a65, 0x1a73, 0x1a7f, 0x1ab0, 0x1abf, 0x1b00, + 0x1b34, 0x1b36, 0x1b3c, 0x1b42, 0x1b6b, 0x1b80, 0x1ba2, 0x1ba8, 0x1bab, + 0x1be6, 0x1be8, 0x1bed, 0x1bef, 0x1c2c, 0x1c36, 0x1cd0, 0x1cd4, 0x1ce2, + 0x1ced, 0x1cf4, 0x1cf8, 0x1dc0, 0x20d0, 0x20e1, 0x20e5, 0x2cef, 0x2d7f, + 0x2de0, 0x302a, 0x3099, 0xa66f, 0xa674, 0xa69e, 0xa6f0, 0xa802, 0xa806, + 0xa80b, 0xa825, 0xa82c, 0xa8c4, 0xa8e0, 0xa8ff, 0xa926, 0xa947, 0xa980, + 0xa9b3, 0xa9b6, 0xa9bc, 0xa9e5, 0xaa29, 0xaa31, 0xaa35, 0xaa43, 0xaa4c, + 0xaa7c, 0xaab0, 0xaab2, 0xaab7, 0xaabe, 0xaac1, 0xaaec, 0xaaf6, 0xabe5, + 0xabe8, 0xabed, 0xfb1e, 0xfe00, 0xfe20, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +HWY_ALIGN constexpr uint16_t nsm_lte16[] = { + 0x36f, 0x487, 0x5bd, 0x5bf, 0x5c2, 0x5c5, 0x5c7, 0x61a, 0x65f, + 0x670, 0x6dc, 0x6e4, 0x6e8, 0x6ed, 0x711, 0x74a, 0x7b0, 0x7f3, + 0x7fd, 0x819, 0x823, 0x827, 0x82d, 0x85b, 0x89f, 0x8e1, 0x902, + 0x93a, 0x93c, 0x948, 0x94d, 0x957, 0x963, 0x981, 0x9bc, 0x9c4, + 0x9cd, 0x9e3, 0x9fe, 0xa02, 0xa3c, 0xa42, 0xa48, 0xa4d, 0xa51, + 0xa71, 0xa75, 0xa82, 0xabc, 0xac5, 0xac8, 0xacd, 0xae3, 0xaff, + 0xb01, 0xb3c, 0xb3f, 0xb44, 0xb4d, 0xb56, 0xb63, 0xb82, 0xbc0, + 0xbcd, 0xc00, 0xc04, 0xc3c, 0xc40, 0xc48, 0xc4d, 0xc56, 0xc63, + 0xc81, 0xcbc, 0xcbf, 0xcc6, 0xccd, 0xce3, 0xd01, 0xd3c, 0xd44, + 0xd4d, 0xd63, 0xd81, 0xdca, 0xdd4, 0xdd6, 0xe31, 0xe3a, 0xe4e, + 0xeb1, 0xebc, 0xece, 0xf19, 0xf35, 0xf37, 0xf39, 0xf7e, 0xf84, + 0xf87, 0xf97, 0xfbc, 0xfc6, 0x1030, 0x1037, 0x103a, 0x103e, 0x1059, + 0x1060, 0x1074, 0x1082, 0x1086, 0x108d, 0x109d, 0x135f, 0x1714, 0x1733, + 0x1753, 0x1773, 0x17b5, 0x17bd, 0x17c6, 0x17d3, 0x17dd, 0x180d, 0x180f, + 0x1886, 0x18a9, 0x1922, 0x1928, 0x1932, 0x193b, 0x1a18, 0x1a1b, 0x1a56, + 0x1a5e, 0x1a60, 0x1a62, 0x1a6c, 0x1a7c, 0x1a7f, 0x1abd, 0x1ace, 0x1b03, + 0x1b34, 0x1b3a, 0x1b3c, 0x1b42, 0x1b73, 0x1b81, 0x1ba5, 0x1ba9, 0x1bad, + 0x1be6, 0x1be9, 0x1bed, 0x1bf1, 0x1c33, 0x1c37, 0x1cd2, 0x1ce0, 0x1ce8, + 0x1ced, 0x1cf4, 0x1cf9, 0x1dff, 0x20dc, 0x20e1, 0x20f0, 0x2cf1, 0x2d7f, + 0x2dff, 0x302d, 0x309a, 0xa66f, 0xa67d, 0xa69f, 0xa6f1, 0xa802, 0xa806, + 0xa80b, 0xa826, 0xa82c, 0xa8c5, 0xa8f1, 0xa8ff, 0xa92d, 0xa951, 0xa982, + 0xa9b3, 0xa9b9, 0xa9bd, 0xa9e5, 0xaa2e, 0xaa32, 0xaa36, 0xaa43, 0xaa4c, + 0xaa7c, 0xaab0, 0xaab4, 0xaab8, 0xaabf, 0xaac1, 0xaaed, 0xaaf6, 0xabe5, + 0xabe8, 0xabed, 0xfb1e, 0xfe0f, 0xfe2f, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + // All our tables must be identically sized -static_assert(std::size(eaw_gte) == std::size(eaw_lte)); -static_assert(std::size(zero_gte) == std::size(zero_lte)); -static_assert(std::size(nsm_gte) == std::size(nsm_lte)); +static_assert(std::size(eaw_gte32) == std::size(eaw_lte32)); +static_assert(std::size(eaw_gte16) == std::size(eaw_lte16)); +static_assert(std::size(zero_gte32) == std::size(zero_lte32)); +static_assert(std::size(zero_gte16) == std::size(zero_lte16)); +static_assert(std::size(nsm_gte32) == std::size(nsm_lte32)); +static_assert(std::size(nsm_gte16) == std::size(nsm_lte16)); -/// Vectorized implementation of Unicode display width. Determining width -/// unfortunately requires many small range checks, so we test some fast paths -/// and otherwise try to do N (vector lane width) range checks at a time. -template -int8_t CodepointWidthImpl(D d, T input) { - // If the input is ASCII, then we return 1. We do NOT check for - // control characters because we assume that the input has already - // been checked for that case. - if (input <= 0xFF) { - return 1; - } +/// Handles 16-bit codepoints. +template +int8_t CodepointWidth16(D d, uint16_t input) { + assert(input > 0xFF); + assert(input <= 0xFFFF); - // Its not ASCII, so lets move to vector ops to figure out the width. const size_t N = hn::Lanes(d); const hn::Vec input_vec = Set(d, input); @@ -207,12 +221,12 @@ int8_t CodepointWidthImpl(D d, T input) { // NOTE: 0x2E3B is technically width 3 but for our terminal we only // handle up to width 2 as wide so we will treat it as width 2. HWY_ALIGN constexpr T gte_keys[] = { - 0x2E3A, 0x1f1e6, 0x3400, 0x4E00, 0xF900, 0x20000, 0x30000, 0x2E3B, - 0, 0, 0, 0, 0, 0, 0, 0, + 0x2E3A, 0x3400, 0x4E00, 0xF900, 0x2E3B, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; HWY_ALIGN constexpr T lte_keys[] = { - 0x2E3A, 0x1f1ff, 0x4DBF, 0x9FFF, 0xFAFF, 0x2FFFD, 0x3FFFD, 0x2E3B, - 0, 0, 0, 0, 0, 0, 0, 0, + 0x2E3A, 0x4DBF, 0x9FFF, 0xFAFF, 0x2E3B, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; size_t i = 0; for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { @@ -224,15 +238,15 @@ int8_t CodepointWidthImpl(D d, T input) { return 2; } } - assert(i >= 7); // We should have checked all the ranges. + assert(i >= 5); // We should have checked all the ranges. } { HWY_ALIGN constexpr T gte_keys[] = { - 0x1160, 0x2060, 0xFFF0, 0xE0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0x1160, 0x2060, 0xFFF0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; HWY_ALIGN constexpr T lte_keys[] = { - 0x11FF, 0x206F, 0xFFF8, 0xE0FFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0x11FF, 0x206F, 0xFFF8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; size_t i = 0; for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { @@ -248,14 +262,14 @@ int8_t CodepointWidthImpl(D d, T input) { { constexpr T zero_gte_min = - *std::min_element(zero_gte, zero_gte + std::size(zero_gte)); + *std::min_element(zero_gte16, zero_gte16 + std::size(zero_gte16)); constexpr T zero_lte_max = - *std::max_element(zero_lte, zero_lte + std::size(zero_lte)); + *std::max_element(zero_lte16, zero_lte16 + std::size(zero_lte16)); if (input >= zero_gte_min && input <= zero_lte_max) { size_t i = 0; - for (; i + N <= std::size(zero_gte) && zero_gte[i] != 0; i += N) { - const hn::Vec lte_vec = hn::Load(d, zero_lte + i); - const hn::Vec gte_vec = hn::Load(d, zero_gte + i); + for (; i + N <= std::size(zero_gte16) && zero_gte16[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, zero_lte16 + i); + const hn::Vec gte_vec = hn::Load(d, zero_gte16 + i); const intptr_t idx = hn::FindFirstTrue( d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); if (idx >= 0) { @@ -267,14 +281,14 @@ int8_t CodepointWidthImpl(D d, T input) { { constexpr T eaw_gte_min = - *std::min_element(eaw_gte, eaw_gte + std::size(eaw_gte)); + *std::min_element(eaw_gte16, eaw_gte16 + std::size(eaw_gte16)); constexpr T eaw_lte_max = - *std::max_element(eaw_lte, eaw_lte + std::size(eaw_lte)); + *std::max_element(eaw_lte16, eaw_lte16 + std::size(eaw_lte16)); if (input >= eaw_gte_min && input <= eaw_lte_max) { size_t i = 0; - for (; i + N <= std::size(eaw_lte) && eaw_lte[i] != 0; i += N) { - const hn::Vec lte_vec = hn::Load(d, eaw_lte + i); - const hn::Vec gte_vec = hn::Load(d, eaw_gte + i); + for (; i + N <= std::size(eaw_lte16) && eaw_lte16[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, eaw_lte16 + i); + const hn::Vec gte_vec = hn::Load(d, eaw_gte16 + i); const intptr_t idx = hn::FindFirstTrue( d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); if (idx >= 0) { @@ -286,14 +300,14 @@ int8_t CodepointWidthImpl(D d, T input) { { constexpr T nsm_gte_min = - *std::min_element(nsm_gte, nsm_gte + std::size(nsm_gte)); + *std::min_element(nsm_gte16, nsm_gte16 + std::size(nsm_gte16)); constexpr T nsm_lte_max = - *std::max_element(nsm_lte, nsm_lte + std::size(nsm_lte)); + *std::max_element(nsm_lte16, nsm_lte16 + std::size(nsm_lte16)); if (input >= nsm_gte_min && input <= nsm_lte_max) { size_t i = 0; - for (; i + N <= std::size(nsm_lte) && nsm_lte[i] != 0; i += N) { - const hn::Vec lte_vec = hn::Load(d, nsm_lte + i); - const hn::Vec gte_vec = hn::Load(d, nsm_gte + i); + for (; i + N <= std::size(nsm_lte16) && nsm_lte16[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, nsm_lte16 + i); + const hn::Vec gte_vec = hn::Load(d, nsm_gte16 + i); const intptr_t idx = hn::FindFirstTrue( d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); if (idx >= 0) { @@ -306,9 +320,135 @@ int8_t CodepointWidthImpl(D d, T input) { return 1; } -int8_t CodepointWidth(T input) { - const hn::ScalableTag d; - return CodepointWidthImpl(d, input); +/// Handles codepoints larger than 16-bit. +template +int8_t CodepointWidth32(D d, T input) { + assert(input > 0xFFFF); + + const size_t N = hn::Lanes(d); + const hn::Vec input_vec = Set(d, input); + + { + // NOTE: 0x2E3B is technically width 3 but for our terminal we only + // handle up to width 2 as wide so we will treat it as width 2. + HWY_ALIGN constexpr T gte_keys[] = { + 0x1f1e6, 0x20000, 0x30000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + HWY_ALIGN constexpr T lte_keys[] = { + 0x1f1ff, 0x2FFFD, 0x3FFFD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + size_t i = 0; + for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, lte_keys + i); + const hn::Vec gte_vec = hn::Load(d, gte_keys + i); + const intptr_t idx = hn::FindFirstTrue( + d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); + if (idx >= 0) { + return 2; + } + } + assert(i >= 2); // We should have checked all the ranges. + } + + { + HWY_ALIGN constexpr T gte_keys[] = { + 0xE0000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + HWY_ALIGN constexpr T lte_keys[] = { + 0xE0FFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + size_t i = 0; + for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, lte_keys + i); + const hn::Vec gte_vec = hn::Load(d, gte_keys + i); + const intptr_t idx = hn::FindFirstTrue( + d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); + if (idx >= 0) { + return 0; + } + } + } + + { + constexpr T zero_gte_min = + *std::min_element(zero_gte32, zero_gte32 + std::size(zero_gte32)); + constexpr T zero_lte_max = + *std::max_element(zero_lte32, zero_lte32 + std::size(zero_lte32)); + if (input >= zero_gte_min && input <= zero_lte_max) { + size_t i = 0; + for (; i + N <= std::size(zero_gte32) && zero_gte32[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, zero_lte32 + i); + const hn::Vec gte_vec = hn::Load(d, zero_gte32 + i); + const intptr_t idx = hn::FindFirstTrue( + d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); + if (idx >= 0) { + return 0; + } + } + } + } + + { + constexpr T eaw_gte_min = + *std::min_element(eaw_gte32, eaw_gte32 + std::size(eaw_gte32)); + constexpr T eaw_lte_max = + *std::max_element(eaw_lte32, eaw_lte32 + std::size(eaw_lte32)); + if (input >= eaw_gte_min && input <= eaw_lte_max) { + size_t i = 0; + for (; i + N <= std::size(eaw_lte32) && eaw_lte32[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, eaw_lte32 + i); + const hn::Vec gte_vec = hn::Load(d, eaw_gte32 + i); + const intptr_t idx = hn::FindFirstTrue( + d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); + if (idx >= 0) { + return 2; + } + } + } + } + + { + constexpr T nsm_gte_min = + *std::min_element(nsm_gte32, nsm_gte32 + std::size(nsm_gte32)); + constexpr T nsm_lte_max = + *std::max_element(nsm_lte32, nsm_lte32 + std::size(nsm_lte32)); + if (input >= nsm_gte_min && input <= nsm_lte_max) { + size_t i = 0; + for (; i + N <= std::size(nsm_lte32) && nsm_lte32[i] != 0; i += N) { + const hn::Vec lte_vec = hn::Load(d, nsm_lte32 + i); + const hn::Vec gte_vec = hn::Load(d, nsm_gte32 + i); + const intptr_t idx = hn::FindFirstTrue( + d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); + if (idx >= 0) { + return 0; + } + } + } + } + + return 1; +} + +/// Vectorized implementation of Unicode display width. Determining width +/// unfortunately requires many small range checks, so we test some fast paths +/// and otherwise try to do N (vector lane width) range checks at a time. +int8_t CodepointWidth(uint32_t input) { + // If the input is ASCII, then we return 1. We do NOT check for + // control characters because we assume that the input has already + // been checked for that case. + if (input <= 0xFF) { + return 1; + } + + // We handle 16-bit codepoints separately because they are more common + // and we can fit double the number of lanes in a vector. + if (input <= 0xFFFF) { + hn::ScalableTag d; + return CodepointWidth16(d, input); + } + + hn::ScalableTag d; + return CodepointWidth32(d, input); } } // namespace HWY_NAMESPACE From ae741172693fa773e7c0bc80f3e4e0cd72fcc22b Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 7 Feb 2024 14:31:06 -0800 Subject: [PATCH 10/12] simd: use less vector ops for 16-bit --- src/simd/codepoint_width.cpp | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/src/simd/codepoint_width.cpp b/src/simd/codepoint_width.cpp index bf8cae370..6e2ced205 100644 --- a/src/simd/codepoint_width.cpp +++ b/src/simd/codepoint_width.cpp @@ -221,12 +221,12 @@ int8_t CodepointWidth16(D d, uint16_t input) { // NOTE: 0x2E3B is technically width 3 but for our terminal we only // handle up to width 2 as wide so we will treat it as width 2. HWY_ALIGN constexpr T gte_keys[] = { - 0x2E3A, 0x3400, 0x4E00, 0xF900, 0x2E3B, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0x2E3A, 0x3400, 0x4E00, 0xF900, 0x2E3B, 0x1160, 0x2060, 0xFFF0, + 0, 0, 0, 0, 0, 0, 0, 0, }; HWY_ALIGN constexpr T lte_keys[] = { - 0x2E3A, 0x4DBF, 0x9FFF, 0xFAFF, 0x2E3B, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0x2E3A, 0x4DBF, 0x9FFF, 0xFAFF, 0x2E3B, 0x11FF, 0x206F, 0xFFF8, + 0, 0, 0, 0, 0, 0, 0, 0, }; size_t i = 0; for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { @@ -234,30 +234,16 @@ int8_t CodepointWidth16(D d, uint16_t input) { const hn::Vec gte_vec = hn::Load(d, gte_keys + i); const intptr_t idx = hn::FindFirstTrue( d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); - if (idx >= 0) { + + // We organize the data above to split 0 and 2-width codepoints since + // we can probably do all the comparisons in one go. + if (idx >= 5) { + return 0; + } else if (idx >= 0) { return 2; } } - assert(i >= 5); // We should have checked all the ranges. - } - - { - HWY_ALIGN constexpr T gte_keys[] = { - 0x1160, 0x2060, 0xFFF0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; - HWY_ALIGN constexpr T lte_keys[] = { - 0x11FF, 0x206F, 0xFFF8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; - size_t i = 0; - for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { - const hn::Vec lte_vec = hn::Load(d, lte_keys + i); - const hn::Vec gte_vec = hn::Load(d, gte_keys + i); - const intptr_t idx = hn::FindFirstTrue( - d, hn::And(hn::Le(input_vec, lte_vec), hn::Ge(input_vec, gte_vec))); - if (idx >= 0) { - return 0; - } - } + assert(i >= 7); // We should have checked all the ranges. } { From e5c7d4e3ee57d1dea4145f401b2f94aee0c4c2a9 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 7 Feb 2024 14:43:08 -0800 Subject: [PATCH 11/12] simd: increase padding for avx --- src/simd/codepoint_width.cpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/simd/codepoint_width.cpp b/src/simd/codepoint_width.cpp index 6e2ced205..f55e2b198 100644 --- a/src/simd/codepoint_width.cpp +++ b/src/simd/codepoint_width.cpp @@ -55,6 +55,8 @@ HWY_ALIGN constexpr uint16_t eaw_gte16[] = { 0xfe10, 0xfe30, 0xfe54, 0xfe68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, }; HWY_ALIGN constexpr uint16_t eaw_lte16[] = { @@ -70,6 +72,8 @@ HWY_ALIGN constexpr uint16_t eaw_lte16[] = { 0xfe19, 0xfe52, 0xfe66, 0xfe6b, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /// These are the ranges of codepoints that are DEFINITELY width 0. @@ -88,6 +92,8 @@ HWY_ALIGN constexpr uint16_t zero_gte16[] = { 0xfff9, 0x488, 0x1abe, 0x20dd, 0x20e2, 0xa670, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, }; HWY_ALIGN constexpr uint16_t zero_lte16[] = { @@ -95,6 +101,8 @@ HWY_ALIGN constexpr uint16_t zero_lte16[] = { 0xfffb, 0x489, 0x1abe, 0x20e0, 0x20e4, 0xa672, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /// Non-spacing marks @@ -169,6 +177,8 @@ HWY_ALIGN constexpr uint16_t nsm_gte16[] = { 0xabe8, 0xabed, 0xfb1e, 0xfe00, 0xfe20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, }; HWY_ALIGN constexpr uint16_t nsm_lte16[] = { @@ -198,6 +208,8 @@ HWY_ALIGN constexpr uint16_t nsm_lte16[] = { 0xabe8, 0xabed, 0xfb1e, 0xfe0f, 0xfe2f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, }; // All our tables must be identically sized @@ -221,12 +233,15 @@ int8_t CodepointWidth16(D d, uint16_t input) { // NOTE: 0x2E3B is technically width 3 but for our terminal we only // handle up to width 2 as wide so we will treat it as width 2. HWY_ALIGN constexpr T gte_keys[] = { - 0x2E3A, 0x3400, 0x4E00, 0xF900, 0x2E3B, 0x1160, 0x2060, 0xFFF0, - 0, 0, 0, 0, 0, 0, 0, 0, + 0x2E3A, 0x3400, 0x4E00, 0xF900, 0x2E3B, 0x1160, 0x2060, 0xFFF0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; HWY_ALIGN constexpr T lte_keys[] = { - 0x2E3A, 0x4DBF, 0x9FFF, 0xFAFF, 0x2E3B, 0x11FF, 0x206F, 0xFFF8, - 0, 0, 0, 0, 0, 0, 0, 0, + 0x2E3A, 0x4DBF, 0x9FFF, 0xFAFF, 0x2E3B, 0x11FF, 0x206F, 0xFFF8, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; size_t i = 0; for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { From 4e2502c11eb4b471bbfefa0fc6137b1cc0d44c2e Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 7 Feb 2024 15:23:43 -0800 Subject: [PATCH 12/12] simd/codepoint-width: assertions for avx512 padding --- src/simd/codepoint_width.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/simd/codepoint_width.cpp b/src/simd/codepoint_width.cpp index f55e2b198..4eb7da66d 100644 --- a/src/simd/codepoint_width.cpp +++ b/src/simd/codepoint_width.cpp @@ -233,8 +233,9 @@ int8_t CodepointWidth16(D d, uint16_t input) { // NOTE: 0x2E3B is technically width 3 but for our terminal we only // handle up to width 2 as wide so we will treat it as width 2. HWY_ALIGN constexpr T gte_keys[] = { - 0x2E3A, 0x3400, 0x4E00, 0xF900, 0x2E3B, 0x1160, 0x2060, 0xFFF0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0x2E3A, 0x3400, 0x4E00, 0xF900, 0x2E3B, 0x1160, 0x2060, 0xFFF0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; HWY_ALIGN constexpr T lte_keys[] = { @@ -243,6 +244,8 @@ int8_t CodepointWidth16(D d, uint16_t input) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; + static_assert(std::size(gte_keys) == std::size(lte_keys)); + static_assert(std::size(gte_keys) >= 32); size_t i = 0; for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { const hn::Vec lte_vec = hn::Load(d, lte_keys + i); @@ -338,6 +341,8 @@ int8_t CodepointWidth32(D d, T input) { HWY_ALIGN constexpr T lte_keys[] = { 0x1f1ff, 0x2FFFD, 0x3FFFD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; + static_assert(std::size(gte_keys) == std::size(lte_keys)); + static_assert(std::size(gte_keys) >= 16); size_t i = 0; for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { const hn::Vec lte_vec = hn::Load(d, lte_keys + i); @@ -358,6 +363,8 @@ int8_t CodepointWidth32(D d, T input) { HWY_ALIGN constexpr T lte_keys[] = { 0xE0FFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; + static_assert(std::size(gte_keys) == std::size(lte_keys)); + static_assert(std::size(gte_keys) >= 16); size_t i = 0; for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) { const hn::Vec lte_vec = hn::Load(d, lte_keys + i);