ghostty/src/synthetic/Utf8.zig
Mitchell Hashimoto f1c42c9f8c synthetic package
This introduces a new package `src/synthetic` for generating synthetic
data, currently primarily for benchmarking but other use cases can
emerge.

The synthetic package exports a runtime-dispatched type `Generator` that
can generate data of various types. To start, we have a bytes, utf8,
and OSC generator. The goal of each generator is to expose knobs to tune the
probabilities of various outcomes. For example, the UTF-8 generator has
a knob to tune the probability of generating 1, 2, 3, or 4-byte UTF-8
sequences.

Ultimately, the goal is to be able to collect probability data
empirically that we can then use for benchmarks so we can optimize
various parts of the codebase on real-world data shape distributions.
2025-05-21 10:20:09 -07:00

104 lines
3.5 KiB
Zig

/// Generates UTF-8.
///
/// This doesn't yet generate multi-codepoint graphemes, but it
/// has the ability to generate a custom distribution of UTF-8
/// encoding lengths (1, 2, 3, or 4 bytes).
const Utf8 = @This();
const std = @import("std");
const assert = std.debug.assert;
const Generator = @import("Generator.zig");
/// Possible UTF-8 encoding lengths.
pub const Utf8Len = enum(u3) {
one = 1,
two = 2,
three = 3,
four = 4,
};
/// Random number generator.
rand: std.Random,
/// The minimum and maximum length of the generated bytes. The maximum
/// length will be capped to the length of the buffer passed in if the
/// buffer length is smaller.
min_len: usize = 1,
max_len: usize = std.math.maxInt(usize),
/// Probability of a specific UTF-8 encoding length being generated.
/// The probabilities are weighted relative to each other, so they
/// can sum greater than 1.0. A length of weight 1.0 and a length
/// of weight 2.0 will have a 2:1 chance of the latter being
/// selected.
///
/// If a UTF-8 encoding of a chosen length can't fit into the remaining
/// buffer, a smaller length will be chosen. For small buffers this may
/// skew the distribution of lengths.
p_length: std.enums.EnumArray(Utf8Len, f64) = .initFill(1.0),
pub fn generator(self: *Utf8) Generator {
return .init(self, next);
}
pub fn next(self: *Utf8, buf: []u8) Generator.Error![]const u8 {
const len = @min(
self.rand.intRangeAtMostBiased(usize, self.min_len, self.max_len),
buf.len,
);
const result = buf[0..len];
var rem: usize = len;
while (rem > 0) {
// Pick a utf8 byte count to generate.
const utf8_len: Utf8Len = len: {
const Indexer = @TypeOf(self.p_length).Indexer;
const idx = self.rand.weightedIndex(f64, &self.p_length.values);
var utf8_len = Indexer.keyForIndex(idx);
assert(rem > 0);
while (@intFromEnum(utf8_len) > rem) {
// If the chosen length can't fit into the remaining buffer,
// choose a smaller length.
utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
}
break :len utf8_len;
};
// Generate a UTF-8 sequence that encodes to this length.
const cp: u21 = switch (utf8_len) {
.one => self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
.two => self.rand.intRangeAtMostBiased(u21, 0x80, 0x7FF),
.three => self.rand.intRangeAtMostBiased(u21, 0x800, 0xFFFF),
.four => self.rand.intRangeAtMostBiased(u21, 0x10000, 0x10FFFF),
};
assert(std.unicode.utf8CodepointSequenceLength(
cp,
) catch unreachable == @intFromEnum(utf8_len));
rem -= std.unicode.utf8Encode(
cp,
result[result.len - rem ..],
) catch |err| switch (err) {
// Impossible because our generation above is hardcoded to
// produce a valid range. If not, a bug.
error.CodepointTooLarge => unreachable,
// Possible, in which case we redo the loop and encode nothing.
error.Utf8CannotEncodeSurrogateHalf => continue,
};
}
return result;
}
test "utf8" {
const testing = std.testing;
var prng = std.Random.DefaultPrng.init(0);
var buf: [256]u8 = undefined;
var v: Utf8 = .{ .rand = prng.random() };
const gen = v.generator();
const result = try gen.next(&buf);
try testing.expect(result.len > 0);
try testing.expect(std.unicode.utf8ValidateSlice(result));
}