mirror of
https://github.com/ghostty-org/ghostty.git
synced 2025-07-19 18:26:13 +03:00

This introduces a new package `src/synthetic` for generating synthetic data, currently primarily for benchmarking but other use cases can emerge. The synthetic package exports a runtime-dispatched type `Generator` that can generate data of various types. To start, we have a bytes, utf8, and OSC generator. The goal of each generator is to expose knobs to tune the probabilities of various outcomes. For example, the UTF-8 generator has a knob to tune the probability of generating 1, 2, 3, or 4-byte UTF-8 sequences. Ultimately, the goal is to be able to collect probability data empirically that we can then use for benchmarks so we can optimize various parts of the codebase on real-world data shape distributions.
104 lines
3.5 KiB
Zig
104 lines
3.5 KiB
Zig
/// Generates UTF-8.
|
|
///
|
|
/// This doesn't yet generate multi-codepoint graphemes, but it
|
|
/// has the ability to generate a custom distribution of UTF-8
|
|
/// encoding lengths (1, 2, 3, or 4 bytes).
|
|
const Utf8 = @This();
|
|
|
|
const std = @import("std");
|
|
const assert = std.debug.assert;
|
|
const Generator = @import("Generator.zig");
|
|
|
|
/// Possible UTF-8 encoding lengths.
|
|
pub const Utf8Len = enum(u3) {
|
|
one = 1,
|
|
two = 2,
|
|
three = 3,
|
|
four = 4,
|
|
};
|
|
|
|
/// Random number generator.
|
|
rand: std.Random,
|
|
|
|
/// The minimum and maximum length of the generated bytes. The maximum
|
|
/// length will be capped to the length of the buffer passed in if the
|
|
/// buffer length is smaller.
|
|
min_len: usize = 1,
|
|
max_len: usize = std.math.maxInt(usize),
|
|
|
|
/// Probability of a specific UTF-8 encoding length being generated.
|
|
/// The probabilities are weighted relative to each other, so they
|
|
/// can sum greater than 1.0. A length of weight 1.0 and a length
|
|
/// of weight 2.0 will have a 2:1 chance of the latter being
|
|
/// selected.
|
|
///
|
|
/// If a UTF-8 encoding of a chosen length can't fit into the remaining
|
|
/// buffer, a smaller length will be chosen. For small buffers this may
|
|
/// skew the distribution of lengths.
|
|
p_length: std.enums.EnumArray(Utf8Len, f64) = .initFill(1.0),
|
|
|
|
pub fn generator(self: *Utf8) Generator {
|
|
return .init(self, next);
|
|
}
|
|
|
|
pub fn next(self: *Utf8, buf: []u8) Generator.Error![]const u8 {
|
|
const len = @min(
|
|
self.rand.intRangeAtMostBiased(usize, self.min_len, self.max_len),
|
|
buf.len,
|
|
);
|
|
|
|
const result = buf[0..len];
|
|
var rem: usize = len;
|
|
while (rem > 0) {
|
|
// Pick a utf8 byte count to generate.
|
|
const utf8_len: Utf8Len = len: {
|
|
const Indexer = @TypeOf(self.p_length).Indexer;
|
|
const idx = self.rand.weightedIndex(f64, &self.p_length.values);
|
|
var utf8_len = Indexer.keyForIndex(idx);
|
|
assert(rem > 0);
|
|
while (@intFromEnum(utf8_len) > rem) {
|
|
// If the chosen length can't fit into the remaining buffer,
|
|
// choose a smaller length.
|
|
utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
|
|
}
|
|
break :len utf8_len;
|
|
};
|
|
|
|
// Generate a UTF-8 sequence that encodes to this length.
|
|
const cp: u21 = switch (utf8_len) {
|
|
.one => self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
|
|
.two => self.rand.intRangeAtMostBiased(u21, 0x80, 0x7FF),
|
|
.three => self.rand.intRangeAtMostBiased(u21, 0x800, 0xFFFF),
|
|
.four => self.rand.intRangeAtMostBiased(u21, 0x10000, 0x10FFFF),
|
|
};
|
|
|
|
assert(std.unicode.utf8CodepointSequenceLength(
|
|
cp,
|
|
) catch unreachable == @intFromEnum(utf8_len));
|
|
rem -= std.unicode.utf8Encode(
|
|
cp,
|
|
result[result.len - rem ..],
|
|
) catch |err| switch (err) {
|
|
// Impossible because our generation above is hardcoded to
|
|
// produce a valid range. If not, a bug.
|
|
error.CodepointTooLarge => unreachable,
|
|
|
|
// Possible, in which case we redo the loop and encode nothing.
|
|
error.Utf8CannotEncodeSurrogateHalf => continue,
|
|
};
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
test "utf8" {
|
|
const testing = std.testing;
|
|
var prng = std.Random.DefaultPrng.init(0);
|
|
var buf: [256]u8 = undefined;
|
|
var v: Utf8 = .{ .rand = prng.random() };
|
|
const gen = v.generator();
|
|
const result = try gen.next(&buf);
|
|
try testing.expect(result.len > 0);
|
|
try testing.expect(std.unicode.utf8ValidateSlice(result));
|
|
}
|