/// Generates UTF-8. /// /// This doesn't yet generate multi-codepoint graphemes, but it /// has the ability to generate a custom distribution of UTF-8 /// encoding lengths (1, 2, 3, or 4 bytes). const Utf8 = @This(); const std = @import("std"); const assert = std.debug.assert; const Generator = @import("Generator.zig"); /// Possible UTF-8 encoding lengths. pub const Utf8Len = enum(u3) { one = 1, two = 2, three = 3, four = 4, }; /// Random number generator. rand: std.Random, /// The minimum and maximum length of the generated bytes. The maximum /// length will be capped to the length of the buffer passed in if the /// buffer length is smaller. min_len: usize = 1, max_len: usize = std.math.maxInt(usize), /// Probability of a specific UTF-8 encoding length being generated. /// The probabilities are weighted relative to each other, so they /// can sum greater than 1.0. A length of weight 1.0 and a length /// of weight 2.0 will have a 2:1 chance of the latter being /// selected. /// /// If a UTF-8 encoding of a chosen length can't fit into the remaining /// buffer, a smaller length will be chosen. For small buffers this may /// skew the distribution of lengths. p_length: std.enums.EnumArray(Utf8Len, f64) = .initFill(1.0), pub fn generator(self: *Utf8) Generator { return .init(self, next); } pub fn next(self: *Utf8, buf: []u8) Generator.Error![]const u8 { const len = @min( self.rand.intRangeAtMostBiased(usize, self.min_len, self.max_len), buf.len, ); const result = buf[0..len]; var rem: usize = len; while (rem > 0) { // Pick a utf8 byte count to generate. const utf8_len: Utf8Len = len: { const Indexer = @TypeOf(self.p_length).Indexer; const idx = self.rand.weightedIndex(f64, &self.p_length.values); var utf8_len = Indexer.keyForIndex(idx); assert(rem > 0); while (@intFromEnum(utf8_len) > rem) { // If the chosen length can't fit into the remaining buffer, // choose a smaller length. utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1); } break :len utf8_len; }; // Generate a UTF-8 sequence that encodes to this length. const cp: u21 = switch (utf8_len) { .one => self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F), .two => self.rand.intRangeAtMostBiased(u21, 0x80, 0x7FF), .three => self.rand.intRangeAtMostBiased(u21, 0x800, 0xFFFF), .four => self.rand.intRangeAtMostBiased(u21, 0x10000, 0x10FFFF), }; assert(std.unicode.utf8CodepointSequenceLength( cp, ) catch unreachable == @intFromEnum(utf8_len)); rem -= std.unicode.utf8Encode( cp, result[result.len - rem ..], ) catch |err| switch (err) { // Impossible because our generation above is hardcoded to // produce a valid range. If not, a bug. error.CodepointTooLarge => unreachable, // Possible, in which case we redo the loop and encode nothing. error.Utf8CannotEncodeSurrogateHalf => continue, }; } return result; } test "utf8" { const testing = std.testing; var prng = std.Random.DefaultPrng.init(0); var buf: [256]u8 = undefined; var v: Utf8 = .{ .rand = prng.random() }; const gen = v.generator(); const result = try gen.next(&buf); try testing.expect(result.len > 0); try testing.expect(std.unicode.utf8ValidateSlice(result)); }