diff --git a/src/simd/utf8.zig b/src/simd/utf8.zig index 87ef4601b..ac8f7b53b 100644 --- a/src/simd/utf8.zig +++ b/src/simd/utf8.zig @@ -22,8 +22,16 @@ pub fn utf8ValidateNeon(input: []const u8) bool { } pub const Neon = struct { + /// The previous input in a vector. This is required because to check + /// the validity of a UTF-8 byte, we need to sometimes know previous + /// state if it the first byte is a continuation byte. prev_input: @Vector(16, u8), + + /// The current error status. Once an error is set, it is never unset. prev_error: @Vector(16, u8), + + /// The current incomplete status. This is non-zero if the last chunk + /// requires more bytes to be valid UTF-8. prev_incomplete: @Vector(16, u8), pub fn init() Neon { @@ -34,13 +42,21 @@ pub const Neon = struct { }; } + /// Validate a chunk of UTF-8 data. This function is designed to be + /// called multiple times with successive chunks of data. When the + /// data is complete, you must call `finalize` to check for any + /// remaining errors. pub fn validate(self: *Neon, input: []const u8) void { + // Break up our input into 16 byte chunks, and process each chunk + // separately. The size of a Neon register is 16 bytes. var i: usize = 0; while (i + 16 <= input.len) : (i += 16) { const input_vec = aarch64.vld1q_u8(input[i..]); self.next(input_vec); } + // If we have any data remaining, we pad it with zeroes since that + // is valid UTF-8, and then treat it like a normal block. if (i < input.len) { const remaining = input.len - i; assert(remaining < 16); @@ -54,11 +70,24 @@ pub const Neon = struct { } } + /// Call to finalize the validation (EOF is reached). + pub fn finalize(self: *Neon) void { + // Its possible for our last chunk to end expecting more + // continuation bytes. + self.prev_error = aarch64.vorrq_u8(self.prev_error, self.prev_incomplete); + } + + /// Returns true if there are any errors. pub fn hasErrors(self: *Neon) bool { return aarch64.vmaxvq_u8(self.prev_error) != 0; } - pub fn next(self: *Neon, input_vec: @Vector(16, u8)) void { + /// Process a single vector of input. + /// + /// This function generally isn't called directly, but it is very useful + /// if you want to compose this validation with other SIMD operations + /// and already have your data in a SIMD register. + pub fn process(self: *Neon, input_vec: @Vector(16, u8)) void { // If all we have is ASCII, then we can skip the rest. if (aarch64.vmaxvq_u8(input_vec) <= 0b10000000) { self.prev_error = aarch64.vorrq_u8(self.prev_error, self.prev_incomplete); @@ -95,6 +124,7 @@ pub const Neon = struct { break :incomplete aarch64.vld1q_u8(&bytes); }); + // Debug all the vector registers: // std.log.warn("input={}", .{input_vec}); // std.log.warn("prev_input={}", .{self.prev_input}); // std.log.warn("prev1={}", .{prev1}); @@ -222,6 +252,7 @@ const OVERLONG_4: u8 = 1 << 6; const CARRY: u8 = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . /// Generic test function so we can test against multiple implementations. +/// This is initially copied from the Zig stdlib but may be expanded. fn testValidate(func: *const Validate) !void { const testing = std.testing; try testing.expect(func("hello friends!!!"));