//! This benchmark tests the throughput of codepoint width calculation.
//! This is a common operation in terminal character printing and the
//! motivating factor to write this benchmark was discovering that our
//! codepoint width function was 30% of the runtime of every character
//! print.
//!
//! This will consume all of the available stdin, so you should run it
//! with `head` in a pipe to restrict. For example, to test ASCII input:
//!
//!   bench-stream --mode=gen-ascii | head -c 50M | bench-codepoint-width --mode=ziglyph
//!

const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const ArenaAllocator = std.heap.ArenaAllocator;
const ziglyph = @import("ziglyph");
const cli = @import("../cli.zig");
const simd = @import("../simd/main.zig");
const table = @import("../unicode/main.zig").table;
const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");

const Args = struct {
    mode: Mode = .noop,

    /// The size for read buffers. Doesn't usually need to be changed. The
    /// main point is to make this runtime known so we can avoid compiler
    /// optimizations.
    @"buffer-size": usize = 4096,

    /// This is set by the CLI parser for deinit.
    _arena: ?ArenaAllocator = null,

    pub fn deinit(self: *Args) void {
        if (self._arena) |arena| arena.deinit();
        self.* = undefined;
    }
};

const Mode = enum {
    /// The baseline mode copies the data from the fd into a buffer. This
    /// is used to show the minimal overhead of reading the fd into memory
    /// and establishes a baseline for the other modes.
    noop,

    /// libc wcwidth
    wcwidth,

    /// Use ziglyph library to calculate the display width of each codepoint.
    ziglyph,

    /// Our SIMD implementation.
    simd,

    /// Test our lookup table implementation.
    table,
};

pub const std_options: std.Options = .{
    .log_level = .debug,
};

pub fn main() !void {
    // We want to use the c allocator because it is much faster than GPA.
    const alloc = std.heap.c_allocator;

    // Parse our args
    var args: Args = .{};
    defer args.deinit();
    {
        var iter = try std.process.argsWithAllocator(alloc);
        defer iter.deinit();
        try cli.args.parse(Args, alloc, &args, &iter);
    }

    const reader = std.io.getStdIn().reader();
    const buf = try alloc.alloc(u8, args.@"buffer-size");

    // Handle the modes that do not depend on terminal state first.
    switch (args.mode) {
        .noop => try benchNoop(reader, buf),
        .wcwidth => try benchWcwidth(reader, buf),
        .ziglyph => try benchZiglyph(reader, buf),
        .simd => try benchSimd(reader, buf),
        .table => try benchTable(reader, buf),
    }
}

noinline fn benchNoop(
    reader: anytype,
    buf: []u8,
) !void {
    var d: UTF8Decoder = .{};
    while (true) {
        const n = try reader.read(buf);
        if (n == 0) break;

        // Using stream.next directly with a for loop applies a naive
        // scalar approach.
        for (buf[0..n]) |c| {
            _ = d.next(c);
        }
    }
}

extern "c" fn wcwidth(c: u32) c_int;

noinline fn benchWcwidth(
    reader: anytype,
    buf: []u8,
) !void {
    var d: UTF8Decoder = .{};
    while (true) {
        const n = try reader.read(buf);
        if (n == 0) break;

        // Using stream.next directly with a for loop applies a naive
        // scalar approach.
        for (buf[0..n]) |c| {
            const cp_, const consumed = d.next(c);
            assert(consumed);
            if (cp_) |cp| {
                const width = wcwidth(cp);

                // Write the width to the buffer to avoid it being compiled away
                buf[0] = @intCast(width);
            }
        }
    }
}

noinline fn benchTable(
    reader: anytype,
    buf: []u8,
) !void {
    var d: UTF8Decoder = .{};
    while (true) {
        const n = try reader.read(buf);
        if (n == 0) break;

        // Using stream.next directly with a for loop applies a naive
        // scalar approach.
        for (buf[0..n]) |c| {
            const cp_, const consumed = d.next(c);
            assert(consumed);
            if (cp_) |cp| {
                // This is the same trick we do in terminal.zig so we
                // keep it here.
                const width = if (cp <= 0xFF) 1 else table.get(@intCast(cp)).width;

                // Write the width to the buffer to avoid it being compiled away
                buf[0] = @intCast(width);
            }
        }
    }
}

noinline fn benchZiglyph(
    reader: anytype,
    buf: []u8,
) !void {
    var d: UTF8Decoder = .{};
    while (true) {
        const n = try reader.read(buf);
        if (n == 0) break;

        // Using stream.next directly with a for loop applies a naive
        // scalar approach.
        for (buf[0..n]) |c| {
            const cp_, const consumed = d.next(c);
            assert(consumed);
            if (cp_) |cp| {
                const width = ziglyph.display_width.codePointWidth(cp, .half);

                // Write the width to the buffer to avoid it being compiled away
                buf[0] = @intCast(width);
            }
        }
    }
}

noinline fn benchSimd(
    reader: anytype,
    buf: []u8,
) !void {
    var d: UTF8Decoder = .{};
    while (true) {
        const n = try reader.read(buf);
        if (n == 0) break;

        // Using stream.next directly with a for loop applies a naive
        // scalar approach.
        for (buf[0..n]) |c| {
            const cp_, const consumed = d.next(c);
            assert(consumed);
            if (cp_) |cp| {
                const width = simd.codepointWidth(cp);

                // Write the width to the buffer to avoid it being compiled away
                buf[0] = @intCast(width);
            }
        }
    }
}