Merge pull request #531 from mitchellh/codepoint-map

Specify font for specific codepoint ranges
2025-07-25 05:06:24 +03:00 · 2023-09-24 21:06:51 -07:00
parent f25a3ce87d 7a9a36ecb0
commit 601ca5e7de
6 changed files with 440 additions and 8 deletions
--- a/src/Surface.zig
+++ b/src/Surface.zig
@ -224,6 +224,11 @@ pub fn init(
        var group = try font.Group.init(alloc, font_lib, font_size);
        errdefer group.deinit();

+        // If we have codepoint mappings, set those.
+        if (config.@"font-codepoint-map".map.list.len > 0) {
+            group.codepoint_map = config.@"font-codepoint-map".map;
+        }
+
        // Search for fonts
        if (font.Discover != void) discover: {
            const disco = try app.fontDiscover() orelse {
--- a/src/config/Config.zig
+++ b/src/config/Config.zig
@ -91,6 +91,24 @@ const c = @cImport({
@"font-variation-italic": RepeatableFontVariation = .{},
@"font-variation-bold-italic": RepeatableFontVariation = .{},

+/// Force one or a range of Unicode codepoints to map to a specific named
+/// font. This is useful if you want to support special symbols or if you
+/// want to use specific glyphs that render better for your specific font.
+///
+/// The syntax is "codepoint=fontname" where "codepoint" is either a
+/// single codepoint or a range. Codepoints must be specified as full
+/// Unicode hex values, such as "U+ABCD". Codepoints ranges are specified
+/// as "U+ABCD-U+DEFG". You can specify multiple ranges for the same font
+/// separated by commas, such as "U+ABCD-U+DEFG,U+1234-U+5678=fontname".
+/// The font name is the same value as you would use for "font-family".
+///
+/// This configuration can be repeated multiple times to specify multiple
+/// codepoint mappings.
+///
+/// Changing this configuration at runtime will only affect new terminals,
+/// i.e. new windows, tabs, etc.
+@"font-codepoint-map": RepeatableCodepointMap = .{},
+
 /// Draw fonts with a thicker stroke, if supported. This is only supported
 /// currently on macOS.
@"font-thicken": bool = false,
@ -1507,6 +1525,186 @@ pub const Keybinds = struct {
    }
 };

+/// See "font-codepoint-map" for documentation.
+pub const RepeatableCodepointMap = struct {
+    const Self = @This();
+
+    map: fontpkg.CodepointMap = .{},
+
+    pub fn parseCLI(self: *Self, alloc: Allocator, input_: ?[]const u8) !void {
+        const input = input_ orelse return error.ValueRequired;
+        const eql_idx = std.mem.indexOf(u8, input, "=") orelse return error.InvalidValue;
+        const whitespace = " \t";
+        const key = std.mem.trim(u8, input[0..eql_idx], whitespace);
+        const value = std.mem.trim(u8, input[eql_idx + 1 ..], whitespace);
+        const valueZ = try alloc.dupeZ(u8, value);
+
+        var p: UnicodeRangeParser = .{ .input = key };
+        while (try p.next()) |range| {
+            try self.map.add(alloc, .{
+                .range = range,
+                .descriptor = .{
+                    .family = valueZ,
+                    .monospace = false, // we allow any font
+                },
+            });
+        }
+    }
+
+    /// Deep copy of the struct. Required by Config.
+    pub fn clone(self: *const Self, alloc: Allocator) !Self {
+        return .{
+            .map = .{ .list = try self.map.list.clone(alloc) },
+        };
+    }
+
+    /// Compare if two of our value are requal. Required by Config.
+    pub fn equal(self: Self, other: Self) bool {
+        const itemsA = self.map.list.slice();
+        const itemsB = other.map.list.slice();
+        if (itemsA.len != itemsB.len) return false;
+        for (0..itemsA.len) |i| {
+            const a = itemsA.get(i);
+            const b = itemsB.get(i);
+            if (!std.meta.eql(a, b)) return false;
+        } else return true;
+    }
+
+    /// Parses the list of Unicode codepoint ranges. Valid syntax:
+    ///
+    ///   "" (empty returns null)
+    ///   U+1234
+    ///   U+1234-5678
+    ///   U+1234,U+5678
+    ///   U+1234-5678,U+5678
+    ///   U+1234,U+5678-U+9ABC
+    ///
+    /// etc.
+    const UnicodeRangeParser = struct {
+        input: []const u8,
+        i: usize = 0,
+
+        pub fn next(self: *UnicodeRangeParser) !?[2]u21 {
+            // Once we're EOF then we're done without an error.
+            if (self.eof()) return null;
+
+            // One codepoint no matter what
+            const start = try self.parseCodepoint();
+            if (self.eof()) return .{ start, start };
+
+            // We're allowed to have any whitespace here
+            self.consumeWhitespace();
+
+            // Otherwise we expect either a range or a comma
+            switch (self.input[self.i]) {
+                // Comma means we have another codepoint but in a different
+                // range so we return our current codepoint.
+                ',' => {
+                    self.advance();
+                    self.consumeWhitespace();
+                    if (self.eof()) return error.InvalidValue;
+                    return .{ start, start };
+                },
+
+                // Hyphen means we have a range.
+                '-' => {
+                    self.advance();
+                    self.consumeWhitespace();
+                    if (self.eof()) return error.InvalidValue;
+                    const end = try self.parseCodepoint();
+                    self.consumeWhitespace();
+                    if (!self.eof() and self.input[self.i] != ',') return error.InvalidValue;
+                    self.advance();
+                    self.consumeWhitespace();
+                    if (start > end) return error.InvalidValue;
+                    return .{ start, end };
+                },
+
+                else => return error.InvalidValue,
+            }
+        }
+
+        fn consumeWhitespace(self: *UnicodeRangeParser) void {
+            while (!self.eof()) {
+                switch (self.input[self.i]) {
+                    ' ', '\t' => self.advance(),
+                    else => return,
+                }
+            }
+        }
+
+        fn parseCodepoint(self: *UnicodeRangeParser) !u21 {
+            if (self.input[self.i] != 'U') return error.InvalidValue;
+            self.advance();
+            if (self.eof()) return error.InvalidValue;
+            if (self.input[self.i] != '+') return error.InvalidValue;
+            self.advance();
+            if (self.eof()) return error.InvalidValue;
+
+            const start_i = self.i;
+            while (true) {
+                const current = self.input[self.i];
+                const is_hex = (current >= '0' and current <= '9') or
+                    (current >= 'A' and current <= 'F') or
+                    (current >= 'a' and current <= 'f');
+                if (!is_hex) break;
+
+                // Advance but break on EOF
+                self.advance();
+                if (self.eof()) break;
+            }
+
+            // If we didn't consume a single character, we have an error.
+            if (start_i == self.i) return error.InvalidValue;
+
+            return std.fmt.parseInt(u21, self.input[start_i..self.i], 16) catch
+                return error.InvalidValue;
+        }
+
+        fn advance(self: *UnicodeRangeParser) void {
+            self.i += 1;
+        }
+
+        fn eof(self: *const UnicodeRangeParser) bool {
+            return self.i >= self.input.len;
+        }
+    };
+
+    test "parseCLI" {
+        const testing = std.testing;
+        var arena = ArenaAllocator.init(testing.allocator);
+        defer arena.deinit();
+        const alloc = arena.allocator();
+
+        var list: Self = .{};
+        try list.parseCLI(alloc, "U+ABCD=Comic Sans");
+        try list.parseCLI(alloc, "U+0001 - U+0005=Verdana");
+        try list.parseCLI(alloc, "U+0006-U+0009, U+ABCD=Courier");
+
+        try testing.expectEqual(@as(usize, 4), list.map.list.len);
+        {
+            const entry = list.map.list.get(0);
+            try testing.expectEqual([2]u21{ 0xABCD, 0xABCD }, entry.range);
+            try testing.expectEqualStrings("Comic Sans", entry.descriptor.family.?);
+        }
+        {
+            const entry = list.map.list.get(1);
+            try testing.expectEqual([2]u21{ 1, 5 }, entry.range);
+            try testing.expectEqualStrings("Verdana", entry.descriptor.family.?);
+        }
+        {
+            const entry = list.map.list.get(2);
+            try testing.expectEqual([2]u21{ 6, 9 }, entry.range);
+            try testing.expectEqualStrings("Courier", entry.descriptor.family.?);
+        }
+        {
+            const entry = list.map.list.get(3);
+            try testing.expectEqual([2]u21{ 0xABCD, 0xABCD }, entry.range);
+            try testing.expectEqualStrings("Courier", entry.descriptor.family.?);
+        }
+    }
+};
+
 /// Options for copy on select behavior.
 pub const CopyOnSelect = enum {
    /// Disables copy on select entirely.
--- a/src/font/CodepointMap.zig
+++ b/src/font/CodepointMap.zig
@ -0,0 +1,81 @@
+/// CodepointMap is a map of codepoints to a discovery descriptor of a font
+/// to use for that codepoint. If the descriptor doesn't return any matching
+/// font, the codepoint is rendered using the default font.
+const CodepointMap = @This();
+
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+const discovery = @import("discovery.zig");
+
+pub const Entry = struct {
+    /// Unicode codepoint range. Asserts range[0] <= range[1].
+    range: [2]u21,
+
+    /// The discovery descriptor of the font to use for this range.
+    descriptor: discovery.Descriptor,
+};
+
+/// The list of entries. We use a multiarraylist because Descriptors are
+/// quite large and we will very rarely match, so we'd rather pack our
+/// ranges together to make everything more cache friendly for lookups.
+///
+/// Note: we just do a linear search because we expect to always have very
+/// few entries, so the overhead of a binary search is not worth it. This is
+/// possible to defeat with some pathological inputs, but there is no realistic
+/// scenario where this will be a problem except people trying to fuck around.
+list: std.MultiArrayList(Entry) = .{},
+
+pub fn deinit(self: *CodepointMap, alloc: Allocator) void {
+    self.list.deinit(alloc);
+}
+
+/// Add an entry to the map.
+///
+/// For conflicting codepoints, entries added later take priority over
+/// entries added earlier.
+pub fn add(self: *CodepointMap, alloc: Allocator, entry: Entry) !void {
+    assert(entry.range[0] <= entry.range[1]);
+    try self.list.append(alloc, entry);
+}
+
+/// Get a descriptor for a codepoint.
+pub fn get(self: *const CodepointMap, cp: u21) ?discovery.Descriptor {
+    const items = self.list.items(.range);
+    for (items, 0..) |range, forward_i| {
+        const i = items.len - forward_i - 1;
+        if (range[0] <= cp and cp <= range[1]) {
+            const descs = self.list.items(.descriptor);
+            return descs[i];
+        }
+    }
+
+    return null;
+}
+
+test "codepointmap" {
+    const testing = std.testing;
+    const alloc = testing.allocator;
+
+    var m: CodepointMap = .{};
+    defer m.deinit(alloc);
+
+    // Exact range
+    try testing.expect(m.get(1) == null);
+    try m.add(alloc, .{ .range = .{ 1, 1 }, .descriptor = .{ .family = "A" } });
+    {
+        const d = m.get(1).?;
+        try testing.expectEqualStrings("A", d.family.?);
+    }
+
+    // Later entry takes priority
+    try m.add(alloc, .{ .range = .{ 1, 2 }, .descriptor = .{ .family = "B" } });
+    {
+        const d = m.get(1).?;
+        try testing.expectEqualStrings("B", d.family.?);
+    }
+
+    // Non-matching
+    try testing.expect(m.get(0) == null);
+    try testing.expect(m.get(3) == null);
+}
--- a/src/font/Group.zig
+++ b/src/font/Group.zig
@ -32,6 +32,33 @@ const log = std.log.scoped(.font_group);
 // most important memory efficiency we can look for. This is totally opaque
 // to the user so we can change this later.
 const StyleArray = std.EnumArray(Style, std.ArrayListUnmanaged(GroupFace));
+
+/// Map of descriptors to faces. This is used with manual codepoint maps
+/// to ensure that we don't load the same font multiple times.
+///
+/// Note that the current implementation will load the same font multiple
+/// times if the font used for a codepoint map is identical to a font used
+/// for a regular style. That's just an inefficient choice made now because
+/// the implementation is simpler and codepoint maps matching a regular
+/// font is a rare case.
+const DescriptorCache = std.HashMapUnmanaged(
+    font.discovery.Descriptor,
+    ?FontIndex,
+    struct {
+        const KeyType = font.discovery.Descriptor;
+
+        pub fn hash(ctx: @This(), k: KeyType) u64 {
+            _ = ctx;
+            return k.hash();
+        }
+
+        pub fn eql(ctx: @This(), a: KeyType, b: KeyType) bool {
+            return ctx.hash(a) == ctx.hash(b);
+        }
+    },
+    std.hash_map.default_max_load_percentage,
+);
+
 /// The allocator for this group
 alloc: Allocator,

@ -49,6 +76,15 @@ faces: StyleArray,
 /// the codepoint. This can be set after initialization.
 discover: ?*font.Discover = null,

+/// A map of codepoints to font requests for codepoint-level overrides.
+/// The memory associated with the map is owned by the caller and is not
+/// modified or freed by Group.
+codepoint_map: ?font.CodepointMap = null,
+
+/// The descriptor cache is used to cache the descriptor to font face
+/// mapping for codepoint maps.
+descriptor_cache: DescriptorCache = .{},
+
 /// Set this to a non-null value to enable sprite glyph drawing. If this
 /// isn't enabled we'll just fall through to trying to use regular fonts
 /// to render sprite glyphs. But more than likely, if this isn't set then
@ -86,11 +122,15 @@ pub fn init(
 }

 pub fn deinit(self: *Group) void {
-    var it = self.faces.iterator();
-    while (it.next()) |entry| {
-        for (entry.value.items) |*item| item.deinit();
-        entry.value.deinit(self.alloc);
+    {
+        var it = self.faces.iterator();
+        while (it.next()) |entry| {
+            for (entry.value.items) |*item| item.deinit();
+            entry.value.deinit(self.alloc);
+        }
    }
+
+    self.descriptor_cache.deinit(self.alloc);
 }

 /// Add a face to the list for the given style. This face will be added as
@ -172,9 +212,12 @@ pub fn setSize(self: *Group, size: font.face.DesiredSize) !void {
 }

 /// This represents a specific font in the group.
-pub const FontIndex = packed struct(u8) {
+pub const FontIndex = packed struct(FontIndex.Backing) {
+    const Backing = u16;
+    const backing_bits = @typeInfo(Backing).Int.bits;
+
    /// The number of bits we use for the index.
-    const idx_bits = 8 - @typeInfo(@typeInfo(Style).Enum.tag_type).Int.bits;
+    const idx_bits = backing_bits - @typeInfo(@typeInfo(Style).Enum.tag_type).Int.bits;
    pub const IndexInt = @Type(.{ .Int = .{ .signedness = .unsigned, .bits = idx_bits } });

    /// The special-case fonts that we support.
@ -195,7 +238,7 @@ pub const FontIndex = packed struct(u8) {
    }

    /// Convert to int
-    pub fn int(self: FontIndex) u8 {
+    pub fn int(self: FontIndex) Backing {
        return @bitCast(self);
    }

@ -211,7 +254,11 @@ pub const FontIndex = packed struct(u8) {
        // We never want to take up more than a byte since font indexes are
        // everywhere so if we increase the size of this we'll dramatically
        // increase our memory usage.
-        try std.testing.expectEqual(@sizeOf(u8), @sizeOf(FontIndex));
+        try std.testing.expectEqual(@sizeOf(Backing), @sizeOf(FontIndex));
+
+        // Just so we're aware when this changes. The current maximum number
+        // of fonts for a style is 13 bits or 8192 fonts.
+        try std.testing.expectEqual(13, idx_bits);
    }
 };

@ -231,6 +278,13 @@ pub fn indexForCodepoint(
    style: Style,
    p: ?Presentation,
 ) ?FontIndex {
+    // Codepoint overrides.
+    if (self.indexForCodepointOverride(cp)) |idx_| {
+        if (idx_) |idx| return idx;
+    } else |err| {
+        log.warn("codepoint override failed codepoint={} err={}", .{ cp, err });
+    }
+
    // If we have sprite drawing enabled, check if our sprite face can
    // handle this.
    if (self.sprite) |sprite| {
@ -306,6 +360,60 @@ fn indexForCodepointExact(self: Group, cp: u32, style: Style, p: ?Presentation)
    return null;
 }

+/// Checks if the codepoint is in the map of codepoint overrides,
+/// finds the override font, and returns it.
+fn indexForCodepointOverride(self: *Group, cp: u32) !?FontIndex {
+    if (comptime font.Discover == void) return null;
+    const map = self.codepoint_map orelse return null;
+
+    // If we have a codepoint too large or isn't in the map, then we
+    // don't have an override.
+    const cp_u21 = std.math.cast(u21, cp) orelse return null;
+    const desc = map.get(cp_u21) orelse return null;
+
+    // Fast path: the descriptor is already loaded.
+    const idx_: ?FontIndex = self.descriptor_cache.get(desc) orelse idx: {
+        // Slow path: we have to find this descriptor and load the font
+        const discover = self.discover orelse return null;
+        var disco_it = try discover.discover(desc);
+        defer disco_it.deinit();
+
+        const face = (try disco_it.next()) orelse {
+            log.warn(
+                "font lookup for codepoint map failed codepoint={} err=FontNotFound",
+                .{cp},
+            );
+
+            // Add null to the cache so we don't do a lookup again later.
+            try self.descriptor_cache.put(self.alloc, desc, null);
+            return null;
+        };
+
+        // Add the font to our list of fonts so we can get an index for it,
+        // and ensure the index is stored in the descriptor cache for next time.
+        const idx = try self.addFace(.regular, .{ .deferred = face });
+        try self.descriptor_cache.put(self.alloc, desc, idx);
+
+        break :idx idx;
+    };
+
+    // The descriptor cache will populate null if the descriptor is not found
+    // to avoid expensive discoveries later.
+    const idx = idx_ orelse return null;
+
+    // We need to verify that this index has the codepoint we want.
+    if (self.hasCodepoint(idx, cp, null)) {
+        log.debug("codepoint override based on config codepoint={} family={s}", .{
+            cp,
+            desc.family orelse "",
+        });
+
+        return idx;
+    }
+
+    return null;
+}
+
 /// Check if a specific font index has a specific codepoint. This does not
 /// necessarily force the font to load.
 pub fn hasCodepoint(self: *Group, index: FontIndex, cp: u32, p: ?Presentation) bool {
--- a/src/font/discovery.zig
+++ b/src/font/discovery.zig
@ -56,6 +56,30 @@ pub const Descriptor = struct {
    /// will be preferred, but not guaranteed.
    variations: []const Variation = &.{},

+    /// Returns a hash code that can be used to uniquely identify this
+    /// action.
+    pub fn hash(self: Descriptor) u64 {
+        const autoHash = std.hash.autoHash;
+        var hasher = std.hash.Wyhash.init(0);
+        autoHash(&hasher, self.family);
+        autoHash(&hasher, self.style);
+        autoHash(&hasher, self.codepoint);
+        autoHash(&hasher, self.size);
+        autoHash(&hasher, self.bold);
+        autoHash(&hasher, self.italic);
+        autoHash(&hasher, self.monospace);
+        autoHash(&hasher, self.variations.len);
+        for (self.variations) |variation| {
+            autoHash(&hasher, variation.id);
+
+            // This is not correct, but we don't currently depend on the
+            // hash value being different based on decimal values of variations.
+            autoHash(&hasher, @as(u64, @intFromFloat(variation.value)));
+        }
+
+        return hasher.final();
+    }
+
    /// Convert to Fontconfig pattern to use for lookup. The pattern does
    /// not have defaults filled/substituted (Fontconfig thing) so callers
    /// must still do this.
@ -350,6 +374,21 @@ pub const CoreText = struct {
    };
 };

+test "descriptor hash" {
+    const testing = std.testing;
+
+    var d: Descriptor = .{};
+    try testing.expect(d.hash() != 0);
+}
+
+test "descriptor hash familiy names" {
+    const testing = std.testing;
+
+    var d1: Descriptor = .{ .family = "A" };
+    var d2: Descriptor = .{ .family = "B" };
+    try testing.expect(d1.hash() != d2.hash());
+}
+
 test "fontconfig" {
    if (options.backend != .fontconfig_freetype) return error.SkipZigTest;

--- a/src/font/main.zig
+++ b/src/font/main.zig
@ -5,6 +5,7 @@ const build_config = @import("../build_config.zig");
 pub const Atlas = @import("Atlas.zig");
 pub const discovery = @import("discovery.zig");
 pub const face = @import("face.zig");
+pub const CodepointMap = @import("CodepointMap.zig");
 pub const DeferredFace = @import("DeferredFace.zig");
 pub const Face = face.Face;
 pub const Group = @import("Group.zig");