Merge pull request #1108 from cespare/url-boundary-regex

config: improve link matching for URLs ending in . or )
2025-07-14 15:56:13 +03:00 · 2023-12-16 19:28:02 -08:00
parent fd80889f9a ae94af37c0
commit 2da5ec2c28
2 changed files with 61 additions and 8 deletions
--- a/src/config/url.zig
+++ b/src/config/url.zig
@ -5,22 +5,75 @@ const oni = @import("oniguruma");
 /// This is here in the config package because one day the matchers will be
 /// configurable and this will be a default.
 ///
-/// This is taken from the Alacritty project.
-pub const regex = "(ipfs:|ipns:|magnet:|mailto:|gemini://|gopher://|https://|http://|news:|file:|git://|ssh:|ftp://)[^\u{0000}-\u{001F}\u{007F}-\u{009F}<>\x22\\s{-}\\^⟨⟩\x60]+";
+/// This is adapted from a regex used in the Alacritty project.
+///
+/// This regex is liberal in what it accepts after the scheme, with exceptions
+/// for URLs ending with . or ). Although such URLs are perfectly valid, it is
+/// common for text to contain URLs surrounded by parentheses (such as in
+/// Markdown links) or at the end of sentences. Therefore, this regex excludes
+/// them as follows:
+///
+/// 1. Do not match regexes ending with .
+/// 2. Do not match regexes ending with ), except for ones which contain a (
+///    without a subsequent )
+///
+/// Rule 2 means that that we handle the following two cases:
+///
+///   "https://en.wikipedia.org/wiki/Rust_(video_game)" (include parens)
+///   "(https://example.com)" (do not include the parens)
+///
+/// There are many complicated cases where these heuristics break down, but
+/// handling them well requires a non-regex approach.
+pub const regex = "(?:" ++ url_scheme ++ ")(?:[^" ++ url_exclude ++ "]*[^" ++ url_exclude ++ ").]|[^" ++ url_exclude ++ "(]*\\([^" ++ url_exclude ++ ")]*\\))";
+const url_scheme = "ipfs:|ipns:|magnet:|mailto:|gemini://|gopher://|https://|http://|news:|file:|git://|ssh:|ftp://";
+const url_exclude = "\u{0000}-\u{001F}\u{007F}-\u{009F}<>\x22\\s{-}\\^⟨⟩\x60";

 test "url regex" {
+    const testing = std.testing;
+
    try oni.testing.ensureInit();
-    var re = try oni.Regex.init(regex, .{}, oni.Encoding.utf8, oni.Syntax.default, null);
+    var re = try oni.Regex.init(
+        regex,
+        .{ .find_longest = true },
+        oni.Encoding.utf8,
+        oni.Syntax.default,
+        null,
+    );
    defer re.deinit();

-    // The URL cases to test that our regex matches. Feel free to add to this
+    // The URL cases to test what our regex matches. Feel free to add to this
    // as we find bugs or just want more coverage.
-    const cases: []const []const u8 = &.{
-        "https://example.com",
+    const cases = [_]struct {
+        input: []const u8,
+        expect: []const u8,
+    }{
+        .{
+            .input = "hello https://example.com world",
+            .expect = "https://example.com",
+        },
+        .{
+            .input = "https://example.com/foo(bar) more",
+            .expect = "https://example.com/foo(bar)",
+        },
+        .{
+            .input = "https://example.com/foo(bar)baz more",
+            .expect = "https://example.com/foo(bar)baz",
+        },
+        .{
+            .input = "Link inside (https://example.com) parens",
+            .expect = "https://example.com",
+        },
+        .{
+            .input = "Link period https://example.com. More text.",
+            .expect = "https://example.com",
+        },
    };

    for (cases) |case| {
-        var reg = try re.search(case, .{});
+        var reg = try re.search(case.input, .{});
        defer reg.deinit();
+        try testing.expectEqual(@as(usize, 1), reg.count());
+        const match = case.input[@intCast(reg.starts()[0])..@intCast(reg.ends()[0])];
+        try testing.expectEqualStrings(case.expect, match);
    }
 }
--- a/src/input/Link.zig
+++ b/src/input/Link.zig
@ -36,7 +36,7 @@ pub const Highlight = union(enum) {
 pub fn oniRegex(self: *const Link) !oni.Regex {
    return try oni.Regex.init(
        self.regex,
-        .{},
+        .{ .find_longest = true },
        oni.Encoding.utf8,
        oni.Syntax.default,
        null,