diff --git a/src/config/url.zig b/src/config/url.zig index 4cbfacdd4..b0820bb43 100644 --- a/src/config/url.zig +++ b/src/config/url.zig @@ -5,22 +5,75 @@ const oni = @import("oniguruma"); /// This is here in the config package because one day the matchers will be /// configurable and this will be a default. /// -/// This is taken from the Alacritty project. -pub const regex = "(ipfs:|ipns:|magnet:|mailto:|gemini://|gopher://|https://|http://|news:|file:|git://|ssh:|ftp://)[^\u{0000}-\u{001F}\u{007F}-\u{009F}<>\x22\\s{-}\\^⟨⟩\x60]+"; +/// This is adapted from a regex used in the Alacritty project. +/// +/// This regex is liberal in what it accepts after the scheme, with exceptions +/// for URLs ending with . or ). Although such URLs are perfectly valid, it is +/// common for text to contain URLs surrounded by parentheses (such as in +/// Markdown links) or at the end of sentences. Therefore, this regex excludes +/// them as follows: +/// +/// 1. Do not match regexes ending with . +/// 2. Do not match regexes ending with ), except for ones which contain a ( +/// without a subsequent ) +/// +/// Rule 2 means that that we handle the following two cases: +/// +/// "https://en.wikipedia.org/wiki/Rust_(video_game)" (include parens) +/// "(https://example.com)" (do not include the parens) +/// +/// There are many complicated cases where these heuristics break down, but +/// handling them well requires a non-regex approach. +pub const regex = "(?:" ++ url_scheme ++ ")(?:[^" ++ url_exclude ++ "]*[^" ++ url_exclude ++ ").]|[^" ++ url_exclude ++ "(]*\\([^" ++ url_exclude ++ ")]*\\))"; +const url_scheme = "ipfs:|ipns:|magnet:|mailto:|gemini://|gopher://|https://|http://|news:|file:|git://|ssh:|ftp://"; +const url_exclude = "\u{0000}-\u{001F}\u{007F}-\u{009F}<>\x22\\s{-}\\^⟨⟩\x60"; test "url regex" { + const testing = std.testing; + try oni.testing.ensureInit(); - var re = try oni.Regex.init(regex, .{}, oni.Encoding.utf8, oni.Syntax.default, null); + var re = try oni.Regex.init( + regex, + .{ .find_longest = true }, + oni.Encoding.utf8, + oni.Syntax.default, + null, + ); defer re.deinit(); - // The URL cases to test that our regex matches. Feel free to add to this + // The URL cases to test what our regex matches. Feel free to add to this // as we find bugs or just want more coverage. - const cases: []const []const u8 = &.{ - "https://example.com", + const cases = [_]struct { + input: []const u8, + expect: []const u8, + }{ + .{ + .input = "hello https://example.com world", + .expect = "https://example.com", + }, + .{ + .input = "https://example.com/foo(bar) more", + .expect = "https://example.com/foo(bar)", + }, + .{ + .input = "https://example.com/foo(bar)baz more", + .expect = "https://example.com/foo(bar)baz", + }, + .{ + .input = "Link inside (https://example.com) parens", + .expect = "https://example.com", + }, + .{ + .input = "Link period https://example.com. More text.", + .expect = "https://example.com", + }, }; for (cases) |case| { - var reg = try re.search(case, .{}); + var reg = try re.search(case.input, .{}); defer reg.deinit(); + try testing.expectEqual(@as(usize, 1), reg.count()); + const match = case.input[@intCast(reg.starts()[0])..@intCast(reg.ends()[0])]; + try testing.expectEqualStrings(case.expect, match); } } diff --git a/src/input/Link.zig b/src/input/Link.zig index 4fb9eadcf..55390d4be 100644 --- a/src/input/Link.zig +++ b/src/input/Link.zig @@ -36,7 +36,7 @@ pub const Highlight = union(enum) { pub fn oniRegex(self: *const Link) !oni.Regex { return try oni.Regex.init( self.regex, - .{}, + .{ .find_longest = true }, oni.Encoding.utf8, oni.Syntax.default, null,