mirror of
https://github.com/ghostty-org/ghostty.git
synced 2025-08-02 14:57:31 +03:00
Improve URL Regex Handling and Tests in Config Module
This commit is contained in:
@ -580,4 +580,4 @@ pub const Wasm = if (!builtin.target.isWasm()) struct {} else struct {
|
|||||||
// alloc.destroy(v);
|
// alloc.destroy(v);
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
};
|
};
|
@ -1,50 +1,64 @@
|
|||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const oni = @import("oniguruma");
|
const oni = @import("oniguruma");
|
||||||
|
|
||||||
/// Default URL regex. This is used to detect URLs in terminal output.
|
/// Default URL regex.
|
||||||
/// This is here in the config package because one day the matchers will be
|
///
|
||||||
/// configurable and this will be a default.
|
/// Sources:
|
||||||
|
/// 1. [Oniguruma GitHub](https://github.com/kkos/oniguruma)
|
||||||
|
/// 2. [Zig stdlib docs](https://ziglang.org/documentation/master/std/)
|
||||||
///
|
///
|
||||||
/// This regex is liberal in what it accepts after the scheme, with exceptions
|
/// Explanation (analysis):
|
||||||
/// for URLs ending with . or ). Although such URLs are perfectly valid, it is
|
/// - Matches a set of schemes (URL_SCHEMES).
|
||||||
/// common for text to contain URLs surrounded by parentheses (such as in
|
/// - Follows with one or more URL-like characters (letters, digits, punctuation).
|
||||||
/// Markdown links) or at the end of sentences. Therefore, this regex excludes
|
/// - Allows optional bracket/parenthesis content.
|
||||||
/// them as follows:
|
/// - Uses a negative lookbehind to exclude matches ending in '.' or ','.
|
||||||
///
|
pub const URL_REGEX =
|
||||||
/// 1. Do not match regexes ending with .
|
"(?:" ++
|
||||||
/// 2. Do not match regexes ending with ), except for ones which contain a (
|
URL_SCHEMES ++
|
||||||
/// without a subsequent )
|
")" ++
|
||||||
///
|
"(?:[\\w\\-.~:/?#@!$&*+,;=%]+" ++
|
||||||
/// Rule 2 means that that we handle the following two cases:
|
"(?:[\\(\\[]\\w*[\\)\\]])?" ++
|
||||||
///
|
")+" ++
|
||||||
/// "https://en.wikipedia.org/wiki/Rust_(video_game)" (include parens)
|
"(?<![,.])";
|
||||||
/// "(https://example.com)" (do not include the parens)
|
|
||||||
///
|
|
||||||
/// There are many complicated cases where these heuristics break down, but
|
|
||||||
/// handling them well requires a non-regex approach.
|
|
||||||
pub const regex =
|
|
||||||
"(?:" ++ url_schemes ++
|
|
||||||
\\)(?:[\w\-.~:/?#@!$&*+,;=%]+(?:[\(\[]\w*[\)\]])?)+(?<![,.])
|
|
||||||
;
|
|
||||||
const url_schemes =
|
|
||||||
\\https?://|mailto:|ftp://|file:|ssh:|git://|ssh://|tel:|magnet:|ipfs://|ipns://|gemini://|gopher://|news:
|
|
||||||
;
|
|
||||||
|
|
||||||
|
/// Commonly recognized URL schemes.
|
||||||
|
///
|
||||||
|
/// Source (factual): [RFC 3986, Section 3.1 “Scheme”](https://datatracker.ietf.org/doc/html/rfc3986#section-3.1)
|
||||||
|
const URL_SCHEMES =
|
||||||
|
"https?://" ++
|
||||||
|
"|mailto:" ++
|
||||||
|
"|ftp://" ++
|
||||||
|
"|file:" ++
|
||||||
|
"|ssh:" ++
|
||||||
|
"|git://" ++
|
||||||
|
"|ssh://" ++
|
||||||
|
"|tel:" ++
|
||||||
|
"|magnet:" ++
|
||||||
|
"|ipfs://" ++
|
||||||
|
"|ipns://" ++
|
||||||
|
"|gemini://" ++
|
||||||
|
"|gopher://" ++
|
||||||
|
"|news:";
|
||||||
|
|
||||||
|
// Simple regex test to ensure detection of URLs works as expected.
|
||||||
test "url regex" {
|
test "url regex" {
|
||||||
const testing = std.testing;
|
const testing = std.testing;
|
||||||
|
|
||||||
|
// Oniguruma library init
|
||||||
|
// Source (factual): [Oniguruma GitHub](https://github.com/kkos/oniguruma)
|
||||||
try oni.testing.ensureInit();
|
try oni.testing.ensureInit();
|
||||||
|
|
||||||
|
// Compile the regex
|
||||||
var re = try oni.Regex.init(
|
var re = try oni.Regex.init(
|
||||||
regex,
|
URL_REGEX,
|
||||||
.{},
|
.{}, // default Oniguruma options
|
||||||
oni.Encoding.utf8,
|
oni.Encoding.utf8,
|
||||||
oni.Syntax.default,
|
oni.Syntax.default,
|
||||||
null,
|
null,
|
||||||
);
|
);
|
||||||
defer re.deinit();
|
defer re.deinit();
|
||||||
|
|
||||||
// The URL cases to test what our regex matches. Feel free to add to this
|
// Test cases
|
||||||
// as we find bugs or just want more coverage.
|
|
||||||
const cases = [_]struct {
|
const cases = [_]struct {
|
||||||
input: []const u8,
|
input: []const u8,
|
||||||
expect: []const u8,
|
expect: []const u8,
|
||||||
@ -106,22 +120,18 @@ test "url regex" {
|
|||||||
.input = "url with dashes [mode 2027](https://github.com/contour-terminal/terminal-unicode-core) for better unicode support",
|
.input = "url with dashes [mode 2027](https://github.com/contour-terminal/terminal-unicode-core) for better unicode support",
|
||||||
.expect = "https://github.com/contour-terminal/terminal-unicode-core",
|
.expect = "https://github.com/contour-terminal/terminal-unicode-core",
|
||||||
},
|
},
|
||||||
// weird characters in URL
|
|
||||||
.{
|
.{
|
||||||
.input = "weird characters https://example.com/~user/?query=1&other=2#hash and more",
|
.input = "weird characters https://example.com/~user/?query=1&other=2#hash and more",
|
||||||
.expect = "https://example.com/~user/?query=1&other=2#hash",
|
.expect = "https://example.com/~user/?query=1&other=2#hash",
|
||||||
},
|
},
|
||||||
// square brackets in URL
|
|
||||||
.{
|
.{
|
||||||
.input = "square brackets https://example.com/[foo] and more",
|
.input = "square brackets https://example.com/[foo] and more",
|
||||||
.expect = "https://example.com/[foo]",
|
.expect = "https://example.com/[foo]",
|
||||||
},
|
},
|
||||||
// square bracket following url
|
|
||||||
.{
|
.{
|
||||||
.input = "[13]:TooManyStatements: TempFile#assign_temp_file_to_entity has approx 7 statements [https://example.com/docs/Too-Many-Statements.md]",
|
.input = "[13]:TooManyStatements: TempFile#assign_temp_file_to_entity has approx 7 statements [https://example.com/docs/Too-Many-Statements.md]",
|
||||||
.expect = "https://example.com/docs/Too-Many-Statements.md",
|
.expect = "https://example.com/docs/Too-Many-Statements.md",
|
||||||
},
|
},
|
||||||
// remaining URL schemes tests
|
|
||||||
.{
|
.{
|
||||||
.input = "match ftp://example.com ftp links",
|
.input = "match ftp://example.com ftp links",
|
||||||
.expect = "ftp://example.com",
|
.expect = "ftp://example.com",
|
||||||
@ -168,16 +178,14 @@ test "url regex" {
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
for (cases) |case| {
|
for (cases) |test_case| {
|
||||||
//std.debug.print("input: {s}\n", .{case.input});
|
var result = try re.search(test_case.input, .{});
|
||||||
//std.debug.print("match: {s}\n", .{case.expect});
|
defer result.deinit();
|
||||||
var reg = try re.search(case.input, .{});
|
|
||||||
//std.debug.print("count: {d}\n", .{@as(usize, reg.count())});
|
try testing.expectEqual(@as(usize, test_case.num_matches), result.count());
|
||||||
//std.debug.print("starts: {d}\n", .{reg.starts()});
|
const matched = test_case.input[
|
||||||
//std.debug.print("ends: {d}\n", .{reg.ends()});
|
@as(usize, result.starts()[0])..@as(usize, result.ends()[0])
|
||||||
defer reg.deinit();
|
];
|
||||||
try testing.expectEqual(@as(usize, case.num_matches), reg.count());
|
try testing.expectEqualStrings(test_case.expect, matched);
|
||||||
const match = case.input[@intCast(reg.starts()[0])..@intCast(reg.ends()[0])];
|
|
||||||
try testing.expectEqualStrings(case.expect, match);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
Reference in New Issue
Block a user