mirror of
https://github.com/curl/curl.git
synced 2026-01-18 17:21:26 +01:00
mdlinkcheck: detect and check "raw" links
- URLs specified outside of the markdown []() are now extracted and checked - also check TODO, FAQ and KNOWN_BUGS - more aggressive avoiding to check github.com/curl/curl, all uses of example domains and some more established URLs on the curl.se site - list all errors in the end to make them easier to spot in CI logs Closes #19848
This commit is contained in:
@@ -27,7 +27,10 @@ use strict;
|
||||
use warnings;
|
||||
|
||||
my %whitelist = (
|
||||
'https://curl.se' => 1,
|
||||
'https://curl.se/' => 1,
|
||||
'https://curl.se/bug/' => 1,
|
||||
'https://curl.se/bug/view.cgi' => 1,
|
||||
'https://curl.se/changes.html' => 1,
|
||||
'https://curl.se/dev/advisory.html' => 1,
|
||||
'https://curl.se/dev/builds.html' => 1,
|
||||
@@ -40,19 +43,25 @@ my %whitelist = (
|
||||
'https://curl.se/docs/bugbounty.html' => 1,
|
||||
'https://curl.se/docs/caextract.html' => 1,
|
||||
'https://curl.se/docs/copyright.html' => 1,
|
||||
'https://curl.se/docs/http-cookies.html' => 1,
|
||||
'https://curl.se/docs/install.html' => 1,
|
||||
'https://curl.se/docs/knownbugs.html' => 1,
|
||||
'https://curl.se/docs/manpage.html' => 1,
|
||||
'https://curl.se/docs/releases.html' => 1,
|
||||
'https://curl.se/docs/security.html' => 1,
|
||||
'https://curl.se/docs/ssl-ciphers.html' => 1,
|
||||
'https://curl.se/docs/ssl-compared.html' => 1,
|
||||
'https://curl.se/docs/sslcerts.html' => 1,
|
||||
'https://curl.se/docs/thanks.html' => 1,
|
||||
'https://curl.se/docs/todo.html' => 1,
|
||||
'https://curl.se/docs/vulnerabilities.html' => 1,
|
||||
'https://curl.se/download.html' => 1,
|
||||
'https://curl.se/libcurl/' => 1,
|
||||
'https://curl.se/libcurl/c/CURLOPT_SSLVERSION.html' => 1,
|
||||
'https://curl.se/libcurl/c/CURLOPT_SSL_CIPHER_LIST.html' => 1,
|
||||
'https://curl.se/libcurl/c/CURLOPT_SSLVERSION.html' => 1,
|
||||
'https://curl.se/libcurl/c/CURLOPT_TLS13_CIPHERS.html' => 1,
|
||||
'https://curl.se/libcurl/c/libcurl.html' => 1,
|
||||
'https://curl.se/libcurl/c/threadsafe.html' => 1,
|
||||
'https://curl.se/logo/curl-logo.svg' => 1,
|
||||
'https://curl.se/mail/' => 1,
|
||||
'https://curl.se/mail/etiquette.html' => 1,
|
||||
@@ -62,14 +71,15 @@ my %whitelist = (
|
||||
'https://curl.se/rfc/rfc2255.txt' => 1,
|
||||
'https://curl.se/sponsors.html' => 1,
|
||||
'https://curl.se/support.html' => 1,
|
||||
'https://curl.se/windows' => 1,
|
||||
'https://curl.se/windows/' => 1,
|
||||
|
||||
'https://testclutch.curl.se/' => 1,
|
||||
|
||||
'https://github.com/curl/curl' => 1,
|
||||
'https://github.com/curl/curl-fuzzer' => 1,
|
||||
'https://github.com/curl/curl-www' => 1,
|
||||
'https://github.com/curl/curl/discussions' => 1,
|
||||
'https://github.com/curl/curl/issues' => 1,
|
||||
'https://github.com/curl/curl/labels/help%20wanted' => 1,
|
||||
'https://github.com/curl/curl/pulls' => 1,
|
||||
'https://github.com/curl/curl.git' => 1,
|
||||
'https://github.com/curl/curl/wcurl' => 1,
|
||||
|
||||
);
|
||||
|
||||
@@ -77,7 +87,7 @@ my %url;
|
||||
my %flink;
|
||||
|
||||
# list all .md files in the repo
|
||||
my @files=`git ls-files '**.md'`;
|
||||
my @files=`git ls-files '**.md' docs/TODO docs/KNOWN_BUGS docs/FAQ`;
|
||||
|
||||
sub storelink {
|
||||
my ($f, $line, $link) = @_;
|
||||
@@ -91,7 +101,29 @@ sub storelink {
|
||||
$link =~ s:\#.*\z::;
|
||||
|
||||
if($link =~ /^(https|http):/) {
|
||||
$url{$link} .= "$f:$line ";
|
||||
if($whitelist{$link}) {
|
||||
#print "-- whitelisted: $link\n";
|
||||
}
|
||||
# example.com is just example
|
||||
elsif($link =~ /^https:\/\/(.*)example.(com|org|net)/) {
|
||||
#print "-- example: $link\n";
|
||||
}
|
||||
# so is using the .example TLD
|
||||
elsif($link =~ /^https:\/\/(.*)\.example(\/|$|:)/) {
|
||||
#print "-- .example: $link\n";
|
||||
}
|
||||
# so is using anything on localhost
|
||||
elsif($link =~ /^http(s|):\/\/localhost/) {
|
||||
#print "-- localhost: $link\n";
|
||||
}
|
||||
# ignore all links to curl's github repo
|
||||
elsif($link =~ /^https:\/\/github.com\/curl\/curl(\/|$)/) {
|
||||
#print "-- curl github repo: $link\n";
|
||||
}
|
||||
else {
|
||||
#print "ADD '$link'\n";
|
||||
$url{$link} .= "$f:$line ";
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -119,11 +151,19 @@ sub findlinks {
|
||||
return;
|
||||
|
||||
while(<F>) {
|
||||
chomp;
|
||||
if(/\]\(([^)]*)/) {
|
||||
my $link = $1;
|
||||
#print "$f:$line $link\n";
|
||||
storelink($f, $line, $link);
|
||||
}
|
||||
# ignore trailing: dot, quote, asterisk, hash, comma, question mark,
|
||||
# colon, closing parenthesis, closing angle bracket, whitespace, pipe,
|
||||
# backtick, semicolon
|
||||
elsif(/(https:\/\/[a-z0-9.\/:%_-]+[^."*\#,?:\)> \t|`;])/i) {
|
||||
#print "RAW ";
|
||||
storelink($f, $line, $1);
|
||||
}
|
||||
$line++;
|
||||
}
|
||||
close(F);
|
||||
@@ -133,11 +173,10 @@ sub checkurl {
|
||||
my ($url) = @_;
|
||||
|
||||
if($whitelist{$url}) {
|
||||
#print "$url is whitelisted\n";
|
||||
#print STDERR "$url is whitelisted\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
print "check $url\n";
|
||||
$url =~ s/\+/%2B/g;
|
||||
my @content;
|
||||
if(open(my $fh, '-|', 'curl', '-ILfsm10', '--retry', '2', '--retry-delay', '5',
|
||||
@@ -146,9 +185,10 @@ sub checkurl {
|
||||
close $fh;
|
||||
}
|
||||
if(!$content[0]) {
|
||||
print STDERR "FAIL\n";
|
||||
print "FAIL: $url\n";
|
||||
return 1; # fail
|
||||
}
|
||||
print "OK: $url\n";
|
||||
return 0; # ok
|
||||
}
|
||||
|
||||
@@ -157,14 +197,19 @@ for my $f (@files) {
|
||||
findlinks($f);
|
||||
}
|
||||
|
||||
my $error;
|
||||
#for my $u (sort keys %url) {
|
||||
# print "$u\n";
|
||||
#}
|
||||
#exit;
|
||||
|
||||
my $error;
|
||||
my @errlist;
|
||||
for my $u (sort keys %url) {
|
||||
my $r = checkurl($u);
|
||||
|
||||
if($r) {
|
||||
for my $f (split(/ /, $url{$u})) {
|
||||
printf "%s ERROR links to missing URL %s\n", $f, $u;
|
||||
push @errlist, sprintf "%s ERROR links to missing URL %s\n", $f, $u;
|
||||
$error++;
|
||||
}
|
||||
}
|
||||
@@ -173,10 +218,17 @@ for my $u (sort keys %url) {
|
||||
for my $l (sort keys %flink) {
|
||||
if(! -r $l) {
|
||||
for my $f (split(/ /, $flink{$l})) {
|
||||
printf "%s ERROR links to missing file %s\n", $f, $l;
|
||||
push @errlist, sprintf "%s ERROR links to missing file %s\n", $f, $l;
|
||||
$error++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf "Checked %d URLs\n", scalar(keys %url);
|
||||
if($error) {
|
||||
print "$error URLs had problems:\n";
|
||||
for(@errlist) {
|
||||
print $_;
|
||||
}
|
||||
}
|
||||
exit 1 if($error);
|
||||
|
||||
Reference in New Issue
Block a user