mirror of
https://github.com/curl/curl.git
synced 2026-01-18 17:21:26 +01:00
It finds debug outputs in source code otherwise. Output the whitelist "warnings" to stderr to better allow us to count URLs with `./mdlinkcheck --dry-run | wc -l`. Closes #19938
256 lines
7.2 KiB
Perl
Executable File
256 lines
7.2 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
#***************************************************************************
|
|
# _ _ ____ _
|
|
# Project ___| | | | _ \| |
|
|
# / __| | | | |_) | |
|
|
# | (__| |_| | _ <| |___
|
|
# \___|\___/|_| \_\_____|
|
|
#
|
|
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
|
|
#
|
|
# This software is licensed as described in the file COPYING, which
|
|
# you should have received as part of this distribution. The terms
|
|
# are also available at https://curl.se/docs/copyright.html.
|
|
#
|
|
# You may opt to use, copy, modify, merge, publish, distribute and/or sell
|
|
# copies of the Software, and permit persons to whom the Software is
|
|
# furnished to do so, under the terms of the COPYING file.
|
|
#
|
|
# This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
|
# KIND, either express or implied.
|
|
#
|
|
# SPDX-License-Identifier: curl
|
|
#
|
|
###########################################################################
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
my %whitelist = (
|
|
'https://curl.se' => 1,
|
|
'https://curl.se/' => 1,
|
|
'https://curl.se/bug/' => 1,
|
|
'https://curl.se/bug/view.cgi' => 1,
|
|
'https://curl.se/changes.html' => 1,
|
|
'https://curl.se/dev/advisory.html' => 1,
|
|
'https://curl.se/dev/builds.html' => 1,
|
|
'https://curl.se/dev/code-style.html' => 1,
|
|
'https://curl.se/dev/contribute.html' => 1,
|
|
'https://curl.se/dev/internals.html' => 1,
|
|
'https://curl.se/dev/secprocess.html' => 1,
|
|
'https://curl.se/dev/sourceactivity.html' => 1,
|
|
'https://curl.se/docs/' => 1,
|
|
'https://curl.se/docs/bugbounty.html' => 1,
|
|
'https://curl.se/docs/caextract.html' => 1,
|
|
'https://curl.se/docs/copyright.html' => 1,
|
|
'https://curl.se/docs/http-cookies.html' => 1,
|
|
'https://curl.se/docs/install.html' => 1,
|
|
'https://curl.se/docs/knownbugs.html' => 1,
|
|
'https://curl.se/docs/manpage.html' => 1,
|
|
'https://curl.se/docs/releases.html' => 1,
|
|
'https://curl.se/docs/security.html' => 1,
|
|
'https://curl.se/docs/ssl-ciphers.html' => 1,
|
|
'https://curl.se/docs/ssl-compared.html' => 1,
|
|
'https://curl.se/docs/sslcerts.html' => 1,
|
|
'https://curl.se/docs/thanks.html' => 1,
|
|
'https://curl.se/docs/todo.html' => 1,
|
|
'https://curl.se/docs/vulnerabilities.html' => 1,
|
|
'https://curl.se/download.html' => 1,
|
|
'https://curl.se/libcurl/' => 1,
|
|
'https://curl.se/libcurl/c/CURLOPT_SSL_CIPHER_LIST.html' => 1,
|
|
'https://curl.se/libcurl/c/CURLOPT_SSLVERSION.html' => 1,
|
|
'https://curl.se/libcurl/c/CURLOPT_TLS13_CIPHERS.html' => 1,
|
|
'https://curl.se/libcurl/c/libcurl.html' => 1,
|
|
'https://curl.se/libcurl/c/threadsafe.html' => 1,
|
|
'https://curl.se/logo/curl-logo.svg' => 1,
|
|
'https://curl.se/mail/' => 1,
|
|
'https://curl.se/mail/etiquette.html' => 1,
|
|
'https://curl.se/mail/list.cgi?list=curl-distros' => 1,
|
|
'https://curl.se/mail/list.cgi?list=curl-library' => 1,
|
|
'https://curl.se/rfc/cookie_spec.html' => 1,
|
|
'https://curl.se/rfc/rfc2255.txt' => 1,
|
|
'https://curl.se/sponsors.html' => 1,
|
|
'https://curl.se/support.html' => 1,
|
|
'https://curl.se/windows/' => 1,
|
|
|
|
'https://testclutch.curl.se/' => 1,
|
|
|
|
'https://github.com/curl/curl-fuzzer' => 1,
|
|
'https://github.com/curl/curl-www' => 1,
|
|
'https://github.com/curl/curl/wcurl' => 1,
|
|
|
|
);
|
|
|
|
my %url;
|
|
my %flink;
|
|
|
|
my $dry;
|
|
if(defined $ARGV[0] && $ARGV[0] eq "--dry-run") {
|
|
$dry = 1;
|
|
shift @ARGV;
|
|
}
|
|
|
|
# list all files to scan for links
|
|
my @files=`git ls-files docs include lib scripts src`;
|
|
|
|
sub storelink {
|
|
my ($f, $line, $link) = @_;
|
|
my $o = $link;
|
|
|
|
if($link =~ /^\#/) {
|
|
# ignore local-only links
|
|
return;
|
|
}
|
|
# cut off any anchor
|
|
$link =~ s:\#.*\z::;
|
|
|
|
if($link =~ /^(https|http):/) {
|
|
if($whitelist{$link}) {
|
|
#print "-- whitelisted: $link\n";
|
|
$whitelist{$link}++;
|
|
}
|
|
# example.com is just example
|
|
elsif($link =~ /^https:\/\/(.*)example.(com|org|net)/) {
|
|
#print "-- example: $link\n";
|
|
}
|
|
# so is using the .example TLD
|
|
elsif($link =~ /^https:\/\/(.*)\.example(\/|$|:)/) {
|
|
#print "-- .example: $link\n";
|
|
}
|
|
# so is using anything on localhost
|
|
elsif($link =~ /^http(s|):\/\/localhost/) {
|
|
#print "-- localhost: $link\n";
|
|
}
|
|
# ignore all links to curl's github repo
|
|
elsif($link =~ /^https:\/\/github.com\/curl\/curl(\/|$)/) {
|
|
#print "-- curl github repo: $link\n";
|
|
}
|
|
elsif($link =~ /^(https|http):\/\/[0-9.]+(\/|$)/) {
|
|
#print "-- IPv4 number: $link\n";
|
|
}
|
|
else {
|
|
#print "ADD '$link'\n";
|
|
$url{$link} .= "$f:$line ";
|
|
}
|
|
return;
|
|
}
|
|
|
|
# a file link
|
|
my $dir = $f;
|
|
$dir =~ s:([^/]*\z)::;
|
|
|
|
if($link =~ s/(^\/)//) {
|
|
# link starts with a slash, now removed
|
|
$dir = "";
|
|
}
|
|
else {
|
|
while($link =~ s:^\.\.\/::) {
|
|
$dir =~ s:([^/]*)\/\z::;
|
|
}
|
|
}
|
|
|
|
$flink{"./$dir$link"} .= "$f:$line ";
|
|
}
|
|
|
|
sub findlinks {
|
|
my ($f) = @_;
|
|
my $line = 1;
|
|
open(F, "<:crlf", "$f") ||
|
|
return;
|
|
|
|
# is it a markdown extension?
|
|
my $md = ($f =~ /\.md$/i);
|
|
|
|
while(<F>) {
|
|
chomp;
|
|
if($md && /\]\(([^)]*)/) {
|
|
my $link = $1;
|
|
#print "$f:$line $link\n";
|
|
storelink($f, $line, $link);
|
|
}
|
|
# ignore trailing: dot, double quote, single quote, asterisk, hash,
|
|
# comma, question mark, colon, closing parenthesis, backslash,
|
|
# closing angle bracket, whitespace, pipe, backtick, semicolon
|
|
elsif(/(https:\/\/[a-z0-9.\/:%_+@-]+[^."'*\#,?:\)> \t|`;\\])/i) {
|
|
#print "RAW '$_'\n";
|
|
storelink($f, $line, $1);
|
|
}
|
|
$line++;
|
|
}
|
|
close(F);
|
|
}
|
|
|
|
sub checkurl {
|
|
my ($url) = @_;
|
|
|
|
if($whitelist{$url}) {
|
|
#print STDERR "$url is whitelisted\n";
|
|
return 0;
|
|
}
|
|
|
|
$url =~ s/\+/%2B/g;
|
|
my @content;
|
|
if(open(my $fh, '-|', 'curl', '-ILfsm10', '--retry', '2', '--retry-delay', '5',
|
|
'-A', 'Mozilla/curl.se link-probe', $url)) {
|
|
@content = <$fh>;
|
|
close $fh;
|
|
}
|
|
if(!$content[0]) {
|
|
print "FAIL: $url\n";
|
|
return 1; # fail
|
|
}
|
|
print "OK: $url\n";
|
|
return 0; # ok
|
|
}
|
|
|
|
for my $f (@files) {
|
|
chomp $f;
|
|
if($f !~ /\/mdlinkcheck$/) {
|
|
findlinks($f);
|
|
}
|
|
}
|
|
|
|
for my $u (sort keys %whitelist) {
|
|
if($whitelist{$u} == 1) {
|
|
printf STDERR "warning: unused whitelist entry: '$u'\n";
|
|
}
|
|
}
|
|
|
|
if($dry) {
|
|
for my $u (sort keys %url) {
|
|
print "$u\n";
|
|
}
|
|
exit;
|
|
}
|
|
|
|
my $error;
|
|
my @errlist;
|
|
for my $u (sort keys %url) {
|
|
my $r = checkurl($u);
|
|
|
|
if($r) {
|
|
for my $f (split(/ /, $url{$u})) {
|
|
push @errlist, sprintf "%s ERROR links to missing URL %s\n", $f, $u;
|
|
$error++;
|
|
}
|
|
}
|
|
}
|
|
|
|
for my $l (sort keys %flink) {
|
|
if(! -r $l) {
|
|
for my $f (split(/ /, $flink{$l})) {
|
|
push @errlist, sprintf "%s ERROR links to missing file %s\n", $f, $l;
|
|
$error++;
|
|
}
|
|
}
|
|
}
|
|
|
|
printf "Checked %d URLs\n", scalar(keys %url);
|
|
if($error) {
|
|
print "$error URLs had problems:\n";
|
|
for(@errlist) {
|
|
print $_;
|
|
}
|
|
}
|
|
exit 1 if($error);
|