#!/usr/local/bin/perl -wT # # W3C Link Checker # by Hugo Haas # (c) 1999-2011 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # # This program is licensed under the W3C(r) Software License: # http://www.w3.org/Consortium/Legal/copyright-software # # The documentation is at: # http://validator.w3.org/docs/checklink.html # # See the Mercurial interface at: # http://dvcs.w3.org/hg/link-checker/ # # An online version is available at: # http://validator.w3.org/checklink # # Comments and suggestions should be sent to the www-validator mailing list: # www-validator@w3.org (with 'checklink' in the subject) # http://lists.w3.org/Archives/Public/www-validator/ (archives) # # Small modifications in March 2020 by Karl Berry # (contributed under the same license, or public domain if you prefer). # I started from https://metacpan.org/release/W3C-LinkChecker, version 4.81. # - (&simple_request) ignore "Argument isn't numeric" warnings. # - (%Opts, &check_uri) new option --exclude-url-file; see --help message. # - (&parse_arguments) allow multiple -X options. # - (&check_uri) missing argument to hprintf. # - (&hprintf) avoid useless warnings when undef is returned. # The ideas are (1) to avoid rechecking every url during development, # and (2) to make the exclude list easier to maintain, # and (3) to eliminate useless warnings from the code, # # For GNU Automake, this program is used by the checklinkx target # in doc/local.mk to check the (html output of) automake manual. use strict; use 5.008; # Get rid of potentially unsafe and unneeded environment variables. delete(@ENV{qw(IFS CDPATH ENV BASH_ENV)}); $ENV{PATH} = undef; # ...but we want PERL5?LIB honored even in taint mode, see perlsec, perl5lib, # http://www.mail-archive.com/cpan-testers-discuss%40perl.org/msg01064.html use Config qw(%Config); use lib map { /(.*)/ } defined($ENV{PERL5LIB}) ? split(/$Config{path_sep}/, $ENV{PERL5LIB}) : defined($ENV{PERLLIB}) ? split(/$Config{path_sep}/, $ENV{PERLLIB}) : (); # ----------------------------------------------------------------------------- package W3C::UserAgent; use LWP::RobotUA 1.19 qw(); use LWP::UserAgent qw(); use Net::HTTP::Methods 5.833 qw(); # >= 5.833 for 4kB cookies (#6678) # if 0, ignore robots exclusion (useful for testing) use constant USE_ROBOT_UA => 1; if (USE_ROBOT_UA) { @W3C::UserAgent::ISA = qw(LWP::RobotUA); } else { @W3C::UserAgent::ISA = qw(LWP::UserAgent); } sub new { my $proto = shift; my $class = ref($proto) || $proto; my ($name, $from, $rules) = @_; # For security/privacy reasons, if $from was not given, do not send it. # Cheat by defining something for the constructor, and resetting it later. my $from_ok = $from; $from ||= 'www-validator@w3.org'; my $self; if (USE_ROBOT_UA) { $self = $class->SUPER::new($name, $from, $rules); } else { my %cnf; @cnf{qw(agent from)} = ($name, $from); $self = LWP::UserAgent->new(%cnf); $self = bless $self, $class; } $self->from(undef) unless $from_ok; $self->env_proxy(); $self->allow_private_ips(1); $self->protocols_forbidden([qw(mailto javascript)]); return $self; } sub allow_private_ips { my $self = shift; if (@_) { $self->{Checklink_allow_private_ips} = shift; if (!$self->{Checklink_allow_private_ips}) { # Pull in dependencies require Net::IP; require Socket; require Net::hostent; } } return $self->{Checklink_allow_private_ips}; } sub redirect_progress_callback { my $self = shift; $self->{Checklink_redirect_callback} = shift if @_; return $self->{Checklink_redirect_callback}; } sub simple_request { my $self = shift; my $response = $self->ip_disallowed($_[0]->uri()); # RFC 2616, section 15.1.3 $_[0]->remove_header("Referer") if ($_[0]->referer() && (!$_[0]->uri()->secure() && URI->new($_[0]->referer())->secure())); $response ||= do { local $SIG{__WARN__} = sub { # Suppress RobotRules warnings, rt.cpan.org #18902 # Suppress "Argument isn't numeric" warnings, see below. warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/ && $_[0] !~ /^Argument .* isn't numeric.*Response\.pm/ ); }; # @@@ Why not just $self->SUPER::simple_request? [--unknown] # --- Indeed. Further, why use simple_request in the first place? # It is not part of the UserAgent UI. I believe this can result # in warnings like: # Argument "0, 0, 0, 0" isn't numeric in numeric gt (>) at # /usr/local/lib/perl5/site_perl/5.30.2/HTTP/Response.pm line 261. # when checking, e.g., # https://metacpan.org/pod/distribution/Test-Harness/bin/prove # For testing, here is a three-line html file to check that url: # X #

prove

# # I have been unable to reproduce the warning with a test program # checking that url using $ua->request(), or other UserAgent # functions, even after carefully reproducing all the headers # that checklink sends in the request. --karl@freefriends.org. $self->W3C::UserAgent::SUPER::simple_request(@_); }; if (!defined($self->{FirstResponse})) { $self->{FirstResponse} = $response->code(); $self->{FirstMessage} = $response->message() || '(no message)'; } return $response; } sub redirect_ok { my ($self, $request, $response) = @_; if (my $callback = $self->redirect_progress_callback()) { # @@@ TODO: when an LWP internal robots.txt request gets redirected, # this will a bit confusingly fire for it too. Would need a robust # way to determine whether the request is such a LWP "internal # robots.txt" one. &$callback($request->method(), $request->uri()); } return 0 unless $self->SUPER::redirect_ok($request, $response); if (my $res = $self->ip_disallowed($request->uri())) { $response->previous($response->clone()); $response->request($request); $response->code($res->code()); $response->message($res->message()); return 0; } return 1; } # # Checks whether we're allowed to retrieve the document based on its IP # address. Takes an URI object and returns a HTTP::Response containing the # appropriate status and error message if the IP was disallowed, 0 # otherwise. URIs without hostname or IP address are always allowed, # including schemes where those make no sense (eg. data:, often javascript:). # sub ip_disallowed { my ($self, $uri) = @_; return 0 if $self->allow_private_ips(); # Short-circuit my $hostname = undef; eval { $hostname = $uri->host() }; # Not all URIs implement host()... return 0 unless $hostname; my $addr = my $iptype = my $resp = undef; if (my $host = Net::hostent::gethostbyname($hostname)) { $addr = Socket::inet_ntoa($host->addr()) if $host->addr(); if ($addr && (my $ip = Net::IP->new($addr))) { $iptype = $ip->iptype(); } } if ($iptype && $iptype ne 'PUBLIC') { $resp = HTTP::Response->new(403, 'Checking non-public IP address disallowed by link checker configuration' ); $resp->header('Client-Warning', 'Internal response'); } return $resp; } # ----------------------------------------------------------------------------- package W3C::LinkChecker; use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION $DocType $Head $Accept $ContentTypes %Cfg $CssUrl); use CSS::DOM 0.09 qw(); # >= 0.09 for many bugfixes use CSS::DOM::Constants qw(:rule); use CSS::DOM::Style qw(); use CSS::DOM::Util qw(); use Encode qw(); use HTML::Entities qw(); use HTML::Parser 3.40 qw(); # >= 3.40 for utf8_mode() use HTTP::Headers::Util qw(); use HTTP::Message 5.827 qw(); # >= 5.827 for content_charset() use HTTP::Request 5.814 qw(); # >= 5.814 for accept_decodable() use HTTP::Response 1.50 qw(); # >= 1.50 for decoded_content() use Time::HiRes qw(); use URI 1.53 qw(); # >= 1.53 for secure() use URI::Escape qw(); use URI::Heuristic qw(); # @@@ Needs also W3C::UserAgent but can't use() it here. use constant RC_ROBOTS_TXT => -1; use constant RC_DNS_ERROR => -2; use constant RC_IP_DISALLOWED => -3; use constant RC_PROTOCOL_DISALLOWED => -4; use constant LINE_UNKNOWN => -1; use constant MP2 => (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2); # Tag=>attribute mapping of things we treat as links. # Note: meta/@http-equiv gets special treatment, see start() for details. use constant LINK_ATTRS => { a => ['href'], # base/@href intentionally not checked # http://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi area => ['href'], audio => ['src'], blockquote => ['cite'], body => ['background'], command => ['icon'], # button/@formaction not checked (side effects) del => ['cite'], # @pluginspage, @pluginurl, @href: pre-HTML5 proprietary embed => ['href', 'pluginspage', 'pluginurl', 'src'], # form/@action not checked (side effects) frame => ['longdesc', 'src'], html => ['manifest'], iframe => ['longdesc', 'src'], img => ['longdesc', 'src'], # input/@action, input/@formaction not checked (side effects) input => ['src'], ins => ['cite'], link => ['href'], object => ['data'], q => ['cite'], script => ['src'], source => ['src'], track => ['src'], video => ['src', 'poster'], }; # Tag=>[separator, attributes] mapping of things we treat as lists of links. use constant LINK_LIST_ATTRS => { a => [qr/\s+/, ['ping']], applet => [qr/[\s,]+/, ['archive']], area => [qr/\s+/, ['ping']], head => [qr/\s+/, ['profile']], object => [qr/\s+/, ['archive']], }; # TBD/TODO: # - applet/@code? # - bgsound/@src? # - object/@classid? # - isindex/@action? # - layer/@background,@src? # - ilayer/@background? # - table,tr,td,th/@background? # - xmp/@href? @W3C::LinkChecker::ISA = qw(HTML::Parser); BEGIN { # Version info $PACKAGE = 'W3C Link Checker'; $PROGRAM = 'W3C-checklink'; $VERSION = '4.81'; $REVISION = sprintf('version %s (c) 1999-2011 W3C', $VERSION); $AGENT = sprintf( '%s/%s %s', $PROGRAM, $VERSION, ( W3C::UserAgent::USE_ROBOT_UA ? LWP::RobotUA->_agent() : LWP::UserAgent->_agent() ) ); # Pull in mod_perl modules if applicable. eval { local $SIG{__DIE__} = undef; require Apache2::RequestUtil; } if MP2(); my @content_types = qw( text/html application/xhtml+xml;q=0.9 application/vnd.wap.xhtml+xml;q=0.6 ); $Accept = join(', ', @content_types, '*/*;q=0.5'); push(@content_types, 'text/css', 'text/html-sandboxed'); my $re = join('|', map { s/;.*//; quotemeta } @content_types); $ContentTypes = qr{\b(?:$re)\b}io; # Regexp for matching URL values in CSS. $CssUrl = qr/(?:\s|^)url\(\s*(['"]?)(.*?)\1\s*\)(?=\s|$)/; # # Read configuration. If the W3C_CHECKLINK_CFG environment variable has # been set or the default contains a non-empty file, read it. Otherwise, # skip silently. # my $defaultconfig = '/etc/w3c/checklink.conf'; if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) { require Config::General; Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig; eval { my %config_opts = ( -ConfigFile => $conffile, -SplitPolicy => 'equalsign', -AllowMultiOptions => 'no', ); %Cfg = Config::General->new(%config_opts)->getall(); }; if ($@) { die <<"EOF"; Failed to read configuration from '$conffile': $@ EOF } } $Cfg{Markup_Validator_URI} ||= 'http://validator.w3.org/check?uri=%s'; $Cfg{CSS_Validator_URI} ||= 'http://jigsaw.w3.org/css-validator/validator?uri=%s'; $Cfg{Doc_URI} ||= 'http://validator.w3.org/docs/checklink.html'; # Untaint config params that are used as the format argument to (s)printf(), # Perl 5.10 does not want to see that in taint mode. ($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/); ($Cfg{CSS_Validator_URI}) = ($Cfg{CSS_Validator_URI} =~ /^(.*)$/); $DocType = ''; my $css_url = URI->new_abs('linkchecker.css', $Cfg{Doc_URI}); my $js_url = URI->new_abs('linkchecker.js', $Cfg{Doc_URI}); $Head = sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url, $js_url); EOF # Trusted environment variables that need laundering in taint mode. for (qw(NNTPSERVER NEWSHOST)) { ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_}; } # Use passive FTP by default, see Net::FTP(3). $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); } # Autoflush $| = 1; # Different options specified by the user my $cmdline = !($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/); my %Opts = ( Command_Line => $cmdline, Quiet => 0, Summary_Only => 0, Verbose => 0, Progress => 0, HTML => 0, Timeout => 30, Redirects => 1, Dir_Redirects => 1, Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE}, Cookies => undef, No_Referer => 0, Hide_Same_Realm => 0, Depth => 0, # < 0 means unlimited recursion. Sleep_Time => 1, Connection_Cache_Size => 2, Max_Documents => 150, # For the online version. User => undef, Password => undef, Base_Locations => [], Exclude => undef, Exclude_Docs => undef, Exclude_Url_File => undef, Suppress_Redirect => [], Suppress_Redirect_Prefix => [], Suppress_Redirect_Regexp => [], Suppress_Temp_Redirects => 1, Suppress_Broken => [], Suppress_Fragment => [], Masquerade => 0, Masquerade_From => '', Masquerade_To => '', Trusted => $Cfg{Trusted}, Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ? $Cfg{Allow_Private_IPs} : $cmdline, ); undef $cmdline; # Global variables # What URI's did we process? (used for recursive mode) my %processed; # Result of the HTTP query my %results; # List of redirects my %redirects; # Count of the number of documents checked my $doc_count = 0; # Time stamp my $timestamp = &get_timestamp(); # Per-document header; undefined if already printed. See print_doc_header(). my $doc_header; &parse_arguments() if $Opts{Command_Line}; my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address $ua->conn_cache({total_capacity => $Opts{Connection_Cache_Size}}); if ($ua->can('delay')) { $ua->delay($Opts{Sleep_Time} / 60); } $ua->timeout($Opts{Timeout}); # Set up cookie stash if requested if (defined($Opts{Cookies})) { require HTTP::Cookies; my $cookie_file = $Opts{Cookies}; if ($cookie_file eq 'tmp') { $cookie_file = undef; } elsif ($cookie_file =~ /^(.*)$/) { $cookie_file = $1; # untaint } $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1)); } eval { $ua->allow_private_ips($Opts{Allow_Private_IPs}); }; if ($@) { die <<"EOF"; Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and Net::hostent modules: $@ EOF } # Add configured forbidden protocols if ($Cfg{Forbidden_Protocols}) { my $forbidden = $ua->protocols_forbidden(); push(@$forbidden, split(/[,\s]+/, lc($Cfg{Forbidden_Protocols}))); $ua->protocols_forbidden($forbidden); } if ($Opts{Command_Line}) { require Text::Wrap; Text::Wrap->import('wrap'); require URI::file; &usage(1) unless scalar(@ARGV); $Opts{_Self_URI} = 'http://validator.w3.org/checklink'; # For HTML output &ask_password() if ($Opts{User} && !$Opts{Password}); if (!$Opts{Summary_Only}) { printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML}; } else { $Opts{Verbose} = 0; $Opts{Progress} = 0; } # Populate data for print_form() my %params = ( summary => $Opts{Summary_Only}, hide_redirects => !$Opts{Redirects}, hide_type => $Opts{Dir_Redirects} ? 'dir' : 'all', no_accept_language => !( defined($Opts{Accept_Language}) && $Opts{Accept_Language} eq 'auto' ), no_referer => $Opts{No_Referer}, recursive => ($Opts{Depth} != 0), depth => $Opts{Depth}, ); my $check_num = 1; my @bases = @{$Opts{Base_Locations}}; for my $uri (@ARGV) { # Reset base locations so that previous URI's given on the command line # won't affect the recursion scope for this URI (see check_uri()) @{$Opts{Base_Locations}} = @bases; # Transform the parameter into a URI $uri = &urize($uri); $params{uri} = $uri; &check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1); $check_num++; } undef $check_num; if ($Opts{HTML}) { &html_footer(); } elsif ($doc_count > 0 && !$Opts{Summary_Only}) { printf("\n%s\n", &global_stats()); } } else { require CGI; require CGI::Carp; CGI::Carp->import(qw(fatalsToBrowser)); require CGI::Cookie; # file: URIs are not allowed in CGI mode my $forbidden = $ua->protocols_forbidden(); push(@$forbidden, 'file'); $ua->protocols_forbidden($forbidden); my $query = CGI->new(); for my $param ($query->param()) { my @values = map { Encode::decode_utf8($_) } $query->param($param); $query->param($param, @values); } # Set a few parameters in CGI mode $Opts{Verbose} = 0; $Opts{Progress} = 0; $Opts{HTML} = 1; $Opts{_Self_URI} = $query->url(-relative => 1); # Backwards compatibility my $uri = undef; if ($uri = $query->param('url')) { $query->param('uri', $uri) unless $query->param('uri'); $query->delete('url'); } $uri = $query->param('uri'); if (!$uri) { &html_header('', undef); # Set cookie only from results page. my %cookies = CGI::Cookie->fetch(); &print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1); &html_footer(); exit; } # Backwards compatibility if ($query->param('hide_dir_redirects')) { $query->param('hide_redirects', 'on'); $query->param('hide_type', 'dir'); $query->delete('hide_dir_redirects'); } $Opts{Summary_Only} = 1 if $query->param('summary'); if ($query->param('hide_redirects')) { $Opts{Dir_Redirects} = 0; if (my $type = $query->param('hide_type')) { $Opts{Redirects} = 0 if ($type ne 'dir'); } else { $Opts{Redirects} = 0; } } $Opts{Accept_Language} = undef if $query->param('no_accept_language'); $Opts{No_Referer} = $query->param('no_referer'); $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0); if (my $depth = $query->param('depth')) { # @@@ Ignore invalid depth silently for now. $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/); } # Save, clear or leave cookie as is. my $cookie = undef; if (my $action = $query->param('cookie')) { if ($action eq 'clear') { # Clear the cookie. $cookie = CGI::Cookie->new(-name => $PROGRAM); $cookie->value({clear => 1}); $cookie->expires('-1M'); } elsif ($action eq 'set') { # Set the options. $cookie = CGI::Cookie->new(-name => $PROGRAM); my %options = $query->Vars(); delete($options{$_}) for qw(url uri check cookie); # Non-persistent. $cookie->value(\%options); } } if (!$cookie) { my %cookies = CGI::Cookie->fetch(); $cookie = $cookies{$PROGRAM}; } # Always refresh cookie expiration time. $cookie->expires('+1M') if ($cookie && !$cookie->expires()); # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts. # If we're under mod_perl, there is a way around it... eval { local $SIG{__DIE__} = undef; my $auth = Apache2::RequestUtil->request()->headers_in()->{Authorization}; $ENV{HTTP_AUTHORIZATION} = $auth if $auth; } if (MP2() && !$ENV{HTTP_AUTHORIZATION}); $uri =~ s/^\s+//g; if ($uri =~ /:/) { $uri = URI->new($uri); } else { if ($uri =~ m|^//|) { $uri = URI->new("http:$uri"); } else { local $ENV{URL_GUESS_PATTERN} = ''; my $guess = URI::Heuristic::uf_uri($uri); if ($guess->scheme() && $ua->is_protocol_supported($guess)) { $uri = $guess; } else { $uri = URI->new("http://$uri"); } } } $uri = $uri->canonical(); $query->param("uri", $uri); &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie); undef $query; # Not needed any more. &html_footer(); } ############################################################################### ################################ # Command line and usage stuff # ################################ sub parse_arguments () { require Encode::Locale; Encode::Locale::decode_argv(); require Getopt::Long; Getopt::Long->require_version(2.17); Getopt::Long->import('GetOptions'); Getopt::Long::Configure('bundling', 'no_ignore_case'); my $masq = ''; my @locs = (); GetOptions( 'help|h|?' => sub { usage(0) }, 'q|quiet' => sub { $Opts{Quiet} = 1; $Opts{Summary_Only} = 1; }, 's|summary' => \$Opts{Summary_Only}, 'b|broken' => sub { $Opts{Redirects} = 0; $Opts{Dir_Redirects} = 0; }, 'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; }, 'v|verbose' => \$Opts{Verbose}, 'i|indicator' => \$Opts{Progress}, 'H|html' => \$Opts{HTML}, 'r|recursive' => sub { $Opts{Depth} = -1 if $Opts{Depth} == 0; }, 'l|location=s' => \@locs, 'X|exclude=s@' => \@{$Opts{Exclude}}, 'exclude-docs=s@' => \@{$Opts{Exclude_Docs}}, 'exclude-url-file=s' => \$Opts{Exclude_Url_File}, 'suppress-redirect=s@' => \@{$Opts{Suppress_Redirect}}, 'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}}, 'suppress-temp-redirects' => \$Opts{Suppress_Temp_Redirects}, 'suppress-broken=s@' => \@{$Opts{Suppress_Broken}}, 'suppress-fragment=s@' => \@{$Opts{Suppress_Fragment}}, 'u|user=s' => \$Opts{User}, 'p|password=s' => \$Opts{Password}, 't|timeout=i' => \$Opts{Timeout}, 'C|connection-cache=i' => \$Opts{Connection_Cache_Size}, 'S|sleep=i' => \$Opts{Sleep_Time}, 'L|languages=s' => \$Opts{Accept_Language}, 'c|cookies=s' => \$Opts{Cookies}, 'R|no-referer' => \$Opts{No_Referer}, 'D|depth=i' => sub { $Opts{Depth} = $_[1] unless $_[1] == 0; }, 'd|domain=s' => \$Opts{Trusted}, 'masquerade=s' => \$masq, 'hide-same-realm' => \$Opts{Hide_Same_Realm}, 'V|version' => \&version, ) || usage(1); if ($masq) { $Opts{Masquerade} = 1; my @masq = split(/\s+/, $masq); if (scalar(@masq) != 2 || !defined($masq[0]) || $masq[0] !~ /\S/ || !defined($masq[1]) || $masq[1] !~ /\S/) { usage(1, "Error: --masquerade takes two whitespace separated URIs."); } else { require URI::file; $Opts{Masquerade_From} = $masq[0]; my $u = URI->new($masq[1]); $Opts{Masquerade_To} = $u->scheme() ? $u : URI::file->new_abs($masq[1]); } } if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') { $Opts{Accept_Language} = &guess_language(); } if (($Opts{Sleep_Time} || 0) < 1) { warn( "*** Warning: minimum allowed sleep time is 1 second, resetting.\n" ); $Opts{Sleep_Time} = 1; } push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs); $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs); for my $i (0 .. $#{$Opts{Exclude_Docs}}) { eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; }; &usage(1, "Error in exclude-docs regexp: $@") if $@; } if (defined($Opts{Trusted})) { eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; }; &usage(1, "Error in trusted domains regexp: $@") if $@; } # Sanity-check error-suppression arguments for my $i (0 .. $#{$Opts{Suppress_Redirect}}) { ${$Opts{Suppress_Redirect}}[$i] =~ s/ /->/; my $sr_arg = ${$Opts{Suppress_Redirect}}[$i]; if ($sr_arg !~ /.->./) { &usage(1, "Bad suppress-redirect argument, should contain \"->\": $sr_arg" ); } } for my $i (0 .. $#{$Opts{Suppress_Redirect_Prefix}}) { my $srp_arg = ${$Opts{Suppress_Redirect_Prefix}}[$i]; $srp_arg =~ s/ /->/; if ($srp_arg !~ /^(.*)->(.*)$/) { &usage(1, "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg" ); } # Turn prefixes into a regexp. ${$Opts{Suppress_Redirect_Prefix}}[$i] = qr/^\Q$1\E(.*)->\Q$2\E\1$/ism; } for my $i (0 .. $#{$Opts{Suppress_Broken}}) { ${$Opts{Suppress_Broken}}[$i] =~ s/ /:/; my $sb_arg = ${$Opts{Suppress_Broken}}[$i]; if ($sb_arg !~ /^(-1|[0-9]+):./) { &usage(1, "Bad suppress-broken argument, should be prefixed by a numeric response code: $sb_arg" ); } } for my $sf_arg (@{$Opts{Suppress_Fragment}}) { if ($sf_arg !~ /.#./) { &usage(1, "Bad suppress-fragment argument, should contain \"#\": $sf_arg" ); } } if ($#{$Opts{Exclude}} > 0) { # convert $Opts{Exclude} array into regexp by parenthesizing # each and inserting alternations between. my $exclude_rx = join("|", map { "($_)" } @{$Opts{Exclude}}); # # For the sake of the rest of the program, pretend the option # was that string all along. $Opts{Exclude} = $exclude_rx; } if ($Opts{Exclude_Url_File}) { # The idea is that if the specified file exists, we read it and # treat it as a list of excludes. If the file doesn't exist, we # write it with all the urls that were successful. That way, we # can avoid re-checking them on every run, and it can be removed # externally (from cron) to get re-updated. # # We distinguish the cases here, and either add to # $Opts{Exclude} if reading, or setting Exclude_File_Write in # %Opts if writing (even though it is not really an option, # but it's the most convenient place). if (-s $Opts{Exclude_Url_File}) { open (my $xf, "$Opts{Exclude_Url_File}") || &usage(1, "Could not open $Opts{Exclude_Url_File}" . " for reading: $!"); my @xf = (); while (<$xf>) { chomp; # the file is urls, not regexps, so quotemeta. push (@xf, "(" . quotemeta($_) . ")"); } my $xf_rx = join ("|", @xf); if ($Opts{Exclude}) { $Opts{Exclude} .= "|$xf_rx"; } else { $Opts{Exclude} = $xf_rx; } } else { open ($Opts{Exclude_File_Write}, ">$Opts{Exclude_Url_File}") || &usage(1, "Could not open $Opts{Exclude_Url_File} for writing: $!"); # we write on a successful retrieve, and don't bother closing. } } # Precompile/error-check final list of regular expressions if (defined($Opts{Exclude})) { eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; }; &usage(1, "Error in exclude regexp $Opts{Exclude}: $@") if $@; } return; } sub version () { print "$PACKAGE $REVISION\n"; exit 0; } sub usage () { my ($exitval, $msg) = @_; $exitval = 0 unless defined($exitval); $msg ||= ''; $msg =~ s/[\r\n]*$/\n\n/ if $msg; die($msg) unless $Opts{Command_Line}; my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only'; select(STDERR) if $exitval; print "$msg$PACKAGE $REVISION Usage: checklink Options: -s, --summary Result summary only. -b, --broken Show only the broken links, not the redirects. -e, --directory Hide directory redirects, for example http://www.w3.org/TR -> http://www.w3.org/TR/ -r, --recursive Check the documents linked from the first one. -D, --depth N Check the documents linked from the first one to depth N (implies --recursive). -l, --location URI Scope of the documents checked in recursive mode (implies --recursive). Can be specified multiple times. If not specified, the default eg. for http://www.w3.org/TR/html4/Overview.html would be http://www.w3.org/TR/html4/ -X, --exclude REGEXP Do not check links whose full, canonical URIs match REGEXP; also limits recursion the same way as --exclude-docs with the same regexp would. This option may be specified multiple times. --exclude-docs REGEXP In recursive mode, do not check links in documents whose full, canonical URIs match REGEXP. This option may be specified multiple times. --exclude-url-file FILE If FILE exists, treat each line as a string specifying another exclude; quotemeta is called to make them regexps. If FILE does not exist, open it for writing and write each checked url which gets a 200 response to it. --suppress-redirect URI->URI Do not report a redirect from the first to the second URI. This option may be specified multiple times. --suppress-redirect-prefix URI->URI Do not report a redirect from a child of the first URI to the same child of the second URI. This option may be specified multiple times. --suppress-temp-redirects Suppress warnings about temporary redirects. --suppress-broken CODE:URI Do not report a broken link with the given CODE. CODE is HTTP response, or -1 for robots exclusion. This option may be specified multiple times. --suppress-fragment URI Do not report the given broken fragment URI. A fragment URI contains \"#\". This option may be specified multiple times. -L, --languages LANGS Accept-Language header to send. The special value 'auto' causes autodetection from the environment. -c, --cookies FILE Use cookies, load/save them in FILE. The special value 'tmp' causes non-persistent use of cookies. -R, --no-referer Do not send the Referer HTTP header. -q, --quiet No output if no errors are found (implies -s). -v, --verbose Verbose mode. -i, --indicator Show percentage of lines processed while parsing. -u, --user USERNAME Specify a username for authentication. -p, --password PASSWORD Specify a password. --hide-same-realm Hide 401's that are in the same realm as the document checked. -S, --sleep SECS Sleep SECS seconds between requests to each server (default and minimum: 1 second). -t, --timeout SECS Timeout for requests in seconds (default: 30). -d, --domain DOMAIN Regular expression describing the domain to which authentication information will be sent (default: $trust). --masquerade \"BASE1 BASE2\" Masquerade base URI BASE1 as BASE2. See the manual page for more information. -H, --html HTML output. -?, -h, --help Show this message and exit. -V, --version Output version information and exit. See \"perldoc LWP\" for information about proxy server support, \"perldoc Net::FTP\" for information about various environment variables affecting FTP connections and \"perldoc Net::NNTP\" for setting a default NNTP server for news: URIs. The W3C_CHECKLINK_CFG environment variable can be used to set the configuration file to use. See details in the full manual page, it can be displayed with: perldoc checklink More documentation at: $Cfg{Doc_URI} Please send bug reports and comments to the www-validator mailing list: www-validator\@w3.org (with 'checklink' in the subject) Archives are at: http://lists.w3.org/Archives/Public/www-validator/ "; exit $exitval; } sub ask_password () { eval { local $SIG{__DIE__} = undef; require Term::ReadKey; Term::ReadKey->require_version(2.00); Term::ReadKey->import(qw(ReadMode)); }; if ($@) { warn('Warning: Term::ReadKey 2.00 or newer not available, ' . "password input disabled.\n"); return; } printf(STDERR 'Enter the password for user %s: ', $Opts{User}); ReadMode('noecho', *STDIN); chomp($Opts{Password} = ); ReadMode('restore', *STDIN); print(STDERR "ok.\n"); return; } ############################################################################### ########################################################################### # Guess an Accept-Language header based on the $LANG environment variable # ########################################################################### sub guess_language () { my $lang = $ENV{LANG} or return; $lang =~ s/[\.@].*$//; # en_US.UTF-8, fi_FI@euro... return 'en' if ($lang eq 'C' || $lang eq 'POSIX'); my $res = undef; eval { require Locale::Language; if (my $tmp = Locale::Language::language2code($lang)) { $lang = $tmp; } if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) { if (Locale::Language::code2language($l)) { $res = $l; if ($c) { require Locale::Country; $res .= "-$c" if Locale::Country::code2country($c); } } } }; return $res; } ############################ # Transform foo into a URI # ############################ sub urize ($) { my $arg = shift; my $uarg = URI::Escape::uri_unescape($arg); my $uri; if (-d $uarg) { # look for an "index" file in dir, return it if found require File::Spec; for my $index (map { File::Spec->catfile($uarg, $_) } qw(index.html index.xhtml index.htm index.xhtm)) { if (-e $index) { $uri = URI::file->new_abs($index); last; } } # return dir itself if an index file was not found $uri ||= URI::file->new_abs($uarg); } elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) { $uri = URI::file->new_abs($uarg); } else { my $newuri = URI->new($arg); if ($newuri->scheme()) { $uri = $newuri; } else { local $ENV{URL_GUESS_PATTERN} = ''; $uri = URI::Heuristic::uf_uri($arg); $uri = URI::file->new_abs($uri) unless $uri->scheme(); } } return $uri->canonical(); } ######################################## # Check for broken links in a resource # ######################################## sub check_uri (\%\$$$$;\$$) { my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_; $is_start ||= ($check_num == 1); my $start = $Opts{Summary_Only} ? 0 : &get_timestamp(); # Get and parse the document my $response = &get_document( 'GET', $uri, $doc_count, \%redirects, $referer, $cookie, $params, $check_num, $is_start ); # Can we check the resource? If not, we exit here... return if defined($response->{Stop}); if ($Opts{HTML}) { &html_header($uri, $cookie) if ($check_num == 1); &print_form($params, $cookie, $check_num) if $is_start; } if ($is_start) { # Starting point of a new check, eg. from the command line # Use the first URI as the recursion base unless specified otherwise. push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical()) unless @{$Opts{Base_Locations}}; } else { # Before fetching the document, we don't know if we'll be within the # recursion scope or not (think redirects). if (!&in_recursion_scope($response->{absolute_uri})) { hprintf("Not in recursion scope: %s\n", $response->{absolute_uri}) if ($Opts{Verbose}); $response->content(""); return; } } # Define the document header, and perhaps print it. # (It might still be defined if the previous document had no errors; # just redefine it in that case.) if ($check_num != 1) { if ($Opts{HTML}) { $doc_header = "\n
\n"; } else { $doc_header = "\n" . ('-' x 40) . "\n"; } } if ($Opts{HTML}) { $doc_header .= ("

\nProcessing\t" . &show_url($response->{absolute_uri}) . "\n

\n\n"); } else { $doc_header .= "\nProcessing\t$response->{absolute_uri}\n\n"; } if (!$Opts{Quiet}) { print_doc_header(); } # We are checking a new document $doc_count++; my $result_anchor = 'results' . $doc_count; if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) { my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; my $acclang = $Opts{Accept_Language} || '(not sent)'; my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; my $cookies = 'not used'; if (defined($Opts{Cookies})) { $cookies = 'used, '; if ($Opts{Cookies} eq 'tmp') { $cookies .= 'non-persistent'; } else { $cookies .= "file $Opts{Cookies}"; } } printf( <<'EOF', $Accept, $acclang, $send_referer, $cookies, $Opts{Sleep_Time}, $s); Settings used: - Accept: %s - Accept-Language: %s - Referer: %s - Cookies: %s - Sleeping %d second%s between requests to each server EOF printf("- Excluding links matching %s\n", $Opts{Exclude}) if defined($Opts{Exclude}); printf("- Excluding links in documents whose URIs match %s\n", join(', ', @{$Opts{Exclude_Docs}})) if @{$Opts{Exclude_Docs}}; } if ($Opts{HTML}) { if (!$Opts{Summary_Only}) { my $accept = &encode($Accept); my $acclang = &encode($Opts{Accept_Language} || '(not sent)'); my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; printf( <<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s);
Settings used:
EOF printf("

Go to the results.

\n", $result_anchor); my $esc_uri = URI::Escape::uri_escape($response->{absolute_uri}, "^A-Za-z0-9."); print "

For reliable link checking results, check "; if (!$response->{IsCss}) { printf("HTML validity and ", &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri))); } printf( "CSS validity first.

Back to the link checker.

\n", &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)), &encode($Opts{_Self_URI}) ); printf(<<'EOF', $result_anchor);

Status:

EOF
        }
    }

    if ($Opts{Summary_Only} && !$Opts{Quiet}) {
        print '

' if $Opts{HTML}; print 'This may take some time'; print "... (why?)

" if $Opts{HTML}; print " if the document has many links to check.\n" unless $Opts{HTML}; } # Record that we have processed this resource $processed{$response->{absolute_uri}} = 1; # Parse the document my $p = &parse_document($uri, $response->base(), $response, 1, ($depth != 0)); my $base = URI->new($p->{base}); # Check anchors ############### print "Checking anchors...\n" unless $Opts{Summary_Only}; my %errors; while (my ($anchor, $lines) = each(%{$p->{Anchors}})) { if (!length($anchor)) { # Empty IDREF's are not allowed $errors{$anchor} = 1; } else { my $times = 0; $times += $_ for values(%$lines); # They should appear only once $errors{$anchor} = 1 if ($times > 1); } } print " done.\n" unless $Opts{Summary_Only}; # Check links ############# &hprintf("Recording all the links found: %d\n", scalar(keys %{$p->{Links}})) if ($Opts{Verbose}); my %links; my %hostlinks; # Record all the links found while (my ($link, $lines) = each(%{$p->{Links}})) { my $link_uri = URI->new($link); my $abs_link_uri = URI->new_abs($link_uri, $base); if ($Opts{Masquerade}) { if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) { print_doc_header(); printf("processing %s in base %s\n", $abs_link_uri, $Opts{Masquerade_To}); my $nlink = $abs_link_uri; $nlink =~ s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|; $abs_link_uri = URI->new($nlink); } } my $canon_uri = URI->new($abs_link_uri->canonical()); my $fragment = $canon_uri->fragment(undef); if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) { if (!exists($links{$canon_uri})) { my $hostport; $hostport = $canon_uri->host_port() if $canon_uri->can('host_port'); $hostport = '' unless defined $hostport; push(@{$hostlinks{$hostport}}, $canon_uri); } for my $line_num (keys(%$lines)) { if (!defined($fragment) || !length($fragment)) { # Document without fragment $links{$canon_uri}{location}{$line_num} = 1; } else { # Resource with a fragment $links{$canon_uri}{fragments}{$fragment}{$line_num} = 1; } } } else { hprintf("excluded via options: %s\n", $canon_uri) if ($Opts{Verbose}); } } my @order = &distribute_links(\%hostlinks); undef %hostlinks; # Build the list of broken URI's my $nlinks = scalar(@order); &hprintf("Checking %d links to build list of broken URI's\n", $nlinks) if ($Opts{Verbose}); my %broken; my $link_num = 0; for my $u (@order) { my $ulinks = $links{$u}; if ($Opts{Summary_Only}) { # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896 print ' ' if ($Opts{HTML} && !$Opts{Command_Line}); } else { &hprintf("\nChecking link %s\n", $u); my $progress = ($link_num / $nlinks) * 100; printf( '', $result_anchor, &encode($u), $progress) if (!$Opts{Command_Line} && $Opts{HTML} && !$Opts{Summary_Only}); } $link_num++; # Check that a link is valid &check_validity($uri, $u, ($depth != 0 && &in_recursion_scope($u)), \%links, \%redirects); &hprintf("\tReturn code: %s\n", $results{$u}{location}{code}) if ($Opts{Verbose}); if ($Opts{Exclude_File_Write} && $results{$u}{location}{code} == 200) { my $fh = $Opts{Exclude_File_Write}; print $fh ("$u\n"); } if ($results{$u}{location}{success}) { # Even though it was not broken, we might want to display it # on the results page (e.g. because it required authentication) $broken{$u}{location} = 1 if ($results{$u}{location}{display} >= 400); # List the broken fragments while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) { my $fragment_ok = $results{$u}{fragments}{$fragment}; if ($Opts{Verbose}) { my @line_nums = sort { $a <=> $b } keys(%$lines); &hprintf( "\t\t%s %s - Line%s: %s\n", $fragment, $fragment_ok ? 'OK' : 'Not found', (scalar(@line_nums) > 1) ? 's' : '', join(', ', @line_nums) ); } # A broken fragment? $broken{$u}{fragments}{$fragment} += 2 unless $fragment_ok; } } elsif (!($Opts{Quiet} && &informational($results{$u}{location}{code}))) { # Couldn't find the document $broken{$u}{location} = 1; # All the fragments associated are hence broken for my $fragment (keys %{$ulinks->{fragments}}) { $broken{$u}{fragments}{$fragment}++; } } } &hprintf( "\nProcessed in %s seconds.\n", &time_diff($start, &get_timestamp()) ) unless $Opts{Summary_Only}; printf( '', $result_anchor, &time_diff($start, &get_timestamp())) if ($Opts{HTML} && !$Opts{Summary_Only}); # Display results if ($Opts{HTML} && !$Opts{Summary_Only}) { print("
\n
\n"); printf("

Results

\n", $result_anchor); } print "\n" unless $Opts{Quiet}; &links_summary(\%links, \%results, \%broken, \%redirects); &anchors_summary($p->{Anchors}, \%errors); # Do we want to process other documents? if ($depth != 0) { for my $u (map { URI->new($_) } keys %links) { next unless $results{$u}{location}{success}; # Broken link? next unless &in_recursion_scope($u); # Do we understand its content type? next unless ($results{$u}{location}{type} =~ $ContentTypes); # Have we already processed this URI? next if &already_processed($u, $uri); # Do the job print "\n" unless $Opts{Quiet}; if ($Opts{HTML}) { if (!$Opts{Command_Line}) { if ($doc_count == $Opts{Max_Documents}) { print( "
\n

Maximum number of documents ($Opts{Max_Documents}) reached!

\n" ); } if ($doc_count >= $Opts{Max_Documents}) { $doc_count++; print("

Not checking $u

\n"); $processed{$u} = 1; next; } } } # This is an inherently recursive algorithm, so Perl's warning is not # helpful. You may wish to comment this out when debugging, though. no warnings 'recursion'; if ($depth < 0) { &check_uri($params, $u, 0, -1, $cookie, $uri); } else { &check_uri($params, $u, 0, $depth - 1, $cookie, $uri); } } } return; } ############################################################### # Distribute links based on host:port to avoid RobotUA delays # ############################################################### sub distribute_links(\%) { my $hostlinks = shift; # Hosts ordered by weight (number of links), descending my @order = sort { scalar(@{$hostlinks->{$b}}) <=> scalar(@{$hostlinks->{$a}}) } keys %$hostlinks; # All link list flattened into one, in host weight order my @all; push(@all, @{$hostlinks->{$_}}) for @order; return @all if (scalar(@order) < 2); # Indexes and chunk size for "zipping" the end result list my $num = scalar(@{$hostlinks->{$order[0]}}); my @indexes = map { $_ * $num } (0 .. $num - 1); # Distribute them my @result; while (my @chunk = splice(@all, 0, $num)) { @result[@indexes] = @chunk; @indexes = map { $_ + 1 } @indexes; } # Weed out undefs @result = grep(defined, @result); return @result; } ########################################## # Decode Content-Encodings in a response # ########################################## sub decode_content ($) { my $response = shift; my $error = undef; my $docref = $response->decoded_content(ref => 1); if (defined($docref)) { utf8::encode($$docref); $response->content_ref($docref); # Remove Content-Encoding so it won't be decoded again later. $response->remove_header('Content-Encoding'); } else { my $ce = $response->header('Content-Encoding'); $ce = defined($ce) ? "'$ce'" : 'undefined'; my $ct = $response->header('Content-Type'); $ct = defined($ct) ? "'$ct'" : 'undefined'; my $request_uri = $response->request->url; my $cs = $response->content_charset(); $cs = defined($cs) ? "'$cs'" : 'unknown'; $error = "Error decoding document at <$request_uri>, Content-Type $ct, " . "Content-Encoding $ce, content charset $cs: '$@'"; } return $error; } ####################################### # Get and parse a resource to process # ####################################### sub get_document ($\$$;\%\$$$$$) { my ($method, $uri, $in_recursion, $redirects, $referer, $cookie, $params, $check_num, $is_start ) = @_; # $method contains the HTTP method the use (GET or HEAD) # $uri object contains the identifier of the resource # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least # the second resource checked) # $redirects is a pointer to the hash containing the map of the redirects # $referer is the URI object of the referring document # $cookie, $params, $check_num, and $is_start are for printing HTTP headers # and the form if $in_recursion == 0 and not authenticating # Get the resource my $response; if (defined($results{$uri}{response}) && !($method eq 'GET' && $results{$uri}{method} eq 'HEAD')) { $response = $results{$uri}{response}; } else { $response = &get_uri($method, $uri, $referer); &record_results($uri, $method, $response, $referer); &record_redirects($redirects, $response); } if (!$response->is_success()) { if (!$in_recursion) { # Is it too late to request authentication? if ($response->code() == 401) { &authentication($response, $cookie, $params, $check_num, $is_start); } else { if ($Opts{HTML}) { &html_header($uri, $cookie) if ($check_num == 1); &print_form($params, $cookie, $check_num) if $is_start; print "

", &status_icon($response->code()); } &hprintf("\nError: %d %s\n", $response->code(), $response->message() || '(no message)'); print "

\n" if $Opts{HTML}; } } $response->{Stop} = 1; $response->content(""); return ($response); } # What is the URI of the resource that we are processing by the way? my $base_uri = $response->base(); my $request_uri = URI->new($response->request->url); $response->{absolute_uri} = $request_uri->abs($base_uri); # Can we parse the document? my $failed_reason; my $ct = $response->header('Content-Type'); if (!$ct || $ct !~ $ContentTypes) { $failed_reason = "Content-Type for <$request_uri> is " . (defined($ct) ? "'$ct'" : 'undefined'); } else { $failed_reason = decode_content($response); } if ($failed_reason) { # No, there is a problem... if (!$in_recursion) { if ($Opts{HTML}) { &html_header($uri, $cookie) if ($check_num == 1); &print_form($params, $cookie, $check_num) if $is_start; print "

", &status_icon(406); } &hprintf("Can't check links: %s.\n", $failed_reason); print "

\n" if $Opts{HTML}; } $response->{Stop} = 1; $response->content(""); } # Ok, return the information return ($response); } ######################################################### # Check whether a URI is within the scope of recursion. # ######################################################### sub in_recursion_scope (\$) { my ($uri) = @_; return 0 unless $uri; my $candidate = $uri->canonical(); return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude}); for my $excluded_doc (@{$Opts{Exclude_Docs}}) { return 0 if ($candidate =~ $excluded_doc); } for my $base (@{$Opts{Base_Locations}}) { my $rel = $candidate->rel($base); next if ($candidate eq $rel); # Relative path not possible? next if ($rel =~ m|^(\.\.)?/|); # Relative path upwards? return 1; } return 0; # We always have at least one base location, but none matched. } ################################# # Check for content type match. # ################################# sub is_content_type ($$) { my ($candidate, $type) = @_; return 0 unless ($candidate && $type); my @v = HTTP::Headers::Util::split_header_words($candidate); return scalar(@v) ? $type eq lc($v[0]->[0]) : 0; } ################################################## # Check whether a URI has already been processed # ################################################## sub already_processed (\$\$) { my ($uri, $referer) = @_; # Don't be verbose for that part... my $summary_value = $Opts{Summary_Only}; $Opts{Summary_Only} = 1; # Do a GET: if it fails, we stop, if not, the results are cached my $response = &get_document('GET', $uri, 1, undef, $referer); # ... but just for that part $Opts{Summary_Only} = $summary_value; # Can we process the resource? return -1 if defined($response->{Stop}); # Have we already processed it? return 1 if defined($processed{$response->{absolute_uri}->as_string()}); # It's not processed yet and it is processable: return 0 return 0; } ############################ # Get the content of a URI # ############################ sub get_uri ($\$;\$$\%$$$$) { # Here we have a lot of extra parameters in order not to lose information # if the function is called several times (401's) my ($method, $uri, $referer, $start, $redirects, $code, $realm, $message, $auth ) = @_; # $method contains the method used # $uri object contains the target of the request # $referer is the URI object of the referring document # $start is a timestamp (not defined the first time the function is called) # $redirects is a map of redirects # $code is the first HTTP return code # $realm is the realm of the request # $message is the HTTP message received # $auth equals 1 if we want to send out authentication information # For timing purposes $start = &get_timestamp() unless defined($start); # Prepare the query # Do we want printouts of progress? my $verbose_progress = !($Opts{Summary_Only} || (!$doc_count && $Opts{HTML})); &hprintf("%s %s ", $method, $uri) if $verbose_progress; my $request = HTTP::Request->new($method, $uri); $request->header('Accept-Language' => $Opts{Accept_Language}) if $Opts{Accept_Language}; $request->header('Accept', $Accept); $request->accept_decodable(); # Are we providing authentication info? if ($auth && $request->url()->host() =~ $Opts{Trusted}) { if (defined($ENV{HTTP_AUTHORIZATION})) { $request->header(Authorization => $ENV{HTTP_AUTHORIZATION}); } elsif (defined($Opts{User}) && defined($Opts{Password})) { $request->authorization_basic($Opts{User}, $Opts{Password}); } } # Tell the user agent if we want progress reports for redirects or not. $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); }) if $verbose_progress; # Set referer $request->referer($referer) if (!$Opts{No_Referer} && $referer); # Telling caches in the middle we want a fresh copy (Bug 4998) $request->header(Cache_Control => "max-age=0"); # Do the query my $response = $ua->request($request); # Get the results # Record the very first response if (!defined($code)) { ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)}); } # Authentication requested? if ($response->code() == 401 && !defined($auth) && (defined($ENV{HTTP_AUTHORIZATION}) || (defined($Opts{User}) && defined($Opts{Password}))) ) { # Set host as trusted domain unless we already have one. if (!$Opts{Trusted}) { my $re = sprintf('^%s$', quotemeta($response->base()->host())); $Opts{Trusted} = qr/$re/io; } # Deal with authentication and avoid loops if (!defined($realm) && $response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) { $realm = $1; } print "\n" if $verbose_progress; return &get_uri($method, $response->request()->url(), $referer, $start, $redirects, $code, $realm, $message, 1); } # @@@ subtract robot delay from the "fetched in" time? &hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp())) if $verbose_progress; $response->{IsCss} = is_content_type($response->content_type(), "text/css"); $response->{Realm} = $realm if defined($realm); return $response; } ######################################### # Record the results of an HTTP request # ######################################### sub record_results (\$$$$) { my ($uri, $method, $response, $referer) = @_; $results{$uri}{referer} = $referer; $results{$uri}{response} = $response; $results{$uri}{method} = $method; $results{$uri}{location}{code} = $response->code(); $results{$uri}{location}{code} = RC_ROBOTS_TXT() if ($results{$uri}{location}{code} == 403 && $response->message() =~ /Forbidden by robots\.txt/); $results{$uri}{location}{code} = RC_IP_DISALLOWED() if ($results{$uri}{location}{code} == 403 && $response->message() =~ /non-public IP/); $results{$uri}{location}{code} = RC_DNS_ERROR() if ($results{$uri}{location}{code} == 500 && $response->message() =~ /Bad hostname '[^\']*'/); $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED() if ($results{$uri}{location}{code} == 500 && $response->message() =~ /Access to '[^\']*' URIs has been disabled/); $results{$uri}{location}{type} = $response->header('Content-type'); $results{$uri}{location}{display} = $results{$uri}{location}{code}; # Rewind, check for the original code and message. for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) { $results{$uri}{location}{orig} = $tmp->code(); $results{$uri}{location}{orig_message} = $tmp->message() || '(no message)'; } $results{$uri}{location}{success} = $response->is_success(); # If a suppressed broken link, fill the data structure like a typical success. # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n"; if (!$results{$uri}{location}{success}) { my $code = $results{$uri}{location}{code}; my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}}; if ($match) { $results{$uri}{location}{success} = 1; $results{$uri}{location}{code} = 100; $results{$uri}{location}{display} = 100; } } # Stores the authentication information if (defined($response->{Realm})) { $results{$uri}{location}{realm} = $response->{Realm}; $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm}; } # What type of broken link is it? (stored in {record} - the {display} # information is just for visual use only) if ($results{$uri}{location}{display} == 401 && $results{$uri}{location}{code} == 404) { $results{$uri}{location}{record} = 404; } else { $results{$uri}{location}{record} = $results{$uri}{location}{display}; } # Did it fail? $results{$uri}{location}{message} = $response->message() || '(no message)'; if (!$results{$uri}{location}{success}) { &hprintf( "Error: %d %s\n", $results{$uri}{location}{code}, $results{$uri}{location}{message} ) if ($Opts{Verbose}); } return; } #################### # Parse a document # #################### sub parse_document (\$\$$$$) { my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_; print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n") if $Opts{Verbose}; my $p; if (defined($results{$uri}{parsing})) { # We have already done the job. Woohoo! $p->{base} = $results{$uri}{parsing}{base}; $p->{Anchors} = $results{$uri}{parsing}{Anchors}; $p->{Links} = $results{$uri}{parsing}{Links}; return $p; } $p = W3C::LinkChecker->new(); $p->{base} = $base_uri; my $stype = $response->header("Content-Style-Type"); $p->{style_is_css} = !$stype || is_content_type($stype, "text/css"); my $start; if (!$Opts{Summary_Only}) { $start = &get_timestamp(); print("Parsing...\n"); } # Content-Encoding etc already decoded in get_document(). my $docref = $response->content_ref(); # Count lines beforehand if needed (for progress indicator, or CSS while # we don't get any line context out of the parser). In case of HTML, the # actual final number of lines processed shown is populated by our # end_document handler. $p->{Total} = ($$docref =~ tr/\n//) if ($response->{IsCss} || $Opts{Progress}); # We only look for anchors if we are not interested in the links # obviously, or if we are running a recursive checking because we # might need this information later $p->{only_anchors} = !($links || $rec_needs_links); if ($response->{IsCss}) { # Parse as CSS $p->parse_css($$docref, LINE_UNKNOWN()); } else { # Parse as HTML # Transform into for parsing # Processing instructions are not parsed by process, but in this case # it should be. It's expensive, it's horrible, but it's the easiest way # for right now. $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors}; $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/); $p->parse($$docref)->eof(); } $response->content(""); if (!$Opts{Summary_Only}) { my $stop = &get_timestamp(); print "\r" if $Opts{Progress}; &hprintf(" done (%d lines in %s seconds).\n", $p->{Total}, &time_diff($start, $stop)); } # Save the results before exiting $results{$uri}{parsing}{base} = $p->{base}; $results{$uri}{parsing}{Anchors} = $p->{Anchors}; $results{$uri}{parsing}{Links} = $p->{Links}; return $p; } #################################### # Constructor for W3C::LinkChecker # #################################### sub new { my $p = HTML::Parser::new(@_, api_version => 3); $p->utf8_mode(1); # Set up handlers $p->handler(start => 'start', 'self, tagname, attr, line'); $p->handler(end => 'end', 'self, tagname, line'); $p->handler(text => 'text', 'self, dtext, line'); $p->handler( declaration => sub { my $self = shift; $self->declaration(substr($_[0], 2, -1)); }, 'self, text, line' ); $p->handler(end_document => 'end_document', 'self, line'); if ($Opts{Progress}) { $p->handler(default => 'parse_progress', 'self, line'); $p->{last_percentage} = 0; } # Check ? $p->{check_name} = 1; # Check <[..] id="..">? $p->{check_id} = 1; # Don't interpret comment loosely $p->strict_comment(1); return $p; } ################################################# # Record or return the doctype of the document # ################################################# sub doctype { my ($self, $dc) = @_; return $self->{doctype} unless $dc; $_ = $self->{doctype} = $dc; # What to look for depending on the doctype # Check for ? $self->{check_name} = 0 if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %; # Check for <* id="...">? $self->{check_id} = 0 if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%); # Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...) $self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%); return; } ################################### # Print parse progress indication # ################################### sub parse_progress { my ($self, $line) = @_; return unless defined($line) && $line > 0 && $self->{Total} > 0; my $percentage = int($line / $self->{Total} * 100); if ($percentage != $self->{last_percentage}) { printf("\r%4d%%", $percentage); $self->{last_percentage} = $percentage; } return; } ############################# # Extraction of the anchors # ############################# sub get_anchor { my ($self, $tag, $attr) = @_; my $anchor = $self->{check_id} ? $attr->{id} : undef; if ($self->{check_name} && ($tag eq 'a')) { # @@@@ In XHTML, is mandatory # Force an error if it's not the case (or if id's and name's values # are different) # If id is defined, name if defined must have the same value $anchor ||= $attr->{name}; } return $anchor; } ############################# # W3C::LinkChecker handlers # ############################# sub add_link { my ($self, $uri, $base, $line) = @_; if (defined($uri)) { # Remove repeated slashes after the . or .. in relative links, to avoid # duplicated checking or infinite recursion. $uri =~ s|^(\.\.?/)/+|$1|o; $uri = Encode::decode_utf8($uri); $uri = URI->new_abs($uri, $base) if defined($base); $self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}++; } return; } sub start { my ($self, $tag, $attr, $line) = @_; $line = LINE_UNKNOWN() unless defined($line); # Anchors my $anchor = $self->get_anchor($tag, $attr); $self->{Anchors}{$anchor}{$line}++ if defined($anchor); # Links if (!$self->{only_anchors}) { my $tag_local_base = undef; # Special case: base/@href # @@@TODO: The reason for handling ourselves is that LWP's # head parsing magic fails at least for responses that have # Content-Encodings: https://rt.cpan.org/Ticket/Display.html?id=54361 if ($tag eq 'base') { # Ignore with missing/empty href. $self->{base} = $attr->{href} if (defined($attr->{href}) && length($attr->{href})); } # Special case: meta[@http-equiv=Refresh]/@content elsif ($tag eq 'meta') { if ($attr->{'http-equiv'} && lc($attr->{'http-equiv'}) eq 'refresh') { my $content = $attr->{content}; if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) { $self->add_link($1, undef, $line); } } } # Special case: tags that have "local base" elsif ($tag eq 'applet' || $tag eq 'object') { if (my $codebase = $attr->{codebase}) { # Applet codebases are directories, append trailing slash # if it's not there so that new_abs does the right thing. $codebase .= "/" if ($tag eq 'applet' && $codebase !~ m|/$|); # TODO: HTML 4 spec says applet/@codebase may only point to # subdirs of the directory containing the current document. # Should we do something about that? $tag_local_base = URI->new_abs($codebase, $self->{base}); } } # Link attributes: if (my $link_attrs = LINK_ATTRS()->{$tag}) { for my $la (@$link_attrs) { $self->add_link($attr->{$la}, $tag_local_base, $line); } } # List of links attributes: if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) { my ($sep, $attrs) = @$link_attrs; for my $la (@$attrs) { if (defined(my $value = $attr->{$la})) { for my $link (split($sep, $value)) { $self->add_link($link, $tag_local_base, $line); } } } } # Inline CSS: delete $self->{csstext}; if ($tag eq 'style') { $self->{csstext} = '' if ((!$attr->{type} && $self->{style_is_css}) || is_content_type($attr->{type}, "text/css")); } elsif ($self->{style_is_css} && (my $style = $attr->{style})) { $style = CSS::DOM::Style::parse($style); $self->parse_style($style, $line); } } $self->parse_progress($line) if $Opts{Progress}; return; } sub end { my ($self, $tagname, $line) = @_; $self->parse_css($self->{csstext}, $line) if ($tagname eq 'style'); delete $self->{csstext}; $self->parse_progress($line) if $Opts{Progress}; return; } sub parse_css { my ($self, $css, $line) = @_; return unless $css; my $sheet = CSS::DOM::parse($css); for my $rule (@{$sheet->cssRules()}) { if ($rule->type() == IMPORT_RULE()) { $self->add_link($rule->href(), $self->{base}, $line); } elsif ($rule->type == STYLE_RULE()) { $self->parse_style($rule->style(), $line); } } return; } sub parse_style { my ($self, $style, $line) = @_; return unless $style; for (my $i = 0, my $len = $style->length(); $i < $len; $i++) { my $prop = $style->item($i); my $val = $style->getPropertyValue($prop); while ($val =~ /$CssUrl/go) { my $url = CSS::DOM::Util::unescape($2); $self->add_link($url, $self->{base}, $line); } } return; } sub declaration { my ($self, $text, $line) = @_; # Extract the doctype my @declaration = split(/\s+/, $text, 4); if ($#declaration >= 3 && $declaration[0] eq 'DOCTYPE' && lc($declaration[1]) eq 'html') { # Parse the doctype declaration if ($text =~ m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i ) { # Store the doctype $self->doctype($1) if $1; # If there is a link to the DTD, record it $self->add_link($3, undef, $line) if (!$self->{only_anchors} && $3); } } $self->text($text) unless $self->{only_anchors}; return; } sub text { my ($self, $text, $line) = @_; $self->{csstext} .= $text if defined($self->{csstext}); $self->parse_progress($line) if $Opts{Progress}; return; } sub end_document { my ($self, $line) = @_; $self->{Total} = $line; delete $self->{csstext}; return; } ################################ # Check the validity of a link # ################################ sub check_validity (\$\$$\%\%) { my ($referer, $uri, $want_links, $links, $redirects) = @_; # $referer is the URI object of the document checked # $uri is the URI object of the target that we are verifying # $want_links is true if we're interested in links in the target doc # $links is a hash of the links in the documents checked # $redirects is a map of the redirects encountered # Get the document with the appropriate method: GET if there are # fragments to check or links are wanted, HEAD is enough otherwise. my $fragments = $links->{$uri}{fragments} || {}; my $method = ($want_links || %$fragments) ? 'GET' : 'HEAD'; my $response; my $being_processed = 0; if (!defined($results{$uri}) || ($method eq 'GET' && $results{$uri}{method} eq 'HEAD')) { $being_processed = 1; $response = &get_uri($method, $uri, $referer); # Get the information back from get_uri() &record_results($uri, $method, $response, $referer); # Record the redirects &record_redirects($redirects, $response); } elsif (!($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}))) { my $ref = $results{$uri}{referer}; &hprintf("Already checked%s\n", $ref ? ", referrer $ref" : "."); } # We got the response of the HTTP request. Stop here if it was a HEAD. return if ($method eq 'HEAD'); # There are fragments. Parse the document. my $p; if ($being_processed) { # Can we really parse the document? if (!defined($results{$uri}{location}{type}) || $results{$uri}{location}{type} !~ $ContentTypes) { &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n", $uri, $results{$uri}{location}{type}) if ($Opts{Verbose}); $response->content(""); return; } # Do it then if (my $error = decode_content($response)) { &hprintf("%s\n.", $error); } # @@@TODO: this isn't the best thing to do if a decode error occurred $p = &parse_document($uri, $response->base(), $response, 0, $want_links); } else { # We already had the information $p->{Anchors} = $results{$uri}{parsing}{Anchors}; } # Check that the fragments exist for my $fragment (keys %$fragments) { if (defined($p->{Anchors}{$fragment}) || &escape_match($fragment, $p->{Anchors}) || grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}}) { $results{$uri}{fragments}{$fragment} = 1; } else { $results{$uri}{fragments}{$fragment} = 0; } } return; } sub escape_match ($\%) { my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]); for my $b (keys %$hash) { return 1 if ($a eq URI::Escape::uri_unescape($b)); } return 0; } ########################## # Ask for authentication # ########################## sub authentication ($;$$$$) { my ($response, $cookie, $params, $check_num, $is_start) = @_; my $realm = ''; if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) { $realm = $1; } if ($Opts{Command_Line}) { printf STDERR <<'EOF', $response->request()->url(), $realm; Authentication is required for %s. The realm is "%s". Use the -u and -p options to specify a username and password and the -d option to specify trusted domains. EOF } else { printf( "Status: 401 Authorization Required\nWWW-Authenticate: %s\n%sConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n", $response->www_authenticate(), $cookie ? "Set-Cookie: $cookie\n" : "", ); printf( "%s W3C Link Checker: 401 Authorization Required %s ", $DocType, $Head ); &banner(': 401 Authorization Required'); &print_form($params, $cookie, $check_num) if $is_start; printf( '

%s You need "%s" access to %s to perform link checking.
', &status_icon(401), &encode($realm), (&encode($response->request()->url())) x 2 ); my $host = $response->request()->url()->host(); if ($Opts{Trusted} && $host !~ $Opts{Trusted}) { printf <<'EOF', &encode($Opts{Trusted}), &encode($host); This service has been configured to send authentication only to hostnames matching the regular expression %s, but the hostname %s does not match it. EOF } print "

\n"; } return; } ################## # Get statistics # ################## sub get_timestamp () { return pack('LL', Time::HiRes::gettimeofday()); } sub time_diff ($$) { my @start = unpack('LL', $_[0]); my @stop = unpack('LL', $_[1]); for ($start[1], $stop[1]) { $_ /= 1_000_000; } return (sprintf("%.2f", ($stop[0] + $stop[1]) - ($start[0] + $start[1]))); } ######################## # Handle the redirects # ######################## # Record the redirects in a hash sub record_redirects (\%$) { my ($redirects, $response) = @_; for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) { # Check for redirect match. my $from = $prev->request()->url(); my $to = $response->request()->url(); # same on every loop iteration my $from_to = $from . '->' . $to; my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}}; # print STDERR "Result $match of redirect checking $from_to\n"; if ($match) { next; } $match = grep { $from_to =~ /$_/ } @{$Opts{Suppress_Redirect_Prefix}}; # print STDERR "Result $match of regexp checking $from_to\n"; if ($match) { next; } my $c = $prev->code(); if ($Opts{Suppress_Temp_Redirects} && ($c == 307 || $c == 302)) { next; } $redirects->{$prev->request()->url()} = $response->request()->url(); } return; } # Determine if a request is redirected sub is_redirected ($%) { my ($uri, %redirects) = @_; return (defined($redirects{$uri})); } # Get a list of redirects for a URI sub get_redirects ($%) { my ($uri, %redirects) = @_; my @history = ($uri); my %seen = ($uri => 1); # for tracking redirect loops my $loop = 0; while ($redirects{$uri}) { $uri = $redirects{$uri}; push(@history, $uri); if ($seen{$uri}) { $loop = 1; last; } else { $seen{$uri}++; } } return ($loop, @history); } #################################################### # Tool for sorting the unique elements of an array # #################################################### sub sort_unique (@) { my %saw; @saw{@_} = (); return (sort { $a <=> $b } keys %saw); } ##################### # Print the results # ##################### sub line_number ($) { my $line = shift; return $line if ($line >= 0); return "(N/A)"; } sub http_rc ($) { my $rc = shift; return $rc if ($rc >= 0); return "(N/A)"; } # returns true if the given code is informational sub informational ($) { my $rc = shift; return $rc == RC_ROBOTS_TXT() || $rc == RC_IP_DISALLOWED() || $rc == RC_PROTOCOL_DISALLOWED(); } sub anchors_summary (\%\%) { my ($anchors, $errors) = @_; # Number of anchors found. my $n = scalar(keys(%$anchors)); if (!$Opts{Quiet}) { if ($Opts{HTML}) { print("

Anchors

\n

"); } else { print("Anchors\n\n"); } &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's'); print("

\n") if $Opts{HTML}; } # List of the duplicates, if any. my @errors = keys %{$errors}; if (!scalar(@errors)) { print("

Valid anchors!

\n") if (!$Opts{Quiet} && $Opts{HTML} && $n); return; } undef $n; print_doc_header(); print('

') if $Opts{HTML}; print('List of duplicate and empty anchors'); print <<'EOF' if $Opts{HTML};

EOF print("\n"); for my $anchor (@errors) { my $format; my @unique = &sort_unique( map { line_number($_) } keys %{$anchors->{$anchor}} ); if ($Opts{HTML}) { $format = "\n"; } else { my $s = (scalar(@unique) > 1) ? 's' : ''; $format = "\t%s\tLine$s: %s\n"; } printf($format, &encode(length($anchor) ? $anchor : 'Empty anchor'), join(', ', @unique)); } print("\n
Anchor Lines
%s%s
\n") if $Opts{HTML}; return; } sub show_link_report (\%\%\%\%\@;$\%) { my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_; print("\n
") if $Opts{HTML}; print("\n") if (!$Opts{Quiet}); # Process each URL my ($c, $previous_c); for my $u (@$urls) { my @fragments = keys %{$broken->{$u}{fragments}}; # Did we get a redirect? my $redirected = &is_redirected($u, %$redirects); # List of lines my @total_lines; push(@total_lines, keys(%{$links->{$u}{location}})); for my $f (@fragments) { push(@total_lines, keys(%{$links->{$u}{fragments}{$f}})) unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()})); } my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects); my $currloc = $results->{$u}{location}; # Error type $c = &code_shown($u, $results); # What to do my $whattodo; my $redirect_too; if ($todo) { if ($u =~ m/^javascript:/) { if ($Opts{HTML}) { $whattodo = 'You must change this link: people using a browser without JavaScript support will not be able to follow this link. See the Web Content Accessibility Guidelines on the use of scripting on the Web and the techniques on how to solve this.'; } else { $whattodo = 'Change this link: people using a browser without JavaScript support will not be able to follow this link.'; } } elsif ($c == RC_ROBOTS_TXT()) { $whattodo = 'The link was not checked due to robots exclusion ' . 'rules. Check the link manually.'; } elsif ($redirect_loop) { $whattodo = 'Retrieving the URI results in a redirect loop, that should be ' . 'fixed. Examine the redirect sequence to see where the loop ' . 'occurs.'; } else { $whattodo = $todo->{$c}; } } elsif (defined($redirects{$u})) { # Redirects if (($u . '/') eq $redirects{$u}) { $whattodo = 'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.'; } elsif ($c == 307 || $c == 302) { $whattodo = 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.'; } elsif ($c == 301) { $whattodo = 'This is a permanent redirect. The link should be updated.'; } } my @unique = &sort_unique(map { line_number($_) } @total_lines); my $lines_list = join(', ', @unique); my $s = (scalar(@unique) > 1) ? 's' : ''; undef @unique; my @http_codes = ($currloc->{code}); unshift(@http_codes, $currloc->{orig}) if $currloc->{orig}; @http_codes = map { http_rc($_) } @http_codes; if ($Opts{HTML}) { # Style stuff my $idref = ''; if ($codes && (!defined($previous_c) || ($c != $previous_c))) { $idref = ' id="d' . $doc_count . 'code_' . $c . '"'; $previous_c = $c; } # Main info for (@redirects_urls) { $_ = &show_url($_); } # HTTP message my $http_message; if ($currloc->{message}) { $http_message = &encode($currloc->{message}); if ($c == 404 || $c == 500) { $http_message = '' . $http_message . ''; } } my $redirmsg = $redirect_loop ? ' redirect loop detected' : ''; printf(" %s Line%s: %s %s
Status: %s %s %s

%s %s

\n", # Anchor for return codes $idref, # Color &status_icon($c), $s, # List of lines $lines_list, # List of redirects $redirected ? join(' redirected to ', @redirects_urls) . $redirmsg : &show_url($u), # Realm defined($currloc->{realm}) ? sprintf('Realm: %s
', &encode($currloc->{realm})) : '', # HTTP original message # defined($currloc->{orig_message}) # ? &encode($currloc->{orig_message}). # ' -> ' # : '', # Response code chain join( ' -> ', map { &encode($_) } @http_codes), # HTTP final message $http_message, # What to do $whattodo, # Redirect too? $redirect_too ? sprintf(' %s', &bgcolor(301), $redirect_too) : '', ); if ($#fragments >= 0) { printf("
Broken fragments:
    \n"); } } else { my $redirmsg = $redirect_loop ? ' redirect loop detected' : ''; printf( "\n%s\t%s\n Code: %s %s\n%s\n", # List of redirects $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u, # List of lines $lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '', # Response code chain join(' -> ', @http_codes), # HTTP message $currloc->{message} || '', # What to do wrap(' To do: ', ' ', $whattodo) ); if ($#fragments >= 0) { if ($currloc->{code} == 200) { print("The following fragments need to be fixed:\n"); } else { print("Fragments:\n"); } } } # Fragments for my $f (@fragments) { my @unique_lines = &sort_unique(keys %{$links->{$u}{fragments}{$f}}); my $plural = (scalar(@unique_lines) > 1) ? 's' : ''; my $unique_lines = join(', ', @unique_lines); if ($Opts{HTML}) { printf("
  • %s#%s (line%s %s)
  • \n", &encode($u), &encode($f), $plural, $unique_lines); } else { printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines); } } print("
\n") if ($Opts{HTML} && scalar(@fragments)); } # End of the table print("
\n") if $Opts{HTML}; return; } sub code_shown ($$) { my ($u, $results) = @_; if ($results->{$u}{location}{record} == 200) { return $results->{$u}{location}{orig} || $results->{$u}{location}{record}; } else { return $results->{$u}{location}{record}; } } sub links_summary (\%\%\%\%) { # Advices to fix the problems my %todo = ( 200 => 'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).', 300 => 'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.', 301 => 'This is a permanent redirect. The link should be updated to point to the more recent URI.', 302 => 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.', 303 => 'This rare status code points to a "See Other" resource. There is generally nothing to be done.', 307 => 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.', 400 => 'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.', 401 => "The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.", 403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.', 404 => 'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.', 405 => 'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically. Check the link manually.', 406 => "The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.", 407 => 'The link is a proxy, but requires Authentication.', 408 => 'The request timed out.', 410 => 'The resource is gone. You should remove this link.', 415 => 'The media type is not supported.', 500 => 'This is a server side problem. Check the URI.', 501 => 'Could not check this link: method not implemented or scheme not supported.', 503 => 'The server cannot service the request, for some unknown reason.', # Non-HTTP codes: RC_ROBOTS_TXT() => sprintf( 'The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.', $Opts{HTML} ? ( '', '', "", '' ) : ('') x 4 ), RC_DNS_ERROR() => 'The hostname could not be resolved. Check the link for typos.', RC_IP_DISALLOWED() => sprintf( 'The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.', $Opts{HTML} ? ('', '') : ('') x 2), RC_PROTOCOL_DISALLOWED() => 'Accessing links with this URI scheme has been disabled in link checker.', ); my %priority = ( 410 => 1, 404 => 2, 403 => 5, 200 => 10, 300 => 15, 401 => 20 ); my ($links, $results, $broken, $redirects) = @_; # List of the broken links my @urls = keys %{$broken}; my @dir_redirect_urls = (); if ($Opts{Redirects}) { # Add the redirected URI's to the report for my $l (keys %$redirects) { next unless (defined($results->{$l}) && defined($links->{$l}) && !defined($broken->{$l})); # Check whether we have a "directory redirect" # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ my ($redirect_loop, @redirects) = get_redirects($l, %$redirects); if ($#redirects == 1) { push(@dir_redirect_urls, $l); next; } push(@urls, $l); } } # Broken links and redirects if ($#urls < 0) { if (!$Opts{Quiet}) { print_doc_header(); if ($Opts{HTML}) { print "

Links

\n

Valid links!

\n"; } else { print "\nValid links.\n"; } } } else { print_doc_header(); print('

') if $Opts{HTML}; print("\nList of broken links and other issues"); #print(' and redirects') if $Opts{Redirects}; # Sort the URI's by HTTP Code my %code_summary; my @idx; for my $u (@urls) { if (defined($results->{$u}{location}{record})) { my $c = &code_shown($u, $results); $code_summary{$c}++; push(@idx, $c); } } my @sorted = @urls[ sort { defined($priority{$idx[$a]}) ? defined($priority{$idx[$b]}) ? $priority{$idx[$a]} <=> $priority{$idx[$b]} : -1 : defined($priority{$idx[$b]}) ? 1 : $idx[$a] <=> $idx[$b] } 0 .. $#idx ]; @urls = @sorted; undef(@sorted); undef(@idx); if ($Opts{HTML}) { # Print a summary print <<'EOF';

There are issues with the URLs listed below. The table summarizes the issues and suggested actions by HTTP response status code.

EOF for my $code (sort(keys(%code_summary))) { printf('', &bgcolor($code)); printf('', $doc_count, $code, http_rc($code)); printf('', $code_summary{$code}); printf('', $todo{$code}); print "\n"; } print "\n
Code Occurrences What to do
%s%s%s
\n"; } else { print(':'); } &show_link_report($links, $results, $broken, $redirects, \@urls, 1, \%todo); } # Show directory redirects if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) { print_doc_header(); print('

') if $Opts{HTML}; print("\nList of redirects"); print( "

\n

The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.

" ) if $Opts{HTML}; &show_link_report($links, $results, $broken, $redirects, \@dir_redirect_urls); } return; } ############################################################################### ################ # Global stats # ################ sub global_stats () { my $stop = &get_timestamp(); my $n_docs = ($doc_count <= $Opts{Max_Documents}) ? $doc_count : $Opts{Max_Documents}; return sprintf( 'Checked %d document%s in %s seconds.', $n_docs, ($n_docs == 1) ? '' : 's', &time_diff($timestamp, $stop) ); } ################## # HTML interface # ################## sub html_header ($$) { my ($uri, $cookie) = @_; my $title = defined($uri) ? $uri : ''; $title = ': ' . $title if ($title =~ /\S/); my $headers = ''; if (!$Opts{Command_Line}) { $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $uri; $headers .= "Content-Type: text/html; charset=utf-8\n"; $headers .= "Set-Cookie: $cookie\n" if $cookie; # mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same # print() statement as the last header $headers .= "Content-Language: en\n\n"; } my $onload = $uri ? '' : ' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"'; print $headers, $DocType, " W3C Link Checker", &encode($title), " ", $Head, " '; &banner($title); return; } sub banner ($) { my $tagline = "Check links and anchors in Web pages or full Web sites"; printf( <<'EOF', URI->new_abs("../images/no_w3c.png", $Cfg{Doc_URI}), $tagline);
EOF return; } sub status_icon($) { my ($code) = @_; my $icon_type; my $r = HTTP::Response->new($code); if ($r->is_success()) { $icon_type = 'error' ; # if is success but reported, it's because of broken frags => error } elsif (&informational($code)) { $icon_type = 'info'; } elsif ($code == 300) { $icon_type = 'info'; } elsif ($code == 401) { $icon_type = 'error'; } elsif ($r->is_redirect()) { $icon_type = 'warning'; } elsif ($r->is_error()) { $icon_type = 'error'; } else { $icon_type = 'error'; } return sprintf('%s', URI->new_abs("../images/info_icons/$icon_type.png", $Cfg{Doc_URI}), $icon_type); } sub bgcolor ($) { my ($code) = @_; my $class; my $r = HTTP::Response->new($code); if ($r->is_success()) { return ''; } elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) { $class = 'dubious'; } elsif ($code == 300) { $class = 'multiple'; } elsif ($code == 401) { $class = 'unauthorized'; } elsif ($r->is_redirect()) { $class = 'redirect'; } elsif ($r->is_error()) { $class = 'broken'; } else { $class = 'broken'; } return (' class="' . $class . '"'); } sub show_url ($) { my ($url) = @_; return sprintf('%s', (&encode($url)) x 2); } sub html_footer () { printf("

%s

\n", &global_stats()) if ($doc_count > 0 && !$Opts{Quiet}); if (!$doc_count) { print <<'EOF';

This Link Checker looks for issues in links, anchors and referenced objects in a Web page, CSS style sheet, or recursively on a whole Web site. For best results, it is recommended to first ensure that the documents checked use Valid (X)HTML Markup and CSS. The Link Checker is part of the W3C's validators and Quality Web tools.

EOF } printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION);
%s
%s
EOF return; } sub print_form (\%$$) { my ($params, $cookie, $check_num) = @_; # Split params on \0, see CGI's docs on Vars() while (my ($key, $value) = each(%$params)) { if ($value) { my @vals = split(/\0/, $value, 2); $params->{$key} = $vals[0]; } } # Override undefined values from the cookie, if we got one. my $valid_cookie = 0; if ($cookie) { my %cookie_values = $cookie->value(); if (!$cookie_values{clear}) { # XXX no easy way to check if cookie expired? $valid_cookie = 1; while (my ($key, $value) = each(%cookie_values)) { $params->{$key} = $value unless defined($params->{$key}); } } } my $chk = ' checked="checked"'; $params->{hide_type} = 'all' unless $params->{hide_type}; my $requested_uri = &encode($params->{uri} || ''); my $sum = $params->{summary} ? $chk : ''; my $red = $params->{hide_redirects} ? $chk : ''; my $all = ($params->{hide_type} ne 'dir') ? $chk : ''; my $dir = $all ? '' : $chk; my $acc = $params->{no_accept_language} ? $chk : ''; my $ref = $params->{no_referer} ? $chk : ''; my $rec = $params->{recursive} ? $chk : ''; my $dep = &encode($params->{depth} || ''); my $cookie_options = ''; if ($valid_cookie) { $cookie_options = " "; } else { $cookie_options = " "; } print "

More Options





,

", $cookie_options, "

"; return; } sub encode (@) { return $Opts{HTML} ? HTML::Entities::encode(@_) : @_; } sub hprintf (@) { print_doc_header(); if (!$Opts{HTML}) { # can have undef values here; avoid useless warning. E.g., # Error: -1 Forbidden by robots.txt # Use of uninitialized value $_[2] in printf at /usr/local/bin/checklink line 3245. # and # Error: 404 File `/u/karl/gnu/src/akarl/doc/dejagnu.html' does not exist # Use of uninitialized value $_[2] in printf at /usr/local/bin/checklink line 3245. my @args = (); for my $a (@_) { push (@args, defined $a ? $a : ""), } printf(@args); } else { print HTML::Entities::encode(sprintf($_[0], @_[1 .. @_ - 1])); } return; } # Print the document header, if it hasn't been printed already. # This is invoked before most other output operations, in order # to enable quiet processing that doesn't clutter the output with # "Processing..." messages when nothing else will be reported. sub print_doc_header () { if (defined($doc_header)) { print $doc_header; undef($doc_header); } } # Local Variables: # mode: perl # indent-tabs-mode: nil # cperl-indent-level: 4 # cperl-continued-statement-offset: 4 # cperl-brace-offset: -4 # perl-indent-level: 4 # End: # ex: ts=4 sw=4 et