File checkbot-1.80-webserver.patch of Package perl-checkbot
diff -ruN checkbot-1.80-orig/checkbot checkbot-1.80/checkbot
--- checkbot-1.80-orig/checkbot 2008-10-15 14:55:01.000000000 +0200
+++ checkbot-1.80/checkbot 2010-10-28 18:15:44.000000000 +0200
@@ -33,7 +33,7 @@
=head1 NAME
-Checkbot - WWW Link Verifier
+Checkbot - WWW Link Verifier. Modified for the Novell Webconsole
=head1 SYNOPSIS
@@ -51,6 +51,8 @@
[B<--enable-virtual>]
[B<--language> language code]
[B<--suppress> suppression file]
+ [B<--username> Username for Novell Webconsole]
+ [B<--password> Password for Novell Webconsole]
[start URLs]
=head1 DESCRIPTION
@@ -307,6 +309,28 @@
403 /http:\/\/wikipedia.org\/.*/
+=item --username <username>
+
+Username for the Novell Webconsole.
+
+Before starting the link check the must exist already an account
+to the Webconsole in order getting all available links.
+So first of all there will be generated a login cookie by calling
+the <hostname>/accounts/login with the given username and password.
+
+Do NOT forget activating cookies by the --cookies option.
+
+=item --password <password>
+
+Password for the Novell Webconsole.
+
+Before starting the link check the must exist already an account
+to the Webconsole in order getting all available links.
+So first of all there will be generated a login cookie by calling
+the <hostname>/accounts/login with the given username and password.
+
+Do NOT forget activating cookies by the --cookies option.
+
=back
Deprecated options which will disappear in a future release:
@@ -482,7 +506,7 @@
# Get command-line arguments
use Getopt::Long;
- my $result = GetOptions(qw(cookies debug help noproxy=s verbose url=s match=s exclude|x=s file=s filter=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=f timeout=i interval=i dontwarn=s enable-virtual language=s allow-simple-hosts suppress=s));
+ my $result = GetOptions(qw(cookies debug help noproxy=s verbose url=s match=s exclude|x=s file=s filter=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=f timeout=i interval=i dontwarn=s enable-virtual language=s allow-simple-hosts suppress=s password=s username=s));
# Handle arguments, some are mandatory, some have defaults
&print_help if (($main::opt_help && $main::opt_help)
@@ -661,9 +685,42 @@
open(CURRENT, $main::cur_queue)
|| die "$0: Unable to open CURRENT $main::cur_queue for reading: $!\n";
+ ## initialize to webconsole specific
+ if (defined($main::opt_url))
+ {
+ my $urlstring = $main::opt_url . "/accounts/login";
+
+ my %argument = ();
+ if (defined($main::opt_username))
+ {
+ $argument{"user_login"} = $main::opt_username;
+ }
+ if (defined($main::opt_password))
+ {
+ $argument{"user_password"} = $main::opt_password;
+ }
+
+ # receiving session cookie
+ handle_url($main::opt_url);
+
+ # login
+ my $root_url = URI->new($urlstring);
+
+ my $response = $main::ua->post($root_url, \%argument);
+ if ($response->is_success || $response->is_redirect) {
+ print "Login succeeded\n";
+ print $response->content;
+ }
+ else {
+ print "Login NOT succeeded\n";
+ print STDERR $response->status_line, "\n";
+ }
+ }
+
do {
# Read a line from the queue, and process it
while (defined ($line = <CURRENT>) ) {
+ print "checking $line";
chomp($line);
&handle_url($line);
&check_point();
@@ -727,6 +784,8 @@
my $response;
my $type;
+# print "Checking URL: $urlstr \n";
+
$stats{'todo'}--;
# Add this URL to the ones we've seen already, return if it is a
@@ -800,16 +859,16 @@
if ($response->is_redirect and is_internal($url->as_string)) {
if ($response->code == 300) { # multiple choices, but no redirection available
- output 'Multiple choices', 2;
+ print 'Multiple choices\n';
} else {
my $baseURI = URI->new($url);
if (defined $response->header('Location')) {
my $redir_url = URI->new_abs($response->header('Location'), $baseURI);
- output "Redirected to $redir_url", 2;
+ print "Redirected to $redir_url\n";
add_to_queue($redir_url, $urlparent);
$stats{'todo'}++;
} else {
- output 'Location header missing from redirect response', 2;
+ print 'Location header missing from redirect response\n';
}
}
}
@@ -984,6 +1043,8 @@
print OUT "<tr><th align=\"left\">--match</th><td class='text'>Match regular expression</td><td class='value' id='match'>$main::opt_match</td></tr>\n";
print OUT "<tr><th align=\"left\">--exclude</th><td class='text'>Exclude regular expression</td><td class='value' id='exclude'>$main::opt_exclude</td></tr>\n" if defined $main::opt_exclude;
print OUT "<tr><th align=\"left\">--filter</th><td class='text'>Filter regular expression</td><td class='value' id='filter'>$main::opt_filter</td></tr>\n" if defined $main::opt_filter;
+ print OUT "<tr><th align=\"left\">--username</th><td class='text'>Username</td><td class='value' id='username'>$main::opt_username</td></tr>\n" if defined $main::opt_username;
+ print OUT "<tr><th align=\"left\">--password</th><td class='text'>Password</td><td class='value' id='password'>set</td></tr>\n" if defined $main::opt_password;
print OUT "<tr><th align=\"left\">--noproxy</th><td class='text'>No Proxy for the following domains</td><td class='value' id='noproxy'>$main::opt_noproxy</td></tr>\n" if defined $main::opt_noproxy;
print OUT "<tr><th align=\"left\">--ignore</th><td class='text'>Ignore regular expression</td><td class='value' id='ignore'>$main::opt_ignore</td></tr>\n" if defined $main::opt_ignore;
print OUT "<tr><th align=\"left\">--suppress</th><td class='text'>Suppress error code and URL specified by</td><td class='value' id='suppress'>$main::opt_suppress</td></tr>\n" if defined $main::opt_suppress;
@@ -1183,12 +1244,14 @@
my $content = $response->decoded_content || $response->content;
$p->parse($content);
$p->eof;
-
+ my $string = $response->base->as_string;
+ print ( "URL $string \n");
# Deal with the links we found in this document
my @links = $p->links();
foreach (@links) {
my ($tag, %l) = @{$_};
foreach (keys %l) {
+ print (" $l{$_} $_ $tag\n");
# Get the canonical URL, so we don't need to worry about base, case, etc.
my $url = $l{$_}->canonical;
@@ -1473,6 +1536,12 @@
--dontwarn codes Do not write warnings for these HTTP response codes
--enable-virtual Use only virtual names, not IP numbers for servers
--language Specify 2-letter language code for language negotiation
+ --username Username for Novell Webconsole
+ --password Password for Novell Webconsole
+
+Example for checking a Novell Webconsole:
+
+checkbot --cookies --url=http://webconsole.suse.de --username=schubi --password=system --ignore=http://webconsole.suse.de/accounts/logout
Options --match, --exclude, and --ignore can take a perl regular expression
as their argument\n
diff -ruN checkbot-1.80-orig/checkbot.orig checkbot-1.80/checkbot.orig
--- checkbot-1.80-orig/checkbot.orig 1970-01-01 01:00:00.000000000 +0100
+++ checkbot-1.80/checkbot.orig 2008-10-15 14:55:01.000000000 +0200
@@ -0,0 +1,1516 @@
+#!/usr/bin/perl -w
+#
+# checkbot - A perl5 script to check validity of links in www document trees
+#
+# Hans de Graaff <hans@degraaff.org>, 1994-2005.
+# Based on Dimitri Tischenko, Delft University of Technology, 1994
+# Based on the testlinks script by Roy Fielding
+# With contributions from Bruce Speyer <bruce.speyer@elecomm.com>
+#
+# This application is free software; you can redistribute it and/or
+# modify it under the same terms as Perl itself.
+#
+# Info-URL: http://degraaff.org/checkbot/
+#
+# $Id: checkbot 238 2008-10-15 12:55:00Z graaff $
+# (Log information can be found at the end of the script)
+
+require 5.004;
+use strict;
+
+require LWP;
+use File::Basename;
+
+BEGIN {
+ eval "use Time::Duration qw(duration)";
+ $main::useduration = ($@ ? 0 : 1);
+}
+
+# Version information
+my
+$VERSION = '1.80';
+
+
+=head1 NAME
+
+Checkbot - WWW Link Verifier
+
+=head1 SYNOPSIS
+
+checkbot [B<--cookies>] [B<--debug>] [B<--file> file name] [B<--help>]
+ [B<--mailto> email addresses] [B<--noproxy> list of domains]
+ [B<--verbose>]
+ [B<--url> start URL]
+ [B<--match> match string] [B<--exclude> exclude string]
+ [B<--proxy> proxy URL] [B<--internal-only>]
+ [B<--ignore> ignore string]
+ [B<--filter> substitution regular expression]
+ [B<--style> style file URL]
+ [B<--note> note] [B<--sleep> seconds] [B<--timeout> timeout]
+ [B<--interval> seconds] [B<--dontwarn> HTTP responde codes]
+ [B<--enable-virtual>]
+ [B<--language> language code]
+ [B<--suppress> suppression file]
+ [start URLs]
+
+=head1 DESCRIPTION
+
+Checkbot verifies the links in a specific portion of the World Wide
+Web. It creates HTML pages with diagnostics.
+
+Checkbot uses LWP to find URLs on pages and to check them. It supports
+the same schemes as LWP does, and finds the same links that
+HTML::LinkExtor will find.
+
+Checkbot considers links to be either 'internal' or
+'external'. Internal links are links within the web space that needs
+to be checked. If an internal link points to a web document this
+document is retrieved, and its links are extracted and
+processed. External links are only checked to be working. Checkbot
+checks links as it finds them, so internal and external links are
+checked at the same time, even though they are treated differently.
+
+Options for Checkbot are:
+
+=over 4
+
+=item --cookies
+
+Accept cookies from the server and offer them again at later
+requests. This may be useful for servers that use cookies to handle
+sessions. By default Checkbot does not accept any cookies.
+
+=item --debug
+
+Enable debugging mode. Not really supported anymore, but it will keep
+some files around that otherwise would be deleted.
+
+=item --file <file name>
+
+Use the file I<file name> as the basis for the summary file names. The
+summary page will get the I<file name> given, and the server pages are
+based on the I<file name> without the .html extension. For example,
+setting this option to C<index.html> will create a summary page called
+index.html and server pages called index-server1.html and
+index-server2.html.
+
+The default value for this option is C<checkbot.html>.
+
+=item --help
+
+Shows brief help message on the standard output.
+
+=item --mailto <email address>[,<email address>]
+
+Send mail to the I<email address> when Checkbot is done checking. You
+can give more than one address separated by commas. The notification
+email includes a small summary of the results. As of Checkbot 1.76
+email is only sent if problems have been found during the Checkbot
+run.
+
+=item --noproxy <list of domains>
+
+Do not proxy requests to the given domains. The list of domains must
+be a comma-separated list. For example, so avoid using the proxy for
+the localhost and someserver.xyz, you can use C<--noproxy
+localhost,someserver.xyz>.
+
+=item --verbose
+
+Show verbose output while running. Includes all links checked, results
+from the checks, etc.
+
+
+
+
+
+=item --url <start URL>
+
+Set the start URL. Checkbot starts checking at this URL, and then
+recursively checks all links found on this page. The start URL takes
+precedence over additional URLs specified on the command line.
+
+If no scheme is specified for the URL, the file protocol is assumed.
+
+=item --match <match string>
+
+This option selects which pages Checkbot considers local. If the
+I<match string> is contained within the URL, then Checkbot considers
+the page local, retrieves it, and will check all the links contained
+on it. Otherwise the page is considered external and it is only
+checked with a HEAD request.
+
+If no explicit I<match string> is given, the start URLs (See option
+C<--url>) will be used as a match string instead. In this case the
+last page name, if any, will be trimmed. For example, a start URL like
+C<http://some.site/index.html> will result in a default I<match
+string> of C<http://some.site/>.
+
+The I<match string> can be a perl regular expression. For example, to
+check the main server page and all HTML pages directly underneath it,
+but not the HTML pages in the subdirectories of the server, the
+I<match string> would be C<www.someserver.xyz/($|[^/]+.html)>.
+
+=item --exclude <exclude string>
+
+URLs matching the I<exclude string> are considered to be external,
+even if they happen to match the I<match string> (See option
+C<--match>). URLs matching the --exclude string are still being
+checked and will be reported if problems are found, but they will not
+be checked for further links into the site.
+
+The I<exclude string> can be a perl regular expression. For example,
+to consider all URLs with a query string external, use C<[=\?]>. This
+can be useful when a URL with a query string unlocks the path to a
+huge database which will be checked.
+
+=item --filter <filter string>
+
+This option defines a I<filter string>, which is a perl regular
+expression. This filter is run on each URL found, thus rewriting the
+URL before it enters the queue to be checked. It can be used to remove
+elements from a URL. This option can be useful when symbolic links
+point to the same directory, or when a content management system adds
+session IDs to URLs.
+
+For example C</old/new/> would replace occurrences of 'old' with 'new'
+in each URL.
+
+=item --ignore <ignore string>
+
+URLs matching the I<ignore string> are not checked at all, they are
+completely ignored by Checkbot. This can be useful to ignore known
+problem links, or to ignore links leading into databases. The I<ignore
+string> is matched after the I<filter string> has been applied.
+
+The I<ignore string> can be a perl regular expression.
+
+For example C<www.server.com\/(one|two)> would match all URLs starting
+with either www.server.com/one or www.server.com/two.
+
+
+=item --proxy <proxy URL>
+
+This attribute specifies the URL of a proxy server. Only the HTTP and
+FTP requests will be sent to that proxy server.
+
+=item --internal-only
+
+Skip the checking of external links at the end of the Checkbot
+run. Only matching links are checked. Note that some redirections may
+still cause external links to be checked.
+
+=item --note <note>
+
+The I<note> is included verbatim in the mail message (See option
+C<--mailto>). This can be useful to include the URL of the summary HTML page
+for easy reference, for instance.
+
+Only meaningful in combination with the C<--mailto> option.
+
+=item --sleep <seconds>
+
+Number of I<seconds> to sleep in between requests. Default is 0
+seconds, i.e. do not sleep at all between requests. Setting this
+option can be useful to keep the load on the web server down while
+running Checkbot. This option can also be set to a fractional number,
+i.e. a value of 0.1 will sleep one tenth of a second between requests.
+
+=item --timeout <timeout>
+
+Default timeout for the requests, specified in seconds. The default is
+2 minutes.
+
+=item --interval <seconds>
+
+The maximum interval between updates of the results web pages in
+seconds. Default is 3 hours (10800 seconds). Checkbot will start the
+interval at one minute, and gradually extend it towards the maximum
+interval.
+
+=item --style <URL of style file>
+
+When this option is used, Checkbot embeds this URL as a link to a
+style file on each page it writes. This makes it easy to customize the
+layout of pages generated by Checkbot.
+
+=item --dontwarn <HTTP response codes regular expression>
+
+Do not include warnings on the result pages for those HTTP response
+codes which match the regular expression. For instance, --dontwarn
+"(301|404)" would not include 301 and 404 response codes.
+
+Checkbot uses the response codes generated by the server, even if this
+response code is not defined in RFC 2616 (HTTP/1.1). In addition to
+the normal HTTP response code, Checkbot defines a few response codes
+for situations which are not technically a problem, but which causes
+problems in many cases anyway. These codes are:
+
+ 901 Host name expected but not found
+ In this case the URL supports a host name, but non was found
+ in the URL. This usually indicates a mistake in the URL. An
+ exception is that this check is not applied to news: URLs.
+
+ 902 Unqualified host name found
+ In this case the host name does not contain the domain part.
+ This usually means that the pages work fine when viewed within
+ the original domain, but not when viewed from outside it.
+
+ 903 Double slash in URL path
+ The URL has a double slash in it. This is legal, but some web
+ servers cannot handle it very well and may cause Checkbot to
+ run away. See also the comments below.
+
+ 904 Unknown scheme in URL
+ The URL starts with a scheme that Checkbot does not know
+ about. This is often caused by mistyping the scheme of the URL,
+ but the scheme can also be a legal one. In that case please let
+ me know so that it can be added to Checkbot.
+
+=item --enable-virtual
+
+This option enables dealing with virtual servers. Checkbot then
+assumes that all hostnames for internal servers are unique, even
+though their IP addresses may be the same. Normally Checkbot uses the
+IP address to distinguish servers. This has the advantage that if a
+server has two names (e.g. www and bamboozle) its pages only get
+checked once. When you want to check multiple virtual servers this
+causes problems, which this feature works around by using the hostname
+to distinguish the server.
+
+=item --language
+
+The argument for this option is a two-letter language code. Checkbot
+will use language negotiation to request files in that language. The
+default is to request English language (language code 'en').
+
+=item --suppress
+
+The argument for this option is a file which contains combinations of
+error codes and URLs for which to suppress warnings. This can be used
+to avoid reporting of known and unfixable URL errors or warnings.
+
+The format of the suppression file is a simple whitespace delimited
+format, first listing the error code followed by the URL. Each error
+code and URL combination is listed on a new line. Comments can be
+added to the file by starting the line with a C<#> character.
+
+ # 301 Moved Permanently
+ 301 http://www.w3.org/P3P
+
+ # 403 Forbidden
+ 403 http://www.herring.com/
+
+For further flexibility a regular expression can be used instead of a
+normal URL. The regular expression must be enclosed with forward
+slashes. For example, to suppress all 403 errors on wikipedia:
+
+ 403 /http:\/\/wikipedia.org\/.*/
+
+=back
+
+Deprecated options which will disappear in a future release:
+
+=over
+
+=item --allow-simple-hosts (deprecated)
+
+This option turns off warnings about URLs which contain unqualified
+host names. This is useful for intranet sites which often use just a
+simple host name or even C<localhost> in their links.
+
+Use of this option is deprecated. Please use the --dontwarn mechanism
+for error 902 instead.
+
+=back
+
+
+=head1 HINTS AND TIPS
+
+=over
+
+=item Problems with checking FTP links
+
+Some users may experience consistent problems with checking FTP
+links. In these cases it may be useful to instruct Net::FTP to use
+passive FTP mode to check files. This can be done by setting the
+environment variable FTP_PASSIVE to 1. For example, using the bash
+shell: C<FTP_PASSIVE=1 checkbot ...>. See the Net::FTP documentation
+for more details.
+
+=item Run-away Checkbot
+
+In some cases Checkbot literally takes forever to finish. There are two
+common causes for this problem.
+
+First, there might be a database application as part of the web site
+which generates a new page based on links on another page. Since
+Checkbot tries to travel through all links this will create an
+infinite number of pages. This kind of run-away effect is usually predictable. It can be avoided by using the --exclude option.
+
+Second, a server configuration problem can cause a loop in generating
+URLs for pages that really do not exist. This will result in URLs of
+the form http://some.server/images/images/images/logo.png, with ever
+more 'images' included. Checkbot cannot check for this because the
+server should have indicated that the requested pages do not
+exist. There is no easy way to solve this other than fixing the
+offending web server or the broken links.
+
+=item Problems with https:// links
+
+The error message
+
+ Can't locate object method "new" via package "LWP::Protocol::https::Socket"
+
+usually means that the current installation of LWP does not support
+checking of SSL links (i.e. links starting with https://). This
+problem can be solved by installing the Crypt::SSLeay module.
+
+=back
+
+=head1 EXAMPLES
+
+The most simple use of Checkbot is to check a set of pages on a
+server. To check my checkbot pages I would use:
+
+ checkbot http://degraaff.org/checkbot/
+
+Checkbot runs can take some time so Checkbot can send a notification
+mail when the run is done:
+
+ checkbot --mailto hans@degraaff.org http://degraaff.org/checkbot/
+
+It is possible to check a set of local file without using a web
+server. This only works for static files but may be useful in some
+cases.
+
+ checkbot file:///var/www/documents/
+
+=head1 PREREQUISITES
+
+This script uses the C<LWP> modules.
+
+=head1 COREQUISITES
+
+This script can send mail when C<Mail::Send> is present.
+
+=head1 AUTHOR
+
+Hans de Graaff <hans@degraaff.org>
+
+=pod OSNAMES
+
+any
+
+=cut
+
+# Declare some global variables, avoids ugly use of main:: all around
+my %checkbot_errors = ('901' => 'Host name expected but not found',
+ '902' => 'Unqualified host name in URL',
+ '903' => 'URL contains double slash in URL',
+ '904' => 'Unknown scheme in URL',
+ );
+
+my @starturls = ();
+
+# Two hashes to store the response to a URL, and all the parents of the URL
+my %url_error = ();
+my %url_parent = ();
+
+# Hash for storing the title of a URL for use in reports. TODO: remove
+# this and store title as part of queue.
+my %url_title = ();
+
+# Hash for suppressions, which are defined as a combination of code and URL
+my %suppression = ();
+
+# Hash to store statistics on link checking
+my %stats = ('todo' => 0,
+ 'link' => 0,
+ 'problem' => 0 );
+
+# Options hash (to be filled by GetOptions)
+my %options = ();
+
+# Keep track of start time so that we can use it in reports
+my $start_time = time();
+
+# If on a Mac we should ask for the arguments through some MacPerl stuff
+if ($^O eq 'MacOS') {
+ $main::mac_answer = eval "MacPerl::Ask('Enter Command-Line Options')";
+ push(@ARGV, split(' ', $main::mac_answer));
+}
+
+# Prepare
+check_options();
+init_modules();
+init_globals();
+init_suppression();
+
+# Start actual application
+check_links();
+
+# Finish up
+create_page(1);
+send_mail() if defined $main::opt_mailto and $stats{problem} > 0;
+
+exit 0;
+
+# output prints stuff on stderr if --verbose, and takes care of proper
+# indentation
+sub output {
+ my ($line, $level) = @_;
+
+ return unless $main::opt_verbose;
+
+ chomp $line;
+
+ my $indent = '';
+
+ if (defined $level) {
+ while ($level-- > 0) {
+ $indent .= ' ';
+ }
+ }
+
+ print STDERR $indent, $line, "\n";
+}
+
+### Initialization and setup routines
+
+sub check_options {
+
+ # Get command-line arguments
+ use Getopt::Long;
+ my $result = GetOptions(qw(cookies debug help noproxy=s verbose url=s match=s exclude|x=s file=s filter=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=f timeout=i interval=i dontwarn=s enable-virtual language=s allow-simple-hosts suppress=s));
+
+ # Handle arguments, some are mandatory, some have defaults
+ &print_help if (($main::opt_help && $main::opt_help)
+ || (!$main::opt_url && $#ARGV == -1));
+ $main::opt_timeout = 120 unless defined($main::opt_timeout) && length($main::opt_timeout);
+ $main::opt_verbose = 0 unless $main::opt_verbose;
+ $main::opt_sleep = 0 unless defined($main::opt_sleep) && length($main::opt_sleep);
+ $main::opt_interval = 10800 unless defined $main::opt_interval and length $main::opt_interval;
+ $main::opt_dontwarn = "xxx" unless defined $main::opt_dontwarn and length $main::opt_dontwarn;
+ $main::opt_enable_virtual = 0 unless defined $main::opt_enable_virtual;
+ # Set the default language and make sure it is a two letter, lowercase code
+ $main::opt_language = 'en' unless defined $main::opt_language;
+ $main::opt_language = lc(substr($main::opt_language, 0, 2));
+ $main::opt_language =~ tr/a-z//cd;
+ if ($main::opt_language !~ /[a-z][a-z]/) {
+ warn "Argument --language $main::opt_language is not a valid language code\nUsing English as a default.\n";
+ $main::opt_language = 'en';
+ }
+ $main::opt_allow_simple_hosts = 0
+ unless $main::opt_allow_simple_hosts;
+ output "--allow-simple-hosts is deprecated, please use the --dontwarn mechanism", 0 if $main::opt_allow_simple_hosts;
+
+ # The default for opt_match will be set later, because we might want
+ # to muck with opt_url first.
+
+ # Display messages about the options
+ output "*** Starting Checkbot $VERSION in verbose mode";
+ output 'Will skip checking of external links', 1
+ if $main::opt_internal_only;
+ output "Allowing unqualified host names", 1
+ if $main::opt_allow_simple_hosts;
+ output "Not using optional Time::Duration module: not found", 1
+ unless $main::useduration;
+}
+
+sub init_modules {
+
+ use URI;
+ # Prepare the user agent to be used:
+ use LWP::UserAgent;
+ use LWP::MediaTypes;
+ #use LWP::Debug qw(- +debug);
+ use HTML::LinkExtor;
+ $main::ua = new LWP::UserAgent;
+ $main::ua->agent("Checkbot/$VERSION LWP/" . LWP::Version);
+ $main::ua->timeout($main::opt_timeout);
+ # Add a proxy to the user agent, if defined
+ $main::ua->proxy(['http', 'ftp'], $main::opt_proxy)
+ if defined($main::opt_proxy);
+ $main::ua->no_proxy(split(',', $main::opt_noproxy))
+ if defined $main::opt_noproxy;
+ # Add a cookie jar to the UA if requested by the user
+ $main::ua->cookie_jar( {} )
+ if defined $main::opt_cookies or $main::opt_cookies;
+
+ require Mail::Send if defined $main::opt_mailto;
+
+ use HTTP::Status;
+}
+
+sub init_globals {
+ my $url;
+
+ # Directory and files for output
+ if ($main::opt_file) {
+ $main::file = $main::opt_file;
+ $main::file =~ /(.*)\./;
+ $main::server_prefix = $1;
+ } else {
+ $main::file = "checkbot.html";
+ $main::server_prefix = "checkbot";
+ }
+ $main::tmpdir = ($ENV{'TMPDIR'} or $ENV{'TMP'} or $ENV{'TEMP'} or "/tmp") . "/Checkbot.$$";
+
+ $main::cur_queue = $main::tmpdir . "/queue";
+ $main::new_queue = $main::tmpdir . "/queue-new";
+
+ # Make sure we catch signals so that we can clean up temporary files
+ $SIG{'INT'} = $SIG{'TERM'} = $SIG{'HUP'} = $SIG{'QUIT'} = \&got_signal;
+
+ # Set up hashes to be used
+ %main::checked = ();
+ %main::servers = ();
+ %main::servers_get_only = ();
+
+ # Initialize the start URLs. --url takes precedence. Otherwise
+ # just process URLs in order as they appear on the command line.
+ unshift(@ARGV, $main::opt_url) if $main::opt_url;
+ foreach (@ARGV) {
+ $url = URI->new($_);
+ # If no scheme is defined we will assume file is used, so that
+ # it becomes easy to check a single file.
+ $url->scheme('file') unless defined $url->scheme;
+ $url->host('localhost') if $url->scheme eq 'file';
+ if (!defined $url->host) {
+ warn "No host specified in URL $url, ignoring it.\n";
+ next;
+ }
+ push(@starturls, $url);
+ }
+ die "There are no valid starting URLs to begin checking with!\n"
+ if scalar(@starturls) == -1;
+
+ # Set the automatic matching expression to a concatenation of the starturls
+ if (!defined $main::opt_match) {
+ my @matchurls;
+ foreach my $url (@starturls) {
+ # Remove trailing files from the match, e.g. remove index.html
+ # stuff so that we match on the host and/or directory instead,
+ # but only if there is a path component in the first place.
+ my $matchurl = $url->as_string;
+ $matchurl =~ s!/[^/]+$!/! unless $url->path eq '';
+ push(@matchurls, quotemeta $matchurl);
+ }
+ $main::opt_match = '^(' . join('|', @matchurls) . ')';
+ output "--match defaults to $main::opt_match";
+ }
+
+ # Initialize statistics hash with number of start URLs
+ $stats{'todo'} = scalar(@starturls);
+
+ # We write out our status every now and then.
+ $main::cp_int = 1;
+ $main::cp_last = 0;
+}
+
+sub init_suppression {
+ return if not defined $main::opt_suppress;
+
+ die "Suppression file \"$main::opt_suppress\" is in fact a directory"
+ if -d $main::opt_suppress;
+
+ open(SUPPRESSIONS, $main::opt_suppress)
+ or die "Unable to open $main::opt_suppress for reading: $!\n";
+ while (my $line = <SUPPRESSIONS>) {
+ chomp $line;
+ next if $line =~ /^#/ or $line =~ /^\s*$/;
+
+ if ($line !~ /^\s*(\d+)\s+(\S+)/) {
+ output "WARNING: Unable to parse line in suppression file $main::opt_suppress:\n $line\n";
+ } else {
+ output "Suppressed: $1 $2\n" if $main::opt_verbose;
+ $suppression{$1}{$2} = $2;
+ }
+ }
+ close SUPPRESSIONS;
+}
+
+
+
+
+### Main application code
+
+sub check_links {
+ my $line;
+
+ mkdir $main::tmpdir, 0755
+ || die "$0: unable to create directory $main::tmpdir: $!\n";
+
+ # Explicitly set the record separator. I had the problem that this
+ # was not defined under my perl 5.00502. This should fix that, and
+ # not cause problems for older versions of perl.
+ $/ = "\n";
+
+ open(CURRENT, ">$main::cur_queue")
+ || die "$0: Unable to open CURRENT $main::cur_queue for writing: $!\n";
+ open(QUEUE, ">$main::new_queue")
+ || die "$0: Unable to open QUEUE $main::new_queue for writing: $!\n";
+
+ # Prepare CURRENT queue with starting URLs
+ foreach (@starturls) {
+ print CURRENT $_->as_string . "|\n";
+ }
+ close CURRENT;
+
+ open(CURRENT, $main::cur_queue)
+ || die "$0: Unable to open CURRENT $main::cur_queue for reading: $!\n";
+
+ do {
+ # Read a line from the queue, and process it
+ while (defined ($line = <CURRENT>) ) {
+ chomp($line);
+ &handle_url($line);
+ &check_point();
+ }
+
+ # Move queues around, and try again, but only if there are still
+ # things to do
+ output "*** Moving queues around, " . $stats{'todo'} . " links to do.";
+ close CURRENT
+ or warn "Error while closing CURRENT filehandle: $!\n";
+ close QUEUE;
+
+ # TODO: should check whether these succeed
+ unlink($main::cur_queue);
+ rename($main::new_queue, $main::cur_queue);
+
+ open(CURRENT, "$main::cur_queue")
+ || die "$0: Unable to open $main::cur_queue for reading: $!\n";
+ open(QUEUE, ">$main::new_queue")
+ || die "$0: Unable to open $main::new_queue for writing: $!\n";
+
+ } while (not -z $main::cur_queue);
+
+ close CURRENT;
+ close QUEUE;
+
+ unless (defined($main::opt_debug)) {
+ clean_up();
+ }
+}
+
+sub clean_up {
+ unlink $main::cur_queue, $main::new_queue;
+ rmdir $main::tmpdir;
+ output "Removed temporary directory $main::tmpdir and its contents.\n", 1;
+}
+
+sub got_signal {
+ my ($signalname) = @_;
+
+ clean_up() unless defined $main::opt_debug;
+
+ print STDERR "Caught SIG$signalname.\n";
+ exit 1;
+}
+
+# Whether URL is 'internal' or 'external'
+sub is_internal ($) {
+ my ($url) = @_;
+
+ return ( $url =~ /$main::opt_match/o
+ and not (defined $main::opt_exclude and $url =~ /$main::opt_exclude/o));
+}
+
+
+sub handle_url {
+ my ($line) = @_;
+ my ($urlstr, $urlparent) = split(/\|/, $line);
+
+ my $reqtype;
+ my $response;
+ my $type;
+
+ $stats{'todo'}--;
+
+ # Add this URL to the ones we've seen already, return if it is a
+ # duplicate.
+ return if add_checked($urlstr);
+
+ $stats{'link'}++;
+
+ # Is this an external URL and we only check internal stuff?
+ return if defined $main::opt_internal_only
+ and not is_internal($urlstr);
+
+ my $url = URI->new($urlstr);
+
+ # Perhaps this is a URL we are not interested in checking...
+ if (not defined($url->scheme)
+ or $url->scheme !~ /^(https?|file|ftp|gopher|nntp)$/o ) {
+ # Ignore URLs which we know we can ignore, create error for others
+ if ($url->scheme =~ /^(news|mailto|javascript|mms)$/o) {
+ output "Ignore $url", 1;
+ } else {
+ add_error($urlstr, $urlparent, 904, "Unknown scheme in URL: "
+ . $url->scheme);
+ }
+ return;
+ }
+
+ # Guess/determine the type of document we might retrieve from this
+ # URL. We do this because we only want to use a full GET for HTML
+ # document. No need to retrieve images, etc.
+ if ($url->path =~ /\/$/o || $url->path eq "") {
+ $type = 'text/html';
+ } else {
+ $type = guess_media_type($url->path);
+ }
+ # application/octet-stream is the fallback of LWP's guess stuff, so
+ # if we get this then we ask the server what we got just to be sure.
+ if ($type eq 'application/octet-stream') {
+ $response = performRequest('HEAD', $url, $urlparent, $type, $main::opt_language);
+ $type = $response->content_type;
+ }
+
+ # Determine if this is a URL we should GET fully or partially (using HEAD)
+ if ($type =~ /html/o
+ && $url->scheme =~ /^(https?|file|ftp|gopher)$/o
+ and is_internal($url->as_string)
+ && (!defined $main::opt_exclude || $url !~ /$main::opt_exclude/o)) {
+ $reqtype = 'GET';
+ } else {
+ $reqtype = 'HEAD';
+ }
+
+ # Get the document, unless we already did while determining the type
+ $response = performRequest($reqtype, $url, $urlparent, $type, $main::opt_language)
+ unless defined($response) and $reqtype eq 'HEAD';
+
+ # Ok, we got something back from checking, let's see what it is
+ if ($response->is_success) {
+ select(undef, undef, undef, $main::opt_sleep)
+ unless $main::opt_debug || $url->scheme eq 'file';
+
+ # Internal HTML documents need to be given to handle_doc for processing
+ if ($reqtype eq 'GET' and is_internal($url->as_string)) {
+ handle_doc($response, $urlstr);
+ }
+ } else {
+
+ # Right, so it wasn't the smashing succes we hoped for, so bring
+ # the bad news and store the pertinent information for later
+ add_error($url, $urlparent, $response->code, $response->message);
+
+ if ($response->is_redirect and is_internal($url->as_string)) {
+ if ($response->code == 300) { # multiple choices, but no redirection available
+ output 'Multiple choices', 2;
+ } else {
+ my $baseURI = URI->new($url);
+ if (defined $response->header('Location')) {
+ my $redir_url = URI->new_abs($response->header('Location'), $baseURI);
+ output "Redirected to $redir_url", 2;
+ add_to_queue($redir_url, $urlparent);
+ $stats{'todo'}++;
+ } else {
+ output 'Location header missing from redirect response', 2;
+ }
+ }
+ }
+ }
+ # Done with this URL
+}
+
+sub performRequest {
+ my ($reqtype, $url, $urlparent, $type, $language) = @_;
+
+ my ($response);
+
+ # A better solution here would be to use GET exclusively. Here is how
+ # to do that. We would have to set this max_size thing in
+ # check_external, I guess...
+ # Set $ua->max_size(1) and then try a normal GET request. However,
+ # that doesn't always work as evidenced by an FTP server that just
+ # hangs in this case... Needs more testing to see if the timeout
+ # catches this.
+
+ # Normally, we would only need to do a HEAD, but given the way LWP
+ # handles gopher requests, we need to do a GET on those to get at
+ # least a 500 and 501 error. We would need to parse the document
+ # returned by LWP to find out if we had problems finding the
+ # file. -- Patch by Bruce Speyer <bspeyer@texas-one.org>
+
+ # We also need to do GET instead of HEAD if we know the remote
+ # server won't accept it. The standard way for an HTTP server to
+ # indicate this is by returning a 405 ("Method Not Allowed") or 501
+ # ("Not Implemented"). Other circumstances may also require sending
+ # GETs instead of HEADs to a server. Details are documented below.
+ # -- Larry Gilbert <larry@n2h2.com>
+
+ # Normally we try a HEAD request first, then a GET request if
+ # needed. There may be circumstances in which we skip doing a HEAD
+ # (e.g. when we should be getting the whole document).
+ foreach my $try ('HEAD', 'GET') {
+
+ # Skip trying HEAD when we know we need to do a GET or when we
+ # know only a GET will work anyway.
+ next if $try eq 'HEAD' and
+ ($reqtype eq 'GET'
+ or $url->scheme eq 'gopher'
+ or (defined $url->authority and $main::servers_get_only{$url->authority}));
+
+ # Output what we are going to do with this link
+ output(sprintf("%4s %s (%s)\n", $try, $url, $type), 1);
+
+ # Create the request with all appropriate headers
+ my %header_hash = ( 'Referer' => $urlparent );
+ if (defined($language) && ($language ne '')) {
+ $header_hash{'Accept-Language'} = $language;
+ }
+ my $ref_header = new HTTP::Headers(%header_hash);
+ my $request = new HTTP::Request($try, $url, $ref_header);
+ $response = $main::ua->simple_request($request);
+
+ # If we are doing a HEAD request we need to make sure nothing
+ # fishy happened. we use some heuristics to see if we are ok, or
+ # if we should try again with a GET request.
+ if ($try eq 'HEAD') {
+
+ # 400, 405, 406 and 501 are standard indications that HEAD
+ # shouldn't be used
+ # We used to check for 403 here also, but according to the HTTP spec
+ # a 403 indicates that the server understood us fine but really does
+ # not want us to see the page, so we SHOULD NOT retry.
+ if ($response->code =~ /^(400|405|406|501)$/o) {
+ output "Server does not seem to like HEAD requests; retrying", 2;
+ $main::servers_get_only{$url->authority}++;
+ next;
+ };
+
+ # There are many servers out there that have real trouble with
+ # HEAD, so if we get a 500 Internal Server error just retry with
+ # a GET request to get an authoritive answer. We used to do this
+ # only for special cases, but the list got big and some
+ # combinations (e.g. Zope server behind Apache proxy) can't
+ # easily be detected from the headers.
+ if ($response->code =~ /^500$/o) {
+ output "Internal server error on HEAD request; retrying with GET", 2;
+ $main::servers_get_only{$url->authority}++ if defined $url->authority;
+ next;
+ }
+
+ # If we know the server we can try some specific heuristics
+ if (defined $response->server) {
+
+ # Netscape Enterprise has been seen returning 500 and even 404
+ # (yes, 404!!) in response to HEAD requests
+ if ($response->server =~ /^Netscape-Enterprise/o
+ and $response->code =~ /^404$/o) {
+ output "Unreliable Netscape-Enterprise response to HEAD request; retrying", 2;
+ $main::servers_get_only{$url->authority}++;
+ next;
+ };
+ }
+
+ # If a HEAD request resulted in nothing noteworthy, no need for
+ # any further attempts using GET, we are done.
+ last;
+ }
+ }
+
+ return $response;
+}
+
+
+# This routine creates a (temporary) WWW page based on the current
+# findings This allows somebody to monitor the process, but is also
+# convenient when this program crashes or waits because of diskspace
+# or memory problems
+
+sub create_page {
+ my($final_page) = @_;
+
+ my $path = "";
+ my $prevpath = "";
+ my $prevcode = 0;
+ my $prevmessage = "";
+
+ output "*** Start writing results page";
+
+ open(OUT, ">$main::file.new")
+ || die "$0: Unable to open $main::file.new for writing:\n";
+ print OUT "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n";
+ print OUT "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n";
+ print OUT "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n";
+ print OUT "<head>\n";
+ if (!$final_page) {
+ printf OUT "<meta http-equiv=\"Refresh\" content=\"%d\" />\n",
+ int($main::cp_int * 60 / 2 - 5);
+ }
+
+ print OUT "<title>Checkbot report</title>\n";
+ print OUT "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style;
+ print OUT "</head>\n";
+ print OUT "<body>\n";
+ print OUT "<h1><em>Checkbot</em>: main report</h1>\n";
+
+ # Show the status of this checkbot session
+ print OUT "<table summary=\"Status of this Checkbot session\" class='status'><tr><th>Status:</th><td>";
+ if ($final_page) {
+ print OUT "Done.<br />\n";
+ print OUT 'Run started on ' . localtime($start_time) . ".<br />\n";
+ print OUT 'Run duration ', duration(time() - $start_time), ".\n"
+ if $main::useduration;
+ } else {
+ print OUT "Running since " . localtime($start_time) . ".<br />\n";
+ print OUT "Last update at ". localtime() . ".<br />\n";
+ print OUT "Next update in <strong>", int($main::cp_int), "</strong> minutes.\n";
+ }
+ print OUT "</td></tr></table>\n\n";
+
+ # Summary (very brief overview of key statistics)
+ print OUT "<hr /><h2 class='summary'>Report summary</h2>\n";
+
+ print OUT "<table summary=\"Report summary\" class='summary'>\n";
+ print OUT "<tr id='checked'><th>Links checked</th><td class='value'>", $stats{'link'}, "</td></tr>\n";
+ print OUT "<tr id='problems'><th>Problems so far</th><td class='value'>", $stats{'problem'}, "</td></tr>\n";
+ print OUT "<tr id='todo'><th>Links to do</th><td class='value'>", $stats{'todo'}, "</td></tr>\n";
+ print OUT "</table>\n";
+
+ # Server information
+ printAllServers($final_page);
+
+ # Checkbot session parameters
+ print OUT "<hr /><h2 class='params'>Checkbot session parameters</h2>\n";
+ print OUT "<table summary=\"Checkbot session parameters\" class='params'>\n";
+ print OUT "<tr><th align=\"left\">--url &<br/> <command line urls></th><td class='text'>Start URL(s)</td><td class='value' id='url'>",
+ join(',', @starturls), "</td></tr>\n";
+ print OUT "<tr><th align=\"left\">--match</th><td class='text'>Match regular expression</td><td class='value' id='match'>$main::opt_match</td></tr>\n";
+ print OUT "<tr><th align=\"left\">--exclude</th><td class='text'>Exclude regular expression</td><td class='value' id='exclude'>$main::opt_exclude</td></tr>\n" if defined $main::opt_exclude;
+ print OUT "<tr><th align=\"left\">--filter</th><td class='text'>Filter regular expression</td><td class='value' id='filter'>$main::opt_filter</td></tr>\n" if defined $main::opt_filter;
+ print OUT "<tr><th align=\"left\">--noproxy</th><td class='text'>No Proxy for the following domains</td><td class='value' id='noproxy'>$main::opt_noproxy</td></tr>\n" if defined $main::opt_noproxy;
+ print OUT "<tr><th align=\"left\">--ignore</th><td class='text'>Ignore regular expression</td><td class='value' id='ignore'>$main::opt_ignore</td></tr>\n" if defined $main::opt_ignore;
+ print OUT "<tr><th align=\"left\">--suppress</th><td class='text'>Suppress error code and URL specified by</td><td class='value' id='suppress'>$main::opt_suppress</td></tr>\n" if defined $main::opt_suppress;
+ print OUT "<tr><th align=\"left\">--dontwarn</th><td class='text'>Don't warn for these codes</td><td class='value' id='dontwarn'>$main::opt_dontwarn</td></tr>\n" if $main::opt_dontwarn ne 'xxx';
+ print OUT "<tr><th align=\"left\">--enable-virtual</th><td class='text'>Use virtual names only</td><td class='value' id='enable_virtual'>yes</td></tr>\n" if $main::opt_enable_virtual;
+ print OUT "<tr><th align=\"left\">--internal-only</th><td class='text'>Check only internal links</td><td class='value' id='internal_only'>yes</td></tr>\n" if defined $main::opt_internal_only;
+ print OUT "<tr><th align=\"left\">--cookies</th><td class='text'>Accept cookies</td><td class='value' id='cookies'>yes</td></tr>\n" if defined $main::opt_cookies;
+ print OUT "<tr><th align=\"left\">--sleep</th><td class='text'>Sleep seconds between requests</td><td class='value' id='sleep'>$main::opt_sleep</td></tr>\n" if ($main::opt_sleep != 0);
+ print OUT "<tr><th align=\"left\">--timeout</th><td class='text'>Request timeout seconds</td><td class='value' id='timeout'>$main::opt_timeout</td></tr>\n";
+ print OUT "</table>\n";
+
+ # Statistics for types of links
+
+ print OUT signature();
+
+ close(OUT);
+
+ rename($main::file, $main::file . ".bak");
+ rename($main::file . ".new", $main::file);
+
+ unlink $main::file . ".bak" unless $main::opt_debug;
+
+ output "*** Done writing result page";
+}
+
+# Create a list of all the servers, and create the corresponding table
+# and subpages. We use the servers overview for this. This can result
+# in strange effects when the same server (e.g. IP address) has
+# several names, because several entries will appear. However, when
+# using the IP address there are also a number of tricky situations,
+# e.g. with virtual hosting. Given that likely the servers have
+# different names for a reasons, I think it is better to have
+# duplicate entries in some cases, instead of working off of the IP
+# addresses.
+
+sub printAllServers {
+ my ($finalPage) = @_;
+
+ my $server;
+ print OUT "<hr /><h2 class='overview'>Overview per server</h2>\n";
+ print OUT "<table summary=\"Overview per server\" class='overview'><tr><th>Server</th><th>Server<br />Type</th><th>Documents<br />scanned</th><th>Problem<br />links</th><th>Ratio</th></tr>\n";
+
+ foreach $server (sort keys %main::servers) {
+ print_server($server, $finalPage);
+ }
+ print OUT "</table>\n\n";
+}
+
+sub get_server_type {
+ my($server) = @_;
+
+ my $result;
+
+ if ( ! defined($main::server_type{$server})) {
+ if ($server eq 'localhost') {
+ $result = 'Direct access through filesystem';
+ } else {
+ my $request = new HTTP::Request('HEAD', "http://$server/");
+ my $response = $main::ua->simple_request($request);
+ $result = $response->header('Server');
+ }
+ $result = "Unknown server type" if ! defined $result or $result eq "";
+ output "=== Server $server is a $result";
+ $main::server_type{$server} = $result;
+ }
+ $main::server_type{$server};
+}
+
+sub add_checked {
+ my($urlstr) = @_;
+ my $item;
+ my $result = 0;
+
+ if (is_internal($urlstr) and not $main::opt_enable_virtual) {
+ # Substitute hostname with IP-address. This keeps us from checking
+ # the same pages for each name of the server, wasting time & resources.
+ # Only do this if we are not dealing with virtual servers. Also, we
+ # only do this for internal servers, because it makes no sense for
+ # external links.
+ my $url = URI->new($urlstr);
+ $url->host(ip_address($url->host)) if $url->can('host');
+ $urlstr = $url->as_string;
+ }
+
+ if (defined $main::checked{$urlstr}) {
+ $result = 1;
+ $main::checked{$urlstr}++;
+ } else {
+ $main::checked{$urlstr} = 1;
+ }
+
+ return $result;
+}
+
+# Has this URL already been checked?
+sub is_checked {
+ my ($urlstr) = @_;
+
+ if (is_internal($urlstr) and not $main::opt_enable_virtual) {
+ # Substitute hostname with IP-address. This keeps us from checking
+ # the same pages for each name of the server, wasting time & resources.
+ # Only do this if we are not dealing with virtual servers. Also, we
+ # only do this for internal servers, because it makes no sense for
+ # external links.
+ my $url = URI->new($urlstr);
+ $url->host(ip_address($url->host)) if $url->can('host');
+ $urlstr = $url->as_string;
+ }
+
+ return defined $main::checked{$urlstr};
+}
+
+sub add_error ($$$$) {
+ my ($url, $urlparent, $code, $status) = @_;
+
+ # Check for the quick eliminations first
+ return if $code =~ /$main::opt_dontwarn/o
+ or defined $suppression{$code}{$url};
+
+ # Check for matches on the regular expressions in the supression file
+ if (defined $suppression{$code}) {
+ foreach my $item ( %{$suppression{$code}} ) {
+ if ($item =~ /^\/(.*)\/$/) {
+ my $regexp = $1;
+ if ($url =~ $regexp) {
+ output "Supressing error $code for $url due to regular expression match on $regexp", 2;
+ return;
+ }
+ }
+ }
+ }
+
+ $status = checkbot_status_message($code) if not defined $status;
+
+ output "$code $status", 2;
+
+ $url_error{$url}{'code'} = $code;
+ $url_error{$url}{'status'} = $status;
+ push @{$url_parent{$url}}, $urlparent;
+ $stats{'problem'}++;
+}
+
+# Parse document, and get the links
+sub handle_doc {
+ my ($response, $urlstr) = @_;
+
+ my $num_links = 0;
+ my $new_links = 0;
+
+ # TODO: we are making an assumption here that the $reponse->base is
+ # valid, which might not always be true! This needs to be fixed, but
+ # first let's try to find out why this stuff is sometimes not
+ # valid... Aha. a simple <base href="news:"> will do the trick. It is
+ # not clear what the right fix for this is.
+
+ # We use the URL we used to retrieve this document as the URL to
+ # attach the problem reports to, even though this may not be the
+ # proper base url.
+ my $baseurl = URI->new($urlstr);
+
+ # When we received the document we can add a notch to its server
+ $main::servers{$baseurl->authority}++;
+
+ # Retrieve useful information from this document.
+ # TODO: using a regexp is NOT how this should be done, but it is
+ # easy. The right way would be to write a HTML::Parser or to use
+ # XPath on the document DOM provided that the document is easily
+ # parsed as XML. Either method is a lot of overhead.
+ if ($response->content =~ /title\>(.*?)\<\/title/si) {
+
+ # TODO: using a general hash that stores titles for all pages may
+ # consume too much memory. It would be better to only store the
+ # titles for requests that had problems. That requires passing them
+ # down to the queue. Take the easy way out for now.
+ $url_title{$baseurl} = $1;
+ }
+
+ # Check if this document has a Robots META tag. If so, check if
+ # Checkbot is allowed to FOLLOW the links on this page. Note that we
+ # ignore the INDEX directive because Checkbot is not an indexing
+ # robot. See http://www.robotstxt.org/wc/meta-user.html
+ # TODO: one more reason (see title) to properly parse this document...
+ if ($response->content =~ /\<meta[^\>]*?robots[^\>]*?nofollow[^\>]*?\>/si) {
+ output "Obeying robots meta tag $&, skipping document", 2;
+ return;
+ }
+
+
+ # Parse the document just downloaded, using the base url as defined
+ # in the response, otherwise we won't get the same behavior as
+ # browsers and miss things like a BASE url in pages.
+ my $p = HTML::LinkExtor->new(undef, $response->base);
+
+ # If charset information is missing then decoded_content doesn't
+ # work. Fall back to content in this case, even though that may lead
+ # to charset warnings. See bug 1665075 for reference.
+ my $content = $response->decoded_content || $response->content;
+ $p->parse($content);
+ $p->eof;
+
+ # Deal with the links we found in this document
+ my @links = $p->links();
+ foreach (@links) {
+ my ($tag, %l) = @{$_};
+ foreach (keys %l) {
+ # Get the canonical URL, so we don't need to worry about base, case, etc.
+ my $url = $l{$_}->canonical;
+
+ # Remove fragments, if any
+ $url->fragment(undef);
+
+ # Determine in which tag this URL was found
+ # Ignore <base> tags because they need not point to a valid URL
+ # in order to work (e.g. when directory indexing is turned off).
+ next if $tag eq 'base';
+
+ # Skip some 'links' that are not required to link to an actual
+ # live link but which LinkExtor returns as links anyway.
+ next if $tag eq 'applet' and $_ eq 'code';
+ next if $tag eq 'object' and $_ eq 'classid';
+
+ # Run filter on the URL if defined
+ if (defined $main::opt_filter) {
+ die "Filter supplied with --filter option contains errors!\n$@\n"
+ unless defined eval '$url =~ s' . $main::opt_filter
+ }
+
+ # Should we ignore this URL?
+ if (defined $main::opt_ignore and $url =~ /$main::opt_ignore/o) {
+ output "--ignore: $url", 1;
+ next;
+ }
+
+ # Check whether URL has fully-qualified hostname
+ if ($url->can('host') and $url->scheme ne 'news') {
+ if (! defined $url->host) {
+ add_error($url, $baseurl->as_string, '901',
+ $checkbot_errors{'901'});
+ } elsif (!$main::opt_allow_simple_hosts && $url->host !~ /\./) {
+ add_error($url, $baseurl->as_string, '902',
+ $checkbot_errors{'902'});
+ }
+ }
+
+ # Some servers do not process // correctly in requests for relative
+ # URLs. We should flag them here. Note that // in a URL path is
+ # actually valid per RFC 2396, and that they should not be removed
+ # when processing relative URLs as per RFC 1808. See
+ # e.g. <http://deesse.univ-lemans.fr:8003/Connected/RFC/1808/18.html>.
+ # Thanks to Randal Schwartz and Reinier Post for their explanations.
+ if ($url =~ /^http:\/\/.*\/\//) {
+ add_error($url, $baseurl->as_string, '903',
+ $checkbot_errors{'903'});
+ }
+
+ # We add all URLs found to the queue, unless we already checked
+ # it earlier
+ if (is_checked($url)) {
+
+ # If an error has already been logged for this URL we add the
+ # current parent to the list of parents on which this URL
+ # appears.
+ if (defined $url_error{$url}) {
+ push @{$url_parent{$url}}, $baseurl->as_string;
+ $stats{'problem'}++;
+ }
+
+ $stats{'link'}++;
+ } else {
+ add_to_queue($url, $baseurl);
+ $stats{'todo'}++;
+ $new_links++;
+ }
+ $num_links++;
+ }
+ }
+ output "Got $num_links links ($new_links new) from document", 2;
+}
+
+
+sub add_to_queue {
+ my ($url, $parent) = @_;
+
+ print QUEUE $url . '|' . $parent . "\n";
+}
+
+sub checkbot_status_message ($) {
+ my ($code) = @_;
+
+ my $result = status_message($code) || $checkbot_errors{$code}
+ || '(Undefined status)';
+}
+
+sub print_server ($$) {
+ my($server, $final_page) = @_;
+
+ my $host = $server;
+ $host =~ s/(.*):\d+/$1/;
+
+ output "Writing server $server (really " . ip_address($host) . ")", 1;
+
+ my $server_problem = count_problems($server);
+ my $filename = "$main::server_prefix-$server.html";
+ $filename =~ s/:/-/o;
+
+ print OUT "<tr><td class='server'>";
+ print OUT "<a href=\"@{[ (fileparse($filename))[0] ]}\">" if $server_problem > 0;
+ print OUT "$server";
+ print OUT "</a>" if $server_problem > 0;
+ print OUT "</td>";
+ print OUT "<td class='servertype'>" . get_server_type($server) . "</td>";
+ printf OUT "<td class='unique' align=\"right\">%d</td>",
+ $main::servers{$server} + $server_problem;
+ if ($server_problem) {
+ printf OUT "<td class='problems' id='oops' align=\"right\">%d</td>",
+ $server_problem;
+ } else {
+ printf OUT "<td class='problems' id='zero_defects' align=\"right\">%d</td>",
+ $server_problem;
+ }
+
+ my $ratio = $server_problem / ($main::servers{$server} + $server_problem) * 100;
+ print OUT "<td class='ratio' align=\"right\">";
+ print OUT "<strong>" unless $ratio < 0.5;
+ printf OUT "%4d%%", $ratio;
+ print OUT "</strong>" unless $ratio < 0.5;
+ print OUT "</td>";
+ print OUT "</tr>\n";
+
+ # Create this server file
+ open(SERVER, ">$filename")
+ || die "Unable to open server file $filename for writing: $!";
+ print SERVER "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n";
+ print SERVER "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n";
+ print SERVER "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n";
+ print SERVER "<head>\n";
+ if (!$final_page) {
+ printf SERVER "<meta http-equiv=\"Refresh\" content=\"%d\" />\n",
+ int($main::cp_int * 60 / 2 - 5);
+ }
+ print SERVER "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style;
+ print SERVER "<title>Checkbot: output for server $server</title></head>\n";
+ print SERVER "<body><h2><em>Checkbot</em>: report for server <tt>$server</tt></h2>\n";
+ print SERVER "<p>Go To: <a href=\"@{[ (fileparse($main::file))[0] ]}\">Main report page</a>";
+
+ printServerProblems($server, $final_page);
+
+ print SERVER "\n";
+ print SERVER signature();
+
+ close SERVER;
+}
+
+# Return a string containing Checkbot's signature for HTML pages
+sub signature {
+ return "<hr />\n<p class='signature'>Page created by <a href=\"http://degraaff.org/checkbot/\">Checkbot $VERSION</a> on <em>" . localtime() . "</em>.</p>\n".
+ "<p><a href=\"http://validator.w3.org/check/?uri=referer\"><img src=\"http://www.w3.org/Icons/valid-xhtml11\" alt=\"Valid XHTML 1.1\" height=\"31\" width=\"88\" /></a></p>".
+ "</body></html>";
+}
+
+# Loop through all possible problems, select relevant ones for this server
+# and display them in a meaningful way.
+sub printServerProblems ($$) {
+ my ($server, $final_page) = @_;
+ $server = quotemeta $server;
+
+ my $separator = "<hr />\n";
+
+ my %thisServerList = ();
+
+ # First we find all the problems for this particular server
+ foreach my $url (keys %url_parent) {
+ foreach my $parent (@{$url_parent{$url}}) {
+ next if $parent !~ $server;
+ chomp $parent;
+ $thisServerList{$url_error{$url}{'code'}}{$parent}{$url}
+ = $url_error{$url}{'status'};
+ }
+ }
+
+ # Do a run to find all error codes on this page, and include a table
+ # of contents to the actual report
+ foreach my $code (sort keys %thisServerList) {
+ print SERVER ", <a href=\"#rc$code\">$code ";
+ print SERVER checkbot_status_message($code);
+ print SERVER "</a>";
+ }
+ print SERVER ".</p>\n";
+
+
+ # Now run through this list and print the errors
+ foreach my $code (sort keys %thisServerList) {
+ my $codeOut = '';
+
+ foreach my $parent (sort keys %{ $thisServerList{$code} }) {
+ my $urlOut = '';
+ foreach my $url (sort keys %{ $thisServerList{$code}{$parent} }) {
+ my $status = $thisServerList{$code}{$parent}{$url};
+ $urlOut .= "<li><a href=\"$url\">$url</a><br/>\n";
+ $urlOut .= "$status"
+ if defined $status and $status ne checkbot_status_message($code);
+ $urlOut .= "</li>\n";
+ }
+ if ($urlOut ne '') {
+ $codeOut .= "<dt><a href=\"$parent\">$parent</a>";
+ $codeOut .= "<br />$url_title{$parent}\n" if defined $url_title{$parent};
+ $codeOut .= "<dd><ul>\n$urlOut\n</ul>\n\n";
+ }
+ }
+
+ if ($codeOut ne '') {
+ print SERVER $separator if $separator;
+ $separator = '';
+ print SERVER "<h4 id=\"rc$code\">$code ";
+ print SERVER checkbot_status_message($code);
+ print SERVER "</h4>\n<dl>\n$codeOut\n</dl>\n";
+ }
+ }
+}
+
+sub check_point {
+ if ( ($main::cp_last + 60 * $main::cp_int < time())
+ || ($main::opt_debug && $main::opt_verbose)) {
+ &create_page(0);
+ $main::cp_last = time();
+ # Increase the intervall from one snapshot to the next by 25%
+ # until we have reached the maximum.
+ $main::cp_int *= 1.25 unless $main::opt_debug;
+ $main::cp_int = $main::opt_interval if $main::cp_int > $main::opt_interval;
+ }
+}
+
+sub send_mail {
+ my $msg = new Mail::Send;
+ my $sub = 'Checkbot results for ';
+ $sub .= join(', ', @starturls);
+ $sub .= ': ' . $stats{'problem'} . ' errors';
+
+ $msg->to($main::opt_mailto);
+ $msg->subject($sub);
+
+ my $fh = $msg->open;
+
+ print $fh "Checkbot results for:\n " . join("\n ", @starturls) . "\n\n";
+ print $fh "User-supplied note: $main::opt_note\n\n"
+ if defined $main::opt_note;
+
+ print $fh $stats{'link'}, " links were checked, and ";
+ print $fh $stats{'problem'}, " problems were detected.\n";
+
+ print $fh 'Run started on ' . localtime($start_time) . "\n";
+ print $fh 'Run duration ', duration(time() - $start_time), "\n"
+ if $main::useduration;
+
+
+ print $fh "\n-- \nCheckbot $VERSION\n";
+ print $fh "<URL:http://degraaff.org/checkbot/>\n";
+
+ $fh->close;
+}
+
+sub print_help {
+ print <<"__EOT__";
+Checkbot $VERSION command line options:
+
+ --cookies Accept cookies from the server
+ --debug Debugging mode: No pauses, stop after 25 links.
+ --file file Use file as basis for output file names.
+ --help Provide this message.
+ --mailto address Mail brief synopsis to address when done.
+ --noproxy domains Do not proxy requests to given domains.
+ --verbose Verbose mode: display many messages about progress.
+ --url url Start URL
+ --match match Check pages only if URL matches `match'
+ If no match is given, the start URL is used as a match
+ --exclude exclude Exclude pages if the URL matches 'exclude'
+ --filter regexp Run regexp on each URL found
+ --ignore ignore Ignore URLs matching 'ignore'
+ --suppress file Use contents of 'file' to suppress errors in output
+ --note note Include Note (e.g. URL to report) along with Mail message.
+ --proxy URL URL of proxy server for HTTP and FTP requests.
+ --internal-only Only check internal links, skip checking external links.
+ --sleep seconds Sleep this many seconds between requests (default 0)
+ --style url Reference the style sheet at this URL.
+ --timeout seconds Timeout for http requests in seconds (default 120)
+ --interval seconds Maximum time interval between updates (default 10800)
+ --dontwarn codes Do not write warnings for these HTTP response codes
+ --enable-virtual Use only virtual names, not IP numbers for servers
+ --language Specify 2-letter language code for language negotiation
+
+Options --match, --exclude, and --ignore can take a perl regular expression
+as their argument\n
+Use 'perldoc checkbot' for more verbose documentation.
+Checkbot WWW page : http://degraaff.org/checkbot/
+Mail bugs and problems: checkbot\@degraaff.org
+__EOT__
+
+ exit 0;
+}
+
+sub ip_address {
+ my($host) = @_;
+
+ return $main::ip_cache{$host} if defined $main::ip_cache{$host};
+
+ my($name,$aliases,$adrtype,$length,@addrs) = gethostbyname($host);
+ if (defined $addrs[0]) {
+ my($n1,$n2,$n3,$n4) = unpack ('C4',$addrs[0]);
+ $main::ip_cache{$host} = "$n1.$n2.$n3.$n4";
+ } else {
+ # Whee! No IP-address found for this host. Just keep whatever we
+ # got for the host. If this really is some kind of error it will
+ # be found later on.
+ $main::ip_cache{$host} = $host;
+ }
+}
+
+sub count_problems {
+ my ($server) = @_;
+ $server = quotemeta $server;
+ my $count = 0;
+
+ foreach my $url (sort keys %url_parent) {
+ foreach my $parent (@{ $url_parent{$url} }) {
+ $count++ if $parent =~ m/$server/;
+ }
+ }
+ return $count;
+}
+