#!/usr/bin/perl # Copyright Iain Georgeson 2003 # This program is free software; you may redistribute it and/or # modify it under the same terms as Perl itself. use subs qw(main handle_comment handle_entries handle_link handle_text process_chunk show_issues); use strict; use warnings; use Data::Dumper; use HTML::LinkExtor; use Lingua::Ispell qw(spellcheck); use LWP::Parallel::UserAgent qw(C_ENDALL C_ENDCON); $Data::Dumper::Indent = 1; $Lingua::Ispell::path = '/usr/bin/ispell'; my %spell; my %parsers; my %url_map; my %issues; my $ua; my $start_url = shift; my $do_spelling = 1; sub main { $start_url or die "Where do I start?\n"; $ua = new LWP::Parallel::UserAgent (env_proxy => 1, redirect => 0); $ua->max_hosts(5); $ua->max_req(5); $ua->register(new HTTP::Request(GET => $start_url), \&process_chunk, 4096); my $entries = $ua->wait(5); # Go go gadget LWP! print "\n"; my $issues = handle_entries($entries); show_issues($issues); } sub show_issues { my $issues = shift; for my $page (sort keys %$issues) { print "$page:\n", map " $_\n" => @{$issues->{$page}}; } } sub handle_entries { my $entries = shift; for my $entry (keys %$entries) { $entries->{$entry}{response}->code eq 200 and next; for my $from_url (@{$url_map{$entries->{$entry}{response}->base->canonical}}) { push @{$issues{$from_url}}, join "" => $entries->{$entry}{response}->base->canonical, ": ", $entries->{$entry}{response}->code, " ", $entries->{$entry}{response}->message } } $do_spelling or return \%issues; for my $page (keys %spell) { for my $localword (@{$spell{$page}{localwords}}) { delete $spell{$page}{errors}{$localword}; } for my $error (sort keys %{$spell{$page}{errors}}) { push @{$issues{$page}}, "$spell{$page}{errors}{$error} x $error". (exists $spell{$page}{guesses}{$error} ? ' ['. join(' ' => @{$spell{$page}{guesses}{$error}}). ']' : ""); } } return \%issues; } sub process_chunk { my ($data, $response, $protocol) = @_; $response->content_type eq 'text/html' or return C_ENDCON; unless($parsers{$response}) { print STDERR "[". $response->base->canonical. "]\n"; $parsers{$response} = new HTML::LinkExtor(sub {handle_link ($response, @_)}, $response->base->canonical); if($response->base->canonical =~ /^$start_url/) { $parsers{$response}->handler (comment => sub {handle_comment ($response, @_)}, 'text'); $parsers{$response}->handler (text => sub {handle_text ($response, @_)}, 'dtext'); } } $parsers{$response}->parse($data); return undef; } sub handle_comment { my ($response, $text) = @_; if($text =~ s/$/$1/s) { my @hitop_warnings = split /\n/ => $text; pop @hitop_warnings; # Drop the "\d microseconds" line @hitop_warnings or return; push @{$issues{$response->base->canonical}}, @hitop_warnings; } if($text =~ s/$/$1/s) { push @{$spell{$response->base->canonical}{localwords}} => split /\s+/ => $1; } } sub handle_text { my ($response, $text) = @_; my @results; for my $line (split /\n/ => $text) { for my $error (spellcheck $line) { $spell{$response->base->canonical}{errors}{$error->{term}}++; exists $error->{guesses} and push @{$spell{$response->base->canonical}{guesses}{$error->{term}}} => @{$error->{guesses}}; exists $error->{misses} and push @{$spell{$response->base->canonical}{guesses}{$error->{term}}} => @{$error->{misses}}; } } } sub handle_link { my ($response, $tag, $attr, $url) = @_; my $from_url = $response->base->canonical; my $to_url = $url->canonical; $from_url =~ /^$start_url/ or return; $to_url =~ /^mailto:/ and return; my $seen_url = exists $url_map{$to_url}; push @{$url_map{$to_url}}, "$from_url"; $seen_url and return; $ua->register (new HTTP::Request(GET => $url), \&process_chunk) and die "Can't register it"; } main