Listing 1 Perl-based bookmark validator
=1= #!/usr/bin/perl
=2= use strict;
=3= use warnings;
=4=
=5= use HTML::Parser;
=6= use LWP::Parallel::UserAgent;
=7= use HTTP::Request::Common;
=8=
=9= $^I = "~";
=10= @ARGV = "-" unless @ARGV; # act as filter if no names specified
=11= while (@ARGV) {
=12= $_ = do { local $/; <> };
=13=
=14= my $urls = extract_links($_);
=15=
=16= validate_links($urls);
=17=
=18= rewrite_html($_, $urls);
=19= }
=20=
=21= sub extract_links {
=22= my $html = shift;
=23=
=24= my %urls;
=25=
=26= my $p = HTML::Parser->new
=27= (
=28= start_h =>
=29= [sub {
=30= my ($tagname, $attr) = @_;
=31= return unless $tagname eq "a" and my $href = $attr->{href};
=32= $urls{$href} = "";
=33= }, "tagname, attr"],
=34= ) or die;
=35=
=36= $p->parse($html);
=37= $p->eof;
=38=
=39= return \%urls;
=40= }
=41=
=42= sub validate_links {
=43= my $urls = shift; # hashref
=44=
=45= my $pua = LWP::Parallel::UserAgent->new(max_size => 1);
=46=
=47= while (my ($url) = each %$urls) {
=48= $pua->register(GET $url);
=49= }
=50=
=51= for my $entry (values %{$pua->wait(30)}) {
=52= my $url = $entry->request->url;
=53= my $success = $entry->response->is_success;
=54= warn +($urls->{$url} = $success ? "LIVE" : "DEAD"), ": $url\n";
=55= }
=56=
=57= # return void
=58= }
=59=
=60= sub rewrite_html {
=61= my $html = shift;
=62= my $urls = shift; # hashref
=63=
=64= my $dead = 0; # mark the next text as "DEAD -"
=65=
=66= my $p = HTML::Parser->new
=67= (
=68= start_h =>
=69= [sub {
=70= my ($text, $tagname, $attr) = @_;
=71= if ($tagname eq "a" and my $href = $attr->{href}) {
=72= $dead = 1 if $urls->{$href} eq "DEAD";
=73= }
=74= print $text;
=75= }, "text, tagname, attr"],
=76= text_h =>
=77= [sub {
=78= my ($text) = @_;
=79= if ($dead) {
=80= $text = "DEAD - $text" unless $text =~ /DEAD -/;
=81= $dead = 0;
=82= }
=83= print $text;
=84= }, "text"],
=85= default_h =>
=86= [sub { print shift }, 'text'],
=87= ) or die;
=88=
=89= $p->parse($html);
=90= $p->eof;
=91= # return void
=92= }
|