Article Listing 1 jan2005.tar

Listing 1 Perl-based bookmark validator

=1=    #!/usr/bin/perl
=2=    use strict;
=3=    use warnings;
=4=    
=5=    use HTML::Parser;
=6=    use LWP::Parallel::UserAgent;
=7=    use HTTP::Request::Common;
=8=    
=9=    $^I = "~";
=10=    @ARGV = "-" unless @ARGV;       # act as filter if no names specified
=11=    while (@ARGV) {
=12=      $_ = do { local $/; <> };
=13=    
=14=      my $urls = extract_links($_);
=15=    
=16=      validate_links($urls);
=17=    
=18=      rewrite_html($_, $urls);
=19=    }
=20=    
=21=    sub extract_links {
=22=      my $html = shift;
=23=    
=24=      my %urls;
=25=    
=26=      my $p = HTML::Parser->new
=27=        (
=28=         start_h =>
=29=         [sub {
=30=            my ($tagname, $attr) = @_;
=31=            return unless $tagname eq "a" and my $href = $attr->{href};
=32=            $urls{$href} = "";
=33=          }, "tagname, attr"],
=34=        ) or die;
=35=    
=36=      $p->parse($html);
=37=      $p->eof;
=38=    
=39=      return \%urls;
=40=    }
=41=    
=42=    sub validate_links {
=43=      my $urls = shift;             # hashref
=44=    
=45=      my $pua = LWP::Parallel::UserAgent->new(max_size => 1);
=46=    
=47=      while (my ($url) = each %$urls) {
=48=        $pua->register(GET $url);
=49=      }
=50=    
=51=      for my $entry (values %{$pua->wait(30)}) {
=52=        my $url = $entry->request->url;
=53=        my $success = $entry->response->is_success;
=54=        warn +($urls->{$url} = $success ? "LIVE" : "DEAD"), ": $url\n";
=55=      }
=56=    
=57=      # return void
=58=    }
=59=    
=60=    sub rewrite_html {
=61=      my $html = shift;
=62=      my $urls = shift;             # hashref
=63=    
=64=      my $dead = 0;                 # mark the next text as "DEAD -"
=65=    
=66=      my $p = HTML::Parser->new
=67=        (
=68=         start_h =>
=69=         [sub {
=70=            my ($text, $tagname, $attr) = @_;
=71=            if ($tagname eq "a" and my $href = $attr->{href}) {
=72=              $dead = 1 if $urls->{$href} eq "DEAD";
=73=            }
=74=            print $text;
=75=          }, "text, tagname, attr"],
=76=         text_h =>
=77=         [sub {
=78=            my ($text) = @_;
=79=            if ($dead) {
=80=              $text = "DEAD - $text" unless $text =~ /DEAD -/;
=81=              $dead = 0;
=82=            }
=83=            print $text;
=84=          }, "text"],
=85=         default_h =>
=86=         [sub { print shift }, 'text'],
=87=        ) or die;
=88=    
=89=      $p->parse($html);
=90=      $p->eof;
=91=      # return void
=92=    }