Listing 4.
wwwkan.pl
Tuomas J. Lukka
Learning Japanese
The Perl Journal, Summer 1998
 
#!/usr/bin/perl
#
# wwwkan1.pl - translate kanji or compounds in Japanese HTML.
# Copyright (C) 1997,1998 Tuomas J. Lukka. All rights reserved.

# Directory to the kanji dictionary database
$libdir = "/my/home/dir/japanese_files/";

# The url of this CGI-script, for mangling the links on the page
$my_url = "http://komodo.media.mit.edu/~tjl/cgi-bin/wwwkan1.cgi";

# Link types to substitute.
# 0 = absolute, 1 = relative.
%links = (a => ['href', 1], img => ['src', 0], 
          form => ['action', 1], link => ['href', 1], 
          frame => ['src', 1]);
                
# ---- main program

use CGI;
use LWP::Simple;
use HTML::Parse;
use URI::URL;
use Fcntl;
use AnyDBM_File;

tie %kanji, AnyDBM_File, "$libdir/kanji.dbmx", O_RDONLY, 0;

$query = new CGI;

print $query->header, "CONVERTED By TJL's kanji explainer on ",
      'date', '. Mail comments to lukka@fas.harvard.edu.<P>',
      $query->startform(), "<b>Go To:</b> ",
      $query->textfield(-name => 'url',
            -default => 'http://www.yahoo.co.jp/', -size => 50),
      $query->submit('Action','Doit'), 
      $query->endform, "<HR>\n";

# Get the original document from the net.
$url = $query->param('url');
$doc = get $url;

# Substitute web addresses so that text documents are fetched with
# this script and pictures are fetched directly.
$h = parse_html($doc);
$h->traverse(
    sub {
        my($e, $start) = @_;
        return 1 unless $start;
        my $attr = $links{lc $e->tag} or return 1;
        my $url = $e->attr($attr->[0]) or return 1;
        $e->attr($attr->[0], ($attr->[1] ?
                       getlink($url) : abslink($url)));
},
1);
$doc = $h->as_HTML;

# Substitute kanji for English
for ( split "\n", $doc ) {
    s/((?:[\x80-\xFF][\x40-\xFF])+)/explainstr($1)/ge;
    print;
}
exit;

# SUBROUTINES

# Make an absolute URL from a relative URL in the original document
sub abslink {
    return (new URI::URL($_[0]))->abs($url)->as_string;
}

# Make a new URL which gets a document through our translation service.
sub getlink {
    my $url_to = (new URI::URL($_[0]))->abs($url);
    my $proxy_url = new URI::URL($my_url);
    $proxy_url->query_form(url => $url_to->as_string);
    return $proxy_url->as_string;
}

# Insert explanations into a string of kanji
sub explainstr {
    my $str = @_;
    my $res = "";
    my ($pos, $mlen, $s);
    for ( $pos = 0; $pos < length($str); $pos += $mlen ) {
        my $expl;
        $mlen = 20;
        while (!defined($expl = $kanji{$s=(substr(($str),$pos,$mlen))})
                 and $mlen > 2) {
            $mlen -= 2;
        }
        $res .= $s;
        if (defined $expl) {
            $res .= " <small><[[[".($expl)."]]]></small> ";
       }
    }
    return $res;
}