#!/usr/bin/perl -w # # unbbc: # Undo the BBC News web site's graphic `design'. # # This is full of horrid heuristics, because there are several sorts of pages # on the BBC News web site, and we need to deal properly with all of them. It # is not guaranteed that this program will emit correct HTML, but it's good # enough for real browsers. # # Copyright (c) 2002 Chris Lightfoot. # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or, at your # option, any later version. # # You can read the license at # http://ex-parrot.com/~chris/GPL.txt # # Email: chris@ex-parrot.com; WWW: http://www.ex-parrot.com/~chris/ # use strict; my $rcsid = ''; $rcsid .= '$Id: unbbc,v 1.1 2002/09/03 18:53:26 chris Exp $'; use HTML::Parser; use HTML::Tagset; use HTML::Entities; use LWP::Simple; use URI::URL; use CGI::Fast; # Address to use via mod_rewrite. my $myname = "http://www.cold.local/unbbc"; while (my $q = new CGI::Fast) { my $myurl = $q->url(-absolute); $myname ||= "$myurl?url="; my $url = $q->param("url"); $url ||= "/1/hi.html"; $url = "http://news.bbc.co.uk$url"; my $content = get($url); if (!$content) { print < 1 } qw(table font link div span iframe script applet style small large)); my %discard_attrs = map { $_ => 1 } qw(style color bgcolor background face class link alink vlink text topmargin leftmargin marginheight marginwidth); my $boolean_attribute_marker = "__boolean__attribute__"; my $do_start_tag = sub ($$) { my ($Tagname, $Attr) = @_; my $tagname = lc $Tagname; my $attr = { }; $attr->{lc $_} = $Attr->{$_} foreach (keys %$Attr); ++$in_style if ($tagname eq 'style'); ++$in_script if ($tagname eq 'script'); ++$in_applet if ($tagname eq 'applet'); ++$in_iframe if ($tagname eq 'iframe'); my @reconstruct_attrs; # For a non-front-page story, the content begins at the date. if (!$is_front_page) { $in_content = 1 if ($tagname eq 'span' and exists $attr->{class} and lc $attr->{class} eq 'date'); } foreach (keys %$attr) { unless ($discard_attrs{lc $_} or ($tagname ne 'img' and $_ =~ /^width|height$/)) { my $x = $attr->{$_}; if ($x eq $boolean_attribute_marker) { push(@reconstruct_attrs, $_); } else { if ($tagname eq 'a' and $_ eq 'href') { # Rewrite URLs via this script, if they are within the # BBC. if ($x =~ m#^javascript:popupas\(['"](.+)['"]\)$#i) { $x = $1; push(@reconstruct_attrs, 'target="_new"'); } unless ($x =~ /^javascript:/) { my $newurl = url($x, $url)->abs->as_string; $newurl =~ s#^http://news.bbc.co.uk#$myname#; $x = $newurl; } } encode_entities($x); push(@reconstruct_attrs, qq($_="$x")); } } } if ($discard_tags{$tagname}) { # Don't care about this tag at all. return; } elsif ($tagname eq 'img') { # Only like images which actually represent something. return if ((exists $attr->{width} and $attr->{width} < 16) or (exists $attr->{height} and $attr->{height} < 16)); } if (!$in_script and !$in_applet and !$in_style) { return if ($in_body and !$in_content); if ($tagname eq 'img' and $attr->{src} =~ /startquote\.gif$/) { $result .= "
"; } elsif ($tagname eq 'img' and $attr->{src} =~ /endquote\.gif$/) { $result .= "
"; } else { $result .= "<" . join(" ", ($tagname, @reconstruct_attrs)) . ">"; $result .= '' if ($tagname eq 'html'); $result .= qq(

Oooh, this isn't working, give me the real page.

) if ($tagname eq 'body'); } } ++$in_body if ($tagname eq 'body'); }; my $do_end_tag = sub ($$) { my ($Tagname) = @_; my $tagname = lc $Tagname; --$in_body if ($tagname eq 'body'); --$in_style if ($tagname eq 'style'); --$in_script if ($tagname eq 'script'); --$in_applet if ($tagname eq 'applet'); --$in_iframe if ($tagname eq 'iframe'); return if ($in_body and !$in_content); # Structure of tables. $result .= "
" if ($tagname eq 'tr'); $result .= " " if ($tagname =~ /^td|div$/); return if ($discard_tags{$tagname}); $result .= ""; }; my $do_text = sub ($) { my ($text) = @_; return if ($in_body and !$in_content); $text =~ s/ //g; if (!$in_script and !$in_applet and !$in_style) { $result .= $_[0]; } }; my $do_comment = sub ($) { my ($text) = @_; if ($is_front_page) { # For a front page story, the content is delimeted by comments. $in_content = 1 if (!$in_content and $text =~ /FrontPageStoryBody/); $in_content = 0 if ($in_content and $text =~ /UKINewsPuffMegaInclude/); # (!) } $in_content = 1 if ($text =~ /\[start\]--\(\( Content Area \)\)/); $in_content = 0 if ($in_content and $text =~ /black line row/); if (!$in_script and !$in_applet) { $result .= $text; } }; my $parser = new HTML::Parser(api_version => 3, start_h => [ $do_start_tag, 'tagname, attr' ], end_h => [ $do_end_tag, 'tagname' ], text_h => [ $do_text, 'text' ], comment_h => [ $do_comment, 'text' ]); $parser->xml_mode(1); # because of $parser->parse($content); $parser->eof(); my $l = length($result); print "Content-Length: $l\r\nContent-Type: text/html\r\n\r\n$result"; # that's all... }