From: Tom Christiansen <tchrist@mox.perl.com>
Subject: Seizing the Bull by the Horns
Date: 11 May 1999 11:01:46 -0700
I suppose that you thought the posting I'm following up to was bereft of
Perl content. Not so! It was just to whet your appetites. Here's a
program written by Abigail (who may in fact have a newer version) that
filters out much of the bull described in the parent of this article.
It's a personal proxy server. You'll have to go into proxy.pl and
edit all the places that say FIXME to reflect your local situation.
You might have a look into the config file, too.
--tom
PS: Idiots who use <BR><BR> to mean <P> end up looking worse, but that's
their own dumb fault.
#!/bin/sh
# This is a shell archive (produced by GNU sharutils 4.2).
# To extract the files from this archive, save it to some FILE, remove
# everything before the `!/bin/sh' line above, then type `sh FILE'.
#
# Made on 1999-05-11 10:54 MDT by <tchrist@jhereg.perl.com>.
# Source directory was `/tmp'.
#
# Existing files will *not* be overwritten unless `-c' is specified.
# This format requires very little intelligence at unshar time.
# "if test", "echo", "mkdir", and "sed" may be needed.
#
# This shar contains:
# length mode name
# ------ ---------- ------------------------------------------
# 12073 -rw-r--r-- abiprox/Proxy/HTML.pm
# 21497 -rw-r--r-- abiprox/Proxy/Config.pm
# 7286 -rwx------ abiprox/proxy.pl
#
echo=echo
if mkdir _sh22768; then
$echo 'x -' 'creating lock directory'
else
$echo 'failed to create lock directory'
exit 1
fi
# ============= abiprox/Proxy/HTML.pm ==============
if test ! -d 'abiprox'; then
$echo 'x -' 'creating directory' 'abiprox'
mkdir 'abiprox'
fi
if test ! -d 'abiprox/Proxy'; then
$echo 'x -' 'creating directory' 'abiprox/Proxy'
mkdir 'abiprox/Proxy'
fi
if test -f 'abiprox/Proxy/HTML.pm' && test "$first_param" != -c; then
$echo 'x -' SKIPPING 'abiprox/Proxy/HTML.pm' '(file already exists)'
else
$echo 'x -' extracting 'abiprox/Proxy/HTML.pm' '(text)'
sed 's/^X//' << 'SHAR_EOF' > 'abiprox/Proxy/HTML.pm' &&
Xpackage Proxy::HTML; # I guess this is a bad name....
X
X#
X# $Id: HTML.pm,v 1.7 1998/02/01 15:23:36 abigail Exp abigail $
X#
X# $Log: HTML.pm,v $
X# Revision 1.7 1998/02/01 15:23:36 abigail
X# Added </table> recovery strategy.
X#
X# Revision 1.6 1997/11/23 05:50:37 abigail
X# Change structure of no_attributes and no_tags datastructures.
X# They are now arrays of tags or tag/attribute pairs, so we can
X# have side specific *and* general configuration.
X# This gets rid of the map inside map inside map expression.
X#
X# Revision 1.5 1997/11/03 17:29:43 abigail
X# Many changes, including support for not showing images based on the
X# scr attribute, showing an images alt attribute, and deleting entire
X# elements.
X#
X# Revision 1.4 1997/10/20 04:45:02 abigail
X# All HTML stuff is now site configurable.
X# Matching for site is done in new(), not for all tags.
X# Building the page is delayed till the end, keeping all tags and
X# text fragments in a stack. This allows for going back and deleting
X# stuff. (Yet to be done).
X#
X# Revision 1.3 1997/10/15 00:24:11 abigail
X# On the fly configuration.
X# Host dependent image selection.
X#
X# Revision 1.2 1997/08/24 05:15:11 abigail
X# Configurable ignoring tags/attributes. Either unconditionally,
X# or by looking at the attribute value.
X# Rewriting <img> tags if images are not wanted.
X#
X# Revision 1.1 1997/08/23 09:32:05 abigail
X# Initial revision
X#
X#
X#
X# Strip nasty tags/attributes from HTML text.
X
Xuse strict;
Xuse Carp;
X
Xuse Data::Dumper;
X
Xuse vars qw /@ISA/;
Xuse vars qw /$IMG $NO_TAGS $NO_ATTR $NO_TAG_BY_ATTR $TEXT $REPLACE
X $ALT $INLINE $NO_ELEMENTS $NETSCAPE_COMMENT/;
X
Xmy $COMM_ONLY = 1;
Xmy $TEXT_ONLY = 2;
X
Xuse HTML::Parser;
X
X@ISA = qw /HTML::Parser/;
X
Xsub deal_with_img ($$);
X
X
X# We mask new to be able to get the inline status.
Xsub new ($$) {
X my $class = shift;
X my $self = new HTML::Parser;
X bless $self, $class;
X}
X
X
X# To do interesting things, we keep track of the parsed and modified
X# part of html text in an instance variable "mod_text". It is set to ''
X# by masking parse ().
X
Xsub parse ($@) {
X my $self = shift;
X $self -> {response} = shift;
X my $host =
X $self -> {host} = $self -> {response} -> base -> host;
X my $url =
X $self -> {url} = $self -> {response} -> base -> as_string;
X $self -> {content} = $self -> {response} -> content;
X $self -> {last_tag} = 'NIL';
X
X # Configuration is site specific. Do all the host matching
X # stuff here.
X
X # Match the URL with all entries in $IMG, remember which
X # ones matched.
X $self -> {src_matches} = [map {@{$_ -> [1]}}
X grep {$url =~ /$_->[0]/;} @$IMG];
X
X # Match the URL with all entries in $ALT, remember which
X # ones matched.
X $self -> {alts} = [map {@{$_ -> [1]}}
X grep {$url =~ /$_->[0]/;} @$ALT];
X
X # The coderefs for the no attributes.
X foreach my $site (grep {$url =~ /$_->[0]/;} @$NO_ATTR) {
X foreach my $elem (@{$site -> [1]}) {
X foreach my $attr (@{$elem -> [1]}) {
X push @{$self -> {no_attr} -> {$elem -> [0]} -> {$attr -> [0]}},
X $attr -> [1];
X }
X }
X }
X
X # The no elements structure.
X foreach my $site (grep {$url =~ /$_->[0]/;} @$NO_ELEMENTS) {
X foreach my $tag (@{$site -> [1]}) {
X push @{$self -> {skips} -> {$tag -> [0]}}, $tag -> [1];
X }
X }
X
X # The no tags structures.
X foreach my $site (grep {$url =~ /$_->[0]/;} @$NO_TAGS) {
X foreach my $tag (@{$site -> [1]}) {
X push @{$self -> {no_tags} -> {$tag -> [0]}}, $tag -> [1];
X }
X }
X
X # The information for the text/comment filters.
X $self -> {text_info} = [map {@{$_ -> [1]};}
X grep {$url =~ /$_->[0]/;} @$TEXT];
X
X # The information for text replacement.
X $self -> {replace} = [map {@{$_ -> [1]};}
X grep {$url =~ /$_->[0]/;} @$REPLACE];
X
X $self -> netscape_buggy_comment ($NETSCAPE_COMMENT); # Silly netscape.
X
X # Flag to indicate whether white space collapses or not.
X $self -> {pre} = 0;
X
X # Flag to indicate whether we are in 'skip' mode or not.
X $self -> {skip} = 0;
X $self -> {end_skip} = undef;
X $self -> {skip_type} = undef;
X
X $self -> {skip_element} = '';
X $self -> {skip_element_count} = 0;
X
X # Initialize the stack.
X $self -> s_init;
X
X $self -> SUPER::parse ($self -> {content});
X}
X
X# Return the modified text.
Xsub parsed_text ($) {
X my $self = shift;
X $self -> build_page;
X}
X
X# Now we mask the various "call back" functions.
X
X# Declarations are just copied. (But what about comments?)
Xsub declaration ($$) {
X my $self = shift;
X my $decl = shift;
X return if $self -> {skip};
X $self -> s_push (["DECL", $decl]);
X}
X
X# This is the interesting function.
Xsub start ($$$$$) {
X my $self = shift;
X return if $self -> {skip};
X
X my ($tag, $attributes, $attrseq, $origtext) = @_;
X
X # If the tag is the same as the element we are currently
X # skipping, we need to increment to nesting counter.
X if ($tag eq $self -> {skip_element}) {
X $self -> {skip_element_count} ++;
X return;
X }
X
X # my $skip_sub = $self -> {skips} -> {$tag};
X foreach my $skip_sub (@{$self -> {skips} -> {$tag} || []}) {
X if ($skip_sub && $skip_sub -> ($tag, $attributes, $self)) {
X # Go into element skipping mode.
X $self -> {skip} = 1;
X $self -> {skip_element} = $tag;
X $self -> {skip_element_count} = 1;
X return;
X }
X }
X
X foreach my $sub (@{$self -> {no_tags} -> {$tag} || []}) {
X return if $sub && $sub -> ($tag, $attributes, $self, 0);
X }
X
X # Image is special.
X if ($tag eq 'img') {
X return if $self -> no_image ($attributes -> {src});
X my $alt = $self -> img_to_alt ($attributes);
X if ($alt) {
X $self -> {last_tag} = 'NIL';
X $self -> s_push (["TEXT", $alt]);
X return;
X }
X }
X
X my $no_attr = $self -> {no_attr} -> {$tag};
X if ($no_attr) {
X foreach my $attribute (keys %$attributes) {
X foreach my $sub (@{$no_attr -> {$attribute} || []}) {
X delete $attributes -> {$attribute} if
X $sub -> ($attributes -> {$attribute}, $self);
X }
X }
X }
X
X $self -> {pre} = 1 if $tag eq 'pre';
X $self -> {last_tag} = $tag;
X $self -> s_push (["START", $tag, $attributes]);
X}
X
X
X# Copy end tags, except for those being filtered out.
Xsub end ($$) {
X my $self = shift;
X my $tag = shift;
X
X # If we are skipping an entire element, and we encounter a
X # closing tag for that element, we decrement the nesting counter.
X # If it becomes 0, we stop skipping.
X if ($tag eq $self -> {skip_element}) {
X return if -- $self -> {skip_element_count} > 0;
X $self -> {skip} = 0;
X $self -> {skip_element} = '';
X return;
X }
X
X return if $self -> {skip};
X
X foreach my $sub (@{$self -> {no_tags} -> {$tag} || []}) {
X return if $sub && $sub -> ($tag, {}, $self, 1);
X }
X
X $self -> {pre} = 0 if $tag eq 'pre';
X $self -> {last_tag} = 'NIL';
X $self -> s_push (["END", $tag]);
X}
X
X# Strip out any hacks.
Xsub text ($$$) {
X my $self = shift;
X my $text = shift;
X my $comment = shift || 0;
X
X $text =~ s/(?: ?| ?)(?: ?| ?|\s)+/ /g
X unless $self -> {pre} || $comment;
X
X if ($self -> {skip}) {
X # We are in skipping mode.
X # Hence check if it matches the end of skipping mode.
X if (defined $self -> {end_skip} &&
X $text =~ /$self->{end_skip}/ &&
X (!defined $self -> {skip_type} ||
X ($self -> {skip_type} == $COMM_ONLY && $comment) ||
X ($self -> {skip_type} == $TEXT_ONLY && !$comment))) {
X $self -> {skip} = 0;
X $self -> {end_skip} = undef;
X }
X return;
X }
X
X foreach my $skip_text (@{$self -> {text_info}}) {
X if (ref $skip_text) {
X if (defined $skip_text -> [2]) {
X next if $skip_text -> [2] == $COMM_ONLY && !$comment;
X next if $skip_text -> [2] == $TEXT_ONLY && $comment;
X }
X # It is an array with a begin and end match.
X if ($text =~ /$skip_text->[0]/) {
X # Match the first element.
X # Enter skip mode, remember second element.
X $self -> {skip} = 1;
X $self -> {end_skip} = $skip_text -> [1];
X $self -> {skip_type} = $skip_text -> [2];
X return;
X }
X }
X # Else it's a string. Skip this text if it matches.
X # No point in skipping a comment.
X else {return if !$comment && $text =~ $skip_text;}
X }
X
X # Do textual replacements.
X foreach my $replacement (@{$self -> {replace}}) {
X my ($pattern, $repl) = @$replacement;
X $text =~ s/$pattern/$repl/g;
X }
X
X $self -> {last_tag} = 'NIL' if $text =~ /\S/ && !$comment;
X $self -> s_push ([$comment ? "COMMENT" : "TEXT", $text]);
X}
X
X# Comments are just copied.
X# Hmm, 'man HTML::Parser' is unclear. Comments *are* part of a document
X# declaration. And you can have multiple comments in a document declarion.
X# Hope this will work....
Xsub comment ($$) {
X my $self = shift;
X my $comment = shift;
X $self -> text ($comment, 1);
X}
X
X
X# Return 1 if we shouldn't include the image.
Xsub no_image ($$) {
X my ($self, $src) = @_;
X
X foreach my $s (@{$self -> {src_matches}}) {return 1 if $src =~ /$s/;}
X return;
X}
X
X# Check if we prefer the alt text.
Xsub img_to_alt ($$) {
X my ($self, $attributes) = @_;
X
X my $src = $attributes -> {src};
X foreach my $s (@{$self -> {alts}}) {
X if ($src =~ /$s/) {return $attributes -> {alt} || $INLINE;}
X }
X return;
X}
X
X
X
X# Stack functions.
Xsub s_init () {my $self = shift; $self -> {stack} = [];}
Xsub s_push () {my $self = shift; push @{$self -> {stack}}, shift;}
Xsub s_pop () {my $self = shift; pop @{$self -> {stack}};}
Xsub s_top () {my $self = shift; $self -> {stack} -> [-1];}
X
X
Xsub build_tag ($$$) {
X my $self = shift;
X my ($tag, $attributes) = @_;
X my $line = "<$tag";
X foreach my $attr (keys %$attributes) {
X # We lost the info whether it was single or double quote delimited.
X # So we take the "safe" approach and use double quote delimited,
X # but first we need to eliminate the existing double quotes.
X $attributes -> {$attr} =~ s/"/"/g;
X $line .= qq ( $attr = "$attributes->{$attr}");
X }
X $line .= ">";
X}
X
Xsub build_page () {
X my $self = shift;
X my $text = "";
X
X # Some pages miss </table> tags. This is usually due to different
X # error recovery strategies of browsers and HTML::Parser.
X # Count them, and insert any missing </tables> before the end
X # of the document.
X my $tables = 0;
X my $body = 0;
X
X foreach my $part (@{$self -> {stack}}) {
X my ($type, $tag, $attr) = @$part;
X foreach ($type) {
X /TEXT/ && do {$text .= $tag; last;};
X /COMMENT/ && do {$text .= "<!--$tag-->"; last;};
X /DECL/ && do {$text .= "<!$tag>"; last;};
X /END/ && do {$tables -- if $tag eq 'table';
X do {$text .= "</table>" x $tables;
X $body = 1;} if !$body && ($tag eq 'body' ||
X $tag eq 'html');
X $text .= "</$tag>";
X last;};
X /START/ && do {$text .= $self -> build_tag ($tag, $attr);
X $tables ++ if $tag eq 'table';
X last;};
X die "Unknown type: $type\n";
X }
X }
X
X $text .= "</table>" x $tables unless $body;
X
X $text;
X}
X
X1;
SHAR_EOF
: || $echo 'restore of' 'abiprox/Proxy/HTML.pm' 'failed'
fi
# ============= abiprox/Proxy/Config.pm ==============
if test -f 'abiprox/Proxy/Config.pm' && test "$first_param" != -c; then
$echo 'x -' SKIPPING 'abiprox/Proxy/Config.pm' '(file already exists)'
else
$echo 'x -' extracting 'abiprox/Proxy/Config.pm' '(text)'
sed 's/^X//' << 'SHAR_EOF' > 'abiprox/Proxy/Config.pm' &&
Xpackage main;
X
X#
X# $Id: Config.pm,v 1.5 1998/02/01 15:24:34 abigail Exp abigail $
X#
X# $Log: Config.pm,v $
X# Revision 1.5 1998/02/01 15:24:34 abigail
X# More data.
X#
X# Revision 1.4 1997/11/23 05:50:37 abigail
X# Change structure of no_attributes and no_tags datastructures.
X# They are now arrays of tags or tag/attribute pairs, so we can
X# have side specific *and* general configuration.
X# This gets rid of the map inside map inside map expression.
X#
X# Revision 1.3 1997/11/03 17:28:48 abigail
X# Many changes, including support for not showing images based on the
X# scr attribute, showing an images alt attribute, and deleting entire
X# elements.
X#
X# Revision 1.2 1997/10/20 04:44:34 abigail
X# All HTML stuff is now site configurable.
X#
X# Revision 1.1 1997/10/15 00:25:58 abigail
X# Initial revision
X#
X#
X#
X# Configuration for proxy.
X
Xuse strict;
X
Xuse vars qw /@NO_REQ_HEADERS @NO_RES_HEADERS/;
X # Headers we do not want in
X # outgoing requests.
X@NO_REQ_HEADERS = qw /User-Agent From Referer Cookie/;
X@NO_RES_HEADERS = qw /Cache-control Set-Cookie Refresh Etag/;
X # Headers we do not want in
X # responses.
X
Xpackage Proxy::HTML;
X
X
Xuse vars qw /$IMG $NO_TAGS $NO_ATTR $TEXT $REPLACE $NO_ELEMENTS
X $ALT $INLINE $NETSCAPE_COMMENT/;
X
X
X# This is done a bit tricky, but that's to be able to cope with the
X# fact this file gets re-required over time.
X# This function takes an attribute name and a list of regexes as
X# argument, and returns a closure that returns true if the value
X# of the attribute matches one of the regexs.
Xmy $generic_filter = sub {
X my ($attribute, @filters) = @_;
X local $" = ", ";
X sub {
X my ($tag, $attributes, $self, $end) = @_;
X my $attr = $attributes -> {$attribute} or return;
X foreach (@filters) {return 1 if $attr =~ /$_/;}
X return;
X }
X};
X
X# Easy function to quickly make long regexs.
Xmy $image = sub {
X my ($type, $slash, @images) = @_;
X local $" = "|";
X my $regex = $slash ? '/' : "";
X $regex = "$regex(?:@images)";
X $regex .= "\\.$type" if $type;
X $regex;
X};
X
X
X# Regexes that match certain sites.
Xmy $ALL = '.?'; # Regex that always matches.
Xmy $CNN = '//[-\w.]*\b(?:cnn(?:si|fn)?|allpolitics)\.com/';
Xmy $HELEN = '//[-\w.]*\bpeterzale\.com/'; # Helen strip.
Xmy $COMICS = '//[-\w.]*\bunitedmedia\.com/'; # Dilbert/Snoopy
Xmy $NEWS = '//[-\w.]*\bnews\.com/';
Xmy $ALTAVISTA = '//[-\w.]*\bdigital\.com/';
Xmy $INFOSEEK = '//[-\w.]*\binfoseek\.com/';
Xmy $YAHOO = '//[-\w.]*\byahoo\.com/';
Xmy $LYCOS = '//[-\w.]*\blycos\.com/';
Xmy $PERL = '//[-\w.]*\bperl\.com/';
Xmy $PLANET = '//[-\w.]*\bpi\.net/'; # Planet Internet.
Xmy $DEJANEWS = '//[-\w.]*\bdejanews\.com/';
Xmy $KRUD = '//[-\w.]*\bkrud\.com/'; # Krud Radio cartoon.
Xmy $FORBES = '//[-\w.]*\bforbes\.com/';
Xmy $VOLKSKRANT = '//[-\w.]*\bvolkskrant\.nl/';
Xmy $NRC = '//[-\w.]*\bnrc\.nl/';
Xmy $NYTIMES = '//[-\w.]*\bnytimes\.com/';
Xmy $MOVIE = '//[-\w.]*\bimdb\.com/'; # Internet Movie Database.
Xmy $NORTHERN = '//[-\w.]*\bnorthernlight\.com/'; # Northern Light.
Xmy $HOTBOT = '//[-\w.]*\bhotbot\.com/'; # Hotbot
Xmy $MERRIAM = '//[-\w.]*\bm-w\.com/'; # Online webster.
Xmy $REDMEAT = '//[-\w.]*\bredmeat\.com/'; # Strips.
Xmy $UKTIMES = '//[-\w.]*\bthe-times\.co\.uk/'; # UK Times.
Xmy $MERC = '//[-\w.]*\bsjmercury\.com/'; # SJ Mercury News
Xmy $LATIMES = '//[-\w.]*\blatimes\.com/'; # Los Angeles Times
Xmy $LFTIMES = '//[-\w.]*\bft\.com/'; # London Financial Times
Xmy $USATODAY = '//[-\w.]*\busatoday\.com/';
Xmy $ZDNET = '//[-\w.]*\bzdnet\.com/';
X
X# Remove logos, nav. bars and ads from commonly visited sites.
X
X# Regexes for well know add/counter sites.
X# Used to block both A element, and the IMG element.
Xmy @AD_SITES_COM = qw /focalink digiweb ad.preferences preferences nrsite linkexchange
X aaddzz/;
Xmy @AD_SITES_NET = qw /cerf doubleclick/;
Xmy @AD_SITES_MISC = qw /www\.image\.dk/;
Xmy @COUNTER_COM = qw /hitbox pagecount icount www\.fxweb\.holowww
X aaddzz siteflow net-user/;
Xmy @COUNTER_NET = qw /net-trak\.stats/;
Xmy @COUNTER_MISC = qw /www\.image\.dk www\.nedstat\.nl/;
Xmy ($AD_SITES_REGEX, $COUNTER_SITES_REGEX);
X{local $" = '|';
X $AD_SITES_REGEX = "//[\\w.-]*\\b(?:(?:(?:@AD_SITES_COM)\\.com)|" .
X "(?:(?:@AD_SITES_NET)\\.net)|" .
X "(?:@AD_SITES_MISC))";
X $COUNTER_SITES_REGEX = "//[\\w.-]*\\b(?:(?:(?:@COUNTER_COM)\\.com)|" .
X "(?:(?:@COUNTER_NET)\\.net)|" .
X "(?:@COUNTER_MISC))";
X}
X
X# Regex for well known counter programs.
Xmy $COUNTERS = join '|', qw /usercounter viewstat/;
X
X$IMG = [
X [$ALL => [$AD_SITES_REGEX,
X $COUNTER_SITES_REGEX,
X "/cgi-bin/(?:$COUNTERS)",
X qw { \bads?\b
X \badv(?:erts?)?/b
X /p(?:oint)?cast/
X /Count\d*\.cgi
X /netscape\.gif
X }]],
X
X [$HELEN => [$image -> ('gif', 0, qw {awrd hel he ar gwe sp dou dg}),
X ]],
X
X [$MERRIAM => [$image -> ('gif', 1, qw {but hddict logot})]],
X
X [$COMICS => [$image -> ('gif', 1, qw {
X invis hr4 header headline \w*interbar flowers
X um_bug bnn yoyo\d* dil_(?:partner|store)_icon
X ms_(?:black|top(?:l)?|list_right|icon_store)
X ms_(?:snoopy|editoons|store|copy|about(?:um|cz))
X ms_(?:read|dilbert|bottom|comiclist|mustsee)
X link inter (?:dnrc|lotd|ddmw|duh)_icon spacer
X email_endnavi 256c-100 dz_logo_sm ad slogan
X (?:(?:ms|d[zd]|exodus)_|IE4|com)logo
X dz_(?:anigiftease|archive|agenda|strip_[br])
X dz_(?:lart|link)_(?:lotd|ddmw|dnrc|duh|store)
X ab_(?:logo[TBC]|topline|todays) bottomnav_map
X ab_links_(?:dilbert|snoopy|editoons|store)
X c_zone official collect artafs wood wood2
X new_cool strip store window middle linus kids
X charlie lucy snoopy sally gall schroder today
X epn_map_(?:t|b_books) grown new_feet
X spacer_black backto a_month bottom
X linksmal sh_books reading black 3dtheater
X (?:sunday|arch|comic|email|more)navi(?:RED)?
X youngdil}),
X $image -> ('jpg', 1, qw {Backpack}),
X $image -> ('', 1, qw {storeImages/}),
X ]],
X
X [$VOLKSKRANT
X => [$image -> ('gif', 1, qw {
X postbank rabo demon i_voorpag_kop msie40 bruna
X abnamro i_indruk_kop indruk_tab
X v_\w+ media2 dossiers2 tab_\w+ empty_\d+x\d+
X i_\w+_1_136w sport\d
X bovenrnd onderrand_1997 lijn})]],
X
X [$NRC => [$image -> ('gif', 0, qw {
X luxklein(?:sup|n) hekk splash mast 1x1})
X ]],
X
X [$CNN => [$image -> ('gif', 1, qw {
X video\.bug cnn_logo cnn_interactive nav[\w\.-]+
X spacer explore\.anim(?:\.\d+\.\d+)?
X pathnet\.warner airmedia insight white
X circle\.(?:white|red) arrow\.blue
X custom_feature pointcast ie hr fringe_us logo qa
X bullet\d* y\.matters today_cnn dot to_top2 blue
X [\w-.]+banner search\.feed plus_message cities
X back_to_top(?:\.space)? sound\.icon sci_tech
X what_you_think\.space vx_bar\+txt weather bbs
X b_OPTIONS b_SUBMIT story_promo nb_CUSTOM
X nb_CNNIN nb_CUSTOM_PROFILE b_TOP plus_logo
X search\.bn\.recommends book\.search bnoble
X bn\.recommend\.box\.(?:left|top|right) main_us
X nb_ALL_POLITICS nb_CNN(?:fn|si) news search
X related(?:stories|sites) stormcenter_link
X nb_CNN_INTERACTIVE mn_Lifestyle mn_Sci-Tech
X sep_BANNERDOTS dot_GREY nb_LOWER_STEP dline
X pumpkin\.bullet\.icon jol\.top buttons
X explore\.[\w.]+
X message\.boards qt_icon logo_[lr] mn_Showbiz}),
X $image -> ('jpg', 1, qw {
X [\w-.]+banner news bug related(?:stories|sites)
X watch\.gutter yir\.special today_cnn
X }),
X $image -> ('gif', 0, qw {nav_no\.cursor logo}),
X $image -> ('jpg', 0, qw {lotd vidpik}),
X $image -> ('', 0, qw {[Ii]nfoseek pathfinder}),
X ]],
X
X [$YAHOO => [$image -> ('', 0, qw {/adv/}),
X $image -> ('gif', 1, qw {new2})
X ]],
X
X [$NEWS => [$image -> ('gif', 1, qw {
X newscom item_\w+ cnet home}),
X $image -> ('', 1, qw {Banners/ bump masthead Frontdoor}),
X ]],
X
X [$ALTAVISTA => [$image -> ('gif', 1, qw {
X av_(?:logo|map) pixel searchpx digital-logo
X preview}),
X ]],
X
X [$INFOSEEK => [$image -> ('gif', 1, qw {
X home_bar_ups big_yellow_red ups_track select
X smart_search_results_ups greenbal}),
X $image -> ('gif', 0, qw {circlei}),
X ]],
X
X [$LYCOS => [$image -> ('gif', 1, qw {
X new-logo place-holder navbar servicebar homerule
X lycos-logo games-icon Lilarrow-(?:vspace|red)
X poweredbyww sweepicn lilarrow scificon
X threearrows logobar}),
X ]],
X
X [$PERL => [$image -> ('gif', 1, qw {
X ora_logo perl_id_313c header-nav camel}),
X $image -> ('jpg', 1, qw {perl_lang_color}),
X ]],
X
X [$PLANET => [$image -> ('gif', 1, qw {pilogo klgnetbn 1 now}),
X $image -> ('', 0, qw {/adv/ /pics/div/}),
X ]],
X
X [$DEJANEWS => [$image -> ('gif', 1, qw {
X browsers nav_\w* head home2 fingers [rl]tri 1x1
X qm tgb text thr prev(?:_x)? curr new redline
X auth
X iworld_f97 next(?:_x)? [rl]arr(?:_x)? post_a}),
X $image -> ('', 1, qw {dnlogo_ frownie_}),
X ]],
X
X [$KRUD => [$image -> ('gif', 1, qw {rewind tip krudrad})]],
X
X [$FORBES => [$image -> ('gif', 1, qw {
X adot comments
X theme_ban netvlu14d logo-otl deliver netvlu})
X ]],
X
X [$NYTIMES => [$image -> ('gif', 1, qw {maintoolbar cyberex 1bancyber})]],
X
X [$MOVIE => [$image -> ('gif', 1, qw {click\d+ accentdesign center})]],
X
X [$NORTHERN => [$image -> ('gif', 1, qw {
X logo(?:_corner2|-tall) (?:home|support)gray
X side\d sson 00000 searchfor[12] line transparent
X searchresults templogo foldery special
X www dynamicfolders bullet_right feature})]],
X
X [$HOTBOT => [$image -> ('gif', 1, qw {misc\.transPixel})]],
X
X [$UKTIMES => [$image -> ('', 1, qw {_ecb_timnws_ valentine})]],
X
X
X];
X
X# As $IMG, but then for the cases we prefer the alt text.
X$ALT = [
X [$VOLKSKRANT
X => [$image -> ('', 1, qw {images/balk})]],
X
X [$DEJANEWS => [$image -> ('gif', 1, qw {gifs/..})]],
X
X [$COMICS => [$image -> ('gif', 1, qw {ab_next})]],
X
X [$NORTHERN => [$image -> ('gif', 1, qw {
X return2resultssq newsearch purchasedoc moneyback
X spec-info return2results newsearchround search
X home
X aboutsq accounts support disclaim help_better-z
X nextten prev10 searchhelp startover})]],
X];
X
X# Need this later.
Xmy $delete = sub {1;};
Xmy $absolute_width = sub {shift !~ /%$/;};
Xmy $no_multiple = sub {my ($tag, $attr, $self, $end) = @_;
X $tag eq $self -> {last_tag}};
X
X
X
Xmy @CNN_ADS = qw { \bredirect\b \bclick\b \bevent\.ng\b \bclick\.ng\b SHOWBIZAdSpace /ads/ /CNNProm /adinfo/
X /www\.reservationdesk\.com};
Xmy $CNN_ADS = join '|', @CNN_ADS;
X
X
X# $NO_TAGS will remove tags. Configurable by site.
X$NO_TAGS = [
X [$ALL => [[br => $no_multiple],
X [hr => $no_multiple],
X [meta => $generic_filter -> ('http-equiv', '.')],
X [base => $generic_filter -> ('target', '_blank')],
X [img => $generic_filter -> ('src' => '\b(?:ads?|adv(?:erts?)?)\b',
X '/p(?:oint)?cast/',
X '\bclick(thr)?',
X '\bredirects?\b',
X '\bad(?:vert(?:isement)?)?s?\b',
X '(?:link|banner|counter)\.cgi',
X '/promo(?:tions?)?/',
X '(?:event|click?)\.ng',
X $COUNTER_SITES_REGEX,
X $AD_SITES_REGEX)],
X map {[$_, $delete]} qw /font basefont blink applet
X param embed/]
X ],
X
X [$CNN => [[hr => sub {my ($tag, $attr, $self, $end) = @_;
X $self -> {last_tag} eq 'a';}]]],
X
X # [$LYCOS => [[frame => $generic_filter -> ('src', '/wguide/tools/')]]],
X];
X
X
X# $NO_ELEMENTS will remove entire *elements*. Configurable by site.
X# Very handy to remove ad banners.
X$NO_ELEMENTS = [
X
X [$ALL => [
X [a => $generic_filter -> ('href', '\b(?:ads?|adv(?:erts?)?)\b',
X '/p(?:oint)?cast/',
X '\bclick(thr)?',
X '\bredirects?\b',
X '\bredir\b',
X '\bad(?:vert(?:isement)?)?s?\b',
X '(?:link|banner|counter)\.cgi',
X '/promo(?:tions?)?/',
X '(?:event|click?)\.ng',
X $COUNTER_SITES_REGEX,
X $AD_SITES_REGEX)],
X
X ]],
X
X [$LFTIMES => [[ a => $generic_filter -> (href => 'pft/jump')]]],
X
X [$LATIMES => [[ a => $generic_filter -> (href => 'event.ng')]]],
X [$USATODAY => [[ a => $generic_filter -> (href => 'sponsors')]]],
X
X [$ALL => [[a => $generic_filter -> ('href', 'http://\S*infi.net')]]],
X
X [$CNN => [[a => $generic_filter -> ('href', $CNN_ADS,
X 'video\.html')],
X [form => $generic_filter -> ('action', 'servercast\.net')]]],
X
X [$PERL => [[a => $generic_filter -> ('href', 'http://opensource\.oreilly\.com/news/\w+_\d+\.html')]]],
X
X [$NYTIMES => [[form => $generic_filter -> ('action', '/ads/')]]],
X
X [$NRC => [[a => $generic_filter -> ('href', '/Advertenties/')]]],
X
X [$YAHOO => [[a => $generic_filter -> (href => '/[*?]http:/')]]],
X
X [$LYCOS => [[a => $generic_filter -> ('href', 'adclick')],
X [form => $generic_filter -> ('action', 'nph-adclick.exe')]]],
X
X [$MOVIE => [[a => $generic_filter -> ('href', '/Ads/')]]],
X
X [$DEJANEWS => [[a => $generic_filter -> ('href', 'gtplacer')]]],
X
X [$HOTBOT => [[a => $generic_filter -> ('href', 'clickthru\.html')]]],
X
X [$COMICS => [[a => $generic_filter -> ('href', '1800flowers',
X 'netscape\.com',
X 'comiczone\.cdf',
X 'microsoft\.com')],
X ['select' => $generic_filter -> ('onchange', 'window\.open')],
X [form => $generic_filter -> ('onsubmit', 'window\.open')]]],
X
X [$REDMEAT => [[a => $generic_filter -> ('href', 'riddler.com')]]],
X
X [$UKTIMES => [[a => $generic_filter -> ('href', 'valentines',
X 'infotimes', '\bstanding\b', '\.map\+\d')]]],
X
X [$ZDNET => [[form => $generic_filter -> ('action', '/adverts/')]]],
X
X];
X
X
X
X# Initialize the NO_ATTR datastructure.
X# The form is: tag -> attribute -> coderef, per site.
X# If the attribute for that tag exists, the coderef is called with
X# the attribute value as parameter. If it returns true, it's deleted.
X$NO_ATTR = [
X [$ALL => [[img => [[border => $delete]]],
X [input => [[border => $delete]]],
X [a => [[target => sub {shift =~ /new$/i;}]]],
X [frameset => [[frameborder => $delete],
X [border => $delete],
X [framespacing => $delete]]],
X [frame => [[noresize => $delete],
X [marginwidth => $delete],
X [marginheight => $delete],
X [frameborder => $delete],
X [scrolling => $delete]]],
X [hr => [[noshade => $delete],
X [width => $delete]]],
X [table => [[width => $absolute_width],
X [height => $absolute_width],
X [align => $delete],
X [cellspacing => $delete],
X [cellpadding => $delete],
X [nowrap => $delete],
X [bgcolor => $delete]]],
X [td => [[width => $absolute_width],
X [height => $delete],
X [nowrap => $delete],
X [bgcolor => $delete]]],
X [th => [[width => $absolute_width],
X [height => $delete],
X [nowrap => $delete],
X [bgcolor => $delete]]],
X ['tr' => [[width => $absolute_width],
X [nowrap => $delete],
X [bgcolor => $delete]]],
X [body => [[background => $delete],
X [bgcolor => $delete],
X ['text' => $delete],
X ['link' => $delete],
X [vlink => $delete],
X [alink => $delete]]]],
X ],
X
X [$NORTHERN => [[td => [[nowrap => $delete]]]]],
X];
X
X
X
X# This strips out text, or parts of the document.
X# Each site has a list of strings or 2 string elements.
X# If a text or comment matches the string, it's ignored.
X# For string pairs, whenever a text or comment matches the first string,
X# the parsers goes into 'ignore' mode, until the second string matches
X# a comment or text. *After* which it goes back to normal parsing.
Xmy $COMM_ONLY = 1;
Xmy $TEXT_ONLY = 2;
X$TEXT = [
X [$CNN => ['Advertising information for CNN Web',
X 'Pages will open in a new browser',
X ['Search and Promos', 'Search and Promos', $COMM_ONLY],
X ['CNN/SI Bullets', 'CNN/SI Bullets', $COMM_ONLY],
X ['Community Bullets', 'Community Bullets', $COMM_ONLY],
X ['Search Table', 'Search Table', $COMM_ONLY],
X ['Nagano Bullets', 'Nagano Bullets', $COMM_ONLY],
X ['Jobs', 'Jobs', $COMM_ONLY],
X ]],
X [$MERC => [['NetG AD', 'ENDAD', $COMM_ONLY]]],
X
X [$LYCOS => [['^\s*AD\b', '(?:\bEND |/)AD\b', $COMM_ONLY]]],
X];
X
X
X# Replace a text with something else.
X# We have 2 element arrays of strings. First is regex, second a replacement.
X# No $1 and friends available. For now that is.
X$REPLACE = [
X # There is some real bad software out there.
X [$ALL => [['(?i)<font' => '<bogus'],
X ['(?i)</font' => '</bogus']
X ]],
X
X [$NORTHERN => [['(?i)<meta' => '<bogus']]],
X
X [$MERRIAM => [['(?i)<img' => '<bogus']]],
X
X];
X
X# How to deal with netscape comments.
X$NETSCAPE_COMMENT = 1;
X
X# Text we like to have for inlines without alt.
X$INLINE = '[INLINE]';
X
X1;
SHAR_EOF
: || $echo 'restore of' 'abiprox/Proxy/Config.pm' 'failed'
fi
# ============= abiprox/proxy.pl ==============
if test -f 'abiprox/proxy.pl' && test "$first_param" != -c; then
$echo 'x -' SKIPPING 'abiprox/proxy.pl' '(file already exists)'
else
$echo 'x -' extracting 'abiprox/proxy.pl' '(text)'
sed 's/^X//' << 'SHAR_EOF' > 'abiprox/proxy.pl' &&
X#!/usr/bin/perl -w
X
X#
X# $Id: proxy.pl,v 1.5 1998/02/01 15:25:55 abigail Exp abigail $
X#
X# $Log: proxy.pl,v $
X# Revision 1.5 1998/02/01 15:25:55 abigail
X# Error handling and REAPER changes as suggested by Tom C.
X#
X# Revision 1.4 1997/11/03 17:28:03 abigail
X# Minor changes.
X#
X# Revision 1.3 1997/10/15 00:20:26 abigail
X# Added on the fly configuration.
X#
X# Revision 1.2 1997/08/24 05:14:58 abigail
X# Added $IGNORE feature.
X#
X# Revision 1.1 1997/08/23 09:35:25 abigail
X# Initial revision
X#
X#
X#
X# A proxy server that strips out nasty HTTP headers, and nasty HTML tags.
X# The proxy part is inspired on Randal's Webtechniques column 11.
X# [http://www.stonehenge.com/merlyn/WebTechniques/col11.html].
X
Xuse strict;
Xuse Carp;
X
X#use lib qw (/nfs1/home/abigail/Perl);
X### FIXME
Xuse lib qw (/home/tchrist/nap);
X
Xuse LWP::UserAgent;
Xuse HTTP::Daemon;
Xuse Proxy::HTML;
Xuse POSIX qw /:sys_wait_h/;
X
X
X# Subclass LWP::UserAgent to have the browser know about redirects.
X
X@UserAgent::ISA = qw /LWP::UserAgent/;
Xsub UserAgent::redirect_ok {0;} # No, it is *not* ok to redirect.
X
X# And back to the main stuff.
X
Xsub handle_connection ($);
Xsub fetch ($);
Xsub illegal (%);
X
Xuse vars qw /@NO_REQ_HEADERS @NO_RES_HEADERS/;
X
X# This is just to satisfy -T, we don't call other programs.
X$ENV {PATH} = join ':', qw (/usr/bin); # /bin -> /usr/bin on Solaris.
X
X# On the fly configuration file.
X# Hardcoded file name - not very nice.
X### FIXME
Xmy $Config = "/home/tchrist/nap/Proxy/Config.pm";
X
X# Configuration.
X
X### FIXME
Xmy $ME = "wherever.youare.com"; # The machine I run on.
Xmy $LISTEN_PORT = shift || 8888; # That's the port I'll listen to.
Xdie "Invalid port" if $LISTEN_PORT =~ /\D/ || $LISTEN_PORT < 1024;
X($LISTEN_PORT) = $LISTEN_PORT =~ m/(.*)/; # Untaint.
X### FIXME
Xmy $PROXY = "notherplace.someproxy.com"; # The machine the proxy runs on.
Xmy $PROXY_PORT = 8181; # The proxy's port.
Xmy @PROXY_SCHEMES = qw /http ftp/; # Proxy accepts this.
Xundef $PROXY; # We don't need the other proxy anymore.
X
Xmy $FORBIDDEN = 403; # HTTP Response code.
Xmy $FORBIDDEN_TEXT = 'Forbidden'; # Text of the above.
X
X # How I want to be named.
X # RCS will fill in the version id.
Xmy $NAME = q <HTML stripper $Revision: 1.5 $>;
X
Xmy $last_required = 0;
X
X# Initialize the daemon.
Xmy $daemon = new HTTP::Daemon LocalAddr => $ME, LocalPort => $LISTEN_PORT,
X Reuse => 1
X or croak "HTTP Proxy failed to initialize: $!";
X
X# Initialize agent.
Xmy $agent = new UserAgent;
X $agent -> agent ($NAME);
Xif (defined $PROXY) {
X $agent -> proxy (\@PROXY_SCHEMES, "http://$PROXY:$PROXY_PORT/");
X}
X $agent -> env_proxy (); # Set up whatever else you want.
X
X $agent -> parse_head (0); # Man page says: Do not turn this off,
X # unless you know what you
X # are doing.
X # I know damn well what I'm doing, and one
X # of the reasons to start this whole project
X # are those annoying elements in the head.
X # I rather have software without a mind of
X # its own. Furthermore, read the frigging
X # spec, 'http-equiv' is there for *servers*,
X # not user agents. Damnit.
X # At least server authors always had the
X # sense to ignore those elements.
X
X# Zombies are bad.
Xsub REAPER {
X my $child;
X while (($child = waitpid (-1, WNOHANG)) && $child != -1) { }
X $SIG{CHLD} = \&REAPER; # still loathe sysV
X}
X$SIG {CHLD} = \&REAPER;
X
Xwarn "Proxy on $ME ($LISTEN_PORT) initialized\n";
X
X# And loop forever.
Xwhile (my $slave = $daemon -> accept) {
X # Reconfigure if the config file changed.
X my $mod = (stat $Config) [9] || 0;
X if ($mod > $last_required) {
X delete $INC {$Config};
X require $Config;
X $last_required = $mod;
X }
X handle_connection $slave;
X}
X
X# That's all folks.
Xexit 0;
X
X
X# This handles the connection. Fork, and let the child deal with
X# the stuff. Child exits after dealing with the stuff.
Xsub handle_connection ($) {
X my $connection = shift;
X
X my $pid = fork;
X if ($pid) {
X close $connection; # Can I do $connection -> close; ? If no, why not?
X return;
X }
X
X
X my $request = $connection -> get_request;
X if (defined $request) {
X $0 = "tbtw " . $request->url->as_string();
X ## print "$0\n";
X my $response = fetch $request;
X # Error handling added by Tom Christiansen, 19971118.
X unless ($response -> is_error) {
X $connection -> send_response ($response);
X }
X else {
X my $trailer = "<br>---------------</br><em>$NAME</em>";
X my $content = $response -> error_as_HTML;
X $content =~ s!(?=</body>)!$trailer!i;
X $connection -> send_error ($response -> code, $content);
X }
X close $connection;
X }
X exit 0 if defined $pid;
X}
X
X# Check if a request is valid. If it is, strip info from the client,
X# fetch the stuff, strip info from the server, and return the result.
X# For illegal stuff, return an error message.
Xsub fetch ($) {
X my $request = shift;
X
X # Check if valid.
X if ($request -> url -> scheme ne 'http') {
X return illegal url => $request -> url, reason => "bad scheme";
X }
X
X my $response = eval {$agent -> request ($request -> strip) -> strip;};
X print scalar localtime, "\n$@\n\n" if $@;
X
X # print scalar localtime, "\n", $response -> as_string, "\n\n";
X
X $response; # Will generate a 'document contains no data' if there's
X # an error. Needs a fix.
X}
X
X
X# Create a forbidden response.
Xsub illegal (%) {
X my %args = @_;
X my $response = new HTTP::Response $FORBIDDEN, $FORBIDDEN_TEXT;
X
X # There's only one illegal action for now....
X foreach ($args {reason}) {
X /^bad scheme/ && do {
X $response -> content ("Illegal scheme: $args{url}");
X last;
X };
X # Any...
X $response -> content ("Illegal action.");
X }
X
X $response;
X}
X
X
X# This sub routine strips out HTTP headers we do not want to send.
X# It's just an extra function added to HTTP::Request.
Xsub HTTP::Request::strip ($) {
X my $self = shift;
X $self -> remove_header (@NO_REQ_HEADERS);
X $self; # Return self for extra coolness.
X}
X
X
X# Strip out HTTP response headers we do not want to send to the client.
X# And if the returned type is of text/html, filter it.
X# We put this function into HTTP::Response.
Xsub HTTP::Response::strip ($) {
X my $self = shift;
X
X $self -> remove_header (@NO_RES_HEADERS);
X
X if ($self -> content_type eq 'text/html') {
X my $html = new Proxy::HTML;
X # Replace content with the parsed text after parsing the content.
X $self -> content ($html -> parse ($self) -> parsed_text);
X }
X
X $self; # Return self for extra coolness.
X}
X
SHAR_EOF
: || $echo 'restore of' 'abiprox/proxy.pl' 'failed'
fi
rm -fr _sh22768
exit 0
--
I don't know. I don't care. And it doesn't make any difference.