[LWN Logo]

From: Tom Christiansen <tchrist@mox.perl.com>
Subject: Seizing the Bull by the Horns
Date: 11 May 1999 11:01:46 -0700

I suppose that you thought the posting I'm following up to was bereft of
Perl content.  Not so!  It was just to whet your appetites.  Here's a
program written by Abigail (who may in fact have a newer version) that
filters out much of the bull described in the parent of this article.
It's a personal proxy server.  You'll have to go into proxy.pl and
edit all the places that say FIXME to reflect your local situation.
You might have a look into the config file, too.

--tom

PS: Idiots who use <BR><BR> to mean <P> end up looking worse, but that's
    their own dumb fault.

#!/bin/sh
# This is a shell archive (produced by GNU sharutils 4.2).
# To extract the files from this archive, save it to some FILE, remove
# everything before the `!/bin/sh' line above, then type `sh FILE'.
#
# Made on 1999-05-11 10:54 MDT by <tchrist@jhereg.perl.com>.
# Source directory was `/tmp'.
#
# Existing files will *not* be overwritten unless `-c' is specified.
# This format requires very little intelligence at unshar time.
# "if test", "echo", "mkdir", and "sed" may be needed.
#
# This shar contains:
# length mode       name
# ------ ---------- ------------------------------------------
#  12073 -rw-r--r-- abiprox/Proxy/HTML.pm
#  21497 -rw-r--r-- abiprox/Proxy/Config.pm
#   7286 -rwx------ abiprox/proxy.pl
#
echo=echo
if mkdir _sh22768; then
  $echo 'x -' 'creating lock directory'
else
  $echo 'failed to create lock directory'
  exit 1
fi
# ============= abiprox/Proxy/HTML.pm ==============
if test ! -d 'abiprox'; then
  $echo 'x -' 'creating directory' 'abiprox'
  mkdir 'abiprox'
fi
if test ! -d 'abiprox/Proxy'; then
  $echo 'x -' 'creating directory' 'abiprox/Proxy'
  mkdir 'abiprox/Proxy'
fi
if test -f 'abiprox/Proxy/HTML.pm' && test "$first_param" != -c; then
  $echo 'x -' SKIPPING 'abiprox/Proxy/HTML.pm' '(file already exists)'
else
  $echo 'x -' extracting 'abiprox/Proxy/HTML.pm' '(text)'
  sed 's/^X//' << 'SHAR_EOF' > 'abiprox/Proxy/HTML.pm' &&
Xpackage Proxy::HTML;   # I guess this is a bad name....
X
X#
X# $Id: HTML.pm,v 1.7 1998/02/01 15:23:36 abigail Exp abigail $
X#
X# $Log: HTML.pm,v $
X# Revision 1.7  1998/02/01 15:23:36  abigail
X# Added </table> recovery strategy.
X#
X# Revision 1.6  1997/11/23 05:50:37  abigail
X# Change structure of no_attributes and no_tags datastructures.
X# They are now arrays of tags or tag/attribute pairs, so we can
X# have side specific *and* general configuration.
X# This gets rid of the map inside map inside map expression.
X#
X# Revision 1.5  1997/11/03 17:29:43  abigail
X# Many changes, including support for not showing images based on the
X# scr attribute, showing an images alt attribute, and deleting entire
X# elements.
X#
X# Revision 1.4  1997/10/20 04:45:02  abigail
X# All HTML stuff is now site configurable.
X# Matching for site is done in new(), not for all tags.
X# Building the page is delayed till the end, keeping all tags and
X# text fragments in a stack. This allows for going back and deleting
X# stuff. (Yet to be done).
X#
X# Revision 1.3  1997/10/15 00:24:11  abigail
X# On the fly configuration.
X# Host dependent image selection.
X#
X# Revision 1.2  1997/08/24 05:15:11  abigail
X# Configurable ignoring tags/attributes. Either unconditionally,
X# or by looking at the attribute value.
X# Rewriting <img> tags if images are not wanted.
X#
X# Revision 1.1  1997/08/23 09:32:05  abigail
X# Initial revision
X#
X#
X#
X# Strip nasty tags/attributes from HTML text.
X
Xuse strict;
Xuse Carp;
X
Xuse Data::Dumper;
X
Xuse vars qw /@ISA/;
Xuse vars qw /$IMG $NO_TAGS $NO_ATTR $NO_TAG_BY_ATTR $TEXT $REPLACE
X             $ALT $INLINE $NO_ELEMENTS $NETSCAPE_COMMENT/;
X
Xmy $COMM_ONLY = 1;
Xmy $TEXT_ONLY = 2;
X
Xuse HTML::Parser;
X
X@ISA = qw /HTML::Parser/;
X
Xsub deal_with_img ($$);
X
X
X# We mask new to be able to get the inline status.
Xsub new ($$) {
X    my $class = shift;
X    my $self  = new HTML::Parser;
X    bless $self, $class;
X}
X
X
X# To do interesting things, we keep track of the parsed and modified
X# part of html text in an instance variable "mod_text". It is set to ''
X# by masking parse ().
X
Xsub parse ($@) {
X    my $self                =   shift;
X    $self -> {response}     =   shift;
X    my $host =
X        $self -> {host}     =  $self -> {response} -> base -> host;
X    my $url  =
X        $self -> {url}      =  $self -> {response} -> base -> as_string;
X    $self -> {content}      =  $self -> {response} -> content;
X    $self -> {last_tag}     =  'NIL';
X
X    # Configuration is site specific. Do all the host matching
X    # stuff here.
X
X    # Match the URL with all entries in $IMG, remember which
X    # ones matched.
X    $self -> {src_matches}  =  [map {@{$_ -> [1]}}
X                                     grep {$url =~ /$_->[0]/;} @$IMG];
X
X    # Match the URL with all entries in $ALT, remember which
X    # ones matched.
X    $self -> {alts}         =  [map {@{$_ -> [1]}}
X                                     grep {$url =~ /$_->[0]/;} @$ALT];
X
X    # The coderefs for the no attributes.
X    foreach my $site (grep {$url =~ /$_->[0]/;} @$NO_ATTR) {
X        foreach my $elem (@{$site -> [1]}) {
X            foreach my $attr (@{$elem -> [1]}) {
X                push @{$self -> {no_attr} -> {$elem -> [0]} -> {$attr -> [0]}},
X                       $attr -> [1];
X            }
X        }
X    }
X
X    # The no elements structure.
X    foreach my $site (grep {$url =~ /$_->[0]/;} @$NO_ELEMENTS) {
X        foreach my $tag (@{$site -> [1]}) {
X            push @{$self -> {skips} -> {$tag -> [0]}}, $tag -> [1];
X        }
X    }
X
X    # The no tags structures.
X    foreach my $site (grep {$url =~ /$_->[0]/;} @$NO_TAGS) {
X        foreach my $tag (@{$site -> [1]}) {
X            push @{$self -> {no_tags} -> {$tag -> [0]}}, $tag -> [1];
X        }
X    }
X
X    # The information for the text/comment filters.
X    $self -> {text_info} = [map {@{$_ -> [1]};}
X                            grep {$url =~ /$_->[0]/;} @$TEXT];
X
X    # The information for text replacement.
X    $self -> {replace}   = [map {@{$_ -> [1]};}
X                            grep {$url =~ /$_->[0]/;} @$REPLACE];
X
X    $self -> netscape_buggy_comment ($NETSCAPE_COMMENT);  # Silly netscape.
X
X    # Flag to indicate whether white space collapses or not.
X    $self -> {pre}  = 0;
X
X    # Flag to indicate whether we are in 'skip' mode or not.
X    $self -> {skip}      = 0;
X    $self -> {end_skip}  = undef;
X    $self -> {skip_type} = undef;
X
X    $self -> {skip_element}       = '';
X    $self -> {skip_element_count} = 0;
X
X    # Initialize the stack.
X    $self -> s_init;
X
X    $self -> SUPER::parse ($self -> {content});
X}
X
X# Return the modified text.
Xsub parsed_text ($) {
X    my $self = shift;
X    $self -> build_page;
X}
X
X# Now we mask the various "call back" functions.
X
X# Declarations are just copied. (But what about comments?)
Xsub declaration ($$) {
X    my $self = shift;
X    my $decl = shift;
X    return if $self -> {skip};
X    $self -> s_push (["DECL", $decl]);
X}
X
X# This is the interesting function.
Xsub start ($$$$$) {
X    my $self = shift;
X    return if $self -> {skip};
X
X    my ($tag, $attributes, $attrseq, $origtext) = @_;
X
X    # If the tag is the same as the element we are currently
X    # skipping, we need to increment to nesting counter.
X    if ($tag eq $self -> {skip_element}) {
X        $self -> {skip_element_count} ++;
X        return;
X    }
X
X    # my $skip_sub = $self -> {skips} -> {$tag};
X    foreach my $skip_sub (@{$self -> {skips} -> {$tag} || []}) {
X        if ($skip_sub && $skip_sub -> ($tag, $attributes, $self)) {
X            # Go into element skipping mode.
X            $self -> {skip} = 1;
X            $self -> {skip_element} = $tag;
X            $self -> {skip_element_count} = 1;
X            return;
X        }
X    }
X
X    foreach my $sub (@{$self -> {no_tags} -> {$tag} || []}) {
X        return if $sub && $sub -> ($tag, $attributes, $self, 0);
X    }
X
X    # Image is special.
X    if ($tag eq 'img') {
X        return if $self -> no_image ($attributes -> {src});
X        my $alt = $self -> img_to_alt ($attributes);
X        if ($alt) {
X            $self -> {last_tag} = 'NIL';
X            $self -> s_push (["TEXT", $alt]);
X            return;
X        }
X    }
X
X    my $no_attr = $self -> {no_attr} -> {$tag};
X    if ($no_attr) {
X        foreach my $attribute (keys %$attributes) {
X            foreach my $sub (@{$no_attr -> {$attribute} || []}) {
X                delete $attributes -> {$attribute} if
X                   $sub -> ($attributes -> {$attribute}, $self);
X            }
X        }
X    }
X
X    $self -> {pre}       =  1 if $tag eq 'pre';
X    $self -> {last_tag}  = $tag;
X    $self -> s_push (["START", $tag, $attributes]);
X}
X
X
X# Copy end tags, except for those being filtered out.
Xsub end ($$) {
X    my $self = shift;
X    my $tag  = shift;
X
X    # If we are skipping an entire element, and we encounter a
X    # closing tag for that element, we decrement the nesting counter.
X    # If it becomes 0, we stop skipping.
X    if ($tag eq $self -> {skip_element}) {
X        return if -- $self -> {skip_element_count} > 0;
X        $self -> {skip} = 0;
X        $self -> {skip_element} = '';
X        return;
X    }
X
X    return if $self -> {skip};
X
X    foreach my $sub (@{$self -> {no_tags} -> {$tag} || []}) {
X        return if $sub && $sub -> ($tag, {}, $self, 1);
X    }
X
X    $self -> {pre}       =  0 if $tag eq 'pre';
X    $self -> {last_tag}  = 'NIL';
X    $self -> s_push (["END", $tag]);
X}
X
X# Strip out any &nbsp; hacks.
Xsub text ($$$) {
X    my $self    = shift;
X    my $text    = shift;
X    my $comment = shift || 0;
X
X    $text =~ s/(?:&nbsp;?|&#160;?)(?:&nbsp;?|&#160;?|\s)+/ /g
X                  unless $self -> {pre} || $comment;
X
X    if ($self -> {skip}) {
X        # We are in skipping mode.
X        # Hence check if it matches the end of skipping mode.
X        if (defined $self -> {end_skip}  &&
X            $text =~ /$self->{end_skip}/ &&
X            (!defined $self -> {skip_type} ||
X                     ($self -> {skip_type} == $COMM_ONLY &&  $comment) ||
X                     ($self -> {skip_type} == $TEXT_ONLY && !$comment))) {
X            $self -> {skip}     = 0;
X            $self -> {end_skip} = undef;
X        }
X        return;
X    }
X
X    foreach my $skip_text (@{$self -> {text_info}}) {
X        if (ref $skip_text) {
X            if (defined $skip_text -> [2]) {
X                next if $skip_text -> [2] == $COMM_ONLY && !$comment;
X                next if $skip_text -> [2] == $TEXT_ONLY &&  $comment;
X            }
X            # It is an array with a begin and end match.
X            if ($text =~ /$skip_text->[0]/) {
X                # Match the first element.
X                # Enter skip mode, remember second element.
X                $self -> {skip}      = 1;
X                $self -> {end_skip}  = $skip_text -> [1];
X                $self -> {skip_type} = $skip_text -> [2];
X                return;
X            }
X        }
X        # Else it's a string. Skip this text if it matches.
X        # No point in skipping a comment.
X        else {return if !$comment && $text =~ $skip_text;}
X    }
X
X    # Do textual replacements.
X    foreach my $replacement (@{$self -> {replace}}) {
X        my ($pattern, $repl) = @$replacement;
X        $text =~ s/$pattern/$repl/g;
X    }
X
X    $self -> {last_tag}  = 'NIL' if $text =~ /\S/ && !$comment;
X    $self -> s_push ([$comment ? "COMMENT" : "TEXT", $text]);
X}
X
X# Comments are just copied.
X# Hmm, 'man HTML::Parser' is unclear. Comments *are* part of a document
X# declaration. And you can have multiple comments in a document declarion.
X# Hope this will work....
Xsub comment ($$) {
X    my $self    = shift;
X    my $comment = shift;
X    $self -> text ($comment, 1);
X}
X
X
X# Return 1 if we shouldn't include the image.
Xsub no_image ($$) {
X    my ($self, $src) = @_;
X
X    foreach my $s (@{$self -> {src_matches}}) {return 1 if $src =~ /$s/;}
X    return;
X}
X
X# Check if we prefer the alt text.
Xsub img_to_alt ($$) {
X    my ($self, $attributes) = @_;
X
X    my $src = $attributes -> {src};
X    foreach my $s (@{$self -> {alts}}) {
X        if ($src =~ /$s/) {return $attributes -> {alt} || $INLINE;}
X    }
X    return;
X}
X
X
X
X# Stack functions.
Xsub s_init () {my $self = shift; $self -> {stack} = [];}
Xsub s_push () {my $self = shift; push @{$self -> {stack}}, shift;}
Xsub s_pop  () {my $self = shift; pop  @{$self -> {stack}};}
Xsub s_top  () {my $self = shift; $self -> {stack} -> [-1];}
X
X
Xsub build_tag ($$$) {
X    my $self = shift;
X    my ($tag, $attributes) = @_;
X    my $line = "<$tag";
X    foreach my $attr (keys %$attributes) {
X        # We lost the info whether it was single or double quote delimited.
X        # So we take the "safe" approach and use double quote delimited,
X        # but first we need to eliminate the existing double quotes.
X        $attributes -> {$attr} =~ s/"/&quot;/g;
X        $line .= qq ( $attr = "$attributes->{$attr}");
X    }
X    $line .= ">";
X}
X
Xsub build_page () {
X    my $self   = shift;
X    my $text   = "";
X
X    # Some pages miss </table> tags. This is usually due to different
X    # error recovery strategies of browsers and HTML::Parser.
X    # Count them, and insert any missing </tables> before the end
X    # of the document.
X    my $tables = 0;
X    my $body   = 0;
X
X    foreach my $part (@{$self -> {stack}}) {
X        my ($type, $tag, $attr) = @$part;
X        foreach ($type) {
X            /TEXT/    &&  do {$text .=      $tag;     last;};
X            /COMMENT/ &&  do {$text .= "<!--$tag-->"; last;};
X            /DECL/    &&  do {$text .=   "<!$tag>";   last;};
X            /END/     &&  do {$tables -- if $tag eq 'table';
X                              do {$text .= "</table>" x $tables;
X                                  $body  = 1;} if !$body && ($tag eq 'body' ||
X                                                             $tag eq 'html');
X                              $text .=   "</$tag>";
X                              last;};
X            /START/   &&  do {$text .= $self -> build_tag ($tag, $attr);
X                              $tables ++ if $tag eq 'table';
X                              last;};
X            die "Unknown type: $type\n";
X        }
X    }
X
X    $text .= "</table>" x $tables unless $body;
X
X    $text;
X}
X
X1;
SHAR_EOF
  : || $echo 'restore of' 'abiprox/Proxy/HTML.pm' 'failed'
fi
# ============= abiprox/Proxy/Config.pm ==============
if test -f 'abiprox/Proxy/Config.pm' && test "$first_param" != -c; then
  $echo 'x -' SKIPPING 'abiprox/Proxy/Config.pm' '(file already exists)'
else
  $echo 'x -' extracting 'abiprox/Proxy/Config.pm' '(text)'
  sed 's/^X//' << 'SHAR_EOF' > 'abiprox/Proxy/Config.pm' &&
Xpackage main;
X
X#
X# $Id: Config.pm,v 1.5 1998/02/01 15:24:34 abigail Exp abigail $
X#
X# $Log: Config.pm,v $
X# Revision 1.5  1998/02/01 15:24:34  abigail
X# More data.
X#
X# Revision 1.4  1997/11/23 05:50:37  abigail
X# Change structure of no_attributes and no_tags datastructures.
X# They are now arrays of tags or tag/attribute pairs, so we can
X# have side specific *and* general configuration.
X# This gets rid of the map inside map inside map expression.
X#
X# Revision 1.3  1997/11/03 17:28:48  abigail
X# Many changes, including support for not showing images based on the
X# scr attribute, showing an images alt attribute, and deleting entire
X# elements.
X#
X# Revision 1.2  1997/10/20 04:44:34  abigail
X# All HTML stuff is now site configurable.
X#
X# Revision 1.1  1997/10/15 00:25:58  abigail
X# Initial revision
X#
X#
X#
X# Configuration for proxy.
X
Xuse strict;
X
Xuse vars qw /@NO_REQ_HEADERS @NO_RES_HEADERS/;
X                                             # Headers we do not want in
X                                             # outgoing requests.
X@NO_REQ_HEADERS =  qw /User-Agent From Referer Cookie/;
X@NO_RES_HEADERS =  qw /Cache-control Set-Cookie Refresh Etag/;
X                                             # Headers we do not want in
X                                             # responses.
X
Xpackage Proxy::HTML;
X
X
Xuse vars qw /$IMG $NO_TAGS $NO_ATTR $TEXT $REPLACE $NO_ELEMENTS
X             $ALT $INLINE $NETSCAPE_COMMENT/;
X
X
X# This is done a bit tricky, but that's to be able to cope with the
X# fact this file gets re-required over time.
X# This function takes an attribute name and a list of regexes as
X# argument, and returns a closure that returns true if the value
X# of the attribute matches one of the regexs.
Xmy $generic_filter = sub {
X    my ($attribute, @filters) = @_;
X    local $" = ", ";
X    sub {
X        my ($tag, $attributes, $self, $end) = @_;
X        my $attr = $attributes -> {$attribute} or return;
X        foreach (@filters) {return 1 if $attr =~ /$_/;}
X        return;
X    }
X};
X
X# Easy function to quickly make long regexs.
Xmy $image = sub {
X    my ($type, $slash, @images) = @_;
X    local $"  = "|";
X    my $regex = $slash ? '/' : "";
X    $regex    = "$regex(?:@images)";
X    $regex   .= "\\.$type" if $type;
X    $regex;
X};
X
X
X# Regexes that match certain sites.
Xmy $ALL          = '.?';                            # Regex that always matches.
Xmy $CNN          = '//[-\w.]*\b(?:cnn(?:si|fn)?|allpolitics)\.com/';
Xmy $HELEN        = '//[-\w.]*\bpeterzale\.com/';    # Helen strip.
Xmy $COMICS       = '//[-\w.]*\bunitedmedia\.com/';  # Dilbert/Snoopy
Xmy $NEWS         = '//[-\w.]*\bnews\.com/';
Xmy $ALTAVISTA    = '//[-\w.]*\bdigital\.com/';
Xmy $INFOSEEK     = '//[-\w.]*\binfoseek\.com/';
Xmy $YAHOO        = '//[-\w.]*\byahoo\.com/';
Xmy $LYCOS        = '//[-\w.]*\blycos\.com/';
Xmy $PERL         = '//[-\w.]*\bperl\.com/';
Xmy $PLANET       = '//[-\w.]*\bpi\.net/';           # Planet Internet.
Xmy $DEJANEWS     = '//[-\w.]*\bdejanews\.com/';
Xmy $KRUD         = '//[-\w.]*\bkrud\.com/';         # Krud Radio cartoon.
Xmy $FORBES       = '//[-\w.]*\bforbes\.com/';
Xmy $VOLKSKRANT   = '//[-\w.]*\bvolkskrant\.nl/';
Xmy $NRC          = '//[-\w.]*\bnrc\.nl/';
Xmy $NYTIMES      = '//[-\w.]*\bnytimes\.com/';
Xmy $MOVIE        = '//[-\w.]*\bimdb\.com/';         # Internet Movie Database.
Xmy $NORTHERN     = '//[-\w.]*\bnorthernlight\.com/'; # Northern Light.
Xmy $HOTBOT       = '//[-\w.]*\bhotbot\.com/';       # Hotbot
Xmy $MERRIAM      = '//[-\w.]*\bm-w\.com/';          # Online webster.
Xmy $REDMEAT      = '//[-\w.]*\bredmeat\.com/';      # Strips.
Xmy $UKTIMES      = '//[-\w.]*\bthe-times\.co\.uk/'; # UK Times.
Xmy $MERC         = '//[-\w.]*\bsjmercury\.com/';     # SJ Mercury News
Xmy $LATIMES      = '//[-\w.]*\blatimes\.com/';     # Los Angeles Times
Xmy $LFTIMES      = '//[-\w.]*\bft\.com/';     # London Financial Times
Xmy $USATODAY     = '//[-\w.]*\busatoday\.com/';     
Xmy $ZDNET        = '//[-\w.]*\bzdnet\.com/';     
X
X# Remove logos, nav. bars and ads from commonly visited sites.
X
X# Regexes for well know add/counter sites.
X# Used to block both A element, and the IMG element.
Xmy @AD_SITES_COM      = qw /focalink digiweb ad.preferences preferences nrsite linkexchange
X                            aaddzz/;
Xmy @AD_SITES_NET      = qw /cerf doubleclick/;
Xmy @AD_SITES_MISC     = qw /www\.image\.dk/;
Xmy @COUNTER_COM       = qw /hitbox pagecount icount www\.fxweb\.holowww
X                            aaddzz siteflow net-user/;
Xmy @COUNTER_NET       = qw /net-trak\.stats/;
Xmy @COUNTER_MISC      = qw /www\.image\.dk www\.nedstat\.nl/;
Xmy ($AD_SITES_REGEX, $COUNTER_SITES_REGEX);
X{local $" = '|';
X $AD_SITES_REGEX      = "//[\\w.-]*\\b(?:(?:(?:@AD_SITES_COM)\\.com)|" .
X                                        "(?:(?:@AD_SITES_NET)\\.net)|" .
X                                        "(?:@AD_SITES_MISC))";
X $COUNTER_SITES_REGEX = "//[\\w.-]*\\b(?:(?:(?:@COUNTER_COM)\\.com)|" .
X                                        "(?:(?:@COUNTER_NET)\\.net)|" .
X                                        "(?:@COUNTER_MISC))";
X}
X
X# Regex for well known counter programs.
Xmy $COUNTERS          = join '|', qw /usercounter viewstat/;
X
X$IMG = [
X    [$ALL      =>  [$AD_SITES_REGEX,
X                    $COUNTER_SITES_REGEX,
X                    "/cgi-bin/(?:$COUNTERS)",
X                    qw { \bads?\b
X			\badv(?:erts?)?/b
X			/p(?:oint)?cast/
X                        /Count\d*\.cgi
X                        /netscape\.gif
X    }]],
X
X    [$HELEN    =>  [$image -> ('gif', 0, qw {awrd hel he ar gwe sp dou dg}),
X    ]],
X
X    [$MERRIAM  =>  [$image -> ('gif', 1, qw {but hddict logot})]],
X
X    [$COMICS   =>  [$image -> ('gif', 1, qw {
X                                invis hr4 header headline \w*interbar flowers
X                                um_bug bnn yoyo\d* dil_(?:partner|store)_icon
X                                ms_(?:black|top(?:l)?|list_right|icon_store)
X                                ms_(?:snoopy|editoons|store|copy|about(?:um|cz))
X                                ms_(?:read|dilbert|bottom|comiclist|mustsee)
X                                link inter (?:dnrc|lotd|ddmw|duh)_icon spacer
X                                email_endnavi 256c-100 dz_logo_sm ad slogan
X                                (?:(?:ms|d[zd]|exodus)_|IE4|com)logo
X                                dz_(?:anigiftease|archive|agenda|strip_[br])
X                                dz_(?:lart|link)_(?:lotd|ddmw|dnrc|duh|store)
X                                ab_(?:logo[TBC]|topline|todays) bottomnav_map
X                                ab_links_(?:dilbert|snoopy|editoons|store)
X                                c_zone official collect artafs wood wood2
X                                new_cool strip store window middle linus kids
X                                charlie lucy snoopy sally gall schroder today
X                                epn_map_(?:t|b_books) grown new_feet
X                                spacer_black backto a_month bottom
X                                linksmal sh_books reading black 3dtheater
X                                (?:sunday|arch|comic|email|more)navi(?:RED)?
X                                youngdil}),
X                    $image -> ('jpg', 1, qw {Backpack}),
X                    $image -> ('',    1, qw {storeImages/}),
X    ]],
X
X    [$VOLKSKRANT
X               =>  [$image -> ('gif', 1, qw {
X                                postbank rabo demon i_voorpag_kop msie40 bruna
X                                abnamro i_indruk_kop indruk_tab
X                                v_\w+ media2 dossiers2 tab_\w+ empty_\d+x\d+
X                                i_\w+_1_136w sport\d
X                                bovenrnd onderrand_1997 lijn})]],
X
X    [$NRC      =>  [$image -> ('gif', 0, qw {
X                                luxklein(?:sup|n) hekk splash mast 1x1})
X    ]],
X
X    [$CNN      =>  [$image -> ('gif', 1, qw {
X                                video\.bug cnn_logo cnn_interactive nav[\w\.-]+
X                                spacer explore\.anim(?:\.\d+\.\d+)?
X                                pathnet\.warner airmedia insight white 
X                                circle\.(?:white|red) arrow\.blue
X                                custom_feature pointcast ie hr fringe_us logo qa
X                                bullet\d* y\.matters today_cnn dot to_top2 blue
X                                [\w-.]+banner search\.feed plus_message cities
X                                back_to_top(?:\.space)? sound\.icon sci_tech
X                                what_you_think\.space vx_bar\+txt weather bbs
X                                b_OPTIONS b_SUBMIT story_promo nb_CUSTOM
X                                nb_CNNIN nb_CUSTOM_PROFILE b_TOP plus_logo
X                                search\.bn\.recommends book\.search bnoble
X                                bn\.recommend\.box\.(?:left|top|right) main_us
X                                nb_ALL_POLITICS nb_CNN(?:fn|si) news search
X                                related(?:stories|sites) stormcenter_link
X                                nb_CNN_INTERACTIVE mn_Lifestyle mn_Sci-Tech
X                                sep_BANNERDOTS dot_GREY nb_LOWER_STEP dline
X                                pumpkin\.bullet\.icon jol\.top buttons
X                                explore\.[\w.]+
X                                message\.boards qt_icon logo_[lr] mn_Showbiz}),
X                    $image -> ('jpg', 1, qw {
X                                [\w-.]+banner news bug related(?:stories|sites)
X                                watch\.gutter yir\.special today_cnn
X                                }),
X                    $image -> ('gif', 0, qw {nav_no\.cursor logo}),
X                    $image -> ('jpg', 0, qw {lotd vidpik}),
X                    $image -> ('',    0, qw {[Ii]nfoseek pathfinder}),
X    ]],
X
X    [$YAHOO    =>  [$image -> ('',    0, qw {/adv/}),
X                    $image -> ('gif', 1, qw {new2})
X    ]],
X
X    [$NEWS     =>  [$image -> ('gif', 1, qw {
X                                newscom item_\w+ cnet home}),
X                    $image -> ('',    1, qw {Banners/ bump masthead Frontdoor}),
X    ]],
X
X    [$ALTAVISTA => [$image -> ('gif', 1, qw {
X                                av_(?:logo|map) pixel searchpx digital-logo
X                                preview}),
X    ]],
X
X    [$INFOSEEK =>  [$image -> ('gif', 1, qw {
X                                home_bar_ups big_yellow_red ups_track select
X                                smart_search_results_ups greenbal}),
X                    $image -> ('gif', 0, qw {circlei}),
X    ]],
X
X    [$LYCOS    =>  [$image -> ('gif', 1, qw {
X                                new-logo place-holder navbar servicebar homerule
X                                lycos-logo games-icon Lilarrow-(?:vspace|red)
X                                poweredbyww sweepicn lilarrow scificon
X                                threearrows logobar}),
X    ]],
X
X    [$PERL     =>  [$image -> ('gif', 1, qw {
X                                ora_logo perl_id_313c header-nav camel}),
X                    $image -> ('jpg', 1, qw {perl_lang_color}),
X    ]],
X
X    [$PLANET   =>  [$image -> ('gif', 1, qw {pilogo klgnetbn 1 now}),
X                    $image -> ('',    0, qw {/adv/ /pics/div/}),
X    ]],
X
X    [$DEJANEWS =>  [$image -> ('gif', 1, qw {
X                                browsers nav_\w* head home2 fingers [rl]tri 1x1
X                                qm tgb text thr prev(?:_x)? curr new redline
X                                auth
X                                iworld_f97 next(?:_x)? [rl]arr(?:_x)? post_a}),
X                    $image -> ('',    1, qw {dnlogo_ frownie_}),
X    ]],
X
X    [$KRUD     =>  [$image -> ('gif', 1, qw {rewind tip krudrad})]],
X
X    [$FORBES   =>  [$image -> ('gif', 1, qw {
X                                adot comments
X                                theme_ban netvlu14d logo-otl deliver netvlu})
X    ]],
X
X    [$NYTIMES  =>  [$image -> ('gif', 1, qw {maintoolbar cyberex 1bancyber})]],
X
X    [$MOVIE    =>  [$image -> ('gif', 1, qw {click\d+ accentdesign center})]],
X
X    [$NORTHERN =>  [$image -> ('gif', 1, qw {
X                                logo(?:_corner2|-tall) (?:home|support)gray
X                                side\d sson 00000 searchfor[12] line transparent
X                                searchresults templogo foldery special
X                                www dynamicfolders bullet_right feature})]],
X
X    [$HOTBOT   =>  [$image -> ('gif', 1, qw {misc\.transPixel})]],
X
X    [$UKTIMES  =>  [$image -> ('', 1, qw {_ecb_timnws_ valentine})]],
X
X
X];
X
X# As $IMG, but then for the cases we prefer the alt text.
X$ALT = [
X    [$VOLKSKRANT
X               =>  [$image -> ('',    1, qw {images/balk})]],
X    
X    [$DEJANEWS =>  [$image -> ('gif', 1, qw {gifs/..})]],
X
X    [$COMICS   =>  [$image -> ('gif', 1, qw {ab_next})]],
X
X    [$NORTHERN =>  [$image -> ('gif', 1, qw {
X                                return2resultssq newsearch purchasedoc moneyback
X                                spec-info return2results newsearchround search
X                                home
X                                aboutsq accounts support disclaim help_better-z
X                                nextten prev10 searchhelp startover})]],
X];
X
X# Need this later.
Xmy $delete         = sub {1;};
Xmy $absolute_width = sub {shift !~ /%$/;};
Xmy $no_multiple    = sub {my ($tag, $attr, $self, $end) = @_;
X                          $tag eq $self -> {last_tag}};
X
X
X
Xmy @CNN_ADS = qw { \bredirect\b \bclick\b \bevent\.ng\b \bclick\.ng\b SHOWBIZAdSpace /ads/ /CNNProm /adinfo/
X                  /www\.reservationdesk\.com};
Xmy $CNN_ADS = join '|', @CNN_ADS;
X
X
X# $NO_TAGS will remove tags. Configurable by site.
X$NO_TAGS = [
X    [$ALL      => [[br      =>  $no_multiple],
X                   [hr      =>  $no_multiple],
X                   [meta    =>  $generic_filter -> ('http-equiv', '.')],
X                   [base    =>  $generic_filter -> ('target',     '_blank')],
X		   [img     => $generic_filter -> ('src' => '\b(?:ads?|adv(?:erts?)?)\b',
X							'/p(?:oint)?cast/',
X							'\bclick(thr)?',
X							'\bredirects?\b',
X							'\bad(?:vert(?:isement)?)?s?\b',
X							'(?:link|banner|counter)\.cgi',
X							'/promo(?:tions?)?/',
X							'(?:event|click?)\.ng',
X							$COUNTER_SITES_REGEX,
X                                                        $AD_SITES_REGEX)],
X                    map {[$_, $delete]} qw /font basefont blink applet
X                                            param embed/]
X	],
X
X    [$CNN      => [[hr      =>  sub {my ($tag, $attr, $self, $end) = @_;
X                                     $self -> {last_tag} eq 'a';}]]],
X
X  # [$LYCOS    => [[frame   =>  $generic_filter -> ('src', '/wguide/tools/')]]],
X];
X
X
X# $NO_ELEMENTS will remove entire *elements*. Configurable by site.
X# Very handy to remove ad banners.
X$NO_ELEMENTS = [
X
X    [$ALL      => [
X		    [a    => $generic_filter -> ('href', '\b(?:ads?|adv(?:erts?)?)\b',
X							'/p(?:oint)?cast/',
X							'\bclick(thr)?',
X							'\bredirects?\b',
X							'\bredir\b',
X							'\bad(?:vert(?:isement)?)?s?\b',
X							'(?:link|banner|counter)\.cgi',
X							'/promo(?:tions?)?/',
X							'(?:event|click?)\.ng',
X							$COUNTER_SITES_REGEX,
X                                                        $AD_SITES_REGEX)],
X
X		    ]],
X
X    [$LFTIMES      => [[ a => $generic_filter -> (href => 'pft/jump')]]],
X
X    [$LATIMES      => [[ a => $generic_filter -> (href => 'event.ng')]]],
X    [$USATODAY      =>  [[ a => $generic_filter -> (href => 'sponsors')]]],
X
X    [$ALL => [[a => $generic_filter -> ('href', 'http://\S*infi.net')]]],
X
X    [$CNN      => [[a    => $generic_filter -> ('href', $CNN_ADS, 
X                                                        'video\.html')],
X                   [form => $generic_filter -> ('action', 'servercast\.net')]]],
X
X    [$PERL     => [[a    => $generic_filter -> ('href', 'http://opensource\.oreilly\.com/news/\w+_\d+\.html')]]],
X
X    [$NYTIMES  =>  [[form => $generic_filter -> ('action', '/ads/')]]],
X
X    [$NRC      => [[a    => $generic_filter -> ('href', '/Advertenties/')]]],
X
X    [$YAHOO    => [[a    => $generic_filter  -> (href  => '/[*?]http:/')]]],
X
X    [$LYCOS    => [[a    => $generic_filter -> ('href', 'adclick')],
X                   [form => $generic_filter -> ('action', 'nph-adclick.exe')]]],
X
X    [$MOVIE    => [[a    => $generic_filter -> ('href', '/Ads/')]]],
X
X    [$DEJANEWS => [[a    => $generic_filter -> ('href', 'gtplacer')]]],
X
X    [$HOTBOT   => [[a    => $generic_filter -> ('href', 'clickthru\.html')]]],
X
X    [$COMICS   => [[a    => $generic_filter -> ('href', '1800flowers',
X                                                        'netscape\.com',
X                                                        'comiczone\.cdf',
X                                                        'microsoft\.com')],
X               ['select' => $generic_filter -> ('onchange', 'window\.open')],
X                   [form => $generic_filter -> ('onsubmit', 'window\.open')]]],
X
X    [$REDMEAT  => [[a    => $generic_filter -> ('href', 'riddler.com')]]],
X
X    [$UKTIMES  => [[a    => $generic_filter -> ('href', 'valentines',
X    'infotimes', '\bstanding\b', '\.map\+\d')]]],
X
X    [$ZDNET  => [[form    => $generic_filter -> ('action', '/adverts/')]]],
X
X];
X
X
X
X# Initialize the NO_ATTR datastructure.
X# The form is: tag -> attribute -> coderef, per site.
X# If the attribute for that tag exists, the coderef is called with
X# the attribute value as parameter. If it returns true, it's deleted.
X$NO_ATTR = [
X    [$ALL      => [[img      =>  [[border       =>  $delete]]],
X                   [input    =>  [[border       =>  $delete]]],
X                   [a        =>  [[target       =>  sub {shift =~ /new$/i;}]]],
X                   [frameset =>  [[frameborder  =>  $delete],
X                                  [border       =>  $delete],
X                                  [framespacing =>  $delete]]],
X                   [frame    =>  [[noresize     =>  $delete],
X                                  [marginwidth  =>  $delete],
X                                  [marginheight =>  $delete],
X                                  [frameborder  =>  $delete],
X                                  [scrolling    =>  $delete]]],
X                   [hr       =>  [[noshade      =>  $delete],
X                                  [width        =>  $delete]]],
X                   [table    =>  [[width        =>  $absolute_width],
X                                  [height       =>  $absolute_width],
X                                  [align        =>  $delete],
X                                  [cellspacing  =>  $delete],
X                                  [cellpadding  =>  $delete],
X                                  [nowrap       =>  $delete],
X                                  [bgcolor      =>  $delete]]],
X                   [td       =>  [[width        =>  $absolute_width],
X                                  [height       =>  $delete],
X                                  [nowrap       =>  $delete],
X                                  [bgcolor      =>  $delete]]],
X                   [th       =>  [[width        =>  $absolute_width],
X                                  [height       =>  $delete],
X                                  [nowrap       =>  $delete],
X                                  [bgcolor      =>  $delete]]],
X                  ['tr'      =>  [[width        =>  $absolute_width],
X                                  [nowrap       =>  $delete],
X                                  [bgcolor      =>  $delete]]],
X                   [body     =>  [[background   =>  $delete],
X                                  [bgcolor      =>  $delete],
X                                 ['text'        =>  $delete],
X                                 ['link'        =>  $delete],
X                                  [vlink        =>  $delete],
X                                  [alink        =>  $delete]]]],
X    ],
X
X    [$NORTHERN => [[td       =>  [[nowrap       =>  $delete]]]]],
X];
X
X
X
X# This strips out text, or parts of the document.
X# Each site has a list of strings or 2 string elements.
X# If a text or comment matches the string, it's ignored.
X# For string pairs, whenever a text or comment matches the first string,
X# the parsers goes into 'ignore' mode, until the second string matches
X# a comment or text. *After* which it goes back to normal parsing.
Xmy $COMM_ONLY = 1;
Xmy $TEXT_ONLY = 2;
X$TEXT = [
X    [$CNN      =>  ['Advertising information for CNN Web', 
X		    'Pages will open in a new browser',
X                    ['Search and Promos', 'Search and Promos', $COMM_ONLY],
X                    ['CNN/SI Bullets', 'CNN/SI Bullets', $COMM_ONLY],
X                    ['Community Bullets', 'Community Bullets', $COMM_ONLY],
X                    ['Search Table', 'Search Table', $COMM_ONLY],
X                    ['Nagano Bullets', 'Nagano Bullets', $COMM_ONLY],
X                    ['Jobs', 'Jobs', $COMM_ONLY],
X    ]],
X    [$MERC     =>   [['NetG AD', 'ENDAD', $COMM_ONLY]]],
X
X    [$LYCOS    => [['^\s*AD\b', '(?:\bEND |/)AD\b', $COMM_ONLY]]],
X];
X
X
X# Replace a text with something else.
X# We have 2 element arrays of strings. First is regex, second a replacement.
X# No $1 and friends available. For now that is.
X$REPLACE = [
X    # There is some real bad software out there.
X    [$ALL      => [['(?i)<font'    =>  '<bogus'],
X                   ['(?i)</font'   =>  '</bogus']
X    ]],
X
X    [$NORTHERN => [['(?i)<meta'    =>  '<bogus']]],
X
X    [$MERRIAM  => [['(?i)<img'     =>  '<bogus']]],
X
X];
X
X# How to deal with netscape comments.
X$NETSCAPE_COMMENT = 1;
X
X# Text we like to have for inlines without alt.
X$INLINE = '[INLINE]';
X
X1;
SHAR_EOF
  : || $echo 'restore of' 'abiprox/Proxy/Config.pm' 'failed'
fi
# ============= abiprox/proxy.pl ==============
if test -f 'abiprox/proxy.pl' && test "$first_param" != -c; then
  $echo 'x -' SKIPPING 'abiprox/proxy.pl' '(file already exists)'
else
  $echo 'x -' extracting 'abiprox/proxy.pl' '(text)'
  sed 's/^X//' << 'SHAR_EOF' > 'abiprox/proxy.pl' &&
X#!/usr/bin/perl -w
X
X#
X# $Id: proxy.pl,v 1.5 1998/02/01 15:25:55 abigail Exp abigail $
X#
X# $Log: proxy.pl,v $
X# Revision 1.5  1998/02/01 15:25:55  abigail
X# Error handling and REAPER changes as suggested by Tom C.
X#
X# Revision 1.4  1997/11/03 17:28:03  abigail
X# Minor changes.
X#
X# Revision 1.3  1997/10/15 00:20:26  abigail
X# Added on the fly configuration.
X#
X# Revision 1.2  1997/08/24 05:14:58  abigail
X# Added $IGNORE feature.
X#
X# Revision 1.1  1997/08/23 09:35:25  abigail
X# Initial revision
X#
X#
X#
X# A proxy server that strips out nasty HTTP headers, and nasty HTML tags.
X# The proxy part is inspired on Randal's Webtechniques column 11.
X# [http://www.stonehenge.com/merlyn/WebTechniques/col11.html].
X
Xuse strict;
Xuse Carp;
X
X#use lib qw (/nfs1/home/abigail/Perl);
X### FIXME
Xuse lib qw (/home/tchrist/nap);
X
Xuse LWP::UserAgent;
Xuse HTTP::Daemon;
Xuse Proxy::HTML;
Xuse POSIX  qw /:sys_wait_h/;
X
X
X# Subclass LWP::UserAgent to have the browser know about redirects.
X
X@UserAgent::ISA = qw /LWP::UserAgent/;
Xsub UserAgent::redirect_ok {0;}   # No, it is *not* ok to redirect.
X
X# And back to the main stuff.
X
Xsub handle_connection ($);
Xsub fetch             ($);
Xsub illegal           (%);
X
Xuse vars qw /@NO_REQ_HEADERS @NO_RES_HEADERS/;
X
X# This is just to satisfy -T, we don't call other programs.
X$ENV {PATH} = join ':', qw (/usr/bin);   # /bin -> /usr/bin on Solaris.
X
X# On the fly configuration file.
X# Hardcoded file name - not very nice.
X### FIXME
Xmy $Config = "/home/tchrist/nap/Proxy/Config.pm";
X
X# Configuration.
X
X### FIXME
Xmy $ME             =  "wherever.youare.com";           # The machine I run on.
Xmy $LISTEN_PORT    =  shift || 8888;         # That's the port I'll listen to.
Xdie "Invalid port" if $LISTEN_PORT =~ /\D/ || $LISTEN_PORT < 1024;
X($LISTEN_PORT)     =  $LISTEN_PORT =~ m/(.*)/; # Untaint.
X### FIXME
Xmy $PROXY          =  "notherplace.someproxy.com";  # The machine the proxy runs on.
Xmy $PROXY_PORT     =  8181;                  # The proxy's port.
Xmy @PROXY_SCHEMES  =  qw /http ftp/;         # Proxy accepts this.
Xundef $PROXY;      # We don't need the other proxy anymore.
X
Xmy $FORBIDDEN      =  403;                   # HTTP Response code.
Xmy $FORBIDDEN_TEXT =  'Forbidden';           # Text of the above.
X
X                                             # How I want to be named.
X                                             # RCS will fill in the version id.
Xmy $NAME           =  q <HTML stripper $Revision: 1.5 $>;
X
Xmy $last_required  = 0;
X
X# Initialize the daemon.
Xmy $daemon = new HTTP::Daemon LocalAddr => $ME, LocalPort => $LISTEN_PORT,
X                              Reuse     =>  1
X   or croak "HTTP Proxy failed to initialize: $!";
X
X# Initialize agent.
Xmy  $agent  =   new UserAgent;
X    $agent  ->  agent ($NAME);
Xif (defined $PROXY) {
X    $agent  ->  proxy (\@PROXY_SCHEMES, "http://$PROXY:$PROXY_PORT/");
X}
X    $agent  ->  env_proxy ();   # Set up whatever else you want.
X
X    $agent  ->  parse_head (0); # Man page says: Do not turn this off,
X                                #                unless you know what you
X                                #                are doing.
X                                # I know damn well what I'm doing, and one
X                                # of the reasons to start this whole project
X                                # are those annoying elements in the head.
X                                # I rather have software without a mind of
X                                # its own. Furthermore, read the frigging
X                                # spec, 'http-equiv' is there for *servers*,
X                                # not user agents. Damnit.
X                                # At least server authors always had the
X                                # sense to ignore those elements.
X
X# Zombies are bad.
Xsub REAPER {
X    my $child;
X    while (($child = waitpid (-1, WNOHANG)) && $child != -1) { }
X    $SIG{CHLD} = \&REAPER;  # still loathe sysV
X}
X$SIG {CHLD} = \&REAPER;
X
Xwarn "Proxy on $ME ($LISTEN_PORT) initialized\n";
X
X# And loop forever.
Xwhile (my $slave = $daemon -> accept) {
X    # Reconfigure if the config file changed.
X    my $mod = (stat $Config) [9] || 0;
X    if ($mod > $last_required) {
X        delete $INC {$Config};
X        require $Config;
X        $last_required = $mod;
X    }
X    handle_connection $slave;
X}
X
X# That's all folks.
Xexit 0;
X
X
X# This handles the connection. Fork, and let the child deal with
X# the stuff. Child exits after dealing with the stuff.
Xsub handle_connection ($) {
X    my $connection = shift;
X
X    my $pid = fork;
X    if ($pid) {
X        close $connection;  # Can I do $connection -> close; ? If no, why not?
X        return;
X    }
X
X
X    my $request    = $connection -> get_request;
X    if (defined $request) {
X	$0 = "tbtw " . $request->url->as_string();
X	## print "$0\n";
X        my $response = fetch $request;
X        # Error handling added by Tom Christiansen, 19971118.
X        unless ($response -> is_error) {
X            $connection -> send_response ($response);
X        }
X        else {
X            my $trailer =  "<br>---------------</br><em>$NAME</em>";
X            my $content =  $response -> error_as_HTML;
X            $content    =~ s!(?=</body>)!$trailer!i;
X            $connection -> send_error ($response -> code, $content);
X        }
X        close $connection;
X    }
X    exit 0 if defined $pid;
X}
X
X# Check if a request is valid. If it is, strip info from the client,
X# fetch the stuff, strip info from the server, and return the result.
X# For illegal stuff, return an error message.
Xsub fetch ($) {
X    my $request  = shift;
X    
X    # Check if valid.
X    if ($request -> url -> scheme ne 'http') {
X        return illegal url => $request -> url, reason => "bad scheme";
X    }
X
X    my $response = eval {$agent -> request ($request -> strip) -> strip;};
X    print scalar localtime, "\n$@\n\n" if $@;
X
X  # print scalar localtime, "\n", $response -> as_string, "\n\n";
X
X    $response;  # Will generate a 'document contains no data' if there's
X                # an error. Needs a fix.
X}
X
X
X# Create a forbidden response.
Xsub illegal (%) {
X    my %args     = @_;
X    my $response = new HTTP::Response $FORBIDDEN, $FORBIDDEN_TEXT;
X
X    # There's only one illegal action for now....
X    foreach ($args {reason}) {
X        /^bad scheme/   &&   do {
X            $response -> content ("Illegal scheme: $args{url}");
X            last;
X        };
X        # Any...
X        $response -> content ("Illegal action.");
X    }
X
X    $response;
X}
X
X
X# This sub routine strips out HTTP headers we do not want to send.
X# It's just an extra function added to HTTP::Request.
Xsub HTTP::Request::strip ($) {
X    my $self = shift;
X    $self -> remove_header (@NO_REQ_HEADERS);
X    $self;  # Return self for extra coolness.
X}
X
X
X# Strip out HTTP response headers we do not want to send to the client.
X# And if the returned type is of text/html, filter it.
X# We put this function into HTTP::Response.
Xsub HTTP::Response::strip ($) {
X    my $self = shift;
X
X    $self -> remove_header (@NO_RES_HEADERS);
X
X    if ($self -> content_type eq 'text/html') {
X        my $html = new Proxy::HTML;
X        # Replace content with the parsed text after parsing the content.
X        $self -> content ($html -> parse ($self) -> parsed_text);
X    }
X
X    $self;  # Return self for extra coolness.
X}
X
SHAR_EOF
  : || $echo 'restore of' 'abiprox/proxy.pl' 'failed'
fi
rm -fr _sh22768
exit 0
-- 
 I don't know. I don't care. And it doesn't make any difference.