From: Tom Christiansen <tchrist@mox.perl.com> Subject: Seizing the Bull by the Horns Date: 11 May 1999 11:01:46 -0700 I suppose that you thought the posting I'm following up to was bereft of Perl content. Not so! It was just to whet your appetites. Here's a program written by Abigail (who may in fact have a newer version) that filters out much of the bull described in the parent of this article. It's a personal proxy server. You'll have to go into proxy.pl and edit all the places that say FIXME to reflect your local situation. You might have a look into the config file, too. --tom PS: Idiots who use <BR><BR> to mean <P> end up looking worse, but that's their own dumb fault. #!/bin/sh # This is a shell archive (produced by GNU sharutils 4.2). # To extract the files from this archive, save it to some FILE, remove # everything before the `!/bin/sh' line above, then type `sh FILE'. # # Made on 1999-05-11 10:54 MDT by <tchrist@jhereg.perl.com>. # Source directory was `/tmp'. # # Existing files will *not* be overwritten unless `-c' is specified. # This format requires very little intelligence at unshar time. # "if test", "echo", "mkdir", and "sed" may be needed. # # This shar contains: # length mode name # ------ ---------- ------------------------------------------ # 12073 -rw-r--r-- abiprox/Proxy/HTML.pm # 21497 -rw-r--r-- abiprox/Proxy/Config.pm # 7286 -rwx------ abiprox/proxy.pl # echo=echo if mkdir _sh22768; then $echo 'x -' 'creating lock directory' else $echo 'failed to create lock directory' exit 1 fi # ============= abiprox/Proxy/HTML.pm ============== if test ! -d 'abiprox'; then $echo 'x -' 'creating directory' 'abiprox' mkdir 'abiprox' fi if test ! -d 'abiprox/Proxy'; then $echo 'x -' 'creating directory' 'abiprox/Proxy' mkdir 'abiprox/Proxy' fi if test -f 'abiprox/Proxy/HTML.pm' && test "$first_param" != -c; then $echo 'x -' SKIPPING 'abiprox/Proxy/HTML.pm' '(file already exists)' else $echo 'x -' extracting 'abiprox/Proxy/HTML.pm' '(text)' sed 's/^X//' << 'SHAR_EOF' > 'abiprox/Proxy/HTML.pm' && Xpackage Proxy::HTML; # I guess this is a bad name.... X X# X# $Id: HTML.pm,v 1.7 1998/02/01 15:23:36 abigail Exp abigail $ X# X# $Log: HTML.pm,v $ X# Revision 1.7 1998/02/01 15:23:36 abigail X# Added </table> recovery strategy. X# X# Revision 1.6 1997/11/23 05:50:37 abigail X# Change structure of no_attributes and no_tags datastructures. X# They are now arrays of tags or tag/attribute pairs, so we can X# have side specific *and* general configuration. X# This gets rid of the map inside map inside map expression. X# X# Revision 1.5 1997/11/03 17:29:43 abigail X# Many changes, including support for not showing images based on the X# scr attribute, showing an images alt attribute, and deleting entire X# elements. X# X# Revision 1.4 1997/10/20 04:45:02 abigail X# All HTML stuff is now site configurable. X# Matching for site is done in new(), not for all tags. X# Building the page is delayed till the end, keeping all tags and X# text fragments in a stack. This allows for going back and deleting X# stuff. (Yet to be done). X# X# Revision 1.3 1997/10/15 00:24:11 abigail X# On the fly configuration. X# Host dependent image selection. X# X# Revision 1.2 1997/08/24 05:15:11 abigail X# Configurable ignoring tags/attributes. Either unconditionally, X# or by looking at the attribute value. X# Rewriting <img> tags if images are not wanted. X# X# Revision 1.1 1997/08/23 09:32:05 abigail X# Initial revision X# X# X# X# Strip nasty tags/attributes from HTML text. X Xuse strict; Xuse Carp; X Xuse Data::Dumper; X Xuse vars qw /@ISA/; Xuse vars qw /$IMG $NO_TAGS $NO_ATTR $NO_TAG_BY_ATTR $TEXT $REPLACE X $ALT $INLINE $NO_ELEMENTS $NETSCAPE_COMMENT/; X Xmy $COMM_ONLY = 1; Xmy $TEXT_ONLY = 2; X Xuse HTML::Parser; X X@ISA = qw /HTML::Parser/; X Xsub deal_with_img ($$); X X X# We mask new to be able to get the inline status. Xsub new ($$) { X my $class = shift; X my $self = new HTML::Parser; X bless $self, $class; X} X X X# To do interesting things, we keep track of the parsed and modified X# part of html text in an instance variable "mod_text". It is set to '' X# by masking parse (). X Xsub parse ($@) { X my $self = shift; X $self -> {response} = shift; X my $host = X $self -> {host} = $self -> {response} -> base -> host; X my $url = X $self -> {url} = $self -> {response} -> base -> as_string; X $self -> {content} = $self -> {response} -> content; X $self -> {last_tag} = 'NIL'; X X # Configuration is site specific. Do all the host matching X # stuff here. X X # Match the URL with all entries in $IMG, remember which X # ones matched. X $self -> {src_matches} = [map {@{$_ -> [1]}} X grep {$url =~ /$_->[0]/;} @$IMG]; X X # Match the URL with all entries in $ALT, remember which X # ones matched. X $self -> {alts} = [map {@{$_ -> [1]}} X grep {$url =~ /$_->[0]/;} @$ALT]; X X # The coderefs for the no attributes. X foreach my $site (grep {$url =~ /$_->[0]/;} @$NO_ATTR) { X foreach my $elem (@{$site -> [1]}) { X foreach my $attr (@{$elem -> [1]}) { X push @{$self -> {no_attr} -> {$elem -> [0]} -> {$attr -> [0]}}, X $attr -> [1]; X } X } X } X X # The no elements structure. X foreach my $site (grep {$url =~ /$_->[0]/;} @$NO_ELEMENTS) { X foreach my $tag (@{$site -> [1]}) { X push @{$self -> {skips} -> {$tag -> [0]}}, $tag -> [1]; X } X } X X # The no tags structures. X foreach my $site (grep {$url =~ /$_->[0]/;} @$NO_TAGS) { X foreach my $tag (@{$site -> [1]}) { X push @{$self -> {no_tags} -> {$tag -> [0]}}, $tag -> [1]; X } X } X X # The information for the text/comment filters. X $self -> {text_info} = [map {@{$_ -> [1]};} X grep {$url =~ /$_->[0]/;} @$TEXT]; X X # The information for text replacement. X $self -> {replace} = [map {@{$_ -> [1]};} X grep {$url =~ /$_->[0]/;} @$REPLACE]; X X $self -> netscape_buggy_comment ($NETSCAPE_COMMENT); # Silly netscape. X X # Flag to indicate whether white space collapses or not. X $self -> {pre} = 0; X X # Flag to indicate whether we are in 'skip' mode or not. X $self -> {skip} = 0; X $self -> {end_skip} = undef; X $self -> {skip_type} = undef; X X $self -> {skip_element} = ''; X $self -> {skip_element_count} = 0; X X # Initialize the stack. X $self -> s_init; X X $self -> SUPER::parse ($self -> {content}); X} X X# Return the modified text. Xsub parsed_text ($) { X my $self = shift; X $self -> build_page; X} X X# Now we mask the various "call back" functions. X X# Declarations are just copied. (But what about comments?) Xsub declaration ($$) { X my $self = shift; X my $decl = shift; X return if $self -> {skip}; X $self -> s_push (["DECL", $decl]); X} X X# This is the interesting function. Xsub start ($$$$$) { X my $self = shift; X return if $self -> {skip}; X X my ($tag, $attributes, $attrseq, $origtext) = @_; X X # If the tag is the same as the element we are currently X # skipping, we need to increment to nesting counter. X if ($tag eq $self -> {skip_element}) { X $self -> {skip_element_count} ++; X return; X } X X # my $skip_sub = $self -> {skips} -> {$tag}; X foreach my $skip_sub (@{$self -> {skips} -> {$tag} || []}) { X if ($skip_sub && $skip_sub -> ($tag, $attributes, $self)) { X # Go into element skipping mode. X $self -> {skip} = 1; X $self -> {skip_element} = $tag; X $self -> {skip_element_count} = 1; X return; X } X } X X foreach my $sub (@{$self -> {no_tags} -> {$tag} || []}) { X return if $sub && $sub -> ($tag, $attributes, $self, 0); X } X X # Image is special. X if ($tag eq 'img') { X return if $self -> no_image ($attributes -> {src}); X my $alt = $self -> img_to_alt ($attributes); X if ($alt) { X $self -> {last_tag} = 'NIL'; X $self -> s_push (["TEXT", $alt]); X return; X } X } X X my $no_attr = $self -> {no_attr} -> {$tag}; X if ($no_attr) { X foreach my $attribute (keys %$attributes) { X foreach my $sub (@{$no_attr -> {$attribute} || []}) { X delete $attributes -> {$attribute} if X $sub -> ($attributes -> {$attribute}, $self); X } X } X } X X $self -> {pre} = 1 if $tag eq 'pre'; X $self -> {last_tag} = $tag; X $self -> s_push (["START", $tag, $attributes]); X} X X X# Copy end tags, except for those being filtered out. Xsub end ($$) { X my $self = shift; X my $tag = shift; X X # If we are skipping an entire element, and we encounter a X # closing tag for that element, we decrement the nesting counter. X # If it becomes 0, we stop skipping. X if ($tag eq $self -> {skip_element}) { X return if -- $self -> {skip_element_count} > 0; X $self -> {skip} = 0; X $self -> {skip_element} = ''; X return; X } X X return if $self -> {skip}; X X foreach my $sub (@{$self -> {no_tags} -> {$tag} || []}) { X return if $sub && $sub -> ($tag, {}, $self, 1); X } X X $self -> {pre} = 0 if $tag eq 'pre'; X $self -> {last_tag} = 'NIL'; X $self -> s_push (["END", $tag]); X} X X# Strip out any hacks. Xsub text ($$$) { X my $self = shift; X my $text = shift; X my $comment = shift || 0; X X $text =~ s/(?: ?| ?)(?: ?| ?|\s)+/ /g X unless $self -> {pre} || $comment; X X if ($self -> {skip}) { X # We are in skipping mode. X # Hence check if it matches the end of skipping mode. X if (defined $self -> {end_skip} && X $text =~ /$self->{end_skip}/ && X (!defined $self -> {skip_type} || X ($self -> {skip_type} == $COMM_ONLY && $comment) || X ($self -> {skip_type} == $TEXT_ONLY && !$comment))) { X $self -> {skip} = 0; X $self -> {end_skip} = undef; X } X return; X } X X foreach my $skip_text (@{$self -> {text_info}}) { X if (ref $skip_text) { X if (defined $skip_text -> [2]) { X next if $skip_text -> [2] == $COMM_ONLY && !$comment; X next if $skip_text -> [2] == $TEXT_ONLY && $comment; X } X # It is an array with a begin and end match. X if ($text =~ /$skip_text->[0]/) { X # Match the first element. X # Enter skip mode, remember second element. X $self -> {skip} = 1; X $self -> {end_skip} = $skip_text -> [1]; X $self -> {skip_type} = $skip_text -> [2]; X return; X } X } X # Else it's a string. Skip this text if it matches. X # No point in skipping a comment. X else {return if !$comment && $text =~ $skip_text;} X } X X # Do textual replacements. X foreach my $replacement (@{$self -> {replace}}) { X my ($pattern, $repl) = @$replacement; X $text =~ s/$pattern/$repl/g; X } X X $self -> {last_tag} = 'NIL' if $text =~ /\S/ && !$comment; X $self -> s_push ([$comment ? "COMMENT" : "TEXT", $text]); X} X X# Comments are just copied. X# Hmm, 'man HTML::Parser' is unclear. Comments *are* part of a document X# declaration. And you can have multiple comments in a document declarion. X# Hope this will work.... Xsub comment ($$) { X my $self = shift; X my $comment = shift; X $self -> text ($comment, 1); X} X X X# Return 1 if we shouldn't include the image. Xsub no_image ($$) { X my ($self, $src) = @_; X X foreach my $s (@{$self -> {src_matches}}) {return 1 if $src =~ /$s/;} X return; X} X X# Check if we prefer the alt text. Xsub img_to_alt ($$) { X my ($self, $attributes) = @_; X X my $src = $attributes -> {src}; X foreach my $s (@{$self -> {alts}}) { X if ($src =~ /$s/) {return $attributes -> {alt} || $INLINE;} X } X return; X} X X X X# Stack functions. Xsub s_init () {my $self = shift; $self -> {stack} = [];} Xsub s_push () {my $self = shift; push @{$self -> {stack}}, shift;} Xsub s_pop () {my $self = shift; pop @{$self -> {stack}};} Xsub s_top () {my $self = shift; $self -> {stack} -> [-1];} X X Xsub build_tag ($$$) { X my $self = shift; X my ($tag, $attributes) = @_; X my $line = "<$tag"; X foreach my $attr (keys %$attributes) { X # We lost the info whether it was single or double quote delimited. X # So we take the "safe" approach and use double quote delimited, X # but first we need to eliminate the existing double quotes. X $attributes -> {$attr} =~ s/"/"/g; X $line .= qq ( $attr = "$attributes->{$attr}"); X } X $line .= ">"; X} X Xsub build_page () { X my $self = shift; X my $text = ""; X X # Some pages miss </table> tags. This is usually due to different X # error recovery strategies of browsers and HTML::Parser. X # Count them, and insert any missing </tables> before the end X # of the document. X my $tables = 0; X my $body = 0; X X foreach my $part (@{$self -> {stack}}) { X my ($type, $tag, $attr) = @$part; X foreach ($type) { X /TEXT/ && do {$text .= $tag; last;}; X /COMMENT/ && do {$text .= "<!--$tag-->"; last;}; X /DECL/ && do {$text .= "<!$tag>"; last;}; X /END/ && do {$tables -- if $tag eq 'table'; X do {$text .= "</table>" x $tables; X $body = 1;} if !$body && ($tag eq 'body' || X $tag eq 'html'); X $text .= "</$tag>"; X last;}; X /START/ && do {$text .= $self -> build_tag ($tag, $attr); X $tables ++ if $tag eq 'table'; X last;}; X die "Unknown type: $type\n"; X } X } X X $text .= "</table>" x $tables unless $body; X X $text; X} X X1; SHAR_EOF : || $echo 'restore of' 'abiprox/Proxy/HTML.pm' 'failed' fi # ============= abiprox/Proxy/Config.pm ============== if test -f 'abiprox/Proxy/Config.pm' && test "$first_param" != -c; then $echo 'x -' SKIPPING 'abiprox/Proxy/Config.pm' '(file already exists)' else $echo 'x -' extracting 'abiprox/Proxy/Config.pm' '(text)' sed 's/^X//' << 'SHAR_EOF' > 'abiprox/Proxy/Config.pm' && Xpackage main; X X# X# $Id: Config.pm,v 1.5 1998/02/01 15:24:34 abigail Exp abigail $ X# X# $Log: Config.pm,v $ X# Revision 1.5 1998/02/01 15:24:34 abigail X# More data. X# X# Revision 1.4 1997/11/23 05:50:37 abigail X# Change structure of no_attributes and no_tags datastructures. X# They are now arrays of tags or tag/attribute pairs, so we can X# have side specific *and* general configuration. X# This gets rid of the map inside map inside map expression. X# X# Revision 1.3 1997/11/03 17:28:48 abigail X# Many changes, including support for not showing images based on the X# scr attribute, showing an images alt attribute, and deleting entire X# elements. X# X# Revision 1.2 1997/10/20 04:44:34 abigail X# All HTML stuff is now site configurable. X# X# Revision 1.1 1997/10/15 00:25:58 abigail X# Initial revision X# X# X# X# Configuration for proxy. X Xuse strict; X Xuse vars qw /@NO_REQ_HEADERS @NO_RES_HEADERS/; X # Headers we do not want in X # outgoing requests. X@NO_REQ_HEADERS = qw /User-Agent From Referer Cookie/; X@NO_RES_HEADERS = qw /Cache-control Set-Cookie Refresh Etag/; X # Headers we do not want in X # responses. X Xpackage Proxy::HTML; X X Xuse vars qw /$IMG $NO_TAGS $NO_ATTR $TEXT $REPLACE $NO_ELEMENTS X $ALT $INLINE $NETSCAPE_COMMENT/; X X X# This is done a bit tricky, but that's to be able to cope with the X# fact this file gets re-required over time. X# This function takes an attribute name and a list of regexes as X# argument, and returns a closure that returns true if the value X# of the attribute matches one of the regexs. Xmy $generic_filter = sub { X my ($attribute, @filters) = @_; X local $" = ", "; X sub { X my ($tag, $attributes, $self, $end) = @_; X my $attr = $attributes -> {$attribute} or return; X foreach (@filters) {return 1 if $attr =~ /$_/;} X return; X } X}; X X# Easy function to quickly make long regexs. Xmy $image = sub { X my ($type, $slash, @images) = @_; X local $" = "|"; X my $regex = $slash ? '/' : ""; X $regex = "$regex(?:@images)"; X $regex .= "\\.$type" if $type; X $regex; X}; X X X# Regexes that match certain sites. Xmy $ALL = '.?'; # Regex that always matches. Xmy $CNN = '//[-\w.]*\b(?:cnn(?:si|fn)?|allpolitics)\.com/'; Xmy $HELEN = '//[-\w.]*\bpeterzale\.com/'; # Helen strip. Xmy $COMICS = '//[-\w.]*\bunitedmedia\.com/'; # Dilbert/Snoopy Xmy $NEWS = '//[-\w.]*\bnews\.com/'; Xmy $ALTAVISTA = '//[-\w.]*\bdigital\.com/'; Xmy $INFOSEEK = '//[-\w.]*\binfoseek\.com/'; Xmy $YAHOO = '//[-\w.]*\byahoo\.com/'; Xmy $LYCOS = '//[-\w.]*\blycos\.com/'; Xmy $PERL = '//[-\w.]*\bperl\.com/'; Xmy $PLANET = '//[-\w.]*\bpi\.net/'; # Planet Internet. Xmy $DEJANEWS = '//[-\w.]*\bdejanews\.com/'; Xmy $KRUD = '//[-\w.]*\bkrud\.com/'; # Krud Radio cartoon. Xmy $FORBES = '//[-\w.]*\bforbes\.com/'; Xmy $VOLKSKRANT = '//[-\w.]*\bvolkskrant\.nl/'; Xmy $NRC = '//[-\w.]*\bnrc\.nl/'; Xmy $NYTIMES = '//[-\w.]*\bnytimes\.com/'; Xmy $MOVIE = '//[-\w.]*\bimdb\.com/'; # Internet Movie Database. Xmy $NORTHERN = '//[-\w.]*\bnorthernlight\.com/'; # Northern Light. Xmy $HOTBOT = '//[-\w.]*\bhotbot\.com/'; # Hotbot Xmy $MERRIAM = '//[-\w.]*\bm-w\.com/'; # Online webster. Xmy $REDMEAT = '//[-\w.]*\bredmeat\.com/'; # Strips. Xmy $UKTIMES = '//[-\w.]*\bthe-times\.co\.uk/'; # UK Times. Xmy $MERC = '//[-\w.]*\bsjmercury\.com/'; # SJ Mercury News Xmy $LATIMES = '//[-\w.]*\blatimes\.com/'; # Los Angeles Times Xmy $LFTIMES = '//[-\w.]*\bft\.com/'; # London Financial Times Xmy $USATODAY = '//[-\w.]*\busatoday\.com/'; Xmy $ZDNET = '//[-\w.]*\bzdnet\.com/'; X X# Remove logos, nav. bars and ads from commonly visited sites. X X# Regexes for well know add/counter sites. X# Used to block both A element, and the IMG element. Xmy @AD_SITES_COM = qw /focalink digiweb ad.preferences preferences nrsite linkexchange X aaddzz/; Xmy @AD_SITES_NET = qw /cerf doubleclick/; Xmy @AD_SITES_MISC = qw /www\.image\.dk/; Xmy @COUNTER_COM = qw /hitbox pagecount icount www\.fxweb\.holowww X aaddzz siteflow net-user/; Xmy @COUNTER_NET = qw /net-trak\.stats/; Xmy @COUNTER_MISC = qw /www\.image\.dk www\.nedstat\.nl/; Xmy ($AD_SITES_REGEX, $COUNTER_SITES_REGEX); X{local $" = '|'; X $AD_SITES_REGEX = "//[\\w.-]*\\b(?:(?:(?:@AD_SITES_COM)\\.com)|" . X "(?:(?:@AD_SITES_NET)\\.net)|" . X "(?:@AD_SITES_MISC))"; X $COUNTER_SITES_REGEX = "//[\\w.-]*\\b(?:(?:(?:@COUNTER_COM)\\.com)|" . X "(?:(?:@COUNTER_NET)\\.net)|" . X "(?:@COUNTER_MISC))"; X} X X# Regex for well known counter programs. Xmy $COUNTERS = join '|', qw /usercounter viewstat/; X X$IMG = [ X [$ALL => [$AD_SITES_REGEX, X $COUNTER_SITES_REGEX, X "/cgi-bin/(?:$COUNTERS)", X qw { \bads?\b X \badv(?:erts?)?/b X /p(?:oint)?cast/ X /Count\d*\.cgi X /netscape\.gif X }]], X X [$HELEN => [$image -> ('gif', 0, qw {awrd hel he ar gwe sp dou dg}), X ]], X X [$MERRIAM => [$image -> ('gif', 1, qw {but hddict logot})]], X X [$COMICS => [$image -> ('gif', 1, qw { X invis hr4 header headline \w*interbar flowers X um_bug bnn yoyo\d* dil_(?:partner|store)_icon X ms_(?:black|top(?:l)?|list_right|icon_store) X ms_(?:snoopy|editoons|store|copy|about(?:um|cz)) X ms_(?:read|dilbert|bottom|comiclist|mustsee) X link inter (?:dnrc|lotd|ddmw|duh)_icon spacer X email_endnavi 256c-100 dz_logo_sm ad slogan X (?:(?:ms|d[zd]|exodus)_|IE4|com)logo X dz_(?:anigiftease|archive|agenda|strip_[br]) X dz_(?:lart|link)_(?:lotd|ddmw|dnrc|duh|store) X ab_(?:logo[TBC]|topline|todays) bottomnav_map X ab_links_(?:dilbert|snoopy|editoons|store) X c_zone official collect artafs wood wood2 X new_cool strip store window middle linus kids X charlie lucy snoopy sally gall schroder today X epn_map_(?:t|b_books) grown new_feet X spacer_black backto a_month bottom X linksmal sh_books reading black 3dtheater X (?:sunday|arch|comic|email|more)navi(?:RED)? X youngdil}), X $image -> ('jpg', 1, qw {Backpack}), X $image -> ('', 1, qw {storeImages/}), X ]], X X [$VOLKSKRANT X => [$image -> ('gif', 1, qw { X postbank rabo demon i_voorpag_kop msie40 bruna X abnamro i_indruk_kop indruk_tab X v_\w+ media2 dossiers2 tab_\w+ empty_\d+x\d+ X i_\w+_1_136w sport\d X bovenrnd onderrand_1997 lijn})]], X X [$NRC => [$image -> ('gif', 0, qw { X luxklein(?:sup|n) hekk splash mast 1x1}) X ]], X X [$CNN => [$image -> ('gif', 1, qw { X video\.bug cnn_logo cnn_interactive nav[\w\.-]+ X spacer explore\.anim(?:\.\d+\.\d+)? X pathnet\.warner airmedia insight white X circle\.(?:white|red) arrow\.blue X custom_feature pointcast ie hr fringe_us logo qa X bullet\d* y\.matters today_cnn dot to_top2 blue X [\w-.]+banner search\.feed plus_message cities X back_to_top(?:\.space)? sound\.icon sci_tech X what_you_think\.space vx_bar\+txt weather bbs X b_OPTIONS b_SUBMIT story_promo nb_CUSTOM X nb_CNNIN nb_CUSTOM_PROFILE b_TOP plus_logo X search\.bn\.recommends book\.search bnoble X bn\.recommend\.box\.(?:left|top|right) main_us X nb_ALL_POLITICS nb_CNN(?:fn|si) news search X related(?:stories|sites) stormcenter_link X nb_CNN_INTERACTIVE mn_Lifestyle mn_Sci-Tech X sep_BANNERDOTS dot_GREY nb_LOWER_STEP dline X pumpkin\.bullet\.icon jol\.top buttons X explore\.[\w.]+ X message\.boards qt_icon logo_[lr] mn_Showbiz}), X $image -> ('jpg', 1, qw { X [\w-.]+banner news bug related(?:stories|sites) X watch\.gutter yir\.special today_cnn X }), X $image -> ('gif', 0, qw {nav_no\.cursor logo}), X $image -> ('jpg', 0, qw {lotd vidpik}), X $image -> ('', 0, qw {[Ii]nfoseek pathfinder}), X ]], X X [$YAHOO => [$image -> ('', 0, qw {/adv/}), X $image -> ('gif', 1, qw {new2}) X ]], X X [$NEWS => [$image -> ('gif', 1, qw { X newscom item_\w+ cnet home}), X $image -> ('', 1, qw {Banners/ bump masthead Frontdoor}), X ]], X X [$ALTAVISTA => [$image -> ('gif', 1, qw { X av_(?:logo|map) pixel searchpx digital-logo X preview}), X ]], X X [$INFOSEEK => [$image -> ('gif', 1, qw { X home_bar_ups big_yellow_red ups_track select X smart_search_results_ups greenbal}), X $image -> ('gif', 0, qw {circlei}), X ]], X X [$LYCOS => [$image -> ('gif', 1, qw { X new-logo place-holder navbar servicebar homerule X lycos-logo games-icon Lilarrow-(?:vspace|red) X poweredbyww sweepicn lilarrow scificon X threearrows logobar}), X ]], X X [$PERL => [$image -> ('gif', 1, qw { X ora_logo perl_id_313c header-nav camel}), X $image -> ('jpg', 1, qw {perl_lang_color}), X ]], X X [$PLANET => [$image -> ('gif', 1, qw {pilogo klgnetbn 1 now}), X $image -> ('', 0, qw {/adv/ /pics/div/}), X ]], X X [$DEJANEWS => [$image -> ('gif', 1, qw { X browsers nav_\w* head home2 fingers [rl]tri 1x1 X qm tgb text thr prev(?:_x)? curr new redline X auth X iworld_f97 next(?:_x)? [rl]arr(?:_x)? post_a}), X $image -> ('', 1, qw {dnlogo_ frownie_}), X ]], X X [$KRUD => [$image -> ('gif', 1, qw {rewind tip krudrad})]], X X [$FORBES => [$image -> ('gif', 1, qw { X adot comments X theme_ban netvlu14d logo-otl deliver netvlu}) X ]], X X [$NYTIMES => [$image -> ('gif', 1, qw {maintoolbar cyberex 1bancyber})]], X X [$MOVIE => [$image -> ('gif', 1, qw {click\d+ accentdesign center})]], X X [$NORTHERN => [$image -> ('gif', 1, qw { X logo(?:_corner2|-tall) (?:home|support)gray X side\d sson 00000 searchfor[12] line transparent X searchresults templogo foldery special X www dynamicfolders bullet_right feature})]], X X [$HOTBOT => [$image -> ('gif', 1, qw {misc\.transPixel})]], X X [$UKTIMES => [$image -> ('', 1, qw {_ecb_timnws_ valentine})]], X X X]; X X# As $IMG, but then for the cases we prefer the alt text. X$ALT = [ X [$VOLKSKRANT X => [$image -> ('', 1, qw {images/balk})]], X X [$DEJANEWS => [$image -> ('gif', 1, qw {gifs/..})]], X X [$COMICS => [$image -> ('gif', 1, qw {ab_next})]], X X [$NORTHERN => [$image -> ('gif', 1, qw { X return2resultssq newsearch purchasedoc moneyback X spec-info return2results newsearchround search X home X aboutsq accounts support disclaim help_better-z X nextten prev10 searchhelp startover})]], X]; X X# Need this later. Xmy $delete = sub {1;}; Xmy $absolute_width = sub {shift !~ /%$/;}; Xmy $no_multiple = sub {my ($tag, $attr, $self, $end) = @_; X $tag eq $self -> {last_tag}}; X X X Xmy @CNN_ADS = qw { \bredirect\b \bclick\b \bevent\.ng\b \bclick\.ng\b SHOWBIZAdSpace /ads/ /CNNProm /adinfo/ X /www\.reservationdesk\.com}; Xmy $CNN_ADS = join '|', @CNN_ADS; X X X# $NO_TAGS will remove tags. Configurable by site. X$NO_TAGS = [ X [$ALL => [[br => $no_multiple], X [hr => $no_multiple], X [meta => $generic_filter -> ('http-equiv', '.')], X [base => $generic_filter -> ('target', '_blank')], X [img => $generic_filter -> ('src' => '\b(?:ads?|adv(?:erts?)?)\b', X '/p(?:oint)?cast/', X '\bclick(thr)?', X '\bredirects?\b', X '\bad(?:vert(?:isement)?)?s?\b', X '(?:link|banner|counter)\.cgi', X '/promo(?:tions?)?/', X '(?:event|click?)\.ng', X $COUNTER_SITES_REGEX, X $AD_SITES_REGEX)], X map {[$_, $delete]} qw /font basefont blink applet X param embed/] X ], X X [$CNN => [[hr => sub {my ($tag, $attr, $self, $end) = @_; X $self -> {last_tag} eq 'a';}]]], X X # [$LYCOS => [[frame => $generic_filter -> ('src', '/wguide/tools/')]]], X]; X X X# $NO_ELEMENTS will remove entire *elements*. Configurable by site. X# Very handy to remove ad banners. X$NO_ELEMENTS = [ X X [$ALL => [ X [a => $generic_filter -> ('href', '\b(?:ads?|adv(?:erts?)?)\b', X '/p(?:oint)?cast/', X '\bclick(thr)?', X '\bredirects?\b', X '\bredir\b', X '\bad(?:vert(?:isement)?)?s?\b', X '(?:link|banner|counter)\.cgi', X '/promo(?:tions?)?/', X '(?:event|click?)\.ng', X $COUNTER_SITES_REGEX, X $AD_SITES_REGEX)], X X ]], X X [$LFTIMES => [[ a => $generic_filter -> (href => 'pft/jump')]]], X X [$LATIMES => [[ a => $generic_filter -> (href => 'event.ng')]]], X [$USATODAY => [[ a => $generic_filter -> (href => 'sponsors')]]], X X [$ALL => [[a => $generic_filter -> ('href', 'http://\S*infi.net')]]], X X [$CNN => [[a => $generic_filter -> ('href', $CNN_ADS, X 'video\.html')], X [form => $generic_filter -> ('action', 'servercast\.net')]]], X X [$PERL => [[a => $generic_filter -> ('href', 'http://opensource\.oreilly\.com/news/\w+_\d+\.html')]]], X X [$NYTIMES => [[form => $generic_filter -> ('action', '/ads/')]]], X X [$NRC => [[a => $generic_filter -> ('href', '/Advertenties/')]]], X X [$YAHOO => [[a => $generic_filter -> (href => '/[*?]http:/')]]], X X [$LYCOS => [[a => $generic_filter -> ('href', 'adclick')], X [form => $generic_filter -> ('action', 'nph-adclick.exe')]]], X X [$MOVIE => [[a => $generic_filter -> ('href', '/Ads/')]]], X X [$DEJANEWS => [[a => $generic_filter -> ('href', 'gtplacer')]]], X X [$HOTBOT => [[a => $generic_filter -> ('href', 'clickthru\.html')]]], X X [$COMICS => [[a => $generic_filter -> ('href', '1800flowers', X 'netscape\.com', X 'comiczone\.cdf', X 'microsoft\.com')], X ['select' => $generic_filter -> ('onchange', 'window\.open')], X [form => $generic_filter -> ('onsubmit', 'window\.open')]]], X X [$REDMEAT => [[a => $generic_filter -> ('href', 'riddler.com')]]], X X [$UKTIMES => [[a => $generic_filter -> ('href', 'valentines', X 'infotimes', '\bstanding\b', '\.map\+\d')]]], X X [$ZDNET => [[form => $generic_filter -> ('action', '/adverts/')]]], X X]; X X X X# Initialize the NO_ATTR datastructure. X# The form is: tag -> attribute -> coderef, per site. X# If the attribute for that tag exists, the coderef is called with X# the attribute value as parameter. If it returns true, it's deleted. X$NO_ATTR = [ X [$ALL => [[img => [[border => $delete]]], X [input => [[border => $delete]]], X [a => [[target => sub {shift =~ /new$/i;}]]], X [frameset => [[frameborder => $delete], X [border => $delete], X [framespacing => $delete]]], X [frame => [[noresize => $delete], X [marginwidth => $delete], X [marginheight => $delete], X [frameborder => $delete], X [scrolling => $delete]]], X [hr => [[noshade => $delete], X [width => $delete]]], X [table => [[width => $absolute_width], X [height => $absolute_width], X [align => $delete], X [cellspacing => $delete], X [cellpadding => $delete], X [nowrap => $delete], X [bgcolor => $delete]]], X [td => [[width => $absolute_width], X [height => $delete], X [nowrap => $delete], X [bgcolor => $delete]]], X [th => [[width => $absolute_width], X [height => $delete], X [nowrap => $delete], X [bgcolor => $delete]]], X ['tr' => [[width => $absolute_width], X [nowrap => $delete], X [bgcolor => $delete]]], X [body => [[background => $delete], X [bgcolor => $delete], X ['text' => $delete], X ['link' => $delete], X [vlink => $delete], X [alink => $delete]]]], X ], X X [$NORTHERN => [[td => [[nowrap => $delete]]]]], X]; X X X X# This strips out text, or parts of the document. X# Each site has a list of strings or 2 string elements. X# If a text or comment matches the string, it's ignored. X# For string pairs, whenever a text or comment matches the first string, X# the parsers goes into 'ignore' mode, until the second string matches X# a comment or text. *After* which it goes back to normal parsing. Xmy $COMM_ONLY = 1; Xmy $TEXT_ONLY = 2; X$TEXT = [ X [$CNN => ['Advertising information for CNN Web', X 'Pages will open in a new browser', X ['Search and Promos', 'Search and Promos', $COMM_ONLY], X ['CNN/SI Bullets', 'CNN/SI Bullets', $COMM_ONLY], X ['Community Bullets', 'Community Bullets', $COMM_ONLY], X ['Search Table', 'Search Table', $COMM_ONLY], X ['Nagano Bullets', 'Nagano Bullets', $COMM_ONLY], X ['Jobs', 'Jobs', $COMM_ONLY], X ]], X [$MERC => [['NetG AD', 'ENDAD', $COMM_ONLY]]], X X [$LYCOS => [['^\s*AD\b', '(?:\bEND |/)AD\b', $COMM_ONLY]]], X]; X X X# Replace a text with something else. X# We have 2 element arrays of strings. First is regex, second a replacement. X# No $1 and friends available. For now that is. X$REPLACE = [ X # There is some real bad software out there. X [$ALL => [['(?i)<font' => '<bogus'], X ['(?i)</font' => '</bogus'] X ]], X X [$NORTHERN => [['(?i)<meta' => '<bogus']]], X X [$MERRIAM => [['(?i)<img' => '<bogus']]], X X]; X X# How to deal with netscape comments. X$NETSCAPE_COMMENT = 1; X X# Text we like to have for inlines without alt. X$INLINE = '[INLINE]'; X X1; SHAR_EOF : || $echo 'restore of' 'abiprox/Proxy/Config.pm' 'failed' fi # ============= abiprox/proxy.pl ============== if test -f 'abiprox/proxy.pl' && test "$first_param" != -c; then $echo 'x -' SKIPPING 'abiprox/proxy.pl' '(file already exists)' else $echo 'x -' extracting 'abiprox/proxy.pl' '(text)' sed 's/^X//' << 'SHAR_EOF' > 'abiprox/proxy.pl' && X#!/usr/bin/perl -w X X# X# $Id: proxy.pl,v 1.5 1998/02/01 15:25:55 abigail Exp abigail $ X# X# $Log: proxy.pl,v $ X# Revision 1.5 1998/02/01 15:25:55 abigail X# Error handling and REAPER changes as suggested by Tom C. X# X# Revision 1.4 1997/11/03 17:28:03 abigail X# Minor changes. X# X# Revision 1.3 1997/10/15 00:20:26 abigail X# Added on the fly configuration. X# X# Revision 1.2 1997/08/24 05:14:58 abigail X# Added $IGNORE feature. X# X# Revision 1.1 1997/08/23 09:35:25 abigail X# Initial revision X# X# X# X# A proxy server that strips out nasty HTTP headers, and nasty HTML tags. X# The proxy part is inspired on Randal's Webtechniques column 11. X# [http://www.stonehenge.com/merlyn/WebTechniques/col11.html]. X Xuse strict; Xuse Carp; X X#use lib qw (/nfs1/home/abigail/Perl); X### FIXME Xuse lib qw (/home/tchrist/nap); X Xuse LWP::UserAgent; Xuse HTTP::Daemon; Xuse Proxy::HTML; Xuse POSIX qw /:sys_wait_h/; X X X# Subclass LWP::UserAgent to have the browser know about redirects. X X@UserAgent::ISA = qw /LWP::UserAgent/; Xsub UserAgent::redirect_ok {0;} # No, it is *not* ok to redirect. X X# And back to the main stuff. X Xsub handle_connection ($); Xsub fetch ($); Xsub illegal (%); X Xuse vars qw /@NO_REQ_HEADERS @NO_RES_HEADERS/; X X# This is just to satisfy -T, we don't call other programs. X$ENV {PATH} = join ':', qw (/usr/bin); # /bin -> /usr/bin on Solaris. X X# On the fly configuration file. X# Hardcoded file name - not very nice. X### FIXME Xmy $Config = "/home/tchrist/nap/Proxy/Config.pm"; X X# Configuration. X X### FIXME Xmy $ME = "wherever.youare.com"; # The machine I run on. Xmy $LISTEN_PORT = shift || 8888; # That's the port I'll listen to. Xdie "Invalid port" if $LISTEN_PORT =~ /\D/ || $LISTEN_PORT < 1024; X($LISTEN_PORT) = $LISTEN_PORT =~ m/(.*)/; # Untaint. X### FIXME Xmy $PROXY = "notherplace.someproxy.com"; # The machine the proxy runs on. Xmy $PROXY_PORT = 8181; # The proxy's port. Xmy @PROXY_SCHEMES = qw /http ftp/; # Proxy accepts this. Xundef $PROXY; # We don't need the other proxy anymore. X Xmy $FORBIDDEN = 403; # HTTP Response code. Xmy $FORBIDDEN_TEXT = 'Forbidden'; # Text of the above. X X # How I want to be named. X # RCS will fill in the version id. Xmy $NAME = q <HTML stripper $Revision: 1.5 $>; X Xmy $last_required = 0; X X# Initialize the daemon. Xmy $daemon = new HTTP::Daemon LocalAddr => $ME, LocalPort => $LISTEN_PORT, X Reuse => 1 X or croak "HTTP Proxy failed to initialize: $!"; X X# Initialize agent. Xmy $agent = new UserAgent; X $agent -> agent ($NAME); Xif (defined $PROXY) { X $agent -> proxy (\@PROXY_SCHEMES, "http://$PROXY:$PROXY_PORT/"); X} X $agent -> env_proxy (); # Set up whatever else you want. X X $agent -> parse_head (0); # Man page says: Do not turn this off, X # unless you know what you X # are doing. X # I know damn well what I'm doing, and one X # of the reasons to start this whole project X # are those annoying elements in the head. X # I rather have software without a mind of X # its own. Furthermore, read the frigging X # spec, 'http-equiv' is there for *servers*, X # not user agents. Damnit. X # At least server authors always had the X # sense to ignore those elements. X X# Zombies are bad. Xsub REAPER { X my $child; X while (($child = waitpid (-1, WNOHANG)) && $child != -1) { } X $SIG{CHLD} = \&REAPER; # still loathe sysV X} X$SIG {CHLD} = \&REAPER; X Xwarn "Proxy on $ME ($LISTEN_PORT) initialized\n"; X X# And loop forever. Xwhile (my $slave = $daemon -> accept) { X # Reconfigure if the config file changed. X my $mod = (stat $Config) [9] || 0; X if ($mod > $last_required) { X delete $INC {$Config}; X require $Config; X $last_required = $mod; X } X handle_connection $slave; X} X X# That's all folks. Xexit 0; X X X# This handles the connection. Fork, and let the child deal with X# the stuff. Child exits after dealing with the stuff. Xsub handle_connection ($) { X my $connection = shift; X X my $pid = fork; X if ($pid) { X close $connection; # Can I do $connection -> close; ? If no, why not? X return; X } X X X my $request = $connection -> get_request; X if (defined $request) { X $0 = "tbtw " . $request->url->as_string(); X ## print "$0\n"; X my $response = fetch $request; X # Error handling added by Tom Christiansen, 19971118. X unless ($response -> is_error) { X $connection -> send_response ($response); X } X else { X my $trailer = "<br>---------------</br><em>$NAME</em>"; X my $content = $response -> error_as_HTML; X $content =~ s!(?=</body>)!$trailer!i; X $connection -> send_error ($response -> code, $content); X } X close $connection; X } X exit 0 if defined $pid; X} X X# Check if a request is valid. If it is, strip info from the client, X# fetch the stuff, strip info from the server, and return the result. X# For illegal stuff, return an error message. Xsub fetch ($) { X my $request = shift; X X # Check if valid. X if ($request -> url -> scheme ne 'http') { X return illegal url => $request -> url, reason => "bad scheme"; X } X X my $response = eval {$agent -> request ($request -> strip) -> strip;}; X print scalar localtime, "\n$@\n\n" if $@; X X # print scalar localtime, "\n", $response -> as_string, "\n\n"; X X $response; # Will generate a 'document contains no data' if there's X # an error. Needs a fix. X} X X X# Create a forbidden response. Xsub illegal (%) { X my %args = @_; X my $response = new HTTP::Response $FORBIDDEN, $FORBIDDEN_TEXT; X X # There's only one illegal action for now.... X foreach ($args {reason}) { X /^bad scheme/ && do { X $response -> content ("Illegal scheme: $args{url}"); X last; X }; X # Any... X $response -> content ("Illegal action."); X } X X $response; X} X X X# This sub routine strips out HTTP headers we do not want to send. X# It's just an extra function added to HTTP::Request. Xsub HTTP::Request::strip ($) { X my $self = shift; X $self -> remove_header (@NO_REQ_HEADERS); X $self; # Return self for extra coolness. X} X X X# Strip out HTTP response headers we do not want to send to the client. X# And if the returned type is of text/html, filter it. X# We put this function into HTTP::Response. Xsub HTTP::Response::strip ($) { X my $self = shift; X X $self -> remove_header (@NO_RES_HEADERS); X X if ($self -> content_type eq 'text/html') { X my $html = new Proxy::HTML; X # Replace content with the parsed text after parsing the content. X $self -> content ($html -> parse ($self) -> parsed_text); X } X X $self; # Return self for extra coolness. X} X SHAR_EOF : || $echo 'restore of' 'abiprox/proxy.pl' 'failed' fi rm -fr _sh22768 exit 0 -- I don't know. I don't care. And it doesn't make any difference.