#!/usr/local/bin/perl -w ########################################################################### # mime_strip.html_bodies.pl # ########################################################################### # Used to strip the "HTML-alternative" MIME attachments that some Mozilla # # and MS Outlook users like to add to every email. Used in conjunction # # with procmail. # # # # Originally coded 12/17/2002 by Lester Hightower # # Based on info from this URL: http://perlmonks.thepen.com/53404.html # ########################################################################### use strict; use MIME::Parser; use MIME::Entity; use IPC::Open2; use FileHandle; use Getopt::Long; $Getopt::Long::ignorecase = 0; $Getopt::Long::debug = 0; my $DoSpamCheck=0; my $MsgDupThreshold=-1; my $help=0; my $DEBUG=0; if ( !GetOptions( 'spamcheck=i' => \$DoSpamCheck, 'msg-dup-threshold=i' => \$MsgDupThreshold, 'help' => \$help, 'debug' => \$DEBUG, ) or $help ) { app_usage(2); } my $VERSION = 1.6; $|++; my $envelope = ; my $parser = MIME::Parser->new; $parser->output_to_core(1); $parser->tmp_to_core(1); my $ent = eval { $parser->parse(\*STDIN) }; die "$@" if $@; if ($ent->effective_type eq "multipart/alternative" and $ent->parts == 2 and $ent->parts(0)->effective_type eq "text/plain" and $ent->parts(1)->effective_type eq "text/html") { my $body=$ent->parts(0)->bodyhandle->as_string(); # If the body is short (so a regex makes sense) and it's all white-space, # then dropping the HTML body makes the message a little useless, and # likely it is spam... However, if we have lynx available, we'll # try to convert it to plain text. if ( (length($body) < 500 && $body =~ m/^\s+$/) || $DoSpamCheck) { my $html_body=$ent->parts(1)->bodyhandle->as_string(); my $new_body = &HTMLtoTXTviaLynx(\$html_body); if (length($new_body)) { if ($DoSpamCheck) { $body = &DoSpamCheck($ent, \$body, \$new_body, $MsgDupThreshold); } else { $body = $new_body; } } } else { $body.="\n\n[HTML alternate version deleted]\n"; } my $newent = MIME::Entity->build(Data => $body); $ent->parts([$newent]); $ent->make_singlepart; $ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE'); } elsif ($ent->effective_type eq "multipart/mixed" and $ent->parts(0)->effective_type eq "multipart/alternative" and $ent->parts(0)->parts == 2 and $ent->parts(0)->parts(0)->effective_type eq "text/plain" and $ent->parts(0)->parts(1)->effective_type eq "text/html") { my $body=$ent->parts(0)->parts(0)->bodyhandle->as_string(); # If the body is short (so a regex makes sense) and it's all white-space, # then dropping the HTML body makes the message a little useless, and # likely it is spam... However, if we have lynx available, we'll # try to convert it to plain text. if (length($body) < 500 && $body =~ m/^\s+$/) { my $html_body=$ent->parts(0)->parts(1)->bodyhandle->as_string(); my $new_body = &HTMLtoTXTviaLynx(\$html_body); if (length($new_body)) { if ($DoSpamCheck) { $body = &DoSpamCheck($ent, \$body, \$new_body, $MsgDupThreshold); } else { $body = $new_body; } } } else { $body.="\n\n[HTML alternate version deleted]\n"; } my $newent = MIME::Entity->build(Data => $body); $ent->parts(0)->parts([$newent]); $ent->parts(0)->make_singlepart; $ent->parts(0)->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE'); $ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE'); # # Mozilla Mail (added 12/17/2002 by LHH) # # Emails from mozilla with "multipart/related" attachments. } elsif ($ent->effective_type eq "multipart/alternative" and $ent->parts == 2 and $ent->parts(0)->effective_type eq "text/plain" and $ent->parts(1)->effective_type eq "multipart/related") { my $newent = MIME::Entity->build(Data => $ent->parts(0)->bodyhandle->as_string() . "\n\n[HTML alternate version deleted]\n"); $ent->parts([$newent]); $ent->make_singlepart; $ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE'); # # Mozilla Mail (added 03/27/2003 by LHH) # # Emails from mozilla with "multipart/related" attachments inside of # an "multipart/alternative" in a "multipart/mixed" message. # These come when a document is attached to an email and the # client is also configured to send HTML mail. } elsif ($ent->effective_type eq "multipart/mixed" and $ent->parts(0)->effective_type eq "multipart/alternative" and $ent->parts(0)->parts == 2 and $ent->parts(0)->parts(0)->effective_type eq "text/plain" and $ent->parts(0)->parts(1)->effective_type eq "multipart/related") { my $newent = MIME::Entity->build(Data => $ent->parts(0)->parts(0)->bodyhandle->as_string() . "\n\n[HTML alternate version deleted]\n"); $ent->parts(0)->parts([$newent]); $ent->parts(0)->make_singlepart; $ent->parts(0)->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE'); $ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE'); } print $envelope; $ent->print; exit; ############### # Subroutines # ############### sub HTMLtoTXTviaLynx { my $rhtml=shift @_; if (ref($rhtml) ne 'SCALAR') { return ''; } my $LYNX_EXE=`which lynx 2>/dev/null`; chomp($LYNX_EXE); if (! (-x $LYNX_EXE)) { return ''; } my $body=''; my $tmp_file = "/tmp/mime_strip.html_bodies.$$.lynx.tmp"; my $h_out=new FileHandle; my $h_in=new FileHandle; my $new_enough_lynx=0; my $lynx_ver="UNKNOWN"; if (open($h_in, "'$LYNX_EXE' --version 2>&1 |")) { my @ver_data=<$h_in>; my ($ver_info,@crap)=grep(/version/i, @ver_data); chomp($ver_info); close($h_in); if ($ver_info =~ m/Version (([0-9]+)\.([0-9]+)\.([0-9]+)[^\s]+)/i) { $lynx_ver=$1; if ($2 >= 2 && $3 >= 8 && $4 >= 4) { $new_enough_lynx=1; } } } if ($new_enough_lynx && open($h_out, "| '$LYNX_EXE' -stdin -dump -nolog -noredir -localhost -nolist -width=76 1>'$tmp_file' 2>/dev/null")) { print $h_out $$rhtml; close $h_out; if (-r $tmp_file && open($h_in, "< $tmp_file")) { my @lynx_text_arr=<$h_in>; close($h_in); unlink $tmp_file; my $lynx_text=join('', @lynx_text_arr); if (length($lynx_text)) { $body = $lynx_text . "\n[HTML converted to text by lynx $lynx_ver]\n"; } } } elsif (! $new_enough_lynx) { $body="NOTICE from mime_strip.html_bodies.pl:\n" . "\n" . "This message had an empty text/plain attachment, so I wanted\n" . "to use lynx to convert the HTML attachment to plain text,\n" . "and I found a lynx binary at $LYNX_EXE, but the lynx\n" . "version is too old for me to use (ver. $lynx_ver).\n" . "\n" . "You might consider upgrading lynx to v2.8.4 or higher.\n"; } return $body; } sub DoSpamCheck { my $ent=shift @_; my $rTxtBody=shift @_; my $rLnxBody=shift @_; my $MsgDupThreshold=shift @_; # Get the "[HTML converted to text by lynx...]" line out of $rLnxBody my @tmp=grep(!/^\[HTML converted to text by /,split(/[\r\n]+/,$$rLnxBody)); my $htm_spam_body=join("\n", @tmp); # Grab all the words, and word count, into %words_txt and %words_htm my %words_txt; my %words_htm; %words_txt = map { lc($_) => (iif($words_txt{lc($_)}) + 1) } uniq([split(/\W+/, $$rTxtBody)] ); %words_htm = map { lc($_) => (iif($words_htm{lc($_)}) + 1) } uniq([split(/\W+/, $htm_spam_body)]); # If we matched on empty string, drop those if (defined($words_txt{''})) { delete $words_txt{''}; } if (defined($words_htm{''})) { delete $words_htm{''}; } # Go through each word and count the difference in the number of # occurrances in that word in the text vs. the HTML parts, and then # Set "X-MultiPart-SpamRating" and "X-MultiPart-IsSpam" headers for # this message, as appropriate. #my %diff_words; my $total_diff=0; foreach my $word ( uniq([keys %words_txt, keys %words_htm]) ) { my $diff = abs(iif($words_txt{$word}) - iif($words_htm{$word})); #$diff_words{$word} = $diff; $total_diff += $diff; #print "LHHD: \"$word\"\n"; } #print "LHHD:\n" . $htm_spam_body . "\n---\n" . $$rTxtBody . "\nEND LHHD\n"; my $head=$ent->head(); $head->replace('X-MultiPart-SpamRating', $total_diff); if ($DoSpamCheck > 0 && $total_diff > $DoSpamCheck) { $head->replace('X-MultiPart-IsSpam', 'YES'); } my $new_body=''; if ($total_diff > $MsgDupThreshold) { my $lsp = 15; # Leading space on section headers $new_body= (" " x $lsp) . "+" . ("-" x 39) . "+\n" . (" " x $lsp) . "| Plain-text part of multi-part message |\n" . (" " x $lsp) . "+" . ("-" x 39) . "+\n" . "\n" . $$rTxtBody . "\n" . (" " x $lsp) . "+" . ("-" x 33) . "+\n" . (" " x $lsp) . "| HTML part of multi-part message |\n" . (" " x $lsp) . "+" . ("-" x 33) . "+\n" . $$rLnxBody; } else { $new_body = $$rTxtBody; } return $new_body; } sub app_usage { my ($exit) = shift @_; print STDERR < -help This message. -spamcheck=n Do spam check between html and text message parts. "n" is one of three types of values: 0 - do no spam checking <0 - do spam checking but only apply the X-MultiPart-SpamRating header. >0 - do spam checking and apply the X-MultiPart-SpamRating header and also the X-MultiPart-IsSpam: YES header if the SpamRating is > than "n" -msg-dup-threshold=n Threshold above which to show both the plain-text and the lynx-converted text in the msg body when using the -spamcheck option. -debug Not yet implemented. EndOfUsage exit $exit if $exit != 0; return $exit; } sub uniq { my($rarray) = shift(@_); my(%tmphash, $item); foreach $item (@$rarray) { $tmphash{$item}++; } return(sort keys %tmphash); } sub iif { my $val=shift @_; if (! defined($val)) { return 0; } else { return int($val); } } ############### ## Begin POD ## ############### =head1 NAME mime_strip.html_bodies.pl =head1 README Used to strip the alternative "HTML body" attachments that some Mozilla and MS Outlook users like to add to every email. Version 1.5 also introduced a feature to catch most multipart/alternative SPAM, which has become very popular recently and that gets past most SPAM filters. This script is most often used in conjunction with procmail. =head1 DESCRIPTION Below is a snippet from my .procmailrc to illustrate the use of this script. Note that perldoc wraps some of the lines when it should not, so if you intend to copy/paste please open the script itself and copy/paste from there, not from a "perldoc" or "man" view. ############################################################################# :0 * ^Content-Type: (multipart/alternative|multipart/mixed) { # OK, before we just blindly file this in filtered.multipart_alternative # or deliver it, let's give spamassassin a chance at it. :0fw * < 100000 | /usr/bin/spamc # Store a copy, just in case. (paranoia, may kill this later) :0cHBE * ^Content-Type: multipart/alternative { :0 /home/hightowe/mail/junk_mail/filtered.multipart_alternative.$THIS_MONTH } ############################################# # Run the message through the strip filters # ############################################# # Note: on 08/12/2003 I added the "a" flags here to only run this # recipe if the "multipart/alternative" matched right above. # Strip HTML-alternative bodies :0afw | /home/hightowe/bin/mime_strip.html_bodies.pl -spamcheck=50 -msg-dup-threshold=5 # Rename possibly dangerous attachments (.exe/.vbs/.pif/etc.) :0fw | /home/hightowe/bin/mime_rename.dangerous_windows_exts.pl } # OK, we're going to file all "X-MultiPart-IsSpam: YES" messages # into /home/hightowe/mail/junk_mail/filtered.MultiPartSpam.$THIS_MONTH. # Note that this header is applied by ~/bin/mime_strip.html_bodies.pl. :0H: * ^X-MultiPart-IsSpam: YES /home/hightowe/mail/junk_mail/filtered.MultiPartSpam.$THIS_MONTH ############################################################################# =head1 AUTHORSHIP Lester Hightower =head1 CHANGE LOG Dec-17-2002: Originally created by Lester Hightower Mar-25-2003: Ver. 1.0, first release to CPAN Mar-26-2003: Ver. 1.1, added "README" to POD for better CPAN behavior Mar-27-2003: Ver. 1.2, added another Mozilla "multipart/related" catch Apr-22-2003: Ver. 1.3, bugfix: changed to $ent->bodyhandle->as_string() Jan-06-2004: Ver. 1.4: added lynx-based HTML->plain text conversions for multi-part alternatives that do not have a text/plain component. Jan-11-2004: Ver. 1.5: added -spamcheck=n option to try to catch the new multipart/alternative SPAM that has become very popular lately -- where the plain text part is something that SpamAssassin, et. al., allows through (random words, paragraphs from books), but the HTML part is an advertisement. Jan-16-2004: Ver. 1.6: added -msg-dup-threshold=n option to allow one to eliminate that msg body duplication caused by -spamcheck. This is particularly useful for low rated messages (which are normally not spam). =head1 PREREQUISITES This script requires the C module. It also requires C. If you want to use the HTML->text feature (not required, and not very important), or the -spamcheck feature, you will need version 2.4.8 or higher of lynx, the text-based web-browser, installed. =pod OSNAMES any =pod SCRIPT CATEGORIES Mail Mail/Converters Mail/Filters =cut