#!/usr/bin/perl # Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB # This file is public domain and comes with NO WARRANTY of any kind # # This program is brought to you by Janne-Petteri Koilo with the # administration of Michael Widenius. # This program takes your mails and puts them into your database. It ignores # messages with the same from, date and message text. # You can use mail-files that are compressed or gzipped and ends with # -.gz or -.Z. use DBI; use Getopt::Long; $VER = "1.6"; $opt_db = "mail"; $opt_table = "mails"; $opt_max_mail_size = 65536; $opt_db_engine = "mysql"; $opt_host = "localhost"; $opt_user = $opt_password = ""; $opt_help = $opt_version = $opt_test=0; GetOptions("help","version","user=s","password=s", "db_engine=s","db=s","host=s","max_mail_size=s","test") || usage(); usage($VER) if ($opt_help || $opt_version || !$ARGV[0]); %months= ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5, 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10, 'Nov' => 11, 'Des' => 12); $count_no_from = $count_no_txt = $count_too_big = 0; $count_forwarded_msgs = $count_duplicates = $no_subject = 0; $inserted_mails = 0; $dbh=0; $dbh = DBI->connect("DBI:$opt_db_engine:$opt_db:$opt_host",$opt_user, $opt_password,{ PrintError => 0}) || die $DBI::errstr; if (!$opt_test) { create_table_if_needed($dbh); } foreach (@ARGV) { if (/^(.*)\.(gz|Z)$/) #checks if the file is compressed or gzipped { open(FILE, "zcat $_ |"); process_mail_file($dbh,$1); } else { open(FILE,$_); process_mail_file($dbh,$_); } } $dbh->disconnect if (!$opt_test); $ignored = $count_no_from + $count_no_txt + $count_too_big + $count_duplicates + $no_subject; print "Mails inserted:\t\t\t$inserted_mails\n"; print "Mails ignored:\t\t\t$ignored\n"; print "Mails without \"From:\" -field:\t$count_no_from\n"; print "Mails without message:\t\t$count_no_txt\n"; print "Too big mails (> $opt_max_mail_size):\t$count_too_big\n"; print "Duplicate mails:\t\t$count_duplicates\n"; print "Forwarded mails:\t\t$count_forwarded_msgs\n"; print "No subject:\t\t\t$no_subject\n"; print "Mails altogether:\t\t"; print $inserted_mails+$ignored; print "\n"; exit(0); sub usage { my($VER)=@_; $0 =~ s/.\/(.+)/$1/; if ($opt_version) { print "$0 version $VER\n"; } else { print <prepare("select count(*) from $opt_table") or die $dbh->errstr; if (!$sth->execute) { $create = "CREATE TABLE $opt_table (msg_nro mediumint unsigned not null "; $create .= "auto_increment, date DATETIME NOT NULL, time_zone CHAR(6) "; $create .= "NOT NULL, mail_from char(120) not null, reply char(120), "; $create .= "mail_to TEXT, cc TEXT, sbj char(200), txt MEDIUMTEXT NOT "; $create .= "NULL, file char(32) noT NULL, hash INT NOT NULL, key "; $create .= "(msg_nro), primary key (mail_from, date, time_zone, hash))"; $sth = $dbh->prepare($create) or die $dbh->errstr; $sth->execute() or die $dbh->errstr; } } sub process_mail_file { my ($dbh,$file_name)= @_; my (%values,$type,$check); %values=(); $type=""; $check=0; while () { chop; if ($type ne "message") { if (/^Reply-To: (.*)/i) # finding different fields from file { $type="reply"; $values{$type}= $1; } elsif (/^From: (.*)/i) { $type="from"; $values{$type}= $1; } elsif (/^To: (.*)/i) { $type="to"; $values{$type}= $1; } elsif (/^Cc: (.*)/i) { $type="cc"; $values{$type}= $1; } elsif (/^Subject: (.*)/i) { $type="subject"; $values{$type}= $1; } elsif (/^Date: (.*)/i) { date_parser($1,\%values); $type="rubbish"; } elsif (/^[\w\W-]+:\s/) { $type="rubbish"; } elsif ($_ eq "") { $type="message"; $values{$type}=""; } else { s/^\s*/ /; $values{$type}.= $_; } } elsif ($check!=0 && $_ ne "") # in case of forwarded messages { $values{$type}.= "\n" . $_; $check--; } elsif (/^From .* \d\d:\d\d:\d\d\s\d\d\d\d$/) { $values{'hash'}= checksum("$values{'message'}"); update_table($dbh,$file_name,\%values); %values=(); $type=""; $check=0; } elsif (/-* forwarded message .*-*/i) # in case of forwarded messages { $values{$type}.= "\n" . $_; $check++; $count_forwarded_msgs++; } else { $values{$type}.= "\n" . $_; } } $values{'hash'}= checksum("$values{'message'}"); update_table($dbh,$file_name,\%values); } ######## # converts date to the right form sub date_parser { my ($date_raw,$values)=@_; $date_raw =~ /\s*(\d{1,2}) (\w+) (\d{2,4}) (\d+:\d+:\d+)\s*([\w-+]{3-5})?/; $values->{'date'}=$3 . "-" . $months{$2} . "-" . "$1 $4"; $values->{'time_zone'}=$5; } ######### # this is runned when the whole mail is gathered. # this actually puts the mail to the database. sub update_table { my($dbh,$file_name,$values)=@_; my($query); if (! defined($values->{'subject'}) || !defined($values->{'to'})) { $no_subject++; return; # Ignore these } $values->{'message'} =~ s/^\s*//; #removes whitespaces from the beginning $values->{'message'} =~ s/\s*$//; #removes whitespaces from the end $query = "insert into $opt_table values (NULL,'" . $values->{'date'}; $query .= "','" . $values->{'time_zone'} . "',"; $query .= (defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL") . ","; $query .= (defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL") . ","; $query .= (defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL") . ","; $query .= (defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL") . ","; $query .= $dbh->quote($values->{'subject'}) . ","; $query .= $dbh->quote($values->{'message'}) . "," . $dbh->quote($file_name); $query .= ",'" . $values->{'hash'} . "')"; if (length($values->{'message'}) > $opt_max_mail_size) #disables big message { $count_too_big++; } elsif ($values->{'from'} eq "") #disables mails with no from field { $count_no_from++; } elsif ($opt_test) { print "$query\n"; $inserted_mails++; } elsif ($values->{'message'} eq "") #disables mails with no message text { $count_no_msg_text++; } elsif ($dbh->do($query)) { $inserted_mails++; } elsif (!($dbh->errstr =~ /Duplicate entry /)) #disables duplicates { die "Aborting: Got error '" . $dbh->errstr ."' for query: '$query'\n"; } else { $count_duplicates++; } $query=""; } ########## # In case you have two identical messages we wanted to identify them # and remove additionals; We do this by calculating a hash number of the # message and ignoring messages with the same from, date and hash. # This function calculates a simple 32 bit hash value for the message. sub checksum { my ($txt)= @_; my ($crc,$i,$count); $count = length($txt); for ($crc = $i = 0; $i < $count ; $i++) { $crc = (($crc << 1) + (ord (substr ($txt, $i, 1)))) + (($crc & (1 << 30)) ? 1 : 0); $crc &= ((1 << 31) -1); } return $crc; }