diff options
Diffstat (limited to 'tests/mail_to_db.pl')
-rwxr-xr-x | tests/mail_to_db.pl | 310 |
1 files changed, 310 insertions, 0 deletions
diff --git a/tests/mail_to_db.pl b/tests/mail_to_db.pl new file mode 100755 index 00000000000..7886ffb7e0f --- /dev/null +++ b/tests/mail_to_db.pl @@ -0,0 +1,310 @@ +#!/usr/bin/perl +# Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB +# This file is public domain and comes with NO WARRANTY of any kind +# +# This program is brought to you by Janne-Petteri Koilo with the +# administration of Michael Widenius. + +# This program takes your mails and puts them into your database. It ignores +# messages with the same from, date and message text. +# You can use mail-files that are compressed or gzipped and ends with +# -.gz or -.Z. + +use DBI; +use Getopt::Long; + +$VER = "1.6"; + +$opt_db = "mail"; +$opt_table = "mails"; +$opt_max_mail_size = 65536; +$opt_db_engine = "mysql"; +$opt_host = "localhost"; +$opt_user = $opt_password = ""; +$opt_help = $opt_version = $opt_test=0; + +GetOptions("help","version","user=s","password=s", + "db_engine=s","db=s","host=s","max_mail_size=s","test") || usage(); + +usage($VER) if ($opt_help || $opt_version || !$ARGV[0]); + +%months= ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5, + 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10, + 'Nov' => 11, 'Des' => 12); + +$count_no_from = $count_no_txt = $count_too_big = 0; +$count_forwarded_msgs = $count_duplicates = $no_subject = 0; +$inserted_mails = 0; +$dbh=0; + +$dbh = DBI->connect("DBI:$opt_db_engine:$opt_db:$opt_host",$opt_user, + $opt_password,{ PrintError => 0}) || die $DBI::errstr; +if (!$opt_test) +{ + create_table_if_needed($dbh); +} + +foreach (@ARGV) +{ + if (/^(.*)\.(gz|Z)$/) #checks if the file is compressed or gzipped + { + open(FILE, "zcat $_ |"); + process_mail_file($dbh,$1); + } + else + { + open(FILE,$_); + process_mail_file($dbh,$_); + } +} +$dbh->disconnect if (!$opt_test); + +$ignored = $count_no_from + $count_no_txt + $count_too_big + $count_duplicates + $no_subject; +print "Mails inserted:\t\t\t$inserted_mails\n"; +print "Mails ignored:\t\t\t$ignored\n"; +print "Mails without \"From:\" -field:\t$count_no_from\n"; +print "Mails without message:\t\t$count_no_txt\n"; +print "Too big mails (> $opt_max_mail_size):\t$count_too_big\n"; +print "Duplicate mails:\t\t$count_duplicates\n"; +print "Forwarded mails:\t\t$count_forwarded_msgs\n"; +print "No subject:\t\t\t$no_subject\n"; +print "Mails altogether:\t\t"; +print $inserted_mails+$ignored; +print "\n"; +exit(0); + +sub usage +{ + my($VER)=@_; + + $0 =~ s/.\/(.+)/$1/; + if ($opt_version) + { + print "$0 version $VER\n"; + } + else + { + print <<EOF; +$0 version $VER + +Usage: $0 [options] file1 [file2 file3 ...] + +Description: Inserts mails from file(s) into a database + +Options: +--help show this help and exit +--version shows the version of the program +--db_engine=... database server (default: $opt_db_engine) +--db=... database to be used (default: $opt_db) +--host=... hostname to be used (default: $opt_host) +--password=... user password for the db server +--user=... username for the db server +--max_mail_size=# max size of a mail to be inserted into the db. + mail will be ignored if it exceeds this size + (default $opt_max_mail_size) +--test Don\'t connect to the database, just write the + queries to stdout +EOF + } + exit(0); +} + +sub create_table_if_needed +{ + my ($dbh)=@_; + my ($sth,$create); + + $sth = $dbh->prepare("select count(*) from $opt_table") or die $dbh->errstr; + if (!$sth->execute) + { + $create = "CREATE TABLE $opt_table (msg_nro mediumint unsigned not null "; + $create .= "auto_increment, date DATETIME NOT NULL, time_zone CHAR(6) "; + $create .= "NOT NULL, mail_from char(120) not null, reply char(120), "; + $create .= "mail_to TEXT, cc TEXT, sbj char(200), txt MEDIUMTEXT NOT "; + $create .= "NULL, file char(32) noT NULL, hash INT NOT NULL, key "; + $create .= "(msg_nro), primary key (mail_from, date, time_zone, hash))"; + $sth = $dbh->prepare($create) or die $dbh->errstr; + $sth->execute() or die $dbh->errstr; + } +} + +sub process_mail_file +{ + my ($dbh,$file_name)= @_; + my (%values,$type,$check); + + %values=(); $type=""; + $check=0; + + while (<FILE>) + { + chop; + if ($type ne "message") + { + if (/^Reply-To: (.*)/i) # finding different fields from file + { + $type="reply"; + $values{$type}= $1; + } + elsif (/^From: (.*)/i) + { + $type="from"; + $values{$type}= $1; + } + elsif (/^To: (.*)/i) + { + $type="to"; + $values{$type}= $1; + } + elsif (/^Cc: (.*)/i) + { + $type="cc"; + $values{$type}= $1; + } + elsif (/^Subject: (.*)/i) + { + $type="subject"; + $values{$type}= $1; + } + elsif (/^Date: (.*)/i) + { + date_parser($1,\%values); + $type="rubbish"; + } + elsif (/^[\w\W-]+:\s/) + { + $type="rubbish"; + } + elsif ($_ eq "") + { + $type="message"; + $values{$type}=""; + } + else + { + s/^\s*/ /; + $values{$type}.= $_; + } + } + elsif ($check!=0 && $_ ne "") # in case of forwarded messages + { + $values{$type}.= "\n" . $_; + $check--; + } + elsif (/^From .* \d\d:\d\d:\d\d\s\d\d\d\d$/) + { + $values{'hash'}= checksum("$values{'message'}"); + update_table($dbh,$file_name,\%values); + %values=(); $type=""; + $check=0; + } + elsif (/-* forwarded message .*-*/i) # in case of forwarded messages + { + $values{$type}.= "\n" . $_; + $check++; + $count_forwarded_msgs++; + } + else + { + $values{$type}.= "\n" . $_; + } + } + $values{'hash'}= checksum("$values{'message'}"); + update_table($dbh,$file_name,\%values); +} + +######## + +# converts date to the right form + +sub date_parser +{ + my ($date_raw,$values)=@_; + + $date_raw =~ /\s*(\d{1,2}) (\w+) (\d{2,4}) (\d+:\d+:\d+)\s*([\w-+]{3-5})?/; + + $values->{'date'}=$3 . "-" . $months{$2} . "-" . "$1 $4"; + $values->{'time_zone'}=$5; +} + +######### + +# this is runned when the whole mail is gathered. +# this actually puts the mail to the database. + +sub update_table +{ + my($dbh,$file_name,$values)=@_; + my($query); + + if (! defined($values->{'subject'}) || !defined($values->{'to'})) + { + $no_subject++; + return; # Ignore these + } + $values->{'message'} =~ s/^\s*//; #removes whitespaces from the beginning + $values->{'message'} =~ s/\s*$//; #removes whitespaces from the end + $query = "insert into $opt_table values (NULL,'" . $values->{'date'}; + $query .= "','" . $values->{'time_zone'} . "',"; + $query .= (defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL") . ","; + $query .= (defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL") . ","; + + $query .= (defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL") . ","; + $query .= (defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL") . ","; + $query .= $dbh->quote($values->{'subject'}) . ","; + $query .= $dbh->quote($values->{'message'}) . "," . $dbh->quote($file_name); + $query .= ",'" . $values->{'hash'} . "')"; + + if (length($values->{'message'}) > $opt_max_mail_size) #disables big message + { + $count_too_big++; + } + elsif ($values->{'from'} eq "") #disables mails with no from field + { + $count_no_from++; + } + elsif ($opt_test) + { + print "$query\n"; + $inserted_mails++; + } + elsif ($values->{'message'} eq "") #disables mails with no message text + { + $count_no_msg_text++; + } + elsif ($dbh->do($query)) + { + $inserted_mails++; + } + elsif (!($dbh->errstr =~ /Duplicate entry /)) #disables duplicates + { + die "Aborting: Got error '" . $dbh->errstr ."' for query: '$query'\n"; + } + else + { + $count_duplicates++; + } + $query=""; +} + + +########## + +# In case you have two identical messages we wanted to identify them +# and remove additionals; We do this by calculating a hash number of the +# message and ignoring messages with the same from, date and hash. +# This function calculates a simple 32 bit hash value for the message. + +sub checksum +{ + my ($txt)= @_; + my ($crc,$i,$count); + $count = length($txt); + for ($crc = $i = 0; $i < $count ; $i++) + { + $crc = (($crc << 1) + (ord (substr ($txt, $i, 1)))) + + (($crc & (1 << 30)) ? 1 : 0); + $crc &= ((1 << 31) -1); + } + return $crc; +} |