summaryrefslogtreecommitdiff
path: root/tests/mail_to_db.pl
diff options
context:
space:
mode:
Diffstat (limited to 'tests/mail_to_db.pl')
-rwxr-xr-xtests/mail_to_db.pl310
1 files changed, 310 insertions, 0 deletions
diff --git a/tests/mail_to_db.pl b/tests/mail_to_db.pl
new file mode 100755
index 00000000000..7886ffb7e0f
--- /dev/null
+++ b/tests/mail_to_db.pl
@@ -0,0 +1,310 @@
+#!/usr/bin/perl
+# Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB
+# This file is public domain and comes with NO WARRANTY of any kind
+#
+# This program is brought to you by Janne-Petteri Koilo with the
+# administration of Michael Widenius.
+
+# This program takes your mails and puts them into your database. It ignores
+# messages with the same from, date and message text.
+# You can use mail-files that are compressed or gzipped and ends with
+# -.gz or -.Z.
+
+use DBI;
+use Getopt::Long;
+
+$VER = "1.6";
+
+$opt_db = "mail";
+$opt_table = "mails";
+$opt_max_mail_size = 65536;
+$opt_db_engine = "mysql";
+$opt_host = "localhost";
+$opt_user = $opt_password = "";
+$opt_help = $opt_version = $opt_test=0;
+
+GetOptions("help","version","user=s","password=s",
+ "db_engine=s","db=s","host=s","max_mail_size=s","test") || usage();
+
+usage($VER) if ($opt_help || $opt_version || !$ARGV[0]);
+
+%months= ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5,
+ 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10,
+ 'Nov' => 11, 'Des' => 12);
+
+$count_no_from = $count_no_txt = $count_too_big = 0;
+$count_forwarded_msgs = $count_duplicates = $no_subject = 0;
+$inserted_mails = 0;
+$dbh=0;
+
+$dbh = DBI->connect("DBI:$opt_db_engine:$opt_db:$opt_host",$opt_user,
+ $opt_password,{ PrintError => 0}) || die $DBI::errstr;
+if (!$opt_test)
+{
+ create_table_if_needed($dbh);
+}
+
+foreach (@ARGV)
+{
+ if (/^(.*)\.(gz|Z)$/) #checks if the file is compressed or gzipped
+ {
+ open(FILE, "zcat $_ |");
+ process_mail_file($dbh,$1);
+ }
+ else
+ {
+ open(FILE,$_);
+ process_mail_file($dbh,$_);
+ }
+}
+$dbh->disconnect if (!$opt_test);
+
+$ignored = $count_no_from + $count_no_txt + $count_too_big + $count_duplicates + $no_subject;
+print "Mails inserted:\t\t\t$inserted_mails\n";
+print "Mails ignored:\t\t\t$ignored\n";
+print "Mails without \"From:\" -field:\t$count_no_from\n";
+print "Mails without message:\t\t$count_no_txt\n";
+print "Too big mails (> $opt_max_mail_size):\t$count_too_big\n";
+print "Duplicate mails:\t\t$count_duplicates\n";
+print "Forwarded mails:\t\t$count_forwarded_msgs\n";
+print "No subject:\t\t\t$no_subject\n";
+print "Mails altogether:\t\t";
+print $inserted_mails+$ignored;
+print "\n";
+exit(0);
+
+sub usage
+{
+ my($VER)=@_;
+
+ $0 =~ s/.\/(.+)/$1/;
+ if ($opt_version)
+ {
+ print "$0 version $VER\n";
+ }
+ else
+ {
+ print <<EOF;
+$0 version $VER
+
+Usage: $0 [options] file1 [file2 file3 ...]
+
+Description: Inserts mails from file(s) into a database
+
+Options:
+--help show this help and exit
+--version shows the version of the program
+--db_engine=... database server (default: $opt_db_engine)
+--db=... database to be used (default: $opt_db)
+--host=... hostname to be used (default: $opt_host)
+--password=... user password for the db server
+--user=... username for the db server
+--max_mail_size=# max size of a mail to be inserted into the db.
+ mail will be ignored if it exceeds this size
+ (default $opt_max_mail_size)
+--test Don\'t connect to the database, just write the
+ queries to stdout
+EOF
+ }
+ exit(0);
+}
+
+sub create_table_if_needed
+{
+ my ($dbh)=@_;
+ my ($sth,$create);
+
+ $sth = $dbh->prepare("select count(*) from $opt_table") or die $dbh->errstr;
+ if (!$sth->execute)
+ {
+ $create = "CREATE TABLE $opt_table (msg_nro mediumint unsigned not null ";
+ $create .= "auto_increment, date DATETIME NOT NULL, time_zone CHAR(6) ";
+ $create .= "NOT NULL, mail_from char(120) not null, reply char(120), ";
+ $create .= "mail_to TEXT, cc TEXT, sbj char(200), txt MEDIUMTEXT NOT ";
+ $create .= "NULL, file char(32) noT NULL, hash INT NOT NULL, key ";
+ $create .= "(msg_nro), primary key (mail_from, date, time_zone, hash))";
+ $sth = $dbh->prepare($create) or die $dbh->errstr;
+ $sth->execute() or die $dbh->errstr;
+ }
+}
+
+sub process_mail_file
+{
+ my ($dbh,$file_name)= @_;
+ my (%values,$type,$check);
+
+ %values=(); $type="";
+ $check=0;
+
+ while (<FILE>)
+ {
+ chop;
+ if ($type ne "message")
+ {
+ if (/^Reply-To: (.*)/i) # finding different fields from file
+ {
+ $type="reply";
+ $values{$type}= $1;
+ }
+ elsif (/^From: (.*)/i)
+ {
+ $type="from";
+ $values{$type}= $1;
+ }
+ elsif (/^To: (.*)/i)
+ {
+ $type="to";
+ $values{$type}= $1;
+ }
+ elsif (/^Cc: (.*)/i)
+ {
+ $type="cc";
+ $values{$type}= $1;
+ }
+ elsif (/^Subject: (.*)/i)
+ {
+ $type="subject";
+ $values{$type}= $1;
+ }
+ elsif (/^Date: (.*)/i)
+ {
+ date_parser($1,\%values);
+ $type="rubbish";
+ }
+ elsif (/^[\w\W-]+:\s/)
+ {
+ $type="rubbish";
+ }
+ elsif ($_ eq "")
+ {
+ $type="message";
+ $values{$type}="";
+ }
+ else
+ {
+ s/^\s*/ /;
+ $values{$type}.= $_;
+ }
+ }
+ elsif ($check!=0 && $_ ne "") # in case of forwarded messages
+ {
+ $values{$type}.= "\n" . $_;
+ $check--;
+ }
+ elsif (/^From .* \d\d:\d\d:\d\d\s\d\d\d\d$/)
+ {
+ $values{'hash'}= checksum("$values{'message'}");
+ update_table($dbh,$file_name,\%values);
+ %values=(); $type="";
+ $check=0;
+ }
+ elsif (/-* forwarded message .*-*/i) # in case of forwarded messages
+ {
+ $values{$type}.= "\n" . $_;
+ $check++;
+ $count_forwarded_msgs++;
+ }
+ else
+ {
+ $values{$type}.= "\n" . $_;
+ }
+ }
+ $values{'hash'}= checksum("$values{'message'}");
+ update_table($dbh,$file_name,\%values);
+}
+
+########
+
+# converts date to the right form
+
+sub date_parser
+{
+ my ($date_raw,$values)=@_;
+
+ $date_raw =~ /\s*(\d{1,2}) (\w+) (\d{2,4}) (\d+:\d+:\d+)\s*([\w-+]{3-5})?/;
+
+ $values->{'date'}=$3 . "-" . $months{$2} . "-" . "$1 $4";
+ $values->{'time_zone'}=$5;
+}
+
+#########
+
+# this is runned when the whole mail is gathered.
+# this actually puts the mail to the database.
+
+sub update_table
+{
+ my($dbh,$file_name,$values)=@_;
+ my($query);
+
+ if (! defined($values->{'subject'}) || !defined($values->{'to'}))
+ {
+ $no_subject++;
+ return; # Ignore these
+ }
+ $values->{'message'} =~ s/^\s*//; #removes whitespaces from the beginning
+ $values->{'message'} =~ s/\s*$//; #removes whitespaces from the end
+ $query = "insert into $opt_table values (NULL,'" . $values->{'date'};
+ $query .= "','" . $values->{'time_zone'} . "',";
+ $query .= (defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL") . ",";
+ $query .= (defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL") . ",";
+
+ $query .= (defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL") . ",";
+ $query .= (defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL") . ",";
+ $query .= $dbh->quote($values->{'subject'}) . ",";
+ $query .= $dbh->quote($values->{'message'}) . "," . $dbh->quote($file_name);
+ $query .= ",'" . $values->{'hash'} . "')";
+
+ if (length($values->{'message'}) > $opt_max_mail_size) #disables big message
+ {
+ $count_too_big++;
+ }
+ elsif ($values->{'from'} eq "") #disables mails with no from field
+ {
+ $count_no_from++;
+ }
+ elsif ($opt_test)
+ {
+ print "$query\n";
+ $inserted_mails++;
+ }
+ elsif ($values->{'message'} eq "") #disables mails with no message text
+ {
+ $count_no_msg_text++;
+ }
+ elsif ($dbh->do($query))
+ {
+ $inserted_mails++;
+ }
+ elsif (!($dbh->errstr =~ /Duplicate entry /)) #disables duplicates
+ {
+ die "Aborting: Got error '" . $dbh->errstr ."' for query: '$query'\n";
+ }
+ else
+ {
+ $count_duplicates++;
+ }
+ $query="";
+}
+
+
+##########
+
+# In case you have two identical messages we wanted to identify them
+# and remove additionals; We do this by calculating a hash number of the
+# message and ignoring messages with the same from, date and hash.
+# This function calculates a simple 32 bit hash value for the message.
+
+sub checksum
+{
+ my ($txt)= @_;
+ my ($crc,$i,$count);
+ $count = length($txt);
+ for ($crc = $i = 0; $i < $count ; $i++)
+ {
+ $crc = (($crc << 1) + (ord (substr ($txt, $i, 1)))) +
+ (($crc & (1 << 30)) ? 1 : 0);
+ $crc &= ((1 << 31) -1);
+ }
+ return $crc;
+}