diff options
Diffstat (limited to 'bin/ninka-sqlite')
-rwxr-xr-x | bin/ninka-sqlite | 195 |
1 files changed, 195 insertions, 0 deletions
diff --git a/bin/ninka-sqlite b/bin/ninka-sqlite new file mode 100755 index 0000000..6b27ea9 --- /dev/null +++ b/bin/ninka-sqlite @@ -0,0 +1,195 @@ +#!/usr/bin/perl +# +# Copyright (C) 2014,2015 Anthony Kohan and Daniel M. German +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of +# the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +use strict; +use DBI; +use File::Temp; +use File::Find; +use File::Basename; +use Ninka; + +if (scalar(@ARGV) != 2) { + print STDERR "Ninka v${Ninka::VERSION}. sqlite wrapper\n"; + print STDERR "Processes package file (.tar.gz, zip, jar. etc) and outputs to sqlite file\n"; + print STDERR "Incorrect number of arguments\n"; + print STDERR "Usage: $0 <path to package file> <database name>\n"; + exit 1; +} + +my $path = $0; + +$path =~ s/\/+[^\/]+$//; +if ($path eq "") { + $path = "./"; +} + +my ($pack, $db) = @ARGV; + +my $dbh = DBI->connect("DBI:SQLite:dbname=$db", "", "", {RaiseError => 1, AutoCommit => 0}) + or die $DBI::errstr; +$dbh->do("CREATE TABLE IF NOT EXISTS + comments (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + sentences (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + goodsents (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + badsents (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + senttoks (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + licenses (filename TEXT, path TEXT, container TEXT, licenses TEXT, + num_found INT, lines INT, toks_ignored INT, toks_unmatched INT, + toks_unknown INT, tokens TEXT, + PRIMARY KEY(filename, path, container))"); + +my $tempdir = File::Temp->newdir(); +my $dirname = $tempdir->dirname; + +print "***** Extracting file [$pack] to temporary directory [$dirname] *****\n"; +my $packext = getExtension($pack); +if ($packext eq ".bz2" || $packext eq ".gz") { + execute("tar -xvf '$pack' --directory '$dirname'"); +} elsif ($packext eq ".jar" || $packext eq ".zip") { + execute("unzip -d $dirname $pack"); +} else { + print "ninka-wrapper does not support packages with extension [$packext]\n"; +} + +my @files; +find( + sub { push @files, $File::Find::name unless -d; }, + $dirname +); + +print "***** Beginning Execution of Ninka *****\n"; +foreach my $file (@files) { + print "Running ninka on file [$file]\n"; + execute("perl ${path}/ninka -i '$file'"); +} + +my @ninkafiles; +find( + sub { + my $ext = getExtension($File::Find::name); + if($ext =~ m/(comments|sentences|goodsent|badsent|senttok|license)$/){ + push @ninkafiles, $File::Find::name; + } + }, + $dirname +); + +print "***** Entering Ninka Data into Database [$db] *****\n"; +foreach my $file (@ninkafiles) { + + my $filepath = dirname($file); + $filepath =~ s/$dirname//; + my $basefile = basename($file); + my $rootfile = removeExtension($basefile); + my $packname = basename($pack); + + #Read entire file into a string + open (my $fh, '<', $file) or die "Can't open file $!"; + my $filedata = do { local $/; <$fh> }; + + my $sth; + my $ext = getExtension($basefile); + + if ($ext eq ".comments") { + print "Inserting [$basefile] into table comments\n"; + $sth = $dbh->prepare("INSERT INTO comments VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + if ($ext eq ".sentences") { + print "Inserting [$basefile] into table sentences\n"; + $sth = $dbh->prepare("INSERT INTO sentences VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + if ($ext eq ".goodsent") { + print "Inserting [$basefile] into table goodsents\n"; + $sth = $dbh->prepare("INSERT INTO goodsents VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + if ($ext eq ".badsent") { + print "Inserting [$basefile] into table badsents\n"; + $sth = $dbh->prepare("INSERT INTO badsents VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + if ($ext eq ".senttok") { + print "Inserting [$basefile] into table senttoks\n"; + $sth = $dbh->prepare("INSERT INTO senttoks VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + if ($ext eq ".license") { + print "Inserting [$basefile] into table licenses\n"; + my @columns = parseLicenseData($filedata); + $sth = $dbh->prepare("INSERT INTO licenses VALUES + ('$rootfile', '$filepath', '$packname', '$columns[0]', '$columns[1]', + '$columns[2]', '$columns[3]', '$columns[4]', '$columns[5]', '$columns[6]')"); + } + + if (defined $sth) { + $sth->bind_param(1, $filedata); + $sth->execute; + } + + close($fh); +} + +$dbh->commit(); +$dbh->disconnect(); + +sub parseLicenseData { + my ($data) = @_; + + my @columns; + my @fields = split(';', $data); + if($fields[0] eq "NONE\n"){ + @columns = '' x 7; + @columns[0] = 'NONE'; + } else { + @columns = @fields; + } + return @columns; +} + +sub getExtension { + my ($file) = @_; + my $filename = basename($file); + my ($ext) = $filename =~ /(\.[^.]+)$/; + return $ext; +} + +sub removeExtension { + my ($file) = @_; + (my $filename = $file) =~ s/\.[^.]+$//; + return $filename; +} + +sub execute { + my ($command) = @_; + my $output = `$command`; + my $status = ($? >> 8); + die "execution of [$command] failed: status [$status]\n" if ($status != 0); + return $output; +} |