summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordmg <dmg@uvic.ca>2014-07-29 22:22:58 -0700
committerDaniel M German <dmg@uvic.ca>2015-01-10 02:24:27 -0800
commitfa81ddcfa925acc52632fd72de419abe29796e2d (patch)
tree4fedf298041a65d68352882e299f536ea6aa43dc
parent5023e3b856b07dd66bbbd7cf8b7327170ed3d81f (diff)
downloadninka-fa81ddcfa925acc52632fd72de419abe29796e2d.tar.gz
added experimental code
-rw-r--r--ninka-sqlite.pl174
-rw-r--r--unify.pl161
2 files changed, 335 insertions, 0 deletions
diff --git a/ninka-sqlite.pl b/ninka-sqlite.pl
new file mode 100644
index 0000000..eee0c0f
--- /dev/null
+++ b/ninka-sqlite.pl
@@ -0,0 +1,174 @@
+#!/usr/bin/perl
+
+use strict;
+use Switch;
+use DBI;
+use File::Temp;
+use File::Find;
+use File::Basename;
+use Scalar::Util qw(looks_like_number);
+
+if(scalar(@ARGV) != 2){
+ print STDERR "Incorrect number of arguments\n";
+ print STDERR "Correct usage is: perl ninka-wrapper <path to package file> <database name>\n";
+ exit 1;
+}
+
+my $path = $0;
+
+$path =~ s/\/+[^\/]+$//;
+if ($path eq "") {
+ $path = "./";
+}
+
+my ($pack, $db) = @ARGV;
+
+my $dbh = DBI->connect("DBI:SQLite:dbname=$db", "", "", {RaiseError => 1})
+ or die $DBI::errstr;
+$dbh->do("CREATE TABLE IF NOT EXISTS
+ comments (filename TEXT, path TEXT, container TEXT, content TEXT,
+ PRIMARY KEY(filename, path, container))");
+$dbh->do("CREATE TABLE IF NOT EXISTS
+ sentences (filename TEXT, path TEXT, container TEXT, content TEXT,
+ PRIMARY KEY(filename, path, container))");
+$dbh->do("CREATE TABLE IF NOT EXISTS
+ goodsents (filename TEXT, path TEXT, container TEXT, content TEXT,
+ PRIMARY KEY(filename, path, container))");
+$dbh->do("CREATE TABLE IF NOT EXISTS
+ badsents (filename TEXT, path TEXT, container TEXT, content TEXT,
+ PRIMARY KEY(filename, path, container))");
+$dbh->do("CREATE TABLE IF NOT EXISTS
+ senttoks (filename TEXT, path TEXT, container TEXT, content TEXT,
+ PRIMARY KEY(filename, path, container))");
+$dbh->do("CREATE TABLE IF NOT EXISTS
+ licenses (filename TEXT, path TEXT, container TEXT, licenses TEXT,
+ num_found INT, lines INT, toks_ignored INT, toks_unmatched INT,
+ toks_unknown INT, tokens TEXT,
+ PRIMARY KEY(filename, path, container))");
+
+my $tempdir = File::Temp->newdir();
+my $dirname = $tempdir->dirname;
+
+print "***** Extracting file [$pack] to temporary directory [$dirname] *****\n";
+my $packext = getExtension($pack);
+if ($packext eq ".bz2" || $packext eq ".gz") {
+ execute("tar -xvf '$pack' --directory '$dirname'");
+} elsif ($packext eq ".jar" || $packext eq ".zip") {
+ execute("unzip -d $dirname $pack");
+} else {
+ print "ninka-wrapper does not support packages with extension [$packext]\n";
+}
+
+my @files;
+find(
+ sub { push @files, $File::Find::name unless -d; },
+ $dirname
+);
+
+print "***** Beginning Execution of Ninka *****\n";
+foreach my $file (@files) {
+ print "Running ninka on file [$file]\n";
+ execute("perl ${path}/ninka.pl '$file'");
+}
+
+my @ninkafiles;
+find(
+ sub {
+ my $ext = getExtension($File::Find::name);
+ if($ext =~ m/(comments|sentences|goodsent|badsent|senttok|license)$/){
+ push @ninkafiles, $File::Find::name;
+ }
+ },
+ $dirname
+);
+
+print "***** Entering Ninka Data into Database [$db] *****\n";
+foreach my $file (@ninkafiles) {
+
+ my $filepath = dirname($file);
+ $filepath =~ s/$dirname//;
+ my $basefile = basename($file);
+ my $rootfile = removeExtension($basefile);
+ my $packname = basename($pack);
+
+ #Read entire file into a string
+ open (my $fh, '<', $file) or die "Can't open file $!";
+ my $filedata = do { local $/; <$fh> };
+
+ my $sth;
+ switch (getExtension($basefile)){
+ case ".comments" {
+ print "Inserting [$basefile] into table comments\n";
+ $sth = $dbh->prepare("INSERT INTO comments VALUES
+ ('$rootfile', '$filepath', '$packname', ?)");
+ }
+ case ".sentences" {
+ print "Inserting [$basefile] into table sentences\n";
+ $sth = $dbh->prepare("INSERT INTO sentences VALUES
+ ('$rootfile', '$filepath', '$packname', ?)");
+ }
+ case ".goodsent" {
+ print "Inserting [$basefile] into table goodsents\n";
+ $sth = $dbh->prepare("INSERT INTO goodsents VALUES
+ ('$rootfile', '$filepath', '$packname', ?)");
+ }
+ case ".badsent" {
+ print "Inserting [$basefile] into table goodsents\n";
+ $sth = $dbh->prepare("INSERT INTO badsents VALUES
+ ('$rootfile', '$filepath', '$packname', ?)");
+ }
+ case ".senttok" {
+ print "Inserting [$basefile] into table senttoks\n";
+ $sth = $dbh->prepare("INSERT INTO senttoks VALUES
+ ('$rootfile', '$filepath', '$packname', ?)");
+ }
+ case ".license" {
+ print "Inserting [$basefile] into table licenses\n";
+ my @columns = parseLicenseData($filedata);
+ $sth = $dbh->prepare("INSERT INTO licenses VALUES
+ ('$rootfile', '$filepath', '$packname', '$columns[0]', '$columns[1]',
+ '$columns[2]', '$columns[3]', '$columns[4]', '$columns[5]', '$columns[6]')");
+ }
+ }
+
+ $sth->bind_param(1, $filedata);
+ $sth->execute;
+ close($fh);
+}
+
+$dbh->disconnect;
+
+sub parseLicenseData {
+ my ($data) = @_;
+
+ my @columns;
+ my @fields = split(';', $data);
+ if($fields[0] eq "NONE\n"){
+ @columns = '' x 7;
+ @columns[0] = 'NONE';
+ } else {
+ @columns = @fields;
+ }
+ return @columns;
+}
+
+sub getExtension {
+ my ($file) = @_;
+ my $filename = basename($file);
+ my ($ext) = $filename =~ /(\.[^.]+)$/;
+ return $ext;
+}
+
+sub removeExtension {
+ my ($file) = @_;
+ (my $filename = $file) =~ s/\.[^.]+$//;
+ return $filename;
+}
+
+sub execute {
+ my ($command) = @_;
+ my $output = `$command`;
+ my $status = ($? >> 8);
+ die "execution of [$command] failed: status [$status]\n" if ($status != 0);
+ return $output;
+}
diff --git a/unify.pl b/unify.pl
new file mode 100644
index 0000000..f518fbb
--- /dev/null
+++ b/unify.pl
@@ -0,0 +1,161 @@
+#!/usr/bin/perl
+
+# first pass, unify names of licenses and remove duplicates.
+
+# we trick regarding gpl related licenses so they are "clustered" together..
+#
+# replace GPL with __GPL
+# replace exception in the text with ___exception
+
+use strict;
+
+my %equiv = (
+ "boostV1Ref" => "boostV1",
+ "X11" => "X11mit",
+ "X11Festival" => "X11mit",
+ "X11mitNoSellNoDocDocBSDvar" => "X11mit",
+ "X11mitwithoutSell" => 'X11mit',
+ "X11mitBSDvar" => "X11mit",
+ "X11mitwithoutSellCMUVariant" => "X11mit",
+ "X11mitwithoutSellCMUVariant" => "X11mit",
+ "X11mitwithoutSellandNoDocumentationRequi" => "X11mit",
+ "MITvar3" => "X11mit",
+ "MITvar2" => "X11mit",
+ "MIT" => "X11mit",
+ "ZLIBref" => "ZLIB",
+ "BSD3NoWarranty" => "BSD3",
+ "BSD2EndorseInsteadOfBinary" => "BSD2",
+ "BSD2var2" => "BSD2",
+ "LesserGPLv2" => "LibraryGPLv2",
+ "LesserGPLv2+" => "LibraryGPLv2+",
+ "orLGPLVer2.1" => "LesserGPLVer2.1",
+ "postgresqlRef" => "postgresql",
+ );
+
+while (<>) {
+ chomp;
+ my @f = split(/;/);
+ # first remove duplicates
+
+ my $l = $f[1];
+
+ # do a simple rewriting of this exception which is an incomplete license
+
+ $l =~ s/^Exception$/UNKNOWN/;
+
+ my @l = split(/,/,$l);
+ my %lics = %{{ map { $_ => 1 } @l }};
+
+ %lics = Do_Equivalent(%lics);
+ %lics = Remove_Redundant(%lics);
+ %lics = Do_Exceptions(%lics);
+
+ my @out = sort keys %lics;
+
+ my $t = join(',', @out);
+ if ($t eq "") {
+ $t = "UNKNOWN";
+ }
+ print $f[0], ";$t\n";
+}
+
+sub Do_Exceptions
+{
+ my (%lics) = @_;
+
+ if ($lics{'digiaQTExceptionNoticeVer1.1'} ne '' and $lics{'Qt'}) {
+ delete $lics{'digiaQTExceptionNoticeVer1.1'};
+ delete $lics{'Qt'};
+ $lics{'Qt-qtExcep'} = 'Qt-qtExcep';
+ }
+ if ($lics{'BisonException'} ne "" and $lics{"GPLv3+"} ne "") {
+ delete $lics{'BisonException'};
+ delete $lics{"GPLv3+"};
+ $lics{'GPLv3+-bisonExcep'} = 'GPLv3+-bisonExcep';
+ }
+ if ($lics{'BisonException'} ne "" and $lics{"GPLv2+"} ne "") {
+ delete $lics{'BisonException'};
+ delete $lics{"GPLv2+"};
+ $lics{'GPLv2+-bisonExcep'} = 'GPLv2+-bisonExcep';
+ }
+ if ($lics{'BisonException'} ne "" and $lics{"GPLv2"} ne "") {
+ delete $lics{'BisonException'};
+ delete $lics{"GPLv2"};
+ $lics{'GPLv2-bisonExcep'} = 'GPLv2-bisonExcep';
+ }
+ if ($lics{'ClassPathException'} ne "" and $lics{"GPLv2"} ne "") {
+ delete $lics{'ClassPathException'};
+ delete $lics{"GPLv2"};
+ $lics{"GPLv2-classPathExcep"} = "GPLv2-classPathExcep";
+ }
+ if ($lics{'CDDLorGPLv2'} ne "" and $lics{"ClassPathExceptionGPLv2"} ne "") {
+ delete $lics{'CDDLorGPLv2'};
+ delete $lics{"ClassPathExceptionGPLv2"};
+ $lics{'CDDLorGPLv2-classPathExcep'} = 'CDDLorGPLv2-classPathExcep';
+ }
+ if ($lics{'LinkException'} ne "" and $lics{"GPLv3+"} ne "") {
+ delete $lics{'LinkException'};
+ delete $lics{"GPLv3+"};
+ $lics{'GPLv3+-linkExcep'} = 'GPLv3+-linkExcep';
+ }
+ if ($lics{'LinkException'} ne "" and $lics{"GPLv2+"} ne "") {
+ delete $lics{'LinkException'};
+ delete $lics{"GPLv2+"};
+ $lics{'GPLv2+-linkExcep'} = 'GPLv2+-linkExcep';
+ }
+ if ($lics{'LinkException'} ne "" and $lics{"GPLv3"} ne "") {
+ delete $lics{'LinkException'};
+ delete $lics{"GPLv3"};
+ $lics{'GPLv3-linkExcep'} = 'GPLv3-linkExcep';
+ }
+ if ($lics{'LinkException'} ne "" and $lics{"GPLv2"} ne "") {
+ delete $lics{'LinkException'};
+ delete $lics{"GPLv2"};
+ $lics{'GPLv2-linkExcep'} = 'GPLv2-linkExcep';
+ }
+
+ return %lics;
+
+}
+
+sub Remove_Redundant
+{
+ my (%lics) = @_;
+
+ if ($lics{"GPLnoVersion"} ne "" and $lics{"GPLv2"} . $lics{"GPLv2+"} .$lics{"GPLv3"} . $lics{"GPLv3+"} ne "") {
+ delete $lics{"GPLnoVersion"};
+ }
+ if ($lics{"GPLv2+"} ne "" and $lics{"GPLv3+"} ne "") {
+ delete $lics{"GPLv2+"};
+ }
+ if ($lics{'MPL1_1andLGPLv2_1'} ne "" and $lics{"MPLv1_1"} ne "") {
+ delete $lics{"MPLv1_1"};
+ }
+
+
+ return %lics;
+
+}
+
+sub Do_Equivalent
+{
+ my (%lics) = @_;
+ my %outA;
+
+ # then normalize licenses
+ foreach my $a (keys %lics) {
+ next if $a eq "SeeFile";
+ if ($equiv{$a} ne "") {
+ $outA{$equiv{$a}} = $equiv{$a};
+ } else {
+ $outA{$a} = $a;
+ }
+ }
+ return %outA;
+
+}
+
+
+sub uniq {
+ return keys %{{ map { $_ => 1 } @_ }};
+}