From fa81ddcfa925acc52632fd72de419abe29796e2d Mon Sep 17 00:00:00 2001 From: dmg Date: Tue, 29 Jul 2014 22:22:58 -0700 Subject: added experimental code --- ninka-sqlite.pl | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ unify.pl | 161 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 335 insertions(+) create mode 100644 ninka-sqlite.pl create mode 100644 unify.pl diff --git a/ninka-sqlite.pl b/ninka-sqlite.pl new file mode 100644 index 0000000..eee0c0f --- /dev/null +++ b/ninka-sqlite.pl @@ -0,0 +1,174 @@ +#!/usr/bin/perl + +use strict; +use Switch; +use DBI; +use File::Temp; +use File::Find; +use File::Basename; +use Scalar::Util qw(looks_like_number); + +if(scalar(@ARGV) != 2){ + print STDERR "Incorrect number of arguments\n"; + print STDERR "Correct usage is: perl ninka-wrapper \n"; + exit 1; +} + +my $path = $0; + +$path =~ s/\/+[^\/]+$//; +if ($path eq "") { + $path = "./"; +} + +my ($pack, $db) = @ARGV; + +my $dbh = DBI->connect("DBI:SQLite:dbname=$db", "", "", {RaiseError => 1}) + or die $DBI::errstr; +$dbh->do("CREATE TABLE IF NOT EXISTS + comments (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + sentences (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + goodsents (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + badsents (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + senttoks (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + licenses (filename TEXT, path TEXT, container TEXT, licenses TEXT, + num_found INT, lines INT, toks_ignored INT, toks_unmatched INT, + toks_unknown INT, tokens TEXT, + PRIMARY KEY(filename, path, container))"); + +my $tempdir = File::Temp->newdir(); +my $dirname = $tempdir->dirname; + +print "***** Extracting file [$pack] to temporary directory [$dirname] *****\n"; +my $packext = getExtension($pack); +if ($packext eq ".bz2" || $packext eq ".gz") { + execute("tar -xvf '$pack' --directory '$dirname'"); +} elsif ($packext eq ".jar" || $packext eq ".zip") { + execute("unzip -d $dirname $pack"); +} else { + print "ninka-wrapper does not support packages with extension [$packext]\n"; +} + +my @files; +find( + sub { push @files, $File::Find::name unless -d; }, + $dirname +); + +print "***** Beginning Execution of Ninka *****\n"; +foreach my $file (@files) { + print "Running ninka on file [$file]\n"; + execute("perl ${path}/ninka.pl '$file'"); +} + +my @ninkafiles; +find( + sub { + my $ext = getExtension($File::Find::name); + if($ext =~ m/(comments|sentences|goodsent|badsent|senttok|license)$/){ + push @ninkafiles, $File::Find::name; + } + }, + $dirname +); + +print "***** Entering Ninka Data into Database [$db] *****\n"; +foreach my $file (@ninkafiles) { + + my $filepath = dirname($file); + $filepath =~ s/$dirname//; + my $basefile = basename($file); + my $rootfile = removeExtension($basefile); + my $packname = basename($pack); + + #Read entire file into a string + open (my $fh, '<', $file) or die "Can't open file $!"; + my $filedata = do { local $/; <$fh> }; + + my $sth; + switch (getExtension($basefile)){ + case ".comments" { + print "Inserting [$basefile] into table comments\n"; + $sth = $dbh->prepare("INSERT INTO comments VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + case ".sentences" { + print "Inserting [$basefile] into table sentences\n"; + $sth = $dbh->prepare("INSERT INTO sentences VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + case ".goodsent" { + print "Inserting [$basefile] into table goodsents\n"; + $sth = $dbh->prepare("INSERT INTO goodsents VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + case ".badsent" { + print "Inserting [$basefile] into table goodsents\n"; + $sth = $dbh->prepare("INSERT INTO badsents VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + case ".senttok" { + print "Inserting [$basefile] into table senttoks\n"; + $sth = $dbh->prepare("INSERT INTO senttoks VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + case ".license" { + print "Inserting [$basefile] into table licenses\n"; + my @columns = parseLicenseData($filedata); + $sth = $dbh->prepare("INSERT INTO licenses VALUES + ('$rootfile', '$filepath', '$packname', '$columns[0]', '$columns[1]', + '$columns[2]', '$columns[3]', '$columns[4]', '$columns[5]', '$columns[6]')"); + } + } + + $sth->bind_param(1, $filedata); + $sth->execute; + close($fh); +} + +$dbh->disconnect; + +sub parseLicenseData { + my ($data) = @_; + + my @columns; + my @fields = split(';', $data); + if($fields[0] eq "NONE\n"){ + @columns = '' x 7; + @columns[0] = 'NONE'; + } else { + @columns = @fields; + } + return @columns; +} + +sub getExtension { + my ($file) = @_; + my $filename = basename($file); + my ($ext) = $filename =~ /(\.[^.]+)$/; + return $ext; +} + +sub removeExtension { + my ($file) = @_; + (my $filename = $file) =~ s/\.[^.]+$//; + return $filename; +} + +sub execute { + my ($command) = @_; + my $output = `$command`; + my $status = ($? >> 8); + die "execution of [$command] failed: status [$status]\n" if ($status != 0); + return $output; +} diff --git a/unify.pl b/unify.pl new file mode 100644 index 0000000..f518fbb --- /dev/null +++ b/unify.pl @@ -0,0 +1,161 @@ +#!/usr/bin/perl + +# first pass, unify names of licenses and remove duplicates. + +# we trick regarding gpl related licenses so they are "clustered" together.. +# +# replace GPL with __GPL +# replace exception in the text with ___exception + +use strict; + +my %equiv = ( + "boostV1Ref" => "boostV1", + "X11" => "X11mit", + "X11Festival" => "X11mit", + "X11mitNoSellNoDocDocBSDvar" => "X11mit", + "X11mitwithoutSell" => 'X11mit', + "X11mitBSDvar" => "X11mit", + "X11mitwithoutSellCMUVariant" => "X11mit", + "X11mitwithoutSellCMUVariant" => "X11mit", + "X11mitwithoutSellandNoDocumentationRequi" => "X11mit", + "MITvar3" => "X11mit", + "MITvar2" => "X11mit", + "MIT" => "X11mit", + "ZLIBref" => "ZLIB", + "BSD3NoWarranty" => "BSD3", + "BSD2EndorseInsteadOfBinary" => "BSD2", + "BSD2var2" => "BSD2", + "LesserGPLv2" => "LibraryGPLv2", + "LesserGPLv2+" => "LibraryGPLv2+", + "orLGPLVer2.1" => "LesserGPLVer2.1", + "postgresqlRef" => "postgresql", + ); + +while (<>) { + chomp; + my @f = split(/;/); + # first remove duplicates + + my $l = $f[1]; + + # do a simple rewriting of this exception which is an incomplete license + + $l =~ s/^Exception$/UNKNOWN/; + + my @l = split(/,/,$l); + my %lics = %{{ map { $_ => 1 } @l }}; + + %lics = Do_Equivalent(%lics); + %lics = Remove_Redundant(%lics); + %lics = Do_Exceptions(%lics); + + my @out = sort keys %lics; + + my $t = join(',', @out); + if ($t eq "") { + $t = "UNKNOWN"; + } + print $f[0], ";$t\n"; +} + +sub Do_Exceptions +{ + my (%lics) = @_; + + if ($lics{'digiaQTExceptionNoticeVer1.1'} ne '' and $lics{'Qt'}) { + delete $lics{'digiaQTExceptionNoticeVer1.1'}; + delete $lics{'Qt'}; + $lics{'Qt-qtExcep'} = 'Qt-qtExcep'; + } + if ($lics{'BisonException'} ne "" and $lics{"GPLv3+"} ne "") { + delete $lics{'BisonException'}; + delete $lics{"GPLv3+"}; + $lics{'GPLv3+-bisonExcep'} = 'GPLv3+-bisonExcep'; + } + if ($lics{'BisonException'} ne "" and $lics{"GPLv2+"} ne "") { + delete $lics{'BisonException'}; + delete $lics{"GPLv2+"}; + $lics{'GPLv2+-bisonExcep'} = 'GPLv2+-bisonExcep'; + } + if ($lics{'BisonException'} ne "" and $lics{"GPLv2"} ne "") { + delete $lics{'BisonException'}; + delete $lics{"GPLv2"}; + $lics{'GPLv2-bisonExcep'} = 'GPLv2-bisonExcep'; + } + if ($lics{'ClassPathException'} ne "" and $lics{"GPLv2"} ne "") { + delete $lics{'ClassPathException'}; + delete $lics{"GPLv2"}; + $lics{"GPLv2-classPathExcep"} = "GPLv2-classPathExcep"; + } + if ($lics{'CDDLorGPLv2'} ne "" and $lics{"ClassPathExceptionGPLv2"} ne "") { + delete $lics{'CDDLorGPLv2'}; + delete $lics{"ClassPathExceptionGPLv2"}; + $lics{'CDDLorGPLv2-classPathExcep'} = 'CDDLorGPLv2-classPathExcep'; + } + if ($lics{'LinkException'} ne "" and $lics{"GPLv3+"} ne "") { + delete $lics{'LinkException'}; + delete $lics{"GPLv3+"}; + $lics{'GPLv3+-linkExcep'} = 'GPLv3+-linkExcep'; + } + if ($lics{'LinkException'} ne "" and $lics{"GPLv2+"} ne "") { + delete $lics{'LinkException'}; + delete $lics{"GPLv2+"}; + $lics{'GPLv2+-linkExcep'} = 'GPLv2+-linkExcep'; + } + if ($lics{'LinkException'} ne "" and $lics{"GPLv3"} ne "") { + delete $lics{'LinkException'}; + delete $lics{"GPLv3"}; + $lics{'GPLv3-linkExcep'} = 'GPLv3-linkExcep'; + } + if ($lics{'LinkException'} ne "" and $lics{"GPLv2"} ne "") { + delete $lics{'LinkException'}; + delete $lics{"GPLv2"}; + $lics{'GPLv2-linkExcep'} = 'GPLv2-linkExcep'; + } + + return %lics; + +} + +sub Remove_Redundant +{ + my (%lics) = @_; + + if ($lics{"GPLnoVersion"} ne "" and $lics{"GPLv2"} . $lics{"GPLv2+"} .$lics{"GPLv3"} . $lics{"GPLv3+"} ne "") { + delete $lics{"GPLnoVersion"}; + } + if ($lics{"GPLv2+"} ne "" and $lics{"GPLv3+"} ne "") { + delete $lics{"GPLv2+"}; + } + if ($lics{'MPL1_1andLGPLv2_1'} ne "" and $lics{"MPLv1_1"} ne "") { + delete $lics{"MPLv1_1"}; + } + + + return %lics; + +} + +sub Do_Equivalent +{ + my (%lics) = @_; + my %outA; + + # then normalize licenses + foreach my $a (keys %lics) { + next if $a eq "SeeFile"; + if ($equiv{$a} ne "") { + $outA{$equiv{$a}} = $equiv{$a}; + } else { + $outA{$a} = $a; + } + } + return %outA; + +} + + +sub uniq { + return keys %{{ map { $_ => 1 } @_ }}; +} -- cgit v1.2.1