From 457d82dee255867837e0722f2993db632170354a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scheibe?= Date: Thu, 4 Jun 2015 10:50:59 +0200 Subject: cleanup - remove duplicate unify.pl file * it's already in the scripts folder --- unify.pl | 161 --------------------------------------------------------------- 1 file changed, 161 deletions(-) delete mode 100644 unify.pl diff --git a/unify.pl b/unify.pl deleted file mode 100644 index f518fbb..0000000 --- a/unify.pl +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/perl - -# first pass, unify names of licenses and remove duplicates. - -# we trick regarding gpl related licenses so they are "clustered" together.. -# -# replace GPL with __GPL -# replace exception in the text with ___exception - -use strict; - -my %equiv = ( - "boostV1Ref" => "boostV1", - "X11" => "X11mit", - "X11Festival" => "X11mit", - "X11mitNoSellNoDocDocBSDvar" => "X11mit", - "X11mitwithoutSell" => 'X11mit', - "X11mitBSDvar" => "X11mit", - "X11mitwithoutSellCMUVariant" => "X11mit", - "X11mitwithoutSellCMUVariant" => "X11mit", - "X11mitwithoutSellandNoDocumentationRequi" => "X11mit", - "MITvar3" => "X11mit", - "MITvar2" => "X11mit", - "MIT" => "X11mit", - "ZLIBref" => "ZLIB", - "BSD3NoWarranty" => "BSD3", - "BSD2EndorseInsteadOfBinary" => "BSD2", - "BSD2var2" => "BSD2", - "LesserGPLv2" => "LibraryGPLv2", - "LesserGPLv2+" => "LibraryGPLv2+", - "orLGPLVer2.1" => "LesserGPLVer2.1", - "postgresqlRef" => "postgresql", - ); - -while (<>) { - chomp; - my @f = split(/;/); - # first remove duplicates - - my $l = $f[1]; - - # do a simple rewriting of this exception which is an incomplete license - - $l =~ s/^Exception$/UNKNOWN/; - - my @l = split(/,/,$l); - my %lics = %{{ map { $_ => 1 } @l }}; - - %lics = Do_Equivalent(%lics); - %lics = Remove_Redundant(%lics); - %lics = Do_Exceptions(%lics); - - my @out = sort keys %lics; - - my $t = join(',', @out); - if ($t eq "") { - $t = "UNKNOWN"; - } - print $f[0], ";$t\n"; -} - -sub Do_Exceptions -{ - my (%lics) = @_; - - if ($lics{'digiaQTExceptionNoticeVer1.1'} ne '' and $lics{'Qt'}) { - delete $lics{'digiaQTExceptionNoticeVer1.1'}; - delete $lics{'Qt'}; - $lics{'Qt-qtExcep'} = 'Qt-qtExcep'; - } - if ($lics{'BisonException'} ne "" and $lics{"GPLv3+"} ne "") { - delete $lics{'BisonException'}; - delete $lics{"GPLv3+"}; - $lics{'GPLv3+-bisonExcep'} = 'GPLv3+-bisonExcep'; - } - if ($lics{'BisonException'} ne "" and $lics{"GPLv2+"} ne "") { - delete $lics{'BisonException'}; - delete $lics{"GPLv2+"}; - $lics{'GPLv2+-bisonExcep'} = 'GPLv2+-bisonExcep'; - } - if ($lics{'BisonException'} ne "" and $lics{"GPLv2"} ne "") { - delete $lics{'BisonException'}; - delete $lics{"GPLv2"}; - $lics{'GPLv2-bisonExcep'} = 'GPLv2-bisonExcep'; - } - if ($lics{'ClassPathException'} ne "" and $lics{"GPLv2"} ne "") { - delete $lics{'ClassPathException'}; - delete $lics{"GPLv2"}; - $lics{"GPLv2-classPathExcep"} = "GPLv2-classPathExcep"; - } - if ($lics{'CDDLorGPLv2'} ne "" and $lics{"ClassPathExceptionGPLv2"} ne "") { - delete $lics{'CDDLorGPLv2'}; - delete $lics{"ClassPathExceptionGPLv2"}; - $lics{'CDDLorGPLv2-classPathExcep'} = 'CDDLorGPLv2-classPathExcep'; - } - if ($lics{'LinkException'} ne "" and $lics{"GPLv3+"} ne "") { - delete $lics{'LinkException'}; - delete $lics{"GPLv3+"}; - $lics{'GPLv3+-linkExcep'} = 'GPLv3+-linkExcep'; - } - if ($lics{'LinkException'} ne "" and $lics{"GPLv2+"} ne "") { - delete $lics{'LinkException'}; - delete $lics{"GPLv2+"}; - $lics{'GPLv2+-linkExcep'} = 'GPLv2+-linkExcep'; - } - if ($lics{'LinkException'} ne "" and $lics{"GPLv3"} ne "") { - delete $lics{'LinkException'}; - delete $lics{"GPLv3"}; - $lics{'GPLv3-linkExcep'} = 'GPLv3-linkExcep'; - } - if ($lics{'LinkException'} ne "" and $lics{"GPLv2"} ne "") { - delete $lics{'LinkException'}; - delete $lics{"GPLv2"}; - $lics{'GPLv2-linkExcep'} = 'GPLv2-linkExcep'; - } - - return %lics; - -} - -sub Remove_Redundant -{ - my (%lics) = @_; - - if ($lics{"GPLnoVersion"} ne "" and $lics{"GPLv2"} . $lics{"GPLv2+"} .$lics{"GPLv3"} . $lics{"GPLv3+"} ne "") { - delete $lics{"GPLnoVersion"}; - } - if ($lics{"GPLv2+"} ne "" and $lics{"GPLv3+"} ne "") { - delete $lics{"GPLv2+"}; - } - if ($lics{'MPL1_1andLGPLv2_1'} ne "" and $lics{"MPLv1_1"} ne "") { - delete $lics{"MPLv1_1"}; - } - - - return %lics; - -} - -sub Do_Equivalent -{ - my (%lics) = @_; - my %outA; - - # then normalize licenses - foreach my $a (keys %lics) { - next if $a eq "SeeFile"; - if ($equiv{$a} ne "") { - $outA{$equiv{$a}} = $equiv{$a}; - } else { - $outA{$a} = $a; - } - } - return %outA; - -} - - -sub uniq { - return keys %{{ map { $_ => 1 } @_ }}; -} -- cgit v1.2.1 From 9059d97db02001b60ed70260b2c33c4b9875f54b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scheibe?= Date: Thu, 4 Jun 2015 11:36:23 +0200 Subject: add missing FileCleaner to MANIFEST * use "make disttest" to check if the distribution passes the tests * for distribution support also see http://search.cpan.org/dist/ExtUtils-MakeMaker/lib/ExtUtils/MakeMaker.pm#Distribution_Support --- MANIFEST | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST b/MANIFEST index 3379706..4982e2e 100644 --- a/MANIFEST +++ b/MANIFEST @@ -4,6 +4,7 @@ lib/Ninka.pm lib/Ninka/abbreviations.dict lib/Ninka/CommentExtractor.pm lib/Ninka/criticalwords.dict +lib/Ninka/FileCleaner.pm lib/Ninka/interrules.dict lib/Ninka/LicenseMatcher.pm lib/Ninka/LicenseRules.pm -- cgit v1.2.1 From 3b40e6d7979b517b9f3cb3ae09bf7354fef950a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scheibe?= Date: Thu, 4 Jun 2015 15:33:01 +0200 Subject: include ninka-excel & ninka-sqlite in the distribution * now calling ninka with option "-i" to generate the required intermediary files * possible improvement: use the Ninka::* modules to get the infos without creating files * not using the evil "Switch" module anymore * referencing Ninka version to reduce duplication and maintenance hell --- MANIFEST | 2 + Makefile.PL | 5 ++ README | 6 +- bin/ninka-excel | 164 ++++++++++++++++++++++++++++++++++++++++++++++ bin/ninka-sqlite | 195 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ ninka-excel.pl | 169 ----------------------------------------------- ninka-sqlite.pl | 196 ------------------------------------------------------- 7 files changed, 369 insertions(+), 368 deletions(-) create mode 100755 bin/ninka-excel create mode 100755 bin/ninka-sqlite delete mode 100755 ninka-excel.pl delete mode 100755 ninka-sqlite.pl diff --git a/MANIFEST b/MANIFEST index 4982e2e..328ab11 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1,4 +1,6 @@ bin/ninka +bin/ninka-excel +bin/ninka-sqlite Changes lib/Ninka.pm lib/Ninka/abbreviations.dict diff --git a/Makefile.PL b/Makefile.PL index af4730c..20ea9fd 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -17,10 +17,15 @@ WriteMakefile( 'ExtUtils::MakeMaker' => '6.52', }, PREREQ_PM => { + 'DBI' => '0', + 'DBD::SQLite' => '0', 'File::Basename' => '0', + 'File::Find' => '0', 'File::Spec::Functions' => '0', + 'File::Temp' => '0', 'Getopt::Std' => '0', 'IPC::Open3' => '0', + 'Spreadsheet::WriteExcel' => '0', }, TEST_REQUIRES => { 'File::Temp' => '0', diff --git a/README b/README index b80a187..bd67b2c 100644 --- a/README +++ b/README @@ -62,9 +62,9 @@ the above paper. * Requirements - Perl version 5 or above -- for ninka-excel.pl: Perl module Spreadsheet::WriteExcel - https://metacpan.org/release/Spreadsheet-WriteExcel/ -- for ninka-sqlite.pl: Perl module DBD::SQLite +- for ninka-excel: Perl module Spreadsheet::WriteExcel + https://metacpan.org/release/Spreadsheet-WriteExcel +- for ninka-sqlite: Perl module DBD::SQLite https://metacpan.org/release/DBD-SQLite * How to install diff --git a/bin/ninka-excel b/bin/ninka-excel new file mode 100755 index 0000000..2e2d35d --- /dev/null +++ b/bin/ninka-excel @@ -0,0 +1,164 @@ +#!/usr/bin/perl +# +# Copyright (C) 2014,2015 Anthony Kohan and Daniel M. German +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of +# the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +use strict; +use File::Temp; +use File::Find; +use File::Basename; +use Ninka; +use Spreadsheet::WriteExcel; + +if (scalar(@ARGV) != 2) { + print STDERR "Ninka v${Ninka::VERSION}. sqlite wrapper\n"; + print STDERR "Processes package file (.tar.gz, zip, jar. etc) and outputs to excel file\n"; + print STDERR "Incorrect number of arguments\n"; + print STDERR "Usage: $0 \n"; + exit 1; +} + +my $path = $0; + +$path =~ s/\/+[^\/]+$//; +if ($path eq "") { + $path = "./"; +} + +my ($pack, $excelFile) = @ARGV; + +my $workbook = Spreadsheet::WriteExcel->new($excelFile); +my $worksheet = $workbook->add_worksheet(); +my $format = $workbook->add_format(); # Add a format +$format->set_bold(); +$format->set_color('blue'); +$format->set_align('center'); + +$worksheet->set_column(0, 9, 30); +$worksheet->write(0, 0, 'Container File', $format); +$worksheet->write(0, 1, 'Path', $format); +$worksheet->write(0, 2, 'Filename', $format); +$worksheet->write(0, 3, 'Licenses', $format); +$worksheet->write(0, 4, 'Num found', $format); +$worksheet->write(0, 5, 'Lines', $format); +$worksheet->write(0, 6, 'TokensIgnored', $format); +$worksheet->write(0, 7, 'TokensUnmatched', $format); +$worksheet->write(0, 8, 'TokensUnknown', $format); +$worksheet->write(0, 9, 'Tokens', $format); + +my $tempdir = File::Temp->newdir(); +my $dirname = $tempdir->dirname; + +print "***** Extracting file [$pack] to temporary directory [$dirname] *****\n"; +my $packext = getExtension($pack); +if ($packext eq ".bz2" || $packext eq ".gz") { + execute("tar -xvf '$pack' --directory '$dirname'"); +} elsif ($packext eq ".jar" || $packext eq ".zip") { + execute("unzip -d $dirname $pack"); +} else { + print "ninka-wrapper does not support packages with extension [$packext]\n"; +} + +my @files; +find( + sub { push @files, $File::Find::name unless -d; }, + $dirname +); + +print "***** Beginning Execution of Ninka *****\n"; +foreach my $file (@files) { + if (-T $file) { + print "Running ninka on file [$file]\n"; + execute("perl ${path}/ninka -i '$file'"); + } +} + +print "***** Entering Ninka Data into excell file [$excelFile] *****\n"; +my $row = 1; + +foreach my $file (@files) { + + my $filepath = dirname($file); + $filepath =~ s/$dirname//; + my $basefile = fileparse($file, ()); + my $packname = basename($pack); + + #Read entire file into a string + my $filename = "${file}.license"; + + $worksheet->write($row, 0, $packname); + $worksheet->write($row, 1, $filepath); + $worksheet->write($row, 2, $basefile); + + print "Inserting [$basefile] into table spreedsheet\n"; + + if (-T $filename) { + + open (my $fh, '<', $filename) or die "Can't open file $!"; + my $filedata = do { local $/; <$fh> }; + + my @columns = parseLicenseData($filedata); + + my $originalFile = $file; + $originalFile =~ s/\.license$//; + + foreach my $i (0..7) { + $worksheet->write($row, $i+3, $columns[$i]); + } + close($fh); + + } else { + $worksheet->write($row, 3, "Binary File"); + } + $row++; +} + +$workbook->close(); + +sub parseLicenseData { + my ($data) = @_; + chomp($data); + my @columns; + my @fields = split(';', $data); + if($fields[0] eq "NONE\n"){ + @columns = '' x 7; + @columns[0] = 'NONE'; + } else { + @columns = @fields; + } + return @columns; +} + +sub getExtension { + my ($file) = @_; + my $filename = basename($file); + my ($ext) = $filename =~ /(\.[^.]+)$/; + return $ext; +} + +sub removeExtension { + my ($file) = @_; + (my $filename = $file) =~ s/\.[^.]+$//; + return $filename; +} + +sub execute { + my ($command) = @_; + my $output = `$command`; + my $status = ($? >> 8); + die "execution of [$command] failed: status [$status]\n" if ($status != 0); + return $output; +} diff --git a/bin/ninka-sqlite b/bin/ninka-sqlite new file mode 100755 index 0000000..6b27ea9 --- /dev/null +++ b/bin/ninka-sqlite @@ -0,0 +1,195 @@ +#!/usr/bin/perl +# +# Copyright (C) 2014,2015 Anthony Kohan and Daniel M. German +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of +# the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +use strict; +use DBI; +use File::Temp; +use File::Find; +use File::Basename; +use Ninka; + +if (scalar(@ARGV) != 2) { + print STDERR "Ninka v${Ninka::VERSION}. sqlite wrapper\n"; + print STDERR "Processes package file (.tar.gz, zip, jar. etc) and outputs to sqlite file\n"; + print STDERR "Incorrect number of arguments\n"; + print STDERR "Usage: $0 \n"; + exit 1; +} + +my $path = $0; + +$path =~ s/\/+[^\/]+$//; +if ($path eq "") { + $path = "./"; +} + +my ($pack, $db) = @ARGV; + +my $dbh = DBI->connect("DBI:SQLite:dbname=$db", "", "", {RaiseError => 1, AutoCommit => 0}) + or die $DBI::errstr; +$dbh->do("CREATE TABLE IF NOT EXISTS + comments (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + sentences (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + goodsents (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + badsents (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + senttoks (filename TEXT, path TEXT, container TEXT, content TEXT, + PRIMARY KEY(filename, path, container))"); +$dbh->do("CREATE TABLE IF NOT EXISTS + licenses (filename TEXT, path TEXT, container TEXT, licenses TEXT, + num_found INT, lines INT, toks_ignored INT, toks_unmatched INT, + toks_unknown INT, tokens TEXT, + PRIMARY KEY(filename, path, container))"); + +my $tempdir = File::Temp->newdir(); +my $dirname = $tempdir->dirname; + +print "***** Extracting file [$pack] to temporary directory [$dirname] *****\n"; +my $packext = getExtension($pack); +if ($packext eq ".bz2" || $packext eq ".gz") { + execute("tar -xvf '$pack' --directory '$dirname'"); +} elsif ($packext eq ".jar" || $packext eq ".zip") { + execute("unzip -d $dirname $pack"); +} else { + print "ninka-wrapper does not support packages with extension [$packext]\n"; +} + +my @files; +find( + sub { push @files, $File::Find::name unless -d; }, + $dirname +); + +print "***** Beginning Execution of Ninka *****\n"; +foreach my $file (@files) { + print "Running ninka on file [$file]\n"; + execute("perl ${path}/ninka -i '$file'"); +} + +my @ninkafiles; +find( + sub { + my $ext = getExtension($File::Find::name); + if($ext =~ m/(comments|sentences|goodsent|badsent|senttok|license)$/){ + push @ninkafiles, $File::Find::name; + } + }, + $dirname +); + +print "***** Entering Ninka Data into Database [$db] *****\n"; +foreach my $file (@ninkafiles) { + + my $filepath = dirname($file); + $filepath =~ s/$dirname//; + my $basefile = basename($file); + my $rootfile = removeExtension($basefile); + my $packname = basename($pack); + + #Read entire file into a string + open (my $fh, '<', $file) or die "Can't open file $!"; + my $filedata = do { local $/; <$fh> }; + + my $sth; + my $ext = getExtension($basefile); + + if ($ext eq ".comments") { + print "Inserting [$basefile] into table comments\n"; + $sth = $dbh->prepare("INSERT INTO comments VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + if ($ext eq ".sentences") { + print "Inserting [$basefile] into table sentences\n"; + $sth = $dbh->prepare("INSERT INTO sentences VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + if ($ext eq ".goodsent") { + print "Inserting [$basefile] into table goodsents\n"; + $sth = $dbh->prepare("INSERT INTO goodsents VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + if ($ext eq ".badsent") { + print "Inserting [$basefile] into table badsents\n"; + $sth = $dbh->prepare("INSERT INTO badsents VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + if ($ext eq ".senttok") { + print "Inserting [$basefile] into table senttoks\n"; + $sth = $dbh->prepare("INSERT INTO senttoks VALUES + ('$rootfile', '$filepath', '$packname', ?)"); + } + if ($ext eq ".license") { + print "Inserting [$basefile] into table licenses\n"; + my @columns = parseLicenseData($filedata); + $sth = $dbh->prepare("INSERT INTO licenses VALUES + ('$rootfile', '$filepath', '$packname', '$columns[0]', '$columns[1]', + '$columns[2]', '$columns[3]', '$columns[4]', '$columns[5]', '$columns[6]')"); + } + + if (defined $sth) { + $sth->bind_param(1, $filedata); + $sth->execute; + } + + close($fh); +} + +$dbh->commit(); +$dbh->disconnect(); + +sub parseLicenseData { + my ($data) = @_; + + my @columns; + my @fields = split(';', $data); + if($fields[0] eq "NONE\n"){ + @columns = '' x 7; + @columns[0] = 'NONE'; + } else { + @columns = @fields; + } + return @columns; +} + +sub getExtension { + my ($file) = @_; + my $filename = basename($file); + my ($ext) = $filename =~ /(\.[^.]+)$/; + return $ext; +} + +sub removeExtension { + my ($file) = @_; + (my $filename = $file) =~ s/\.[^.]+$//; + return $filename; +} + +sub execute { + my ($command) = @_; + my $output = `$command`; + my $status = ($? >> 8); + die "execution of [$command] failed: status [$status]\n" if ($status != 0); + return $output; +} diff --git a/ninka-excel.pl b/ninka-excel.pl deleted file mode 100755 index 71adddf..0000000 --- a/ninka-excel.pl +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/perl -# -# Copyright (C) 2014,2015 Anthony Kohan and Daniel M. German -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of -# the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -use strict; -use Switch; -use File::Temp; -use File::Find; -use File::Basename; -use Scalar::Util qw(looks_like_number); -use Spreadsheet::WriteExcel; - - - -if(scalar(@ARGV) != 2){ - print STDERR "Ninka 1.3. sqlite wrapper\n"; - print STDERR "Processes package file (.tar.gz, zip, jar. etc) and outputs to excel file\n"; - print STDERR "Incorrect number of arguments\n"; - print STDERR "Correct usage is: $0 \n"; - exit 1; -} - -my $path = $0; - -$path =~ s/\/+[^\/]+$//; -if ($path eq "") { - $path = "./"; -} - -my ($pack, $excelFile) = @ARGV; - -my $workbook = Spreadsheet::WriteExcel->new($excelFile); -my $worksheet = $workbook->add_worksheet(); -my $format = $workbook->add_format(); # Add a format -$format->set_bold(); -$format->set_color('blue'); -$format->set_align('center'); - -$worksheet->set_column(0, 9, 30); -$worksheet->write(0, 0, 'Container File', $format); -$worksheet->write(0, 1, 'Path', $format); -$worksheet->write(0, 2, 'Filename', $format); -$worksheet->write(0, 3, 'Licenses', $format); -$worksheet->write(0, 4, 'Num found', $format); -$worksheet->write(0, 5, 'Lines', $format); -$worksheet->write(0, 6, 'TokensIgnored', $format); -$worksheet->write(0, 7, 'TokensUnmatched', $format); -$worksheet->write(0, 8, 'TokensUnknown', $format); -$worksheet->write(0, 9, 'Tokens', $format); - -my $tempdir = File::Temp->newdir(); -my $dirname = $tempdir->dirname; - -print "***** Extracting file [$pack] to temporary directory [$dirname] *****\n"; -my $packext = getExtension($pack); -if ($packext eq ".bz2" || $packext eq ".gz") { - execute("tar -xvf '$pack' --directory '$dirname'"); -} elsif ($packext eq ".jar" || $packext eq ".zip") { - execute("unzip -d $dirname $pack"); -} else { - print "ninka-wrapper does not support packages with extension [$packext]\n"; -} - -my @files; -find( - sub { push @files, $File::Find::name unless -d; }, - $dirname -); - -print "***** Beginning Execution of Ninka *****\n"; -foreach my $file (@files) { - if (-T $file) { - print "Running ninka on file [$file]\n"; - execute("perl ${path}/ninka.pl '$file'"); - } -} - - -print "***** Entering Ninka Data into excell file [$excelFile] *****\n"; -my $row = 1; - -foreach my $file (@files) { - - my $filepath = dirname($file); - $filepath =~ s/$dirname//; - my $basefile = fileparse($file, ()); - my $packname = basename($pack); - - #Read entire file into a string - my $filename = "${file}.license"; - - $worksheet->write($row, 0, $packname); - $worksheet->write($row, 1, $filepath); - $worksheet->write($row, 2, $basefile); - - print "Inserting [$basefile] into table spreedsheet\n"; - - if (-T $filename) { - - open (my $fh, '<', $filename) or die "Can't open file $!"; - my $filedata = do { local $/; <$fh> }; - - my @columns = parseLicenseData($filedata); - - - my $originalFile = $file; - $originalFile =~ s/\.license$//; - - foreach my $i (0..7) { - $worksheet->write($row, $i+3, $columns[$i]); - } - close($fh); - - } else { - $worksheet->write($row, 3, "Binary File"); - } - $row++; -} - -$workbook->close(); - -sub parseLicenseData { - my ($data) = @_; - chomp($data); - my @columns; - my @fields = split(';', $data); - if($fields[0] eq "NONE\n"){ - @columns = '' x 7; - @columns[0] = 'NONE'; - } else { - @columns = @fields; - } - return @columns; -} - -sub getExtension { - my ($file) = @_; - my $filename = basename($file); - my ($ext) = $filename =~ /(\.[^.]+)$/; - return $ext; -} - -sub removeExtension { - my ($file) = @_; - (my $filename = $file) =~ s/\.[^.]+$//; - return $filename; -} - -sub execute { - my ($command) = @_; - my $output = `$command`; - my $status = ($? >> 8); - die "execution of [$command] failed: status [$status]\n" if ($status != 0); - return $output; -} diff --git a/ninka-sqlite.pl b/ninka-sqlite.pl deleted file mode 100755 index d53f60f..0000000 --- a/ninka-sqlite.pl +++ /dev/null @@ -1,196 +0,0 @@ -#!/usr/bin/perl -# -# Copyright (C) 2014,2015 Anthony Kohan and Daniel M. German -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of -# the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# - -use strict; -use Switch; -use DBI; -use File::Temp; -use File::Find; -use File::Basename; -use Scalar::Util qw(looks_like_number); - - - -if(scalar(@ARGV) != 2){ - print STDERR "Ninka 1.3. sqlite wrapper\n"; - print STDERR "Processes package file (.tar.gz, zip, jar. etc) and outputs to sqlite file\n"; - print STDERR "Incorrect number of arguments\n"; - print STDERR "Correct usage is: $0 \n"; - exit 1; -} - -my $path = $0; - -$path =~ s/\/+[^\/]+$//; -if ($path eq "") { - $path = "./"; -} - -my ($pack, $db) = @ARGV; - -my $dbh = DBI->connect("DBI:SQLite:dbname=$db", "", "", {RaiseError => 1, AutoCommit => 0}) - or die $DBI::errstr; -$dbh->do("CREATE TABLE IF NOT EXISTS - comments (filename TEXT, path TEXT, container TEXT, content TEXT, - PRIMARY KEY(filename, path, container))"); -$dbh->do("CREATE TABLE IF NOT EXISTS - sentences (filename TEXT, path TEXT, container TEXT, content TEXT, - PRIMARY KEY(filename, path, container))"); -$dbh->do("CREATE TABLE IF NOT EXISTS - goodsents (filename TEXT, path TEXT, container TEXT, content TEXT, - PRIMARY KEY(filename, path, container))"); -$dbh->do("CREATE TABLE IF NOT EXISTS - badsents (filename TEXT, path TEXT, container TEXT, content TEXT, - PRIMARY KEY(filename, path, container))"); -$dbh->do("CREATE TABLE IF NOT EXISTS - senttoks (filename TEXT, path TEXT, container TEXT, content TEXT, - PRIMARY KEY(filename, path, container))"); -$dbh->do("CREATE TABLE IF NOT EXISTS - licenses (filename TEXT, path TEXT, container TEXT, licenses TEXT, - num_found INT, lines INT, toks_ignored INT, toks_unmatched INT, - toks_unknown INT, tokens TEXT, - PRIMARY KEY(filename, path, container))"); - -my $tempdir = File::Temp->newdir(); -my $dirname = $tempdir->dirname; - -print "***** Extracting file [$pack] to temporary directory [$dirname] *****\n"; -my $packext = getExtension($pack); -if ($packext eq ".bz2" || $packext eq ".gz") { - execute("tar -xvf '$pack' --directory '$dirname'"); -} elsif ($packext eq ".jar" || $packext eq ".zip") { - execute("unzip -d $dirname $pack"); -} else { - print "ninka-wrapper does not support packages with extension [$packext]\n"; -} - -my @files; -find( - sub { push @files, $File::Find::name unless -d; }, - $dirname -); - -print "***** Beginning Execution of Ninka *****\n"; -foreach my $file (@files) { - print "Running ninka on file [$file]\n"; - execute("perl ${path}/ninka.pl '$file'"); -} - -my @ninkafiles; -find( - sub { - my $ext = getExtension($File::Find::name); - if($ext =~ m/(comments|sentences|goodsent|badsent|senttok|license)$/){ - push @ninkafiles, $File::Find::name; - } - }, - $dirname -); - -print "***** Entering Ninka Data into Database [$db] *****\n"; -foreach my $file (@ninkafiles) { - - my $filepath = dirname($file); - $filepath =~ s/$dirname//; - my $basefile = basename($file); - my $rootfile = removeExtension($basefile); - my $packname = basename($pack); - - #Read entire file into a string - open (my $fh, '<', $file) or die "Can't open file $!"; - my $filedata = do { local $/; <$fh> }; - - my $sth; - switch (getExtension($basefile)){ - -# case ".comments" { -# print "Inserting [$basefile] into table comments\n"; -# $sth = $dbh->prepare("INSERT INTO comments VALUES -# ('$rootfile', '$filepath', '$packname', ?)"); -# } - case ".sentences" { - print "Inserting [$basefile] into table sentences\n"; - $sth = $dbh->prepare("INSERT INTO sentences VALUES - ('$rootfile', '$filepath', '$packname', ?)"); - } - case ".goodsent" { - print "Inserting [$basefile] into table goodsents\n"; - $sth = $dbh->prepare("INSERT INTO goodsents VALUES - ('$rootfile', '$filepath', '$packname', ?)"); - } - case ".badsent" { - print "Inserting [$basefile] into table goodsents\n"; - $sth = $dbh->prepare("INSERT INTO badsents VALUES - ('$rootfile', '$filepath', '$packname', ?)"); - } - case ".senttok" { - print "Inserting [$basefile] into table senttoks\n"; - $sth = $dbh->prepare("INSERT INTO senttoks VALUES - ('$rootfile', '$filepath', '$packname', ?)"); - } - case ".license" { - print "Inserting [$basefile] into table licenses\n"; - my @columns = parseLicenseData($filedata); - $sth = $dbh->prepare("INSERT INTO licenses VALUES - ('$rootfile', '$filepath', '$packname', '$columns[0]', '$columns[1]', - '$columns[2]', '$columns[3]', '$columns[4]', '$columns[5]', '$columns[6]')"); - } - } - - $sth->bind_param(1, $filedata); - $sth->execute; - close($fh); -} - -$dbh->commit(); -$dbh->disconnect(); - -sub parseLicenseData { - my ($data) = @_; - - my @columns; - my @fields = split(';', $data); - if($fields[0] eq "NONE\n"){ - @columns = '' x 7; - @columns[0] = 'NONE'; - } else { - @columns = @fields; - } - return @columns; -} - -sub getExtension { - my ($file) = @_; - my $filename = basename($file); - my ($ext) = $filename =~ /(\.[^.]+)$/; - return $ext; -} - -sub removeExtension { - my ($file) = @_; - (my $filename = $file) =~ s/\.[^.]+$//; - return $filename; -} - -sub execute { - my ($command) = @_; - my $output = `$command`; - my $status = ($? >> 8); - die "execution of [$command] failed: status [$status]\n" if ($status != 0); - return $output; -} -- cgit v1.2.1 From 703f23e8d91316bf22cd7ec6ab877438e4fc9007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scheibe?= Date: Thu, 4 Jun 2015 16:00:00 +0200 Subject: set execution flag on scripts --- scripts/license_matcher_modified.pl | 0 scripts/parseLicense.pl | 0 scripts/sort_package_license_list.pl | 0 scripts/unify.pl | 0 4 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/license_matcher_modified.pl mode change 100644 => 100755 scripts/parseLicense.pl mode change 100644 => 100755 scripts/sort_package_license_list.pl mode change 100644 => 100755 scripts/unify.pl diff --git a/scripts/license_matcher_modified.pl b/scripts/license_matcher_modified.pl old mode 100644 new mode 100755 diff --git a/scripts/parseLicense.pl b/scripts/parseLicense.pl old mode 100644 new mode 100755 diff --git a/scripts/sort_package_license_list.pl b/scripts/sort_package_license_list.pl old mode 100644 new mode 100755 diff --git a/scripts/unify.pl b/scripts/unify.pl old mode 100644 new mode 100755 -- cgit v1.2.1 From 926271a6bcf456c4d7b00acbc65bc7260a49eff4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scheibe?= Date: Thu, 4 Jun 2015 17:42:18 +0200 Subject: include "scripts" folder in the distribution --- MANIFEST | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MANIFEST b/MANIFEST index 328ab11..6f51dd9 100644 --- a/MANIFEST +++ b/MANIFEST @@ -20,6 +20,10 @@ Makefile.PL MANIFEST MANIFEST.SKIP README +scripts/license_matcher_modified.pl +scripts/parseLicense.pl +scripts/sort_package_license_list.pl +scripts/unify.pl t/data/expected_output/AAL t/data/expected_output/AFL-1.1 t/data/expected_output/AFL-1.2 -- cgit v1.2.1 From 580fb4848cbcce37dffe64f2566cdbfbbf00f2d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scheibe?= Date: Thu, 4 Jun 2015 12:31:33 +0200 Subject: test that documentation uses valid syntax * POD - Plain Old Documentation (the documentation format used by Perl) --- MANIFEST | 1 + Makefile.PL | 1 + t/pod_ok.t | 5 +++++ 3 files changed, 7 insertions(+) create mode 100644 t/pod_ok.t diff --git a/MANIFEST b/MANIFEST index 6f51dd9..938e912 100644 --- a/MANIFEST +++ b/MANIFEST @@ -216,5 +216,6 @@ t/data/licenses/OSL-2.1 t/data/licenses/OSL-3.0 t/data/licenses/PRESERVE_COPYRIGHT_NOTICE t/data/licenses/Public-domain +t/pod_ok.t t/reference_licenses.t t/syntax_ok_and_use_strict.t diff --git a/Makefile.PL b/Makefile.PL index 20ea9fd..790ea55 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -30,6 +30,7 @@ WriteMakefile( TEST_REQUIRES => { 'File::Temp' => '0', 'Test::More' => '0.98', + 'Test::Pod' => '1.00', 'Test::Strict' => '0', }, META_MERGE => { diff --git a/t/pod_ok.t b/t/pod_ok.t new file mode 100644 index 0000000..57b423a --- /dev/null +++ b/t/pod_ok.t @@ -0,0 +1,5 @@ +use strict; +use warnings; +use Test::Pod; + +all_pod_files_ok(); -- cgit v1.2.1 From 425293b6ffc84164a3397ec95fad0f69f924d3df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scheibe?= Date: Mon, 1 Jun 2015 23:53:47 +0200 Subject: fix indentation for pod format --- lib/Ninka.pm | 22 +++++++++++----------- lib/Ninka/CommentExtractor.pm | 24 ++++++++++++------------ lib/Ninka/FileCleaner.pm | 24 ++++++++++++------------ lib/Ninka/LicenseMatcher.pm | 22 +++++++++++----------- lib/Ninka/LicenseRules.pm | 22 +++++++++++----------- 5 files changed, 57 insertions(+), 57 deletions(-) diff --git a/lib/Ninka.pm b/lib/Ninka.pm index dc9fbff..526aeab 100644 --- a/lib/Ninka.pm +++ b/lib/Ninka.pm @@ -86,19 +86,19 @@ Scans a file and returns the found licenses. =head1 COPYRIGHT AND LICENSE - Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German +Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see . +You should have received a copy of the GNU General Public License +along with this program. If not, see . =cut diff --git a/lib/Ninka/CommentExtractor.pm b/lib/Ninka/CommentExtractor.pm index fd62c02..8a2e66f 100644 --- a/lib/Ninka/CommentExtractor.pm +++ b/lib/Ninka/CommentExtractor.pm @@ -93,19 +93,19 @@ If no comment extractor is known for a language, then extracts top lines from so =head1 COPYRIGHT AND LICENSE - Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German +Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see . +You should have received a copy of the GNU General Public License +along with this program. If not, see . -=cut \ No newline at end of file +=cut diff --git a/lib/Ninka/FileCleaner.pm b/lib/Ninka/FileCleaner.pm index c3dd912..825b1fb 100644 --- a/lib/Ninka/FileCleaner.pm +++ b/lib/Ninka/FileCleaner.pm @@ -49,19 +49,19 @@ Escapes apostrophes and other potentially disturbing characters =head1 COPYRIGHT AND LICENSE - Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German, 2015 Daniele Fognini and Johannes Najjar +Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German, 2015 Daniele Fognini and Johannes Najjar - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see . +You should have received a copy of the GNU General Public License +along with this program. If not, see . -=cut \ No newline at end of file +=cut diff --git a/lib/Ninka/LicenseMatcher.pm b/lib/Ninka/LicenseMatcher.pm index 1cb402a..ba73b26 100644 --- a/lib/Ninka/LicenseMatcher.pm +++ b/lib/Ninka/LicenseMatcher.pm @@ -261,19 +261,19 @@ Uses a set of license sentence names as input and outputs license names correspo =head1 COPYRIGHT AND LICENSE - Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German +Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see . +You should have received a copy of the GNU General Public License +along with this program. If not, see . =cut diff --git a/lib/Ninka/LicenseRules.pm b/lib/Ninka/LicenseRules.pm index c7810c8..ee5af4b 100644 --- a/lib/Ninka/LicenseRules.pm +++ b/lib/Ninka/LicenseRules.pm @@ -96,19 +96,19 @@ Contains rules used by Ninka::LicenseMatcher. =head1 COPYRIGHT AND LICENSE - Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German +Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see . +You should have received a copy of the GNU General Public License +along with this program. If not, see . =cut -- cgit v1.2.1 From 5d8906ae2b965d8497350e0e6a16ddb2e5e67a1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scheibe?= Date: Thu, 4 Jun 2015 11:33:38 +0200 Subject: fix encoding of Makefile.PL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * use utf8, otherwise encoding is broken for example in MYMETA.* files ("René" encoded as "René") --- Makefile.PL | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.PL b/Makefile.PL index 790ea55..95f568b 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -1,5 +1,6 @@ use strict; use warnings; +use utf8; use ExtUtils::MakeMaker; WriteMakefile( -- cgit v1.2.1 From 4419dba89e92b471d1b869de8ece10ddb567be4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scheibe?= Date: Thu, 4 Jun 2015 17:35:27 +0200 Subject: correctly placing the SCOWL abbreviations information * all license files should live in the base directory * moved info how abbreviations have been extracted from a special README into a notes section of Ninka::SentenceExtractor --- Copyright.SCOWL | 230 +++++++++++++++++++++++++++++++++++++++++ MANIFEST | 1 + lib/Ninka/Copyright.SCOWL | 230 ----------------------------------------- lib/Ninka/README.txt | 9 -- lib/Ninka/SentenceExtractor.pm | 13 +++ 5 files changed, 244 insertions(+), 239 deletions(-) create mode 100644 Copyright.SCOWL delete mode 100644 lib/Ninka/Copyright.SCOWL delete mode 100644 lib/Ninka/README.txt diff --git a/Copyright.SCOWL b/Copyright.SCOWL new file mode 100644 index 0000000..a4654a3 --- /dev/null +++ b/Copyright.SCOWL @@ -0,0 +1,230 @@ +The collective work is Copyright 2000-2015 by Kevin Atkinson as well +as any of the copyrights mentioned below: + + Copyright 2000-2015 by Kevin Atkinson + + Permission to use, copy, modify, distribute and sell these word + lists, the associated scripts, the output created from the scripts, + and its documentation for any purpose is hereby granted without fee, + provided that the above copyright notice appears in all copies and + that both that copyright notice and this permission notice appear in + supporting documentation. Kevin Atkinson makes no representations + about the suitability of this array for any purpose. It is provided + "as is" without express or implied warranty. + +Alan Beale also deserves special credit as he has, +in addition to providing the 12Dicts package and being a major +contributor to the ENABLE word list, given me an incredible amount of +feedback and created a number of special lists (those found in the +Supplement) in order to help improve the overall quality of SCOWL. + +The 10 level includes the 1000 most common English words (according to +the Moby (TM) Words II [MWords] package), a subset of the 1000 most +common words on the Internet (again, according to Moby Words II), and +frequently class 16 from Brian Kelk's "UK English Wordlist +with Frequency Classification". + +The MWords package was explicitly placed in the public domain: + + The Moby lexicon project is complete and has + been place into the public domain. Use, sell, + rework, excerpt and use in any way on any platform. + + Placing this material on internal or public servers is + also encouraged. The compiler is not aware of any + export restrictions so freely distribute world-wide. + + You can verify the public domain status by contacting + + Grady Ward + 3449 Martha Ct. + Arcata, CA 95521-4884 + + grady@netcom.com + grady@northcoast.com + +The "UK English Wordlist With Frequency Classification" is also in the +Public Domain: + + Date: Sat, 08 Jul 2000 20:27:21 +0100 + From: Brian Kelk + + > I was wondering what the copyright status of your "UK English + > Wordlist With Frequency Classification" word list as it seems to + > be lacking any copyright notice. + + There were many many sources in total, but any text marked + "copyright" was avoided. Locally-written documentation was one + source. An earlier version of the list resided in a filespace called + PUBLIC on the University mainframe, because it was considered public + domain. + + Date: Tue, 11 Jul 2000 19:31:34 +0100 + + > So are you saying your word list is also in the public domain? + + That is the intention. + +The 20 level includes frequency classes 7-15 from Brian's word list. + +The 35 level includes frequency classes 2-6 and words appearing in at +least 11 of 12 dictionaries as indicated in the 12Dicts package. All +words from the 12Dicts package have had likely inflections added via +my inflection database. + +The 12Dicts package and Supplement is in the Public Domain. + +The WordNet database, which was used in the creation of the +Inflections database, is under the following copyright: + + This software and database is being provided to you, the LICENSEE, + by Princeton University under the following license. By obtaining, + using and/or copying this software and database, you agree that you + have read, understood, and will comply with these terms and + conditions.: + + Permission to use, copy, modify and distribute this software and + database and its documentation for any purpose and without fee or + royalty is hereby granted, provided that you agree to comply with + the following copyright notice and statements, including the + disclaimer, and that the same appear on ALL copies of the software, + database and documentation, including modifications that you make + for internal use or for distribution. + + WordNet 1.6 Copyright 1997 by Princeton University. All rights + reserved. + + THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON + UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR + IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON + UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- + ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE + LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY + THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. + + The name of Princeton University or Princeton may not be used in + advertising or publicity pertaining to distribution of the software + and/or database. Title to copyright in this software, database and + any associated documentation shall at all times remain with + Princeton University and LICENSEE agrees to preserve same. + +The 40 level includes words from Alan's 3esl list found in version 4.0 +of his 12dicts package. Like his other stuff the 3esl list is also in the +public domain. + +The 50 level includes Brian's frequency class 1, words appearing +in at least 5 of 12 of the dictionaries as indicated in the 12Dicts +package, and uppercase words in at least 4 of the previous 12 +dictionaries. A decent number of proper names is also included: The +top 1000 male, female, and Last names from the 1990 Census report; a +list of names sent to me by Alan Beale; and a few names that I added +myself. Finally a small list of abbreviations not commonly found in +other word lists is included. + +The name files form the Census report is a government document which I +don't think can be copyrighted. + +The file special-jargon.50 uses common.lst and word.lst from the +"Unofficial Jargon File Word Lists" which is derived from "The Jargon +File". All of which is in the Public Domain. This file also contain +a few extra UNIX terms which are found in the file "unix-terms" in the +special/ directory. + +The 55 level includes words from Alan's 2of4brif list found in version +4.0 of his 12dicts package. Like his other stuff the 2of4brif is also +in the public domain. + +The 60 level includes all words appearing in at least 2 of the 12 +dictionaries as indicated by the 12Dicts package. + +The 70 level includes Brian's frequency class 0 and the 74,550 common +dictionary words from the MWords package. The common dictionary words, +like those from the 12Dicts package, have had all likely inflections +added. The 70 level also included the 5desk list from version 4.0 of +the 12Dics package which is in the public domain. + +The 80 level includes the ENABLE word list, all the lists in the +ENABLE supplement package (except for ABLE), the "UK Advanced Cryptics +Dictionary" (UKACD), the list of signature words from the YAWL package, +and the 10,196 places list from the MWords package. + +The ENABLE package, mainted by M\Cooper , +is in the Public Domain: + + The ENABLE master word list, WORD.LST, is herewith formally released + into the Public Domain. Anyone is free to use it or distribute it in + any manner they see fit. No fee or registration is required for its + use nor are "contributions" solicited (if you feel you absolutely + must contribute something for your own peace of mind, the authors of + the ENABLE list ask that you make a donation on their behalf to your + favorite charity). This word list is our gift to the Scrabble + community, as an alternate to "official" word lists. Game designers + may feel free to incorporate the WORD.LST into their games. Please + mention the source and credit us as originators of the list. Note + that if you, as a game designer, use the WORD.LST in your product, + you may still copyright and protect your product, but you may *not* + legally copyright or in any way restrict redistribution of the + WORD.LST portion of your product. This *may* under law restrict your + rights to restrict your users' rights, but that is only fair. + +UKACD, by J Ross Beresford , is under the +following copyright: + + Copyright (c) J Ross Beresford 1993-1999. All Rights Reserved. + + The following restriction is placed on the use of this publication: + if The UK Advanced Cryptics Dictionary is used in a software package + or redistributed in any form, the copyright notice must be + prominently displayed and the text of this document must be included + verbatim. + + There are no other restrictions: I would like to see the list + distributed as widely as possible. + +The 95 level includes the 354,984 single words, 256,772 compound +words, 4,946 female names and the 3,897 male names, and 21,986 names +from the MWords package, ABLE.LST from the ENABLE Supplement, and some +additional words found in my part-of-speech database that were not +found anywhere else. + +Accent information was taken from UKACD. + +My VARCON package was used to create the American, British, and +Canadian word list. + +Since the original word lists used in the VARCON package came +from the Ispell distribution they are under the Ispell copyright: + + Copyright 1993, Geoff Kuenning, Granada Hills, CA + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. All modifications to the source code must be clearly marked as + such. Binary redistributions based on modified source code + must be clearly marked as modified versions in the documentation + and/or other materials provided with the distribution. + (clause 4 removed with permission from Geoff Kuenning) + 5. The name of Geoff Kuenning may not be used to endorse or promote + products derived from this software without specific prior + written permission. + + THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS + IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEOFF + KUENNING OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. diff --git a/MANIFEST b/MANIFEST index 938e912..1508f1e 100644 --- a/MANIFEST +++ b/MANIFEST @@ -2,6 +2,7 @@ bin/ninka bin/ninka-excel bin/ninka-sqlite Changes +Copyright.SCOWL lib/Ninka.pm lib/Ninka/abbreviations.dict lib/Ninka/CommentExtractor.pm diff --git a/lib/Ninka/Copyright.SCOWL b/lib/Ninka/Copyright.SCOWL deleted file mode 100644 index a4654a3..0000000 --- a/lib/Ninka/Copyright.SCOWL +++ /dev/null @@ -1,230 +0,0 @@ -The collective work is Copyright 2000-2015 by Kevin Atkinson as well -as any of the copyrights mentioned below: - - Copyright 2000-2015 by Kevin Atkinson - - Permission to use, copy, modify, distribute and sell these word - lists, the associated scripts, the output created from the scripts, - and its documentation for any purpose is hereby granted without fee, - provided that the above copyright notice appears in all copies and - that both that copyright notice and this permission notice appear in - supporting documentation. Kevin Atkinson makes no representations - about the suitability of this array for any purpose. It is provided - "as is" without express or implied warranty. - -Alan Beale also deserves special credit as he has, -in addition to providing the 12Dicts package and being a major -contributor to the ENABLE word list, given me an incredible amount of -feedback and created a number of special lists (those found in the -Supplement) in order to help improve the overall quality of SCOWL. - -The 10 level includes the 1000 most common English words (according to -the Moby (TM) Words II [MWords] package), a subset of the 1000 most -common words on the Internet (again, according to Moby Words II), and -frequently class 16 from Brian Kelk's "UK English Wordlist -with Frequency Classification". - -The MWords package was explicitly placed in the public domain: - - The Moby lexicon project is complete and has - been place into the public domain. Use, sell, - rework, excerpt and use in any way on any platform. - - Placing this material on internal or public servers is - also encouraged. The compiler is not aware of any - export restrictions so freely distribute world-wide. - - You can verify the public domain status by contacting - - Grady Ward - 3449 Martha Ct. - Arcata, CA 95521-4884 - - grady@netcom.com - grady@northcoast.com - -The "UK English Wordlist With Frequency Classification" is also in the -Public Domain: - - Date: Sat, 08 Jul 2000 20:27:21 +0100 - From: Brian Kelk - - > I was wondering what the copyright status of your "UK English - > Wordlist With Frequency Classification" word list as it seems to - > be lacking any copyright notice. - - There were many many sources in total, but any text marked - "copyright" was avoided. Locally-written documentation was one - source. An earlier version of the list resided in a filespace called - PUBLIC on the University mainframe, because it was considered public - domain. - - Date: Tue, 11 Jul 2000 19:31:34 +0100 - - > So are you saying your word list is also in the public domain? - - That is the intention. - -The 20 level includes frequency classes 7-15 from Brian's word list. - -The 35 level includes frequency classes 2-6 and words appearing in at -least 11 of 12 dictionaries as indicated in the 12Dicts package. All -words from the 12Dicts package have had likely inflections added via -my inflection database. - -The 12Dicts package and Supplement is in the Public Domain. - -The WordNet database, which was used in the creation of the -Inflections database, is under the following copyright: - - This software and database is being provided to you, the LICENSEE, - by Princeton University under the following license. By obtaining, - using and/or copying this software and database, you agree that you - have read, understood, and will comply with these terms and - conditions.: - - Permission to use, copy, modify and distribute this software and - database and its documentation for any purpose and without fee or - royalty is hereby granted, provided that you agree to comply with - the following copyright notice and statements, including the - disclaimer, and that the same appear on ALL copies of the software, - database and documentation, including modifications that you make - for internal use or for distribution. - - WordNet 1.6 Copyright 1997 by Princeton University. All rights - reserved. - - THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON - UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR - IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON - UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- - ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE - LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY - THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. - - The name of Princeton University or Princeton may not be used in - advertising or publicity pertaining to distribution of the software - and/or database. Title to copyright in this software, database and - any associated documentation shall at all times remain with - Princeton University and LICENSEE agrees to preserve same. - -The 40 level includes words from Alan's 3esl list found in version 4.0 -of his 12dicts package. Like his other stuff the 3esl list is also in the -public domain. - -The 50 level includes Brian's frequency class 1, words appearing -in at least 5 of 12 of the dictionaries as indicated in the 12Dicts -package, and uppercase words in at least 4 of the previous 12 -dictionaries. A decent number of proper names is also included: The -top 1000 male, female, and Last names from the 1990 Census report; a -list of names sent to me by Alan Beale; and a few names that I added -myself. Finally a small list of abbreviations not commonly found in -other word lists is included. - -The name files form the Census report is a government document which I -don't think can be copyrighted. - -The file special-jargon.50 uses common.lst and word.lst from the -"Unofficial Jargon File Word Lists" which is derived from "The Jargon -File". All of which is in the Public Domain. This file also contain -a few extra UNIX terms which are found in the file "unix-terms" in the -special/ directory. - -The 55 level includes words from Alan's 2of4brif list found in version -4.0 of his 12dicts package. Like his other stuff the 2of4brif is also -in the public domain. - -The 60 level includes all words appearing in at least 2 of the 12 -dictionaries as indicated by the 12Dicts package. - -The 70 level includes Brian's frequency class 0 and the 74,550 common -dictionary words from the MWords package. The common dictionary words, -like those from the 12Dicts package, have had all likely inflections -added. The 70 level also included the 5desk list from version 4.0 of -the 12Dics package which is in the public domain. - -The 80 level includes the ENABLE word list, all the lists in the -ENABLE supplement package (except for ABLE), the "UK Advanced Cryptics -Dictionary" (UKACD), the list of signature words from the YAWL package, -and the 10,196 places list from the MWords package. - -The ENABLE package, mainted by M\Cooper , -is in the Public Domain: - - The ENABLE master word list, WORD.LST, is herewith formally released - into the Public Domain. Anyone is free to use it or distribute it in - any manner they see fit. No fee or registration is required for its - use nor are "contributions" solicited (if you feel you absolutely - must contribute something for your own peace of mind, the authors of - the ENABLE list ask that you make a donation on their behalf to your - favorite charity). This word list is our gift to the Scrabble - community, as an alternate to "official" word lists. Game designers - may feel free to incorporate the WORD.LST into their games. Please - mention the source and credit us as originators of the list. Note - that if you, as a game designer, use the WORD.LST in your product, - you may still copyright and protect your product, but you may *not* - legally copyright or in any way restrict redistribution of the - WORD.LST portion of your product. This *may* under law restrict your - rights to restrict your users' rights, but that is only fair. - -UKACD, by J Ross Beresford , is under the -following copyright: - - Copyright (c) J Ross Beresford 1993-1999. All Rights Reserved. - - The following restriction is placed on the use of this publication: - if The UK Advanced Cryptics Dictionary is used in a software package - or redistributed in any form, the copyright notice must be - prominently displayed and the text of this document must be included - verbatim. - - There are no other restrictions: I would like to see the list - distributed as widely as possible. - -The 95 level includes the 354,984 single words, 256,772 compound -words, 4,946 female names and the 3,897 male names, and 21,986 names -from the MWords package, ABLE.LST from the ENABLE Supplement, and some -additional words found in my part-of-speech database that were not -found anywhere else. - -Accent information was taken from UKACD. - -My VARCON package was used to create the American, British, and -Canadian word list. - -Since the original word lists used in the VARCON package came -from the Ispell distribution they are under the Ispell copyright: - - Copyright 1993, Geoff Kuenning, Granada Hills, CA - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - 3. All modifications to the source code must be clearly marked as - such. Binary redistributions based on modified source code - must be clearly marked as modified versions in the documentation - and/or other materials provided with the distribution. - (clause 4 removed with permission from Geoff Kuenning) - 5. The name of Geoff Kuenning may not be used to endorse or promote - products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS - IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEOFF - KUENNING OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. diff --git a/lib/Ninka/README.txt b/lib/Ninka/README.txt deleted file mode 100644 index 2ca7f46..0000000 --- a/lib/Ninka/README.txt +++ /dev/null @@ -1,9 +0,0 @@ -This list of abbreviations was extracted from SCOWL (Spell Checker Oriented Word Lists) by by Kevin Atkinson (kevina@gnu.org) version 2015.04.24. - -Specifically it was created from scowl-2015.04.24.tar.gz. by running: - -cat *abbrev* | sort -u > abbreviations.dict - -It also contains some additions by D.M German. - -See Copyright.scowl for license. diff --git a/lib/Ninka/SentenceExtractor.pm b/lib/Ninka/SentenceExtractor.pm index c27d199..e476463 100644 --- a/lib/Ninka/SentenceExtractor.pm +++ b/lib/Ninka/SentenceExtractor.pm @@ -251,6 +251,19 @@ Ninka::SentenceExtractor Breaks comments into sentences. +=head1 NOTES + +This list of abbreviations was extracted from SCOWL (Spell Checker Oriented Word Lists) +by Kevin Atkinson (kevina@gnu.org) version 2015.04.24. + +Specifically it was created from scowl-2015.04.24.tar.gz. by running: + + cat *abbrev* | sort -u > abbreviations.dict + +It also contains some additions by D.M German. + +See Copyright.SCOWL for license. + =head1 COPYRIGHT AND LICENSE Author: Paul Clough -- cgit v1.2.1 From 9f3023e62659702d85a1fccbc5d49a4bb8392ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scheibe?= Date: Tue, 2 Jun 2015 00:03:09 +0200 Subject: update documenation to changed license & modularized implemenation * also replace separate maintained man page via auto-generated one from pod * documentation of the "ninka" script is maintained as pod directly inside the script itself (bin/ninka) * at build time via "make" a man page is generated under blib/man1/ninka.1p --- Changes | 2 +- Makefile.PL | 4 +- README | 118 ++++++++++++++++++++++------------------------------------- bin/ninka | 70 ++++++++++++++++++++++++++++++----- lib/Ninka.pm | 4 +- man/ninka.1 | 83 ----------------------------------------- 6 files changed, 109 insertions(+), 172 deletions(-) delete mode 100644 man/ninka.1 diff --git a/Changes b/Changes index ad02822..04ec33e 100644 --- a/Changes +++ b/Changes @@ -22,7 +22,7 @@ * ninka.pl: fixed bug in finding the path of where ninka was being executed from (reported by Ryan Biesemeyer) - * Fixed quotes in perl (René bScheibe) + * Fixed quotes in perl (René Scheibe) 2015-01-05 dmg diff --git a/Makefile.PL b/Makefile.PL index 95f568b..b29cf02 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -7,7 +7,7 @@ WriteMakefile( NAME => 'Ninka', VERSION_FROM => 'lib/Ninka.pm', ABSTRACT_FROM => 'lib/Ninka.pm', - LICENSE => 'agpl_3', + LICENSE => 'gpl_2', AUTHOR => [ 'Daniel M. German ', 'Yuki Manabe ', @@ -38,7 +38,7 @@ WriteMakefile( resources => { homepage => 'http://ninka.turingmachine.org/', repository => 'https://github.com/dmgerman/ninka', - license => 'http://www.gnu.org/licenses/agpl-3.0.html', + license => 'http://www.gnu.org/licenses/gpl-2.0.html', }, }, ); diff --git a/README b/README index bd67b2c..dbbe6f1 100644 --- a/README +++ b/README @@ -11,16 +11,13 @@ under which a source file is made available. This tool uses a source file as input and outputs the licenses identified within that file. -If you need to know the detail of Ninka, please see the following -paper: +If you need to know the detail of Ninka, please see the following paper: Daniel M. German, Yuki Manabe and Katsuro Inoue. A sentence-matching method for automatic license identification of source code files. In 25nd IEEE/ACM International Conference on Automated Software Engineering (ASE 2010). You can email me (dmg@uvic.ca) for a copy or -download it from - -http://turingmachine.org/~dmg/papers/dmg2010ninka.pdf +download it from http://turingmachine.org/~dmg/papers/dmg2010ninka.pdf. If you use Ninka for research purposes, we would appreciate you cite the above paper. @@ -28,13 +25,13 @@ the above paper. * Contributors - Paul Clough for his code to split sentences -- Anthony Kohan for writing the excel and sqlite backends. -- Armijn Hemel from Tjaldur Software Governance Solutions for multiple bug reports and suggestions +- Anthony Kohan for writing the excel and sqlite backends +- Armijn Hemel from Tjaldur Software Governance Solutions for multiple bug reports and suggestions +- René Scheibe for modularizing the code * License - Except for the directories comments and splitter, Ninka is licensed - under the GPLv2+ + Ninka is licensed under the GPLv2+: Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German @@ -51,13 +48,10 @@ the above paper. You should have received a copy of the GNU General Public License along with this program. If not, see . - - splitter.pl is a derivative work of the Rule-based sentence - splitter script by Paul Paul Clough. Please see splitter/README - for details. + Ninka::SentenceExtraxtor is a derivative work of the rule-based sentence + splitter script by Paul Paul Clough. - - comments is based on a program to remove comments by Jon Newman, - it is released under the GNU General Public License Version 2 or - (at your option) any later version. + comments is based on a program to remove comments by Jon Newman. * Requirements @@ -70,40 +64,25 @@ the above paper. * How to install 1. Unpack the distribution in a directory. - 2. Optional: Build and install comments (make sure it is somwehere in the - path) (see directory comments) - + 2. Optional: Build and install comments (make sure it is somwehere in the path) (see directory comments) -* Usage: +* Usage -Ninka uses a pipe model (see below). Each step of the "pipe" creates a -file, but +ninka [options] filename -ninka.pl [options] [filename] +Available options: -Available options + -i create intermediary files -v verbose - -d delete intermediate files - -C force creation of comments file - -c stop after creation of comments - -S force creation of sentences file - -s stop after creation of sentences - -G force creation of goodsent file - -g stop after creation of goodsent - -T force creation of senttok file - -t stop after creation of senttok - -L force creation of license file - -f force all processing - Example: - ninka.pl foo.c + ninka -i foo.c It will create five files: - 1. foo.c.comments: extracted the first two comments blocks, where - the license is usually + 1. foo.c.comments: extracted the first comments blocks, where + the license is usually included 2. foo.c.sentences: creates the list of sentences in the license statement 3. foo.c.goodsent: contains sentences that are likely to be part of @@ -117,69 +96,60 @@ It will create five files: - Licenses - Unmatched sentences in *.senttok that were not matched - - +The files are not required for Ninka's functionality. But they can help +to debug license detection issues. * Ninka model Ninka uses a pipe-model. Each stage of the pipe does something very specific: - 1. Comment extractor. +1. Comment extractor - - directory: extComments + - Module: Ninka::CommentExtractor - - command: extComments.pl, might use comments (included in distribution) + - Purpose: Extracts top comments of source code. + If no comment extractor is known for the language, + then extracts top lines from source (currently 700) - - Purpose: Extracts top comments of source code. If no - comment extractor is known for the language, then extracts top lines from source (currently 700) - - - Creates .comments file + - Output: .comments 2. Split sentences in comments - - directory: splitter - - - command: splitter.pl - - - Purpose: Ninka works by matching sentences of licenses, hence - it needs to properly break text into sentences. - - - Outputs .sentences - -3. Filter "good" sentences. + - Module: Ninka::SentenceExtractor - - directory filter + - Purpose: Ninka works by matching sentences of licenses, + hence it needs to properly break text into sentences. - - command: filter.pl + - Output: .sentences - - Purpose: some sentences are related to a license, some are - not. It is valuable to know if a file contains lines that look - like a license or not (e.g. to know that a file has no license) +3. Filter "good" sentences - - Outputs: .goodsent, and .badsent (not used) + - Module: Ninka::SentenceFilter -4. Tokenizes sentences + - Purpose: Some sentences are related to a license, some are not. + It is valuable to know if a file contains lines that look like + a license or not (e.g. to know that a file has no license). - - Directory senttok + - Output: .goodsent and .badsent - - command: senttok.pl +4. Tokenize sentences - - Purpose: It creates a file that corresponds to the recognized - sentence tokens. For each sentence, it outputs its sentence token, or unknown otherwise. + - Module: Ninka::SentenceTokenizer - - Outputs: .senttok + - Purpose: It creates a file that corresponds to the recognized sentence tokens. + For each sentence, it outputs its sentence token, or unknown otherwise. -5. Matches sentences to licenses + - Output: .senttok - - Directory matcher +5. Match sentences to licenses - - Command: matcher.pl + - Module: Ninka::LicenseMatcher - - Purpose: looks at the sequence of sentence tokens and outputs the licenses found + - Purpose: It looks at the sentence tokens and outputs the licenses found. - Output: .license -The script ninka.pl takes care of all these steps, and optionally removes +The script ninka takes care of all these steps, and optionally creates intermediary files, and writes to the stdout the licenses found. ------ diff --git a/bin/ninka b/bin/ninka index 4732cbe..9cfd6aa 100755 --- a/bin/ninka +++ b/bin/ninka @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/perl use strict; use warnings; @@ -19,7 +19,7 @@ sub parse_cmdline_parameters { if (!getopts('iv', \%opts) || scalar(@ARGV) == 0) { print STDERR "Ninka v${Ninka::VERSION} -Usage: $0 [options] +Usage: ninka [options] Options: -i create intermediary files @@ -32,29 +32,79 @@ Options: __END__ +=encoding utf8 + =head1 NAME -ninka +ninka - source file license identification tool + +=head1 SYNOPSYS + +B [options] F =head1 DESCRIPTION -Scans a file and returns the found licenses. +Scans a source file and returns the found licenses. + +=head1 OPTIONS + +=over + +=item B<-i> + +create intermediary files (for debugging) + +=item B<-v> + +verbose + +=back + +=head1 EXAMPLES + +=over + +=item B F + +Determine the licenses in file F. + +=item B F + +Determine the licenses in file F and create intermediary files (for debugging). + +=item find * | xargs -n1 -I@ B '@' + +Determine the licenses of files in a directory. + +=back + +=head1 AUTHOR + +B was written by Daniel M. German and Yuki Manabe . + +=head1 SEE ALSO + +Daniel M. German, Yuki Manabe and Katsuro Inoue. A sentence-matching method +for automatic license identification of source code files. In 25nd IEEE/ACM +International Conference on Automated Software Engineering (ASE 2010). + +You can download it from http://turingmachine.org/~dmg/papers/dmg2010ninka.pdf. =head1 COPYRIGHT AND LICENSE -Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German +Copyright (C) 2009-2014 Yuki Manabe and Daniel M. German, 2015 René Scheibe -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as -published by the Free Software Foundation, either version 3 of the +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. +GNU General Public License for more details. -You should have received a copy of the GNU Affero General Public License +You should have received a copy of the GNU General Public License along with this program. If not, see . =cut diff --git a/lib/Ninka.pm b/lib/Ninka.pm index 526aeab..8f454cd 100644 --- a/lib/Ninka.pm +++ b/lib/Ninka.pm @@ -68,7 +68,7 @@ __END__ =head1 NAME -Ninka - Find licenses in source files. +Ninka - source file license identification tool =head1 SYNOPSIS @@ -82,7 +82,7 @@ Ninka - Find licenses in source files. =head1 DESCRIPTION -Scans a file and returns the found licenses. +Scans a source file and returns the found licenses. =head1 COPYRIGHT AND LICENSE diff --git a/man/ninka.1 b/man/ninka.1 deleted file mode 100644 index 9cd2d57..0000000 --- a/man/ninka.1 +++ /dev/null @@ -1,83 +0,0 @@ -.TH NINKA 1.3 "May 2015" ninka -.SH NAME -ninka \- source file license identification tool -.SH SYNOPSYS -.SY ninka -.OP \-vfCcSsGgTtLd -.OP \-\- -.RI [ file ] -.YS - -.SH DESCRIPTION - -Analyses source files to determine the license they fall under. Takes a source -file as input and outputs the file's license. - -.SH OPTIONS - -.IP \-v -verbose - -.IP \-f -force all processing - -.IP \-C -force creation of comments -.IP \-c -stop after creation of comments - -.IP \-S -force creation of sentences -.IP \-s -stop after creation of sentences - -.IP \-G -force creation of goodsent -.IP \-g -stop after creation of goodsent - -.IP \-T -force creation of senttok -.IP \-t -stop after creation of senttok - -.IP \-L -force creation of matching - -.IP \-d -delete intermediate files - -.IP \-\- -Stop processing options - -.SH EXAMPLES - -.TP -\fBninka\fR \fIfoo.c\fR -Determine the licenses in file foo.c - -.TP -.BI ninka\ \-d \ foo.c -Determine the license in file foo.c and delete intermediary files - -.TP -find * | xargs \-n1 \-I@ \fBninka\fR '@' -Determine the licenses of files in a directory. - - -.SH AUTHOR - -\fBninka\fR was written by Daniel M. German and Yuki Manabe -. ninka itself is licensed under the AGPLv3+. This -manpage was written by Ryan Kavanagh for the Debian -project and is also licensed under the AGPLv3+. - -.SH SEE ALSO - -Daniel M. German, Yuki Manabe and Katsuro Inoue. A sentence-matching method -for automatic license identification of source code files. In 25nd IEEE/ACM -International Conference on Automated Software Engineering (ASE 2010). - -You can email Daniel M. German for a copy or download it from -.UR http://turingmachine.org/~dmg/papers/dmg2010ninka.pdf -.UE -- cgit v1.2.1 From 794490b0bb279cd2f6d673aecdf68c653a7a9dfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scheibe?= Date: Thu, 4 Jun 2015 18:27:27 +0200 Subject: fix typo --- lib/Ninka/SentenceExtractor.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Ninka/SentenceExtractor.pm b/lib/Ninka/SentenceExtractor.pm index e476463..6aeee4e 100644 --- a/lib/Ninka/SentenceExtractor.pm +++ b/lib/Ninka/SentenceExtractor.pm @@ -93,7 +93,7 @@ sub execute { $count2++ if ($c ge 'A' && $c le 'z'); } my $clean_sentence = clean_sentence($sentence); - push @clean_sentences, $clean_sentence if $clean_sentence, + push @clean_sentences, $clean_sentence if $clean_sentence; } if ($count1 != $count2) { -- cgit v1.2.1