summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordmg <dmg@uvic.ca>2011-07-12 03:36:57 -0700
committerdmg <dmg@uvic.ca>2011-07-12 03:36:57 -0700
commit594d5e42a040359f3bd7e75c448aa16fbc7b5bcd (patch)
tree600947d076084ae602bd329975b5ec859ccb01ad
parent5028ead16791c10c8b63dc9f9db1ab9112cc7d8c (diff)
downloadninka-594d5e42a040359f3bd7e75c448aa16fbc7b5bcd.tar.gz
Implemented spdx variants and changed the format of the output of matcher
-rwxr-xr-xmatcher/matcher.pl355
-rwxr-xr-xmatcher/rules.dict11
-rw-r--r--senttok/licensesentence.dict13
-rwxr-xr-xsenttok/senttok.pl16
4 files changed, 260 insertions, 135 deletions
diff --git a/matcher/matcher.pl b/matcher/matcher.pl
index e85dff3..29f62ec 100755
--- a/matcher/matcher.pl
+++ b/matcher/matcher.pl
@@ -36,6 +36,8 @@ my %NonCriticalRules ;
# once we have matched a rule, these are not that important
+my @generalNonCritical = ('AllRights');
+
my @gplNonCritical = ('GPLnoVersion',
'FSFwarranty',
'LibraryGPLcopyVer0',
@@ -101,89 +103,94 @@ my $interrules= $path . "interrules.dict";
die "Usage $0 <filename>.sentences" unless $ARGV[0] =~ /\.senttok$/;
-open (INPUTFILE, "<$ARGV[0]") or die ("Error: $ARGV[0] is not found.");
-open (RULES, "<$rules") or die ("Error: rules.dict is not found.");
-open (IRULES, "<$interrules") or die ("Error: interrules.dict is not found.");
# read rules
-my @rulelist=();
-my @interrulelist=();
-my @licSentNames=();
my $countUnknowns = 0;
-my $sentence;
-while ($sentence=<RULES>){
- chomp $sentence;
- #check format
- if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/){
- next;
- }
- $sentence =~ /(.*?):(.*)/;
- push (@rulelist,[$1,$2]);
-}
-#print $rulelist;
-#for my $ref( @rulelist ){
-# no strict "refs";
-# print "@$ref\n";
-# }
+# read the rules
-close RULES;
+my @rulelist = Read_Rules($rules);
-while ($sentence=<IRULES>){
- chomp $sentence;
- #check format
- if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/){
- next;
- }
- foreach my $item (split(/\|/,$2)){
- push (@interrulelist,[$item,$1]);
- }
-}
+my @interRuleList = Read_Inter_Rules($interrules);
-close IRULES;
+
+my @licSentNames=();
+my @original;
+
+Read_Original($ARGV[0], \@licSentNames, \@original);
+
+
+#foreach my $x (@licSentNames) {
+# print "$x\n";
+#}
+#exit;
+
+#foreach my $x (@original) {
+# print "$x\n";
+#}
+#exit;
##########################################
-#for my $ref( @interrulelist ){
-# print "@$ref\n";
+#for my $ref( @interRuleList ){
+# print "@$ref\n";
#}
-# matching
-# 1. read senttok file
-my @original;
-while ($sentence = <INPUTFILE>){
- #check format
- #chomp $sentence;
- if ($sentence =~ /^(.*?)[\n,]/){
- if ($1 ne "UNKNOWN"){
- } else {
- $countUnknowns++;
+# matching spdx requires to match strict licenses, with no alternatives...
+
+my $senttok= "," . join(",",@licSentNames) . ",";
+my @result=();
+my $countMatches = 0;
+
+Match_License();
+
+# do we have to check again?
+## todo, verifythat we have unmatched sentences...
+
+@licSentNames = split(',', $senttok);
+
+# first remove the extrict part from it
+
+#Print_Result();
+
+my $match = 0;
+for (my $i=0;$i<=$#licSentNames ;$i++) {
+ if ($licSentNames[$i] == 0 and
+ ($licSentNames[$i] ne "UNKNOWN" and
+ $licSentNames[$i] ne "")) {
+# print "[$licSentNames[$i]]\n";
+ $licSentNames[$i] =~ s/Extrict$//;
+ $match ++;
}
- push (@licSentNames,$1);
- }
- chomp $sentence;
- push (@original, $sentence);
}
-if (scalar(@original) == 0) {
- print "NONE\n";
- exit 0;
-}
-
-#print join(";",@licSentNames)."\n";
-close INPUTFILE;
+#Print_Result();
-# 2. replace
-for (my $i=0;$i<=$#interrulelist ;$i++){
- #for my $ref( @interrulelist[$i]){
- # print "@$ref\n";
- #}
- #print $interrulelist[$i][0];
- @licSentNames = map { $_ eq $interrulelist[$i][0] ? $interrulelist[$i][1] : $_ } @licSentNames;
+
+if ($match > 0) {
+# print "REDO\n";
+ for (my $i=0;$i<=$#interRuleList ;$i++){
+ #for my $ref( @interRuleList[$i]){
+ # print "@$ref\n";
+ #}
+ #print $interRuleList[$i][0];
+ @licSentNames = map { $_ eq $interRuleList[$i][0] ? $interRuleList[$i][1] : $_ } @licSentNames;
+ }
+
+ $senttok= join(",",@licSentNames) . ',';
+
+ Match_License();
}
+Print_Result();
+
+
+exit 0;
+
+
+
#print @licSentNames;
#print join(";",@licSentNames)."\n";
@@ -193,99 +200,193 @@ for (my $i=0;$i<=$#interrulelist ;$i++){
# we will iterate over rules, matching as many as we can...
-my @result=();
-# create a string with the sentences
-my $senttok= "," . join(",",@licSentNames) . ",";
-#print STDERR "\nStarting>>>>$senttok\n";
-for (my $j=0;$j<=$#rulelist;$j++){
+sub Is_Unknown
+{
+ my ($s) = @_;
+ my @f = split (/,/, $s);
+ return $f[0] eq "UNKNOWN";
+}
+
+
+sub Read_Rules
+{
+ my ($rulesF) = @_;
+ open (RULES, "<$rulesF") or die ("Error: rules.dict is not found.");
+ my $sentence;
+ my @rules = ();
+ while ($sentence=<RULES>){
+ chomp $sentence;
+ #check format
+ if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/){
+ next;
+ }
+ $sentence =~ /(.*?):(.*)/;
+ push (@rules,[$1,$2]);
+ }
+ close RULES;
+ return @rules;
+}
+
+
+sub Read_Inter_Rules
+{
+ my ($interrules) = @_;
+
+ my @list;
+ open (IRULES, "<$interrules") or die ("Error: interrules.dict is not found.");
+ my $sentence;
+ while ($sentence=<IRULES>){
+ chomp $sentence;
+ #check format
+ if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/){
+ next;
+ }
+ foreach my $item (split(/\|/,$2)){
+ push (@list,[$item,$1]);
+ }
+ }
+ close IRULES;
+ return @list;
+}
+
+sub Read_Original
+{
+ my ($inputF, $tokens, $originals) = @_;
+
+ open (INPUTFILE, $inputF) or die ("Error: $inputF is not found.");
+
+ my $sentence;
+ my @original;
+ while ($sentence = <INPUTFILE>){
+ chomp $sentence;
+ my @fields = split(':',$sentence);
+ push(@$originals,$fields[1]);
+ my @token = split(';', $fields[0]);
+ push(@$tokens,$token[0]);
+ }
+ if (scalar(@$originals) == 0) {
+ print "NONE\n";
+ exit 0;
+ }
- my $rule=$rulelist[$j][1];
- my $rulename=$rulelist[$j][0];
+#print join(";",@licSentNames)."\n";
+
+ close INPUTFILE;
+}
+
+sub Match_License
+{
+
+# create a string with the sentences
- while ($senttok =~ s/,${rule},/,/){
- push (@result,$rulename);
+ for (my $j=0;$j<=$#rulelist;$j++){
+
+ my $rule=$rulelist[$j][1];
+ my $rulename=$rulelist[$j][0];
+ my $lenRule = scalar(split(',', $rule));
+ # replace rule with the length of the rule
+ while ($senttok =~ s/,${rule},/,$lenRule,/){
+ $countMatches ++;
+ push (@result,$rulename);
# print ">>>>$senttok|$rulelist[$j][1]\n";
# print "Result: ", join(',', @result);
# print "\n";
+ }
}
-}
+
+# print ">>>>[$senttok]\n";
+
+ my $onlyAllRight = 0;
# ok, at this point we have removed all the matched sentences...
#print STDERR "Ending>>>>>>>$senttok\n";
#print STDERR "Size>>" , scalar(@result), "\n";
#print STDERR "Result>>", join(',', @result), "\n";
-
+
# let us remove allrights
-my $onlyAllRight = 1;
-for my $i (0.. scalar(@licSentNames)-1){
- if (($licSentNames[$i] eq "AllRights")) {
- $licSentNames[$i] = '';
- } else {
- $onlyAllRight = 0;
- }
-}
+# my $onlyAllRight = 1;
+# for my $i (0.. scalar(@licSentNames)-1){
+# if (($licSentNames[$i] eq "AllRights")) {
+# $licSentNames[$i] = '';
+# } else {
+# $onlyAllRight = 0;
+# }
+# }
# output result
-if (scalar(@result) > 0){
- # at this point we have matched
-
-
- # let us clean up the rules... let us print the matched rules, and the
+ if (scalar(@result) > 0){
+ # at this point we have matched
+
+
+ # let us clean up the rules... let us print the matched rules, and the
# if (grep(/GPL/, @result)) {
# print "GPL...\n";
# foreach my $r ($NonCriticalRules{GPL}) {
# $senttok =~ s/(,|^)$r(,|$)/$1$2/g;
# }
# }
- foreach my $res (@result) {
- my $temp = $NonCriticalRules{$res};
- foreach my $r (@$temp) {
-# print ">>Senttok [$r][$senttok]\n";
- while ($senttok =~ s/(,|^)$r(,|$)/$1$2/g) {
+ # general removal of rules
+
+
+ foreach my $r (@generalNonCritical) {
+ while ($senttok =~ s/,$r,/,-1,/) {
;
}
}
- }
-
- # we also want to remove any rule contains allrights
- $senttok =~ s/AllRights(,?)/$1/g;
- $senttok =~ s/UNKNOWN,/,/g;
- $senttok =~ s/,+/,/g;
-
- print join(',',@result), ";$senttok;$countUnknowns\n";
-
-
-}else{
-
- # if it contains only AllRights there it is o'right
- # at this point there is at least one rule
-
- # let us remove the non important sentences... by making them empty
- # on this array...
- if ($onlyAllRight) {
- print "NONE;\n";
- } elsif ($countUnknowns != 0) {
- print "UNMATCHED \[", join (',',@original), "\]\n";
- } else {
- my $t = join (',',@original);
- $t =~ s/;/<SEMI>/g;
- print "UNKNOWN [$t];";
- my $t = join (',',@licSentNames);
- $t =~ s/;/<SEMI>/g;
- print "UKNSIMP [$t]";
- print "\n";
+# print "[$senttok]\n";
+
+ foreach my $res (@result) {
+ my $temp = $NonCriticalRules{$res};
+ foreach my $r (@$temp) {
+# print ">>Senttok [$r][$senttok]\n";
+ while ($senttok =~ s/,$r,/,-1,/g) {
+ ;
+ }
+ }
+ }
+# print "[$senttok]\n";
}
}
-sub Is_Unknown
+
+sub Print_Result
{
- my ($s) = @_;
- my @f = split (/,/, $s);
- return $f[0] eq "UNKNOWN";
+# $senttok =~ s/AllRights(,?)/$1/g;
+# $senttok =~ s/UNKNOWN,/,/g;
+# $senttok =~ s/,+/,/g;
+
+ my $save = $senttok;
+ # ok, so now, what I want to output it:
+ # licenses; number of licenses matched;number of sentences matched; number of sentences ignored;number of sentences not matched;number of sentences unknown
+ my @sections = split(',', $senttok);
+ die "assertion 1" if $sections[0] ne "";
+ die "assertion 2" if $sections[scalar(@sections)] ne "";
+
+ my $ignoredLines = 0;
+ my $licenseLines = 0;
+ my $unknownLines = 0;
+ my $unmatchedLines = 0;
+ foreach my $i (1..scalar(@sections)-1) {
+# print "$i;$sections[$i]\n";
+ if ($sections[$i] < 0) {
+ $ignoredLines += - $sections[$i];
+ } elsif ($sections[$i] != 0) {
+ $licenseLines += $sections[$i];
+ } elsif ($sections[$i] eq "UNKNOWN") {
+ $unknownLines ++;
+ } else {
+ $unmatchedLines++;
+ }
+ }
+ $senttok =~ s/^,(.*),$/$1/;
+
+# print "$ignoredLines > $licenseLines > $unknownLines > $unmatchedLines\n";
+
+ print join(',',@result), ";$countMatches;$licenseLines;$ignoredLines;$unmatchedLines;$unknownLines;$senttok\n";
+ $senttok = $save;
+
}
-
-
diff --git a/matcher/rules.dict b/matcher/rules.dict
index 37d9f85..cec925b 100755
--- a/matcher/rules.dict
+++ b/matcher/rules.dict
@@ -69,6 +69,15 @@ CDDLv1orGPLv2:CDDLorGPLv2,CDDLorGPLv2compliance,CDDLorGPLv2where,ApachesPermLim,
# this one should go before the BSD ones
intelBSDLicense:BSDpre,BSDcondSource,BSDcondBinary,BSDcondEndorseRULE,BSDasIs,BSDWarr,intelBSDexport1,intelBSDexport2,intelBSDexport3
+# this rule should be before BSDs
+spdxSleepyCat:SleepyCatNameExtrict,AllRights,BSDpre,BSDcondSourceExtrict,BSDcondBinaryExtrict,SleepyCatObtain,SleepyCatSourceIncluded,SleepyCatSourceComplete,SleepyCatDoesNotInclude,SleepyCatAsIs,BSDWarrExtrict,AllRights,BSDpre,BSDcondSourceExtrict,BSDcondBinaryExtrict,BSDcondEndorseExtrict,BSDasIsExtrict,BSDWarrExtrict,AllRights,BSDpre,BSDcondSourceExtrict,BSDcondBinaryExtrict,BSDcondEndorseExtrict,BSDasIsExtrict,BSDWarrExtrict
+
+
+spdxBSD4:BSDpre,BSDcondSourceExtrict,BSDcondBinaryExtrict,BSDcondAdvPart1Extrict,BSDcondAdvPart2Extrict,BSDcondEndorseExtrict,BSDasIsExtrict,BSDWarrExtrict
+spdxBSD3:BSDpre,BSDcondSourceExtrict,BSDcondBinaryExtrict,BSDcondEndorseExtrict,BSDasIsExtrict,BSDWarrExtrict
+spdxBSD2:BSDpre,BSDcondSourceExtrict,BSDcondBinaryExtrict,BSDasIsExtrict,BSDWarrExtrict
+#,SleepyCatObtain,SleepyCatObtain,SleepyCatSourceIncluded,SleepyCatSourceComplete
+#,BSDpre,BSDcondSourceExtrict,BSDcondBinaryExtrict,BSDcondEndorseExtrict,BSDasIsExtrict,BSDWarrExtrict,BSDpre,BSDcondSourceExtrict,BSDcondBinaryExtrict,SleepyCatAsIs,BSDWarrExtrict
BSD4:BSDpre,BSDcondSource,BSDcondBinary,BSDcondAdvPart1,BSDcondAdvPart2,BSDcondEndorseRULE,BSDasIs,BSDWarr
@@ -115,6 +124,8 @@ Apachev1.1:BSDpre,BSDcondSource,BSDcondBinary,BSDcondAdvPart1,BSDcondAdvPart2,Op
Apachev1.1:BSDpre,BSDcondSource,BSDcondBinary,BSDcondAdvPart2,OpenSSLendorse,OpenSSLwritCond,OpenSSLName,BSDasIs,BSDWarr
SleepyCat:BSDpre,BSDcondSource,BSDcondBinary,SleepyCatObtain,SleepyCatSourceIncluded,SleepyCatSourceComplete,SleepyCatDoesNotInclude,SleepyCatAsIs,BSDWarr
+
+
boost:boostPermission,boostPreserve,boostAsIs,boostWarr
boostV1:boostRefv1
SSLeay:SSLCopy,SSLeayAttrib,SSLeayAdType,BSDpre,BSDcondSource,BSDcondBinary,BSDcondAdvRULE,SSLeayCrypto,SSLeayWindows,BSDasIs,BSDWarr,SSLeayCantChangeLic
diff --git a/senttok/licensesentence.dict b/senttok/licensesentence.dict
index 265cc79..ceddb98 100644
--- a/senttok/licensesentence.dict
+++ b/senttok/licensesentence.dict
@@ -30,6 +30,7 @@ GPLGen:14:1:^([^,;]+) is free software; you can distribute it and/or modify it u
GPLGen:20:1:^([^,;]+) is free software; you can redistribute it and/or modify it under the terms of the GPL(,|;)? <version>$:
GPLGen:22:1:^([^,;]+) is free software; you can redistribute it and/or modify it under the terms of <VERSION> of the GPL$:
GPLGen:23:1:^([^,;]+) is free software; you can redistribute it and/or modify it under the terms of the GPL; <VERSION>:
+GPLGen:24:1:^([^,;]+) is <LICENSED> under the terms of the GPL <VERSION>, and can be copied, distributed, and modified under those terms:
LesserGPLGen:24:0:^you can redistribute it and/or modify it under the terms of the Lesser GPL <VERSION>:
LesserGPLGen:23:1:^([^,;]+) is free software; you can redistribute it and/or modify it under the terms of the Lesser GPL \(as published by the Free Software Foundation\) <VERSION>$:
LesserGPLGenv3or:10:1:This program is free software; you can redistribute it and/or modify it under the terms of the Lesser GPL; either <VERSION>, or (at your option) version 3
@@ -199,6 +200,17 @@ XXXLeGPLseeDetails:24:0:See the GNU Lesser General Public License for more detai
XXXLiGPLseeDetails:24:0:See the GNU Library General Public License for more details:
###########################
# BSDs
+# first extrict rules
+BSDcondSourceExtrict:10:0:Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer:
+BSDcondBinaryExtrict:10:0:Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution:
+BSDcondAdvPart1Extrict:10:0:All advertising materials mentioning features or use of this software must display the following acknowledgement:
+BSDcondAdvPart2Extrict:10:1:This product includes software developed by (.+):
+BSDcondEndorseExtrict:10:1:Neither the name of the ([^;]+) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission:
+BSDasIsExtrict:10:4:THIS SOFTWARE IS PROVIDED BY (.+)<quotes>AS IS<quotes> AND ANY EXPRESS(ED)? OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED:
+BSDWarrExtrict:10:2:IN NO EVENT SHALL (.+) BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES \(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION\) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT \(INCLUDING NEGLIGENCE OR OTHERWISE\) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE:
+
+
+###
BSDlike:10:0:^This file and program are licensed under a BSD style license$
BSDpreLike:70:0:Redistribution and use in source and binary forms are permitted provided that the above copyright notice and this paragraph are duplicated in all such forms and that any documentation, advertising materials, and other materials related to such distribution and use acknowledge that the software was developed by:
BSDpre:70:0:Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -234,6 +246,7 @@ BSDZlibAltered:70:0:Altered source versions must be plainly marked as such, and
OpenSSLName:70:3:Products derived from this software may not be called (.+) nor may (.+) appear in their name(s)?,? without prior written permission of the (.+)$:
OpenSSLAckPart1:70:1:Redistributions of any form whatsoever must retain the following acknowledgment:
OpenSSLAckPart2:70:1:<quotes>This product includes software developed by ([^,;]+)<quotes>:
+SleepyCatNameExtrict:10:0:<quotes>The Sleepycat License Copyright \(c\) 1990\-1999 Sleepycat Software:
SleepyCatObtain:70:1:Redistributions in any form must be accompanied by information on how to obtain complete source code for the ([^,;]+):
SleepyCatSourceIncluded:70:0:The source code must either be included in the distribution or be available for no more than the cost of distribution plus a nominal fee, and must be freely redistributable under reasonable conditions:
SleepyCatSourceComplete:70:0:For an executable file, complete source code means the source code for all modules it contains:
diff --git a/senttok/senttok.pl b/senttok/senttok.pl
index a11adac..543f818 100755
--- a/senttok/senttok.pl
+++ b/senttok/senttok.pl
@@ -20,7 +20,6 @@ use strict;
my $TOO_LONG = 70;
-
# where are we running the splitter from?
my $path = $0;
$path =~ s/[^\/]+$//;
@@ -46,8 +45,10 @@ while ($line = <LICENSESENTENCEFILE>){
#}
close LICENSESENTENCEFILE;
while ($line = <>){
+ my $saveLine;
my $originalLine;
chomp $line;
+ $originalLine = $line;
if ($line =~ s/^Alternatively,? ?//) {
print "Altern\n";
@@ -67,7 +68,7 @@ while ($line = <>){
my $gpl = 0;
my ($gplLater, $gplVersion);
- $originalLine = $line;
+ $saveLine = $line;
# print "Original
# [$line]
@@ -95,7 +96,7 @@ while ($line = <>){
again:
# print "Testing
# lin[$line]
-# ori[$originalLine]
+# ori[$saveLine]
# re [$regexp]
# lpg[$LGPL]
#\n";
@@ -119,7 +120,7 @@ while ($line = <>){
}
if ($gpl) {
$gpl = 0;
- $line = $originalLine;
+ $line = $saveLine;
goto again;
}
next;## dmg
@@ -145,15 +146,14 @@ while ($line = <>){
length($after) >$TOO_LONG) {
$matchname .= "-TOOLONG";
}
-
- my $parmstrings=join(",",$matchname, $subRule, $before, $after, @parm);
- print $parmstrings,"\n";
+ my $parmstrings=join(";",$matchname, $subRule, $before, $after, @parm);
+ print $parmstrings,":$originalLine\n";
}else{
#UNKNOWN, sentence
chomp $line;
- print $matchname,",",0, ",", $mostsimilarname,",",$distance,",",$line,",<------------>[$originalLine][$lineAsGPL]\n";
+ print $matchname,";",0, ";", $mostsimilarname,";",$distance,";",$saveLine,":$originalLine\n";
}
}