diff options
Diffstat (limited to 'matcher/matcher.pl')
-rwxr-xr-x | matcher/matcher.pl | 355 |
1 files changed, 228 insertions, 127 deletions
diff --git a/matcher/matcher.pl b/matcher/matcher.pl index e85dff3..29f62ec 100755 --- a/matcher/matcher.pl +++ b/matcher/matcher.pl @@ -36,6 +36,8 @@ my %NonCriticalRules ; # once we have matched a rule, these are not that important +my @generalNonCritical = ('AllRights'); + my @gplNonCritical = ('GPLnoVersion', 'FSFwarranty', 'LibraryGPLcopyVer0', @@ -101,89 +103,94 @@ my $interrules= $path . "interrules.dict"; die "Usage $0 <filename>.sentences" unless $ARGV[0] =~ /\.senttok$/; -open (INPUTFILE, "<$ARGV[0]") or die ("Error: $ARGV[0] is not found."); -open (RULES, "<$rules") or die ("Error: rules.dict is not found."); -open (IRULES, "<$interrules") or die ("Error: interrules.dict is not found."); # read rules -my @rulelist=(); -my @interrulelist=(); -my @licSentNames=(); my $countUnknowns = 0; -my $sentence; -while ($sentence=<RULES>){ - chomp $sentence; - #check format - if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/){ - next; - } - $sentence =~ /(.*?):(.*)/; - push (@rulelist,[$1,$2]); -} -#print $rulelist; -#for my $ref( @rulelist ){ -# no strict "refs"; -# print "@$ref\n"; -# } +# read the rules -close RULES; +my @rulelist = Read_Rules($rules); -while ($sentence=<IRULES>){ - chomp $sentence; - #check format - if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/){ - next; - } - foreach my $item (split(/\|/,$2)){ - push (@interrulelist,[$item,$1]); - } -} +my @interRuleList = Read_Inter_Rules($interrules); -close IRULES; + +my @licSentNames=(); +my @original; + +Read_Original($ARGV[0], \@licSentNames, \@original); + + +#foreach my $x (@licSentNames) { +# print "$x\n"; +#} +#exit; + +#foreach my $x (@original) { +# print "$x\n"; +#} +#exit; ########################################## -#for my $ref( @interrulelist ){ -# print "@$ref\n"; +#for my $ref( @interRuleList ){ +# print "@$ref\n"; #} -# matching -# 1. read senttok file -my @original; -while ($sentence = <INPUTFILE>){ - #check format - #chomp $sentence; - if ($sentence =~ /^(.*?)[\n,]/){ - if ($1 ne "UNKNOWN"){ - } else { - $countUnknowns++; +# matching spdx requires to match strict licenses, with no alternatives... + +my $senttok= "," . join(",",@licSentNames) . ","; +my @result=(); +my $countMatches = 0; + +Match_License(); + +# do we have to check again? +## todo, verifythat we have unmatched sentences... + +@licSentNames = split(',', $senttok); + +# first remove the extrict part from it + +#Print_Result(); + +my $match = 0; +for (my $i=0;$i<=$#licSentNames ;$i++) { + if ($licSentNames[$i] == 0 and + ($licSentNames[$i] ne "UNKNOWN" and + $licSentNames[$i] ne "")) { +# print "[$licSentNames[$i]]\n"; + $licSentNames[$i] =~ s/Extrict$//; + $match ++; } - push (@licSentNames,$1); - } - chomp $sentence; - push (@original, $sentence); } -if (scalar(@original) == 0) { - print "NONE\n"; - exit 0; -} - -#print join(";",@licSentNames)."\n"; -close INPUTFILE; +#Print_Result(); -# 2. replace -for (my $i=0;$i<=$#interrulelist ;$i++){ - #for my $ref( @interrulelist[$i]){ - # print "@$ref\n"; - #} - #print $interrulelist[$i][0]; - @licSentNames = map { $_ eq $interrulelist[$i][0] ? $interrulelist[$i][1] : $_ } @licSentNames; + +if ($match > 0) { +# print "REDO\n"; + for (my $i=0;$i<=$#interRuleList ;$i++){ + #for my $ref( @interRuleList[$i]){ + # print "@$ref\n"; + #} + #print $interRuleList[$i][0]; + @licSentNames = map { $_ eq $interRuleList[$i][0] ? $interRuleList[$i][1] : $_ } @licSentNames; + } + + $senttok= join(",",@licSentNames) . ','; + + Match_License(); } +Print_Result(); + + +exit 0; + + + #print @licSentNames; #print join(";",@licSentNames)."\n"; @@ -193,99 +200,193 @@ for (my $i=0;$i<=$#interrulelist ;$i++){ # we will iterate over rules, matching as many as we can... -my @result=(); -# create a string with the sentences -my $senttok= "," . join(",",@licSentNames) . ","; -#print STDERR "\nStarting>>>>$senttok\n"; -for (my $j=0;$j<=$#rulelist;$j++){ +sub Is_Unknown +{ + my ($s) = @_; + my @f = split (/,/, $s); + return $f[0] eq "UNKNOWN"; +} + + +sub Read_Rules +{ + my ($rulesF) = @_; + open (RULES, "<$rulesF") or die ("Error: rules.dict is not found."); + my $sentence; + my @rules = (); + while ($sentence=<RULES>){ + chomp $sentence; + #check format + if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/){ + next; + } + $sentence =~ /(.*?):(.*)/; + push (@rules,[$1,$2]); + } + close RULES; + return @rules; +} + + +sub Read_Inter_Rules +{ + my ($interrules) = @_; + + my @list; + open (IRULES, "<$interrules") or die ("Error: interrules.dict is not found."); + my $sentence; + while ($sentence=<IRULES>){ + chomp $sentence; + #check format + if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/){ + next; + } + foreach my $item (split(/\|/,$2)){ + push (@list,[$item,$1]); + } + } + close IRULES; + return @list; +} + +sub Read_Original +{ + my ($inputF, $tokens, $originals) = @_; + + open (INPUTFILE, $inputF) or die ("Error: $inputF is not found."); + + my $sentence; + my @original; + while ($sentence = <INPUTFILE>){ + chomp $sentence; + my @fields = split(':',$sentence); + push(@$originals,$fields[1]); + my @token = split(';', $fields[0]); + push(@$tokens,$token[0]); + } + if (scalar(@$originals) == 0) { + print "NONE\n"; + exit 0; + } - my $rule=$rulelist[$j][1]; - my $rulename=$rulelist[$j][0]; +#print join(";",@licSentNames)."\n"; + + close INPUTFILE; +} + +sub Match_License +{ + +# create a string with the sentences - while ($senttok =~ s/,${rule},/,/){ - push (@result,$rulename); + for (my $j=0;$j<=$#rulelist;$j++){ + + my $rule=$rulelist[$j][1]; + my $rulename=$rulelist[$j][0]; + my $lenRule = scalar(split(',', $rule)); + # replace rule with the length of the rule + while ($senttok =~ s/,${rule},/,$lenRule,/){ + $countMatches ++; + push (@result,$rulename); # print ">>>>$senttok|$rulelist[$j][1]\n"; # print "Result: ", join(',', @result); # print "\n"; + } } -} + +# print ">>>>[$senttok]\n"; + + my $onlyAllRight = 0; # ok, at this point we have removed all the matched sentences... #print STDERR "Ending>>>>>>>$senttok\n"; #print STDERR "Size>>" , scalar(@result), "\n"; #print STDERR "Result>>", join(',', @result), "\n"; - + # let us remove allrights -my $onlyAllRight = 1; -for my $i (0.. scalar(@licSentNames)-1){ - if (($licSentNames[$i] eq "AllRights")) { - $licSentNames[$i] = ''; - } else { - $onlyAllRight = 0; - } -} +# my $onlyAllRight = 1; +# for my $i (0.. scalar(@licSentNames)-1){ +# if (($licSentNames[$i] eq "AllRights")) { +# $licSentNames[$i] = ''; +# } else { +# $onlyAllRight = 0; +# } +# } # output result -if (scalar(@result) > 0){ - # at this point we have matched - - - # let us clean up the rules... let us print the matched rules, and the + if (scalar(@result) > 0){ + # at this point we have matched + + + # let us clean up the rules... let us print the matched rules, and the # if (grep(/GPL/, @result)) { # print "GPL...\n"; # foreach my $r ($NonCriticalRules{GPL}) { # $senttok =~ s/(,|^)$r(,|$)/$1$2/g; # } # } - foreach my $res (@result) { - my $temp = $NonCriticalRules{$res}; - foreach my $r (@$temp) { -# print ">>Senttok [$r][$senttok]\n"; - while ($senttok =~ s/(,|^)$r(,|$)/$1$2/g) { + # general removal of rules + + + foreach my $r (@generalNonCritical) { + while ($senttok =~ s/,$r,/,-1,/) { ; } } - } - - # we also want to remove any rule contains allrights - $senttok =~ s/AllRights(,?)/$1/g; - $senttok =~ s/UNKNOWN,/,/g; - $senttok =~ s/,+/,/g; - - print join(',',@result), ";$senttok;$countUnknowns\n"; - - -}else{ - - # if it contains only AllRights there it is o'right - # at this point there is at least one rule - - # let us remove the non important sentences... by making them empty - # on this array... - if ($onlyAllRight) { - print "NONE;\n"; - } elsif ($countUnknowns != 0) { - print "UNMATCHED \[", join (',',@original), "\]\n"; - } else { - my $t = join (',',@original); - $t =~ s/;/<SEMI>/g; - print "UNKNOWN [$t];"; - my $t = join (',',@licSentNames); - $t =~ s/;/<SEMI>/g; - print "UKNSIMP [$t]"; - print "\n"; +# print "[$senttok]\n"; + + foreach my $res (@result) { + my $temp = $NonCriticalRules{$res}; + foreach my $r (@$temp) { +# print ">>Senttok [$r][$senttok]\n"; + while ($senttok =~ s/,$r,/,-1,/g) { + ; + } + } + } +# print "[$senttok]\n"; } } -sub Is_Unknown + +sub Print_Result { - my ($s) = @_; - my @f = split (/,/, $s); - return $f[0] eq "UNKNOWN"; +# $senttok =~ s/AllRights(,?)/$1/g; +# $senttok =~ s/UNKNOWN,/,/g; +# $senttok =~ s/,+/,/g; + + my $save = $senttok; + # ok, so now, what I want to output it: + # licenses; number of licenses matched;number of sentences matched; number of sentences ignored;number of sentences not matched;number of sentences unknown + my @sections = split(',', $senttok); + die "assertion 1" if $sections[0] ne ""; + die "assertion 2" if $sections[scalar(@sections)] ne ""; + + my $ignoredLines = 0; + my $licenseLines = 0; + my $unknownLines = 0; + my $unmatchedLines = 0; + foreach my $i (1..scalar(@sections)-1) { +# print "$i;$sections[$i]\n"; + if ($sections[$i] < 0) { + $ignoredLines += - $sections[$i]; + } elsif ($sections[$i] != 0) { + $licenseLines += $sections[$i]; + } elsif ($sections[$i] eq "UNKNOWN") { + $unknownLines ++; + } else { + $unmatchedLines++; + } + } + $senttok =~ s/^,(.*),$/$1/; + +# print "$ignoredLines > $licenseLines > $unknownLines > $unmatchedLines\n"; + + print join(',',@result), ";$countMatches;$licenseLines;$ignoredLines;$unmatchedLines;$unknownLines;$senttok\n"; + $senttok = $save; + } - - |