diff options
Diffstat (limited to 'senttok/senttok.pl')
-rwxr-xr-x | senttok/senttok.pl | 373 |
1 files changed, 373 insertions, 0 deletions
diff --git a/senttok/senttok.pl b/senttok/senttok.pl new file mode 100755 index 0000000..7a20f59 --- /dev/null +++ b/senttok/senttok.pl @@ -0,0 +1,373 @@ +#!/usr/bin/perl +# +# Copyright (C) 2009-2010 Yuki Manabe and Daniel M. German +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +use strict; + + +my $TOO_LONG = 70; + + +# where are we running the splitter from? +my $path = $0; +$path =~ s/[^\/]+$//; +if ($path eq "") { + $path = "./"; +} +my $licSentences = $path . "licensesentence.dict"; + +open FH, "<$ARGV[0]"; +my @licensesentencelist=(); +open LICENSESENTENCEFILE, "<$licSentences"; +my $line; +while ($line = <LICENSESENTENCEFILE>){ + chomp $line; + next if $line =~ /^\#/; + next if $line =~ /^ *$/; + die "Illegal format in license expression [$line] " unless $line =~ /(.*?):(.*?):(.*)/; + push @licensesentencelist,$line; +} + +#foreach $line (@licensesentencelist) { +# print $line; +#} +close LICENSESENTENCEFILE; +while ($line = <>){ + my $originalLine; + chomp $line; + + if ($line =~ s/^Alternatively,? ?//) { + print "Altern\n"; + } + + $line = Normalize_Sentence($line); + + + my $check=0; + my $matchname="UNKNOWN"; + my @parm=(); + my $sentence; + my $distance=1; #maximum? number + my $mostsimilarname="UNKNOWN"; + my $before; + my $after; + my $gpl = 0; + my ($gplLater, $gplVersion); + + $originalLine = $line; + +# print "Original +# [$line] +#\n"; + + + my $lineAsGPL =''; + + if (Looks_Like_GPL($line)) { + my $old = $line; + $gpl = 1; + ($line, $gplLater, $gplVersion) = Normalize_GPL($line); + $lineAsGPL = $line; + } + my ($name, $subRule, $number, $regexp, $option); + my $saveLine = $line; + my $saveGPL = $gpl; + my $LGPL = ""; + foreach $sentence (@licensesentencelist) { + ($name, $subRule, $number, $regexp, $option) = split(/:/, $sentence); + # we need this due to the goto again + $line = $saveLine; + $gpl = $saveGPL; + $LGPL = ""; + again: +# print "Testing +# lin[$line] +# ori[$originalLine] +# re [$regexp] +# lpg[$LGPL] +#\n"; + if ( $line =~ /$regexp/im ){ + $before = $`; + $after = $'; #'; + $check=1; + $matchname=$name; + for (my $i = 1; $i <= $number; $i++){ + no strict 'refs'; + push @parm,$$i; + } + last; + } else{ +# print "NO MATCH\n"; + # let us try again in cas it is lesser/library + # do it only once + if ($gpl and $line =~ s/(Lesser|Library) GPL/GPL/i) { + $LGPL = $1; + goto again; + } + if ($gpl) { + $gpl = 0; + $line = $originalLine; + goto again; + } + next;## dmg + my $targetset=$regexp; + $targetset =~ s/^(.*)$/$1/; + my $tmpdist=&levenshtein($line,$targetset)/max(length($targetset),length($sentence)); + if ($tmpdist<$distance){ + $mostsimilarname=$name; + $distance=$tmpdist; + } + } + last; ### + } + if ($check == 1){ + #licensesentence name, parm1, parm2,.. + if ($gpl) { + $matchname .= "Ver" . $gplVersion; + $matchname .= "+" if $gplLater; + $matchname = $LGPL . $matchname; + } else { + } + if (length($before)>$TOO_LONG || + length($after) >$TOO_LONG) { + $matchname .= "-TOOLONG"; + } + + my $parmstrings=join(",",$matchname, $subRule, $before, $after, @parm); + print $parmstrings,"\n"; + + + }else{ + #UNKNOWN, sentence + chomp $line; + print $matchname,",",0, ",", $mostsimilarname,",",$distance,",",$line,",<------------>[$originalLine][$lineAsGPL]\n"; + } + +} + +close FH; +exit 0; + +sub Normalize_GPL +{ + my ($line) = @_; + my $later = 0; + my $version = 0; + + # do some very quick spelling corrections for english/british words + $line =~ s/Version 2,? \(June 1991\)/Version 2/gi; + $line =~ s/Version 2,? dated June 1991/Version 2/gi; + $line =~ s/Version 2\.1,? dated February 1999/Version 2.1/gi; + if ($line =~ s/,? or \(?at your option\)?,? any later version//i) { + $later = 1; + } + if ($line =~ s/, or any later version//i) { + $later = 1; + } + if ($line =~ s/ or (greater|later)//i) { + $later = 1; + } + if ($line =~ s/or (greater|later) //i) { + $later = 1; + } + if ($line =~ s/(version|v\.?) ([123\.0]+)/<VERSION>/i) { + $version = $2; +# print "Version [$version]\n"; + } + if ($line =~ s/GPL ?[v\-]([123\.0]+)/GPL <VERSION>/i) { + $version = $1; + } + if ($line =~ s/v\.?([123\.0]+)/<VERSION>/i) { + $version = $1; + } + + $line =~ s/(distributable|licensed|released|made available)/<LICENSED>/ig; + $line =~ s/Library General Public License/Library General Public License/ig; + $line =~ s/Lesser General Public License/Lesser General Public License/ig; + + $line =~ s/General Public License/GPL/gi; + $line =~ s/GPL \(GPL\)/GPL/gi; + $line =~ s/GPL \(<QUOTES>GPL<QUOTES>\)/GPL/gi; + + + $line =~ s/GNU //gi; + $line =~ s/under GPL/under the GPL/gi; + $line =~ s/under Lesser/under the Lesser/gi; + $line =~ s/under Library/under the Library/gi; + + $line =~ s/of GPL/of the GPL/gi; + $line =~ s/of Lesser/of the Lesser/gi; + $line =~ s/of Library/of the Library/gi; + + $line =~ s/(can|may)/can/gi; + $line =~ s/<VERSION> only/<VERSION>/gi; + $line =~ s/<VERSION> of the license/<VERSION>/gi; + $line =~ s/(<VERSION>|GPL),? as published by the Free Software Foundation/$1/gi; + $line =~ s/(<VERSION>|GPL) \(as published by the Free Software Foundation\)/$1/gi; + $line =~ s/(<VERSION>|GPL),? incorporated herein by reference/$1/gi; + $line =~ s/terms and conditions/terms/gi; + $line =~ s/GPL along with/GPL with/gi; + + $line =~ s/GPL \(<VERSION\)/GPL <VERSION>/gi; + + $line =~ s/ +/ /; + $line =~ s/ +$//; + +# print ">>>>>>>>>>$line:$later:$version\n"; + + return ($line,$later,$version); +} + +sub Looks_Like_GPL +{ + my ($line) = @_; + + return 1 if $line =~ /GNU/; + return 1 if $line =~ /General Public License/; + return 1 if $line =~ /GPL/; + + return 0; +} + + +sub Normalize_Sentence +{ + my ($line) = @_; + # do some very quick spelling corrections for english/british words + $line=~ s/icence/icense/ig; + $line=~ s/(\.|;)$//; + + return $line; +} + +# Return the Levenshtein distance (also called Edit distance) +# between two strings +# +# The Levenshtein distance (LD) is a measure of similarity between two +# strings, denoted here by s1 and s2. The distance is the number of +# deletions, insertions or substitutions required to transform s1 into +# s2. The greater the distance, the more different the strings are. +# +# The algorithm employs a proximity matrix, which denotes the distances +# between substrings of the two given strings. Read the embedded comments +# for more info. If you want a deep understanding of the algorithm, print +# the matrix for some test strings and study it +# +# The beauty of this system is that nothing is magical - the distance +# is intuitively understandable by humans +# +# The distance is named after the Russian scientist Vladimir +# Levenshtein, who devised the algorithm in 1965 +# +sub levenshtein + { + # $s1 and $s2 are the two strings + # $len1 and $len2 are their respective lengths + # + my ($s1, $s2) = @_; + my ($len1, $len2) = (length $s1, length $s2); + + # If one of the strings is empty, the distance is the length + # of the other string + # + return $len2 if ($len1 == 0); + return $len1 if ($len2 == 0); + + my %mat; + + # Init the distance matrix + # + # The first row to 0..$len1 + # The first column to 0..$len2 + # The rest to 0 + # + # The first row and column are initialized so to denote distance + # from the empty string + # + for (my $i = 0; $i <= $len1; ++$i) + { + for (my $j = 0; $j <= $len2; ++$j) + { + $mat{$i}{$j} = 0; + $mat{0}{$j} = $j; + } + + $mat{$i}{0} = $i; + } + + # Some char-by-char processing is ahead, so prepare + # array of chars from the strings + # + my @ar1 = split(//, $s1); + my @ar2 = split(//, $s2); + + for (my $i = 1; $i <= $len1; ++$i) + { + for (my $j = 1; $j <= $len2; ++$j) + { + # Set the cost to 1 iff the ith char of $s1 + # equals the jth of $s2 + # + # Denotes a substitution cost. When the char are equal + # there is no need to substitute, so the cost is 0 + # + my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; + + # Cell $mat{$i}{$j} equals the minimum of: + # + # - The cell immediately above plus 1 + # - The cell immediately to the left plus 1 + # - The cell diagonally above and to the left plus the cost + # + # We can either insert a new char, delete a char or + # substitute an existing char (with an associated cost) + # + $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, + $mat{$i}{$j-1} + 1, + $mat{$i-1}{$j-1} + $cost]); + } + } + + # Finally, the Levenshtein distance equals the rightmost bottom cell + # of the matrix + # + # Note that $mat{$x}{$y} denotes the distance between the substrings + # 1..$x and 1..$y + # + return $mat{$len1}{$len2}; + } + + + # minimal element of a list + # + sub min + { + my @list = @{$_[0]}; + my $min = $list[0]; + + foreach my $i (@list) + { + $min = $i if ($i < $min); + } + + return $min; + } + + sub max{ + my @list = @_; + return $list[0]>$list[1]?$list[0]:$list[1]; + } + |