summaryrefslogtreecommitdiff
path: root/senttok/senttok.pl
diff options
context:
space:
mode:
Diffstat (limited to 'senttok/senttok.pl')
-rwxr-xr-xsenttok/senttok.pl373
1 files changed, 373 insertions, 0 deletions
diff --git a/senttok/senttok.pl b/senttok/senttok.pl
new file mode 100755
index 0000000..7a20f59
--- /dev/null
+++ b/senttok/senttok.pl
@@ -0,0 +1,373 @@
+#!/usr/bin/perl
+#
+# Copyright (C) 2009-2010 Yuki Manabe and Daniel M. German
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+use strict;
+
+
+my $TOO_LONG = 70;
+
+
+# where are we running the splitter from?
+my $path = $0;
+$path =~ s/[^\/]+$//;
+if ($path eq "") {
+ $path = "./";
+}
+my $licSentences = $path . "licensesentence.dict";
+
+open FH, "<$ARGV[0]";
+my @licensesentencelist=();
+open LICENSESENTENCEFILE, "<$licSentences";
+my $line;
+while ($line = <LICENSESENTENCEFILE>){
+ chomp $line;
+ next if $line =~ /^\#/;
+ next if $line =~ /^ *$/;
+ die "Illegal format in license expression [$line] " unless $line =~ /(.*?):(.*?):(.*)/;
+ push @licensesentencelist,$line;
+}
+
+#foreach $line (@licensesentencelist) {
+# print $line;
+#}
+close LICENSESENTENCEFILE;
+while ($line = <>){
+ my $originalLine;
+ chomp $line;
+
+ if ($line =~ s/^Alternatively,? ?//) {
+ print "Altern\n";
+ }
+
+ $line = Normalize_Sentence($line);
+
+
+ my $check=0;
+ my $matchname="UNKNOWN";
+ my @parm=();
+ my $sentence;
+ my $distance=1; #maximum? number
+ my $mostsimilarname="UNKNOWN";
+ my $before;
+ my $after;
+ my $gpl = 0;
+ my ($gplLater, $gplVersion);
+
+ $originalLine = $line;
+
+# print "Original
+# [$line]
+#\n";
+
+
+ my $lineAsGPL ='';
+
+ if (Looks_Like_GPL($line)) {
+ my $old = $line;
+ $gpl = 1;
+ ($line, $gplLater, $gplVersion) = Normalize_GPL($line);
+ $lineAsGPL = $line;
+ }
+ my ($name, $subRule, $number, $regexp, $option);
+ my $saveLine = $line;
+ my $saveGPL = $gpl;
+ my $LGPL = "";
+ foreach $sentence (@licensesentencelist) {
+ ($name, $subRule, $number, $regexp, $option) = split(/:/, $sentence);
+ # we need this due to the goto again
+ $line = $saveLine;
+ $gpl = $saveGPL;
+ $LGPL = "";
+ again:
+# print "Testing
+# lin[$line]
+# ori[$originalLine]
+# re [$regexp]
+# lpg[$LGPL]
+#\n";
+ if ( $line =~ /$regexp/im ){
+ $before = $`;
+ $after = $'; #';
+ $check=1;
+ $matchname=$name;
+ for (my $i = 1; $i <= $number; $i++){
+ no strict 'refs';
+ push @parm,$$i;
+ }
+ last;
+ } else{
+# print "NO MATCH\n";
+ # let us try again in cas it is lesser/library
+ # do it only once
+ if ($gpl and $line =~ s/(Lesser|Library) GPL/GPL/i) {
+ $LGPL = $1;
+ goto again;
+ }
+ if ($gpl) {
+ $gpl = 0;
+ $line = $originalLine;
+ goto again;
+ }
+ next;## dmg
+ my $targetset=$regexp;
+ $targetset =~ s/^(.*)$/$1/;
+ my $tmpdist=&levenshtein($line,$targetset)/max(length($targetset),length($sentence));
+ if ($tmpdist<$distance){
+ $mostsimilarname=$name;
+ $distance=$tmpdist;
+ }
+ }
+ last; ###
+ }
+ if ($check == 1){
+ #licensesentence name, parm1, parm2,..
+ if ($gpl) {
+ $matchname .= "Ver" . $gplVersion;
+ $matchname .= "+" if $gplLater;
+ $matchname = $LGPL . $matchname;
+ } else {
+ }
+ if (length($before)>$TOO_LONG ||
+ length($after) >$TOO_LONG) {
+ $matchname .= "-TOOLONG";
+ }
+
+ my $parmstrings=join(",",$matchname, $subRule, $before, $after, @parm);
+ print $parmstrings,"\n";
+
+
+ }else{
+ #UNKNOWN, sentence
+ chomp $line;
+ print $matchname,",",0, ",", $mostsimilarname,",",$distance,",",$line,",<------------>[$originalLine][$lineAsGPL]\n";
+ }
+
+}
+
+close FH;
+exit 0;
+
+sub Normalize_GPL
+{
+ my ($line) = @_;
+ my $later = 0;
+ my $version = 0;
+
+ # do some very quick spelling corrections for english/british words
+ $line =~ s/Version 2,? \(June 1991\)/Version 2/gi;
+ $line =~ s/Version 2,? dated June 1991/Version 2/gi;
+ $line =~ s/Version 2\.1,? dated February 1999/Version 2.1/gi;
+ if ($line =~ s/,? or \(?at your option\)?,? any later version//i) {
+ $later = 1;
+ }
+ if ($line =~ s/, or any later version//i) {
+ $later = 1;
+ }
+ if ($line =~ s/ or (greater|later)//i) {
+ $later = 1;
+ }
+ if ($line =~ s/or (greater|later) //i) {
+ $later = 1;
+ }
+ if ($line =~ s/(version|v\.?) ([123\.0]+)/<VERSION>/i) {
+ $version = $2;
+# print "Version [$version]\n";
+ }
+ if ($line =~ s/GPL ?[v\-]([123\.0]+)/GPL <VERSION>/i) {
+ $version = $1;
+ }
+ if ($line =~ s/v\.?([123\.0]+)/<VERSION>/i) {
+ $version = $1;
+ }
+
+ $line =~ s/(distributable|licensed|released|made available)/<LICENSED>/ig;
+ $line =~ s/Library General Public License/Library General Public License/ig;
+ $line =~ s/Lesser General Public License/Lesser General Public License/ig;
+
+ $line =~ s/General Public License/GPL/gi;
+ $line =~ s/GPL \(GPL\)/GPL/gi;
+ $line =~ s/GPL \(<QUOTES>GPL<QUOTES>\)/GPL/gi;
+
+
+ $line =~ s/GNU //gi;
+ $line =~ s/under GPL/under the GPL/gi;
+ $line =~ s/under Lesser/under the Lesser/gi;
+ $line =~ s/under Library/under the Library/gi;
+
+ $line =~ s/of GPL/of the GPL/gi;
+ $line =~ s/of Lesser/of the Lesser/gi;
+ $line =~ s/of Library/of the Library/gi;
+
+ $line =~ s/(can|may)/can/gi;
+ $line =~ s/<VERSION> only/<VERSION>/gi;
+ $line =~ s/<VERSION> of the license/<VERSION>/gi;
+ $line =~ s/(<VERSION>|GPL),? as published by the Free Software Foundation/$1/gi;
+ $line =~ s/(<VERSION>|GPL) \(as published by the Free Software Foundation\)/$1/gi;
+ $line =~ s/(<VERSION>|GPL),? incorporated herein by reference/$1/gi;
+ $line =~ s/terms and conditions/terms/gi;
+ $line =~ s/GPL along with/GPL with/gi;
+
+ $line =~ s/GPL \(<VERSION\)/GPL <VERSION>/gi;
+
+ $line =~ s/ +/ /;
+ $line =~ s/ +$//;
+
+# print ">>>>>>>>>>$line:$later:$version\n";
+
+ return ($line,$later,$version);
+}
+
+sub Looks_Like_GPL
+{
+ my ($line) = @_;
+
+ return 1 if $line =~ /GNU/;
+ return 1 if $line =~ /General Public License/;
+ return 1 if $line =~ /GPL/;
+
+ return 0;
+}
+
+
+sub Normalize_Sentence
+{
+ my ($line) = @_;
+ # do some very quick spelling corrections for english/british words
+ $line=~ s/icence/icense/ig;
+ $line=~ s/(\.|;)$//;
+
+ return $line;
+}
+
+# Return the Levenshtein distance (also called Edit distance)
+# between two strings
+#
+# The Levenshtein distance (LD) is a measure of similarity between two
+# strings, denoted here by s1 and s2. The distance is the number of
+# deletions, insertions or substitutions required to transform s1 into
+# s2. The greater the distance, the more different the strings are.
+#
+# The algorithm employs a proximity matrix, which denotes the distances
+# between substrings of the two given strings. Read the embedded comments
+# for more info. If you want a deep understanding of the algorithm, print
+# the matrix for some test strings and study it
+#
+# The beauty of this system is that nothing is magical - the distance
+# is intuitively understandable by humans
+#
+# The distance is named after the Russian scientist Vladimir
+# Levenshtein, who devised the algorithm in 1965
+#
+sub levenshtein
+ {
+ # $s1 and $s2 are the two strings
+ # $len1 and $len2 are their respective lengths
+ #
+ my ($s1, $s2) = @_;
+ my ($len1, $len2) = (length $s1, length $s2);
+
+ # If one of the strings is empty, the distance is the length
+ # of the other string
+ #
+ return $len2 if ($len1 == 0);
+ return $len1 if ($len2 == 0);
+
+ my %mat;
+
+ # Init the distance matrix
+ #
+ # The first row to 0..$len1
+ # The first column to 0..$len2
+ # The rest to 0
+ #
+ # The first row and column are initialized so to denote distance
+ # from the empty string
+ #
+ for (my $i = 0; $i <= $len1; ++$i)
+ {
+ for (my $j = 0; $j <= $len2; ++$j)
+ {
+ $mat{$i}{$j} = 0;
+ $mat{0}{$j} = $j;
+ }
+
+ $mat{$i}{0} = $i;
+ }
+
+ # Some char-by-char processing is ahead, so prepare
+ # array of chars from the strings
+ #
+ my @ar1 = split(//, $s1);
+ my @ar2 = split(//, $s2);
+
+ for (my $i = 1; $i <= $len1; ++$i)
+ {
+ for (my $j = 1; $j <= $len2; ++$j)
+ {
+ # Set the cost to 1 iff the ith char of $s1
+ # equals the jth of $s2
+ #
+ # Denotes a substitution cost. When the char are equal
+ # there is no need to substitute, so the cost is 0
+ #
+ my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
+
+ # Cell $mat{$i}{$j} equals the minimum of:
+ #
+ # - The cell immediately above plus 1
+ # - The cell immediately to the left plus 1
+ # - The cell diagonally above and to the left plus the cost
+ #
+ # We can either insert a new char, delete a char or
+ # substitute an existing char (with an associated cost)
+ #
+ $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
+ $mat{$i}{$j-1} + 1,
+ $mat{$i-1}{$j-1} + $cost]);
+ }
+ }
+
+ # Finally, the Levenshtein distance equals the rightmost bottom cell
+ # of the matrix
+ #
+ # Note that $mat{$x}{$y} denotes the distance between the substrings
+ # 1..$x and 1..$y
+ #
+ return $mat{$len1}{$len2};
+ }
+
+
+ # minimal element of a list
+ #
+ sub min
+ {
+ my @list = @{$_[0]};
+ my $min = $list[0];
+
+ foreach my $i (@list)
+ {
+ $min = $i if ($i < $min);
+ }
+
+ return $min;
+ }
+
+ sub max{
+ my @list = @_;
+ return $list[0]>$list[1]?$list[0]:$list[1];
+ }
+