summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRené Scheibe <rene.scheibe@gmail.com>2014-09-03 15:56:17 +0200
committerRené Scheibe <rene.scheibe@gmail.com>2014-09-03 15:56:17 +0200
commit8b20ac6d9016be8d5e66d791a6d17cfd0429781f (patch)
tree74424be03ff503d5c058cf08f8fd8b7c1694d11a
parent525e3e973cf3bd388e4cffa3054cb2b753c545c7 (diff)
parent334865e50c2af0d9ce1c5be8385ad984d5c62346 (diff)
downloadninka-8b20ac6d9016be8d5e66d791a6d17cfd0429781f.tar.gz
Merge branch 'consistent-formatting' into integration
Conflicts: extComments/extComments.pl extComments/hashComments.pl matcher/matcher.pl splitter/splitter.pl
-rwxr-xr-xextComments/extComments.pl32
-rwxr-xr-xextComments/hashComments.pl20
-rwxr-xr-xfilter/filter.pl44
-rwxr-xr-xmatcher/matcher.pl149
-rwxr-xr-xninka.pl45
-rwxr-xr-xsenttok/senttok.pl167
-rwxr-xr-xsplitter/splitter.pl102
7 files changed, 216 insertions, 343 deletions
diff --git a/extComments/extComments.pl b/extComments/extComments.pl
index 4ba9f59..1a4caec 100755
--- a/extComments/extComments.pl
+++ b/extComments/extComments.pl
@@ -1,5 +1,4 @@
#!/usr/bin/env perl
-
#
# Copyright (C) 2009-2010 Yuki Manabe and Daniel M. German
#
@@ -30,7 +29,7 @@ if ($path eq '') {
# set parameters
my %opts = ();
if (!getopts ('vc:p:',\%opts)) {
-print STDERR "Usage $0 -v
+ print STDERR "Usage $0 -v
-v verbose
-c count of comment blocks
@@ -49,8 +48,6 @@ $numberComments = $opts{c} if exists $opts{c};
my $verbose = 1;
$verbose = exists $opts{v};
-
-
if (get_size($f) == 0) {
print STDERR "Empty file, just exit\n" if $verbose;
exit 0; # nothing to report, just end
@@ -60,30 +57,22 @@ my $commentsCmd = Determine_Comments_Extractor($f);
execute("$commentsCmd");
-if ($commentsCmd =~ /^comments/ and
- get_size("${f}.comments") == 0){
+if ($commentsCmd =~ /^comments/ and get_size("${f}.comments") == 0) {
`cat '$f' | head -700 > ${f}.comments`;
}
exit 0;
-
-sub Determine_Comments_Extractor
-{
+sub Determine_Comments_Extractor {
my ($f) = @_;
if ($f =~ /\.([^\.]+)$/) {
my $ext= $1;
- if ($ext =~ /^(pl|pm|py)$/
- ) {
-########################
-# for the time being, let us just extract the top 400 lines
-
+ if ($ext =~ /^(pl|pm|py)$/) {
+ # for the time being, let us just extract the top 400 lines
return "cat '$f' | head -400 > '${f}.comments'";
# return "$path/hashComments.pl -p '#' '$f'";
- } elsif ($ext eq 'jl' or
- $ext eq 'el'
- ) {
+ } elsif ($ext eq 'jl' or $ext eq 'el') {
return "cat '$f' | head -400 > '${f}.comments'";
# return "$path/hashComments.pl -p ';' '$f'";;
} elsif ($ext =~ /^(java|c|cpp|h|cxx|c\+\+|cc)$/ ) {
@@ -102,20 +91,17 @@ sub Determine_Comments_Extractor
}
}
-sub execute
-{
+sub execute {
my ($c) = @_;
-# print "\nTo execute [$c]\n";
my $r = `$c`;
my $status = ($? >> 8);
die "execution of program [$c] failed: status [$status]" if ($status != 0);
return $r;
}
-
-sub get_size
-{
+sub get_size {
my ($f) = @_;
my $size = (stat($f))[7];
return $size;
}
+
diff --git a/extComments/hashComments.pl b/extComments/hashComments.pl
index c83846b..baa9937 100755
--- a/extComments/hashComments.pl
+++ b/extComments/hashComments.pl
@@ -1,5 +1,4 @@
#!/usr/bin/env perl
-
#
# Copyright (C) 2009-2010 Yuki Manabe and Daniel M. German
#
@@ -17,20 +16,18 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
-
-# this is to extract the first <n> comments from any language that
+# this is to extract the first <n> comments from any language that
# uses the same prefix
use Getopt::Std;
-
# set parameters
my %opts = ();
if (!getopts ('vc:p:',\%opts)) {
-print STDERR "Usage $0 -v
+ print STDERR "Usage $0 -v
-v verbose
- -p comment char
+ -p comment char
-c count of comment blocks
\n";
@@ -64,7 +61,7 @@ while (<>) {
if (Is_Comment($_)) {
s/\t/ /g;
s/ +/ /g;
- $comCount ++ if (not $insideComment);
+ $comCount++ if (not $insideComment);
$insideComment = 1;
/$commentChar+/;
print OUT $' . "\n"; #'
@@ -72,18 +69,15 @@ while (<>) {
print OUT "\n";
} else {
exit 0;
- }
+ }
}
-
-sub Is_Comment
-{
+sub Is_Comment {
my ($st) = @_;
return ($st =~ /^\s*$commentChar/);
}
-sub Is_Blank
-{
+sub Is_Blank {
my ($st) = @_;
return ($st =~ /^\s*$/);
}
diff --git a/filter/filter.pl b/filter/filter.pl
index 615c850..84269c6 100755
--- a/filter/filter.pl
+++ b/filter/filter.pl
@@ -16,7 +16,6 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
-
#
# filter.pl
# This script classify input sentences into two categories,
@@ -29,7 +28,6 @@
#
use strict;
-
#print $ARGV[0];
# where are we running the program from
@@ -44,7 +42,6 @@ die "Usagee $0 <filename>.sentences" unless $ARGV[0] =~ /\.sentences$/;
my $goodfilename = $ARGV[0];
-
die "Filename should end in '.sentences' [$goodfilename]" unless $goodfilename =~ s/\.sentences$/\.goodsent/;
my $badfilename = $ARGV[0];
$badfilename =~ s/\.sentences$/\.badsent/;
@@ -58,36 +55,37 @@ open (DICTIONARY, "<$critWords") or die ('Error: criticalword.dict is not found.
open (GOODOUT, ">$goodfilename") || die ('Error');
open (BADOUT, ">$badfilename") || die ('Error');
-my @cwordlist=();
+my @cwordlist = ();
# read dictionary into list
my $cword;
-while ($cword=<DICTIONARY>){
- chomp $cword;
- next if $cword =~ /^\#/;
- $cword =~ s/\#.*$//; # remove everything to the end of file
- push(@cwordlist,"$cword");
+while ($cword = <DICTIONARY>) {
+ chomp $cword;
+ next if $cword =~ /^\#/;
+ $cword =~ s/\#.*$//; # remove everything to the end of file
+ push(@cwordlist, "$cword");
}
close(DICTIONARY);
#matching cliticalwords in list against sentences.
my $sentence;
-while ($sentence=<INPUTFILE>){
- my $check=0;
- chomp $sentence;
- foreach $cword (@cwordlist){
- if($sentence =~ /\b$cword\b/i){
- $check=1;
- #print "$cword:$sentence";
- last;
+while ($sentence = <INPUTFILE>) {
+ my $check = 0;
+ chomp $sentence;
+ foreach $cword (@cwordlist) {
+ if ($sentence =~ /\b$cword\b/i) {
+ $check = 1;
+ #print "$cword:$sentence";
+ last;
+ }
+ }
+ if ($check == 1) {
+ print GOODOUT "$sentence\n";
+ } else {
+ print BADOUT "$sentence\n";
}
- }
- if ($check==1){
- print GOODOUT "$sentence\n";
- }else{
- print BADOUT "$sentence\n";
- }
}
close(INPUTFILE);
close(GOODOUT);
close(BADOUT);
+
diff --git a/matcher/matcher.pl b/matcher/matcher.pl
index 665cc77..e805b6c 100755
--- a/matcher/matcher.pl
+++ b/matcher/matcher.pl
@@ -1,5 +1,4 @@
#!/usr/bin/env perl
-
#
# Copyright (C) 2009-2010 Yuki Manabe and Daniel M. German
#
@@ -20,7 +19,7 @@
#
# matchter.pl
#
-# This script use a set of license sentence name as input
+# This script use a set of license sentence name as input
# and output license name corresponds to a rule which match the set.
#
# author: Yuki Manabe
@@ -116,32 +115,24 @@ my $path = $0;
$path =~ s/[^\/]+$//;
if ($path eq '') {
$path = './';
- }
+}
my $rules= $path . 'rules.dict';
my $interrules= $path . 'interrules.dict';
die "Usage $0 <filename>.senttok" unless $ARGV[0] =~ /\.senttok$/;
-
-# read rules
-
my $countUnknowns = 0;
-
# read the rules
-
my @rulelist = Read_Rules($rules);
-
my @interRuleList = Read_Inter_Rules($interrules);
-
-my @licSentNames=();
+my @licSentNames = ();
my @original;
Read_Original($ARGV[0], \@licSentNames, \@original);
-
#foreach my $x (@licSentNames) {
# print "$x\n";
#}
@@ -160,8 +151,8 @@ Read_Original($ARGV[0], \@licSentNames, \@original);
# matching spdx requires to match strict licenses, with no alternatives...
-my $senttok= ',' . join(',',@licSentNames) . ',';
-my @result=();
+my $senttok = ',' . join(',', @licSentNames) . ',';
+my @result = ();
my $countMatches = 0;
print "[$senttok]\n" if $debug;
@@ -177,154 +168,136 @@ Match_License();
#Print_Result();
my $match = 0;
-for (my $i=0;$i<=$#licSentNames ;$i++) {
- if ($licSentNames[$i] == 0 and
- ($licSentNames[$i] ne 'UNKNOWN' and
+for (my $i = 0; $i <= $#licSentNames; $i++) {
+ if ($licSentNames[$i] == 0 and
+ ($licSentNames[$i] ne 'UNKNOWN' and
$licSentNames[$i] ne '')) {
# print "[$licSentNames[$i]]\n";
$licSentNames[$i] =~ s/Extrict$//;
- $match ++;
+ $match++;
}
}
#Print_Result();
-
if ($match > 0) {
# print "REDO\n";
- for (my $i=0;$i<=$#interRuleList ;$i++){
+ for (my $i = 0; $i <= $#interRuleList; $i++) {
#for my $ref( @interRuleList[$i]){
# print "@$ref\n";
#}
#print $interRuleList[$i][0];
@licSentNames = map { $_ eq $interRuleList[$i][0] ? $interRuleList[$i][1] : $_ } @licSentNames;
}
-
- $senttok= join(',',@licSentNames) . ',';
-
+
+ $senttok = join(',', @licSentNames) . ',';
+
Match_License();
}
Print_Result();
-
exit 0;
-
-
#print @licSentNames;
#print join(';',@licSentNames)."\n";
-
# 3. matching
###############################
# we will iterate over rules, matching as many as we can...
-
-
-
-
-sub Is_Unknown
-{
+sub Is_Unknown {
my ($s) = @_;
my @f = split (/,/, $s);
return $f[0] eq 'UNKNOWN';
}
-
-sub Read_Rules
-{
+sub Read_Rules {
my ($rulesF) = @_;
open (RULES, "<$rulesF") or die ('Error: rules.dict is not found.');
my $sentence;
my @rules = ();
- while ($sentence=<RULES>){
+ while ($sentence = <RULES>) {
chomp $sentence;
- # clean up spaces
- $sentence=~ s/^\s+//;
- $sentence=~ s/\s+$//;
- $sentence=~ s/\s*,\s*/,/g;
- $sentence=~ s/\s*:\s*/:/g;
+ # clean up spaces
+ $sentence =~ s/^\s+//;
+ $sentence =~ s/\s+$//;
+ $sentence =~ s/\s*,\s*/,/g;
+ $sentence =~ s/\s*:\s*/:/g;
#check format
- if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/){
+ if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/) {
next;
}
$sentence =~ /(.*?):(.*)/;
- push (@rules,[$1,$2]);
+ push (@rules, [$1, $2]);
}
close RULES;
return @rules;
}
-
-sub Read_Inter_Rules
-{
+sub Read_Inter_Rules {
my ($interrules) = @_;
my @list;
open (IRULES, "<$interrules") or die ('Error: interrules.dict is not found.');
my $sentence;
- while ($sentence=<IRULES>){
+ while ($sentence = <IRULES>) {
chomp $sentence;
#check format
- if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/){
+ if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/) {
next;
}
- foreach my $item (split(/\|/,$2)){
- push (@list,[$item,$1]);
+ foreach my $item (split(/\|/, $2)) {
+ push (@list, [$item, $1]);
}
}
close IRULES;
return @list;
}
-sub Read_Original
-{
+sub Read_Original {
my ($inputF, $tokens, $originals) = @_;
open (INPUTFILE, $inputF) or die ("Error: $inputF is not found.");
-
+
my $sentence;
my @original;
- while ($sentence = <INPUTFILE>){
+ while ($sentence = <INPUTFILE>) {
chomp $sentence;
- my @fields = split(':',$sentence);
- push(@$originals,$fields[1]);
+ my @fields = split(':', $sentence);
+ push(@$originals, $fields[1]);
my @token = split(';', $fields[0]);
- push(@$tokens,$token[0]);
+ push(@$tokens, $token[0]);
}
if (scalar(@$originals) == 0) {
print "NONE\n";
exit 0;
}
-
+
#print join(';',@licSentNames)."\n";
-
+
close INPUTFILE;
}
-sub Match_License
-{
-
+sub Match_License {
# create a string with the sentences
-
- for (my $j=0;$j<=$#rulelist;$j++){
-
- my $rule=$rulelist[$j][1];
- my $rulename=$rulelist[$j][0];
+
+ for (my $j = 0; $j <= $#rulelist; $j++) {
+ my $rule = $rulelist[$j][1];
+ my $rulename = $rulelist[$j][0];
my $lenRule = scalar(split(',', $rule));
# replace rule with the length of the rule
- print "To try [$rulename][$rule] on [$senttok]\n" if $debug;
- while ($senttok =~ s/,${rule},/,$lenRule,/){
- $countMatches ++;
- push (@result,$rulename);
+ print "To try [$rulename][$rule] on [$senttok]\n" if $debug;
+ while ($senttok =~ s/,${rule},/,$lenRule,/) {
+ $countMatches++;
+ push (@result, $rulename);
# print ">>>>$senttok|$rulelist[$j][1]\n";
# print 'Result: ', join(',', @result);
# print "\n";
}
}
-
+
# print ">>>>[$senttok]\n";
my $onlyAllRight = 0;
@@ -333,7 +306,7 @@ sub Match_License
#print STDERR "Ending>>>>>>>$senttok\n";
#print STDERR 'Size>>' , scalar(@result), "\n";
#print STDERR 'Result>>', join(',', @result), "\n";
-
+
# let us remove allrights
# my $onlyAllRight = 1;
# for my $i (0.. scalar(@licSentNames)-1){
@@ -345,11 +318,10 @@ sub Match_License
# }
# output result
- if (scalar(@result) > 0){
+ if (scalar(@result) > 0) {
# at this point we have matched
-
-
- # let us clean up the rules... let us print the matched rules, and the
+
+ # let us clean up the rules... let us print the matched rules, and the
# if (grep(/GPL/, @result)) {
# print "GPL...\n";
# foreach my $r ($NonCriticalRules{GPL}) {
@@ -357,7 +329,6 @@ sub Match_License
# }
# }
# general removal of rules
-
foreach my $r (@generalNonCritical) {
while ($senttok =~ s/,$r,/,-1,/) {
@@ -365,7 +336,7 @@ sub Match_License
}
}
# print "[$senttok]\n";
-
+
foreach my $res (@result) {
my $temp = $NonCriticalRules{$res};
foreach my $r (@$temp) {
@@ -375,13 +346,11 @@ sub Match_License
}
}
}
-# print "[$senttok]\n";
+# print "[$senttok]\n";
}
}
-
-sub Print_Result
-{
+sub Print_Result {
# $senttok =~ s/AllRights(,?)/$1/g;
# $senttok =~ s/UNKNOWN,/,/g;
# $senttok =~ s/,+/,/g;
@@ -392,7 +361,7 @@ sub Print_Result
my @sections = split(',', $senttok);
die 'assertion 1' if $sections[0] ne '';
die 'assertion 2' if $sections[scalar(@sections)] ne '';
-
+
my $ignoredLines = 0;
my $licenseLines = 0;
my $unknownLines = 0;
@@ -404,20 +373,20 @@ sub Print_Result
} elsif ($sections[$i] != 0) {
$licenseLines += $sections[$i];
} elsif ($sections[$i] eq 'UNKNOWN') {
- $unknownLines ++;
+ $unknownLines++;
} else {
$unmatchedLines++;
}
}
$senttok =~ s/^,(.*),$/$1/;
-
+
# print "$ignoredLines > $licenseLines > $unknownLines > $unmatchedLines\n";
if (scalar (@result) == 0) {
- print 'UNKNOWN';
+ print 'UNKNOWN';
} else {
- print join(',',@result);
+ print join(',',@result);
}
print ";$countMatches;$licenseLines;$ignoredLines;$unmatchedLines;$unknownLines;$senttok\n";
$senttok = $save;
-
}
+
diff --git a/ninka.pl b/ninka.pl
index 990a5f5..a05c5af 100755
--- a/ninka.pl
+++ b/ninka.pl
@@ -42,15 +42,13 @@ Usage $0 -fCtTvcgsGd <filename>
-L force creation of matching
- -d delete intermediate files
+ -d delete intermediate files
\n";
exit 1;
}
-
-
my $verbose = exists $opts{v};
my $delete = exists $opts{d};
#$delete = 1;
@@ -62,7 +60,6 @@ if ($path eq "") {
$path = "./";
}
-
my $force = exists $opts{f};
my $forceGood = exists $opts{G};
my $forceSentences = exists $opts{S};
@@ -76,8 +73,6 @@ my $f = $ARGV[0];
my $original = $f;
-
-
print "Starting: $original;\n" if ($verbose);
print "$original;";
@@ -92,28 +87,22 @@ if (not (-f "$f")) {
exit 0;
}
-
-Do_File_Process($original, $commentsFile, ($force or $forceComments),
+Do_File_Process($original, $commentsFile, ($force or $forceComments),
"$path/extComments/extComments.pl -c1 '${original}'",
"Creating comments file",
exists $opts{c});
-
-Do_File_Process($commentsFile, $sentencesFile, ($force or $forceSentences),
+Do_File_Process($commentsFile, $sentencesFile, ($force or $forceSentences),
"$path/splitter/splitter.pl '${commentsFile}'",
- "Splitting sentences", exists $opts{s}
- );
+ "Splitting sentences", exists $opts{s});
-Do_File_Process( $sentencesFile, $goodsentFile, ($force or $forceGood),
+Do_File_Process($sentencesFile, $goodsentFile, ($force or $forceGood),
"$path/filter/filter.pl '${sentencesFile}'",
- "Filtering good sentences", exists $opts{s}
- );
+ "Filtering good sentences", exists $opts{s});
-Do_File_Process($goodsentFile, $sentokFile, ($force or $forceSentok),
+Do_File_Process($goodsentFile, $sentokFile, ($force or $forceSentok),
"$path/senttok/senttok.pl '${goodsentFile}' > '${sentokFile}'",
- "Matching sentences against rules", exists $opts{t}
- );
-
+ "Matching sentences against rules", exists $opts{t});
print "Matching ${f}.senttok against rules" if ($verbose);
execute("$path/matcher/matcher.pl '${f}.senttok' > '${f}.license'");
@@ -132,18 +121,15 @@ if ($delete) {
exit 0;
-
-
-sub Do_File_Process
-{
+sub Do_File_Process {
my ($input, $output, $force, $cmd, $message, $end) = @_;
print "${message}:" if ($verbose);
- if ($force or newer($input, $output)) {
+ if ($force or newer($input, $output)) {
print "Running ${cmd}:" if ($verbose);
execute($cmd);
} else {
- print "File [$output] newer than input [$input], not creating:" if ($verbose);
+ print "File [$output] newer than input [$input], not creating:" if ($verbose);
}
if ($end) {
print "Existing after $message" if $verbose;
@@ -152,11 +138,7 @@ sub Do_File_Process
}
}
-
-
-
-sub execute
-{
+sub execute {
my ($c) = @_;
# print "\nTo execute [$c]\n";
my $r = `$c`;
@@ -165,8 +147,7 @@ sub execute
return $r;
}
-sub newer
-{
+sub newer {
my ($f1, $f2) = @_;
my ($f1write) = (stat($f1))[9];
my ($f2write) = (stat($f2))[9];
diff --git a/senttok/senttok.pl b/senttok/senttok.pl
index 8580324..e960e2e 100755
--- a/senttok/senttok.pl
+++ b/senttok/senttok.pl
@@ -15,8 +15,8 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
-use strict;
+use strict;
my $TOO_LONG = 70;
@@ -32,7 +32,7 @@ open FH, "<$ARGV[0]";
my @licensesentencelist=();
open LICENSESENTENCEFILE, "<$licSentences";
my $line;
-while ($line = <LICENSESENTENCEFILE>){
+while ($line = <LICENSESENTENCEFILE>) {
chomp $line;
next if $line =~ /^\#/;
next if $line =~ /^ *$/;
@@ -44,7 +44,7 @@ while ($line = <LICENSESENTENCEFILE>){
# print $line;
#}
close LICENSESENTENCEFILE;
-while ($line = <>){
+while ($line = <>) {
my $saveLine;
my $originalLine;
chomp $line;
@@ -56,14 +56,13 @@ while ($line = <>){
$line = Normalize_Sentence($line);
-
- my $check=0;
- my $matchname="UNKNOWN";
- my @parm=();
+ my $check = 0;
+ my $matchname = "UNKNOWN";
+ my @parm = ();
my $sentence;
- my $distance=1; #maximum? number
- my $mostsimilarname="UNKNOWN";
- my $before;
+ my $distance = 1; #maximum? number
+ my $mostsimilarname = "UNKNOWN";
+ my $before;
my $after;
my $gpl = 0;
my ($gplLater, $gplVersion);
@@ -74,8 +73,7 @@ while ($line = <>){
# [$line]
#\n";
-
- my $lineAsGPL ='';
+ my $lineAsGPL = '';
if (Looks_Like_GPL($line)) {
my $old = $line;
@@ -94,23 +92,23 @@ while ($line = <>){
$gpl = $saveGPL;
$LGPL = "";
again:
-# print "Testing
+# print "Testing
# lin[$line]
# ori[$saveLine]
# re [$regexp]
# lpg[$LGPL]
#\n";
- if ( $line =~ /$regexp/im ){
- $before = $`;
+ if ($line =~ /$regexp/im) {
+ $before = $`;
$after = $'; #';
- $check=1;
- $matchname=$name;
- for (my $i = 1; $i <= $number; $i++){
+ $check = 1;
+ $matchname = $name;
+ for (my $i = 1; $i <= $number; $i++) {
no strict 'refs';
- push @parm,$$i;
+ push @parm, $$i;
}
last;
- } else{
+ } else {
# print "NO MATCH\n";
# let us try again in cas it is lesser/library
# do it only once
@@ -124,45 +122,39 @@ while ($line = <>){
goto again;
}
next;## dmg
- my $targetset=$regexp;
+ my $targetset = $regexp;
$targetset =~ s/^(.*)$/$1/;
- my $tmpdist=&levenshtein($line,$targetset)/max(length($targetset),length($sentence));
- if ($tmpdist<$distance){
- $mostsimilarname=$name;
- $distance=$tmpdist;
+ my $tmpdist = levenshtein($line, $targetset) / max(length($targetset), length($sentence));
+ if ($tmpdist < $distance) {
+ $mostsimilarname = $name;
+ $distance = $tmpdist;
}
}
last; ###
}
- if ($check == 1){
- #licensesentence name, parm1, parm2,..
+ if ($check == 1) {
+ # licensesentence name, parm1, parm2,..
if ($gpl) {
$matchname .= "Ver" . $gplVersion;
$matchname .= "+" if $gplLater;
$matchname = $LGPL . $matchname;
- } else {
}
- if (length($before)>$TOO_LONG ||
- length($after) >$TOO_LONG) {
+ if (length($before) > $TOO_LONG || length($after) > $TOO_LONG) {
$matchname .= "-TOOLONG";
}
- my $parmstrings=join(";",$matchname, $subRule, $before, $after, @parm);
- print $parmstrings,":$originalLine\n";
-
-
- }else{
- #UNKNOWN, sentence
+ my $parmstrings = join(";",$matchname, $subRule, $before, $after, @parm);
+ print $parmstrings, ":$originalLine\n";
+ } else {
+ # UNKNOWN, sentence
chomp $line;
- print $matchname,";",0, ";", $mostsimilarname,";",$distance,";",$saveLine,":$originalLine\n";
- }
-
+ print $matchname, ";", 0, ";", $mostsimilarname, ";", $distance, ";", $saveLine, ":$originalLine\n";
+ }
}
close FH;
exit 0;
-sub Normalize_GPL
-{
+sub Normalize_GPL {
my ($line) = @_;
my $later = 0;
my $version = 0;
@@ -202,7 +194,6 @@ sub Normalize_GPL
$line =~ s/GPL \(GPL\)/GPL/gi;
$line =~ s/GPL \(<QUOTES>GPL<QUOTES>\)/GPL/gi;
-
$line =~ s/GNU //gi;
$line =~ s/under GPL/under the GPL/gi;
$line =~ s/under Lesser/under the Lesser/gi;
@@ -231,8 +222,7 @@ sub Normalize_GPL
return ($line,$later,$version);
}
-sub Looks_Like_GPL
-{
+sub Looks_Like_GPL {
my ($line) = @_;
return 1 if $line =~ /GNU/;
@@ -242,18 +232,16 @@ sub Looks_Like_GPL
return 0;
}
-
-sub Normalize_Sentence
-{
+sub Normalize_Sentence {
my ($line) = @_;
# do some very quick spelling corrections for english/british words
- $line=~ s/icence/icense/ig;
- $line=~ s/(\.|;)$//;
+ $line =~ s/icence/icense/ig;
+ $line =~ s/(\.|;)$//;
return $line;
}
-# Return the Levenshtein distance (also called Edit distance)
+# Return the Levenshtein distance (also called Edit distance)
# between two strings
#
# The Levenshtein distance (LD) is a measure of similarity between two
@@ -272,22 +260,21 @@ sub Normalize_Sentence
# The distance is named after the Russian scientist Vladimir
# Levenshtein, who devised the algorithm in 1965
#
-sub levenshtein
- {
+sub levenshtein {
# $s1 and $s2 are the two strings
# $len1 and $len2 are their respective lengths
#
my ($s1, $s2) = @_;
my ($len1, $len2) = (length $s1, length $s2);
-
+
# If one of the strings is empty, the distance is the length
# of the other string
#
return $len2 if ($len1 == 0);
return $len1 if ($len2 == 0);
-
+
my %mat;
-
+
# Init the distance matrix
#
# The first row to 0..$len1
@@ -297,35 +284,31 @@ sub levenshtein
# The first row and column are initialized so to denote distance
# from the empty string
#
- for (my $i = 0; $i <= $len1; ++$i)
- {
- for (my $j = 0; $j <= $len2; ++$j)
- {
+ for (my $i = 0; $i <= $len1; ++$i) {
+ for (my $j = 0; $j <= $len2; ++$j) {
$mat{$i}{$j} = 0;
$mat{0}{$j} = $j;
- }
-
+ }
+
$mat{$i}{0} = $i;
}
-
+
# Some char-by-char processing is ahead, so prepare
# array of chars from the strings
#
my @ar1 = split(//, $s1);
my @ar2 = split(//, $s2);
-
- for (my $i = 1; $i <= $len1; ++$i)
- {
- for (my $j = 1; $j <= $len2; ++$j)
- {
+
+ for (my $i = 1; $i <= $len1; ++$i) {
+ for (my $j = 1; $j <= $len2; ++$j) {
# Set the cost to 1 iff the ith char of $s1
# equals the jth of $s2
- #
+ #
# Denotes a substitution cost. When the char are equal
# there is no need to substitute, so the cost is 0
#
my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
-
+
# Cell $mat{$i}{$j} equals the minimum of:
#
# - The cell immediately above plus 1
@@ -338,36 +321,30 @@ sub levenshtein
$mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
$mat{$i}{$j-1} + 1,
$mat{$i-1}{$j-1} + $cost]);
- }
- }
-
+ }
+ }
+
# Finally, the Levenshtein distance equals the rightmost bottom cell
# of the matrix
#
# Note that $mat{$x}{$y} denotes the distance between the substrings
# 1..$x and 1..$y
- #
return $mat{$len1}{$len2};
- }
-
-
- # minimal element of a list
- #
- sub min
- {
- my @list = @{$_[0]};
- my $min = $list[0];
-
- foreach my $i (@list)
- {
- $min = $i if ($i < $min);
- }
-
- return $min;
- }
-
- sub max{
- my @list = @_;
- return $list[0]>$list[1]?$list[0]:$list[1];
+}
+
+sub min {
+ my @list = @{$_[0]};
+ my $min = $list[0];
+
+ foreach my $i (@list) {
+ $min = $i if ($i < $min);
}
-
+
+ return $min;
+}
+
+sub max {
+ my @list = @_;
+ return $list[0] > $list[1] ? $list[0] : $list[1];
+}
+
diff --git a/splitter/splitter.pl b/splitter/splitter.pl
index 55a9d36..69039ad 100755
--- a/splitter/splitter.pl
+++ b/splitter/splitter.pl
@@ -1,8 +1,8 @@
#!/usr/bin/env perl
-#
+#
#***************************************************************************************************
-# SENTENCE SPLITTER
-# Author: Paul Clough {cloughie@dcs.shef.ac.uk}
+# SENTENCE SPLITTER
+# Author: Paul Clough {cloughie@dcs.shef.ac.uk}
#
# This program is originally based on the sentence splitter program
@@ -26,15 +26,12 @@
# You should have received a copy of the GNU Affero General Public License
# along with this patch. If not, see <http://www.gnu.org/licenses/>.
-
-
use strict;
# This program is originally based on the sentence splitter program
# published by Paul Clough. Version 1.0, but then it was mostly rewritten
# His ideas, however, linger in here (and his dictionary of abbreviations)
-
my $dictionary = 'splitter.dict';
my $abbrv_file = 'splitter.abv';
my $len = 0;
@@ -53,12 +50,10 @@ $abbrv_file = $path . $abbrv_file;
die "Usage $0 <filename>.comments" unless $ARGV[0] =~ /\.comments$/;
-
-die "Input file name should end in '.comments' [$output_file]" unless $output_file =~ s/\.comments$/.sentences/;
+die "Input file name should end in '.comments' [$output_file]" unless $output_file =~ s/\.comments$/.sentences/;
open(OUT, ">$output_file") or die("Unable to create output file [$output_file]");
-
#print length($opt_o);
# Load in the dictionary and find the common words.
@@ -70,15 +65,15 @@ open(OUT, ">$output_file") or die("Unable to create output file [$output_file]")
&loadAbbreviations;
my $text;
-# open(FILE, $opt_f) or die "Can't open $opt_f for reading\n";
+# open(FILE, $opt_f) or die "Can't open $opt_f for reading\n";
my $line;
while (defined ($line = <>)) {
- $text.= $line;
+ $text .= $line;
}
# append a newline just in case
-$text.="\n";
+$text .= "\n";
# - is used to create lines
# = is used to create lines
@@ -96,11 +91,9 @@ $text =~ s@^[ \t]*/\*@@gmx;
$text =~ s/\*\/[ \t]*$//gmx;
$text =~ s@([^:])// @$1@gmx;
-
# Replace /\r\n/ with \n only
$text =~ s/\r\n/\n/g;
-
# now, try to replace the leading/ending character of each line #/-, at most 3 heading characters
# and each repeated as many times as necessaary
$text =~ s/^[ \t]{0,3}[\*\#\/\;]+//gmx;
@@ -128,7 +121,7 @@ $text .= "\n";
# this gets us in big trouble... licenses that have numeric abbreviations
$text =~ s/v\.\s+2\.0/v<dot> 2<dot>0/g;
-while ($text =~ /^([^\n]*)\n/gsm ) {
+while ($text =~ /^([^\n]*)\n/gsm) {
my $curr = $1;
# print "<<$curr\n<<\n";
@@ -145,7 +138,7 @@ while ($text =~ /^([^\n]*)\n/gsm ) {
foreach my $s (@sentences) {
for my $i (0..length($s)-1) {
- my $c = substr($s,$i,1);
+ my $c = substr($s, $i, 1);
$count2++ if ($c ge 'A' && $c le 'z');
}
print OUT Clean_Sentence($s) , "\n";
@@ -164,17 +157,11 @@ close OUT;
exit;
-
-
-
-#***************************************************************************************************
-
#***************************************************************************************************
# procedures
#***************************************************************************************************
-sub Clean_Sentence
-{
+sub Clean_Sentence {
($_) = @_;
# check for trailing bullets of different types
@@ -198,12 +185,9 @@ sub Clean_Sentence
die if /\n/m;
return $_;
-
}
-
-sub Split_Text
-{
+sub Split_Text {
my ($text) = @_;
my $len = 0;
my $next_word;
@@ -213,7 +197,7 @@ sub Split_Text
my @result;
my $after;
my $currentSentence = '';
- # this breaks the sentence into
+ # this breaks the sentence into
# 1. Any text before a separator
# 2. The separator [.!?:\n]
# 3.
@@ -222,19 +206,19 @@ sub Split_Text
([\.\!\?\:\n])
(?=(.?))
/xsm) { #/(?:(?=([([{\"\'`)}\]<]*[ ]+)[([{\"\'`)}\] ]*([A-Z0-9][a-z]*))|(?=([()\"\'`)}\<\] ]+)\s))/sm ) {
- $text = $'; #';
+ $text = $'; #';
my $sentenceMatch = $1;
- my $sentence = $1 . $2;
- my $punctuation = $2;
+ my $sentence = $1 . $2;
+ my $punctuation = $2;
$after = $3;
-
+
# if next character is not a space, then we are not in a sentence"
if ($after ne ' ' && $after ne "\t") {
$currentSentence .= $sentence;
next;
}
#at this point we know that there is a space after
- if ($punctuation eq ':' || $punctuation eq '?' || $punctuation eq '!') {
+ if ($punctuation eq ':' || $punctuation eq '?' || $punctuation eq '!') {
# let us consider this right here a beginning of a sentence
push @result, $currentSentence . $sentence;
$currentSentence = '';
@@ -248,46 +232,41 @@ sub Split_Text
# simple heuristic... let us check that the next words are not the beginning of a sentence
# in our library
# ENDTODO
-
+
# is the last word an abbreviation? For this the period has to follow the word
# this expression might have to be updated to take care of special characters in names :(
if ($sentenceMatch =~ /(.?)([^[:punct:]\s]+)$/) {
my $before = $1;
my $lastWord = $2;
#is it an abbreviation
-
+
if (length($lastWord) == 1 ) {
# single character abbreviations are special...
- # we will assume they never split the sentence if they are capitalized.
- if (($lastWord ge 'A') and
- ($lastWord le 'Z')) {
+ # we will assume they never split the sentence if they are capitalized.
+ if (($lastWord ge 'A') and ($lastWord le 'Z')) {
$currentSentence .= $sentence;
next;
}
print "last word an abbrev $sentenceMatch lastword [$lastWord] before [$before]\n";
# but some are lowercase!
- if (($lastWord eq 'e') or
- ($lastWord eq 'i')) {
+ if (($lastWord eq 'e') or ($lastWord eq 'i')) {
$currentSentence .= $sentence;
next;
}
print "2 last word an abbrev $sentenceMatch lastword [$lastWord] before [$before]\n";
} else {
-
$lastWord = lc $lastWord;
-
+
# only accept abbreviations if the previous char to the abbrev is space or
# is empty (beginning of line). This avoids things like .c
if (length($before) > 0 and $before eq ' ' and $ABBREVIATIONS{$lastWord}) {
-
$currentSentence .= $sentence;
next;
} else {
# just keep going, we handle this case below
}
}
-
}
push @result, $currentSentence . $sentence;
@@ -297,51 +276,40 @@ sub Split_Text
die 'We have not dealt with this case';
}
push @result, $currentSentence . $text;
-
+
#Print_Non_Sentence($text,"\n",'');
return @result;
-
}
sub loadDictionary {
-
- # Initialise var
- my $common_term = '';
+ my $common_term = '';
if (open(DICT, $dictionary)) {
-
while (defined ($line = <DICT>)) {
chomp($line);
if ($line !~ /^[A-Z]/) {
$COMMON_TERMS{$line} = 1;
}
-
- }
-
+ }
+
close(DICT);
} else {
- die "cannot open dictionary file $dictionary: $!";
+ die "cannot open dictionary file $dictionary: $!";
}
}
-sub loadAbbreviations
-{
-
- # Initialise var
- my $abbrv_term = '';
-
+sub loadAbbreviations {
+ my $abbrv_term = '';
+
if (open(ABBRV, $abbrv_file)) {
-
while (defined ($line = <ABBRV>)) {
chomp($line);
- $ABBREVIATIONS{$line} = $line;
- }
-
+ $ABBREVIATIONS{$line} = $line;
+ }
+
close(ABBRV);
} else {
- die "cannot open dictionary file $abbrv_file: $!";
+ die "cannot open dictionary file $abbrv_file: $!";
}
}
-
-#***************************************************************************************************