Merge branch 'consistent-formatting' into integration

Conflicts: extComments/extComments.pl extComments/hashComments.pl matcher/matcher.pl splitter/splitter.pl
author: René Scheibe <rene.scheibe@gmail.com> 2014-09-03 15:56:17 +0200
committer: René Scheibe <rene.scheibe@gmail.com> 2014-09-03 15:56:17 +0200
commit: 8b20ac6d9016be8d5e66d791a6d17cfd0429781f (patch)
tree: 74424be03ff503d5c058cf08f8fd8b7c1694d11a
parent: 525e3e973cf3bd388e4cffa3054cb2b753c545c7 (diff)
parent: 334865e50c2af0d9ce1c5be8385ad984d5c62346 (diff)
download: ninka-8b20ac6d9016be8d5e66d791a6d17cfd0429781f.tar.gz
7 files changed, 216 insertions, 343 deletions
diff --git a/extComments/extComments.pl b/extComments/extComments.pl
index 4ba9f59..1a4caec 100755
--- a/extComments/extComments.pl
+++ b/extComments/extComments.pl
@@ -1,5 +1,4 @@
 #!/usr/bin/env perl
-
 #
 #    Copyright (C) 2009-2010  Yuki Manabe and Daniel M. German
 #
@@ -30,7 +29,7 @@ if ($path eq '') {
 # set parameters
 my %opts = ();
 if (!getopts ('vc:p:',\%opts)) {
-print STDERR "Usage $0 -v
+    print STDERR "Usage $0 -v
 
   -v verbose
   -c count of comment blocks
@@ -49,8 +48,6 @@ $numberComments = $opts{c} if exists $opts{c};
 my $verbose = 1;
 $verbose = exists $opts{v};
 
-
-
 if (get_size($f) == 0) {
     print STDERR "Empty file, just exit\n" if $verbose;
     exit 0; # nothing to report, just end
@@ -60,30 +57,22 @@ my $commentsCmd = Determine_Comments_Extractor($f);
 
 execute("$commentsCmd");
 
-if ($commentsCmd =~ /^comments/ and
-    get_size("${f}.comments") == 0){
+if ($commentsCmd =~ /^comments/ and get_size("${f}.comments") == 0) {
     `cat '$f' | head -700  > ${f}.comments`;
 }
 
 exit 0;
 
-
-sub Determine_Comments_Extractor 
-{
+sub Determine_Comments_Extractor {
     my ($f) = @_;
     if ($f =~ /\.([^\.]+)$/) {
         my $ext= $1;
 
-        if ($ext =~ /^(pl|pm|py)$/ 
-            ) {
-########################
-# for the time being, let us just extract the top 400 lines
-
+        if ($ext =~ /^(pl|pm|py)$/) {
+            # for the time being, let us just extract the top 400 lines
             return "cat '$f' | head -400  > '${f}.comments'";
 #            return "$path/hashComments.pl -p '#' '$f'";
-        } elsif ($ext eq 'jl' or
-                 $ext eq 'el'
-            ) {
+        } elsif ($ext eq 'jl' or $ext eq 'el') {
             return "cat '$f' | head -400  > '${f}.comments'";
 #            return "$path/hashComments.pl -p ';' '$f'";;
         } elsif ($ext =~ /^(java|c|cpp|h|cxx|c\+\+|cc)$/ ) {
@@ -102,20 +91,17 @@ sub Determine_Comments_Extractor
     }
 }
 
-sub execute
-{
+sub execute {
     my ($c) = @_;
-#    print "\nTo execute [$c]\n";
     my $r = `$c`;
     my $status = ($? >> 8);
     die "execution of program [$c] failed: status [$status]" if ($status != 0);
     return $r;
 }
 
-
-sub get_size
-{
+sub get_size {
     my ($f) = @_;
     my $size = (stat($f))[7];
     return $size;
 }
+
diff --git a/extComments/hashComments.pl b/extComments/hashComments.pl
index c83846b..baa9937 100755
--- a/extComments/hashComments.pl
+++ b/extComments/hashComments.pl
@@ -1,5 +1,4 @@
 #!/usr/bin/env perl
-
 #
 #    Copyright (C) 2009-2010  Yuki Manabe and Daniel M. German
 #
@@ -17,20 +16,18 @@
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-
-# this is to extract the first <n> comments from any language that 
+# this is to extract the first <n> comments from any language that
 # uses the same prefix
 
 use Getopt::Std;
 
-
 # set parameters
 my %opts = ();
 if (!getopts ('vc:p:',\%opts)) {
-print STDERR "Usage $0 -v
+    print STDERR "Usage $0 -v
 
   -v verbose
-  -p comment char  
+  -p comment char
   -c count of comment blocks
 
 \n";
@@ -64,7 +61,7 @@ while (<>) {
     if (Is_Comment($_)) {
         s/\t/ /g;
         s/ +/ /g;
-        $comCount ++ if (not $insideComment);
+        $comCount++ if (not $insideComment);
         $insideComment = 1;
         /$commentChar+/;
         print OUT $' . "\n"; #'
@@ -72,18 +69,15 @@ while (<>) {
         print OUT "\n";
     } else {
         exit 0;
-    } 
+    }
 }
 
-
-sub Is_Comment
-{
+sub Is_Comment {
     my ($st) = @_;
     return  ($st =~ /^\s*$commentChar/);
 }
 
-sub Is_Blank
-{
+sub Is_Blank {
     my ($st) = @_;
     return ($st =~ /^\s*$/);
 }
diff --git a/filter/filter.pl b/filter/filter.pl
index 615c850..84269c6 100755
--- a/filter/filter.pl
+++ b/filter/filter.pl
@@ -16,7 +16,6 @@
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-
 #
 # filter.pl
 # This script classify input sentences into two categories,
@@ -29,7 +28,6 @@
 #
 use strict;
 
-
 #print $ARGV[0];
 
 # where are we running the program from
@@ -44,7 +42,6 @@ die "Usagee $0 <filename>.sentences" unless $ARGV[0] =~ /\.sentences$/;
 
 my $goodfilename = $ARGV[0];
 
-
 die "Filename should end in '.sentences' [$goodfilename]" unless $goodfilename =~ s/\.sentences$/\.goodsent/;
 my $badfilename = $ARGV[0];
 $badfilename =~ s/\.sentences$/\.badsent/;
@@ -58,36 +55,37 @@ open (DICTIONARY, "<$critWords") or die ('Error: criticalword.dict is not found.
 open (GOODOUT, ">$goodfilename") || die ('Error');
 open (BADOUT, ">$badfilename") || die ('Error');
 
-my @cwordlist=();
+my @cwordlist = ();
 # read dictionary into list
 my $cword;
-while ($cword=<DICTIONARY>){
-  chomp $cword;
-  next if $cword =~ /^\#/;
-  $cword =~ s/\#.*$//; # remove everything to the end of file
-  push(@cwordlist,"$cword");
+while ($cword = <DICTIONARY>) {
+    chomp $cword;
+    next if $cword =~ /^\#/;
+    $cword =~ s/\#.*$//; # remove everything to the end of file
+    push(@cwordlist, "$cword");
 }
 close(DICTIONARY);
 
 #matching cliticalwords in list against sentences.
 my $sentence;
-while ($sentence=<INPUTFILE>){
-  my $check=0;
-  chomp $sentence;
-  foreach $cword (@cwordlist){
-    if($sentence =~ /\b$cword\b/i){
-      $check=1;
-      #print "$cword:$sentence";
-      last;
+while ($sentence = <INPUTFILE>) {
+    my $check = 0;
+    chomp $sentence;
+    foreach $cword (@cwordlist) {
+        if ($sentence =~ /\b$cword\b/i) {
+            $check = 1;
+            #print "$cword:$sentence";
+            last;
+        }
+    }
+    if ($check == 1) {
+        print GOODOUT "$sentence\n";
+    } else {
+        print BADOUT "$sentence\n";
     }
-  }
-  if ($check==1){
-    print GOODOUT "$sentence\n";
-  }else{
-     print BADOUT "$sentence\n";
-  }
 }
 
 close(INPUTFILE);
 close(GOODOUT);
 close(BADOUT);
+
diff --git a/matcher/matcher.pl b/matcher/matcher.pl
index 665cc77..e805b6c 100755
--- a/matcher/matcher.pl
+++ b/matcher/matcher.pl
@@ -1,5 +1,4 @@
 #!/usr/bin/env perl
-
 #
 #    Copyright (C) 2009-2010  Yuki Manabe and Daniel M. German
 #
@@ -20,7 +19,7 @@
 #
 # matchter.pl
 #
-# This script use a set of license sentence name as input 
+# This script use a set of license sentence name as input
 # and output license name corresponds to a rule which match the set.
 #
 # author: Yuki Manabe
@@ -116,32 +115,24 @@ my $path = $0;
 $path =~ s/[^\/]+$//;
 if ($path eq '') {
     $path = './';
-  }
+}
 
 my $rules= $path . 'rules.dict';
 my $interrules= $path . 'interrules.dict';
 
 die "Usage $0 <filename>.senttok" unless $ARGV[0] =~ /\.senttok$/;
 
-
-# read rules
-
 my $countUnknowns = 0;
 
-
 # read the rules
-
 my @rulelist = Read_Rules($rules);
-
 my @interRuleList = Read_Inter_Rules($interrules);
 
-
-my @licSentNames=();
+my @licSentNames = ();
 my @original;
 
 Read_Original($ARGV[0], \@licSentNames, \@original);
 
-
 #foreach my $x (@licSentNames) {
 #    print "$x\n";
 #}
@@ -160,8 +151,8 @@ Read_Original($ARGV[0], \@licSentNames, \@original);
 
 # matching spdx requires to match strict licenses, with no alternatives...
 
-my $senttok= ',' . join(',',@licSentNames) . ',';
-my @result=();
+my $senttok = ',' . join(',', @licSentNames) . ',';
+my @result = ();
 my $countMatches = 0;
 
 print "[$senttok]\n" if $debug;
@@ -177,154 +168,136 @@ Match_License();
 #Print_Result();
 
 my $match = 0;
-for (my $i=0;$i<=$#licSentNames ;$i++) {
-    if ($licSentNames[$i] == 0 and 
-        ($licSentNames[$i] ne 'UNKNOWN'  and
+for (my $i = 0; $i <= $#licSentNames; $i++) {
+    if ($licSentNames[$i] == 0 and
+        ($licSentNames[$i] ne 'UNKNOWN' and
          $licSentNames[$i] ne '')) {
 #        print "[$licSentNames[$i]]\n";
         $licSentNames[$i] =~ s/Extrict$//;
-        $match ++;
+        $match++;
     }
 }
 
 #Print_Result();
 
-
 if ($match > 0) {
 #    print "REDO\n";
-    for (my $i=0;$i<=$#interRuleList ;$i++){
+    for (my $i = 0; $i <= $#interRuleList; $i++) {
         #for my $ref( @interRuleList[$i]){
         #  print "@$ref\n";
         #}
         #print $interRuleList[$i][0];
         @licSentNames = map { $_ eq $interRuleList[$i][0] ? $interRuleList[$i][1] : $_ } @licSentNames;
     }
-    
-    $senttok= join(',',@licSentNames) . ',';
-    
+
+    $senttok = join(',', @licSentNames) . ',';
+
     Match_License();
 }
 
 Print_Result();
 
-
 exit 0;
 
-
-
 #print @licSentNames;
 #print join(';',@licSentNames)."\n";
 
-
 # 3. matching
 ###############################
 
 # we will iterate over rules, matching as many as we can...
 
-
-
-
-
-sub Is_Unknown
-{
+sub Is_Unknown {
     my ($s) = @_;
     my @f = split (/,/, $s);
     return $f[0] eq 'UNKNOWN';
 }
 
-
-sub Read_Rules
-{
+sub Read_Rules {
     my ($rulesF) = @_;
     open (RULES, "<$rulesF") or die ('Error: rules.dict is not found.');
     my $sentence;
     my @rules = ();
-    while ($sentence=<RULES>){
+    while ($sentence = <RULES>) {
         chomp $sentence;
-	# clean up spaces
-	$sentence=~ s/^\s+//;
-	$sentence=~ s/\s+$//;
-	$sentence=~ s/\s*,\s*/,/g;
-	$sentence=~ s/\s*:\s*/:/g;
+        # clean up spaces
+        $sentence =~ s/^\s+//;
+        $sentence =~ s/\s+$//;
+        $sentence =~ s/\s*,\s*/,/g;
+        $sentence =~ s/\s*:\s*/:/g;
         #check format
-        if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/){
+        if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/) {
             next;
         }
         $sentence =~ /(.*?):(.*)/;
-        push (@rules,[$1,$2]);
+        push (@rules, [$1, $2]);
     }
     close RULES;
     return @rules;
 }
 
-
-sub Read_Inter_Rules
-{
+sub Read_Inter_Rules {
     my ($interrules) = @_;
 
     my @list;
     open (IRULES, "<$interrules") or die ('Error: interrules.dict is not found.');
     my $sentence;
-    while ($sentence=<IRULES>){
+    while ($sentence = <IRULES>) {
         chomp $sentence;
         #check format
-        if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/){
+        if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/) {
             next;
         }
-        foreach my $item (split(/\|/,$2)){
-            push (@list,[$item,$1]);
+        foreach my $item (split(/\|/, $2)) {
+            push (@list, [$item, $1]);
         }
     }
     close IRULES;
     return @list;
 }
 
-sub Read_Original
-{
+sub Read_Original {
     my ($inputF, $tokens, $originals) = @_;
 
     open (INPUTFILE, $inputF) or die ("Error: $inputF is not found.");
-    
+
     my $sentence;
     my @original;
-    while ($sentence = <INPUTFILE>){
+    while ($sentence = <INPUTFILE>) {
         chomp $sentence;
-        my @fields = split(':',$sentence);
-        push(@$originals,$fields[1]);
+        my @fields = split(':', $sentence);
+        push(@$originals, $fields[1]);
         my @token = split(';', $fields[0]);
-        push(@$tokens,$token[0]);
+        push(@$tokens, $token[0]);
     }
     if (scalar(@$originals) == 0) {
         print "NONE\n";
         exit 0;
     }
-    
+
 #print join(';',@licSentNames)."\n";
-    
+
     close INPUTFILE;
 }
 
-sub Match_License
-{
-    
+sub Match_License {
 # create a string with the sentences
-    
-    for (my $j=0;$j<=$#rulelist;$j++){
-        
-        my $rule=$rulelist[$j][1];
-        my $rulename=$rulelist[$j][0];
+
+    for (my $j = 0; $j <= $#rulelist; $j++) {
+        my $rule = $rulelist[$j][1];
+        my $rulename = $rulelist[$j][0];
         my $lenRule = scalar(split(',', $rule));
         # replace rule with the length of the rule
-	print "To try [$rulename][$rule] on [$senttok]\n" if $debug;
-        while ($senttok =~ s/,${rule},/,$lenRule,/){
-            $countMatches ++;
-            push (@result,$rulename);
+        print "To try [$rulename][$rule] on [$senttok]\n" if $debug;
+        while ($senttok =~ s/,${rule},/,$lenRule,/) {
+            $countMatches++;
+            push (@result, $rulename);
 #        print ">>>>$senttok|$rulelist[$j][1]\n";
 #        print 'Result: ', join(',', @result);
 #        print "\n";
         }
     }
-    
+
 #    print ">>>>[$senttok]\n";
 
     my $onlyAllRight = 0;
@@ -333,7 +306,7 @@ sub Match_License
 #print STDERR "Ending>>>>>>>$senttok\n";
 #print STDERR 'Size>>' , scalar(@result), "\n";
 #print STDERR 'Result>>', join(',', @result), "\n";
-    
+
 # let us remove allrights
 #    my $onlyAllRight = 1;
 #    for my $i (0.. scalar(@licSentNames)-1){
@@ -345,11 +318,10 @@ sub Match_License
 #    }
 
 # output result
-    if (scalar(@result) > 0){
+    if (scalar(@result) > 0) {
         # at this point we have matched
-        
-        
-        # let us clean up the rules... let us print the matched rules, and the 
+
+        # let us clean up the rules... let us print the matched rules, and the
 #    if (grep(/GPL/, @result)) {
 #        print "GPL...\n";
 #        foreach my $r ($NonCriticalRules{GPL}) {
@@ -357,7 +329,6 @@ sub Match_License
 #        }
 #    }
         # general removal of rules
-        
 
         foreach my $r (@generalNonCritical) {
             while ($senttok =~ s/,$r,/,-1,/) {
@@ -365,7 +336,7 @@ sub Match_License
             }
         }
 #        print "[$senttok]\n";
-                
+
         foreach my $res (@result) {
             my $temp = $NonCriticalRules{$res};
             foreach my $r (@$temp) {
@@ -375,13 +346,11 @@ sub Match_License
                 }
             }
         }
-#        print "[$senttok]\n";        
+#        print "[$senttok]\n";
     }
 }
 
-
-sub Print_Result
-{
+sub Print_Result {
 #        $senttok =~ s/AllRights(,?)/$1/g;
 #        $senttok =~ s/UNKNOWN,/,/g;
 #        $senttok =~ s/,+/,/g;
@@ -392,7 +361,7 @@ sub Print_Result
     my @sections = split(',', $senttok);
     die 'assertion 1' if $sections[0] ne '';
     die 'assertion 2' if $sections[scalar(@sections)] ne '';
-    
+
     my $ignoredLines = 0;
     my $licenseLines = 0;
     my $unknownLines = 0;
@@ -404,20 +373,20 @@ sub Print_Result
         } elsif ($sections[$i] != 0) {
             $licenseLines += $sections[$i];
         } elsif ($sections[$i] eq 'UNKNOWN') {
-            $unknownLines ++;
+            $unknownLines++;
         } else {
             $unmatchedLines++;
         }
     }
     $senttok =~ s/^,(.*),$/$1/;
-    
+
 #    print "$ignoredLines > $licenseLines > $unknownLines > $unmatchedLines\n";
     if (scalar (@result) == 0) {
-	print 'UNKNOWN';
+        print 'UNKNOWN';
     } else {
-	print join(',',@result);
+        print join(',',@result);
     }
     print ";$countMatches;$licenseLines;$ignoredLines;$unmatchedLines;$unknownLines;$senttok\n";
     $senttok = $save;
-    
 }
+
diff --git a/ninka.pl b/ninka.pl
index 990a5f5..a05c5af 100755
--- a/ninka.pl
+++ b/ninka.pl
@@ -42,15 +42,13 @@ Usage $0 -fCtTvcgsGd <filename>
 
   -L force creation of matching
 
-  -d delete intermediate files  
+  -d delete intermediate files
 
 \n";
 
     exit 1;
 }
 
-
-
 my $verbose = exists $opts{v};
 my $delete = exists $opts{d};
 #$delete = 1;
@@ -62,7 +60,6 @@ if ($path eq "") {
     $path = "./";
 }
 
-
 my $force = exists $opts{f};
 my $forceGood = exists $opts{G};
 my $forceSentences = exists $opts{S};
@@ -76,8 +73,6 @@ my $f = $ARGV[0];
 
 my $original = $f;
 
-
-
 print "Starting: $original;\n" if ($verbose);
 
 print "$original;";
@@ -92,28 +87,22 @@ if (not (-f "$f")) {
     exit 0;
 }
 
-
-Do_File_Process($original, $commentsFile, ($force or $forceComments), 
+Do_File_Process($original, $commentsFile, ($force or $forceComments),
                 "$path/extComments/extComments.pl -c1 '${original}'",
                 "Creating comments file",
                 exists $opts{c});
 
-
-Do_File_Process($commentsFile, $sentencesFile, ($force or $forceSentences), 
+Do_File_Process($commentsFile, $sentencesFile, ($force or $forceSentences),
                 "$path/splitter/splitter.pl '${commentsFile}'",
-                "Splitting sentences", exists $opts{s}
-    );
+                "Splitting sentences", exists $opts{s});
 
-Do_File_Process( $sentencesFile, $goodsentFile, ($force or $forceGood), 
+Do_File_Process($sentencesFile, $goodsentFile, ($force or $forceGood),
                  "$path/filter/filter.pl '${sentencesFile}'",
-                 "Filtering good sentences", exists $opts{s}
-    );
+                 "Filtering good sentences", exists $opts{s});
 
-Do_File_Process($goodsentFile, $sentokFile, ($force or $forceSentok), 
+Do_File_Process($goodsentFile, $sentokFile, ($force or $forceSentok),
                 "$path/senttok/senttok.pl '${goodsentFile}' > '${sentokFile}'",
-                "Matching sentences against rules", exists $opts{t}
-    );
-
+                "Matching sentences against rules", exists $opts{t});
 
 print "Matching ${f}.senttok against rules" if ($verbose);
 execute("$path/matcher/matcher.pl '${f}.senttok' > '${f}.license'");
@@ -132,18 +121,15 @@ if ($delete) {
 
 exit 0;
 
-
-
-sub Do_File_Process
-{
+sub Do_File_Process {
     my ($input, $output, $force, $cmd, $message, $end) = @_;
 
     print "${message}:" if ($verbose);
-    if ($force or newer($input, $output))  {
+    if ($force or newer($input, $output)) {
         print "Running ${cmd}:" if ($verbose);
         execute($cmd);
     } else {
-        print "File [$output] newer than input [$input], not creating:" if ($verbose);    
+        print "File [$output] newer than input [$input], not creating:" if ($verbose);
     }
     if ($end) {
         print "Existing after $message" if $verbose;
@@ -152,11 +138,7 @@ sub Do_File_Process
     }
 }
 
-
-
-
-sub execute
-{
+sub execute {
     my ($c) = @_;
 #    print "\nTo execute [$c]\n";
     my $r = `$c`;
@@ -165,8 +147,7 @@ sub execute
     return $r;
 }
 
-sub newer
-{
+sub newer {
     my ($f1, $f2) = @_;
     my ($f1write) = (stat($f1))[9];
     my ($f2write) = (stat($f2))[9];
diff --git a/senttok/senttok.pl b/senttok/senttok.pl
index 8580324..e960e2e 100755
--- a/senttok/senttok.pl
+++ b/senttok/senttok.pl
@@ -15,8 +15,8 @@
 #    You should have received a copy of the GNU Affero General Public License
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
-use strict;
 
+use strict;
 
 my $TOO_LONG = 70;
 
@@ -32,7 +32,7 @@ open FH, "<$ARGV[0]";
 my @licensesentencelist=();
 open LICENSESENTENCEFILE, "<$licSentences";
 my $line;
-while ($line = <LICENSESENTENCEFILE>){
+while ($line = <LICENSESENTENCEFILE>) {
     chomp $line;
     next if $line =~ /^\#/;
     next if $line =~ /^ *$/;
@@ -44,7 +44,7 @@ while ($line = <LICENSESENTENCEFILE>){
 #  print $line;
 #}
 close LICENSESENTENCEFILE;
-while ($line = <>){
+while ($line = <>) {
     my $saveLine;
     my $originalLine;
     chomp $line;
@@ -56,14 +56,13 @@ while ($line = <>){
 
     $line = Normalize_Sentence($line);
 
-
-    my $check=0;
-    my $matchname="UNKNOWN";
-    my @parm=();
+    my $check = 0;
+    my $matchname = "UNKNOWN";
+    my @parm = ();
     my $sentence;
-    my $distance=1; #maximum? number
-    my $mostsimilarname="UNKNOWN";
-    my $before; 
+    my $distance = 1; #maximum? number
+    my $mostsimilarname = "UNKNOWN";
+    my $before;
     my $after;
     my $gpl = 0;
     my ($gplLater, $gplVersion);
@@ -74,8 +73,7 @@ while ($line = <>){
 #   [$line]
 #\n";
 
-
-    my $lineAsGPL ='';
+    my $lineAsGPL = '';
 
     if (Looks_Like_GPL($line)) {
         my $old = $line;
@@ -94,23 +92,23 @@ while ($line = <>){
         $gpl = $saveGPL;
         $LGPL = "";
       again:
-#       print "Testing 
+#       print "Testing
 #   lin[$line]
 #   ori[$saveLine]
 #   re [$regexp]
 #   lpg[$LGPL]
 #\n";
-        if ( $line =~ /$regexp/im ){
-            $before = $`; 
+        if ($line =~ /$regexp/im) {
+            $before = $`;
             $after = $'; #';
-            $check=1;
-            $matchname=$name;
-            for (my $i = 1; $i <= $number; $i++){
+            $check = 1;
+            $matchname = $name;
+            for (my $i = 1; $i <= $number; $i++) {
                 no strict 'refs';
-                push @parm,$$i;
+                push @parm, $$i;
             }
             last;
-        } else{
+        } else {
 #            print "NO MATCH\n";
             # let us try again in cas it is lesser/library
             # do it only once
@@ -124,45 +122,39 @@ while ($line = <>){
                 goto again;
             }
             next;## dmg
-            my $targetset=$regexp;
+            my $targetset = $regexp;
             $targetset =~ s/^(.*)$/$1/;
-            my $tmpdist=&levenshtein($line,$targetset)/max(length($targetset),length($sentence));
-            if ($tmpdist<$distance){
-                $mostsimilarname=$name;
-                $distance=$tmpdist;
+            my $tmpdist = levenshtein($line, $targetset) / max(length($targetset), length($sentence));
+            if ($tmpdist < $distance) {
+                $mostsimilarname = $name;
+                $distance = $tmpdist;
             }
         }
         last; ###
     }
-    if ($check == 1){
-        #licensesentence name, parm1, parm2,..
+    if ($check == 1) {
+        # licensesentence name, parm1, parm2,..
         if ($gpl) {
             $matchname .= "Ver" . $gplVersion;
             $matchname .= "+" if $gplLater;
             $matchname = $LGPL . $matchname;
-        } else {
         }
-        if (length($before)>$TOO_LONG ||
-            length($after) >$TOO_LONG) {
+        if (length($before) > $TOO_LONG || length($after) > $TOO_LONG) {
             $matchname .= "-TOOLONG";
         }
-        my $parmstrings=join(";",$matchname, $subRule, $before, $after, @parm);
-        print $parmstrings,":$originalLine\n";
-
-        
-    }else{
-        #UNKNOWN, sentence
+        my $parmstrings = join(";",$matchname, $subRule, $before, $after, @parm);
+        print $parmstrings, ":$originalLine\n";
+    } else {
+        # UNKNOWN, sentence
         chomp $line;
-        print $matchname,";",0, ";", $mostsimilarname,";",$distance,";",$saveLine,":$originalLine\n";
-    } 
-    
+        print $matchname, ";", 0, ";", $mostsimilarname, ";", $distance, ";", $saveLine, ":$originalLine\n";
+    }
 }
 
 close FH;
 exit 0;
 
-sub Normalize_GPL
-{
+sub Normalize_GPL {
     my ($line) = @_;
     my $later = 0;
     my $version = 0;
@@ -202,7 +194,6 @@ sub Normalize_GPL
     $line =~ s/GPL \(GPL\)/GPL/gi;
     $line =~ s/GPL \(<QUOTES>GPL<QUOTES>\)/GPL/gi;
 
-
     $line =~ s/GNU //gi;
     $line =~ s/under GPL/under the GPL/gi;
     $line =~ s/under Lesser/under the Lesser/gi;
@@ -231,8 +222,7 @@ sub Normalize_GPL
     return ($line,$later,$version);
 }
 
-sub Looks_Like_GPL
-{
+sub Looks_Like_GPL {
     my ($line) = @_;
 
     return 1 if $line =~ /GNU/;
@@ -242,18 +232,16 @@ sub Looks_Like_GPL
     return 0;
 }
 
-
-sub Normalize_Sentence
-{
+sub Normalize_Sentence {
     my ($line) = @_;
     # do some very quick spelling corrections for english/british words
-    $line=~ s/icence/icense/ig;
-    $line=~ s/(\.|;)$//;
+    $line =~ s/icence/icense/ig;
+    $line =~ s/(\.|;)$//;
 
     return $line;
 }
 
-# Return the Levenshtein distance (also called Edit distance) 
+# Return the Levenshtein distance (also called Edit distance)
 # between two strings
 #
 # The Levenshtein distance (LD) is a measure of similarity between two
@@ -272,22 +260,21 @@ sub Normalize_Sentence
 # The distance is named after the Russian scientist Vladimir
 # Levenshtein, who devised the algorithm in 1965
 #
-sub levenshtein
-  {
+sub levenshtein {
     # $s1 and $s2 are the two strings
     # $len1 and $len2 are their respective lengths
     #
     my ($s1, $s2) = @_;
     my ($len1, $len2) = (length $s1, length $s2);
-    
+
     # If one of the strings is empty, the distance is the length
     # of the other string
     #
     return $len2 if ($len1 == 0);
     return $len1 if ($len2 == 0);
-    
+
     my %mat;
-    
+
     # Init the distance matrix
     #
     # The first row to 0..$len1
@@ -297,35 +284,31 @@ sub levenshtein
     # The first row and column are initialized so to denote distance
     # from the empty string
     #
-    for (my $i = 0; $i <= $len1; ++$i)
-      {
-        for (my $j = 0; $j <= $len2; ++$j)
-	  {
+    for (my $i = 0; $i <= $len1; ++$i) {
+        for (my $j = 0; $j <= $len2; ++$j) {
             $mat{$i}{$j} = 0;
             $mat{0}{$j} = $j;
-	  }
-	
+      }
+
         $mat{$i}{0} = $i;
       }
-    
+
     # Some char-by-char processing is ahead, so prepare
     # array of chars from the strings
     #
     my @ar1 = split(//, $s1);
     my @ar2 = split(//, $s2);
-    
-    for (my $i = 1; $i <= $len1; ++$i)
-      {
-        for (my $j = 1; $j <= $len2; ++$j)
-	  {
+
+    for (my $i = 1; $i <= $len1; ++$i) {
+        for (my $j = 1; $j <= $len2; ++$j) {
             # Set the cost to 1 iff the ith char of $s1
             # equals the jth of $s2
-            # 
+            #
             # Denotes a substitution cost. When the char are equal
             # there is no need to substitute, so the cost is 0
             #
             my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
-	    
+
             # Cell $mat{$i}{$j} equals the minimum of:
             #
             # - The cell immediately above plus 1
@@ -338,36 +321,30 @@ sub levenshtein
             $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
                                 $mat{$i}{$j-1} + 1,
                                 $mat{$i-1}{$j-1} + $cost]);
-	  }
-      }
-    
+        }
+    }
+
     # Finally, the Levenshtein distance equals the rightmost bottom cell
     # of the matrix
     #
     # Note that $mat{$x}{$y} denotes the distance between the substrings
     # 1..$x and 1..$y
-    #
     return $mat{$len1}{$len2};
-  }
-  
-  
-  # minimal element of a list
-  #
-  sub min
-    {
-      my @list = @{$_[0]};
-      my $min = $list[0];
-      
-      foreach my $i (@list)
-	{
-	  $min = $i if ($i < $min);
-	}
-      
-      return $min;
-    }
-    
-    sub max{
-      my @list = @_;
-      return $list[0]>$list[1]?$list[0]:$list[1];
+}
+
+sub min {
+    my @list = @{$_[0]};
+    my $min = $list[0];
+
+    foreach my $i (@list) {
+        $min = $i if ($i < $min);
     }
-    
+
+    return $min;
+}
+
+sub max {
+    my @list = @_;
+    return $list[0] > $list[1] ? $list[0] : $list[1];
+}
+
diff --git a/splitter/splitter.pl b/splitter/splitter.pl
index 55a9d36..69039ad 100755
--- a/splitter/splitter.pl
+++ b/splitter/splitter.pl
@@ -1,8 +1,8 @@
 #!/usr/bin/env perl
-# 
+#
 #***************************************************************************************************
-# 		SENTENCE SPLITTER
-# Author:	Paul Clough 	{cloughie@dcs.shef.ac.uk}
+#       SENTENCE SPLITTER
+# Author:   Paul Clough     {cloughie@dcs.shef.ac.uk}
 #
 
 # This program is originally based on the sentence splitter program
@@ -26,15 +26,12 @@
 #    You should have received a copy of the GNU Affero General Public License
 #    along with this patch.  If not, see <http://www.gnu.org/licenses/>.
 
-
-
 use strict;
 
 # This program is originally based on the sentence splitter program
 # published by Paul Clough. Version 1.0, but then it was mostly rewritten
 # His ideas, however, linger in here (and his dictionary of abbreviations)
 
-
 my $dictionary = 'splitter.dict';
 my $abbrv_file = 'splitter.abv';
 my $len = 0;
@@ -53,12 +50,10 @@ $abbrv_file = $path . $abbrv_file;
 
 die "Usage $0 <filename>.comments" unless $ARGV[0] =~ /\.comments$/;
 
-
-die "Input file name should end in '.comments' [$output_file]" unless  $output_file =~ s/\.comments$/.sentences/;
+die "Input file name should end in '.comments' [$output_file]" unless $output_file =~ s/\.comments$/.sentences/;
 
 open(OUT, ">$output_file") or die("Unable to create output file [$output_file]");
 
-
 #print length($opt_o);
 
 # Load in the dictionary and find the common words.
@@ -70,15 +65,15 @@ open(OUT, ">$output_file") or die("Unable to create output file [$output_file]")
 &loadAbbreviations;
 
 my $text;
-#	open(FILE, $opt_f) or die "Can't open $opt_f for reading\n";
+#   open(FILE, $opt_f) or die "Can't open $opt_f for reading\n";
 
 my $line;
 while (defined ($line = <>)) {
-    $text.= $line;
+    $text .= $line;
 }
 
 # append a newline just in case
-$text.="\n";
+$text .= "\n";
 
 # - is used to create lines
 # = is used to create lines
@@ -96,11 +91,9 @@ $text =~ s@^[ \t]*/\*@@gmx;
 $text =~ s/\*\/[ \t]*$//gmx;
 $text =~ s@([^:])// @$1@gmx;
 
-
 # Replace /\r\n/ with \n only
 $text =~ s/\r\n/\n/g;
 
-
 # now, try to replace the leading/ending character of each line #/-, at most 3 heading characters
 # and each repeated as many times as necessaary
 $text =~ s/^[ \t]{0,3}[\*\#\/\;]+//gmx;
@@ -128,7 +121,7 @@ $text .= "\n";
 # this gets us in big trouble... licenses that have numeric abbreviations
 $text =~ s/v\.\s+2\.0/v<dot> 2<dot>0/g;
 
-while ($text =~ /^([^\n]*)\n/gsm ) {
+while ($text =~ /^([^\n]*)\n/gsm) {
     my $curr = $1;
 #    print "<<$curr\n<<\n";
 
@@ -145,7 +138,7 @@ while ($text =~ /^([^\n]*)\n/gsm ) {
 
     foreach my $s (@sentences) {
         for my $i (0..length($s)-1) {
-            my $c = substr($s,$i,1);
+            my $c = substr($s, $i, 1);
             $count2++ if ($c ge 'A' && $c le 'z');
         }
         print OUT Clean_Sentence($s) , "\n";
@@ -164,17 +157,11 @@ close OUT;
 
 exit;
 
-
-
-
-#***************************************************************************************************
-
 #***************************************************************************************************
 # procedures
 #***************************************************************************************************
 
-sub Clean_Sentence
-{
+sub Clean_Sentence {
     ($_) = @_;
     # check for trailing bullets of different types
 
@@ -198,12 +185,9 @@ sub Clean_Sentence
     die if /\n/m;
 
     return $_;
-
 }
 
-
-sub Split_Text
-{
+sub Split_Text {
     my ($text) = @_;
     my $len = 0;
     my $next_word;
@@ -213,7 +197,7 @@ sub Split_Text
     my @result;
     my $after;
     my $currentSentence = '';
-    # this breaks the sentence into 
+    # this breaks the sentence into
     # 1. Any text before a separator
     # 2. The separator [.!?:\n]
     # 3.
@@ -222,19 +206,19 @@ sub Split_Text
                      ([\.\!\?\:\n])
                      (?=(.?))
                    /xsm) { #/(?:(?=([([{\"\'`)}\]<]*[ ]+)[([{\"\'`)}\] ]*([A-Z0-9][a-z]*))|(?=([()\"\'`)}\<\] ]+)\s))/sm ) {
-	$text = $';    #';
+        $text = $';    #';
         my $sentenceMatch = $1;
-        my $sentence = $1 . $2; 
-	my $punctuation = $2;
+        my $sentence = $1 . $2;
+        my $punctuation = $2;
         $after = $3;
-        
+
         # if next character is not a space, then we are not in a sentence"
         if ($after ne ' ' && $after ne "\t") {
             $currentSentence .= $sentence;
             next;
         }
         #at this point we know that there is a space after
-        if ($punctuation eq ':'  || $punctuation eq '?'  || $punctuation eq '!') {
+        if ($punctuation eq ':' || $punctuation eq '?' || $punctuation eq '!') {
             # let us consider this right here a beginning of a sentence
             push @result, $currentSentence . $sentence;
             $currentSentence = '';
@@ -248,46 +232,41 @@ sub Split_Text
             # simple heuristic... let us check that the next words are not the beginning of a sentence
             # in our library
             # ENDTODO
-            
+
             # is the last word an abbreviation? For this the period has to follow the word
             # this expression might have to be updated to take care of special characters  in names :(
             if ($sentenceMatch =~ /(.?)([^[:punct:]\s]+)$/) {
                 my $before = $1;
                 my $lastWord = $2;
                 #is it an abbreviation
-                
+
                 if (length($lastWord) == 1 ) {
                     # single character abbreviations are special...
-                    # we will assume they never split the sentence if they are capitalized. 
-                    if (($lastWord ge 'A') and
-                        ($lastWord le 'Z'))  {
+                    # we will assume they never split the sentence if they are capitalized.
+                    if (($lastWord ge 'A') and ($lastWord le 'Z')) {
                         $currentSentence .= $sentence;
                         next;
                     }
                     print "last word an abbrev $sentenceMatch lastword [$lastWord] before [$before]\n";
 
                     # but some are lowercase!
-                    if (($lastWord eq 'e') or
-                        ($lastWord eq 'i'))  {
+                    if (($lastWord eq 'e') or ($lastWord eq 'i')) {
                         $currentSentence .= $sentence;
                         next;
                     }
                     print "2 last word an abbrev $sentenceMatch lastword [$lastWord] before [$before]\n";
                 } else {
-                    
                     $lastWord = lc $lastWord;
-                    
+
                     # only accept abbreviations if the previous char to the abbrev is space or
                     # is empty (beginning of line). This avoids things like .c
                     if (length($before) > 0 and $before eq ' ' and  $ABBREVIATIONS{$lastWord}) {
-                        
                         $currentSentence .= $sentence;
                         next;
                     } else {
                         # just keep going, we handle this case below
                     }
                 }
-                
             }
 
             push @result, $currentSentence . $sentence;
@@ -297,51 +276,40 @@ sub Split_Text
         die 'We have not dealt with this case';
     }
     push @result, $currentSentence . $text;
-    
+
     #Print_Non_Sentence($text,"\n",'');
     return @result;
-
 }
 
 sub loadDictionary {
-
-	# Initialise var
-    my $common_term = '';	
+    my $common_term = '';
 
     if (open(DICT, $dictionary)) {
-
         while (defined ($line = <DICT>)) {
             chomp($line);
             if ($line !~ /^[A-Z]/) {
                 $COMMON_TERMS{$line} = 1;
             }
-            
-        }		
-		
+        }
+
         close(DICT);
     } else {
-        die "cannot open dictionary file $dictionary: $!";		
+        die "cannot open dictionary file $dictionary: $!";
     }
 }
 
-sub loadAbbreviations 
-{
-    
-    # Initialise var
-    my $abbrv_term = '';	
-    
+sub loadAbbreviations {
+    my $abbrv_term = '';
+
     if (open(ABBRV, $abbrv_file)) {
-        
         while (defined ($line = <ABBRV>)) {
             chomp($line);
-            $ABBREVIATIONS{$line} = $line;	
-        }		
-        
+            $ABBREVIATIONS{$line} = $line;
+        }
+
         close(ABBRV);
     } else {
-        die "cannot open dictionary file $abbrv_file: $!";		
+        die "cannot open dictionary file $abbrv_file: $!";
     }
 }
 
-
-#***************************************************************************************************
author	René Scheibe <rene.scheibe@gmail.com>	2014-09-03 15:56:17 +0200
committer	René Scheibe <rene.scheibe@gmail.com>	2014-09-03 15:56:17 +0200
commit	8b20ac6d9016be8d5e66d791a6d17cfd0429781f (patch)
tree	74424be03ff503d5c058cf08f8fd8b7c1694d11a
parent	525e3e973cf3bd388e4cffa3054cb2b753c545c7 (diff)
parent	334865e50c2af0d9ce1c5be8385ad984d5c62346 (diff)
download	ninka-8b20ac6d9016be8d5e66d791a6d17cfd0429781f.tar.gz