massive cleanup

* consistent script descriptions * consistent cmdline parameter parsing with Getopt::Std * remove some commented out & unused code * fix some issues reported by perlcritic * better code formatting * and much more...
author: René Scheibe <rene.scheibe@gmail.com> 2014-09-10 15:12:04 +0200
committer: René Scheibe <rene.scheibe@gmail.com> 2014-09-12 19:55:15 +0200
commit: 43cd8a52398a3a6436eb2e24d2d30fb3ee5080fc (patch)
tree: bdb3524e3d54e17255032afe2a47ec3e80fc7dba
parent: 949dc6984decd81e51e264af34ee7cbc117e5e4a (diff)
download: ninka-43cd8a52398a3a6436eb2e24d2d30fb3ee5080fc.tar.gz
7 files changed, 481 insertions, 472 deletions
diff --git a/extComments/extComments.pl b/extComments/extComments.pl
index d7a65d5..1f840a8 100755
--- a/extComments/extComments.pl
+++ b/extComments/extComments.pl
@@ -16,87 +16,91 @@
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-use Getopt::Std;
-use strict;
-
-my $path = $0;
+#
+# extComments.pl
+#
+# This script extracts comments from source code.
+# If no comment extractor is known for a language, then extracts top lines from source.
+#
 
-$path =~ s/\/+[^\/]+$//;
-if ($path eq '') {
-    $path = './';
-}
+use strict;
+use warnings;
+use Getopt::Std;
 
-# set parameters
+# parse cmdline parameters
 my %opts = ();
-if (!getopts ('vc:p:',\%opts)) {
-    print STDERR "Usage $0 -v
+if (!getopts('v', \%opts) or scalar(@ARGV) == 0) {
+    print STDERR "Usage: $0 [OPTIONS] <filename>
 
-  -v verbose
-  -c count of comment blocks
+Options:
+  -v verbose\n";
 
-\n";
-
-    die;
+    exit 1;
 }
 
-my $file = $ARGV[0];
+my $verbose = exists $opts{v};
 
-#die "illegal file [$file]"  if $file =~ m@/\.@;
+my $path = get_my_path($0);
 
-my $number_comments = 1;
-$number_comments = $opts{c} if exists $opts{c};
-my $verbose = 1;
-$verbose = exists $opts{v};
+my $input_file = $ARGV[0];
+my $comments_file = "$input_file.comments";
 
-if (get_size($file) == 0) {
-    print STDERR "Empty file, just exit\n" if $verbose;
-    exit 0; # nothing to report, just end
+my $comments_cmd = determine_comments_cmd($input_file, $comments_file);
+execute($comments_cmd);
+if ($comments_cmd =~ /^comments/ and get_size($comments_file) == 0) {
+    $comments_cmd = create_head_cmd($input_file, $comments_file, 700);
+    execute($comments_cmd);
 }
 
-my $comments_cmd = determine_comments_extractor($file);
-
-execute("$comments_cmd");
+exit 0;
 
-if ($comments_cmd =~ /^comments/ and get_size("${file}.comments") == 0) {
-    `cat '$file' | head -700  > ${file}.comments`;
+sub get_my_path {
+    my ($self) = @_;
+    my $path = $self;
+    $path =~ s/\/+[^\/]+$//;
+    if ($path eq '') {
+        $path = './';
+    }
+    return $path;
 }
 
-exit 0;
-
-sub determine_comments_extractor {
-    my ($file) = @_;
-    if ($file =~ /\.([^\.]+)$/) {
-        my $ext= $1;
+sub determine_comments_cmd {
+    my ($input_file, $comments_file) = @_;
 
+    if ($input_file =~ /\.([^\.]+)$/) {
+        my $ext = $1;
         if ($ext =~ /^(pl|pm|py)$/) {
-            # for the time being, let us just extract the top 400 lines
-            return "cat '$file' | head -400  > '${file}.comments'";
-#            return "$path/hashComments.pl -p '#' '$file'";
-        } elsif ($ext eq 'jl' or $ext eq 'el') {
-            return "cat '$file' | head -400  > '${file}.comments'";
-#            return "$path/hashComments.pl -p ';' '$file'";;
+            return create_head_cmd($input_file, $comments_file, 400);
+#            return "$path/hashComments.pl -p '#' '$input_file'";
+        } elsif ($ext =~ /^(jl|el)$/) {
+            return create_head_cmd($input_file, $comments_file, 400);
+#            return "$path/hashComments.pl -p ';' '$input_file'";;
         } elsif ($ext =~ /^(java|c|cpp|h|cxx|c\+\+|cc)$/ ) {
-            my $comments_cmd_location = `which comments`;
-            if ($comments_cmd_location ne '') {
-                return "comments -c1 '$file' 2> /dev/null";
+            my $comments_binary = 'comments';
+            if (`which $comments_binary` ne '') {
+                return "$comments_binary -c1 '$input_file' 2> /dev/null";
             } else {
-                return "cat '$file' | head -400  > '${file}.comments'";
+                return create_head_cmd($input_file, $comments_file, 400);
             }
         } else {
-            return "cat '$file' | head -700  > '${file}.comments'";
+            return create_head_cmd($input_file, $comments_file, 700);
         }
     } else {
-        print "\n>>>>>>>>>>>>>>>>>>>>>\n";
-        return "cat '$file' | head -700  > '${file}.comments'";
+        return create_head_cmd($input_file, $comments_file, 700);
     }
 }
 
+sub create_head_cmd {
+    my ($input_file, $output_file, $count_lines) = @_;
+    return "head -$count_lines '$input_file' > '$output_file'";
+}
+
 sub execute {
-    my ($c) = @_;
-    my $r = `$c`;
+    my ($cmd) = @_;
+    my $result = `$cmd`;
     my $status = ($? >> 8);
-    die "execution of program [$c] failed: status [$status]" if ($status != 0);
-    return $r;
+    die "execution of program [$cmd] failed: status [$status]" if ($status != 0);
+    return $result;
 }
 
 sub get_size {
diff --git a/extComments/hashComments.pl b/extComments/hashComments.pl
index a32e283..77c3863 100755
--- a/extComments/hashComments.pl
+++ b/extComments/hashComments.pl
@@ -16,69 +16,75 @@
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-# this is to extract the first <n> comments from any language that
-# uses the same prefix
+#
+# hashComments.pl
+#
+# This script extracts the first <n> comments from any language that uses the same prefix.
+#
 
+use strict;
+use warnings;
 use Getopt::Std;
 
-# set parameters
+# parse cmdline parameters
 my %opts = ();
-if (!getopts ('vc:p:',\%opts)) {
-    print STDERR "Usage $0 -v
+if (!getopts ('vc:p:', \%opts)) {
+    print STDERR "Usage: $0 [OPTIONS] <filename>
 
+Options:
   -v verbose
   -p comment char
-  -c count of comment blocks
-
-\n";
+  -c count of comment blocks\n";
 
-    die;
+    exit 1;
 }
-my $file = $ARGV[0];
-
-open (OUT, ">${file}.comments") or die "Unable to create [${file}.comments]";
-
- <>;
-print OUT unless /^\#\!/;
-
-my $comment_char = '#';
-
-$comment_char = $opts{p} if exists $opts{p};
-
-my $comments_count = 1;
-$comments_count = $opts{c} if exists $opts{c};
 
 my $verbose = exists $opts{v};
 
+my $comment_char   = exists $opts{p} ? $opts{p} : '#';
+my $comments_count = exists $opts{c} ? $opts{c} : 1;
+
 my $inside_comment = 0;
 my $inside_code = 0;
 
 my $comment_count = 0;
 my $code_count = 0;
 
-while (<>) {
-    chomp;
-    if (is_comment($_)) {
-        s/\t/ /g;
-        s/ +/ /g;
+my $input_file = $ARGV[0];
+my $comments_file = "$input_file.comments";
+
+open my $input_fh, '>', $input_file or die "can't open input file [$input_file]";
+open my $comments_fh, '>', $comments_file or die "can't create output file [$comments_file]";
+
+<$input_fh>;
+print $comments_fh unless /^\#\!/;
+
+while (my $line = <$input_fh>) {
+    chomp $line;
+    if (is_comment($line)) {
+        $line =~ s/\t/ /g;
+        $line =~ s/ +/ /g;
         $comment_count++ if (not $inside_comment);
         $inside_comment = 1;
-        /$comment_char+/;
-        print OUT $' . "\n"; #'
-    } elsif (is_blank($_)) {
-        print OUT "\n";
+        $line =~ /$comment_char+/;
+        print $comments_fh substr($line, $+[0]) . "\n";
+    } elsif (is_blank($line)) {
+        print $comments_fh "$line\n";
     } else {
-        exit 0;
+        last;
     }
 }
 
+close $input_fh;
+close $comments_fh;
+
 sub is_comment {
-    my ($st) = @_;
-    return  ($st =~ /^\s*$comment_char/);
+    my ($string) = @_;
+    return ($string =~ /^\s*$comment_char/);
 }
 
 sub is_blank {
-    my ($st) = @_;
-    return ($st =~ /^\s*$/);
+    my ($string) = @_;
+    return ($string =~ /^\s*$/);
 }
 
diff --git a/filter/filter.pl b/filter/filter.pl
index 7b0fafd..26edb0d 100755
--- a/filter/filter.pl
+++ b/filter/filter.pl
@@ -18,74 +18,85 @@
 
 #
 # filter.pl
-# This script classify input sentences into two categories,
-# good sentences and bad sentences.
-# This script regard a sentence include a critical word (ex. legal term) as good
 #
-# usage: filter.pl (inputfilename)
-#
-# Author: Yuki Manabe
+# This script classifies input sentences into two categories, good sentences and bad sentences.
+# A sentence including a critical word (ex. legal term) is regarded as good.
 #
+
 use strict;
+use warnings;
+use Getopt::Std;
 
-#print $ARGV[0];
+my $INPUT_FILE_EXTENSION = 'sentences';
 
-# where are we running the program from
-my $path = $0;
-$path =~ s/[^\/]+$//;
-if ($path eq '') {
-    $path = './';
+# parse cmdline parameters
+if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) {
+    print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n";
+    exit 1;
 }
-my $file_critical_words = $path . 'criticalword.dict';
 
-die "Usagee $0 <filename>.sentences" unless $ARGV[0] =~ /\.sentences$/;
+my $path = get_my_path($0);
 
-my $file_good = $ARGV[0];
+my $input_file = $ARGV[0];
+my $file_critical_words = "$path/criticalword.dict";
 
-die "Filename should end in '.sentences' [$file_good]" unless $file_good =~ s/\.sentences$/\.goodsent/;
-my $file_bad = $ARGV[0];
-$file_bad =~ s/\.sentences$/\.badsent/;
+my $file_good = $input_file; $file_good =~ s/\.$INPUT_FILE_EXTENSION$/\.goodsent/;
+my $file_bad  = $input_file; $file_bad  =~ s/\.$INPUT_FILE_EXTENSION$/\.badsent/;
 
-#print $file_good;
-#print $file_bad;
+open my $input_fh, '<', $input_file or die "can't open input file [$input_file]: $!";
 
-open (INPUTFILE, "<$ARGV[0]") or die ("Error: $ARGV[0] is not found.");
-open (DICTIONARY, "<$file_critical_words") or die ('Error: criticalword.dict is not found.');
+open my $good_fh, '>', $file_good or die "can't create good sentences file [$file_good]: $!";
+open my $bad_fh, '>', $file_bad or die "can't create bad sentences file [$file_bad]: $!";
 
-open (GOODOUT, ">$file_good") || die ('Error');
-open (BADOUT, ">$file_bad") || die ('Error');
-
-my @critical_words = ();
-# read dictionary into list
-my $critical_word;
-while ($critical_word = <DICTIONARY>) {
-    chomp $critical_word;
-    next if $critical_word =~ /^\#/;
-    $critical_word =~ s/\#.*$//; # remove everything to the end of file
-    push(@critical_words, "$critical_word");
-}
-close(DICTIONARY);
+my @critical_words = read_critical_words($file_critical_words);
 
-#matching cliticalwords in list against sentences.
-my $sentence;
-while ($sentence = <INPUTFILE>) {
-    my $check = 0;
+# matching critical words in list against sentences
+while (my $sentence = <$input_fh>) {
     chomp $sentence;
-    foreach $critical_word (@critical_words) {
+    next unless $sentence;
+    my $check = 0;
+    foreach my $critical_word (@critical_words) {
         if ($sentence =~ /\b$critical_word\b/i) {
             $check = 1;
-            #print "$critical_word:$sentence";
             last;
         }
     }
-    if ($check == 1) {
-        print GOODOUT "$sentence\n";
+    if ($check) {
+        print $good_fh "$sentence\n";
     } else {
-        print BADOUT "$sentence\n";
+        print $bad_fh "$sentence\n";
+    }
+}
+
+close $input_fh;
+close $good_fh;
+close $bad_fh;
+
+sub get_my_path {
+    my ($self) = @_;
+    my $path = $self;
+    $path =~ s/\/+[^\/]+$//;
+    if ($path eq '') {
+        $path = './';
     }
+    return $path;
 }
 
-close(INPUTFILE);
-close(GOODOUT);
-close(BADOUT);
+sub read_critical_words {
+    my ($file) = @_;
+    my @critical_words = ();
+
+    open my $fh, '<', $file or die "can't open file [$file]: $!";
+
+    while (my $line = <$fh>) {
+        chomp $line;
+        next if $line =~ /^\#/;
+        $line =~ s/\#.*$//; # remove everything to the end of line
+        push @critical_words, $line;
+    }
+
+    close $fh;
+
+    return @critical_words;
+}
 
diff --git a/matcher/matcher.pl b/matcher/matcher.pl
index 9d1e74b..8eae92f 100755
--- a/matcher/matcher.pl
+++ b/matcher/matcher.pl
@@ -19,21 +19,19 @@
 #
 # matchter.pl
 #
-# This script use a set of license sentence name as input
-# and output license name corresponds to a rule which match the set.
-#
-# author: Yuki Manabe
-#
-# usage: matchter.pl (inputfilename)
+# This script uses a set of license sentence names as input and
+# outputs license names corresponds to a rule which match the set.
 #
 
 use strict;
+#use warnings;
+use Getopt::Std;
+
 my $debug = 0;
 
 my %NON_CRITICAL_RULES = ();
 
 # these should go into a file, but for the time being, let us keep them here
-
 # once we have matched a rule, these are not that important
 
 my @GENERAL_NON_CRITICAL = ('AllRights');
@@ -57,10 +55,10 @@ $NON_CRITICAL_RULES{'LibraryGPLv3+'} = [@GPL_NON_CRITICAL];
 $NON_CRITICAL_RULES{'LibraryGPLv3'} = [@GPL_NON_CRITICAL];
 $NON_CRITICAL_RULES{'LibraryGPLv2+'} = [@GPL_NON_CRITICAL];
 $NON_CRITICAL_RULES{'LibraryGPLv2'} = [@GPL_NON_CRITICAL];
-$NON_CRITICAL_RULES{'LesserGPLv3'} = [@GPL_NON_CRITICAL, 'LesserGPLseeVer3','LesserGPLcopyVer3','SeeFileVer3'];
+$NON_CRITICAL_RULES{'LesserGPLv3'} = [@GPL_NON_CRITICAL, 'LesserGPLseeVer3', 'LesserGPLcopyVer3', 'SeeFileVer3'];
 $NON_CRITICAL_RULES{'LesserGPLv2.1+'} = [@GPL_NON_CRITICAL];
 $NON_CRITICAL_RULES{'LesserGPLv2.1'} = [@GPL_NON_CRITICAL];
-$NON_CRITICAL_RULES{'LGPLv2orv3'}= [@GPL_NON_CRITICAL];
+$NON_CRITICAL_RULES{'LGPLv2orv3'} = [@GPL_NON_CRITICAL];
 $NON_CRITICAL_RULES{'LesserGPLv2'} = [@GPL_NON_CRITICAL];
 $NON_CRITICAL_RULES{'LesserGPLv2+'} = [@GPL_NON_CRITICAL];
 
@@ -70,85 +68,68 @@ $NON_CRITICAL_RULES{'GPLv1+'} = [@GPL_NON_CRITICAL];
 $NON_CRITICAL_RULES{'GPLv1'} = [@GPL_NON_CRITICAL];
 $NON_CRITICAL_RULES{'GPLv3+'} = [@GPL_NON_CRITICAL];
 $NON_CRITICAL_RULES{'GPLv3'} = [@GPL_NON_CRITICAL];
-$NON_CRITICAL_RULES{'AGPLv3'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0','AGPLseeVer0'];
-$NON_CRITICAL_RULES{'AGPLv3+'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0','AGPLseeVer0'];
+$NON_CRITICAL_RULES{'AGPLv3'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0', 'AGPLseeVer0'];
+$NON_CRITICAL_RULES{'AGPLv3+'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0', 'AGPLseeVer0'];
 $NON_CRITICAL_RULES{'GPLnoVersion'} = [@GPL_NON_CRITICAL];
 
 $NON_CRITICAL_RULES{'Apachev1.1'} = ['ApacheLic1_1'];
-$NON_CRITICAL_RULES{'Apachev2'}   = ['ApachePre','ApacheSee'];
+$NON_CRITICAL_RULES{'Apachev2'} = ['ApachePre', 'ApacheSee'];
 
-$NON_CRITICAL_RULES{'LibGCJLic'}  = ['LibGCJSee'];
-$NON_CRITICAL_RULES{'CDDLicV1'}  = ['Compliance','CDDLicWhere','ApachesPermLim','CDDLicIncludeFile','UseSubjectToTerm', 'useOnlyInCompliance'];
-$NON_CRITICAL_RULES{'CDDLic'}  = ['Compliance','CDDLicWhere','ApachesPermLim','CDDLicIncludeFile','UseSubjectToTerm', 'useOnlyInCompliance'];
+$NON_CRITICAL_RULES{'LibGCJLic'} = ['LibGCJSee'];
+$NON_CRITICAL_RULES{'CDDLicV1'} = ['Compliance', 'CDDLicWhere', 'ApachesPermLim', 'CDDLicIncludeFile', 'UseSubjectToTerm', 'useOnlyInCompliance'];
+$NON_CRITICAL_RULES{'CDDLic'} = ['Compliance', 'CDDLicWhere', 'ApachesPermLim', 'CDDLicIncludeFile', 'UseSubjectToTerm', 'useOnlyInCompliance'];
 
-$NON_CRITICAL_RULES{'CDDLorGPLv2'}= ['CDDLorGPLv2doNotAlter','AllRights','useOnlyInCompliance', 'CDDLorGPLv2whereVer0', 'ApachesPermLim', 'CDDLorGPLv2include','CDDLorGPLv2IfApplicable', 'CDDLorGPLv2Portions', 'CDDLorGPLv2ifYouWishVer2', 'CDDLorGPLv2IfYouAddVer2'];
+$NON_CRITICAL_RULES{'CDDLorGPLv2'} = ['CDDLorGPLv2doNotAlter', 'AllRights', 'useOnlyInCompliance', 'CDDLorGPLv2whereVer0', 'ApachesPermLim', 'CDDLorGPLv2include', 'CDDLorGPLv2IfApplicable', 'CDDLorGPLv2Portions', 'CDDLorGPLv2ifYouWishVer2', 'CDDLorGPLv2IfYouAddVer2'];
 
 $NON_CRITICAL_RULES{'CPLv1orGPLv2+orLGPLv2+'} = ['licenseBlockBegin', 'licenseBlockEnd'];
 
-$NON_CRITICAL_RULES{'Qt'} = ['Copyright','qtNokiaExtra','QTNokiaContact', 'qtDiaTems'];
-$NON_CRITICAL_RULES{'orLGPLVer2.1'} = ['LesserqtReviewGPLVer2.1','qtLGPLv2.1where'];
-$NON_CRITICAL_RULES{'orGPLv3'} = ['qtReviewGPLVer3.0','qtReviewGPLVer3','qtGPLwhere'];
+$NON_CRITICAL_RULES{'Qt'} = ['Copyright', 'qtNokiaExtra', 'QTNokiaContact', 'qtDiaTems'];
+$NON_CRITICAL_RULES{'orLGPLVer2.1'} = ['LesserqtReviewGPLVer2.1', 'qtLGPLv2.1where'];
+$NON_CRITICAL_RULES{'orGPLv3'} = ['qtReviewGPLVer3.0', 'qtReviewGPLVer3', 'qtGPLwhere'];
 $NON_CRITICAL_RULES{'digiaQTExceptionNoticeVer1.1'} = ['qtDigiaExtra'];
 
-$NON_CRITICAL_RULES{'MPLv1_0'}  = ['ApacheLicWherePart1','MPLwarranty','MPLSee'];
-$NON_CRITICAL_RULES{'MPLv1_1'}  = ['ApacheLicWherePart1','MPLwarranty','MPLSee'];
-$NON_CRITICAL_RULES{'NPLv1_1'}  = ['ApacheLicWherePart1','MPLwarranty','MPLSee'];
-$NON_CRITICAL_RULES{'NPLv1_0'}  = ['ApacheLicWherePart1','MPLwarranty','MPLSee'];
+$NON_CRITICAL_RULES{'MPLv1_0'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee'];
+$NON_CRITICAL_RULES{'MPLv1_1'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee'];
+$NON_CRITICAL_RULES{'NPLv1_1'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee'];
+$NON_CRITICAL_RULES{'NPLv1_0'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee'];
 
-$NON_CRITICAL_RULES{'subversion'} = ['SeeFileSVN','subversionHistory'];
-$NON_CRITICAL_RULES{'subversion+'} = ['SeeFileSVN','subversionHistory'];
+$NON_CRITICAL_RULES{'subversion'} = ['SeeFileSVN', 'subversionHistory'];
+$NON_CRITICAL_RULES{'subversion+'} = ['SeeFileSVN', 'subversionHistory'];
 $NON_CRITICAL_RULES{'tmate+'} = ['SeeFileSVN'];
 
 $NON_CRITICAL_RULES{'openSSLvar2'} = ['BSDcondAdvPart2'];
 
-$NON_CRITICAL_RULES{'MPLv1_1'} = ['licenseBlockBegin','MPLsee','Copyright','licenseBlockEnd','ApacheLicWherePart1','MPLwarranty', 'MPLwarrantyVar'];
-$NON_CRITICAL_RULES{'MPL1_1andLGPLv2_1'} = ['MPLoptionIfNotDelete2licsVer0','MPL_LGPLseeVer0'];
+$NON_CRITICAL_RULES{'MPLv1_1'} = ['licenseBlockBegin', 'MPLsee', 'Copyright', 'licenseBlockEnd', 'ApacheLicWherePart1', 'MPLwarranty', 'MPLwarrantyVar'];
+$NON_CRITICAL_RULES{'MPL1_1andLGPLv2_1'} = ['MPLoptionIfNotDelete2licsVer0', 'MPL_LGPLseeVer0'];
 
 $NON_CRITICAL_RULES{'FreeType'} = ['FreeTypeNotice'];
 
 $NON_CRITICAL_RULES{'GPLVer2.1or3KDE+'} = [@GPL_NON_CRITICAL];
 $NON_CRITICAL_RULES{'LGPLVer2.1or3KDE+'} = [@GPL_NON_CRITICAL];
 
-# initialize
+my $INPUT_FILE_EXTENSION = 'senttok';
 
-my $path = $0;
-$path =~ s/[^\/]+$//;
-if ($path eq '') {
-    $path = './';
+# parse cmdline parameters
+if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) {
+    print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n";
+    exit 1;
 }
 
-my $rules_file = $path . 'rules.dict';
-my $interrules_file = $path . 'interrules.dict';
+my $path = get_my_path($0);
 
-die "Usage $0 <filename>.senttok" unless $ARGV[0] =~ /\.senttok$/;
+my $input_file = $ARGV[0];
+my $rules_file = "$path/rules.dict";
+my $interrules_file = "$path/interrules.dict";
 
-my $count_unknowns = 0;
+my @license_sentence_names = ();
+my @originals = ();
+read_original($input_file, \@license_sentence_names, \@originals);
 
-# read the rules
 my @rules = read_rules($rules_file);
 my @inter_rules = read_inter_rules($interrules_file);
 
-my @license_sentence_names = ();
-my @original;
-
-read_original($ARGV[0], \@license_sentence_names, \@original);
-
-#foreach my $x (@license_sentence_names) {
-#    print "$x\n";
-#}
-#exit;
-
-#foreach my $x (@original) {
-#    print "$x\n";
-#}
-#exit;
-
 ##########################################
 
-#for my $ref( @inter_rules ){
-#   print "@$ref\n";
-#}
-
 # matching spdx requires to match strict licenses, with no alternatives...
 
 my $senttok = ',' . join(',', @license_sentence_names) . ',';
@@ -159,9 +140,9 @@ print "[$senttok]\n" if $debug;
 match_license();
 
 # do we have to check again?
-## todo, verifythat we have unmatched sentences...
+## todo, verify that we have unmatched sentences...
 
-@license_sentence_names = split(',', $senttok);
+@license_sentence_names = split ',', $senttok;
 
 # first remove the extrict part from it
 
@@ -172,15 +153,12 @@ for (my $i = 0; $i <= $#license_sentence_names; $i++) {
     if ($license_sentence_names[$i] == 0 and
         ($license_sentence_names[$i] ne 'UNKNOWN' and
          $license_sentence_names[$i] ne '')) {
-#        print "[$license_sentence_names[$i]]\n";
         $license_sentence_names[$i] =~ s/Extrict$//;
         $match++;
     }
 }
 
-#print_result();
-
-if ($match > 0) {
+if ($match) {
 #    print "REDO\n";
     for (my $i = 0; $i <= $#inter_rules; $i++) {
         #for my $ref( @inter_rules[$i]){
@@ -199,99 +177,104 @@ print_result();
 
 exit 0;
 
-#print @license_sentence_names;
-#print join(';',@license_sentence_names)."\n";
-
-# 3. matching
-###############################
-
-# we will iterate over rules, matching as many as we can...
+sub get_my_path {
+    my ($self) = @_;
+    my $path = $self;
+    $path =~ s/\/+[^\/]+$//;
+    if ($path eq '') {
+        $path = './';
+    }
+    return $path;
+}
 
 sub is_unknown {
     my ($s) = @_;
-    my @f = split (/,/, $s);
+    my @f = split /,/, $s;
     return $f[0] eq 'UNKNOWN';
 }
 
 sub read_rules {
     my ($file) = @_;
-    open (RULES, "<$file") or die ('Error: rules.dict is not found.');
-    my $sentence;
     my @rules = ();
-    while ($sentence = <RULES>) {
-        chomp $sentence;
+
+    open my $fh, '<', $file or die "can't open file [$file]: $!";
+
+    while (my $line = <$fh>) {
+        chomp $line;
         # clean up spaces
-        $sentence =~ s/^\s+//;
-        $sentence =~ s/\s+$//;
-        $sentence =~ s/\s*,\s*/,/g;
-        $sentence =~ s/\s*:\s*/:/g;
-        #check format
-        if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/) {
+        $line =~ s/^\s+//;
+        $line =~ s/\s+$//;
+        $line =~ s/\s*,\s*/,/g;
+        $line =~ s/\s*:\s*/:/g;
+        # check format
+        if ($line =~ /^#/ || $line !~ /(.*):(.*,)*(.*)/) {
             next;
         }
-        $sentence =~ /(.*?):(.*)/;
-        push (@rules, [$1, $2]);
+        $line =~ /(.*?):(.*)/;
+        push @rules, [$1, $2];
     }
-    close RULES;
+
+    close $fh;
+
     return @rules;
 }
 
 sub read_inter_rules {
     my ($file) = @_;
+    my @inter_rules = ();
 
-    my @inter_rules;
-    open (IRULES, "<$file") or die ('Error: interrules.dict is not found.');
-    my $sentence;
-    while ($sentence = <IRULES>) {
-        chomp $sentence;
-        #check format
-        if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/) {
+    open my $fh, '<', $file or die "can't open file [$file]: $!";
+
+    while (my $line = <$fh>) {
+        chomp $line;
+        # check format
+        if ($line =~ /^#/ || $line !~ /(.*?):(.*)/) {
             next;
         }
-        foreach my $item (split(/\|/, $2)) {
-            push (@inter_rules, [$item, $1]);
+        foreach my $item (split /\|/, $2) {
+            push @inter_rules, [$item, $1];
         }
     }
-    close IRULES;
+
+    close $fh;
+
     return @inter_rules;
 }
 
 sub read_original {
     my ($file, $tokens, $originals) = @_;
 
-    open (INPUTFILE, $file) or die ("Error: $file is not found.");
+    open my $fh, '<', $file or die "can't open file [$file]: $!";
 
-    my $sentence;
-    my @original;
-    while ($sentence = <INPUTFILE>) {
+    while (my $sentence = <$fh>) {
         chomp $sentence;
-        my @fields = split(':', $sentence);
-        push(@$originals, $fields[1]);
-        my @token = split(';', $fields[0]);
-        push(@$tokens, $token[0]);
+        my @fields = split ':', $sentence;
+        push @$originals, $fields[1];
+        my @token = split ';', $fields[0];
+        push @$tokens, $token[0];
     }
+
+    close $fh;
+
     if (scalar(@$originals) == 0) {
         print "NONE\n";
         exit 0;
     }
-
-#print join(';',@license_sentence_names)."\n";
-
-    close INPUTFILE;
 }
 
+# we will iterate over rules, matching as many as we can...
 sub match_license {
 # create a string with the sentences
 
     for (my $j = 0; $j <= $#rules; $j++) {
         my $rule = $rules[$j][1];
         my $rulename = $rules[$j][0];
-        my $rule_length = scalar(split(',', $rule));
+        my $rule_length = scalar(split ',', $rule);
         # replace rule with the length of the rule
         print "To try [$rulename][$rule] on [$senttok]\n" if $debug;
-        while ($senttok =~ s/,${rule},/,$rule_length,/) {
+        while ($senttok =~ s/,$rule,/,$rule_length,/) {
             $count_matches++;
-            push (@result, $rulename);
+            push @result, $rulename;
 #        print ">>>>$senttok|$rules[$j][1]\n";
 #        print 'Result: ', join(',', @result);
 #        print "\n";
@@ -351,14 +334,12 @@ sub match_license {
 }
 
 sub print_result {
-#        $senttok =~ s/AllRights(,?)/$1/g;
-#        $senttok =~ s/UNKNOWN,/,/g;
-#        $senttok =~ s/,+/,/g;
+#   $senttok =~ s/AllRights(,?)/$1/g;
+#   $senttok =~ s/UNKNOWN,/,/g;
+#   $senttok =~ s/,+/,/g;
 
     my $save = $senttok;
-    # ok, so now, what I want to output it:
-    # licenses; number of licenses matched;number of sentences matched; number of sentences ignored;number of sentences not matched;number of sentences unknown
-    my @sections = split(',', $senttok);
+    my @sections = split ',', $senttok;
     die 'assertion 1' if $sections[0] ne '';
     die 'assertion 2' if $sections[scalar(@sections)] ne '';
 
@@ -367,7 +348,6 @@ sub print_result {
     my $unknown_lines = 0;
     my $unmatched_lines = 0;
     foreach my $i (1..scalar(@sections)-1) {
-#        print "$i;$sections[$i]\n";
         if ($sections[$i] < 0) {
             $ignored_lines += - $sections[$i];
         } elsif ($sections[$i] != 0) {
@@ -380,12 +360,13 @@ sub print_result {
     }
     $senttok =~ s/^,(.*),$/$1/;
 
-#    print "$ignored_lines > $license_lines > $unknown_lines > $unmatched_lines\n";
     if (scalar (@result) == 0) {
         print 'UNKNOWN';
     } else {
-        print join(',',@result);
+        print join ',', @result;
     }
+    # ok, so now, what I want to output is:
+    # licenses; number of licenses matched;number of sentences matched; number of sentences ignored;number of sentences not matched;number of sentences unknown
     print ";$count_matches;$license_lines;$ignored_lines;$unmatched_lines;$unknown_lines;$senttok\n";
     $senttok = $save;
 }
diff --git a/ninka.pl b/ninka.pl
index 5f53fc6..99619e4 100755
--- a/ninka.pl
+++ b/ninka.pl
@@ -17,15 +17,19 @@
 #
 
 use strict;
+use warnings;
 use Getopt::Std;
 
+# parse cmdline parameters
 my %opts = ();
-if (!getopts ("vfCcSsGgTtLd",\%opts) or scalar(@ARGV) == 0) {
-print STDERR "Ninka version 1.1
+if (!getopts('vfCcSsGgTtLd', \%opts) or scalar(@ARGV) == 0) {
+    print STDERR "Ninka version 1.1
 
-Usage $0 -fCtTvcgsGd <filename>
+Usage: $0 [OPTIONS] <filename>
 
+Options:
   -v verbose
+
   -f force all processing
 
   -C force creation of comments
@@ -42,95 +46,95 @@ Usage $0 -fCtTvcgsGd <filename>
 
   -L force creation of matching
 
-  -d delete intermediate files
-
-\n";
+  -d delete intermediate files\n";
 
     exit 1;
 }
 
 my $verbose = exists $opts{v};
-my $delete = exists $opts{d};
-#$delete = 1;
+my $delete  = exists $opts{d};
 
-my $path = $0;
-
-$path =~ s/\/+[^\/]+$//;
-if ($path eq "") {
-    $path = "./";
-}
-
-my $force = exists $opts{f};
-my $force_good = exists $opts{G};
+my $force           = exists $opts{f};
+my $force_comments  = exists $opts{C};
 my $force_sentences = exists $opts{S};
-my $force_senttok = exists $opts{T};
-my $force_comments = exists $opts{C};
-my $force_license = exists $opts{L};
+my $force_good      = exists $opts{G};
+my $force_senttok   = exists $opts{T};
+my $force_license   = exists $opts{L};
 
-#die "Usage $0 <filename>" unless $ARGV[0] =~ /\.(c|cpp|java|cc|cxx|h|jl|py|pm|el|pl)$/;
+my $path = get_my_path($0);
 
 my $input_file = $ARGV[0];
 
-print "Starting: $input_file;\n" if ($verbose);
+my $comments_file  = "$input_file.comments";
+my $sentences_file = "$input_file.sentences";
+my $goodsent_file  = "$input_file.goodsent";
+my $badsent_file   = "$input_file.badsent";
+my $senttok_file   = "$input_file.senttok";
+my $license_file   = "$input_file.license";
 
+print "Starting: $input_file;\n" if $verbose;
 print "$input_file;";
 
-my $comments_file = "${input_file}.comments";
-my $sentences_file = "${input_file}.sentences";
-my $goodsent_file = "${input_file}.goodsent";
-my $senttok_file = "${input_file}.senttok";
-
-if (not (-f "$input_file")) {
-    print "ERROR;[${input_file}] is not a file\n" ;
-    exit 0;
+if (not (-f $input_file)) {
+    print "ERROR;[$input_file] is not a file\n";
+    exit 1;
 }
 
-do_file_process($input_file, $comments_file, ($force or $force_comments),
-                "$path/extComments/extComments.pl -c1 '${input_file}'",
-                "Creating comments file",
-                exists $opts{c});
+process_file($input_file, $comments_file, ($force or $force_comments),
+             "$path/extComments/extComments.pl '$input_file'",
+             'Creating comments file', exists $opts{c});
 
-do_file_process($comments_file, $sentences_file, ($force or $force_sentences),
-                "$path/splitter/splitter.pl '${comments_file}'",
-                "Splitting sentences", exists $opts{s});
+process_file($comments_file, $sentences_file, ($force or $force_sentences),
+             "$path/splitter/splitter.pl '$comments_file'",
+             'Splitting sentences', exists $opts{s});
 
-do_file_process($sentences_file, $goodsent_file, ($force or $force_good),
-                 "$path/filter/filter.pl '${sentences_file}'",
-                 "Filtering good sentences", exists $opts{s});
+process_file($sentences_file, $goodsent_file, ($force or $force_good),
+             "$path/filter/filter.pl '$sentences_file'",
+             'Filtering good sentences', exists $opts{s});
 
-do_file_process($goodsent_file, $senttok_file, ($force or $force_senttok),
-                "$path/senttok/senttok.pl '${goodsent_file}' > '${senttok_file}'",
-                "Matching sentences against rules", exists $opts{t});
+process_file($goodsent_file, $senttok_file, ($force or $force_senttok),
+             "$path/senttok/senttok.pl '$goodsent_file' > '$senttok_file'",
+             'Matching sentences against rules', exists $opts{t});
 
-print "Matching ${input_file}.senttok against rules" if ($verbose);
-execute("$path/matcher/matcher.pl '${input_file}.senttok' > '${input_file}.license'");
+process_file($senttok_file, $license_file, ($force or $force_license),
+             "$path/matcher/matcher.pl '$senttok_file' > '$license_file'",
+             'Matching sentence tokens against rules', 0);
 
-print `cat '${input_file}.license'`;
-
-unlink("${input_file}.code");
+print `cat '$license_file'`;
 
 if ($delete) {
-    unlink("${input_file}.badsent");
-    unlink("${input_file}.comments");
-    unlink("${input_file}.goodsent");
-#    unlink("${input_file}.sentences");
-    unlink("${input_file}.senttok");
+    unlink $comments_file;
+    unlink $sentences_file;
+    unlink $goodsent_file;
+    unlink $badsent_file;
+    unlink $senttok_file;
 }
 
 exit 0;
 
-sub do_file_process {
+sub get_my_path {
+    my ($self) = @_;
+    my $path = $self;
+    $path =~ s/\/+[^\/]+$//;
+    if ($path eq '') {
+        $path = './';
+    }
+    return $path;
+}
+
+sub process_file {
     my ($input, $output, $force, $cmd, $message, $end) = @_;
 
-    print "${message}:" if ($verbose);
+    print "$message:" if $verbose;
     if ($force or is_newer($input, $output)) {
-        print "Running ${cmd}:" if ($verbose);
+        print "Running $cmd:" if $verbose;
         execute($cmd);
     } else {
-        print "File [$output] newer than input [$input], not creating:" if ($verbose);
+        print "File [$output] newer than input [$input], not creating:" if $verbose;
     }
+
     if ($end) {
-        print "Existing after $message" if $verbose;
+        print "Exiting after $message" if $verbose;
         print "\n";
         exit 0;
     }
@@ -138,7 +142,6 @@ sub do_file_process {
 
 sub execute {
     my ($command) = @_;
-#    print "\nTo execute [$command]\n";
     my $result = `$command`;
     my $status = ($? >> 8);
     die "execution of program [$command] failed: status [$status]" if ($status != 0);
@@ -147,8 +150,9 @@ sub execute {
 
 sub is_newer {
     my ($f1, $f2) = @_;
-    my ($f1write) = (stat($f1))[9];
-    my ($f2write) = (stat($f2))[9];
+    my $f1write = (stat $f1)[9];
+    my $f2write = (stat $f2)[9];
+
     if (defined $f1write and defined $f2write) {
         return $f1write > $f2write;
     } else {
diff --git a/senttok/senttok.pl b/senttok/senttok.pl
index 5bef3c5..1fa104d 100755
--- a/senttok/senttok.pl
+++ b/senttok/senttok.pl
@@ -16,35 +16,36 @@
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
+#
+# senttok.pl
+#
+# This script creates a file that corresponds to the recognized sentence tokens.
+# For each sentence, it outputs its sentence token, or unknown otherwise.
+#
+
 use strict;
+#use warnings;
+use Getopt::Std;
 
-my $TOO_LONG = 70;
+my $INPUT_FILE_EXTENSION = 'goodsent';
 
-# where are we running the splitter from?
-my $path = $0;
-$path =~ s/[^\/]+$//;
-if ($path eq "") {
-    $path = "./";
+# parse cmdline parameters
+if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) {
+    print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n";
+    exit 1;
 }
-my $path_license_sentences = $path . "licensesentence.dict";
 
-open FH, "<$ARGV[0]";
-my @license_sentences = ();
-open LICENSESENTENCEFILE, "<$path_license_sentences";
-my $line;
-while ($line = <LICENSESENTENCEFILE>) {
-    chomp $line;
-    next if $line =~ /^\#/;
-    next if $line =~ /^ *$/;
-    die "Illegal format in license expression [$line] " unless $line =~ /(.*?):(.*?):(.*)/;
-    push @license_sentences, $line;
-}
+my $TOO_LONG = 70;
+
+my $path = get_my_path($0);
+
+my $input_file = $ARGV[0];
+my $license_sentences_file = "$path/licensesentence.dict";
 
-#foreach $line (@license_sentences) {
-#  print $line;
-#}
-close LICENSESENTENCEFILE;
-while ($line = <>) {
+my @license_sentences = read_license_sentences($license_sentences_file);
+
+open my $input_fh, '<', $input_file or die "can't open file [$input_file]: $!";
+while (my $line = <$input_fh>) {
     my $save_line;
     my $original_line;
     chomp $line;
@@ -57,11 +58,10 @@ while ($line = <>) {
     $line = normalize_sentence($line);
 
     my $check = 0;
-    my $match_name = "UNKNOWN";
-    my @parm = ();
-    my $sentence;
+    my $match_name = 'UNKNOWN';
+    my @parameters = ();
     my $distance = 1; #maximum? number
-    my $most_similar_name = "UNKNOWN";
+    my $most_similar_name = 'UNKNOWN';
     my $before;
     my $after;
     my $gpl = 0;
@@ -69,10 +69,6 @@ while ($line = <>) {
 
     $save_line = $line;
 
-#        print "Original
-#   [$line]
-#\n";
-
     my $line_as_gpl = '';
 
     if (looks_like_gpl($line)) {
@@ -82,35 +78,28 @@ while ($line = <>) {
         $line_as_gpl = $line;
     }
     my ($name, $sub_rule, $number, $regexp, $option);
-    my $save_line = $line;
+    $save_line = $line;
     my $save_gpl = $gpl;
-    my $LGPL = "";
-    foreach $sentence (@license_sentences) {
-        ($name, $sub_rule, $number, $regexp, $option) = split(/:/, $sentence);
+    my $LGPL = '';
+    foreach my $sentence (@license_sentences) {
+        ($name, $sub_rule, $number, $regexp, $option) = split /:/, $sentence;
         # we need this due to the goto again
         $line = $save_line;
         $gpl = $save_gpl;
-        $LGPL = "";
+        $LGPL = '';
       again:
-#       print "Testing
-#   lin[$line]
-#   ori[$save_line]
-#   re [$regexp]
-#   lpg[$LGPL]
-#\n";
         if ($line =~ /$regexp/im) {
             $before = $`;
-            $after = $'; #';
+            $after = $';
             $check = 1;
             $match_name = $name;
             for (my $i = 1; $i <= $number; $i++) {
                 no strict 'refs';
-                push @parm, $$i;
+                push @parameters, $$i;
             }
             last;
         } else {
-#            print "NO MATCH\n";
-            # let us try again in cas it is lesser/library
+            # let us try again in case it is lesser/library
             # do it only once
             if ($gpl and $line =~ s/(Lesser|Library) GPL/GPL/i) {
                 $LGPL = $1;
@@ -121,7 +110,7 @@ while ($line = <>) {
                 $line = $save_line;
                 goto again;
             }
-            next;## dmg
+            next;
             my $targetset = $regexp;
             $targetset =~ s/^(.*)$/$1/;
             my $tmpdist = levenshtein($line, $targetset) / max(length($targetset), length($sentence));
@@ -130,30 +119,42 @@ while ($line = <>) {
                 $distance = $tmpdist;
             }
         }
-        last; ###
+        last;
     }
-    if ($check == 1) {
-        # licensesentence name, parm1, parm2,..
+    if ($check) {
+        # licensesentence name, param1, param2, ...
         if ($gpl) {
-            $match_name .= "Ver" . $gpl_version;
-            $match_name .= "+" if $gpl_later;
+            $match_name .= 'Ver' . $gpl_version;
+            $match_name .= '+' if $gpl_later;
             $match_name = $LGPL . $match_name;
         }
         if (length($before) > $TOO_LONG || length($after) > $TOO_LONG) {
-            $match_name .= "-TOOLONG";
+            $match_name .= '-TOOLONG';
         }
-        my $parmstrings = join(";", $match_name, $sub_rule, $before, $after, @parm);
-        print $parmstrings, ":$original_line\n";
+        # TODO: Use of uninitialized value in @parameters
+        my $parameter_string = join ';', $match_name, $sub_rule, $before, $after, @parameters;
+        print $parameter_string, ":$original_line\n";
     } else {
         # UNKNOWN, sentence
         chomp $line;
-        print $match_name, ";", 0, ";", $most_similar_name, ";", $distance, ";", $save_line, ":$original_line\n";
+        my $parameter_string = join ';', $match_name, 0, $most_similar_name, $distance, $save_line;
+        print $parameter_string, ":$original_line\n";
     }
 }
+close $input_fh;
 
-close FH;
 exit 0;
 
+sub get_my_path {
+    my ($self) = @_;
+    my $path = $self;
+    $path =~ s/\/+[^\/]+$//;
+    if ($path eq '') {
+        $path = './';
+    }
+    return $path;
+}
+
 sub normalize_gpl {
     my ($line) = @_;
     my $later = 0;
@@ -177,7 +178,6 @@ sub normalize_gpl {
     }
     if ($line =~ s/(version|v\.?) ([123\.0]+)/<VERSION>/i) {
         $version = $2;
-#        print "Version [$version]\n";
     }
     if ($line =~ s/GPL ?[v\-]([123\.0]+)/GPL <VERSION>/i) {
         $version = $1;
@@ -217,9 +217,7 @@ sub normalize_gpl {
     $line =~ s/ +/ /;
     $line =~ s/ +$//;
 
-#    print ">>>>>>>>>>$line:$later:$version\n";
-
-    return ($line,$later,$version);
+    return ($line, $later, $version);
 }
 
 sub looks_like_gpl {
@@ -296,8 +294,8 @@ sub levenshtein {
     # Some char-by-char processing is ahead, so prepare
     # array of chars from the strings
     #
-    my @ar1 = split(//, $s1);
-    my @ar2 = split(//, $s2);
+    my @ar1 = split //, $s1;
+    my @ar2 = split //, $s2;
 
     for (my $i = 1; $i <= $len1; ++$i) {
         for (my $j = 1; $j <= $len2; ++$j) {
@@ -348,3 +346,22 @@ sub max {
     return $list[0] > $list[1] ? $list[0] : $list[1];
 }
 
+sub read_license_sentences {
+    my ($file) = @_;
+    my @license_sentences = ();
+
+    open my $fh, '<', $file or die "can't open file [$file]: $!";
+
+    while (my $line = <$fh>) {
+        chomp $line;
+        next if $line =~ /^\#/;
+        next if $line =~ /^ *$/;
+        die "illegal format in license expression [$line]" unless $line =~ /(.*?):(.*?):(.*)/;
+        push @license_sentences, $line;
+    }
+
+    close $fh;
+
+    return @license_sentences;
+}
+
diff --git a/splitter/splitter.pl b/splitter/splitter.pl
index c806f96..dfffa83 100755
--- a/splitter/splitter.pl
+++ b/splitter/splitter.pl
@@ -5,11 +5,14 @@
 # Author:   Paul Clough     {cloughie@dcs.shef.ac.uk}
 #
 
-# This program is originally based on the sentence splitter program
-# published by Paul Clough. Version 1.0,  available form
-# http://ir.shef.ac.uk/cloughie/software.html (splitter.zip)
-# The original program without a license.
-
+#    This program is originally based on the sentence splitter program
+#    published by Paul Clough. Version 1.0, available from
+#    http://ir.shef.ac.uk/cloughie/software.html (splitter.zip)
+#    The original program is without a license.
+#
+#    It was mostly rewritten.
+#    His ideas, however, linger in here (and his file of abbreviations)
+#
 #    Modifications to the original by Daniel M German and Y. Manabe,
 #    which are under the following license:
 #
@@ -25,52 +28,37 @@
 #
 #    You should have received a copy of the GNU Affero General Public License
 #    along with this patch.  If not, see <http://www.gnu.org/licenses/>.
+#
 
-use strict;
-
-# This program is originally based on the sentence splitter program
-# published by Paul Clough. Version 1.0, but then it was mostly rewritten
-# His ideas, however, linger in here (and his dictionary_file of abbreviations)
-
-my $dictionary_file = 'splitter.dict';
-my $abbreviations_file = 'splitter.abv';
-my $length = 0;
-my %COMMON_TERMS = ();
-my %ABBREVIATIONS = ();
-my $output_file = $ARGV[0];
-
-# where are we running the splitter from?
-my $path = $0;
-$path =~ s/[^\/]+$//;
-if ($path eq '') {
-    $path = './';
-}
-$dictionary_file = $path . $dictionary_file;
-$abbreviations_file = $path . $abbreviations_file;
+#
+# splitter.pl
+#
+# This script breaks comments into sentences.
+#
 
-die "Usage $0 <filename>.comments" unless $ARGV[0] =~ /\.comments$/;
+use strict;
+use warnings;
+use Getopt::Std;
 
-die "Input file name should end in '.comments' [$output_file]" unless $output_file =~ s/\.comments$/.sentences/;
+my $INPUT_FILE_EXTENSION = 'comments';
 
-open(OUT, ">$output_file") or die("Unable to create output file [$output_file]");
+# parse cmdline parameters
+if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) {
+    print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n";
+    exit 1;
+}
 
-#print length($opt_o);
+my $path = get_my_path($0);
 
-# Load in the dictionary and find the common words.
-# Here, we assume the words in upper case are simply names and one
-# word per line - i.e. in same form as /usr/dict/words
-&load_dictionary;
+my $input_file = $ARGV[0];
+my $abbreviations_file = "$path/splitter.abv";
 
-# Same assumptions as for dictionary
-&load_abbreviations;
+my $output_file = $input_file; $output_file =~ s/\.$INPUT_FILE_EXTENSION$/\.sentences/;
 
-my $text;
-#   open(FILE, $opt_f) or die "Can't open $opt_f for reading\n";
+my $text = read_file_as_string($input_file);
+my %abbreviations = load_abbreviations($abbreviations_file);
 
-my $line;
-while (defined ($line = <>)) {
-    $text .= $line;
-}
+open my $output_fh, '>', $output_file or die "can't create output file [$output_file]: $!";
 
 # append a newline just in case
 $text .= "\n";
@@ -82,7 +70,7 @@ $text =~ s@={3,1000}@ @gmx;
 $text =~ s@:{3,1000}@ @gmx;
 $text =~ s@\*{3,1000}@ @gmx;
 
-# some characters are used for prettyprinting but never appear in sentences
+# some characters are used for pretty-printing but never appear in sentences
 $text =~ s@\|+@ @gmx;
 $text =~ s@\\+@ @gmx;
 
@@ -91,7 +79,7 @@ $text =~ s@^[ \t]*/\*@@gmx;
 $text =~ s/\*\/[ \t]*$//gmx;
 $text =~ s@([^:])// @$1@gmx;
 
-# Replace /\r\n/ with \n only
+# replace /\r\n/ with \n only
 $text =~ s/\r\n/\n/g;
 
 # now, try to replace the leading/ending character of each line #/-, at most 3 heading characters
@@ -109,9 +97,9 @@ $text =~ s/[\*\#]+//gmx;
 $text =~ s/^[ \t]+$/\n/gm;
 
 # let us try the following trick
-# We first get rid of \t and replace it with ' '
-# we then use \t as a "single line separator" and \n as multiple line.
-# so we can match each with a single character.
+# we first get rid of \t and replace it with ' '
+# we then use \t as a "single line separator" and \n as multiple line
+# so we can match each with a single character
 $text =~ tr/\t/ /;
 
 $text =~ s/\n(?!\n)/\t/g;
@@ -123,61 +111,66 @@ $text =~ s/v\.\s+2\.0/v<dot> 2<dot>0/g;
 
 while ($text =~ /^([^\n]*)\n/gsm) {
     my $curr = $1;
-#    print "<<$curr\n<<\n";
 
     # let us count the number of alphabetic chars to check if we are skipping anything we should not
-    my $count = 0;
+    my $count1 = 0;
     for my $i (0..length($curr)-1) {
-        my $c = substr($curr,$i,1);
-        $count++ if ($c ge 'A' && $c le 'z');
+        my $c = substr($curr, $i, 1);
+        $count1++ if ($c ge 'A' && $c le 'z');
     }
 
     my @sentences = split_text($curr);
 
     my $count2 = 0;
-
     foreach my $sentence (@sentences) {
         for my $i (0..length($sentence)-1) {
             my $c = substr($sentence, $i, 1);
             $count2++ if ($c ge 'A' && $c le 'z');
         }
-        print OUT clean_sentence($sentence) , "\n";
+        my $clean_sentence = clean_sentence($sentence);
+        next unless $clean_sentence;
+        print $output_fh $clean_sentence, "\n";
     }
-    if ($count != $count2) {
+
+    if ($count1 != $count2) {
         print STDERR "-------------------------------------\n";
         print STDERR "[$curr]\n";
         foreach my $sentence (@sentences) {
-            print STDERR clean_sentence($sentence) , "\n";
+            print STDERR clean_sentence($sentence), "\n";
         }
-        die "Number of printable chars does not match! [$count][$count2]";
+        die "Number of printable chars does not match! [$count1][$count2]";
     }
 }
-close OUT;
-#print "$text\n";
+close $output_fh;
 
-exit;
+exit 0;
 
-#***************************************************************************************************
-# procedures
-#***************************************************************************************************
+sub get_my_path {
+    my ($self) = @_;
+    my $path = $self;
+    $path =~ s/\/+[^\/]+$//;
+    if ($path eq '') {
+        $path = './';
+    }
+    return $path;
+}
 
 sub clean_sentence {
     ($_) = @_;
-    # check for trailing bullets of different types
 
+    # check for trailing bullets of different types
     s/^o //;
     s/^\s*[0-9]{1-2}+\s*[\-\)]//;
     s/^[ \t]+//;
     s/[ \t]+$//;
+
     # remove a trailing -
     s/^[ \t]*[\-\.\s*] +//;
 
-    # replace quotes
     s/\s+/ /g;
 
     s/['"`]+/<quotes>/g;
 
-
     s/:/<colon>/g;
 
     s/\.+$/./;
@@ -198,15 +191,15 @@ sub split_text {
     my $after;
     my $current_sentence = '';
     # this breaks the sentence into
-    # 1. Any text before a separator
-    # 2. The separator [.!?:\n]
+    # 1. any text before a separator
+    # 2. the separator [.!?:\n]
     # 3.
-    while ($text =~ /^ 
+    while ($text =~ /^
                      ([^\.\!\?\:\n]*) #
                      ([\.\!\?\:\n])
                      (?=(.?))
                    /xsm) { #/(?:(?=([([{\"\'`)}\]<]*[ ]+)[([{\"\'`)}\] ]*([A-Z0-9][a-z]*))|(?=([()\"\'`)}\<\] ]+)\s))/sm ) {
-        $text = $';    #';
+        $text = $';
         my $sentence_match = $1;
         my $sentence = $1 . $2;
         my $punctuation = $2;
@@ -217,7 +210,7 @@ sub split_text {
             $current_sentence .= $sentence;
             next;
         }
-        #at this point we know that there is a space after
+        # at this point we know that there is a space after
         if ($punctuation eq ':' || $punctuation eq '?' || $punctuation eq '!') {
             # let us consider this right here a beginning of a sentence
             push @result, $current_sentence . $sentence;
@@ -231,7 +224,7 @@ sub split_text {
             # TODO
             # simple heuristic... let us check that the next words are not the beginning of a sentence
             # in our library
-            # ENDTODO
+            # END TODO
 
             # is the last word an abbreviation? For this the period has to follow the word
             # this expression might have to be updated to take care of special characters  in names :(
@@ -260,7 +253,7 @@ sub split_text {
 
                     # only accept abbreviations if the previous char to the abbrev is space or
                     # is empty (beginning of line). This avoids things like .c
-                    if (length($before) > 0 and $before eq ' ' and  $ABBREVIATIONS{$last_word}) {
+                    if (length($before) > 0 and $before eq ' ' and $abbreviations{$last_word}) {
                         $current_sentence .= $sentence;
                         next;
                     } else {
@@ -277,39 +270,32 @@ sub split_text {
     }
     push @result, $current_sentence . $text;
 
-    #print_non_sentence($text,"\n",'');
     return @result;
 }
 
-sub load_dictionary {
-    my $common_term = '';
+sub read_file_as_string {
+    my $file = shift;
 
-    if (open(DICT, $dictionary_file)) {
-        while (defined ($line = <DICT>)) {
-            chomp($line);
-            if ($line !~ /^[A-Z]/) {
-                $COMMON_TERMS{$line} = 1;
-            }
-        }
+    open my $fh, '<', $file or die "can't open file '$file': $!";
+    my $content = do { local $/; <$fh> };
+    close $fh or die "can't close file '$file': $!";
 
-        close(DICT);
-    } else {
-        die "cannot open dictionary file $dictionary_file: $!";
-    }
+    return $content;
 }
 
 sub load_abbreviations {
-    my $abbrv_term = '';
+    my ($file) = @_;
+    my %abbreviations = ();
 
-    if (open(ABBRV, $abbreviations_file)) {
-        while (defined ($line = <ABBRV>)) {
-            chomp($line);
-            $ABBREVIATIONS{$line} = $line;
-        }
+    open my $fh, '<', $file or die "can't open file [$file]: $!";
 
-        close(ABBRV);
-    } else {
-        die "cannot open abbreviations file $abbreviations_file: $!";
+    while (my $line = <$fh>) {
+        chomp $line;
+        $abbreviations{$line} = $line;
     }
+
+    close $fh;
+
+    return %abbreviations;
 }
author	René Scheibe <rene.scheibe@gmail.com>	2014-09-10 15:12:04 +0200
committer	René Scheibe <rene.scheibe@gmail.com>	2014-09-12 19:55:15 +0200
commit	43cd8a52398a3a6436eb2e24d2d30fb3ee5080fc (patch)
tree	bdb3524e3d54e17255032afe2a47ec3e80fc7dba
parent	949dc6984decd81e51e264af34ee7cbc117e5e4a (diff)
download	ninka-43cd8a52398a3a6436eb2e24d2d30fb3ee5080fc.tar.gz