summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRené Scheibe <rene.scheibe@gmail.com>2014-09-10 15:12:04 +0200
committerRené Scheibe <rene.scheibe@gmail.com>2014-09-12 19:55:15 +0200
commit43cd8a52398a3a6436eb2e24d2d30fb3ee5080fc (patch)
treebdb3524e3d54e17255032afe2a47ec3e80fc7dba
parent949dc6984decd81e51e264af34ee7cbc117e5e4a (diff)
downloadninka-43cd8a52398a3a6436eb2e24d2d30fb3ee5080fc.tar.gz
massive cleanup
* consistent script descriptions * consistent cmdline parameter parsing with Getopt::Std * remove some commented out & unused code * fix some issues reported by perlcritic * better code formatting * and much more...
-rwxr-xr-xextComments/extComments.pl110
-rwxr-xr-xextComments/hashComments.pl78
-rwxr-xr-xfilter/filter.pl105
-rwxr-xr-xmatcher/matcher.pl215
-rwxr-xr-xninka.pl126
-rwxr-xr-xsenttok/senttok.pl143
-rwxr-xr-xsplitter/splitter.pl176
7 files changed, 481 insertions, 472 deletions
diff --git a/extComments/extComments.pl b/extComments/extComments.pl
index d7a65d5..1f840a8 100755
--- a/extComments/extComments.pl
+++ b/extComments/extComments.pl
@@ -16,87 +16,91 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
-use Getopt::Std;
-use strict;
-
-my $path = $0;
+#
+# extComments.pl
+#
+# This script extracts comments from source code.
+# If no comment extractor is known for a language, then extracts top lines from source.
+#
-$path =~ s/\/+[^\/]+$//;
-if ($path eq '') {
- $path = './';
-}
+use strict;
+use warnings;
+use Getopt::Std;
-# set parameters
+# parse cmdline parameters
my %opts = ();
-if (!getopts ('vc:p:',\%opts)) {
- print STDERR "Usage $0 -v
+if (!getopts('v', \%opts) or scalar(@ARGV) == 0) {
+ print STDERR "Usage: $0 [OPTIONS] <filename>
- -v verbose
- -c count of comment blocks
+Options:
+ -v verbose\n";
-\n";
-
- die;
+ exit 1;
}
-my $file = $ARGV[0];
+my $verbose = exists $opts{v};
-#die "illegal file [$file]" if $file =~ m@/\.@;
+my $path = get_my_path($0);
-my $number_comments = 1;
-$number_comments = $opts{c} if exists $opts{c};
-my $verbose = 1;
-$verbose = exists $opts{v};
+my $input_file = $ARGV[0];
+my $comments_file = "$input_file.comments";
-if (get_size($file) == 0) {
- print STDERR "Empty file, just exit\n" if $verbose;
- exit 0; # nothing to report, just end
+my $comments_cmd = determine_comments_cmd($input_file, $comments_file);
+execute($comments_cmd);
+if ($comments_cmd =~ /^comments/ and get_size($comments_file) == 0) {
+ $comments_cmd = create_head_cmd($input_file, $comments_file, 700);
+ execute($comments_cmd);
}
-my $comments_cmd = determine_comments_extractor($file);
-
-execute("$comments_cmd");
+exit 0;
-if ($comments_cmd =~ /^comments/ and get_size("${file}.comments") == 0) {
- `cat '$file' | head -700 > ${file}.comments`;
+sub get_my_path {
+ my ($self) = @_;
+ my $path = $self;
+ $path =~ s/\/+[^\/]+$//;
+ if ($path eq '') {
+ $path = './';
+ }
+ return $path;
}
-exit 0;
-
-sub determine_comments_extractor {
- my ($file) = @_;
- if ($file =~ /\.([^\.]+)$/) {
- my $ext= $1;
+sub determine_comments_cmd {
+ my ($input_file, $comments_file) = @_;
+ if ($input_file =~ /\.([^\.]+)$/) {
+ my $ext = $1;
if ($ext =~ /^(pl|pm|py)$/) {
- # for the time being, let us just extract the top 400 lines
- return "cat '$file' | head -400 > '${file}.comments'";
-# return "$path/hashComments.pl -p '#' '$file'";
- } elsif ($ext eq 'jl' or $ext eq 'el') {
- return "cat '$file' | head -400 > '${file}.comments'";
-# return "$path/hashComments.pl -p ';' '$file'";;
+ return create_head_cmd($input_file, $comments_file, 400);
+# return "$path/hashComments.pl -p '#' '$input_file'";
+ } elsif ($ext =~ /^(jl|el)$/) {
+ return create_head_cmd($input_file, $comments_file, 400);
+# return "$path/hashComments.pl -p ';' '$input_file'";;
} elsif ($ext =~ /^(java|c|cpp|h|cxx|c\+\+|cc)$/ ) {
- my $comments_cmd_location = `which comments`;
- if ($comments_cmd_location ne '') {
- return "comments -c1 '$file' 2> /dev/null";
+ my $comments_binary = 'comments';
+ if (`which $comments_binary` ne '') {
+ return "$comments_binary -c1 '$input_file' 2> /dev/null";
} else {
- return "cat '$file' | head -400 > '${file}.comments'";
+ return create_head_cmd($input_file, $comments_file, 400);
}
} else {
- return "cat '$file' | head -700 > '${file}.comments'";
+ return create_head_cmd($input_file, $comments_file, 700);
}
} else {
- print "\n>>>>>>>>>>>>>>>>>>>>>\n";
- return "cat '$file' | head -700 > '${file}.comments'";
+ return create_head_cmd($input_file, $comments_file, 700);
}
}
+sub create_head_cmd {
+ my ($input_file, $output_file, $count_lines) = @_;
+ return "head -$count_lines '$input_file' > '$output_file'";
+}
+
sub execute {
- my ($c) = @_;
- my $r = `$c`;
+ my ($cmd) = @_;
+ my $result = `$cmd`;
my $status = ($? >> 8);
- die "execution of program [$c] failed: status [$status]" if ($status != 0);
- return $r;
+ die "execution of program [$cmd] failed: status [$status]" if ($status != 0);
+ return $result;
}
sub get_size {
diff --git a/extComments/hashComments.pl b/extComments/hashComments.pl
index a32e283..77c3863 100755
--- a/extComments/hashComments.pl
+++ b/extComments/hashComments.pl
@@ -16,69 +16,75 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
-# this is to extract the first <n> comments from any language that
-# uses the same prefix
+#
+# hashComments.pl
+#
+# This script extracts the first <n> comments from any language that uses the same prefix.
+#
+use strict;
+use warnings;
use Getopt::Std;
-# set parameters
+# parse cmdline parameters
my %opts = ();
-if (!getopts ('vc:p:',\%opts)) {
- print STDERR "Usage $0 -v
+if (!getopts ('vc:p:', \%opts)) {
+ print STDERR "Usage: $0 [OPTIONS] <filename>
+Options:
-v verbose
-p comment char
- -c count of comment blocks
-
-\n";
+ -c count of comment blocks\n";
- die;
+ exit 1;
}
-my $file = $ARGV[0];
-
-open (OUT, ">${file}.comments") or die "Unable to create [${file}.comments]";
-
- <>;
-print OUT unless /^\#\!/;
-
-my $comment_char = '#';
-
-$comment_char = $opts{p} if exists $opts{p};
-
-my $comments_count = 1;
-$comments_count = $opts{c} if exists $opts{c};
my $verbose = exists $opts{v};
+my $comment_char = exists $opts{p} ? $opts{p} : '#';
+my $comments_count = exists $opts{c} ? $opts{c} : 1;
+
my $inside_comment = 0;
my $inside_code = 0;
my $comment_count = 0;
my $code_count = 0;
-while (<>) {
- chomp;
- if (is_comment($_)) {
- s/\t/ /g;
- s/ +/ /g;
+my $input_file = $ARGV[0];
+my $comments_file = "$input_file.comments";
+
+open my $input_fh, '>', $input_file or die "can't open input file [$input_file]";
+open my $comments_fh, '>', $comments_file or die "can't create output file [$comments_file]";
+
+<$input_fh>;
+print $comments_fh unless /^\#\!/;
+
+while (my $line = <$input_fh>) {
+ chomp $line;
+ if (is_comment($line)) {
+ $line =~ s/\t/ /g;
+ $line =~ s/ +/ /g;
$comment_count++ if (not $inside_comment);
$inside_comment = 1;
- /$comment_char+/;
- print OUT $' . "\n"; #'
- } elsif (is_blank($_)) {
- print OUT "\n";
+ $line =~ /$comment_char+/;
+ print $comments_fh substr($line, $+[0]) . "\n";
+ } elsif (is_blank($line)) {
+ print $comments_fh "$line\n";
} else {
- exit 0;
+ last;
}
}
+close $input_fh;
+close $comments_fh;
+
sub is_comment {
- my ($st) = @_;
- return ($st =~ /^\s*$comment_char/);
+ my ($string) = @_;
+ return ($string =~ /^\s*$comment_char/);
}
sub is_blank {
- my ($st) = @_;
- return ($st =~ /^\s*$/);
+ my ($string) = @_;
+ return ($string =~ /^\s*$/);
}
diff --git a/filter/filter.pl b/filter/filter.pl
index 7b0fafd..26edb0d 100755
--- a/filter/filter.pl
+++ b/filter/filter.pl
@@ -18,74 +18,85 @@
#
# filter.pl
-# This script classify input sentences into two categories,
-# good sentences and bad sentences.
-# This script regard a sentence include a critical word (ex. legal term) as good
#
-# usage: filter.pl (inputfilename)
-#
-# Author: Yuki Manabe
+# This script classifies input sentences into two categories, good sentences and bad sentences.
+# A sentence including a critical word (ex. legal term) is regarded as good.
#
+
use strict;
+use warnings;
+use Getopt::Std;
-#print $ARGV[0];
+my $INPUT_FILE_EXTENSION = 'sentences';
-# where are we running the program from
-my $path = $0;
-$path =~ s/[^\/]+$//;
-if ($path eq '') {
- $path = './';
+# parse cmdline parameters
+if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) {
+ print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n";
+ exit 1;
}
-my $file_critical_words = $path . 'criticalword.dict';
-die "Usagee $0 <filename>.sentences" unless $ARGV[0] =~ /\.sentences$/;
+my $path = get_my_path($0);
-my $file_good = $ARGV[0];
+my $input_file = $ARGV[0];
+my $file_critical_words = "$path/criticalword.dict";
-die "Filename should end in '.sentences' [$file_good]" unless $file_good =~ s/\.sentences$/\.goodsent/;
-my $file_bad = $ARGV[0];
-$file_bad =~ s/\.sentences$/\.badsent/;
+my $file_good = $input_file; $file_good =~ s/\.$INPUT_FILE_EXTENSION$/\.goodsent/;
+my $file_bad = $input_file; $file_bad =~ s/\.$INPUT_FILE_EXTENSION$/\.badsent/;
-#print $file_good;
-#print $file_bad;
+open my $input_fh, '<', $input_file or die "can't open input file [$input_file]: $!";
-open (INPUTFILE, "<$ARGV[0]") or die ("Error: $ARGV[0] is not found.");
-open (DICTIONARY, "<$file_critical_words") or die ('Error: criticalword.dict is not found.');
+open my $good_fh, '>', $file_good or die "can't create good sentences file [$file_good]: $!";
+open my $bad_fh, '>', $file_bad or die "can't create bad sentences file [$file_bad]: $!";
-open (GOODOUT, ">$file_good") || die ('Error');
-open (BADOUT, ">$file_bad") || die ('Error');
-
-my @critical_words = ();
-# read dictionary into list
-my $critical_word;
-while ($critical_word = <DICTIONARY>) {
- chomp $critical_word;
- next if $critical_word =~ /^\#/;
- $critical_word =~ s/\#.*$//; # remove everything to the end of file
- push(@critical_words, "$critical_word");
-}
-close(DICTIONARY);
+my @critical_words = read_critical_words($file_critical_words);
-#matching cliticalwords in list against sentences.
-my $sentence;
-while ($sentence = <INPUTFILE>) {
- my $check = 0;
+# matching critical words in list against sentences
+while (my $sentence = <$input_fh>) {
chomp $sentence;
- foreach $critical_word (@critical_words) {
+ next unless $sentence;
+ my $check = 0;
+ foreach my $critical_word (@critical_words) {
if ($sentence =~ /\b$critical_word\b/i) {
$check = 1;
- #print "$critical_word:$sentence";
last;
}
}
- if ($check == 1) {
- print GOODOUT "$sentence\n";
+ if ($check) {
+ print $good_fh "$sentence\n";
} else {
- print BADOUT "$sentence\n";
+ print $bad_fh "$sentence\n";
+ }
+}
+
+close $input_fh;
+close $good_fh;
+close $bad_fh;
+
+sub get_my_path {
+ my ($self) = @_;
+ my $path = $self;
+ $path =~ s/\/+[^\/]+$//;
+ if ($path eq '') {
+ $path = './';
}
+ return $path;
}
-close(INPUTFILE);
-close(GOODOUT);
-close(BADOUT);
+sub read_critical_words {
+ my ($file) = @_;
+ my @critical_words = ();
+
+ open my $fh, '<', $file or die "can't open file [$file]: $!";
+
+ while (my $line = <$fh>) {
+ chomp $line;
+ next if $line =~ /^\#/;
+ $line =~ s/\#.*$//; # remove everything to the end of line
+ push @critical_words, $line;
+ }
+
+ close $fh;
+
+ return @critical_words;
+}
diff --git a/matcher/matcher.pl b/matcher/matcher.pl
index 9d1e74b..8eae92f 100755
--- a/matcher/matcher.pl
+++ b/matcher/matcher.pl
@@ -19,21 +19,19 @@
#
# matchter.pl
#
-# This script use a set of license sentence name as input
-# and output license name corresponds to a rule which match the set.
-#
-# author: Yuki Manabe
-#
-# usage: matchter.pl (inputfilename)
+# This script uses a set of license sentence names as input and
+# outputs license names corresponds to a rule which match the set.
#
use strict;
+#use warnings;
+use Getopt::Std;
+
my $debug = 0;
my %NON_CRITICAL_RULES = ();
# these should go into a file, but for the time being, let us keep them here
-
# once we have matched a rule, these are not that important
my @GENERAL_NON_CRITICAL = ('AllRights');
@@ -57,10 +55,10 @@ $NON_CRITICAL_RULES{'LibraryGPLv3+'} = [@GPL_NON_CRITICAL];
$NON_CRITICAL_RULES{'LibraryGPLv3'} = [@GPL_NON_CRITICAL];
$NON_CRITICAL_RULES{'LibraryGPLv2+'} = [@GPL_NON_CRITICAL];
$NON_CRITICAL_RULES{'LibraryGPLv2'} = [@GPL_NON_CRITICAL];
-$NON_CRITICAL_RULES{'LesserGPLv3'} = [@GPL_NON_CRITICAL, 'LesserGPLseeVer3','LesserGPLcopyVer3','SeeFileVer3'];
+$NON_CRITICAL_RULES{'LesserGPLv3'} = [@GPL_NON_CRITICAL, 'LesserGPLseeVer3', 'LesserGPLcopyVer3', 'SeeFileVer3'];
$NON_CRITICAL_RULES{'LesserGPLv2.1+'} = [@GPL_NON_CRITICAL];
$NON_CRITICAL_RULES{'LesserGPLv2.1'} = [@GPL_NON_CRITICAL];
-$NON_CRITICAL_RULES{'LGPLv2orv3'}= [@GPL_NON_CRITICAL];
+$NON_CRITICAL_RULES{'LGPLv2orv3'} = [@GPL_NON_CRITICAL];
$NON_CRITICAL_RULES{'LesserGPLv2'} = [@GPL_NON_CRITICAL];
$NON_CRITICAL_RULES{'LesserGPLv2+'} = [@GPL_NON_CRITICAL];
@@ -70,85 +68,68 @@ $NON_CRITICAL_RULES{'GPLv1+'} = [@GPL_NON_CRITICAL];
$NON_CRITICAL_RULES{'GPLv1'} = [@GPL_NON_CRITICAL];
$NON_CRITICAL_RULES{'GPLv3+'} = [@GPL_NON_CRITICAL];
$NON_CRITICAL_RULES{'GPLv3'} = [@GPL_NON_CRITICAL];
-$NON_CRITICAL_RULES{'AGPLv3'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0','AGPLseeVer0'];
-$NON_CRITICAL_RULES{'AGPLv3+'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0','AGPLseeVer0'];
+$NON_CRITICAL_RULES{'AGPLv3'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0', 'AGPLseeVer0'];
+$NON_CRITICAL_RULES{'AGPLv3+'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0', 'AGPLseeVer0'];
$NON_CRITICAL_RULES{'GPLnoVersion'} = [@GPL_NON_CRITICAL];
$NON_CRITICAL_RULES{'Apachev1.1'} = ['ApacheLic1_1'];
-$NON_CRITICAL_RULES{'Apachev2'} = ['ApachePre','ApacheSee'];
+$NON_CRITICAL_RULES{'Apachev2'} = ['ApachePre', 'ApacheSee'];
-$NON_CRITICAL_RULES{'LibGCJLic'} = ['LibGCJSee'];
-$NON_CRITICAL_RULES{'CDDLicV1'} = ['Compliance','CDDLicWhere','ApachesPermLim','CDDLicIncludeFile','UseSubjectToTerm', 'useOnlyInCompliance'];
-$NON_CRITICAL_RULES{'CDDLic'} = ['Compliance','CDDLicWhere','ApachesPermLim','CDDLicIncludeFile','UseSubjectToTerm', 'useOnlyInCompliance'];
+$NON_CRITICAL_RULES{'LibGCJLic'} = ['LibGCJSee'];
+$NON_CRITICAL_RULES{'CDDLicV1'} = ['Compliance', 'CDDLicWhere', 'ApachesPermLim', 'CDDLicIncludeFile', 'UseSubjectToTerm', 'useOnlyInCompliance'];
+$NON_CRITICAL_RULES{'CDDLic'} = ['Compliance', 'CDDLicWhere', 'ApachesPermLim', 'CDDLicIncludeFile', 'UseSubjectToTerm', 'useOnlyInCompliance'];
-$NON_CRITICAL_RULES{'CDDLorGPLv2'}= ['CDDLorGPLv2doNotAlter','AllRights','useOnlyInCompliance', 'CDDLorGPLv2whereVer0', 'ApachesPermLim', 'CDDLorGPLv2include','CDDLorGPLv2IfApplicable', 'CDDLorGPLv2Portions', 'CDDLorGPLv2ifYouWishVer2', 'CDDLorGPLv2IfYouAddVer2'];
+$NON_CRITICAL_RULES{'CDDLorGPLv2'} = ['CDDLorGPLv2doNotAlter', 'AllRights', 'useOnlyInCompliance', 'CDDLorGPLv2whereVer0', 'ApachesPermLim', 'CDDLorGPLv2include', 'CDDLorGPLv2IfApplicable', 'CDDLorGPLv2Portions', 'CDDLorGPLv2ifYouWishVer2', 'CDDLorGPLv2IfYouAddVer2'];
$NON_CRITICAL_RULES{'CPLv1orGPLv2+orLGPLv2+'} = ['licenseBlockBegin', 'licenseBlockEnd'];
-$NON_CRITICAL_RULES{'Qt'} = ['Copyright','qtNokiaExtra','QTNokiaContact', 'qtDiaTems'];
-$NON_CRITICAL_RULES{'orLGPLVer2.1'} = ['LesserqtReviewGPLVer2.1','qtLGPLv2.1where'];
-$NON_CRITICAL_RULES{'orGPLv3'} = ['qtReviewGPLVer3.0','qtReviewGPLVer3','qtGPLwhere'];
+$NON_CRITICAL_RULES{'Qt'} = ['Copyright', 'qtNokiaExtra', 'QTNokiaContact', 'qtDiaTems'];
+$NON_CRITICAL_RULES{'orLGPLVer2.1'} = ['LesserqtReviewGPLVer2.1', 'qtLGPLv2.1where'];
+$NON_CRITICAL_RULES{'orGPLv3'} = ['qtReviewGPLVer3.0', 'qtReviewGPLVer3', 'qtGPLwhere'];
$NON_CRITICAL_RULES{'digiaQTExceptionNoticeVer1.1'} = ['qtDigiaExtra'];
-$NON_CRITICAL_RULES{'MPLv1_0'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee'];
-$NON_CRITICAL_RULES{'MPLv1_1'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee'];
-$NON_CRITICAL_RULES{'NPLv1_1'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee'];
-$NON_CRITICAL_RULES{'NPLv1_0'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee'];
+$NON_CRITICAL_RULES{'MPLv1_0'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee'];
+$NON_CRITICAL_RULES{'MPLv1_1'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee'];
+$NON_CRITICAL_RULES{'NPLv1_1'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee'];
+$NON_CRITICAL_RULES{'NPLv1_0'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee'];
-$NON_CRITICAL_RULES{'subversion'} = ['SeeFileSVN','subversionHistory'];
-$NON_CRITICAL_RULES{'subversion+'} = ['SeeFileSVN','subversionHistory'];
+$NON_CRITICAL_RULES{'subversion'} = ['SeeFileSVN', 'subversionHistory'];
+$NON_CRITICAL_RULES{'subversion+'} = ['SeeFileSVN', 'subversionHistory'];
$NON_CRITICAL_RULES{'tmate+'} = ['SeeFileSVN'];
$NON_CRITICAL_RULES{'openSSLvar2'} = ['BSDcondAdvPart2'];
-$NON_CRITICAL_RULES{'MPLv1_1'} = ['licenseBlockBegin','MPLsee','Copyright','licenseBlockEnd','ApacheLicWherePart1','MPLwarranty', 'MPLwarrantyVar'];
-$NON_CRITICAL_RULES{'MPL1_1andLGPLv2_1'} = ['MPLoptionIfNotDelete2licsVer0','MPL_LGPLseeVer0'];
+$NON_CRITICAL_RULES{'MPLv1_1'} = ['licenseBlockBegin', 'MPLsee', 'Copyright', 'licenseBlockEnd', 'ApacheLicWherePart1', 'MPLwarranty', 'MPLwarrantyVar'];
+$NON_CRITICAL_RULES{'MPL1_1andLGPLv2_1'} = ['MPLoptionIfNotDelete2licsVer0', 'MPL_LGPLseeVer0'];
$NON_CRITICAL_RULES{'FreeType'} = ['FreeTypeNotice'];
$NON_CRITICAL_RULES{'GPLVer2.1or3KDE+'} = [@GPL_NON_CRITICAL];
$NON_CRITICAL_RULES{'LGPLVer2.1or3KDE+'} = [@GPL_NON_CRITICAL];
-# initialize
+my $INPUT_FILE_EXTENSION = 'senttok';
-my $path = $0;
-$path =~ s/[^\/]+$//;
-if ($path eq '') {
- $path = './';
+# parse cmdline parameters
+if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) {
+ print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n";
+ exit 1;
}
-my $rules_file = $path . 'rules.dict';
-my $interrules_file = $path . 'interrules.dict';
+my $path = get_my_path($0);
-die "Usage $0 <filename>.senttok" unless $ARGV[0] =~ /\.senttok$/;
+my $input_file = $ARGV[0];
+my $rules_file = "$path/rules.dict";
+my $interrules_file = "$path/interrules.dict";
-my $count_unknowns = 0;
+my @license_sentence_names = ();
+my @originals = ();
+read_original($input_file, \@license_sentence_names, \@originals);
-# read the rules
my @rules = read_rules($rules_file);
my @inter_rules = read_inter_rules($interrules_file);
-my @license_sentence_names = ();
-my @original;
-
-read_original($ARGV[0], \@license_sentence_names, \@original);
-
-#foreach my $x (@license_sentence_names) {
-# print "$x\n";
-#}
-#exit;
-
-#foreach my $x (@original) {
-# print "$x\n";
-#}
-#exit;
-
##########################################
-#for my $ref( @inter_rules ){
-# print "@$ref\n";
-#}
-
# matching spdx requires to match strict licenses, with no alternatives...
my $senttok = ',' . join(',', @license_sentence_names) . ',';
@@ -159,9 +140,9 @@ print "[$senttok]\n" if $debug;
match_license();
# do we have to check again?
-## todo, verifythat we have unmatched sentences...
+## todo, verify that we have unmatched sentences...
-@license_sentence_names = split(',', $senttok);
+@license_sentence_names = split ',', $senttok;
# first remove the extrict part from it
@@ -172,15 +153,12 @@ for (my $i = 0; $i <= $#license_sentence_names; $i++) {
if ($license_sentence_names[$i] == 0 and
($license_sentence_names[$i] ne 'UNKNOWN' and
$license_sentence_names[$i] ne '')) {
-# print "[$license_sentence_names[$i]]\n";
$license_sentence_names[$i] =~ s/Extrict$//;
$match++;
}
}
-#print_result();
-
-if ($match > 0) {
+if ($match) {
# print "REDO\n";
for (my $i = 0; $i <= $#inter_rules; $i++) {
#for my $ref( @inter_rules[$i]){
@@ -199,99 +177,104 @@ print_result();
exit 0;
-#print @license_sentence_names;
-#print join(';',@license_sentence_names)."\n";
-
-# 3. matching
-###############################
-
-# we will iterate over rules, matching as many as we can...
+sub get_my_path {
+ my ($self) = @_;
+ my $path = $self;
+ $path =~ s/\/+[^\/]+$//;
+ if ($path eq '') {
+ $path = './';
+ }
+ return $path;
+}
sub is_unknown {
my ($s) = @_;
- my @f = split (/,/, $s);
+ my @f = split /,/, $s;
return $f[0] eq 'UNKNOWN';
}
sub read_rules {
my ($file) = @_;
- open (RULES, "<$file") or die ('Error: rules.dict is not found.');
- my $sentence;
my @rules = ();
- while ($sentence = <RULES>) {
- chomp $sentence;
+
+ open my $fh, '<', $file or die "can't open file [$file]: $!";
+
+ while (my $line = <$fh>) {
+ chomp $line;
# clean up spaces
- $sentence =~ s/^\s+//;
- $sentence =~ s/\s+$//;
- $sentence =~ s/\s*,\s*/,/g;
- $sentence =~ s/\s*:\s*/:/g;
- #check format
- if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/) {
+ $line =~ s/^\s+//;
+ $line =~ s/\s+$//;
+ $line =~ s/\s*,\s*/,/g;
+ $line =~ s/\s*:\s*/:/g;
+ # check format
+ if ($line =~ /^#/ || $line !~ /(.*):(.*,)*(.*)/) {
next;
}
- $sentence =~ /(.*?):(.*)/;
- push (@rules, [$1, $2]);
+ $line =~ /(.*?):(.*)/;
+ push @rules, [$1, $2];
}
- close RULES;
+
+ close $fh;
+
return @rules;
}
sub read_inter_rules {
my ($file) = @_;
+ my @inter_rules = ();
- my @inter_rules;
- open (IRULES, "<$file") or die ('Error: interrules.dict is not found.');
- my $sentence;
- while ($sentence = <IRULES>) {
- chomp $sentence;
- #check format
- if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/) {
+ open my $fh, '<', $file or die "can't open file [$file]: $!";
+
+ while (my $line = <$fh>) {
+ chomp $line;
+ # check format
+ if ($line =~ /^#/ || $line !~ /(.*?):(.*)/) {
next;
}
- foreach my $item (split(/\|/, $2)) {
- push (@inter_rules, [$item, $1]);
+ foreach my $item (split /\|/, $2) {
+ push @inter_rules, [$item, $1];
}
}
- close IRULES;
+
+ close $fh;
+
return @inter_rules;
}
sub read_original {
my ($file, $tokens, $originals) = @_;
- open (INPUTFILE, $file) or die ("Error: $file is not found.");
+ open my $fh, '<', $file or die "can't open file [$file]: $!";
- my $sentence;
- my @original;
- while ($sentence = <INPUTFILE>) {
+ while (my $sentence = <$fh>) {
chomp $sentence;
- my @fields = split(':', $sentence);
- push(@$originals, $fields[1]);
- my @token = split(';', $fields[0]);
- push(@$tokens, $token[0]);
+ my @fields = split ':', $sentence;
+ push @$originals, $fields[1];
+ my @token = split ';', $fields[0];
+ push @$tokens, $token[0];
}
+
+ close $fh;
+
if (scalar(@$originals) == 0) {
print "NONE\n";
exit 0;
}
-
-#print join(';',@license_sentence_names)."\n";
-
- close INPUTFILE;
}
+# we will iterate over rules, matching as many as we can...
sub match_license {
# create a string with the sentences
for (my $j = 0; $j <= $#rules; $j++) {
my $rule = $rules[$j][1];
my $rulename = $rules[$j][0];
- my $rule_length = scalar(split(',', $rule));
+ my $rule_length = scalar(split ',', $rule);
# replace rule with the length of the rule
print "To try [$rulename][$rule] on [$senttok]\n" if $debug;
- while ($senttok =~ s/,${rule},/,$rule_length,/) {
+ while ($senttok =~ s/,$rule,/,$rule_length,/) {
$count_matches++;
- push (@result, $rulename);
+ push @result, $rulename;
# print ">>>>$senttok|$rules[$j][1]\n";
# print 'Result: ', join(',', @result);
# print "\n";
@@ -351,14 +334,12 @@ sub match_license {
}
sub print_result {
-# $senttok =~ s/AllRights(,?)/$1/g;
-# $senttok =~ s/UNKNOWN,/,/g;
-# $senttok =~ s/,+/,/g;
+# $senttok =~ s/AllRights(,?)/$1/g;
+# $senttok =~ s/UNKNOWN,/,/g;
+# $senttok =~ s/,+/,/g;
my $save = $senttok;
- # ok, so now, what I want to output it:
- # licenses; number of licenses matched;number of sentences matched; number of sentences ignored;number of sentences not matched;number of sentences unknown
- my @sections = split(',', $senttok);
+ my @sections = split ',', $senttok;
die 'assertion 1' if $sections[0] ne '';
die 'assertion 2' if $sections[scalar(@sections)] ne '';
@@ -367,7 +348,6 @@ sub print_result {
my $unknown_lines = 0;
my $unmatched_lines = 0;
foreach my $i (1..scalar(@sections)-1) {
-# print "$i;$sections[$i]\n";
if ($sections[$i] < 0) {
$ignored_lines += - $sections[$i];
} elsif ($sections[$i] != 0) {
@@ -380,12 +360,13 @@ sub print_result {
}
$senttok =~ s/^,(.*),$/$1/;
-# print "$ignored_lines > $license_lines > $unknown_lines > $unmatched_lines\n";
if (scalar (@result) == 0) {
print 'UNKNOWN';
} else {
- print join(',',@result);
+ print join ',', @result;
}
+ # ok, so now, what I want to output is:
+ # licenses; number of licenses matched;number of sentences matched; number of sentences ignored;number of sentences not matched;number of sentences unknown
print ";$count_matches;$license_lines;$ignored_lines;$unmatched_lines;$unknown_lines;$senttok\n";
$senttok = $save;
}
diff --git a/ninka.pl b/ninka.pl
index 5f53fc6..99619e4 100755
--- a/ninka.pl
+++ b/ninka.pl
@@ -17,15 +17,19 @@
#
use strict;
+use warnings;
use Getopt::Std;
+# parse cmdline parameters
my %opts = ();
-if (!getopts ("vfCcSsGgTtLd",\%opts) or scalar(@ARGV) == 0) {
-print STDERR "Ninka version 1.1
+if (!getopts('vfCcSsGgTtLd', \%opts) or scalar(@ARGV) == 0) {
+ print STDERR "Ninka version 1.1
-Usage $0 -fCtTvcgsGd <filename>
+Usage: $0 [OPTIONS] <filename>
+Options:
-v verbose
+
-f force all processing
-C force creation of comments
@@ -42,95 +46,95 @@ Usage $0 -fCtTvcgsGd <filename>
-L force creation of matching
- -d delete intermediate files
-
-\n";
+ -d delete intermediate files\n";
exit 1;
}
my $verbose = exists $opts{v};
-my $delete = exists $opts{d};
-#$delete = 1;
+my $delete = exists $opts{d};
-my $path = $0;
-
-$path =~ s/\/+[^\/]+$//;
-if ($path eq "") {
- $path = "./";
-}
-
-my $force = exists $opts{f};
-my $force_good = exists $opts{G};
+my $force = exists $opts{f};
+my $force_comments = exists $opts{C};
my $force_sentences = exists $opts{S};
-my $force_senttok = exists $opts{T};
-my $force_comments = exists $opts{C};
-my $force_license = exists $opts{L};
+my $force_good = exists $opts{G};
+my $force_senttok = exists $opts{T};
+my $force_license = exists $opts{L};
-#die "Usage $0 <filename>" unless $ARGV[0] =~ /\.(c|cpp|java|cc|cxx|h|jl|py|pm|el|pl)$/;
+my $path = get_my_path($0);
my $input_file = $ARGV[0];
-print "Starting: $input_file;\n" if ($verbose);
+my $comments_file = "$input_file.comments";
+my $sentences_file = "$input_file.sentences";
+my $goodsent_file = "$input_file.goodsent";
+my $badsent_file = "$input_file.badsent";
+my $senttok_file = "$input_file.senttok";
+my $license_file = "$input_file.license";
+print "Starting: $input_file;\n" if $verbose;
print "$input_file;";
-my $comments_file = "${input_file}.comments";
-my $sentences_file = "${input_file}.sentences";
-my $goodsent_file = "${input_file}.goodsent";
-my $senttok_file = "${input_file}.senttok";
-
-if (not (-f "$input_file")) {
- print "ERROR;[${input_file}] is not a file\n" ;
- exit 0;
+if (not (-f $input_file)) {
+ print "ERROR;[$input_file] is not a file\n";
+ exit 1;
}
-do_file_process($input_file, $comments_file, ($force or $force_comments),
- "$path/extComments/extComments.pl -c1 '${input_file}'",
- "Creating comments file",
- exists $opts{c});
+process_file($input_file, $comments_file, ($force or $force_comments),
+ "$path/extComments/extComments.pl '$input_file'",
+ 'Creating comments file', exists $opts{c});
-do_file_process($comments_file, $sentences_file, ($force or $force_sentences),
- "$path/splitter/splitter.pl '${comments_file}'",
- "Splitting sentences", exists $opts{s});
+process_file($comments_file, $sentences_file, ($force or $force_sentences),
+ "$path/splitter/splitter.pl '$comments_file'",
+ 'Splitting sentences', exists $opts{s});
-do_file_process($sentences_file, $goodsent_file, ($force or $force_good),
- "$path/filter/filter.pl '${sentences_file}'",
- "Filtering good sentences", exists $opts{s});
+process_file($sentences_file, $goodsent_file, ($force or $force_good),
+ "$path/filter/filter.pl '$sentences_file'",
+ 'Filtering good sentences', exists $opts{s});
-do_file_process($goodsent_file, $senttok_file, ($force or $force_senttok),
- "$path/senttok/senttok.pl '${goodsent_file}' > '${senttok_file}'",
- "Matching sentences against rules", exists $opts{t});
+process_file($goodsent_file, $senttok_file, ($force or $force_senttok),
+ "$path/senttok/senttok.pl '$goodsent_file' > '$senttok_file'",
+ 'Matching sentences against rules', exists $opts{t});
-print "Matching ${input_file}.senttok against rules" if ($verbose);
-execute("$path/matcher/matcher.pl '${input_file}.senttok' > '${input_file}.license'");
+process_file($senttok_file, $license_file, ($force or $force_license),
+ "$path/matcher/matcher.pl '$senttok_file' > '$license_file'",
+ 'Matching sentence tokens against rules', 0);
-print `cat '${input_file}.license'`;
-
-unlink("${input_file}.code");
+print `cat '$license_file'`;
if ($delete) {
- unlink("${input_file}.badsent");
- unlink("${input_file}.comments");
- unlink("${input_file}.goodsent");
-# unlink("${input_file}.sentences");
- unlink("${input_file}.senttok");
+ unlink $comments_file;
+ unlink $sentences_file;
+ unlink $goodsent_file;
+ unlink $badsent_file;
+ unlink $senttok_file;
}
exit 0;
-sub do_file_process {
+sub get_my_path {
+ my ($self) = @_;
+ my $path = $self;
+ $path =~ s/\/+[^\/]+$//;
+ if ($path eq '') {
+ $path = './';
+ }
+ return $path;
+}
+
+sub process_file {
my ($input, $output, $force, $cmd, $message, $end) = @_;
- print "${message}:" if ($verbose);
+ print "$message:" if $verbose;
if ($force or is_newer($input, $output)) {
- print "Running ${cmd}:" if ($verbose);
+ print "Running $cmd:" if $verbose;
execute($cmd);
} else {
- print "File [$output] newer than input [$input], not creating:" if ($verbose);
+ print "File [$output] newer than input [$input], not creating:" if $verbose;
}
+
if ($end) {
- print "Existing after $message" if $verbose;
+ print "Exiting after $message" if $verbose;
print "\n";
exit 0;
}
@@ -138,7 +142,6 @@ sub do_file_process {
sub execute {
my ($command) = @_;
-# print "\nTo execute [$command]\n";
my $result = `$command`;
my $status = ($? >> 8);
die "execution of program [$command] failed: status [$status]" if ($status != 0);
@@ -147,8 +150,9 @@ sub execute {
sub is_newer {
my ($f1, $f2) = @_;
- my ($f1write) = (stat($f1))[9];
- my ($f2write) = (stat($f2))[9];
+ my $f1write = (stat $f1)[9];
+ my $f2write = (stat $f2)[9];
+
if (defined $f1write and defined $f2write) {
return $f1write > $f2write;
} else {
diff --git a/senttok/senttok.pl b/senttok/senttok.pl
index 5bef3c5..1fa104d 100755
--- a/senttok/senttok.pl
+++ b/senttok/senttok.pl
@@ -16,35 +16,36 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
+#
+# senttok.pl
+#
+# This script creates a file that corresponds to the recognized sentence tokens.
+# For each sentence, it outputs its sentence token, or unknown otherwise.
+#
+
use strict;
+#use warnings;
+use Getopt::Std;
-my $TOO_LONG = 70;
+my $INPUT_FILE_EXTENSION = 'goodsent';
-# where are we running the splitter from?
-my $path = $0;
-$path =~ s/[^\/]+$//;
-if ($path eq "") {
- $path = "./";
+# parse cmdline parameters
+if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) {
+ print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n";
+ exit 1;
}
-my $path_license_sentences = $path . "licensesentence.dict";
-open FH, "<$ARGV[0]";
-my @license_sentences = ();
-open LICENSESENTENCEFILE, "<$path_license_sentences";
-my $line;
-while ($line = <LICENSESENTENCEFILE>) {
- chomp $line;
- next if $line =~ /^\#/;
- next if $line =~ /^ *$/;
- die "Illegal format in license expression [$line] " unless $line =~ /(.*?):(.*?):(.*)/;
- push @license_sentences, $line;
-}
+my $TOO_LONG = 70;
+
+my $path = get_my_path($0);
+
+my $input_file = $ARGV[0];
+my $license_sentences_file = "$path/licensesentence.dict";
-#foreach $line (@license_sentences) {
-# print $line;
-#}
-close LICENSESENTENCEFILE;
-while ($line = <>) {
+my @license_sentences = read_license_sentences($license_sentences_file);
+
+open my $input_fh, '<', $input_file or die "can't open file [$input_file]: $!";
+while (my $line = <$input_fh>) {
my $save_line;
my $original_line;
chomp $line;
@@ -57,11 +58,10 @@ while ($line = <>) {
$line = normalize_sentence($line);
my $check = 0;
- my $match_name = "UNKNOWN";
- my @parm = ();
- my $sentence;
+ my $match_name = 'UNKNOWN';
+ my @parameters = ();
my $distance = 1; #maximum? number
- my $most_similar_name = "UNKNOWN";
+ my $most_similar_name = 'UNKNOWN';
my $before;
my $after;
my $gpl = 0;
@@ -69,10 +69,6 @@ while ($line = <>) {
$save_line = $line;
-# print "Original
-# [$line]
-#\n";
-
my $line_as_gpl = '';
if (looks_like_gpl($line)) {
@@ -82,35 +78,28 @@ while ($line = <>) {
$line_as_gpl = $line;
}
my ($name, $sub_rule, $number, $regexp, $option);
- my $save_line = $line;
+ $save_line = $line;
my $save_gpl = $gpl;
- my $LGPL = "";
- foreach $sentence (@license_sentences) {
- ($name, $sub_rule, $number, $regexp, $option) = split(/:/, $sentence);
+ my $LGPL = '';
+ foreach my $sentence (@license_sentences) {
+ ($name, $sub_rule, $number, $regexp, $option) = split /:/, $sentence;
# we need this due to the goto again
$line = $save_line;
$gpl = $save_gpl;
- $LGPL = "";
+ $LGPL = '';
again:
-# print "Testing
-# lin[$line]
-# ori[$save_line]
-# re [$regexp]
-# lpg[$LGPL]
-#\n";
if ($line =~ /$regexp/im) {
$before = $`;
- $after = $'; #';
+ $after = $';
$check = 1;
$match_name = $name;
for (my $i = 1; $i <= $number; $i++) {
no strict 'refs';
- push @parm, $$i;
+ push @parameters, $$i;
}
last;
} else {
-# print "NO MATCH\n";
- # let us try again in cas it is lesser/library
+ # let us try again in case it is lesser/library
# do it only once
if ($gpl and $line =~ s/(Lesser|Library) GPL/GPL/i) {
$LGPL = $1;
@@ -121,7 +110,7 @@ while ($line = <>) {
$line = $save_line;
goto again;
}
- next;## dmg
+ next;
my $targetset = $regexp;
$targetset =~ s/^(.*)$/$1/;
my $tmpdist = levenshtein($line, $targetset) / max(length($targetset), length($sentence));
@@ -130,30 +119,42 @@ while ($line = <>) {
$distance = $tmpdist;
}
}
- last; ###
+ last;
}
- if ($check == 1) {
- # licensesentence name, parm1, parm2,..
+ if ($check) {
+ # licensesentence name, param1, param2, ...
if ($gpl) {
- $match_name .= "Ver" . $gpl_version;
- $match_name .= "+" if $gpl_later;
+ $match_name .= 'Ver' . $gpl_version;
+ $match_name .= '+' if $gpl_later;
$match_name = $LGPL . $match_name;
}
if (length($before) > $TOO_LONG || length($after) > $TOO_LONG) {
- $match_name .= "-TOOLONG";
+ $match_name .= '-TOOLONG';
}
- my $parmstrings = join(";", $match_name, $sub_rule, $before, $after, @parm);
- print $parmstrings, ":$original_line\n";
+ # TODO: Use of uninitialized value in @parameters
+ my $parameter_string = join ';', $match_name, $sub_rule, $before, $after, @parameters;
+ print $parameter_string, ":$original_line\n";
} else {
# UNKNOWN, sentence
chomp $line;
- print $match_name, ";", 0, ";", $most_similar_name, ";", $distance, ";", $save_line, ":$original_line\n";
+ my $parameter_string = join ';', $match_name, 0, $most_similar_name, $distance, $save_line;
+ print $parameter_string, ":$original_line\n";
}
}
+close $input_fh;
-close FH;
exit 0;
+sub get_my_path {
+ my ($self) = @_;
+ my $path = $self;
+ $path =~ s/\/+[^\/]+$//;
+ if ($path eq '') {
+ $path = './';
+ }
+ return $path;
+}
+
sub normalize_gpl {
my ($line) = @_;
my $later = 0;
@@ -177,7 +178,6 @@ sub normalize_gpl {
}
if ($line =~ s/(version|v\.?) ([123\.0]+)/<VERSION>/i) {
$version = $2;
-# print "Version [$version]\n";
}
if ($line =~ s/GPL ?[v\-]([123\.0]+)/GPL <VERSION>/i) {
$version = $1;
@@ -217,9 +217,7 @@ sub normalize_gpl {
$line =~ s/ +/ /;
$line =~ s/ +$//;
-# print ">>>>>>>>>>$line:$later:$version\n";
-
- return ($line,$later,$version);
+ return ($line, $later, $version);
}
sub looks_like_gpl {
@@ -296,8 +294,8 @@ sub levenshtein {
# Some char-by-char processing is ahead, so prepare
# array of chars from the strings
#
- my @ar1 = split(//, $s1);
- my @ar2 = split(//, $s2);
+ my @ar1 = split //, $s1;
+ my @ar2 = split //, $s2;
for (my $i = 1; $i <= $len1; ++$i) {
for (my $j = 1; $j <= $len2; ++$j) {
@@ -348,3 +346,22 @@ sub max {
return $list[0] > $list[1] ? $list[0] : $list[1];
}
+sub read_license_sentences {
+ my ($file) = @_;
+ my @license_sentences = ();
+
+ open my $fh, '<', $file or die "can't open file [$file]: $!";
+
+ while (my $line = <$fh>) {
+ chomp $line;
+ next if $line =~ /^\#/;
+ next if $line =~ /^ *$/;
+ die "illegal format in license expression [$line]" unless $line =~ /(.*?):(.*?):(.*)/;
+ push @license_sentences, $line;
+ }
+
+ close $fh;
+
+ return @license_sentences;
+}
+
diff --git a/splitter/splitter.pl b/splitter/splitter.pl
index c806f96..dfffa83 100755
--- a/splitter/splitter.pl
+++ b/splitter/splitter.pl
@@ -5,11 +5,14 @@
# Author: Paul Clough {cloughie@dcs.shef.ac.uk}
#
-# This program is originally based on the sentence splitter program
-# published by Paul Clough. Version 1.0, available form
-# http://ir.shef.ac.uk/cloughie/software.html (splitter.zip)
-# The original program without a license.
-
+# This program is originally based on the sentence splitter program
+# published by Paul Clough. Version 1.0, available from
+# http://ir.shef.ac.uk/cloughie/software.html (splitter.zip)
+# The original program is without a license.
+#
+# It was mostly rewritten.
+# His ideas, however, linger in here (and his file of abbreviations)
+#
# Modifications to the original by Daniel M German and Y. Manabe,
# which are under the following license:
#
@@ -25,52 +28,37 @@
#
# You should have received a copy of the GNU Affero General Public License
# along with this patch. If not, see <http://www.gnu.org/licenses/>.
+#
-use strict;
-
-# This program is originally based on the sentence splitter program
-# published by Paul Clough. Version 1.0, but then it was mostly rewritten
-# His ideas, however, linger in here (and his dictionary_file of abbreviations)
-
-my $dictionary_file = 'splitter.dict';
-my $abbreviations_file = 'splitter.abv';
-my $length = 0;
-my %COMMON_TERMS = ();
-my %ABBREVIATIONS = ();
-my $output_file = $ARGV[0];
-
-# where are we running the splitter from?
-my $path = $0;
-$path =~ s/[^\/]+$//;
-if ($path eq '') {
- $path = './';
-}
-$dictionary_file = $path . $dictionary_file;
-$abbreviations_file = $path . $abbreviations_file;
+#
+# splitter.pl
+#
+# This script breaks comments into sentences.
+#
-die "Usage $0 <filename>.comments" unless $ARGV[0] =~ /\.comments$/;
+use strict;
+use warnings;
+use Getopt::Std;
-die "Input file name should end in '.comments' [$output_file]" unless $output_file =~ s/\.comments$/.sentences/;
+my $INPUT_FILE_EXTENSION = 'comments';
-open(OUT, ">$output_file") or die("Unable to create output file [$output_file]");
+# parse cmdline parameters
+if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) {
+ print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n";
+ exit 1;
+}
-#print length($opt_o);
+my $path = get_my_path($0);
-# Load in the dictionary and find the common words.
-# Here, we assume the words in upper case are simply names and one
-# word per line - i.e. in same form as /usr/dict/words
-&load_dictionary;
+my $input_file = $ARGV[0];
+my $abbreviations_file = "$path/splitter.abv";
-# Same assumptions as for dictionary
-&load_abbreviations;
+my $output_file = $input_file; $output_file =~ s/\.$INPUT_FILE_EXTENSION$/\.sentences/;
-my $text;
-# open(FILE, $opt_f) or die "Can't open $opt_f for reading\n";
+my $text = read_file_as_string($input_file);
+my %abbreviations = load_abbreviations($abbreviations_file);
-my $line;
-while (defined ($line = <>)) {
- $text .= $line;
-}
+open my $output_fh, '>', $output_file or die "can't create output file [$output_file]: $!";
# append a newline just in case
$text .= "\n";
@@ -82,7 +70,7 @@ $text =~ s@={3,1000}@ @gmx;
$text =~ s@:{3,1000}@ @gmx;
$text =~ s@\*{3,1000}@ @gmx;
-# some characters are used for prettyprinting but never appear in sentences
+# some characters are used for pretty-printing but never appear in sentences
$text =~ s@\|+@ @gmx;
$text =~ s@\\+@ @gmx;
@@ -91,7 +79,7 @@ $text =~ s@^[ \t]*/\*@@gmx;
$text =~ s/\*\/[ \t]*$//gmx;
$text =~ s@([^:])// @$1@gmx;
-# Replace /\r\n/ with \n only
+# replace /\r\n/ with \n only
$text =~ s/\r\n/\n/g;
# now, try to replace the leading/ending character of each line #/-, at most 3 heading characters
@@ -109,9 +97,9 @@ $text =~ s/[\*\#]+//gmx;
$text =~ s/^[ \t]+$/\n/gm;
# let us try the following trick
-# We first get rid of \t and replace it with ' '
-# we then use \t as a "single line separator" and \n as multiple line.
-# so we can match each with a single character.
+# we first get rid of \t and replace it with ' '
+# we then use \t as a "single line separator" and \n as multiple line
+# so we can match each with a single character
$text =~ tr/\t/ /;
$text =~ s/\n(?!\n)/\t/g;
@@ -123,61 +111,66 @@ $text =~ s/v\.\s+2\.0/v<dot> 2<dot>0/g;
while ($text =~ /^([^\n]*)\n/gsm) {
my $curr = $1;
-# print "<<$curr\n<<\n";
# let us count the number of alphabetic chars to check if we are skipping anything we should not
- my $count = 0;
+ my $count1 = 0;
for my $i (0..length($curr)-1) {
- my $c = substr($curr,$i,1);
- $count++ if ($c ge 'A' && $c le 'z');
+ my $c = substr($curr, $i, 1);
+ $count1++ if ($c ge 'A' && $c le 'z');
}
my @sentences = split_text($curr);
my $count2 = 0;
-
foreach my $sentence (@sentences) {
for my $i (0..length($sentence)-1) {
my $c = substr($sentence, $i, 1);
$count2++ if ($c ge 'A' && $c le 'z');
}
- print OUT clean_sentence($sentence) , "\n";
+ my $clean_sentence = clean_sentence($sentence);
+ next unless $clean_sentence;
+ print $output_fh $clean_sentence, "\n";
}
- if ($count != $count2) {
+
+ if ($count1 != $count2) {
print STDERR "-------------------------------------\n";
print STDERR "[$curr]\n";
foreach my $sentence (@sentences) {
- print STDERR clean_sentence($sentence) , "\n";
+ print STDERR clean_sentence($sentence), "\n";
}
- die "Number of printable chars does not match! [$count][$count2]";
+ die "Number of printable chars does not match! [$count1][$count2]";
}
}
-close OUT;
-#print "$text\n";
+close $output_fh;
-exit;
+exit 0;
-#***************************************************************************************************
-# procedures
-#***************************************************************************************************
+sub get_my_path {
+ my ($self) = @_;
+ my $path = $self;
+ $path =~ s/\/+[^\/]+$//;
+ if ($path eq '') {
+ $path = './';
+ }
+ return $path;
+}
sub clean_sentence {
($_) = @_;
- # check for trailing bullets of different types
+ # check for trailing bullets of different types
s/^o //;
s/^\s*[0-9]{1-2}+\s*[\-\)]//;
s/^[ \t]+//;
s/[ \t]+$//;
+
# remove a trailing -
s/^[ \t]*[\-\.\s*] +//;
- # replace quotes
s/\s+/ /g;
s/['"`]+/<quotes>/g;
-
s/:/<colon>/g;
s/\.+$/./;
@@ -198,15 +191,15 @@ sub split_text {
my $after;
my $current_sentence = '';
# this breaks the sentence into
- # 1. Any text before a separator
- # 2. The separator [.!?:\n]
+ # 1. any text before a separator
+ # 2. the separator [.!?:\n]
# 3.
- while ($text =~ /^
+ while ($text =~ /^
([^\.\!\?\:\n]*) #
([\.\!\?\:\n])
(?=(.?))
/xsm) { #/(?:(?=([([{\"\'`)}\]<]*[ ]+)[([{\"\'`)}\] ]*([A-Z0-9][a-z]*))|(?=([()\"\'`)}\<\] ]+)\s))/sm ) {
- $text = $'; #';
+ $text = $';
my $sentence_match = $1;
my $sentence = $1 . $2;
my $punctuation = $2;
@@ -217,7 +210,7 @@ sub split_text {
$current_sentence .= $sentence;
next;
}
- #at this point we know that there is a space after
+ # at this point we know that there is a space after
if ($punctuation eq ':' || $punctuation eq '?' || $punctuation eq '!') {
# let us consider this right here a beginning of a sentence
push @result, $current_sentence . $sentence;
@@ -231,7 +224,7 @@ sub split_text {
# TODO
# simple heuristic... let us check that the next words are not the beginning of a sentence
# in our library
- # ENDTODO
+ # END TODO
# is the last word an abbreviation? For this the period has to follow the word
# this expression might have to be updated to take care of special characters in names :(
@@ -260,7 +253,7 @@ sub split_text {
# only accept abbreviations if the previous char to the abbrev is space or
# is empty (beginning of line). This avoids things like .c
- if (length($before) > 0 and $before eq ' ' and $ABBREVIATIONS{$last_word}) {
+ if (length($before) > 0 and $before eq ' ' and $abbreviations{$last_word}) {
$current_sentence .= $sentence;
next;
} else {
@@ -277,39 +270,32 @@ sub split_text {
}
push @result, $current_sentence . $text;
- #print_non_sentence($text,"\n",'');
return @result;
}
-sub load_dictionary {
- my $common_term = '';
+sub read_file_as_string {
+ my $file = shift;
- if (open(DICT, $dictionary_file)) {
- while (defined ($line = <DICT>)) {
- chomp($line);
- if ($line !~ /^[A-Z]/) {
- $COMMON_TERMS{$line} = 1;
- }
- }
+ open my $fh, '<', $file or die "can't open file '$file': $!";
+ my $content = do { local $/; <$fh> };
+ close $fh or die "can't close file '$file': $!";
- close(DICT);
- } else {
- die "cannot open dictionary file $dictionary_file: $!";
- }
+ return $content;
}
sub load_abbreviations {
- my $abbrv_term = '';
+ my ($file) = @_;
+ my %abbreviations = ();
- if (open(ABBRV, $abbreviations_file)) {
- while (defined ($line = <ABBRV>)) {
- chomp($line);
- $ABBREVIATIONS{$line} = $line;
- }
+ open my $fh, '<', $file or die "can't open file [$file]: $!";
- close(ABBRV);
- } else {
- die "cannot open abbreviations file $abbreviations_file: $!";
+ while (my $line = <$fh>) {
+ chomp $line;
+ $abbreviations{$line} = $line;
}
+
+ close $fh;
+
+ return %abbreviations;
}