diff options
author | René Scheibe <rene.scheibe@gmail.com> | 2014-09-10 15:12:04 +0200 |
---|---|---|
committer | René Scheibe <rene.scheibe@gmail.com> | 2014-09-12 19:55:15 +0200 |
commit | 43cd8a52398a3a6436eb2e24d2d30fb3ee5080fc (patch) | |
tree | bdb3524e3d54e17255032afe2a47ec3e80fc7dba | |
parent | 949dc6984decd81e51e264af34ee7cbc117e5e4a (diff) | |
download | ninka-43cd8a52398a3a6436eb2e24d2d30fb3ee5080fc.tar.gz |
massive cleanup
* consistent script descriptions
* consistent cmdline parameter parsing with Getopt::Std
* remove some commented out & unused code
* fix some issues reported by perlcritic
* better code formatting
* and much more...
-rwxr-xr-x | extComments/extComments.pl | 110 | ||||
-rwxr-xr-x | extComments/hashComments.pl | 78 | ||||
-rwxr-xr-x | filter/filter.pl | 105 | ||||
-rwxr-xr-x | matcher/matcher.pl | 215 | ||||
-rwxr-xr-x | ninka.pl | 126 | ||||
-rwxr-xr-x | senttok/senttok.pl | 143 | ||||
-rwxr-xr-x | splitter/splitter.pl | 176 |
7 files changed, 481 insertions, 472 deletions
diff --git a/extComments/extComments.pl b/extComments/extComments.pl index d7a65d5..1f840a8 100755 --- a/extComments/extComments.pl +++ b/extComments/extComments.pl @@ -16,87 +16,91 @@ # along with this program. If not, see <http://www.gnu.org/licenses/>. # -use Getopt::Std; -use strict; - -my $path = $0; +# +# extComments.pl +# +# This script extracts comments from source code. +# If no comment extractor is known for a language, then extracts top lines from source. +# -$path =~ s/\/+[^\/]+$//; -if ($path eq '') { - $path = './'; -} +use strict; +use warnings; +use Getopt::Std; -# set parameters +# parse cmdline parameters my %opts = (); -if (!getopts ('vc:p:',\%opts)) { - print STDERR "Usage $0 -v +if (!getopts('v', \%opts) or scalar(@ARGV) == 0) { + print STDERR "Usage: $0 [OPTIONS] <filename> - -v verbose - -c count of comment blocks +Options: + -v verbose\n"; -\n"; - - die; + exit 1; } -my $file = $ARGV[0]; +my $verbose = exists $opts{v}; -#die "illegal file [$file]" if $file =~ m@/\.@; +my $path = get_my_path($0); -my $number_comments = 1; -$number_comments = $opts{c} if exists $opts{c}; -my $verbose = 1; -$verbose = exists $opts{v}; +my $input_file = $ARGV[0]; +my $comments_file = "$input_file.comments"; -if (get_size($file) == 0) { - print STDERR "Empty file, just exit\n" if $verbose; - exit 0; # nothing to report, just end +my $comments_cmd = determine_comments_cmd($input_file, $comments_file); +execute($comments_cmd); +if ($comments_cmd =~ /^comments/ and get_size($comments_file) == 0) { + $comments_cmd = create_head_cmd($input_file, $comments_file, 700); + execute($comments_cmd); } -my $comments_cmd = determine_comments_extractor($file); - -execute("$comments_cmd"); +exit 0; -if ($comments_cmd =~ /^comments/ and get_size("${file}.comments") == 0) { - `cat '$file' | head -700 > ${file}.comments`; +sub get_my_path { + my ($self) = @_; + my $path = $self; + $path =~ s/\/+[^\/]+$//; + if ($path eq '') { + $path = './'; + } + return $path; } -exit 0; - -sub determine_comments_extractor { - my ($file) = @_; - if ($file =~ /\.([^\.]+)$/) { - my $ext= $1; +sub determine_comments_cmd { + my ($input_file, $comments_file) = @_; + if ($input_file =~ /\.([^\.]+)$/) { + my $ext = $1; if ($ext =~ /^(pl|pm|py)$/) { - # for the time being, let us just extract the top 400 lines - return "cat '$file' | head -400 > '${file}.comments'"; -# return "$path/hashComments.pl -p '#' '$file'"; - } elsif ($ext eq 'jl' or $ext eq 'el') { - return "cat '$file' | head -400 > '${file}.comments'"; -# return "$path/hashComments.pl -p ';' '$file'";; + return create_head_cmd($input_file, $comments_file, 400); +# return "$path/hashComments.pl -p '#' '$input_file'"; + } elsif ($ext =~ /^(jl|el)$/) { + return create_head_cmd($input_file, $comments_file, 400); +# return "$path/hashComments.pl -p ';' '$input_file'";; } elsif ($ext =~ /^(java|c|cpp|h|cxx|c\+\+|cc)$/ ) { - my $comments_cmd_location = `which comments`; - if ($comments_cmd_location ne '') { - return "comments -c1 '$file' 2> /dev/null"; + my $comments_binary = 'comments'; + if (`which $comments_binary` ne '') { + return "$comments_binary -c1 '$input_file' 2> /dev/null"; } else { - return "cat '$file' | head -400 > '${file}.comments'"; + return create_head_cmd($input_file, $comments_file, 400); } } else { - return "cat '$file' | head -700 > '${file}.comments'"; + return create_head_cmd($input_file, $comments_file, 700); } } else { - print "\n>>>>>>>>>>>>>>>>>>>>>\n"; - return "cat '$file' | head -700 > '${file}.comments'"; + return create_head_cmd($input_file, $comments_file, 700); } } +sub create_head_cmd { + my ($input_file, $output_file, $count_lines) = @_; + return "head -$count_lines '$input_file' > '$output_file'"; +} + sub execute { - my ($c) = @_; - my $r = `$c`; + my ($cmd) = @_; + my $result = `$cmd`; my $status = ($? >> 8); - die "execution of program [$c] failed: status [$status]" if ($status != 0); - return $r; + die "execution of program [$cmd] failed: status [$status]" if ($status != 0); + return $result; } sub get_size { diff --git a/extComments/hashComments.pl b/extComments/hashComments.pl index a32e283..77c3863 100755 --- a/extComments/hashComments.pl +++ b/extComments/hashComments.pl @@ -16,69 +16,75 @@ # along with this program. If not, see <http://www.gnu.org/licenses/>. # -# this is to extract the first <n> comments from any language that -# uses the same prefix +# +# hashComments.pl +# +# This script extracts the first <n> comments from any language that uses the same prefix. +# +use strict; +use warnings; use Getopt::Std; -# set parameters +# parse cmdline parameters my %opts = (); -if (!getopts ('vc:p:',\%opts)) { - print STDERR "Usage $0 -v +if (!getopts ('vc:p:', \%opts)) { + print STDERR "Usage: $0 [OPTIONS] <filename> +Options: -v verbose -p comment char - -c count of comment blocks - -\n"; + -c count of comment blocks\n"; - die; + exit 1; } -my $file = $ARGV[0]; - -open (OUT, ">${file}.comments") or die "Unable to create [${file}.comments]"; - - <>; -print OUT unless /^\#\!/; - -my $comment_char = '#'; - -$comment_char = $opts{p} if exists $opts{p}; - -my $comments_count = 1; -$comments_count = $opts{c} if exists $opts{c}; my $verbose = exists $opts{v}; +my $comment_char = exists $opts{p} ? $opts{p} : '#'; +my $comments_count = exists $opts{c} ? $opts{c} : 1; + my $inside_comment = 0; my $inside_code = 0; my $comment_count = 0; my $code_count = 0; -while (<>) { - chomp; - if (is_comment($_)) { - s/\t/ /g; - s/ +/ /g; +my $input_file = $ARGV[0]; +my $comments_file = "$input_file.comments"; + +open my $input_fh, '>', $input_file or die "can't open input file [$input_file]"; +open my $comments_fh, '>', $comments_file or die "can't create output file [$comments_file]"; + +<$input_fh>; +print $comments_fh unless /^\#\!/; + +while (my $line = <$input_fh>) { + chomp $line; + if (is_comment($line)) { + $line =~ s/\t/ /g; + $line =~ s/ +/ /g; $comment_count++ if (not $inside_comment); $inside_comment = 1; - /$comment_char+/; - print OUT $' . "\n"; #' - } elsif (is_blank($_)) { - print OUT "\n"; + $line =~ /$comment_char+/; + print $comments_fh substr($line, $+[0]) . "\n"; + } elsif (is_blank($line)) { + print $comments_fh "$line\n"; } else { - exit 0; + last; } } +close $input_fh; +close $comments_fh; + sub is_comment { - my ($st) = @_; - return ($st =~ /^\s*$comment_char/); + my ($string) = @_; + return ($string =~ /^\s*$comment_char/); } sub is_blank { - my ($st) = @_; - return ($st =~ /^\s*$/); + my ($string) = @_; + return ($string =~ /^\s*$/); } diff --git a/filter/filter.pl b/filter/filter.pl index 7b0fafd..26edb0d 100755 --- a/filter/filter.pl +++ b/filter/filter.pl @@ -18,74 +18,85 @@ # # filter.pl -# This script classify input sentences into two categories, -# good sentences and bad sentences. -# This script regard a sentence include a critical word (ex. legal term) as good # -# usage: filter.pl (inputfilename) -# -# Author: Yuki Manabe +# This script classifies input sentences into two categories, good sentences and bad sentences. +# A sentence including a critical word (ex. legal term) is regarded as good. # + use strict; +use warnings; +use Getopt::Std; -#print $ARGV[0]; +my $INPUT_FILE_EXTENSION = 'sentences'; -# where are we running the program from -my $path = $0; -$path =~ s/[^\/]+$//; -if ($path eq '') { - $path = './'; +# parse cmdline parameters +if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) { + print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n"; + exit 1; } -my $file_critical_words = $path . 'criticalword.dict'; -die "Usagee $0 <filename>.sentences" unless $ARGV[0] =~ /\.sentences$/; +my $path = get_my_path($0); -my $file_good = $ARGV[0]; +my $input_file = $ARGV[0]; +my $file_critical_words = "$path/criticalword.dict"; -die "Filename should end in '.sentences' [$file_good]" unless $file_good =~ s/\.sentences$/\.goodsent/; -my $file_bad = $ARGV[0]; -$file_bad =~ s/\.sentences$/\.badsent/; +my $file_good = $input_file; $file_good =~ s/\.$INPUT_FILE_EXTENSION$/\.goodsent/; +my $file_bad = $input_file; $file_bad =~ s/\.$INPUT_FILE_EXTENSION$/\.badsent/; -#print $file_good; -#print $file_bad; +open my $input_fh, '<', $input_file or die "can't open input file [$input_file]: $!"; -open (INPUTFILE, "<$ARGV[0]") or die ("Error: $ARGV[0] is not found."); -open (DICTIONARY, "<$file_critical_words") or die ('Error: criticalword.dict is not found.'); +open my $good_fh, '>', $file_good or die "can't create good sentences file [$file_good]: $!"; +open my $bad_fh, '>', $file_bad or die "can't create bad sentences file [$file_bad]: $!"; -open (GOODOUT, ">$file_good") || die ('Error'); -open (BADOUT, ">$file_bad") || die ('Error'); - -my @critical_words = (); -# read dictionary into list -my $critical_word; -while ($critical_word = <DICTIONARY>) { - chomp $critical_word; - next if $critical_word =~ /^\#/; - $critical_word =~ s/\#.*$//; # remove everything to the end of file - push(@critical_words, "$critical_word"); -} -close(DICTIONARY); +my @critical_words = read_critical_words($file_critical_words); -#matching cliticalwords in list against sentences. -my $sentence; -while ($sentence = <INPUTFILE>) { - my $check = 0; +# matching critical words in list against sentences +while (my $sentence = <$input_fh>) { chomp $sentence; - foreach $critical_word (@critical_words) { + next unless $sentence; + my $check = 0; + foreach my $critical_word (@critical_words) { if ($sentence =~ /\b$critical_word\b/i) { $check = 1; - #print "$critical_word:$sentence"; last; } } - if ($check == 1) { - print GOODOUT "$sentence\n"; + if ($check) { + print $good_fh "$sentence\n"; } else { - print BADOUT "$sentence\n"; + print $bad_fh "$sentence\n"; + } +} + +close $input_fh; +close $good_fh; +close $bad_fh; + +sub get_my_path { + my ($self) = @_; + my $path = $self; + $path =~ s/\/+[^\/]+$//; + if ($path eq '') { + $path = './'; } + return $path; } -close(INPUTFILE); -close(GOODOUT); -close(BADOUT); +sub read_critical_words { + my ($file) = @_; + my @critical_words = (); + + open my $fh, '<', $file or die "can't open file [$file]: $!"; + + while (my $line = <$fh>) { + chomp $line; + next if $line =~ /^\#/; + $line =~ s/\#.*$//; # remove everything to the end of line + push @critical_words, $line; + } + + close $fh; + + return @critical_words; +} diff --git a/matcher/matcher.pl b/matcher/matcher.pl index 9d1e74b..8eae92f 100755 --- a/matcher/matcher.pl +++ b/matcher/matcher.pl @@ -19,21 +19,19 @@ # # matchter.pl # -# This script use a set of license sentence name as input -# and output license name corresponds to a rule which match the set. -# -# author: Yuki Manabe -# -# usage: matchter.pl (inputfilename) +# This script uses a set of license sentence names as input and +# outputs license names corresponds to a rule which match the set. # use strict; +#use warnings; +use Getopt::Std; + my $debug = 0; my %NON_CRITICAL_RULES = (); # these should go into a file, but for the time being, let us keep them here - # once we have matched a rule, these are not that important my @GENERAL_NON_CRITICAL = ('AllRights'); @@ -57,10 +55,10 @@ $NON_CRITICAL_RULES{'LibraryGPLv3+'} = [@GPL_NON_CRITICAL]; $NON_CRITICAL_RULES{'LibraryGPLv3'} = [@GPL_NON_CRITICAL]; $NON_CRITICAL_RULES{'LibraryGPLv2+'} = [@GPL_NON_CRITICAL]; $NON_CRITICAL_RULES{'LibraryGPLv2'} = [@GPL_NON_CRITICAL]; -$NON_CRITICAL_RULES{'LesserGPLv3'} = [@GPL_NON_CRITICAL, 'LesserGPLseeVer3','LesserGPLcopyVer3','SeeFileVer3']; +$NON_CRITICAL_RULES{'LesserGPLv3'} = [@GPL_NON_CRITICAL, 'LesserGPLseeVer3', 'LesserGPLcopyVer3', 'SeeFileVer3']; $NON_CRITICAL_RULES{'LesserGPLv2.1+'} = [@GPL_NON_CRITICAL]; $NON_CRITICAL_RULES{'LesserGPLv2.1'} = [@GPL_NON_CRITICAL]; -$NON_CRITICAL_RULES{'LGPLv2orv3'}= [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'LGPLv2orv3'} = [@GPL_NON_CRITICAL]; $NON_CRITICAL_RULES{'LesserGPLv2'} = [@GPL_NON_CRITICAL]; $NON_CRITICAL_RULES{'LesserGPLv2+'} = [@GPL_NON_CRITICAL]; @@ -70,85 +68,68 @@ $NON_CRITICAL_RULES{'GPLv1+'} = [@GPL_NON_CRITICAL]; $NON_CRITICAL_RULES{'GPLv1'} = [@GPL_NON_CRITICAL]; $NON_CRITICAL_RULES{'GPLv3+'} = [@GPL_NON_CRITICAL]; $NON_CRITICAL_RULES{'GPLv3'} = [@GPL_NON_CRITICAL]; -$NON_CRITICAL_RULES{'AGPLv3'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0','AGPLseeVer0']; -$NON_CRITICAL_RULES{'AGPLv3+'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0','AGPLseeVer0']; +$NON_CRITICAL_RULES{'AGPLv3'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0', 'AGPLseeVer0']; +$NON_CRITICAL_RULES{'AGPLv3+'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0', 'AGPLseeVer0']; $NON_CRITICAL_RULES{'GPLnoVersion'} = [@GPL_NON_CRITICAL]; $NON_CRITICAL_RULES{'Apachev1.1'} = ['ApacheLic1_1']; -$NON_CRITICAL_RULES{'Apachev2'} = ['ApachePre','ApacheSee']; +$NON_CRITICAL_RULES{'Apachev2'} = ['ApachePre', 'ApacheSee']; -$NON_CRITICAL_RULES{'LibGCJLic'} = ['LibGCJSee']; -$NON_CRITICAL_RULES{'CDDLicV1'} = ['Compliance','CDDLicWhere','ApachesPermLim','CDDLicIncludeFile','UseSubjectToTerm', 'useOnlyInCompliance']; -$NON_CRITICAL_RULES{'CDDLic'} = ['Compliance','CDDLicWhere','ApachesPermLim','CDDLicIncludeFile','UseSubjectToTerm', 'useOnlyInCompliance']; +$NON_CRITICAL_RULES{'LibGCJLic'} = ['LibGCJSee']; +$NON_CRITICAL_RULES{'CDDLicV1'} = ['Compliance', 'CDDLicWhere', 'ApachesPermLim', 'CDDLicIncludeFile', 'UseSubjectToTerm', 'useOnlyInCompliance']; +$NON_CRITICAL_RULES{'CDDLic'} = ['Compliance', 'CDDLicWhere', 'ApachesPermLim', 'CDDLicIncludeFile', 'UseSubjectToTerm', 'useOnlyInCompliance']; -$NON_CRITICAL_RULES{'CDDLorGPLv2'}= ['CDDLorGPLv2doNotAlter','AllRights','useOnlyInCompliance', 'CDDLorGPLv2whereVer0', 'ApachesPermLim', 'CDDLorGPLv2include','CDDLorGPLv2IfApplicable', 'CDDLorGPLv2Portions', 'CDDLorGPLv2ifYouWishVer2', 'CDDLorGPLv2IfYouAddVer2']; +$NON_CRITICAL_RULES{'CDDLorGPLv2'} = ['CDDLorGPLv2doNotAlter', 'AllRights', 'useOnlyInCompliance', 'CDDLorGPLv2whereVer0', 'ApachesPermLim', 'CDDLorGPLv2include', 'CDDLorGPLv2IfApplicable', 'CDDLorGPLv2Portions', 'CDDLorGPLv2ifYouWishVer2', 'CDDLorGPLv2IfYouAddVer2']; $NON_CRITICAL_RULES{'CPLv1orGPLv2+orLGPLv2+'} = ['licenseBlockBegin', 'licenseBlockEnd']; -$NON_CRITICAL_RULES{'Qt'} = ['Copyright','qtNokiaExtra','QTNokiaContact', 'qtDiaTems']; -$NON_CRITICAL_RULES{'orLGPLVer2.1'} = ['LesserqtReviewGPLVer2.1','qtLGPLv2.1where']; -$NON_CRITICAL_RULES{'orGPLv3'} = ['qtReviewGPLVer3.0','qtReviewGPLVer3','qtGPLwhere']; +$NON_CRITICAL_RULES{'Qt'} = ['Copyright', 'qtNokiaExtra', 'QTNokiaContact', 'qtDiaTems']; +$NON_CRITICAL_RULES{'orLGPLVer2.1'} = ['LesserqtReviewGPLVer2.1', 'qtLGPLv2.1where']; +$NON_CRITICAL_RULES{'orGPLv3'} = ['qtReviewGPLVer3.0', 'qtReviewGPLVer3', 'qtGPLwhere']; $NON_CRITICAL_RULES{'digiaQTExceptionNoticeVer1.1'} = ['qtDigiaExtra']; -$NON_CRITICAL_RULES{'MPLv1_0'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; -$NON_CRITICAL_RULES{'MPLv1_1'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; -$NON_CRITICAL_RULES{'NPLv1_1'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; -$NON_CRITICAL_RULES{'NPLv1_0'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; +$NON_CRITICAL_RULES{'MPLv1_0'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee']; +$NON_CRITICAL_RULES{'MPLv1_1'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee']; +$NON_CRITICAL_RULES{'NPLv1_1'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee']; +$NON_CRITICAL_RULES{'NPLv1_0'} = ['ApacheLicWherePart1', 'MPLwarranty', 'MPLSee']; -$NON_CRITICAL_RULES{'subversion'} = ['SeeFileSVN','subversionHistory']; -$NON_CRITICAL_RULES{'subversion+'} = ['SeeFileSVN','subversionHistory']; +$NON_CRITICAL_RULES{'subversion'} = ['SeeFileSVN', 'subversionHistory']; +$NON_CRITICAL_RULES{'subversion+'} = ['SeeFileSVN', 'subversionHistory']; $NON_CRITICAL_RULES{'tmate+'} = ['SeeFileSVN']; $NON_CRITICAL_RULES{'openSSLvar2'} = ['BSDcondAdvPart2']; -$NON_CRITICAL_RULES{'MPLv1_1'} = ['licenseBlockBegin','MPLsee','Copyright','licenseBlockEnd','ApacheLicWherePart1','MPLwarranty', 'MPLwarrantyVar']; -$NON_CRITICAL_RULES{'MPL1_1andLGPLv2_1'} = ['MPLoptionIfNotDelete2licsVer0','MPL_LGPLseeVer0']; +$NON_CRITICAL_RULES{'MPLv1_1'} = ['licenseBlockBegin', 'MPLsee', 'Copyright', 'licenseBlockEnd', 'ApacheLicWherePart1', 'MPLwarranty', 'MPLwarrantyVar']; +$NON_CRITICAL_RULES{'MPL1_1andLGPLv2_1'} = ['MPLoptionIfNotDelete2licsVer0', 'MPL_LGPLseeVer0']; $NON_CRITICAL_RULES{'FreeType'} = ['FreeTypeNotice']; $NON_CRITICAL_RULES{'GPLVer2.1or3KDE+'} = [@GPL_NON_CRITICAL]; $NON_CRITICAL_RULES{'LGPLVer2.1or3KDE+'} = [@GPL_NON_CRITICAL]; -# initialize +my $INPUT_FILE_EXTENSION = 'senttok'; -my $path = $0; -$path =~ s/[^\/]+$//; -if ($path eq '') { - $path = './'; +# parse cmdline parameters +if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) { + print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n"; + exit 1; } -my $rules_file = $path . 'rules.dict'; -my $interrules_file = $path . 'interrules.dict'; +my $path = get_my_path($0); -die "Usage $0 <filename>.senttok" unless $ARGV[0] =~ /\.senttok$/; +my $input_file = $ARGV[0]; +my $rules_file = "$path/rules.dict"; +my $interrules_file = "$path/interrules.dict"; -my $count_unknowns = 0; +my @license_sentence_names = (); +my @originals = (); +read_original($input_file, \@license_sentence_names, \@originals); -# read the rules my @rules = read_rules($rules_file); my @inter_rules = read_inter_rules($interrules_file); -my @license_sentence_names = (); -my @original; - -read_original($ARGV[0], \@license_sentence_names, \@original); - -#foreach my $x (@license_sentence_names) { -# print "$x\n"; -#} -#exit; - -#foreach my $x (@original) { -# print "$x\n"; -#} -#exit; - ########################################## -#for my $ref( @inter_rules ){ -# print "@$ref\n"; -#} - # matching spdx requires to match strict licenses, with no alternatives... my $senttok = ',' . join(',', @license_sentence_names) . ','; @@ -159,9 +140,9 @@ print "[$senttok]\n" if $debug; match_license(); # do we have to check again? -## todo, verifythat we have unmatched sentences... +## todo, verify that we have unmatched sentences... -@license_sentence_names = split(',', $senttok); +@license_sentence_names = split ',', $senttok; # first remove the extrict part from it @@ -172,15 +153,12 @@ for (my $i = 0; $i <= $#license_sentence_names; $i++) { if ($license_sentence_names[$i] == 0 and ($license_sentence_names[$i] ne 'UNKNOWN' and $license_sentence_names[$i] ne '')) { -# print "[$license_sentence_names[$i]]\n"; $license_sentence_names[$i] =~ s/Extrict$//; $match++; } } -#print_result(); - -if ($match > 0) { +if ($match) { # print "REDO\n"; for (my $i = 0; $i <= $#inter_rules; $i++) { #for my $ref( @inter_rules[$i]){ @@ -199,99 +177,104 @@ print_result(); exit 0; -#print @license_sentence_names; -#print join(';',@license_sentence_names)."\n"; - -# 3. matching -############################### - -# we will iterate over rules, matching as many as we can... +sub get_my_path { + my ($self) = @_; + my $path = $self; + $path =~ s/\/+[^\/]+$//; + if ($path eq '') { + $path = './'; + } + return $path; +} sub is_unknown { my ($s) = @_; - my @f = split (/,/, $s); + my @f = split /,/, $s; return $f[0] eq 'UNKNOWN'; } sub read_rules { my ($file) = @_; - open (RULES, "<$file") or die ('Error: rules.dict is not found.'); - my $sentence; my @rules = (); - while ($sentence = <RULES>) { - chomp $sentence; + + open my $fh, '<', $file or die "can't open file [$file]: $!"; + + while (my $line = <$fh>) { + chomp $line; # clean up spaces - $sentence =~ s/^\s+//; - $sentence =~ s/\s+$//; - $sentence =~ s/\s*,\s*/,/g; - $sentence =~ s/\s*:\s*/:/g; - #check format - if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/) { + $line =~ s/^\s+//; + $line =~ s/\s+$//; + $line =~ s/\s*,\s*/,/g; + $line =~ s/\s*:\s*/:/g; + # check format + if ($line =~ /^#/ || $line !~ /(.*):(.*,)*(.*)/) { next; } - $sentence =~ /(.*?):(.*)/; - push (@rules, [$1, $2]); + $line =~ /(.*?):(.*)/; + push @rules, [$1, $2]; } - close RULES; + + close $fh; + return @rules; } sub read_inter_rules { my ($file) = @_; + my @inter_rules = (); - my @inter_rules; - open (IRULES, "<$file") or die ('Error: interrules.dict is not found.'); - my $sentence; - while ($sentence = <IRULES>) { - chomp $sentence; - #check format - if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/) { + open my $fh, '<', $file or die "can't open file [$file]: $!"; + + while (my $line = <$fh>) { + chomp $line; + # check format + if ($line =~ /^#/ || $line !~ /(.*?):(.*)/) { next; } - foreach my $item (split(/\|/, $2)) { - push (@inter_rules, [$item, $1]); + foreach my $item (split /\|/, $2) { + push @inter_rules, [$item, $1]; } } - close IRULES; + + close $fh; + return @inter_rules; } sub read_original { my ($file, $tokens, $originals) = @_; - open (INPUTFILE, $file) or die ("Error: $file is not found."); + open my $fh, '<', $file or die "can't open file [$file]: $!"; - my $sentence; - my @original; - while ($sentence = <INPUTFILE>) { + while (my $sentence = <$fh>) { chomp $sentence; - my @fields = split(':', $sentence); - push(@$originals, $fields[1]); - my @token = split(';', $fields[0]); - push(@$tokens, $token[0]); + my @fields = split ':', $sentence; + push @$originals, $fields[1]; + my @token = split ';', $fields[0]; + push @$tokens, $token[0]; } + + close $fh; + if (scalar(@$originals) == 0) { print "NONE\n"; exit 0; } - -#print join(';',@license_sentence_names)."\n"; - - close INPUTFILE; } +# we will iterate over rules, matching as many as we can... sub match_license { # create a string with the sentences for (my $j = 0; $j <= $#rules; $j++) { my $rule = $rules[$j][1]; my $rulename = $rules[$j][0]; - my $rule_length = scalar(split(',', $rule)); + my $rule_length = scalar(split ',', $rule); # replace rule with the length of the rule print "To try [$rulename][$rule] on [$senttok]\n" if $debug; - while ($senttok =~ s/,${rule},/,$rule_length,/) { + while ($senttok =~ s/,$rule,/,$rule_length,/) { $count_matches++; - push (@result, $rulename); + push @result, $rulename; # print ">>>>$senttok|$rules[$j][1]\n"; # print 'Result: ', join(',', @result); # print "\n"; @@ -351,14 +334,12 @@ sub match_license { } sub print_result { -# $senttok =~ s/AllRights(,?)/$1/g; -# $senttok =~ s/UNKNOWN,/,/g; -# $senttok =~ s/,+/,/g; +# $senttok =~ s/AllRights(,?)/$1/g; +# $senttok =~ s/UNKNOWN,/,/g; +# $senttok =~ s/,+/,/g; my $save = $senttok; - # ok, so now, what I want to output it: - # licenses; number of licenses matched;number of sentences matched; number of sentences ignored;number of sentences not matched;number of sentences unknown - my @sections = split(',', $senttok); + my @sections = split ',', $senttok; die 'assertion 1' if $sections[0] ne ''; die 'assertion 2' if $sections[scalar(@sections)] ne ''; @@ -367,7 +348,6 @@ sub print_result { my $unknown_lines = 0; my $unmatched_lines = 0; foreach my $i (1..scalar(@sections)-1) { -# print "$i;$sections[$i]\n"; if ($sections[$i] < 0) { $ignored_lines += - $sections[$i]; } elsif ($sections[$i] != 0) { @@ -380,12 +360,13 @@ sub print_result { } $senttok =~ s/^,(.*),$/$1/; -# print "$ignored_lines > $license_lines > $unknown_lines > $unmatched_lines\n"; if (scalar (@result) == 0) { print 'UNKNOWN'; } else { - print join(',',@result); + print join ',', @result; } + # ok, so now, what I want to output is: + # licenses; number of licenses matched;number of sentences matched; number of sentences ignored;number of sentences not matched;number of sentences unknown print ";$count_matches;$license_lines;$ignored_lines;$unmatched_lines;$unknown_lines;$senttok\n"; $senttok = $save; } @@ -17,15 +17,19 @@ # use strict; +use warnings; use Getopt::Std; +# parse cmdline parameters my %opts = (); -if (!getopts ("vfCcSsGgTtLd",\%opts) or scalar(@ARGV) == 0) { -print STDERR "Ninka version 1.1 +if (!getopts('vfCcSsGgTtLd', \%opts) or scalar(@ARGV) == 0) { + print STDERR "Ninka version 1.1 -Usage $0 -fCtTvcgsGd <filename> +Usage: $0 [OPTIONS] <filename> +Options: -v verbose + -f force all processing -C force creation of comments @@ -42,95 +46,95 @@ Usage $0 -fCtTvcgsGd <filename> -L force creation of matching - -d delete intermediate files - -\n"; + -d delete intermediate files\n"; exit 1; } my $verbose = exists $opts{v}; -my $delete = exists $opts{d}; -#$delete = 1; +my $delete = exists $opts{d}; -my $path = $0; - -$path =~ s/\/+[^\/]+$//; -if ($path eq "") { - $path = "./"; -} - -my $force = exists $opts{f}; -my $force_good = exists $opts{G}; +my $force = exists $opts{f}; +my $force_comments = exists $opts{C}; my $force_sentences = exists $opts{S}; -my $force_senttok = exists $opts{T}; -my $force_comments = exists $opts{C}; -my $force_license = exists $opts{L}; +my $force_good = exists $opts{G}; +my $force_senttok = exists $opts{T}; +my $force_license = exists $opts{L}; -#die "Usage $0 <filename>" unless $ARGV[0] =~ /\.(c|cpp|java|cc|cxx|h|jl|py|pm|el|pl)$/; +my $path = get_my_path($0); my $input_file = $ARGV[0]; -print "Starting: $input_file;\n" if ($verbose); +my $comments_file = "$input_file.comments"; +my $sentences_file = "$input_file.sentences"; +my $goodsent_file = "$input_file.goodsent"; +my $badsent_file = "$input_file.badsent"; +my $senttok_file = "$input_file.senttok"; +my $license_file = "$input_file.license"; +print "Starting: $input_file;\n" if $verbose; print "$input_file;"; -my $comments_file = "${input_file}.comments"; -my $sentences_file = "${input_file}.sentences"; -my $goodsent_file = "${input_file}.goodsent"; -my $senttok_file = "${input_file}.senttok"; - -if (not (-f "$input_file")) { - print "ERROR;[${input_file}] is not a file\n" ; - exit 0; +if (not (-f $input_file)) { + print "ERROR;[$input_file] is not a file\n"; + exit 1; } -do_file_process($input_file, $comments_file, ($force or $force_comments), - "$path/extComments/extComments.pl -c1 '${input_file}'", - "Creating comments file", - exists $opts{c}); +process_file($input_file, $comments_file, ($force or $force_comments), + "$path/extComments/extComments.pl '$input_file'", + 'Creating comments file', exists $opts{c}); -do_file_process($comments_file, $sentences_file, ($force or $force_sentences), - "$path/splitter/splitter.pl '${comments_file}'", - "Splitting sentences", exists $opts{s}); +process_file($comments_file, $sentences_file, ($force or $force_sentences), + "$path/splitter/splitter.pl '$comments_file'", + 'Splitting sentences', exists $opts{s}); -do_file_process($sentences_file, $goodsent_file, ($force or $force_good), - "$path/filter/filter.pl '${sentences_file}'", - "Filtering good sentences", exists $opts{s}); +process_file($sentences_file, $goodsent_file, ($force or $force_good), + "$path/filter/filter.pl '$sentences_file'", + 'Filtering good sentences', exists $opts{s}); -do_file_process($goodsent_file, $senttok_file, ($force or $force_senttok), - "$path/senttok/senttok.pl '${goodsent_file}' > '${senttok_file}'", - "Matching sentences against rules", exists $opts{t}); +process_file($goodsent_file, $senttok_file, ($force or $force_senttok), + "$path/senttok/senttok.pl '$goodsent_file' > '$senttok_file'", + 'Matching sentences against rules', exists $opts{t}); -print "Matching ${input_file}.senttok against rules" if ($verbose); -execute("$path/matcher/matcher.pl '${input_file}.senttok' > '${input_file}.license'"); +process_file($senttok_file, $license_file, ($force or $force_license), + "$path/matcher/matcher.pl '$senttok_file' > '$license_file'", + 'Matching sentence tokens against rules', 0); -print `cat '${input_file}.license'`; - -unlink("${input_file}.code"); +print `cat '$license_file'`; if ($delete) { - unlink("${input_file}.badsent"); - unlink("${input_file}.comments"); - unlink("${input_file}.goodsent"); -# unlink("${input_file}.sentences"); - unlink("${input_file}.senttok"); + unlink $comments_file; + unlink $sentences_file; + unlink $goodsent_file; + unlink $badsent_file; + unlink $senttok_file; } exit 0; -sub do_file_process { +sub get_my_path { + my ($self) = @_; + my $path = $self; + $path =~ s/\/+[^\/]+$//; + if ($path eq '') { + $path = './'; + } + return $path; +} + +sub process_file { my ($input, $output, $force, $cmd, $message, $end) = @_; - print "${message}:" if ($verbose); + print "$message:" if $verbose; if ($force or is_newer($input, $output)) { - print "Running ${cmd}:" if ($verbose); + print "Running $cmd:" if $verbose; execute($cmd); } else { - print "File [$output] newer than input [$input], not creating:" if ($verbose); + print "File [$output] newer than input [$input], not creating:" if $verbose; } + if ($end) { - print "Existing after $message" if $verbose; + print "Exiting after $message" if $verbose; print "\n"; exit 0; } @@ -138,7 +142,6 @@ sub do_file_process { sub execute { my ($command) = @_; -# print "\nTo execute [$command]\n"; my $result = `$command`; my $status = ($? >> 8); die "execution of program [$command] failed: status [$status]" if ($status != 0); @@ -147,8 +150,9 @@ sub execute { sub is_newer { my ($f1, $f2) = @_; - my ($f1write) = (stat($f1))[9]; - my ($f2write) = (stat($f2))[9]; + my $f1write = (stat $f1)[9]; + my $f2write = (stat $f2)[9]; + if (defined $f1write and defined $f2write) { return $f1write > $f2write; } else { diff --git a/senttok/senttok.pl b/senttok/senttok.pl index 5bef3c5..1fa104d 100755 --- a/senttok/senttok.pl +++ b/senttok/senttok.pl @@ -16,35 +16,36 @@ # along with this program. If not, see <http://www.gnu.org/licenses/>. # +# +# senttok.pl +# +# This script creates a file that corresponds to the recognized sentence tokens. +# For each sentence, it outputs its sentence token, or unknown otherwise. +# + use strict; +#use warnings; +use Getopt::Std; -my $TOO_LONG = 70; +my $INPUT_FILE_EXTENSION = 'goodsent'; -# where are we running the splitter from? -my $path = $0; -$path =~ s/[^\/]+$//; -if ($path eq "") { - $path = "./"; +# parse cmdline parameters +if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) { + print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n"; + exit 1; } -my $path_license_sentences = $path . "licensesentence.dict"; -open FH, "<$ARGV[0]"; -my @license_sentences = (); -open LICENSESENTENCEFILE, "<$path_license_sentences"; -my $line; -while ($line = <LICENSESENTENCEFILE>) { - chomp $line; - next if $line =~ /^\#/; - next if $line =~ /^ *$/; - die "Illegal format in license expression [$line] " unless $line =~ /(.*?):(.*?):(.*)/; - push @license_sentences, $line; -} +my $TOO_LONG = 70; + +my $path = get_my_path($0); + +my $input_file = $ARGV[0]; +my $license_sentences_file = "$path/licensesentence.dict"; -#foreach $line (@license_sentences) { -# print $line; -#} -close LICENSESENTENCEFILE; -while ($line = <>) { +my @license_sentences = read_license_sentences($license_sentences_file); + +open my $input_fh, '<', $input_file or die "can't open file [$input_file]: $!"; +while (my $line = <$input_fh>) { my $save_line; my $original_line; chomp $line; @@ -57,11 +58,10 @@ while ($line = <>) { $line = normalize_sentence($line); my $check = 0; - my $match_name = "UNKNOWN"; - my @parm = (); - my $sentence; + my $match_name = 'UNKNOWN'; + my @parameters = (); my $distance = 1; #maximum? number - my $most_similar_name = "UNKNOWN"; + my $most_similar_name = 'UNKNOWN'; my $before; my $after; my $gpl = 0; @@ -69,10 +69,6 @@ while ($line = <>) { $save_line = $line; -# print "Original -# [$line] -#\n"; - my $line_as_gpl = ''; if (looks_like_gpl($line)) { @@ -82,35 +78,28 @@ while ($line = <>) { $line_as_gpl = $line; } my ($name, $sub_rule, $number, $regexp, $option); - my $save_line = $line; + $save_line = $line; my $save_gpl = $gpl; - my $LGPL = ""; - foreach $sentence (@license_sentences) { - ($name, $sub_rule, $number, $regexp, $option) = split(/:/, $sentence); + my $LGPL = ''; + foreach my $sentence (@license_sentences) { + ($name, $sub_rule, $number, $regexp, $option) = split /:/, $sentence; # we need this due to the goto again $line = $save_line; $gpl = $save_gpl; - $LGPL = ""; + $LGPL = ''; again: -# print "Testing -# lin[$line] -# ori[$save_line] -# re [$regexp] -# lpg[$LGPL] -#\n"; if ($line =~ /$regexp/im) { $before = $`; - $after = $'; #'; + $after = $'; $check = 1; $match_name = $name; for (my $i = 1; $i <= $number; $i++) { no strict 'refs'; - push @parm, $$i; + push @parameters, $$i; } last; } else { -# print "NO MATCH\n"; - # let us try again in cas it is lesser/library + # let us try again in case it is lesser/library # do it only once if ($gpl and $line =~ s/(Lesser|Library) GPL/GPL/i) { $LGPL = $1; @@ -121,7 +110,7 @@ while ($line = <>) { $line = $save_line; goto again; } - next;## dmg + next; my $targetset = $regexp; $targetset =~ s/^(.*)$/$1/; my $tmpdist = levenshtein($line, $targetset) / max(length($targetset), length($sentence)); @@ -130,30 +119,42 @@ while ($line = <>) { $distance = $tmpdist; } } - last; ### + last; } - if ($check == 1) { - # licensesentence name, parm1, parm2,.. + if ($check) { + # licensesentence name, param1, param2, ... if ($gpl) { - $match_name .= "Ver" . $gpl_version; - $match_name .= "+" if $gpl_later; + $match_name .= 'Ver' . $gpl_version; + $match_name .= '+' if $gpl_later; $match_name = $LGPL . $match_name; } if (length($before) > $TOO_LONG || length($after) > $TOO_LONG) { - $match_name .= "-TOOLONG"; + $match_name .= '-TOOLONG'; } - my $parmstrings = join(";", $match_name, $sub_rule, $before, $after, @parm); - print $parmstrings, ":$original_line\n"; + # TODO: Use of uninitialized value in @parameters + my $parameter_string = join ';', $match_name, $sub_rule, $before, $after, @parameters; + print $parameter_string, ":$original_line\n"; } else { # UNKNOWN, sentence chomp $line; - print $match_name, ";", 0, ";", $most_similar_name, ";", $distance, ";", $save_line, ":$original_line\n"; + my $parameter_string = join ';', $match_name, 0, $most_similar_name, $distance, $save_line; + print $parameter_string, ":$original_line\n"; } } +close $input_fh; -close FH; exit 0; +sub get_my_path { + my ($self) = @_; + my $path = $self; + $path =~ s/\/+[^\/]+$//; + if ($path eq '') { + $path = './'; + } + return $path; +} + sub normalize_gpl { my ($line) = @_; my $later = 0; @@ -177,7 +178,6 @@ sub normalize_gpl { } if ($line =~ s/(version|v\.?) ([123\.0]+)/<VERSION>/i) { $version = $2; -# print "Version [$version]\n"; } if ($line =~ s/GPL ?[v\-]([123\.0]+)/GPL <VERSION>/i) { $version = $1; @@ -217,9 +217,7 @@ sub normalize_gpl { $line =~ s/ +/ /; $line =~ s/ +$//; -# print ">>>>>>>>>>$line:$later:$version\n"; - - return ($line,$later,$version); + return ($line, $later, $version); } sub looks_like_gpl { @@ -296,8 +294,8 @@ sub levenshtein { # Some char-by-char processing is ahead, so prepare # array of chars from the strings # - my @ar1 = split(//, $s1); - my @ar2 = split(//, $s2); + my @ar1 = split //, $s1; + my @ar2 = split //, $s2; for (my $i = 1; $i <= $len1; ++$i) { for (my $j = 1; $j <= $len2; ++$j) { @@ -348,3 +346,22 @@ sub max { return $list[0] > $list[1] ? $list[0] : $list[1]; } +sub read_license_sentences { + my ($file) = @_; + my @license_sentences = (); + + open my $fh, '<', $file or die "can't open file [$file]: $!"; + + while (my $line = <$fh>) { + chomp $line; + next if $line =~ /^\#/; + next if $line =~ /^ *$/; + die "illegal format in license expression [$line]" unless $line =~ /(.*?):(.*?):(.*)/; + push @license_sentences, $line; + } + + close $fh; + + return @license_sentences; +} + diff --git a/splitter/splitter.pl b/splitter/splitter.pl index c806f96..dfffa83 100755 --- a/splitter/splitter.pl +++ b/splitter/splitter.pl @@ -5,11 +5,14 @@ # Author: Paul Clough {cloughie@dcs.shef.ac.uk} # -# This program is originally based on the sentence splitter program -# published by Paul Clough. Version 1.0, available form -# http://ir.shef.ac.uk/cloughie/software.html (splitter.zip) -# The original program without a license. - +# This program is originally based on the sentence splitter program +# published by Paul Clough. Version 1.0, available from +# http://ir.shef.ac.uk/cloughie/software.html (splitter.zip) +# The original program is without a license. +# +# It was mostly rewritten. +# His ideas, however, linger in here (and his file of abbreviations) +# # Modifications to the original by Daniel M German and Y. Manabe, # which are under the following license: # @@ -25,52 +28,37 @@ # # You should have received a copy of the GNU Affero General Public License # along with this patch. If not, see <http://www.gnu.org/licenses/>. +# -use strict; - -# This program is originally based on the sentence splitter program -# published by Paul Clough. Version 1.0, but then it was mostly rewritten -# His ideas, however, linger in here (and his dictionary_file of abbreviations) - -my $dictionary_file = 'splitter.dict'; -my $abbreviations_file = 'splitter.abv'; -my $length = 0; -my %COMMON_TERMS = (); -my %ABBREVIATIONS = (); -my $output_file = $ARGV[0]; - -# where are we running the splitter from? -my $path = $0; -$path =~ s/[^\/]+$//; -if ($path eq '') { - $path = './'; -} -$dictionary_file = $path . $dictionary_file; -$abbreviations_file = $path . $abbreviations_file; +# +# splitter.pl +# +# This script breaks comments into sentences. +# -die "Usage $0 <filename>.comments" unless $ARGV[0] =~ /\.comments$/; +use strict; +use warnings; +use Getopt::Std; -die "Input file name should end in '.comments' [$output_file]" unless $output_file =~ s/\.comments$/.sentences/; +my $INPUT_FILE_EXTENSION = 'comments'; -open(OUT, ">$output_file") or die("Unable to create output file [$output_file]"); +# parse cmdline parameters +if (!getopts('') or scalar(@ARGV) == 0 or !($ARGV[0] =~ /\.$INPUT_FILE_EXTENSION$/)) { + print STDERR "Usage $0 <filename>.$INPUT_FILE_EXTENSION\n"; + exit 1; +} -#print length($opt_o); +my $path = get_my_path($0); -# Load in the dictionary and find the common words. -# Here, we assume the words in upper case are simply names and one -# word per line - i.e. in same form as /usr/dict/words -&load_dictionary; +my $input_file = $ARGV[0]; +my $abbreviations_file = "$path/splitter.abv"; -# Same assumptions as for dictionary -&load_abbreviations; +my $output_file = $input_file; $output_file =~ s/\.$INPUT_FILE_EXTENSION$/\.sentences/; -my $text; -# open(FILE, $opt_f) or die "Can't open $opt_f for reading\n"; +my $text = read_file_as_string($input_file); +my %abbreviations = load_abbreviations($abbreviations_file); -my $line; -while (defined ($line = <>)) { - $text .= $line; -} +open my $output_fh, '>', $output_file or die "can't create output file [$output_file]: $!"; # append a newline just in case $text .= "\n"; @@ -82,7 +70,7 @@ $text =~ s@={3,1000}@ @gmx; $text =~ s@:{3,1000}@ @gmx; $text =~ s@\*{3,1000}@ @gmx; -# some characters are used for prettyprinting but never appear in sentences +# some characters are used for pretty-printing but never appear in sentences $text =~ s@\|+@ @gmx; $text =~ s@\\+@ @gmx; @@ -91,7 +79,7 @@ $text =~ s@^[ \t]*/\*@@gmx; $text =~ s/\*\/[ \t]*$//gmx; $text =~ s@([^:])// @$1@gmx; -# Replace /\r\n/ with \n only +# replace /\r\n/ with \n only $text =~ s/\r\n/\n/g; # now, try to replace the leading/ending character of each line #/-, at most 3 heading characters @@ -109,9 +97,9 @@ $text =~ s/[\*\#]+//gmx; $text =~ s/^[ \t]+$/\n/gm; # let us try the following trick -# We first get rid of \t and replace it with ' ' -# we then use \t as a "single line separator" and \n as multiple line. -# so we can match each with a single character. +# we first get rid of \t and replace it with ' ' +# we then use \t as a "single line separator" and \n as multiple line +# so we can match each with a single character $text =~ tr/\t/ /; $text =~ s/\n(?!\n)/\t/g; @@ -123,61 +111,66 @@ $text =~ s/v\.\s+2\.0/v<dot> 2<dot>0/g; while ($text =~ /^([^\n]*)\n/gsm) { my $curr = $1; -# print "<<$curr\n<<\n"; # let us count the number of alphabetic chars to check if we are skipping anything we should not - my $count = 0; + my $count1 = 0; for my $i (0..length($curr)-1) { - my $c = substr($curr,$i,1); - $count++ if ($c ge 'A' && $c le 'z'); + my $c = substr($curr, $i, 1); + $count1++ if ($c ge 'A' && $c le 'z'); } my @sentences = split_text($curr); my $count2 = 0; - foreach my $sentence (@sentences) { for my $i (0..length($sentence)-1) { my $c = substr($sentence, $i, 1); $count2++ if ($c ge 'A' && $c le 'z'); } - print OUT clean_sentence($sentence) , "\n"; + my $clean_sentence = clean_sentence($sentence); + next unless $clean_sentence; + print $output_fh $clean_sentence, "\n"; } - if ($count != $count2) { + + if ($count1 != $count2) { print STDERR "-------------------------------------\n"; print STDERR "[$curr]\n"; foreach my $sentence (@sentences) { - print STDERR clean_sentence($sentence) , "\n"; + print STDERR clean_sentence($sentence), "\n"; } - die "Number of printable chars does not match! [$count][$count2]"; + die "Number of printable chars does not match! [$count1][$count2]"; } } -close OUT; -#print "$text\n"; +close $output_fh; -exit; +exit 0; -#*************************************************************************************************** -# procedures -#*************************************************************************************************** +sub get_my_path { + my ($self) = @_; + my $path = $self; + $path =~ s/\/+[^\/]+$//; + if ($path eq '') { + $path = './'; + } + return $path; +} sub clean_sentence { ($_) = @_; - # check for trailing bullets of different types + # check for trailing bullets of different types s/^o //; s/^\s*[0-9]{1-2}+\s*[\-\)]//; s/^[ \t]+//; s/[ \t]+$//; + # remove a trailing - s/^[ \t]*[\-\.\s*] +//; - # replace quotes s/\s+/ /g; s/['"`]+/<quotes>/g; - s/:/<colon>/g; s/\.+$/./; @@ -198,15 +191,15 @@ sub split_text { my $after; my $current_sentence = ''; # this breaks the sentence into - # 1. Any text before a separator - # 2. The separator [.!?:\n] + # 1. any text before a separator + # 2. the separator [.!?:\n] # 3. - while ($text =~ /^ + while ($text =~ /^ ([^\.\!\?\:\n]*) # ([\.\!\?\:\n]) (?=(.?)) /xsm) { #/(?:(?=([([{\"\'`)}\]<]*[ ]+)[([{\"\'`)}\] ]*([A-Z0-9][a-z]*))|(?=([()\"\'`)}\<\] ]+)\s))/sm ) { - $text = $'; #'; + $text = $'; my $sentence_match = $1; my $sentence = $1 . $2; my $punctuation = $2; @@ -217,7 +210,7 @@ sub split_text { $current_sentence .= $sentence; next; } - #at this point we know that there is a space after + # at this point we know that there is a space after if ($punctuation eq ':' || $punctuation eq '?' || $punctuation eq '!') { # let us consider this right here a beginning of a sentence push @result, $current_sentence . $sentence; @@ -231,7 +224,7 @@ sub split_text { # TODO # simple heuristic... let us check that the next words are not the beginning of a sentence # in our library - # ENDTODO + # END TODO # is the last word an abbreviation? For this the period has to follow the word # this expression might have to be updated to take care of special characters in names :( @@ -260,7 +253,7 @@ sub split_text { # only accept abbreviations if the previous char to the abbrev is space or # is empty (beginning of line). This avoids things like .c - if (length($before) > 0 and $before eq ' ' and $ABBREVIATIONS{$last_word}) { + if (length($before) > 0 and $before eq ' ' and $abbreviations{$last_word}) { $current_sentence .= $sentence; next; } else { @@ -277,39 +270,32 @@ sub split_text { } push @result, $current_sentence . $text; - #print_non_sentence($text,"\n",''); return @result; } -sub load_dictionary { - my $common_term = ''; +sub read_file_as_string { + my $file = shift; - if (open(DICT, $dictionary_file)) { - while (defined ($line = <DICT>)) { - chomp($line); - if ($line !~ /^[A-Z]/) { - $COMMON_TERMS{$line} = 1; - } - } + open my $fh, '<', $file or die "can't open file '$file': $!"; + my $content = do { local $/; <$fh> }; + close $fh or die "can't close file '$file': $!"; - close(DICT); - } else { - die "cannot open dictionary file $dictionary_file: $!"; - } + return $content; } sub load_abbreviations { - my $abbrv_term = ''; + my ($file) = @_; + my %abbreviations = (); - if (open(ABBRV, $abbreviations_file)) { - while (defined ($line = <ABBRV>)) { - chomp($line); - $ABBREVIATIONS{$line} = $line; - } + open my $fh, '<', $file or die "can't open file [$file]: $!"; - close(ABBRV); - } else { - die "cannot open abbreviations file $abbreviations_file: $!"; + while (my $line = <$fh>) { + chomp $line; + $abbreviations{$line} = $line; } + + close $fh; + + return %abbreviations; } |