diff options
author | René Scheibe <rene.scheibe@gmail.com> | 2014-09-03 15:56:17 +0200 |
---|---|---|
committer | René Scheibe <rene.scheibe@gmail.com> | 2014-09-03 15:56:17 +0200 |
commit | 8b20ac6d9016be8d5e66d791a6d17cfd0429781f (patch) | |
tree | 74424be03ff503d5c058cf08f8fd8b7c1694d11a | |
parent | 525e3e973cf3bd388e4cffa3054cb2b753c545c7 (diff) | |
parent | 334865e50c2af0d9ce1c5be8385ad984d5c62346 (diff) | |
download | ninka-8b20ac6d9016be8d5e66d791a6d17cfd0429781f.tar.gz |
Merge branch 'consistent-formatting' into integration
Conflicts:
extComments/extComments.pl
extComments/hashComments.pl
matcher/matcher.pl
splitter/splitter.pl
-rwxr-xr-x | extComments/extComments.pl | 32 | ||||
-rwxr-xr-x | extComments/hashComments.pl | 20 | ||||
-rwxr-xr-x | filter/filter.pl | 44 | ||||
-rwxr-xr-x | matcher/matcher.pl | 149 | ||||
-rwxr-xr-x | ninka.pl | 45 | ||||
-rwxr-xr-x | senttok/senttok.pl | 167 | ||||
-rwxr-xr-x | splitter/splitter.pl | 102 |
7 files changed, 216 insertions, 343 deletions
diff --git a/extComments/extComments.pl b/extComments/extComments.pl index 4ba9f59..1a4caec 100755 --- a/extComments/extComments.pl +++ b/extComments/extComments.pl @@ -1,5 +1,4 @@ #!/usr/bin/env perl - # # Copyright (C) 2009-2010 Yuki Manabe and Daniel M. German # @@ -30,7 +29,7 @@ if ($path eq '') { # set parameters my %opts = (); if (!getopts ('vc:p:',\%opts)) { -print STDERR "Usage $0 -v + print STDERR "Usage $0 -v -v verbose -c count of comment blocks @@ -49,8 +48,6 @@ $numberComments = $opts{c} if exists $opts{c}; my $verbose = 1; $verbose = exists $opts{v}; - - if (get_size($f) == 0) { print STDERR "Empty file, just exit\n" if $verbose; exit 0; # nothing to report, just end @@ -60,30 +57,22 @@ my $commentsCmd = Determine_Comments_Extractor($f); execute("$commentsCmd"); -if ($commentsCmd =~ /^comments/ and - get_size("${f}.comments") == 0){ +if ($commentsCmd =~ /^comments/ and get_size("${f}.comments") == 0) { `cat '$f' | head -700 > ${f}.comments`; } exit 0; - -sub Determine_Comments_Extractor -{ +sub Determine_Comments_Extractor { my ($f) = @_; if ($f =~ /\.([^\.]+)$/) { my $ext= $1; - if ($ext =~ /^(pl|pm|py)$/ - ) { -######################## -# for the time being, let us just extract the top 400 lines - + if ($ext =~ /^(pl|pm|py)$/) { + # for the time being, let us just extract the top 400 lines return "cat '$f' | head -400 > '${f}.comments'"; # return "$path/hashComments.pl -p '#' '$f'"; - } elsif ($ext eq 'jl' or - $ext eq 'el' - ) { + } elsif ($ext eq 'jl' or $ext eq 'el') { return "cat '$f' | head -400 > '${f}.comments'"; # return "$path/hashComments.pl -p ';' '$f'";; } elsif ($ext =~ /^(java|c|cpp|h|cxx|c\+\+|cc)$/ ) { @@ -102,20 +91,17 @@ sub Determine_Comments_Extractor } } -sub execute -{ +sub execute { my ($c) = @_; -# print "\nTo execute [$c]\n"; my $r = `$c`; my $status = ($? >> 8); die "execution of program [$c] failed: status [$status]" if ($status != 0); return $r; } - -sub get_size -{ +sub get_size { my ($f) = @_; my $size = (stat($f))[7]; return $size; } + diff --git a/extComments/hashComments.pl b/extComments/hashComments.pl index c83846b..baa9937 100755 --- a/extComments/hashComments.pl +++ b/extComments/hashComments.pl @@ -1,5 +1,4 @@ #!/usr/bin/env perl - # # Copyright (C) 2009-2010 Yuki Manabe and Daniel M. German # @@ -17,20 +16,18 @@ # along with this program. If not, see <http://www.gnu.org/licenses/>. # - -# this is to extract the first <n> comments from any language that +# this is to extract the first <n> comments from any language that # uses the same prefix use Getopt::Std; - # set parameters my %opts = (); if (!getopts ('vc:p:',\%opts)) { -print STDERR "Usage $0 -v + print STDERR "Usage $0 -v -v verbose - -p comment char + -p comment char -c count of comment blocks \n"; @@ -64,7 +61,7 @@ while (<>) { if (Is_Comment($_)) { s/\t/ /g; s/ +/ /g; - $comCount ++ if (not $insideComment); + $comCount++ if (not $insideComment); $insideComment = 1; /$commentChar+/; print OUT $' . "\n"; #' @@ -72,18 +69,15 @@ while (<>) { print OUT "\n"; } else { exit 0; - } + } } - -sub Is_Comment -{ +sub Is_Comment { my ($st) = @_; return ($st =~ /^\s*$commentChar/); } -sub Is_Blank -{ +sub Is_Blank { my ($st) = @_; return ($st =~ /^\s*$/); } diff --git a/filter/filter.pl b/filter/filter.pl index 615c850..84269c6 100755 --- a/filter/filter.pl +++ b/filter/filter.pl @@ -16,7 +16,6 @@ # along with this program. If not, see <http://www.gnu.org/licenses/>. # - # # filter.pl # This script classify input sentences into two categories, @@ -29,7 +28,6 @@ # use strict; - #print $ARGV[0]; # where are we running the program from @@ -44,7 +42,6 @@ die "Usagee $0 <filename>.sentences" unless $ARGV[0] =~ /\.sentences$/; my $goodfilename = $ARGV[0]; - die "Filename should end in '.sentences' [$goodfilename]" unless $goodfilename =~ s/\.sentences$/\.goodsent/; my $badfilename = $ARGV[0]; $badfilename =~ s/\.sentences$/\.badsent/; @@ -58,36 +55,37 @@ open (DICTIONARY, "<$critWords") or die ('Error: criticalword.dict is not found. open (GOODOUT, ">$goodfilename") || die ('Error'); open (BADOUT, ">$badfilename") || die ('Error'); -my @cwordlist=(); +my @cwordlist = (); # read dictionary into list my $cword; -while ($cword=<DICTIONARY>){ - chomp $cword; - next if $cword =~ /^\#/; - $cword =~ s/\#.*$//; # remove everything to the end of file - push(@cwordlist,"$cword"); +while ($cword = <DICTIONARY>) { + chomp $cword; + next if $cword =~ /^\#/; + $cword =~ s/\#.*$//; # remove everything to the end of file + push(@cwordlist, "$cword"); } close(DICTIONARY); #matching cliticalwords in list against sentences. my $sentence; -while ($sentence=<INPUTFILE>){ - my $check=0; - chomp $sentence; - foreach $cword (@cwordlist){ - if($sentence =~ /\b$cword\b/i){ - $check=1; - #print "$cword:$sentence"; - last; +while ($sentence = <INPUTFILE>) { + my $check = 0; + chomp $sentence; + foreach $cword (@cwordlist) { + if ($sentence =~ /\b$cword\b/i) { + $check = 1; + #print "$cword:$sentence"; + last; + } + } + if ($check == 1) { + print GOODOUT "$sentence\n"; + } else { + print BADOUT "$sentence\n"; } - } - if ($check==1){ - print GOODOUT "$sentence\n"; - }else{ - print BADOUT "$sentence\n"; - } } close(INPUTFILE); close(GOODOUT); close(BADOUT); + diff --git a/matcher/matcher.pl b/matcher/matcher.pl index 665cc77..e805b6c 100755 --- a/matcher/matcher.pl +++ b/matcher/matcher.pl @@ -1,5 +1,4 @@ #!/usr/bin/env perl - # # Copyright (C) 2009-2010 Yuki Manabe and Daniel M. German # @@ -20,7 +19,7 @@ # # matchter.pl # -# This script use a set of license sentence name as input +# This script use a set of license sentence name as input # and output license name corresponds to a rule which match the set. # # author: Yuki Manabe @@ -116,32 +115,24 @@ my $path = $0; $path =~ s/[^\/]+$//; if ($path eq '') { $path = './'; - } +} my $rules= $path . 'rules.dict'; my $interrules= $path . 'interrules.dict'; die "Usage $0 <filename>.senttok" unless $ARGV[0] =~ /\.senttok$/; - -# read rules - my $countUnknowns = 0; - # read the rules - my @rulelist = Read_Rules($rules); - my @interRuleList = Read_Inter_Rules($interrules); - -my @licSentNames=(); +my @licSentNames = (); my @original; Read_Original($ARGV[0], \@licSentNames, \@original); - #foreach my $x (@licSentNames) { # print "$x\n"; #} @@ -160,8 +151,8 @@ Read_Original($ARGV[0], \@licSentNames, \@original); # matching spdx requires to match strict licenses, with no alternatives... -my $senttok= ',' . join(',',@licSentNames) . ','; -my @result=(); +my $senttok = ',' . join(',', @licSentNames) . ','; +my @result = (); my $countMatches = 0; print "[$senttok]\n" if $debug; @@ -177,154 +168,136 @@ Match_License(); #Print_Result(); my $match = 0; -for (my $i=0;$i<=$#licSentNames ;$i++) { - if ($licSentNames[$i] == 0 and - ($licSentNames[$i] ne 'UNKNOWN' and +for (my $i = 0; $i <= $#licSentNames; $i++) { + if ($licSentNames[$i] == 0 and + ($licSentNames[$i] ne 'UNKNOWN' and $licSentNames[$i] ne '')) { # print "[$licSentNames[$i]]\n"; $licSentNames[$i] =~ s/Extrict$//; - $match ++; + $match++; } } #Print_Result(); - if ($match > 0) { # print "REDO\n"; - for (my $i=0;$i<=$#interRuleList ;$i++){ + for (my $i = 0; $i <= $#interRuleList; $i++) { #for my $ref( @interRuleList[$i]){ # print "@$ref\n"; #} #print $interRuleList[$i][0]; @licSentNames = map { $_ eq $interRuleList[$i][0] ? $interRuleList[$i][1] : $_ } @licSentNames; } - - $senttok= join(',',@licSentNames) . ','; - + + $senttok = join(',', @licSentNames) . ','; + Match_License(); } Print_Result(); - exit 0; - - #print @licSentNames; #print join(';',@licSentNames)."\n"; - # 3. matching ############################### # we will iterate over rules, matching as many as we can... - - - - -sub Is_Unknown -{ +sub Is_Unknown { my ($s) = @_; my @f = split (/,/, $s); return $f[0] eq 'UNKNOWN'; } - -sub Read_Rules -{ +sub Read_Rules { my ($rulesF) = @_; open (RULES, "<$rulesF") or die ('Error: rules.dict is not found.'); my $sentence; my @rules = (); - while ($sentence=<RULES>){ + while ($sentence = <RULES>) { chomp $sentence; - # clean up spaces - $sentence=~ s/^\s+//; - $sentence=~ s/\s+$//; - $sentence=~ s/\s*,\s*/,/g; - $sentence=~ s/\s*:\s*/:/g; + # clean up spaces + $sentence =~ s/^\s+//; + $sentence =~ s/\s+$//; + $sentence =~ s/\s*,\s*/,/g; + $sentence =~ s/\s*:\s*/:/g; #check format - if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/){ + if ($sentence =~ /^#/ || $sentence !~ /(.*):(.*,)*(.*)/) { next; } $sentence =~ /(.*?):(.*)/; - push (@rules,[$1,$2]); + push (@rules, [$1, $2]); } close RULES; return @rules; } - -sub Read_Inter_Rules -{ +sub Read_Inter_Rules { my ($interrules) = @_; my @list; open (IRULES, "<$interrules") or die ('Error: interrules.dict is not found.'); my $sentence; - while ($sentence=<IRULES>){ + while ($sentence = <IRULES>) { chomp $sentence; #check format - if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/){ + if ($sentence =~ /^#/ || $sentence !~ /(.*?):(.*)/) { next; } - foreach my $item (split(/\|/,$2)){ - push (@list,[$item,$1]); + foreach my $item (split(/\|/, $2)) { + push (@list, [$item, $1]); } } close IRULES; return @list; } -sub Read_Original -{ +sub Read_Original { my ($inputF, $tokens, $originals) = @_; open (INPUTFILE, $inputF) or die ("Error: $inputF is not found."); - + my $sentence; my @original; - while ($sentence = <INPUTFILE>){ + while ($sentence = <INPUTFILE>) { chomp $sentence; - my @fields = split(':',$sentence); - push(@$originals,$fields[1]); + my @fields = split(':', $sentence); + push(@$originals, $fields[1]); my @token = split(';', $fields[0]); - push(@$tokens,$token[0]); + push(@$tokens, $token[0]); } if (scalar(@$originals) == 0) { print "NONE\n"; exit 0; } - + #print join(';',@licSentNames)."\n"; - + close INPUTFILE; } -sub Match_License -{ - +sub Match_License { # create a string with the sentences - - for (my $j=0;$j<=$#rulelist;$j++){ - - my $rule=$rulelist[$j][1]; - my $rulename=$rulelist[$j][0]; + + for (my $j = 0; $j <= $#rulelist; $j++) { + my $rule = $rulelist[$j][1]; + my $rulename = $rulelist[$j][0]; my $lenRule = scalar(split(',', $rule)); # replace rule with the length of the rule - print "To try [$rulename][$rule] on [$senttok]\n" if $debug; - while ($senttok =~ s/,${rule},/,$lenRule,/){ - $countMatches ++; - push (@result,$rulename); + print "To try [$rulename][$rule] on [$senttok]\n" if $debug; + while ($senttok =~ s/,${rule},/,$lenRule,/) { + $countMatches++; + push (@result, $rulename); # print ">>>>$senttok|$rulelist[$j][1]\n"; # print 'Result: ', join(',', @result); # print "\n"; } } - + # print ">>>>[$senttok]\n"; my $onlyAllRight = 0; @@ -333,7 +306,7 @@ sub Match_License #print STDERR "Ending>>>>>>>$senttok\n"; #print STDERR 'Size>>' , scalar(@result), "\n"; #print STDERR 'Result>>', join(',', @result), "\n"; - + # let us remove allrights # my $onlyAllRight = 1; # for my $i (0.. scalar(@licSentNames)-1){ @@ -345,11 +318,10 @@ sub Match_License # } # output result - if (scalar(@result) > 0){ + if (scalar(@result) > 0) { # at this point we have matched - - - # let us clean up the rules... let us print the matched rules, and the + + # let us clean up the rules... let us print the matched rules, and the # if (grep(/GPL/, @result)) { # print "GPL...\n"; # foreach my $r ($NonCriticalRules{GPL}) { @@ -357,7 +329,6 @@ sub Match_License # } # } # general removal of rules - foreach my $r (@generalNonCritical) { while ($senttok =~ s/,$r,/,-1,/) { @@ -365,7 +336,7 @@ sub Match_License } } # print "[$senttok]\n"; - + foreach my $res (@result) { my $temp = $NonCriticalRules{$res}; foreach my $r (@$temp) { @@ -375,13 +346,11 @@ sub Match_License } } } -# print "[$senttok]\n"; +# print "[$senttok]\n"; } } - -sub Print_Result -{ +sub Print_Result { # $senttok =~ s/AllRights(,?)/$1/g; # $senttok =~ s/UNKNOWN,/,/g; # $senttok =~ s/,+/,/g; @@ -392,7 +361,7 @@ sub Print_Result my @sections = split(',', $senttok); die 'assertion 1' if $sections[0] ne ''; die 'assertion 2' if $sections[scalar(@sections)] ne ''; - + my $ignoredLines = 0; my $licenseLines = 0; my $unknownLines = 0; @@ -404,20 +373,20 @@ sub Print_Result } elsif ($sections[$i] != 0) { $licenseLines += $sections[$i]; } elsif ($sections[$i] eq 'UNKNOWN') { - $unknownLines ++; + $unknownLines++; } else { $unmatchedLines++; } } $senttok =~ s/^,(.*),$/$1/; - + # print "$ignoredLines > $licenseLines > $unknownLines > $unmatchedLines\n"; if (scalar (@result) == 0) { - print 'UNKNOWN'; + print 'UNKNOWN'; } else { - print join(',',@result); + print join(',',@result); } print ";$countMatches;$licenseLines;$ignoredLines;$unmatchedLines;$unknownLines;$senttok\n"; $senttok = $save; - } + @@ -42,15 +42,13 @@ Usage $0 -fCtTvcgsGd <filename> -L force creation of matching - -d delete intermediate files + -d delete intermediate files \n"; exit 1; } - - my $verbose = exists $opts{v}; my $delete = exists $opts{d}; #$delete = 1; @@ -62,7 +60,6 @@ if ($path eq "") { $path = "./"; } - my $force = exists $opts{f}; my $forceGood = exists $opts{G}; my $forceSentences = exists $opts{S}; @@ -76,8 +73,6 @@ my $f = $ARGV[0]; my $original = $f; - - print "Starting: $original;\n" if ($verbose); print "$original;"; @@ -92,28 +87,22 @@ if (not (-f "$f")) { exit 0; } - -Do_File_Process($original, $commentsFile, ($force or $forceComments), +Do_File_Process($original, $commentsFile, ($force or $forceComments), "$path/extComments/extComments.pl -c1 '${original}'", "Creating comments file", exists $opts{c}); - -Do_File_Process($commentsFile, $sentencesFile, ($force or $forceSentences), +Do_File_Process($commentsFile, $sentencesFile, ($force or $forceSentences), "$path/splitter/splitter.pl '${commentsFile}'", - "Splitting sentences", exists $opts{s} - ); + "Splitting sentences", exists $opts{s}); -Do_File_Process( $sentencesFile, $goodsentFile, ($force or $forceGood), +Do_File_Process($sentencesFile, $goodsentFile, ($force or $forceGood), "$path/filter/filter.pl '${sentencesFile}'", - "Filtering good sentences", exists $opts{s} - ); + "Filtering good sentences", exists $opts{s}); -Do_File_Process($goodsentFile, $sentokFile, ($force or $forceSentok), +Do_File_Process($goodsentFile, $sentokFile, ($force or $forceSentok), "$path/senttok/senttok.pl '${goodsentFile}' > '${sentokFile}'", - "Matching sentences against rules", exists $opts{t} - ); - + "Matching sentences against rules", exists $opts{t}); print "Matching ${f}.senttok against rules" if ($verbose); execute("$path/matcher/matcher.pl '${f}.senttok' > '${f}.license'"); @@ -132,18 +121,15 @@ if ($delete) { exit 0; - - -sub Do_File_Process -{ +sub Do_File_Process { my ($input, $output, $force, $cmd, $message, $end) = @_; print "${message}:" if ($verbose); - if ($force or newer($input, $output)) { + if ($force or newer($input, $output)) { print "Running ${cmd}:" if ($verbose); execute($cmd); } else { - print "File [$output] newer than input [$input], not creating:" if ($verbose); + print "File [$output] newer than input [$input], not creating:" if ($verbose); } if ($end) { print "Existing after $message" if $verbose; @@ -152,11 +138,7 @@ sub Do_File_Process } } - - - -sub execute -{ +sub execute { my ($c) = @_; # print "\nTo execute [$c]\n"; my $r = `$c`; @@ -165,8 +147,7 @@ sub execute return $r; } -sub newer -{ +sub newer { my ($f1, $f2) = @_; my ($f1write) = (stat($f1))[9]; my ($f2write) = (stat($f2))[9]; diff --git a/senttok/senttok.pl b/senttok/senttok.pl index 8580324..e960e2e 100755 --- a/senttok/senttok.pl +++ b/senttok/senttok.pl @@ -15,8 +15,8 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # -use strict; +use strict; my $TOO_LONG = 70; @@ -32,7 +32,7 @@ open FH, "<$ARGV[0]"; my @licensesentencelist=(); open LICENSESENTENCEFILE, "<$licSentences"; my $line; -while ($line = <LICENSESENTENCEFILE>){ +while ($line = <LICENSESENTENCEFILE>) { chomp $line; next if $line =~ /^\#/; next if $line =~ /^ *$/; @@ -44,7 +44,7 @@ while ($line = <LICENSESENTENCEFILE>){ # print $line; #} close LICENSESENTENCEFILE; -while ($line = <>){ +while ($line = <>) { my $saveLine; my $originalLine; chomp $line; @@ -56,14 +56,13 @@ while ($line = <>){ $line = Normalize_Sentence($line); - - my $check=0; - my $matchname="UNKNOWN"; - my @parm=(); + my $check = 0; + my $matchname = "UNKNOWN"; + my @parm = (); my $sentence; - my $distance=1; #maximum? number - my $mostsimilarname="UNKNOWN"; - my $before; + my $distance = 1; #maximum? number + my $mostsimilarname = "UNKNOWN"; + my $before; my $after; my $gpl = 0; my ($gplLater, $gplVersion); @@ -74,8 +73,7 @@ while ($line = <>){ # [$line] #\n"; - - my $lineAsGPL =''; + my $lineAsGPL = ''; if (Looks_Like_GPL($line)) { my $old = $line; @@ -94,23 +92,23 @@ while ($line = <>){ $gpl = $saveGPL; $LGPL = ""; again: -# print "Testing +# print "Testing # lin[$line] # ori[$saveLine] # re [$regexp] # lpg[$LGPL] #\n"; - if ( $line =~ /$regexp/im ){ - $before = $`; + if ($line =~ /$regexp/im) { + $before = $`; $after = $'; #'; - $check=1; - $matchname=$name; - for (my $i = 1; $i <= $number; $i++){ + $check = 1; + $matchname = $name; + for (my $i = 1; $i <= $number; $i++) { no strict 'refs'; - push @parm,$$i; + push @parm, $$i; } last; - } else{ + } else { # print "NO MATCH\n"; # let us try again in cas it is lesser/library # do it only once @@ -124,45 +122,39 @@ while ($line = <>){ goto again; } next;## dmg - my $targetset=$regexp; + my $targetset = $regexp; $targetset =~ s/^(.*)$/$1/; - my $tmpdist=&levenshtein($line,$targetset)/max(length($targetset),length($sentence)); - if ($tmpdist<$distance){ - $mostsimilarname=$name; - $distance=$tmpdist; + my $tmpdist = levenshtein($line, $targetset) / max(length($targetset), length($sentence)); + if ($tmpdist < $distance) { + $mostsimilarname = $name; + $distance = $tmpdist; } } last; ### } - if ($check == 1){ - #licensesentence name, parm1, parm2,.. + if ($check == 1) { + # licensesentence name, parm1, parm2,.. if ($gpl) { $matchname .= "Ver" . $gplVersion; $matchname .= "+" if $gplLater; $matchname = $LGPL . $matchname; - } else { } - if (length($before)>$TOO_LONG || - length($after) >$TOO_LONG) { + if (length($before) > $TOO_LONG || length($after) > $TOO_LONG) { $matchname .= "-TOOLONG"; } - my $parmstrings=join(";",$matchname, $subRule, $before, $after, @parm); - print $parmstrings,":$originalLine\n"; - - - }else{ - #UNKNOWN, sentence + my $parmstrings = join(";",$matchname, $subRule, $before, $after, @parm); + print $parmstrings, ":$originalLine\n"; + } else { + # UNKNOWN, sentence chomp $line; - print $matchname,";",0, ";", $mostsimilarname,";",$distance,";",$saveLine,":$originalLine\n"; - } - + print $matchname, ";", 0, ";", $mostsimilarname, ";", $distance, ";", $saveLine, ":$originalLine\n"; + } } close FH; exit 0; -sub Normalize_GPL -{ +sub Normalize_GPL { my ($line) = @_; my $later = 0; my $version = 0; @@ -202,7 +194,6 @@ sub Normalize_GPL $line =~ s/GPL \(GPL\)/GPL/gi; $line =~ s/GPL \(<QUOTES>GPL<QUOTES>\)/GPL/gi; - $line =~ s/GNU //gi; $line =~ s/under GPL/under the GPL/gi; $line =~ s/under Lesser/under the Lesser/gi; @@ -231,8 +222,7 @@ sub Normalize_GPL return ($line,$later,$version); } -sub Looks_Like_GPL -{ +sub Looks_Like_GPL { my ($line) = @_; return 1 if $line =~ /GNU/; @@ -242,18 +232,16 @@ sub Looks_Like_GPL return 0; } - -sub Normalize_Sentence -{ +sub Normalize_Sentence { my ($line) = @_; # do some very quick spelling corrections for english/british words - $line=~ s/icence/icense/ig; - $line=~ s/(\.|;)$//; + $line =~ s/icence/icense/ig; + $line =~ s/(\.|;)$//; return $line; } -# Return the Levenshtein distance (also called Edit distance) +# Return the Levenshtein distance (also called Edit distance) # between two strings # # The Levenshtein distance (LD) is a measure of similarity between two @@ -272,22 +260,21 @@ sub Normalize_Sentence # The distance is named after the Russian scientist Vladimir # Levenshtein, who devised the algorithm in 1965 # -sub levenshtein - { +sub levenshtein { # $s1 and $s2 are the two strings # $len1 and $len2 are their respective lengths # my ($s1, $s2) = @_; my ($len1, $len2) = (length $s1, length $s2); - + # If one of the strings is empty, the distance is the length # of the other string # return $len2 if ($len1 == 0); return $len1 if ($len2 == 0); - + my %mat; - + # Init the distance matrix # # The first row to 0..$len1 @@ -297,35 +284,31 @@ sub levenshtein # The first row and column are initialized so to denote distance # from the empty string # - for (my $i = 0; $i <= $len1; ++$i) - { - for (my $j = 0; $j <= $len2; ++$j) - { + for (my $i = 0; $i <= $len1; ++$i) { + for (my $j = 0; $j <= $len2; ++$j) { $mat{$i}{$j} = 0; $mat{0}{$j} = $j; - } - + } + $mat{$i}{0} = $i; } - + # Some char-by-char processing is ahead, so prepare # array of chars from the strings # my @ar1 = split(//, $s1); my @ar2 = split(//, $s2); - - for (my $i = 1; $i <= $len1; ++$i) - { - for (my $j = 1; $j <= $len2; ++$j) - { + + for (my $i = 1; $i <= $len1; ++$i) { + for (my $j = 1; $j <= $len2; ++$j) { # Set the cost to 1 iff the ith char of $s1 # equals the jth of $s2 - # + # # Denotes a substitution cost. When the char are equal # there is no need to substitute, so the cost is 0 # my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; - + # Cell $mat{$i}{$j} equals the minimum of: # # - The cell immediately above plus 1 @@ -338,36 +321,30 @@ sub levenshtein $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, $mat{$i}{$j-1} + 1, $mat{$i-1}{$j-1} + $cost]); - } - } - + } + } + # Finally, the Levenshtein distance equals the rightmost bottom cell # of the matrix # # Note that $mat{$x}{$y} denotes the distance between the substrings # 1..$x and 1..$y - # return $mat{$len1}{$len2}; - } - - - # minimal element of a list - # - sub min - { - my @list = @{$_[0]}; - my $min = $list[0]; - - foreach my $i (@list) - { - $min = $i if ($i < $min); - } - - return $min; - } - - sub max{ - my @list = @_; - return $list[0]>$list[1]?$list[0]:$list[1]; +} + +sub min { + my @list = @{$_[0]}; + my $min = $list[0]; + + foreach my $i (@list) { + $min = $i if ($i < $min); } - + + return $min; +} + +sub max { + my @list = @_; + return $list[0] > $list[1] ? $list[0] : $list[1]; +} + diff --git a/splitter/splitter.pl b/splitter/splitter.pl index 55a9d36..69039ad 100755 --- a/splitter/splitter.pl +++ b/splitter/splitter.pl @@ -1,8 +1,8 @@ #!/usr/bin/env perl -# +# #*************************************************************************************************** -# SENTENCE SPLITTER -# Author: Paul Clough {cloughie@dcs.shef.ac.uk} +# SENTENCE SPLITTER +# Author: Paul Clough {cloughie@dcs.shef.ac.uk} # # This program is originally based on the sentence splitter program @@ -26,15 +26,12 @@ # You should have received a copy of the GNU Affero General Public License # along with this patch. If not, see <http://www.gnu.org/licenses/>. - - use strict; # This program is originally based on the sentence splitter program # published by Paul Clough. Version 1.0, but then it was mostly rewritten # His ideas, however, linger in here (and his dictionary of abbreviations) - my $dictionary = 'splitter.dict'; my $abbrv_file = 'splitter.abv'; my $len = 0; @@ -53,12 +50,10 @@ $abbrv_file = $path . $abbrv_file; die "Usage $0 <filename>.comments" unless $ARGV[0] =~ /\.comments$/; - -die "Input file name should end in '.comments' [$output_file]" unless $output_file =~ s/\.comments$/.sentences/; +die "Input file name should end in '.comments' [$output_file]" unless $output_file =~ s/\.comments$/.sentences/; open(OUT, ">$output_file") or die("Unable to create output file [$output_file]"); - #print length($opt_o); # Load in the dictionary and find the common words. @@ -70,15 +65,15 @@ open(OUT, ">$output_file") or die("Unable to create output file [$output_file]") &loadAbbreviations; my $text; -# open(FILE, $opt_f) or die "Can't open $opt_f for reading\n"; +# open(FILE, $opt_f) or die "Can't open $opt_f for reading\n"; my $line; while (defined ($line = <>)) { - $text.= $line; + $text .= $line; } # append a newline just in case -$text.="\n"; +$text .= "\n"; # - is used to create lines # = is used to create lines @@ -96,11 +91,9 @@ $text =~ s@^[ \t]*/\*@@gmx; $text =~ s/\*\/[ \t]*$//gmx; $text =~ s@([^:])// @$1@gmx; - # Replace /\r\n/ with \n only $text =~ s/\r\n/\n/g; - # now, try to replace the leading/ending character of each line #/-, at most 3 heading characters # and each repeated as many times as necessaary $text =~ s/^[ \t]{0,3}[\*\#\/\;]+//gmx; @@ -128,7 +121,7 @@ $text .= "\n"; # this gets us in big trouble... licenses that have numeric abbreviations $text =~ s/v\.\s+2\.0/v<dot> 2<dot>0/g; -while ($text =~ /^([^\n]*)\n/gsm ) { +while ($text =~ /^([^\n]*)\n/gsm) { my $curr = $1; # print "<<$curr\n<<\n"; @@ -145,7 +138,7 @@ while ($text =~ /^([^\n]*)\n/gsm ) { foreach my $s (@sentences) { for my $i (0..length($s)-1) { - my $c = substr($s,$i,1); + my $c = substr($s, $i, 1); $count2++ if ($c ge 'A' && $c le 'z'); } print OUT Clean_Sentence($s) , "\n"; @@ -164,17 +157,11 @@ close OUT; exit; - - - -#*************************************************************************************************** - #*************************************************************************************************** # procedures #*************************************************************************************************** -sub Clean_Sentence -{ +sub Clean_Sentence { ($_) = @_; # check for trailing bullets of different types @@ -198,12 +185,9 @@ sub Clean_Sentence die if /\n/m; return $_; - } - -sub Split_Text -{ +sub Split_Text { my ($text) = @_; my $len = 0; my $next_word; @@ -213,7 +197,7 @@ sub Split_Text my @result; my $after; my $currentSentence = ''; - # this breaks the sentence into + # this breaks the sentence into # 1. Any text before a separator # 2. The separator [.!?:\n] # 3. @@ -222,19 +206,19 @@ sub Split_Text ([\.\!\?\:\n]) (?=(.?)) /xsm) { #/(?:(?=([([{\"\'`)}\]<]*[ ]+)[([{\"\'`)}\] ]*([A-Z0-9][a-z]*))|(?=([()\"\'`)}\<\] ]+)\s))/sm ) { - $text = $'; #'; + $text = $'; #'; my $sentenceMatch = $1; - my $sentence = $1 . $2; - my $punctuation = $2; + my $sentence = $1 . $2; + my $punctuation = $2; $after = $3; - + # if next character is not a space, then we are not in a sentence" if ($after ne ' ' && $after ne "\t") { $currentSentence .= $sentence; next; } #at this point we know that there is a space after - if ($punctuation eq ':' || $punctuation eq '?' || $punctuation eq '!') { + if ($punctuation eq ':' || $punctuation eq '?' || $punctuation eq '!') { # let us consider this right here a beginning of a sentence push @result, $currentSentence . $sentence; $currentSentence = ''; @@ -248,46 +232,41 @@ sub Split_Text # simple heuristic... let us check that the next words are not the beginning of a sentence # in our library # ENDTODO - + # is the last word an abbreviation? For this the period has to follow the word # this expression might have to be updated to take care of special characters in names :( if ($sentenceMatch =~ /(.?)([^[:punct:]\s]+)$/) { my $before = $1; my $lastWord = $2; #is it an abbreviation - + if (length($lastWord) == 1 ) { # single character abbreviations are special... - # we will assume they never split the sentence if they are capitalized. - if (($lastWord ge 'A') and - ($lastWord le 'Z')) { + # we will assume they never split the sentence if they are capitalized. + if (($lastWord ge 'A') and ($lastWord le 'Z')) { $currentSentence .= $sentence; next; } print "last word an abbrev $sentenceMatch lastword [$lastWord] before [$before]\n"; # but some are lowercase! - if (($lastWord eq 'e') or - ($lastWord eq 'i')) { + if (($lastWord eq 'e') or ($lastWord eq 'i')) { $currentSentence .= $sentence; next; } print "2 last word an abbrev $sentenceMatch lastword [$lastWord] before [$before]\n"; } else { - $lastWord = lc $lastWord; - + # only accept abbreviations if the previous char to the abbrev is space or # is empty (beginning of line). This avoids things like .c if (length($before) > 0 and $before eq ' ' and $ABBREVIATIONS{$lastWord}) { - $currentSentence .= $sentence; next; } else { # just keep going, we handle this case below } } - } push @result, $currentSentence . $sentence; @@ -297,51 +276,40 @@ sub Split_Text die 'We have not dealt with this case'; } push @result, $currentSentence . $text; - + #Print_Non_Sentence($text,"\n",''); return @result; - } sub loadDictionary { - - # Initialise var - my $common_term = ''; + my $common_term = ''; if (open(DICT, $dictionary)) { - while (defined ($line = <DICT>)) { chomp($line); if ($line !~ /^[A-Z]/) { $COMMON_TERMS{$line} = 1; } - - } - + } + close(DICT); } else { - die "cannot open dictionary file $dictionary: $!"; + die "cannot open dictionary file $dictionary: $!"; } } -sub loadAbbreviations -{ - - # Initialise var - my $abbrv_term = ''; - +sub loadAbbreviations { + my $abbrv_term = ''; + if (open(ABBRV, $abbrv_file)) { - while (defined ($line = <ABBRV>)) { chomp($line); - $ABBREVIATIONS{$line} = $line; - } - + $ABBREVIATIONS{$line} = $line; + } + close(ABBRV); } else { - die "cannot open dictionary file $abbrv_file: $!"; + die "cannot open dictionary file $abbrv_file: $!"; } } - -#*************************************************************************************************** |