diff options
author | René Scheibe <rene.scheibe@gmail.com> | 2014-09-03 17:17:42 +0200 |
---|---|---|
committer | René Scheibe <rene.scheibe@gmail.com> | 2014-09-03 17:17:42 +0200 |
commit | 949dc6984decd81e51e264af34ee7cbc117e5e4a (patch) | |
tree | 4d9ce2e16048c37f48cca352746edbd6c295a4f4 | |
parent | 8b20ac6d9016be8d5e66d791a6d17cfd0429781f (diff) | |
download | ninka-949dc6984decd81e51e264af34ee7cbc117e5e4a.tar.gz |
consistent naming for constants, variables and methods
* using 'snake' case
-rwxr-xr-x | extComments/extComments.pl | 48 | ||||
-rwxr-xr-x | extComments/hashComments.pl | 36 | ||||
-rwxr-xr-x | filter/filter.pl | 40 | ||||
-rwxr-xr-x | matcher/matcher.pl | 290 | ||||
-rwxr-xr-x | ninka.pl | 80 | ||||
-rwxr-xr-x | senttok/senttok.pl | 76 | ||||
-rwxr-xr-x | splitter/splitter.pl | 90 |
7 files changed, 329 insertions, 331 deletions
diff --git a/extComments/extComments.pl b/extComments/extComments.pl index 1a4caec..d7a65d5 100755 --- a/extComments/extComments.pl +++ b/extComments/extComments.pl @@ -39,55 +39,55 @@ if (!getopts ('vc:p:',\%opts)) { die; } -my $f = $ARGV[0]; +my $file = $ARGV[0]; -#die "illegal file [$f]" if $f =~ m@/\.@; +#die "illegal file [$file]" if $file =~ m@/\.@; -my $numberComments = 1; -$numberComments = $opts{c} if exists $opts{c}; +my $number_comments = 1; +$number_comments = $opts{c} if exists $opts{c}; my $verbose = 1; $verbose = exists $opts{v}; -if (get_size($f) == 0) { +if (get_size($file) == 0) { print STDERR "Empty file, just exit\n" if $verbose; exit 0; # nothing to report, just end } -my $commentsCmd = Determine_Comments_Extractor($f); +my $comments_cmd = determine_comments_extractor($file); -execute("$commentsCmd"); +execute("$comments_cmd"); -if ($commentsCmd =~ /^comments/ and get_size("${f}.comments") == 0) { - `cat '$f' | head -700 > ${f}.comments`; +if ($comments_cmd =~ /^comments/ and get_size("${file}.comments") == 0) { + `cat '$file' | head -700 > ${file}.comments`; } exit 0; -sub Determine_Comments_Extractor { - my ($f) = @_; - if ($f =~ /\.([^\.]+)$/) { +sub determine_comments_extractor { + my ($file) = @_; + if ($file =~ /\.([^\.]+)$/) { my $ext= $1; if ($ext =~ /^(pl|pm|py)$/) { # for the time being, let us just extract the top 400 lines - return "cat '$f' | head -400 > '${f}.comments'"; -# return "$path/hashComments.pl -p '#' '$f'"; + return "cat '$file' | head -400 > '${file}.comments'"; +# return "$path/hashComments.pl -p '#' '$file'"; } elsif ($ext eq 'jl' or $ext eq 'el') { - return "cat '$f' | head -400 > '${f}.comments'"; -# return "$path/hashComments.pl -p ';' '$f'";; + return "cat '$file' | head -400 > '${file}.comments'"; +# return "$path/hashComments.pl -p ';' '$file'";; } elsif ($ext =~ /^(java|c|cpp|h|cxx|c\+\+|cc)$/ ) { - my $comm = `which comments`; - if ($comm ne '') { - return "comments -c1 '$f' 2> /dev/null"; + my $comments_cmd_location = `which comments`; + if ($comments_cmd_location ne '') { + return "comments -c1 '$file' 2> /dev/null"; } else { - return "cat '$f' | head -400 > '${f}.comments'"; + return "cat '$file' | head -400 > '${file}.comments'"; } } else { - return "cat '$f' | head -700 > '${f}.comments'"; + return "cat '$file' | head -700 > '${file}.comments'"; } } else { print "\n>>>>>>>>>>>>>>>>>>>>>\n"; - return "cat '$f' | head -700 > '${f}.comments'"; + return "cat '$file' | head -700 > '${file}.comments'"; } } @@ -100,8 +100,8 @@ sub execute { } sub get_size { - my ($f) = @_; - my $size = (stat($f))[7]; + my ($file) = @_; + my $size = (stat($file))[7]; return $size; } diff --git a/extComments/hashComments.pl b/extComments/hashComments.pl index baa9937..a32e283 100755 --- a/extComments/hashComments.pl +++ b/extComments/hashComments.pl @@ -34,50 +34,50 @@ if (!getopts ('vc:p:',\%opts)) { die; } -my $f = $ARGV[0]; +my $file = $ARGV[0]; -open (OUT, ">${f}.comments") or die "Unable to create [${f}.comments]"; +open (OUT, ">${file}.comments") or die "Unable to create [${file}.comments]"; <>; print OUT unless /^\#\!/; -my $commentChar = '#'; +my $comment_char = '#'; -$commentChar = $opts{p} if exists $opts{p}; +$comment_char = $opts{p} if exists $opts{p}; -my $numberComments = 1; -$numberComments = $opts{c} if exists $opts{c}; +my $comments_count = 1; +$comments_count = $opts{c} if exists $opts{c}; my $verbose = exists $opts{v}; -my $insideComment = 0; -my $insideCode = 0; +my $inside_comment = 0; +my $inside_code = 0; -my $comCount = 0; -my $countCode = 0; +my $comment_count = 0; +my $code_count = 0; while (<>) { chomp; - if (Is_Comment($_)) { + if (is_comment($_)) { s/\t/ /g; s/ +/ /g; - $comCount++ if (not $insideComment); - $insideComment = 1; - /$commentChar+/; + $comment_count++ if (not $inside_comment); + $inside_comment = 1; + /$comment_char+/; print OUT $' . "\n"; #' - } elsif (Is_Blank($_)) { + } elsif (is_blank($_)) { print OUT "\n"; } else { exit 0; } } -sub Is_Comment { +sub is_comment { my ($st) = @_; - return ($st =~ /^\s*$commentChar/); + return ($st =~ /^\s*$comment_char/); } -sub Is_Blank { +sub is_blank { my ($st) = @_; return ($st =~ /^\s*$/); } diff --git a/filter/filter.pl b/filter/filter.pl index 84269c6..7b0fafd 100755 --- a/filter/filter.pl +++ b/filter/filter.pl @@ -36,33 +36,33 @@ $path =~ s/[^\/]+$//; if ($path eq '') { $path = './'; } -my $critWords = $path . 'criticalword.dict'; +my $file_critical_words = $path . 'criticalword.dict'; die "Usagee $0 <filename>.sentences" unless $ARGV[0] =~ /\.sentences$/; -my $goodfilename = $ARGV[0]; +my $file_good = $ARGV[0]; -die "Filename should end in '.sentences' [$goodfilename]" unless $goodfilename =~ s/\.sentences$/\.goodsent/; -my $badfilename = $ARGV[0]; -$badfilename =~ s/\.sentences$/\.badsent/; +die "Filename should end in '.sentences' [$file_good]" unless $file_good =~ s/\.sentences$/\.goodsent/; +my $file_bad = $ARGV[0]; +$file_bad =~ s/\.sentences$/\.badsent/; -#print $goodfilename; -#print $badfilename; +#print $file_good; +#print $file_bad; open (INPUTFILE, "<$ARGV[0]") or die ("Error: $ARGV[0] is not found."); -open (DICTIONARY, "<$critWords") or die ('Error: criticalword.dict is not found.'); +open (DICTIONARY, "<$file_critical_words") or die ('Error: criticalword.dict is not found.'); -open (GOODOUT, ">$goodfilename") || die ('Error'); -open (BADOUT, ">$badfilename") || die ('Error'); +open (GOODOUT, ">$file_good") || die ('Error'); +open (BADOUT, ">$file_bad") || die ('Error'); -my @cwordlist = (); +my @critical_words = (); # read dictionary into list -my $cword; -while ($cword = <DICTIONARY>) { - chomp $cword; - next if $cword =~ /^\#/; - $cword =~ s/\#.*$//; # remove everything to the end of file - push(@cwordlist, "$cword"); +my $critical_word; +while ($critical_word = <DICTIONARY>) { + chomp $critical_word; + next if $critical_word =~ /^\#/; + $critical_word =~ s/\#.*$//; # remove everything to the end of file + push(@critical_words, "$critical_word"); } close(DICTIONARY); @@ -71,10 +71,10 @@ my $sentence; while ($sentence = <INPUTFILE>) { my $check = 0; chomp $sentence; - foreach $cword (@cwordlist) { - if ($sentence =~ /\b$cword\b/i) { + foreach $critical_word (@critical_words) { + if ($sentence =~ /\b$critical_word\b/i) { $check = 1; - #print "$cword:$sentence"; + #print "$critical_word:$sentence"; last; } } diff --git a/matcher/matcher.pl b/matcher/matcher.pl index e805b6c..9d1e74b 100755 --- a/matcher/matcher.pl +++ b/matcher/matcher.pl @@ -30,84 +30,84 @@ use strict; my $debug = 0; -my %NonCriticalRules ; +my %NON_CRITICAL_RULES = (); # these should go into a file, but for the time being, let us keep them here # once we have matched a rule, these are not that important -my @generalNonCritical = ('AllRights'); - -my @gplNonCritical = ('GPLnoVersion', - 'FSFwarranty', - 'LibraryGPLcopyVer0', - 'GPLseeVer0', - 'GPLwrite', - 'SeeFile', - 'FreeSoftware', - 'FSFwarrantyVer0', - 'LibraryGPLseeDetailsVer0', - 'FSFwarranty', - 'LesserGPLseeDetailsVer0', - 'GPLcopyVer0', - 'GNUurl', - 'GPLseeDetailsVer0'); - -$NonCriticalRules{'LibraryGPLv3+'} = [@gplNonCritical]; -$NonCriticalRules{'LibraryGPLv3'} = [@gplNonCritical]; -$NonCriticalRules{'LibraryGPLv2+'} = [@gplNonCritical]; -$NonCriticalRules{'LibraryGPLv2'} = [@gplNonCritical]; -$NonCriticalRules{'LesserGPLv3'} = [@gplNonCritical, 'LesserGPLseeVer3','LesserGPLcopyVer3','SeeFileVer3']; -$NonCriticalRules{'LesserGPLv2.1+'} = [@gplNonCritical]; -$NonCriticalRules{'LesserGPLv2.1'} = [@gplNonCritical]; -$NonCriticalRules{'LGPLv2orv3'}= [@gplNonCritical]; -$NonCriticalRules{'LesserGPLv2'} = [@gplNonCritical]; -$NonCriticalRules{'LesserGPLv2+'} = [@gplNonCritical]; - -$NonCriticalRules{'GPLv2+'} = [@gplNonCritical]; -$NonCriticalRules{'GPLv2'} = [@gplNonCritical]; -$NonCriticalRules{'GPLv1+'} = [@gplNonCritical]; -$NonCriticalRules{'GPLv1'} = [@gplNonCritical]; -$NonCriticalRules{'GPLv3+'} = [@gplNonCritical]; -$NonCriticalRules{'GPLv3'} = [@gplNonCritical]; -$NonCriticalRules{'AGPLv3'} = [@gplNonCritical, 'AGPLreceivedVer0','AGPLseeVer0']; -$NonCriticalRules{'AGPLv3+'} = [@gplNonCritical, 'AGPLreceivedVer0','AGPLseeVer0']; -$NonCriticalRules{'GPLnoVersion'} = [@gplNonCritical]; - -$NonCriticalRules{'Apachev1.1'} = ['ApacheLic1_1']; -$NonCriticalRules{'Apachev2'} = ['ApachePre','ApacheSee']; - -$NonCriticalRules{'LibGCJLic'} = ['LibGCJSee']; -$NonCriticalRules{'CDDLicV1'} = ['Compliance','CDDLicWhere','ApachesPermLim','CDDLicIncludeFile','UseSubjectToTerm', 'useOnlyInCompliance']; -$NonCriticalRules{'CDDLic'} = ['Compliance','CDDLicWhere','ApachesPermLim','CDDLicIncludeFile','UseSubjectToTerm', 'useOnlyInCompliance']; - -$NonCriticalRules{'CDDLorGPLv2'}= ['CDDLorGPLv2doNotAlter','AllRights','useOnlyInCompliance', 'CDDLorGPLv2whereVer0', 'ApachesPermLim', 'CDDLorGPLv2include','CDDLorGPLv2IfApplicable', 'CDDLorGPLv2Portions', 'CDDLorGPLv2ifYouWishVer2', 'CDDLorGPLv2IfYouAddVer2']; - -$NonCriticalRules{'CPLv1orGPLv2+orLGPLv2+'} = ['licenseBlockBegin', 'licenseBlockEnd']; - -$NonCriticalRules{'Qt'} = ['Copyright','qtNokiaExtra','QTNokiaContact', 'qtDiaTems']; -$NonCriticalRules{'orLGPLVer2.1'} = ['LesserqtReviewGPLVer2.1','qtLGPLv2.1where']; -$NonCriticalRules{'orGPLv3'} = ['qtReviewGPLVer3.0','qtReviewGPLVer3','qtGPLwhere']; -$NonCriticalRules{'digiaQTExceptionNoticeVer1.1'} = ['qtDigiaExtra']; - -$NonCriticalRules{'MPLv1_0'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; -$NonCriticalRules{'MPLv1_1'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; -$NonCriticalRules{'NPLv1_1'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; -$NonCriticalRules{'NPLv1_0'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; - -$NonCriticalRules{'subversion'} = ['SeeFileSVN','subversionHistory']; -$NonCriticalRules{'subversion+'} = ['SeeFileSVN','subversionHistory']; -$NonCriticalRules{'tmate+'} = ['SeeFileSVN']; - -$NonCriticalRules{'openSSLvar2'} = ['BSDcondAdvPart2']; - -$NonCriticalRules{'MPLv1_1'} = ['licenseBlockBegin','MPLsee','Copyright','licenseBlockEnd','ApacheLicWherePart1','MPLwarranty', 'MPLwarrantyVar']; -$NonCriticalRules{'MPL1_1andLGPLv2_1'} = ['MPLoptionIfNotDelete2licsVer0','MPL_LGPLseeVer0']; - -$NonCriticalRules{'FreeType'} = ['FreeTypeNotice']; - -$NonCriticalRules{'GPLVer2.1or3KDE+'} = [@gplNonCritical]; -$NonCriticalRules{'LGPLVer2.1or3KDE+'} = [@gplNonCritical]; +my @GENERAL_NON_CRITICAL = ('AllRights'); + +my @GPL_NON_CRITICAL = ('GPLnoVersion', + 'FSFwarranty', + 'LibraryGPLcopyVer0', + 'GPLseeVer0', + 'GPLwrite', + 'SeeFile', + 'FreeSoftware', + 'FSFwarrantyVer0', + 'LibraryGPLseeDetailsVer0', + 'FSFwarranty', + 'LesserGPLseeDetailsVer0', + 'GPLcopyVer0', + 'GNUurl', + 'GPLseeDetailsVer0'); + +$NON_CRITICAL_RULES{'LibraryGPLv3+'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'LibraryGPLv3'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'LibraryGPLv2+'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'LibraryGPLv2'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'LesserGPLv3'} = [@GPL_NON_CRITICAL, 'LesserGPLseeVer3','LesserGPLcopyVer3','SeeFileVer3']; +$NON_CRITICAL_RULES{'LesserGPLv2.1+'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'LesserGPLv2.1'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'LGPLv2orv3'}= [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'LesserGPLv2'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'LesserGPLv2+'} = [@GPL_NON_CRITICAL]; + +$NON_CRITICAL_RULES{'GPLv2+'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'GPLv2'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'GPLv1+'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'GPLv1'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'GPLv3+'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'GPLv3'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'AGPLv3'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0','AGPLseeVer0']; +$NON_CRITICAL_RULES{'AGPLv3+'} = [@GPL_NON_CRITICAL, 'AGPLreceivedVer0','AGPLseeVer0']; +$NON_CRITICAL_RULES{'GPLnoVersion'} = [@GPL_NON_CRITICAL]; + +$NON_CRITICAL_RULES{'Apachev1.1'} = ['ApacheLic1_1']; +$NON_CRITICAL_RULES{'Apachev2'} = ['ApachePre','ApacheSee']; + +$NON_CRITICAL_RULES{'LibGCJLic'} = ['LibGCJSee']; +$NON_CRITICAL_RULES{'CDDLicV1'} = ['Compliance','CDDLicWhere','ApachesPermLim','CDDLicIncludeFile','UseSubjectToTerm', 'useOnlyInCompliance']; +$NON_CRITICAL_RULES{'CDDLic'} = ['Compliance','CDDLicWhere','ApachesPermLim','CDDLicIncludeFile','UseSubjectToTerm', 'useOnlyInCompliance']; + +$NON_CRITICAL_RULES{'CDDLorGPLv2'}= ['CDDLorGPLv2doNotAlter','AllRights','useOnlyInCompliance', 'CDDLorGPLv2whereVer0', 'ApachesPermLim', 'CDDLorGPLv2include','CDDLorGPLv2IfApplicable', 'CDDLorGPLv2Portions', 'CDDLorGPLv2ifYouWishVer2', 'CDDLorGPLv2IfYouAddVer2']; + +$NON_CRITICAL_RULES{'CPLv1orGPLv2+orLGPLv2+'} = ['licenseBlockBegin', 'licenseBlockEnd']; + +$NON_CRITICAL_RULES{'Qt'} = ['Copyright','qtNokiaExtra','QTNokiaContact', 'qtDiaTems']; +$NON_CRITICAL_RULES{'orLGPLVer2.1'} = ['LesserqtReviewGPLVer2.1','qtLGPLv2.1where']; +$NON_CRITICAL_RULES{'orGPLv3'} = ['qtReviewGPLVer3.0','qtReviewGPLVer3','qtGPLwhere']; +$NON_CRITICAL_RULES{'digiaQTExceptionNoticeVer1.1'} = ['qtDigiaExtra']; + +$NON_CRITICAL_RULES{'MPLv1_0'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; +$NON_CRITICAL_RULES{'MPLv1_1'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; +$NON_CRITICAL_RULES{'NPLv1_1'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; +$NON_CRITICAL_RULES{'NPLv1_0'} = ['ApacheLicWherePart1','MPLwarranty','MPLSee']; + +$NON_CRITICAL_RULES{'subversion'} = ['SeeFileSVN','subversionHistory']; +$NON_CRITICAL_RULES{'subversion+'} = ['SeeFileSVN','subversionHistory']; +$NON_CRITICAL_RULES{'tmate+'} = ['SeeFileSVN']; + +$NON_CRITICAL_RULES{'openSSLvar2'} = ['BSDcondAdvPart2']; + +$NON_CRITICAL_RULES{'MPLv1_1'} = ['licenseBlockBegin','MPLsee','Copyright','licenseBlockEnd','ApacheLicWherePart1','MPLwarranty', 'MPLwarrantyVar']; +$NON_CRITICAL_RULES{'MPL1_1andLGPLv2_1'} = ['MPLoptionIfNotDelete2licsVer0','MPL_LGPLseeVer0']; + +$NON_CRITICAL_RULES{'FreeType'} = ['FreeTypeNotice']; + +$NON_CRITICAL_RULES{'GPLVer2.1or3KDE+'} = [@GPL_NON_CRITICAL]; +$NON_CRITICAL_RULES{'LGPLVer2.1or3KDE+'} = [@GPL_NON_CRITICAL]; # initialize @@ -117,23 +117,23 @@ if ($path eq '') { $path = './'; } -my $rules= $path . 'rules.dict'; -my $interrules= $path . 'interrules.dict'; +my $rules_file = $path . 'rules.dict'; +my $interrules_file = $path . 'interrules.dict'; die "Usage $0 <filename>.senttok" unless $ARGV[0] =~ /\.senttok$/; -my $countUnknowns = 0; +my $count_unknowns = 0; # read the rules -my @rulelist = Read_Rules($rules); -my @interRuleList = Read_Inter_Rules($interrules); +my @rules = read_rules($rules_file); +my @inter_rules = read_inter_rules($interrules_file); -my @licSentNames = (); +my @license_sentence_names = (); my @original; -Read_Original($ARGV[0], \@licSentNames, \@original); +read_original($ARGV[0], \@license_sentence_names, \@original); -#foreach my $x (@licSentNames) { +#foreach my $x (@license_sentence_names) { # print "$x\n"; #} #exit; @@ -145,77 +145,77 @@ Read_Original($ARGV[0], \@licSentNames, \@original); ########################################## -#for my $ref( @interRuleList ){ +#for my $ref( @inter_rules ){ # print "@$ref\n"; #} # matching spdx requires to match strict licenses, with no alternatives... -my $senttok = ',' . join(',', @licSentNames) . ','; +my $senttok = ',' . join(',', @license_sentence_names) . ','; my @result = (); -my $countMatches = 0; +my $count_matches = 0; print "[$senttok]\n" if $debug; -Match_License(); +match_license(); # do we have to check again? ## todo, verifythat we have unmatched sentences... -@licSentNames = split(',', $senttok); +@license_sentence_names = split(',', $senttok); # first remove the extrict part from it -#Print_Result(); +#print_result(); my $match = 0; -for (my $i = 0; $i <= $#licSentNames; $i++) { - if ($licSentNames[$i] == 0 and - ($licSentNames[$i] ne 'UNKNOWN' and - $licSentNames[$i] ne '')) { -# print "[$licSentNames[$i]]\n"; - $licSentNames[$i] =~ s/Extrict$//; +for (my $i = 0; $i <= $#license_sentence_names; $i++) { + if ($license_sentence_names[$i] == 0 and + ($license_sentence_names[$i] ne 'UNKNOWN' and + $license_sentence_names[$i] ne '')) { +# print "[$license_sentence_names[$i]]\n"; + $license_sentence_names[$i] =~ s/Extrict$//; $match++; } } -#Print_Result(); +#print_result(); if ($match > 0) { # print "REDO\n"; - for (my $i = 0; $i <= $#interRuleList; $i++) { - #for my $ref( @interRuleList[$i]){ + for (my $i = 0; $i <= $#inter_rules; $i++) { + #for my $ref( @inter_rules[$i]){ # print "@$ref\n"; #} - #print $interRuleList[$i][0]; - @licSentNames = map { $_ eq $interRuleList[$i][0] ? $interRuleList[$i][1] : $_ } @licSentNames; + #print $inter_rules[$i][0]; + @license_sentence_names = map { $_ eq $inter_rules[$i][0] ? $inter_rules[$i][1] : $_ } @license_sentence_names; } - $senttok = join(',', @licSentNames) . ','; + $senttok = join(',', @license_sentence_names) . ','; - Match_License(); + match_license(); } -Print_Result(); +print_result(); exit 0; -#print @licSentNames; -#print join(';',@licSentNames)."\n"; +#print @license_sentence_names; +#print join(';',@license_sentence_names)."\n"; # 3. matching ############################### # we will iterate over rules, matching as many as we can... -sub Is_Unknown { +sub is_unknown { my ($s) = @_; my @f = split (/,/, $s); return $f[0] eq 'UNKNOWN'; } -sub Read_Rules { - my ($rulesF) = @_; - open (RULES, "<$rulesF") or die ('Error: rules.dict is not found.'); +sub read_rules { + my ($file) = @_; + open (RULES, "<$file") or die ('Error: rules.dict is not found.'); my $sentence; my @rules = (); while ($sentence = <RULES>) { @@ -236,11 +236,11 @@ sub Read_Rules { return @rules; } -sub Read_Inter_Rules { - my ($interrules) = @_; +sub read_inter_rules { + my ($file) = @_; - my @list; - open (IRULES, "<$interrules") or die ('Error: interrules.dict is not found.'); + my @inter_rules; + open (IRULES, "<$file") or die ('Error: interrules.dict is not found.'); my $sentence; while ($sentence = <IRULES>) { chomp $sentence; @@ -249,17 +249,17 @@ sub Read_Inter_Rules { next; } foreach my $item (split(/\|/, $2)) { - push (@list, [$item, $1]); + push (@inter_rules, [$item, $1]); } } close IRULES; - return @list; + return @inter_rules; } -sub Read_Original { - my ($inputF, $tokens, $originals) = @_; +sub read_original { + my ($file, $tokens, $originals) = @_; - open (INPUTFILE, $inputF) or die ("Error: $inputF is not found."); + open (INPUTFILE, $file) or die ("Error: $file is not found."); my $sentence; my @original; @@ -275,24 +275,24 @@ sub Read_Original { exit 0; } -#print join(';',@licSentNames)."\n"; +#print join(';',@license_sentence_names)."\n"; close INPUTFILE; } -sub Match_License { +sub match_license { # create a string with the sentences - for (my $j = 0; $j <= $#rulelist; $j++) { - my $rule = $rulelist[$j][1]; - my $rulename = $rulelist[$j][0]; - my $lenRule = scalar(split(',', $rule)); + for (my $j = 0; $j <= $#rules; $j++) { + my $rule = $rules[$j][1]; + my $rulename = $rules[$j][0]; + my $rule_length = scalar(split(',', $rule)); # replace rule with the length of the rule print "To try [$rulename][$rule] on [$senttok]\n" if $debug; - while ($senttok =~ s/,${rule},/,$lenRule,/) { - $countMatches++; + while ($senttok =~ s/,${rule},/,$rule_length,/) { + $count_matches++; push (@result, $rulename); -# print ">>>>$senttok|$rulelist[$j][1]\n"; +# print ">>>>$senttok|$rules[$j][1]\n"; # print 'Result: ', join(',', @result); # print "\n"; } @@ -300,7 +300,7 @@ sub Match_License { # print ">>>>[$senttok]\n"; - my $onlyAllRight = 0; + my $only_all_right = 0; # ok, at this point we have removed all the matched sentences... #print STDERR "Ending>>>>>>>$senttok\n"; @@ -308,12 +308,12 @@ sub Match_License { #print STDERR 'Result>>', join(',', @result), "\n"; # let us remove allrights -# my $onlyAllRight = 1; -# for my $i (0.. scalar(@licSentNames)-1){ -# if (($licSentNames[$i] eq 'AllRights')) { -# $licSentNames[$i] = ''; +# my $only_all_right = 1; +# for my $i (0.. scalar(@license_sentence_names)-1){ +# if (($license_sentence_names[$i] eq 'AllRights')) { +# $license_sentence_names[$i] = ''; # } else { -# $onlyAllRight = 0; +# $only_all_right = 0; # } # } @@ -324,13 +324,13 @@ sub Match_License { # let us clean up the rules... let us print the matched rules, and the # if (grep(/GPL/, @result)) { # print "GPL...\n"; -# foreach my $r ($NonCriticalRules{GPL}) { +# foreach my $r ($NON_CRITICAL_RULES{GPL}) { # $senttok =~ s/(,|^)$r(,|$)/$1$2/g; # } # } # general removal of rules - foreach my $r (@generalNonCritical) { + foreach my $r (@GENERAL_NON_CRITICAL) { while ($senttok =~ s/,$r,/,-1,/) { ; } @@ -338,7 +338,7 @@ sub Match_License { # print "[$senttok]\n"; foreach my $res (@result) { - my $temp = $NonCriticalRules{$res}; + my $temp = $NON_CRITICAL_RULES{$res}; foreach my $r (@$temp) { # print ">>Senttok [$r][$senttok]\n"; while ($senttok =~ s/,$r,/,-1,/g) { @@ -350,7 +350,7 @@ sub Match_License { } } -sub Print_Result { +sub print_result { # $senttok =~ s/AllRights(,?)/$1/g; # $senttok =~ s/UNKNOWN,/,/g; # $senttok =~ s/,+/,/g; @@ -362,31 +362,31 @@ sub Print_Result { die 'assertion 1' if $sections[0] ne ''; die 'assertion 2' if $sections[scalar(@sections)] ne ''; - my $ignoredLines = 0; - my $licenseLines = 0; - my $unknownLines = 0; - my $unmatchedLines = 0; + my $ignored_lines = 0; + my $license_lines = 0; + my $unknown_lines = 0; + my $unmatched_lines = 0; foreach my $i (1..scalar(@sections)-1) { # print "$i;$sections[$i]\n"; if ($sections[$i] < 0) { - $ignoredLines += - $sections[$i]; + $ignored_lines += - $sections[$i]; } elsif ($sections[$i] != 0) { - $licenseLines += $sections[$i]; + $license_lines += $sections[$i]; } elsif ($sections[$i] eq 'UNKNOWN') { - $unknownLines++; + $unknown_lines++; } else { - $unmatchedLines++; + $unmatched_lines++; } } $senttok =~ s/^,(.*),$/$1/; -# print "$ignoredLines > $licenseLines > $unknownLines > $unmatchedLines\n"; +# print "$ignored_lines > $license_lines > $unknown_lines > $unmatched_lines\n"; if (scalar (@result) == 0) { print 'UNKNOWN'; } else { print join(',',@result); } - print ";$countMatches;$licenseLines;$ignoredLines;$unmatchedLines;$unknownLines;$senttok\n"; + print ";$count_matches;$license_lines;$ignored_lines;$unmatched_lines;$unknown_lines;$senttok\n"; $senttok = $save; } @@ -61,71 +61,69 @@ if ($path eq "") { } my $force = exists $opts{f}; -my $forceGood = exists $opts{G}; -my $forceSentences = exists $opts{S}; -my $forceSentok = exists $opts{T}; -my $forceComments = exists $opts{C}; -my $forceLicense = exists $opts{L}; +my $force_good = exists $opts{G}; +my $force_sentences = exists $opts{S}; +my $force_senttok = exists $opts{T}; +my $force_comments = exists $opts{C}; +my $force_license = exists $opts{L}; #die "Usage $0 <filename>" unless $ARGV[0] =~ /\.(c|cpp|java|cc|cxx|h|jl|py|pm|el|pl)$/; -my $f = $ARGV[0]; +my $input_file = $ARGV[0]; -my $original = $f; +print "Starting: $input_file;\n" if ($verbose); -print "Starting: $original;\n" if ($verbose); +print "$input_file;"; -print "$original;"; +my $comments_file = "${input_file}.comments"; +my $sentences_file = "${input_file}.sentences"; +my $goodsent_file = "${input_file}.goodsent"; +my $senttok_file = "${input_file}.senttok"; -my $commentsFile = "${f}.comments"; -my $sentencesFile = "${f}.sentences"; -my $goodsentFile = "${f}.goodsent"; -my $sentokFile = "${f}.senttok"; - -if (not (-f "$f")) { - print "ERROR;[${f}] is not a file\n" ; +if (not (-f "$input_file")) { + print "ERROR;[${input_file}] is not a file\n" ; exit 0; } -Do_File_Process($original, $commentsFile, ($force or $forceComments), - "$path/extComments/extComments.pl -c1 '${original}'", +do_file_process($input_file, $comments_file, ($force or $force_comments), + "$path/extComments/extComments.pl -c1 '${input_file}'", "Creating comments file", exists $opts{c}); -Do_File_Process($commentsFile, $sentencesFile, ($force or $forceSentences), - "$path/splitter/splitter.pl '${commentsFile}'", +do_file_process($comments_file, $sentences_file, ($force or $force_sentences), + "$path/splitter/splitter.pl '${comments_file}'", "Splitting sentences", exists $opts{s}); -Do_File_Process($sentencesFile, $goodsentFile, ($force or $forceGood), - "$path/filter/filter.pl '${sentencesFile}'", +do_file_process($sentences_file, $goodsent_file, ($force or $force_good), + "$path/filter/filter.pl '${sentences_file}'", "Filtering good sentences", exists $opts{s}); -Do_File_Process($goodsentFile, $sentokFile, ($force or $forceSentok), - "$path/senttok/senttok.pl '${goodsentFile}' > '${sentokFile}'", +do_file_process($goodsent_file, $senttok_file, ($force or $force_senttok), + "$path/senttok/senttok.pl '${goodsent_file}' > '${senttok_file}'", "Matching sentences against rules", exists $opts{t}); -print "Matching ${f}.senttok against rules" if ($verbose); -execute("$path/matcher/matcher.pl '${f}.senttok' > '${f}.license'"); +print "Matching ${input_file}.senttok against rules" if ($verbose); +execute("$path/matcher/matcher.pl '${input_file}.senttok' > '${input_file}.license'"); -print `cat '${f}.license'`; +print `cat '${input_file}.license'`; -unlink("${f}.code"); +unlink("${input_file}.code"); if ($delete) { - unlink("${f}.badsent"); - unlink("${f}.comments"); - unlink("${f}.goodsent"); -# unlink("${f}.sentences"); - unlink("${f}.senttok"); + unlink("${input_file}.badsent"); + unlink("${input_file}.comments"); + unlink("${input_file}.goodsent"); +# unlink("${input_file}.sentences"); + unlink("${input_file}.senttok"); } exit 0; -sub Do_File_Process { +sub do_file_process { my ($input, $output, $force, $cmd, $message, $end) = @_; print "${message}:" if ($verbose); - if ($force or newer($input, $output)) { + if ($force or is_newer($input, $output)) { print "Running ${cmd}:" if ($verbose); execute($cmd); } else { @@ -139,15 +137,15 @@ sub Do_File_Process { } sub execute { - my ($c) = @_; -# print "\nTo execute [$c]\n"; - my $r = `$c`; + my ($command) = @_; +# print "\nTo execute [$command]\n"; + my $result = `$command`; my $status = ($? >> 8); - die "execution of program [$c] failed: status [$status]" if ($status != 0); - return $r; + die "execution of program [$command] failed: status [$status]" if ($status != 0); + return $result; } -sub newer { +sub is_newer { my ($f1, $f2) = @_; my ($f1write) = (stat($f1))[9]; my ($f2write) = (stat($f2))[9]; diff --git a/senttok/senttok.pl b/senttok/senttok.pl index e960e2e..5bef3c5 100755 --- a/senttok/senttok.pl +++ b/senttok/senttok.pl @@ -26,75 +26,75 @@ $path =~ s/[^\/]+$//; if ($path eq "") { $path = "./"; } -my $licSentences = $path . "licensesentence.dict"; +my $path_license_sentences = $path . "licensesentence.dict"; open FH, "<$ARGV[0]"; -my @licensesentencelist=(); -open LICENSESENTENCEFILE, "<$licSentences"; +my @license_sentences = (); +open LICENSESENTENCEFILE, "<$path_license_sentences"; my $line; while ($line = <LICENSESENTENCEFILE>) { chomp $line; next if $line =~ /^\#/; next if $line =~ /^ *$/; die "Illegal format in license expression [$line] " unless $line =~ /(.*?):(.*?):(.*)/; - push @licensesentencelist,$line; + push @license_sentences, $line; } -#foreach $line (@licensesentencelist) { +#foreach $line (@license_sentences) { # print $line; #} close LICENSESENTENCEFILE; while ($line = <>) { - my $saveLine; - my $originalLine; + my $save_line; + my $original_line; chomp $line; - $originalLine = $line; + $original_line = $line; if ($line =~ s/^Alternatively,? ?//) { print "Altern\n"; } - $line = Normalize_Sentence($line); + $line = normalize_sentence($line); my $check = 0; - my $matchname = "UNKNOWN"; + my $match_name = "UNKNOWN"; my @parm = (); my $sentence; my $distance = 1; #maximum? number - my $mostsimilarname = "UNKNOWN"; + my $most_similar_name = "UNKNOWN"; my $before; my $after; my $gpl = 0; - my ($gplLater, $gplVersion); + my ($gpl_later, $gpl_version); - $saveLine = $line; + $save_line = $line; # print "Original # [$line] #\n"; - my $lineAsGPL = ''; + my $line_as_gpl = ''; - if (Looks_Like_GPL($line)) { + if (looks_like_gpl($line)) { my $old = $line; $gpl = 1; - ($line, $gplLater, $gplVersion) = Normalize_GPL($line); - $lineAsGPL = $line; + ($line, $gpl_later, $gpl_version) = normalize_gpl($line); + $line_as_gpl = $line; } - my ($name, $subRule, $number, $regexp, $option); - my $saveLine = $line; - my $saveGPL = $gpl; + my ($name, $sub_rule, $number, $regexp, $option); + my $save_line = $line; + my $save_gpl = $gpl; my $LGPL = ""; - foreach $sentence (@licensesentencelist) { - ($name, $subRule, $number, $regexp, $option) = split(/:/, $sentence); + foreach $sentence (@license_sentences) { + ($name, $sub_rule, $number, $regexp, $option) = split(/:/, $sentence); # we need this due to the goto again - $line = $saveLine; - $gpl = $saveGPL; + $line = $save_line; + $gpl = $save_gpl; $LGPL = ""; again: # print "Testing # lin[$line] -# ori[$saveLine] +# ori[$save_line] # re [$regexp] # lpg[$LGPL] #\n"; @@ -102,7 +102,7 @@ while ($line = <>) { $before = $`; $after = $'; #'; $check = 1; - $matchname = $name; + $match_name = $name; for (my $i = 1; $i <= $number; $i++) { no strict 'refs'; push @parm, $$i; @@ -118,7 +118,7 @@ while ($line = <>) { } if ($gpl) { $gpl = 0; - $line = $saveLine; + $line = $save_line; goto again; } next;## dmg @@ -126,7 +126,7 @@ while ($line = <>) { $targetset =~ s/^(.*)$/$1/; my $tmpdist = levenshtein($line, $targetset) / max(length($targetset), length($sentence)); if ($tmpdist < $distance) { - $mostsimilarname = $name; + $most_similar_name = $name; $distance = $tmpdist; } } @@ -135,26 +135,26 @@ while ($line = <>) { if ($check == 1) { # licensesentence name, parm1, parm2,.. if ($gpl) { - $matchname .= "Ver" . $gplVersion; - $matchname .= "+" if $gplLater; - $matchname = $LGPL . $matchname; + $match_name .= "Ver" . $gpl_version; + $match_name .= "+" if $gpl_later; + $match_name = $LGPL . $match_name; } if (length($before) > $TOO_LONG || length($after) > $TOO_LONG) { - $matchname .= "-TOOLONG"; + $match_name .= "-TOOLONG"; } - my $parmstrings = join(";",$matchname, $subRule, $before, $after, @parm); - print $parmstrings, ":$originalLine\n"; + my $parmstrings = join(";", $match_name, $sub_rule, $before, $after, @parm); + print $parmstrings, ":$original_line\n"; } else { # UNKNOWN, sentence chomp $line; - print $matchname, ";", 0, ";", $mostsimilarname, ";", $distance, ";", $saveLine, ":$originalLine\n"; + print $match_name, ";", 0, ";", $most_similar_name, ";", $distance, ";", $save_line, ":$original_line\n"; } } close FH; exit 0; -sub Normalize_GPL { +sub normalize_gpl { my ($line) = @_; my $later = 0; my $version = 0; @@ -222,7 +222,7 @@ sub Normalize_GPL { return ($line,$later,$version); } -sub Looks_Like_GPL { +sub looks_like_gpl { my ($line) = @_; return 1 if $line =~ /GNU/; @@ -232,7 +232,7 @@ sub Looks_Like_GPL { return 0; } -sub Normalize_Sentence { +sub normalize_sentence { my ($line) = @_; # do some very quick spelling corrections for english/british words $line =~ s/icence/icense/ig; diff --git a/splitter/splitter.pl b/splitter/splitter.pl index 69039ad..c806f96 100755 --- a/splitter/splitter.pl +++ b/splitter/splitter.pl @@ -30,11 +30,11 @@ use strict; # This program is originally based on the sentence splitter program # published by Paul Clough. Version 1.0, but then it was mostly rewritten -# His ideas, however, linger in here (and his dictionary of abbreviations) +# His ideas, however, linger in here (and his dictionary_file of abbreviations) -my $dictionary = 'splitter.dict'; -my $abbrv_file = 'splitter.abv'; -my $len = 0; +my $dictionary_file = 'splitter.dict'; +my $abbreviations_file = 'splitter.abv'; +my $length = 0; my %COMMON_TERMS = (); my %ABBREVIATIONS = (); my $output_file = $ARGV[0]; @@ -45,8 +45,8 @@ $path =~ s/[^\/]+$//; if ($path eq '') { $path = './'; } -$dictionary = $path . $dictionary; -$abbrv_file = $path . $abbrv_file; +$dictionary_file = $path . $dictionary_file; +$abbreviations_file = $path . $abbreviations_file; die "Usage $0 <filename>.comments" unless $ARGV[0] =~ /\.comments$/; @@ -59,10 +59,10 @@ open(OUT, ">$output_file") or die("Unable to create output file [$output_file]") # Load in the dictionary and find the common words. # Here, we assume the words in upper case are simply names and one # word per line - i.e. in same form as /usr/dict/words -&loadDictionary; +&load_dictionary; # Same assumptions as for dictionary -&loadAbbreviations; +&load_abbreviations; my $text; # open(FILE, $opt_f) or die "Can't open $opt_f for reading\n"; @@ -132,22 +132,22 @@ while ($text =~ /^([^\n]*)\n/gsm) { $count++ if ($c ge 'A' && $c le 'z'); } - my @sentences = Split_Text($curr); + my @sentences = split_text($curr); my $count2 = 0; - foreach my $s (@sentences) { - for my $i (0..length($s)-1) { - my $c = substr($s, $i, 1); + foreach my $sentence (@sentences) { + for my $i (0..length($sentence)-1) { + my $c = substr($sentence, $i, 1); $count2++ if ($c ge 'A' && $c le 'z'); } - print OUT Clean_Sentence($s) , "\n"; + print OUT clean_sentence($sentence) , "\n"; } if ($count != $count2) { print STDERR "-------------------------------------\n"; print STDERR "[$curr]\n"; - foreach my $s (@sentences) { - print STDERR Clean_Sentence($s) , "\n"; + foreach my $sentence (@sentences) { + print STDERR clean_sentence($sentence) , "\n"; } die "Number of printable chars does not match! [$count][$count2]"; } @@ -161,7 +161,7 @@ exit; # procedures #*************************************************************************************************** -sub Clean_Sentence { +sub clean_sentence { ($_) = @_; # check for trailing bullets of different types @@ -187,16 +187,16 @@ sub Clean_Sentence { return $_; } -sub Split_Text { +sub split_text { my ($text) = @_; - my $len = 0; + my $length = 0; my $next_word; my $last_word; my $stuff_after_period; my $puctuation; my @result; my $after; - my $currentSentence = ''; + my $current_sentence = ''; # this breaks the sentence into # 1. Any text before a separator # 2. The separator [.!?:\n] @@ -207,21 +207,21 @@ sub Split_Text { (?=(.?)) /xsm) { #/(?:(?=([([{\"\'`)}\]<]*[ ]+)[([{\"\'`)}\] ]*([A-Z0-9][a-z]*))|(?=([()\"\'`)}\<\] ]+)\s))/sm ) { $text = $'; #'; - my $sentenceMatch = $1; + my $sentence_match = $1; my $sentence = $1 . $2; my $punctuation = $2; $after = $3; # if next character is not a space, then we are not in a sentence" if ($after ne ' ' && $after ne "\t") { - $currentSentence .= $sentence; + $current_sentence .= $sentence; next; } #at this point we know that there is a space after if ($punctuation eq ':' || $punctuation eq '?' || $punctuation eq '!') { # let us consider this right here a beginning of a sentence - push @result, $currentSentence . $sentence; - $currentSentence = ''; + push @result, $current_sentence . $sentence; + $current_sentence = ''; next; } if ($punctuation eq '.') { @@ -235,33 +235,33 @@ sub Split_Text { # is the last word an abbreviation? For this the period has to follow the word # this expression might have to be updated to take care of special characters in names :( - if ($sentenceMatch =~ /(.?)([^[:punct:]\s]+)$/) { + if ($sentence_match =~ /(.?)([^[:punct:]\s]+)$/) { my $before = $1; - my $lastWord = $2; + my $last_word = $2; #is it an abbreviation - if (length($lastWord) == 1 ) { + if (length($last_word) == 1 ) { # single character abbreviations are special... # we will assume they never split the sentence if they are capitalized. - if (($lastWord ge 'A') and ($lastWord le 'Z')) { - $currentSentence .= $sentence; + if (($last_word ge 'A') and ($last_word le 'Z')) { + $current_sentence .= $sentence; next; } - print "last word an abbrev $sentenceMatch lastword [$lastWord] before [$before]\n"; + print "last word an abbrev $sentence_match lastword [$last_word] before [$before]\n"; # but some are lowercase! - if (($lastWord eq 'e') or ($lastWord eq 'i')) { - $currentSentence .= $sentence; + if (($last_word eq 'e') or ($last_word eq 'i')) { + $current_sentence .= $sentence; next; } - print "2 last word an abbrev $sentenceMatch lastword [$lastWord] before [$before]\n"; + print "2 last word an abbrev $sentence_match lastword [$last_word] before [$before]\n"; } else { - $lastWord = lc $lastWord; + $last_word = lc $last_word; # only accept abbreviations if the previous char to the abbrev is space or # is empty (beginning of line). This avoids things like .c - if (length($before) > 0 and $before eq ' ' and $ABBREVIATIONS{$lastWord}) { - $currentSentence .= $sentence; + if (length($before) > 0 and $before eq ' ' and $ABBREVIATIONS{$last_word}) { + $current_sentence .= $sentence; next; } else { # just keep going, we handle this case below @@ -269,22 +269,22 @@ sub Split_Text { } } - push @result, $currentSentence . $sentence; - $currentSentence = ''; + push @result, $current_sentence . $sentence; + $current_sentence = ''; next; } die 'We have not dealt with this case'; } - push @result, $currentSentence . $text; + push @result, $current_sentence . $text; - #Print_Non_Sentence($text,"\n",''); + #print_non_sentence($text,"\n",''); return @result; } -sub loadDictionary { +sub load_dictionary { my $common_term = ''; - if (open(DICT, $dictionary)) { + if (open(DICT, $dictionary_file)) { while (defined ($line = <DICT>)) { chomp($line); if ($line !~ /^[A-Z]/) { @@ -294,14 +294,14 @@ sub loadDictionary { close(DICT); } else { - die "cannot open dictionary file $dictionary: $!"; + die "cannot open dictionary file $dictionary_file: $!"; } } -sub loadAbbreviations { +sub load_abbreviations { my $abbrv_term = ''; - if (open(ABBRV, $abbrv_file)) { + if (open(ABBRV, $abbreviations_file)) { while (defined ($line = <ABBRV>)) { chomp($line); $ABBREVIATIONS{$line} = $line; @@ -309,7 +309,7 @@ sub loadAbbreviations { close(ABBRV); } else { - die "cannot open dictionary file $abbrv_file: $!"; + die "cannot open abbreviations file $abbreviations_file: $!"; } } |