diff options
Diffstat (limited to 'filter/filter.pl')
-rwxr-xr-x | filter/filter.pl | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/filter/filter.pl b/filter/filter.pl new file mode 100755 index 0000000..35d8d3b --- /dev/null +++ b/filter/filter.pl @@ -0,0 +1,93 @@ +#!/usr/bin/perl +# +# Copyright (C) 2009-2010 Yuki Manabe and Daniel M. German +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + + +# +# filter.pl +# This script classify input sentences into two categories, +# good sentences and bad sentences. +# This script regard a sentence include a critical word (ex. legal term) as good +# +# usage: filter.pl (inputfilename) +# +# Author: Yuki Manabe +# +use strict; + + +#print $ARGV[0]; + +# where are we running the program from +my $path = $0; +$path =~ s/[^\/]+$//; +if ($path eq "") { + $path = "./"; +} +my $critWords = $path . "criticalword.dict"; + +die "Usagee $0 <filename>.sentences" unless $ARGV[0] =~ /\.sentences$/; + +my $goodfilename = $ARGV[0]; + + +die "Filename should end in '.sentences' [$goodfilename]" unless $goodfilename =~ s/\.sentences$/\.goodsent/; +my $badfilename = $ARGV[0]; +$badfilename =~ s/\.sentences$/\.badsent/; + +#print $goodfilename; +#print $badfilename; + +open (INPUTFILE, "<$ARGV[0]") or die ("Error: $ARGV[0] is not found."); +open (DICTIONARY, "<$critWords") or die ("Error: criticalword.dict is not found."); + +open (GOODOUT, ">$goodfilename") || die ("Error"); +open (BADOUT, ">$badfilename") || die ("Error"); + +my @cwordlist=(); +# read dictionary into list +my $cword; +while ($cword=<DICTIONARY>){ + chomp $cword; + next if $cword =~ /^\#/; + $cword =~ s/\#.*$//; # remove everything to the end of file + push(@cwordlist,"$cword"); +} +close(DICTIONARY); + +#matching cliticalwords in list against sentences. +my $sentence; +while ($sentence=<INPUTFILE>){ + my $check=0; + chomp $sentence; + foreach $cword (@cwordlist){ + if($sentence =~ /\b$cword\b/i){ + $check=1; + #print "$cword:$sentence"; + last; + } + } + if ($check==1){ + print GOODOUT "$sentence\n"; + }else{ + print BADOUT "$sentence\n"; + } +} + +close(INPUTFILE); +close(GOODOUT); +close(BADOUT); |