blob: 626904f6e95bdda56bf237a6ed29369fac15d95d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
#!/usr/bin/perl
#
# Copyright (C) 2009-2010 Yuki Manabe and Daniel M. German
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
# filter.pl
# This script classify input sentences into two categories,
# good sentences and bad sentences.
# This script regard a sentence include a critical word (ex. legal term) as good
#
# usage: filter.pl (inputfilename)
#
# Author: Yuki Manabe
#
use strict;
#print $ARGV[0];
# where are we running the program from
my $path = $0;
$path =~ s/[^\/]+$//;
if ($path eq '') {
$path = './';
}
my $critWords = $path . 'criticalword.dict';
die "Usagee $0 <filename>.sentences" unless $ARGV[0] =~ /\.sentences$/;
my $goodfilename = $ARGV[0];
die "Filename should end in '.sentences' [$goodfilename]" unless $goodfilename =~ s/\.sentences$/\.goodsent/;
my $badfilename = $ARGV[0];
$badfilename =~ s/\.sentences$/\.badsent/;
#print $goodfilename;
#print $badfilename;
open (INPUTFILE, "<$ARGV[0]") or die ("Error: $ARGV[0] is not found.");
open (DICTIONARY, "<$critWords") or die ('Error: criticalword.dict is not found.');
open (GOODOUT, ">$goodfilename") || die ('Error');
open (BADOUT, ">$badfilename") || die ('Error');
my @cwordlist=();
# read dictionary into list
my $cword;
while ($cword=<DICTIONARY>){
chomp $cword;
next if $cword =~ /^\#/;
$cword =~ s/\#.*$//; # remove everything to the end of file
push(@cwordlist,"$cword");
}
close(DICTIONARY);
#matching cliticalwords in list against sentences.
my $sentence;
while ($sentence=<INPUTFILE>){
my $check=0;
chomp $sentence;
foreach $cword (@cwordlist){
if($sentence =~ /\b$cword\b/i){
$check=1;
#print "$cword:$sentence";
last;
}
}
if ($check==1){
print GOODOUT "$sentence\n";
}else{
print BADOUT "$sentence\n";
}
}
close(INPUTFILE);
close(GOODOUT);
close(BADOUT);
|