summaryrefslogtreecommitdiff
path: root/filter/filter.pl
blob: 35d8d3bec22988c5dd75a07cc87ab10de325c5fa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/perl
#
#    Copyright (C) 2009-2010  Yuki Manabe and Daniel M. German
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU Affero General Public License as
#    published by the Free Software Foundation, either version 3 of the
#    License, or (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU Affero General Public License for more details.
#
#    You should have received a copy of the GNU Affero General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#


#
# filter.pl
# This script classify input sentences into two categories,
# good sentences and bad sentences.
# This script regard a sentence include a critical word (ex. legal term) as good
#
# usage: filter.pl (inputfilename)
#
# Author: Yuki Manabe
#
use strict;


#print $ARGV[0];

# where are we running the program from
my $path = $0;
$path =~ s/[^\/]+$//;
if ($path eq "") {
    $path = "./";
}
my $critWords = $path . "criticalword.dict";

die "Usagee $0 <filename>.sentences" unless $ARGV[0] =~ /\.sentences$/;

my $goodfilename = $ARGV[0];


die "Filename should end in '.sentences' [$goodfilename]" unless $goodfilename =~ s/\.sentences$/\.goodsent/;
my $badfilename = $ARGV[0];
$badfilename =~ s/\.sentences$/\.badsent/;

#print $goodfilename;
#print $badfilename;

open (INPUTFILE, "<$ARGV[0]") or die ("Error: $ARGV[0] is not found.");
open (DICTIONARY, "<$critWords") or die ("Error: criticalword.dict is not found.");

open (GOODOUT, ">$goodfilename") || die ("Error");
open (BADOUT, ">$badfilename") || die ("Error");

my @cwordlist=();
# read dictionary into list
my $cword;
while ($cword=<DICTIONARY>){
  chomp $cword;
  next if $cword =~ /^\#/;
  $cword =~ s/\#.*$//; # remove everything to the end of file
  push(@cwordlist,"$cword");
}
close(DICTIONARY);

#matching cliticalwords in list against sentences.
my $sentence;
while ($sentence=<INPUTFILE>){
  my $check=0;
  chomp $sentence;
  foreach $cword (@cwordlist){
    if($sentence =~ /\b$cword\b/i){
      $check=1;
      #print "$cword:$sentence";
      last;
    }
  }
  if ($check==1){
    print GOODOUT "$sentence\n";
  }else{
     print BADOUT "$sentence\n";
  }
}

close(INPUTFILE);
close(GOODOUT);
close(BADOUT);