blob: 18aeb7f5d3898b9e8b19cfe57f51692a1de13396 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
# Performance note: I benchmarked this code using a set instead of
# a list for the stopwords and was surprised to find that the list
# performed /better/ than the set - maybe because it's only a small
# list.
stopwords = '''
i
a
an
are
as
at
be
by
for
from
how
in
is
it
of
on
or
that
the
this
to
was
what
when
where
'''.split()
def strip_stopwords(sentence):
"Removes stopwords - also normalizes whitespace"
words = sentence.split()
sentence = []
for word in words:
if word.lower() not in stopwords:
sentence.append(word)
return u' '.join(sentence)
|