2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
* Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
24
import java.io.BufferedReader;
25
import java.io.BufferedWriter;
27
import java.io.FileReader;
28
import java.io.FileWriter;
29
import java.util.Collections;
30
import java.util.Date;
31
import java.util.Enumeration;
32
import java.util.HashSet;
33
import java.util.Iterator;
34
import java.util.Vector;
37
* Class that can test whether a given string is a stop word.
38
* Lowercases all words before the test. <p/>
39
* The format for reading and writing is one word per line, lines starting
40
* with '#' are interpreted as comments and therefore skipped. <p/>
41
* The default stopwords are based on <a href="http://www.cs.cmu.edu/~mccallum/bow/rainbow/" target="_blank">Rainbow</a>. <p/>
43
* Accepts the following parameter: <p/>
46
* loads the stopwords from the given file <p/>
49
* saves the stopwords to the given file <p/>
52
* outputs the current stopwords on stdout <p/>
54
* Any additional parameters are interpreted as words to test as stopwords.
56
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
57
* @author Ashraf M. Kibriya (amk14@cs.waikato.ac.nz)
58
* @author FracPete (fracpete at waikato dot ac dot nz)
59
* @version $Revision: 1.5 $
61
public class Stopwords {
63
/** The hash set containing the list of stopwords */
64
protected HashSet m_Words = null;
66
/** The default stopwords object (stoplist based on Rainbow) */
67
protected static Stopwords m_Stopwords;
70
if (m_Stopwords == null) {
71
m_Stopwords = new Stopwords();
76
* initializes the stopwords (based on <a href="http://www.cs.cmu.edu/~mccallum/bow/rainbow/" target="_blank">Rainbow</a>).
79
m_Words = new HashSet();
81
//Stopwords list from Rainbow
179
add("corresponding");
322
add("ll"); //added to avoid words like you'll,I'll etc.
533
add("unfortunately");
551
add("ve"); //added to avoid words like I've,you've etc.
611
* removes all stopwords
613
public void clear() {
618
* adds the given word to the stopword list (is automatically converted to
619
* lower case and trimmed)
621
* @param word the word to add
623
public void add(String word) {
624
if (word.trim().length() > 0)
625
m_Words.add(word.trim().toLowerCase());
629
* removes the word from the stopword list
631
* @param word the word to remove
632
* @return true if the word was found in the list and then removed
634
public boolean remove(String word) {
635
return m_Words.remove(word);
639
* Returns true if the given string is a stop word.
641
* @param word the word to test
642
* @return true if the word is a stopword
644
public boolean is(String word) {
645
return m_Words.contains(word.toLowerCase());
649
* Returns a sorted enumeration over all stored stopwords
651
* @return the enumeration over all stopwords
653
public Enumeration elements() {
657
iter = m_Words.iterator();
660
while (iter.hasNext())
661
list.add(iter.next());
664
Collections.sort(list);
666
return list.elements();
670
* Generates a new Stopwords object from the given file
672
* @param filename the file to read the stopwords from
673
* @throws Exception if reading fails
675
public void read(String filename) throws Exception {
676
read(new File(filename));
680
* Generates a new Stopwords object from the given file
682
* @param file the file to read the stopwords from
683
* @throws Exception if reading fails
685
public void read(File file) throws Exception {
686
read(new BufferedReader(new FileReader(file)));
690
* Generates a new Stopwords object from the reader. The reader is
691
* closed automatically.
693
* @param reader the reader to get the stopwords from
694
* @throws Exception if reading fails
696
public void read(BufferedReader reader) throws Exception {
701
while ((line = reader.readLine()) != null) {
704
if (line.startsWith("#"))
713
* Writes the current stopwords to the given file
715
* @param filename the file to write the stopwords to
716
* @throws Exception if writing fails
718
public void write(String filename) throws Exception {
719
write(new File(filename));
723
* Writes the current stopwords to the given file
725
* @param file the file to write the stopwords to
726
* @throws Exception if writing fails
728
public void write(File file) throws Exception {
729
write(new BufferedWriter(new FileWriter(file)));
733
* Writes the current stopwords to the given writer. The writer is closed
736
* @param writer the writer to get the stopwords from
737
* @throws Exception if writing fails
739
public void write(BufferedWriter writer) throws Exception {
743
writer.write("# generated " + new Date());
748
while (enm.hasMoreElements()) {
749
writer.write(enm.nextElement().toString());
758
* returns the current stopwords in a string
760
* @return the current stopwords
762
public String toString() {
766
result = new StringBuffer();
768
while (enm.hasMoreElements()) {
769
result.append(enm.nextElement().toString());
770
if (enm.hasMoreElements())
774
return result.toString();
778
* Returns true if the given string is a stop word.
780
* @param str the word to test
781
* @return true if the word is a stopword
783
public static boolean isStopword(String str) {
784
return m_Stopwords.is(str.toLowerCase());
788
* Accepts the following parameter: <p/>
791
* loads the stopwords from the given file <p/>
794
* saves the stopwords to the given file <p/>
797
* outputs the current stopwords on stdout <p/>
799
* Any additional parameters are interpreted as words to test as stopwords.
801
* @param args commandline parameters
802
* @throws Exception if something goes wrong
804
public static void main(String[] args) throws Exception {
805
String input = Utils.getOption('i', args);
806
String output = Utils.getOption('o', args);
807
boolean print = Utils.getFlag('p', args);
810
Vector words = new Vector();
811
for (int i = 0; i < args.length; i++) {
812
if (args[i].trim().length() > 0)
813
words.add(args[i].trim());
816
Stopwords stopwords = new Stopwords();
819
if (input.length() != 0)
820
stopwords.read(input);
823
if (output.length() != 0)
824
stopwords.write(output);
828
System.out.println("\nStopwords:");
829
Enumeration enm = stopwords.elements();
831
while (enm.hasMoreElements()) {
832
System.out.println((i+1) + ". " + enm.nextElement());
837
// check words for being a stopword
838
if (words.size() > 0) {
839
System.out.println("\nChecking for stopwords:");
840
for (int i = 0; i < words.size(); i++) {
842
(i+1) + ". " + words.get(i) + ": "
843
+ stopwords.is(words.get(i).toString()));