2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
* TextDirectoryLoader.java
19
* Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
23
package weka.core.converters;
25
import weka.core.Attribute;
26
import weka.core.FastVector;
27
import weka.core.Instance;
28
import weka.core.Instances;
29
import weka.core.Option;
30
import weka.core.OptionHandler;
31
import weka.core.Utils;
33
import java.io.BufferedInputStream;
35
import java.io.FileInputStream;
36
import java.io.IOException;
37
import java.util.Enumeration;
38
import java.util.Vector;
41
<!-- globalinfo-start -->
42
* Loads all text files in a directory and uses the subdirectory names as class labels. The content of the text files will be stored in a String attribute, the filename can be stored as well.
44
<!-- globalinfo-end -->
46
<!-- options-start -->
47
* Valid options are: <p/>
50
* Enables debug output.
51
* (default: off)</pre>
54
* Stores the filename in an additional attribute.
55
* (default: off)</pre>
57
* <pre> -dir <directory>
58
* The directory to work on.
59
* (default: current directory)</pre>
63
* Based on code from the TextDirectoryToArff tool:
65
* <li><a href="http://list.scms.waikato.ac.nz/mailman/htdig/wekalist/2002-October/000681.html" target="_blank">Original tool</a></li>
66
* <li><a href="http://list.scms.waikato.ac.nz/mailman/htdig/wekalist/2004-January/002152.html" target="_blank">Current version</a></li>
67
* <li><a href="http://weka.sourceforge.net/wiki/index.php/ARFF_files_from_Text_Collections" target="_blank">Wiki article</a></li>
70
* @author Ashraf M. Kibriya (amk14 at cs.waikato.ac.nz)
71
* @author Richard Kirkby (rkirkby at cs.waikato.ac.nz)
72
* @author fracpete (fracpete at waikato dot ac dot nz)
73
* @version $Revision: 1.2 $
76
public class TextDirectoryLoader
77
extends AbstractLoader
78
implements BatchConverter, OptionHandler {
80
/** for serialization */
81
private static final long serialVersionUID = 2592118773712247647L;
83
/** Holds the determined structure (header) of the data set. */
84
protected Instances m_structure = null;
86
/** Holds the source of the data set. */
87
protected File m_sourceFile = new File(System.getProperty("user.dir"));
89
/** whether to print some debug information */
90
protected boolean m_Debug = false;
92
/** whether to include the filename as an extra attribute */
93
protected boolean m_OutputFilename = false;
98
public TextDirectoryLoader() {
99
// No instances retrieved yet
104
* Returns a string describing this loader
106
* @return a description of the evaluator suitable for
107
* displaying in the explorer/experimenter gui
109
public String globalInfo() {
111
"Loads all text files in a directory and uses the subdirectory names "
112
+ "as class labels. The content of the text files will be stored in a "
113
+ "String attribute, the filename can be stored as well.";
117
* Lists the available options
119
* @return an enumeration of the available options
121
public Enumeration listOptions() {
123
Vector result = new Vector();
125
result.add(new Option(
126
"\tEnables debug output.\n"
127
+ "\t(default: off)",
130
result.add(new Option(
131
"\tStores the filename in an additional attribute.\n"
132
+ "\t(default: off)",
135
result.add(new Option(
136
"\tThe directory to work on.\n"
137
+ "\t(default: current directory)",
138
"dir", 0, "-dir <directory>"));
140
return result.elements();
144
* Parses a given list of options. <p/>
146
<!-- options-start -->
147
* Valid options are: <p/>
150
* Enables debug output.
151
* (default: off)</pre>
154
* Stores the filename in an additional attribute.
155
* (default: off)</pre>
157
* <pre> -dir <directory>
158
* The directory to work on.
159
* (default: current directory)</pre>
163
* @param options the options
164
* @throws Exception if options cannot be set
166
public void setOptions(String[] options) throws Exception {
167
setDebug(Utils.getFlag("D", options));
169
setOutputFilename(Utils.getFlag("F", options));
171
setDirectory(new File(Utils.getOption("dir", options)));
177
* @return the current setting
179
public String[] getOptions() {
180
Vector options = new Vector();
185
if (getOutputFilename())
189
options.add(getDirectory().getAbsolutePath());
191
return (String[]) options.toArray(new String[options.size()]);
195
* Sets whether to print some debug information.
197
* @param value if true additional debug information will be printed.
199
public void setDebug(boolean value) {
204
* Gets whether additional debug information is printed.
206
* @return true if additional debug information is printed
208
public boolean getDebug() {
213
* the tip text for this property
215
* @return the tip text
217
public String debugTipText(){
218
return "Whether to print additional debug information to the console.";
222
* Sets whether the filename will be stored as an extra attribute.
224
* @param value if true the filename will be stored in an extra
227
public void setOutputFilename(boolean value) {
228
m_OutputFilename = value;
233
* Gets whether the filename will be stored as an extra attribute.
235
* @return true if the filename is stored in an extra attribute
237
public boolean getOutputFilename() {
238
return m_OutputFilename;
242
* the tip text for this property
244
* @return the tip text
246
public String outputFilenameTipText(){
247
return "Whether to store the filename in an additional attribute.";
251
* Returns a description of the file type, actually it's directories.
253
* @return a short file description
255
public String getFileDescription() {
256
return "Directories";
260
* get the Dir specified as the source
262
* @return the source directory
264
public File getDirectory() {
265
return new File(m_sourceFile.getAbsolutePath());
269
* sets the source directory
271
* @param dir the source directory
272
* @throws IOException if an error occurs
274
public void setDirectory(File dir) throws IOException {
279
* Resets the loader ready to read a new data set
281
public void reset() {
287
* Resets the Loader object and sets the source of the data set to be
288
* the supplied File object.
290
* @param dir the source directory.
291
* @throws IOException if an error occurs
293
public void setSource(File dir) throws IOException {
297
throw new IOException("Source directory object is null!");
301
if (!dir.exists() || !dir.isDirectory())
302
throw new IOException("Directory '" + dir + "' not found");
306
* Determines and returns (if possible) the structure (internally the
307
* header) of the data set as an empty set of instances.
309
* @return the structure of the data set as an empty
311
* @throws IOException if an error occurs
313
public Instances getStructure() throws IOException {
314
if (getDirectory() == null) {
315
throw new IOException("No directory/source has been specified");
318
// determine class labels, i.e., sub-dirs
319
if (m_structure == null) {
320
String directoryPath = getDirectory().getAbsolutePath();
321
FastVector atts = new FastVector();
322
FastVector classes = new FastVector();
324
File dir = new File(directoryPath);
325
String[] subdirs = dir.list();
327
for (int i = 0; i < subdirs.length; i++) {
328
File subdir = new File(directoryPath + File.separator + subdirs[i]);
329
if (subdir.isDirectory())
330
classes.addElement(subdirs[i]);
333
atts.addElement(new Attribute("text", (FastVector) null));
334
if (m_OutputFilename)
335
atts.addElement(new Attribute("filename", (FastVector) null));
336
atts.addElement(new Attribute("class", classes));
338
String relName = directoryPath.replaceAll("/", "_");
339
relName = relName.replaceAll("\\\\", "_").replaceAll(":", "_");
340
m_structure = new Instances(relName, atts, 0);
341
m_structure.setClassIndex(m_structure.numAttributes() - 1);
348
* Return the full data set. If the structure hasn't yet been determined
349
* by a call to getStructure then method should do so before processing
350
* the rest of the data set.
352
* @return the structure of the data set as an empty set of Instances
353
* @throws IOException if there is no source or parsing fails
355
public Instances getDataSet() throws IOException {
356
if (getDirectory() == null)
357
throw new IOException("No directory/source has been specified");
359
String directoryPath = getDirectory().getAbsolutePath();
360
FastVector classes = new FastVector();
361
Enumeration enm = getStructure().classAttribute().enumerateValues();
362
while (enm.hasMoreElements())
363
classes.addElement(enm.nextElement());
365
Instances data = getStructure();
367
for (int k = 0; k < classes.size(); k++) {
368
String subdirPath = (String) classes.elementAt(k);
369
File subdir = new File(directoryPath + File.separator + subdirPath);
370
String[] files = subdir.list();
371
for (int j = 0; j < files.length; j++) {
376
"processing " + fileCount + " : " + subdirPath + " : " + files[j]);
378
double[] newInst = null;
379
if (m_OutputFilename)
380
newInst = new double[3];
382
newInst = new double[2];
383
File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]);
384
BufferedInputStream is;
385
is = new BufferedInputStream(new FileInputStream(txt));
386
StringBuffer txtStr = new StringBuffer();
388
while ((c = is.read()) != -1) {
389
txtStr.append((char) c);
392
newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
393
if (m_OutputFilename)
394
newInst[1] = (double) data.attribute(1).addStringValue(subdirPath + File.separator + files[j]);
395
newInst[data.classIndex()] = (double) k;
396
data.add(new Instance(1.0, newInst));
398
catch (Exception e) {
399
System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath + File.separator + files[j]);
408
* TextDirectoryLoader is unable to process a data set incrementally.
410
* @param structure ignored
411
* @return never returns without throwing an exception
412
* @throws IOException always. TextDirectoryLoader is unable to process a data
415
public Instance getNextInstance(Instances structure) throws IOException {
416
throw new IOException("TextDirectoryLoader can't read data sets incrementally.");
422
* @param args should contain the name of an input file.
424
public static void main(String[] args) {
425
if (args.length > 0) {
427
TextDirectoryLoader loader = new TextDirectoryLoader();
428
loader.setOptions(args);
429
System.out.println(loader.getDataSet());
431
catch (Exception e) {
438
+ "\tTextDirectoryLoader [options]\n"
442
Enumeration enm = ((OptionHandler) new TextDirectoryLoader()).listOptions();
443
while (enm.hasMoreElements()) {
444
Option option = (Option) enm.nextElement();
445
System.err.println(option.synopsis());
446
System.err.println(option.description());
449
System.err.println();