129
136
* @author Stuart Inglis (stuart@reeltwo.com)
130
137
* @author Gordon Paynter (gordon.paynter@ucr.edu)
131
138
* @author Asrhaf M. Kibriya (amk14@cs.waikato.ac.nz)
132
* @version $Revision: 1.20 $
139
* @version $Revision: 1.25 $
135
142
public class StringToWordVector
137
144
implements UnsupervisedFilter, OptionHandler {
139
/** for serialization */
146
/** for serialization. */
140
147
static final long serialVersionUID = 8249106275278565424L;
142
/** Range of columns to convert to word vectors */
149
/** Range of columns to convert to word vectors. */
143
150
protected Range m_SelectedRange = new Range("first-last");
145
/** Contains a mapping of valid words to attribute indexes */
152
/** Contains a mapping of valid words to attribute indexes. */
146
153
private TreeMap m_Dictionary = new TreeMap();
148
155
/** True if output instances should contain word frequency rather than boolean 0 or 1. */
149
156
private boolean m_OutputCounts = false;
151
/** A String prefix for the attribute names */
158
/** A String prefix for the attribute names. */
152
159
private String m_Prefix = "";
154
161
/** Contains the number of documents (instances) a particular word appears in.
155
The counts are stored with the same indexing as given by m_Dictionary. */
162
The counts are stored with the same indexing as given by m_Dictionary. */
156
163
private int [] m_DocsCounts;
158
165
/** Contains the number of documents (instances) in the input format from
159
which the dictionary is created. It is used in IDF transform. */
166
which the dictionary is created. It is used in IDF transform. */
160
167
private int m_NumInstances = -1;
165
172
* documents which will be normalized to average document length.
167
174
private double m_AvgDocLength = -1;
170
177
* The default number of words (per class if there is a class attribute
171
178
* assigned) to attempt to keep.
173
180
private int m_WordsToKeep = 1000;
183
* The percentage at which to periodically prune the dictionary.
185
private double m_PeriodicPruningRate = -1;
175
187
/** True if word frequencies should be transformed into log(1+fi)
176
where fi is the frequency of word i
188
where fi is the frequency of word i.
178
190
private boolean m_TFTransform;
180
192
/** The normalization to apply. */
181
193
protected int m_filterType = FILTER_NONE;
183
/** normalization: No normalization */
195
/** normalization: No normalization. */
184
196
public static final int FILTER_NONE = 0;
185
/** normalization: Normalize all data */
197
/** normalization: Normalize all data. */
186
198
public static final int FILTER_NORMALIZE_ALL = 1;
187
/** normalization: Normalize test data only */
199
/** normalization: Normalize test data only. */
188
200
public static final int FILTER_NORMALIZE_TEST_ONLY = 2;
190
202
/** Specifies whether document's (instance's) word frequencies are
199
211
/** True if word frequencies should be transformed into
200
fij*log(numOfDocs/numOfDocsWithWordi) */
212
fij*log(numOfDocs/numOfDocsWithWordi). */
201
213
private boolean m_IDFTransform;
203
/** True if all tokens should be downcased */
215
/** True if all tokens should be downcased. */
204
216
private boolean m_lowerCaseTokens;
206
218
/** True if tokens that are on a stoplist are to be ignored. */
207
219
private boolean m_useStoplist;
209
/** the stemming algorithm */
221
/** the stemming algorithm. */
210
222
private Stemmer m_Stemmer = new NullStemmer();
212
/** the minimum (per-class) word frequency */
224
/** the minimum (per-class) word frequency. */
213
225
private int m_minTermFreq = 1;
215
/** whether to operate on a per-class basis */
227
/** whether to operate on a per-class basis. */
216
228
private boolean m_doNotOperateOnPerClassBasis = false;
218
230
/** a file containing stopwords for using others than the default Rainbow
220
232
private File m_Stopwords = new File(System.getProperty("user.dir"));
222
/** the tokenizer algorithm to use */
234
/** the tokenizer algorithm to use. */
223
235
private Tokenizer m_Tokenizer = new WordTokenizer();
321
339
* Parses a given list of options. <p/>
323
<!-- options-start -->
324
* Valid options are: <p/>
327
* Output word counts rather than boolean word presence.
330
* <pre> -R <index1,index2-index4,...>
331
* Specify list of string attributes to convert to words (as weka Range).
332
* (default: select all string attributes)</pre>
335
* Invert matching sense of column indexes.</pre>
337
* <pre> -P <attribute name prefix>
338
* Specify a prefix for the created attribute names.
339
* (default: "")</pre>
341
* <pre> -W <number of words to keep>
342
* Specify approximate number of word fields to create.
343
* Surplus words will be discarded..
344
* (default: 1000)</pre>
347
* Transform the word frequencies into log(1+fij)
348
* where fij is the frequency of word i in jth document(instance).
352
* Transform each word frequency into:
353
* fij*log(num of Documents/num of documents containing word i)
354
* where fij if frequency of word i in jth document(instance)</pre>
357
* Whether to 0=not normalize/1=normalize all data/2=normalize test data only
358
* to average length of training documents (default 0=don't normalize).</pre>
361
* Convert all tokens to lowercase before adding to the dictionary.</pre>
364
* Ignore words that are in the stoplist.</pre>
366
* <pre> -stemmer <spec>
367
* The stemmering algorihtm (classname plus parameters) to use.</pre>
369
* <pre> -M <int>
370
* The minimum term frequency (default = 1).</pre>
373
* If this is set, the maximum number of words and the
374
* minimum term frequency is not enforced on a per-class
375
* basis but based on the documents in all the classes
376
* (even if a class attribute is set).</pre>
378
* <pre> -stopwords <file>
379
* A file containing stopwords to override the default ones.
380
* Using this option automatically sets the flag ('-S') to use the
381
* stoplist if the file exists.
382
* Format: one stopword per line, lines starting with '#'
383
* are interpreted as comments and ignored.</pre>
385
* <pre> -tokenizer <spec>
386
* The tokenizing algorihtm (classname plus parameters) to use.
387
* (default: weka.core.tokenizers.WordTokenizer)</pre>
341
<!-- options-start -->
342
* Valid options are: <p/>
345
* Output word counts rather than boolean word presence.
348
* <pre> -R <index1,index2-index4,...>
349
* Specify list of string attributes to convert to words (as weka Range).
350
* (default: select all string attributes)</pre>
353
* Invert matching sense of column indexes.</pre>
355
* <pre> -P <attribute name prefix>
356
* Specify a prefix for the created attribute names.
357
* (default: "")</pre>
359
* <pre> -W <number of words to keep>
360
* Specify approximate number of word fields to create.
361
* Surplus words will be discarded..
362
* (default: 1000)</pre>
364
* <pre> -prune-rate <rate as a percentage of dataset>
365
* Specify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.
366
* -W prunes after creating a full dictionary. You may not have enough memory for this approach.
367
* (default: no periodic pruning)</pre>
370
* Transform the word frequencies into log(1+fij)
371
* where fij is the frequency of word i in jth document(instance).
375
* Transform each word frequency into:
376
* fij*log(num of Documents/num of documents containing word i)
377
* where fij if frequency of word i in jth document(instance)</pre>
380
* Whether to 0=not normalize/1=normalize all data/2=normalize test data only
381
* to average length of training documents (default 0=don't normalize).</pre>
384
* Convert all tokens to lowercase before adding to the dictionary.</pre>
387
* Ignore words that are in the stoplist.</pre>
389
* <pre> -stemmer <spec>
390
* The stemmering algorihtm (classname plus parameters) to use.</pre>
392
* <pre> -M <int>
393
* The minimum term frequency (default = 1).</pre>
396
* If this is set, the maximum number of words and the
397
* minimum term frequency is not enforced on a per-class
398
* basis but based on the documents in all the classes
399
* (even if a class attribute is set).</pre>
401
* <pre> -stopwords <file>
402
* A file containing stopwords to override the default ones.
403
* Using this option automatically sets the flag ('-S') to use the
404
* stoplist if the file exists.
405
* Format: one stopword per line, lines starting with '#'
406
* are interpreted as comments and ignored.</pre>
408
* <pre> -tokenizer <spec>
409
* The tokenizing algorihtm (classname plus parameters) to use.
410
* (default: weka.core.tokenizers.WordTokenizer)</pre>
391
414
* @param options the list of options as an array of strings
392
415
* @throws Exception if an option is not supported
394
417
public void setOptions(String[] options) throws Exception {
397
420
value = Utils.getOption('R', options);
398
421
if (value.length() != 0)
399
422
setSelectedRange(value);
727
* Returns a string describing this filter
768
* Returns a string describing this filter.
728
770
* @return a description of the filter suitable for
729
771
* displaying in the explorer/experimenter gui
731
773
public String globalInfo() {
733
"Converts String attributes into a set of attributes representing "
734
+ "word occurrence (depending on the tokenizer) information from the "
735
+ "text contained in the strings. The set of words (attributes) is "
736
+ "determined by the first batch filtered (typically training data).";
775
"Converts String attributes into a set of attributes representing "
776
+ "word occurrence (depending on the tokenizer) information from the "
777
+ "text contained in the strings. The set of words (attributes) is "
778
+ "determined by the first batch filtered (typically training data).";
740
782
* Gets whether output instances contain 0 or 1 indicating word
741
783
* presence, or word counts.
788
* Returns the tip text for this property
831
* Returns the tip text for this property.
790
833
* @return tip text for this property suitable for
791
834
* displaying in the explorer/experimenter gui
793
836
public String attributeIndicesTipText() {
794
837
return "Specify range of attributes to act on."
795
+ " This is a comma separated list of attribute indices, with"
796
+ " \"first\" and \"last\" valid values. Specify an inclusive"
797
+ " range with \"-\". E.g: \"first-3,5,6-10,last\".";
838
+ " This is a comma separated list of attribute indices, with"
839
+ " \"first\" and \"last\" valid values. Specify an inclusive"
840
+ " range with \"-\". E.g: \"first-3,5,6-10,last\".";
801
* Gets the current range selection
844
* Gets the current range selection.
803
846
* @return a string containing a comma separated list of ranges
836
* Returns the tip text for this property
879
* Returns the tip text for this property.
838
881
* @return tip text for this property suitable for
839
882
* displaying in the explorer/experimenter gui
841
884
public String invertSelectionTipText() {
842
885
return "Set attribute selection mode. If false, only selected"
843
+ " attributes in the range will be worked on; if"
844
+ " true, only non-selected attributes will be processed.";
886
+ " attributes in the range will be worked on; if"
887
+ " true, only non-selected attributes will be processed.";
848
* Gets whether the supplied columns are to be processed or skipped
891
* Gets whether the supplied columns are to be processed or skipped.
850
893
* @return true if the supplied columns will be kept
911
955
public void setWordsToKeep(int newWordsToKeep) {
912
956
m_WordsToKeep = newWordsToKeep;
916
* Returns the tip text for this property
960
* Returns the tip text for this property.
917
962
* @return tip text for this property suitable for
918
963
* displaying in the explorer/experimenter gui
920
965
public String wordsToKeepTipText() {
921
return "The number of words (per class if there is a class attribute "+
922
"assigned) to attempt to keep.";
966
return "The number of words (per class if there is a class attribute "+
967
"assigned) to attempt to keep.";
971
* Gets the rate at which the dictionary is periodically pruned, as a
972
* percentage of the dataset size.
974
* @return the rate at which the dictionary is periodically pruned
976
public double getPeriodicPruning() {
977
return m_PeriodicPruningRate;
981
* Sets the rate at which the dictionary is periodically pruned, as a
982
* percentage of the dataset size.
984
* @param newPeriodicPruning the rate at which the dictionary is periodically pruned
986
public void setPeriodicPruning(double newPeriodicPruning) {
987
m_PeriodicPruningRate = newPeriodicPruning;
991
* Returns the tip text for this property.
993
* @return tip text for this property suitable for
994
* displaying in the explorer/experimenter gui
996
public String periodicPruningTipText() {
997
return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. "
998
+ "wordsToKeep prunes after creating a full dictionary. You may not have enough "
999
+ "memory for this approach.";
925
1002
/** Gets whether if the word frequencies should be transformed into
928
1005
* @return true if word frequencies are to be transformed.
930
1007
public boolean getTFTransform() {
931
return this.m_TFTransform;
1008
return this.m_TFTransform;
934
1011
/** Sets whether if the word frequencies should be transformed into
935
1012
* log(1+fij) where fij is the frequency of word i in document(instance) j.
937
1014
* @param TFTransform true if word frequencies are to be transformed.
939
1016
public void setTFTransform(boolean TFTransform) {
940
this.m_TFTransform = TFTransform;
1017
this.m_TFTransform = TFTransform;
944
* Returns the tip text for this property
1021
* Returns the tip text for this property.
945
1023
* @return tip text for this property suitable for
946
1024
* displaying in the explorer/experimenter gui
948
1026
public String TFTransformTipText() {
949
return "Sets whether if the word frequencies should be transformed into:\n "+
951
" where fij is the frequency of word i in document (instance) j.";
1027
return "Sets whether if the word frequencies should be transformed into:\n "+
1029
" where fij is the frequency of word i in document (instance) j.";
954
1032
/** Sets whether if the word frequencies in a document should be transformed
956
1034
* fij*log(num of Docs/num of Docs with word i) <br>
970
1048
* @param IDFTransform true if the word frequecies are to be transformed
972
1050
public void setIDFTransform(boolean IDFTransform) {
973
this.m_IDFTransform = IDFTransform;
1051
this.m_IDFTransform = IDFTransform;
977
* Returns the tip text for this property
1055
* Returns the tip text for this property.
978
1057
* @return tip text for this property suitable for
979
1058
* displaying in the explorer/experimenter gui
981
1060
public String IDFTransformTipText() {
982
return "Sets whether if the word frequencies in a document should be "+
983
"transformed into: \n"+
984
" fij*log(num of Docs/num of Docs with word i) \n"+
985
" where fij is the frequency of word i in document (instance) j.";
1061
return "Sets whether if the word frequencies in a document should be "+
1062
"transformed into: \n"+
1063
" fij*log(num of Docs/num of Docs with word i) \n"+
1064
" where fij is the frequency of word i in document (instance) j.";
989
1068
/** Gets whether if the word frequencies for a document (instance) should
990
1069
* be normalized or not.
996
1075
return new SelectedTag(m_filterType, TAGS_FILTER);
999
1078
/** Sets whether if the word frequencies for a document (instance) should
1000
1079
* be normalized or not.
1002
1081
* @param newType the new type.
1004
1083
public void setNormalizeDocLength(SelectedTag newType) {
1006
1085
if (newType.getTags() == TAGS_FILTER) {
1007
1086
m_filterType = newType.getSelectedTag().getID();
1012
* Returns the tip text for this property
1091
* Returns the tip text for this property.
1014
1093
* @return tip text for this property suitable for
1015
1094
* displaying in the explorer/experimenter gui
1017
1096
public String normalizeDocLengthTipText() {
1018
return "Sets whether if the word frequencies for a document (instance) "+
1019
"should be normalized or not.";
1097
return "Sets whether if the word frequencies for a document (instance) "+
1098
"should be normalized or not.";
1022
1101
/** Gets whether if the tokens are to be downcased or not.
1024
1103
* @return true if the tokens are to be downcased.
1026
1105
public boolean getLowerCaseTokens() {
1027
return this.m_lowerCaseTokens;
1106
return this.m_lowerCaseTokens;
1030
1109
/** Sets whether if the tokens are to be downcased or not. (Doesn't affect
1031
1110
* non-alphabetic characters in tokens).
1236
1315
* @param array the array to sort
1238
1317
private static void sortArray(int [] array) {
1240
1319
int i, j, h, N = array.length - 1;
1242
1321
for (h = 1; h <= N / 9; h = 3 * h + 1);
1244
1323
for (; h > 0; h /= 3) {
1245
1324
for (i = h + 1; i <= N; i++) {
1248
while (j > h && array[j - h] > v ) {
1249
array[j] = array[j - h];
1327
while (j > h && array[j - h] > v ) {
1328
array[j] = array[j - h];
1258
* determines the selected range
1337
* determines the selected range.
1260
1339
private void determineSelectedRange() {
1262
1341
Instances inputFormat = getInputFormat();
1264
1343
// Calculate the default set of fields to convert
1265
1344
if (m_SelectedRange == null) {
1266
1345
StringBuffer fields = new StringBuffer();
1328
1409
// Iterate through all relevant string attributes of the current instance
1329
1410
Hashtable h = new Hashtable();
1330
1411
for (int j = 0; j < instance.numAttributes(); j++) {
1331
if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {
1412
if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {
1333
1414
// Get tokenizer
1334
m_Tokenizer.tokenize(instance.stringValue(j));
1415
m_Tokenizer.tokenize(instance.stringValue(j));
1336
1417
// Iterate through tokens, perform stemming, and remove stopwords
1337
1418
// (if required)
1338
while (m_Tokenizer.hasMoreElements()) {
1339
String word = ((String)m_Tokenizer.nextElement()).intern();
1341
if(this.m_lowerCaseTokens==true)
1342
word = word.toLowerCase();
1344
word = m_Stemmer.stem(word);
1346
if(this.m_useStoplist==true)
1347
if(stopwords.is(word))
1350
if(!(h.contains(word)))
1351
h.put(word, new Integer(0));
1353
Count count = (Count)dictionaryArr[vInd].get(word);
1354
if (count == null) {
1355
dictionaryArr[vInd].put(word, new Count(1));
1419
while (m_Tokenizer.hasMoreElements()) {
1420
String word = ((String)m_Tokenizer.nextElement()).intern();
1422
if(this.m_lowerCaseTokens==true)
1423
word = word.toLowerCase();
1425
word = m_Stemmer.stem(word);
1427
if(this.m_useStoplist==true)
1428
if(stopwords.is(word))
1431
if(!(h.contains(word)))
1432
h.put(word, new Integer(0));
1434
Count count = (Count)dictionaryArr[vInd].get(word);
1435
if (count == null) {
1436
dictionaryArr[vInd].put(word, new Count(1));
1363
1444
//updating the docCount for the words that have occurred in this
1385
1486
Iterator it = dictionaryArr[z].keySet().iterator();
1386
1487
while (it.hasNext()) {
1387
String word = (String)it.next();
1388
Count count = (Count)dictionaryArr[z].get(word);
1389
array[pos] = count.count;
1488
String word = (String)it.next();
1489
Count count = (Count)dictionaryArr[z].get(word);
1490
array[pos] = count.count;
1393
1494
// sort the array
1394
1495
sortArray(array);
1395
1496
if (array.length < m_WordsToKeep) {
1396
// if there aren't enough words, set the threshold to
1497
// if there aren't enough words, set the threshold to
1398
prune[z] = m_minTermFreq;
1499
prune[z] = m_minTermFreq;
1400
// otherwise set it to be at least minFreq
1401
prune[z] = Math.max(m_minTermFreq,
1402
array[array.length - m_WordsToKeep]);
1501
// otherwise set it to be at least minFreq
1502
prune[z] = Math.max(m_minTermFreq,
1503
array[array.length - m_WordsToKeep]);
1406
1507
// Convert the dictionary into an attribute index
1407
1508
// and create one attribute per word
1408
1509
FastVector attributes = new FastVector(totalsize +
1409
getInputFormat().numAttributes());
1510
getInputFormat().numAttributes());
1411
1512
// Add the non-converted attributes
1412
1513
int classIndex = -1;
1413
1514
for (int i = 0; i < getInputFormat().numAttributes(); i++) {
1414
1515
if (!m_SelectedRange.isInRange(i)) {
1415
if (getInputFormat().classIndex() == i) {
1416
classIndex = attributes.size();
1516
if (getInputFormat().classIndex() == i) {
1517
classIndex = attributes.size();
1418
1519
attributes.addElement(getInputFormat().attribute(i).copy());
1422
1523
// Add the word vector attributes (eliminating duplicates
1423
// that occur in multiple classes)
1524
// that occur in multiple classes)
1424
1525
TreeMap newDictionary = new TreeMap();
1425
1526
int index = attributes.size();
1426
1527
for(int z = 0; z < values; z++) {
1427
1528
Iterator it = dictionaryArr[z].keySet().iterator();
1428
1529
while (it.hasNext()) {
1429
String word = (String)it.next();
1430
Count count = (Count)dictionaryArr[z].get(word);
1431
if (count.count >= prune[z]) {
1432
if(newDictionary.get(word) == null) {
1433
newDictionary.put(word, new Integer(index++));
1434
attributes.addElement(new Attribute(m_Prefix + word));
1530
String word = (String)it.next();
1531
Count count = (Count)dictionaryArr[z].get(word);
1532
if (count.count >= prune[z]) {
1533
if(newDictionary.get(word) == null) {
1534
newDictionary.put(word, new Integer(index++));
1535
attributes.addElement(new Attribute(m_Prefix + word));
1440
1541
// Compute document frequencies
1441
1542
m_DocsCounts = new int[attributes.size()];
1442
1543
Iterator it = newDictionary.keySet().iterator();
1499
1600
// Note that the first string value in a
1500
1601
// SparseInstance doesn't get printed.
1501
1602
outputFormatPeek().attribute(firstCopy)
1502
.addStringValue("Hack to defeat SparseInstance bug");
1603
.addStringValue("Hack to defeat SparseInstance bug");
1504
1605
int newIndex = outputFormatPeek().attribute(firstCopy)
1505
.addStringValue(instance.stringValue(i));
1606
.addStringValue(instance.stringValue(i));
1506
1607
contained.put(new Integer(firstCopy),
1507
new Double(newIndex));
1608
new Double(newIndex));
1514
1615
for (int j = 0; j < instance.numAttributes(); j++) {
1515
1616
//if ((getInputFormat().attribute(j).type() == Attribute.STRING)
1516
1617
if (m_SelectedRange.isInRange(j)
1517
1618
&& (instance.isMissing(j) == false)) {
1519
m_Tokenizer.tokenize(instance.stringValue(j));
1521
while (m_Tokenizer.hasMoreElements()) {
1522
String word = (String)m_Tokenizer.nextElement();
1523
if(this.m_lowerCaseTokens==true)
1620
m_Tokenizer.tokenize(instance.stringValue(j));
1622
while (m_Tokenizer.hasMoreElements()) {
1623
String word = (String)m_Tokenizer.nextElement();
1624
if(this.m_lowerCaseTokens==true)
1524
1625
word = word.toLowerCase();
1525
word = m_Stemmer.stem(word);
1526
Integer index = (Integer) m_Dictionary.get(word);
1527
if (index != null) {
1528
if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup
1529
Double count = (Double)contained.get(index);
1530
if (count != null) {
1531
contained.put(index, new Double(count.doubleValue() + 1.0));
1533
contained.put(index, new Double(1));
1536
contained.put(index, new Double(1));
1626
word = m_Stemmer.stem(word);
1627
Integer index = (Integer) m_Dictionary.get(word);
1628
if (index != null) {
1629
if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup
1630
Double count = (Double)contained.get(index);
1631
if (count != null) {
1632
contained.put(index, new Double(count.doubleValue() + 1.0));
1634
contained.put(index, new Double(1));
1637
contained.put(index, new Double(1));
1543
1644
//Doing TFTransform
1544
1645
if(m_TFTransform==true) {
1545
1646
Iterator it = contained.keySet().iterator();