2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 2 of the License, or
5
* (at your option) any later version.
7
* This program is distributed in the hope that it will be useful,
8
* but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
* GNU General Public License for more details.
12
* You should have received a copy of the GNU General Public License
13
* along with this program; if not, write to the Free Software
14
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
* PredictiveApriori.java
19
* Copyright (C) 2004 University of Waikato, Hamilton, New Zealand
23
package weka.associations;
25
import weka.core.Capabilities;
26
import weka.core.FastVector;
27
import weka.core.Instances;
28
import weka.core.Option;
29
import weka.core.OptionHandler;
30
import weka.core.TechnicalInformation;
31
import weka.core.TechnicalInformationHandler;
32
import weka.core.Utils;
33
import weka.core.Capabilities.Capability;
34
import weka.core.TechnicalInformation.Field;
35
import weka.core.TechnicalInformation.Type;
37
import java.util.Enumeration;
38
import java.util.Hashtable;
39
import java.util.TreeSet;
40
import java.util.Vector;
43
<!-- globalinfo-start -->
44
* Class implementing the predictive apriori algorithm to mine association rules.<br/>
45
* It searches with an increasing support threshold for the best 'n' rules concerning a support-based corrected confidence value.<br/>
47
* For more information see:<br/>
49
* Tobias Scheffer: Finding Association Rules That Trade Support Optimally against Confidence. In: 5th European Conference on Principles of Data Mining and Knowledge Discovery, 424-435, 2001.<br/>
51
* The implementation follows the paper expect for adding a rule to the output of the 'n' best rules. A rule is added if:<br/>
52
* the expected predictive accuracy of this rule is among the 'n' best and it is not subsumed by a rule with at least the same expected predictive accuracy (out of an unpublished manuscript from T. Scheffer).
54
<!-- globalinfo-end -->
56
<!-- technical-bibtex-start -->
59
* @inproceedings{Scheffer2001,
60
* author = {Tobias Scheffer},
61
* booktitle = {5th European Conference on Principles of Data Mining and Knowledge Discovery},
63
* publisher = {Springer},
64
* title = {Finding Association Rules That Trade Support Optimally against Confidence},
69
<!-- technical-bibtex-end -->
71
<!-- options-start -->
72
* Valid options are: <p/>
74
* <pre> -N <required number of rules output>
75
* The required number of rules. (default = 100)</pre>
78
* If set class association rules are mined. (default = no)</pre>
80
* <pre> -c <the class index>
81
* The class index. (default = last)</pre>
85
* @author Stefan Mutter (mutter@cs.waikato.ac.nz)
86
* @version $Revision: 1.11 $ */
88
public class PredictiveApriori
90
implements OptionHandler, CARuleMiner, TechnicalInformationHandler {
92
/** for serialization */
93
static final long serialVersionUID = 8109088846865075341L;
95
/** The minimum support. */
96
protected int m_premiseCount;
98
/** The maximum number of rules that are output. */
99
protected int m_numRules;
101
/** The number of rules created for the prior estimation. */
102
protected static final int m_numRandRules = 1000;
104
/** The number of intervals used for the prior estimation. */
105
protected static final int m_numIntervals = 100;
107
/** The set of all sets of itemsets. */
108
protected FastVector m_Ls;
110
/** The same information stored in hash tables. */
111
protected FastVector m_hashtables;
113
/** The list of all generated rules. */
114
protected FastVector[] m_allTheRules;
116
/** The instances (transactions) to be used for generating
117
the association rules. */
118
protected Instances m_instances;
120
/** The hashtable containing the prior probabilities. */
121
protected Hashtable m_priors;
123
/** The mid points of the intervals used for the prior estimation. */
124
protected double[] m_midPoints;
126
/** The expected predictive accuracy a rule needs to be a candidate for the output. */
127
protected double m_expectation;
129
/** The n best rules. */
130
protected TreeSet m_best;
132
/** Flag keeping track if the list of the n best rules has changed. */
133
protected boolean m_bestChanged;
135
/** Counter for the time of generation for an association rule. */
136
protected int m_count;
138
/** The prior estimator. */
139
protected PriorEstimation m_priorEstimator;
141
/** The class index. */
142
protected int m_classIndex;
144
/** Flag indicating whether class association rules are mined. */
145
protected boolean m_car;
148
* Returns a string describing this associator
149
* @return a description of the evaluator suitable for
150
* displaying in the explorer/experimenter gui
152
public String globalInfo() {
154
"Class implementing the predictive apriori algorithm to mine "
155
+ "association rules.\n"
156
+ "It searches with an increasing support threshold for the best 'n' "
157
+ "rules concerning a support-based corrected confidence value.\n\n"
158
+ "For more information see:\n\n"
159
+ getTechnicalInformation().toString() + "\n\n"
160
+ "The implementation follows the paper expect for adding a rule to the "
161
+ "output of the 'n' best rules. A rule is added if:\n"
162
+ "the expected predictive accuracy of this rule is among the 'n' best "
163
+ "and it is not subsumed by a rule with at least the same expected "
164
+ "predictive accuracy (out of an unpublished manuscript from T. "
169
* Returns an instance of a TechnicalInformation object, containing
170
* detailed information about the technical background of this class,
171
* e.g., paper reference or book this class is based on.
173
* @return the technical information about this class
175
public TechnicalInformation getTechnicalInformation() {
176
TechnicalInformation result;
178
result = new TechnicalInformation(Type.INPROCEEDINGS);
179
result.setValue(Field.AUTHOR, "Tobias Scheffer");
180
result.setValue(Field.TITLE, "Finding Association Rules That Trade Support Optimally against Confidence");
181
result.setValue(Field.BOOKTITLE, "5th European Conference on Principles of Data Mining and Knowledge Discovery");
182
result.setValue(Field.YEAR, "2001");
183
result.setValue(Field.PAGES, "424-435");
184
result.setValue(Field.PUBLISHER, "Springer");
190
* Constructor that allows to sets default values for the
191
* minimum confidence and the maximum number of rules
192
* the minimum confidence.
194
public PredictiveApriori() {
200
* Resets the options to the default values.
202
public void resetOptions() {
206
m_best = new TreeSet();
207
m_bestChanged = false;
212
m_priors = new Hashtable();
219
* Returns default capabilities of the classifier.
221
* @return the capabilities of this classifier
223
public Capabilities getCapabilities() {
224
Capabilities result = super.getCapabilities();
227
result.enable(Capability.NOMINAL_ATTRIBUTES);
228
result.enable(Capability.MISSING_VALUES);
231
result.enable(Capability.NOMINAL_CLASS);
232
result.enable(Capability.MISSING_CLASS_VALUES);
238
* Method that generates all large itemsets with a minimum support, and from
239
* these all association rules.
241
* @param instances the instances to be used for generating the associations
242
* @throws Exception if rules can't be built successfully
244
public void buildAssociations(Instances instances) throws Exception {
246
int temp = m_premiseCount, exactNumber = m_numRules-5;
249
m_best = new TreeSet();
250
m_bestChanged = false;
253
m_instances = new Instances(instances);
255
if (m_classIndex == -1)
256
m_instances.setClassIndex(m_instances.numAttributes()-1);
257
else if (m_classIndex < m_instances.numAttributes() && m_classIndex >= 0)
258
m_instances.setClassIndex(m_classIndex);
260
throw new Exception("Invalid class index.");
262
// can associator handle the data?
263
getCapabilities().testWithFail(m_instances);
266
m_priorEstimator = new PriorEstimation(m_instances,m_numRandRules,m_numIntervals,m_car);
267
m_priors = m_priorEstimator.estimatePrior();
268
m_midPoints = m_priorEstimator.getMidPoints();
270
m_Ls = new FastVector();
271
m_hashtables = new FastVector();
273
for(int i =1; i < m_instances.numAttributes();i++){
274
m_bestChanged = false;
276
// find large item sets
277
findLargeItemSets(i);
279
//find association rules (rule generation procedure)
283
findLargeCarItemSets(i);
284
findCaRulesQuickly();
288
temp =m_premiseCount;
289
while(RuleGeneration.expectation(m_premiseCount, m_premiseCount,m_midPoints,m_priors) <= m_expectation){
291
if(m_premiseCount > m_instances.numInstances())
295
if(m_premiseCount > m_instances.numInstances()){
297
// Reserve space for variables
298
m_allTheRules = new FastVector[3];
299
m_allTheRules[0] = new FastVector();
300
m_allTheRules[1] = new FastVector();
301
m_allTheRules[2] = new FastVector();
304
while(m_best.size()>0 && exactNumber > 0){
305
m_allTheRules[0].insertElementAt((ItemSet)((RuleItem)m_best.last()).premise(),k);
306
m_allTheRules[1].insertElementAt((ItemSet)((RuleItem)m_best.last()).consequence(),k);
307
m_allTheRules[2].insertElementAt(new Double(((RuleItem)m_best.last()).accuracy()),k);
314
if(temp != m_premiseCount && m_Ls.size() > 0){
315
FastVector kSets = (FastVector)m_Ls.lastElement();
316
m_Ls.removeElementAt(m_Ls.size()-1);
317
kSets = ItemSet.deleteItemSets(kSets, m_premiseCount,Integer.MAX_VALUE);
318
m_Ls.addElement(kSets);
322
// Reserve space for variables
323
m_allTheRules = new FastVector[3];
324
m_allTheRules[0] = new FastVector();
325
m_allTheRules[1] = new FastVector();
326
m_allTheRules[2] = new FastVector();
329
while(m_best.size()>0 && exactNumber > 0){
330
m_allTheRules[0].insertElementAt((ItemSet)((RuleItem)m_best.last()).premise(),k);
331
m_allTheRules[1].insertElementAt((ItemSet)((RuleItem)m_best.last()).consequence(),k);
332
m_allTheRules[2].insertElementAt(new Double(((RuleItem)m_best.last()).accuracy()),k);
339
* Method that mines the n best class association rules.
340
* @return an sorted array of FastVector (depending on the expected predictive accuracy) containing the rules and metric information
341
* @param data the instances for which class association rules should be mined
342
* @throws Exception if rules can't be built successfully
344
public FastVector[] mineCARs(Instances data) throws Exception{
347
m_best = new TreeSet();
349
m_bestChanged = false;
352
buildAssociations(data);
353
FastVector[] allCARRules = new FastVector[3];
354
allCARRules[0] = new FastVector();
355
allCARRules[1] = new FastVector();
356
allCARRules[2] = new FastVector();
357
for(int k =0; k < m_allTheRules[0].size();k++){
358
int[] newPremiseArray = new int[m_instances.numAttributes()-1];
360
for(int j = 0;j < m_instances.numAttributes();j++){
361
if(j != m_instances.classIndex()){
362
newPremiseArray[help] = ((ItemSet)m_allTheRules[0].elementAt(k)).itemAt(j);
366
ItemSet newPremise = new ItemSet(m_instances.numInstances(), newPremiseArray);
367
newPremise.setCounter (((ItemSet)m_allTheRules[0].elementAt(k)).counter());
368
allCARRules[0].addElement(newPremise);
369
int[] newConsArray = new int[1];
370
newConsArray[0] =((ItemSet)m_allTheRules[1].elementAt(k)).itemAt(m_instances.classIndex());
371
ItemSet newCons = new ItemSet(m_instances.numInstances(), newConsArray);
372
newCons.setCounter(((ItemSet)m_allTheRules[1].elementAt(k)).counter());
373
allCARRules[1].addElement(newCons);
374
allCARRules[2].addElement(m_allTheRules[2].elementAt(k));
381
* Gets the instances without the class attribute
382
* @return instances without class attribute
384
public Instances getInstancesNoClass() {
386
Instances noClass = null;
388
noClass = LabeledItemSet.divide(m_instances,false);
392
System.out.println("\n"+e.getMessage());
394
//System.out.println(noClass);
399
* Gets the class attribute of all instances
400
* @return Instances containing only the class attribute
402
public Instances getInstancesOnlyClass() {
404
Instances onlyClass = null;
406
onlyClass = LabeledItemSet.divide(m_instances,true);
410
System.out.println("\n"+e.getMessage());
417
* Returns an enumeration describing the available options.
419
* @return an enumeration of all the available options.
421
public Enumeration listOptions() {
423
String string1 = "\tThe required number of rules. (default = " + (m_numRules-5) + ")",
424
string2 = "\tIf set class association rules are mined. (default = no)",
425
string3 = "\tThe class index. (default = last)";
426
FastVector newVector = new FastVector(3);
428
newVector.addElement(new Option(string1, "N", 1,
429
"-N <required number of rules output>"));
430
newVector.addElement(new Option(string2, "A", 0,
432
newVector.addElement(new Option(string3, "c", 1,
433
"-c <the class index>"));
434
return newVector.elements();
439
* Parses a given list of options. <p/>
441
<!-- options-start -->
442
* Valid options are: <p/>
444
* <pre> -N <required number of rules output>
445
* The required number of rules. (default = 100)</pre>
448
* If set class association rules are mined. (default = no)</pre>
450
* <pre> -c <the class index>
451
* The class index. (default = last)</pre>
455
* @param options the list of options as an array of strings
456
* @throws Exception if an option is not supported
458
public void setOptions(String[] options) throws Exception {
462
String numRulesString = Utils.getOption('N', options);
463
if (numRulesString.length() != 0)
464
m_numRules = Integer.parseInt(numRulesString)+5;
466
m_numRules = Integer.MAX_VALUE;
468
String classIndexString = Utils.getOption('c',options);
469
if (classIndexString.length() != 0)
470
m_classIndex = Integer.parseInt(classIndexString);
472
m_car = Utils.getFlag('A', options);
476
* Gets the current settings of the PredictiveApriori object.
478
* @return an array of strings suitable for passing to setOptions
480
public String [] getOptions() {
483
result = new Vector();
486
result.add("" + (m_numRules-5));
492
result.add("" + m_classIndex);
494
return (String[]) result.toArray(new String[result.size()]);
499
* Outputs the association rules.
501
* @return a string representation of the model
503
public String toString() {
505
StringBuffer text = new StringBuffer();
507
if (m_allTheRules[0].size() == 0)
508
return "\nNo large itemsets and rules found!\n";
509
text.append("\nPredictiveApriori\n===================\n\n");
510
text.append("\nBest rules found:\n\n");
512
for (int i = 0; i < m_allTheRules[0].size(); i++) {
513
text.append(Utils.doubleToString((double)i+1,
514
(int)(Math.log(m_numRules)/Math.log(10)+1),0)+
515
". " + ((ItemSet)m_allTheRules[0].elementAt(i)).
516
toString(m_instances)
517
+ " ==> " + ((ItemSet)m_allTheRules[1].elementAt(i)).
518
toString(m_instances) +" acc:("+
519
Utils.doubleToString(((Double)m_allTheRules[2].
520
elementAt(i)).doubleValue(),5)+")");
526
return text.toString();
531
* Returns the tip text for this property
532
* @return tip text for this property suitable for
533
* displaying in the explorer/experimenter gui
535
public String numRulesTipText() {
536
return "Number of rules to find.";
540
* Get the value of the number of required rules.
542
* @return Value of the number of required rules.
544
public int getNumRules() {
550
* Set the value of required rules.
552
* @param v Value to assign to number of required rules.
554
public void setNumRules(int v) {
560
* Sets the class index
561
* @param index the index of the class attribute
563
public void setClassIndex(int index){
565
m_classIndex = index;
569
* Gets the index of the class attribute
570
* @return the index of the class attribute
572
public int getClassIndex(){
578
* Returns the tip text for this property
579
* @return tip text for this property suitable for
580
* displaying in the explorer/experimenter gui
582
public String classIndexTipText() {
583
return "Index of the class attribute.\n If set to -1, the last attribute will be taken as the class attribute.";
587
* Sets class association rule mining
588
* @param flag if class association rules are mined, false otherwise
590
public void setCar(boolean flag){
596
* Gets whether class association ruels are mined
597
* @return true if class association rules are mined, false otherwise
599
public boolean getCar(){
605
* Returns the tip text for this property
606
* @return tip text for this property suitable for
607
* displaying in the explorer/experimenter gui
609
public String carTipText() {
610
return "If enabled class association rules are mined instead of (general) association rules.";
614
* Returns the metric string for the chosen metric type.
615
* Predictive apriori uses the estimated predictive accuracy.
616
* Therefore the metric string is "acc".
617
* @return string "acc"
619
public String metricString() {
626
* Method that finds all large itemsets for the given set of instances.
628
* @param index the instances to be used
629
* @throws Exception if an attribute is numeric
631
private void findLargeItemSets(int index) throws Exception {
633
FastVector kMinusOneSets, kSets = new FastVector();
636
// Find large itemsets
639
kSets = ItemSet.singletons(m_instances);
640
ItemSet.upDateCounters(kSets, m_instances);
641
kSets = ItemSet.deleteItemSets(kSets, m_premiseCount,Integer.MAX_VALUE);
642
if (kSets.size() == 0)
644
m_Ls.addElement(kSets);
649
kSets = (FastVector)m_Ls.lastElement();
650
m_Ls.removeAllElements();
652
kMinusOneSets = kSets;
653
kSets = ItemSet.mergeAllItemSets(kMinusOneSets, i, m_instances.numInstances());
654
hashtable = ItemSet.getHashtable(kMinusOneSets, kMinusOneSets.size());
655
m_hashtables.addElement(hashtable);
656
kSets = ItemSet.pruneItemSets(kSets, hashtable);
657
ItemSet.upDateCounters(kSets, m_instances);
658
kSets = ItemSet.deleteItemSets(kSets, m_premiseCount,Integer.MAX_VALUE);
659
if(kSets.size() == 0)
661
m_Ls.addElement(kSets);
669
* Method that finds all association rules.
671
* @throws Exception if an attribute is numeric
673
private void findRulesQuickly() throws Exception {
675
RuleGeneration currentItemSet;
678
for (int j = 0; j < m_Ls.size(); j++) {
679
FastVector currentItemSets = (FastVector)m_Ls.elementAt(j);
680
Enumeration enumItemSets = currentItemSets.elements();
681
while (enumItemSets.hasMoreElements()) {
682
currentItemSet = new RuleGeneration((ItemSet)enumItemSets.nextElement());
683
m_best = currentItemSet.generateRules(m_numRules, m_midPoints,m_priors,m_expectation,
684
m_instances,m_best,m_count);
686
m_count = currentItemSet.m_count;
687
if(!m_bestChanged && currentItemSet.m_change)
688
m_bestChanged = true;
689
//update minimum expected predictive accuracy to get into the n best
691
m_expectation = ((RuleItem)m_best.first()).accuracy();
692
else m_expectation =0;
699
* Method that finds all large itemsets for class association rule mining for the given set of instances.
700
* @param index the size of the large item sets
701
* @throws Exception if an attribute is numeric
703
private void findLargeCarItemSets(int index) throws Exception {
705
FastVector kMinusOneSets, kSets = new FastVector();
708
// Find large itemsets
710
kSets = CaRuleGeneration.singletons(m_instances);
711
ItemSet.upDateCounters(kSets, m_instances);
712
kSets = ItemSet.deleteItemSets(kSets, m_premiseCount,Integer.MAX_VALUE);
713
if (kSets.size() == 0)
715
m_Ls.addElement(kSets);
720
kSets = (FastVector)m_Ls.lastElement();
721
m_Ls.removeAllElements();
723
kMinusOneSets = kSets;
724
kSets = ItemSet.mergeAllItemSets(kMinusOneSets, i, m_instances.numInstances());
725
hashtable = ItemSet.getHashtable(kMinusOneSets, kMinusOneSets.size());
726
m_hashtables.addElement(hashtable);
727
kSets = ItemSet.pruneItemSets(kSets, hashtable);
728
ItemSet.upDateCounters(kSets, m_instances);
729
kSets = ItemSet.deleteItemSets(kSets, m_premiseCount,Integer.MAX_VALUE);
730
if(kSets.size() == 0)
732
m_Ls.addElement(kSets);
737
* Method that finds all class association rules.
739
* @throws Exception if an attribute is numeric
741
private void findCaRulesQuickly() throws Exception {
743
CaRuleGeneration currentLItemSet;
745
for (int j = 0; j < m_Ls.size(); j++) {
746
FastVector currentItemSets = (FastVector)m_Ls.elementAt(j);
747
Enumeration enumItemSets = currentItemSets.elements();
748
while (enumItemSets.hasMoreElements()) {
749
currentLItemSet = new CaRuleGeneration((ItemSet)enumItemSets.nextElement());
750
m_best = currentLItemSet.generateRules(m_numRules, m_midPoints,m_priors,m_expectation,
751
m_instances,m_best,m_count);
752
m_count = currentLItemSet.count();
753
if(!m_bestChanged && currentLItemSet.change())
754
m_bestChanged = true;
756
m_expectation = ((RuleItem)m_best.first()).accuracy();
764
* returns all the rules
766
* @return all the rules
767
* @see #m_allTheRules
769
public FastVector[] getAllTheRules() {
770
return m_allTheRules;
776
* @param args the commandline parameters
778
public static void main(String[] args) {
779
runAssociator(new PredictiveApriori(), args);