weka_classifiers.py 6.23 KB
Newer Older
1 2 3 4 5 6
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
"""
    Runs collection of machine learning algorithms for data mining tasks available in Weka.
    
7 8
    Hall, Mark, et al, The WEKA data mining software: an update, ACM SIGKDD explorations newsletter, 2009.
    
9 10 11 12
    Name: weka_classifiers.py
    Author: Alessandro dos Santos Ferreira ( santosferreira.alessandro@gmail.com )
"""

13 14
import weka.core.jvm as jvm

15 16 17 18 19 20 21 22 23
from weka.core.converters import Loader as WLoader
from weka.classifiers import Classifier as WClassifier
from weka.classifiers import Evaluation as WEvaluation
from weka.core.classes import Random as WRandom

from collections import OrderedDict

from util.config import Config
from util.file_utils import File
24
from util.utils import TimeUtils
25

26
from weka_alias import WekaAlias
27 28 29 30 31
from classifier import Classifier

class WekaClassifiers(Classifier):

    def __init__(self, classname="weka.classifiers.functions.SMO", options='default'):
32 33 34 35
        if not jvm.started:
            jvm.start()

        self.classname = Config("ClassName", classname, str)
36 37
        self.options = Config("Options", options, str)
        
38 39
        self.reset()

40 41 42 43 44
    
    def get_config(self):
        weka_config = OrderedDict()
        
        weka_config["classname"] = self.classname
45 46
        weka_config["classname"].value = weka_config["classname"].value.split('.')[-1]

47 48 49 50 51
        weka_config["options"] = self.options
        
        return weka_config
        
    def set_config(self, configs):
52 53
        configs["classname"].value = WekaAlias.get_classifier(configs["classname"].value)
        
54 55 56 57 58 59 60
        self.classname = Config.nvl_config(configs["classname"], self.classname)
        self.options = Config.nvl_config(configs["options"], self.options)

    def get_summary_config(self):
        weka_config = OrderedDict()
        
        weka_config[self.classname.label] = self.classname.value
61
        weka_config[self.options.label] = self.options.value
62 63 64 65 66 67 68 69 70 71 72

        summary = ''
        for config in weka_config:
            summary += "%s: %s\n" % (config, str(weka_config[config]))
        
        return summary


    def must_train(self):
        return True

73 74 75 76 77 78 79
    def train(self, dataset, training_data, force = False):
        if self.data is not None and not force:
            return 
        
        if self.data is not None:
            self.reset()
        
80 81 82 83 84 85 86 87 88 89 90
        loader = WLoader(classname="weka.core.converters.ArffLoader")
        
        training_file = File.make_path(dataset, training_data + ".arff")
        self.data = loader.load_file(training_file)
        self.data.class_is_last()
        
        options = None if self.options.value == 'default' else self.options.value.split()
        self.classifier = WClassifier(classname=self.classname.value, options=options)
        self.classifier.build_classifier(self.data)

    
91
    def classify(self, dataset, test_dir, test_data):
92 93
        loader = WLoader(classname="weka.core.converters.ArffLoader")
        
94
        test_file = File.make_path(dataset, test_data)
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
        predict_data = loader.load_file(test_file)
        predict_data.class_is_last()
        
        #values = str(predict_data.class_attribute)[19:-1].split(',')
        values = [str(predict_data.class_attribute.value(i)) for i in range(0, predict_data.class_attribute.num_values)]
        
        classes = []
        
        for index, inst in enumerate(predict_data):
            #pred = self.classifier.classify_instance(inst)
            prediction = self.classifier.distribution_for_instance(inst)
            #cl = int(values[prediction.argmax()][7:])
            cl = values[prediction.argmax()]
            
            #print 'Classe:', cl
            classes.append(cl)

        return classes

114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146

    def cross_validate(self, detail = True):
        start_time = TimeUtils.get_time()
        
        info =  "Scheme:\t%s %s\n" % (str(self.classifier.classname) , " ".join([str(option) for option in self.classifier.options]))
        
        if detail == True:
            info += "Relation:\t%s\n" % (self.data.relationname)
            info += "Instances:\t%d\n" % (self.data.num_instances)
            info += "Attributes:\t%d\n\n" % (self.data.num_attributes)
        
        evl = WEvaluation(self.data)
        evl.crossvalidate_model(self.classifier, self.data, 10, WRandom(1))
        
        if detail == False:
            info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct)

        info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time)
        #info += str(evl.percent_correct) + "\n\n"
        
        if detail == True:
            info += "=== Stratified cross-validation ===\n"
            info += evl.summary() + "\n\n"
            
            info += str(evl.class_details()) + "\n\n"
            
            classes = [str(self.data.class_attribute.value(i)) for i in range(0, self.data.class_attribute.num_values)]
            cm = evl.confusion_matrix
            info += Classifier.confusion_matrix(classes, cm)

        return info


147 148 149 150 151 152
    def experimenter(self):
        info = ""
        
        aliases = sorted(WekaAlias.get_aliases())
        for alias in aliases:
            try:
153
                if alias == 'KStar' or alias == 'LWL' or alias == 'MultilayerPerceptron':
154 155 156 157 158 159 160 161 162
                    continue 
                    
                start_time = TimeUtils.get_time()
                
                classifier = WClassifier(classname=WekaAlias.get_classifier(alias))
        
                info +=  "Scheme:\t%s %s\n" % (str(classifier.classname) , " ".join([str(option) for option in classifier.options]))
                
                evl = WEvaluation(self.data)
163
                evl.evaluate_train_test_split(classifier, self.data, 66, WRandom(1))
164 165 166 167 168 169 170 171 172 173 174
        
                info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct)
                info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time)

            except Exception as e:
                if str(e) != 'Object does not implement or subclass weka.classifiers.Classifier: __builtin__.NoneType':
                    info += "Exception in %s: %s\n\n" % (WekaAlias.get_aliases()[alias], str(e))
        
        return info
        

175 176 177
    def reset(self):
        self.data = None
        self.classifier = None