weka_classifiers.py 9.43 KB
Newer Older
1 2 3 4 5 6
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
"""
    Runs collection of machine learning algorithms for data mining tasks available in Weka.
    
7 8
    Hall, Mark, et al, The WEKA data mining software: an update, ACM SIGKDD explorations newsletter, 2009.
    
9 10 11 12
    Name: weka_classifiers.py
    Author: Alessandro dos Santos Ferreira ( santosferreira.alessandro@gmail.com )
"""

13 14
import weka.core.jvm as jvm

15 16 17 18 19 20 21 22 23
from weka.core.converters import Loader as WLoader
from weka.classifiers import Classifier as WClassifier
from weka.classifiers import Evaluation as WEvaluation
from weka.core.classes import Random as WRandom

from collections import OrderedDict

from util.config import Config
from util.file_utils import File
24
from util.utils import TimeUtils
25

26
from weka_alias import WekaAlias
27 28
from classifier import Classifier

29

30
class WekaClassifiers(Classifier):
31
    """Class for all classifiers available in python-weka-wrapper"""
32 33

    def __init__(self, classname="weka.classifiers.functions.SMO", options='default'):
34 35 36 37 38 39 40 41 42
        """Constructor.
        
        Parameters
        ----------
        classname : string, optional, default = 'weka.classifiers.functions.SMO'
            Classifier initialized as default.
        options : string, optional, default = 'default'
            Classifier options initialized as default. Use the string 'default' to default options.
        """
43 44 45 46
        if not jvm.started:
            jvm.start()

        self.classname = Config("ClassName", classname, str)
47
        self.options = Config("Options", options, str)
48 49
        self.reset()

50 51
    
    def get_config(self):
52 53 54 55 56 57 58
        """Return configuration of classifier. 
        
        Returns
        -------
        config : OrderedDict
            Current configs of classifier.
        """
59 60 61
        weka_config = OrderedDict()
        
        weka_config["classname"] = self.classname
62 63
        weka_config["classname"].value = weka_config["classname"].value.split('.')[-1]

64
        weka_config["options"] = self.options
65
         
66 67 68
        return weka_config
        
    def set_config(self, configs):
69 70 71 72 73 74 75
        """Update configuration of classifier. 
        
        Parameters
        ----------
        configs : OrderedDict
            New configs of classifier.
        """
76 77
        configs["classname"].value = WekaAlias.get_classifier(configs["classname"].value)
        
78 79
        self.classname = Config.nvl_config(configs["classname"], self.classname)
        self.options = Config.nvl_config(configs["options"], self.options)
80 81
        
                
82
    def get_summary_config(self):
83 84 85 86 87 88 89
        """Return fomatted summary of configuration. 
        
        Returns
        -------
        summary : string
            Formatted string with summary of configuration.
        """
90 91 92
        weka_config = OrderedDict()
        
        weka_config[self.classname.label] = self.classname.value
93
        weka_config[self.options.label] = self.options.value
94 95
        #print 'self.options.value', 
        #print self.options.value
96 97 98 99
        summary = ''
        for config in weka_config:
            summary += "%s: %s\n" % (config, str(weka_config[config]))
        
100
        
101 102 103 104
        return summary


    def must_train(self):
105 106 107 108 109 110
        """Return if classifier must be trained. 
        
        Returns
        -------
        True
        """
111 112
        return True

113
    def train(self, dataset, training_data, force = False):
114 115 116 117 118 119 120 121 122 123 124
        """Perform the training of classifier.
        
        Parameters
        ----------
        dataset : string
            Path to image dataset.
        training_data : string
            Name of ARFF training file.
        force : boolean, optional, default = False
            If False don't perform new training if there is trained data.
        """
125 126
        
               
127 128 129 130 131 132
        if self.data is not None and not force:
            return 
        
        if self.data is not None:
            self.reset()
        
133 134 135 136 137 138 139 140 141 142 143
        loader = WLoader(classname="weka.core.converters.ArffLoader")
        
        training_file = File.make_path(dataset, training_data + ".arff")
        self.data = loader.load_file(training_file)
        self.data.class_is_last()
        
        options = None if self.options.value == 'default' else self.options.value.split()
        self.classifier = WClassifier(classname=self.classname.value, options=options)
        self.classifier.build_classifier(self.data)

    
144
    def classify(self, dataset, test_dir, test_data, image):
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
        """Perform the classification. 
        
        Parameters
        ----------
        dataset : string
            Path to image dataset.
        test_dir : string
            Not used.
        test_data : string
            Name of test data file.
            
        Returns
        -------
        summary : list of string
            List of predicted classes for each instance in test data in ordered way.
        """
161 162
        
        
163 164
        loader = WLoader(classname="weka.core.converters.ArffLoader")
        
165
        test_file = File.make_path(dataset, test_data)
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
        predict_data = loader.load_file(test_file)
        predict_data.class_is_last()
        
        #values = str(predict_data.class_attribute)[19:-1].split(',')
        values = [str(predict_data.class_attribute.value(i)) for i in range(0, predict_data.class_attribute.num_values)]
        
        classes = []
        
        for index, inst in enumerate(predict_data):
            #pred = self.classifier.classify_instance(inst)
            prediction = self.classifier.distribution_for_instance(inst)
            #cl = int(values[prediction.argmax()][7:])
            cl = values[prediction.argmax()]
            #print 'Classe:', cl
            classes.append(cl)
        return classes

183 184

    def cross_validate(self, detail = True):
185 186 187 188 189 190 191 192 193 194 195 196
        """Perform cross validation using trained data. 
        
        Parameters
        ----------
        detail : boolean, optional, default = True
            If true return a detailed information of cross validation.
            
        Returns
        -------
        info : string
            Info with results of cross validation.
        """
197 198 199
        
        #print 'cross_validation'
        
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
        start_time = TimeUtils.get_time()
        
        info =  "Scheme:\t%s %s\n" % (str(self.classifier.classname) , " ".join([str(option) for option in self.classifier.options]))
        
        if detail == True:
            info += "Relation:\t%s\n" % (self.data.relationname)
            info += "Instances:\t%d\n" % (self.data.num_instances)
            info += "Attributes:\t%d\n\n" % (self.data.num_attributes)
        
        evl = WEvaluation(self.data)
        evl.crossvalidate_model(self.classifier, self.data, 10, WRandom(1))
        
        if detail == False:
            info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct)

        info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time)
        #info += str(evl.percent_correct) + "\n\n"
        
        if detail == True:
            info += "=== Stratified cross-validation ===\n"
            info += evl.summary() + "\n\n"
            
            info += str(evl.class_details()) + "\n\n"
            
            classes = [str(self.data.class_attribute.value(i)) for i in range(0, self.data.class_attribute.num_values)]
            cm = evl.confusion_matrix
            info += Classifier.confusion_matrix(classes, cm)

        return info


231
    def experimenter(self):
232 233 234 235 236 237 238
        """Perform a test using all classifiers available. 
        
        Returns
        -------
        info : string
            Info with results of experimenter.
        """
239
        info = ""
240
        #print 'experimenter'
241 242 243
        aliases = sorted(WekaAlias.get_aliases())
        for alias in aliases:
            try:
244
                # Ignore very slow classifiers.
245
                if alias == 'KStar' or alias == 'LWL' or alias == 'MultilayerPerceptron':
246 247 248 249 250 251 252 253 254
                    continue 
                    
                start_time = TimeUtils.get_time()
                
                classifier = WClassifier(classname=WekaAlias.get_classifier(alias))
        
                info +=  "Scheme:\t%s %s\n" % (str(classifier.classname) , " ".join([str(option) for option in classifier.options]))
                
                evl = WEvaluation(self.data)
255
                evl.evaluate_train_test_split(classifier, self.data, 66, WRandom(1))
256 257 258 259 260 261 262 263 264 265 266
        
                info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct)
                info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time)

            except Exception as e:
                if str(e) != 'Object does not implement or subclass weka.classifiers.Classifier: __builtin__.NoneType':
                    info += "Exception in %s: %s\n\n" % (WekaAlias.get_aliases()[alias], str(e))
        
        return info
        

267
    def reset(self):
268 269
        """Clean all data of classification. 
        """
270 271
        self.data = None
        self.classifier = None
272 273 274 275 276 277 278 279 280 281 282 283 284 285

    def single_classify(self, image_path, directory, extractors, dict_classes):
        '''
        '''
        from extraction import FeatureExtractor
        from os import remove
        test_file = 'temp'
        fextractor=FeatureExtractor(extractors)
        fextractor.extract_one_file(directory, image_path, output_file = test_file)
        
        predicted = self.classify(directory, test_dir='.tmp', test_data=test_file+'.arff', image=None)

        remove(directory+'/'+test_file+'.arff')
        return predicted[0]