#!/usr/bin/python # -*- coding: utf-8 -*- # """ Runs collection of machine learning algorithms for data mining tasks available in Weka. Name: weka_classifiers.py Author: Alessandro dos Santos Ferreira ( santosferreira.alessandro@gmail.com ) """ import weka.core.jvm as jvm from weka.core.converters import Loader as WLoader from weka.classifiers import Classifier as WClassifier from weka.classifiers import Evaluation as WEvaluation from weka.core.classes import Random as WRandom from collections import OrderedDict from util.config import Config from util.file_utils import File from util.utils import TimeUtils from weka_alias import WekaAlias from classifier import Classifier class WekaClassifiers(Classifier): def __init__(self, classname="weka.classifiers.functions.SMO", options='default'): if not jvm.started: jvm.start() self.classname = Config("ClassName", classname, str) self.options = Config("Options", options, str) self.reset() def get_config(self): weka_config = OrderedDict() weka_config["classname"] = self.classname weka_config["classname"].value = weka_config["classname"].value.split('.')[-1] weka_config["options"] = self.options return weka_config def set_config(self, configs): configs["classname"].value = WekaAlias.get_classifier(configs["classname"].value) self.classname = Config.nvl_config(configs["classname"], self.classname) self.options = Config.nvl_config(configs["options"], self.options) def get_summary_config(self): weka_config = OrderedDict() weka_config[self.classname.label] = self.classname.value weka_config[self.options.label] = self.options.value summary = '' for config in weka_config: summary += "%s: %s\n" % (config, str(weka_config[config])) return summary def must_train(self): return True def train(self, dataset, training_data, force = False): if self.data is not None and not force: return if self.data is not None: self.reset() loader = WLoader(classname="weka.core.converters.ArffLoader") training_file = File.make_path(dataset, training_data + ".arff") self.data = loader.load_file(training_file) self.data.class_is_last() options = None if self.options.value == 'default' else self.options.value.split() self.classifier = WClassifier(classname=self.classname.value, options=options) self.classifier.build_classifier(self.data) def classify(self, dataset, test_data): loader = WLoader(classname="weka.core.converters.ArffLoader") test_file = File.make_path(dataset, test_data + ".arff") predict_data = loader.load_file(test_file) predict_data.class_is_last() #values = str(predict_data.class_attribute)[19:-1].split(',') values = [str(predict_data.class_attribute.value(i)) for i in range(0, predict_data.class_attribute.num_values)] classes = [] for index, inst in enumerate(predict_data): #pred = self.classifier.classify_instance(inst) prediction = self.classifier.distribution_for_instance(inst) #cl = int(values[prediction.argmax()][7:]) cl = values[prediction.argmax()] #print 'Classe:', cl classes.append(cl) return classes def cross_validate(self, detail = True): start_time = TimeUtils.get_time() info = "Scheme:\t%s %s\n" % (str(self.classifier.classname) , " ".join([str(option) for option in self.classifier.options])) if detail == True: info += "Relation:\t%s\n" % (self.data.relationname) info += "Instances:\t%d\n" % (self.data.num_instances) info += "Attributes:\t%d\n\n" % (self.data.num_attributes) evl = WEvaluation(self.data) evl.crossvalidate_model(self.classifier, self.data, 10, WRandom(1)) if detail == False: info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct) info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time) #info += str(evl.percent_correct) + "\n\n" if detail == True: info += "=== Stratified cross-validation ===\n" info += evl.summary() + "\n\n" info += str(evl.class_details()) + "\n\n" classes = [str(self.data.class_attribute.value(i)) for i in range(0, self.data.class_attribute.num_values)] cm = evl.confusion_matrix info += Classifier.confusion_matrix(classes, cm) return info def experimenter(self): info = "" aliases = sorted(WekaAlias.get_aliases()) for alias in aliases: try: if alias == "MultilayerPerceptron": continue start_time = TimeUtils.get_time() classifier = WClassifier(classname=WekaAlias.get_classifier(alias)) info += "Scheme:\t%s %s\n" % (str(classifier.classname) , " ".join([str(option) for option in classifier.options])) evl = WEvaluation(self.data) evl.crossvalidate_model(classifier, self.data, 10, WRandom(1)) info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct) info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time) except Exception as e: if str(e) != 'Object does not implement or subclass weka.classifiers.Classifier: __builtin__.NoneType': info += "Exception in %s: %s\n\n" % (WekaAlias.get_aliases()[alias], str(e)) return info def reset(self): self.data = None self.classifier = None