feature_extraction.py 7.03 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
"""
    Runs feature extraction algorithms.
    
    Name: feature_extractor.py
    Author: Alessandro dos Santos Ferreira ( santosferreira.alessandro@gmail.com )
"""

import io
import itertools
import os

from interface.interface import InterfaceException as IException

from util.file_utils import File
from util.utils import ImageUtils
from util.utils import TimeUtils

from extractor import Extractor

class FeatureExtractor(object):
24
    """Handle the feature extraction."""
25 26
    
    def __init__(self, extractors):
27 28 29 30 31 32 33
        """Constructor.
        
        Parameters
        ----------
        extractor : list of Extractor
            Initial set of active extractors.
        """
34 35
        self.extractors = extractors
    
36
    def extract_all(self, dataset, output_file = None, dirs = None, overwrite = True):
37 38 39 40 41 42 43 44 45 46 47 48
        """Runs the feature extraction algorithms on all images of dataset.
        
        Parameters
        ----------
        dataset : string
            Path to dataset.
        output_file : string, optional, default = None
            Name of output file continaing the features. If not informed is considered the name of dataset.
        dirs : list of string, optional, default = None
            List of directories to be serched. If not informed search in all directories with images inside dataset.
        overwrite : boolean, optional, default = True
            If False check if already exists a file containing the features.
49
        
50 51 52 53 54 55 56 57 58 59 60 61 62 63
        Returns
        -------
        out : tuple 
            Returns a tuple containing the name of output file and time spent in milliseconds.
            
        Raises
        ------
        IException 'Please select at least one extractor'
            Empty list of extractors.
        IException 'Image %s is possibly corrupt'
            Error opening some image inside dataset.
        IException 'There are no images in dataset: %s'
            Dataset does not contain any image.
        """
64 65 66 67 68 69 70
        if len(self.extractors) == 0:
            raise IException("Please select at least one extractor")
        
        if output_file is None:
            output_file = File.get_filename(dataset)
        output_file = File.make_path(dataset, output_file + '.arff')
            
71
        # if already exists a output file and must not override, return current file
72
        if overwrite == False and os.path.isfile(output_file):
73
            return output_file, 0
74 75 76
        
        start_time = TimeUtils.get_time()
        
77 78 79
        classes = sorted(File.list_dirs(dataset))
        dirs = classes if dirs is None else dirs
        
80 81
        data = []
        
82
        # Runs the feature extraction for all classes inside the dataset
83 84
        for cl in dirs:
            items = sorted(os.listdir( File.make_path(dataset, cl)))
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
            print("Processing class %s - %d itens" % (cl, len(items)))
            
            for item in items:
                if item.startswith('.'):
                    continue 
                
                try:
                    filepath = File.make_path(dataset, cl, item)
                    image = File.open_image(filepath, rgb = False )
                except:
                    raise IException("Image %s is possibly corrupt" % filepath)
                
                if len(data) > 0:
                    values = list(itertools.chain.from_iterable(zip(*([extractor().run(image) for extractor in self.extractors]))[2] ))
                else:          
                    labels, types, values = [ list(itertools.chain.from_iterable(ret))
                                                for ret in zip(*([extractor().run(image) for extractor in self.extractors])) ]
102 103
                                                
                data.append(values + [cl if cl in classes else classes[0]])
104 105 106
                
        if len(data) == 0:
            raise IException("There are no images in dataset: %s" % dataset)
107 108
          
        # Save the output file in ARFF format
109 110 111 112 113 114
        self._save_output(File.get_filename(dataset), classes, labels, types, data, output_file)
        
        end_time = TimeUtils.get_time()

        return output_file, (end_time - start_time)

115 116

    def extract_one_file(self, dataset, image_path, output_file = None):
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
        """Runs the feature extraction algorithms on specific image.
        
        Parameters
        ----------
        dataset : string
            Path to dataset.
        image_path : string
            Path to image.
        output_file : string, optional, default = None
            Name of output file continaing the features. If not informed is considered the name of dataset.
        
        Returns
        -------
        out : tuple 
            Returns a tuple containing the name of output file and time spent in milliseconds.
            
        Raises
        ------
        IException 'Please select at least one extractor'
            Empty list of extractors.
        IException 'Image %s is possibly corrupt'
            Error opening image.
        """
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
        if len(self.extractors) == 0:
            raise IException("Please select at least one extractor")
        
        if output_file is None:
            output_file = File.get_filename(dataset)
        output_file = File.make_path(dataset, output_file + '.arff')
        
        classes = sorted(File.list_dirs(dataset))
        
        start_time = TimeUtils.get_time()
        
        try:
            image = File.open_image(image_path, rgb = False )
        except:
            raise IException("Image %s is possibly corrupt" % filepath)
                
        labels, types, values = [ list(itertools.chain.from_iterable(ret))
                                    for ret in zip(*([extractor().run(image) for extractor in self.extractors])) ]
                                    
        self._save_output(File.get_filename(dataset), classes, labels, types, [values + [classes[0]]], output_file)
        
        end_time = TimeUtils.get_time()

        return output_file, (end_time - start_time)
            
165 166
            
    def _save_output(self, relation, classes, labels, types, data, output_file):
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
        """Save output file in ARFF format.
        
        Parameters
        ----------
        relation : string
            Name of relation.
        classes : list of string
            List of classes names.
        labels : list of string
            List of attributes names.
        types : list of string
            List of attributes types.
        data : list of list of string
            List of instances.
        output_file : string
            Path to output file.
        """
184 185
        arff = open(output_file,'wb')

186
        arff.write("%s %s\n\n" % ('@relation', relation))
187

188
        for label, t in zip(labels, types):
189 190 191 192 193 194 195 196 197 198 199 200 201
            arff.write("%s %s %s\n" % ('@attribute', label, t))

        arff.write("%s %s {%s}\n\n" % ('@attribute','classe',', '.join(classes)))

        arff.write('@data\n\n')

        for instance in data:
            instance = map(str, instance)
            line = ",".join(instance)
            arff.write(line+"\n")

        arff.close()