feature_extraction.py 10.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
"""
    Runs feature extraction algorithms.
    
    Name: feature_extractor.py
    Author: Alessandro dos Santos Ferreira ( santosferreira.alessandro@gmail.com )
"""

import io
import itertools
import os
14 15
import gc
import multiprocessing
16
from multiprocessing import Process, Manager
17
import threading
18 19 20 21 22
from interface.interface import InterfaceException as IException

from util.file_utils import File
from util.utils import ImageUtils
from util.utils import TimeUtils
23
import cv2
24
from .extractor import Extractor
25
from tqdm import tqdm
26 27
import sys

28 29 30
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
31

32
class FeatureExtractor(object):
33
    """Handle the feature extraction."""
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
34

35 36

    def __init__(self, extractors, tkParent=None):
37
        """Constructor.
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
38

39 40 41 42 43
        Parameters
        ----------
        extractor : list of Extractor
            Initial set of active extractors.
        """
44
        self.extractors = extractors
45
        self.tkParent=tkParent
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
46

47 48
    def extract_all(self, dataset, output_file=None, dirs=None, overwrite=True, processor_amd=False):
        self.processor_amd=processor_amd
49
        self.threads = []
50 51 52 53 54 55 56 57 58 59 60
        if self.processor_amd == True :
            self.data = Manager().list() #is a necessary because have a problem with use Process and normaly declaration
            self.labels = Manager().list()
            self.types = Manager().list()
        else:
            self.data = [] #is a necessary because have a problem with use Process and normaly declaration

            self.labels = []
            self.types = []


61
        """Runs the feature extraction algorithms on all images of dataset.
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
62

63 64 65 66 67 68 69 70 71 72
        Parameters
        ----------
        dataset : string
            Path to dataset.
        output_file : string, optional, default = None
            Name of output file continaing the features. If not informed is considered the name of dataset.
        dirs : list of string, optional, default = None
            List of directories to be serched. If not informed search in all directories with images inside dataset.
        overwrite : boolean, optional, default = True
            If False check if already exists a file containing the features.
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
73

74 75
        Returns
        -------
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
76
        out : tuple
77
            Returns a tuple containing the name of output file and time spent in milliseconds.
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
78

79 80 81 82 83 84 85 86 87
        Raises
        ------
        IException 'Please select at least one extractor'
            Empty list of extractors.
        IException 'Image %s is possibly corrupt'
            Error opening some image inside dataset.
        IException 'There are no images in dataset: %s'
            Dataset does not contain any image.
        """
88 89
        if len(self.extractors) == 0:
            raise IException("Please select at least one extractor")
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
90

91 92 93
        if output_file is None:
            output_file = File.get_filename(dataset)
        output_file = File.make_path(dataset, output_file + '.arff')
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
94

95
        # if already exists a output file and must not override, return current file
96
        if overwrite == False and os.path.isfile(output_file):
97
            return output_file, 0
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
98

99
        start_time = TimeUtils.get_time()
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
100

101 102
        classes = sorted(File.list_dirs(dataset))
        dirs = classes if dirs is None else dirs
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
103

104
        # Runs the feature extraction for all classes inside the dataset
105
        for cl in  dirs:
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
106
            # start job for each extractor
107 108 109 110 111 112
            self.job_extractor(dataset, cl, classes)

        self.print_console("Wait a moment, the threads are processing "+str(len(self.threads)) +" images, it may be delayed depending on the size or quantity of the images!")
        with tqdm(total=len(self.threads)) as pbar:
            for  t in self.threads:
                 t.start()
113
                 pbar.update(1)
114
            pbar.close()
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
115

116 117 118 119
        self.print_console("Waiting for workers to finish extracting attributes from images!")
        with tqdm(total=len(self.threads)) as ppbar:
            for t in self.threads:
                t.join()
120

121 122 123
                ppbar.update(1)
            ppbar.close()
        self.print_console("The process was completed with "+str(len(self.threads))+" images!")
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
124
        if len(self.data) == 0:
125
            raise IException("There are no images in dataset: %s" % dataset)
126
        del self.threads
127
        gc.collect()
128
        # Save the output file in ARFF format
129
        # self._save_output(File.get_filename(dataset), classes, self.labels, self.types, self.data, output_file)
130
        self._save_output(File.get_filename(dataset), classes, self.labels[0], self.types[0], self.data, output_file)
131 132 133 134
        end_time = TimeUtils.get_time()

        return output_file, (end_time - start_time)

135 136
    # create one thread for folder
    def job_extractor(self, dataset, cl, classes):
Diego André Sant'Ana's avatar
Diego André Sant'Ana committed
137

138
        items = sorted(os.listdir(File.make_path(dataset, cl)))
139
        self.print_console("Processing class %s - %d itens" % (cl, len(items)))
140 141 142 143

        for  item in  items :
            if item.startswith('.'):
                continue
144 145 146 147 148

            if self.processor_amd==True :
                th = multiprocessing.Process(target=self.sub_job_extractor,args=(item, dataset, cl, classes))
            else:
                th = threading.Thread(target=self.sub_job_extractor,args=(item, dataset, cl, classes))
149

150 151 152 153 154 155 156 157 158 159 160 161 162 163
            self.threads.append(th)


    # create one thread each image for use extractor
    def sub_job_extractor(self, item, dataset, cl, classes):
        try:
            filepath = File.make_path(dataset, cl, item)
            image = cv2.imread(filepath)
            #image = self.equalize_size_image(image)

        except:
            raise IException("Image %s is possibly corrupt" % filepath)

        if len(self.data) > 0:
164 165 166 167
            if sys.version_info >= (3, 0):
                values = list(zip(*([extractor().run(image) for extractor in self.extractors])))
            else:
                values = list(itertools.chain.from_iterable(zip(*([extractor().run(image) for extractor in self.extractors]))[2]))
168
	    
169
            self.data.append(values + [cl if cl in classes else classes[0]])
170

171
        else:
172
            labs, tys, values = [list(itertools.chain.from_iterable(ret))
173 174
                                               for ret in
                                               zip(*(extractor().run(image) for extractor in self.extractors))]
175 176
            self.labels.append(labs)
            self.types.append(tys)
177
            self.data.append(values + [cl if cl in classes else classes[0]])
178 179
        image=None
        filepath=None
180
    def extract_one_file(self, dataset, image_path, output_file=None):
181
        """Runs the feature extraction algorithms on specific image.
182

183 184 185 186 187 188 189 190
        Parameters
        ----------
        dataset : string
            Path to dataset.
        image_path : string
            Path to image.
        output_file : string, optional, default = None
            Name of output file continaing the features. If not informed is considered the name of dataset.
191

192 193
        Returns
        -------
194
        out : tuple
195
            Returns a tuple containing the name of output file and time spent in milliseconds.
196

197 198 199 200 201 202 203
        Raises
        ------
        IException 'Please select at least one extractor'
            Empty list of extractors.
        IException 'Image %s is possibly corrupt'
            Error opening image.
        """
204 205
        if len(self.extractors) == 0:
            raise IException("Please select at least one extractor")
206

207 208 209
        if output_file is None:
            output_file = File.get_filename(dataset)
        output_file = File.make_path(dataset, output_file + '.arff')
210

211
        classes = sorted(File.list_dirs(dataset))
212

213
        start_time = TimeUtils.get_time()
214

215
        try:
216
            image = File.open_image(image_path, rgb=False)
217 218
        except:
            raise IException("Image %s is possibly corrupt" % filepath)
219 220 221 222

        labels, types, values = [list(itertools.chain.from_iterable(ret))
                                 for ret in zip(*([extractor().run(image) for extractor in self.extractors]))]

223
        self._save_output(File.get_filename(dataset), classes, labels, types, [values + [classes[0]]], output_file)
224

225 226 227
        end_time = TimeUtils.get_time()

        return output_file, (end_time - start_time)
228

229
    def _save_output(self, relation, classes, labels, types, data, output_file):
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
        """Save output file in ARFF format.
        
        Parameters
        ----------
        relation : string
            Name of relation.
        classes : list of string
            List of classes names.
        labels : list of string
            List of attributes names.
        types : list of string
            List of attributes types.
        data : list of list of string
            List of instances.
        output_file : string
            Path to output file.
        """
247 248

        arff = open(output_file, 'wb')
249

250
        arff.write("%s %s\n\n" % ('@relation', relation))
251

252
        for label, t in zip(labels, types):
253 254
            arff.write("%s %s %s\n" % ('@attribute', label, t))

255
        arff.write("%s %s {%s}\n\n" % ('@attribute', 'classe', ', '.join(classes)))
256 257 258 259 260 261

        arff.write('@data\n\n')

        for instance in data:
            instance = map(str, instance)
            line = ",".join(instance)
262
            arff.write(line + "\n")
263 264 265

        arff.close()

266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
    #method to equalize size of images
    def equalize_size_image(self, image):
            if (image.shape[0] > 1000):
                basewidth = 1000
                wpercent = (basewidth / float(image.shape[0] ))
                hsize = int((float(image.shape[1] ) * float(wpercent)))
                image = cv2.resize(image, (basewidth, hsize))
            elif (image.shape[1] > 1000):
                baseheight = 1000
                wpercent = (baseheight / float(image.shape[1] ))
                wsize = int((float(image.shape[1] ) * float(wpercent)))
                image = cv2.resize(image, (wsize, baseheight))
            elif (image.shape[1] <1000):
                baseheight = 1000
                wpercent = (baseheight / float(image.shape[1] ))
                wsize = int((float(image.shape[1] ) * float(wpercent)))
                image = cv2.resize(image, (wsize, baseheight))
            elif (image.shape[0] < 1000):
                basewidth = 1000
                wpercent = (basewidth / float(image.shape[0] ))
                hsize = int((float(image.shape[1] ) * float(wpercent)))
                image = cv2.resize(image, (basewidth, hsize))
            return image
289 290 291 292 293 294 295 296 297 298

    "Method for print message in console, Window or Both"
    def print_console(self,mensagem):
        if(self.tkParent==None):
            print(mensagem)
        else:
            print(mensagem)
            self.tkParent.append_log( mensagem)
            self.tkParent._root.update_idletasks()