Visual Servoing Platform version 3.5.0
vpDetectorDNN.cpp
1/****************************************************************************
2 *
3 * ViSP, open source Visual Servoing Platform software.
4 * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
5 *
6 * This software is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 * See the file LICENSE.txt at the root directory of this source
11 * distribution for additional information about the GNU GPL.
12 *
13 * For using ViSP with software that can not be combined with the GNU
14 * GPL, please contact Inria about acquiring a ViSP Professional
15 * Edition License.
16 *
17 * See http://visp.inria.fr for more information.
18 *
19 * This software was developed at:
20 * Inria Rennes - Bretagne Atlantique
21 * Campus Universitaire de Beaulieu
22 * 35042 Rennes Cedex
23 * France
24 *
25 * If you have questions regarding the use of this file, please contact
26 * Inria at visp@inria.fr
27 *
28 * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
29 * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
30 *
31 * Description:
32 * DNN object detection using OpenCV DNN module.
33 *
34 *****************************************************************************/
35#include <visp3/core/vpConfig.h>
36
37#if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(VISP_HAVE_OPENCV_DNN)
38#include <visp3/detection/vpDetectorDNN.h>
39#include <visp3/core/vpImageConvert.h>
40
41vpDetectorDNN::vpDetectorDNN() : m_blob(), m_boxes(), m_classIds(), m_confidences(),
42 m_confidenceThreshold(0.5), m_I_color(), m_img(), m_inputSize(300,300), m_mean(127.5, 127.5, 127.5),
43 m_net(), m_nmsThreshold(0.4f), m_outNames(), m_outs(), m_scaleFactor(2.0/255.0), m_swapRB(true) {}
44
46
56 vpImageConvert::convert(I, m_I_color);
57
58 std::vector<vpRect> boundingBoxes;
59 return detect(m_I_color, boundingBoxes);
60}
61
69bool vpDetectorDNN::detect(const vpImage<vpRGBa> &I, std::vector<vpRect> &boundingBoxes) {
71
72 cv::Size inputSize(m_inputSize.width > 0 ? m_inputSize.width : m_img.cols,
73 m_inputSize.height > 0 ? m_inputSize.height : m_img.rows);
74 cv::dnn::blobFromImage(m_img, m_blob, m_scaleFactor, inputSize, m_mean, m_swapRB, false);
75
76 m_net.setInput(m_blob);
77 m_net.forward(m_outs, m_outNames);
78
79 postProcess();
80
81 boundingBoxes.resize(m_boxesNMS.size());
82 for (size_t i = 0; i < m_boxesNMS.size(); i++) {
83 cv::Rect box = m_boxesNMS[i];
84 boundingBoxes[i] = vpRect(box.x, box.y, box.width, box.height);
85 }
86
87 m_nb_objects = boundingBoxes.size();
88 m_polygon.resize(boundingBoxes.size());
89 m_message.resize(boundingBoxes.size());
90 for (size_t i = 0; i < boundingBoxes.size(); i++) {
91 std::vector<vpImagePoint> polygon;
92
93 double x = boundingBoxes[i].getLeft();
94 double y = boundingBoxes[i].getTop();
95 double w = boundingBoxes[i].getWidth();
96 double h = boundingBoxes[i].getHeight();
97
98 polygon.push_back(vpImagePoint(y, x));
99 polygon.push_back(vpImagePoint(y + h, x));
100 polygon.push_back(vpImagePoint(y + h, x + w));
101 polygon.push_back(vpImagePoint(y, x + w));
102
103 m_polygon[i] = polygon;
104
105 std::ostringstream oss;
106 oss << m_classIds[i] << " ; " << m_confidences[i] << " ; " << m_boxes[i];
107 m_message[i] = oss.str();
108 }
109
110 return !boundingBoxes.empty();
111}
112
118std::vector<vpRect> vpDetectorDNN::getDetectionBBs(bool afterNMS) const {
119 std::vector<vpRect> bbs;
120 if (afterNMS) {
121 bbs.reserve(m_boxesNMS.size());
122 for (size_t i = 0; i < m_boxesNMS.size(); i++) {
123 cv::Rect box = m_boxes[i];
124 bbs.push_back(vpRect(box.x, box.y, box.width, box.height));
125 }
126 } else {
127 bbs.reserve(m_boxes.size());
128 for (size_t i = 0; i < m_boxes.size(); i++) {
129 cv::Rect box = m_boxes[i];
130 bbs.push_back(vpRect(box.x, box.y, box.width, box.height));
131 }
132 }
133
134 return bbs;
135}
136
142std::vector<int> vpDetectorDNN::getDetectionClassIds(bool afterNMS) const {
143 if (afterNMS) {
144 std::vector<int> classIds;
145 for (size_t i = 0; i < m_indices.size(); i++) {
146 int idx = m_indices[i];
147 classIds.push_back(m_classIds[idx]);
148 }
149 return classIds;
150 }
151
152 return m_classIds;
153}
154
158std::vector<float> vpDetectorDNN::getDetectionConfidence(bool afterNMS) const {
159 if (afterNMS) {
160 std::vector<float> confidences;
161 for (size_t i = 0; i < m_indices.size(); i++) {
162 int idx = m_indices[i];
163 confidences.push_back(m_confidences[idx]);
164 }
165 return confidences;
166 }
167
168 return m_confidences;
169}
170
171#if (VISP_HAVE_OPENCV_VERSION == 0x030403)
172std::vector<cv::String> vpDetectorDNN::getOutputsNames() {
173 static std::vector<cv::String> names;
174 if (names.empty()) {
175 std::vector<int> outLayers = m_net.getUnconnectedOutLayers();
176 std::vector<cv::String> layersNames = m_net.getLayerNames();
177 names.resize(outLayers.size());
178 for (size_t i = 0; i < outLayers.size(); ++i)
179 names[i] = layersNames[outLayers[i] - 1];
180 }
181 return names;
182}
183#endif
184
185void vpDetectorDNN::postProcess() {
186 //Direct copy from object_detection.cpp OpenCV sample
187 static std::vector<int> outLayers = m_net.getUnconnectedOutLayers();
188 static std::string outLayerType = m_net.getLayer(outLayers[0])->type;
189
190 m_classIds.clear();
191 m_confidences.clear();
192 m_boxes.clear();
193 if (m_net.getLayer(0)->outputNameToIndex("im_info") != -1) // Faster-RCNN or R-FCN
194 {
195 // Network produces output blob with a shape 1x1xNx7 where N is a number of
196 // detections and an every detection is a vector of values
197 // [batchId, classId, confidence, left, top, right, bottom]
198 CV_Assert(m_outs.size() == 1);
199 float* data = (float*)m_outs[0].data;
200 for (size_t i = 0; i < m_outs[0].total(); i += 7)
201 {
202 float confidence = data[i + 2];
203 if (confidence > m_confidenceThreshold)
204 {
205 int left = (int)data[i + 3];
206 int top = (int)data[i + 4];
207 int right = (int)data[i + 5];
208 int bottom = (int)data[i + 6];
209 int width = right - left + 1;
210 int height = bottom - top + 1;
211 m_classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id.
212 m_boxes.push_back(cv::Rect(left, top, width, height));
213 m_confidences.push_back(confidence);
214 }
215 }
216 }
217 else if (outLayerType == "DetectionOutput")
218 {
219 // Network produces output blob with a shape 1x1xNx7 where N is a number of
220 // detections and an every detection is a vector of values
221 // [batchId, classId, confidence, left, top, right, bottom]
222 CV_Assert(m_outs.size() == 1);
223 float* data = (float*)m_outs[0].data;
224 for (size_t i = 0; i < m_outs[0].total(); i += 7)
225 {
226 float confidence = data[i + 2];
227 if (confidence > m_confidenceThreshold)
228 {
229 int left = (int)(data[i + 3] * m_img.cols);
230 int top = (int)(data[i + 4] * m_img.rows);
231 int right = (int)(data[i + 5] * m_img.cols);
232 int bottom = (int)(data[i + 6] * m_img.rows);
233 int width = right - left + 1;
234 int height = bottom - top + 1;
235 m_classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id.
236 m_boxes.push_back(cv::Rect(left, top, width, height));
237 m_confidences.push_back(confidence);
238 }
239 }
240 }
241 else if (outLayerType == "Region")
242 {
243 for (size_t i = 0; i < m_outs.size(); ++i)
244 {
245 // Network produces output blob with a shape NxC where N is a number of
246 // detected objects and C is a number of classes + 4 where the first 4
247 // numbers are [center_x, center_y, width, height]
248 float* data = (float*)m_outs[i].data;
249 for (int j = 0; j < m_outs[i].rows; ++j, data += m_outs[i].cols)
250 {
251 cv::Mat scores = m_outs[i].row(j).colRange(5, m_outs[i].cols);
252 cv::Point classIdPoint;
253 double confidence;
254 cv::minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
255 if (confidence > m_confidenceThreshold)
256 {
257 int centerX = (int)(data[0] * m_img.cols);
258 int centerY = (int)(data[1] * m_img.rows);
259 int width = (int)(data[2] * m_img.cols);
260 int height = (int)(data[3] * m_img.rows);
261 int left = centerX - width / 2;
262 int top = centerY - height / 2;
263
264 m_classIds.push_back(classIdPoint.x);
265 m_confidences.push_back((float)confidence);
266 m_boxes.push_back(cv::Rect(left, top, width, height));
267 }
268 }
269 }
270 }
271 else if(outLayerType == "Identity" || outLayerType == "Softmax")
272 {
273 // In OpenCV 4.5.2, the output of ssd-mobilenet.onnx is parsed as `Softmax`, whereas
274 // in OpenCV 4.5.5, the output is of type `Identity`, and the output order is permuted.
275
276 // Network produces 2 outputs blob:
277 // - `scores` with dimensions 1xNxC
278 // - 'boxes' with dimensions 1xNx4
279 // where `N` is a number of detections and `C` is the number of classes (with `BACKGROUND` as classId = 0).
280
281 int scores_index = m_outNames[0]=="scores" ? 0 : 1; // scores output index.
282 int boxes_index = m_outNames[0]=="boxes" ? 0 : 1; // boxes output index.
283
284 int N = m_outs[scores_index].size[1], C = m_outs[scores_index].size[2];
285
286 float* confidence = (float*)m_outs[scores_index].data;
287 float* bbox = (float*)m_outs[boxes_index].data;
288
289 // Loop over all guesses on the output of the network.
290 for(int i = 0; i < N; i++)
291 {
292 uint32_t maxClass = 0;
293 float maxScore = -1000.0f;
294
295 for(int j = 1; j < C; j++) // ignore background (classId = 0).
296 {
297 const float score = confidence[i * C + j];
298
299 if(score < m_confidenceThreshold)
300 continue;
301
302 if(score > maxScore)
303 {
304 maxScore = score;
305 maxClass = j;
306 }
307 }
308
309 if(maxScore > m_confidenceThreshold)
310 {
311 int left = (int)(bbox[4*i] * m_img.cols);
312 int top = (int)(bbox[4*i + 1] * m_img.rows);
313 int right = (int)(bbox[4*i + 2] * m_img.cols);
314 int bottom = (int)(bbox[4*i + 3] * m_img.rows);
315 int width = right - left + 1;
316 int height = bottom - top + 1;
317
318 m_boxes.push_back(cv::Rect(left, top, width, height));
319 m_classIds.push_back(maxClass);
320 m_confidences.push_back(maxScore);
321 }
322 }
323 }
324 else
325 CV_Error(cv::Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);
326
327 cv::dnn::NMSBoxes(m_boxes, m_confidences, m_confidenceThreshold, m_nmsThreshold, m_indices);
328 m_boxesNMS.resize(m_indices.size());
329 for (size_t i = 0; i < m_indices.size(); ++i)
330 {
331 int idx = m_indices[i];
332 m_boxesNMS[i] = m_boxes[idx];
333 }
334}
335
355void vpDetectorDNN::readNet(const std::string &model, const std::string &config, const std::string &framework) {
356 m_net = cv::dnn::readNet(model, config, framework);
357#if (VISP_HAVE_OPENCV_VERSION == 0x030403)
358 m_outNames = getOutputsNames();
359#else
360 m_outNames = m_net.getUnconnectedOutLayersNames();
361#endif
362}
363
369void vpDetectorDNN::setConfidenceThreshold(float confThreshold) {
370 m_confidenceThreshold = confThreshold;
371}
372
379void vpDetectorDNN::setInputSize(int width, int height) {
380 m_inputSize.width = width;
381 m_inputSize.height = height;
382}
383
391void vpDetectorDNN::setMean(double meanR, double meanG, double meanB) {
392 m_mean = cv::Scalar(meanR, meanG, meanB);
393}
394
401void vpDetectorDNN::setNMSThreshold(float nmsThreshold) {
402 m_nmsThreshold = nmsThreshold;
403}
404
412 m_net.setPreferableBackend(backendId);
413}
414
422 m_net.setPreferableTarget(targetId);
423}
424
428void vpDetectorDNN::setScaleFactor(double scaleFactor) {
429 m_scaleFactor = scaleFactor;
430}
431
437void vpDetectorDNN::setSwapRB(bool swapRB) {
438 m_swapRB = swapRB;
439}
440
441#elif !defined(VISP_BUILD_SHARED_LIBS)
442// Work arround to avoid warning: libvisp_core.a(vpDetectorDNN.cpp.o) has no
443// symbols
444void dummy_vpDetectorDNN(){};
445#endif
std::vector< std::string > m_message
Message attached to each object.
std::vector< std::vector< vpImagePoint > > m_polygon
For each object, defines the polygon that contains the object.
size_t m_nb_objects
Number of detected objects.
void setPreferableBackend(int backendId)
std::vector< vpRect > getDetectionBBs(bool afterNMS=true) const
std::vector< float > getDetectionConfidence(bool afterNMS=true) const
void setMean(double meanR, double meanG, double meanB)
void setPreferableTarget(int targetId)
void setInputSize(int width, int height)
void setScaleFactor(double scaleFactor)
void readNet(const std::string &model, const std::string &config="", const std::string &framework="")
virtual ~vpDetectorDNN()
void setNMSThreshold(float nmsThreshold)
void setConfidenceThreshold(float confThreshold)
void setSwapRB(bool swapRB)
std::vector< int > getDetectionClassIds(bool afterNMS=true) const
virtual bool detect(const vpImage< unsigned char > &I)
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Class that defines a 2D point in an image. This class is useful for image processing and stores only ...
Definition: vpImagePoint.h:88
Defines a rectangle in the plane.
Definition: vpRect.h:80