1 /*
2 * This file is part of FFmpeg.
3 *
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 /**
20 * @file
21 * implementing an object detecting filter using deep learning networks.
22 */
23
35
42
60
62
63 #define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x)
64 #define OFFSET2(x) offsetof(DnnDetectContext, x)
65 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
68 #if (CONFIG_LIBTENSORFLOW == 1)
70 #endif
71 #if (CONFIG_LIBOPENVINO == 1)
73 #endif
77 {
"ssd",
"output shape [1, 1, N, 7]", 0,
AV_OPT_TYPE_CONST, { .i64 =
DDMT_SSD }, 0, 0,
FLAGS, .unit =
"model_type" },
78 {
"yolo",
"output shape [1, N*Cx*Cy*DetectionBox]", 0,
AV_OPT_TYPE_CONST, { .i64 =
DDMT_YOLOV1V2 }, 0, 0,
FLAGS, .unit =
"model_type" },
79 {
"yolov3",
"outputs shape [1, N*D, Cx, Cy]", 0,
AV_OPT_TYPE_CONST, { .i64 =
DDMT_YOLOV3 }, 0, 0,
FLAGS, .unit =
"model_type" },
80 {
"yolov4",
"outputs shape [1, N*D, Cx, Cy]", 0,
AV_OPT_TYPE_CONST, { .i64 =
DDMT_YOLOV4 }, 0, 0,
FLAGS, .unit =
"model_type" },
86 };
87
89
91 return 1.f / (1.f +
exp(-x));
92 }
93
94 static inline float linear(
float x) {
95 return x;
96 }
97
99 {
100 float max_prob = 0;
101 int label_id = 0;
102 for (
int i = 0;
i < nb_classes;
i++) {
103 if (label_data[
i * cell_size] > max_prob) {
104 max_prob = label_data[
i * cell_size];
106 }
107 }
108 return label_id;
109 }
110
111 /* Calculate Intersection Over Union */
113 {
114 float overlapping_width =
FFMIN(bbox1->
x + bbox1->
w, bbox2->
x + bbox2->
w) -
FFMAX(bbox1->
x, bbox2->
x);
115 float overlapping_height =
FFMIN(bbox1->
y + bbox1->
h, bbox2->
y + bbox2->
h) -
FFMAX(bbox1->
y, bbox2->
y);
116 float intersection_area =
117 (overlapping_width < 0 || overlapping_height < 0) ? 0 : overlapping_height * overlapping_width;
118 float union_area = bbox1->
w * bbox1->
h + bbox2->
w * bbox2->
h - intersection_area;
119 return intersection_area / union_area;
120 }
121
124 {
126 float conf_threshold =
ctx->confidence;
127 int detection_boxes, box_size;
128 int cell_w = 0, cell_h = 0, scale_w = 0, scale_h = 0;
129 int nb_classes =
ctx->nb_classes;
131 float *anchors =
ctx->anchors;
134 int is_NHWC = 0;
135
137 cell_w =
ctx->cell_w;
138 cell_h =
ctx->cell_h;
139 scale_w = cell_w;
140 scale_h = cell_h;
141 } else {
142 if (
output[output_index].dims[2] !=
output[output_index].dims[3] &&
143 output[output_index].dims[2] ==
output[output_index].dims[1]) {
144 is_NHWC = 1;
145 cell_w =
output[output_index].dims[2];
146 cell_h =
output[output_index].dims[1];
147 } else {
148 cell_w =
output[output_index].dims[3];
149 cell_h =
output[output_index].dims[2];
150 }
151 scale_w =
ctx->scale_width;
152 scale_h =
ctx->scale_height;
153 }
154 box_size = nb_classes + 5;
155
156 switch (
ctx->model_type) {
159 post_process_raw_data =
linear;
160 break;
162 post_process_raw_data =
sigmoid;
163 break;
164 }
165
166 if (!cell_h || !cell_w) {
169 }
170
171 if (!nb_classes) {
174 }
175
176 if (
output[output_index].dims[1] *
output[output_index].dims[2] *
177 output[output_index].dims[3] % (box_size * cell_w * cell_h)) {
180 }
181 detection_boxes =
output[output_index].dims[1] *
182 output[output_index].dims[2] *
183 output[output_index].dims[3] / box_size / cell_w / cell_h;
184
185 anchors = anchors + (detection_boxes * output_index * 2);
186 /**
187 * find all candidate bbox
188 * yolo output can be reshaped to [B, N*D, Cx, Cy]
189 * Detection box 'D' has format [`x`, `y`, `h`, `w`, `box_score`, `class_no_1`, ...,]
190 **/
191 for (int box_id = 0; box_id < detection_boxes; box_id++) {
192 for (int cx = 0; cx < cell_w; cx++)
193 for (int cy = 0; cy < cell_h; cy++) {
194 float x, y,
w,
h, conf;
195 float *detection_boxes_data;
196 int label_id;
197
198 if (is_NHWC) {
200 ((cy * cell_w + cx) * detection_boxes + box_id) * box_size;
201 conf = post_process_raw_data(detection_boxes_data[4]);
202 } else {
203 detection_boxes_data =
output_data + box_id * box_size * cell_w * cell_h;
204 conf = post_process_raw_data(
205 detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h]);
206 }
207
208 if (is_NHWC) {
209 x = post_process_raw_data(detection_boxes_data[0]);
210 y = post_process_raw_data(detection_boxes_data[1]);
211 w = detection_boxes_data[2];
212 h = detection_boxes_data[3];
214 conf = conf * post_process_raw_data(detection_boxes_data[label_id + 5]);
215 } else {
216 x = post_process_raw_data(detection_boxes_data[cy * cell_w + cx]);
217 y = post_process_raw_data(detection_boxes_data[cy * cell_w + cx + cell_w * cell_h]);
218 w = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h];
219 h = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h];
221 detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h);
222 conf = conf * post_process_raw_data(
223 detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h]);
224 }
225 if (conf < conf_threshold) {
226 continue;
227 }
228
230 if (!bbox)
232
233 bbox->
w =
exp(
w) * anchors[box_id * 2] *
frame->width / scale_w;
234 bbox->
h =
exp(
h) * anchors[box_id * 2 + 1] *
frame->height / scale_h;
235 bbox->
x = (cx + x) / cell_w *
frame->width - bbox->
w / 2;
236 bbox->
y = (cy + y) / cell_h *
frame->height - bbox->
h / 2;
238 if (
ctx->labels && label_id < ctx->label_count) {
240 } else {
242 }
243
247 }
249 }
250 }
251 return 0;
252 }
253
255 {
257 float conf_threshold =
ctx->confidence;
259 int nb_bboxes = 0;
263 return 0;
264 }
265
266 /* remove overlap bboxes */
276 nb_bboxes++;
277 break;
278 }
279 }
280 }
285 return -1;
286 }
288
292
295 memcpy(bbox, candidate_bbox, sizeof(*bbox));
296 nb_bboxes--;
297 }
299 }
300 return 0;
301 }
302
304 {
312 return 0;
313 }
314
317 {
319 for (
int i = 0;
i < nb_outputs;
i++) {
323 }
327 return 0;
328 }
329
332 {
334 float conf_threshold =
ctx->confidence;
335 int proposal_count = 0;
336 int detect_size = 0;
337 float *detections =
NULL, *labels =
NULL;
338 int nb_bboxes = 0;
341 int scale_w =
ctx->scale_width;
342 int scale_h =
ctx->scale_height;
343
344 if (nb_outputs == 1 &&
output->dims[3] == 7) {
345 proposal_count =
output->dims[2];
346 detect_size =
output->dims[3];
347 detections =
output->data;
348 }
else if (nb_outputs == 2 &&
output[0].dims[3] == 5) {
349 proposal_count =
output[0].dims[2];
350 detect_size =
output[0].dims[3];
351 detections =
output[0].data;
353 }
else if (nb_outputs == 2 &&
output[1].dims[3] == 5) {
354 proposal_count =
output[1].dims[2];
355 detect_size =
output[1].dims[3];
356 detections =
output[1].data;
358 } else {
361 }
362
363 if (proposal_count == 0)
364 return 0;
365
366 for (
int i = 0;
i < proposal_count; ++
i) {
367 float conf;
368 if (nb_outputs == 1)
369 conf = detections[
i * detect_size + 2];
370 else
371 conf = detections[
i * detect_size + 4];
372 if (conf < conf_threshold) {
373 continue;
374 }
375 nb_bboxes++;
376 }
377
378 if (nb_bboxes == 0) {
380 return 0;
381 }
382
386 return -1;
387 }
388
390
391 for (
int i = 0;
i < proposal_count; ++
i) {
392 av_unused int image_id = (int)detections[
i * detect_size + 0];
393 int label_id;
394 float conf, x0, y0, x1, y1;
395
396 if (nb_outputs == 1) {
397 label_id = (int)detections[
i * detect_size + 1];
398 conf = detections[
i * detect_size + 2];
399 x0 = detections[
i * detect_size + 3];
400 y0 = detections[
i * detect_size + 4];
401 x1 = detections[
i * detect_size + 5];
402 y1 = detections[
i * detect_size + 6];
403 } else {
404 label_id = (int)labels[
i];
405 x0 = detections[
i * detect_size] / scale_w;
406 y0 = detections[
i * detect_size + 1] / scale_h;
407 x1 = detections[
i * detect_size + 2] / scale_w;
408 y1 = detections[
i * detect_size + 3] / scale_h;
409 conf = detections[
i * detect_size + 4];
410 }
411
412 if (conf < conf_threshold) {
413 continue;
414 }
415
417 bbox->
x = (int)(x0 *
frame->width);
418 bbox->
w = (int)(x1 *
frame->width) - bbox->
x;
419 bbox->
y = (int)(y0 *
frame->height);
420 bbox->
h = (int)(y1 *
frame->height) - bbox->
y;
421
424
425 if (
ctx->labels && label_id < ctx->label_count) {
427 } else {
429 }
430
431 nb_bboxes--;
432 if (nb_bboxes == 0) {
433 break;
434 }
435 }
436 return 0;
437 }
438
441 {
445
447 if (sd) {
449 return -1;
450 }
451
452 switch (
ctx->model_type) {
457 break;
462 break;
468 break;
469 }
470 return 0;
471 }
472
474 {
476 int proposal_count;
477 float conf_threshold =
ctx->confidence;
478 float *conf, *position, *label_id, x0, y0, x1, y1;
479 int nb_bboxes = 0;
483
486 position =
output[3].data;
487 label_id =
output[2].data;
488
490 if (sd) {
492 return -1;
493 }
494
495 for (
int i = 0;
i < proposal_count; ++
i) {
496 if (conf[
i] < conf_threshold)
497 continue;
498 nb_bboxes++;
499 }
500
501 if (nb_bboxes == 0) {
503 return 0;
504 }
505
509 return -1;
510 }
511
513
514 for (
int i = 0;
i < proposal_count; ++
i) {
515 y0 = position[
i * 4];
516 x0 = position[
i * 4 + 1];
517 y1 = position[
i * 4 + 2];
518 x1 = position[
i * 4 + 3];
519
521
522 if (conf[
i] < conf_threshold) {
523 continue;
524 }
525
526 bbox->
x = (int)(x0 *
frame->width);
527 bbox->
w = (int)(x1 *
frame->width) - bbox->
x;
528 bbox->
y = (int)(y0 *
frame->height);
529 bbox->
h = (int)(y1 *
frame->height) - bbox->
y;
530
533
534 if (
ctx->labels && label_id[
i] <
ctx->label_count) {
536 } else {
538 }
539
540 nb_bboxes--;
541 if (nb_bboxes == 0) {
542 break;
543 }
544 }
545 return 0;
546 }
547
549 {
557 default:
560 }
561 }
562
564 {
565 for (
int i = 0;
i <
ctx->label_count;
i++) {
567 }
568 ctx->label_count = 0;
570 }
571
573 {
574 int line_len;
575 FILE *file;
577
579 if (!file){
582 }
583
584 while (!feof(file)) {
585 char *label;
586 char buf[256];
587 if (!fgets(buf, 256, file)) {
588 break;
589 }
590
591 line_len = strlen(buf);
592 while (line_len) {
593 int i = line_len - 1;
594 if (buf[
i] ==
'\n' || buf[
i] ==
'\r' || buf[
i] ==
' ') {
596 line_len--;
597 } else {
598 break;
599 }
600 }
601
602 if (line_len == 0) // empty line
603 continue;
604
607 fclose(file);
609 }
610
612 if (!label) {
614 fclose(file);
616 }
617
620 fclose(file);
623 }
624 }
625
626 fclose(file);
627 return 0;
628 }
629
631 {
632 switch(backend_type) {
634 if (output_nb != 4) {
636 but get %d instead\n", output_nb);
638 }
639 return 0;
641 return 0;
642 default:
645 }
646 return 0;
647 }
648
650 {
657
658 if (using_yolo && !
ctx->anchors) {
661 }
662
670 if (!
ctx->bboxes_fifo)
673
674 if (
ctx->labels_filename) {
678 }
679 }
680
681 return 0;
682 }
683
691 };
692
694 {
698
701 return -1;
702 }
703
704 do {
712 if (out_pts)
713 *out_pts = in_frame->
pts +
pts;
714 }
717
718 return 0;
719 }
720
722 {
729 int got_frame = 0;
730 int async_state;
731
733
734 do {
735 // drain all input frames
742 }
743 }
745
746 // drain all processed frames
747 do {
755 got_frame = 1;
756 }
758
759 // if frame got, schedule to next filter
760 if (got_frame)
761 return 0;
762
769 }
770 }
771
773
774 return 0;
775 }
776
778 {
782 if (
ctx->bboxes_fifo) {
786 }
788 }
791 }
792
794 {
798 int ret, width_idx, height_idx;
799
804 }
807 ctx->scale_width = model_input.
dims[width_idx] == -1 ?
inlink->w :
808 model_input.
dims[width_idx];
809 ctx->scale_height = model_input.
dims[height_idx] == -1 ?
inlink->h :
810 model_input.
dims[height_idx];
811
812 return 0;
813 }
814
816 {
820 },
821 };
822
824 .
p.
name =
"dnn_detect",
826 .p.priv_class = &dnn_detect_class,
835 };