1 /*
2 * This file is part of FFmpeg.
3 *
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 /**
20 * @file
21 * implementing an object detecting filter using deep learning networks.
22 */
23
34
41
60
61 #define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x)
62 #define OFFSET2(x) offsetof(DnnDetectContext, x)
63 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
66 #if (CONFIG_LIBTENSORFLOW == 1)
68 #endif
69 #if (CONFIG_LIBOPENVINO == 1)
71 #endif
76 {
"ssd",
"output shape [1, 1, N, 7]", 0,
AV_OPT_TYPE_CONST, { .i64 =
DDMT_SSD }, 0, 0,
FLAGS, .unit =
"model_type" },
77 {
"yolo",
"output shape [1, N*Cx*Cy*DetectionBox]", 0,
AV_OPT_TYPE_CONST, { .i64 =
DDMT_YOLOV1V2 }, 0, 0,
FLAGS, .unit =
"model_type" },
78 {
"yolov3",
"outputs shape [1, N*D, Cx, Cy]", 0,
AV_OPT_TYPE_CONST, { .i64 =
DDMT_YOLOV3 }, 0, 0,
FLAGS, .unit =
"model_type" },
79 {
"yolov4",
"outputs shape [1, N*D, Cx, Cy]", 0,
AV_OPT_TYPE_CONST, { .i64 =
DDMT_YOLOV4 }, 0, 0,
FLAGS, .unit =
"model_type" },
85 };
86
88
90 return 1.f / (1.f +
exp(-x));
91 }
92
93 static inline float linear(
float x) {
94 return x;
95 }
96
98 {
99 float max_prob = 0;
100 int label_id = 0;
101 for (
int i = 0;
i < nb_classes;
i++) {
102 if (label_data[
i * cell_size] > max_prob) {
103 max_prob = label_data[
i * cell_size];
105 }
106 }
107 return label_id;
108 }
109
111 {
112 char *saveptr =
NULL, *token;
113 float *anchors_buf;
114 int nb_anchor = 0,
i = 0;
115 while(anchors_str[
i] !=
'0円') {
116 if(anchors_str[
i] ==
'&')
117 nb_anchor++;
119 }
120 nb_anchor++;
121 anchors_buf =
av_mallocz(nb_anchor *
sizeof(**anchors));
122 if (!anchors_buf) {
123 return 0;
124 }
125 for (
int i = 0;
i < nb_anchor;
i++) {
126 token =
av_strtok(anchors_str,
"&", &saveptr);
127 if (!token) {
129 return 0;
130 }
131 anchors_buf[
i] = strtof(token,
NULL);
133 }
134 *anchors = anchors_buf;
135 return nb_anchor;
136 }
137
138 /* Calculate Intersection Over Union */
140 {
141 float overlapping_width =
FFMIN(bbox1->
x + bbox1->
w, bbox2->
x + bbox2->
w) -
FFMAX(bbox1->
x, bbox2->
x);
142 float overlapping_height =
FFMIN(bbox1->
y + bbox1->
h, bbox2->
y + bbox2->
h) -
FFMAX(bbox1->
y, bbox2->
y);
143 float intersection_area =
144 (overlapping_width < 0 || overlapping_height < 0) ? 0 : overlapping_height * overlapping_width;
145 float union_area = bbox1->
w * bbox1->
h + bbox2->
w * bbox2->
h - intersection_area;
146 return intersection_area / union_area;
147 }
148
151 {
153 float conf_threshold =
ctx->confidence;
154 int detection_boxes, box_size;
155 int cell_w = 0, cell_h = 0, scale_w = 0, scale_h = 0;
156 int nb_classes =
ctx->nb_classes;
158 float *anchors =
ctx->anchors;
161 int is_NHWC = 0;
162
164 cell_w =
ctx->cell_w;
165 cell_h =
ctx->cell_h;
166 scale_w = cell_w;
167 scale_h = cell_h;
168 } else {
169 if (
output[output_index].dims[2] !=
output[output_index].dims[3] &&
170 output[output_index].dims[2] ==
output[output_index].dims[1]) {
171 is_NHWC = 1;
172 cell_w =
output[output_index].dims[2];
173 cell_h =
output[output_index].dims[1];
174 } else {
175 cell_w =
output[output_index].dims[3];
176 cell_h =
output[output_index].dims[2];
177 }
178 scale_w =
ctx->scale_width;
179 scale_h =
ctx->scale_height;
180 }
181 box_size = nb_classes + 5;
182
183 switch (
ctx->model_type) {
186 post_process_raw_data =
linear;
187 break;
189 post_process_raw_data =
sigmoid;
190 break;
191 }
192
193 if (!cell_h || !cell_w) {
196 }
197
198 if (!nb_classes) {
201 }
202
203 if (!anchors) {
206 }
207
208 if (
output[output_index].dims[1] *
output[output_index].dims[2] *
209 output[output_index].dims[3] % (box_size * cell_w * cell_h)) {
212 }
213 detection_boxes =
output[output_index].dims[1] *
214 output[output_index].dims[2] *
215 output[output_index].dims[3] / box_size / cell_w / cell_h;
216
217 anchors = anchors + (detection_boxes * output_index * 2);
218 /**
219 * find all candidate bbox
220 * yolo output can be reshaped to [B, N*D, Cx, Cy]
221 * Detection box 'D' has format [`x`, `y`, `h`, `w`, `box_score`, `class_no_1`, ...,]
222 **/
223 for (int box_id = 0; box_id < detection_boxes; box_id++) {
224 for (int cx = 0; cx < cell_w; cx++)
225 for (int cy = 0; cy < cell_h; cy++) {
226 float x, y,
w,
h, conf;
227 float *detection_boxes_data;
228 int label_id;
229
230 if (is_NHWC) {
232 ((cy * cell_w + cx) * detection_boxes + box_id) * box_size;
233 conf = post_process_raw_data(detection_boxes_data[4]);
234 } else {
235 detection_boxes_data =
output_data + box_id * box_size * cell_w * cell_h;
236 conf = post_process_raw_data(
237 detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h]);
238 }
239
240 if (is_NHWC) {
241 x = post_process_raw_data(detection_boxes_data[0]);
242 y = post_process_raw_data(detection_boxes_data[1]);
243 w = detection_boxes_data[2];
244 h = detection_boxes_data[3];
246 conf = conf * post_process_raw_data(detection_boxes_data[label_id + 5]);
247 } else {
248 x = post_process_raw_data(detection_boxes_data[cy * cell_w + cx]);
249 y = post_process_raw_data(detection_boxes_data[cy * cell_w + cx + cell_w * cell_h]);
250 w = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h];
251 h = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h];
253 detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h);
254 conf = conf * post_process_raw_data(
255 detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h]);
256 }
257 if (conf < conf_threshold) {
258 continue;
259 }
260
262 if (!bbox)
264
267 bbox->
x = (cx + x) / cell_w *
frame->
width - bbox->
w / 2;
270 if (
ctx->labels && label_id < ctx->label_count) {
272 } else {
274 }
275
279 }
281 }
282 }
283 return 0;
284 }
285
287 {
289 float conf_threshold =
ctx->confidence;
291 int nb_bboxes = 0;
295 return 0;
296 }
297
298 /* remove overlap bboxes */
308 nb_bboxes++;
309 break;
310 }
311 }
312 }
317 return -1;
318 }
320
324
327 memcpy(bbox, candidate_bbox, sizeof(*bbox));
328 nb_bboxes--;
329 }
331 }
332 return 0;
333 }
334
336 {
344 return 0;
345 }
346
349 {
351 for (
int i = 0;
i < nb_outputs;
i++) {
355 }
359 return 0;
360 }
361
364 {
366 float conf_threshold =
ctx->confidence;
367 int proposal_count = 0;
368 int detect_size = 0;
369 float *detections =
NULL, *labels =
NULL;
370 int nb_bboxes = 0;
373 int scale_w =
ctx->scale_width;
374 int scale_h =
ctx->scale_height;
375
376 if (nb_outputs == 1 &&
output->dims[3] == 7) {
377 proposal_count =
output->dims[2];
378 detect_size =
output->dims[3];
379 detections =
output->data;
380 }
else if (nb_outputs == 2 &&
output[0].dims[3] == 5) {
381 proposal_count =
output[0].dims[2];
382 detect_size =
output[0].dims[3];
383 detections =
output[0].data;
385 }
else if (nb_outputs == 2 &&
output[1].dims[3] == 5) {
386 proposal_count =
output[1].dims[2];
387 detect_size =
output[1].dims[3];
388 detections =
output[1].data;
390 } else {
393 }
394
395 if (proposal_count == 0)
396 return 0;
397
398 for (
int i = 0;
i < proposal_count; ++
i) {
399 float conf;
400 if (nb_outputs == 1)
401 conf = detections[
i * detect_size + 2];
402 else
403 conf = detections[
i * detect_size + 4];
404 if (conf < conf_threshold) {
405 continue;
406 }
407 nb_bboxes++;
408 }
409
410 if (nb_bboxes == 0) {
412 return 0;
413 }
414
418 return -1;
419 }
420
422
423 for (
int i = 0;
i < proposal_count; ++
i) {
424 int av_unused image_id = (
int)detections[
i * detect_size + 0];
425 int label_id;
426 float conf, x0, y0, x1, y1;
427
428 if (nb_outputs == 1) {
429 label_id = (
int)detections[
i * detect_size + 1];
430 conf = detections[
i * detect_size + 2];
431 x0 = detections[
i * detect_size + 3];
432 y0 = detections[
i * detect_size + 4];
433 x1 = detections[
i * detect_size + 5];
434 y1 = detections[
i * detect_size + 6];
435 } else {
436 label_id = (
int)labels[
i];
437 x0 = detections[
i * detect_size] / scale_w;
438 y0 = detections[
i * detect_size + 1] / scale_h;
439 x1 = detections[
i * detect_size + 2] / scale_w;
440 y1 = detections[
i * detect_size + 3] / scale_h;
441 conf = detections[
i * detect_size + 4];
442 }
443
444 if (conf < conf_threshold) {
445 continue;
446 }
447
453
456
457 if (
ctx->labels && label_id < ctx->label_count) {
459 } else {
461 }
462
463 nb_bboxes--;
464 if (nb_bboxes == 0) {
465 break;
466 }
467 }
468 return 0;
469 }
470
473 {
477
479 if (sd) {
481 return -1;
482 }
483
484 switch (
ctx->model_type) {
489 break;
494 break;
500 break;
501 }
502 return 0;
503 }
504
506 {
508 int proposal_count;
509 float conf_threshold =
ctx->confidence;
510 float *conf, *position, *label_id, x0, y0, x1, y1;
511 int nb_bboxes = 0;
515
518 position =
output[3].data;
519 label_id =
output[2].data;
520
522 if (sd) {
524 return -1;
525 }
526
527 for (
int i = 0;
i < proposal_count; ++
i) {
528 if (conf[
i] < conf_threshold)
529 continue;
530 nb_bboxes++;
531 }
532
533 if (nb_bboxes == 0) {
535 return 0;
536 }
537
541 return -1;
542 }
543
545
546 for (
int i = 0;
i < proposal_count; ++
i) {
547 y0 = position[
i * 4];
548 x0 = position[
i * 4 + 1];
549 y1 = position[
i * 4 + 2];
550 x1 = position[
i * 4 + 3];
551
553
554 if (conf[
i] < conf_threshold) {
555 continue;
556 }
557
562
565
566 if (
ctx->labels && label_id[
i] <
ctx->label_count) {
568 } else {
570 }
571
572 nb_bboxes--;
573 if (nb_bboxes == 0) {
574 break;
575 }
576 }
577 return 0;
578 }
579
581 {
589 default:
592 }
593 }
594
596 {
597 for (
int i = 0;
i <
ctx->label_count;
i++) {
599 }
600 ctx->label_count = 0;
602 }
603
605 {
606 int line_len;
607 FILE *file;
609
611 if (!file){
614 }
615
616 while (!feof(file)) {
617 char *label;
618 char buf[256];
619 if (!fgets(buf, 256, file)) {
620 break;
621 }
622
623 line_len = strlen(buf);
624 while (line_len) {
625 int i = line_len - 1;
626 if (buf[
i] ==
'\n' || buf[
i] ==
'\r' || buf[
i] ==
' ') {
628 line_len--;
629 } else {
630 break;
631 }
632 }
633
634 if (line_len == 0) // empty line
635 continue;
636
639 fclose(file);
641 }
642
644 if (!label) {
646 fclose(file);
648 }
649
652 fclose(file);
655 }
656 }
657
658 fclose(file);
659 return 0;
660 }
661
663 {
664 switch(backend_type) {
666 if (output_nb != 4) {
668 but get %d instead\n", output_nb);
670 }
671 return 0;
673 return 0;
674 default:
677 }
678 return 0;
679 }
680
682 {
686
694 if (!
ctx->bboxes_fifo)
697
698 if (
ctx->labels_filename) {
700 }
701 if (
ctx->anchors_str) {
706 }
708 }
709 return 0;
710 }
711
719 };
720
722 {
726
729 return -1;
730 }
731
732 do {
740 if (out_pts)
741 *out_pts = in_frame->
pts +
pts;
742 }
745
746 return 0;
747 }
748
750 {
757 int got_frame = 0;
758 int async_state;
759
761
762 do {
763 // drain all input frames
770 }
771 }
773
774 // drain all processed frames
775 do {
783 got_frame = 1;
784 }
786
787 // if frame got, schedule to next filter
788 if (got_frame)
789 return 0;
790
797 }
798 }
799
801
802 return 0;
803 }
804
806 {
813 }
817 }
818
820 {
824 int ret, width_idx, height_idx;
825
830 }
833 ctx->scale_width = model_input.
dims[width_idx] == -1 ?
inlink->w :
834 model_input.
dims[width_idx];
835 ctx->scale_height = model_input.
dims[height_idx] == -1 ?
inlink->h :
836 model_input.
dims[height_idx];
837
838 return 0;
839 }
840
842 {
846 },
847 };
848
850 .
name =
"dnn_detect",
858 .priv_class = &dnn_detect_class,
860 };