FFmpeg: libavfilter/vf_dnn_detect.c Source File

76 { "model_type", "DNN detection model type", OFFSET2(model_type), AV_OPT_TYPE_INT, { .i64 = DDMT_SSD }, INT_MIN, INT_MAX, FLAGS, .unit = "model_type" },

77 { "ssd", "output shape [1, 1, N, 7]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_SSD }, 0, 0, FLAGS, .unit = "model_type" },

78 { "yolo", "output shape [1, N*Cx*Cy*DetectionBox]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV1V2 }, 0, 0, FLAGS, .unit = "model_type" },

79 { "yolov3", "outputs shape [1, N*D, Cx, Cy]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV3 }, 0, 0, FLAGS, .unit = "model_type" },

80 { "yolov4", "outputs shape [1, N*D, Cx, Cy]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV4 }, 0, 0, FLAGS, .unit = "model_type" },

81 { "cell_w", "cell width", OFFSET2(cell_w), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },

82 { "cell_h", "cell height", OFFSET2(cell_h), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },

83 { "nb_classes", "The number of class", OFFSET2(nb_classes), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },

84 { "anchors", "anchors, split by '&'", OFFSET2(anchors), AV_OPT_TYPE_FLOAT | AV_OPT_TYPE_FLAG_ARRAY, { .arr = &anchor_array_def }, FLT_MIN, FLT_MAX, FLAGS },

85 { NULL }

86 };

88 AVFILTER_DNN_DEFINE_CLASS(dnn_detect, DNN_TF | DNN_OV);

90 static inline float sigmoid(float x) {

91 return 1.f / (1.f + exp(-x));

92 }

94 static inline float linear(float x) {

95 return x;

96 }

98 static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data)

99 {

100 float max_prob = 0;

101 int label_id = 0;

102 for (int i = 0; i < nb_classes; i++) {

103 if (label_data[i * cell_size] > max_prob) {

104 max_prob = label_data[i * cell_size];

105 label_id = i;

106 }

107 }

108 return label_id;

109 }

110

111 /* Calculate Intersection Over Union */

112 static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2)

113 {

114 float overlapping_width = FFMIN(bbox1->x + bbox1->w, bbox2->x + bbox2->w) - FFMAX(bbox1->x, bbox2->x);

115 float overlapping_height = FFMIN(bbox1->y + bbox1->h, bbox2->y + bbox2->h) - FFMAX(bbox1->y, bbox2->y);

116 float intersection_area =

117 (overlapping_width < 0 || overlapping_height < 0) ? 0 : overlapping_height * overlapping_width;

118 float union_area = bbox1->w * bbox1->h + bbox2->w * bbox2->h - intersection_area;

119 return intersection_area / union_area;

120 }

121

122 static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index,

123 AVFilterContext *filter_ctx)

124 {

125 DnnDetectContext *ctx = filter_ctx->priv;

126 float conf_threshold = ctx->confidence;

127 int detection_boxes, box_size;

128 int cell_w = 0, cell_h = 0, scale_w = 0, scale_h = 0;

129 int nb_classes = ctx->nb_classes;

130 float *output_data = output[output_index].data;

131 float *anchors = ctx->anchors;

132 AVDetectionBBox *bbox;

133 float (*post_process_raw_data)(float x) = linear;

134 int is_NHWC = 0;

135

136 if (ctx->model_type == DDMT_YOLOV1V2) {

137 cell_w = ctx->cell_w;

138 cell_h = ctx->cell_h;

139 scale_w = cell_w;

140 scale_h = cell_h;

141 } else {

142 if (output[output_index].dims[2] != output[output_index].dims[3] &&

143 output[output_index].dims[2] == output[output_index].dims[1]) {

144 is_NHWC = 1;

145 cell_w = output[output_index].dims[2];

146 cell_h = output[output_index].dims[1];

147 } else {

148 cell_w = output[output_index].dims[3];

149 cell_h = output[output_index].dims[2];

150 }

151 scale_w = ctx->scale_width;

152 scale_h = ctx->scale_height;

153 }

154 box_size = nb_classes + 5;

155

156 switch (ctx->model_type) {

157 case DDMT_YOLOV1V2:

158 case DDMT_YOLOV3:

159 post_process_raw_data = linear;

160 break;

161 case DDMT_YOLOV4:

162 post_process_raw_data = sigmoid;

163 break;

164 }

165

166 if (!cell_h || !cell_w) {

167 av_log(filter_ctx, AV_LOG_ERROR, "cell_w and cell_h are detected\n");

168 return AVERROR(EINVAL);

169 }

170

171 if (!nb_classes) {

172 av_log(filter_ctx, AV_LOG_ERROR, "nb_classes is not set\n");

173 return AVERROR(EINVAL);

174 }

175

176 if (output[output_index].dims[1] * output[output_index].dims[2] *

177 output[output_index].dims[3] % (box_size * cell_w * cell_h)) {

178 av_log(filter_ctx, AV_LOG_ERROR, "wrong cell_w, cell_h or nb_classes\n");

179 return AVERROR(EINVAL);

180 }

181 detection_boxes = output[output_index].dims[1] *

182 output[output_index].dims[2] *

183 output[output_index].dims[3] / box_size / cell_w / cell_h;

184

185 anchors = anchors + (detection_boxes * output_index * 2);

186 /**

187 * find all candidate bbox

188 * yolo output can be reshaped to [B, N*D, Cx, Cy]

189 * Detection box 'D' has format [`x`, `y`, `h`, `w`, `box_score`, `class_no_1`, ...,]

190 **/

191 for (int box_id = 0; box_id < detection_boxes; box_id++) {

192 for (int cx = 0; cx < cell_w; cx++)

193 for (int cy = 0; cy < cell_h; cy++) {

194 float x, y, w, h, conf;

195 float *detection_boxes_data;

196 int label_id;

197

198 if (is_NHWC) {

199 detection_boxes_data = output_data +

200 ((cy * cell_w + cx) * detection_boxes + box_id) * box_size;

201 conf = post_process_raw_data(detection_boxes_data[4]);

202 } else {

203 detection_boxes_data = output_data + box_id * box_size * cell_w * cell_h;

204 conf = post_process_raw_data(

205 detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h]);

206 }

207

208 if (is_NHWC) {

209 x = post_process_raw_data(detection_boxes_data[0]);

210 y = post_process_raw_data(detection_boxes_data[1]);

211 w = detection_boxes_data[2];

212 h = detection_boxes_data[3];

213 label_id = dnn_detect_get_label_id(ctx->nb_classes, 1, detection_boxes_data + 5);

214 conf = conf * post_process_raw_data(detection_boxes_data[label_id + 5]);

215 } else {

216 x = post_process_raw_data(detection_boxes_data[cy * cell_w + cx]);

217 y = post_process_raw_data(detection_boxes_data[cy * cell_w + cx + cell_w * cell_h]);

218 w = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h];

219 h = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h];

220 label_id = dnn_detect_get_label_id(ctx->nb_classes, cell_w * cell_h,

221 detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h);

222 conf = conf * post_process_raw_data(

223 detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h]);

224 }

225 if (conf < conf_threshold) {

226 continue;

227 }

228

229 bbox = av_mallocz(sizeof(*bbox));

230 if (!bbox)

231 return AVERROR(ENOMEM);

232

233 bbox->w = exp(w) * anchors[box_id * 2] * frame->width / scale_w;

234 bbox->h = exp(h) * anchors[box_id * 2 + 1] * frame->height / scale_h;

235 bbox->x = (cx + x) / cell_w * frame->width - bbox->w / 2;

236 bbox->y = (cy + y) / cell_h * frame->height - bbox->h / 2;

237 bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);

238 if (ctx->labels && label_id < ctx->label_count) {

239 av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label));

240 } else {

241 snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id);

242 }

243

244 if (av_fifo_write(ctx->bboxes_fifo, &bbox, 1) < 0) {

245 av_freep(&bbox);

246 return AVERROR(ENOMEM);

247 }

248 bbox = NULL;

249 }

250 }

251 return 0;

252 }

253

254 static int dnn_detect_fill_side_data(AVFrame *frame, AVFilterContext *filter_ctx)

255 {

256 DnnDetectContext *ctx = filter_ctx->priv;

257 float conf_threshold = ctx->confidence;

258 AVDetectionBBox *bbox;

259 int nb_bboxes = 0;

260 AVDetectionBBoxHeader *header;

261 if (av_fifo_can_read(ctx->bboxes_fifo) == 0) {

262 av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");

263 return 0;

264 }

265

266 /* remove overlap bboxes */

267 for (int i = 0; i < av_fifo_can_read(ctx->bboxes_fifo); i++){

268 av_fifo_peek(ctx->bboxes_fifo, &bbox, 1, i);

269 for (int j = 0; j < av_fifo_can_read(ctx->bboxes_fifo); j++) {

270 AVDetectionBBox *overlap_bbox;

271 av_fifo_peek(ctx->bboxes_fifo, &overlap_bbox, 1, j);

272 if (!strcmp(bbox->detect_label, overlap_bbox->detect_label) &&

273 av_cmp_q(bbox->detect_confidence, overlap_bbox->detect_confidence) < 0 &&

274 dnn_detect_IOU(bbox, overlap_bbox) >= conf_threshold) {

275 bbox->classify_count = -1; // bad result

276 nb_bboxes++;

277 break;

278 }

279 }

280 }

281 nb_bboxes = av_fifo_can_read(ctx->bboxes_fifo) - nb_bboxes;

282 header = av_detection_bbox_create_side_data(frame, nb_bboxes);

283 if (!header) {

284 av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);

285 return -1;

286 }

287 av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));

288

289 while(av_fifo_can_read(ctx->bboxes_fifo)) {

290 AVDetectionBBox *candidate_bbox;

291 av_fifo_read(ctx->bboxes_fifo, &candidate_bbox, 1);

292

293 if (nb_bboxes > 0 && candidate_bbox->classify_count != -1) {

294 bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes);

295 memcpy(bbox, candidate_bbox, sizeof(*bbox));

296 nb_bboxes--;

297 }

298 av_freep(&candidate_bbox);

299 }

300 return 0;

301 }

302

303 static int dnn_detect_post_proc_yolo(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)

304 {

305 int ret = 0;

306 ret = dnn_detect_parse_yolo_output(frame, output, 0, filter_ctx);

307 if (ret < 0)

308 return ret;

309 ret = dnn_detect_fill_side_data(frame, filter_ctx);

310 if (ret < 0)

311 return ret;

312 return 0;

313 }

314

315 static int dnn_detect_post_proc_yolov3(AVFrame *frame, DNNData *output,

316 AVFilterContext *filter_ctx, int nb_outputs)

317 {

318 int ret = 0;

319 for (int i = 0; i < nb_outputs; i++) {

320 ret = dnn_detect_parse_yolo_output(frame, output, i, filter_ctx);

321 if (ret < 0)

322 return ret;

323 }

324 ret = dnn_detect_fill_side_data(frame, filter_ctx);

325 if (ret < 0)

326 return ret;

327 return 0;

328 }

329

330 static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outputs,

331 AVFilterContext *filter_ctx)

332 {

333 DnnDetectContext *ctx = filter_ctx->priv;

334 float conf_threshold = ctx->confidence;

335 int proposal_count = 0;

336 int detect_size = 0;

337 float *detections = NULL, *labels = NULL;

338 int nb_bboxes = 0;

339 AVDetectionBBoxHeader *header;

340 AVDetectionBBox *bbox;

341 int scale_w = ctx->scale_width;

342 int scale_h = ctx->scale_height;

343

344 if (nb_outputs == 1 && output->dims[3] == 7) {

345 proposal_count = output->dims[2];

346 detect_size = output->dims[3];

347 detections = output->data;

348 } else if (nb_outputs == 2 && output[0].dims[3] == 5) {

349 proposal_count = output[0].dims[2];

350 detect_size = output[0].dims[3];

351 detections = output[0].data;

352 labels = output[1].data;

353 } else if (nb_outputs == 2 && output[1].dims[3] == 5) {

354 proposal_count = output[1].dims[2];

355 detect_size = output[1].dims[3];

356 detections = output[1].data;

357 labels = output[0].data;

358 } else {

359 av_log(filter_ctx, AV_LOG_ERROR, "Model output shape doesn't match ssd requirement.\n");

360 return AVERROR(EINVAL);

361 }

362

363 if (proposal_count == 0)

364 return 0;

365

366 for (int i = 0; i < proposal_count; ++i) {

367 float conf;

368 if (nb_outputs == 1)

369 conf = detections[i * detect_size + 2];

370 else

371 conf = detections[i * detect_size + 4];

372 if (conf < conf_threshold) {

373 continue;

374 }

375 nb_bboxes++;

376 }

377

378 if (nb_bboxes == 0) {

379 av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");

380 return 0;

381 }

382

383 header = av_detection_bbox_create_side_data(frame, nb_bboxes);

384 if (!header) {

385 av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);

386 return -1;

387 }

388

389 av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));

390

391 for (int i = 0; i < proposal_count; ++i) {

392 av_unused int image_id = (int)detections[i * detect_size + 0];

393 int label_id;

394 float conf, x0, y0, x1, y1;

395

396 if (nb_outputs == 1) {

397 label_id = (int)detections[i * detect_size + 1];

398 conf = detections[i * detect_size + 2];

399 x0 = detections[i * detect_size + 3];

400 y0 = detections[i * detect_size + 4];

401 x1 = detections[i * detect_size + 5];

402 y1 = detections[i * detect_size + 6];

403 } else {

404 label_id = (int)labels[i];

405 x0 = detections[i * detect_size] / scale_w;

406 y0 = detections[i * detect_size + 1] / scale_h;

407 x1 = detections[i * detect_size + 2] / scale_w;

408 y1 = detections[i * detect_size + 3] / scale_h;

409 conf = detections[i * detect_size + 4];

410 }

411

412 if (conf < conf_threshold) {

413 continue;

414 }

415

416 bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes);

417 bbox->x = (int)(x0 * frame->width);

418 bbox->w = (int)(x1 * frame->width) - bbox->x;

419 bbox->y = (int)(y0 * frame->height);

420 bbox->h = (int)(y1 * frame->height) - bbox->y;

421

422 bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);

423 bbox->classify_count = 0;

424

425 if (ctx->labels && label_id < ctx->label_count) {

426 av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label));

427 } else {

428 snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id);

429 }

430

431 nb_bboxes--;

432 if (nb_bboxes == 0) {

433 break;

434 }

435 }

436 return 0;

437 }

438

439 static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outputs,

440 AVFilterContext *filter_ctx)

441 {

442 AVFrameSideData *sd;

443 DnnDetectContext *ctx = filter_ctx->priv;

444 int ret = 0;

445

446 sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);

447 if (sd) {

448 av_log(filter_ctx, AV_LOG_ERROR, "already have bounding boxes in side data.\n");

449 return -1;

450 }

451

452 switch (ctx->model_type) {

453 case DDMT_SSD:

454 ret = dnn_detect_post_proc_ssd(frame, output, nb_outputs, filter_ctx);

455 if (ret < 0)

456 return ret;

457 break;

458 case DDMT_YOLOV1V2:

459 ret = dnn_detect_post_proc_yolo(frame, output, filter_ctx);

460 if (ret < 0)

461 return ret;

462 break;

463 case DDMT_YOLOV3:

464 case DDMT_YOLOV4:

465 ret = dnn_detect_post_proc_yolov3(frame, output, filter_ctx, nb_outputs);

466 if (ret < 0)

467 return ret;

468 break;

469 }

470 return 0;

471 }

472

473 static int dnn_detect_post_proc_tf(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)

474 {

475 DnnDetectContext *ctx = filter_ctx->priv;

476 int proposal_count;

477 float conf_threshold = ctx->confidence;

478 float *conf, *position, *label_id, x0, y0, x1, y1;

479 int nb_bboxes = 0;

480 AVFrameSideData *sd;

481 AVDetectionBBox *bbox;

482 AVDetectionBBoxHeader *header;

483

484 proposal_count = *(float *)(output[0].data);

485 conf = output[1].data;

486 position = output[3].data;

487 label_id = output[2].data;

488

489 sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);

490 if (sd) {

491 av_log(filter_ctx, AV_LOG_ERROR, "already have dnn bounding boxes in side data.\n");

492 return -1;

493 }

494

495 for (int i = 0; i < proposal_count; ++i) {

496 if (conf[i] < conf_threshold)

497 continue;

498 nb_bboxes++;

499 }

500

501 if (nb_bboxes == 0) {

502 av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");

503 return 0;

504 }

505

506 header = av_detection_bbox_create_side_data(frame, nb_bboxes);

507 if (!header) {

508 av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);

509 return -1;

510 }

511

512 av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));

513

514 for (int i = 0; i < proposal_count; ++i) {

515 y0 = position[i * 4];

516 x0 = position[i * 4 + 1];

517 y1 = position[i * 4 + 2];

518 x1 = position[i * 4 + 3];

519

520 bbox = av_get_detection_bbox(header, i);

521

522 if (conf[i] < conf_threshold) {

523 continue;

524 }

525

526 bbox->x = (int)(x0 * frame->width);

527 bbox->w = (int)(x1 * frame->width) - bbox->x;

528 bbox->y = (int)(y0 * frame->height);

529 bbox->h = (int)(y1 * frame->height) - bbox->y;

530

531 bbox->detect_confidence = av_make_q((int)(conf[i] * 10000), 10000);

532 bbox->classify_count = 0;

533

534 if (ctx->labels && label_id[i] < ctx->label_count) {

535 av_strlcpy(bbox->detect_label, ctx->labels[(int)label_id[i]], sizeof(bbox->detect_label));

536 } else {

537 snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", (int)label_id[i]);

538 }

539

540 nb_bboxes--;

541 if (nb_bboxes == 0) {

542 break;

543 }

544 }

545 return 0;

546 }

547

548 static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx)

549 {

550 DnnDetectContext *ctx = filter_ctx->priv;

551 DnnContext *dnn_ctx = &ctx->dnnctx;

552 switch (dnn_ctx->backend_type) {

553 case DNN_OV:

554 return dnn_detect_post_proc_ov(frame, output, nb, filter_ctx);

555 case DNN_TF:

556 return dnn_detect_post_proc_tf(frame, output, filter_ctx);

557 default:

558 avpriv_report_missing_feature(filter_ctx, "Current dnn backend does not support detect filter\n");

559 return AVERROR(EINVAL);

560 }

561 }

562

563 static void free_detect_labels(DnnDetectContext *ctx)

564 {

565 for (int i = 0; i < ctx->label_count; i++) {

566 av_freep(&ctx->labels[i]);

567 }

568 ctx->label_count = 0;

569 av_freep(&ctx->labels);

570 }

571

572 static int read_detect_label_file(AVFilterContext *context)

573 {

574 int line_len;

575 FILE *file;

576 DnnDetectContext *ctx = context->priv;

577

578 file = avpriv_fopen_utf8(ctx->labels_filename, "r");

579 if (!file){

580 av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename);

581 return AVERROR(EINVAL);

582 }

583

584 while (!feof(file)) {

585 char *label;

586 char buf[256];

587 if (!fgets(buf, 256, file)) {

588 break;

589 }

590

591 line_len = strlen(buf);

592 while (line_len) {

593 int i = line_len - 1;

594 if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {

595 buf[i] = '0円';

596 line_len--;

597 } else {

598 break;

599 }

600 }

601

602 if (line_len == 0) // empty line

603 continue;

604

605 if (line_len >= AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) {

606 av_log(context, AV_LOG_ERROR, "label %s too long\n", buf);

607 fclose(file);

608 return AVERROR(EINVAL);

609 }

610

611 label = av_strdup(buf);

612 if (!label) {

613 av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf);

614 fclose(file);

615 return AVERROR(ENOMEM);

616 }

617

618 if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) {

619 av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n");

620 fclose(file);

621 av_freep(&label);

622 return AVERROR(ENOMEM);

623 }

624 }

625

626 fclose(file);

627 return 0;

628 }

629

630 static int check_output_nb(DnnDetectContext *ctx, DNNBackendType backend_type, int output_nb)

631 {

632 switch(backend_type) {

633 case DNN_TF:

634 if (output_nb != 4) {

635 av_log(ctx, AV_LOG_ERROR, "Only support tensorflow detect model with 4 outputs, \

636 but get %d instead\n", output_nb);

637 return AVERROR(EINVAL);

638 }

639 return 0;

640 case DNN_OV:

641 return 0;

642 default:

643 avpriv_report_missing_feature(ctx, "Dnn detect filter does not support current backend\n");

644 return AVERROR(EINVAL);

645 }

646 return 0;

647 }

648

649 static av_cold int dnn_detect_init(AVFilterContext *context)

650 {

651 DnnDetectContext *ctx = context->priv;

652 DnnContext *dnn_ctx = &ctx->dnnctx;

653 int ret;

654 int using_yolo = (ctx->model_type == DDMT_YOLOV3 ||

655 ctx->model_type == DDMT_YOLOV4 ||

656 ctx->model_type == DDMT_YOLOV1V2);

657

658 if (using_yolo && !ctx->anchors) {

659 av_log(ctx, AV_LOG_ERROR, "anchors is not set while being required for YOLO models\n");

660 return AVERROR(EINVAL);

661 }

662

663 ret = ff_dnn_init(&ctx->dnnctx, DFT_ANALYTICS_DETECT, context);

664 if (ret < 0)

665 return ret;

666 ret = check_output_nb(ctx, dnn_ctx->backend_type, dnn_ctx->nb_outputs);

667 if (ret < 0)

668 return ret;

669 ctx->bboxes_fifo = av_fifo_alloc2(1, sizeof(AVDetectionBBox *), AV_FIFO_FLAG_AUTO_GROW);

670 if (!ctx->bboxes_fifo)

671 return AVERROR(ENOMEM);

672 ff_dnn_set_detect_post_proc(&ctx->dnnctx, dnn_detect_post_proc);

673

674 if (ctx->labels_filename) {

675 ret = read_detect_label_file(context);

676 if (ret) {

677 return ret;

678 }

679 }

680

681 return 0;

682 }

683

684 static const enum AVPixelFormat pix_fmts[] = {

685 AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24,

686 AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32,

687 AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,

688 AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,

689 AV_PIX_FMT_NV12,

690 AV_PIX_FMT_NONE

691 };

692

693 static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)

694 {

695 DnnDetectContext *ctx = outlink->src->priv;

696 int ret;

697 DNNAsyncStatusType async_state;

698

699 ret = ff_dnn_flush(&ctx->dnnctx);

700 if (ret != 0) {

701 return -1;

702 }

703

704 do {

705 AVFrame *in_frame = NULL;

706 AVFrame *out_frame = NULL;

707 async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);

708 if (async_state == DAST_SUCCESS) {

709 ret = ff_filter_frame(outlink, in_frame);

710 if (ret < 0)

711 return ret;

712 if (out_pts)

713 *out_pts = in_frame->pts + pts;

714 }

715 av_usleep(5000);

716 } while (async_state >= DAST_NOT_READY);

717

718 return 0;

719 }

720

721 static int dnn_detect_activate(AVFilterContext *filter_ctx)

722 {

723 AVFilterLink *inlink = filter_ctx->inputs[0];

724 AVFilterLink *outlink = filter_ctx->outputs[0];

725 DnnDetectContext *ctx = filter_ctx->priv;

726 AVFrame *in = NULL;

727 int64_t pts;

728 int ret, status;

729 int got_frame = 0;

730 int async_state;

731

732 FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);

733

734 do {

735 // drain all input frames

736 ret = ff_inlink_consume_frame(inlink, &in);

737 if (ret < 0)

738 return ret;

739 if (ret > 0) {

740 if (ff_dnn_execute_model(&ctx->dnnctx, in, NULL) != 0) {

741 return AVERROR(EIO);

742 }

743 }

744 } while (ret > 0);

745

746 // drain all processed frames

747 do {

748 AVFrame *in_frame = NULL;

749 AVFrame *out_frame = NULL;

750 async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);

751 if (async_state == DAST_SUCCESS) {

752 ret = ff_filter_frame(outlink, in_frame);

753 if (ret < 0)

754 return ret;

755 got_frame = 1;

756 }

757 } while (async_state == DAST_SUCCESS);

758

759 // if frame got, schedule to next filter

760 if (got_frame)

761 return 0;

762

763 if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {

764 if (status == AVERROR_EOF) {

765 int64_t out_pts = pts;

766 ret = dnn_detect_flush_frame(outlink, pts, &out_pts);

767 ff_outlink_set_status(outlink, status, out_pts);

768 return ret;

769 }

770 }

771

772 FF_FILTER_FORWARD_WANTED(outlink, inlink);

773

774 return 0;

775 }

776

777 static av_cold void dnn_detect_uninit(AVFilterContext *context)

778 {

779 DnnDetectContext *ctx = context->priv;

780 AVDetectionBBox *bbox;

781 ff_dnn_uninit(&ctx->dnnctx);

782 if (ctx->bboxes_fifo) {

783 while (av_fifo_can_read(ctx->bboxes_fifo)) {

784 av_fifo_read(ctx->bboxes_fifo, &bbox, 1);

785 av_freep(&bbox);

786 }

787 av_fifo_freep2(&ctx->bboxes_fifo);

788 }

789 av_freep(&ctx->anchors);

790 free_detect_labels(ctx);

791 }

792

793 static int config_input(AVFilterLink *inlink)

794 {

795 AVFilterContext *context = inlink->dst;

796 DnnDetectContext *ctx = context->priv;

797 DNNData model_input;

798 int ret, width_idx, height_idx;

799

800 ret = ff_dnn_get_input(&ctx->dnnctx, &model_input);

801 if (ret != 0) {

802 av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n");

803 return ret;

804 }

805 width_idx = dnn_get_width_idx_by_layout(model_input.layout);

806 height_idx = dnn_get_height_idx_by_layout(model_input.layout);

807 ctx->scale_width = model_input.dims[width_idx] == -1 ? inlink->w :

808 model_input.dims[width_idx];

809 ctx->scale_height = model_input.dims[height_idx] == -1 ? inlink->h :

810 model_input.dims[height_idx];

811

812 return 0;

813 }

814

815 static const AVFilterPad dnn_detect_inputs[] = {

816 {

817 .name = "default",

818 .type = AVMEDIA_TYPE_VIDEO,

819 .config_props = config_input,

820 },

821 };

822

823 const FFFilter ff_vf_dnn_detect = {

824 .p.name = "dnn_detect",

825 .p.description = NULL_IF_CONFIG_SMALL("Apply DNN detect filter to the input."),

826 .p.priv_class = &dnn_detect_class,

827 .priv_size = sizeof(DnnDetectContext),

828 .preinit = ff_dnn_filter_init_child_class,

829 .init = dnn_detect_init,

830 .uninit = dnn_detect_uninit,

831 FILTER_INPUTS(dnn_detect_inputs),

832 FILTER_OUTPUTS(ff_video_default_filterpad),

833 FILTER_PIXFMTS_ARRAY(pix_fmts),

834 .activate = dnn_detect_activate,

835 };

pix_fmts

static enum AVPixelFormat pix_fmts[]

Definition: vf_dnn_detect.c:684

DnnDetectContext::nb_classes

int nb_classes

Definition: vf_dnn_detect.c:53

AVPixelFormat

Pixel format.

Definition: pixfmt.h:71

dnn_detect_parse_yolo_output

static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index, AVFilterContext *filter_ctx)

Definition: vf_dnn_detect.c:122

AVERROR

Filter the word "frame" indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions

opt.h

FILTER_PIXFMTS_ARRAY

#define FILTER_PIXFMTS_ARRAY(array)

Definition: filters.h:243

av_frame_get_side_data

AVFrameSideData * av_frame_get_side_data(const AVFrame *frame, enum AVFrameSideDataType type)

Definition: frame.c:659

AVOptionArrayDef::sep

char sep

Separator between array elements in string representations of this option, used by av_opt_set() and a...

Definition: opt.h:423

AVOptionArrayDef

May be set as default_val for AV_OPT_TYPE_FLAG_ARRAY options.

Definition: opt.h:395

ff_filter_frame

int ff_filter_frame(AVFilterLink *link, AVFrame *frame)

Send a frame of data to the next filter.

Definition: avfilter.c:1067

AVERROR_EOF

#define AVERROR_EOF

End of file.

Definition: error.h:57

int64_t

long long int64_t

Definition: coverity.c:34

output

filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output

Definition: filter_design.txt:226

inlink

The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink

Definition: filter_design.txt:212

av_unused

#define av_unused

Definition: attributes.h:151

av_fifo_peek

int av_fifo_peek(const AVFifo *f, void *buf, size_t nb_elems, size_t offset)

Read data from a FIFO without modifying FIFO state.

Definition: fifo.c:255

FILTER_INPUTS

#define FILTER_INPUTS(array)

Definition: filters.h:263

AVFrame

This structure describes decoded (raw) audio or video data.

Definition: frame.h:427

AVFrame::pts

int64_t pts

Presentation timestamp in time_base units (time when frame should be shown to user).

Definition: frame.h:529

uint8_t w

Definition: llviddspenc.c:38

read_detect_label_file

static int read_detect_label_file(AVFilterContext *context)

Definition: vf_dnn_detect.c:572

AVOption

AVOption.

Definition: opt.h:429

data

const char data[16]

Definition: mxf.c:149

dnn_detect_post_proc_ssd

static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outputs, AVFilterContext *filter_ctx)

Definition: vf_dnn_detect.c:330

dnn_detect_init

static av_cold int dnn_detect_init(AVFilterContext *context)

Definition: vf_dnn_detect.c:649

output_data

static int output_data(MLPDecodeContext *m, unsigned int substr, AVFrame *frame, int *got_frame_ptr)

Write the audio data into the output buffer.

Definition: mlpdec.c:1107

AV_LOG_VERBOSE

#define AV_LOG_VERBOSE

Detailed information.

Definition: log.h:226

float.h

dnn_detect_inputs

static const AVFilterPad dnn_detect_inputs[]

Definition: vf_dnn_detect.c:815

AV_PIX_FMT_BGR24

@ AV_PIX_FMT_BGR24

packed RGB 8:8:8, 24bpp, BGRBGR...

Definition: pixfmt.h:76

preinit

static av_cold int preinit(AVFilterContext *ctx)

Definition: af_aresample.c:48

FFMAX

#define FFMAX(a, b)

Definition: macros.h:47

AVFilter::name

const char * name

Filter name.

Definition: avfilter.h:220

dnn_filter_common.h

AVDetectionBBox::y

int y

Definition: detection_bbox.h:32

video.h

AVFilterLink

A link between two filters.

Definition: avfilter.h:395

FF_FILTER_FORWARD_STATUS_BACK

#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)

Forward the status on an output link to an input link.

Definition: filters.h:638

AVFILTER_DNN_DEFINE_CLASS

AVFILTER_DNN_DEFINE_CLASS(dnn_detect, DNN_TF|DNN_OV)

ff_inlink_consume_frame

int ff_inlink_consume_frame(AVFilterLink *link, AVFrame **rframe)

Take a frame from the link's FIFO and update the link's stats.

Definition: avfilter.c:1517

fifo.h

AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE

#define AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE

Definition: detection_bbox.h:36

dnn_get_width_idx_by_layout

static int dnn_get_width_idx_by_layout(DNNLayout layout)

Definition: dnn_interface.h:197

AVDetectionBBox::detect_label

char detect_label[AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE]

Detect result with confidence.

Definition: detection_bbox.h:41

AVFilterContext::priv

void * priv

private data for use by the filter

Definition: avfilter.h:289

av_fifo_write

int av_fifo_write(AVFifo *f, const void *buf, size_t nb_elems)

Write data into a FIFO.

Definition: fifo.c:188

DnnContext

Definition: dnn_interface.h:143

dnn_detect_IOU

static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2)

Definition: vf_dnn_detect.c:112

filter_ctx

static FilteringContext * filter_ctx

Definition: transcode.c:52

ff_dnn_filter_init_child_class

int ff_dnn_filter_init_child_class(AVFilterContext *filter)

Definition: dnn_filter_common.c:61

dnn_detect_uninit

static av_cold void dnn_detect_uninit(AVFilterContext *context)

Definition: vf_dnn_detect.c:777

DnnDetectContext

Definition: vf_dnn_detect.c:43

pts

static int64_t pts

Definition: transcode_aac.c:644

DnnDetectContext::model_type

DNNDetectionModelType model_type

Definition: vf_dnn_detect.c:50

AVFilterPad

A filter pad used for either input or output.

Definition: filters.h:39

av_get_detection_bbox

static av_always_inline AVDetectionBBox * av_get_detection_bbox(const AVDetectionBBoxHeader *header, unsigned int idx)

Definition: detection_bbox.h:84

DnnDetectContext::scale_height

int scale_height

Definition: vf_dnn_detect.c:56

dnn_detect_post_proc_ov

static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outputs, AVFilterContext *filter_ctx)

Definition: vf_dnn_detect.c:439

DNN_TF

@ DNN_TF

Definition: dnn_interface.h:36

AV_LOG_ERROR

#define AV_LOG_ERROR

Something went wrong and cannot losslessly be recovered.

Definition: log.h:210

av_cold

#define av_cold

Definition: attributes.h:106

av_fifo_read

int av_fifo_read(AVFifo *f, void *buf, size_t nb_elems)

Read data from a FIFO.

Definition: fifo.c:240

ff_video_default_filterpad

const AVFilterPad ff_video_default_filterpad[1]

An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_VIDEO.

Definition: video.c:37

DnnDetectContext::bboxes_fifo

AVFifo * bboxes_fifo

Definition: vf_dnn_detect.c:54

FFFilter

Definition: filters.h:266

float

Definition: af_crystalizer.c:122

ff_outlink_set_status

static void ff_outlink_set_status(AVFilterLink *link, int status, int64_t pts)

Set the status field of a link from the source filter.

Definition: filters.h:628

ff_dnn_set_detect_post_proc

int ff_dnn_set_detect_post_proc(DnnContext *ctx, DetectPostProc post_proc)

Definition: dnn_filter_common.c:146

free_detect_labels

static void free_detect_labels(DnnDetectContext *ctx)

Definition: vf_dnn_detect.c:563

DNNData

Definition: dnn_interface.h:69

dnn_detect_post_proc_yolov3

static int dnn_detect_post_proc_yolov3(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx, int nb_outputs)

Definition: vf_dnn_detect.c:315

filters.h

ff_dnn_get_result

DNNAsyncStatusType ff_dnn_get_result(DnnContext *ctx, AVFrame **in_frame, AVFrame **out_frame)

Definition: dnn_filter_common.c:198

ctx

AVFormatContext * ctx

Definition: movenc.c:49

config_input

static int config_input(AVFilterLink *inlink)

Definition: vf_dnn_detect.c:793

linear

static float linear(float x)

Definition: vf_dnn_detect.c:94

ff_vf_dnn_detect

const FFFilter ff_vf_dnn_detect

Definition: vf_dnn_detect.c:823

DnnDetectContext::scale_width

int scale_width

Definition: vf_dnn_detect.c:55

AV_PIX_FMT_YUV420P

@ AV_PIX_FMT_YUV420P

planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)

Definition: pixfmt.h:73

av_usleep

int av_usleep(unsigned usec)

Sleep for a period of time.

Definition: time.c:84

FILTER_OUTPUTS

#define FILTER_OUTPUTS(array)

Definition: filters.h:264

AV_PIX_FMT_GRAYF32

#define AV_PIX_FMT_GRAYF32

Definition: pixfmt.h:582

file_open.h

ff_dnn_get_input

int ff_dnn_get_input(DnnContext *ctx, DNNData *input)

Definition: dnn_filter_common.c:158

DNN_OV

@ DNN_OV

Definition: dnn_interface.h:37

context

it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are in without and describe what they for example set the foo of the bar offset is the offset of the field in your context

Definition: writing_filters.txt:91

AVClass

Describe the class of an AVClass context structure.

Definition: log.h:76

NULL

#define NULL

Definition: coverity.c:32

AVDetectionBBoxHeader

Definition: detection_bbox.h:56

DnnDetectContext::dnnctx

DnnContext dnnctx

Definition: vf_dnn_detect.c:45

DnnDetectContext::cell_h

int cell_h

Definition: vf_dnn_detect.c:52

dnn_detect_activate

static int dnn_detect_activate(AVFilterContext *filter_ctx)

Definition: vf_dnn_detect.c:721

DnnDetectContext::labels_filename

char * labels_filename

Definition: vf_dnn_detect.c:47

av_fifo_can_read

size_t av_fifo_can_read(const AVFifo *f)

Definition: fifo.c:87

DnnDetectContext::labels

char ** labels

Definition: vf_dnn_detect.c:48

DDMT_YOLOV3

@ DDMT_YOLOV3

Definition: vf_dnn_detect.c:39

time.h

AV_PIX_FMT_GRAY8

@ AV_PIX_FMT_GRAY8

Y , 8bpp.

Definition: pixfmt.h:81

exp

int8_t exp

Definition: eval.c:73

ff_dnn_flush

int ff_dnn_flush(DnnContext *ctx)

Definition: dnn_filter_common.c:203

ff_inlink_acknowledge_status

int ff_inlink_acknowledge_status(AVFilterLink *link, int *rstatus, int64_t *rpts)

Test and acknowledge the change of status on the link.

Definition: avfilter.c:1464

FLAGS

#define FLAGS

Definition: vf_dnn_detect.c:65

DDMT_YOLOV4

@ DDMT_YOLOV4

Definition: vf_dnn_detect.c:40

av_detection_bbox_create_side_data

AVDetectionBBoxHeader * av_detection_bbox_create_side_data(AVFrame *frame, uint32_t nb_bboxes)

Allocates memory for AVDetectionBBoxHeader, plus an array of.

Definition: detection_bbox.c:52

DnnContext::backend_type

DNNBackendType backend_type

Definition: dnn_interface.h:149

init

int(* init)(AVBSFContext *ctx)

Definition: dts2pts.c:550

AV_PIX_FMT_RGB24

@ AV_PIX_FMT_RGB24

packed RGB 8:8:8, 24bpp, RGBRGB...

Definition: pixfmt.h:75

AVFifo

Definition: fifo.c:35

NULL_IF_CONFIG_SMALL

#define NULL_IF_CONFIG_SMALL(x)

Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.

Definition: internal.h:94

DnnDetectContext::label_count

int label_count

Definition: vf_dnn_detect.c:49

DAST_SUCCESS

@ DAST_SUCCESS

Definition: dnn_interface.h:53

AVDetectionBBox::w

int w

Definition: detection_bbox.h:33

AV_OPT_TYPE_FLAG_ARRAY

@ AV_OPT_TYPE_FLAG_ARRAY

May be combined with another regular option type to declare an array option.

Definition: opt.h:346

DNNBackendType

Definition: dnn_interface.h:35

DnnContext::nb_outputs

uint32_t nb_outputs

Definition: dnn_interface.h:156

av_make_q

static AVRational av_make_q(int num, int den)

Create an AVRational.

Definition: rational.h:71

avpriv_report_missing_feature

void avpriv_report_missing_feature(void *avc, const char *msg,...) av_printf_format(2

Log a generic warning message about a missing feature.

AVFilterLink::src

AVFilterContext * src

source filter

Definition: avfilter.h:396

sigmoid

static float sigmoid(float x)

Definition: vf_dnn_detect.c:90

header

static const uint8_t header[24]

Definition: sdr2.c:68

DNNData::layout

DNNLayout layout

Definition: dnn_interface.h:75

AVDetectionBBox::classify_count

uint32_t classify_count

Definition: detection_bbox.h:51

DDMT_YOLOV1V2

@ DDMT_YOLOV1V2

Definition: vf_dnn_detect.c:38

FF_FILTER_FORWARD_WANTED

FF_FILTER_FORWARD_WANTED(outlink, inlink)

dnn_detect_options

static const AVOption dnn_detect_options[]

Definition: vf_dnn_detect.c:66

AV_OPT_TYPE_FLOAT

@ AV_OPT_TYPE_FLOAT

Underlying C type is float.

Definition: opt.h:271

dnn_detect_flush_frame

static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)

Definition: vf_dnn_detect.c:693

dnn_detect_post_proc

static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx)

Definition: vf_dnn_detect.c:548

uninit

static void uninit(AVBSFContext *ctx)

Definition: pcm_rechunk.c:68

#define i(width, name, range_min, range_max)

Definition: cbs_h2645.c:256

DFT_ANALYTICS_DETECT

@ DFT_ANALYTICS_DETECT

Definition: dnn_interface.h:59

check_output_nb

static int check_output_nb(DnnDetectContext *ctx, DNNBackendType backend_type, int output_nb)

Definition: vf_dnn_detect.c:630

FFMIN

#define FFMIN(a, b)

Definition: macros.h:49

av_mallocz

void * av_mallocz(size_t size)

Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...

Definition: mem.c:256

AVFilterPad::name

const char * name

Pad name.

Definition: filters.h:45

avpriv_fopen_utf8

FILE * avpriv_fopen_utf8(const char *path, const char *mode)

Open a file using a UTF-8 filename.

Definition: file_open.c:161

dnn_detect_post_proc_yolo

static int dnn_detect_post_proc_yolo(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)

Definition: vf_dnn_detect.c:303

DnnDetectContext::confidence

float confidence

Definition: vf_dnn_detect.c:46

av_cmp_q

static int av_cmp_q(AVRational a, AVRational b)

Compare two rationals.

Definition: rational.h:89

ret

Definition: filter_design.txt:187

AV_PIX_FMT_NV12

@ AV_PIX_FMT_NV12

planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...

Definition: pixfmt.h:96

frame

these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame For filters that do not use the this method is called when a frame is wanted on an output For a it should directly call filter_frame on the corresponding output For a if there are queued frames already one of these frames should be pushed If the filter should request a frame on one of its repeatedly until at least one frame has been pushed Return or at least make progress towards producing a frame

Definition: filter_design.txt:265

AVDetectionBBox::h

int h

Definition: detection_bbox.h:34

av_fifo_alloc2

AVFifo * av_fifo_alloc2(size_t nb_elems, size_t elem_size, unsigned int flags)

Allocate and initialize an AVFifo with a given element size.

Definition: fifo.c:47

AVDetectionBBox::detect_confidence

AVRational detect_confidence

Definition: detection_bbox.h:42

av_dynarray_add_nofree

int av_dynarray_add_nofree(void *tab_ptr, int *nb_ptr, void *elem)

Add an element to a dynamic array.

Definition: mem.c:315

DDMT_SSD

@ DDMT_SSD

Definition: vf_dnn_detect.c:37

status

ov_status_e status

Definition: dnn_backend_openvino.c:100

DNNDetectionModelType

Definition: vf_dnn_detect.c:36

AV_PIX_FMT_NONE

@ AV_PIX_FMT_NONE

Definition: pixfmt.h:72

AV_OPT_TYPE_INT

@ AV_OPT_TYPE_INT

Underlying C type is int.

Definition: opt.h:259

AVDetectionBBox::x

int x

Distance in pixels from the left/top edge of the frame, together with width and height,...

Definition: detection_bbox.h:31

DnnDetectContext::nb_anchor

int nb_anchor

Definition: vf_dnn_detect.c:58

anchor_array_def

static const AVOptionArrayDef anchor_array_def

Definition: vf_dnn_detect.c:61

AV_PIX_FMT_YUV444P

@ AV_PIX_FMT_YUV444P

planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples)

Definition: pixfmt.h:78

AVFilterContext

An instance of a filter.

Definition: avfilter.h:274

DNNData::dims

int dims[4]

Definition: dnn_interface.h:71

av_strdup

char * av_strdup(const char *s)

Duplicate a string.

Definition: mem.c:272

AVMEDIA_TYPE_VIDEO

@ AVMEDIA_TYPE_VIDEO

Definition: avutil.h:200

FFFilter::p

AVFilter p

The public AVFilter.

Definition: filters.h:270

AV_PIX_FMT_YUV422P

@ AV_PIX_FMT_YUV422P

planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)

Definition: pixfmt.h:77

mem.h

OFFSET

#define OFFSET(x)

Definition: vf_dnn_detect.c:63

dnn_detect_fill_side_data

static int dnn_detect_fill_side_data(AVFrame *frame, AVFilterContext *filter_ctx)

Definition: vf_dnn_detect.c:254

dnn_get_height_idx_by_layout

static int dnn_get_height_idx_by_layout(DNNLayout layout)

Definition: dnn_interface.h:202

AVFrameSideData

Structure to hold side data for an AVFrame.

Definition: frame.h:282

DnnDetectContext::cell_w

int cell_w

Definition: vf_dnn_detect.c:51

DnnDetectContext::anchors

float * anchors

Definition: vf_dnn_detect.c:57

ff_dnn_init

int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *filter_ctx)

Definition: dnn_filter_common.c:73

dnn_detect_get_label_id

static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data)

Definition: vf_dnn_detect.c:98

av_freep

#define av_freep(p)

Definition: tableprint_vlc.h:35

AV_PIX_FMT_YUV411P

@ AV_PIX_FMT_YUV411P

planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples)

Definition: pixfmt.h:80

AV_PIX_FMT_YUV410P

@ AV_PIX_FMT_YUV410P

planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples)

Definition: pixfmt.h:79

av_strlcpy

size_t av_strlcpy(char *dst, const char *src, size_t size)

Copy the string src to dst, but no more than size - 1 bytes, and null-terminate dst.

Definition: avstring.c:85

av_log

#define av_log(a,...)

Definition: tableprint_vlc.h:27

av_fifo_freep2

void av_fifo_freep2(AVFifo **f)

Free an AVFifo and reset pointer to NULL.

Definition: fifo.c:286

ff_dnn_uninit

void ff_dnn_uninit(DnnContext *ctx)

Definition: dnn_filter_common.c:208

AVDetectionBBox

Definition: detection_bbox.h:26

Definition: vp9dsp_template.c:2070

dnn_detect_post_proc_tf

static int dnn_detect_post_proc_tf(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)

Definition: vf_dnn_detect.c:473

ff_dnn_execute_model

int ff_dnn_execute_model(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame)

Definition: dnn_filter_common.c:171

avstring.h

AV_OPT_TYPE_STRING

@ AV_OPT_TYPE_STRING

Underlying C type is a uint8_t* that is either NULL or points to a C string allocated with the av_mal...

Definition: opt.h:276

DAST_NOT_READY

@ DAST_NOT_READY

Definition: dnn_interface.h:52

DNNAsyncStatusType

Definition: dnn_interface.h:49

AV_OPT_TYPE_CONST

@ AV_OPT_TYPE_CONST

Special option type for declaring named constants.

Definition: opt.h:299

snprintf

#define snprintf

Definition: snprintf.h:34

OFFSET2

#define OFFSET2(x)

Definition: vf_dnn_detect.c:64

detection_bbox.h

AV_FIFO_FLAG_AUTO_GROW

#define AV_FIFO_FLAG_AUTO_GROW

Automatically resize the FIFO on writes, so that the data fits.

Definition: fifo.h:63

AV_FRAME_DATA_DETECTION_BBOXES

@ AV_FRAME_DATA_DETECTION_BBOXES

Bounding boxes for object detection and classification, as described by AVDetectionBBoxHeader.

Definition: frame.h:194

Generated on Tue Nov 18 2025 19:23:10 for FFmpeg by doxygen 1.8.17