dnn/vf_dnn_detect.c: add tensorflow output parse support

Testing model is tensorflow offical model in github repo, please refer
https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md
to download the detect model as you need.
For example, local testing was carried on with 'ssd_mobilenet_v2_coco_2018_03_29.tar.gz', and
used one image of dog in
https://github.com/tensorflow/models/blob/master/research/object_detection/test_images/image1.jpg

Testing command is:
./ffmpeg -i image1.jpg -vf dnn_detect=dnn_backend=tensorflow:input=image_tensor:output=\
"num_detections&detection_scores&detection_classes&detection_boxes":model=ssd_mobilenet_v2_coco.pb,\
showinfo -f null -

We will see the result similar as below:
[Parsed_showinfo_1 @ 0x33e65f0]   side data - detection bounding boxes:
[Parsed_showinfo_1 @ 0x33e65f0] source: ssd_mobilenet_v2_coco.pb
[Parsed_showinfo_1 @ 0x33e65f0] index: 0,       region: (382, 60) -> (1005, 593), label: 18, confidence: 9834/10000.
[Parsed_showinfo_1 @ 0x33e65f0] index: 1,       region: (12, 8) -> (328, 549), label: 18, confidence: 8555/10000.
[Parsed_showinfo_1 @ 0x33e65f0] index: 2,       region: (293, 7) -> (682, 458), label: 1, confidence: 8033/10000.
[Parsed_showinfo_1 @ 0x33e65f0] index: 3,       region: (342, 0) -> (690, 325), label: 1, confidence: 5878/10000.

There are two boxes of dog with cores 94.05% & 93.45% and two boxes of person with scores 80.33% & 58.78%.

Signed-off-by: Ting Fu <ting.fu@intel.com>
Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
This commit is contained in:
Ting Fu 2021-05-06 16:46:10 +08:00 committed by Guo, Yejun
parent e42125edab
commit c38bc5634d

View File

@ -48,6 +48,9 @@ typedef struct DnnDetectContext {
#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
static const AVOption dnn_detect_options[] = {
{ "dnn_backend", "DNN backend", OFFSET(backend_type), AV_OPT_TYPE_INT, { .i64 = 2 }, INT_MIN, INT_MAX, FLAGS, "backend" },
#if (CONFIG_LIBTENSORFLOW == 1)
{ "tensorflow", "tensorflow backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0, FLAGS, "backend" },
#endif
#if (CONFIG_LIBOPENVINO == 1)
{ "openvino", "openvino backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0, FLAGS, "backend" },
#endif
@ -59,7 +62,7 @@ static const AVOption dnn_detect_options[] = {
AVFILTER_DEFINE_CLASS(dnn_detect);
static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx)
static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
{
DnnDetectContext *ctx = filter_ctx->priv;
float conf_threshold = ctx->confidence;
@ -136,6 +139,96 @@ static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AV
return 0;
}
static int dnn_detect_post_proc_tf(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
{
DnnDetectContext *ctx = filter_ctx->priv;
int proposal_count;
float conf_threshold = ctx->confidence;
float *conf, *position, *label_id, x0, y0, x1, y1;
int nb_bboxes = 0;
AVFrameSideData *sd;
AVDetectionBBox *bbox;
AVDetectionBBoxHeader *header;
proposal_count = *(float *)(output[0].data);
conf = output[1].data;
position = output[3].data;
label_id = output[2].data;
sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
if (sd) {
av_log(filter_ctx, AV_LOG_ERROR, "already have dnn bounding boxes in side data.\n");
return -1;
}
for (int i = 0; i < proposal_count; ++i) {
if (conf[i] < conf_threshold)
continue;
nb_bboxes++;
}
if (nb_bboxes == 0) {
av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
return 0;
}
header = av_detection_bbox_create_side_data(frame, nb_bboxes);
if (!header) {
av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
return -1;
}
av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
for (int i = 0; i < proposal_count; ++i) {
y0 = position[i * 4];
x0 = position[i * 4 + 1];
y1 = position[i * 4 + 2];
x1 = position[i * 4 + 3];
bbox = av_get_detection_bbox(header, i);
if (conf[i] < conf_threshold) {
continue;
}
bbox->x = (int)(x0 * frame->width);
bbox->w = (int)(x1 * frame->width) - bbox->x;
bbox->y = (int)(y0 * frame->height);
bbox->h = (int)(y1 * frame->height) - bbox->y;
bbox->detect_confidence = av_make_q((int)(conf[i] * 10000), 10000);
bbox->classify_count = 0;
if (ctx->labels && label_id[i] < ctx->label_count) {
av_strlcpy(bbox->detect_label, ctx->labels[(int)label_id[i]], sizeof(bbox->detect_label));
} else {
snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", (int)label_id[i]);
}
nb_bboxes--;
if (nb_bboxes == 0) {
break;
}
}
return 0;
}
static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx)
{
DnnDetectContext *ctx = filter_ctx->priv;
DnnContext *dnn_ctx = &ctx->dnnctx;
switch (dnn_ctx->backend_type) {
case DNN_OV:
return dnn_detect_post_proc_ov(frame, output, filter_ctx);
case DNN_TF:
return dnn_detect_post_proc_tf(frame, output, filter_ctx);
default:
avpriv_report_missing_feature(filter_ctx, "Current dnn backend does not support detect filter\n");
return AVERROR(EINVAL);
}
}
static void free_detect_labels(DnnDetectContext *ctx)
{
for (int i = 0; i < ctx->label_count; i++) {