lavfi/qsvvpp: support async depth

Async depth will allow qsv filter cache few frames, and avoid force
switch and end filter task frame by frame. This change will improve
performance for some multi-task case, for example 1:N transcode(
decode + vpp + encode) with all QSV plugins.

Performance data test on my Coffee Lake Desktop(i7-8700K) by using
the following 1:8 transcode test case improvement:
1. Fps improved from 55 to 130.
2. Render/Video usage improved from ~61%/~38% to ~100%/~70%.(Data get
from intel_gpu_top)

test CMD:
ffmpeg -v verbose -init_hw_device qsv=hw:/dev/dri/renderD128 -filter_hw_device                 \
 hw -hwaccel qsv -hwaccel_output_format qsv -c:v h264_qsv -i 1920x1080.264                     \
-vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \
-vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \
-vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \
-vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \
-vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \
-vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \
-vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null -

Signed-off-by: Fei Wang <fei.w.wang@intel.com>
Reviewed-by: Linjie Fu <linjie.justin.fu@gmail.com>
Signed-off-by: Zhong Li <zhongli_dev@126.com>
This commit is contained in:
Fei Wang 2021-03-31 10:07:44 +08:00 committed by Zhong Li
parent 309e3cc15c
commit 89ffcd1bbe
4 changed files with 192 additions and 91 deletions

View File

@ -37,37 +37,6 @@
#define IS_OPAQUE_MEMORY(mode) (mode & MFX_MEMTYPE_OPAQUE_FRAME)
#define IS_SYSTEM_MEMORY(mode) (mode & MFX_MEMTYPE_SYSTEM_MEMORY)
typedef struct QSVFrame {
AVFrame *frame;
mfxFrameSurface1 *surface;
mfxFrameSurface1 surface_internal; /* for system memory */
struct QSVFrame *next;
} QSVFrame;
/* abstract struct for all QSV filters */
struct QSVVPPContext {
mfxSession session;
int (*filter_frame) (AVFilterLink *outlink, AVFrame *frame);/* callback */
enum AVPixelFormat out_sw_format; /* Real output format */
mfxVideoParam vpp_param;
mfxFrameInfo *frame_infos; /* frame info for each input */
/* members related to the input/output surface */
int in_mem_mode;
int out_mem_mode;
QSVFrame *in_frame_list;
QSVFrame *out_frame_list;
int nb_surface_ptrs_in;
int nb_surface_ptrs_out;
mfxFrameSurface1 **surface_ptrs_in;
mfxFrameSurface1 **surface_ptrs_out;
/* MFXVPP extern parameters */
mfxExtOpaqueSurfaceAlloc opaque_alloc;
mfxExtBuffer **ext_buffers;
int nb_ext_buffers;
};
static const mfxHandleType handle_types[] = {
MFX_HANDLE_VA_DISPLAY,
MFX_HANDLE_D3D9_DEVICE_MANAGER,
@ -336,9 +305,11 @@ static int fill_frameinfo_by_link(mfxFrameInfo *frameinfo, AVFilterLink *link)
static void clear_unused_frames(QSVFrame *list)
{
while (list) {
if (list->surface && !list->surface->Data.Locked) {
list->surface = NULL;
/* list->queued==1 means the frame is not cached in VPP
* process any more, it can be released to pool. */
if ((list->queued == 1) && !list->surface.Data.Locked) {
av_frame_free(&list->frame);
list->queued = 0;
}
list = list->next;
}
@ -361,8 +332,10 @@ static QSVFrame *get_free_frame(QSVFrame **list)
QSVFrame *out = *list;
for (; out; out = out->next) {
if (!out->surface)
if (!out->queued) {
out->queued = 1;
break;
}
}
if (!out) {
@ -371,8 +344,9 @@ static QSVFrame *get_free_frame(QSVFrame **list)
av_log(NULL, AV_LOG_ERROR, "Can't alloc new output frame.\n");
return NULL;
}
out->next = *list;
*list = out;
out->queued = 1;
out->next = *list;
*list = out;
}
return out;
@ -402,7 +376,7 @@ static QSVFrame *submit_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *p
return NULL;
}
qsv_frame->frame = av_frame_clone(picref);
qsv_frame->surface = (mfxFrameSurface1 *)qsv_frame->frame->data[3];
qsv_frame->surface = *(mfxFrameSurface1 *)qsv_frame->frame->data[3];
} else {
/* make a copy if the input is not padded as libmfx requires */
if (picref->height & 31 || picref->linesize[0] & 31) {
@ -425,27 +399,26 @@ static QSVFrame *submit_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *p
qsv_frame->frame = av_frame_clone(picref);
if (map_frame_to_surface(qsv_frame->frame,
&qsv_frame->surface_internal) < 0) {
&qsv_frame->surface) < 0) {
av_log(ctx, AV_LOG_ERROR, "Unsupported frame.\n");
return NULL;
}
qsv_frame->surface = &qsv_frame->surface_internal;
}
qsv_frame->surface->Info = s->frame_infos[FF_INLINK_IDX(inlink)];
qsv_frame->surface->Data.TimeStamp = av_rescale_q(qsv_frame->frame->pts,
qsv_frame->surface.Info = s->frame_infos[FF_INLINK_IDX(inlink)];
qsv_frame->surface.Data.TimeStamp = av_rescale_q(qsv_frame->frame->pts,
inlink->time_base, default_tb);
qsv_frame->surface->Info.PicStruct =
qsv_frame->surface.Info.PicStruct =
!qsv_frame->frame->interlaced_frame ? MFX_PICSTRUCT_PROGRESSIVE :
(qsv_frame->frame->top_field_first ? MFX_PICSTRUCT_FIELD_TFF :
MFX_PICSTRUCT_FIELD_BFF);
if (qsv_frame->frame->repeat_pict == 1)
qsv_frame->surface->Info.PicStruct |= MFX_PICSTRUCT_FIELD_REPEATED;
qsv_frame->surface.Info.PicStruct |= MFX_PICSTRUCT_FIELD_REPEATED;
else if (qsv_frame->frame->repeat_pict == 2)
qsv_frame->surface->Info.PicStruct |= MFX_PICSTRUCT_FRAME_DOUBLING;
qsv_frame->surface.Info.PicStruct |= MFX_PICSTRUCT_FRAME_DOUBLING;
else if (qsv_frame->frame->repeat_pict == 4)
qsv_frame->surface->Info.PicStruct |= MFX_PICSTRUCT_FRAME_TRIPLING;
qsv_frame->surface.Info.PicStruct |= MFX_PICSTRUCT_FRAME_TRIPLING;
return qsv_frame;
}
@ -476,7 +449,7 @@ static QSVFrame *query_frame(QSVVPPContext *s, AVFilterLink *outlink)
return NULL;
}
out_frame->surface = (mfxFrameSurface1 *)out_frame->frame->data[3];
out_frame->surface = *(mfxFrameSurface1 *)out_frame->frame->data[3];
} else {
/* Get a frame with aligned dimensions.
* Libmfx need system memory being 128x64 aligned */
@ -490,14 +463,12 @@ static QSVFrame *query_frame(QSVVPPContext *s, AVFilterLink *outlink)
out_frame->frame->height = outlink->h;
ret = map_frame_to_surface(out_frame->frame,
&out_frame->surface_internal);
&out_frame->surface);
if (ret < 0)
return NULL;
out_frame->surface = &out_frame->surface_internal;
}
out_frame->surface->Info = s->vpp_param.vpp.Out;
out_frame->surface.Info = s->vpp_param.vpp.Out;
return out_frame;
}
@ -666,6 +637,16 @@ static int init_vpp_session(AVFilterContext *avctx, QSVVPPContext *s)
return 0;
}
static unsigned int qsv_fifo_item_size(void)
{
return sizeof(mfxSyncPoint) + sizeof(QSVFrame*);
}
static unsigned int qsv_fifo_size(const AVFifoBuffer* fifo)
{
return av_fifo_size(fifo)/qsv_fifo_item_size();
}
int ff_qsvvpp_create(AVFilterContext *avctx, QSVVPPContext **vpp, QSVVPPParam *param)
{
int i;
@ -738,7 +719,17 @@ int ff_qsvvpp_create(AVFilterContext *avctx, QSVVPPContext **vpp, QSVVPPParam *p
s->vpp_param.ExtParam = param->ext_buf;
}
s->vpp_param.AsyncDepth = 1;
s->got_frame = 0;
/** keep fifo size at least 1. Even when async_depth is 0, fifo is used. */
s->async_fifo = av_fifo_alloc((param->async_depth + 1) * qsv_fifo_item_size());
s->async_depth = param->async_depth;
if (!s->async_fifo) {
ret = AVERROR(ENOMEM);
goto failed;
}
s->vpp_param.AsyncDepth = param->async_depth;
if (IS_SYSTEM_MEMORY(s->in_mem_mode))
s->vpp_param.IOPattern |= MFX_IOPATTERN_IN_SYSTEM_MEMORY;
@ -793,6 +784,7 @@ int ff_qsvvpp_free(QSVVPPContext **vpp)
av_freep(&s->surface_ptrs_out);
av_freep(&s->ext_buffers);
av_freep(&s->frame_infos);
av_fifo_free(s->async_fifo);
av_freep(vpp);
return 0;
@ -803,9 +795,29 @@ int ff_qsvvpp_filter_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *picr
AVFilterContext *ctx = inlink->dst;
AVFilterLink *outlink = ctx->outputs[0];
mfxSyncPoint sync;
QSVFrame *in_frame, *out_frame;
QSVFrame *in_frame, *out_frame, *tmp;
int ret, filter_ret;
while (s->eof && qsv_fifo_size(s->async_fifo)) {
av_fifo_generic_read(s->async_fifo, &tmp, sizeof(tmp), NULL);
av_fifo_generic_read(s->async_fifo, &sync, sizeof(sync), NULL);
if (MFXVideoCORE_SyncOperation(s->session, sync, 1000) < 0)
av_log(ctx, AV_LOG_WARNING, "Sync failed.\n");
filter_ret = s->filter_frame(outlink, tmp->frame);
if (filter_ret < 0) {
av_frame_free(&tmp->frame);
ret = filter_ret;
break;
}
tmp->queued--;
s->got_frame = 1;
tmp->frame = NULL;
};
if (!picref)
return 0;
in_frame = submit_frame(s, inlink, picref);
if (!in_frame) {
av_log(ctx, AV_LOG_ERROR, "Failed to submit frame on input[%d]\n",
@ -821,8 +833,8 @@ int ff_qsvvpp_filter_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *picr
}
do {
ret = MFXVideoVPP_RunFrameVPPAsync(s->session, in_frame->surface,
out_frame->surface, NULL, &sync);
ret = MFXVideoVPP_RunFrameVPPAsync(s->session, &in_frame->surface,
&out_frame->surface, NULL, &sync);
if (ret == MFX_WRN_DEVICE_BUSY)
av_usleep(500);
} while (ret == MFX_WRN_DEVICE_BUSY);
@ -833,20 +845,33 @@ int ff_qsvvpp_filter_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *picr
ret = AVERROR(EAGAIN);
break;
}
if (MFXVideoCORE_SyncOperation(s->session, sync, 1000) < 0)
av_log(ctx, AV_LOG_WARNING, "Sync failed.\n");
out_frame->frame->pts = av_rescale_q(out_frame->surface->Data.TimeStamp,
out_frame->frame->pts = av_rescale_q(out_frame->surface.Data.TimeStamp,
default_tb, outlink->time_base);
filter_ret = s->filter_frame(outlink, out_frame->frame);
if (filter_ret < 0) {
av_frame_free(&out_frame->frame);
ret = filter_ret;
break;
out_frame->queued++;
av_fifo_generic_write(s->async_fifo, &out_frame, sizeof(out_frame), NULL);
av_fifo_generic_write(s->async_fifo, &sync, sizeof(sync), NULL);
if (qsv_fifo_size(s->async_fifo) > s->async_depth) {
av_fifo_generic_read(s->async_fifo, &tmp, sizeof(tmp), NULL);
av_fifo_generic_read(s->async_fifo, &sync, sizeof(sync), NULL);
do {
ret = MFXVideoCORE_SyncOperation(s->session, sync, 1000);
} while (ret == MFX_WRN_IN_EXECUTION);
filter_ret = s->filter_frame(outlink, tmp->frame);
if (filter_ret < 0) {
av_frame_free(&tmp->frame);
ret = filter_ret;
break;
}
tmp->queued--;
s->got_frame = 1;
tmp->frame = NULL;
}
out_frame->frame = NULL;
} while(ret == MFX_ERR_MORE_SURFACE);
return ret;

View File

@ -27,6 +27,7 @@
#include <mfx/mfxvideo.h>
#include "avfilter.h"
#include "libavutil/fifo.h"
#define FF_INLINK_IDX(link) ((int)((link)->dstpad - (link)->dst->input_pads))
#define FF_OUTLINK_IDX(link) ((int)((link)->srcpad - (link)->src->output_pads))
@ -39,7 +40,41 @@
((MFX_VERSION.Major > (MAJOR)) || \
(MFX_VERSION.Major == (MAJOR) && MFX_VERSION.Minor >= (MINOR)))
typedef struct QSVVPPContext QSVVPPContext;
typedef struct QSVFrame {
AVFrame *frame;
mfxFrameSurface1 surface;
struct QSVFrame *next;
int queued;
} QSVFrame;
typedef struct QSVVPPContext {
mfxSession session;
int (*filter_frame) (AVFilterLink *outlink, AVFrame *frame); /**< callback */
enum AVPixelFormat out_sw_format; /**< Real output format */
mfxVideoParam vpp_param;
mfxFrameInfo *frame_infos; /**< frame info for each input */
/** members related to the input/output surface */
int in_mem_mode;
int out_mem_mode;
QSVFrame *in_frame_list;
QSVFrame *out_frame_list;
int nb_surface_ptrs_in;
int nb_surface_ptrs_out;
mfxFrameSurface1 **surface_ptrs_in;
mfxFrameSurface1 **surface_ptrs_out;
/** MFXVPP extern parameters */
mfxExtOpaqueSurfaceAlloc opaque_alloc;
mfxExtBuffer **ext_buffers;
int nb_ext_buffers;
int got_frame;
int async_depth;
int eof;
/** order with frame_out, sync */
AVFifoBuffer *async_fifo;
} QSVVPPContext;
typedef struct QSVVPPCrop {
int in_idx; ///< Input index
@ -60,6 +95,8 @@ typedef struct QSVVPPParam {
/* Crop information for each input, if needed */
int num_crop;
QSVVPPCrop *crop;
int async_depth;
} QSVVPPParam;
/* create and initialize the QSV session */

View File

@ -47,14 +47,6 @@ enum {
QSVDEINT_MORE_INPUT,
};
typedef struct QSVFrame {
AVFrame *frame;
mfxFrameSurface1 surface;
int used;
struct QSVFrame *next;
} QSVFrame;
typedef struct QSVDeintContext {
const AVClass *class;
@ -376,7 +368,7 @@ static void clear_unused_frames(QSVDeintContext *s)
while (cur) {
if (!cur->surface.Data.Locked) {
av_frame_free(&cur->frame);
cur->used = 0;
cur->queued = 0;
}
cur = cur->next;
}
@ -391,7 +383,7 @@ static int get_free_frame(QSVDeintContext *s, QSVFrame **f)
frame = s->work_frames;
last = &s->work_frames;
while (frame) {
if (!frame->used) {
if (!frame->queued) {
*f = frame;
return 0;
}
@ -453,7 +445,7 @@ static int submit_frame(AVFilterContext *ctx, AVFrame *frame,
(AVRational){1, 90000});
*surface = &qf->surface;
qf->used = 1;
qf->queued = 1;
return 0;
}

View File

@ -32,6 +32,7 @@
#include "formats.h"
#include "internal.h"
#include "avfilter.h"
#include "filters.h"
#include "libavcodec/avcodec.h"
#include "libavformat/avformat.h"
@ -93,6 +94,9 @@ typedef struct VPPContext{
char *cx, *cy, *cw, *ch;
char *ow, *oh;
char *output_format_str;
int async_depth;
int eof;
} VPPContext;
static const AVOption options[] = {
@ -128,6 +132,7 @@ static const AVOption options[] = {
{ "h", "Output video height", OFFSET(oh), AV_OPT_TYPE_STRING, { .str="w*ch/cw" }, 0, 255, .flags = FLAGS },
{ "height", "Output video height", OFFSET(oh), AV_OPT_TYPE_STRING, { .str="w*ch/cw" }, 0, 255, .flags = FLAGS },
{ "format", "Output pixel format", OFFSET(output_format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
{ "async_depth", "Internal parallelization depth, the higher the value the higher the latency.", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, .flags = FLAGS },
{ NULL }
};
@ -303,6 +308,7 @@ static int config_output(AVFilterLink *outlink)
param.filter_frame = NULL;
param.num_ext_buf = 0;
param.ext_buf = ext_buf;
param.async_depth = vpp->async_depth;
if (inlink->format == AV_PIX_FMT_QSV) {
if (!inlink->hw_frames_ctx || !inlink->hw_frames_ctx->data)
@ -467,23 +473,64 @@ static int config_output(AVFilterLink *outlink)
return 0;
}
static int filter_frame(AVFilterLink *inlink, AVFrame *picref)
static int activate(AVFilterContext *ctx)
{
int ret = 0;
AVFilterContext *ctx = inlink->dst;
VPPContext *vpp = inlink->dst->priv;
AVFilterLink *outlink = ctx->outputs[0];
AVFilterLink *inlink = ctx->inputs[0];
AVFilterLink *outlink = ctx->outputs[0];
VPPContext *s =ctx->priv;
QSVVPPContext *qsv = s->qsv;
AVFrame *in = NULL;
int ret, status;
int64_t pts;
if (vpp->qsv) {
ret = ff_qsvvpp_filter_frame(vpp->qsv, inlink, picref);
av_frame_free(&picref);
} else {
if (picref->pts != AV_NOPTS_VALUE)
picref->pts = av_rescale_q(picref->pts, inlink->time_base, outlink->time_base);
ret = ff_filter_frame(outlink, picref);
FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
if (!s->eof) {
ret = ff_inlink_consume_frame(inlink, &in);
if (ret < 0)
return ret;
if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
if (status == AVERROR_EOF) {
s->eof = 1;
}
}
}
return ret;
if (qsv) {
if (in || s->eof) {
qsv->eof = s->eof;
ret = ff_qsvvpp_filter_frame(qsv, inlink, in);
av_frame_free(&in);
if (s->eof) {
ff_outlink_set_status(outlink, status, pts);
return 0;
}
if (qsv->got_frame) {
qsv->got_frame = 0;
return ret;
}
}
} else {
if (in) {
if (in->pts != AV_NOPTS_VALUE)
in->pts = av_rescale_q(in->pts, inlink->time_base, outlink->time_base);
ret = ff_filter_frame(outlink, in);
return ret;
}
}
if (s->eof) {
ff_outlink_set_status(outlink, status, pts);
return 0;
} else {
FF_FILTER_FORWARD_WANTED(outlink, inlink);
}
return FFERROR_NOT_READY;
}
static int query_formats(AVFilterContext *ctx)
@ -531,7 +578,6 @@ static const AVFilterPad vpp_inputs[] = {
.name = "default",
.type = AVMEDIA_TYPE_VIDEO,
.config_props = config_input,
.filter_frame = filter_frame,
},
{ NULL }
};
@ -554,6 +600,7 @@ AVFilter ff_vf_vpp_qsv = {
.uninit = vpp_uninit,
.inputs = vpp_inputs,
.outputs = vpp_outputs,
.activate = activate,
.priv_class = &vpp_class,
.flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
};