lavfi/qsvvpp: support async depth

Async depth will allow qsv filter cache few frames, and avoid force switch and end filter task frame by frame. This change will improve performance for some multi-task case, for example 1:N transcode( decode + vpp + encode) with all QSV plugins. Performance data test on my Coffee Lake Desktop(i7-8700K) by using the following 1:8 transcode test case improvement: 1. Fps improved from 55 to 130. 2. Render/Video usage improved from ~61%/~38% to ~100%/~70%.(Data get from intel_gpu_top) test CMD: ffmpeg -v verbose -init_hw_device qsv=hw:/dev/dri/renderD128 -filter_hw_device \ hw -hwaccel qsv -hwaccel_output_format qsv -c:v h264_qsv -i 1920x1080.264 \ -vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \ -vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \ -vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \ -vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \ -vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \ -vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - \ -vf 'vpp_qsv=w=1280:h=720:async_depth=4' -c:v h264_qsv -r:v 30 -preset 7 -g 33 -refs 2 -bf 3 -q 24 -f null - Signed-off-by: Fei Wang <fei.w.wang@intel.com> Reviewed-by: Linjie Fu <linjie.justin.fu@gmail.com> Signed-off-by: Zhong Li <zhongli_dev@126.com>
2021-03-31 10:07:44 +08:00 · 2021-03-31 10:07:44 +08:00 · 89ffcd1bbe
parent 309e3cc15c
commit 89ffcd1bbe
4 changed files with 192 additions and 91 deletions
--- a/libavfilter/qsvvpp.c
+++ b/libavfilter/qsvvpp.c
@ -37,37 +37,6 @@
 #define IS_OPAQUE_MEMORY(mode) (mode & MFX_MEMTYPE_OPAQUE_FRAME)
 #define IS_SYSTEM_MEMORY(mode) (mode & MFX_MEMTYPE_SYSTEM_MEMORY)

-typedef struct QSVFrame {
-    AVFrame          *frame;
-    mfxFrameSurface1 *surface;
-    mfxFrameSurface1  surface_internal;  /* for system memory */
-    struct QSVFrame  *next;
-} QSVFrame;
-
-/* abstract struct for all QSV filters */
-struct QSVVPPContext {
-    mfxSession          session;
-    int (*filter_frame) (AVFilterLink *outlink, AVFrame *frame);/* callback */
-    enum AVPixelFormat  out_sw_format;   /* Real output format */
-    mfxVideoParam       vpp_param;
-    mfxFrameInfo       *frame_infos;     /* frame info for each input */
-
-    /* members related to the input/output surface */
-    int                 in_mem_mode;
-    int                 out_mem_mode;
-    QSVFrame           *in_frame_list;
-    QSVFrame           *out_frame_list;
-    int                 nb_surface_ptrs_in;
-    int                 nb_surface_ptrs_out;
-    mfxFrameSurface1  **surface_ptrs_in;
-    mfxFrameSurface1  **surface_ptrs_out;
-
-    /* MFXVPP extern parameters */
-    mfxExtOpaqueSurfaceAlloc opaque_alloc;
-    mfxExtBuffer      **ext_buffers;
-    int                 nb_ext_buffers;
-};
-
 static const mfxHandleType handle_types[] = {
    MFX_HANDLE_VA_DISPLAY,
    MFX_HANDLE_D3D9_DEVICE_MANAGER,
@ -336,9 +305,11 @@ static int fill_frameinfo_by_link(mfxFrameInfo *frameinfo, AVFilterLink *link)
 static void clear_unused_frames(QSVFrame *list)
 {
    while (list) {
-        if (list->surface && !list->surface->Data.Locked) {
-            list->surface = NULL;
+        /* list->queued==1 means the frame is not cached in VPP
+         * process any more, it can be released to pool. */
+        if ((list->queued == 1) && !list->surface.Data.Locked) {
            av_frame_free(&list->frame);
+            list->queued = 0;
        }
        list = list->next;
    }
@ -361,8 +332,10 @@ static QSVFrame *get_free_frame(QSVFrame **list)
    QSVFrame *out = *list;

    for (; out; out = out->next) {
-        if (!out->surface)
+        if (!out->queued) {
+            out->queued = 1;
            break;
+        }
    }

    if (!out) {
@ -371,8 +344,9 @@ static QSVFrame *get_free_frame(QSVFrame **list)
            av_log(NULL, AV_LOG_ERROR, "Can't alloc new output frame.\n");
            return NULL;
        }
-        out->next  = *list;
-        *list      = out;
+        out->queued = 1;
+        out->next   = *list;
+        *list       = out;
    }

    return out;
@ -402,7 +376,7 @@ static QSVFrame *submit_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *p
            return NULL;
        }
        qsv_frame->frame   = av_frame_clone(picref);
-        qsv_frame->surface = (mfxFrameSurface1 *)qsv_frame->frame->data[3];
+        qsv_frame->surface = *(mfxFrameSurface1 *)qsv_frame->frame->data[3];
    } else {
        /* make a copy if the input is not padded as libmfx requires */
        if (picref->height & 31 || picref->linesize[0] & 31) {
@ -425,27 +399,26 @@ static QSVFrame *submit_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *p
            qsv_frame->frame = av_frame_clone(picref);

        if (map_frame_to_surface(qsv_frame->frame,
-                                &qsv_frame->surface_internal) < 0) {
+                                 &qsv_frame->surface) < 0) {
            av_log(ctx, AV_LOG_ERROR, "Unsupported frame.\n");
            return NULL;
        }
-        qsv_frame->surface = &qsv_frame->surface_internal;
    }

-    qsv_frame->surface->Info           = s->frame_infos[FF_INLINK_IDX(inlink)];
-    qsv_frame->surface->Data.TimeStamp = av_rescale_q(qsv_frame->frame->pts,
+    qsv_frame->surface.Info           = s->frame_infos[FF_INLINK_IDX(inlink)];
+    qsv_frame->surface.Data.TimeStamp = av_rescale_q(qsv_frame->frame->pts,
                                                      inlink->time_base, default_tb);

-    qsv_frame->surface->Info.PicStruct =
+    qsv_frame->surface.Info.PicStruct =
            !qsv_frame->frame->interlaced_frame ? MFX_PICSTRUCT_PROGRESSIVE :
            (qsv_frame->frame->top_field_first ? MFX_PICSTRUCT_FIELD_TFF :
                                                 MFX_PICSTRUCT_FIELD_BFF);
    if (qsv_frame->frame->repeat_pict == 1)
-        qsv_frame->surface->Info.PicStruct |= MFX_PICSTRUCT_FIELD_REPEATED;
+        qsv_frame->surface.Info.PicStruct |= MFX_PICSTRUCT_FIELD_REPEATED;
    else if (qsv_frame->frame->repeat_pict == 2)
-        qsv_frame->surface->Info.PicStruct |= MFX_PICSTRUCT_FRAME_DOUBLING;
+        qsv_frame->surface.Info.PicStruct |= MFX_PICSTRUCT_FRAME_DOUBLING;
    else if (qsv_frame->frame->repeat_pict == 4)
-        qsv_frame->surface->Info.PicStruct |= MFX_PICSTRUCT_FRAME_TRIPLING;
+        qsv_frame->surface.Info.PicStruct |= MFX_PICSTRUCT_FRAME_TRIPLING;

    return qsv_frame;
 }
@ -476,7 +449,7 @@ static QSVFrame *query_frame(QSVVPPContext *s, AVFilterLink *outlink)
            return NULL;
        }

-        out_frame->surface = (mfxFrameSurface1 *)out_frame->frame->data[3];
+        out_frame->surface = *(mfxFrameSurface1 *)out_frame->frame->data[3];
    } else {
        /* Get a frame with aligned dimensions.
         * Libmfx need system memory being 128x64 aligned */
@ -490,14 +463,12 @@ static QSVFrame *query_frame(QSVVPPContext *s, AVFilterLink *outlink)
        out_frame->frame->height = outlink->h;

        ret = map_frame_to_surface(out_frame->frame,
-                                  &out_frame->surface_internal);
+                                   &out_frame->surface);
        if (ret < 0)
            return NULL;
-
-        out_frame->surface = &out_frame->surface_internal;
    }

-    out_frame->surface->Info = s->vpp_param.vpp.Out;
+    out_frame->surface.Info = s->vpp_param.vpp.Out;

    return out_frame;
 }
@ -666,6 +637,16 @@ static int init_vpp_session(AVFilterContext *avctx, QSVVPPContext *s)
    return 0;
 }

+static unsigned int qsv_fifo_item_size(void)
+{
+    return sizeof(mfxSyncPoint) + sizeof(QSVFrame*);
+}
+
+static unsigned int qsv_fifo_size(const AVFifoBuffer* fifo)
+{
+    return  av_fifo_size(fifo)/qsv_fifo_item_size();
+}
+
 int ff_qsvvpp_create(AVFilterContext *avctx, QSVVPPContext **vpp, QSVVPPParam *param)
 {
    int i;
@ -738,7 +719,17 @@ int ff_qsvvpp_create(AVFilterContext *avctx, QSVVPPContext **vpp, QSVVPPParam *p
        s->vpp_param.ExtParam    = param->ext_buf;
    }

-    s->vpp_param.AsyncDepth = 1;
+    s->got_frame = 0;
+
+    /** keep fifo size at least 1. Even when async_depth is 0, fifo is used. */
+    s->async_fifo  = av_fifo_alloc((param->async_depth + 1) * qsv_fifo_item_size());
+    s->async_depth = param->async_depth;
+    if (!s->async_fifo) {
+        ret = AVERROR(ENOMEM);
+        goto failed;
+    }
+
+    s->vpp_param.AsyncDepth = param->async_depth;

    if (IS_SYSTEM_MEMORY(s->in_mem_mode))
        s->vpp_param.IOPattern |= MFX_IOPATTERN_IN_SYSTEM_MEMORY;
@ -793,6 +784,7 @@ int ff_qsvvpp_free(QSVVPPContext **vpp)
    av_freep(&s->surface_ptrs_out);
    av_freep(&s->ext_buffers);
    av_freep(&s->frame_infos);
+    av_fifo_free(s->async_fifo);
    av_freep(vpp);

    return 0;
@ -803,9 +795,29 @@ int ff_qsvvpp_filter_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *picr
    AVFilterContext  *ctx     = inlink->dst;
    AVFilterLink     *outlink = ctx->outputs[0];
    mfxSyncPoint      sync;
-    QSVFrame         *in_frame, *out_frame;
+    QSVFrame         *in_frame, *out_frame, *tmp;
    int               ret, filter_ret;

+    while (s->eof && qsv_fifo_size(s->async_fifo)) {
+        av_fifo_generic_read(s->async_fifo, &tmp, sizeof(tmp), NULL);
+        av_fifo_generic_read(s->async_fifo, &sync, sizeof(sync), NULL);
+        if (MFXVideoCORE_SyncOperation(s->session, sync, 1000) < 0)
+            av_log(ctx, AV_LOG_WARNING, "Sync failed.\n");
+
+        filter_ret = s->filter_frame(outlink, tmp->frame);
+        if (filter_ret < 0) {
+            av_frame_free(&tmp->frame);
+            ret = filter_ret;
+            break;
+        }
+        tmp->queued--;
+        s->got_frame = 1;
+        tmp->frame = NULL;
+    };
+
+    if (!picref)
+        return 0;
+
    in_frame = submit_frame(s, inlink, picref);
    if (!in_frame) {
        av_log(ctx, AV_LOG_ERROR, "Failed to submit frame on input[%d]\n",
@ -821,8 +833,8 @@ int ff_qsvvpp_filter_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *picr
        }

        do {
-            ret = MFXVideoVPP_RunFrameVPPAsync(s->session, in_frame->surface,
-                                               out_frame->surface, NULL, &sync);
+            ret = MFXVideoVPP_RunFrameVPPAsync(s->session, &in_frame->surface,
+                                               &out_frame->surface, NULL, &sync);
            if (ret == MFX_WRN_DEVICE_BUSY)
                av_usleep(500);
        } while (ret == MFX_WRN_DEVICE_BUSY);
@ -833,20 +845,33 @@ int ff_qsvvpp_filter_frame(QSVVPPContext *s, AVFilterLink *inlink, AVFrame *picr
                ret = AVERROR(EAGAIN);
            break;
        }
-
-        if (MFXVideoCORE_SyncOperation(s->session, sync, 1000) < 0)
-            av_log(ctx, AV_LOG_WARNING, "Sync failed.\n");
-
-        out_frame->frame->pts = av_rescale_q(out_frame->surface->Data.TimeStamp,
+        out_frame->frame->pts = av_rescale_q(out_frame->surface.Data.TimeStamp,
                                             default_tb, outlink->time_base);

-        filter_ret = s->filter_frame(outlink, out_frame->frame);
-        if (filter_ret < 0) {
-            av_frame_free(&out_frame->frame);
-            ret = filter_ret;
-            break;
+        out_frame->queued++;
+        av_fifo_generic_write(s->async_fifo, &out_frame, sizeof(out_frame), NULL);
+        av_fifo_generic_write(s->async_fifo, &sync, sizeof(sync), NULL);
+
+
+        if (qsv_fifo_size(s->async_fifo) > s->async_depth) {
+            av_fifo_generic_read(s->async_fifo, &tmp, sizeof(tmp), NULL);
+            av_fifo_generic_read(s->async_fifo, &sync, sizeof(sync), NULL);
+
+            do {
+                ret = MFXVideoCORE_SyncOperation(s->session, sync, 1000);
+            } while (ret == MFX_WRN_IN_EXECUTION);
+
+            filter_ret = s->filter_frame(outlink, tmp->frame);
+            if (filter_ret < 0) {
+                av_frame_free(&tmp->frame);
+                ret = filter_ret;
+                break;
+            }
+
+            tmp->queued--;
+            s->got_frame = 1;
+            tmp->frame = NULL;
        }
-        out_frame->frame = NULL;
    } while(ret == MFX_ERR_MORE_SURFACE);

    return ret;
--- a/libavfilter/qsvvpp.h
+++ b/libavfilter/qsvvpp.h
@ -27,6 +27,7 @@
 #include <mfx/mfxvideo.h>

 #include "avfilter.h"
+#include "libavutil/fifo.h"

 #define FF_INLINK_IDX(link)  ((int)((link)->dstpad - (link)->dst->input_pads))
 #define FF_OUTLINK_IDX(link) ((int)((link)->srcpad - (link)->src->output_pads))
@ -39,7 +40,41 @@
    ((MFX_VERSION.Major > (MAJOR)) ||                           \
    (MFX_VERSION.Major == (MAJOR) && MFX_VERSION.Minor >= (MINOR)))

-typedef struct QSVVPPContext QSVVPPContext;
+typedef struct QSVFrame {
+    AVFrame          *frame;
+    mfxFrameSurface1 surface;
+    struct QSVFrame  *next;
+    int queued;
+} QSVFrame;
+
+typedef struct QSVVPPContext {
+    mfxSession          session;
+    int (*filter_frame) (AVFilterLink *outlink, AVFrame *frame); /**< callback */
+    enum AVPixelFormat  out_sw_format;   /**< Real output format */
+    mfxVideoParam       vpp_param;
+    mfxFrameInfo       *frame_infos;     /**< frame info for each input */
+
+    /** members related to the input/output surface */
+    int                 in_mem_mode;
+    int                 out_mem_mode;
+    QSVFrame           *in_frame_list;
+    QSVFrame           *out_frame_list;
+    int                 nb_surface_ptrs_in;
+    int                 nb_surface_ptrs_out;
+    mfxFrameSurface1  **surface_ptrs_in;
+    mfxFrameSurface1  **surface_ptrs_out;
+
+    /** MFXVPP extern parameters */
+    mfxExtOpaqueSurfaceAlloc opaque_alloc;
+    mfxExtBuffer      **ext_buffers;
+    int                 nb_ext_buffers;
+
+    int got_frame;
+    int async_depth;
+    int eof;
+    /** order with frame_out, sync */
+    AVFifoBuffer *async_fifo;
+} QSVVPPContext;

 typedef struct QSVVPPCrop {
    int in_idx;        ///< Input index
@ -60,6 +95,8 @@ typedef struct QSVVPPParam {
    /* Crop information for each input, if needed */
    int num_crop;
    QSVVPPCrop *crop;
+
+   int async_depth;
 } QSVVPPParam;

 /* create and initialize the QSV session */
--- a/libavfilter/vf_deinterlace_qsv.c
+++ b/libavfilter/vf_deinterlace_qsv.c
@ -47,14 +47,6 @@ enum {
    QSVDEINT_MORE_INPUT,
 };

-typedef struct QSVFrame {
-    AVFrame *frame;
-    mfxFrameSurface1 surface;
-    int used;
-
-    struct QSVFrame *next;
-} QSVFrame;
-
 typedef struct QSVDeintContext {
    const AVClass *class;

@ -376,7 +368,7 @@ static void clear_unused_frames(QSVDeintContext *s)
    while (cur) {
        if (!cur->surface.Data.Locked) {
            av_frame_free(&cur->frame);
-            cur->used = 0;
+            cur->queued = 0;
        }
        cur = cur->next;
    }
@ -391,7 +383,7 @@ static int get_free_frame(QSVDeintContext *s, QSVFrame **f)
    frame = s->work_frames;
    last  = &s->work_frames;
    while (frame) {
-        if (!frame->used) {
+        if (!frame->queued) {
            *f = frame;
            return 0;
        }
@ -453,7 +445,7 @@ static int submit_frame(AVFilterContext *ctx, AVFrame *frame,
                                              (AVRational){1, 90000});

    *surface = &qf->surface;
-    qf->used = 1;
+    qf->queued = 1;

    return 0;
 }
--- a/libavfilter/vf_vpp_qsv.c
+++ b/libavfilter/vf_vpp_qsv.c
@ -32,6 +32,7 @@
 #include "formats.h"
 #include "internal.h"
 #include "avfilter.h"
+#include "filters.h"
 #include "libavcodec/avcodec.h"
 #include "libavformat/avformat.h"

@ -93,6 +94,9 @@ typedef struct VPPContext{
    char *cx, *cy, *cw, *ch;
    char *ow, *oh;
    char *output_format_str;
+
+    int async_depth;
+    int eof;
 } VPPContext;

 static const AVOption options[] = {
@ -128,6 +132,7 @@ static const AVOption options[] = {
    { "h",      "Output video height", OFFSET(oh), AV_OPT_TYPE_STRING, { .str="w*ch/cw" }, 0, 255, .flags = FLAGS },
    { "height", "Output video height", OFFSET(oh), AV_OPT_TYPE_STRING, { .str="w*ch/cw" }, 0, 255, .flags = FLAGS },
    { "format", "Output pixel format", OFFSET(output_format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
+    { "async_depth", "Internal parallelization depth, the higher the value the higher the latency.", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, .flags = FLAGS },

    { NULL }
 };
@ -303,6 +308,7 @@ static int config_output(AVFilterLink *outlink)
    param.filter_frame  = NULL;
    param.num_ext_buf   = 0;
    param.ext_buf       = ext_buf;
+    param.async_depth   = vpp->async_depth;

    if (inlink->format == AV_PIX_FMT_QSV) {
         if (!inlink->hw_frames_ctx || !inlink->hw_frames_ctx->data)
@ -467,23 +473,64 @@ static int config_output(AVFilterLink *outlink)
    return 0;
 }

-static int filter_frame(AVFilterLink *inlink, AVFrame *picref)
+static int activate(AVFilterContext *ctx)
 {
-    int              ret = 0;
-    AVFilterContext  *ctx = inlink->dst;
-    VPPContext       *vpp = inlink->dst->priv;
-    AVFilterLink     *outlink = ctx->outputs[0];
+    AVFilterLink *inlink = ctx->inputs[0];
+    AVFilterLink *outlink = ctx->outputs[0];
+    VPPContext *s =ctx->priv;
+    QSVVPPContext *qsv = s->qsv;
+    AVFrame *in = NULL;
+    int ret, status;
+    int64_t pts;

-    if (vpp->qsv) {
-        ret = ff_qsvvpp_filter_frame(vpp->qsv, inlink, picref);
-        av_frame_free(&picref);
-    } else {
-        if (picref->pts != AV_NOPTS_VALUE)
-            picref->pts = av_rescale_q(picref->pts, inlink->time_base, outlink->time_base);
-        ret = ff_filter_frame(outlink, picref);
+    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
+
+    if (!s->eof) {
+        ret = ff_inlink_consume_frame(inlink, &in);
+        if (ret < 0)
+            return ret;
+
+        if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
+            if (status == AVERROR_EOF) {
+                s->eof = 1;
+            }
+        }
    }

-    return ret;
+    if (qsv) {
+        if (in || s->eof) {
+            qsv->eof = s->eof;
+            ret = ff_qsvvpp_filter_frame(qsv, inlink, in);
+            av_frame_free(&in);
+
+            if (s->eof) {
+                ff_outlink_set_status(outlink, status, pts);
+                return 0;
+            }
+
+            if (qsv->got_frame) {
+                qsv->got_frame = 0;
+                return ret;
+            }
+        }
+    } else {
+        if (in) {
+            if (in->pts != AV_NOPTS_VALUE)
+                in->pts = av_rescale_q(in->pts, inlink->time_base, outlink->time_base);
+
+            ret = ff_filter_frame(outlink, in);
+            return ret;
+        }
+    }
+
+    if (s->eof) {
+        ff_outlink_set_status(outlink, status, pts);
+        return 0;
+    } else {
+        FF_FILTER_FORWARD_WANTED(outlink, inlink);
+    }
+
+    return FFERROR_NOT_READY;
 }

 static int query_formats(AVFilterContext *ctx)
@ -531,7 +578,6 @@ static const AVFilterPad vpp_inputs[] = {
        .name          = "default",
        .type          = AVMEDIA_TYPE_VIDEO,
        .config_props  = config_input,
-        .filter_frame  = filter_frame,
    },
    { NULL }
 };
@ -554,6 +600,7 @@ AVFilter ff_vf_vpp_qsv = {
    .uninit        = vpp_uninit,
    .inputs        = vpp_inputs,
    .outputs       = vpp_outputs,
+    .activate      = activate,
    .priv_class    = &vpp_class,
    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
 };