From d6fdc78e9164c8e6bf7bfc7766932c467b874aa2 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Mon, 12 Jul 2021 12:31:14 +0200 Subject: [PATCH] sws: implement slice threading --- Changelog | 1 + libswscale/options.c | 3 ++ libswscale/swscale.c | 59 ++++++++++++++++++++++++ libswscale/swscale_internal.h | 14 ++++++ libswscale/utils.c | 85 +++++++++++++++++++++++++++++++++++ 5 files changed, 162 insertions(+) diff --git a/Changelog b/Changelog index 5b58401183..f66f47b6cb 100644 --- a/Changelog +++ b/Changelog @@ -16,6 +16,7 @@ version : - atilt audio filter - grayworld video filter - AV1 Low overhead bitstream format muxer +- swscale slice threading version 4.4: diff --git a/libswscale/options.c b/libswscale/options.c index 7eb2752543..4b71a23e37 100644 --- a/libswscale/options.c +++ b/libswscale/options.c @@ -81,6 +81,9 @@ static const AVOption swscale_options[] = { { "uniform_color", "blend onto a uniform color", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ALPHA_BLEND_UNIFORM},INT_MIN, INT_MAX, VE, "alphablend" }, { "checkerboard", "blend onto a checkerboard", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ALPHA_BLEND_CHECKERBOARD},INT_MIN, INT_MAX, VE, "alphablend" }, + { "threads", "number of threads", OFFSET(nb_threads), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, INT_MAX, VE, "threads" }, + { "auto", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 0 }, .flags = VE, "threads" }, + { NULL } }; diff --git a/libswscale/swscale.c b/libswscale/swscale.c index ca5c612b18..c233818dcf 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -1113,6 +1113,9 @@ int sws_send_slice(struct SwsContext *c, unsigned int slice_start, unsigned int sws_receive_slice_alignment(const struct SwsContext *c) { + if (c->slice_ctx) + return c->slice_ctx[0]->dst_slice_align; + return c->dst_slice_align; } @@ -1136,6 +1139,27 @@ int sws_receive_slice(struct SwsContext *c, unsigned int slice_start, return AVERROR(EINVAL); } + if (c->slicethread) { + int nb_jobs = c->slice_ctx[0]->dither == SWS_DITHER_ED ? 1 : c->nb_slice_ctx; + int ret = 0; + + c->dst_slice_start = slice_start; + c->dst_slice_height = slice_height; + + avpriv_slicethread_execute(c->slicethread, nb_jobs, 0); + + for (int i = 0; i < c->nb_slice_ctx; i++) { + if (c->slice_err[i] < 0) { + ret = c->slice_err[i]; + break; + } + } + + memset(c->slice_err, 0, c->nb_slice_ctx * sizeof(*c->slice_err)); + + return ret; + } + for (int i = 0; i < FF_ARRAY_ELEMS(dst) && c->frame_dst->data[i]; i++) { dst[i] = c->frame_dst->data[i] + c->frame_dst->linesize[i] * (slice_start >> c->chrDstVSubSample); @@ -1173,6 +1197,41 @@ int attribute_align_arg sws_scale(struct SwsContext *c, int srcSliceH, uint8_t *const dst[], const int dstStride[]) { + if (c->nb_slice_ctx) + c = c->slice_ctx[0]; + return scale_internal(c, srcSlice, srcStride, srcSliceY, srcSliceH, dst, dstStride, 0, c->dstH); } + +void ff_sws_slice_worker(void *priv, int jobnr, int threadnr, + int nb_jobs, int nb_threads) +{ + SwsContext *parent = priv; + SwsContext *c = parent->slice_ctx[threadnr]; + + const int slice_height = FFALIGN(FFMAX((parent->dst_slice_height + nb_jobs - 1) / nb_jobs, 1), + c->dst_slice_align); + const int slice_start = jobnr * slice_height; + const int slice_end = FFMIN((jobnr + 1) * slice_height, parent->dst_slice_height); + int err = 0; + + if (slice_end > slice_start) { + uint8_t *dst[4] = { NULL }; + + for (int i = 0; i < FF_ARRAY_ELEMS(dst) && parent->frame_dst->data[i]; i++) { + const int vshift = (i == 1 || i == 2) ? c->chrDstVSubSample : 0; + const ptrdiff_t offset = parent->frame_dst->linesize[i] * + ((slice_start + parent->dst_slice_start) >> vshift); + + dst[i] = parent->frame_dst->data[i] + offset; + } + + err = scale_internal(c, (const uint8_t * const *)parent->frame_src->data, + parent->frame_src->linesize, 0, c->srcH, + dst, parent->frame_dst->linesize, + parent->dst_slice_start + slice_start, slice_end - slice_start); + } + + parent->slice_err[threadnr] = err; +} diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 55fa6cec07..fbfc08a89f 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -33,6 +33,7 @@ #include "libavutil/mem_internal.h" #include "libavutil/pixfmt.h" #include "libavutil/pixdesc.h" +#include "libavutil/slicethread.h" #include "libavutil/ppc/util_altivec.h" #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long @@ -300,6 +301,15 @@ typedef struct SwsContext { */ const AVClass *av_class; + AVSliceThread *slicethread; + struct SwsContext **slice_ctx; + int *slice_err; + int nb_slice_ctx; + + // values passed to current sws_receive_slice() call + unsigned int dst_slice_start; + unsigned int dst_slice_height; + /** * Note that src, dst, srcStride, dstStride will be copied in the * sws_scale() wrapper so they can be freely modified here. @@ -325,6 +335,7 @@ typedef struct SwsContext { int chrDstVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in destination image. int vChrDrop; ///< Binary logarithm of extra vertical subsampling factor in source image chroma planes specified by user. int sliceDir; ///< Direction that slices are fed to the scaler (1 = top-to-bottom, -1 = bottom-to-top). + int nb_threads; ///< Number of threads used for scaling double param[2]; ///< Input parameters for scaling algorithms that need them. AVFrame *frame_src; @@ -1082,6 +1093,9 @@ void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn yuv2plane1, yuv2planarX_fn yuv2interleavedX_fn yuv2nv12cX, yuv2packed1_fn yuv2packed1, yuv2packed2_fn yuv2packed2, yuv2packedX_fn yuv2packedX, yuv2anyX_fn yuv2anyX, int use_mmx); +void ff_sws_slice_worker(void *priv, int jobnr, int threadnr, + int nb_jobs, int nb_threads); + //number of extra lines to process #define MAX_LINES_AHEAD 4 diff --git a/libswscale/utils.c b/libswscale/utils.c index 235a846809..25051ead72 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -49,6 +49,7 @@ #include "libavutil/mathematics.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" +#include "libavutil/slicethread.h" #include "libavutil/thread.h" #include "libavutil/aarch64/cpu.h" #include "libavutil/ppc/cpu.h" @@ -871,6 +872,18 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4], const AVPixFmtDescriptor *desc_src; int need_reinit = 0; + if (c->nb_slice_ctx) { + for (int i = 0; i < c->nb_slice_ctx; i++) { + int ret = sws_setColorspaceDetails(c->slice_ctx[i], inv_table, + srcRange, table, dstRange, + brightness, contrast, saturation); + if (ret < 0) + return ret; + } + + return 0; + } + handle_formats(c); desc_dst = av_pix_fmt_desc_get(c->dstFormat); desc_src = av_pix_fmt_desc_get(c->srcFormat); @@ -1005,6 +1018,12 @@ int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table, if (!c ) return -1; + if (c->nb_slice_ctx) { + return sws_getColorspaceDetails(c->slice_ctx[0], inv_table, srcRange, + table, dstRange, brightness, contrast, + saturation); + } + *inv_table = c->srcColorspaceTable; *table = c->dstColorspaceTable; *srcRange = range_override_needed(c->srcFormat) ? 1 : c->srcRange; @@ -1170,6 +1189,58 @@ static enum AVPixelFormat alphaless_fmt(enum AVPixelFormat fmt) } } +static int context_init_threaded(SwsContext *c, + SwsFilter *src_filter, SwsFilter *dst_filter) +{ + int ret; + + ret = avpriv_slicethread_create(&c->slicethread, (void*)c, + ff_sws_slice_worker, NULL, c->nb_threads); + if (ret == AVERROR(ENOSYS)) { + c->nb_threads = 1; + return 0; + } else if (ret < 0) + return ret; + + c->nb_threads = ret; + + c->slice_ctx = av_mallocz_array(c->nb_threads, sizeof(*c->slice_ctx)); + c->slice_err = av_mallocz_array(c->nb_threads, sizeof(*c->slice_err)); + if (!c->slice_ctx || !c->slice_err) + return AVERROR(ENOMEM); + + for (int i = 0; i < c->nb_threads; i++) { + c->slice_ctx[i] = sws_alloc_context(); + if (!c->slice_ctx[i]) + return AVERROR(ENOMEM); + + ret = av_opt_copy((void*)c->slice_ctx[i], (void*)c); + if (ret < 0) + return ret; + + c->slice_ctx[i]->nb_threads = 1; + + ret = sws_init_context(c->slice_ctx[i], src_filter, dst_filter); + if (ret < 0) + return ret; + + c->nb_slice_ctx++; + + if (c->slice_ctx[i]->dither == SWS_DITHER_ED) { + av_log(c, AV_LOG_VERBOSE, + "Error-diffusion dither is in use, scaling will be single-threaded."); + break; + } + } + + c->frame_src = av_frame_alloc(); + c->frame_dst = av_frame_alloc(); + if (!c->frame_src || !c->frame_dst) + return AVERROR(ENOMEM); + + return 0; +} + av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) { @@ -1192,6 +1263,13 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, static const float float_mult = 1.0f / 255.0f; static AVOnce rgb2rgb_once = AV_ONCE_INIT; + if (c->nb_threads != 1) { + ret = context_init_threaded(c, srcFilter, dstFilter); + if (ret < 0 || c->nb_threads > 1) + return ret; + // threading disabled in this build, init as single-threaded + } + cpu_flags = av_get_cpu_flags(); flags = c->flags; emms_c(); @@ -2254,6 +2332,13 @@ void sws_freeContext(SwsContext *c) if (!c) return; + for (i = 0; i < c->nb_slice_ctx; i++) + sws_freeContext(c->slice_ctx[i]); + av_freep(&c->slice_ctx); + av_freep(&c->slice_err); + + avpriv_slicethread_free(&c->slicethread); + for (i = 0; i < 4; i++) av_freep(&c->dither_error[i]);