diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c index dee1f68101..8a05965c9b 100644 --- a/libavfilter/vf_nlmeans.c +++ b/libavfilter/vf_nlmeans.c @@ -308,9 +308,9 @@ static int config_input(AVFilterLink *inlink) s->ii = s->ii_orig + s->ii_lz_32 + 1; // allocate weighted average for every pixel - s->linesize = inlink->w; - s->total_weight = av_malloc_array(inlink->w, inlink->h * sizeof(*s->total_weight)); - s->sum = av_malloc_array(inlink->w, inlink->h * sizeof(*s->sum)); + s->linesize = inlink->w + 100; + s->total_weight = av_malloc_array(s->linesize, inlink->h * sizeof(*s->total_weight)); + s->sum = av_malloc_array(s->linesize, inlink->h * sizeof(*s->sum)); if (!s->total_weight || !s->sum) return AVERROR(ENOMEM); @@ -519,6 +519,9 @@ void ff_nlmeans_init(NLMeansDSPContext *dsp) if (ARCH_AARCH64) ff_nlmeans_init_aarch64(dsp); + + if (ARCH_X86) + ff_nlmeans_init_x86(dsp); } static av_cold int init(AVFilterContext *ctx) diff --git a/libavfilter/vf_nlmeans.h b/libavfilter/vf_nlmeans.h index cd1ee7c0bf..43611a03bd 100644 --- a/libavfilter/vf_nlmeans.h +++ b/libavfilter/vf_nlmeans.h @@ -41,5 +41,6 @@ typedef struct NLMeansDSPContext { void ff_nlmeans_init(NLMeansDSPContext *dsp); void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp); +void ff_nlmeans_init_x86(NLMeansDSPContext *dsp); #endif /* AVFILTER_NLMEANS_H */ diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index a29941eaeb..e87481bd7a 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -20,6 +20,7 @@ OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp_init.o OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o +OBJS-$(CONFIG_NLMEANS_FILTER) += x86/vf_nlmeans_init.o OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay_init.o OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o @@ -61,6 +62,7 @@ X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o +X86ASM-OBJS-$(CONFIG_NLMEANS_FILTER) += x86/vf_nlmeans.o X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o X86ASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o X86ASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o diff --git a/libavfilter/x86/vf_nlmeans.asm b/libavfilter/x86/vf_nlmeans.asm new file mode 100644 index 0000000000..8f57801035 --- /dev/null +++ b/libavfilter/x86/vf_nlmeans.asm @@ -0,0 +1,97 @@ +;***************************************************************************** +;* x86-optimized functions for nlmeans filter +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + + +%include "libavutil/x86/x86util.asm" + +%if HAVE_AVX2_EXTERNAL && ARCH_X86_64 + +SECTION_RODATA 32 + +ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\ + 0, -1, -1, -1, -1, -1, -1, -1,\ + 0, 0, -1, -1, -1, -1, -1, -1,\ + 0, 0, 0, -1, -1, -1, -1, -1,\ + 0, 0, 0, 0, -1, -1, -1, -1,\ + 0, 0, 0, 0, 0, -1, -1, -1,\ + 0, 0, 0, 0, 0, 0, -1, -1,\ + 0, 0, 0, 0, 0, 0, 0, -1,\ + 0, 0, 0, 0, 0, 0, 0, 0 + +SECTION .text + +; void ff_compute_weights_line(const uint32_t *const iia, +; const uint32_t *const iib, +; const uint32_t *const iid, +; const uint32_t *const iie, +; const uint8_t *const src, +; float *total, +; float *sum, +; const float *const lut, +; int max, +; int startx, int endx); + +INIT_YMM avx2 +cglobal compute_weights_line, 8, 13, 5, 0, iia, iib, iid, iie, src, total, sum, lut, x, startx, endx, mod, elut + movsxd startxq, dword startxm + movsxd endxq, dword endxm + VPBROADCASTD m2, r8m + + mov xq, startxq + mov modq, mmsize / 4 + lea elutq, [ending_lut] + + vpcmpeqd m4, m4 + + .loop: + mov startxq, endxq + sub startxq, xq + cmp startxq, modq + cmovge startxq, modq + sal startxq, 5 + + movu m0, [iieq + xq * 4] + + psubd m0, [iidq + xq * 4] + psubd m0, [iibq + xq * 4] + paddd m0, [iiaq + xq * 4] + por m0, [elutq + startxq] + pminud m0, m2 + pslld m0, 2 + mova m3, m4 + vgatherdps m1, [lutq + m0], m3 + + pmovzxbd m0, [srcq + xq] + cvtdq2ps m0, m0 + + mulps m0, m1 + + addps m1, [totalq + xq * 4] + addps m0, [sumq + xq * 4] + + movups [totalq + xq * 4], m1 + movups [sumq + xq * 4], m0 + + add xq, mmsize / 4 + cmp xq, endxq + jl .loop + RET + +%endif diff --git a/libavfilter/x86/vf_nlmeans_init.c b/libavfilter/x86/vf_nlmeans_init.c new file mode 100644 index 0000000000..37764d30ab --- /dev/null +++ b/libavfilter/x86/vf_nlmeans_init.c @@ -0,0 +1,40 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/vf_nlmeans.h" + +void ff_compute_weights_line_avx2(const uint32_t *const iia, + const uint32_t *const iib, + const uint32_t *const iid, + const uint32_t *const iie, + const uint8_t *const src, + float *total_weight, + float *sum, + const float *const weight_lut, + int max_meaningful_diff, + int startx, int endx); + +av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags)) + dsp->compute_weights_line = ff_compute_weights_line_avx2; +}