ffmpeg/libavfilter/x86/vf_nlmeans.asm

98 lines
3.0 KiB
NASM

;*****************************************************************************
;* x86-optimized functions for nlmeans filter
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
SECTION_RODATA 32
ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\
0, -1, -1, -1, -1, -1, -1, -1,\
0, 0, -1, -1, -1, -1, -1, -1,\
0, 0, 0, -1, -1, -1, -1, -1,\
0, 0, 0, 0, -1, -1, -1, -1,\
0, 0, 0, 0, 0, -1, -1, -1,\
0, 0, 0, 0, 0, 0, -1, -1,\
0, 0, 0, 0, 0, 0, 0, -1,\
0, 0, 0, 0, 0, 0, 0, 0
SECTION .text
; void ff_compute_weights_line(const uint32_t *const iia,
; const uint32_t *const iib,
; const uint32_t *const iid,
; const uint32_t *const iie,
; const uint8_t *const src,
; float *total,
; float *sum,
; const float *const lut,
; int max,
; int startx, int endx);
INIT_YMM avx2
cglobal compute_weights_line, 8, 13, 5, 0, iia, iib, iid, iie, src, total, sum, lut, x, startx, endx, mod, elut
movsxd startxq, dword startxm
movsxd endxq, dword endxm
VPBROADCASTD m2, r8m
mov xq, startxq
mov modq, mmsize / 4
lea elutq, [ending_lut]
vpcmpeqd m4, m4
.loop:
mov startxq, endxq
sub startxq, xq
cmp startxq, modq
cmovge startxq, modq
sal startxq, 5
movu m0, [iieq + xq * 4]
psubd m0, [iidq + xq * 4]
psubd m0, [iibq + xq * 4]
paddd m0, [iiaq + xq * 4]
por m0, [elutq + startxq]
pminud m0, m2
pslld m0, 2
mova m3, m4
vgatherdps m1, [lutq + m0], m3
pmovzxbd m0, [srcq + xq]
cvtdq2ps m0, m0
mulps m0, m1
addps m1, [totalq + xq * 4]
addps m0, [sumq + xq * 4]
movups [totalq + xq * 4], m1
movups [sumq + xq * 4], m0
add xq, mmsize / 4
cmp xq, endxq
jl .loop
RET
%endif