x86/takdsp: add avx2 versions of all functions

On an Intel Core i7 12700k:

decorrelate_ls_c: 814.3
decorrelate_ls_sse2: 165.8
decorrelate_ls_avx2: 101.3
decorrelate_sf_c: 1602.6
decorrelate_sf_sse4: 640.1
decorrelate_sf_avx2: 324.6
decorrelate_sm_c: 1564.8
decorrelate_sm_sse2: 379.3
decorrelate_sm_avx2: 203.3
decorrelate_sr_c: 785.3
decorrelate_sr_sse2: 176.3
decorrelate_sr_avx2: 99.8

Tested-by: Lynne <dev@lynne.ee>
Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
James Almer 2023-12-22 20:34:52 -03:00
parent 370ce305f4
commit 591dc3b4b8
2 changed files with 38 additions and 14 deletions

View File

@ -28,7 +28,7 @@ pd_128: times 4 dd 128
SECTION .text
INIT_XMM sse2
%macro TAK_DECORRELATE 0
cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
shl lengthd, 2
add p1q, lengthq
@ -73,10 +73,8 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
mova m1, [p2q+lengthq]
mova m3, [p1q+lengthq+mmsize]
mova m4, [p2q+lengthq+mmsize]
mova m2, m1
mova m5, m4
psrad m2, 1
psrad m5, 1
psrad m2, m1, 1
psrad m5, m4, 1
psubd m0, m2
psubd m3, m5
paddd m1, m0
@ -88,29 +86,44 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
add lengthq, mmsize*2
jl .loop
RET
%endmacro
INIT_XMM sse4
INIT_XMM sse2
TAK_DECORRELATE
INIT_YMM avx2
TAK_DECORRELATE
%macro TAK_DECORRELATE_SF 0
cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
shl lengthd, 2
add p1q, lengthq
add p2q, lengthq
neg lengthq
movd m2, dshiftm
movd m3, dfactorm
pshufd m3, m3, 0
mova m4, [pd_128]
movd xm2, dshiftm
%if UNIX64
movd xm3, dfactorm
VPBROADCASTD m3, xm3
%else
VPBROADCASTD m3, dfactorm
%endif
VBROADCASTI128 m4, [pd_128]
.loop:
mova m0, [p1q+lengthq]
mova m1, [p2q+lengthq]
psrad m1, m2
psrad m1, xm2
pmulld m1, m3
paddd m1, m4
psrad m1, 8
pslld m1, m2
psubd m1, m0
pslld m1, xm2
psubd m1, [p1q+lengthq]
mova [p1q+lengthq], m1
add lengthq, mmsize
jl .loop
RET
%endmacro
INIT_XMM sse4
TAK_DECORRELATE_SF
INIT_YMM avx2
TAK_DECORRELATE_SF

View File

@ -24,9 +24,13 @@
#include "config.h"
void ff_tak_decorrelate_ls_sse2(const int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_ls_avx2(const int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sr_sse2(int32_t *p1, const int32_t *p2, int length);
void ff_tak_decorrelate_sr_avx2(int32_t *p1, const int32_t *p2, int length);
void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sm_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sf_sse4(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor);
void ff_tak_decorrelate_sf_avx2(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor);
av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
{
@ -42,5 +46,12 @@ av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
if (EXTERNAL_SSE4(cpu_flags)) {
c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->decorrelate_ls = ff_tak_decorrelate_ls_avx2;
c->decorrelate_sr = ff_tak_decorrelate_sr_avx2;
c->decorrelate_sm = ff_tak_decorrelate_sm_avx2;
c->decorrelate_sf = ff_tak_decorrelate_sf_avx2;
}
#endif
}