x86/tx_float: fully support 128bit regs in LOAD64_LUT

The gather path didn't support 128bit registers. It's not faster on Zen 3, but it's here for completeness.
2022-09-19 04:13:04 +02:00 · 2022-09-19 04:13:04 +02:00 · 892548e6a1
commit 892548e6a1
parent af42bb3d61
1 changed files with 5 additions and 5 deletions
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@ -107,19 +107,19 @@ SECTION .text
 ; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
 %macro LOAD64_LUT 5-7
 %if %0 > 6 && cpuflag(avx2)
-    pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
-    movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction
-    vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args
+    pcmpeqd %7, %7 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
+    movupd xmm%6, [%3 + %4] ; float mov since vgatherdpd is a float instruction
+    vgatherdpd %1, [%2 + xmm%6*8], %7 ; must use separate registers for args
 %else
    mov      %5d, [%3 + %4 + 0]
    movsd  xmm%1, [%2 + %5q*8]
-%if mmsize == 32
+%if sizeof%1 > 16 && %0 > 5
    mov      %5d, [%3 + %4 + 8]
    movsd  xmm%6, [%2 + %5q*8]
 %endif
    mov      %5d, [%3 + %4 + 4]
    movhps xmm%1, [%2 + %5q*8]
-%if mmsize == 32
+%if sizeof%1 > 16 && %0 > 5
    mov      %5d, [%3 + %4 + 12]
    movhps xmm%6, [%2 + %5q*8]
    vinsertf128 %1, %1, xmm%6, 1