x86/tx_float: add a standalone 15-point AVX2 transform

Enables its use everywhere else in the framework.
2022-09-28 06:46:57 +02:00 · 2022-09-28 06:46:57 +02:00 · cc1df4045e
commit cc1df4045e
parent 877e575b5d
2 changed files with 117 additions and 0 deletions
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@ -1515,6 +1515,69 @@ FFT_SPLIT_RADIX_FN avx2, 1
 %endif
 %endif

+%macro FFT15_FN 2
+INIT_YMM avx2
+cglobal fft15_ %+ %2, 4, 10, 16, ctx, out, in, stride, len, lut, tmp, tgt5, stride3, stride5
+    mov lutq, [ctxq + AVTXContext.map]
+
+    imul stride3q, strideq, 3
+    imul stride5q, strideq, 5
+
+    movaps m11, [mask_mmppmmmm]      ; mmppmmmm
+    movaps m10, [tab_53_float]       ; tab5
+    movaps xm9, [tab_53_float + 32]  ; tab3
+    vpermpd m9, m9, q1110            ; tab[23232323]
+    movaps m8, [s15_perm]
+
+%if %1
+    movups  xm0, [inq]
+    movddup xm5, [inq + 16]
+    movups  m2, [inq + mmsize*0 + 24]
+    movups  m3, [inq + mmsize*1 + 24]
+    movups  m4, [inq + mmsize*2 + 24]
+%else
+    LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15
+    LOAD64_LUT  m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7
+    LOAD64_LUT  m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15
+    LOAD64_LUT  m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7
+    mov tmpd, [lutq + 8]
+    movddup xm5, [inq + tmpq*8]
+%endif
+
+    FFT15
+
+    lea tgt5q, [outq + stride5q]
+    lea tmpq,  [outq + stride5q*2]
+
+    movhps [outq], xm14              ; out[0]
+    movhps [outq + stride5q*1], xm15 ; out[5]
+    movlps [outq + stride5q*2], xm15 ; out[10]
+
+    vextractf128 xm3, m0, 1
+    vextractf128 xm4, m1, 1
+    vextractf128 xm5, m2, 1
+
+    movlps [outq  + strideq*1],  xm1
+    movhps [outq  + strideq*2],  xm2
+    movlps [outq  + stride3q*1], xm3
+    movhps [outq  + strideq*4],  xm4
+    movlps [outq  + stride3q*2], xm0
+    movlps [outq  + strideq*8],  xm5
+    movhps [outq  + stride3q*4], xm0
+    movhps [tgt5q + strideq*2],  xm1
+    movhps [tgt5q + strideq*4],  xm3
+    movlps [tmpq  + strideq*1],  xm2
+    movlps [tmpq  + stride3q*1], xm4
+    movhps [tmpq  + strideq*4],  xm5
+
+    RET
+%endmacro
+
+%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+FFT15_FN 0, float
+FFT15_FN 1, ns_float
+%endif
+
 %macro IMDCT_FN 1
 INIT_YMM %1
 cglobal mdct_inv_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, exp, t1, t2, t3, \
--- a/libavutil/x86/tx_float_init.c
+++ b/libavutil/x86/tx_float_init.c
@ -30,6 +30,8 @@ TX_DECL_FN(fft8,      sse3)
 TX_DECL_FN(fft8_ns,   sse3)
 TX_DECL_FN(fft8,      avx)
 TX_DECL_FN(fft8_ns,   avx)
+TX_DECL_FN(fft15,     avx2)
+TX_DECL_FN(fft15_ns,  avx2)
 TX_DECL_FN(fft16,     avx)
 TX_DECL_FN(fft16_ns,  avx)
 TX_DECL_FN(fft16,     fma3)
@ -85,6 +87,53 @@ static av_cold int b ##basis## _i ##interleave(AVTXContext *s,                 \
 DECL_INIT_FN(8, 0)
 DECL_INIT_FN(8, 2)

+static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd,
+                               uint64_t flags, FFTXCodeletOptions *opts,
+                               int len, int inv, const void *scale)
+{
+    TX_TAB(ff_tx_init_tabs)(len);
+
+    s->map = av_malloc(len*sizeof(s->map));
+    s->map[0] = 0; /* DC is always at the start */
+    if (inv) /* Reversing the ACs flips the transform direction */
+        for (int i = 1; i < len; i++)
+            s->map[i] = len - i;
+    else
+        for (int i = 1; i < len; i++)
+            s->map[i] = i;
+
+    if (len == 15) {
+        int cnt = 0, tmp[15];
+
+        /* Our 15-point transform is actually a 5x3 PFA, so embed its input map. */
+        memcpy(tmp, s->map, 15*sizeof(*tmp));
+        for (int i = 0; i < 5; i++)
+            for (int j = 0; j < 3; j++)
+                s->map[i*3 + j] = tmp[(i*3 + j*5) % 15];
+
+        /* Special 15-point assembly permutation */
+        memcpy(tmp, s->map, 15*sizeof(*tmp));
+        for (int i = 1; i < 15; i += 3) {
+            s->map[cnt] = tmp[i];
+            cnt++;
+        }
+        for (int i = 2; i < 15; i += 3) {
+            s->map[cnt] = tmp[i];
+            cnt++;
+        }
+        for (int i = 0; i < 15; i += 3) {
+            s->map[cnt] = tmp[i];
+            cnt++;
+        }
+        memmove(&s->map[7], &s->map[6], 4*sizeof(int));
+        memmove(&s->map[3], &s->map[1], 4*sizeof(int));
+        s->map[1] = tmp[2];
+        s->map[2] = tmp[0];
+    }
+
+    return 0;
+}
+
 static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
                              uint64_t flags, FFTXCodeletOptions *opts,
                              int len, int inv, const void *scale)
@ -229,6 +278,11 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
           AV_CPU_FLAG_AVXSLOW),

 #if HAVE_AVX2_EXTERNAL
+    TX_DEF(fft15, FFT, 15, 15, 15, 0, 320, factor_init, avx2, AVX2,
+           AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
+    TX_DEF(fft15_ns, FFT, 15, 15, 15, 0, 384, factor_init, avx2, AVX2,
+           AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW),
+
    TX_DEF(fft_sr,    FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0,
           AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
    TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2,