armv6: Accelerate butterflies_float

I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in butterflies_float_c() / ff_butterflies_float_vfp() for the same sample AAC stream: Before After Mean StdDev Mean StdDev Confidence Change Audio decode 1542.8 43.7 1470.5 41.5 100.0% +4.9% butterflies_float 130.0 11.9 70.2 12.1 100.0% +85.2% Signed-off-by: Martin Storsjö <martin@martin.st>
2014-07-11 00:12:34 +01:00 · 2014-07-11 00:12:34 +01:00 · 5a272190a0
parent 5edad2c4a1
commit 5a272190a0
2 changed files with 120 additions and 0 deletions
--- a/libavutil/arm/float_dsp_init_vfp.c
+++ b/libavutil/arm/float_dsp_init_vfp.c
@ -32,6 +32,8 @@ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
 void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
                                const float *src1, int len);

+void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len);
+
 av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags)
 {
    if (!have_vfpv3(cpu_flags)) {
@ -39,4 +41,6 @@ av_cold void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp, int cpu_flags)
        fdsp->vector_fmul_window = ff_vector_fmul_window_vfp;
    }
    fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
+    if (!have_vfpv3(cpu_flags))
+        fdsp->butterflies_float = ff_butterflies_float_vfp;
 }
--- a/libavutil/arm/float_dsp_vfp.S
+++ b/libavutil/arm/float_dsp_vfp.S
@ -339,3 +339,119 @@ function ff_vector_fmul_reverse_vfp, export=1
        vpop            {d8-d15}
        bx              lr
 endfunc
+
+/**
+ * ARM VFP implementation of 'butterflies_float_c' function
+ * Assume that len is a positive non-zero number
+ */
+@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
+function ff_butterflies_float_vfp, export=1
+BASE1   .req    a1
+BASE2   .req    a2
+LEN     .req    a3
+OLDFPSCR .req   a4
+
+        vpush   {s16-s31}
+        fmrx    OLDFPSCR, FPSCR
+
+        tst     LEN, #7
+        beq     4f                          @ common case: len is a multiple of 8
+
+        ldr     ip, =0x03000000             @ RunFast mode, scalar mode
+        fmxr    FPSCR, ip
+
+        tst     LEN, #1
+        beq     1f
+        vldmia  BASE1!, {s0}
+        vldmia  BASE2!, {s8}
+        vadd.f  s16, s0, s8
+        vsub.f  s24, s0, s8
+        vstr    s16, [BASE1, #0-4*1]
+        vstr    s24, [BASE2, #0-4*1]
+1:
+        tst     LEN, #2
+        beq     2f
+        vldmia  BASE1!, {s0-s1}
+        vldmia  BASE2!, {s8-s9}
+        vadd.f  s16, s0, s8
+        vadd.f  s17, s1, s9
+        vsub.f  s24, s0, s8
+        vsub.f  s25, s1, s9
+        vstr    d8, [BASE1, #0-8*1]    @ s16,s17
+        vstr    d12, [BASE2, #0-8*1]   @ s24,s25
+2:
+        tst     LEN, #4
+        beq     3f
+        vldmia  BASE1!, {s0-s1}
+        vldmia  BASE2!, {s8-s9}
+        vldmia  BASE1!, {s2-s3}
+        vldmia  BASE2!, {s10-s11}
+        vadd.f  s16, s0, s8
+        vadd.f  s17, s1, s9
+        vsub.f  s24, s0, s8
+        vsub.f  s25, s1, s9
+        vadd.f  s18, s2, s10
+        vadd.f  s19, s3, s11
+        vsub.f  s26, s2, s10
+        vsub.f  s27, s3, s11
+        vstr    d8, [BASE1, #0-16*1]    @ s16,s17
+        vstr    d12, [BASE2, #0-16*1]   @ s24,s25
+        vstr    d9, [BASE1, #8-16*1]    @ s18,s19
+        vstr    d13, [BASE2, #8-16*1]   @ s26,s27
+3:
+        bics    LEN, LEN, #7
+        beq     7f
+4:
+        ldr     ip, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
+        fmxr    FPSCR, ip
+
+        vldmia  BASE1!, {s0-s1}
+        vldmia  BASE2!, {s8-s9}
+        vldmia  BASE1!, {s2-s3}
+        vldmia  BASE2!, {s10-s11}
+        vadd.f  s16, s0, s8
+            vldmia  BASE1!, {s4-s5}
+            vldmia  BASE2!, {s12-s13}
+            vldmia  BASE1!, {s6-s7}
+            vldmia  BASE2!, {s14-s15}
+        vsub.f  s24, s0, s8
+            vadd.f  s20, s4, s12
+        subs    LEN, LEN, #8
+        beq     6f
+5:              vldmia  BASE1!, {s0-s3}
+                vldmia  BASE2!, {s8-s11}
+            vsub.f  s28, s4, s12
+        vstr    d8, [BASE1, #0-16*3]    @ s16,s17
+        vstr    d9, [BASE1, #8-16*3]    @ s18,s19
+        vstr    d12, [BASE2, #0-16*3]   @ s24,s25
+        vstr    d13, [BASE2, #8-16*3]   @ s26,s27
+                vadd.f  s16, s0, s8
+                    vldmia  BASE1!, {s4-s7}
+                    vldmia  BASE2!, {s12-s15}
+                vsub.f  s24, s0, s8
+            vstr    d10, [BASE1, #0-16*3]   @ s20,s21
+            vstr    d11, [BASE1, #8-16*3]   @ s22,s23
+            vstr    d14, [BASE2, #0-16*3]   @ s28,s29
+            vstr    d15, [BASE2, #8-16*3]   @ s30,s31
+                    vadd.f  s20, s4, s12
+        subs    LEN, LEN, #8
+        bne     5b
+6:                   vsub.f  s28, s4, s12
+                vstr    d8, [BASE1, #0-16*2]    @ s16,s17
+                vstr    d9, [BASE1, #8-16*2]    @ s18,s19
+                vstr    d12, [BASE2, #0-16*2]   @ s24,s25
+                vstr    d13, [BASE2, #8-16*2]   @ s26,s27
+                    vstr    d10, [BASE1, #0-16*1]   @ s20,s21
+                    vstr    d11, [BASE1, #8-16*1]   @ s22,s23
+                    vstr    d14, [BASE2, #0-16*1]   @ s28,s29
+                    vstr    d15, [BASE2, #8-16*1]   @ s30,s31
+7:
+        fmxr    FPSCR, OLDFPSCR
+        vpop    {s16-s31}
+        bx      lr
+
+        .unreq  BASE1
+        .unreq  BASE2
+        .unreq  LEN
+        .unreq  OLDFPSCR
+endfunc