arm/aarch64: vp9: Fix vertical alignment

Align the second/third operands as they usually are. Due to the wildly varying sizes of the written out operands in aarch64 assembly, the column alignment is usually not as clear as in arm assembly. This is cherrypicked from libav commit 7995ebfad1. Signed-off-by: Martin Storsjö <martin@martin.st>
2017-01-09 00:04:19 +02:00 · 2017-01-09 00:04:19 +02:00 · 21c89f3a26
commit 21c89f3a26
parent 70317b25aa
3 changed files with 26 additions and 26 deletions
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@ -380,7 +380,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
        movrel          x4,  idct_coeffs
 .else
-        movrel          x4, iadst8_coeffs
+        movrel          x4,  iadst8_coeffs
        ld1             {v1.8h}, [x4], #16
 .endif
        ld1             {v0.8h}, [x4]
@ -480,23 +480,23 @@ itxfm_func8x8 iadst, iadst


 function idct16x16_dc_add_neon
-        movrel          x4, idct_coeffs
+        movrel          x4,  idct_coeffs
        ld1             {v0.4h}, [x4]

-        movi            v1.4h, #0
+        movi            v1.4h,  #0

        ld1             {v2.h}[0], [x2]
-        smull           v2.4s,  v2.4h, v0.h[0]
-        rshrn           v2.4h,  v2.4s, #14
-        smull           v2.4s,  v2.4h, v0.h[0]
-        rshrn           v2.4h,  v2.4s, #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
        dup             v2.8h,  v2.h[0]
        st1             {v1.h}[0], [x2]

-        srshr           v2.8h, v2.8h, #6
+        srshr           v2.8h,  v2.8h,  #6

-        mov             x3, x0
-        mov             x4, #16
+        mov             x3,  x0
+        mov             x4,  #16
 1:
        // Loop to add the constant from v2 into all 16x16 outputs
        subs            x4,  x4,  #2
@ -869,7 +869,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1,idct
        ld1             {v0.8h,v1.8h}, [x10]
 .endif
-        mov             x9, #32
+        mov             x9,  #32

 .ifc \txfm1\()_\txfm2,idct_idct
        cmp             w3,  #10
@ -1046,10 +1046,10 @@ idct16_partial quarter
 idct16_partial half

 function idct32x32_dc_add_neon
-        movrel          x4, idct_coeffs
+        movrel          x4,  idct_coeffs
        ld1             {v0.4h}, [x4]

-        movi            v1.4h, #0
+        movi            v1.4h,  #0

        ld1             {v2.h}[0], [x2]
        smull           v2.4s,  v2.4h,  v0.h[0]
@ -1059,10 +1059,10 @@ function idct32x32_dc_add_neon
        dup             v2.8h,  v2.h[0]
        st1             {v1.h}[0], [x2]

-        srshr           v0.8h, v2.8h, #6
+        srshr           v0.8h,  v2.8h,  #6

-        mov             x3, x0
-        mov             x4, #32
+        mov             x3,  x0
+        mov             x4,  #32
 1:
        // Loop to add the constant v0 into all 32x32 outputs
        subs            x4,  x4,  #2
@ -1230,7 +1230,7 @@ endfunc
 // x9 = double input stride
 function idct32_1d_8x32_pass1\suffix\()_neon
        mov             x14, x30
-        movi            v2.8h, #0
+        movi            v2.8h,  #0

        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .ifb \suffix
@ -1295,7 +1295,7 @@ function idct32_1d_8x32_pass1\suffix\()_neon
 .endif
        add             x2,  x2,  #64

-        movi            v2.8h, #0
+        movi            v2.8h,  #0
        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
 .ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@ -530,7 +530,7 @@ function idct16x16_dc_add_neon
        movrel          r12, idct_coeffs
        vld1.16         {d0}, [r12,:64]

-        vmov.i16        q2, #0
+        vmov.i16        q2,  #0

        vld1.16         {d16[]}, [r2,:16]
        vmull.s16       q8,  d16, d0[0]
@ -793,7 +793,7 @@ function \txfm\()16_1d_4x16_pass1_neon
        push            {lr}

        mov             r12, #32
-        vmov.s16        q2, #0
+        vmov.s16        q2,  #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
        vld1.16         {d\i}, [r2,:64]
        vst1.16         {d4},  [r2,:64], r12
@ -1142,7 +1142,7 @@ function idct32x32_dc_add_neon
        movrel          r12, idct_coeffs
        vld1.16         {d0}, [r12,:64]

-        vmov.i16        q2, #0
+        vmov.i16        q2,  #0

        vld1.16         {d16[]}, [r2,:16]
        vmull.s16       q8,  d16, d0[0]
@ -1330,7 +1330,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon

        @ Double stride of the input, since we only read every other line
        mov             r12, #128
-        vmov.s16        d4, #0
+        vmov.s16        d4,  #0

        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
 .ifb \suffix
@ -1394,7 +1394,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
 .endif
        add             r2,  r2,  #64

-        vmov.s16        d8, #0
+        vmov.s16        d8,  #0
        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
 .ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@ -1533,9 +1533,9 @@ function idct32_1d_4x32_pass2\suffix\()_neon
 .endif
        vld1.32         {d12[]},  [r0,:32], r1
        vld1.32         {d12[1]}, [r0,:32], r1
-        vrshr.s16       q4, q4, #6
+        vrshr.s16       q4,  q4,  #6
        vld1.32         {d13[]},  [r0,:32], r1
-        vrshr.s16       q5, q5, #6
+        vrshr.s16       q5,  q5,  #6
        vld1.32         {d13[1]}, [r0,:32], r1
        sub             r0,  r0,  r1, lsl #2
        vaddw.u8        q4,  q4,  d12
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@ -828,7 +828,7 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1
 endfunc

 function vp9_loop_filter_h_16_neon
-        sub             r12,  r0,  #8
+        sub             r12, r0,  #8
        vld1.8          {d16}, [r12,:64], r1
        vld1.8          {d24}, [r0, :64], r1
        vld1.8          {d17}, [r12,:64], r1