aarch64: Use ret x<n> instead of br x<n> where possible

Change AArch64 assembly code to use: ret x<n> instead of: br x<n> "ret x<n>" is already used in a lot of places so this patch makes it consistent across the code base. This does not change behavior or performance. In addition, this change reduces the number of landing pads needed in a subsequent patch to support the Armv8.5-A Branch Target Identification (BTI) security feature. Signed-off-by: Jonathan Wright <jonathan.wright@arm.com> Signed-off-by: Martin Storsjö <martin@martin.st>
2020-09-28 13:35:51 +01:00 · 2020-09-28 13:35:51 +01:00 · 6f04cf54f5
parent 20c66fe2f9
commit 6f04cf54f5
5 changed files with 73 additions and 73 deletions
--- a/libavcodec/aarch64/simple_idct_neon.S
+++ b/libavcodec/aarch64/simple_idct_neon.S
@ -58,7 +58,7 @@ endconst
 .endm

 .macro idct_end
-        br              x10
+        ret             x10
 .endm

 .macro smull1 a, b, c
--- a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
@ -1040,7 +1040,7 @@ function \txfm\()16_1d_4x16_pass1_neon
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
        store           \i,  x0,  #16
 .endr
-        br              x14
+        ret             x14
 1:
        // Special case: For the last input column (x1 == 12),
        // which would be stored as the last row in the temp buffer,
@ -1068,7 +1068,7 @@ function \txfm\()16_1d_4x16_pass1_neon
        mov             v29.16b, v17.16b
        mov             v30.16b, v18.16b
        mov             v31.16b, v19.16b
-        br              x14
+        ret             x14
 endfunc

 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@ -1098,7 +1098,7 @@ function \txfm\()16_1d_4x16_pass2_neon
        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

-        br              x14
+        ret             x14
 endfunc
 .endm

@ -1208,7 +1208,7 @@ function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10
 .endif
-        br              x15
+        ret             x15
 endfunc

 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
@ -1264,7 +1264,7 @@ function idct16_1d_4x16_pass1_quarter_neon
        st1             {v23.4s},  [x0], #16
        st1             {v27.4s},  [x0], #16
        st1             {v31.4s},  [x0], #16
-        br              x14
+        ret             x14
 endfunc

 function idct16_1d_4x16_pass2_quarter_neon
@ -1286,7 +1286,7 @@ function idct16_1d_4x16_pass2_quarter_neon
        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

-        br              x14
+        ret             x14
 endfunc

 function idct16_1d_4x16_pass1_half_neon
@ -1313,7 +1313,7 @@ function idct16_1d_4x16_pass1_half_neon
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
        store           \i,  x0,  #16
 .endr
-        br              x14
+        ret             x14
 1:
        // Special case: For the second input column (r1 == 4),
        // which would be stored as the second row in the temp buffer,
@ -1341,7 +1341,7 @@ function idct16_1d_4x16_pass1_half_neon
        mov             v21.16b, v17.16b
        mov             v22.16b, v18.16b
        mov             v23.16b, v19.16b
-        br              x14
+        ret             x14
 endfunc

 function idct16_1d_4x16_pass2_half_neon
@ -1364,7 +1364,7 @@ function idct16_1d_4x16_pass2_half_neon
        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

-        br              x14
+        ret             x14
 endfunc

 .macro idct16_partial size
@ -1390,7 +1390,7 @@ function idct16x16_\size\()_add_16_neon

        add             sp,  sp,  #1024
        ldp             d8,  d9,  [sp], 0x10
-        br              x15
+        ret             x15
 endfunc
 .endm

@ -1729,7 +1729,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
        store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
        store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
 .purgem store_rev
-        br              x14
+        ret             x14
 endfunc

 // This is mostly the same as 4x32_pass1, but without the transpose,
@ -1849,7 +1849,7 @@ function idct32_1d_4x32_pass2\suffix\()_neon
        load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
        load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
 .purgem load_acc_store
-        br              x14
+        ret             x14
 endfunc
 .endm

@ -1943,7 +1943,7 @@ function vp9_idct_idct_32x32_add_16_neon
        ldp             d10, d11, [sp], 0x10
        ldp             d8,  d9,  [sp], 0x10

-        br              x15
+        ret             x15
 endfunc

 function ff_vp9_idct_idct_32x32_add_10_neon, export=1
@ -2009,7 +2009,7 @@ function idct32x32_\size\()_add_16_neon
        ldp             d10, d11, [sp], 0x10
        ldp             d8,  d9,  [sp], 0x10

-        br              x15
+        ret             x15
 endfunc
 .endm

--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@ -787,7 +787,7 @@ function \txfm\()16_1d_8x16_pass1_neon
 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
        store           \i,  x0,  #16
 .endr
-        br              x14
+        ret             x14
 1:
        // Special case: For the last input column (x1 == 8),
        // which would be stored as the last row in the temp buffer,
@ -806,7 +806,7 @@ function \txfm\()16_1d_8x16_pass1_neon
        mov             v29.16b, v21.16b
        mov             v30.16b, v22.16b
        mov             v31.16b, v23.16b
-        br              x14
+        ret             x14
 endfunc

 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
@ -834,7 +834,7 @@ function \txfm\()16_1d_8x16_pass2_neon
        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b

-        br              x14
+        ret             x14
 endfunc
 .endm

@ -925,7 +925,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10
 .endif
-        br              x15
+        ret             x15
 endfunc
 .endm

@ -960,7 +960,7 @@ function idct16_1d_8x16_pass1_quarter_neon
 .irp i, 24, 25, 26, 27
        store           \i,  x0,  x9
 .endr
-        br              x14
+        ret             x14
 endfunc

 function idct16_1d_8x16_pass2_quarter_neon
@ -978,7 +978,7 @@ function idct16_1d_8x16_pass2_quarter_neon
        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b

-        br              x14
+        ret             x14
 endfunc

 function idct16_1d_8x16_pass1_half_neon
@ -1003,7 +1003,7 @@ function idct16_1d_8x16_pass1_half_neon
 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
        store           \i,  x0,  x9
 .endr
-        br              x14
+        ret             x14
 endfunc

 function idct16_1d_8x16_pass2_half_neon
@ -1021,7 +1021,7 @@ function idct16_1d_8x16_pass2_half_neon
        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b

-        br              x14
+        ret             x14
 endfunc

 .macro idct16_partial size
@ -1038,7 +1038,7 @@ function idct16x16_\size\()_add_neon
 .endr

        add             sp,  sp,  #512
-        br              x15
+        ret             x15
 endfunc
 .endm

@ -1349,7 +1349,7 @@ function idct32_1d_8x32_pass1\suffix\()_neon
        store_rev       v25.8h, v17.8h
        store_rev       v24.8h, v16.8h
 .purgem store_rev
-        br              x14
+        ret             x14
 endfunc

 // This is mostly the same as 8x32_pass1, but without the transpose,
@ -1466,7 +1466,7 @@ function idct32_1d_8x32_pass2\suffix\()_neon
        load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
        load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
 .purgem load_acc_store
-        br              x14
+        ret             x14
 endfunc
 .endm

@ -1547,7 +1547,7 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
        ldp             d8,  d9,  [sp], 0x10
        ldp             d10, d11, [sp], 0x10

-        br              x15
+        ret             x15
 endfunc

 .macro idct32_partial size
@ -1572,7 +1572,7 @@ function idct32x32_\size\()_add_neon
        ldp             d8,  d9,  [sp], 0x10
        ldp             d10, d11, [sp], 0x10

-        br              x15
+        ret             x15
 endfunc
 .endm

--- a/libavcodec/aarch64/vp9lpf_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@ -57,7 +57,7 @@
        mov             x12, v4.d[1]
        adds            x11, x11, x12
        b.ne            1f
-        br              x10
+        ret             x10
 1:

 .if \wd >= 8
@ -193,7 +193,7 @@
        b.eq            6f
 .else
        b.ne            1f
-        br              x13
+        ret             x13
 1:
 .endif

@ -252,7 +252,7 @@
        b.ne            1f
        // If no pixels needed flat8in nor flat8out, jump to a
        // writeout of the inner 4 pixels
-        br              x14
+        ret             x14
 1:

        mov             x11, v7.d[0]
@ -260,7 +260,7 @@
        adds            x11, x11, x12
        b.ne            1f
        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        br              x15
+        ret             x15

 1:
        // flat8out
@ -434,7 +434,7 @@ function ff_\func\()_\bpp\()_neon, export=1
        ldp             d10, d11, [sp], 0x10
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10
-        br              x16
+        ret             x16
 .else
        b               \func\()_16_neon
 .endif
@ -474,7 +474,7 @@ function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10
 .endif
-        br              x16
+        ret             x16
 endfunc
 .endm

@ -508,7 +508,7 @@ function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
        lsl             w3,  w14, #\bpp - 8
        lsl             w4,  w15, #\bpp - 8
        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
-        br              x16
+        ret             x16
 endfunc
 .endm

@ -541,7 +541,7 @@ function vp9_loop_filter_v_4_8_16_neon
        st1             {v25.8h}, [x0], x1
        sub             x0,  x0,  x1, lsl #1

-        br              x10
+        ret             x10
 endfunc

 bpp_frontends vp9_loop_filter_v_4_8
@ -589,7 +589,7 @@ function vp9_loop_filter_h_4_8_16_neon
        sub             x0,  x0,  x1, lsl #3
        add             x0,  x0,  #4

-        br              x10
+        ret             x10
 endfunc

 bpp_frontends vp9_loop_filter_h_4_8
@ -620,7 +620,7 @@ function vp9_loop_filter_v_8_8_16_neon
        sub             x0,  x0,  x1, lsl #1
        sub             x0,  x0,  x1

-        br              x10
+        ret             x10
 6:
        sub             x9,  x0,  x1, lsl #1
        st1             {v22.8h}, [x9], x1
@ -628,7 +628,7 @@ function vp9_loop_filter_v_8_8_16_neon
        st1             {v23.8h}, [x9], x1
        st1             {v25.8h}, [x0], x1
        sub             x0,  x0,  x1, lsl #1
-        br              x10
+        ret             x10
 endfunc

 bpp_frontends vp9_loop_filter_v_8_8
@ -671,7 +671,7 @@ function vp9_loop_filter_h_8_8_16_neon
        sub             x0,  x0,  x1, lsl #3
        add             x0,  x0,  #8

-        br              x10
+        ret             x10
 6:
        // If we didn't need to do the flat8in part, we use the same writeback
        // as in loop_filter_h_4_8.
@ -688,7 +688,7 @@ function vp9_loop_filter_h_8_8_16_neon
        st1             {v25.d}[1], [x0], x1
        sub             x0,  x0,  x1, lsl #3
        add             x0,  x0,  #4
-        br              x10
+        ret             x10
 endfunc

 bpp_frontends vp9_loop_filter_h_8_8
@ -743,7 +743,7 @@ function vp9_loop_filter_v_16_8_16_neon
        sub             x0,  x0,  x1, lsl #3
        add             x0,  x0,  x1

-        br              x10
+        ret             x10
 8:
        add             x9,  x9,  x1, lsl #2
        // If we didn't do the flat8out part, the output is left in the
@ -756,7 +756,7 @@ function vp9_loop_filter_v_16_8_16_neon
        st1             {v26.8h}, [x0], x1
        sub             x0,  x0,  x1, lsl #1
        sub             x0,  x0,  x1
-        br              x10
+        ret             x10
 7:
        sub             x9,  x0,  x1, lsl #1
        st1             {v22.8h}, [x9], x1
@ -764,7 +764,7 @@ function vp9_loop_filter_v_16_8_16_neon
        st1             {v23.8h}, [x9], x1
        st1             {v25.8h}, [x0], x1
        sub             x0,  x0,  x1, lsl #1
-        br              x10
+        ret             x10
 endfunc

 bpp_frontends vp9_loop_filter_v_16_8, push=1
@ -821,7 +821,7 @@ function vp9_loop_filter_h_16_8_16_neon
        st1             {v31.8h}, [x0], x1
        sub             x0,  x0,  x1, lsl #3

-        br              x10
+        ret             x10
 8:
        // The same writeback as in loop_filter_h_8_8
        sub             x9,  x0,  #8
@ -838,7 +838,7 @@ function vp9_loop_filter_h_16_8_16_neon
        st1             {v27.8h}, [x0], x1
        sub             x0,  x0,  x1, lsl #3
        add             x0,  x0,  #8
-        br              x10
+        ret             x10
 7:
        // The same writeback as in loop_filter_h_4_8
        sub             x9,  x0,  #4
@ -854,7 +854,7 @@ function vp9_loop_filter_h_16_8_16_neon
        st1             {v25.d}[1], [x0], x1
        sub             x0,  x0,  x1, lsl #3
        add             x0,  x0,  #4
-        br              x10
+        ret             x10
 endfunc

 bpp_frontends vp9_loop_filter_h_16_8, push=1
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@ -399,7 +399,7 @@
 .endif
        // If no pixels needed flat8in nor flat8out, jump to a
        // writeout of the inner 4 pixels
-        br              x14
+        ret             x14
 1:

        mov             x5,  v7.d[0]
@ -411,7 +411,7 @@
        cbnz            x5,  1f
 .endif
        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        br              x15
+        ret             x15

 1:
        // flat8out
@ -532,32 +532,32 @@ function vp9_loop_filter_4
        loop_filter     4,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
        ret
 9:
-        br              x10
+        ret             x10
 endfunc

 function vp9_loop_filter_4_16b_mix_44
        loop_filter     4,  .16b, 44,   v16, v17, v18, v19, v28, v29, v30, v31
        ret
 9:
-        br              x10
+        ret             x10
 endfunc

 function vp9_loop_filter_8
        loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
        ret
 6:
-        br              x13
+        ret             x13
 9:
-        br              x10
+        ret             x10
 endfunc

 function vp9_loop_filter_8_16b_mix
        loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
        ret
 6:
-        br              x13
+        ret             x13
 9:
-        br              x10
+        ret             x10
 endfunc

 function vp9_loop_filter_16
@ -568,7 +568,7 @@ function vp9_loop_filter_16
        ldp             d10, d11, [sp], 0x10
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 endfunc

 function vp9_loop_filter_16_16b
@ -579,7 +579,7 @@ function vp9_loop_filter_16_16b
        ldp             d10, d11, [sp], 0x10
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 endfunc

 .macro loop_filter_4
@ -648,7 +648,7 @@ function ff_vp9_loop_filter_v_4_8_neon, export=1
        st1             {v23.8b}, [x9], x1
        st1             {v25.8b}, [x0], x1

-        br              x10
+        ret             x10
 endfunc

 function ff_vp9_loop_filter_v_44_16_neon, export=1
@ -672,7 +672,7 @@ function ff_vp9_loop_filter_v_44_16_neon, export=1
        st1             {v23.16b}, [x9], x1
        st1             {v25.16b}, [x0], x1

-        br              x10
+        ret             x10
 endfunc

 function ff_vp9_loop_filter_h_4_8_neon, export=1
@ -714,7 +714,7 @@ function ff_vp9_loop_filter_h_4_8_neon, export=1
        st1             {v25.s}[0], [x9], x1
        st1             {v25.s}[1], [x0], x1

-        br              x10
+        ret             x10
 endfunc

 function ff_vp9_loop_filter_h_44_16_neon, export=1
@ -766,7 +766,7 @@ function ff_vp9_loop_filter_h_44_16_neon, export=1
        st1             {v25.s}[1], [x9], x1
        st1             {v25.s}[3], [x0], x1

-        br              x10
+        ret             x10
 endfunc

 function ff_vp9_loop_filter_v_8_8_neon, export=1
@ -793,14 +793,14 @@ function ff_vp9_loop_filter_v_8_8_neon, export=1
        st1             {v23.8b}, [x9], x1
        st1             {v26.8b}, [x0], x1

-        br              x10
+        ret             x10
 6:
        sub             x9,  x0,  x1, lsl #1
        st1             {v22.8b}, [x9], x1
        st1             {v24.8b}, [x0], x1
        st1             {v23.8b}, [x9], x1
        st1             {v25.8b}, [x0], x1
-        br              x10
+        ret             x10
 endfunc

 .macro mix_v_16 mix
@ -828,14 +828,14 @@ function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1
        st1             {v23.16b}, [x9], x1
        st1             {v26.16b}, [x0], x1

-        br              x10
+        ret             x10
 6:
        sub             x9,  x0,  x1, lsl #1
        st1             {v22.16b}, [x9], x1
        st1             {v24.16b}, [x0], x1
        st1             {v23.16b}, [x9], x1
        st1             {v25.16b}, [x0], x1
-        br              x10
+        ret             x10
 endfunc
 .endm

@ -876,7 +876,7 @@ function ff_vp9_loop_filter_h_8_8_neon, export=1
        st1             {v23.8b}, [x9], x1
        st1             {v27.8b}, [x0], x1

-        br              x10
+        ret             x10
 6:
        // If we didn't need to do the flat8in part, we use the same writeback
        // as in loop_filter_h_4_8.
@ -891,7 +891,7 @@ function ff_vp9_loop_filter_h_8_8_neon, export=1
        st1             {v24.s}[1], [x0], x1
        st1             {v25.s}[0], [x9], x1
        st1             {v25.s}[1], [x0], x1
-        br              x10
+        ret             x10
 endfunc

 .macro mix_h_16 mix
@ -942,7 +942,7 @@ function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
        st1             {v27.8b},   [x9], x1
        st1             {v27.d}[1], [x0], x1

-        br              x10
+        ret             x10
 6:
        add             x9,  x9,  #2
        add             x0,  x0,  #2
@ -963,7 +963,7 @@ function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
        st1             {v24.s}[3], [x0], x1
        st1             {v25.s}[1], [x9], x1
        st1             {v25.s}[3], [x0], x1
-        br              x10
+        ret             x10
 endfunc
 .endm

@ -1022,7 +1022,7 @@ function ff_vp9_loop_filter_v_16_8_neon, export=1
        ldp             d10, d11, [sp], 0x10
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 8:
        add             x9,  x9,  x1, lsl #2
        // If we didn't do the flat8out part, the output is left in the
@ -1091,7 +1091,7 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1
        ldp             d10, d11, [sp], 0x10
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 8:
        add             x9,  x9,  x1, lsl #2
        st1             {v21.16b}, [x9], x1
@ -1168,7 +1168,7 @@ function ff_vp9_loop_filter_h_16_8_neon, export=1
        ldp             d10, d11, [sp], 0x10
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 8:
        // The same writeback as in loop_filter_h_8_8
        sub             x9,  x0,  #4
@ -1287,7 +1287,7 @@ function ff_vp9_loop_filter_h_16_16_neon, export=1
        ldp             d10, d11, [sp], 0x10
        ldp             d12, d13, [sp], 0x10
        ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 8:
        sub             x9,  x0,  #4
        add             x0,  x9,  x1, lsl #3