diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S
index 5e4d021a97..210182ff21 100644
--- a/libavcodec/aarch64/simple_idct_neon.S
+++ b/libavcodec/aarch64/simple_idct_neon.S
@@ -58,7 +58,7 @@ endconst
 .endm
 
 .macro idct_end
-        br              x10
+        ret             x10
 .endm
 
 .macro smull1 a, b, c
diff --git a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
index 68296d9c40..c5f43d36a3 100644
--- a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
@@ -1040,7 +1040,7 @@ function \txfm\()16_1d_4x16_pass1_neon
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
         store           \i,  x0,  #16
 .endr
-        br              x14
+        ret             x14
 1:
         // Special case: For the last input column (x1 == 12),
         // which would be stored as the last row in the temp buffer,
@@ -1068,7 +1068,7 @@ function \txfm\()16_1d_4x16_pass1_neon
         mov             v29.16b, v17.16b
         mov             v30.16b, v18.16b
         mov             v31.16b, v19.16b
-        br              x14
+        ret             x14
 endfunc
 
 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -1098,7 +1098,7 @@ function \txfm\()16_1d_4x16_pass2_neon
         load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
         load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
 
-        br              x14
+        ret             x14
 endfunc
 .endm
 
@@ -1208,7 +1208,7 @@ function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
 .endif
-        br              x15
+        ret             x15
 endfunc
 
 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
@@ -1264,7 +1264,7 @@ function idct16_1d_4x16_pass1_quarter_neon
         st1             {v23.4s},  [x0], #16
         st1             {v27.4s},  [x0], #16
         st1             {v31.4s},  [x0], #16
-        br              x14
+        ret             x14
 endfunc
 
 function idct16_1d_4x16_pass2_quarter_neon
@@ -1286,7 +1286,7 @@ function idct16_1d_4x16_pass2_quarter_neon
         load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
         load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
 
-        br              x14
+        ret             x14
 endfunc
 
 function idct16_1d_4x16_pass1_half_neon
@@ -1313,7 +1313,7 @@ function idct16_1d_4x16_pass1_half_neon
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
         store           \i,  x0,  #16
 .endr
-        br              x14
+        ret             x14
 1:
         // Special case: For the second input column (r1 == 4),
         // which would be stored as the second row in the temp buffer,
@@ -1341,7 +1341,7 @@ function idct16_1d_4x16_pass1_half_neon
         mov             v21.16b, v17.16b
         mov             v22.16b, v18.16b
         mov             v23.16b, v19.16b
-        br              x14
+        ret             x14
 endfunc
 
 function idct16_1d_4x16_pass2_half_neon
@@ -1364,7 +1364,7 @@ function idct16_1d_4x16_pass2_half_neon
         load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
         load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
 
-        br              x14
+        ret             x14
 endfunc
 
 .macro idct16_partial size
@@ -1390,7 +1390,7 @@ function idct16x16_\size\()_add_16_neon
 
         add             sp,  sp,  #1024
         ldp             d8,  d9,  [sp], 0x10
-        br              x15
+        ret             x15
 endfunc
 .endm
 
@@ -1729,7 +1729,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
         store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
         store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
 .purgem store_rev
-        br              x14
+        ret             x14
 endfunc
 
 // This is mostly the same as 4x32_pass1, but without the transpose,
@@ -1849,7 +1849,7 @@ function idct32_1d_4x32_pass2\suffix\()_neon
         load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
         load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
 .purgem load_acc_store
-        br              x14
+        ret             x14
 endfunc
 .endm
 
@@ -1943,7 +1943,7 @@ function vp9_idct_idct_32x32_add_16_neon
         ldp             d10, d11, [sp], 0x10
         ldp             d8,  d9,  [sp], 0x10
 
-        br              x15
+        ret             x15
 endfunc
 
 function ff_vp9_idct_idct_32x32_add_10_neon, export=1
@@ -2009,7 +2009,7 @@ function idct32x32_\size\()_add_16_neon
         ldp             d10, d11, [sp], 0x10
         ldp             d8,  d9,  [sp], 0x10
 
-        br              x15
+        ret             x15
 endfunc
 .endm
 
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 99413b0f70..03272eae82 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -787,7 +787,7 @@ function \txfm\()16_1d_8x16_pass1_neon
 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
         store           \i,  x0,  #16
 .endr
-        br              x14
+        ret             x14
 1:
         // Special case: For the last input column (x1 == 8),
         // which would be stored as the last row in the temp buffer,
@@ -806,7 +806,7 @@ function \txfm\()16_1d_8x16_pass1_neon
         mov             v29.16b, v21.16b
         mov             v30.16b, v22.16b
         mov             v31.16b, v23.16b
-        br              x14
+        ret             x14
 endfunc
 
 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
@@ -834,7 +834,7 @@ function \txfm\()16_1d_8x16_pass2_neon
         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
 
-        br              x14
+        ret             x14
 endfunc
 .endm
 
@@ -925,7 +925,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
 .endif
-        br              x15
+        ret             x15
 endfunc
 .endm
 
@@ -960,7 +960,7 @@ function idct16_1d_8x16_pass1_quarter_neon
 .irp i, 24, 25, 26, 27
         store           \i,  x0,  x9
 .endr
-        br              x14
+        ret             x14
 endfunc
 
 function idct16_1d_8x16_pass2_quarter_neon
@@ -978,7 +978,7 @@ function idct16_1d_8x16_pass2_quarter_neon
         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
 
-        br              x14
+        ret             x14
 endfunc
 
 function idct16_1d_8x16_pass1_half_neon
@@ -1003,7 +1003,7 @@ function idct16_1d_8x16_pass1_half_neon
 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
         store           \i,  x0,  x9
 .endr
-        br              x14
+        ret             x14
 endfunc
 
 function idct16_1d_8x16_pass2_half_neon
@@ -1021,7 +1021,7 @@ function idct16_1d_8x16_pass2_half_neon
         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
 
-        br              x14
+        ret             x14
 endfunc
 
 .macro idct16_partial size
@@ -1038,7 +1038,7 @@ function idct16x16_\size\()_add_neon
 .endr
 
         add             sp,  sp,  #512
-        br              x15
+        ret             x15
 endfunc
 .endm
 
@@ -1349,7 +1349,7 @@ function idct32_1d_8x32_pass1\suffix\()_neon
         store_rev       v25.8h, v17.8h
         store_rev       v24.8h, v16.8h
 .purgem store_rev
-        br              x14
+        ret             x14
 endfunc
 
 // This is mostly the same as 8x32_pass1, but without the transpose,
@@ -1466,7 +1466,7 @@ function idct32_1d_8x32_pass2\suffix\()_neon
         load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
         load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
 .purgem load_acc_store
-        br              x14
+        ret             x14
 endfunc
 .endm
 
@@ -1547,7 +1547,7 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
         ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
 
-        br              x15
+        ret             x15
 endfunc
 
 .macro idct32_partial size
@@ -1572,7 +1572,7 @@ function idct32x32_\size\()_add_neon
         ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
 
-        br              x15
+        ret             x15
 endfunc
 .endm
 
diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
index 9869614a29..a092617b92 100644
--- a/libavcodec/aarch64/vp9lpf_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -57,7 +57,7 @@
         mov             x12, v4.d[1]
         adds            x11, x11, x12
         b.ne            1f
-        br              x10
+        ret             x10
 1:
 
 .if \wd >= 8
@@ -193,7 +193,7 @@
         b.eq            6f
 .else
         b.ne            1f
-        br              x13
+        ret             x13
 1:
 .endif
 
@@ -252,7 +252,7 @@
         b.ne            1f
         // If no pixels needed flat8in nor flat8out, jump to a
         // writeout of the inner 4 pixels
-        br              x14
+        ret             x14
 1:
 
         mov             x11, v7.d[0]
@@ -260,7 +260,7 @@
         adds            x11, x11, x12
         b.ne            1f
         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        br              x15
+        ret             x15
 
 1:
         // flat8out
@@ -434,7 +434,7 @@ function ff_\func\()_\bpp\()_neon, export=1
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
-        br              x16
+        ret             x16
 .else
         b               \func\()_16_neon
 .endif
@@ -474,7 +474,7 @@ function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
 .endif
-        br              x16
+        ret             x16
 endfunc
 .endm
 
@@ -508,7 +508,7 @@ function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
         lsl             w3,  w14, #\bpp - 8
         lsl             w4,  w15, #\bpp - 8
         bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
-        br              x16
+        ret             x16
 endfunc
 .endm
 
@@ -541,7 +541,7 @@ function vp9_loop_filter_v_4_8_16_neon
         st1             {v25.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #1
 
-        br              x10
+        ret             x10
 endfunc
 
 bpp_frontends vp9_loop_filter_v_4_8
@@ -589,7 +589,7 @@ function vp9_loop_filter_h_4_8_16_neon
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  #4
 
-        br              x10
+        ret             x10
 endfunc
 
 bpp_frontends vp9_loop_filter_h_4_8
@@ -620,7 +620,7 @@ function vp9_loop_filter_v_8_8_16_neon
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
 
-        br              x10
+        ret             x10
 6:
         sub             x9,  x0,  x1, lsl #1
         st1             {v22.8h}, [x9], x1
@@ -628,7 +628,7 @@ function vp9_loop_filter_v_8_8_16_neon
         st1             {v23.8h}, [x9], x1
         st1             {v25.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #1
-        br              x10
+        ret             x10
 endfunc
 
 bpp_frontends vp9_loop_filter_v_8_8
@@ -671,7 +671,7 @@ function vp9_loop_filter_h_8_8_16_neon
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  #8
 
-        br              x10
+        ret             x10
 6:
         // If we didn't need to do the flat8in part, we use the same writeback
         // as in loop_filter_h_4_8.
@@ -688,7 +688,7 @@ function vp9_loop_filter_h_8_8_16_neon
         st1             {v25.d}[1], [x0], x1
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  #4
-        br              x10
+        ret             x10
 endfunc
 
 bpp_frontends vp9_loop_filter_h_8_8
@@ -743,7 +743,7 @@ function vp9_loop_filter_v_16_8_16_neon
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  x1
 
-        br              x10
+        ret             x10
 8:
         add             x9,  x9,  x1, lsl #2
         // If we didn't do the flat8out part, the output is left in the
@@ -756,7 +756,7 @@ function vp9_loop_filter_v_16_8_16_neon
         st1             {v26.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
-        br              x10
+        ret             x10
 7:
         sub             x9,  x0,  x1, lsl #1
         st1             {v22.8h}, [x9], x1
@@ -764,7 +764,7 @@ function vp9_loop_filter_v_16_8_16_neon
         st1             {v23.8h}, [x9], x1
         st1             {v25.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #1
-        br              x10
+        ret             x10
 endfunc
 
 bpp_frontends vp9_loop_filter_v_16_8, push=1
@@ -821,7 +821,7 @@ function vp9_loop_filter_h_16_8_16_neon
         st1             {v31.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #3
 
-        br              x10
+        ret             x10
 8:
         // The same writeback as in loop_filter_h_8_8
         sub             x9,  x0,  #8
@@ -838,7 +838,7 @@ function vp9_loop_filter_h_16_8_16_neon
         st1             {v27.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  #8
-        br              x10
+        ret             x10
 7:
         // The same writeback as in loop_filter_h_4_8
         sub             x9,  x0,  #4
@@ -854,7 +854,7 @@ function vp9_loop_filter_h_16_8_16_neon
         st1             {v25.d}[1], [x0], x1
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  #4
-        br              x10
+        ret             x10
 endfunc
 
 bpp_frontends vp9_loop_filter_h_16_8, push=1
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 0878763020..694ff8956f 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -399,7 +399,7 @@
 .endif
         // If no pixels needed flat8in nor flat8out, jump to a
         // writeout of the inner 4 pixels
-        br              x14
+        ret             x14
 1:
 
         mov             x5,  v7.d[0]
@@ -411,7 +411,7 @@
         cbnz            x5,  1f
 .endif
         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        br              x15
+        ret             x15
 
 1:
         // flat8out
@@ -532,32 +532,32 @@ function vp9_loop_filter_4
         loop_filter     4,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
         ret
 9:
-        br              x10
+        ret             x10
 endfunc
 
 function vp9_loop_filter_4_16b_mix_44
         loop_filter     4,  .16b, 44,   v16, v17, v18, v19, v28, v29, v30, v31
         ret
 9:
-        br              x10
+        ret             x10
 endfunc
 
 function vp9_loop_filter_8
         loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
         ret
 6:
-        br              x13
+        ret             x13
 9:
-        br              x10
+        ret             x10
 endfunc
 
 function vp9_loop_filter_8_16b_mix
         loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
         ret
 6:
-        br              x13
+        ret             x13
 9:
-        br              x10
+        ret             x10
 endfunc
 
 function vp9_loop_filter_16
@@ -568,7 +568,7 @@ function vp9_loop_filter_16
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 endfunc
 
 function vp9_loop_filter_16_16b
@@ -579,7 +579,7 @@ function vp9_loop_filter_16_16b
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 endfunc
 
 .macro loop_filter_4
@@ -648,7 +648,7 @@ function ff_vp9_loop_filter_v_4_8_neon, export=1
         st1             {v23.8b}, [x9], x1
         st1             {v25.8b}, [x0], x1
 
-        br              x10
+        ret             x10
 endfunc
 
 function ff_vp9_loop_filter_v_44_16_neon, export=1
@@ -672,7 +672,7 @@ function ff_vp9_loop_filter_v_44_16_neon, export=1
         st1             {v23.16b}, [x9], x1
         st1             {v25.16b}, [x0], x1
 
-        br              x10
+        ret             x10
 endfunc
 
 function ff_vp9_loop_filter_h_4_8_neon, export=1
@@ -714,7 +714,7 @@ function ff_vp9_loop_filter_h_4_8_neon, export=1
         st1             {v25.s}[0], [x9], x1
         st1             {v25.s}[1], [x0], x1
 
-        br              x10
+        ret             x10
 endfunc
 
 function ff_vp9_loop_filter_h_44_16_neon, export=1
@@ -766,7 +766,7 @@ function ff_vp9_loop_filter_h_44_16_neon, export=1
         st1             {v25.s}[1], [x9], x1
         st1             {v25.s}[3], [x0], x1
 
-        br              x10
+        ret             x10
 endfunc
 
 function ff_vp9_loop_filter_v_8_8_neon, export=1
@@ -793,14 +793,14 @@ function ff_vp9_loop_filter_v_8_8_neon, export=1
         st1             {v23.8b}, [x9], x1
         st1             {v26.8b}, [x0], x1
 
-        br              x10
+        ret             x10
 6:
         sub             x9,  x0,  x1, lsl #1
         st1             {v22.8b}, [x9], x1
         st1             {v24.8b}, [x0], x1
         st1             {v23.8b}, [x9], x1
         st1             {v25.8b}, [x0], x1
-        br              x10
+        ret             x10
 endfunc
 
 .macro mix_v_16 mix
@@ -828,14 +828,14 @@ function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1
         st1             {v23.16b}, [x9], x1
         st1             {v26.16b}, [x0], x1
 
-        br              x10
+        ret             x10
 6:
         sub             x9,  x0,  x1, lsl #1
         st1             {v22.16b}, [x9], x1
         st1             {v24.16b}, [x0], x1
         st1             {v23.16b}, [x9], x1
         st1             {v25.16b}, [x0], x1
-        br              x10
+        ret             x10
 endfunc
 .endm
 
@@ -876,7 +876,7 @@ function ff_vp9_loop_filter_h_8_8_neon, export=1
         st1             {v23.8b}, [x9], x1
         st1             {v27.8b}, [x0], x1
 
-        br              x10
+        ret             x10
 6:
         // If we didn't need to do the flat8in part, we use the same writeback
         // as in loop_filter_h_4_8.
@@ -891,7 +891,7 @@ function ff_vp9_loop_filter_h_8_8_neon, export=1
         st1             {v24.s}[1], [x0], x1
         st1             {v25.s}[0], [x9], x1
         st1             {v25.s}[1], [x0], x1
-        br              x10
+        ret             x10
 endfunc
 
 .macro mix_h_16 mix
@@ -942,7 +942,7 @@ function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
         st1             {v27.8b},   [x9], x1
         st1             {v27.d}[1], [x0], x1
 
-        br              x10
+        ret             x10
 6:
         add             x9,  x9,  #2
         add             x0,  x0,  #2
@@ -963,7 +963,7 @@ function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
         st1             {v24.s}[3], [x0], x1
         st1             {v25.s}[1], [x9], x1
         st1             {v25.s}[3], [x0], x1
-        br              x10
+        ret             x10
 endfunc
 .endm
 
@@ -1022,7 +1022,7 @@ function ff_vp9_loop_filter_v_16_8_neon, export=1
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 8:
         add             x9,  x9,  x1, lsl #2
         // If we didn't do the flat8out part, the output is left in the
@@ -1091,7 +1091,7 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 8:
         add             x9,  x9,  x1, lsl #2
         st1             {v21.16b}, [x9], x1
@@ -1168,7 +1168,7 @@ function ff_vp9_loop_filter_h_16_8_neon, export=1
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 8:
         // The same writeback as in loop_filter_h_8_8
         sub             x9,  x0,  #4
@@ -1287,7 +1287,7 @@ function ff_vp9_loop_filter_h_16_16_neon, export=1
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
-        br              x10
+        ret             x10
 8:
         sub             x9,  x0,  #4
         add             x0,  x9,  x1, lsl #3