aarch64: h264qpel: Do vertical filtering without transposing

This gives rather big speedups on these functions:

Before:
put_h264_qpel_8_mc01_8_neon:     241.0   131.5   138.7
put_h264_qpel_8_mc02_8_neon:     214.7   121.2   127.5
put_h264_qpel_8_mc03_8_neon:     242.5   131.2   135.7
put_h264_qpel_8_mc11_8_neon:     421.2   218.7   251.0
put_h264_qpel_8_mc12_8_neon:     878.0   509.5   537.5
put_h264_qpel_8_mc13_8_neon:     423.7   217.0   252.0
put_h264_qpel_8_mc21_8_neon:     858.2   479.5   514.0
put_h264_qpel_8_mc22_8_neon:     649.7   385.2   403.0
put_h264_qpel_8_mc23_8_neon:     860.2   476.5   517.7
put_h264_qpel_8_mc31_8_neon:     437.2   219.5   252.5
put_h264_qpel_8_mc32_8_neon:     892.5   510.5   546.0
put_h264_qpel_8_mc33_8_neon:     438.2   218.5   257.0
put_h264_qpel_16_mc01_8_neon:    944.2   509.7   546.7
put_h264_qpel_16_mc02_8_neon:    878.7   469.5   509.7
put_h264_qpel_16_mc03_8_neon:    945.7   510.7   557.0
put_h264_qpel_16_mc11_8_neon:   1663.2   858.5   979.5
put_h264_qpel_16_mc12_8_neon:   3510.2  2027.7  2112.7
put_h264_qpel_16_mc13_8_neon:   1664.7   857.5   980.5
put_h264_qpel_16_mc21_8_neon:   3366.2  1928.5  2030.5
put_h264_qpel_16_mc22_8_neon:   2584.7  1514.7  1590.2
put_h264_qpel_16_mc23_8_neon:   3367.7  1927.7  2035.0
put_h264_qpel_16_mc31_8_neon:   1716.7   849.7   997.0
put_h264_qpel_16_mc32_8_neon:   3564.0  2044.2  3835.2
put_h264_qpel_16_mc33_8_neon:   1717.7   863.0   989.5

After:
put_h264_qpel_8_mc01_8_neon:     136.0    73.7    76.0
put_h264_qpel_8_mc02_8_neon:     108.7    65.0    64.0
put_h264_qpel_8_mc03_8_neon:     137.5    72.7    73.0
put_h264_qpel_8_mc11_8_neon:     316.2   159.0   188.5
put_h264_qpel_8_mc12_8_neon:     653.0   375.5   384.7
put_h264_qpel_8_mc13_8_neon:     318.7   165.5   189.5
put_h264_qpel_8_mc21_8_neon:     739.2   385.7   432.5
put_h264_qpel_8_mc22_8_neon:     530.7   295.5   309.5
put_h264_qpel_8_mc23_8_neon:     741.2   393.7   421.0
put_h264_qpel_8_mc31_8_neon:     332.2   162.5   190.0
put_h264_qpel_8_mc32_8_neon:     667.5   378.2   390.5
put_h264_qpel_8_mc33_8_neon:     332.7   166.5   195.5
put_h264_qpel_16_mc01_8_neon:    524.2   285.2   294.0
put_h264_qpel_16_mc02_8_neon:    454.7   252.2   250.2
put_h264_qpel_16_mc03_8_neon:    525.7   286.0   283.0
put_h264_qpel_16_mc11_8_neon:   1243.2   630.7   726.7
put_h264_qpel_16_mc12_8_neon:   2610.2  1479.7  1481.2
put_h264_qpel_16_mc13_8_neon:   1250.5   631.7   727.7
put_h264_qpel_16_mc21_8_neon:   2890.2  1571.2  1679.7
put_h264_qpel_16_mc22_8_neon:   2108.7  1177.5  1223.5
put_h264_qpel_16_mc23_8_neon:   2891.7  1578.7  1667.7
put_h264_qpel_16_mc31_8_neon:   1296.7   630.5   752.5
put_h264_qpel_16_mc32_8_neon:   2664.0  1483.2  1503.5
put_h264_qpel_16_mc33_8_neon:   1297.7   632.5   747.2

I.e. overall a 20%-60% reduction in runtime of these
functions.

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2021-09-03 13:56:05 +03:00
parent 2d5a7f6d00
commit fd3bd5c492
1 changed files with 58 additions and 57 deletions

View File

@ -58,6 +58,24 @@
.endif
.endm
//trashes v0-v4
.macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1
uaddl v2.8H, \r2\().8B, \r3\().8B
uaddl v0.8H, \r3\().8B, \r4\().8B
uaddl v4.8H, \r1\().8B, \r4\().8B
uaddl v1.8H, \r2\().8B, \r5\().8B
uaddl \d0\().8H, \r0\().8B, \r5\().8B
uaddl \d1\().8H, \r1\().8B, \r6\().8B
mla \d0\().8H, v2.8H, v6.H[1]
mls \d0\().8H, v4.8H, v6.H[0]
mla \d1\().8H, v0.8H, v6.H[1]
mls \d1\().8H, v1.8H, v6.H[0]
.if \narrow
sqrshrun \d0\().8B, \d0\().8H, #5
sqrshrun \d1\().8B, \d1\().8H, #5
.endif
.endm
//trashes v0-v5, v7, v30-v31
.macro lowpass_8H r0, r1
ext v0.16B, \r0\().16B, \r0\().16B, #2
@ -100,18 +118,13 @@
.endm
// trashed v0-v7
.macro lowpass_8.16 r0, r1, r2
ext v1.16B, \r0\().16B, \r1\().16B, #4
ext v0.16B, \r0\().16B, \r1\().16B, #6
saddl v5.4S, v1.4H, v0.4H
ext v2.16B, \r0\().16B, \r1\().16B, #2
saddl2 v1.4S, v1.8H, v0.8H
ext v3.16B, \r0\().16B, \r1\().16B, #8
saddl v6.4S, v2.4H, v3.4H
ext \r1\().16B, \r0\().16B, \r1\().16B, #10
saddl2 v2.4S, v2.8H, v3.8H
saddl v0.4S, \r0\().4H, \r1\().4H
saddl2 v4.4S, \r0\().8H, \r1\().8H
.macro lowpass_8.16 r0, r1, r2, r3, r4, r5
saddl v5.4S, \r2\().4H, \r3\().4H
saddl2 v1.4S, \r2\().8H, \r3\().8H
saddl v6.4S, \r1\().4H, \r4\().4H
saddl2 v2.4S, \r1\().8H, \r4\().8H
saddl v0.4S, \r0\().4H, \r5\().4H
saddl2 v4.4S, \r0\().8H, \r5\().8H
shl v3.4S, v5.4S, #4
shl v5.4S, v5.4S, #2
@ -134,7 +147,7 @@
rshrn v5.4H, v5.4S, #10
rshrn2 v5.8H, v1.4S, #10
sqxtun \r2\().8B, v5.8H
sqxtun \r0\().8B, v5.8H
.endm
function put_h264_qpel16_h_lowpass_neon_packed
@ -258,27 +271,23 @@ endfunc
function \type\()_h264_qpel8_v_lowpass_neon
ld1 {v16.8B}, [x1], x3
ld1 {v18.8B}, [x1], x3
ld1 {v20.8B}, [x1], x3
ld1 {v22.8B}, [x1], x3
ld1 {v24.8B}, [x1], x3
ld1 {v26.8B}, [x1], x3
ld1 {v28.8B}, [x1], x3
ld1 {v30.8B}, [x1], x3
ld1 {v17.8B}, [x1], x3
ld1 {v18.8B}, [x1], x3
ld1 {v19.8B}, [x1], x3
ld1 {v20.8B}, [x1], x3
ld1 {v21.8B}, [x1], x3
ld1 {v22.8B}, [x1], x3
ld1 {v23.8B}, [x1], x3
ld1 {v25.8B}, [x1]
transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
lowpass_8 v16, v17, v18, v19, v16, v17
lowpass_8 v20, v21, v22, v23, v18, v19
lowpass_8 v24, v25, v26, v27, v20, v21
lowpass_8 v28, v29, v30, v31, v22, v23
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
ld1 {v24.8B}, [x1], x3
ld1 {v25.8B}, [x1], x3
ld1 {v26.8B}, [x1], x3
ld1 {v27.8B}, [x1], x3
ld1 {v28.8B}, [x1]
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
.ifc \type,avg
ld1 {v24.8B}, [x0], x2
ld1 {v25.8B}, [x0], x2
@ -335,26 +344,23 @@ endfunc
function \type\()_h264_qpel8_v_lowpass_l2_neon
ld1 {v16.8B}, [x1], x3
ld1 {v18.8B}, [x1], x3
ld1 {v20.8B}, [x1], x3
ld1 {v22.8B}, [x1], x3
ld1 {v24.8B}, [x1], x3
ld1 {v26.8B}, [x1], x3
ld1 {v28.8B}, [x1], x3
ld1 {v30.8B}, [x1], x3
ld1 {v17.8B}, [x1], x3
ld1 {v18.8B}, [x1], x3
ld1 {v19.8B}, [x1], x3
ld1 {v20.8B}, [x1], x3
ld1 {v21.8B}, [x1], x3
ld1 {v22.8B}, [x1], x3
ld1 {v23.8B}, [x1], x3
ld1 {v25.8B}, [x1]
ld1 {v24.8B}, [x1], x3
ld1 {v25.8B}, [x1], x3
ld1 {v26.8B}, [x1], x3
ld1 {v27.8B}, [x1], x3
ld1 {v28.8B}, [x1]
transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
lowpass_8 v16, v17, v18, v19, v16, v17
lowpass_8 v20, v21, v22, v23, v18, v19
lowpass_8 v24, v25, v26, v27, v20, v21
lowpass_8 v28, v29, v30, v31, v22, v23
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
ld1 {v24.8B}, [x12], x2
ld1 {v25.8B}, [x12], x2
@ -432,22 +438,17 @@ function put_h264_qpel8_hv_lowpass_neon_top
lowpass_8H v26, v27
lowpass_8H v28, v29
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
lowpass_8.16 v16, v17, v18, v19, v20, v21
lowpass_8.16 v17, v18, v19, v20, v21, v22
lowpass_8.16 v16, v24, v16
lowpass_8.16 v17, v25, v17
lowpass_8.16 v18, v19, v20, v21, v22, v23
lowpass_8.16 v19, v20, v21, v22, v23, v24
lowpass_8.16 v18, v26, v18
lowpass_8.16 v19, v27, v19
lowpass_8.16 v20, v21, v22, v23, v24, v25
lowpass_8.16 v21, v22, v23, v24, v25, v26
lowpass_8.16 v20, v28, v20
lowpass_8.16 v21, v29, v21
lowpass_8.16 v22, v30, v22
lowpass_8.16 v23, v31, v23
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
lowpass_8.16 v22, v23, v24, v25, v26, v27
lowpass_8.16 v23, v24, v25, v26, v27, v28
ret
endfunc