swscale/rgb2rgb2: rework RISC-V V shuffle_bytes_{0321,2103}

This avoids strided loads.

Before:
shuffle_bytes_0321_rvv_i32: 307.7
shuffle_bytes_2103_rvv_i32: 308.7

After:
shuffle_bytes_0321_rvv_i32: 59.7
shuffle_bytes_2103_rvv_i32: 61.5
This commit is contained in:
Rémi Denis-Courmont 2023-07-18 21:39:59 +03:00
parent d3948e4db5
commit 15982554e6
1 changed files with 22 additions and 27 deletions

View File

@ -21,36 +21,31 @@
#include "libavutil/riscv/asm.S"
func ff_shuffle_bytes_0321_rvv, zve32x
addi t1, a0, 3
addi t2, a0, 2
addi t3, a0, 1
1:
srai a2, a2, 2
li t4, 4
2:
vsetvli t0, a2, e8, m1, ta, ma
sub a2, a2, t0
vlse8.v v8, (a0), t4
sh2add a0, t0, a0
vlse8.v v9, (t1), t4
sh2add t1, t0, t1
vlse8.v v10, (t2), t4
sh2add t2, t0, t2
vlse8.v v11, (t3), t4
sh2add t3, t0, t3
vsseg4e8.v v8, (a1)
sh2add a1, t0, a1
bnez a2, 2b
ret
li t1, 0x00ff00ff
j 1f
endfunc
func ff_shuffle_bytes_2103_rvv, zve32x
addi t1, a0, 1
addi t2, a0, 0
addi t3, a0, 3
addi a0, a0, 2
j 1b
li t1, ~0x00ff00ff
1:
not t2, t1
srai a2, a2, 2
2:
vsetvli t0, a2, e32, m8, ta, ma
vle32.v v8, (a0)
sub a2, a2, t0
vand.vx v16, v8, t2
sh2add a0, t0, a0
vand.vx v8, v8, t1
vsrl.vi v24, v16, 16
vsll.vi v16, v16, 16
vor.vv v8, v8, v24
vor.vv v8, v16, v8
vse32.v v8, (a1)
sh2add a1, t0, a1
bnez a2, 2b
ret
endfunc
func ff_shuffle_bytes_1230_rvv, zve32x