swscale/ppc: VSX-optimize yuv2422_1

./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
            -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \
            -cpuflags 0 -v error -

15.3x speedup:

yuyv422
  14513 UNITS in yuv2packed1,   32768 runs,      0 skips
    949 UNITS in yuv2packed1,   32767 runs,      1 skips
yvyu422
  14516 UNITS in yuv2packed1,   32767 runs,      1 skips
    943 UNITS in yuv2packed1,   32767 runs,      1 skips
uyvy422
  14530 UNITS in yuv2packed1,   32767 runs,      1 skips
    941 UNITS in yuv2packed1,   32766 runs,      2 skips
This commit is contained in:
Lauri Kasanen 2019-03-24 13:45:55 +02:00
parent 4e8cbbf70e
commit a6a31ca3d9
1 changed files with 149 additions and 0 deletions

View File

@ -664,6 +664,143 @@ YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
static av_always_inline void
write422(const vector int16_t vy1, const vector int16_t vy2,
const vector int16_t vu, const vector int16_t vv,
uint8_t *dest, const enum AVPixelFormat target)
{
vector uint8_t vd1, vd2, tmp;
const vector uint8_t yuyv1 = (vector uint8_t) {
0x0, 0x10, 0x1, 0x18,
0x2, 0x11, 0x3, 0x19,
0x4, 0x12, 0x5, 0x1a,
0x6, 0x13, 0x7, 0x1b };
const vector uint8_t yuyv2 = (vector uint8_t) {
0x8, 0x14, 0x9, 0x1c,
0xa, 0x15, 0xb, 0x1d,
0xc, 0x16, 0xd, 0x1e,
0xe, 0x17, 0xf, 0x1f };
const vector uint8_t yvyu1 = (vector uint8_t) {
0x0, 0x18, 0x1, 0x10,
0x2, 0x19, 0x3, 0x11,
0x4, 0x1a, 0x5, 0x12,
0x6, 0x1b, 0x7, 0x13 };
const vector uint8_t yvyu2 = (vector uint8_t) {
0x8, 0x1c, 0x9, 0x14,
0xa, 0x1d, 0xb, 0x15,
0xc, 0x1e, 0xd, 0x16,
0xe, 0x1f, 0xf, 0x17 };
const vector uint8_t uyvy1 = (vector uint8_t) {
0x10, 0x0, 0x18, 0x1,
0x11, 0x2, 0x19, 0x3,
0x12, 0x4, 0x1a, 0x5,
0x13, 0x6, 0x1b, 0x7 };
const vector uint8_t uyvy2 = (vector uint8_t) {
0x14, 0x8, 0x1c, 0x9,
0x15, 0xa, 0x1d, 0xb,
0x16, 0xc, 0x1e, 0xd,
0x17, 0xe, 0x1f, 0xf };
vd1 = vec_packsu(vy1, vy2);
vd2 = vec_packsu(vu, vv);
switch (target) {
case AV_PIX_FMT_YUYV422:
tmp = vec_perm(vd1, vd2, yuyv1);
vec_st(tmp, 0, dest);
tmp = vec_perm(vd1, vd2, yuyv2);
vec_st(tmp, 16, dest);
break;
case AV_PIX_FMT_YVYU422:
tmp = vec_perm(vd1, vd2, yvyu1);
vec_st(tmp, 0, dest);
tmp = vec_perm(vd1, vd2, yvyu2);
vec_st(tmp, 16, dest);
break;
case AV_PIX_FMT_UYVY422:
tmp = vec_perm(vd1, vd2, uyvy1);
vec_st(tmp, 0, dest);
tmp = vec_perm(vd1, vd2, uyvy2);
vec_st(tmp, 16, dest);
break;
}
}
static av_always_inline void
yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest, int dstW,
int uvalpha, int y, enum AVPixelFormat target)
{
const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
vector int16_t vy1, vy2, vu, vv, tmp;
const vector int16_t add64 = vec_splats((int16_t) 64);
const vector int16_t add128 = vec_splats((int16_t) 128);
const vector uint16_t shift7 = vec_splat_u16(7);
const vector uint16_t shift8 = vec_splat_u16(8);
int i;
if (uvalpha < 2048) {
for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
vy1 = vec_ld(0, &buf0[i * 2]);
vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
vu = vec_ld(0, &ubuf0[i]);
vv = vec_ld(0, &vbuf0[i]);
vy1 = vec_add(vy1, add64);
vy2 = vec_add(vy2, add64);
vu = vec_add(vu, add64);
vv = vec_add(vv, add64);
vy1 = vec_sra(vy1, shift7);
vy2 = vec_sra(vy2, shift7);
vu = vec_sra(vu, shift7);
vv = vec_sra(vv, shift7);
write422(vy1, vy2, vu, vv, &dest[i * 4], target);
}
} else {
const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
vy1 = vec_ld(0, &buf0[i * 2]);
vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
vu = vec_ld(0, &ubuf0[i]);
tmp = vec_ld(0, &ubuf1[i]);
vu = vec_adds(vu, tmp);
vv = vec_ld(0, &vbuf0[i]);
tmp = vec_ld(0, &vbuf1[i]);
vv = vec_adds(vv, tmp);
vy1 = vec_add(vy1, add64);
vy2 = vec_add(vy2, add64);
vu = vec_adds(vu, add128);
vv = vec_adds(vv, add128);
vy1 = vec_sra(vy1, shift7);
vy2 = vec_sra(vy2, shift7);
vu = vec_sra(vu, shift8);
vv = vec_sra(vv, shift8);
write422(vy1, vy2, vu, vv, &dest[i * 4], target);
}
}
}
#define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
const int16_t *ubuf[2], const int16_t *vbuf[2], \
const int16_t *abuf0, uint8_t *dest, int dstW, \
int uvalpha, int y) \
{ \
name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, \
abuf0, dest, dstW, uvalpha, \
y, fmt); \
}
YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422)
YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
#endif /* !HAVE_BIGENDIAN */
#endif /* HAVE_VSX */
@ -768,6 +905,18 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
}
break;
}
} else { /* !SWS_FULL_CHR_H_INT */
switch (dstFormat) {
case AV_PIX_FMT_YUYV422:
c->yuv2packed1 = yuv2yuyv422_1_vsx;
break;
case AV_PIX_FMT_YVYU422:
c->yuv2packed1 = yuv2yvyu422_1_vsx;
break;
case AV_PIX_FMT_UYVY422:
c->yuv2packed1 = yuv2uyvy422_1_vsx;
break;
}
}
#endif /* !HAVE_BIGENDIAN */