libswscale/x86/yuv2rgb: Change inline assembly into nasm code

The original inline assembly and nasm code have the same fps when called by command.
NASM code almost has no impact on the perfromance.

Signed-off-by: Ting Fu <ting.fu@intel.com>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
Ting Fu 2020-01-19 11:51:03 +08:00 committed by Michael Niedermayer
parent 2d58fa6d9e
commit e934194b6a
5 changed files with 384 additions and 387 deletions

View File

@ -12,3 +12,4 @@ X86ASM-OBJS += x86/input.o \
x86/output.o \
x86/scale.o \
x86/rgb_2_rgb.o \
x86/yuv_2_rgb.o \

View File

@ -29,6 +29,14 @@
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
0x0103010301030103LL,
0x0200020002000200LL,};
const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
0x0602060206020602LL,
0x0004000400040004LL,};
#if HAVE_INLINE_ASM
#define DITHER1XBPP
@ -38,14 +46,6 @@ DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL;
DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL;
DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL;
const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
0x0103010301030103LL,
0x0200020002000200LL,};
const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
0x0602060206020602LL,
0x0004000400040004LL,};
DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL;
DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL;
DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL;

View File

@ -37,7 +37,7 @@
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#if HAVE_INLINE_ASM
#if HAVE_X86ASM
#define DITHER1XBPP // only for MMX
@ -50,32 +50,31 @@ DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
//MMX versions
#if HAVE_MMX_INLINE && HAVE_6REGS
#if HAVE_MMX
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _mmx
#include "yuv2rgb_template.c"
#endif /* HAVE_MMX_INLINE && HAVE_6REGS */
#endif /* HAVE_MMX */
// MMXEXT versions
#if HAVE_MMXEXT_INLINE && HAVE_6REGS
#if HAVE_MMXEXT
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _mmxext
#include "yuv2rgb_template.c"
#endif /* HAVE_MMXEXT_INLINE && HAVE_6REGS */
#endif /* HAVE_MMXEXT */
#endif /* HAVE_INLINE_ASM */
#endif /* HAVE_X86ASM */
av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
{
#if HAVE_MMX_INLINE && HAVE_6REGS
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags)) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB24:
return yuv420_rgb24_mmxext;
@ -83,13 +82,12 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
return yuv420_bgr24_mmxext;
}
}
#endif
if (INLINE_MMX(cpu_flags)) {
if (EXTERNAL_MMX(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
#if CONFIG_SWSCALE_ALPHA
return yuva420_rgb32_mmx;
#endif
break;
@ -97,7 +95,7 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
return yuv420_rgb32_mmx;
case AV_PIX_FMT_BGR32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
#if CONFIG_SWSCALE_ALPHA
return yuva420_bgr32_mmx;
#endif
break;
@ -113,7 +111,7 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
return yuv420_rgb15_mmx;
}
}
#endif /* HAVE_MMX_INLINE && HAVE_6REGS */
#endif /* HAVE_X86ASM */
return NULL;
}

View File

@ -26,23 +26,6 @@
#include "libavutil/x86/asm.h"
#include "libswscale/swscale_internal.h"
#undef MOVNTQ
#undef EMMS
#undef SFENCE
#if COMPILE_TEMPLATE_MMXEXT
#define MOVNTQ "movntq"
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
#define SFENCE " # nop"
#endif
#define REG_BLUE "0"
#define REG_RED "1"
#define REG_GREEN "2"
#define REG_ALPHA "3"
#define YUV2RGB_LOOP(depth) \
h_size = (c->dstW + 7) & ~7; \
if (h_size * depth > FFABS(dstStride[0])) \
@ -50,7 +33,6 @@
\
vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
\
__asm__ volatile ("pxor %mm4, %mm4\n\t"); \
for (y = 0; y < srcSliceH; y++) { \
uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
const uint8_t *py = src[0] + y * srcStride[0]; \
@ -58,146 +40,33 @@
const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
x86_reg index = -h_size / 2; \
#define YUV2RGB_INITIAL_LOAD \
__asm__ volatile ( \
"movq (%5, %0, 2), %%mm6\n\t" \
"movd (%2, %0), %%mm0\n\t" \
"movd (%3, %0), %%mm1\n\t" \
"1: \n\t" \
/* YUV2RGB core
* Conversion is performed in usual way:
* R = Y' * Ycoef + Vred * V'
* G = Y' * Ycoef + Vgreen * V' + Ugreen * U'
* B = Y' * Ycoef + Ublue * U'
*
* where X' = X * 8 - Xoffset (multiplication is performed to increase
* precision a bit).
* Since it operates in YUV420 colorspace, Y component is additionally
* split into Y1 and Y2 for even and odd pixels.
*
* Input:
* mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero register
* Output:
* mm1 - R, mm2 - G, mm0 - B
*/
#define YUV2RGB \
/* convert Y, U, V into Y1', Y2', U', V' */ \
"movq %%mm6, %%mm7\n\t" \
"punpcklbw %%mm4, %%mm0\n\t" \
"punpcklbw %%mm4, %%mm1\n\t" \
"pand "MANGLE(mmx_00ffw)", %%mm6\n\t" \
"psrlw $8, %%mm7\n\t" \
"psllw $3, %%mm0\n\t" \
"psllw $3, %%mm1\n\t" \
"psllw $3, %%mm6\n\t" \
"psllw $3, %%mm7\n\t" \
"psubsw "U_OFFSET"(%4), %%mm0\n\t" \
"psubsw "V_OFFSET"(%4), %%mm1\n\t" \
"psubw "Y_OFFSET"(%4), %%mm6\n\t" \
"psubw "Y_OFFSET"(%4), %%mm7\n\t" \
\
/* multiply by coefficients */ \
"movq %%mm0, %%mm2\n\t" \
"movq %%mm1, %%mm3\n\t" \
"pmulhw "UG_COEFF"(%4), %%mm2\n\t" \
"pmulhw "VG_COEFF"(%4), %%mm3\n\t" \
"pmulhw "Y_COEFF" (%4), %%mm6\n\t" \
"pmulhw "Y_COEFF" (%4), %%mm7\n\t" \
"pmulhw "UB_COEFF"(%4), %%mm0\n\t" \
"pmulhw "VR_COEFF"(%4), %%mm1\n\t" \
"paddsw %%mm3, %%mm2\n\t" \
/* now: mm0 = UB, mm1 = VR, mm2 = CG */ \
/* mm6 = Y1, mm7 = Y2 */ \
\
/* produce RGB */ \
"movq %%mm7, %%mm3\n\t" \
"movq %%mm7, %%mm5\n\t" \
"paddsw %%mm0, %%mm3\n\t" \
"paddsw %%mm1, %%mm5\n\t" \
"paddsw %%mm2, %%mm7\n\t" \
"paddsw %%mm6, %%mm0\n\t" \
"paddsw %%mm6, %%mm1\n\t" \
"paddsw %%mm6, %%mm2\n\t" \
#define RGB_PACK_INTERLEAVE \
/* pack and interleave even/odd pixels */ \
"packuswb %%mm1, %%mm0\n\t" \
"packuswb %%mm5, %%mm3\n\t" \
"packuswb %%mm2, %%mm2\n\t" \
"movq %%mm0, %%mm1\n\n" \
"packuswb %%mm7, %%mm7\n\t" \
"punpcklbw %%mm3, %%mm0\n\t" \
"punpckhbw %%mm3, %%mm1\n\t" \
"punpcklbw %%mm7, %%mm2\n\t" \
#define YUV2RGB_ENDLOOP(depth) \
"movq 8 (%5, %0, 2), %%mm6\n\t" \
"movd 4 (%3, %0), %%mm1\n\t" \
"movd 4 (%2, %0), %%mm0\n\t" \
"add $"AV_STRINGIFY(depth * 8)", %1\n\t" \
"add $4, %0\n\t" \
"js 1b\n\t" \
#if COMPILE_TEMPLATE_MMXEXT
#undef RGB_PACK24_B_OPERANDS
#define RGB_PACK24_B_OPERANDS NAMED_CONSTRAINTS_ARRAY_ADD(mask1101,mask0110,mask0100,mask0010,mask1001)
#else
#undef RGB_PACK24_B_OPERANDS
#define RGB_PACK24_B_OPERANDS
#endif
#define YUV2RGB_OPERANDS \
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index) \
NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_03,pb_07,mmx_redmask,pb_e0) \
RGB_PACK24_B_OPERANDS \
: "memory" \
); \
} \
#define YUV2RGB_OPERANDS_ALPHA \
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index), "r" (pa - 2*index) \
NAMED_CONSTRAINTS_ADD(mmx_00ffw) \
: "memory" \
); \
} \
#define YUV2RGB_ENDFUNC \
__asm__ volatile (SFENCE"\n\t" \
"emms \n\t"); \
return srcSliceH; \
#define IF0(x)
#define IF1(x) x
#define RGB_PACK16(gmask, is15) \
"pand "MANGLE(mmx_redmask)", %%mm0\n\t" \
"pand "MANGLE(mmx_redmask)", %%mm1\n\t" \
"movq %%mm2, %%mm3\n\t" \
"psllw $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \
"psrlw $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \
"psrlw $3, %%mm0\n\t" \
IF##is15("psrlw $1, %%mm1\n\t") \
"pand "MANGLE(pb_e0)", %%mm2\n\t" \
"pand "MANGLE(gmask)", %%mm3\n\t" \
"por %%mm2, %%mm0\n\t" \
"por %%mm3, %%mm1\n\t" \
"movq %%mm0, %%mm2\n\t" \
"punpcklbw %%mm1, %%mm0\n\t" \
"punpckhbw %%mm1, %%mm2\n\t" \
MOVNTQ " %%mm0, (%1)\n\t" \
MOVNTQ " %%mm2, 8(%1)\n\t" \
#define DITHER_RGB \
"paddusb "BLUE_DITHER"(%4), %%mm0\n\t" \
"paddusb "GREEN_DITHER"(%4), %%mm2\n\t" \
"paddusb "RED_DITHER"(%4), %%mm1\n\t" \
extern void RENAME(ff_yuv_420_rgb24)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void RENAME(ff_yuv_420_bgr24)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
#if !COMPILE_TEMPLATE_MMXEXT
extern void RENAME(ff_yuv_420_rgb15)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void RENAME(ff_yuv_420_rgb16)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void RENAME(ff_yuv_420_rgb32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void RENAME(ff_yuv_420_bgr32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void RENAME(ff_yuva_420_rgb32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index, const uint8_t *pa_2index);
extern void RENAME(ff_yuva_420_bgr32)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index, const uint8_t *pa_2index);
static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
@ -213,17 +82,9 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
c->redDither = ff_dither8[(y + 1) & 1];
#endif
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
#ifdef DITHER1XBPP
DITHER_RGB
#endif
RGB_PACK16(pb_03, 1)
YUV2RGB_ENDLOOP(2)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
RENAME(ff_yuv_420_rgb15)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
@ -241,79 +102,67 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
c->redDither = ff_dither8[(y + 1) & 1];
#endif
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
#ifdef DITHER1XBPP
DITHER_RGB
#endif
RGB_PACK16(pb_07, 0)
YUV2RGB_ENDLOOP(2)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
RENAME(ff_yuv_420_rgb16)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
#endif /* !COMPILE_TEMPLATE_MMXEXT */
#define RGB_PACK24(blue, red)\
"packuswb %%mm3, %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
"packuswb %%mm5, %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\
"packuswb %%mm7, %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\
"movq %%mm"red", %%mm3 \n"\
"movq %%mm"blue", %%mm6 \n"\
"psrlq $32, %%mm"red" \n" /* R1 R3 R5 R7 */\
"punpcklbw %%mm2, %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\
"punpcklbw %%mm"red", %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\
"movq %%mm3, %%mm5 \n"\
"punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\
"punpcklwd %%mm6, %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\
"punpckhwd %%mm6, %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
RGB_PACK24_B
static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
#if COMPILE_TEMPLATE_MMXEXT
DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
#undef RGB_PACK24_B
#define RGB_PACK24_B\
"pshufw $0xc6, %%mm2, %%mm1 \n"\
"pshufw $0x84, %%mm3, %%mm6 \n"\
"pshufw $0x38, %%mm5, %%mm7 \n"\
"pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\
"movq %%mm1, %%mm0 \n"\
"pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\
"movq %%mm1, %%mm2 \n"\
"pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\
"psrlq $48, %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\
"pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\
"psllq $32, %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\
"pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\
"por %%mm3, %%mm1 \n"\
"por %%mm6, %%mm0 \n"\
"por %%mm5, %%mm1 \n"\
"por %%mm7, %%mm2 \n"\
MOVNTQ" %%mm0, (%1) \n"\
MOVNTQ" %%mm1, 8(%1) \n"\
MOVNTQ" %%mm2, 16(%1) \n"\
YUV2RGB_LOOP(4)
#else
#undef RGB_PACK24_B
#define RGB_PACK24_B\
"movd %%mm3, (%1) \n" /* R0 G0 B0 R1 */\
"movd %%mm2, 4(%1) \n" /* G1 B1 */\
"psrlq $32, %%mm3 \n"\
"psrlq $16, %%mm2 \n"\
"movd %%mm3, 6(%1) \n" /* R2 G2 B2 R3 */\
"movd %%mm2, 10(%1) \n" /* G3 B3 */\
"psrlq $16, %%mm2 \n"\
"movd %%mm5, 12(%1) \n" /* R4 G4 B4 R5 */\
"movd %%mm2, 16(%1) \n" /* G5 B5 */\
"psrlq $32, %%mm5 \n"\
"movd %%mm2, 20(%1) \n" /* -- -- G7 B7 */\
"movd %%mm5, 18(%1) \n" /* R6 G6 B6 R7 */\
RENAME(ff_yuv_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
RENAME(ff_yuv_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
RENAME(ff_yuva_420_rgb32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
}
return srcSliceH;
}
static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
RENAME(ff_yuva_420_bgr32)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
}
return srcSliceH;
}
#endif
static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
@ -325,143 +174,22 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
YUV2RGB_LOOP(3)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK24(REG_BLUE, REG_RED)
YUV2RGB_ENDLOOP(3)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
RENAME(ff_yuv_420_rgb24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(3)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK24(REG_RED, REG_BLUE)
YUV2RGB_ENDLOOP(3)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
RENAME(ff_yuv_420_bgr24)(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
#define SET_EMPTY_ALPHA \
"pcmpeqd %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha to 0xFF */ \
#define LOAD_ALPHA \
"movq (%6, %0, 2), %%mm"REG_ALPHA"\n\t" \
#define RGB_PACK32(red, green, blue, alpha) \
"movq %%mm"blue", %%mm5\n\t" \
"movq %%mm"red", %%mm6\n\t" \
"punpckhbw %%mm"green", %%mm5\n\t" \
"punpcklbw %%mm"green", %%mm"blue"\n\t" \
"punpckhbw %%mm"alpha", %%mm6\n\t" \
"punpcklbw %%mm"alpha", %%mm"red"\n\t" \
"movq %%mm"blue", %%mm"green"\n\t" \
"movq %%mm5, %%mm"alpha"\n\t" \
"punpcklwd %%mm"red", %%mm"blue"\n\t" \
"punpckhwd %%mm"red", %%mm"green"\n\t" \
"punpcklwd %%mm6, %%mm5\n\t" \
"punpckhwd %%mm6, %%mm"alpha"\n\t" \
MOVNTQ " %%mm"blue", 0(%1)\n\t" \
MOVNTQ " %%mm"green", 8(%1)\n\t" \
MOVNTQ " %%mm5, 16(%1)\n\t" \
MOVNTQ " %%mm"alpha", 24(%1)\n\t" \
#if !COMPILE_TEMPLATE_MMXEXT
static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
SET_EMPTY_ALPHA
RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
LOAD_ALPHA
RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS_ALPHA
YUV2RGB_ENDFUNC
}
#endif
static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
SET_EMPTY_ALPHA
RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
LOAD_ALPHA
RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS_ALPHA
YUV2RGB_ENDFUNC
}
#endif
#endif /* !COMPILE_TEMPLATE_MMXEXT */

View File

@ -0,0 +1,270 @@
;******************************************************************************
;* software YUV to RGB converter
;*
;* Copyright (C) 2001-2007 Michael Niedermayer
;* (c) 2010 Konstantin Shishkov
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_00ff: times 4 dw 255
pb_f8: times 8 db 248
pb_e0: times 8 db 224
pb_03: times 8 db 3
pb_07: times 8 db 7
mask_1101: dw -1, -1, 0, -1
mask_0010: dw 0, 0, -1, 0
mask_0110: dw 0, -1, -1, 0
mask_1001: dw -1, 0, 0, -1
mask_0100: dw 0, -1, 0, 0
SECTION .text
;-----------------------------------------------------------------------------
;
; YUV420/YUVA420 to RGB/BGR 15/16/24/32
; R = Y + ((vrCoff * (v - 128)) >> 8)
; G = Y - ((ugCoff * (u - 128) + vgCoff * (v - 128)) >> 8)
; B = Y + ((ubCoff * (u - 128)) >> 8)
;
;-----------------------------------------------------------------------------
%macro MOV_H2L 1
psrlq %1, 32
%endmacro
%macro yuv2rgb_fn 3
%if %3 == 32
%ifidn %1, yuva
%define parameters index, image, pu_index, pv_index, pointer_c_dither, py_2index, pa_2index
%define GPR_num 7
%endif
%else
%define parameters index, image, pu_index, pv_index, pointer_c_dither, py_2index
%define GPR_num 6
%endif
%define m_green m2
%define m_alpha m3
%define m_y m6
%define m_u m0
%define m_v m1
%ifidn %2, rgb
%define m_red m1
%define m_blue m0
%else
%define m_red m0
%define m_blue m1
%endif
%define time_num 1
%define reg_num 8
%define y_offset [pointer_c_ditherq + 8 * 8]
%define u_offset [pointer_c_ditherq + 9 * 8]
%define v_offset [pointer_c_ditherq + 10 * 8]
%define ug_coff [pointer_c_ditherq + 7 * 8]
%define vg_coff [pointer_c_ditherq + 6 * 8]
%define y_coff [pointer_c_ditherq + 3 * 8]
%define ub_coff [pointer_c_ditherq + 5 * 8]
%define vr_coff [pointer_c_ditherq + 4 * 8]
cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
%if ARCH_X86_64
movsxd indexq, indexd
%endif
mova m_y, [py_2indexq + 2 * indexq]
movh m_u, [pu_indexq + indexq]
movh m_v, [pv_indexq + indexq]
.loop0:
pxor m4, m4
mova m7, m6
punpcklbw m0, m4
punpcklbw m1, m4
mova m2, [pw_00ff]
pand m6, m2
psrlw m7, 8
psllw m0, 3
psllw m1, 3
psllw m6, 3
psllw m7, 3
psubsw m0, u_offset ; U = U - 128
psubsw m1, v_offset ; V = V - 128
psubw m6, y_offset
psubw m7, y_offset
mova m2, m0
mova m3, m1
pmulhw m2, ug_coff
pmulhw m3, vg_coff
pmulhw m6, y_coff
pmulhw m7, y_coff
pmulhw m0, ub_coff
pmulhw m1, vr_coff
paddsw m2, m3
mova m3, m7
mova m5, m7
paddsw m3, m0 ; B1 B3 B5 B7 ...
paddsw m5, m1 ; R1 R3 R5 R7 ...
paddsw m7, m2 ; G1 G3 G4 G7 ...
paddsw m0, m6 ; B0 B2 B4 B6 ...
paddsw m1, m6 ; R0 R2 R4 R6 ...
paddsw m2, m6 ; G0 G2 G4 G6 ...
%if %3 == 24 ; PACK RGB24
%define depth 3
packuswb m0, m3 ; R0 R2 R4 R6 ... R1 R3 R5 R7 ...
packuswb m1, m5 ; B0 B2 B4 B6 ... B1 B3 B5 B7 ...
packuswb m2, m7 ; G0 G2 G4 G6 ... G1 G3 G5 G7 ...
mova m3, m_red
mova m6, m_blue
MOV_H2L m_red
punpcklbw m3, m2 ; R0 G0 R2 G2 R4 G4 R6 G6 R8 G8 ...
punpcklbw m6, m_red ; B0 R1 B2 R3 B4 R5 B6 R7 B8 R9 ...
mova m5, m3
punpckhbw m2, m_blue ; G1 B1 G3 B3 G5 B5 G7 B7 G9 B9 ...
punpcklwd m3 ,m6 ; R0 G0 B0 R1 R2 G2 B2 R3
punpckhwd m5, m6 ; R4 G4 B4 R5 R6 G6 B6 R7
%if cpuflag(mmxext)
pshufw m1, m2, 0xc6
pshufw m6, m3, 0x84
pshufw m7, m5, 0x38
pand m6, [mask_1101] ; R0 G0 B0 R1 -- -- R2 G2
movq m0, m1
pand m7, [mask_0110] ; -- -- R6 G6 B6 R7 -- --
movq m2, m1
pand m1, [mask_0100] ; -- -- G3 B3 -- -- -- --
psrlq m3, 48 ; B2 R3 -- -- -- -- -- --
pand m0, [mask_0010] ; -- -- -- -- G1 B1 -- --
psllq m5, 32 ; -- -- -- -- R4 G4 B4 R5
pand m2, [mask_1001] ; G5 B5 -- -- -- -- G7 B7
por m1, m3
por m0, m6
por m1, m5
por m2, m7
movntq [imageq], m0
movntq [imageq + 8], m1
movntq [imageq + 16], m2
%else ; cpuflag(mmx)
movd [imageq], m3 ; R0 G0 R2 G2
movd [imageq + 4], m2 ; G1 B1
psrlq m3, 32
psrlq m2, 16
movd [imageq + 6], m3 ; R2 G2 B2 R3
movd [imageq + 10], m2 ; G3 B3
psrlq m2, 16
movd [imageq + 12], m5 ; R4 G4 B4 R5
movd [imageq + 16], m2 ; G5 B5
psrlq m5, 32
movd [imageq + 20], m2 ; -- -- G7 B7
movd [imageq + 18], m5 ; R6 G6 B6 R7
%endif
%else ; PACK RGB15/16/32
packuswb m0, m1
packuswb m3, m5
packuswb m2, m2
mova m1, m0
packuswb m7, m7
punpcklbw m0, m3 ; B0 B1 B2 B3 ... B7
punpckhbw m1, m3 ; R0 R1 R2 R3 ... R7
punpcklbw m2, m7 ; G0 G1 G2 G3 ... G7
%if %3 == 32 ; PACK RGB32
%define depth 4
%ifidn %1, yuv
pcmpeqd m3, m3 ; Set alpha empty
%else
mova m3, [pa_2indexq + 2 * indexq] ; Load alpha
%endif
mova m5, m_blue
mova m6, m_red
punpckhbw m5, m_green
punpcklbw m_blue, m_green
punpckhbw m6, m_alpha
punpcklbw m_red, m_alpha
mova m_green, m_blue
mova m_alpha, m5
punpcklwd m_blue, m_red
punpckhwd m_green, m_red
punpcklwd m5, m6
punpckhwd m_alpha, m6
mova [imageq + 0], m_blue
mova [imageq + 8 * time_num], m_green
mova [imageq + 16 * time_num], m5
mova [imageq + 24 * time_num], m_alpha
%else ; PACK RGB15/16
%define depth 2
%define blue_dither [pointer_c_ditherq + 2 * 8]
%define green_dither [pointer_c_ditherq + 1 * 8]
%define red_dither [pointer_c_ditherq + 0 * 8]
%if %3 == 15
%define gmask pb_03
%define isRGB15 1
%else
%define gmask pb_07
%define isRGB15 0
%endif
paddusb m0, blue_dither
paddusb m2, green_dither
paddusb m1, red_dither
pand m0, [pb_f8]
pand m1, [pb_f8]
mova m3, m2
psllw m2, 3 - isRGB15
psrlw m3, 5 + isRGB15
psrlw m0, 3
psrlw m1, isRGB15
pand m2, [pb_e0]
pand m3, [gmask]
por m0, m2
por m1, m3
mova m2, m0
punpcklbw m0, m1
punpckhbw m2, m1
mova [imageq], m0
mova [imageq + 8 * time_num], m2
%endif ; PACK RGB15/16
%endif ; PACK RGB15/16/32
mova m_y, [py_2indexq + 2 * indexq + 8 * time_num]
movh m_v, [pv_indexq + indexq + 4 * time_num]
movh m_u, [pu_indexq + indexq + 4 * time_num]
add imageq, 8 * depth * time_num
add indexq, 4 * time_num
js .loop0
REP_RET
%endmacro
INIT_MMX mmx
yuv2rgb_fn yuv, rgb, 24
yuv2rgb_fn yuv, bgr, 24
yuv2rgb_fn yuv, rgb, 32
yuv2rgb_fn yuv, bgr, 32
yuv2rgb_fn yuva, rgb, 32
yuv2rgb_fn yuva, bgr, 32
yuv2rgb_fn yuv, rgb, 15
yuv2rgb_fn yuv, rgb, 16
INIT_MMX mmxext
yuv2rgb_fn yuv, rgb, 24
yuv2rgb_fn yuv, bgr, 24