New implementation of rgb32tobgr32

The previous implementation segfaulted with MMX enabled when fed an image
smaller than the size of the units the MMX code processed. The new code:
- is faster for MMX, MMX2 and plain C
- processes small images correctly
- is LGPL

Originally committed as revision 23009 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
This commit is contained in:
Ivo van Poorten 2007-04-16 21:41:03 +00:00
parent 4f99f93268
commit b38d487466

View File

@ -1364,49 +1364,66 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
{ {
uint8_t *d = dst, *s = (uint8_t *) src;
const uint8_t *end = s + src_size;
#ifdef HAVE_MMX #ifdef HAVE_MMX
/* TODO: unroll this loop */ __asm __volatile(
asm volatile ( " "PREFETCH" (%1) \n"
"xor %%"REG_a", %%"REG_a" \n\t" " movq %3, %%mm7 \n"
ASMALIGN(4) " pxor %4, %%mm7 \n"
"1: \n\t" " movq %%mm7, %%mm6 \n"
PREFETCH" 32(%0, %%"REG_a") \n\t" " pxor %5, %%mm7 \n"
"movq (%0, %%"REG_a"), %%mm0 \n\t" " jmp 2f \n"
"movq %%mm0, %%mm1 \n\t" ASMALIGN(4)
"movq %%mm0, %%mm2 \n\t" "1: \n"
"pslld $16, %%mm0 \n\t" " "PREFETCH" 32(%1) \n"
"psrld $16, %%mm1 \n\t" " movq (%1), %%mm0 \n"
"pand "MANGLE(mask32r)", %%mm0 \n\t" " movq 8(%1), %%mm1 \n"
"pand "MANGLE(mask32g)", %%mm2 \n\t" # ifdef HAVE_MMX2
"pand "MANGLE(mask32b)", %%mm1 \n\t" " pshufw $177, %%mm0, %%mm3 \n"
"por %%mm0, %%mm2 \n\t" " pshufw $177, %%mm1, %%mm5 \n"
"por %%mm1, %%mm2 \n\t" " pand %%mm7, %%mm0 \n"
MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t" " pand %%mm6, %%mm3 \n"
"add $8, %%"REG_a" \n\t" " pand %%mm7, %%mm1 \n"
"cmp %2, %%"REG_a" \n\t" " pand %%mm6, %%mm5 \n"
" jb 1b \n\t" " por %%mm3, %%mm0 \n"
:: "r" (src), "r"(dst), "r" (src_size-7) " por %%mm5, %%mm1 \n"
: "%"REG_a # else
); " movq %%mm0, %%mm2 \n"
" movq %%mm1, %%mm4 \n"
__asm __volatile(SFENCE:::"memory"); " pand %%mm7, %%mm0 \n"
__asm __volatile(EMMS:::"memory"); " pand %%mm6, %%mm2 \n"
#else " pand %%mm7, %%mm1 \n"
unsigned i; " pand %%mm6, %%mm4 \n"
unsigned num_pixels = src_size >> 2; " movq %%mm2, %%mm3 \n"
for(i=0; i<num_pixels; i++) " movq %%mm4, %%mm5 \n"
{ " pslld $16, %%mm2 \n"
#ifdef WORDS_BIGENDIAN " psrld $16, %%mm3 \n"
dst[4*i + 1] = src[4*i + 3]; " pslld $16, %%mm4 \n"
dst[4*i + 2] = src[4*i + 2]; " psrld $16, %%mm5 \n"
dst[4*i + 3] = src[4*i + 1]; " por %%mm2, %%mm0 \n"
#else " por %%mm4, %%mm1 \n"
dst[4*i + 0] = src[4*i + 2]; " por %%mm3, %%mm0 \n"
dst[4*i + 1] = src[4*i + 1]; " por %%mm5, %%mm1 \n"
dst[4*i + 2] = src[4*i + 0]; # endif
" "MOVNTQ" %%mm0, (%0) \n"
" "MOVNTQ" %%mm1, 8(%0) \n"
" add $16, %0 \n"
" add $16, %1 \n"
"2: \n"
" cmp %1, %2 \n"
" ja 1b \n"
" "SFENCE" \n"
" "EMMS" \n"
: "+r"(d), "+r"(s)
: "r" (end-15), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
: "memory");
#endif #endif
for (; s<end; s+=4, d+=4) {
int v = *(uint32_t *)s, g = v & 0xff00;
v &= 0xff00ff;
*(uint32_t *)d = (v>>16) + g + (v<<16);
} }
#endif
} }
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)