overread in the mmx2 horizontal scaler fixed

2% faster horizontal mmx2 scaler

Originally committed as revision 5453 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
This commit is contained in:
Michael Niedermayer 2002-04-01 14:01:22 +00:00
parent 0344cd0a7c
commit b7dc6f6628
3 changed files with 194 additions and 122 deletions

View File

@ -117,10 +117,6 @@ untested special converters
extern int verbose; // defined in mplayer.c
/*
NOTES
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
TODO
@ -1020,12 +1016,17 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
}
#ifdef ARCH_X86
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
{
uint8_t *fragment;
int imm8OfPShufW1;
int imm8OfPShufW2;
int fragmentLength;
uint8_t *fragmentA;
int imm8OfPShufW1A;
int imm8OfPShufW2A;
int fragmentLengthA;
uint8_t *fragmentB;
int imm8OfPShufW1B;
int imm8OfPShufW2B;
int fragmentLengthB;
int fragmentPos;
int xpos, i;
@ -1037,22 +1038,18 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
"jmp 9f \n\t"
// Begin
"0: \n\t"
"movq (%%esi), %%mm0 \n\t" //FIXME Alignment
"movq %%mm0, %%mm1 \n\t"
"psrlq $8, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"addw %%bx, %%cx \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
"movq (%%edx, %%eax), %%mm3 \n\t"
"movd (%%ecx, %%esi), %%mm0 \n\t"
"movd 1(%%ecx, %%esi), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm1, %%mm1 \n\t"
"1: \n\t"
"adcl %%edx, %%esi \n\t" //xx+= (4*lumXInc)>>16 + carry
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psrlw $9, %%mm3 \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%ebx, %%eax), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
@ -1071,13 +1068,54 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
"subl %0, %2 \n\t"
"leal 9b, %3 \n\t"
"subl %0, %3 \n\t"
:"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
"=r" (fragmentLength)
:"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
"=r" (fragmentLengthA)
);
asm volatile(
"jmp 9f \n\t"
// Begin
"0: \n\t"
"movq (%%edx, %%eax), %%mm3 \n\t"
"movd (%%ecx, %%esi), %%mm0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm0, %%mm1 \n\t"
"1: \n\t"
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%ebx, %%eax), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"movq %%mm0, (%%edi, %%eax) \n\t"
"addl $8, %%eax \n\t"
// End
"9: \n\t"
// "int $3\n\t"
"leal 0b, %0 \n\t"
"leal 1b, %1 \n\t"
"leal 2b, %2 \n\t"
"decl %1 \n\t"
"decl %2 \n\t"
"subl %0, %1 \n\t"
"subl %0, %2 \n\t"
"leal 9b, %3 \n\t"
"subl %0, %3 \n\t"
:"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
"=r" (fragmentLengthB)
);
xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
for(i=0; i<dstW/8; i++)
fragmentPos=0;
for(i=0; i<dstW/numSplits; i++)
{
int xx=xpos>>16;
@ -1088,20 +1126,65 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
int c=((xpos+xInc*2)>>16) - xx;
int d=((xpos+xInc*3)>>16) - xx;
memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
filter[i ] = (( xpos & 0xFFFF) ^ 0xFFFF)>>9;
filter[i+1] = (((xpos+xInc ) & 0xFFFF) ^ 0xFFFF)>>9;
filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
filterPos[i/2]= xx;
funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
a | (b<<2) | (c<<4) | (d<<6);
if(d+1<4)
{
int maxShift= 3-(d+1);
int shift=0;
// if we dont need to read 8 bytes than dont :), reduces the chance of
// crossing a cache line
if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
funnyCode[fragmentLength*(i+4)/4]= RET;
funnyCode[fragmentPos + imm8OfPShufW1B]=
(a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
funnyCode[fragmentPos + imm8OfPShufW2B]=
a | (b<<2) | (c<<4) | (d<<6);
if(i+3>=dstW) shift=maxShift; //avoid overread
else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
if(shift && i>=shift)
{
funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
filterPos[i/2]-=shift;
}
fragmentPos+= fragmentLengthB;
}
else
{
int maxShift= 3-d;
int shift=0;
memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
funnyCode[fragmentPos + imm8OfPShufW1A]=
funnyCode[fragmentPos + imm8OfPShufW2A]=
a | (b<<2) | (c<<4) | (d<<6);
if(i+4>=dstW) shift=maxShift; //avoid overread
else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
if(shift && i>=shift)
{
funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
filterPos[i/2]-=shift;
}
fragmentPos+= fragmentLengthA;
}
funnyCode[fragmentPos]= RET;
}
xpos+=xInc;
}
filterPos[i/2]= xpos>>16; // needed to jump to the next part
}
#endif // ARCH_X86
@ -1565,8 +1648,13 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
// cant downscale !!!
if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
{
initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode);
initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
c->lumMmx2Filter = (int16_t*)memalign(8, (dstW /8+8)*sizeof(int16_t));
c->chrMmx2Filter = (int16_t*)memalign(8, (c->chrDstW /4+8)*sizeof(int16_t));
c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW /2/8+8)*sizeof(int32_t));
c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
}
#endif
} // Init Horizontal stuff
@ -2014,6 +2102,15 @@ void freeSwsContext(SwsContext *c){
if(c->chrMmxFilter) free(c->chrMmxFilter);
c->chrMmxFilter = NULL;
if(c->lumMmx2Filter) free(c->lumMmx2Filter);
c->lumMmx2Filter=NULL;
if(c->chrMmx2Filter) free(c->chrMmx2Filter);
c->chrMmx2Filter=NULL;
if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
c->lumMmx2FilterPos=NULL;
if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
c->chrMmx2FilterPos=NULL;
free(c);
}

View File

@ -69,6 +69,10 @@ typedef struct SwsContext{
uint8_t __attribute__((aligned(32))) funnyYCode[10000];
uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
int32_t *lumMmx2FilterPos;
int32_t *chrMmx2FilterPos;
int16_t *lumMmx2Filter;
int16_t *chrMmx2Filter;
int canMMX2BeUsed;

View File

@ -2238,7 +2238,8 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
int flags, int canMMX2BeUsed, int16_t *hLumFilter,
int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
int srcFormat, uint8_t *formatConvBuffer)
int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
int32_t *mmx2FilterPos)
{
if(srcFormat==IMGFMT_YUY2)
{
@ -2294,35 +2295,21 @@ static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, in
{
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"pxor %%mm2, %%mm2 \n\t" // 2*xalpha
"movd %5, %%mm6 \n\t" // xInc&0xFFFF
"punpcklwd %%mm6, %%mm6 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
"movq %%mm6, %%mm2 \n\t"
"psllq $16, %%mm2 \n\t"
"paddw %%mm6, %%mm2 \n\t"
"psllq $16, %%mm2 \n\t"
"paddw %%mm6, %%mm2 \n\t"
"psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFF
"movq %%mm2, %%mm4 \n\t"
"movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
"punpcklwd %%mm6, %%mm6 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
"movl %0, %%ecx \n\t"
"movl %1, %%edi \n\t"
"movl %2, %%edx \n\t"
"movl %3, %%ebx \n\t"
"xorl %%eax, %%eax \n\t" // i
"movl %0, %%esi \n\t" // src
"movl %1, %%edi \n\t" // buf1
"movl %3, %%edx \n\t" // (xInc*4)>>16
"xorl %%ecx, %%ecx \n\t"
"xorl %%ebx, %%ebx \n\t"
"movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
PREFETCH" (%%ecx) \n\t"
PREFETCH" 32(%%ecx) \n\t"
PREFETCH" 64(%%ecx) \n\t"
#define FUNNY_Y_CODE \
PREFETCH" 1024(%%esi) \n\t"\
PREFETCH" 1056(%%esi) \n\t"\
PREFETCH" 1088(%%esi) \n\t"\
"call *%6 \n\t"\
"movq %%mm4, %%mm2 \n\t"\
"xorl %%ecx, %%ecx \n\t"
"movl (%%ebx), %%esi \n\t"\
"call *%4 \n\t"\
"addl (%%ebx, %%eax), %%ecx \n\t"\
"addl %%eax, %%edi \n\t"\
"xorl %%eax, %%eax \n\t"\
FUNNY_Y_CODE
FUNNY_Y_CODE
@ -2333,8 +2320,8 @@ FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
:: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
"m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
:: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
"m" (funnyYCode)
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
);
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
@ -2402,7 +2389,8 @@ FUNNY_Y_CODE
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
int srcFormat, uint8_t *formatConvBuffer)
int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
int32_t *mmx2FilterPos)
{
if(srcFormat==IMGFMT_YUY2)
{
@ -2469,65 +2457,44 @@ inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, u
if(canMMX2BeUsed)
{
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"pxor %%mm2, %%mm2 \n\t" // 2*xalpha
"movd %5, %%mm6 \n\t" // xInc&0xFFFF
"punpcklwd %%mm6, %%mm6 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
"movq %%mm6, %%mm2 \n\t"
"psllq $16, %%mm2 \n\t"
"paddw %%mm6, %%mm2 \n\t"
"psllq $16, %%mm2 \n\t"
"paddw %%mm6, %%mm2 \n\t"
"psllq $16, %%mm2 \n\t" //0,t,2t,3t t=xInc&0xFFFF
"movq %%mm2, %%mm4 \n\t"
"movd %4, %%mm6 \n\t" //(xInc*4)&0xFFFF
"punpcklwd %%mm6, %%mm6 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
"xorl %%eax, %%eax \n\t" // i
"movl %0, %%esi \n\t" // src
"movl %1, %%edi \n\t" // buf1
"movl %3, %%edx \n\t" // (xInc*4)>>16
"xorl %%ecx, %%ecx \n\t"
"xorl %%ebx, %%ebx \n\t"
"movw %4, %%bx \n\t" // (xInc*4)&0xFFFF
"pxor %%mm7, %%mm7 \n\t"
"movl %0, %%ecx \n\t"
"movl %1, %%edi \n\t"
"movl %2, %%edx \n\t"
"movl %3, %%ebx \n\t"
"xorl %%eax, %%eax \n\t" // i
PREFETCH" (%%ecx) \n\t"
PREFETCH" 32(%%ecx) \n\t"
PREFETCH" 64(%%ecx) \n\t"
#define FUNNYUVCODE \
PREFETCH" 1024(%%esi) \n\t"\
PREFETCH" 1056(%%esi) \n\t"\
PREFETCH" 1088(%%esi) \n\t"\
"call *%7 \n\t"\
"movq %%mm4, %%mm2 \n\t"\
"xorl %%ecx, %%ecx \n\t"
#define FUNNY_UV_CODE \
"movl (%%ebx), %%esi \n\t"\
"call *%4 \n\t"\
"addl (%%ebx, %%eax), %%ecx \n\t"\
"addl %%eax, %%edi \n\t"\
"xorl %%eax, %%eax \n\t"\
FUNNYUVCODE
FUNNYUVCODE
FUNNYUVCODE
FUNNYUVCODE
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
"xorl %%eax, %%eax \n\t" // i
"movl %5, %%ecx \n\t" // src
"movl %1, %%edi \n\t" // buf1
"addl $4096, %%edi \n\t"
PREFETCH" (%%ecx) \n\t"
PREFETCH" 32(%%ecx) \n\t"
PREFETCH" 64(%%ecx) \n\t"
FUNNYUVCODE
FUNNYUVCODE
FUNNYUVCODE
FUNNYUVCODE
"xorl %%eax, %%eax \n\t" // i
"movl %6, %%esi \n\t" // src
"movl %1, %%edi \n\t" // buf1
"addl $4096, %%edi \n\t"
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNYUVCODE
FUNNYUVCODE
FUNNYUVCODE
FUNNYUVCODE
FUNNYUVCODE
FUNNYUVCODE
FUNNYUVCODE
FUNNYUVCODE
:: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
"m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
);
:: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
"m" (funnyUVCode), "m" (src2)
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
);
for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
{
// printf("%d %d %d\n", dstWidth, i, srcW);
@ -2749,7 +2716,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
// printf("%d %d\n", lumBufIndex, vLumBufSize);
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
funnyYCode, c->srcFormat, formatConvBuffer);
funnyYCode, c->srcFormat, formatConvBuffer,
c->lumMmx2Filter, c->lumMmx2FilterPos);
lastInLumBuf++;
}
while(lastInChrBuf < lastChrSrcY)
@ -2763,7 +2731,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
//FIXME replace parameters through context struct (some at least)
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
funnyUVCode, c->srcFormat, formatConvBuffer);
funnyUVCode, c->srcFormat, formatConvBuffer,
c->chrMmx2Filter, c->chrMmx2FilterPos);
lastInChrBuf++;
}
//wrap buf index around to stay inside the ring buffer
@ -2787,7 +2756,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
funnyYCode, c->srcFormat, formatConvBuffer);
funnyYCode, c->srcFormat, formatConvBuffer,
c->lumMmx2Filter, c->lumMmx2FilterPos);
lastInLumBuf++;
}
while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
@ -2800,7 +2770,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
funnyUVCode, c->srcFormat, formatConvBuffer);
funnyUVCode, c->srcFormat, formatConvBuffer,
c->chrMmx2Filter, c->chrMmx2FilterPos);
lastInChrBuf++;
}
//wrap buf index around to stay inside the ring buffer