From 428098165de4c3edfe42c1b7f00627d287015863 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Sun, 29 Apr 2007 21:36:43 +0000 Subject: [PATCH] cosmetics attack, part III: Remove all tabs and prettyprint/reindent the code. Originally committed as revision 23175 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale --- libswscale/yuv2rgb.c | 782 ++++++++++---------- libswscale/yuv2rgb_altivec.c | 1272 +++++++++++++++++---------------- libswscale/yuv2rgb_mlib.c | 36 +- libswscale/yuv2rgb_template.c | 652 ++++++++--------- 4 files changed, 1373 insertions(+), 1369 deletions(-) diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c index db6534287e..98a3569d85 100644 --- a/libswscale/yuv2rgb.c +++ b/libswscale/yuv2rgb.c @@ -156,7 +156,7 @@ const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={ #ifdef HAVE_MMX /* hope these constant values are cache line aligned */ -static uint64_t attribute_used __attribute__((aligned(8))) mmx_00ffw = 0x00ff00ff00ff00ffULL; +static uint64_t attribute_used __attribute__((aligned(8))) mmx_00ffw = 0x00ff00ff00ff00ffULL; static uint64_t attribute_used __attribute__((aligned(8))) mmx_redmask = 0xf8f8f8f8f8f8f8f8ULL; static uint64_t attribute_used __attribute__((aligned(8))) mmx_grnmask = 0xfcfcfcfcfcfcfcfcULL; @@ -172,12 +172,12 @@ static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither; static volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither; static uint64_t __attribute__((aligned(8))) dither4[2]={ - 0x0103010301030103LL, - 0x0200020002000200LL,}; + 0x0103010301030103LL, + 0x0200020002000200LL,}; static uint64_t __attribute__((aligned(8))) dither8[2]={ - 0x0602060206020602LL, - 0x0004000400040004LL,}; + 0x0602060206020602LL, + 0x0004000400040004LL,}; #undef HAVE_MMX @@ -210,404 +210,404 @@ const int32_t Inverse_Table_6_9[8][4] = { {117579, 136230, 16907, 35559} /* SMPTE 240M (1987) */ }; -#define RGB(i) \ - U = pu[i]; \ - V = pv[i]; \ - r = (void *)c->table_rV[V]; \ - g = (void *)(c->table_gU[U] + c->table_gV[V]); \ - b = (void *)c->table_bU[U]; +#define RGB(i) \ + U = pu[i]; \ + V = pv[i]; \ + r = (void *)c->table_rV[V]; \ + g = (void *)(c->table_gU[U] + c->table_gV[V]); \ + b = (void *)c->table_bU[U]; -#define DST1(i) \ - Y = py_1[2*i]; \ - dst_1[2*i] = r[Y] + g[Y] + b[Y]; \ - Y = py_1[2*i+1]; \ - dst_1[2*i+1] = r[Y] + g[Y] + b[Y]; +#define DST1(i) \ + Y = py_1[2*i]; \ + dst_1[2*i] = r[Y] + g[Y] + b[Y]; \ + Y = py_1[2*i+1]; \ + dst_1[2*i+1] = r[Y] + g[Y] + b[Y]; -#define DST2(i) \ - Y = py_2[2*i]; \ - dst_2[2*i] = r[Y] + g[Y] + b[Y]; \ - Y = py_2[2*i+1]; \ - dst_2[2*i+1] = r[Y] + g[Y] + b[Y]; +#define DST2(i) \ + Y = py_2[2*i]; \ + dst_2[2*i] = r[Y] + g[Y] + b[Y]; \ + Y = py_2[2*i+1]; \ + dst_2[2*i+1] = r[Y] + g[Y] + b[Y]; -#define DST1RGB(i) \ - Y = py_1[2*i]; \ - dst_1[6*i] = r[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = b[Y]; \ - Y = py_1[2*i+1]; \ - dst_1[6*i+3] = r[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = b[Y]; +#define DST1RGB(i) \ + Y = py_1[2*i]; \ + dst_1[6*i] = r[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = b[Y]; \ + Y = py_1[2*i+1]; \ + dst_1[6*i+3] = r[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = b[Y]; -#define DST2RGB(i) \ - Y = py_2[2*i]; \ - dst_2[6*i] = r[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = b[Y]; \ - Y = py_2[2*i+1]; \ - dst_2[6*i+3] = r[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = b[Y]; +#define DST2RGB(i) \ + Y = py_2[2*i]; \ + dst_2[6*i] = r[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = b[Y]; \ + Y = py_2[2*i+1]; \ + dst_2[6*i+3] = r[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = b[Y]; -#define DST1BGR(i) \ - Y = py_1[2*i]; \ - dst_1[6*i] = b[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = r[Y]; \ - Y = py_1[2*i+1]; \ - dst_1[6*i+3] = b[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = r[Y]; +#define DST1BGR(i) \ + Y = py_1[2*i]; \ + dst_1[6*i] = b[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = r[Y]; \ + Y = py_1[2*i+1]; \ + dst_1[6*i+3] = b[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = r[Y]; -#define DST2BGR(i) \ - Y = py_2[2*i]; \ - dst_2[6*i] = b[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = r[Y]; \ - Y = py_2[2*i+1]; \ - dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y]; +#define DST2BGR(i) \ + Y = py_2[2*i]; \ + dst_2[6*i] = b[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = r[Y]; \ + Y = py_2[2*i+1]; \ + dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y]; #define PROLOG(func_name, dst_type) \ static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \ - int srcSliceH, uint8_t* dst[], int dstStride[]){\ + int srcSliceH, uint8_t* dst[], int dstStride[]){\ int y;\ \ - if(c->srcFormat == PIX_FMT_YUV422P){\ - srcStride[1] *= 2;\ - srcStride[2] *= 2;\ + if (c->srcFormat == PIX_FMT_YUV422P){\ + srcStride[1] *= 2;\ + srcStride[2] *= 2;\ }\ - for(y=0; y>1)*srcStride[1];\ - uint8_t *pv= src[2] + (y>>1)*srcStride[2];\ - unsigned int h_size= c->dstW>>3;\ - while (h_size--) {\ - int attribute_unused U, V;\ - int Y;\ + for (y=0; y>1)*srcStride[1];\ + uint8_t *pv= src[2] + (y>>1)*srcStride[2];\ + unsigned int h_size= c->dstW>>3;\ + while (h_size--) {\ + int attribute_unused U, V;\ + int Y;\ #define EPILOG(dst_delta)\ - pu += 4;\ - pv += 4;\ - py_1 += 8;\ - py_2 += 8;\ - dst_1 += dst_delta;\ - dst_2 += dst_delta;\ - }\ + pu += 4;\ + pv += 4;\ + py_1 += 8;\ + py_2 += 8;\ + dst_1 += dst_delta;\ + dst_2 += dst_delta;\ + }\ }\ return srcSliceH;\ } PROLOG(yuv2rgb_c_32, uint32_t) - RGB(0); - DST1(0); - DST2(0); + RGB(0); + DST1(0); + DST2(0); - RGB(1); - DST2(1); - DST1(1); + RGB(1); + DST2(1); + DST1(1); - RGB(2); - DST1(2); - DST2(2); + RGB(2); + DST1(2); + DST2(2); - RGB(3); - DST2(3); - DST1(3); + RGB(3); + DST2(3); + DST1(3); EPILOG(8) PROLOG(yuv2rgb_c_24_rgb, uint8_t) - RGB(0); - DST1RGB(0); - DST2RGB(0); + RGB(0); + DST1RGB(0); + DST2RGB(0); - RGB(1); - DST2RGB(1); - DST1RGB(1); + RGB(1); + DST2RGB(1); + DST1RGB(1); - RGB(2); - DST1RGB(2); - DST2RGB(2); + RGB(2); + DST1RGB(2); + DST2RGB(2); - RGB(3); - DST2RGB(3); - DST1RGB(3); + RGB(3); + DST2RGB(3); + DST1RGB(3); EPILOG(24) // only trivial mods from yuv2rgb_c_24_rgb PROLOG(yuv2rgb_c_24_bgr, uint8_t) - RGB(0); - DST1BGR(0); - DST2BGR(0); + RGB(0); + DST1BGR(0); + DST2BGR(0); - RGB(1); - DST2BGR(1); - DST1BGR(1); + RGB(1); + DST2BGR(1); + DST1BGR(1); - RGB(2); - DST1BGR(2); - DST2BGR(2); + RGB(2); + DST1BGR(2); + DST2BGR(2); - RGB(3); - DST2BGR(3); - DST1BGR(3); + RGB(3); + DST2BGR(3); + DST1BGR(3); EPILOG(24) // This is exactly the same code as yuv2rgb_c_32 except for the types of // r, g, b, dst_1, dst_2 PROLOG(yuv2rgb_c_16, uint16_t) - RGB(0); - DST1(0); - DST2(0); + RGB(0); + DST1(0); + DST2(0); - RGB(1); - DST2(1); - DST1(1); + RGB(1); + DST2(1); + DST1(1); - RGB(2); - DST1(2); - DST2(2); + RGB(2); + DST1(2); + DST2(2); - RGB(3); - DST2(3); - DST1(3); + RGB(3); + DST2(3); + DST1(3); EPILOG(8) // This is exactly the same code as yuv2rgb_c_32 except for the types of // r, g, b, dst_1, dst_2 PROLOG(yuv2rgb_c_8, uint8_t) - RGB(0); - DST1(0); - DST2(0); + RGB(0); + DST1(0); + DST2(0); - RGB(1); - DST2(1); - DST1(1); + RGB(1); + DST2(1); + DST1(1); - RGB(2); - DST1(2); - DST2(2); + RGB(2); + DST1(2); + DST2(2); - RGB(3); - DST2(3); - DST1(3); + RGB(3); + DST2(3); + DST1(3); EPILOG(8) // r, g, b, dst_1, dst_2 PROLOG(yuv2rgb_c_8_ordered_dither, uint8_t) - const uint8_t *d32= dither_8x8_32[y&7]; - const uint8_t *d64= dither_8x8_73[y&7]; -#define DST1bpp8(i,o) \ - Y = py_1[2*i]; \ - dst_1[2*i] = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]]; \ - Y = py_1[2*i+1]; \ - dst_1[2*i+1] = r[Y+d32[1+o]] + g[Y+d32[1+o]] + b[Y+d64[1+o]]; + const uint8_t *d32= dither_8x8_32[y&7]; + const uint8_t *d64= dither_8x8_73[y&7]; +#define DST1bpp8(i,o) \ + Y = py_1[2*i]; \ + dst_1[2*i] = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]]; \ + Y = py_1[2*i+1]; \ + dst_1[2*i+1] = r[Y+d32[1+o]] + g[Y+d32[1+o]] + b[Y+d64[1+o]]; -#define DST2bpp8(i,o) \ - Y = py_2[2*i]; \ - dst_2[2*i] = r[Y+d32[8+o]] + g[Y+d32[8+o]] + b[Y+d64[8+o]]; \ - Y = py_2[2*i+1]; \ - dst_2[2*i+1] = r[Y+d32[9+o]] + g[Y+d32[9+o]] + b[Y+d64[9+o]]; +#define DST2bpp8(i,o) \ + Y = py_2[2*i]; \ + dst_2[2*i] = r[Y+d32[8+o]] + g[Y+d32[8+o]] + b[Y+d64[8+o]]; \ + Y = py_2[2*i+1]; \ + dst_2[2*i+1] = r[Y+d32[9+o]] + g[Y+d32[9+o]] + b[Y+d64[9+o]]; - RGB(0); - DST1bpp8(0,0); - DST2bpp8(0,0); + RGB(0); + DST1bpp8(0,0); + DST2bpp8(0,0); - RGB(1); - DST2bpp8(1,2); - DST1bpp8(1,2); + RGB(1); + DST2bpp8(1,2); + DST1bpp8(1,2); - RGB(2); - DST1bpp8(2,4); - DST2bpp8(2,4); + RGB(2); + DST1bpp8(2,4); + DST2bpp8(2,4); - RGB(3); - DST2bpp8(3,6); - DST1bpp8(3,6); + RGB(3); + DST2bpp8(3,6); + DST1bpp8(3,6); EPILOG(8) // This is exactly the same code as yuv2rgb_c_32 except for the types of // r, g, b, dst_1, dst_2 PROLOG(yuv2rgb_c_4, uint8_t) - int acc; -#define DST1_4(i) \ - Y = py_1[2*i]; \ - acc = r[Y] + g[Y] + b[Y]; \ - Y = py_1[2*i+1]; \ - acc |= (r[Y] + g[Y] + b[Y])<<4;\ - dst_1[i] = acc; + int acc; +#define DST1_4(i) \ + Y = py_1[2*i]; \ + acc = r[Y] + g[Y] + b[Y]; \ + Y = py_1[2*i+1]; \ + acc |= (r[Y] + g[Y] + b[Y])<<4; \ + dst_1[i] = acc; -#define DST2_4(i) \ - Y = py_2[2*i]; \ - acc = r[Y] + g[Y] + b[Y]; \ - Y = py_2[2*i+1]; \ - acc |= (r[Y] + g[Y] + b[Y])<<4;\ - dst_2[i] = acc; +#define DST2_4(i) \ + Y = py_2[2*i]; \ + acc = r[Y] + g[Y] + b[Y]; \ + Y = py_2[2*i+1]; \ + acc |= (r[Y] + g[Y] + b[Y])<<4; \ + dst_2[i] = acc; - RGB(0); - DST1_4(0); - DST2_4(0); + RGB(0); + DST1_4(0); + DST2_4(0); - RGB(1); - DST2_4(1); - DST1_4(1); + RGB(1); + DST2_4(1); + DST1_4(1); - RGB(2); - DST1_4(2); - DST2_4(2); + RGB(2); + DST1_4(2); + DST2_4(2); - RGB(3); - DST2_4(3); - DST1_4(3); + RGB(3); + DST2_4(3); + DST1_4(3); EPILOG(4) PROLOG(yuv2rgb_c_4_ordered_dither, uint8_t) - const uint8_t *d64= dither_8x8_73[y&7]; - const uint8_t *d128=dither_8x8_220[y&7]; - int acc; + const uint8_t *d64= dither_8x8_73[y&7]; + const uint8_t *d128=dither_8x8_220[y&7]; + int acc; -#define DST1bpp4(i,o) \ - Y = py_1[2*i]; \ - acc = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \ - Y = py_1[2*i+1]; \ - acc |= (r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]])<<4;\ - dst_1[i]= acc; +#define DST1bpp4(i,o) \ + Y = py_1[2*i]; \ + acc = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \ + Y = py_1[2*i+1]; \ + acc |= (r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]])<<4; \ + dst_1[i]= acc; -#define DST2bpp4(i,o) \ - Y = py_2[2*i]; \ - acc = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \ - Y = py_2[2*i+1]; \ - acc |= (r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]])<<4;\ - dst_2[i]= acc; +#define DST2bpp4(i,o) \ + Y = py_2[2*i]; \ + acc = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \ + Y = py_2[2*i+1]; \ + acc |= (r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]])<<4; \ + dst_2[i]= acc; - RGB(0); - DST1bpp4(0,0); - DST2bpp4(0,0); + RGB(0); + DST1bpp4(0,0); + DST2bpp4(0,0); - RGB(1); - DST2bpp4(1,2); - DST1bpp4(1,2); + RGB(1); + DST2bpp4(1,2); + DST1bpp4(1,2); - RGB(2); - DST1bpp4(2,4); - DST2bpp4(2,4); + RGB(2); + DST1bpp4(2,4); + DST2bpp4(2,4); - RGB(3); - DST2bpp4(3,6); - DST1bpp4(3,6); + RGB(3); + DST2bpp4(3,6); + DST1bpp4(3,6); EPILOG(4) // This is exactly the same code as yuv2rgb_c_32 except for the types of // r, g, b, dst_1, dst_2 PROLOG(yuv2rgb_c_4b, uint8_t) - RGB(0); - DST1(0); - DST2(0); + RGB(0); + DST1(0); + DST2(0); - RGB(1); - DST2(1); - DST1(1); + RGB(1); + DST2(1); + DST1(1); - RGB(2); - DST1(2); - DST2(2); + RGB(2); + DST1(2); + DST2(2); - RGB(3); - DST2(3); - DST1(3); + RGB(3); + DST2(3); + DST1(3); EPILOG(8) PROLOG(yuv2rgb_c_4b_ordered_dither, uint8_t) - const uint8_t *d64= dither_8x8_73[y&7]; - const uint8_t *d128=dither_8x8_220[y&7]; + const uint8_t *d64= dither_8x8_73[y&7]; + const uint8_t *d128=dither_8x8_220[y&7]; -#define DST1bpp4b(i,o) \ - Y = py_1[2*i]; \ - dst_1[2*i] = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \ - Y = py_1[2*i+1]; \ - dst_1[2*i+1] = r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]]; +#define DST1bpp4b(i,o) \ + Y = py_1[2*i]; \ + dst_1[2*i] = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \ + Y = py_1[2*i+1]; \ + dst_1[2*i+1] = r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]]; -#define DST2bpp4b(i,o) \ - Y = py_2[2*i]; \ - dst_2[2*i] = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \ - Y = py_2[2*i+1]; \ - dst_2[2*i+1] = r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]]; +#define DST2bpp4b(i,o) \ + Y = py_2[2*i]; \ + dst_2[2*i] = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \ + Y = py_2[2*i+1]; \ + dst_2[2*i+1] = r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]]; - RGB(0); - DST1bpp4b(0,0); - DST2bpp4b(0,0); + RGB(0); + DST1bpp4b(0,0); + DST2bpp4b(0,0); - RGB(1); - DST2bpp4b(1,2); - DST1bpp4b(1,2); + RGB(1); + DST2bpp4b(1,2); + DST1bpp4b(1,2); - RGB(2); - DST1bpp4b(2,4); - DST2bpp4b(2,4); + RGB(2); + DST1bpp4b(2,4); + DST2bpp4b(2,4); - RGB(3); - DST2bpp4b(3,6); - DST1bpp4b(3,6); + RGB(3); + DST2bpp4b(3,6); + DST1bpp4b(3,6); EPILOG(8) PROLOG(yuv2rgb_c_1_ordered_dither, uint8_t) - const uint8_t *d128=dither_8x8_220[y&7]; - char out_1=0, out_2=0; - g= c->table_gU[128] + c->table_gV[128]; + const uint8_t *d128=dither_8x8_220[y&7]; + char out_1=0, out_2=0; + g= c->table_gU[128] + c->table_gV[128]; -#define DST1bpp1(i,o) \ - Y = py_1[2*i]; \ - out_1+= out_1 + g[Y+d128[0+o]]; \ - Y = py_1[2*i+1]; \ - out_1+= out_1 + g[Y+d128[1+o]]; +#define DST1bpp1(i,o) \ + Y = py_1[2*i]; \ + out_1+= out_1 + g[Y+d128[0+o]]; \ + Y = py_1[2*i+1]; \ + out_1+= out_1 + g[Y+d128[1+o]]; -#define DST2bpp1(i,o) \ - Y = py_2[2*i]; \ - out_2+= out_2 + g[Y+d128[8+o]]; \ - Y = py_2[2*i+1]; \ - out_2+= out_2 + g[Y+d128[9+o]]; +#define DST2bpp1(i,o) \ + Y = py_2[2*i]; \ + out_2+= out_2 + g[Y+d128[8+o]]; \ + Y = py_2[2*i+1]; \ + out_2+= out_2 + g[Y+d128[9+o]]; - DST1bpp1(0,0); - DST2bpp1(0,0); + DST1bpp1(0,0); + DST2bpp1(0,0); - DST2bpp1(1,2); - DST1bpp1(1,2); + DST2bpp1(1,2); + DST1bpp1(1,2); - DST1bpp1(2,4); - DST2bpp1(2,4); + DST1bpp1(2,4); + DST2bpp1(2,4); - DST2bpp1(3,6); - DST1bpp1(3,6); + DST2bpp1(3,6); + DST1bpp1(3,6); - dst_1[0]= out_1; - dst_2[0]= out_2; + dst_1[0]= out_1; + dst_2[0]= out_2; EPILOG(1) SwsFunc yuv2rgb_get_func_ptr (SwsContext *c) { #if defined(HAVE_MMX2) || defined(HAVE_MMX) - if(c->flags & SWS_CPU_CAPS_MMX2){ - switch(c->dstFormat){ - case PIX_FMT_RGB32: return yuv420_rgb32_MMX2; - case PIX_FMT_BGR24: return yuv420_rgb24_MMX2; - case PIX_FMT_BGR565: return yuv420_rgb16_MMX2; - case PIX_FMT_BGR555: return yuv420_rgb15_MMX2; - } + if (c->flags & SWS_CPU_CAPS_MMX2){ + switch(c->dstFormat){ + case PIX_FMT_RGB32: return yuv420_rgb32_MMX2; + case PIX_FMT_BGR24: return yuv420_rgb24_MMX2; + case PIX_FMT_BGR565: return yuv420_rgb16_MMX2; + case PIX_FMT_BGR555: return yuv420_rgb15_MMX2; + } } - if(c->flags & SWS_CPU_CAPS_MMX){ - switch(c->dstFormat){ - case PIX_FMT_RGB32: return yuv420_rgb32_MMX; - case PIX_FMT_BGR24: return yuv420_rgb24_MMX; - case PIX_FMT_BGR565: return yuv420_rgb16_MMX; - case PIX_FMT_BGR555: return yuv420_rgb15_MMX; - } + if (c->flags & SWS_CPU_CAPS_MMX){ + switch(c->dstFormat){ + case PIX_FMT_RGB32: return yuv420_rgb32_MMX; + case PIX_FMT_BGR24: return yuv420_rgb24_MMX; + case PIX_FMT_BGR565: return yuv420_rgb16_MMX; + case PIX_FMT_BGR555: return yuv420_rgb15_MMX; + } } #endif #ifdef HAVE_MLIB { - SwsFunc t= yuv2rgb_init_mlib(c); - if(t) return t; + SwsFunc t= yuv2rgb_init_mlib(c); + if (t) return t; } #endif #ifdef HAVE_ALTIVEC if (c->flags & SWS_CPU_CAPS_ALTIVEC) { - SwsFunc t = yuv2rgb_init_altivec(c); - if(t) return t; + SwsFunc t = yuv2rgb_init_altivec(c); + if (t) return t; } #endif @@ -630,7 +630,7 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c) case PIX_FMT_BGR4_BYTE: return yuv2rgb_c_4b_ordered_dither; case PIX_FMT_MONOBLACK: return yuv2rgb_c_1_ordered_dither; default: - assert(0); + assert(0); } return NULL; } @@ -638,9 +638,9 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c) static int div_round (int dividend, int divisor) { if (dividend > 0) - return (dividend + (divisor>>1)) / divisor; + return (dividend + (divisor>>1)) / divisor; else - return -((-dividend + (divisor>>1)) / divisor); + return -((-dividend + (divisor>>1)) / divisor); } int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation) @@ -667,9 +667,9 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int64_t oy = 0; //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv); - if(!fullRange){ - cy= (cy*255) / 219; - oy= 16<<16; + if (!fullRange){ + cy= (cy*255) / 219; + oy= 16<<16; }else{ crv= (crv*224) / 255; cbu= (cbu*224) / 255; @@ -686,163 +686,163 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, oy -= 256*brightness; for (i = 0; i < 1024; i++) { - int j; + int j; - j= (cy*(((i - 384)<<16) - oy) + (1<<31))>>32; - j = (j < 0) ? 0 : ((j > 255) ? 255 : j); - table_Y[i] = j; + j= (cy*(((i - 384)<<16) - oy) + (1<<31))>>32; + j = (j < 0) ? 0 : ((j > 255) ? 255 : j); + table_Y[i] = j; } switch (bpp) { case 32: - table_start= table_32 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint32_t)); + table_start= table_32 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint32_t)); - entry_size = sizeof (uint32_t); - table_r = table_32 + 197; - table_b = table_32 + 197 + 685; - table_g = table_32 + 197 + 2*682; + entry_size = sizeof (uint32_t); + table_r = table_32 + 197; + table_b = table_32 + 197 + 685; + table_g = table_32 + 197 + 2*682; - for (i = -197; i < 256+197; i++) - ((uint32_t *)table_r)[i] = table_Y[i+384] << (isRgb ? 16 : 0); - for (i = -132; i < 256+132; i++) - ((uint32_t *)table_g)[i] = table_Y[i+384] << 8; - for (i = -232; i < 256+232; i++) - ((uint32_t *)table_b)[i] = table_Y[i+384] << (isRgb ? 0 : 16); - break; + for (i = -197; i < 256+197; i++) + ((uint32_t *)table_r)[i] = table_Y[i+384] << (isRgb ? 16 : 0); + for (i = -132; i < 256+132; i++) + ((uint32_t *)table_g)[i] = table_Y[i+384] << 8; + for (i = -232; i < 256+232; i++) + ((uint32_t *)table_b)[i] = table_Y[i+384] << (isRgb ? 0 : 16); + break; case 24: - table_start= table_8 = av_malloc ((256 + 2*232) * sizeof (uint8_t)); + table_start= table_8 = av_malloc ((256 + 2*232) * sizeof (uint8_t)); - entry_size = sizeof (uint8_t); - table_r = table_g = table_b = table_8 + 232; + entry_size = sizeof (uint8_t); + table_r = table_g = table_b = table_8 + 232; - for (i = -232; i < 256+232; i++) - ((uint8_t * )table_b)[i] = table_Y[i+384]; - break; + for (i = -232; i < 256+232; i++) + ((uint8_t * )table_b)[i] = table_Y[i+384]; + break; case 15: case 16: - table_start= table_16 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t)); + table_start= table_16 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t)); - entry_size = sizeof (uint16_t); - table_r = table_16 + 197; - table_b = table_16 + 197 + 685; - table_g = table_16 + 197 + 2*682; + entry_size = sizeof (uint16_t); + table_r = table_16 + 197; + table_b = table_16 + 197 + 685; + table_g = table_16 + 197 + 2*682; - for (i = -197; i < 256+197; i++) { - int j = table_Y[i+384] >> 3; + for (i = -197; i < 256+197; i++) { + int j = table_Y[i+384] >> 3; - if (isRgb) - j <<= ((bpp==16) ? 11 : 10); + if (isRgb) + j <<= ((bpp==16) ? 11 : 10); - ((uint16_t *)table_r)[i] = j; - } - for (i = -132; i < 256+132; i++) { - int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3); + ((uint16_t *)table_r)[i] = j; + } + for (i = -132; i < 256+132; i++) { + int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3); - ((uint16_t *)table_g)[i] = j << 5; - } - for (i = -232; i < 256+232; i++) { - int j = table_Y[i+384] >> 3; + ((uint16_t *)table_g)[i] = j << 5; + } + for (i = -232; i < 256+232; i++) { + int j = table_Y[i+384] >> 3; - if (!isRgb) - j <<= ((bpp==16) ? 11 : 10); + if (!isRgb) + j <<= ((bpp==16) ? 11 : 10); - ((uint16_t *)table_b)[i] = j; - } - break; + ((uint16_t *)table_b)[i] = j; + } + break; case 8: - table_start= table_332 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t)); + table_start= table_332 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t)); - entry_size = sizeof (uint8_t); - table_r = table_332 + 197; - table_b = table_332 + 197 + 685; - table_g = table_332 + 197 + 2*682; + entry_size = sizeof (uint8_t); + table_r = table_332 + 197; + table_b = table_332 + 197 + 685; + table_g = table_332 + 197 + 2*682; - for (i = -197; i < 256+197; i++) { - int j = (table_Y[i+384 - 16] + 18)/36; + for (i = -197; i < 256+197; i++) { + int j = (table_Y[i+384 - 16] + 18)/36; - if (isRgb) - j <<= 5; + if (isRgb) + j <<= 5; - ((uint8_t *)table_r)[i] = j; - } - for (i = -132; i < 256+132; i++) { - int j = (table_Y[i+384 - 16] + 18)/36; + ((uint8_t *)table_r)[i] = j; + } + for (i = -132; i < 256+132; i++) { + int j = (table_Y[i+384 - 16] + 18)/36; - if (!isRgb) - j <<= 1; + if (!isRgb) + j <<= 1; - ((uint8_t *)table_g)[i] = j << 2; - } - for (i = -232; i < 256+232; i++) { - int j = (table_Y[i+384 - 37] + 43)/85; + ((uint8_t *)table_g)[i] = j << 2; + } + for (i = -232; i < 256+232; i++) { + int j = (table_Y[i+384 - 37] + 43)/85; - if (!isRgb) - j <<= 6; + if (!isRgb) + j <<= 6; - ((uint8_t *)table_b)[i] = j; - } - break; + ((uint8_t *)table_b)[i] = j; + } + break; case 4: case 4|128: - table_start= table_121 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t)); + table_start= table_121 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t)); - entry_size = sizeof (uint8_t); - table_r = table_121 + 197; - table_b = table_121 + 197 + 685; - table_g = table_121 + 197 + 2*682; + entry_size = sizeof (uint8_t); + table_r = table_121 + 197; + table_b = table_121 + 197 + 685; + table_g = table_121 + 197 + 2*682; - for (i = -197; i < 256+197; i++) { - int j = table_Y[i+384 - 110] >> 7; + for (i = -197; i < 256+197; i++) { + int j = table_Y[i+384 - 110] >> 7; - if (isRgb) - j <<= 3; + if (isRgb) + j <<= 3; - ((uint8_t *)table_r)[i] = j; - } - for (i = -132; i < 256+132; i++) { - int j = (table_Y[i+384 - 37]+ 43)/85; + ((uint8_t *)table_r)[i] = j; + } + for (i = -132; i < 256+132; i++) { + int j = (table_Y[i+384 - 37]+ 43)/85; - ((uint8_t *)table_g)[i] = j << 1; - } - for (i = -232; i < 256+232; i++) { - int j =table_Y[i+384 - 110] >> 7; + ((uint8_t *)table_g)[i] = j << 1; + } + for (i = -232; i < 256+232; i++) { + int j =table_Y[i+384 - 110] >> 7; - if (!isRgb) - j <<= 3; + if (!isRgb) + j <<= 3; - ((uint8_t *)table_b)[i] = j; - } - break; + ((uint8_t *)table_b)[i] = j; + } + break; case 1: - table_start= table_1 = av_malloc (256*2 * sizeof (uint8_t)); + table_start= table_1 = av_malloc (256*2 * sizeof (uint8_t)); - entry_size = sizeof (uint8_t); - table_g = table_1; - table_r = table_b = NULL; + entry_size = sizeof (uint8_t); + table_g = table_1; + table_r = table_b = NULL; - for (i = 0; i < 256+256; i++) { - int j = table_Y[i + 384 - 110]>>7; + for (i = 0; i < 256+256; i++) { + int j = table_Y[i + 384 - 110]>>7; - ((uint8_t *)table_g)[i] = j; - } - break; + ((uint8_t *)table_g)[i] = j; + } + break; default: - table_start= NULL; - av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp); - //free mem? - return -1; + table_start= NULL; + av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp); + //free mem? + return -1; } for (i = 0; i < 256; i++) { - c->table_rV[i] = (uint8_t *)table_r + entry_size * div_round (crv * (i-128), 76309); - c->table_gU[i] = (uint8_t *)table_g + entry_size * div_round (cgu * (i-128), 76309); - c->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309); - c->table_bU[i] = (uint8_t *)table_b + entry_size * div_round (cbu * (i-128), 76309); + c->table_rV[i] = (uint8_t *)table_r + entry_size * div_round (crv * (i-128), 76309); + c->table_gU[i] = (uint8_t *)table_g + entry_size * div_round (cgu * (i-128), 76309); + c->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309); + c->table_bU[i] = (uint8_t *)table_b + entry_size * div_round (cbu * (i-128), 76309); } av_free(c->yuvTable); diff --git a/libswscale/yuv2rgb_altivec.c b/libswscale/yuv2rgb_altivec.c index 60cc0a4ac1..04e8e1711e 100644 --- a/libswscale/yuv2rgb_altivec.c +++ b/libswscale/yuv2rgb_altivec.c @@ -139,70 +139,70 @@ typedef signed char sbyte; static const vector unsigned char perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, - 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a), + 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a), perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, - 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f), + 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f), perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, - 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05), + 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05), perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, - 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f); + 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f); -#define vec_merge3(x2,x1,x0,y0,y1,y2) \ -do { \ - typeof(x0) o0,o2,o3; \ - o0 = vec_mergeh (x0,x1); \ - y0 = vec_perm (o0, x2, perm_rgb_0);\ - o2 = vec_perm (o0, x2, perm_rgb_1);\ - o3 = vec_mergel (x0,x1); \ - y1 = vec_perm (o3,o2,perm_rgb_2); \ - y2 = vec_perm (o3,o2,perm_rgb_3); \ +#define vec_merge3(x2,x1,x0,y0,y1,y2) \ +do { \ + typeof(x0) o0,o2,o3; \ + o0 = vec_mergeh (x0,x1); \ + y0 = vec_perm (o0, x2, perm_rgb_0); \ + o2 = vec_perm (o0, x2, perm_rgb_1); \ + o3 = vec_mergel (x0,x1); \ + y1 = vec_perm (o3,o2,perm_rgb_2); \ + y2 = vec_perm (o3,o2,perm_rgb_3); \ } while(0) -#define vec_mstbgr24(x0,x1,x2,ptr) \ -do { \ - typeof(x0) _0,_1,_2; \ - vec_merge3 (x0,x1,x2,_0,_1,_2); \ - vec_st (_0, 0, ptr++); \ - vec_st (_1, 0, ptr++); \ - vec_st (_2, 0, ptr++); \ +#define vec_mstbgr24(x0,x1,x2,ptr) \ +do { \ + typeof(x0) _0,_1,_2; \ + vec_merge3 (x0,x1,x2,_0,_1,_2); \ + vec_st (_0, 0, ptr++); \ + vec_st (_1, 0, ptr++); \ + vec_st (_2, 0, ptr++); \ } while (0); -#define vec_mstrgb24(x0,x1,x2,ptr) \ -do { \ - typeof(x0) _0,_1,_2; \ - vec_merge3 (x2,x1,x0,_0,_1,_2); \ - vec_st (_0, 0, ptr++); \ - vec_st (_1, 0, ptr++); \ - vec_st (_2, 0, ptr++); \ +#define vec_mstrgb24(x0,x1,x2,ptr) \ +do { \ + typeof(x0) _0,_1,_2; \ + vec_merge3 (x2,x1,x0,_0,_1,_2); \ + vec_st (_0, 0, ptr++); \ + vec_st (_1, 0, ptr++); \ + vec_st (_2, 0, ptr++); \ } while (0); /* pack the pixels in rgb0 format msb R lsb 0 */ -#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \ -do { \ - T _0,_1,_2,_3; \ - _0 = vec_mergeh (x0,x1); \ - _1 = vec_mergeh (x2,x3); \ - _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ - _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ - vec_st (_2, 0*16, (T *)ptr); \ - vec_st (_3, 1*16, (T *)ptr); \ - _0 = vec_mergel (x0,x1); \ - _1 = vec_mergel (x2,x3); \ - _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ - _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ - vec_st (_2, 2*16, (T *)ptr); \ - vec_st (_3, 3*16, (T *)ptr); \ - ptr += 4; \ +#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \ +do { \ + T _0,_1,_2,_3; \ + _0 = vec_mergeh (x0,x1); \ + _1 = vec_mergeh (x2,x3); \ + _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ + _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ + vec_st (_2, 0*16, (T *)ptr); \ + vec_st (_3, 1*16, (T *)ptr); \ + _0 = vec_mergel (x0,x1); \ + _1 = vec_mergel (x2,x3); \ + _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ + _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ + vec_st (_2, 2*16, (T *)ptr); \ + vec_st (_3, 3*16, (T *)ptr); \ + ptr += 4; \ } while (0); /* | 1 0 1.4021 | | Y | | 1 -0.3441 -0.7142 |x| Cb| - | 1 1.7718 0 | | Cr| + | 1 1.7718 0 | | Cr| Y: [-128 127] @@ -216,51 +216,51 @@ do { \ #define vec_unh(x) \ - (vector signed short) \ - vec_perm(x,(typeof(x))AVV(0),\ - (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ - 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07)) + (vector signed short) \ + vec_perm(x,(typeof(x))AVV(0),\ + (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ + 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07)) #define vec_unl(x) \ - (vector signed short) \ - vec_perm(x,(typeof(x))AVV(0),\ - (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ - 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F)) + (vector signed short) \ + vec_perm(x,(typeof(x))AVV(0),\ + (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ + 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F)) #define vec_clip_s16(x) \ - vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\ - (vector signed short)AVV(16, 16, 16, 16, 16, 16, 16, 16 )) + vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\ + (vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16)) #define vec_packclp(x,y) \ - (vector unsigned char)vec_packs \ - ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \ - (vector unsigned short)vec_max (y,(vector signed short) AVV(0))) + (vector unsigned char)vec_packs \ + ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \ + (vector unsigned short)vec_max (y,(vector signed short) AVV(0))) //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr) static inline void cvtyuvtoRGB (SwsContext *c, - vector signed short Y, vector signed short U, vector signed short V, - vector signed short *R, vector signed short *G, vector signed short *B) + vector signed short Y, vector signed short U, vector signed short V, + vector signed short *R, vector signed short *G, vector signed short *B) { - vector signed short vx,ux,uvx; + vector signed short vx,ux,uvx; - Y = vec_mradds (Y, c->CY, c->OY); - U = vec_sub (U,(vector signed short) - vec_splat((vector signed short)AVV(128),0)); - V = vec_sub (V,(vector signed short) - vec_splat((vector signed short)AVV(128),0)); + Y = vec_mradds (Y, c->CY, c->OY); + U = vec_sub (U,(vector signed short) + vec_splat((vector signed short)AVV(128),0)); + V = vec_sub (V,(vector signed short) + vec_splat((vector signed short)AVV(128),0)); - // ux = (CBU*(u<CSHIFT)+0x4000)>>15; - ux = vec_sl (U, c->CSHIFT); - *B = vec_mradds (ux, c->CBU, Y); + // ux = (CBU*(u<CSHIFT)+0x4000)>>15; + ux = vec_sl (U, c->CSHIFT); + *B = vec_mradds (ux, c->CBU, Y); - // vx = (CRV*(v<CSHIFT)+0x4000)>>15; - vx = vec_sl (V, c->CSHIFT); - *R = vec_mradds (vx, c->CRV, Y); + // vx = (CRV*(v<CSHIFT)+0x4000)>>15; + vx = vec_sl (V, c->CSHIFT); + *R = vec_mradds (vx, c->CRV, Y); - // uvx = ((CGU*u) + (CGV*v))>>15; - uvx = vec_mradds (U, c->CGU, Y); - *G = vec_mradds (V, c->CGV, uvx); + // uvx = ((CGU*u) + (CGV*v))>>15; + uvx = vec_mradds (U, c->CGU, Y); + *G = vec_mradds (V, c->CGV, uvx); } @@ -271,164 +271,168 @@ static inline void cvtyuvtoRGB (SwsContext *c, */ -#define DEFCSP420_CVT(name,out_pixels) \ -static int altivec_##name (SwsContext *c, \ - unsigned char **in, int *instrides, \ - int srcSliceY, int srcSliceH, \ - unsigned char **oplanes, int *outstrides) \ -{ \ - int w = c->srcW; \ - int h = srcSliceH; \ - int i,j; \ - int instrides_scl[3]; \ - vector unsigned char y0,y1; \ - \ - vector signed char u,v; \ - \ - vector signed short Y0,Y1,Y2,Y3; \ - vector signed short U,V; \ - vector signed short vx,ux,uvx; \ - vector signed short vx0,ux0,uvx0; \ - vector signed short vx1,ux1,uvx1; \ - vector signed short R0,G0,B0; \ - vector signed short R1,G1,B1; \ - vector unsigned char R,G,B; \ - \ - vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \ - vector unsigned char align_perm; \ - \ - vector signed short \ - lCY = c->CY, \ - lOY = c->OY, \ - lCRV = c->CRV, \ - lCBU = c->CBU, \ - lCGU = c->CGU, \ - lCGV = c->CGV; \ - \ - vector unsigned short lCSHIFT = c->CSHIFT; \ - \ - ubyte *y1i = in[0]; \ - ubyte *y2i = in[0]+instrides[0]; \ - ubyte *ui = in[1]; \ - ubyte *vi = in[2]; \ - \ - vector unsigned char *oute \ - = (vector unsigned char *) \ - (oplanes[0]+srcSliceY*outstrides[0]); \ - vector unsigned char *outo \ - = (vector unsigned char *) \ - (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ - \ - \ - instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \ - instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ - instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ - \ - \ - for (i=0;i>15 */ \ - ux = vec_sl (U, lCSHIFT); \ - ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \ - ux0 = vec_mergeh (ux,ux); \ - ux1 = vec_mergel (ux,ux); \ - \ - /* vx = (CRV*(v<>15; */ \ - vx = vec_sl (V, lCSHIFT); \ - vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \ - vx0 = vec_mergeh (vx,vx); \ - vx1 = vec_mergel (vx,vx); \ - \ - /* uvx = ((CGU*u) + (CGV*v))>>15 */ \ - uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \ - uvx = vec_mradds (V, lCGV, uvx); \ - uvx0 = vec_mergeh (uvx,uvx); \ - uvx1 = vec_mergel (uvx,uvx); \ - \ - R0 = vec_add (Y0,vx0); \ - G0 = vec_add (Y0,uvx0); \ - B0 = vec_add (Y0,ux0); \ - R1 = vec_add (Y1,vx1); \ - G1 = vec_add (Y1,uvx1); \ - B1 = vec_add (Y1,ux1); \ - \ - R = vec_packclp (R0,R1); \ - G = vec_packclp (G0,G1); \ - B = vec_packclp (B0,B1); \ - \ - out_pixels(R,G,B,oute); \ - \ - R0 = vec_add (Y2,vx0); \ - G0 = vec_add (Y2,uvx0); \ - B0 = vec_add (Y2,ux0); \ - R1 = vec_add (Y3,vx1); \ - G1 = vec_add (Y3,uvx1); \ - B1 = vec_add (Y3,ux1); \ - R = vec_packclp (R0,R1); \ - G = vec_packclp (G0,G1); \ - B = vec_packclp (B0,B1); \ - \ - \ - out_pixels(R,G,B,outo); \ - \ - y1i += 16; \ - y2i += 16; \ - ui += 8; \ - vi += 8; \ - \ - } \ - \ - outo += (outstrides[0])>>4; \ - oute += (outstrides[0])>>4; \ - \ - ui += instrides_scl[1]; \ - vi += instrides_scl[2]; \ - y1i += instrides_scl[0]; \ - y2i += instrides_scl[0]; \ - } \ - return srcSliceH; \ +#define DEFCSP420_CVT(name,out_pixels) \ +static int altivec_##name (SwsContext *c, \ + unsigned char **in, int *instrides, \ + int srcSliceY, int srcSliceH, \ + unsigned char **oplanes, int *outstrides) \ +{ \ + int w = c->srcW; \ + int h = srcSliceH; \ + int i,j; \ + int instrides_scl[3]; \ + vector unsigned char y0,y1; \ + \ + vector signed char u,v; \ + \ + vector signed short Y0,Y1,Y2,Y3; \ + vector signed short U,V; \ + vector signed short vx,ux,uvx; \ + vector signed short vx0,ux0,uvx0; \ + vector signed short vx1,ux1,uvx1; \ + vector signed short R0,G0,B0; \ + vector signed short R1,G1,B1; \ + vector unsigned char R,G,B; \ + \ + vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \ + vector unsigned char align_perm; \ + \ + vector signed short \ + lCY = c->CY, \ + lOY = c->OY, \ + lCRV = c->CRV, \ + lCBU = c->CBU, \ + lCGU = c->CGU, \ + lCGV = c->CGV; \ + \ + vector unsigned short lCSHIFT = c->CSHIFT; \ + \ + ubyte *y1i = in[0]; \ + ubyte *y2i = in[0]+instrides[0]; \ + ubyte *ui = in[1]; \ + ubyte *vi = in[2]; \ + \ + vector unsigned char *oute \ + = (vector unsigned char *) \ + (oplanes[0]+srcSliceY*outstrides[0]); \ + vector unsigned char *outo \ + = (vector unsigned char *) \ + (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ + \ + \ + instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \ + instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ + instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ + \ + \ + for (i=0;i>15 */ \ + ux = vec_sl (U, lCSHIFT); \ + ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \ + ux0 = vec_mergeh (ux,ux); \ + ux1 = vec_mergel (ux,ux); \ + \ + /* vx = (CRV*(v<>15; */ \ + vx = vec_sl (V, lCSHIFT); \ + vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \ + vx0 = vec_mergeh (vx,vx); \ + vx1 = vec_mergel (vx,vx); \ + \ + /* uvx = ((CGU*u) + (CGV*v))>>15 */ \ + uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \ + uvx = vec_mradds (V, lCGV, uvx); \ + uvx0 = vec_mergeh (uvx,uvx); \ + uvx1 = vec_mergel (uvx,uvx); \ + \ + R0 = vec_add (Y0,vx0); \ + G0 = vec_add (Y0,uvx0); \ + B0 = vec_add (Y0,ux0); \ + R1 = vec_add (Y1,vx1); \ + G1 = vec_add (Y1,uvx1); \ + B1 = vec_add (Y1,ux1); \ + \ + R = vec_packclp (R0,R1); \ + G = vec_packclp (G0,G1); \ + B = vec_packclp (B0,B1); \ + \ + out_pixels(R,G,B,oute); \ + \ + R0 = vec_add (Y2,vx0); \ + G0 = vec_add (Y2,uvx0); \ + B0 = vec_add (Y2,ux0); \ + R1 = vec_add (Y3,vx1); \ + G1 = vec_add (Y3,uvx1); \ + B1 = vec_add (Y3,ux1); \ + R = vec_packclp (R0,R1); \ + G = vec_packclp (G0,G1); \ + B = vec_packclp (B0,B1); \ + \ + \ + out_pixels(R,G,B,outo); \ + \ + y1i += 16; \ + y2i += 16; \ + ui += 8; \ + vi += 8; \ + \ + } \ + \ + outo += (outstrides[0])>>4; \ + oute += (outstrides[0])>>4; \ + \ + ui += instrides_scl[1]; \ + vi += instrides_scl[2]; \ + y1i += instrides_scl[0]; \ + y2i += instrides_scl[0]; \ + } \ + return srcSliceH; \ } @@ -444,150 +448,150 @@ DEFCSP420_CVT (yuv2_abgr, out_abgr) DEFCSP420_CVT (yuv2_bgra, out_bgra) #else static int altivec_yuv2_bgra32 (SwsContext *c, - unsigned char **in, int *instrides, - int srcSliceY, int srcSliceH, - unsigned char **oplanes, int *outstrides) + unsigned char **in, int *instrides, + int srcSliceY, int srcSliceH, + unsigned char **oplanes, int *outstrides) { - int w = c->srcW; - int h = srcSliceH; - int i,j; - int instrides_scl[3]; - vector unsigned char y0,y1; + int w = c->srcW; + int h = srcSliceH; + int i,j; + int instrides_scl[3]; + vector unsigned char y0,y1; - vector signed char u,v; + vector signed char u,v; - vector signed short Y0,Y1,Y2,Y3; - vector signed short U,V; - vector signed short vx,ux,uvx; - vector signed short vx0,ux0,uvx0; - vector signed short vx1,ux1,uvx1; - vector signed short R0,G0,B0; - vector signed short R1,G1,B1; - vector unsigned char R,G,B; + vector signed short Y0,Y1,Y2,Y3; + vector signed short U,V; + vector signed short vx,ux,uvx; + vector signed short vx0,ux0,uvx0; + vector signed short vx1,ux1,uvx1; + vector signed short R0,G0,B0; + vector signed short R1,G1,B1; + vector unsigned char R,G,B; - vector unsigned char *uivP, *vivP; - vector unsigned char align_perm; + vector unsigned char *uivP, *vivP; + vector unsigned char align_perm; - vector signed short - lCY = c->CY, - lOY = c->OY, - lCRV = c->CRV, - lCBU = c->CBU, - lCGU = c->CGU, - lCGV = c->CGV; + vector signed short + lCY = c->CY, + lOY = c->OY, + lCRV = c->CRV, + lCBU = c->CBU, + lCGU = c->CGU, + lCGV = c->CGV; - vector unsigned short lCSHIFT = c->CSHIFT; + vector unsigned short lCSHIFT = c->CSHIFT; - ubyte *y1i = in[0]; - ubyte *y2i = in[0]+w; - ubyte *ui = in[1]; - ubyte *vi = in[2]; + ubyte *y1i = in[0]; + ubyte *y2i = in[0]+w; + ubyte *ui = in[1]; + ubyte *vi = in[2]; - vector unsigned char *oute - = (vector unsigned char *) - (oplanes[0]+srcSliceY*outstrides[0]); - vector unsigned char *outo - = (vector unsigned char *) - (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); + vector unsigned char *oute + = (vector unsigned char *) + (oplanes[0]+srcSliceY*outstrides[0]); + vector unsigned char *outo + = (vector unsigned char *) + (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); - instrides_scl[0] = instrides[0]; - instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ - instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ + instrides_scl[0] = instrides[0]; + instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ + instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ - for (i=0;i>15 */ - ux = vec_sl (U, lCSHIFT); - ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); - ux0 = vec_mergeh (ux,ux); - ux1 = vec_mergel (ux,ux); + /* ux = (CBU*(u<>15 */ + ux = vec_sl (U, lCSHIFT); + ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); + ux0 = vec_mergeh (ux,ux); + ux1 = vec_mergel (ux,ux); - /* vx = (CRV*(v<>15; */ - vx = vec_sl (V, lCSHIFT); - vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); - vx0 = vec_mergeh (vx,vx); - vx1 = vec_mergel (vx,vx); - /* uvx = ((CGU*u) + (CGV*v))>>15 */ - uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); - uvx = vec_mradds (V, lCGV, uvx); - uvx0 = vec_mergeh (uvx,uvx); - uvx1 = vec_mergel (uvx,uvx); - R0 = vec_add (Y0,vx0); - G0 = vec_add (Y0,uvx0); - B0 = vec_add (Y0,ux0); - R1 = vec_add (Y1,vx1); - G1 = vec_add (Y1,uvx1); - B1 = vec_add (Y1,ux1); - R = vec_packclp (R0,R1); - G = vec_packclp (G0,G1); - B = vec_packclp (B0,B1); + /* vx = (CRV*(v<>15; */ + vx = vec_sl (V, lCSHIFT); + vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); + vx0 = vec_mergeh (vx,vx); + vx1 = vec_mergel (vx,vx); + /* uvx = ((CGU*u) + (CGV*v))>>15 */ + uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); + uvx = vec_mradds (V, lCGV, uvx); + uvx0 = vec_mergeh (uvx,uvx); + uvx1 = vec_mergel (uvx,uvx); + R0 = vec_add (Y0,vx0); + G0 = vec_add (Y0,uvx0); + B0 = vec_add (Y0,ux0); + R1 = vec_add (Y1,vx1); + G1 = vec_add (Y1,uvx1); + B1 = vec_add (Y1,ux1); + R = vec_packclp (R0,R1); + G = vec_packclp (G0,G1); + B = vec_packclp (B0,B1); - out_argb(R,G,B,oute); - R0 = vec_add (Y2,vx0); - G0 = vec_add (Y2,uvx0); - B0 = vec_add (Y2,ux0); - R1 = vec_add (Y3,vx1); - G1 = vec_add (Y3,uvx1); - B1 = vec_add (Y3,ux1); - R = vec_packclp (R0,R1); - G = vec_packclp (G0,G1); - B = vec_packclp (B0,B1); + out_argb(R,G,B,oute); + R0 = vec_add (Y2,vx0); + G0 = vec_add (Y2,uvx0); + B0 = vec_add (Y2,ux0); + R1 = vec_add (Y3,vx1); + G1 = vec_add (Y3,uvx1); + B1 = vec_add (Y3,ux1); + R = vec_packclp (R0,R1); + G = vec_packclp (G0,G1); + B = vec_packclp (B0,B1); - out_argb(R,G,B,outo); - y1i += 16; - y2i += 16; - ui += 8; - vi += 8; + out_argb(R,G,B,outo); + y1i += 16; + y2i += 16; + ui += 8; + vi += 8; + } + + outo += (outstrides[0])>>4; + oute += (outstrides[0])>>4; + + ui += instrides_scl[1]; + vi += instrides_scl[2]; + y1i += instrides_scl[0]; + y2i += instrides_scl[0]; } - - outo += (outstrides[0])>>4; - oute += (outstrides[0])>>4; - - ui += instrides_scl[1]; - vi += instrides_scl[2]; - y1i += instrides_scl[0]; - y2i += instrides_scl[0]; - } - return srcSliceH; + return srcSliceH; } #endif @@ -603,77 +607,77 @@ DEFCSP420_CVT (yuv2_bgr24, out_bgr24) // 0123 4567 89ab cdef static const vector unsigned char - demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00, - 0x10,0x04,0x10,0x04, - 0x10,0x08,0x10,0x08, - 0x10,0x0c,0x10,0x0c), - demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02, - 0x10,0x06,0x10,0x06, - 0x10,0x0A,0x10,0x0A, - 0x10,0x0E,0x10,0x0E), - demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03, - 0x10,0x05,0x10,0x07, - 0x10,0x09,0x10,0x0B, - 0x10,0x0D,0x10,0x0F); + demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00, + 0x10,0x04,0x10,0x04, + 0x10,0x08,0x10,0x08, + 0x10,0x0c,0x10,0x0c), + demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02, + 0x10,0x06,0x10,0x06, + 0x10,0x0A,0x10,0x0A, + 0x10,0x0E,0x10,0x0E), + demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03, + 0x10,0x05,0x10,0x07, + 0x10,0x09,0x10,0x0B, + 0x10,0x0D,0x10,0x0F); /* this is so I can play live CCIR raw video */ static int altivec_uyvy_rgb32 (SwsContext *c, - unsigned char **in, int *instrides, - int srcSliceY, int srcSliceH, - unsigned char **oplanes, int *outstrides) + unsigned char **in, int *instrides, + int srcSliceY, int srcSliceH, + unsigned char **oplanes, int *outstrides) { - int w = c->srcW; - int h = srcSliceH; - int i,j; - vector unsigned char uyvy; - vector signed short Y,U,V; - vector signed short R0,G0,B0,R1,G1,B1; - vector unsigned char R,G,B; - vector unsigned char *out; - ubyte *img; + int w = c->srcW; + int h = srcSliceH; + int i,j; + vector unsigned char uyvy; + vector signed short Y,U,V; + vector signed short R0,G0,B0,R1,G1,B1; + vector unsigned char R,G,B; + vector unsigned char *out; + ubyte *img; - img = in[0]; - out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); + img = in[0]; + out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); - for (i=0;iflags & SWS_CPU_CAPS_ALTIVEC)) + if (!(c->flags & SWS_CPU_CAPS_ALTIVEC)) + return NULL; + + /* + and this seems not to matter too much I tried a bunch of + videos with abnormal widths and mplayer crashes else where. + mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv + boom with X11 bad match. + + */ + if ((c->srcW & 0xf) != 0) return NULL; + + switch (c->srcFormat) { + case PIX_FMT_YUV410P: + case PIX_FMT_YUV420P: + /*case IMGFMT_CLPL: ??? */ + case PIX_FMT_GRAY8: + case PIX_FMT_NV12: + case PIX_FMT_NV21: + if ((c->srcH & 0x1) != 0) + return NULL; + + switch(c->dstFormat){ + case PIX_FMT_RGB24: + av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n"); + return altivec_yuv2_rgb24; + case PIX_FMT_BGR24: + av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n"); + return altivec_yuv2_bgr24; + case PIX_FMT_ARGB: + av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n"); + return altivec_yuv2_argb; + case PIX_FMT_ABGR: + av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n"); + return altivec_yuv2_abgr; + case PIX_FMT_RGBA: + av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n"); + return altivec_yuv2_rgba; + case PIX_FMT_BGRA: + av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n"); + return altivec_yuv2_bgra; + default: return NULL; + } + break; + + case PIX_FMT_UYVY422: + switch(c->dstFormat){ + case PIX_FMT_BGR32: + av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n"); + return altivec_uyvy_rgb32; + default: return NULL; + } + break; + + } return NULL; - - /* - and this seems not to matter too much I tried a bunch of - videos with abnormal widths and mplayer crashes else where. - mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv - boom with X11 bad match. - - */ - if ((c->srcW & 0xf) != 0) return NULL; - - switch (c->srcFormat) { - case PIX_FMT_YUV410P: - case PIX_FMT_YUV420P: - /*case IMGFMT_CLPL: ??? */ - case PIX_FMT_GRAY8: - case PIX_FMT_NV12: - case PIX_FMT_NV21: - if ((c->srcH & 0x1) != 0) - return NULL; - - switch(c->dstFormat){ - case PIX_FMT_RGB24: - av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n"); - return altivec_yuv2_rgb24; - case PIX_FMT_BGR24: - av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n"); - return altivec_yuv2_bgr24; - case PIX_FMT_ARGB: - av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n"); - return altivec_yuv2_argb; - case PIX_FMT_ABGR: - av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n"); - return altivec_yuv2_abgr; - case PIX_FMT_RGBA: - av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n"); - return altivec_yuv2_rgba; - case PIX_FMT_BGRA: - av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n"); - return altivec_yuv2_bgra; - default: return NULL; - } - break; - - case PIX_FMT_UYVY422: - switch(c->dstFormat){ - case PIX_FMT_BGR32: - av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n"); - return altivec_uyvy_rgb32; - default: return NULL; - } - break; - - } - return NULL; } static uint16_t roundToInt16(int64_t f){ - int r= (f + (1<<15))>>16; - if(r<-0x7FFF) return 0x8000; - else if(r> 0x7FFF) return 0x7FFF; - else return r; + int r= (f + (1<<15))>>16; + if (r<-0x7FFF) return 0x8000; + else if (r> 0x7FFF) return 0x7FFF; + else return r; } void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation) { - union { - signed short tmp[8] __attribute__ ((aligned(16))); - vector signed short vec; - } buf; + union { + signed short tmp[8] __attribute__ ((aligned(16))); + vector signed short vec; + } buf; - buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy - buf.tmp[1] = -256*brightness; //oy - buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv - buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu - buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu - buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv + buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy + buf.tmp[1] = -256*brightness; //oy + buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv + buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu + buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu + buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv - c->CSHIFT = (vector unsigned short)vec_splat_u16(2); - c->CY = vec_splat ((vector signed short)buf.vec, 0); - c->OY = vec_splat ((vector signed short)buf.vec, 1); - c->CRV = vec_splat ((vector signed short)buf.vec, 2); - c->CBU = vec_splat ((vector signed short)buf.vec, 3); - c->CGU = vec_splat ((vector signed short)buf.vec, 4); - c->CGV = vec_splat ((vector signed short)buf.vec, 5); + c->CSHIFT = (vector unsigned short)vec_splat_u16(2); + c->CY = vec_splat ((vector signed short)buf.vec, 0); + c->OY = vec_splat ((vector signed short)buf.vec, 1); + c->CRV = vec_splat ((vector signed short)buf.vec, 2); + c->CBU = vec_splat ((vector signed short)buf.vec, 3); + c->CGU = vec_splat ((vector signed short)buf.vec, 4); + c->CGV = vec_splat ((vector signed short)buf.vec, 5); #if 0 -{ -int i; -char *v[6]={"cy","oy","crv","cbu","cgu","cgv"}; -for (i=0; i<6;i++) - printf("%s %d ", v[i],buf.tmp[i] ); - printf("\n"); -} + { + int i; + char *v[6]={"cy","oy","crv","cbu","cgu","cgv"}; + for (i=0; i<6; i++) + printf("%s %d ", v[i],buf.tmp[i] ); + printf("\n"); + } #endif - return; + return; } void altivec_yuv2packedX (SwsContext *c, - int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, - int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, - uint8_t *dest, int dstW, int dstY) + int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, + int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, + uint8_t *dest, int dstW, int dstY) { - int i,j; - vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; - vector signed short R0,G0,B0,R1,G1,B1; + int i,j; + vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; + vector signed short R0,G0,B0,R1,G1,B1; - vector unsigned char R,G,B; - vector unsigned char *out,*nout; + vector unsigned char R,G,B; + vector unsigned char *out,*nout; - vector signed short RND = vec_splat_s16(1<<3); - vector unsigned short SCL = vec_splat_u16(4); - unsigned long scratch[16] __attribute__ ((aligned (16))); + vector signed short RND = vec_splat_s16(1<<3); + vector unsigned short SCL = vec_splat_u16(4); + unsigned long scratch[16] __attribute__ ((aligned (16))); - vector signed short *YCoeffs, *CCoeffs; + vector signed short *YCoeffs, *CCoeffs; - YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize; - CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize; + YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize; + CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize; - out = (vector unsigned char *)dest; + out = (vector unsigned char *)dest; - for(i=0; idstFormat) { - case PIX_FMT_ABGR: out_abgr (R,G,B,out); break; - case PIX_FMT_BGRA: out_bgra (R,G,B,out); break; - case PIX_FMT_RGBA: out_rgba (R,G,B,out); break; - case PIX_FMT_ARGB: out_argb (R,G,B,out); break; - case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break; - case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break; - default: - { - /* If this is reached, the caller should have called yuv2packedXinC - instead. */ - static int printed_error_message; - if(!printed_error_message) { - av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", - sws_format_name(c->dstFormat)); - printed_error_message=1; - } - return; + switch(c->dstFormat) { + case PIX_FMT_ABGR: out_abgr (R,G,B,out); break; + case PIX_FMT_BGRA: out_bgra (R,G,B,out); break; + case PIX_FMT_RGBA: out_rgba (R,G,B,out); break; + case PIX_FMT_ARGB: out_argb (R,G,B,out); break; + case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break; + case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break; + default: + { + /* If this is reached, the caller should have called yuv2packedXinC + instead. */ + static int printed_error_message; + if (!printed_error_message) { + av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", + sws_format_name(c->dstFormat)); + printed_error_message=1; + } + return; + } } } - } - if (i < dstW) { - i -= 16; + if (i < dstW) { + i -= 16; - Y0 = RND; - Y1 = RND; - /* extract 16 coeffs from lumSrc */ - for(j=0; jdstFormat) { + case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break; + case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break; + case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break; + case PIX_FMT_ARGB: out_argb (R,G,B,nout); break; + case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break; + case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break; + default: + /* Unreachable, I think. */ + av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", + sws_format_name(c->dstFormat)); + return; + } + + memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4); } - U = RND; - V = RND; - /* extract 8 coeffs from U,V */ - for(j=0; jdstFormat) { - case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break; - case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break; - case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break; - case PIX_FMT_ARGB: out_argb (R,G,B,nout); break; - case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break; - case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break; - default: - /* Unreachable, I think. */ - av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", - sws_format_name(c->dstFormat)); - return; - } - - memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4); - } - } diff --git a/libswscale/yuv2rgb_mlib.c b/libswscale/yuv2rgb_mlib.c index 6d540a38b4..e80f1ad318 100644 --- a/libswscale/yuv2rgb_mlib.c +++ b/libswscale/yuv2rgb_mlib.c @@ -32,55 +32,55 @@ #include "swscale.h" static int mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]){ + int srcSliceH, uint8_t* dst[], int dstStride[]){ if(c->srcFormat == PIX_FMT_YUV422P){ - srcStride[1] *= 2; - srcStride[2] *= 2; + srcStride[1] *= 2; + srcStride[2] *= 2; } assert(srcStride[1] == srcStride[2]); mlib_VideoColorYUV2ARGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW, - srcSliceH, dstStride[0], srcStride[0], srcStride[1]); + srcSliceH, dstStride[0], srcStride[0], srcStride[1]); return srcSliceH; } static int mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]){ + int srcSliceH, uint8_t* dst[], int dstStride[]){ if(c->srcFormat == PIX_FMT_YUV422P){ - srcStride[1] *= 2; - srcStride[2] *= 2; + srcStride[1] *= 2; + srcStride[2] *= 2; } assert(srcStride[1] == srcStride[2]); mlib_VideoColorYUV2ABGR420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW, - srcSliceH, dstStride[0], srcStride[0], srcStride[1]); + srcSliceH, dstStride[0], srcStride[0], srcStride[1]); return srcSliceH; } static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]){ + int srcSliceH, uint8_t* dst[], int dstStride[]){ if(c->srcFormat == PIX_FMT_YUV422P){ - srcStride[1] *= 2; - srcStride[2] *= 2; + srcStride[1] *= 2; + srcStride[2] *= 2; } assert(srcStride[1] == srcStride[2]); mlib_VideoColorYUV2RGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW, - srcSliceH, dstStride[0], srcStride[0], srcStride[1]); + srcSliceH, dstStride[0], srcStride[0], srcStride[1]); return srcSliceH; } SwsFunc yuv2rgb_init_mlib(SwsContext *c) { - switch(c->dstFormat){ - case PIX_FMT_RGB24: return mlib_YUV2RGB420_24; - case PIX_FMT_BGR32: return mlib_YUV2ARGB420_32; - case PIX_FMT_RGB32: return mlib_YUV2ABGR420_32; - default: return NULL; - } + switch(c->dstFormat){ + case PIX_FMT_RGB24: return mlib_YUV2RGB420_24; + case PIX_FMT_BGR32: return mlib_YUV2ARGB420_32; + case PIX_FMT_RGB32: return mlib_YUV2ABGR420_32; + default: return NULL; + } } diff --git a/libswscale/yuv2rgb_template.c b/libswscale/yuv2rgb_template.c index 2a725eee7a..66dbf72f18 100644 --- a/libswscale/yuv2rgb_template.c +++ b/libswscale/yuv2rgb_template.c @@ -47,169 +47,169 @@ #endif #define YUV2RGB \ - /* Do the multiply part of the conversion for even and odd pixels, - register usage: - mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, - mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, - mm6 -> Y even, mm7 -> Y odd */\ - /* convert the chroma part */\ - "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ - "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ + /* Do the multiply part of the conversion for even and odd pixels, + register usage: + mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, + mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, + mm6 -> Y even, mm7 -> Y odd */\ + /* convert the chroma part */\ + "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ + "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ \ - "psllw $3, %%mm0;" /* Promote precision */ \ - "psllw $3, %%mm1;" /* Promote precision */ \ + "psllw $3, %%mm0;" /* Promote precision */ \ + "psllw $3, %%mm1;" /* Promote precision */ \ \ - "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \ - "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \ + "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \ + "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \ \ - "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ - "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ + "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ + "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ \ - "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \ - "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \ + "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \ + "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \ \ - "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\ - "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\ + "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\ + "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\ \ - "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\ + "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\ \ - /* convert the luma part */\ - "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\ - "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\ + /* convert the luma part */\ + "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\ + "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\ \ - "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\ + "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\ \ - "psllw $3, %%mm6;" /* Promote precision */\ - "psllw $3, %%mm7;" /* Promote precision */\ + "psllw $3, %%mm6;" /* Promote precision */\ + "psllw $3, %%mm7;" /* Promote precision */\ \ - "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\ - "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\ + "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\ + "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\ \ - "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\ - "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\ + "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\ + "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\ \ - /* Do the addition part of the conversion for even and odd pixels, - register usage: - mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, - mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, - mm6 -> Y even, mm7 -> Y odd */\ - "movq %%mm0, %%mm3;" /* Copy Cblue */\ - "movq %%mm1, %%mm4;" /* Copy Cred */\ - "movq %%mm2, %%mm5;" /* Copy Cgreen */\ + /* Do the addition part of the conversion for even and odd pixels, + register usage: + mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, + mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, + mm6 -> Y even, mm7 -> Y odd */\ + "movq %%mm0, %%mm3;" /* Copy Cblue */\ + "movq %%mm1, %%mm4;" /* Copy Cred */\ + "movq %%mm2, %%mm5;" /* Copy Cgreen */\ \ - "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\ - "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\ + "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\ + "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\ \ - "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\ - "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\ + "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\ + "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\ \ - "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\ - "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\ + "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\ + "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\ \ - /* Limit RGB even to 0..255 */\ - "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\ - "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\ - "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\ + /* Limit RGB even to 0..255 */\ + "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\ + "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\ + "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\ \ - /* Limit RGB odd to 0..255 */\ - "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\ - "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\ - "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\ + /* Limit RGB odd to 0..255 */\ + "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\ + "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\ + "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\ \ - /* Interleave RGB even and odd */\ - "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\ - "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\ - "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\ + /* Interleave RGB even and odd */\ + "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\ + "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\ + "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\ static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]){ + int srcSliceH, uint8_t* dst[], int dstStride[]){ int y, h_size; if(c->srcFormat == PIX_FMT_YUV422P){ - srcStride[1] *= 2; - srcStride[2] *= 2; + srcStride[1] *= 2; + srcStride[2] *= 2; } h_size= (c->dstW+7)&~7; if(h_size*2 > FFABS(dstStride[0])) h_size-=8; __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); -//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], -//srcStride[0],srcStride[1],srcStride[2],dstStride[0]); + //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], + //srcStride[0],srcStride[1],srcStride[2],dstStride[0]); for (y= 0; y>1)*srcStride[1]; - uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; - long index= -h_size/2; + uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; + uint8_t *_py = src[0] + y*srcStride[0]; + uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; + uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; + long index= -h_size/2; - b5Dither= dither8[y&1]; - g6Dither= dither4[y&1]; - g5Dither= dither8[y&1]; - r5Dither= dither8[(y+1)&1]; - /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 - pixels in each iteration */ - __asm__ __volatile__ ( - /* load data for start of next scan line */ - "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ - "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ -// ".balign 16 \n\t" - "1: \n\t" -/* no speed diference on my p3@500 with prefetch, - * if it is faster for anyone with -benchmark then tell me - PREFETCH" 64(%0) \n\t" - PREFETCH" 64(%1) \n\t" - PREFETCH" 64(%2) \n\t" -*/ + b5Dither= dither8[y&1]; + g6Dither= dither4[y&1]; + g5Dither= dither8[y&1]; + r5Dither= dither8[(y+1)&1]; + /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 + pixels in each iteration */ + __asm__ __volatile__ ( + /* load data for start of next scan line */ + "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ + "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + //".balign 16 \n\t" + "1: \n\t" + /* no speed diference on my p3@500 with prefetch, + * if it is faster for anyone with -benchmark then tell me + PREFETCH" 64(%0) \n\t" + PREFETCH" 64(%1) \n\t" + PREFETCH" 64(%2) \n\t" + */ YUV2RGB #ifdef DITHER1XBPP - "paddusb "MANGLE(b5Dither)", %%mm0;" - "paddusb "MANGLE(g6Dither)", %%mm2;" - "paddusb "MANGLE(r5Dither)", %%mm1;" + "paddusb "MANGLE(b5Dither)", %%mm0;" + "paddusb "MANGLE(g6Dither)", %%mm2;" + "paddusb "MANGLE(r5Dither)", %%mm1;" #endif - /* mask unneeded bits off */ - "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ - "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */ - "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ + /* mask unneeded bits off */ + "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ + "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */ + "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ - "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ - "pxor %%mm4, %%mm4;" /* zero mm4 */ + "psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ + "pxor %%mm4, %%mm4;" /* zero mm4 */ - "movq %%mm0, %%mm5;" /* Copy B7-B0 */ - "movq %%mm2, %%mm7;" /* Copy G7-G0 */ + "movq %%mm0, %%mm5;" /* Copy B7-B0 */ + "movq %%mm2, %%mm7;" /* Copy G7-G0 */ - /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ - "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ - "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ + /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ + "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ + "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ - "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ - "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ + "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ + "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ - "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ - MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ + "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ - /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ - "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ - "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ + /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ + "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ + "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ - "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ - "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ + "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ + "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ - "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ + "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ - MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ + MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ - "add $16, %1 \n\t" - "add $4, %0 \n\t" - " js 1b \n\t" + "add $16, %1 \n\t" + "add $4, %0 \n\t" + " js 1b \n\t" - : "+r" (index), "+r" (_image) - : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) - ); + : "+r" (index), "+r" (_image) + : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) + ); } __asm__ __volatile__ (EMMS); @@ -218,88 +218,88 @@ YUV2RGB } static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]){ + int srcSliceH, uint8_t* dst[], int dstStride[]){ int y, h_size; if(c->srcFormat == PIX_FMT_YUV422P){ - srcStride[1] *= 2; - srcStride[2] *= 2; + srcStride[1] *= 2; + srcStride[2] *= 2; } h_size= (c->dstW+7)&~7; if(h_size*2 > FFABS(dstStride[0])) h_size-=8; __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); -//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], -//srcStride[0],srcStride[1],srcStride[2],dstStride[0]); + //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], + //srcStride[0],srcStride[1],srcStride[2],dstStride[0]); for (y= 0; y>1)*srcStride[1]; - uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; - long index= -h_size/2; + uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; + uint8_t *_py = src[0] + y*srcStride[0]; + uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; + uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; + long index= -h_size/2; - b5Dither= dither8[y&1]; - g6Dither= dither4[y&1]; - g5Dither= dither8[y&1]; - r5Dither= dither8[(y+1)&1]; - /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 - pixels in each iteration */ - __asm__ __volatile__ ( - /* load data for start of next scan line */ - "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ - "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ -// ".balign 16 \n\t" - "1: \n\t" + b5Dither= dither8[y&1]; + g6Dither= dither4[y&1]; + g5Dither= dither8[y&1]; + r5Dither= dither8[(y+1)&1]; + /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 + pixels in each iteration */ + __asm__ __volatile__ ( + /* load data for start of next scan line */ + "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ + "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + //".balign 16 \n\t" + "1: \n\t" YUV2RGB #ifdef DITHER1XBPP - "paddusb "MANGLE(b5Dither)", %%mm0 \n\t" - "paddusb "MANGLE(g5Dither)", %%mm2 \n\t" - "paddusb "MANGLE(r5Dither)", %%mm1 \n\t" + "paddusb "MANGLE(b5Dither)", %%mm0 \n\t" + "paddusb "MANGLE(g5Dither)", %%mm2 \n\t" + "paddusb "MANGLE(r5Dither)", %%mm1 \n\t" #endif - /* mask unneeded bits off */ - "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ - "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */ - "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ + /* mask unneeded bits off */ + "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ + "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */ + "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ - "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ - "psrlw $1,%%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */ - "pxor %%mm4, %%mm4;" /* zero mm4 */ + "psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ + "psrlw $1, %%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */ + "pxor %%mm4, %%mm4;" /* zero mm4 */ - "movq %%mm0, %%mm5;" /* Copy B7-B0 */ - "movq %%mm2, %%mm7;" /* Copy G7-G0 */ + "movq %%mm0, %%mm5;" /* Copy B7-B0 */ + "movq %%mm2, %%mm7;" /* Copy G7-G0 */ - /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ - "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */ - "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ + /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ + "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */ + "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ - "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ - "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ + "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ + "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ - "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ - MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ + "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ - /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ - "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */ - "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ + /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ + "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */ + "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ - "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ - "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ + "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ + "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ - "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ + "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ - MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ + MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ - "add $16, %1 \n\t" - "add $4, %0 \n\t" - " js 1b \n\t" - : "+r" (index), "+r" (_image) - : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) - ); + "add $16, %1 \n\t" + "add $4, %0 \n\t" + " js 1b \n\t" + : "+r" (index), "+r" (_image) + : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) + ); } __asm__ __volatile__ (EMMS); @@ -307,12 +307,12 @@ YUV2RGB } static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]){ + int srcSliceH, uint8_t* dst[], int dstStride[]){ int y, h_size; if(c->srcFormat == PIX_FMT_YUV422P){ - srcStride[1] *= 2; - srcStride[2] *= 2; + srcStride[1] *= 2; + srcStride[2] *= 2; } h_size= (c->dstW+7)&~7; @@ -321,131 +321,131 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStr __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); for (y= 0; y>1)*srcStride[1]; - uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; - long index= -h_size/2; + uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; + uint8_t *_py = src[0] + y*srcStride[0]; + uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; + uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; + long index= -h_size/2; - /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 - pixels in each iteration */ - __asm__ __volatile__ ( - /* load data for start of next scan line */ - "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ - "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ -// ".balign 16 \n\t" - "1: \n\t" + /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 + pixels in each iteration */ + __asm__ __volatile__ ( + /* load data for start of next scan line */ + "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ + "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + //".balign 16 \n\t" + "1: \n\t" YUV2RGB - /* mm0=B, %%mm2=G, %%mm1=R */ + /* mm0=B, %%mm2=G, %%mm1=R */ #ifdef HAVE_MMX2 - "movq "MANGLE(M24A)", %%mm4 \n\t" - "movq "MANGLE(M24C)", %%mm7 \n\t" - "pshufw $0x50, %%mm0, %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */ - "pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */ - "pshufw $0x00, %%mm1, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */ + "movq "MANGLE(M24A)", %%mm4 \n\t" + "movq "MANGLE(M24C)", %%mm7 \n\t" + "pshufw $0x50, %%mm0, %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */ + "pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */ + "pshufw $0x00, %%mm1, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */ - "pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */ - "pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */ - "pand %%mm7, %%mm6 \n\t" /* R1 R0 */ + "pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */ + "pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */ + "pand %%mm7, %%mm6 \n\t" /* R1 R0 */ - "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */ - "por %%mm5, %%mm6 \n\t" - "por %%mm3, %%mm6 \n\t" - MOVNTQ" %%mm6, (%1) \n\t" + "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */ + "por %%mm5, %%mm6 \n\t" + "por %%mm3, %%mm6 \n\t" + MOVNTQ" %%mm6, (%1) \n\t" - "psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */ - "pshufw $0xA5, %%mm0, %%mm5 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */ - "pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */ - "pshufw $0xA5, %%mm1, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */ + "psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */ + "pshufw $0xA5, %%mm0, %%mm5 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */ + "pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */ + "pshufw $0xA5, %%mm1, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */ - "pand "MANGLE(M24B)", %%mm5 \n\t" /* B5 B4 B3 */ - "pand %%mm7, %%mm3 \n\t" /* G4 G3 */ - "pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */ + "pand "MANGLE(M24B)", %%mm5 \n\t" /* B5 B4 B3 */ + "pand %%mm7, %%mm3 \n\t" /* G4 G3 */ + "pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */ - "por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */ - "por %%mm3, %%mm6 \n\t" - MOVNTQ" %%mm6, 8(%1) \n\t" + "por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */ + "por %%mm3, %%mm6 \n\t" + MOVNTQ" %%mm6, 8(%1) \n\t" - "pshufw $0xFF, %%mm0, %%mm5 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */ - "pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */ - "pshufw $0xFA, %%mm1, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */ - "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ + "pshufw $0xFF, %%mm0, %%mm5 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */ + "pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */ + "pshufw $0xFA, %%mm1, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */ + "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - "pand %%mm7, %%mm5 \n\t" /* B7 B6 */ - "pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */ - "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */ - "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + "pand %%mm7, %%mm5 \n\t" /* B7 B6 */ + "pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */ + "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */ + "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ - "por %%mm5, %%mm3 \n\t" - "por %%mm3, %%mm6 \n\t" - MOVNTQ" %%mm6, 16(%1) \n\t" - "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ - "pxor %%mm4, %%mm4 \n\t" + "por %%mm5, %%mm3 \n\t" + "por %%mm3, %%mm6 \n\t" + MOVNTQ" %%mm6, 16(%1) \n\t" + "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + "pxor %%mm4, %%mm4 \n\t" #else - "pxor %%mm4, %%mm4 \n\t" - "movq %%mm0, %%mm5 \n\t" /* B */ - "movq %%mm1, %%mm6 \n\t" /* R */ - "punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */ - "punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */ - "punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */ - "punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */ - "movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */ - "movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */ - "punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */ - "punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */ - "punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */ - "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */ + "pxor %%mm4, %%mm4 \n\t" + "movq %%mm0, %%mm5 \n\t" /* B */ + "movq %%mm1, %%mm6 \n\t" /* R */ + "punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */ + "punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */ + "punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */ + "punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */ + "movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */ + "movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */ + "punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */ + "punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */ + "punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */ + "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */ - "movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */ - "movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */ - "movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */ - "movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */ + "movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */ + "movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */ + "movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */ + "movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */ - "psllq $40, %%mm7 \n\t" /* RGB00000 0 */ - "psllq $40, %%mm0 \n\t" /* RGB00000 1 */ - "psllq $40, %%mm5 \n\t" /* RGB00000 2 */ - "psllq $40, %%mm3 \n\t" /* RGB00000 3 */ + "psllq $40, %%mm7 \n\t" /* RGB00000 0 */ + "psllq $40, %%mm0 \n\t" /* RGB00000 1 */ + "psllq $40, %%mm5 \n\t" /* RGB00000 2 */ + "psllq $40, %%mm3 \n\t" /* RGB00000 3 */ - "punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */ - "punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */ - "punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */ - "punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */ + "punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */ + "punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */ + "punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */ + "punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */ - "psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */ - "movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */ - "psllq $40, %%mm0 \n\t" /* GB000000 1 */ - "por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */ - MOVNTQ" %%mm7, (%1) \n\t" + "psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */ + "movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */ + "psllq $40, %%mm0 \n\t" /* GB000000 1 */ + "por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */ + MOVNTQ" %%mm7, (%1) \n\t" - "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ + "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */ - "movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */ - "psllq $24, %%mm5 \n\t" /* BRGB0000 2 */ - "por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */ - MOVNTQ" %%mm6, 8(%1) \n\t" + "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */ + "movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */ + "psllq $24, %%mm5 \n\t" /* BRGB0000 2 */ + "por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */ + MOVNTQ" %%mm6, 8(%1) \n\t" - "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ - "psrlq $40, %%mm1 \n\t" /* 000000RG 2 */ - "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */ - "por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */ - MOVNTQ" %%mm1, 16(%1) \n\t" + "psrlq $40, %%mm1 \n\t" /* 000000RG 2 */ + "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */ + "por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */ + MOVNTQ" %%mm1, 16(%1) \n\t" - "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ - "pxor %%mm4, %%mm4 \n\t" + "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + "pxor %%mm4, %%mm4 \n\t" #endif - "add $24, %1 \n\t" - "add $4, %0 \n\t" - " js 1b \n\t" + "add $24, %1 \n\t" + "add $4, %0 \n\t" + " js 1b \n\t" - : "+r" (index), "+r" (_image) - : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) - ); + : "+r" (index), "+r" (_image) + : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) + ); } __asm__ __volatile__ (EMMS); @@ -453,12 +453,12 @@ YUV2RGB } static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]){ + int srcSliceH, uint8_t* dst[], int dstStride[]){ int y, h_size; if(c->srcFormat == PIX_FMT_YUV422P){ - srcStride[1] *= 2; - srcStride[2] *= 2; + srcStride[1] *= 2; + srcStride[2] *= 2; } h_size= (c->dstW+7)&~7; @@ -467,71 +467,71 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStr __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); for (y= 0; y>1)*srcStride[1]; - uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; - long index= -h_size/2; + uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; + uint8_t *_py = src[0] + y*srcStride[0]; + uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; + uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; + long index= -h_size/2; - /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 - pixels in each iteration */ - __asm__ __volatile__ ( - /* load data for start of next scan line */ - "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ - "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ -// ".balign 16 \n\t" - "1: \n\t" + /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 + pixels in each iteration */ + __asm__ __volatile__ ( + /* load data for start of next scan line */ + "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ + "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + //".balign 16 \n\t" + "1: \n\t" YUV2RGB - /* convert RGB plane to RGB packed format, - mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, - mm4 -> GB, mm5 -> AR pixel 4-7, - mm6 -> GB, mm7 -> AR pixel 0-3 */ - "pxor %%mm3, %%mm3;" /* zero mm3 */ + /* convert RGB plane to RGB packed format, + mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, + mm4 -> GB, mm5 -> AR pixel 4-7, + mm6 -> GB, mm7 -> AR pixel 0-3 */ + "pxor %%mm3, %%mm3;" /* zero mm3 */ - "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ - "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ + "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ + "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ - "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ - "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ + "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ + "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ - "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ - "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */ + "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ + "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */ - "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */ - MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */ + "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */ + MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */ - "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ - "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ + "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ + "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ - "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */ - MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */ + "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */ + MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */ - "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ - "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */ + "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ + "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */ - "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */ - MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */ + "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */ + MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */ - "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ - "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ + "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ + "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ - "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */ - MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */ + "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */ + MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */ - "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ - "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ + "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ + "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ - "pxor %%mm4, %%mm4;" /* zero mm4 */ - "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + "pxor %%mm4, %%mm4;" /* zero mm4 */ + "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ - "add $32, %1 \n\t" - "add $4, %0 \n\t" - " js 1b \n\t" + "add $32, %1 \n\t" + "add $4, %0 \n\t" + " js 1b \n\t" - : "+r" (index), "+r" (_image) - : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) - ); + : "+r" (index), "+r" (_image) + : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) + ); } __asm__ __volatile__ (EMMS);