From 4e3e333a79272944b40695166438359b376d7864 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 5 Jul 2011 12:49:11 -0700 Subject: [PATCH] swscale: error dithering for 16/9/10-bit to 8-bit. Based on a somewhat similar idea in FFmpeg's swscale copy. --- libswscale/swscale.c | 49 ++++++++--- libswscale/swscale_internal.h | 6 ++ libswscale/x86/swscale_template.c | 135 ++++++++++++++++++++++++++---- 3 files changed, 160 insertions(+), 30 deletions(-) diff --git a/libswscale/swscale.c b/libswscale/swscale.c index db4d231e13..dd9f4a108f 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -182,6 +182,18 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={ { 77, 23, 60, 15, 72, 21, 56, 14, }, }; #endif +DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = { +{ 36, 68, 60, 92, 34, 66, 58, 90,}, +{ 100, 4,124, 28, 98, 2,122, 26,}, +{ 52, 84, 44, 76, 50, 82, 42, 74,}, +{ 116, 20,108, 12,114, 18,106, 10,}, +{ 32, 64, 56, 88, 38, 70, 62, 94,}, +{ 96, 0,120, 24,102, 6,126, 30,}, +{ 48, 80, 40, 72, 54, 86, 46, 78,}, +{ 112, 16,104, 8,118, 22,110, 14,}, +}; +DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] = +{ 64, 64, 64, 64, 64, 64, 64, 64 }; static av_always_inline void yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc, @@ -285,10 +297,11 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter, uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; int i; + const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; //FIXME Optimize (just quickly written not optimized..) for (i=0; ilumDither8, *chrDither = c->chrDither8; for (i=0; i>7; + int val = (lumSrc[i]+ lumDither[i & 7]) >> 7; yDest[i]= av_clip_uint8(val); } if (uDest) for (i=0; i>7; - int v=(chrVSrc[i]+64)>>7; + int u = (chrUSrc[i] + chrDither[i & 7]) >> 7; + int v = (chrVSrc[i] + chrDither[(i + 3) & 7]) >> 7; uDest[i]= av_clip_uint8(u); vDest[i]= av_clip_uint8(v); } if (CONFIG_SWSCALE_ALPHA && aDest) for (i=0; i>7; + int val = (alpSrc[i] + lumDither[i & 7]) >> 7; aDest[i]= av_clip_uint8(val); } } @@ -359,11 +373,12 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter, { uint8_t *yDest = dest[0], *uDest = dest[1]; enum PixelFormat dstFormat = c->dstFormat; + const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; //FIXME Optimize (just quickly written not optimized..) int i; for (i=0; iyuv2packed1; yuv2packed2_fn yuv2packed2 = c->yuv2packed2; yuv2packedX_fn yuv2packedX = c->yuv2packedX; + int should_dither = is9_OR_10BPS(c->srcFormat) || is16BPS(c->srcFormat); /* vars which will change and which we need to store back in the context */ int dstY= c->dstY; @@ -2401,6 +2417,9 @@ static int swScale(SwsContext *c, const uint8_t* src[], lastInChrBuf= -1; } + if (!should_dither) { + c->chrDither8 = c->lumDither8 = ff_sws_pb_64; + } lastDstY= dstY; for (;dstY < dstH; dstY++) { @@ -2490,6 +2509,10 @@ static int swScale(SwsContext *c, const uint8_t* src[], #if HAVE_MMX updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf); #endif + if (should_dither) { + c->chrDither8 = dither_8x8_128[chrDstY & 7]; + c->lumDither8 = dither_8x8_128[dstY & 7]; + } if (dstY >= dstH-2) { // hmm looks like we can't use MMX here without overwriting this array's tail find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX, diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index b3698a3d94..efb8aff088 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -321,6 +321,8 @@ typedef struct SwsContext { #define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48" #define UV_OFF "11*8+4*4*256*3+48" #define UV_OFFx2 "11*8+4*4*256*3+56" +#define DITHER16 "11*8+4*4*256*3+64" +#define DITHER32 "11*8+4*4*256*3+80" DECLARE_ALIGNED(8, uint64_t, redDither); DECLARE_ALIGNED(8, uint64_t, greenDither); @@ -345,6 +347,10 @@ typedef struct SwsContext { int32_t alpMmxFilter[4*MAX_FILTER_SIZE]; DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes + uint16_t dither16[8]; + uint32_t dither32[8]; + + const uint8_t *chrDither8, *lumDither8; #if HAVE_ALTIVEC vector signed short CY; diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index 26cd2742a3..fd6ec3a793 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -37,8 +37,8 @@ #define YSCALEYUV2YV12X(offset, dest, end, pos) \ __asm__ volatile(\ - "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ + "movq "DITHER16"+0(%0), %%mm3 \n\t"\ + "movq "DITHER16"+8(%0), %%mm4 \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\ ".p2align 4 \n\t" /* FIXME Unroll? */\ @@ -60,8 +60,8 @@ MOVNTQ(%%mm3, (%1, %3))\ "add $8, %3 \n\t"\ "cmp %2, %3 \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ + "movq "DITHER16"+0(%0), %%mm3 \n\t"\ + "movq "DITHER16"+8(%0), %%mm4 \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\ "jb 1b \n\t"\ @@ -70,6 +70,42 @@ : "%"REG_d, "%"REG_S\ ); +#if !COMPILE_TEMPLATE_MMX2 +static av_always_inline void +dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot) +{ + if (rot) { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "psrlq $24, %%mm3\n\t" + "psllq $40, %%mm4\n\t" + "por %%mm4, %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + "psraw $4, %%mm3\n\t" + "psraw $4, %%mm4\n\t" + "movq %%mm3, "DITHER16"+0(%1)\n\t" + "movq %%mm4, "DITHER16"+8(%1)\n\t" + :: "r"(srcDither), "r"(&c->redDither) + ); + } else { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + "psraw $4, %%mm3\n\t" + "psraw $4, %%mm4\n\t" + "movq %%mm3, "DITHER16"+0(%1)\n\t" + "movq %%mm4, "DITHER16"+8(%1)\n\t" + :: "r"(srcDither), "r"(&c->redDither) + ); + } +} +#endif + static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, @@ -79,12 +115,16 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, { uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; + const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; if (uDest) { x86_reg uv_off = c->uv_offx2 >> 1; + dither_8to16(c, chrDither, 0); YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) + dither_8to16(c, chrDither, 1); YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) } + dither_8to16(c, lumDither, 0); if (CONFIG_SWSCALE_ALPHA && aDest) { YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) } @@ -95,10 +135,10 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \ __asm__ volatile(\ "lea " offset "(%0), %%"REG_d" \n\t"\ - "pxor %%mm4, %%mm4 \n\t"\ - "pxor %%mm5, %%mm5 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ + "movq "DITHER32"+0(%0), %%mm4 \n\t"\ + "movq "DITHER32"+8(%0), %%mm5 \n\t"\ + "movq "DITHER32"+16(%0), %%mm6 \n\t"\ + "movq "DITHER32"+24(%0), %%mm7 \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\ ".p2align 4 \n\t"\ "1: \n\t"\ @@ -142,10 +182,10 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, "add $8, %3 \n\t"\ "cmp %2, %3 \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\ - "pxor %%mm4, %%mm4 \n\t"\ - "pxor %%mm5, %%mm5 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ + "movq "DITHER32"+0(%0), %%mm4 \n\t"\ + "movq "DITHER32"+8(%0), %%mm5 \n\t"\ + "movq "DITHER32"+16(%0), %%mm6 \n\t"\ + "movq "DITHER32"+24(%0), %%mm7 \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\ "jb 1b \n\t"\ :: "r" (&c->redDither),\ @@ -153,6 +193,62 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, : "%"REG_a, "%"REG_d, "%"REG_S\ ); +#if !COMPILE_TEMPLATE_MMX2 +static av_always_inline void +dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot) +{ + if (rot) { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm4\n\t" + "movq %%mm4, %%mm5\n\t" + "psrlq $24, %%mm4\n\t" + "psllq $40, %%mm5\n\t" + "por %%mm5, %%mm4\n\t" + "movq %%mm4, %%mm6\n\t" + "punpcklbw %%mm0, %%mm4\n\t" + "punpckhbw %%mm0, %%mm6\n\t" + "movq %%mm4, %%mm5\n\t" + "movq %%mm6, %%mm7\n\t" + "punpcklwd %%mm0, %%mm4\n\t" + "punpckhwd %%mm0, %%mm5\n\t" + "punpcklwd %%mm0, %%mm6\n\t" + "punpckhwd %%mm0, %%mm7\n\t" + "psllw $12, %%mm4\n\t" + "psllw $12, %%mm5\n\t" + "psllw $12, %%mm6\n\t" + "psllw $12, %%mm7\n\t" + "movq %%mm3, "DITHER32"+0(%1)\n\t" + "movq %%mm4, "DITHER32"+8(%1)\n\t" + "movq %%mm4, "DITHER32"+16(%1)\n\t" + "movq %%mm4, "DITHER32"+24(%1)\n\t" + :: "r"(srcDither), "r"(&c->redDither) + ); + } else { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm4\n\t" + "movq %%mm4, %%mm6\n\t" + "punpcklbw %%mm0, %%mm4\n\t" + "punpckhbw %%mm0, %%mm6\n\t" + "movq %%mm4, %%mm5\n\t" + "movq %%mm6, %%mm7\n\t" + "punpcklwd %%mm0, %%mm4\n\t" + "punpckhwd %%mm0, %%mm5\n\t" + "punpcklwd %%mm0, %%mm6\n\t" + "punpckhwd %%mm0, %%mm7\n\t" + "psllw $12, %%mm4\n\t" + "psllw $12, %%mm5\n\t" + "psllw $12, %%mm6\n\t" + "psllw $12, %%mm7\n\t" + "movq %%mm3, "DITHER32"+0(%1)\n\t" + "movq %%mm4, "DITHER32"+8(%1)\n\t" + "movq %%mm4, "DITHER32"+16(%1)\n\t" + "movq %%mm4, "DITHER32"+24(%1)\n\t" + :: "r"(srcDither), "r"(&c->redDither) + ); + } +} +#endif + static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, @@ -162,12 +258,16 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, { uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; + const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; if (uDest) { x86_reg uv_off = c->uv_offx2 >> 1; + dither_8to32(c, chrDither, 0); YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) + dither_8to32(c, chrDither, 1); YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) } + dither_8to32(c, lumDither, 0); if (CONFIG_SWSCALE_ALPHA && aDest) { YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) } @@ -220,19 +320,20 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, chrVSrc + chrDstW, alpSrc + dstW }; x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW }; + const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; while (p--) { if (dst[p]) { + dither_8to16(c, (p == 2 || p == 3) ? chrDither : lumDither, p == 2); __asm__ volatile( "mov %2, %%"REG_a" \n\t" - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "psllw $6, %%mm7 \n\t" + "movq "DITHER16"+0(%3), %%mm6 \n\t" + "movq "DITHER16"+8(%3), %%mm7 \n\t" ".p2align 4 \n\t" /* FIXME Unroll? */ "1: \n\t" "movq (%0, %%"REG_a", 2), %%mm0 \n\t" "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t" - "paddsw %%mm7, %%mm0 \n\t" + "paddsw %%mm6, %%mm0 \n\t" "paddsw %%mm7, %%mm1 \n\t" "psraw $7, %%mm0 \n\t" "psraw $7, %%mm1 \n\t" @@ -241,7 +342,7 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, "add $8, %%"REG_a" \n\t" "jnc 1b \n\t" :: "r" (src[p]), "r" (dst[p] + counter[p]), - "g" (-counter[p]) + "g" (-counter[p]), "r"(&c->redDither) : "%"REG_a ); }