swscale: error dithering for 16/9/10-bit to 8-bit.

Based on a somewhat similar idea in FFmpeg's swscale copy.
2011-07-05 12:49:11 -07:00 · 2011-07-05 12:49:11 -07:00 · 4e3e333a79
parent 7d7bacf0f1
commit 4e3e333a79
3 changed files with 160 additions and 30 deletions
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@ -182,6 +182,18 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 { 77,  23,  60,  15,  72,  21,  56,  14, },
 };
 #endif
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
+{  36, 68, 60, 92, 34, 66, 58, 90,},
+{ 100,  4,124, 28, 98,  2,122, 26,},
+{  52, 84, 44, 76, 50, 82, 42, 74,},
+{ 116, 20,108, 12,114, 18,106, 10,},
+{  32, 64, 56, 88, 38, 70, 62, 94,},
+{  96,  0,120, 24,102,  6,126, 30,},
+{  48, 80, 40, 72, 54, 86, 46, 78,},
+{ 112, 16,104,  8,118, 22,110, 14,},
+};
+DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
+{  64, 64, 64, 64, 64, 64, 64, 64 };

 static av_always_inline void
 yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
@ -285,10 +297,11 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
    uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
            *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
    int i;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;

    //FIXME Optimize (just quickly written not optimized..)
    for (i=0; i<dstW; i++) {
-        int val=1<<18;
+        int val = lumDither[i & 7] << 12;
        int j;
        for (j=0; j<lumFilterSize; j++)
            val += lumSrc[j][i] * lumFilter[j];
@ -298,8 +311,8 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,

    if (uDest)
        for (i=0; i<chrDstW; i++) {
-            int u=1<<18;
-            int v=1<<18;
+            int u = chrDither[i & 7] << 12;
+            int v = chrDither[(i + 3) & 7] << 12;
            int j;
            for (j=0; j<chrFilterSize; j++) {
                u += chrUSrc[j][i] * chrFilter[j];
@ -312,7 +325,7 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,

    if (CONFIG_SWSCALE_ALPHA && aDest)
        for (i=0; i<dstW; i++) {
-            int val=1<<18;
+            int val = lumDither[i & 7] << 12;
            int j;
            for (j=0; j<lumFilterSize; j++)
                val += alpSrc[j][i] * lumFilter[j];
@ -329,23 +342,24 @@ static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
    uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
            *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
    int i;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;

    for (i=0; i<dstW; i++) {
-        int val= (lumSrc[i]+64)>>7;
+        int val = (lumSrc[i]+  lumDither[i & 7]) >> 7;
        yDest[i]= av_clip_uint8(val);
    }

    if (uDest)
        for (i=0; i<chrDstW; i++) {
-            int u=(chrUSrc[i]+64)>>7;
-            int v=(chrVSrc[i]+64)>>7;
+            int u = (chrUSrc[i] + chrDither[i & 7])       >> 7;
+            int v = (chrVSrc[i] + chrDither[(i + 3) & 7]) >> 7;
            uDest[i]= av_clip_uint8(u);
            vDest[i]= av_clip_uint8(v);
        }

    if (CONFIG_SWSCALE_ALPHA && aDest)
        for (i=0; i<dstW; i++) {
-            int val= (alpSrc[i]+64)>>7;
+            int val = (alpSrc[i] + lumDither[i & 7]) >> 7;
            aDest[i]= av_clip_uint8(val);
        }
 }
@ -359,11 +373,12 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 {
    uint8_t *yDest = dest[0], *uDest = dest[1];
    enum PixelFormat dstFormat = c->dstFormat;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;

    //FIXME Optimize (just quickly written not optimized..)
    int i;
    for (i=0; i<dstW; i++) {
-        int val=1<<18;
+        int val = lumDither[i & 7] << 12;
        int j;
        for (j=0; j<lumFilterSize; j++)
            val += lumSrc[j][i] * lumFilter[j];
@ -376,8 +391,8 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,

    if (dstFormat == PIX_FMT_NV12)
        for (i=0; i<chrDstW; i++) {
-            int u=1<<18;
-            int v=1<<18;
+            int u = chrDither[i & 7] << 12;
+            int v = chrDither[(i + 3) & 7] << 12;
            int j;
            for (j=0; j<chrFilterSize; j++) {
                u += chrUSrc[j][i] * chrFilter[j];
@ -389,8 +404,8 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
        }
    else
        for (i=0; i<chrDstW; i++) {
-            int u=1<<18;
-            int v=1<<18;
+            int u = chrDither[i & 7] << 12;
+            int v = chrDither[(i + 3) & 7] << 12;
            int j;
            for (j=0; j<chrFilterSize; j++) {
                u += chrUSrc[j][i] * chrFilter[j];
@ -2352,6 +2367,7 @@ static int swScale(SwsContext *c, const uint8_t* src[],
    yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
    yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
    yuv2packedX_fn yuv2packedX = c->yuv2packedX;
+    int should_dither = is9_OR_10BPS(c->srcFormat) || is16BPS(c->srcFormat);

    /* vars which will change and which we need to store back in the context */
    int dstY= c->dstY;
@ -2401,6 +2417,9 @@ static int swScale(SwsContext *c, const uint8_t* src[],
        lastInChrBuf= -1;
    }

+    if (!should_dither) {
+        c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
+    }
    lastDstY= dstY;

    for (;dstY < dstH; dstY++) {
@ -2490,6 +2509,10 @@ static int swScale(SwsContext *c, const uint8_t* src[],
 #if HAVE_MMX
        updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
 #endif
+        if (should_dither) {
+            c->chrDither8 = dither_8x8_128[chrDstY & 7];
+            c->lumDither8 = dither_8x8_128[dstY & 7];
+        }
        if (dstY >= dstH-2) {
            // hmm looks like we can't use MMX here without overwriting this array's tail
            find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@ -321,6 +321,8 @@ typedef struct SwsContext {
 #define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48"
 #define UV_OFF                "11*8+4*4*256*3+48"
 #define UV_OFFx2              "11*8+4*4*256*3+56"
+#define DITHER16              "11*8+4*4*256*3+64"
+#define DITHER32              "11*8+4*4*256*3+80"

    DECLARE_ALIGNED(8, uint64_t, redDither);
    DECLARE_ALIGNED(8, uint64_t, greenDither);
@ -345,6 +347,10 @@ typedef struct SwsContext {
    int32_t  alpMmxFilter[4*MAX_FILTER_SIZE];
    DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes
    DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes
+    uint16_t dither16[8];
+    uint32_t dither32[8];
+
+    const uint8_t *chrDither8, *lumDither8;

 #if HAVE_ALTIVEC
    vector signed short   CY;
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@ -37,8 +37,8 @@

 #define YSCALEYUV2YV12X(offset, dest, end, pos) \
    __asm__ volatile(\
-        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
-        "movq                             %%mm3, %%mm4      \n\t"\
+        "movq                  "DITHER16"+0(%0), %%mm3      \n\t"\
+        "movq                  "DITHER16"+8(%0), %%mm4      \n\t"\
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
        ".p2align                             4             \n\t" /* FIXME Unroll? */\
@ -60,8 +60,8 @@
        MOVNTQ(%%mm3, (%1, %3))\
        "add                                 $8, %3         \n\t"\
        "cmp                                 %2, %3         \n\t"\
-        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
-        "movq                             %%mm3, %%mm4      \n\t"\
+        "movq                  "DITHER16"+0(%0), %%mm3      \n\t"\
+        "movq                  "DITHER16"+8(%0), %%mm4      \n\t"\
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
        "jb                                  1b             \n\t"\
@ -70,6 +70,42 @@
        : "%"REG_d, "%"REG_S\
    );

+#if !COMPILE_TEMPLATE_MMX2
+static av_always_inline void
+dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot)
+{
+    if (rot) {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "psrlq       $24, %%mm3\n\t"
+                         "psllq       $40, %%mm4\n\t"
+                         "por       %%mm4, %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         "psraw        $4, %%mm3\n\t"
+                         "psraw        $4, %%mm4\n\t"
+                         "movq      %%mm3, "DITHER16"+0(%1)\n\t"
+                         "movq      %%mm4, "DITHER16"+8(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    } else {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         "psraw        $4, %%mm3\n\t"
+                         "psraw        $4, %%mm4\n\t"
+                         "movq      %%mm3, "DITHER16"+0(%1)\n\t"
+                         "movq      %%mm4, "DITHER16"+8(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    }
+}
+#endif
+
 static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
                             const int16_t **lumSrc, int lumFilterSize,
                             const int16_t *chrFilter, const int16_t **chrUSrc,
@ -79,12 +115,16 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
 {
    uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
            *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;

    if (uDest) {
        x86_reg uv_off = c->uv_offx2 >> 1;
+        dither_8to16(c, chrDither, 0);
        YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
+        dither_8to16(c, chrDither, 1);
        YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
    }
+    dither_8to16(c, lumDither, 0);
    if (CONFIG_SWSCALE_ALPHA && aDest) {
        YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
    }
@ -95,10 +135,10 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
 #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
    __asm__ volatile(\
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
-        "pxor                             %%mm4, %%mm4      \n\t"\
-        "pxor                             %%mm5, %%mm5      \n\t"\
-        "pxor                             %%mm6, %%mm6      \n\t"\
-        "pxor                             %%mm7, %%mm7      \n\t"\
+        "movq                  "DITHER32"+0(%0), %%mm4      \n\t"\
+        "movq                  "DITHER32"+8(%0), %%mm5      \n\t"\
+        "movq                 "DITHER32"+16(%0), %%mm6      \n\t"\
+        "movq                 "DITHER32"+24(%0), %%mm7      \n\t"\
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
        ".p2align                             4             \n\t"\
        "1:                                                 \n\t"\
@ -142,10 +182,10 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
        "add                                 $8, %3         \n\t"\
        "cmp                                 %2, %3         \n\t"\
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
-        "pxor                             %%mm4, %%mm4      \n\t"\
-        "pxor                             %%mm5, %%mm5      \n\t"\
-        "pxor                             %%mm6, %%mm6      \n\t"\
-        "pxor                             %%mm7, %%mm7      \n\t"\
+        "movq                  "DITHER32"+0(%0), %%mm4      \n\t"\
+        "movq                  "DITHER32"+8(%0), %%mm5      \n\t"\
+        "movq                 "DITHER32"+16(%0), %%mm6      \n\t"\
+        "movq                 "DITHER32"+24(%0), %%mm7      \n\t"\
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
        "jb                                  1b             \n\t"\
        :: "r" (&c->redDither),\
@ -153,6 +193,62 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
        : "%"REG_a, "%"REG_d, "%"REG_S\
    );

+#if !COMPILE_TEMPLATE_MMX2
+static av_always_inline void
+dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot)
+{
+    if (rot) {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm4\n\t"
+                         "movq      %%mm4, %%mm5\n\t"
+                         "psrlq       $24, %%mm4\n\t"
+                         "psllq       $40, %%mm5\n\t"
+                         "por       %%mm5, %%mm4\n\t"
+                         "movq      %%mm4, %%mm6\n\t"
+                         "punpcklbw %%mm0, %%mm4\n\t"
+                         "punpckhbw %%mm0, %%mm6\n\t"
+                         "movq      %%mm4, %%mm5\n\t"
+                         "movq      %%mm6, %%mm7\n\t"
+                         "punpcklwd %%mm0, %%mm4\n\t"
+                         "punpckhwd %%mm0, %%mm5\n\t"
+                         "punpcklwd %%mm0, %%mm6\n\t"
+                         "punpckhwd %%mm0, %%mm7\n\t"
+                         "psllw       $12, %%mm4\n\t"
+                         "psllw       $12, %%mm5\n\t"
+                         "psllw       $12, %%mm6\n\t"
+                         "psllw       $12, %%mm7\n\t"
+                         "movq      %%mm3, "DITHER32"+0(%1)\n\t"
+                         "movq      %%mm4, "DITHER32"+8(%1)\n\t"
+                         "movq      %%mm4, "DITHER32"+16(%1)\n\t"
+                         "movq      %%mm4, "DITHER32"+24(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    } else {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm4\n\t"
+                         "movq      %%mm4, %%mm6\n\t"
+                         "punpcklbw %%mm0, %%mm4\n\t"
+                         "punpckhbw %%mm0, %%mm6\n\t"
+                         "movq      %%mm4, %%mm5\n\t"
+                         "movq      %%mm6, %%mm7\n\t"
+                         "punpcklwd %%mm0, %%mm4\n\t"
+                         "punpckhwd %%mm0, %%mm5\n\t"
+                         "punpcklwd %%mm0, %%mm6\n\t"
+                         "punpckhwd %%mm0, %%mm7\n\t"
+                         "psllw       $12, %%mm4\n\t"
+                         "psllw       $12, %%mm5\n\t"
+                         "psllw       $12, %%mm6\n\t"
+                         "psllw       $12, %%mm7\n\t"
+                         "movq      %%mm3, "DITHER32"+0(%1)\n\t"
+                         "movq      %%mm4, "DITHER32"+8(%1)\n\t"
+                         "movq      %%mm4, "DITHER32"+16(%1)\n\t"
+                         "movq      %%mm4, "DITHER32"+24(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    }
+}
+#endif
+
 static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
                                const int16_t **lumSrc, int lumFilterSize,
                                const int16_t *chrFilter, const int16_t **chrUSrc,
@ -162,12 +258,16 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
 {
    uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
            *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;

    if (uDest) {
        x86_reg uv_off = c->uv_offx2 >> 1;
+        dither_8to32(c, chrDither, 0);
        YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
+        dither_8to32(c, chrDither, 1);
        YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
    }
+    dither_8to32(c, lumDither, 0);
    if (CONFIG_SWSCALE_ALPHA && aDest) {
        YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
    }
@ -220,19 +320,20 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
        chrVSrc + chrDstW, alpSrc + dstW
    };
    x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;

    while (p--) {
        if (dst[p]) {
+            dither_8to16(c, (p == 2 || p == 3) ? chrDither : lumDither, p == 2);
            __asm__ volatile(
                "mov %2, %%"REG_a"                    \n\t"
-                "pcmpeqw %%mm7, %%mm7                 \n\t"
-                "psrlw                 $15, %%mm7     \n\t"
-                "psllw                  $6, %%mm7     \n\t"
+                "movq    "DITHER16"+0(%3), %%mm6      \n\t"
+                "movq    "DITHER16"+8(%3), %%mm7      \n\t"
                ".p2align                4            \n\t" /* FIXME Unroll? */
                "1:                                   \n\t"
                "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"
                "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"
-                "paddsw             %%mm7, %%mm0      \n\t"
+                "paddsw             %%mm6, %%mm0      \n\t"
                "paddsw             %%mm7, %%mm1      \n\t"
                "psraw                 $7, %%mm0      \n\t"
                "psraw                 $7, %%mm1      \n\t"
@ -241,7 +342,7 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
                "add                   $8, %%"REG_a"  \n\t"
                "jnc                   1b             \n\t"
                :: "r" (src[p]), "r" (dst[p] + counter[p]),
-                   "g" (-counter[p])
+                   "g" (-counter[p]), "r"(&c->redDither)
                : "%"REG_a
            );
        }