From c59f9a68e2eda847538df367b5b9e7256247ef78 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 5 Jul 2011 12:49:11 -0700
Subject: [PATCH] swscale: error dithering for 16/9/10-bit to 8-bit.

Based on a somewhat similar idea in FFmpeg's swscale copy.
---
 libswscale/swscale.c              |  64 +++++++++-----
 libswscale/swscale_internal.h     |   9 +-
 libswscale/x86/swscale_template.c | 136 +++++++++++++++++++++++++-----
 3 files changed, 160 insertions(+), 49 deletions(-)

diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index b6455bb5e7..45098ef4c2 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -193,6 +193,18 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 { 77,  23,  60,  15,  72,  21,  56,  14, },
 };
 #endif
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
+{  36, 68, 60, 92, 34, 66, 58, 90,},
+{ 100,  4,124, 28, 98,  2,122, 26,},
+{  52, 84, 44, 76, 50, 82, 42, 74,},
+{ 116, 20,108, 12,114, 18,106, 10,},
+{  32, 64, 56, 88, 38, 70, 62, 94,},
+{  96,  0,120, 24,102,  6,126, 30,},
+{  48, 80, 40, 72, 54, 86, 46, 78,},
+{ 112, 16,104,  8,118, 22,110, 14,},
+};
+DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
+{  64, 64, 64, 64, 64, 64, 64, 64 };
 
 DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
 {
@@ -387,16 +399,16 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
                        const int16_t *chrFilter, const int16_t **chrUSrc,
                        const int16_t **chrVSrc,
                        int chrFilterSize, const int16_t **alpSrc,
-                       uint8_t *dest[4], int dstW, int chrDstW,
-                       const uint8_t *lumDither, const uint8_t *chrDither)
+                       uint8_t *dest[4], int dstW, int chrDstW)
 {
     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
     int i;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 
     //FIXME Optimize (just quickly written not optimized..)
     for (i=0; i<dstW; i++) {
-        int val = lumDither[i&7] << 12;
+        int val = lumDither[i & 7] << 12;
         int j;
         for (j=0; j<lumFilterSize; j++)
             val += lumSrc[j][i] * lumFilter[j];
@@ -406,8 +418,8 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 
     if (uDest)
         for (i=0; i<chrDstW; i++) {
-            int u = chrDither[i&7] << 12;
-            int v = chrDither[(i+3)&7] << 12;
+            int u = chrDither[i & 7] << 12;
+            int v = chrDither[(i + 3) & 7] << 12;
             int j;
             for (j=0; j<chrFilterSize; j++) {
                 u += chrUSrc[j][i] * chrFilter[j];
@@ -420,7 +432,7 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 
     if (CONFIG_SWSCALE_ALPHA && aDest)
         for (i=0; i<dstW; i++) {
-            int val = lumDither[i&7] << 12;
+            int val = lumDither[i & 7] << 12;
             int j;
             for (j=0; j<lumFilterSize; j++)
                 val += alpSrc[j][i] * lumFilter[j];
@@ -432,29 +444,29 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
                        const int16_t *chrUSrc, const int16_t *chrVSrc,
                        const int16_t *alpSrc,
-                       uint8_t *dest[4], int dstW, int chrDstW,
-                       const uint8_t *lumDither, const uint8_t *chrDither)
+                       uint8_t *dest[4], int dstW, int chrDstW)
 {
     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
     int i;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 
     for (i=0; i<dstW; i++) {
-        int val= (lumSrc[i]+lumDither[i&7])>>7;
+        int val = (lumSrc[i]+  lumDither[i & 7]) >> 7;
         yDest[i]= av_clip_uint8(val);
     }
 
     if (uDest)
         for (i=0; i<chrDstW; i++) {
-            int u=(chrUSrc[i]+chrDither[i&7])>>7;
-            int v=(chrVSrc[i]+chrDither[(i+3)&7])>>7;
+            int u = (chrUSrc[i] + chrDither[i & 7])       >> 7;
+            int v = (chrVSrc[i] + chrDither[(i + 3) & 7]) >> 7;
             uDest[i]= av_clip_uint8(u);
             vDest[i]= av_clip_uint8(v);
         }
 
     if (CONFIG_SWSCALE_ALPHA && aDest)
         for (i=0; i<dstW; i++) {
-            int val= (alpSrc[i]+lumDither[i&7])>>7;
+            int val = (alpSrc[i] + lumDither[i & 7]) >> 7;
             aDest[i]= av_clip_uint8(val);
         }
 }
@@ -464,16 +476,16 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
                         const int16_t *chrFilter, const int16_t **chrUSrc,
                         const int16_t **chrVSrc, int chrFilterSize,
                         const int16_t **alpSrc, uint8_t *dest[4],
-                        int dstW, int chrDstW,
-                        const uint8_t *lumDither, const uint8_t *chrDither)
+                        int dstW, int chrDstW)
 {
     uint8_t *yDest = dest[0], *uDest = dest[1];
     enum PixelFormat dstFormat = c->dstFormat;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 
     //FIXME Optimize (just quickly written not optimized..)
     int i;
     for (i=0; i<dstW; i++) {
-        int val = lumDither[i&7]<<12;
+        int val = lumDither[i & 7] << 12;
         int j;
         for (j=0; j<lumFilterSize; j++)
             val += lumSrc[j][i] * lumFilter[j];
@@ -486,8 +498,8 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 
     if (dstFormat == PIX_FMT_NV12)
         for (i=0; i<chrDstW; i++) {
-            int u = chrDither[i&7]<<12;
-            int v = chrDither[(i+3)&7]<<12;
+            int u = chrDither[i & 7] << 12;
+            int v = chrDither[(i + 3) & 7] << 12;
             int j;
             for (j=0; j<chrFilterSize; j++) {
                 u += chrUSrc[j][i] * chrFilter[j];
@@ -499,8 +511,8 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
         }
     else
         for (i=0; i<chrDstW; i++) {
-            int u = chrDither[i&7]<<12;
-            int v = chrDither[(i+3)&7]<<12;
+            int u = chrDither[i & 7] << 12;
+            int v = chrDither[(i + 3) & 7] << 12;
             int j;
             for (j=0; j<chrFilterSize; j++) {
                 u += chrUSrc[j][i] * chrFilter[j];
@@ -2523,6 +2535,7 @@ static int swScale(SwsContext *c, const uint8_t* src[],
     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
     int lastDstY;
     uint32_t *pal=c->pal_yuv;
+
     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
@@ -2578,6 +2591,9 @@ static int swScale(SwsContext *c, const uint8_t* src[],
         lastInChrBuf= -1;
     }
 
+    if (!should_dither) {
+        c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
+    }
     lastDstY= dstY;
 
     for (;dstY < dstH; dstY++) {
@@ -2588,8 +2604,6 @@ static int swScale(SwsContext *c, const uint8_t* src[],
             dst[2] + dstStride[2] * chrDstY,
             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
         };
-        const uint8_t *lumDither= should_dither ? dithers[7][dstY   &7] : flat64;
-        const uint8_t *chrDither= should_dither ? dithers[7][chrDstY&7] : flat64;
 
         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
@@ -2669,6 +2683,10 @@ static int swScale(SwsContext *c, const uint8_t* src[],
 #if HAVE_MMX
         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
 #endif
+        if (should_dither) {
+            c->chrDither8 = dither_8x8_128[chrDstY & 7];
+            c->lumDither8 = dither_8x8_128[dstY & 7];
+        }
         if (dstY >= dstH-2) {
             // hmm looks like we can't use MMX here without overwriting this array's tail
             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
@@ -2689,13 +2707,13 @@ static int swScale(SwsContext *c, const uint8_t* src[],
                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
                     yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf,
-                             dest, dstW, chrDstW, lumDither, chrDither);
+                             dest, dstW, chrDstW);
                 } else { //General YV12
                     yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
                              lumSrcPtr, vLumFilterSize,
                              vChrFilter + chrDstY * vChrFilterSize,
                              chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
-                             alpSrcPtr, dest, dstW, chrDstW, lumDither, chrDither);
+                             alpSrcPtr, dest, dstW, chrDstW);
                 }
             } else {
                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index e19956c079..09e6ad4b01 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -75,8 +75,7 @@ typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t* src[],
 typedef void (*yuv2planar1_fn) (struct SwsContext *c,
                                 const int16_t *lumSrc, const int16_t *chrUSrc,
                                 const int16_t *chrVSrc, const int16_t *alpSrc,
-                                uint8_t *dest[4], int dstW, int chrDstW,
-                                const uint8_t *lumDither, const uint8_t *chrDither);
+                                uint8_t *dest[4], int dstW, int chrDstW);
 /**
  * Write one line of horizontally scaled Y/U/V/A to planar output
  * with multi-point vertical scaling between input pixels.
@@ -99,7 +98,7 @@ typedef void (*yuv2planarX_fn) (struct SwsContext *c, const int16_t *lumFilter,
                                 const int16_t *chrFilter, const int16_t **chrUSrc,
                                 const int16_t **chrVSrc,  int chrFilterSize,
                                 const int16_t **alpSrc, uint8_t *dest[4],
-                                int dstW, int chrDstW, const uint8_t *lumDither, const uint8_t *chrDither);
+                                int dstW, int chrDstW);
 /**
  * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
  * output without any additional vertical scaling (or point-scaling). Note
@@ -323,7 +322,7 @@ typedef struct SwsContext {
 #define UV_OFF                "11*8+4*4*256*3+48"
 #define UV_OFFx2              "11*8+4*4*256*3+56"
 #define DITHER16              "11*8+4*4*256*3+64"
-#define DITHER32              "11*8+4*4*256*3+64+16"
+#define DITHER32              "11*8+4*4*256*3+80"
 
     DECLARE_ALIGNED(8, uint64_t, redDither);
     DECLARE_ALIGNED(8, uint64_t, greenDither);
@@ -351,6 +350,8 @@ typedef struct SwsContext {
     uint16_t dither16[8];
     uint32_t dither32[8];
 
+    const uint8_t *chrDither8, *lumDither8;
+
 #if HAVE_ALTIVEC
     vector signed short   CY;
     vector signed short   CRV;
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 23bc5148c7..87248e76fa 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -70,26 +70,62 @@
         : "%"REG_d, "%"REG_S\
     );
 
+#if !COMPILE_TEMPLATE_MMX2
+static av_always_inline void
+dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot)
+{
+    if (rot) {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "psrlq       $24, %%mm3\n\t"
+                         "psllq       $40, %%mm4\n\t"
+                         "por       %%mm4, %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         "psraw        $4, %%mm3\n\t"
+                         "psraw        $4, %%mm4\n\t"
+                         "movq      %%mm3, "DITHER16"+0(%1)\n\t"
+                         "movq      %%mm4, "DITHER16"+8(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    } else {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         "psraw        $4, %%mm3\n\t"
+                         "psraw        $4, %%mm4\n\t"
+                         "movq      %%mm3, "DITHER16"+0(%1)\n\t"
+                         "movq      %%mm4, "DITHER16"+8(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    }
+}
+#endif
+
 static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
                              const int16_t **lumSrc, int lumFilterSize,
                              const int16_t *chrFilter, const int16_t **chrUSrc,
                              const int16_t **chrVSrc,
                              int chrFilterSize, const int16_t **alpSrc,
-                             uint8_t *dest[4], int dstW, int chrDstW,
-                             const uint8_t *lumDither, const uint8_t *chrDither)
+                             uint8_t *dest[4], int dstW, int chrDstW)
 {
     int i;
     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 
     if (uDest) {
         x86_reg uv_off = c->uv_offx2 >> 1;
-        for(i=0; i<8; i++) c->dither16[i] = chrDither[i]>>4;
+        dither_8to16(c, chrDither, 0);
         YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
-        for(i=0; i<8; i++) c->dither16[i] = chrDither[(i+3)&7]>>4;
+        dither_8to16(c, chrDither, 1);
         YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
     }
-    for(i=0; i<8; i++) c->dither16[i] = lumDither[i]>>4;
+    dither_8to16(c, lumDither, 0);
     if (CONFIG_SWSCALE_ALPHA && aDest) {
         YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
     }
@@ -104,10 +140,6 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
         "movq                  "DITHER32"+8(%0), %%mm5      \n\t"\
         "movq                 "DITHER32"+16(%0), %%mm6      \n\t"\
         "movq                 "DITHER32"+24(%0), %%mm7      \n\t"\
-        "pxor                             %%mm4, %%mm4      \n\t"\
-        "pxor                             %%mm5, %%mm5      \n\t"\
-        "pxor                             %%mm6, %%mm6      \n\t"\
-        "pxor                             %%mm7, %%mm7      \n\t"\
         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
         ".p2align                             4             \n\t"\
         "1:                                                 \n\t"\
@@ -157,26 +189,87 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
         : "%"REG_a, "%"REG_d, "%"REG_S\
     );
 
+#if !COMPILE_TEMPLATE_MMX2
+static av_always_inline void
+dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot)
+{
+int i;
+if(rot) for(i=0; i<8; i++) c->dither32[i] = srcDither[(i+3)&7]<<12;
+else    for(i=0; i<8; i++) c->dither32[i] = srcDither[i&7]<<12;
+return;
+
+    if (rot) {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm4\n\t"
+                         "movq      %%mm4, %%mm5\n\t"
+                         "psrlq       $24, %%mm4\n\t"
+                         "psllq       $40, %%mm5\n\t"
+                         "por       %%mm5, %%mm4\n\t"
+                         "movq      %%mm4, %%mm6\n\t"
+                         "punpcklbw %%mm0, %%mm4\n\t"
+                         "punpckhbw %%mm0, %%mm6\n\t"
+                         "movq      %%mm4, %%mm5\n\t"
+                         "movq      %%mm6, %%mm7\n\t"
+                         "punpcklwd %%mm0, %%mm4\n\t"
+                         "punpckhwd %%mm0, %%mm5\n\t"
+                         "punpcklwd %%mm0, %%mm6\n\t"
+                         "punpckhwd %%mm0, %%mm7\n\t"
+                         "psllw       $12, %%mm4\n\t"
+                         "psllw       $12, %%mm5\n\t"
+                         "psllw       $12, %%mm6\n\t"
+                         "psllw       $12, %%mm7\n\t"
+                         "movq      %%mm4, "DITHER32"+0(%1)\n\t"
+                         "movq      %%mm5, "DITHER32"+8(%1)\n\t"
+                         "movq      %%mm6, "DITHER32"+16(%1)\n\t"
+                         "movq      %%mm7, "DITHER32"+24(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    } else {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm4\n\t"
+                         "movq      %%mm4, %%mm6\n\t"
+                         "punpcklbw %%mm0, %%mm4\n\t"
+                         "punpckhbw %%mm0, %%mm6\n\t"
+                         "movq      %%mm4, %%mm5\n\t"
+                         "movq      %%mm6, %%mm7\n\t"
+                         "punpcklwd %%mm0, %%mm4\n\t"
+                         "punpckhwd %%mm0, %%mm5\n\t"
+                         "punpcklwd %%mm0, %%mm6\n\t"
+                         "punpckhwd %%mm0, %%mm7\n\t"
+                         "psllw       $12, %%mm4\n\t"
+                         "psllw       $12, %%mm5\n\t"
+                         "psllw       $12, %%mm6\n\t"
+                         "psllw       $12, %%mm7\n\t"
+                         "movq      %%mm4, "DITHER32"+0(%1)\n\t"
+                         "movq      %%mm5, "DITHER32"+8(%1)\n\t"
+                         "movq      %%mm6, "DITHER32"+16(%1)\n\t"
+                         "movq      %%mm7, "DITHER32"+24(%1)\n\t"
+                         :: "r"(srcDither), "r"(&c->redDither)
+                         );
+    }
+}
+#endif
+
 static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
                                 const int16_t **lumSrc, int lumFilterSize,
                                 const int16_t *chrFilter, const int16_t **chrUSrc,
                                 const int16_t **chrVSrc,
                                 int chrFilterSize, const int16_t **alpSrc,
-                                uint8_t *dest[4], int dstW, int chrDstW,
-                                const uint8_t *lumDither, const uint8_t *chrDither)
+                                uint8_t *dest[4], int dstW, int chrDstW)
 {
     int i;
     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 
     if (uDest) {
         x86_reg uv_off = c->uv_offx2 >> 1;
-        for(i=0; i<8; i++) c->dither32[i] = chrDither[i]<<12;
+        dither_8to32(c, chrDither, 0);
         YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
-        for(i=0; i<8; i++) c->dither32[i] = chrDither[(i+3)&7]<<12;
+        dither_8to32(c, chrDither, 1);
         YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
     }
-    for(i=0; i<8; i++) c->dither32[i] = lumDither[i]<<12;
+    dither_8to32(c, lumDither, 0);
     if (CONFIG_SWSCALE_ALPHA && aDest) {
         YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
     }
@@ -187,8 +280,7 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
 static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
                              const int16_t *chrUSrc, const int16_t *chrVSrc,
                              const int16_t *alpSrc,
-                             uint8_t *dst[4], int dstW, int chrDstW,
-                             const uint8_t *lumDither, const uint8_t *chrDither)
+                             uint8_t *dst[4], int dstW, int chrDstW)
 {
     int p= 4;
     const int16_t *src[4]= {
@@ -222,8 +314,7 @@ static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
 static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
                                 const int16_t *chrUSrc, const int16_t *chrVSrc,
                                 const int16_t *alpSrc,
-                                uint8_t *dst[4], int dstW, int chrDstW,
-                                const uint8_t *lumDither, const uint8_t *chrDither)
+                                uint8_t *dst[4], int dstW, int chrDstW)
 {
     int p= 4;
     const int16_t *src[4]= {
@@ -231,15 +322,16 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
         chrVSrc + chrDstW, alpSrc + dstW
     };
     x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 
     while (p--) {
         if (dst[p]) {
             int i;
-            for(i=0; i<8; i++) c->dither16[i] = i<2 ? lumDither[i] : chrDither[i];
+            for(i=0; i<8; i++) c->dither16[i] = (p == 2 || p == 3) ? lumDither[i] : chrDither[i];
             __asm__ volatile(
                 "mov %2, %%"REG_a"                    \n\t"
-                "movq               0(%3), %%mm6      \n\t"
-                "movq               8(%3), %%mm7      \n\t"
+                "movq    "DITHER16"+0(%3), %%mm6      \n\t"
+                "movq    "DITHER16"+8(%3), %%mm7      \n\t"
                 ".p2align                4            \n\t" /* FIXME Unroll? */
                 "1:                                   \n\t"
                 "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"
@@ -253,7 +345,7 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
                 "add                   $8, %%"REG_a"  \n\t"
                 "jnc                   1b             \n\t"
                 :: "r" (src[p]), "r" (dst[p] + counter[p]),
-                   "g" (-counter[p]), "r"(c->dither16)
+                   "g" (-counter[p]), "r"(&c->redDither)
                 : "%"REG_a
             );
         }