Blackfin optimized YUV420 to RGB CSC Color Space Converters.

YUV2 -> RGB BGR for 565, 555 and 888 a.k.a. 24bit color. Speed-up compared to C version compiled with -O3 187.28% Patch by Marc Hoffman %mmh A pleasantst P com% Original thread: Date: May 9, 2007 2:46 AM Subject: [FFmpeg-devel] PATCH BlackFin yuv2rgb color space conversion Originally committed as revision 23307 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
2007-05-13 19:22:32 +00:00 · 2007-05-13 19:22:32 +00:00 · d3f3eea92d
parent 79d4c96a1a
commit d3f3eea92d
7 changed files with 690 additions and 1 deletions
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@ -12,6 +12,9 @@ OBJS= swscale.o rgb2rgb.o
 OBJS-$(TARGET_ALTIVEC)     +=  yuv2rgb_altivec.o
 OBJS-$(CONFIG_GPL)         +=  yuv2rgb.o

+OBJS-$(TARGET_ARCH_BFIN)     +=  yuv2rgb_bfin.o
+ASM_OBJS-$(TARGET_ARCH_BFIN) += internal_bfin.o
+
 HEADERS = swscale.h rgb2rgb.h

 include ../common.mak
--- a/libswscale/internal_bfin.S
+++ b/libswscale/internal_bfin.S
@ -0,0 +1,454 @@
+/*
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *                    April 20, 2007
+ *
+ * Blackfin Video Color Space Converters Operations
+ *  convert I420 YV12 to RGB in various formats,
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+/*
+    YUV420 to RGB565 conversion.  This routine takes a YUV 420 planar macroblock
+    and converts it to RGB565.  R:5 bits, G:6 bits, B:5 bits.. packed into shorts
+
+
+    The following calculation is used for the conversion:
+
+      r = clipz((y-oy)*cy  + crv*(v-128))
+      g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
+      b = clipz((y-oy)*cy  + cbu*(u-128))
+
+    y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
+
+
+    New factorization to elliminate the truncation error which was
+    occuring due to the byteop3p.
+
+
+  1) use the bytop16m to subtract quad bytes we use this in U8 this
+   then so the offsets need to be renormalized to 8bits.
+
+  2) scale operands up by a factor of 4 not 8 because Blackfin
+     multiplies include a shift.
+
+  3) compute into the accumulators cy*yx0, cy*yx1
+
+  4) compute each of the linear equations
+      r = clipz((y-oy)*cy  + crv*(v-128))
+
+      g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
+
+      b = clipz((y-oy)*cy  + cbu*(u-128))
+
+     reuse of the accumulators requires that we actually multiply
+     twice once with addition and the second time with a subtaction.
+
+     because of this we need to compute the equations in the order R B
+     then G saving the writes for B in the case of 24/32 bit color
+     formats.
+
+    api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
+                       int dW, uint32_t *coeffs);
+
+        A          B
+        ---        ---
+        i2 = cb    i3 = cr
+        i1 = coeff i0 = y
+
+  Where coeffs have the following layout in memory.
+
+  uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
+
+  coeffs is a pointer to oy.
+
+  the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
+  replication is used to simplify the internal algorithms for the dual mac architecture
+  of BlackFin.
+
+  All routines are exported with _ff_bfin_ as a symbol prefix
+
+  rough performance gain compared against -O3:
+
+  2779809/1484290 187.28%
+
+  which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
+  c/pel for the optimized implementations. Not sure why there is such a
+  huge variation on the reference codes on Blackfin I guess it must have
+  to do with the memory system.
+
+*/
+
+#define mL1 .l1.text
+#define mL3 .text
+#define MEM mL1
+
+#define DEFUN(fname,where,interface) \
+        .section where;              \
+        .global _ff_bfin_ ## fname;  \
+        .type _ff_bfin_ ## fname, STT_FUNC; \
+        .align 8;                    \
+        _ff_bfin_ ## fname
+
+#define DEFUN_END(fname) \
+        .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
+
+
+.text
+
+#define COEFF_LEN        11*4
+#define COEFF_REL_CY_OFF 4*4
+
+#define ARG_OUT   20
+#define ARG_W     24
+#define ARG_COEFF 28
+
+DEFUN(yuv2rgb565_line,MEM,
+   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
+        link 0;
+        [--sp] = (r7:4);
+        p1 = [fp+ARG_OUT];
+        r3 = [fp+ARG_W];
+
+        i0 = r0;
+        i2 = r1;
+        i3 = r2;
+
+        r0 = [fp+ARG_COEFF];
+        i1 = r0;
+        b1 = i1;
+        l1 = COEFF_LEN;
+        m0 = COEFF_REL_CY_OFF;
+        p0 = r3;
+
+        r0   = [i0++];         // 2Y
+        r1.l = w[i2++];        // 2u
+        r1.h = w[i3++];        // 2v
+        p0 = p0>>2;
+
+        lsetup (.L0565, .L1565) lc0 = p0;
+
+        /*
+           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
+           r0 -- used to load 4ys
+           r1 -- used to load 2us,2vs
+           r4 -- y3,y2
+           r5 -- y1,y0
+           r6 -- u1,u0
+           r7 -- v1,v0
+        */
+                                                              r2=[i1++]; // oy
+.L0565:
+        /*
+        rrrrrrrr gggggggg bbbbbbbb
+         5432109876543210
+                    bbbbb >>3
+              gggggggg    <<3
+         rrrrrrrr         <<8
+         rrrrrggggggbbbbb
+        */
+        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
+        (r7,r6) = byteop16m (r1:0, r3:2) (r);
+        r5 = r5 << 2 (v);                                                // y1,y0
+        r4 = r4 << 2 (v);                                                // y3,y2
+        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
+        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
+        /* Y' = y*cy */
+        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2 = r2 >> 3 (v);
+        r3 = r2 & r5;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
+                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+        r2 = r2 << 8 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
+        r2 = r2 << 3 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+        [p1++]=r3                                          || r1=[i1++]; // cy
+
+        /* Y' = y*cy */
+
+        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2 = r2 >> 3 (v);
+        r3 = r2 & r5;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
+                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+        r2 = r2 << 8 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
+        r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+        [p1++]=r3                                          || r1.h = w[i3++];        // 2v
+.L1565:                                                       r2=[i1++]; // oy
+
+        l1 = 0;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+DEFUN_END(yuv2rgb565_line)
+
+DEFUN(yuv2rgb555_line,MEM,
+   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
+        link 0;
+        [--sp] = (r7:4);
+        p1 = [fp+ARG_OUT];
+        r3 = [fp+ARG_W];
+
+        i0 = r0;
+        i2 = r1;
+        i3 = r2;
+
+        r0 = [fp+ARG_COEFF];
+        i1 = r0;
+        b1 = i1;
+        l1 = COEFF_LEN;
+        m0 = COEFF_REL_CY_OFF;
+        p0 = r3;
+
+        r0   = [i0++];         // 2Y
+        r1.l = w[i2++];        // 2u
+        r1.h = w[i3++];        // 2v
+        p0 = p0>>2;
+
+        lsetup (.L0555, .L1555) lc0 = p0;
+
+        /*
+           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
+           r0 -- used to load 4ys
+           r1 -- used to load 2us,2vs
+           r4 -- y3,y2
+           r5 -- y1,y0
+           r6 -- u1,u0
+           r7 -- v1,v0
+        */
+                                                              r2=[i1++]; // oy
+.L0555:
+        /*
+        rrrrrrrr gggggggg bbbbbbbb
+         5432109876543210
+                    bbbbb >>3
+               gggggggg   <<2
+          rrrrrrrr        <<7
+         xrrrrrgggggbbbbb
+        */
+
+        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
+        (r7,r6) = byteop16m (r1:0, r3:2) (r);
+        r5 = r5 << 2 (v);                                                // y1,y0
+        r4 = r4 << 2 (v);                                                // y3,y2
+        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
+        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
+        /* Y' = y*cy */
+        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2 = r2 >> 3 (v);
+        r3 = r2 & r5;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
+                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+        r2 = r2 << 7 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
+        r2 = r2 << 2 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+        [p1++]=r3                                          || r1=[i1++]; // cy
+
+        /* Y' = y*cy */
+
+        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2 = r2 >> 3 (v);
+        r3 = r2 & r5;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
+                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+        r2 = r2 << 7 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
+        r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+        [p1++]=r3                                          || r1.h=w[i3++]; // 2v
+
+.L1555:                                                       r2=[i1++]; // oy
+
+        l1 = 0;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+DEFUN_END(yuv2rgb555_line)
+
+DEFUN(yuv2rgb24_line,MEM,
+   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
+        link 0;
+        [--sp] = (r7:4);
+        p1 = [fp+ARG_OUT];
+        r3 = [fp+ARG_W];
+        p2 = p1;
+        p2 += 3;
+
+        i0 = r0;
+        i2 = r1;
+        i3 = r2;
+
+        r0 = [fp+ARG_COEFF]; // coeff buffer
+        i1 = r0;
+        b1 = i1;
+        l1 = COEFF_LEN;
+        m0 = COEFF_REL_CY_OFF;
+        p0 = r3;
+
+        r0   = [i0++];         // 2Y
+        r1.l = w[i2++];        // 2u
+        r1.h = w[i3++];        // 2v
+        p0 = p0>>2;
+
+        lsetup (.L0888, .L1888) lc0 = p0;
+
+        /*
+           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
+           r0 -- used to load 4ys
+           r1 -- used to load 2us,2vs
+           r4 -- y3,y2
+           r5 -- y1,y0
+           r6 -- u1,u0
+           r7 -- v1,v0
+        */
+                                                              r2=[i1++]; // oy
+.L0888:
+        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
+        (r7,r6) = byteop16m (r1:0, r3:2) (r);
+        r5 = r5 << 2 (v);               // y1,y0
+        r4 = r4 << 2 (v);               // y3,y2
+        r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
+        r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
+
+        /* Y' = y*cy */
+        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2=r2>>16 || B[p1++]=r2;
+                     B[p2++]=r2;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
+                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
+        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
+
+        r2=r2>>16 || B[p1++]=r2;
+                     B[p2++]=r2;
+
+        r3=r3>>16 || B[p1++]=r3;
+                     B[p2++]=r3                            || r1=[i1++]; // cy
+
+        p1+=3;
+        p2+=3;
+        /* Y' = y*cy */
+        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2=r2>>16 || B[p1++]=r2;
+        B[p2++]=r2;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
+                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
+        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
+        r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
+                     B[p2++]=r2 || r1.l = w[i2++]; // 2u
+        r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
+                     B[p2++]=r3 || r2=[i1++];      // oy
+
+        p1+=3;
+.L1888: p2+=3;
+
+        l1 = 0;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+DEFUN_END(yuv2rgb888_line)
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@ -1992,7 +1992,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
 #endif

 #if !defined(RUNTIME_CPUDETECT) || !defined (CONFIG_GPL) //ensure that the flags match the compiled variant if cpudetect is off
-    flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC);
+    flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC|SWS_CPU_CAPS_BFIN);
 #ifdef HAVE_MMX2
    flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2;
 #elif defined (HAVE_3DNOW)
@ -2001,6 +2001,8 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
    flags |= SWS_CPU_CAPS_MMX;
 #elif defined (HAVE_ALTIVEC)
    flags |= SWS_CPU_CAPS_ALTIVEC;
+#elif defined (ARCH_BFIN)
+    flags |= SWS_CPU_CAPS_BFIN;
 #endif
 #endif /* RUNTIME_CPUDETECT */
    if (clip_table[512] != 255) globalInit();
--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@ -74,6 +74,7 @@ extern "C" {
 #define SWS_CPU_CAPS_MMX2     0x20000000
 #define SWS_CPU_CAPS_3DNOW    0x40000000
 #define SWS_CPU_CAPS_ALTIVEC  0x10000000
+#define SWS_CPU_CAPS_BFIN     0x01000000

 #define SWS_MAX_REDUCE_CUTOFF 0.002

--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@ -162,6 +162,22 @@ typedef struct SwsContext{

 #endif

+
+#ifdef ARCH_BFIN
+    uint32_t oy           __attribute__((aligned(4)));
+    uint32_t oc           __attribute__((aligned(4)));
+    uint32_t zero         __attribute__((aligned(4)));
+    uint32_t cy           __attribute__((aligned(4)));
+    uint32_t crv          __attribute__((aligned(4)));
+    uint32_t rmask        __attribute__((aligned(4)));
+    uint32_t cbu          __attribute__((aligned(4)));
+    uint32_t bmask        __attribute__((aligned(4)));
+    uint32_t cgu          __attribute__((aligned(4)));
+    uint32_t cgv          __attribute__((aligned(4)));
+    uint32_t gmask        __attribute__((aligned(4)));
+#endif
+
+
 } SwsContext;
 //FIXME check init (where 0)

--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@ -611,6 +611,14 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
    }
 #endif

+#ifdef ARCH_BFIN
+    if (c->flags & SWS_CPU_CAPS_BFIN)
+    {
+        SwsFunc t = ff_bfin_yuv2rgb_get_func_ptr (c);
+        if (t) return t;
+    }
+#endif
+
    av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found\n");

    switch(c->dstFormat){
--- a/libswscale/yuv2rgb_bfin.c
+++ b/libswscale/yuv2rgb_bfin.c
@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *                    April 20, 2007
+ *
+ * Blackfin Video Color Space Converters Operations
+ *  convert I420 YV12 to RGB in various formats,
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <assert.h>
+#include "config.h"
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+#include <unistd.h>
+#include <bits/bfin_sram.h>
+#include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+
+
+#define L1CODE __attribute__ ((l1_text))
+
+extern void ff_bfin_yuv2rgb555_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                                     int w, uint32_t *coeffs) L1CODE;
+
+extern void ff_bfin_yuv2rgb565_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                                     int w, uint32_t *coeffs) L1CODE;
+
+extern void ff_bfin_yuv2rgb24_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                                    int w, uint32_t *coeffs) L1CODE;
+
+typedef void (* ltransform_t)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                              int w, uint32_t *coeffs);
+
+
+static void bfin_prepare_coefficients (SwsContext *c, int rgb, int masks)
+{
+    int oy;
+    oy      = c->yOffset&0xffff;
+    oy      = oy >> 3; // keep everything U8.0 for offset calculation
+
+    c->oc   = 128*0x01010101U;
+    c->oy   =  oy*0x01010101U;
+
+    /* copy 64bit vector coeffs down to 32bit vector coeffs */
+    c->cy  = c->yCoeff;
+    c->zero = 0;
+
+    if (rgb) {
+        c->crv = c->vrCoeff;
+        c->cbu = c->ubCoeff;
+        c->cgu = c->ugCoeff;
+        c->cgv = c->vgCoeff;
+    } else {
+        c->crv = c->ubCoeff;
+        c->cbu = c->vrCoeff;
+        c->cgu = c->vgCoeff;
+        c->cgv = c->ugCoeff;
+    }
+
+
+    if (masks == 555) {
+        c->rmask = 0x001f * 0x00010001U;
+        c->gmask = 0x03e0 * 0x00010001U;
+        c->bmask = 0x7c00 * 0x00010001U;
+    } else if (masks == 565) {
+        c->rmask = 0x001f * 0x00010001U;
+        c->gmask = 0x07e0 * 0x00010001U;
+        c->bmask = 0xf800 * 0x00010001U;
+    }
+}
+
+static int core_yuv420_rgb (SwsContext *c,
+                            uint8_t **in, int *instrides,
+                            int srcSliceY, int srcSliceH,
+                            uint8_t **oplanes, int *outstrides,
+                            ltransform_t lcscf, int rgb, int masks)
+{
+    uint8_t *py,*pu,*pv,*op;
+    int w  = instrides[0];
+    int h2 = srcSliceH>>1;
+    int i;
+
+    bfin_prepare_coefficients (c, rgb, masks);
+
+    py = in[0];
+    pu = in[1+(1^rgb)];
+    pv = in[1+(0^rgb)];
+
+    op = oplanes[0] + srcSliceY*outstrides[0];
+
+    for (i=0;i<h2;i++) {
+
+        lcscf (py,pu,pv,op,w,&c->oy);
+
+        py += instrides[0];
+        op += outstrides[0];
+
+        lcscf (py,pu,pv,op,w,&c->oy);
+
+        py += instrides[0];
+        pu += instrides[1];
+        pv += instrides[2];
+        op += outstrides[0];
+    }
+
+    return srcSliceH;
+}
+
+
+static int bfin_yuv420_rgb555 (SwsContext *c,
+                               uint8_t **in, int *instrides,
+                               int srcSliceY, int srcSliceH,
+                               uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
+                            ff_bfin_yuv2rgb555_line, 1, 555);
+}
+
+static int bfin_yuv420_bgr555 (SwsContext *c,
+                               uint8_t **in, int *instrides,
+                               int srcSliceY, int srcSliceH,
+                               uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
+                            ff_bfin_yuv2rgb555_line, 0, 555);
+}
+
+static int bfin_yuv420_rgb24 (SwsContext *c,
+                              uint8_t **in, int *instrides,
+                              int srcSliceY, int srcSliceH,
+                              uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
+                            ff_bfin_yuv2rgb24_line, 1, 888);
+}
+
+static int bfin_yuv420_bgr24 (SwsContext *c,
+                              uint8_t **in, int *instrides,
+                              int srcSliceY, int srcSliceH,
+                              uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
+                            ff_bfin_yuv2rgb24_line, 0, 888);
+}
+
+static int bfin_yuv420_rgb565 (SwsContext *c,
+                               uint8_t **in, int *instrides,
+                               int srcSliceY, int srcSliceH,
+                               uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
+                            ff_bfin_yuv2rgb565_line, 1, 565);
+}
+
+static int bfin_yuv420_bgr565 (SwsContext *c,
+                               uint8_t **in, int *instrides,
+                               int srcSliceY, int srcSliceH,
+                               uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
+                            ff_bfin_yuv2rgb565_line, 0, 565);
+}
+
+
+SwsFunc ff_bfin_yuv2rgb_get_func_ptr (SwsContext *c)
+{
+    SwsFunc f;
+
+    switch(c->dstFormat) {
+    case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break;
+    case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break;
+    case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break;
+    case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break;
+    case PIX_FMT_RGB24:  f = bfin_yuv420_rgb24;  break;
+    case PIX_FMT_BGR24:  f = bfin_yuv420_bgr24;  break;
+    default:
+        return 0;
+    }
+
+    av_log(c, AV_LOG_INFO, "BlackFin Accelerated Color Space Converter %s\n",
+           sws_format_name (c->dstFormat));
+
+    return f;
+}