Better ARM support for mplayer/ffmpeg, ported from atty fork

while playing with some new hardware, I found it's running a forked mplayer
 -- and it looks like they're following the GPL.

 The maintainer's page is here: http://atty.jp/?Zaurus/mplayer
 Unfortunately it's mostly in Japanese, so it's hard to figure out any
  details.

  Their code looks quite interesting (at least to those of us w/ ARM CPUs).

  The patches I've attached are the patches from atty.jp with a couple of
  modifications by myself:
  - ported to current CVS
  - reverted their change of removing SNOW support from ffmpeg
  - cleaned up their bswap mess
  - removed DOS-style linebreaks from various files

patch by (Bernhard Rosenkraenzer: bero, arklinux org)

Originally committed as revision 4311 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Bernhard Rosenkränzer 2005-05-26 14:32:46 +00:00 committed by Michael Niedermayer
parent c66a443401
commit 6ad1fa5a49
9 changed files with 2294 additions and 1 deletions

View File

@ -316,8 +316,11 @@ endif
# armv4l specific stuff
ifeq ($(TARGET_ARCH_ARMV4L),yes)
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o armv4l/dsputil_arm_s.o
OBJS += armv4l/dsputil_arm.o armv4l/mpegvideo_arm.o
ifeq ($(TARGET_IWMMXT),yes)
OBJS += armv4l/dsputil_iwmmxt.o armv4l/mpegvideo_iwmmxt.o
endif
endif
# sun mediaLib specific stuff
@ -327,6 +330,12 @@ OBJS += mlib/dsputil_mlib.o
CFLAGS += $(MLIB_INC)
endif
# Intel IPP specific stuff
# currently only works when libavcodec is used in mplayer
ifeq ($(HAVE_IPP),yes)
CFLAGS += $(IPP_INC)
endif
# alpha specific stuff
ifeq ($(TARGET_ARCH_ALPHA),yes)
OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o \

View File

@ -18,6 +18,13 @@
*/
#include "../dsputil.h"
#ifdef HAVE_IPP
#include "ipp.h"
#endif
#ifdef HAVE_IWMMXT
extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
#endif
extern void j_rev_dct_ARM(DCTELEM *data);
extern void simple_idct_ARM(DCTELEM *data);
@ -26,6 +33,146 @@ extern void simple_idct_ARM(DCTELEM *data);
static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
static void put_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_pixels8_x2_arm(block, pixels, line_size, h);
put_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_pixels8_y2_arm(block, pixels, line_size, h);
put_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_pixels8_xy2_arm(block, pixels, line_size, h);
put_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_no_rnd_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_no_rnd_pixels8_x2_arm(block, pixels, line_size, h);
put_no_rnd_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_no_rnd_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_no_rnd_pixels8_y2_arm(block, pixels, line_size, h);
put_no_rnd_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_no_rnd_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_no_rnd_pixels8_xy2_arm(block, pixels, line_size, h);
put_no_rnd_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
}
static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size)
{
asm volatile (
"mov r10, #8 \n\t"
"1: \n\t"
/* load dest */
"ldr r4, [%1] \n\t"
/* block[0] and block[1]*/
"ldrsh r5, [%0] \n\t"
"ldrsh r7, [%0, #2] \n\t"
"and r6, r4, #0xFF \n\t"
"and r8, r4, #0xFF00 \n\t"
"add r6, r5, r6 \n\t"
"add r8, r7, r8, lsr #8 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"mov r9, r6 \n\t"
"ldrsh r5, [%0, #4] \n\t" /* moved form [A] */
"orr r9, r9, r8, lsl #8 \n\t"
/* block[2] and block[3] */
/* [A] */
"ldrsh r7, [%0, #6] \n\t"
"and r6, r4, #0xFF0000 \n\t"
"and r8, r4, #0xFF000000 \n\t"
"add r6, r5, r6, lsr #16 \n\t"
"add r8, r7, r8, lsr #24 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"orr r9, r9, r6, lsl #16 \n\t"
"ldr r4, [%1, #4] \n\t" /* moved form [B] */
"orr r9, r9, r8, lsl #24 \n\t"
/* store dest */
"ldrsh r5, [%0, #8] \n\t" /* moved form [C] */
"str r9, [%1] \n\t"
/* load dest */
/* [B] */
/* block[4] and block[5] */
/* [C] */
"ldrsh r7, [%0, #10] \n\t"
"and r6, r4, #0xFF \n\t"
"and r8, r4, #0xFF00 \n\t"
"add r6, r5, r6 \n\t"
"add r8, r7, r8, lsr #8 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"mov r9, r6 \n\t"
"ldrsh r5, [%0, #12] \n\t" /* moved from [D] */
"orr r9, r9, r8, lsl #8 \n\t"
/* block[6] and block[7] */
/* [D] */
"ldrsh r7, [%0, #14] \n\t"
"and r6, r4, #0xFF0000 \n\t"
"and r8, r4, #0xFF000000 \n\t"
"add r6, r5, r6, lsr #16 \n\t"
"add r8, r7, r8, lsr #24 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"orr r9, r9, r6, lsl #16 \n\t"
"add %0, %0, #16 \n\t" /* moved from [E] */
"orr r9, r9, r8, lsl #24 \n\t"
"subs r10, r10, #1 \n\t" /* moved from [F] */
/* store dest */
"str r9, [%1, #4] \n\t"
/* [E] */
/* [F] */
"add %1, %1, %2 \n\t"
"bne 1b \n\t"
:
: "r"(block),
"r"(dest),
"r"(line_size)
: "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" );
}
/* XXX: those functions should be suppressed ASAP when all IDCTs are
converted */
static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
@ -48,6 +195,34 @@ static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
simple_idct_ARM (block);
ff_add_pixels_clamped(block, dest, line_size);
}
static void simple_idct_ipp(DCTELEM *block)
{
#ifdef HAVE_IPP
ippiDCT8x8Inv_Video_16s_C1I(block);
#endif
}
static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
{
#ifdef HAVE_IPP
ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
#endif
}
#ifdef HAVE_IWMMXT
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
#endif
static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
{
#ifdef HAVE_IPP
ippiDCT8x8Inv_Video_16s_C1I(block);
#ifdef HAVE_IWMMXT
add_pixels_clamped_iwmmxt(block, dest, line_size);
#else
add_pixels_clamped_ARM(block, dest, line_size);
#endif
#endif
}
void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
{
@ -56,7 +231,11 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
ff_put_pixels_clamped = c->put_pixels_clamped;
ff_add_pixels_clamped = c->add_pixels_clamped;
#ifdef HAVE_IPP
if(idct_algo==FF_IDCT_ARM){
#else
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_ARM){
#endif
c->idct_put= j_rev_dct_ARM_put;
c->idct_add= j_rev_dct_ARM_add;
c->idct = j_rev_dct_ARM;
@ -66,5 +245,37 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
c->idct_add= simple_idct_ARM_add;
c->idct = simple_idct_ARM;
c->idct_permutation_type= FF_NO_IDCT_PERM;
#ifdef HAVE_IPP
} else if (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_IPP){
#else
} else if (idct_algo==FF_IDCT_IPP){
#endif
c->idct_put= simple_idct_ipp_put;
c->idct_add= simple_idct_ipp_add;
c->idct = simple_idct_ipp;
c->idct_permutation_type= FF_NO_IDCT_PERM;
}
/* c->put_pixels_tab[0][0] = put_pixels16_arm; */ // NG!
c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK!
c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK!
/* c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; /\* NG *\/ */
/* c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; // ?(»È¤ï¤ì¤Ê¤¤) */
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK
/* c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; //NG */
c->put_pixels_tab[1][0] = put_pixels8_arm; //OK
c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK
/* c->put_pixels_tab[1][2] = put_pixels8_y2_arm; //NG */
/* c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; //NG */
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
/* c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;//NG */
#if 1
#ifdef HAVE_IWMMXT
dsputil_init_iwmmxt(c, avctx);
#endif
#endif
}

View File

@ -0,0 +1,694 @@
@
@ ARMv4L optimized DSP utils
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
@
@ This library is free software; you can redistribute it and/or
@ modify it under the terms of the GNU Lesser General Public
@ License as published by the Free Software Foundation; either
@ version 2 of the License, or (at your option) any later version.
@
@ This library is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
@ Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public
@ License along with this library; if not, write to the Free Software
@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@
.macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
mov \Rd0, \Rn0, lsr #(\shift * 8)
mov \Rd1, \Rn1, lsr #(\shift * 8)
mov \Rd2, \Rn2, lsr #(\shift * 8)
mov \Rd3, \Rn3, lsr #(\shift * 8)
orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
.endm
.macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
mov \R0, \R0, lsr #(\shift * 8)
orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
mov \R1, \R1, lsr #(\shift * 8)
orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
.endm
.macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
.endm
.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
@ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
@ Rmask = 0xFEFEFEFE
@ Rn = destroy
eor \Rd0, \Rn0, \Rm0
eor \Rd1, \Rn1, \Rm1
orr \Rn0, \Rn0, \Rm0
orr \Rn1, \Rn1, \Rm1
and \Rd0, \Rd0, \Rmask
and \Rd1, \Rd1, \Rmask
sub \Rd0, \Rn0, \Rd0, lsr #1
sub \Rd1, \Rn1, \Rd1, lsr #1
.endm
.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
@ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
@ Rmask = 0xFEFEFEFE
@ Rn = destroy
eor \Rd0, \Rn0, \Rm0
eor \Rd1, \Rn1, \Rm1
and \Rn0, \Rn0, \Rm0
and \Rn1, \Rn1, \Rm1
and \Rd0, \Rd0, \Rmask
and \Rd1, \Rd1, \Rmask
add \Rd0, \Rn0, \Rd0, lsr #1
add \Rd1, \Rn1, \Rd1, lsr #1
.endm
@ ----------------------------------------------------------------
.align 8
.global put_pixels16_arm
put_pixels16_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r11, lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
bic r1, r1, #3
add r5, r5, r4, lsl #2
ldrne pc, [r5]
1:
ldmia r1, {r4-r7}
add r1, r1, r2
stmia r0, {r4-r7}
pld [r1]
subs r3, r3, #1
add r0, r0, r2
bne 1b
ldmfd sp!, {r4-r11, pc}
.align 8
2:
ldmia r1, {r4-r8}
add r1, r1, r2
ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stmia r0, {r9-r12}
add r0, r0, r2
bne 2b
ldmfd sp!, {r4-r11, pc}
.align 8
3:
ldmia r1, {r4-r8}
add r1, r1, r2
ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stmia r0, {r9-r12}
add r0, r0, r2
bne 3b
ldmfd sp!, {r4-r11, pc}
.align 8
4:
ldmia r1, {r4-r8}
add r1, r1, r2
ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stmia r0, {r9-r12}
add r0, r0, r2
bne 4b
ldmfd sp!, {r4-r11,pc}
.align 8
5:
.word 1b
.word 2b
.word 3b
.word 4b
@ ----------------------------------------------------------------
.align 8
.global put_pixels8_arm
put_pixels8_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r5,lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
bic r1, r1, #3
add r5, r5, r4, lsl #2
ldrne pc, [r5]
1:
ldmia r1, {r4-r5}
add r1, r1, r2
subs r3, r3, #1
pld [r1]
stmia r0, {r4-r5}
add r0, r0, r2
bne 1b
ldmfd sp!, {r4-r5,pc}
.align 8
2:
ldmia r1, {r4-r5, r12}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
pld [r1]
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 2b
ldmfd sp!, {r4-r5,pc}
.align 8
3:
ldmia r1, {r4-r5, r12}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
pld [r1]
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 3b
ldmfd sp!, {r4-r5,pc}
.align 8
4:
ldmia r1, {r4-r5, r12}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
pld [r1]
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 4b
ldmfd sp!, {r4-r5,pc}
.align 8
5:
.word 1b
.word 2b
.word 3b
.word 4b
@ ----------------------------------------------------------------
.align 8
.global put_pixels8_x2_arm
put_pixels8_x2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r10,lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
ldr r12, [r5]
add r5, r5, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
pld [r1]
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 1b
ldmfd sp!, {r4-r10,pc}
.align 8
2:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
pld [r1]
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 2b
ldmfd sp!, {r4-r10,pc}
.align 8
3:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
pld [r1]
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 3b
ldmfd sp!, {r4-r10,pc}
.align 8
4:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
pld [r1]
RND_AVG32 r8, r9, r6, r7, r5, r10, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 4b
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
.align 8
5:
.word 0xFEFEFEFE
.word 2b
.word 3b
.word 4b
.align 8
.global put_no_rnd_pixels8_x2_arm
put_no_rnd_pixels8_x2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r10,lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
ldr r12, [r5]
add r5, r5, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
pld [r1]
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 1b
ldmfd sp!, {r4-r10,pc}
.align 8
2:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
pld [r1]
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 2b
ldmfd sp!, {r4-r10,pc}
.align 8
3:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
pld [r1]
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 3b
ldmfd sp!, {r4-r10,pc}
.align 8
4:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
pld [r1]
NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 4b
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
.align 8
5:
.word 0xFEFEFEFE
.word 2b
.word 3b
.word 4b
@ ----------------------------------------------------------------
.align 8
.global put_pixels8_y2_arm
put_pixels8_y2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
mov r3, r3, lsr #1
ldr r12, [r5]
add r5, r5, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
ldmia r1, {r4-r5}
add r1, r1, r2
6: ldmia r1, {r6-r7}
add r1, r1, r2
pld [r1]
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
ldmia r1, {r4-r5}
add r1, r1, r2
stmia r0, {r8-r9}
add r0, r0, r2
pld [r1]
RND_AVG32 r8, r9, r6, r7, r4, r5, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
2:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
3:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
4:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
5:
.word 0xFEFEFEFE
.word 2b
.word 3b
.word 4b
.align 8
.global put_no_rnd_pixels8_y2_arm
put_no_rnd_pixels8_y2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
mov r3, r3, lsr #1
ldr r12, [r5]
add r5, r5, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
ldmia r1, {r4-r5}
add r1, r1, r2
6: ldmia r1, {r6-r7}
add r1, r1, r2
pld [r1]
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
ldmia r1, {r4-r5}
add r1, r1, r2
stmia r0, {r8-r9}
add r0, r0, r2
pld [r1]
NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
2:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
3:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
4:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
5:
.word 0xFEFEFEFE
.word 2b
.word 3b
.word 4b
@ ----------------------------------------------------------------
.macro RND_XY2_IT align, rnd
@ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
@ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
.if \align == 0
ldmia r1, {r6-r8}
.elseif \align == 3
ldmia r1, {r5-r7}
.else
ldmia r1, {r8-r10}
.endif
add r1, r1, r2
pld [r1]
.if \align == 0
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
.elseif \align == 1
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
.elseif \align == 2
ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
.elseif \align == 3
ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
.endif
ldr r14, [r12, #0] @ 0x03030303
tst r3, #1
and r8, r4, r14
and r9, r5, r14
and r10, r6, r14
and r11, r7, r14
.if \rnd == 1
ldreq r14, [r12, #16] @ 0x02020202
.else
ldreq r14, [r12, #28] @ 0x01010101
.endif
add r8, r8, r10
add r9, r9, r11
addeq r8, r8, r14
addeq r9, r9, r14
ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2
and r4, r14, r4, lsr #2
and r5, r14, r5, lsr #2
and r6, r14, r6, lsr #2
and r7, r14, r7, lsr #2
add r10, r4, r6
add r11, r5, r7
.endm
.macro RND_XY2_EXPAND align, rnd
RND_XY2_IT \align, \rnd
6: stmfd sp!, {r8-r11}
RND_XY2_IT \align, \rnd
ldmfd sp!, {r4-r7}
add r4, r4, r8
add r5, r5, r9
add r6, r6, r10
add r7, r7, r11
ldr r14, [r12, #24] @ 0x0F0F0F0F
and r4, r14, r4, lsr #2
and r5, r14, r5, lsr #2
add r4, r4, r6
add r5, r5, r7
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.endm
.align 8
.global put_pixels8_xy2_arm
put_pixels8_xy2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
adrl r12, 5f
ands r4, r1, #3
add r5, r12, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
RND_XY2_EXPAND 0, 1
.align 8
2:
RND_XY2_EXPAND 1, 1
.align 8
3:
RND_XY2_EXPAND 2, 1
.align 8
4:
RND_XY2_EXPAND 3, 1
5:
.word 0x03030303
.word 2b
.word 3b
.word 4b
.word 0x02020202
.word 0xFCFCFCFC >> 2
.word 0x0F0F0F0F
.word 0x01010101
.align 8
.global put_no_rnd_pixels8_xy2_arm
put_no_rnd_pixels8_xy2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
adrl r12, 5f
ands r4, r1, #3
add r5, r12, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
RND_XY2_EXPAND 0, 0
.align 8
2:
RND_XY2_EXPAND 1, 0
.align 8
3:
RND_XY2_EXPAND 2, 0
.align 8
4:
RND_XY2_EXPAND 3, 0
5:
.word 0x03030303
.word 2b
.word 3b
.word 4b
.word 0x02020202
.word 0xFCFCFCFC >> 2
.word 0x0F0F0F0F
.word 0x01010101

View File

@ -0,0 +1,168 @@
/*
* iWMMXt optimized DSP utils
* Copyright (c) 2004 AGAWA Koji
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "../dsputil.h"
#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
#define WAVG2B "wavg2b"
#include "dsputil_iwmmxt_rnd.h"
#undef DEF
#undef SET_RND
#undef WAVG2B
#define DEF(x, y) x ## _ ## y ##_iwmmxt
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
#define WAVG2B "wavg2br"
#include "dsputil_iwmmxt_rnd.h"
#undef DEF
#undef SET_RND
#undef WAVG2BR
// need scheduling
#define OP(AVG) \
asm volatile ( \
/* alignment */ \
"and r12, %[pixels], #7 \n\t" \
"bic %[pixels], %[pixels], #7 \n\t" \
"tmcr wcgr1, r12 \n\t" \
\
"wldrd wr0, [%[pixels]] \n\t" \
"wldrd wr1, [%[pixels], #8] \n\t" \
"add %[pixels], %[pixels], %[line_size] \n\t" \
"walignr1 wr4, wr0, wr1 \n\t" \
\
"1: \n\t" \
\
"wldrd wr2, [%[pixels]] \n\t" \
"wldrd wr3, [%[pixels], #8] \n\t" \
"add %[pixels], %[pixels], %[line_size] \n\t" \
"pld [%[pixels]] \n\t" \
"walignr1 wr5, wr2, wr3 \n\t" \
AVG " wr6, wr4, wr5 \n\t" \
"wstrd wr6, [%[block]] \n\t" \
"add %[block], %[block], %[line_size] \n\t" \
\
"wldrd wr0, [%[pixels]] \n\t" \
"wldrd wr1, [%[pixels], #8] \n\t" \
"add %[pixels], %[pixels], %[line_size] \n\t" \
"walignr1 wr4, wr0, wr1 \n\t" \
"pld [%[pixels]] \n\t" \
AVG " wr6, wr4, wr5 \n\t" \
"wstrd wr6, [%[block]] \n\t" \
"add %[block], %[block], %[line_size] \n\t" \
\
"subs %[h], %[h], #2 \n\t" \
"bne 1b \n\t" \
: [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \
: [line_size]"r"(line_size) \
: "memory", "r12");
void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
OP("wavg2br");
}
void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
OP("wavg2b");
}
#undef OP
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
{
uint8_t *pixels2 = pixels + line_size;
__asm__ __volatile__ (
"mov r12, #4 \n\t"
"1: \n\t"
"pld [%[pixels], %[line_size2]] \n\t"
"pld [%[pixels2], %[line_size2]] \n\t"
"wldrd wr4, [%[pixels]] \n\t"
"wldrd wr5, [%[pixels2]] \n\t"
"pld [%[block], #32] \n\t"
"wunpckelub wr6, wr4 \n\t"
"wldrd wr0, [%[block]] \n\t"
"wunpckehub wr7, wr4 \n\t"
"wldrd wr1, [%[block], #8] \n\t"
"wunpckelub wr8, wr5 \n\t"
"wldrd wr2, [%[block], #16] \n\t"
"wunpckehub wr9, wr5 \n\t"
"wldrd wr3, [%[block], #24] \n\t"
"add %[block], %[block], #32 \n\t"
"waddhss wr10, wr0, wr6 \n\t"
"waddhss wr11, wr1, wr7 \n\t"
"waddhss wr12, wr2, wr8 \n\t"
"waddhss wr13, wr3, wr9 \n\t"
"wpackhus wr14, wr10, wr11 \n\t"
"wpackhus wr15, wr12, wr13 \n\t"
"wstrd wr14, [%[pixels]] \n\t"
"add %[pixels], %[pixels], %[line_size2] \n\t"
"subs r12, r12, #1 \n\t"
"wstrd wr15, [%[pixels2]] \n\t"
"add %[pixels2], %[pixels2], %[line_size2] \n\t"
"bne 1b \n\t"
: [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
: [line_size2]"r"(line_size << 1)
: "cc", "memory", "r12");
}
static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
return;
}
void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
{
c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
}

File diff suppressed because it is too large Load Diff

View File

@ -21,6 +21,13 @@
#include "../mpegvideo.h"
#include "../avcodec.h"
#ifdef HAVE_IWMMXT
extern void MPV_common_init_iwmmxt(MpegEncContext *s);
#endif
void MPV_common_init_armv4l(MpegEncContext *s)
{
#ifdef HAVE_IWMMXT
MPV_common_init_iwmmxt(s);
#endif
}

View File

@ -0,0 +1,97 @@
#include "../dsputil.h"
#include "../mpegvideo.h"
#include "../avcodec.h"
static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int level, qmul, qadd;
int nCoeffs;
DCTELEM *block_orig = block;
assert(s->block_last_index[n]>=0);
qmul = qscale << 1;
if (!s->h263_aic) {
if (n < 4)
level = block[0] * s->y_dc_scale;
else
level = block[0] * s->c_dc_scale;
qadd = (qscale - 1) | 1;
}else{
qadd = 0;
level = block[0];
}
if(s->ac_pred)
nCoeffs=63;
else
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
__asm__ __volatile__ (
/* "movd %1, %%mm6 \n\t" //qmul */
/* "packssdw %%mm6, %%mm6 \n\t" */
/* "packssdw %%mm6, %%mm6 \n\t" */
"tbcsth wr6, %[qmul] \n\t"
/* "movd %2, %%mm5 \n\t" //qadd */
/* "packssdw %%mm5, %%mm5 \n\t" */
/* "packssdw %%mm5, %%mm5 \n\t" */
"tbcsth wr5, %[qadd] \n\t"
"wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */
"wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */
"wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */
"1: \n\t"
"wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */
"wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */
"wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */
"wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */
/* "movq (%0, %3), %%mm2 \n\t" */
/* "movq 8(%0, %3), %%mm3 \n\t" */
"wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */
"wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */
"wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */
"wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */
"waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */
"waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */
"wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */
"wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */
"wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */
"wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */
"wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */
"wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */
"wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */
"wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */
"add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */
"subs %[i], %[i], #1 \n\t"
"bne 1b \n\t" /* "jng 1b \n\t" */
:[block]"+r"(block)
:[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd)
:"memory");
block_orig[0] = level;
}
#if 0
static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int nCoeffs;
assert(s->block_last_index[n]>=0);
if(s->ac_pred)
nCoeffs=63;
else
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale);
}
#endif
void MPV_common_init_iwmmxt(MpegEncContext *s)
{
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt;
#if 0
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt;
#endif
}

View File

@ -1180,6 +1180,7 @@ typedef struct AVCodecContext {
#define FF_IDCT_SIMPLEARM 10
#define FF_IDCT_H264 11
#define FF_IDCT_VP3 12
#define FP_IDCT_IPP 13
/**
* slice count.

View File

@ -94,10 +94,23 @@ static always_inline uint16_t bswap_16(uint16_t x){
return (x>>8) | (x<<8);
}
#ifdef ARCH_ARM
static always_inline uint32_t bswap_32(uint32_t x){
uint32_t t;
__asm__ (
"eor %1, %0, %0, ror #16 \n\t"
"bic %1, %1, #0xFF0000 \n\t"
"mov %0, %0, ror #8 \n\t"
"eor %0, %0, %1, lsr #8 \n\t"
: "+r"(x), "+r"(t));
return x;
}
#else
static always_inline uint32_t bswap_32(uint32_t x){
x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF);
return (x>>16) | (x<<16);
}
#endif
static inline uint64_t bswap_64(uint64_t x)
{