avcodec/mips: [loongson] reoptimize simple idct with mmi.

Performance of mpeg4 decoding improved about 23%(from 128fps to 158fps, tested on loongson 3A3000).
Reoptimized following functions with mmi.
1. ff_simple_idct_put_8_mmi
2. ff_simple_idct_add_8_mmi
3. ff_simple_idct_8_mmi

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
Shiyou Yin 2018-08-31 21:41:49 +08:00 committed by Michael Niedermayer
parent 1124df0397
commit df13b75aa1
4 changed files with 424 additions and 765 deletions

View File

@ -20,6 +20,7 @@
*/
#include "idctdsp_mips.h"
#include "xvididct_mips.h"
#if HAVE_MSA
static av_cold void idctdsp_init_msa(IDCTDSPContext *c, AVCodecContext *avctx,
@ -48,8 +49,10 @@ static av_cold void idctdsp_init_mmi(IDCTDSPContext *c, AVCodecContext *avctx,
if ((avctx->lowres != 1) && (avctx->lowres != 2) && (avctx->lowres != 3) &&
(avctx->bits_per_raw_sample != 10) &&
(avctx->bits_per_raw_sample != 12) &&
(avctx->idct_algo == FF_IDCT_AUTO)) {
c->idct = ff_simple_idct_mmi;
((avctx->idct_algo == FF_IDCT_AUTO) || (avctx->idct_algo == FF_IDCT_SIMPLE))) {
c->idct_put = ff_simple_idct_put_8_mmi;
c->idct_add = ff_simple_idct_add_8_mmi;
c->idct = ff_simple_idct_8_mmi;
c->perm_type = FF_IDCT_PERM_NONE;
}

View File

@ -46,8 +46,8 @@ void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
uint8_t *av_restrict pixels, ptrdiff_t line_size);
void ff_add_pixels_clamped_mmi(const int16_t *block,
uint8_t *av_restrict pixels, ptrdiff_t line_size);
void ff_simple_idct_mmi(int16_t *block);
void ff_simple_idct_put_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_add_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_8_mmi(int16_t *block);
void ff_simple_idct_put_8_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_add_8_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
#endif // #ifndef AVCODEC_MIPS_IDCTDSP_MIPS_H

File diff suppressed because it is too large Load Diff

View File

@ -201,6 +201,55 @@
#endif /* HAVE_LOONGSON2 */
/**
* backup register
*/
#define BACKUP_REG \
double temp_backup_reg[8]; \
if (_MIPS_SIM == _ABI64) \
__asm__ volatile ( \
"gssqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
"gssqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
"gssqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
"gssqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
: \
: [temp]"r"(temp_backup_reg) \
: "memory" \
); \
else \
__asm__ volatile ( \
"gssqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
"gssqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
"gssqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
: \
: [temp]"r"(temp_backup_reg) \
: "memory" \
);
/**
* recover register
*/
#define RECOVER_REG \
if (_MIPS_SIM == _ABI64) \
__asm__ volatile ( \
"gslqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
"gslqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
"gslqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
"gslqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
: \
: [temp]"r"(temp_backup_reg) \
: "memory" \
); \
else \
__asm__ volatile ( \
"gslqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
"gslqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
"gslqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
: \
: [temp]"r"(temp_backup_reg) \
: "memory" \
);
#define TRANSPOSE_4H(m1, m2, m3, m4, t1, t2, t3, t4, t5, r1, zero, shift) \
"li "#r1", 0x93 \n\t" \
"xor "#zero","#zero","#zero" \n\t" \