flac/x86: add ff_flac_lpc_32_sse4()

benchmarked on sandybridge x86_64:
1358232 decicycles in flac_lpc_32_c
1244575 decicycles in flac_lpc_32_sse4, James Almer's patch
 650045 decicycles in flac_lpc_32_sse4, this patch

I haven't tested the edgecases such as odd block lengths

odd block length tested-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Loren Merritt 2014-02-03 23:17:04 +00:00 committed by Michael Niedermayer
parent 4a37e2977c
commit 9c978f243a
5 changed files with 115 additions and 0 deletions

View File

@ -128,4 +128,6 @@ av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt,
if (ARCH_ARM)
ff_flacdsp_init_arm(c, fmt, bps);
if (ARCH_X86)
ff_flacdsp_init_x86(c, fmt, bps);
}

View File

@ -33,5 +33,6 @@ typedef struct FLACDSPContext {
void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
#endif /* AVCODEC_FLACDSP_H */

View File

@ -12,6 +12,7 @@ OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \
x86/fdct.o \
x86/motion_est.o
OBJS-$(CONFIG_FFT) += x86/fft_init.o
OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp_init.o
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
@ -70,6 +71,7 @@ YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
x86/qpel.o
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
YASM-OBJS-$(CONFIG_FFT) += x86/fft.o
YASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
x86/h264_chromamc_10bit.o

View File

@ -0,0 +1,71 @@
;******************************************************************************
;* FLAC DSP SIMD optimizations
;*
;* Copyright (C) 2014 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
INIT_XMM sse4
cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
sub lend, pred_orderd
jle .ret
lea decodedq, [decodedq+pred_orderq*4-8]
lea coeffsq, [coeffsq+pred_orderq*4]
neg pred_orderq
movd m4, qlevelm
ALIGN 16
.loop_sample:
movd m0, [decodedq+pred_orderq*4+8]
add decodedq, 8
movd m1, [coeffsq+pred_orderq*4]
pxor m2, m2
pxor m3, m3
lea jq, [pred_orderq+1]
test jq, jq
jz .end_order
.loop_order:
pmuldq m0, m1
paddq m2, m0
movd m0, [decodedq+jq*4]
pmuldq m1, m0
paddq m3, m1
movd m1, [coeffsq+jq*4]
inc jq
jl .loop_order
.end_order:
pmuldq m0, m1
paddq m2, m0
psrlq m2, m4
movd m0, [decodedq]
paddd m0, m2
movd [decodedq], m0
sub lend, 2
jl .ret
pmuldq m1, m0
paddq m3, m1
psrlq m3, m4
movd m1, [decodedq+4]
paddd m1, m3
movd [decodedq+4], m1
jg .loop_sample
.ret:
REP_RET

View File

@ -0,0 +1,39 @@
/*
* Copyright (c) 2014 James Almer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/flacdsp.h"
#include "libavutil/x86/cpu.h"
#include "config.h"
void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
int qlevel, int len);
av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt,
int bps)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE4(cpu_flags)) {
if (bps > 16)
c->lpc = ff_flac_lpc_32_sse4;
}
#endif
}