From b385c4c6a316f798b6a14418f09e545cda7fef7f Mon Sep 17 00:00:00 2001 From: James Almer Date: Thu, 6 Nov 2014 20:43:06 -0300 Subject: [PATCH] x86/swr: replace sse4 instructions in pack_6ch with sse ones There's no benefit from using blendps here except on CPUs with AVX, where it's faster than shufps according to Intel's documentation. As such, rename the sse4 functions to sse/sse2 and use shufps instead. Reviewed-by: Michael Niedermayer Signed-off-by: James Almer --- libswresample/x86/audio_convert.asm | 31 +++++++++++++++++--------- libswresample/x86/audio_convert_init.c | 23 ++++++++++--------- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/libswresample/x86/audio_convert.asm b/libswresample/x86/audio_convert.asm index b6e9e5d79d..d77e93439b 100644 --- a/libswresample/x86/audio_convert.asm +++ b/libswresample/x86/audio_convert.asm @@ -245,15 +245,27 @@ pack_6ch_%2_to_%1_u_int %+ SUFFIX mov%3 m4, [srcq+src4q] mov%3 m5, [srcq+src5q] %7 x,x,x,x,m7,x -%if cpuflag(sse4) +%if cpuflag(sse) SBUTTERFLYPS 0, 1, 6 SBUTTERFLYPS 2, 3, 6 SBUTTERFLYPS 4, 5, 6 +%if cpuflag(avx) blendps m6, m4, m0, 1100b +%else + movaps m6, m4 + shufps m4, m0, q3210 + SWAP 4,6 +%endif movlhps m0, m2 movhlps m4, m2 +%if cpuflag(avx) blendps m2, m5, m1, 1100b +%else + movaps m2, m5 + shufps m5, m1, q3210 + SWAP 2,5 +%endif movlhps m1, m3 movhlps m5, m3 @@ -380,6 +392,10 @@ CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N +INIT_XMM sse +PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N +PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N + INIT_XMM sse2 CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N @@ -431,6 +447,10 @@ UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT +PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT +PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT +PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT INIT_XMM ssse3 UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N @@ -440,15 +460,6 @@ UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT -INIT_XMM sse4 -PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N -PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N - -PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT -PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT -PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT -PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT - %if HAVE_AVX_EXTERNAL INIT_XMM avx PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N diff --git a/libswresample/x86/audio_convert_init.c b/libswresample/x86/audio_convert_init.c index a26cdf6ea6..769575d0fc 100644 --- a/libswresample/x86/audio_convert_init.c +++ b/libswresample/x86/audio_convert_init.c @@ -58,7 +58,12 @@ MULTI_CAPS_FUNC(SSE2, sse2) ac->simd_f = ff_pack_6ch_float_to_float_a_mmx; } } - + if(EXTERNAL_SSE(mm_flags)) { + if(channels == 6) { + if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P) + ac->simd_f = ff_pack_6ch_float_to_float_a_sse; + } + } if(EXTERNAL_SSE2(mm_flags)) { if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P) ac->simd_f = ff_int32_to_float_a_sse2; @@ -105,6 +110,12 @@ MULTI_CAPS_FUNC(SSE2, sse2) if( out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLT) ac->simd_f = ff_unpack_2ch_float_to_int16_a_sse2; } + if(channels == 6) { + if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P) + ac->simd_f = ff_pack_6ch_int32_to_float_a_sse2; + if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP) + ac->simd_f = ff_pack_6ch_float_to_int32_a_sse2; + } } if(EXTERNAL_SSSE3(mm_flags)) { if(channels == 2) { @@ -116,16 +127,6 @@ MULTI_CAPS_FUNC(SSE2, sse2) ac->simd_f = ff_unpack_2ch_int16_to_float_a_ssse3; } } - if(EXTERNAL_SSE4(mm_flags)) { - if(channels == 6) { - if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P) - ac->simd_f = ff_pack_6ch_float_to_float_a_sse4; - if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P) - ac->simd_f = ff_pack_6ch_int32_to_float_a_sse4; - if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP) - ac->simd_f = ff_pack_6ch_float_to_int32_a_sse4; - } - } if(EXTERNAL_AVX(mm_flags)) { if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P) ac->simd_f = ff_int32_to_float_a_avx;