diff --git a/configure b/configure index 0bd20e7e65..8a1a1b8584 100755 --- a/configure +++ b/configure @@ -136,13 +136,9 @@ Component options: --disable-w32threads disable Win32 threads [autodetect] --disable-os2threads disable OS/2 threads [autodetect] --disable-network disable network support [no] - --disable-dct disable DCT code --disable-dwt disable DWT code --disable-error-resilience disable error resilience code --disable-lsp disable LSP code - --disable-mdct disable MDCT code - --disable-rdft disable RDFT code - --disable-fft disable FFT code --disable-faan disable floating point AAN (I)DCT code --disable-pixelutils disable pixel utils in libavutil @@ -2004,17 +2000,13 @@ PROGRAM_LIST=" " SUBSYSTEM_LIST=" - dct dwt error_resilience faan fast_unaligned - fft lsp - mdct pixelutils network - rdft " # COMPONENT_LIST needs to come last to ensure correct dependency checking @@ -2766,7 +2758,6 @@ cbs_h266_select="cbs" cbs_jpeg_select="cbs" cbs_mpeg2_select="cbs" cbs_vp9_select="cbs" -dct_select="rdft" deflate_wrapper_deps="zlib" dirac_parse_select="golomb" dovi_rpu_select="golomb" @@ -2786,7 +2777,6 @@ frame_thread_encoder_deps="encoders threads" inflate_wrapper_deps="zlib" intrax8_select="blockdsp wmv2dsp" iso_media_select="mpeg4audio" -mdct_select="fft" me_cmp_select="idctdsp" mpeg_er_select="error_resilience" mpegaudio_select="mpegaudiodsp mpegaudioheader" @@ -2796,7 +2786,6 @@ mpegvideoenc_select="aandcttables fdctdsp me_cmp mpegvideo pixblockdsp" msmpeg4dec_select="h263_decoder" msmpeg4enc_select="h263_encoder" vc1dsp_select="h264chroma qpeldsp startcode" -rdft_select="fft" # decoders / encoders aac_decoder_select="adts_header mpeg4audio sinewin" diff --git a/doc/mips.txt b/doc/mips.txt index a84e89ae79..d66ce3b447 100644 --- a/doc/mips.txt +++ b/doc/mips.txt @@ -48,11 +48,6 @@ Files that have MIPS copyright notice in them: float_dsp_mips.c libm_mips.h softfloat_tables.h -* libavcodec/ - fft_fixed_32.c - fft_init_table.c - fft_table.h - mdct_fixed_32.c * libavcodec/mips/ aacdec_fixed.c aacsbr_fixed.c @@ -70,9 +65,6 @@ Files that have MIPS copyright notice in them: compute_antialias_float.h lsp_mips.h dsputil_mips.c - fft_mips.c - fft_table.h - fft_init_table.c fmtconvert_mips.c iirfilter_mips.c mpegaudiodsp_mips_fixed.c diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 0a3a8fcdf9..42af4e49aa 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -32,6 +32,7 @@ OBJS = ac3_parser.o \ allcodecs.o \ avcodec.o \ avdct.o \ + avfft.o \ avpacket.o \ bitstream.o \ bitstream_filters.o \ @@ -81,7 +82,6 @@ OBJS-$(CONFIG_CBS_JPEG) += cbs_jpeg.o OBJS-$(CONFIG_CBS_MPEG2) += cbs_mpeg2.o OBJS-$(CONFIG_CBS_VP9) += cbs_vp9.o OBJS-$(CONFIG_CRYSTALHD) += crystalhd.o -OBJS-$(CONFIG_DCT) += dct.o dct32_fixed.o dct32_float.o OBJS-$(CONFIG_DEFLATE_WRAPPER) += zlib_wrapper.o OBJS-$(CONFIG_DOVI_RPU) += dovi_rpu.o OBJS-$(CONFIG_ERROR_RESILIENCE) += error_resilience.o @@ -90,9 +90,6 @@ OBJS-$(CONFIG_EXIF) += exif.o tiff_common.o OBJS-$(CONFIG_FAANDCT) += faandct.o OBJS-$(CONFIG_FAANIDCT) += faanidct.o OBJS-$(CONFIG_FDCTDSP) += fdctdsp.o jfdctfst.o jfdctint.o -FFT-OBJS-$(CONFIG_HARDCODED_TABLES) += cos_tables.o -OBJS-$(CONFIG_FFT) += avfft.o fft_float.o fft_fixed_32.o \ - fft_init_table.o $(FFT-OBJS-yes) OBJS-$(CONFIG_FMTCONVERT) += fmtconvert.o OBJS-$(CONFIG_GOLOMB) += golomb.o OBJS-$(CONFIG_H263DSP) += h263dsp.o @@ -125,7 +122,6 @@ OBJS-$(CONFIG_LLVIDENCDSP) += lossless_videoencdsp.o OBJS-$(CONFIG_LPC) += lpc.o OBJS-$(CONFIG_LSP) += lsp.o OBJS-$(CONFIG_LZF) += lzf.o -OBJS-$(CONFIG_MDCT) += mdct_float.o mdct_fixed_32.o OBJS-$(CONFIG_ME_CMP) += me_cmp.o OBJS-$(CONFIG_MEDIACODEC) += mediacodecdec_common.o mediacodec_surface.o mediacodec_wrapper.o mediacodec_sw_buffer.o OBJS-$(CONFIG_MPEG_ER) += mpeg_er.o @@ -157,7 +153,6 @@ OBJS-$(CONFIG_QSV) += qsv.o OBJS-$(CONFIG_QSVDEC) += qsvdec.o OBJS-$(CONFIG_QSVENC) += qsvenc.o OBJS-$(CONFIG_RANGECODER) += rangecoder.o -OBJS-$(CONFIG_RDFT) += rdft.o OBJS-$(CONFIG_RV34DSP) += rv34dsp.o OBJS-$(CONFIG_SINEWIN) += sinewin.o OBJS-$(CONFIG_SNAPPY) += snappy.o @@ -1326,8 +1321,6 @@ TESTPROGS = avcodec \ TESTPROGS-$(CONFIG_AV1_VAAPI_ENCODER) += av1_levels TESTPROGS-$(CONFIG_CABAC) += cabac -TESTPROGS-$(CONFIG_DCT) += avfft -TESTPROGS-$(CONFIG_FFT) += fft fft-fixed32 TESTPROGS-$(CONFIG_GOLOMB) += golomb TESTPROGS-$(CONFIG_IDCTDSP) += dct TESTPROGS-$(CONFIG_IIRFILTER) += iirfilter @@ -1347,7 +1340,6 @@ HOSTPROGS = aacps_tablegen \ aacps_fixed_tablegen \ cbrt_tablegen \ cbrt_fixed_tablegen \ - cos_tablegen \ dv_tablegen \ motionpixels_tablegen \ mpegaudio_tablegen \ @@ -1362,12 +1354,6 @@ CLEANFILES = *_tables.c *_tables.h *_tablegen$(HOSTEXESUF) $(SUBDIR)tests/dct$(EXESUF): $(SUBDIR)dctref.o $(SUBDIR)aandcttab.o $(SUBDIR)dv_tablegen$(HOSTEXESUF): $(SUBDIR)dvdata_host.o -TRIG_TABLES = cos cos_fixed sin -TRIG_TABLES := $(TRIG_TABLES:%=$(SUBDIR)%_tables.c) - -$(TRIG_TABLES): $(SUBDIR)%_tables.c: $(SUBDIR)cos_tablegen$(HOSTEXESUF) - $(M)./$< $* > $@ - ifdef CONFIG_SMALL $(SUBDIR)%_tablegen$(HOSTEXESUF): HOSTCFLAGS += -DCONFIG_SMALL=1 else diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index cb428b49e0..beb6a02f5f 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -1,5 +1,4 @@ # subsystems -OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o @@ -36,7 +35,6 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o # subsystems NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o -NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \ @@ -47,7 +45,6 @@ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \ NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \ aarch64/simple_idct_neon.o -NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o diff --git a/libavcodec/aarch64/asm-offsets.h b/libavcodec/aarch64/asm-offsets.h deleted file mode 100644 index fc38eed298..0000000000 --- a/libavcodec/aarch64/asm-offsets.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_AARCH64_ASM_OFFSETS_H -#define AVCODEC_AARCH64_ASM_OFFSETS_H - -/* FFTContext */ -#define IMDCT_HALF 0x48 - -#endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */ diff --git a/libavcodec/aarch64/fft_init_aarch64.c b/libavcodec/aarch64/fft_init_aarch64.c deleted file mode 100644 index 77f5607960..0000000000 --- a/libavcodec/aarch64/fft_init_aarch64.c +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/aarch64/cpu.h" - -#include "libavcodec/fft.h" - -void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); -void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); - -void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); - -av_cold void ff_fft_init_aarch64(FFTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - if (s->nbits < 17) { - s->fft_permute = ff_fft_permute_neon; - s->fft_calc = ff_fft_calc_neon; - } -#if CONFIG_MDCT - s->imdct_calc = ff_imdct_calc_neon; - s->imdct_half = ff_imdct_half_neon; - s->mdct_calc = ff_mdct_calc_neon; - s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; -#endif - } -} diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S deleted file mode 100644 index d7225511dd..0000000000 --- a/libavcodec/aarch64/fft_neon.S +++ /dev/null @@ -1,447 +0,0 @@ -/* - * ARM NEON optimised FFT - * - * Copyright (c) 2009 Mans Rullgard - * Copyright (c) 2009 Naotoshi Nojiri - * Copyright (c) 2014 Janne Grunau - * - * This algorithm (though not any of the implementation details) is - * based on libdjbfft by D. J. Bernstein. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/aarch64/asm.S" - -#define M_SQRT1_2 0.70710678118654752440 - -.macro transpose d0, d1, s0, s1 - trn1 \d0, \s0, \s1 - trn2 \d1, \s0, \s1 -.endm - - -function fft4_neon - AARCH64_VALID_JUMP_TARGET - ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] - - fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1 - fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1 - - ext v16.8b, v2.8b, v3.8b, #4 - ext v17.8b, v3.8b, v2.8b, #4 - - fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3 - fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3 - - fadd v0.2s, v4.2s, v5.2s - fsub v2.2s, v4.2s, v5.2s - fadd v1.2s, v6.2s, v7.2s - fsub v3.2s, v6.2s, v7.2s - - st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] - - ret -endfunc - -function fft8_neon - AARCH64_VALID_JUMP_TARGET - mov x1, x0 - ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 - ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] - ext v22.8b, v2.8b, v3.8b, #4 - ext v23.8b, v3.8b, v2.8b, #4 - fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 - fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 - fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 - fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 - rev64 v27.2s, v28.2s // ??? - fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 - fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 - fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w - ext v6.8b, v4.8b, v5.8b, #4 - ext v7.8b, v5.8b, v4.8b, #4 - fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w - fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 - fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 - fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w - fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w - fadd v0.2s, v20.2s, v21.2s - fsub v2.2s, v20.2s, v21.2s - fadd v1.2s, v22.2s, v23.2s - rev64 v26.2s, v26.2s - rev64 v27.2s, v27.2s - fsub v3.2s, v22.2s, v23.2s - fsub v6.2s, v6.2s, v7.2s - fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 - fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 - fadd v7.2s, v4.2s, v5.2s - fsub v18.2s, v2.2s, v6.2s - ext v26.8b, v24.8b, v25.8b, #4 - ext v27.8b, v25.8b, v24.8b, #4 - fadd v2.2s, v2.2s, v6.2s - fsub v16.2s, v0.2s, v7.2s - fadd v5.2s, v25.2s, v24.2s - fsub v4.2s, v26.2s, v27.2s - fadd v0.2s, v0.2s, v7.2s - fsub v17.2s, v1.2s, v5.2s - fsub v19.2s, v3.2s, v4.2s - fadd v3.2s, v3.2s, v4.2s - fadd v1.2s, v1.2s, v5.2s - - st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] - st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1] - - ret -endfunc - -function fft16_neon - AARCH64_VALID_JUMP_TARGET - mov x1, x0 - ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 - ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32 - ext v22.8b, v2.8b, v3.8b, #4 - ext v23.8b, v3.8b, v2.8b, #4 - fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 - fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 - fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 - fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 - rev64 v27.2s, v28.2s // ??? - fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 - fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 - fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w - ext v6.8b, v4.8b, v5.8b, #4 - ext v7.8b, v5.8b, v4.8b, #4 - fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w - fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 - fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 - fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w - fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w - fadd v0.2s, v20.2s, v21.2s - fsub v2.2s, v20.2s, v21.2s - fadd v1.2s, v22.2s, v23.2s - rev64 v26.2s, v26.2s - rev64 v27.2s, v27.2s - fsub v3.2s, v22.2s, v23.2s - fsub v6.2s, v6.2s, v7.2s - fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 - fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 - fadd v7.2s, v4.2s, v5.2s - fsub v18.2s, v2.2s, v6.2s - ld1 {v20.4s,v21.4s}, [x0], #32 - ld1 {v22.4s,v23.4s}, [x0], #32 - ext v26.8b, v24.8b, v25.8b, #4 - ext v27.8b, v25.8b, v24.8b, #4 - fadd v2.2s, v2.2s, v6.2s - fsub v16.2s, v0.2s, v7.2s - fadd v5.2s, v25.2s, v24.2s - fsub v4.2s, v26.2s, v27.2s - transpose v24.2d, v25.2d, v20.2d, v22.2d - transpose v26.2d, v27.2d, v21.2d, v23.2d - fadd v0.2s, v0.2s, v7.2s - fsub v17.2s, v1.2s, v5.2s - fsub v19.2s, v3.2s, v4.2s - fadd v3.2s, v3.2s, v4.2s - fadd v1.2s, v1.2s, v5.2s - ext v20.16b, v21.16b, v21.16b, #4 - ext v21.16b, v23.16b, v23.16b, #4 - - zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]} - zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]} - zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]} - zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]} - - // 2 x fft4 - transpose v22.2d, v23.2d, v20.2d, v21.2d - - fadd v4.4s, v24.4s, v25.4s - fadd v5.4s, v26.4s, v27.4s - fsub v6.4s, v24.4s, v25.4s - fsub v7.4s, v22.4s, v23.4s - - ld1 {v23.4s}, [x14] - - fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]} - fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]} - fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]} - fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]} - - //fft_pass_neon_16 - rev64 v7.4s, v25.4s - fmul v25.4s, v25.4s, v23.s[1] - fmul v7.4s, v7.4s, v29.4s - fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a} - - zip1 v20.4s, v24.4s, v25.4s - zip2 v21.4s, v24.4s, v25.4s - fneg v22.4s, v20.4s - fadd v4.4s, v21.4s, v20.4s - fsub v6.4s, v20.4s, v21.4s // just the second half - fadd v5.4s, v21.4s, v22.4s // just the first half - - tbl v4.16b, {v4.16b}, v30.16b // trans4_float - tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float - - fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]} - fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]} - fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]} - fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]} - -//second half - rev64 v6.4s, v26.4s - fmul v26.4s, v26.4s, v23.s[2] - rev64 v7.4s, v27.4s - fmul v27.4s, v27.4s, v23.s[3] - fmul v6.4s, v6.4s, v29.4s - fmul v7.4s, v7.4s, v29.4s - fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6} - fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a} - - zip1 v24.4s, v26.4s, v27.4s - zip2 v25.4s, v26.4s, v27.4s - fneg v26.4s, v24.4s - fadd v4.4s, v25.4s, v24.4s - fsub v6.4s, v24.4s, v25.4s // just the second half - fadd v5.4s, v25.4s, v26.4s // just the first half - - tbl v4.16b, {v4.16b}, v30.16b // trans4_float - tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float - - fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]} - fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]} - fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]} - fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]} - - st1 {v16.4s,v17.4s}, [x1], #32 - st1 {v18.4s,v19.4s}, [x1], #32 - st1 {v20.4s,v21.4s}, [x1], #32 - st1 {v22.4s,v23.4s}, [x1], #32 - - ret -endfunc - - -const trans4_float, align=4 - .byte 0, 1, 2, 3 - .byte 8, 9, 10, 11 - .byte 4, 5, 6, 7 - .byte 12, 13, 14, 15 -endconst - -const trans8_float, align=4 - .byte 24, 25, 26, 27 - .byte 0, 1, 2, 3 - .byte 28, 29, 30, 31 - .byte 4, 5, 6, 7 -endconst - -function fft_pass_neon - sub x6, x2, #1 // n - 1, loop counter - lsl x5, x2, #3 // 2 * n * sizeof FFTSample - lsl x1, x2, #4 // 2 * n * sizeof FFTComplex - add x5, x4, x5 // wim - add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex - add x2, x0, x2, lsl #5 // &z[o2] - add x3, x0, x3 // &z[o3] - add x1, x0, x1 // &z[o1] - ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} - ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} - ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} - trn2 v25.2d, v20.2d, v22.2d - sub x5, x5, #4 // wim-- - trn1 v24.2d, v20.2d, v22.2d - ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1] - rev64 v7.4s, v25.4s - fmul v25.4s, v25.4s, v4.s[1] - ld1 {v16.4s}, [x0] // {z[0],z[1]} - fmul v7.4s, v7.4s, v29.4s - ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]} - prfm pldl1keep, [x2, #16] - prfm pldl1keep, [x3, #16] - fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} - prfm pldl1keep, [x0, #16] - prfm pldl1keep, [x1, #16] - - zip1 v20.4s, v24.4s, v25.4s - zip2 v21.4s, v24.4s, v25.4s - fneg v22.4s, v20.4s - fadd v4.4s, v21.4s, v20.4s - fsub v6.4s, v20.4s, v21.4s // just the second half - fadd v5.4s, v21.4s, v22.4s // just the first half - - tbl v4.16b, {v4.16b}, v30.16b // trans4_float - tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float - - fadd v20.4s, v16.4s, v4.4s - fsub v22.4s, v16.4s, v4.4s - fadd v21.4s, v17.4s, v5.4s - st1 {v20.4s}, [x0], #16 // {z[0], z[1]} - fsub v23.4s, v17.4s, v5.4s - - st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} - st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} - st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} -1: - ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} - ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} - ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} - transpose v26.2d, v27.2d, v20.2d, v22.2d - ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]} - rev64 v6.4s, v26.4s - fmul v26.4s, v26.4s, v4.s[0] - rev64 v7.4s, v27.4s - fmul v27.4s, v27.4s, v4.s[1] - fmul v6.4s, v6.4s, v29.4s - fmul v7.4s, v7.4s, v29.4s - ld1 {v16.4s},[x0] // {z[0],z[1]} - fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6} - fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} - ld1 {v17.4s},[x1] // {z[o1],z[o1+1]} - - subs x6, x6, #1 // n-- - - zip1 v20.4s, v26.4s, v27.4s - zip2 v21.4s, v26.4s, v27.4s - fneg v22.4s, v20.4s - fadd v4.4s, v21.4s, v20.4s - fsub v6.4s, v20.4s, v21.4s // just the second half - fadd v5.4s, v21.4s, v22.4s // just the first half - - tbl v4.16b, {v4.16b}, v30.16b // trans4_float - tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float - - fadd v20.4s, v16.4s, v4.4s - fsub v22.4s, v16.4s, v4.4s - fadd v21.4s, v17.4s, v5.4s - st1 {v20.4s}, [x0], #16 // {z[0], z[1]} - fsub v23.4s, v17.4s, v5.4s - - st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} - st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} - st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} - b.ne 1b - - ret -endfunc - -.macro def_fft n, n2, n4 -function fft\n\()_neon, align=6 - AARCH64_VALID_JUMP_TARGET - AARCH64_SIGN_LINK_REGISTER - stp x28, x30, [sp, #-16]! - add x28, x0, #\n4*2*8 - bl fft\n2\()_neon - mov x0, x28 - bl fft\n4\()_neon - add x0, x28, #\n4*1*8 - bl fft\n4\()_neon - sub x0, x28, #\n4*2*8 - ldp x28, x30, [sp], #16 - AARCH64_VALIDATE_LINK_REGISTER - movrel x4, X(ff_cos_\n) - mov x2, #\n4>>1 - b fft_pass_neon -endfunc -.endm - - def_fft 32, 16, 8 - def_fft 64, 32, 16 - def_fft 128, 64, 32 - def_fft 256, 128, 64 - def_fft 512, 256, 128 - def_fft 1024, 512, 256 - def_fft 2048, 1024, 512 - def_fft 4096, 2048, 1024 - def_fft 8192, 4096, 2048 - def_fft 16384, 8192, 4096 - def_fft 32768, 16384, 8192 - def_fft 65536, 32768, 16384 - -function ff_fft_calc_neon, export=1 - prfm pldl1keep, [x1] - movrel x10, trans4_float - ldr w2, [x0] - movrel x11, trans8_float - sub w2, w2, #2 - movrel x3, fft_tab_neon - ld1 {v30.16b}, [x10] - mov x7, #-8 - movrel x12, pmmp - ldr x3, [x3, x2, lsl #3] - movrel x13, mppm - movrel x14, X(ff_cos_16) - ld1 {v31.16b}, [x11] - mov x0, x1 - ld1 {v29.4s}, [x12] // pmmp - ld1 {v28.4s}, [x13] - br x3 -endfunc - -function ff_fft_permute_neon, export=1 - mov x6, #1 - ldr w2, [x0] // nbits - ldr x3, [x0, #16] // tmp_buf - ldr x0, [x0, #8] // revtab - lsl x6, x6, x2 - mov x2, x6 -1: - ld1 {v0.2s,v1.2s}, [x1], #16 - ldr w4, [x0], #4 - uxth w5, w4 - lsr w4, w4, #16 - add x5, x3, x5, lsl #3 - add x4, x3, x4, lsl #3 - st1 {v0.2s}, [x5] - st1 {v1.2s}, [x4] - subs x6, x6, #2 - b.gt 1b - - sub x1, x1, x2, lsl #3 -1: - ld1 {v0.4s,v1.4s}, [x3], #32 - st1 {v0.4s,v1.4s}, [x1], #32 - subs x2, x2, #4 - b.gt 1b - - ret -endfunc - -const fft_tab_neon, relocate=1 - .quad fft4_neon - .quad fft8_neon - .quad fft16_neon - .quad fft32_neon - .quad fft64_neon - .quad fft128_neon - .quad fft256_neon - .quad fft512_neon - .quad fft1024_neon - .quad fft2048_neon - .quad fft4096_neon - .quad fft8192_neon - .quad fft16384_neon - .quad fft32768_neon - .quad fft65536_neon -endconst - -const pmmp, align=4 - .float +1.0, -1.0, -1.0, +1.0 -endconst - -const mppm, align=4 - .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 -endconst diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S deleted file mode 100644 index 98b09bf1ab..0000000000 --- a/libavcodec/aarch64/mdct_neon.S +++ /dev/null @@ -1,326 +0,0 @@ -/* - * AArch64 NEON optimised MDCT - * Copyright (c) 2009 Mans Rullgard - * Copyright (c) 2014 Janne Grunau - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/aarch64/asm.S" - -function ff_imdct_half_neon, export=1 - stp x19, x20, [sp, #-32]! - AARCH64_SIGN_LINK_REGISTER - str x30, [sp, #16] - mov x12, #1 - ldr w14, [x0, #28] // mdct_bits - ldr x4, [x0, #32] // tcos - ldr x3, [x0, #8] // revtab - lsl x12, x12, x14 // n = 1 << nbits - lsr x14, x12, #2 // n4 = n >> 2 - add x7, x2, x12, lsl #1 - mov x12, #-16 - sub x7, x7, #16 - - ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0 - ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x - rev64 v17.2s, v17.2s - ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 - fmul v6.2s, v17.2s, v2.2s - fmul v7.2s, v0.2s, v2.2s -1: - subs x14, x14, #2 - ldr w6, [x3], #4 - fmul v4.2s, v0.2s, v3.2s - fmul v5.2s, v17.2s, v3.2s - fsub v4.2s, v6.2s, v4.2s - fadd v5.2s, v5.2s, v7.2s - ubfm x8, x6, #16, #31 - ubfm x6, x6, #0, #15 - add x8, x1, x8, lsl #3 - add x6, x1, x6, lsl #3 - b.eq 2f - ld2 {v16.2s,v17.2s}, [x7], x12 - ld2 {v0.2s,v1.2s}, [x2], #16 - rev64 v17.2s, v17.2s - ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 - fmul v6.2s, v17.2s, v2.2s - fmul v7.2s, v0.2s, v2.2s - st2 {v4.s,v5.s}[0], [x6] - st2 {v4.s,v5.s}[1], [x8] - b 1b -2: - st2 {v4.s,v5.s}[0], [x6] - st2 {v4.s,v5.s}[1], [x8] - - mov x19, x0 - mov x20, x1 - bl X(ff_fft_calc_neon) - - mov x12, #1 - ldr w14, [x19, #28] // mdct_bits - ldr x4, [x19, #32] // tcos - lsl x12, x12, x14 // n = 1 << nbits - lsr x14, x12, #3 // n8 = n >> 3 - - add x4, x4, x14, lsl #3 - add x6, x20, x14, lsl #3 - sub x1, x4, #16 - sub x3, x6, #16 - - mov x7, #-16 - mov x8, x6 - mov x0, x3 - - ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0 - ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3 - ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 -3: - subs x14, x14, #2 - fmul v7.2s, v0.2s, v17.2s - ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3 - fmul v4.2s, v1.2s, v17.2s - fmul v6.2s, v21.2s, v19.2s - fmul v5.2s, v20.2s, v19.2s - fmul v22.2s, v1.2s, v16.2s - fmul v23.2s, v21.2s, v18.2s - fmul v24.2s, v0.2s, v16.2s - fmul v25.2s, v20.2s, v18.2s - fadd v7.2s, v7.2s, v22.2s - fadd v5.2s, v5.2s, v23.2s - fsub v4.2s, v4.2s, v24.2s - fsub v6.2s, v6.2s, v25.2s - b.eq 4f - ld2 {v0.2s,v1.2s}, [x3], x7 - ld2 {v20.2s,v21.2s},[x6], #16 - ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 - rev64 v5.2s, v5.2s - rev64 v7.2s, v7.2s - st2 {v4.2s,v5.2s}, [x0], x7 - st2 {v6.2s,v7.2s}, [x8], #16 - b 3b -4: - rev64 v5.2s, v5.2s - rev64 v7.2s, v7.2s - st2 {v4.2s,v5.2s}, [x0] - st2 {v6.2s,v7.2s}, [x8] - - ldr x30, [sp, #16] - AARCH64_VALIDATE_LINK_REGISTER - ldp x19, x20, [sp], #32 - - ret -endfunc - -function ff_imdct_calc_neon, export=1 - stp x19, x20, [sp, #-32]! - AARCH64_SIGN_LINK_REGISTER - str x30, [sp, #16] - ldr w3, [x0, #28] // mdct_bits - mov x19, #1 - mov x20, x1 - lsl x19, x19, x3 - add x1, x1, x19 - - bl X(ff_imdct_half_neon) - - add x0, x20, x19, lsl #2 - add x1, x20, x19, lsl #1 - sub x0, x0, #8 - sub x2, x1, #16 - mov x3, #-16 - mov x6, #-8 -1: - ld1 {v0.4s}, [x2], x3 - prfum pldl1keep, [x0, #-16] - rev64 v0.4s, v0.4s - ld1 {v2.2s,v3.2s}, [x1], #16 - fneg v4.4s, v0.4s - prfum pldl1keep, [x2, #-16] - rev64 v2.2s, v2.2s - rev64 v3.2s, v3.2s - ext v4.16b, v4.16b, v4.16b, #8 - st1 {v2.2s}, [x0], x6 - st1 {v3.2s}, [x0], x6 - st1 {v4.4s}, [x20], #16 - subs x19, x19, #16 - b.gt 1b - - ldr x30, [sp, #16] - AARCH64_VALIDATE_LINK_REGISTER - ldp x19, x20, [sp], #32 - - ret -endfunc - - -function ff_mdct_calc_neon, export=1 - stp x19, x20, [sp, #-32]! - AARCH64_SIGN_LINK_REGISTER - str x30, [sp, #16] - - mov x12, #1 - ldr w14, [x0, #28] // mdct_bits - ldr x4, [x0, #32] // tcos - ldr x3, [x0, #8] // revtab - lsl x14, x12, x14 // n = 1 << nbits - add x7, x2, x14 // in4u - sub x9, x7, #16 // in4d - add x2, x7, x14, lsl #1 // in3u - add x8, x9, x14, lsl #1 // in3d - add x5, x4, x14, lsl #1 - sub x5, x5, #16 - sub x3, x3, #4 - mov x12, #-16 - lsr x13, x14, #1 - - ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 - ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 - ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 - rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 - rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 - ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 - fsub v0.2s, v17.2s, v0.2s // in4d-in4u I - ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 - rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 - rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 - ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 - fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R - fsub v16.2s, v16.2s, v1.2s // in0u-in2d R - fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I -1: - fmul v7.2s, v0.2s, v21.2s // I*s - ldr w10, [x3, x13] - fmul v6.2s, v2.2s, v20.2s // -R*c - ldr w6, [x3, #4]! - fmul v4.2s, v2.2s, v21.2s // -R*s - fmul v5.2s, v0.2s, v20.2s // I*c - fmul v24.2s, v16.2s, v30.2s // R*c - fmul v25.2s, v18.2s, v31.2s // -I*s - fmul v22.2s, v16.2s, v31.2s // R*s - fmul v23.2s, v18.2s, v30.2s // I*c - subs x14, x14, #16 - subs x13, x13, #8 - fsub v6.2s, v6.2s, v7.2s // -R*c-I*s - fadd v7.2s, v4.2s, v5.2s // -R*s+I*c - fsub v24.2s, v25.2s, v24.2s // I*s-R*c - fadd v25.2s, v22.2s, v23.2s // R*s-I*c - b.eq 1f - mov x12, #-16 - ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 - ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 - fneg v7.2s, v7.2s // R*s-I*c - ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 - rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 - rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 - ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 - fsub v0.2s, v17.2s, v0.2s // in4d-in4u I - ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 - rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 - rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 - ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 - fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R - fsub v16.2s, v16.2s, v1.2s // in0u-in2d R - fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I - ubfm x12, x6, #16, #31 - ubfm x6, x6, #0, #15 - add x12, x1, x12, lsl #3 - add x6, x1, x6, lsl #3 - st2 {v6.s,v7.s}[0], [x6] - st2 {v6.s,v7.s}[1], [x12] - ubfm x6, x10, #16, #31 - ubfm x10, x10, #0, #15 - add x6 , x1, x6, lsl #3 - add x10, x1, x10, lsl #3 - st2 {v24.s,v25.s}[0], [x10] - st2 {v24.s,v25.s}[1], [x6] - b 1b -1: - fneg v7.2s, v7.2s // R*s-I*c - ubfm x12, x6, #16, #31 - ubfm x6, x6, #0, #15 - add x12, x1, x12, lsl #3 - add x6, x1, x6, lsl #3 - st2 {v6.s,v7.s}[0], [x6] - st2 {v6.s,v7.s}[1], [x12] - ubfm x6, x10, #16, #31 - ubfm x10, x10, #0, #15 - add x6 , x1, x6, lsl #3 - add x10, x1, x10, lsl #3 - st2 {v24.s,v25.s}[0], [x10] - st2 {v24.s,v25.s}[1], [x6] - - mov x19, x0 - mov x20, x1 - bl X(ff_fft_calc_neon) - - mov x12, #1 - ldr w14, [x19, #28] // mdct_bits - ldr x4, [x19, #32] // tcos - lsl x12, x12, x14 // n = 1 << nbits - lsr x14, x12, #3 // n8 = n >> 3 - - add x4, x4, x14, lsl #3 - add x6, x20, x14, lsl #3 - sub x1, x4, #16 - sub x3, x6, #16 - - mov x7, #-16 - mov x8, x6 - mov x0, x3 - - ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0 - ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3 - ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 -1: - subs x14, x14, #2 - fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0 - ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3 - fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0 - fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3 - fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3 - fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0 - fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3 - fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3 - fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0 - fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0 - fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3 - fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3 - fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0 - fneg v4.2s, v4.2s - fneg v6.2s, v6.2s - b.eq 1f - ld2 {v0.2s, v1.2s}, [x3], x7 - ld2 {v20.2s,v21.2s}, [x6], #16 - ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 - rev64 v5.2s, v5.2s - rev64 v7.2s, v7.2s - st2 {v4.2s,v5.2s}, [x0], x7 - st2 {v6.2s,v7.2s}, [x8], #16 - b 1b -1: - rev64 v5.2s, v5.2s - rev64 v7.2s, v7.2s - st2 {v4.2s,v5.2s}, [x0] - st2 {v6.2s,v7.2s}, [x8] - - ldr x30, [sp, #16] - AARCH64_VALIDATE_LINK_REGISTER - ldp x19, x20, [sp], #32 - - ret -endfunc diff --git a/libavcodec/aarch64/synth_filter_init.c b/libavcodec/aarch64/synth_filter_init.c index 6b6da35b54..aea6aaf419 100644 --- a/libavcodec/aarch64/synth_filter_init.c +++ b/libavcodec/aarch64/synth_filter_init.c @@ -23,15 +23,8 @@ #include "libavutil/aarch64/cpu.h" #include "libavutil/attributes.h" #include "libavutil/internal.h" -#include "libavcodec/fft.h" #include "libavcodec/synth_filter.h" -#include "asm-offsets.h" - -#if HAVE_NEON -AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF); -#endif - void ff_synth_filter_float_neon(AVTXContext *imdct, float *synth_buf_ptr, int *synth_buf_offset, float synth_buf2[32], const float window[512], diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S index 259fa6e66c..63aefcb56e 100644 --- a/libavcodec/aarch64/synth_filter_neon.S +++ b/libavcodec/aarch64/synth_filter_neon.S @@ -19,8 +19,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "asm-offsets.h" - #include "libavutil/aarch64/asm.S" .macro inner_loop diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 5d284bdc01..becf316eb6 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -5,7 +5,6 @@ OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \ arm/ac3dsp_arm.o OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_arm.o OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_arm.o -OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_init_arm.o OBJS-$(CONFIG_G722DSP) += arm/g722dsp_init_arm.o OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o @@ -25,7 +24,6 @@ OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_init_arm.o -OBJS-$(CONFIG_RDFT) += arm/rdft_init_arm.o OBJS-$(CONFIG_RV34DSP) += arm/rv34dsp_init_arm.o OBJS-$(CONFIG_VC1DSP) += arm/vc1dsp_init_arm.o OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o @@ -90,9 +88,7 @@ ARMV6-OBJS-$(CONFIG_TRUEHD_DECODER) += arm/mlpdsp_armv6.o # VFP optimizations # subsystems -VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o VFP-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_vfp.o -VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o # decoders/encoders VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o @@ -107,7 +103,6 @@ NEON-OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_neon.o \ arm/int_neon.o NEON-OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_neon.o \ arm/blockdsp_neon.o -NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o NEON-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_neon.o NEON-OBJS-$(CONFIG_G722DSP) += arm/g722dsp_neon.o NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o @@ -121,10 +116,8 @@ NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \ NEON-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_neon.o \ arm/idctdsp_neon.o \ arm/simple_idct_neon.o -NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_neon.o -NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o NEON-OBJS-$(CONFIG_VC1DSP) += arm/vc1dsp_init_neon.o \ arm/vc1dsp_neon.o NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c deleted file mode 100644 index 8ae22dfb4e..0000000000 --- a/libavcodec/arm/fft_init_arm.c +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/arm/cpu.h" - -#include "libavcodec/fft.h" - -void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z); - -void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); -void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); - -void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input); - -void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); - -av_cold void ff_fft_init_arm(FFTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_vfp_vm(cpu_flags)) { - s->fft_calc = ff_fft_calc_vfp; -#if CONFIG_MDCT - s->imdct_half = ff_imdct_half_vfp; -#endif - } - - if (have_neon(cpu_flags)) { -#if CONFIG_FFT - if (s->nbits < 17) { - s->fft_permute = ff_fft_permute_neon; - s->fft_calc = ff_fft_calc_neon; - } -#endif -#if CONFIG_MDCT - s->imdct_calc = ff_imdct_calc_neon; - s->imdct_half = ff_imdct_half_neon; - s->mdct_calc = ff_mdct_calc_neon; - s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; -#endif - } -} diff --git a/libavcodec/arm/fft_neon.S b/libavcodec/arm/fft_neon.S deleted file mode 100644 index 48f8dfc424..0000000000 --- a/libavcodec/arm/fft_neon.S +++ /dev/null @@ -1,375 +0,0 @@ -/* - * ARM NEON optimised FFT - * - * Copyright (c) 2009 Mans Rullgard - * Copyright (c) 2009 Naotoshi Nojiri - * - * This algorithm (though not any of the implementation details) is - * based on libdjbfft by D. J. Bernstein. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -#define M_SQRT1_2 0.70710678118654752440 - - -function fft4_neon - vld1.32 {d0-d3}, [r0,:128] - - vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2 - vsub.f32 d6, d0, d1 @ r0-r1,i0-i1 - vsub.f32 d7, d16, d17 @ r3-r2,i2-i3 - vadd.f32 d4, d0, d1 @ r0+r1,i0+i1 - vadd.f32 d5, d2, d3 @ i2+i3,r2+r3 - vadd.f32 d1, d6, d7 - vsub.f32 d3, d6, d7 - vadd.f32 d0, d4, d5 - vsub.f32 d2, d4, d5 - - vst1.32 {d0-d3}, [r0,:128] - - bx lr -endfunc - -function fft8_neon - mov r1, r0 - vld1.32 {d0-d3}, [r1,:128]! - vld1.32 {d16-d19}, [r1,:128] - - movw r2, #0x04f3 @ sqrt(1/2) - movt r2, #0x3f35 - eor r3, r2, #1<<31 - vdup.32 d31, r2 - - vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2 - vadd.f32 d4, d16, d17 @ r4+r5,i4+i5 - vmov d28, r3, r2 - vadd.f32 d5, d18, d19 @ r6+r7,i6+i7 - vsub.f32 d17, d16, d17 @ r4-r5,i4-i5 - vsub.f32 d19, d18, d19 @ r6-r7,i6-i7 - vrev64.32 d29, d28 - vadd.f32 d20, d0, d1 @ r0+r1,i0+i1 - vadd.f32 d21, d2, d3 @ r2+r3,i2+i3 - vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w - vext.32 q3, q2, q2, #1 - vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w - vsub.f32 d23, d22, d23 @ i2-i3,r3-r2 - vsub.f32 d22, d0, d1 @ r0-r1,i0-i1 - vmul.f32 d24, d17, d31 @ a2r*w,a2i*w - vmul.f32 d25, d19, d31 @ a3r*w,a3i*w - vadd.f32 d0, d20, d21 - vsub.f32 d2, d20, d21 - vadd.f32 d1, d22, d23 - vrev64.32 q13, q13 - vsub.f32 d3, d22, d23 - vsub.f32 d6, d6, d7 - vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2 - vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6 - vadd.f32 d7, d4, d5 - vsub.f32 d18, d2, d6 - vext.32 q13, q12, q12, #1 - vadd.f32 d2, d2, d6 - vsub.f32 d16, d0, d7 - vadd.f32 d5, d25, d24 - vsub.f32 d4, d26, d27 - vadd.f32 d0, d0, d7 - vsub.f32 d17, d1, d5 - vsub.f32 d19, d3, d4 - vadd.f32 d3, d3, d4 - vadd.f32 d1, d1, d5 - - vst1.32 {d16-d19}, [r1,:128] - vst1.32 {d0-d3}, [r0,:128] - - bx lr -endfunc - -function fft16_neon - movrel r1, mppm - vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3} - pld [r0, #32] - vld1.32 {d2-d3}, [r1,:128] - vext.32 q13, q9, q9, #1 - vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7} - vadd.f32 d4, d16, d17 - vsub.f32 d5, d16, d17 - vadd.f32 d18, d18, d19 - vsub.f32 d19, d26, d27 - - vadd.f32 d20, d22, d23 - vsub.f32 d22, d22, d23 - vsub.f32 d23, d24, d25 - vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1} - vadd.f32 d21, d24, d25 - vmul.f32 d24, d22, d2 - vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3} - vmul.f32 d25, d23, d3 - vuzp.32 d16, d17 @ {r0,r1,i0,i1} - vmul.f32 q1, q11, d2[1] - vuzp.32 d18, d19 @ {r2,r3,i2,i3} - vrev64.32 q12, q12 - vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6} - vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11} - vzip.32 q10, q11 - vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15} - vadd.f32 d0, d22, d20 - vadd.f32 d1, d21, d23 - vsub.f32 d2, d21, d23 - vsub.f32 d3, d22, d20 - sub r0, r0, #96 - vext.32 q13, q13, q13, #1 - vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5} - vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} - vext.32 q15, q15, q15, #1 - vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7} - vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10} - vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3} - vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14} - vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6} - vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a} - movrelx r2, X(ff_cos_16) - vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8} - vrev64.32 d1, d1 - vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a} - vrev64.32 d3, d3 - movrel r3, pmmp - vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8} - vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a} - vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9} - vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13} - vld1.32 {d4-d5}, [r2,:64] - vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11} - vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15} - vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13} - vld1.32 {d6-d7}, [r3,:128] - vrev64.32 q1, q14 - vmul.f32 q14, q14, d4[1] - vmul.f32 q1, q1, q3 - vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a} - vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15} - vzip.32 q12, q14 - vadd.f32 d0, d28, d24 - vadd.f32 d1, d25, d29 - vsub.f32 d2, d25, d29 - vsub.f32 d3, d28, d24 - vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9} - vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} - vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13} - mov r1, #32 - vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5} - vrev64.32 q0, q13 - vmul.f32 q13, q13, d5[0] - vrev64.32 q1, q15 - vmul.f32 q15, q15, d5[1] - vst2.32 {d16-d17},[r0,:128], r1 - vmul.f32 q0, q0, q3 - vst2.32 {d20-d21},[r0,:128], r1 - vmul.f32 q1, q1, q3 - vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6} - vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a} - vst2.32 {d24-d25},[r0,:128], r1 - vst2.32 {d28-d29},[r0,:128] - vzip.32 q13, q15 - sub r0, r0, #80 - vadd.f32 d0, d30, d26 - vadd.f32 d1, d27, d31 - vsub.f32 d2, d27, d31 - vsub.f32 d3, d30, d26 - vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11} - vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3} - vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15} - vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7} - vst2.32 {d18-d19},[r0,:128], r1 - vst2.32 {d22-d23},[r0,:128], r1 - vst2.32 {d26-d27},[r0,:128], r1 - vst2.32 {d30-d31},[r0,:128] - bx lr -endfunc - -function fft_pass_neon - push {r4-r6,lr} - mov r6, r2 @ n - lsl r5, r2, #3 @ 2 * n * sizeof FFTSample - lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex - lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex - add r3, r2, r4 - add r4, r4, r0 @ &z[o1] - add r2, r2, r0 @ &z[o2] - add r3, r3, r0 @ &z[o3] - vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} - movrel r12, pmmp - vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} - add r5, r5, r1 @ wim - vld1.32 {d6-d7}, [r12,:128] @ pmmp - vswp d21, d22 - vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]} - sub r5, r5, #4 @ wim-- - vrev64.32 q1, q11 - vmul.f32 q11, q11, d4[1] - vmul.f32 q1, q1, q3 - vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1] - vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} - vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} - sub r6, r6, #1 @ n-- - vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} - vzip.32 q10, q11 - vadd.f32 d0, d22, d20 - vadd.f32 d1, d21, d23 - vsub.f32 d2, d21, d23 - vsub.f32 d3, d22, d20 - vsub.f32 q10, q8, q0 - vadd.f32 q8, q8, q0 - vsub.f32 q11, q9, q1 - vadd.f32 q9, q9, q1 - vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]} - vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]} - vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]} - vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]} - sub r5, r5, #8 @ wim -= 2 -1: - vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} - vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} - vswp d21, d22 - vld1.32 {d4}, [r1]! @ {wre[0],wre[1]} - vrev64.32 q0, q10 - vmul.f32 q10, q10, d4[0] - vrev64.32 q1, q11 - vmul.f32 q11, q11, d4[1] - vld1.32 {d5}, [r5] @ {wim[-1],wim[0]} - vmul.f32 q0, q0, q3 - sub r5, r5, #8 @ wim -= 2 - vmul.f32 q1, q1, q3 - vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6} - vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} - vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} - subs r6, r6, #1 @ n-- - vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} - vzip.32 q10, q11 - vadd.f32 d0, d22, d20 - vadd.f32 d1, d21, d23 - vsub.f32 d2, d21, d23 - vsub.f32 d3, d22, d20 - vsub.f32 q10, q8, q0 - vadd.f32 q8, q8, q0 - vsub.f32 q11, q9, q1 - vadd.f32 q9, q9, q1 - vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]} - vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]} - vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]} - vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]} - bne 1b - - pop {r4-r6,pc} -endfunc - -.macro def_fft n, n2, n4 - .align 6 -function fft\n\()_neon - push {r4, lr} - mov r4, r0 - bl fft\n2\()_neon - add r0, r4, #\n4*2*8 - bl fft\n4\()_neon - add r0, r4, #\n4*3*8 - bl fft\n4\()_neon - mov r0, r4 - pop {r4, lr} - movrelx r1, X(ff_cos_\n) - mov r2, #\n4/2 - b fft_pass_neon -endfunc -.endm - - def_fft 32, 16, 8 - def_fft 64, 32, 16 - def_fft 128, 64, 32 - def_fft 256, 128, 64 - def_fft 512, 256, 128 - def_fft 1024, 512, 256 - def_fft 2048, 1024, 512 - def_fft 4096, 2048, 1024 - def_fft 8192, 4096, 2048 - def_fft 16384, 8192, 4096 - def_fft 32768, 16384, 8192 - def_fft 65536, 32768, 16384 - -function ff_fft_calc_neon, export=1 - ldr r2, [r0] - sub r2, r2, #2 - movrel r3, fft_tab_neon - ldr r3, [r3, r2, lsl #2] - mov r0, r1 - bx r3 -endfunc - -function ff_fft_permute_neon, export=1 - push {r4,lr} - mov r12, #1 - ldr r2, [r0] @ nbits - ldr r3, [r0, #12] @ tmp_buf - ldr r0, [r0, #8] @ revtab - lsl r12, r12, r2 - mov r2, r12 -1: - vld1.32 {d0-d1}, [r1,:128]! - ldr r4, [r0], #4 - uxth lr, r4 - uxth r4, r4, ror #16 - add lr, r3, lr, lsl #3 - add r4, r3, r4, lsl #3 - vst1.32 {d0}, [lr,:64] - vst1.32 {d1}, [r4,:64] - subs r12, r12, #2 - bgt 1b - - sub r1, r1, r2, lsl #3 -1: - vld1.32 {d0-d3}, [r3,:128]! - vst1.32 {d0-d3}, [r1,:128]! - subs r2, r2, #4 - bgt 1b - - pop {r4,pc} -endfunc - -const fft_tab_neon, relocate=1 - .word fft4_neon - .word fft8_neon - .word fft16_neon - .word fft32_neon - .word fft64_neon - .word fft128_neon - .word fft256_neon - .word fft512_neon - .word fft1024_neon - .word fft2048_neon - .word fft4096_neon - .word fft8192_neon - .word fft16384_neon - .word fft32768_neon - .word fft65536_neon -endconst - -const pmmp, align=4 - .float +1.0, -1.0, -1.0, +1.0 -endconst - -const mppm, align=4 - .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 -endconst diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S deleted file mode 100644 index ac601325f2..0000000000 --- a/libavcodec/arm/fft_vfp.S +++ /dev/null @@ -1,530 +0,0 @@ -/* - * Copyright (c) 2013 RISC OS Open Ltd - * Author: Ben Avison - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -@ The fftx_internal_vfp versions of the functions obey a modified AAPCS: -@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and -@ all single-precision VFP registers may be corrupted on exit. The a2 -@ register may not be clobbered in these functions, as it holds the -@ stored original FPSCR. - -function ff_fft_calc_vfp, export=1 - ldr ip, [a1, #0] @ nbits - mov a1, a2 - movrel a2, (fft_tab_vfp - 8) - ldr pc, [a2, ip, lsl #2] -endfunc -const fft_tab_vfp, relocate=1 - .word fft4_vfp - .word fft8_vfp - .word X(ff_fft16_vfp) @ this one alone is exported - .word fft32_vfp - .word fft64_vfp - .word fft128_vfp - .word fft256_vfp - .word fft512_vfp - .word fft1024_vfp - .word fft2048_vfp - .word fft4096_vfp - .word fft8192_vfp - .word fft16384_vfp - .word fft32768_vfp - .word fft65536_vfp -endconst - -function fft4_vfp - vldr d0, [a1, #0*2*4] @ s0,s1 = z[0] - vldr d4, [a1, #1*2*4] @ s8,s9 = z[1] - vldr d1, [a1, #2*2*4] @ s2,s3 = z[2] - vldr d5, [a1, #3*2*4] @ s10,s11 = z[3] - @ stall - vadd.f s12, s0, s8 @ i0 - vadd.f s13, s1, s9 @ i1 - vadd.f s14, s2, s10 @ i2 - vadd.f s15, s3, s11 @ i3 - vsub.f s8, s0, s8 @ i4 - vsub.f s9, s1, s9 @ i5 - vsub.f s10, s2, s10 @ i6 - vsub.f s11, s3, s11 @ i7 - @ stall - @ stall - vadd.f s0, s12, s14 @ z[0].re - vsub.f s4, s12, s14 @ z[2].re - vadd.f s1, s13, s15 @ z[0].im - vsub.f s5, s13, s15 @ z[2].im - vadd.f s7, s9, s10 @ z[3].im - vsub.f s3, s9, s10 @ z[1].im - vadd.f s2, s8, s11 @ z[1].re - vsub.f s6, s8, s11 @ z[3].re - @ stall - @ stall - vstr d0, [a1, #0*2*4] - vstr d2, [a1, #2*2*4] - @ stall - @ stall - vstr d1, [a1, #1*2*4] - vstr d3, [a1, #3*2*4] - - bx lr -endfunc - -.macro macro_fft8_head - @ FFT4 - vldr d4, [a1, #0 * 2*4] - vldr d6, [a1, #1 * 2*4] - vldr d5, [a1, #2 * 2*4] - vldr d7, [a1, #3 * 2*4] - @ BF - vldr d12, [a1, #4 * 2*4] - vadd.f s16, s8, s12 @ vector op - vldr d14, [a1, #5 * 2*4] - vldr d13, [a1, #6 * 2*4] - vldr d15, [a1, #7 * 2*4] - vsub.f s20, s8, s12 @ vector op - vadd.f s0, s16, s18 - vsub.f s2, s16, s18 - vadd.f s1, s17, s19 - vsub.f s3, s17, s19 - vadd.f s7, s21, s22 - vsub.f s5, s21, s22 - vadd.f s4, s20, s23 - vsub.f s6, s20, s23 - vsub.f s20, s24, s28 @ vector op - vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory - vstr d1, [a1, #1 * 2*4] - vldr s0, cos1pi4 - vadd.f s16, s24, s28 @ vector op - vstr d2, [a1, #2 * 2*4] - vstr d3, [a1, #3 * 2*4] - vldr d12, [a1, #0 * 2*4] - @ TRANSFORM - vmul.f s20, s20, s0 @ vector x scalar op - vldr d13, [a1, #1 * 2*4] - vldr d14, [a1, #2 * 2*4] - vldr d15, [a1, #3 * 2*4] - @ BUTTERFLIES - vadd.f s0, s18, s16 - vadd.f s1, s17, s19 - vsub.f s2, s17, s19 - vsub.f s3, s18, s16 - vadd.f s4, s21, s20 - vsub.f s5, s21, s20 - vadd.f s6, s22, s23 - vsub.f s7, s22, s23 - vadd.f s8, s0, s24 @ vector op - vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory - vstr d1, [a1, #1 * 2*4] - vldr d6, [a1, #0 * 2*4] - vldr d7, [a1, #1 * 2*4] - vadd.f s1, s5, s6 - vadd.f s0, s7, s4 - vsub.f s2, s5, s6 - vsub.f s3, s7, s4 - vsub.f s12, s24, s12 @ vector op - vsub.f s5, s29, s1 - vsub.f s4, s28, s0 - vsub.f s6, s30, s2 - vsub.f s7, s31, s3 - vadd.f s16, s0, s28 @ vector op - vstr d6, [a1, #4 * 2*4] - vstr d7, [a1, #6 * 2*4] - vstr d4, [a1, #0 * 2*4] - vstr d5, [a1, #2 * 2*4] - vstr d2, [a1, #5 * 2*4] - vstr d3, [a1, #7 * 2*4] -.endm - -.macro macro_fft8_tail - vstr d8, [a1, #1 * 2*4] - vstr d9, [a1, #3 * 2*4] -.endm - -function .Lfft8_internal_vfp - macro_fft8_head - macro_fft8_tail - bx lr -endfunc - -function fft8_vfp - ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 - fmrx a2, FPSCR - fmxr FPSCR, a3 - vpush {s16-s31} - mov ip, lr - bl .Lfft8_internal_vfp - vpop {s16-s31} - fmxr FPSCR, a2 - bx ip -endfunc - -.align 3 -cos1pi4: @ cos(1*pi/4) = sqrt(2) - .float 0.707106769084930419921875 -cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2 - .float 0.92387950420379638671875 -cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2 - .float 0.3826834261417388916015625 - -function .Lfft16_internal_vfp - macro_fft8_head - @ FFT4(z+8) - vldr d10, [a1, #8 * 2*4] - vldr d12, [a1, #9 * 2*4] - vldr d11, [a1, #10 * 2*4] - vldr d13, [a1, #11 * 2*4] - macro_fft8_tail - vadd.f s16, s20, s24 @ vector op - @ FFT4(z+12) - vldr d4, [a1, #12 * 2*4] - vldr d6, [a1, #13 * 2*4] - vldr d5, [a1, #14 * 2*4] - vsub.f s20, s20, s24 @ vector op - vldr d7, [a1, #15 * 2*4] - vadd.f s0, s16, s18 - vsub.f s4, s16, s18 - vadd.f s1, s17, s19 - vsub.f s5, s17, s19 - vadd.f s7, s21, s22 - vsub.f s3, s21, s22 - vadd.f s2, s20, s23 - vsub.f s6, s20, s23 - vadd.f s16, s8, s12 @ vector op - vstr d0, [a1, #8 * 2*4] - vstr d2, [a1, #10 * 2*4] - vstr d1, [a1, #9 * 2*4] - vsub.f s20, s8, s12 - vstr d3, [a1, #11 * 2*4] - @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4) - vldr d12, [a1, #10 * 2*4] - vadd.f s0, s16, s18 - vadd.f s1, s17, s19 - vsub.f s6, s16, s18 - vsub.f s7, s17, s19 - vsub.f s3, s21, s22 - vadd.f s2, s20, s23 - vadd.f s5, s21, s22 - vsub.f s4, s20, s23 - vstr d0, [a1, #12 * 2*4] - vmov s0, s6 - @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8) - vldr d6, [a1, #9 * 2*4] - vstr d1, [a1, #13 * 2*4] - vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8 - vstr d2, [a1, #15 * 2*4] - vldr d7, [a1, #13 * 2*4] - vadd.f s4, s25, s24 - vsub.f s5, s25, s24 - vsub.f s6, s0, s7 - vadd.f s7, s0, s7 - vmul.f s20, s12, s3 @ vector op - @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8) - vldr d4, [a1, #11 * 2*4] - vldr d5, [a1, #15 * 2*4] - vldr s1, cos3pi8 - vmul.f s24, s4, s2 @ vector * scalar op - vmul.f s28, s12, s1 @ vector * scalar op - vmul.f s12, s8, s1 @ vector * scalar op - vadd.f s4, s20, s29 - vsub.f s5, s21, s28 - vsub.f s6, s22, s31 - vadd.f s7, s23, s30 - vmul.f s8, s8, s3 @ vector * scalar op - vldr d8, [a1, #1 * 2*4] - vldr d9, [a1, #5 * 2*4] - vldr d10, [a1, #3 * 2*4] - vldr d11, [a1, #7 * 2*4] - vldr d14, [a1, #2 * 2*4] - vadd.f s0, s6, s4 - vadd.f s1, s5, s7 - vsub.f s2, s5, s7 - vsub.f s3, s6, s4 - vadd.f s4, s12, s9 - vsub.f s5, s13, s8 - vsub.f s6, s14, s11 - vadd.f s7, s15, s10 - vadd.f s12, s0, s16 @ vector op - vstr d0, [a1, #1 * 2*4] - vstr d1, [a1, #5 * 2*4] - vldr d4, [a1, #1 * 2*4] - vldr d5, [a1, #5 * 2*4] - vadd.f s0, s6, s4 - vadd.f s1, s5, s7 - vsub.f s2, s5, s7 - vsub.f s3, s6, s4 - vsub.f s8, s16, s8 @ vector op - vstr d6, [a1, #1 * 2*4] - vstr d7, [a1, #5 * 2*4] - vldr d15, [a1, #6 * 2*4] - vsub.f s4, s20, s0 - vsub.f s5, s21, s1 - vsub.f s6, s22, s2 - vsub.f s7, s23, s3 - vadd.f s20, s0, s20 @ vector op - vstr d4, [a1, #9 * 2*4] - @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12]) - vldr d6, [a1, #8 * 2*4] - vstr d5, [a1, #13 * 2*4] - vldr d7, [a1, #12 * 2*4] - vstr d2, [a1, #11 * 2*4] - vldr d8, [a1, #0 * 2*4] - vstr d3, [a1, #15 * 2*4] - vldr d9, [a1, #4 * 2*4] - vadd.f s0, s26, s24 - vadd.f s1, s25, s27 - vsub.f s2, s25, s27 - vsub.f s3, s26, s24 - vadd.f s4, s14, s12 - vadd.f s5, s13, s15 - vsub.f s6, s13, s15 - vsub.f s7, s14, s12 - vadd.f s8, s0, s28 @ vector op - vstr d0, [a1, #3 * 2*4] - vstr d1, [a1, #7 * 2*4] - vldr d6, [a1, #3 * 2*4] - vldr d7, [a1, #7 * 2*4] - vsub.f s0, s16, s4 - vsub.f s1, s17, s5 - vsub.f s2, s18, s6 - vsub.f s3, s19, s7 - vsub.f s12, s28, s12 @ vector op - vadd.f s16, s4, s16 @ vector op - vstr d10, [a1, #3 * 2*4] - vstr d11, [a1, #7 * 2*4] - vstr d4, [a1, #2 * 2*4] - vstr d5, [a1, #6 * 2*4] - vstr d0, [a1, #8 * 2*4] - vstr d1, [a1, #12 * 2*4] - vstr d6, [a1, #10 * 2*4] - vstr d7, [a1, #14 * 2*4] - vstr d8, [a1, #0 * 2*4] - vstr d9, [a1, #4 * 2*4] - - bx lr -endfunc - -function ff_fft16_vfp, export=1 - ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 - fmrx a2, FPSCR - fmxr FPSCR, a3 - vpush {s16-s31} - mov ip, lr - bl .Lfft16_internal_vfp - vpop {s16-s31} - fmxr FPSCR, a2 - bx ip -endfunc - -.macro pass n, z0, z1, z2, z3 - add v6, v5, #4*2*\n - @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]) - @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) - @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]) - @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) - vldr d8, [\z2, #8*(o2+1)] @ s16,s17 - vldmdb v6!, {s2} - vldr d9, [\z3, #8*(o3+1)] @ s18,s19 - vldmia v5!, {s0,s1} @ s0 is unused - vldr s7, [\z2, #8*o2] @ t1 - vmul.f s20, s16, s2 @ vector * scalar - vldr s0, [\z3, #8*o3] @ t5 - vldr s6, [\z2, #8*o2+4] @ t2 - vldr s3, [\z3, #8*o3+4] @ t6 - vmul.f s16, s16, s1 @ vector * scalar - ldr a4, =\n-1 -1: add \z0, \z0, #8*2 - .if \n*4*2 >= 512 - add \z1, \z1, #8*2 - .endif - .if \n*4*2 >= 256 - add \z2, \z2, #8*2 - .endif - .if \n*4*2 >= 512 - add \z3, \z3, #8*2 - .endif - @ up to 2 stalls (VFP vector issuing / waiting for s0) - @ depending upon whether this is the first iteration and - @ how many add instructions are inserted above - vadd.f s4, s0, s7 @ t5 - vadd.f s5, s6, s3 @ t6 - vsub.f s6, s6, s3 @ t4 - vsub.f s7, s0, s7 @ t3 - vldr d6, [\z0, #8*0-8*2] @ s12,s13 - vadd.f s0, s16, s21 @ t1 - vldr d7, [\z1, #8*o1-8*2] @ s14,s15 - vsub.f s1, s18, s23 @ t5 - vadd.f s8, s4, s12 @ vector + vector - @ stall (VFP vector issuing) - @ stall (VFP vector issuing) - @ stall (VFP vector issuing) - vsub.f s4, s12, s4 - vsub.f s5, s13, s5 - vsub.f s6, s14, s6 - vsub.f s7, s15, s7 - vsub.f s2, s17, s20 @ t2 - vadd.f s3, s19, s22 @ t6 - vstr d4, [\z0, #8*0-8*2] @ s8,s9 - vstr d5, [\z1, #8*o1-8*2] @ s10,s11 - @ stall (waiting for s5) - vstr d2, [\z2, #8*o2-8*2] @ s4,s5 - vadd.f s4, s1, s0 @ t5 - vstr d3, [\z3, #8*o3-8*2] @ s6,s7 - vsub.f s7, s1, s0 @ t3 - vadd.f s5, s2, s3 @ t6 - vsub.f s6, s2, s3 @ t4 - vldr d6, [\z0, #8*1-8*2] @ s12,s13 - vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15 - vldr d4, [\z2, #8*o2] @ s8,s9 - vldmdb v6!, {s2,s3} - vldr d5, [\z3, #8*o3] @ s10,s11 - vadd.f s20, s4, s12 @ vector + vector - vldmia v5!, {s0,s1} - vldr d8, [\z2, #8*(o2+1)] @ s16,s17 - @ stall (VFP vector issuing) - vsub.f s4, s12, s4 - vsub.f s5, s13, s5 - vsub.f s6, s14, s6 - vsub.f s7, s15, s7 - vmul.f s12, s8, s3 @ vector * scalar - vstr d10, [\z0, #8*1-8*2] @ s20,s21 - vldr d9, [\z3, #8*(o3+1)] @ s18,s19 - vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23 - vmul.f s8, s8, s0 @ vector * scalar - vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5 - @ stall (waiting for s7) - vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7 - vmul.f s20, s16, s2 @ vector * scalar - @ stall (VFP vector issuing) - @ stall (VFP vector issuing) - @ stall (VFP vector issuing) - vadd.f s7, s8, s13 @ t1 - vsub.f s6, s9, s12 @ t2 - vsub.f s0, s10, s15 @ t5 - vadd.f s3, s11, s14 @ t6 - vmul.f s16, s16, s1 @ vector * scalar - subs a4, a4, #1 - bne 1b - @ What remains is identical to the first two indentations of - @ the above, but without the increment of z - vadd.f s4, s0, s7 @ t5 - vadd.f s5, s6, s3 @ t6 - vsub.f s6, s6, s3 @ t4 - vsub.f s7, s0, s7 @ t3 - vldr d6, [\z0, #8*0] @ s12,s13 - vadd.f s0, s16, s21 @ t1 - vldr d7, [\z1, #8*o1] @ s14,s15 - vsub.f s1, s18, s23 @ t5 - vadd.f s8, s4, s12 @ vector + vector - vsub.f s4, s12, s4 - vsub.f s5, s13, s5 - vsub.f s6, s14, s6 - vsub.f s7, s15, s7 - vsub.f s2, s17, s20 @ t2 - vadd.f s3, s19, s22 @ t6 - vstr d4, [\z0, #8*0] @ s8,s9 - vstr d5, [\z1, #8*o1] @ s10,s11 - vstr d2, [\z2, #8*o2] @ s4,s5 - vadd.f s4, s1, s0 @ t5 - vstr d3, [\z3, #8*o3] @ s6,s7 - vsub.f s7, s1, s0 @ t3 - vadd.f s5, s2, s3 @ t6 - vsub.f s6, s2, s3 @ t4 - vldr d6, [\z0, #8*1] @ s12,s13 - vldr d7, [\z1, #8*(o1+1)] @ s14,s15 - vadd.f s20, s4, s12 @ vector + vector - vsub.f s4, s12, s4 - vsub.f s5, s13, s5 - vsub.f s6, s14, s6 - vsub.f s7, s15, s7 - vstr d10, [\z0, #8*1] @ s20,s21 - vstr d11, [\z1, #8*(o1+1)] @ s22,s23 - vstr d2, [\z2, #8*(o2+1)] @ s4,s5 - vstr d3, [\z3, #8*(o3+1)] @ s6,s7 -.endm - -.macro def_fft n, n2, n4 -function .Lfft\n\()_internal_vfp - .if \n >= 512 - push {v1-v6,lr} - .elseif \n >= 256 - push {v1-v2,v5-v6,lr} - .else - push {v1,v5-v6,lr} - .endif - mov v1, a1 - bl .Lfft\n2\()_internal_vfp - add a1, v1, #8*(\n/4)*2 - bl .Lfft\n4\()_internal_vfp - movrelx v5, X(ff_cos_\n), a1 - add a1, v1, #8*(\n/4)*3 - bl .Lfft\n4\()_internal_vfp - .if \n >= 512 - .set o1, 0*(\n/4/2) - .set o2, 0*(\n/4/2) - .set o3, 0*(\n/4/2) - add v2, v1, #8*2*(\n/4/2) - add v3, v1, #8*4*(\n/4/2) - add v4, v1, #8*6*(\n/4/2) - pass (\n/4/2), v1, v2, v3, v4 - pop {v1-v6,pc} - .elseif \n >= 256 - .set o1, 2*(\n/4/2) - .set o2, 0*(\n/4/2) - .set o3, 2*(\n/4/2) - add v2, v1, #8*4*(\n/4/2) - pass (\n/4/2), v1, v1, v2, v2 - pop {v1-v2,v5-v6,pc} - .else - .set o1, 2*(\n/4/2) - .set o2, 4*(\n/4/2) - .set o3, 6*(\n/4/2) - pass (\n/4/2), v1, v1, v1, v1 - pop {v1,v5-v6,pc} - .endif -endfunc - -function fft\n\()_vfp - ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ - fmrx a2, FPSCR - fmxr FPSCR, a3 - vpush {s16-s31} - mov ip, lr - bl .Lfft\n\()_internal_vfp - vpop {s16-s31} - fmxr FPSCR, a2 - bx ip -endfunc - -.ltorg -.endm - - def_fft 32, 16, 8 - def_fft 64, 32, 16 - def_fft 128, 64, 32 - def_fft 256, 128, 64 - def_fft 512, 256, 128 - def_fft 1024, 512, 256 - def_fft 2048, 1024, 512 - def_fft 4096, 2048, 1024 - def_fft 8192, 4096, 2048 - def_fft 16384, 8192, 4096 - def_fft 32768, 16384, 8192 - def_fft 65536, 32768, 16384 diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S deleted file mode 100644 index a6952fa571..0000000000 --- a/libavcodec/arm/mdct_neon.S +++ /dev/null @@ -1,301 +0,0 @@ -/* - * ARM NEON optimised MDCT - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -#define ff_fft_calc_neon X(ff_fft_calc_neon) - -function ff_imdct_half_neon, export=1 - push {r4-r8,lr} - - mov r12, #1 - ldr lr, [r0, #20] @ mdct_bits - ldr r4, [r0, #24] @ tcos - ldr r3, [r0, #8] @ revtab - lsl r12, r12, lr @ n = 1 << nbits - lsr lr, r12, #2 @ n4 = n >> 2 - add r7, r2, r12, lsl #1 - mov r12, #-16 - sub r7, r7, #16 - - vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0 - vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x - vrev64.32 d17, d17 - vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 - vmul.f32 d6, d17, d2 - vmul.f32 d7, d0, d2 -1: - subs lr, lr, #2 - ldr r6, [r3], #4 - vmul.f32 d4, d0, d3 - vmul.f32 d5, d17, d3 - vsub.f32 d4, d6, d4 - vadd.f32 d5, d5, d7 - uxth r8, r6, ror #16 - uxth r6, r6 - add r8, r1, r8, lsl #3 - add r6, r1, r6, lsl #3 - beq 1f - vld2.32 {d16-d17},[r7,:128],r12 - vld2.32 {d0-d1}, [r2,:128]! - vrev64.32 d17, d17 - vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 - vmul.f32 d6, d17, d2 - vmul.f32 d7, d0, d2 - vst2.32 {d4[0],d5[0]}, [r6,:64] - vst2.32 {d4[1],d5[1]}, [r8,:64] - b 1b -1: - vst2.32 {d4[0],d5[0]}, [r6,:64] - vst2.32 {d4[1],d5[1]}, [r8,:64] - - mov r4, r0 - mov r6, r1 - bl ff_fft_calc_neon - - mov r12, #1 - ldr lr, [r4, #20] @ mdct_bits - ldr r4, [r4, #24] @ tcos - lsl r12, r12, lr @ n = 1 << nbits - lsr lr, r12, #3 @ n8 = n >> 3 - - add r4, r4, lr, lsl #3 - add r6, r6, lr, lsl #3 - sub r1, r4, #16 - sub r3, r6, #16 - - mov r7, #-16 - mov r8, r6 - mov r0, r3 - - vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0 - vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3 - vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 -1: - subs lr, lr, #2 - vmul.f32 d7, d0, d18 - vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3 - vmul.f32 d4, d1, d18 - vmul.f32 d5, d21, d19 - vmul.f32 d6, d20, d19 - vmul.f32 d22, d1, d16 - vmul.f32 d23, d21, d17 - vmul.f32 d24, d0, d16 - vmul.f32 d25, d20, d17 - vadd.f32 d7, d7, d22 - vadd.f32 d6, d6, d23 - vsub.f32 d4, d4, d24 - vsub.f32 d5, d5, d25 - beq 1f - vld2.32 {d0-d1}, [r3,:128], r7 - vld2.32 {d20-d21},[r6,:128]! - vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128], r7 - vst2.32 {d5,d7}, [r8,:128]! - b 1b -1: - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128] - vst2.32 {d5,d7}, [r8,:128] - - pop {r4-r8,pc} -endfunc - -function ff_imdct_calc_neon, export=1 - push {r4-r6,lr} - - ldr r3, [r0, #20] - mov r4, #1 - mov r5, r1 - lsl r4, r4, r3 - add r1, r1, r4 - - bl X(ff_imdct_half_neon) - - add r0, r5, r4, lsl #2 - add r1, r5, r4, lsl #1 - sub r0, r0, #8 - sub r2, r1, #16 - mov r3, #-16 - mov r6, #-8 - vmov.i32 d30, #1<<31 -1: - vld1.32 {d0-d1}, [r2,:128], r3 - pld [r0, #-16] - vrev64.32 q0, q0 - vld1.32 {d2-d3}, [r1,:128]! - veor d4, d1, d30 - pld [r2, #-16] - vrev64.32 q1, q1 - veor d5, d0, d30 - vst1.32 {d2}, [r0,:64], r6 - vst1.32 {d3}, [r0,:64], r6 - vst1.32 {d4-d5}, [r5,:128]! - subs r4, r4, #16 - bgt 1b - - pop {r4-r6,pc} -endfunc - -function ff_mdct_calc_neon, export=1 - push {r4-r10,lr} - - mov r12, #1 - ldr lr, [r0, #20] @ mdct_bits - ldr r4, [r0, #24] @ tcos - ldr r3, [r0, #8] @ revtab - lsl lr, r12, lr @ n = 1 << nbits - add r7, r2, lr @ in4u - sub r9, r7, #16 @ in4d - add r2, r7, lr, lsl #1 @ in3u - add r8, r9, lr, lsl #1 @ in3d - add r5, r4, lr, lsl #1 - sub r5, r5, #16 - sub r3, r3, #4 - mov r12, #-16 - - vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 - vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 - vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 - vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 - vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 - vsub.f32 d0, d18, d0 @ in4d-in4u I - vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 - vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 - vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 - vadd.f32 d1, d1, d19 @ in3u+in3d -R - vsub.f32 d16, d16, d2 @ in0u-in2d R - vadd.f32 d17, d17, d3 @ in2u+in1d -I -1: - vmul.f32 d7, d0, d21 @ I*s -A ldr r10, [r3, lr, lsr #1] -T lsr r10, lr, #1 -T ldr r10, [r3, r10] - vmul.f32 d6, d1, d20 @ -R*c - ldr r6, [r3, #4]! - vmul.f32 d4, d1, d21 @ -R*s - vmul.f32 d5, d0, d20 @ I*c - vmul.f32 d24, d16, d30 @ R*c - vmul.f32 d25, d17, d31 @ -I*s - vmul.f32 d22, d16, d31 @ R*s - vmul.f32 d23, d17, d30 @ I*c - subs lr, lr, #16 - vsub.f32 d6, d6, d7 @ -R*c-I*s - vadd.f32 d7, d4, d5 @ -R*s+I*c - vsub.f32 d24, d25, d24 @ I*s-R*c - vadd.f32 d25, d22, d23 @ R*s-I*c - beq 1f - mov r12, #-16 - vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 - vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 - vneg.f32 d7, d7 @ R*s-I*c - vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 - vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 - vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 - vsub.f32 d0, d18, d0 @ in4d-in4u I - vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 - vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 - vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 - vadd.f32 d1, d1, d19 @ in3u+in3d -R - vsub.f32 d16, d16, d2 @ in0u-in2d R - vadd.f32 d17, d17, d3 @ in2u+in1d -I - uxth r12, r6, ror #16 - uxth r6, r6 - add r12, r1, r12, lsl #3 - add r6, r1, r6, lsl #3 - vst2.32 {d6[0],d7[0]}, [r6,:64] - vst2.32 {d6[1],d7[1]}, [r12,:64] - uxth r6, r10, ror #16 - uxth r10, r10 - add r6 , r1, r6, lsl #3 - add r10, r1, r10, lsl #3 - vst2.32 {d24[0],d25[0]},[r10,:64] - vst2.32 {d24[1],d25[1]},[r6,:64] - b 1b -1: - vneg.f32 d7, d7 @ R*s-I*c - uxth r12, r6, ror #16 - uxth r6, r6 - add r12, r1, r12, lsl #3 - add r6, r1, r6, lsl #3 - vst2.32 {d6[0],d7[0]}, [r6,:64] - vst2.32 {d6[1],d7[1]}, [r12,:64] - uxth r6, r10, ror #16 - uxth r10, r10 - add r6 , r1, r6, lsl #3 - add r10, r1, r10, lsl #3 - vst2.32 {d24[0],d25[0]},[r10,:64] - vst2.32 {d24[1],d25[1]},[r6,:64] - - mov r4, r0 - mov r6, r1 - bl ff_fft_calc_neon - - mov r12, #1 - ldr lr, [r4, #20] @ mdct_bits - ldr r4, [r4, #24] @ tcos - lsl r12, r12, lr @ n = 1 << nbits - lsr lr, r12, #3 @ n8 = n >> 3 - - add r4, r4, lr, lsl #3 - add r6, r6, lr, lsl #3 - sub r1, r4, #16 - sub r3, r6, #16 - - mov r7, #-16 - mov r8, r6 - mov r0, r3 - - vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0 - vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3 - vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 -1: - subs lr, lr, #2 - vmul.f32 d7, d0, d18 @ r1*s1,r0*s0 - vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3 - vmul.f32 d4, d1, d18 @ i1*s1,i0*s0 - vmul.f32 d5, d21, d19 @ i2*s2,i3*s3 - vmul.f32 d6, d20, d19 @ r2*s2,r3*s3 - vmul.f32 d24, d0, d16 @ r1*c1,r0*c0 - vmul.f32 d25, d20, d17 @ r2*c2,r3*c3 - vmul.f32 d22, d21, d17 @ i2*c2,i3*c3 - vmul.f32 d23, d1, d16 @ i1*c1,i0*c0 - vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0 - vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3 - vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3 - vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0 - vneg.f32 q2, q2 - beq 1f - vld2.32 {d0-d1}, [r3,:128], r7 - vld2.32 {d20-d21},[r6,:128]! - vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128], r7 - vst2.32 {d5,d7}, [r8,:128]! - b 1b -1: - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128] - vst2.32 {d5,d7}, [r8,:128] - - pop {r4-r10,pc} -endfunc diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S deleted file mode 100644 index 43f6d14c0c..0000000000 --- a/libavcodec/arm/mdct_vfp.S +++ /dev/null @@ -1,347 +0,0 @@ -/* - * Copyright (c) 2013 RISC OS Open Ltd - * Author: Ben Avison - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -CONTEXT .req a1 -ORIGOUT .req a2 -IN .req a3 -OUT .req v1 -REVTAB .req v2 -TCOS .req v3 -TSIN .req v4 -OLDFPSCR .req v5 -J0 .req a2 -J1 .req a4 -J2 .req ip -J3 .req lr -REVTAB_HI .req v5 -IN_HI .req v6 -OUT_HI .req v6 -TCOS_HI .req sl -TSIN_HI .req fp - -.macro prerotation_innerloop - .set trig_lo, k - .set trig_hi, n4 - k - 2 - .set in_lo, trig_lo * 2 - .set in_hi, trig_hi * 2 - vldr d8, [TCOS, #trig_lo*4] @ s16,s17 - vldr d9, [TCOS, #trig_hi*4] @ s18,s19 - vldr s0, [IN, #in_hi*4 + 12] - vldr s1, [IN, #in_hi*4 + 4] - vldr s2, [IN, #in_lo*4 + 12] - vldr s3, [IN, #in_lo*4 + 4] - vmul.f s8, s0, s16 @ vector operation - vldr d10, [TSIN, #trig_lo*4] @ s20,s21 - vldr d11, [TSIN, #trig_hi*4] @ s22,s23 - vldr s4, [IN, #in_lo*4] - vldr s5, [IN, #in_lo*4 + 8] - vldr s6, [IN, #in_hi*4] - vldr s7, [IN, #in_hi*4 + 8] - ldr J0, [REVTAB, #trig_lo*2] - vmul.f s12, s0, s20 @ vector operation - ldr J2, [REVTAB, #trig_hi*2] - mov J1, J0, lsr #16 - and J0, J0, #255 @ halfword value will be < n4 - vmls.f s8, s4, s20 @ vector operation - mov J3, J2, lsr #16 - and J2, J2, #255 @ halfword value will be < n4 - add J0, OUT, J0, lsl #3 - vmla.f s12, s4, s16 @ vector operation - add J1, OUT, J1, lsl #3 - add J2, OUT, J2, lsl #3 - add J3, OUT, J3, lsl #3 - vstr s8, [J0] - vstr s9, [J1] - vstr s10, [J2] - vstr s11, [J3] - vstr s12, [J0, #4] - vstr s13, [J1, #4] - vstr s14, [J2, #4] - vstr s15, [J3, #4] - .set k, k + 2 -.endm - -.macro prerotation_innerloop_rolled - vldmia TCOS!, {s16,s17} - vldmdb TCOS_HI!, {s18,s19} - vldr s0, [IN_HI, #-4] - vldr s1, [IN_HI, #-12] - vldr s2, [IN, #12] - vldr s3, [IN, #4] - vmul.f s8, s0, s16 @ vector operation - vldmia TSIN!, {s20,s21} - vldmdb TSIN_HI!, {s22,s23} - vldr s4, [IN] - vldr s5, [IN, #8] - vldr s6, [IN_HI, #-16] - vldr s7, [IN_HI, #-8] - vmul.f s12, s0, s20 @ vector operation - add IN, IN, #16 - sub IN_HI, IN_HI, #16 - ldrh J0, [REVTAB], #2 - ldrh J1, [REVTAB], #2 - vmls.f s8, s4, s20 @ vector operation - ldrh J3, [REVTAB_HI, #-2]! - ldrh J2, [REVTAB_HI, #-2]! - add J0, OUT, J0, lsl #3 - vmla.f s12, s4, s16 @ vector operation - add J1, OUT, J1, lsl #3 - add J2, OUT, J2, lsl #3 - add J3, OUT, J3, lsl #3 - vstr s8, [J0] - vstr s9, [J1] - vstr s10, [J2] - vstr s11, [J3] - vstr s12, [J0, #4] - vstr s13, [J1, #4] - vstr s14, [J2, #4] - vstr s15, [J3, #4] -.endm - -.macro postrotation_innerloop tail, head - .set trig_lo_head, n8 - k - 2 - .set trig_hi_head, n8 + k - .set out_lo_head, trig_lo_head * 2 - .set out_hi_head, trig_hi_head * 2 - .set trig_lo_tail, n8 - (k - 2) - 2 - .set trig_hi_tail, n8 + (k - 2) - .set out_lo_tail, trig_lo_tail * 2 - .set out_hi_tail, trig_hi_tail * 2 - .if (k & 2) == 0 - TCOS_D0_HEAD .req d10 @ s20,s21 - TCOS_D1_HEAD .req d11 @ s22,s23 - TCOS_S0_TAIL .req s24 - .else - TCOS_D0_HEAD .req d12 @ s24,s25 - TCOS_D1_HEAD .req d13 @ s26,s27 - TCOS_S0_TAIL .req s20 - .endif - .ifnc "\tail","" - vmls.f s8, s0, TCOS_S0_TAIL @ vector operation - .endif - .ifnc "\head","" - vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17 - vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19 - vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4] - .endif - .ifnc "\tail","" - vmla.f s12, s4, TCOS_S0_TAIL @ vector operation - .endif - .ifnc "\head","" - vldr s0, [OUT, #out_lo_head*4] - vldr s1, [OUT, #out_lo_head*4 + 8] - vldr s2, [OUT, #out_hi_head*4] - vldr s3, [OUT, #out_hi_head*4 + 8] - vldr s4, [OUT, #out_lo_head*4 + 4] - vldr s5, [OUT, #out_lo_head*4 + 12] - vldr s6, [OUT, #out_hi_head*4 + 4] - vldr s7, [OUT, #out_hi_head*4 + 12] - .endif - .ifnc "\tail","" - vstr s8, [OUT, #out_lo_tail*4] - vstr s9, [OUT, #out_lo_tail*4 + 8] - vstr s10, [OUT, #out_hi_tail*4] - vstr s11, [OUT, #out_hi_tail*4 + 8] - .endif - .ifnc "\head","" - vmul.f s8, s4, s16 @ vector operation - .endif - .ifnc "\tail","" - vstr s12, [OUT, #out_hi_tail*4 + 12] - vstr s13, [OUT, #out_hi_tail*4 + 4] - vstr s14, [OUT, #out_lo_tail*4 + 12] - vstr s15, [OUT, #out_lo_tail*4 + 4] - .endif - .ifnc "\head","" - vmul.f s12, s0, s16 @ vector operation - vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4] - .endif - .unreq TCOS_D0_HEAD - .unreq TCOS_D1_HEAD - .unreq TCOS_S0_TAIL - .ifnc "\head","" - .set k, k + 2 - .endif -.endm - -.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail - .ifnc "\tail","" - vmls.f s8, s0, \tcos_s0_tail @ vector operation - .endif - .ifnc "\head","" - vldmia TSIN!, {s16,s17} - vldmdb TSIN_HI!, {s18,s19} - vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head} - .endif - .ifnc "\tail","" - vmla.f s12, s4, \tcos_s0_tail @ vector operation - .endif - .ifnc "\head","" - vldr s0, [OUT, #+\out_offset_head+0] - vldr s1, [OUT, #+\out_offset_head+8] - vldr s2, [OUT_HI, #-\out_offset_head-16] - vldr s3, [OUT_HI, #-\out_offset_head-8] - vldr s4, [OUT, #+\out_offset_head+4] - vldr s5, [OUT, #+\out_offset_head+12] - vldr s6, [OUT_HI, #-\out_offset_head-12] - vldr s7, [OUT_HI, #-\out_offset_head-4] - .endif - .ifnc "\tail","" - vstr s8, [OUT, #+\out_offset_tail+0] - vstr s9, [OUT, #+\out_offset_tail+8] - vstr s10, [OUT_HI, #-\out_offset_tail-16] - vstr s11, [OUT_HI, #-\out_offset_tail-8] - .endif - .ifnc "\head","" - vmul.f s8, s4, s16 @ vector operation - .endif - .ifnc "\tail","" - vstr s12, [OUT_HI, #-\out_offset_tail-4] - vstr s13, [OUT_HI, #-\out_offset_tail-12] - vstr s14, [OUT, #+\out_offset_tail+12] - vstr s15, [OUT, #+\out_offset_tail+4] - .endif - .ifnc "\head","" - vmul.f s12, s0, s16 @ vector operation - vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head} - .endif -.endm - - -/* void ff_imdct_half_vfp(FFTContext *s, - * FFTSample *output, - * const FFTSample *input) - */ -function ff_imdct_half_vfp, export=1 - ldr ip, [CONTEXT, #5*4] @ mdct_bits - teq ip, #6 - bne 10f - - .set n, 1<<6 - .set n2, n/2 - .set n4, n/4 - .set n8, n/8 - - push {v1-v5,lr} - vpush {s16-s27} - fmrx OLDFPSCR, FPSCR - ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 - fmxr FPSCR, lr - mov OUT, ORIGOUT - ldr REVTAB, [CONTEXT, #2*4] - ldr TCOS, [CONTEXT, #6*4] - ldr TSIN, [CONTEXT, #7*4] - - .set k, 0 - .rept n8/2 - prerotation_innerloop - .endr - - fmxr FPSCR, OLDFPSCR - mov a1, OUT - bl X(ff_fft16_vfp) - ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 - fmxr FPSCR, lr - - .set k, 0 - postrotation_innerloop , head - .rept n8/2 - 1 - postrotation_innerloop tail, head - .endr - postrotation_innerloop tail - - fmxr FPSCR, OLDFPSCR - vpop {s16-s27} - pop {v1-v5,pc} - -10: - push {v1-v6,sl,fp,lr} - vpush {s16-s27} - fmrx OLDFPSCR, FPSCR - ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 - fmxr FPSCR, lr - mov lr, #1 - mov OUT, ORIGOUT - ldr REVTAB, [CONTEXT, #2*4] - ldr TCOS, [CONTEXT, #6*4] - ldr TSIN, [CONTEXT, #7*4] - mov lr, lr, lsl ip - - push {CONTEXT,OLDFPSCR} - add IN_HI, IN, lr, lsl #1 - add REVTAB_HI, REVTAB, lr, lsr #1 - add TCOS_HI, TCOS, lr - add TSIN_HI, TSIN, lr -0: prerotation_innerloop_rolled - teq IN, IN_HI - bne 0b - ldmia sp, {CONTEXT,OLDFPSCR} - - mov ORIGOUT, OUT - fmxr FPSCR, OLDFPSCR - ldr ip, [CONTEXT, #9*4] - blx ip @ s->fft_calc(s, output) - - pop {CONTEXT,OLDFPSCR} - ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 - ldr ip, [CONTEXT, #5*4] @ mdct_bits - fmxr FPSCR, lr - mov lr, #1 - mov lr, lr, lsl ip - sub TCOS, TCOS, lr, lsr #1 - sub TSIN, TSIN, lr, lsr #1 - add OUT_HI, OUT, lr, lsl #1 - add TCOS_HI, TCOS, lr - add TSIN_HI, TSIN, lr - postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0 - b 1f -0: add OUT, OUT, #32 - sub OUT_HI, OUT_HI, #32 - postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16 -1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0 - teq TSIN, TSIN_HI - bne 0b - postrotation_innerloop_rolled tail,,,,,, s24,, 16 - - fmxr FPSCR, OLDFPSCR - vpop {s16-s27} - pop {v1-v6,sl,fp,pc} -endfunc - - .unreq CONTEXT - .unreq ORIGOUT - .unreq IN - .unreq OUT - .unreq REVTAB - .unreq TCOS - .unreq TSIN - .unreq OLDFPSCR - .unreq J0 - .unreq J1 - .unreq J2 - .unreq J3 - .unreq REVTAB_HI - .unreq IN_HI - .unreq OUT_HI - .unreq TCOS_HI - .unreq TSIN_HI diff --git a/libavcodec/arm/rdft_init_arm.c b/libavcodec/arm/rdft_init_arm.c deleted file mode 100644 index 1c5d8beb61..0000000000 --- a/libavcodec/arm/rdft_init_arm.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/arm/cpu.h" - -#include "libavcodec/rdft.h" - -void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); - -av_cold void ff_rdft_init_arm(RDFTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) - s->rdft_calc = ff_rdft_calc_neon; -} diff --git a/libavcodec/arm/rdft_neon.S b/libavcodec/arm/rdft_neon.S deleted file mode 100644 index eabb92b4bd..0000000000 --- a/libavcodec/arm/rdft_neon.S +++ /dev/null @@ -1,155 +0,0 @@ -/* - * ARM NEON optimised RDFT - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_rdft_calc_neon, export=1 - push {r4-r8,lr} - - ldr r6, [r0, #4] @ inverse - mov r4, r0 - mov r5, r1 - - lsls r6, r6, #31 - bne 1f - add r0, r4, #24 - bl X(ff_fft_permute_neon) - add r0, r4, #24 - mov r1, r5 - bl X(ff_fft_calc_neon) -1: - ldr r12, [r4, #0] @ nbits - mov r2, #1 - ldr r8, [r4, #20] @ negative_sin - lsl r12, r2, r12 - add r0, r5, #8 - lsl r8, r8, #31 - add r1, r5, r12, lsl #2 - lsr r12, r12, #2 - vdup.32 d26, r8 - ldr r2, [r4, #12] @ tcos - sub r12, r12, #2 - ldr r3, [r4, #16] @ tsin - mov r7, r0 - sub r1, r1, #8 - mov lr, r1 - mov r8, #-8 - vld1.32 {d0}, [r0,:64]! @ d1[0,1] - vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] - vld1.32 {d4}, [r2,:64]! @ tcos[i] - vld1.32 {d5}, [r3,:64]! @ tsin[i] - vmov.f32 d18, #0.5 @ k1 - vdup.32 d19, r6 - veor d5, d26, d5 - pld [r0, #32] - veor d19, d18, d19 @ k2 - vmov.i32 d16, #0 - vmov.i32 d17, #1<<31 - pld [r1, #-32] - vtrn.32 d16, d17 - pld [r2, #32] - vrev64.32 d16, d16 @ d16=1,0 d17=0,1 - pld [r3, #32] -2: - veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] - vld1.32 {d24}, [r0,:64]! @ d1[0,1] - vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] - vld1.32 {d25}, [r1,:64], r8 @ d2[0,1] - vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] - veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1] - pld [r0, #32] - vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re - pld [r1, #-32] - vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1] - vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1] - vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re - veor d7, d21, d16 @ -od.im, od.re - vrev64.32 d3, d21 @ od.re, od.im - veor d6, d20, d17 @ ev.re,-ev.im - veor d2, d3, d16 @ -od.re, od.im - vmla.f32 d20, d3, d4[1] - vmla.f32 d20, d7, d5[1] - vmla.f32 d6, d2, d4[1] - vmla.f32 d6, d21, d5[1] - vld1.32 {d4}, [r2,:64]! @ tcos[i] - veor d7, d23, d16 @ -od.im, od.re - vld1.32 {d5}, [r3,:64]! @ tsin[i] - veor d24, d22, d17 @ ev.re,-ev.im - vrev64.32 d3, d23 @ od.re, od.im - veor d5, d26, d5 - pld [r2, #32] - veor d2, d3, d16 @ -od.re, od.im - pld [r3, #32] - vmla.f32 d22, d3, d4[0] - vmla.f32 d22, d7, d5[0] - vmla.f32 d24, d2, d4[0] - vmla.f32 d24, d23, d5[0] - vld1.32 {d0}, [r0,:64]! @ d1[0,1] - vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] - vst1.32 {d20}, [r7,:64]! - vst1.32 {d6}, [lr,:64], r8 - vst1.32 {d22}, [r7,:64]! - vst1.32 {d24}, [lr,:64], r8 - subs r12, r12, #2 - bgt 2b - - veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] - vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] - vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] - ldr r2, [r4, #8] @ sign_convention - vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re - add r0, r0, #4 - bfc r2, #0, #31 - vld1.32 {d0[0]}, [r0,:32] - veor d7, d21, d16 @ -od.im, od.re - vrev64.32 d3, d21 @ od.re, od.im - veor d6, d20, d17 @ ev.re,-ev.im - vld1.32 {d22}, [r5,:64] - vdup.32 d1, r2 - vmov d23, d22 - veor d2, d3, d16 @ -od.re, od.im - vtrn.32 d22, d23 - veor d0, d0, d1 - veor d23, d23, d17 - vmla.f32 d20, d3, d4[1] - vmla.f32 d20, d7, d5[1] - vmla.f32 d6, d2, d4[1] - vmla.f32 d6, d21, d5[1] - vadd.f32 d22, d22, d23 - vst1.32 {d20}, [r7,:64] - vst1.32 {d6}, [lr,:64] - vst1.32 {d0[0]}, [r0,:32] - vst1.32 {d22}, [r5,:64] - - cmp r6, #0 - it eq - popeq {r4-r8,pc} - - vmul.f32 d22, d22, d18 - vst1.32 {d22}, [r5,:64] - add r0, r4, #24 - mov r1, r5 - bl X(ff_fft_permute_neon) - add r0, r4, #24 - mov r1, r5 - pop {r4-r8,lr} - b X(ff_fft_calc_neon) -endfunc diff --git a/libavcodec/arm/synth_filter_init_arm.c b/libavcodec/arm/synth_filter_init_arm.c index 858c117d39..10689b62e6 100644 --- a/libavcodec/arm/synth_filter_init_arm.c +++ b/libavcodec/arm/synth_filter_init_arm.c @@ -23,7 +23,6 @@ #include "libavutil/arm/cpu.h" #include "libavutil/attributes.h" #include "libavutil/internal.h" -#include "libavcodec/fft.h" #include "libavcodec/synth_filter.h" void ff_synth_filter_float_vfp(AVTXContext *imdct, diff --git a/libavcodec/cos_tablegen.c b/libavcodec/cos_tablegen.c deleted file mode 100644 index 7206aad5dd..0000000000 --- a/libavcodec/cos_tablegen.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Generate a header file for hardcoded ff_cos_* tables - * - * Copyright (c) 2009 Reimar Döffinger - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include -#include -#include - -#include "libavutil/mathematics.h" - -#define BITS 17 -#define FLOATFMT "%.18e" -#define FIXEDFMT "%6d" - -static int clip_f15(int v) -{ - return v < -32767 ? -32767 : - v > 32767 ? 32767 : - v; -} - -static void printval(double val, int fixed) -{ - if (fixed) { - /* lrint() isn't always available, so round and cast manually. */ - double new_val = val * (double) (1 << 15); - - new_val = new_val >= 0 ? floor(new_val + 0.5) : ceil(new_val - 0.5); - - printf(" "FIXEDFMT",", clip_f15((long int) new_val)); - } else { - printf(" "FLOATFMT",", val); - } -} - -int main(int argc, char *argv[]) -{ - int i, j; - int do_sin = argc > 1 && !strcmp(argv[1], "sin"); - int fixed = argc > 1 && strstr(argv[1], "fixed"); - double (*func)(double) = do_sin ? sin : cos; - - printf("/* This file was automatically generated. */\n"); - printf("#define FFT_FLOAT %d\n", !fixed); - printf("#include \"libavcodec/%s\"\n", do_sin ? "rdft.h" : "fft.h"); - for (i = 4; i <= BITS; i++) { - int m = 1 << i; - double freq = 2*M_PI/m; - printf("%s(%i) = {\n ", do_sin ? "SINTABLE" : "COSTABLE", m); - for (j = 0; j < m/2 - 1; j++) { - int idx = j > m/4 ? m/2 - j : j; - if (do_sin && j >= m/4) - idx = m/4 - j; - printval(func(idx*freq), fixed); - if ((j & 3) == 3) - printf("\n "); - } - printval(func(do_sin ? -(m/4 - 1)*freq : freq), fixed); - printf("\n};\n"); - } - return 0; -} diff --git a/libavcodec/dct.c b/libavcodec/dct.c deleted file mode 100644 index eeb4d154e0..0000000000 --- a/libavcodec/dct.c +++ /dev/null @@ -1,228 +0,0 @@ -/* - * (I)DCT Transforms - * Copyright (c) 2009 Peter Ross - * Copyright (c) 2010 Alex Converse - * Copyright (c) 2010 Vitor Sessak - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * (Inverse) Discrete Cosine Transforms. These are also known as the - * type II and type III DCTs respectively. - */ - -#include -#include - -#include "libavutil/error.h" -#include "libavutil/mathematics.h" -#include "libavutil/mem.h" -#include "dct.h" -#include "dct32.h" - -/* sin((M_PI * x / (2 * n)) */ -#define SIN(s, n, x) (s->costab[(n) - (x)]) - -/* cos((M_PI * x / (2 * n)) */ -#define COS(s, n, x) (s->costab[x]) - -static void dst_calc_I_c(DCTContext *ctx, FFTSample *data) -{ - int n = 1 << ctx->nbits; - int i; - - data[0] = 0; - for (i = 1; i < n / 2; i++) { - float tmp1 = data[i ]; - float tmp2 = data[n - i]; - float s = SIN(ctx, n, 2 * i); - - s *= tmp1 + tmp2; - tmp1 = (tmp1 - tmp2) * 0.5f; - data[i] = s + tmp1; - data[n - i] = s - tmp1; - } - - data[n / 2] *= 2; - ctx->rdft.rdft_calc(&ctx->rdft, data); - - data[0] *= 0.5f; - - for (i = 1; i < n - 2; i += 2) { - data[i + 1] += data[i - 1]; - data[i] = -data[i + 2]; - } - - data[n - 1] = 0; -} - -static void dct_calc_I_c(DCTContext *ctx, FFTSample *data) -{ - int n = 1 << ctx->nbits; - int i; - float next = -0.5f * (data[0] - data[n]); - - for (i = 0; i < n / 2; i++) { - float tmp1 = data[i]; - float tmp2 = data[n - i]; - float s = SIN(ctx, n, 2 * i); - float c = COS(ctx, n, 2 * i); - - c *= tmp1 - tmp2; - s *= tmp1 - tmp2; - - next += c; - - tmp1 = (tmp1 + tmp2) * 0.5f; - data[i] = tmp1 - s; - data[n - i] = tmp1 + s; - } - - ctx->rdft.rdft_calc(&ctx->rdft, data); - data[n] = data[1]; - data[1] = next; - - for (i = 3; i <= n; i += 2) - data[i] = data[i - 2] - data[i]; -} - -static void dct_calc_III_c(DCTContext *ctx, FFTSample *data) -{ - int n = 1 << ctx->nbits; - int i; - - float next = data[n - 1]; - float inv_n = 1.0f / n; - - for (i = n - 2; i >= 2; i -= 2) { - float val1 = data[i]; - float val2 = data[i - 1] - data[i + 1]; - float c = COS(ctx, n, i); - float s = SIN(ctx, n, i); - - data[i] = c * val1 + s * val2; - data[i + 1] = s * val1 - c * val2; - } - - data[1] = 2 * next; - - ctx->rdft.rdft_calc(&ctx->rdft, data); - - for (i = 0; i < n / 2; i++) { - float tmp1 = data[i] * inv_n; - float tmp2 = data[n - i - 1] * inv_n; - float csc = ctx->csc2[i] * (tmp1 - tmp2); - - tmp1 += tmp2; - data[i] = tmp1 + csc; - data[n - i - 1] = tmp1 - csc; - } -} - -static void dct_calc_II_c(DCTContext *ctx, FFTSample *data) -{ - int n = 1 << ctx->nbits; - int i; - float next; - - for (i = 0; i < n / 2; i++) { - float tmp1 = data[i]; - float tmp2 = data[n - i - 1]; - float s = SIN(ctx, n, 2 * i + 1); - - s *= tmp1 - tmp2; - tmp1 = (tmp1 + tmp2) * 0.5f; - - data[i] = tmp1 + s; - data[n-i-1] = tmp1 - s; - } - - ctx->rdft.rdft_calc(&ctx->rdft, data); - - next = data[1] * 0.5; - data[1] *= -1; - - for (i = n - 2; i >= 0; i -= 2) { - float inr = data[i ]; - float ini = data[i + 1]; - float c = COS(ctx, n, i); - float s = SIN(ctx, n, i); - - data[i] = c * inr + s * ini; - data[i + 1] = next; - - next += s * inr - c * ini; - } -} - -static void dct32_func(DCTContext *ctx, FFTSample *data) -{ - ctx->dct32(data, data); -} - -av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse) -{ - int n = 1 << nbits; - int i; - int ret; - - memset(s, 0, sizeof(*s)); - - s->nbits = nbits; - s->inverse = inverse; - - if (inverse == DCT_II && nbits == 5) { - s->dct_calc = dct32_func; - } else { - ff_init_ff_cos_tabs(nbits + 2); - - s->costab = ff_cos_tabs[nbits + 2]; - s->csc2 = av_malloc_array(n / 2, sizeof(FFTSample)); - if (!s->csc2) - return AVERROR(ENOMEM); - - if ((ret = ff_rdft_init(&s->rdft, nbits, inverse == DCT_III)) < 0) { - av_freep(&s->csc2); - return ret; - } - - for (i = 0; i < n / 2; i++) - s->csc2[i] = 0.5 / sin((M_PI / (2 * n) * (2 * i + 1))); - - switch (inverse) { - case DCT_I : s->dct_calc = dct_calc_I_c; break; - case DCT_II : s->dct_calc = dct_calc_II_c; break; - case DCT_III: s->dct_calc = dct_calc_III_c; break; - case DST_I : s->dct_calc = dst_calc_I_c; break; - } - } - - s->dct32 = ff_dct32_float; -#if ARCH_X86 - ff_dct_init_x86(s); -#endif - - return 0; -} - -av_cold void ff_dct_end(DCTContext *s) -{ - ff_rdft_end(&s->rdft); - av_freep(&s->csc2); -} diff --git a/libavcodec/dct.h b/libavcodec/dct.h index 75f40b9f84..17c881a695 100644 --- a/libavcodec/dct.h +++ b/libavcodec/dct.h @@ -21,37 +21,12 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#if !defined(AVCODEC_DCT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT) +#ifndef AVCODEC_DCT_H #define AVCODEC_DCT_H #include #include -#include "rdft.h" - -struct DCTContext { - int nbits; - int inverse; - RDFTContext rdft; - const float *costab; - FFTSample *csc2; - void (*dct_calc)(struct DCTContext *s, FFTSample *data); - void (*dct32)(FFTSample *out, const FFTSample *in); -}; - -/** - * Set up DCT. - * @param nbits size of the input array: - * (1 << nbits) for DCT-II, DCT-III and DST-I - * (1 << nbits) + 1 for DCT-I - * - * @note the first element of the input of DST-I is ignored - */ -int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType type); -void ff_dct_end (DCTContext *s); - -void ff_dct_init_x86(DCTContext *s); - void ff_j_rev_dct(int16_t *data); void ff_j_rev_dct4(int16_t *data); void ff_j_rev_dct2(int16_t *data); diff --git a/libavcodec/fft-internal.h b/libavcodec/fft-internal.h deleted file mode 100644 index d89a3e38ca..0000000000 --- a/libavcodec/fft-internal.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_FFT_INTERNAL_H -#define AVCODEC_FFT_INTERNAL_H - -#include "libavutil/mathematics.h" -#include "fft.h" - -#if FFT_FLOAT - -#define FIX15(v) (v) -#define sqrthalf (float)M_SQRT1_2 - -#define BF(x, y, a, b) do { \ - x = a - b; \ - y = a + b; \ - } while (0) - -#define CMUL(dre, dim, are, aim, bre, bim) do { \ - (dre) = (are) * (bre) - (aim) * (bim); \ - (dim) = (are) * (bim) + (aim) * (bre); \ - } while (0) - -#else /* FFT_FLOAT */ - -#define CMUL(dre, dim, are, aim, bre, bim) do { \ - int64_t accu; \ - (accu) = (int64_t)(bre) * (are); \ - (accu) -= (int64_t)(bim) * (aim); \ - (dre) = (int)(((accu) + 0x40000000) >> 31); \ - (accu) = (int64_t)(bre) * (aim); \ - (accu) += (int64_t)(bim) * (are); \ - (dim) = (int)(((accu) + 0x40000000) >> 31); \ - } while (0) - -#endif /* FFT_FLOAT */ - -#define ff_imdct_calc_c FFT_NAME(ff_imdct_calc_c) -#define ff_imdct_half_c FFT_NAME(ff_imdct_half_c) -#define ff_mdct_calc_c FFT_NAME(ff_mdct_calc_c) - -void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); - -#endif /* AVCODEC_FFT_INTERNAL_H */ diff --git a/libavcodec/fft.h b/libavcodec/fft.h deleted file mode 100644 index d46e5a3f0b..0000000000 --- a/libavcodec/fft.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2000, 2001, 2002 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_FFT_H -#define AVCODEC_FFT_H - -#ifndef FFT_FLOAT -#define FFT_FLOAT 1 -#endif - -#include -#include "config.h" - -#include "libavutil/attributes_internal.h" -#include "libavutil/mem_internal.h" - -#if FFT_FLOAT - -#include "avfft.h" - -#define FFT_NAME(x) x - -typedef float FFTDouble; - -#else - -#define Q31(x) (int)((x)*2147483648.0 + 0.5) -#define FFT_NAME(x) x ## _fixed_32 - -typedef int32_t FFTSample; - -typedef struct FFTComplex { - FFTSample re, im; -} FFTComplex; - -typedef int FFTDouble; -typedef struct FFTContext FFTContext; - -#endif /* FFT_FLOAT */ - -typedef struct FFTDComplex { - FFTDouble re, im; -} FFTDComplex; - -/* FFT computation */ - -enum fft_permutation_type { - FF_FFT_PERM_DEFAULT, - FF_FFT_PERM_SWAP_LSBS, - FF_FFT_PERM_AVX, -}; - -enum mdct_permutation_type { - FF_MDCT_PERM_NONE, - FF_MDCT_PERM_INTERLEAVE, -}; - -struct FFTContext { - int nbits; - int inverse; - uint16_t *revtab; - FFTComplex *tmp_buf; - int mdct_size; /* size of MDCT (i.e. number of input data * 2) */ - int mdct_bits; /* n = 2^nbits */ - /* pre/post rotation tables */ - FFTSample *tcos; - FFTSample *tsin; - /** - * Do the permutation needed BEFORE calling fft_calc(). - */ - void (*fft_permute)(struct FFTContext *s, FFTComplex *z); - /** - * Do a complex FFT with the parameters defined in ff_fft_init(). The - * input data must be permuted before. No 1.0/sqrt(n) normalization is done. - */ - void (*fft_calc)(struct FFTContext *s, FFTComplex *z); - void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); - void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input); - void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); - enum fft_permutation_type fft_permutation; - enum mdct_permutation_type mdct_permutation; - uint32_t *revtab32; -}; - -#if CONFIG_HARDCODED_TABLES -#define COSTABLE_CONST const -#define ff_init_ff_cos_tabs(index) -#else -#define COSTABLE_CONST -#define ff_init_ff_cos_tabs FFT_NAME(ff_init_ff_cos_tabs) - -/** - * Initialize the cosine table in ff_cos_tabs[index] - * @param index index in ff_cos_tabs array of the table to initialize - */ -void ff_init_ff_cos_tabs(int index); -#endif - -#define COSTABLE(size) \ - COSTABLE_CONST attribute_visibility_hidden DECLARE_ALIGNED(32, FFTSample, FFT_NAME(ff_cos_##size))[size/2] - -extern COSTABLE(16); -extern COSTABLE(32); -extern COSTABLE(64); -extern COSTABLE(128); -extern COSTABLE(256); -extern COSTABLE(512); -extern COSTABLE(1024); -extern COSTABLE(2048); -extern COSTABLE(4096); -extern COSTABLE(8192); -extern COSTABLE(16384); -extern COSTABLE(32768); -extern COSTABLE(65536); -extern COSTABLE(131072); -extern COSTABLE_CONST FFTSample* const FFT_NAME(ff_cos_tabs)[18]; - -#define ff_fft_init FFT_NAME(ff_fft_init) -#define ff_fft_end FFT_NAME(ff_fft_end) - -/** - * Set up a complex FFT. - * @param nbits log2 of the length of the input array - * @param inverse if 0 perform the forward transform, if 1 perform the inverse - */ -int ff_fft_init(FFTContext *s, int nbits, int inverse); - -void ff_fft_init_aarch64(FFTContext *s); -void ff_fft_init_x86(FFTContext *s); -void ff_fft_init_arm(FFTContext *s); -void ff_fft_init_mips(FFTContext *s); -void ff_fft_init_ppc(FFTContext *s); - -void ff_fft_end(FFTContext *s); - -#define ff_mdct_init FFT_NAME(ff_mdct_init) -#define ff_mdct_end FFT_NAME(ff_mdct_end) - -int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale); -void ff_mdct_end(FFTContext *s); - -#endif /* AVCODEC_FFT_H */ diff --git a/libavcodec/fft_fixed_32.c b/libavcodec/fft_fixed_32.c deleted file mode 100644 index e18dc83891..0000000000 --- a/libavcodec/fft_fixed_32.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2012 - * MIPS Technologies, Inc., California. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Authors: Stanislav Ocovaj (socovaj@mips.com) - * Goran Cordasic (goran@mips.com) - * Djordje Pesut (djordje@mips.com) - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#define FFT_FLOAT 0 -#include "fft_template.c" diff --git a/libavcodec/fft_float.c b/libavcodec/fft_float.c deleted file mode 100644 index a9fd01978d..0000000000 --- a/libavcodec/fft_float.c +++ /dev/null @@ -1,20 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#define FFT_FLOAT 1 -#include "fft_template.c" diff --git a/libavcodec/fft_init_table.c b/libavcodec/fft_init_table.c deleted file mode 100644 index 83e35ffb7c..0000000000 --- a/libavcodec/fft_init_table.c +++ /dev/null @@ -1,344 +0,0 @@ -/* - * Copyright (c) 2012 - * MIPS Technologies, Inc., California. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Authors: Stanislav Ocovaj (socovaj@mips.com) - * Goran Cordasic (goran@mips.com) - * Djordje Pesut (djordje@mips.com) - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * definitions and initialization of LUT table for FFT - */ -#include "libavutil/thread.h" - -#include "libavcodec/fft_table.h" - -const int32_t ff_w_tab_sr[MAX_FFT_SIZE/(4*16)] = { -2147483647, 2147483016, 2147481121, 2147477963, 2147473542, 2147467857, 2147460908, 2147452697, -2147443222, 2147432484, 2147420483, 2147407218, 2147392690, 2147376899, 2147359845, 2147341527, -2147321946, 2147301102, 2147278995, 2147255625, 2147230991, 2147205094, 2147177934, 2147149511, -2147119825, 2147088876, 2147056664, 2147023188, 2146988450, 2146952448, 2146915184, 2146876656, -2146836866, 2146795813, 2146753497, 2146709917, 2146665076, 2146618971, 2146571603, 2146522973, -2146473080, 2146421924, 2146369505, 2146315824, 2146260881, 2146204674, 2146147205, 2146088474, -2146028480, 2145967224, 2145904705, 2145840924, 2145775880, 2145709574, 2145642006, 2145573176, -2145503083, 2145431729, 2145359112, 2145285233, 2145210092, 2145133690, 2145056025, 2144977098, -2144896910, 2144815460, 2144732748, 2144648774, 2144563539, 2144477042, 2144389283, 2144300264, -2144209982, 2144118439, 2144025635, 2143931570, 2143836244, 2143739656, 2143641807, 2143542697, -2143442326, 2143340694, 2143237802, 2143133648, 2143028234, 2142921559, 2142813624, 2142704427, -2142593971, 2142482254, 2142369276, 2142255039, 2142139541, 2142022783, 2141904764, 2141785486, -2141664948, 2141543150, 2141420092, 2141295774, 2141170197, 2141043360, 2140915264, 2140785908, -2140655293, 2140523418, 2140390284, 2140255892, 2140120240, 2139983329, 2139845159, 2139705730, -2139565043, 2139423097, 2139279892, 2139135429, 2138989708, 2138842728, 2138694490, 2138544994, -2138394240, 2138242228, 2138088958, 2137934430, 2137778644, 2137621601, 2137463301, 2137303743, -2137142927, 2136980855, 2136817525, 2136652938, 2136487095, 2136319994, 2136151637, 2135982023, -2135811153, 2135639026, 2135465642, 2135291003, 2135115107, 2134937956, 2134759548, 2134579885, -2134398966, 2134216791, 2134033361, 2133848675, 2133662734, 2133475538, 2133287087, 2133097381, -2132906420, 2132714204, 2132520734, 2132326009, 2132130030, 2131932796, 2131734309, 2131534567, -2131333572, 2131131322, 2130927819, 2130723062, 2130517052, 2130309789, 2130101272, 2129891502, -2129680480, 2129468204, 2129254676, 2129039895, 2128823862, 2128606576, 2128388038, 2128168248, -2127947206, 2127724913, 2127501367, 2127276570, 2127050522, 2126823222, 2126594672, 2126364870, -2126133817, 2125901514, 2125667960, 2125433155, 2125197100, 2124959795, 2124721240, 2124481435, -2124240380, 2123998076, 2123754522, 2123509718, 2123263666, 2123016364, 2122767814, 2122518015, -2122266967, 2122014670, 2121761126, 2121506333, 2121250292, 2120993003, 2120734467, 2120474683, -2120213651, 2119951372, 2119687847, 2119423074, 2119157054, 2118889788, 2118621275, 2118351516, -2118080511, 2117808259, 2117534762, 2117260020, 2116984031, 2116706797, 2116428319, 2116148595, -2115867626, 2115585412, 2115301954, 2115017252, 2114731305, 2114444114, 2114155680, 2113866001, -2113575080, 2113282914, 2112989506, 2112694855, 2112398960, 2112101824, 2111803444, 2111503822, -2111202959, 2110900853, 2110597505, 2110292916, 2109987085, 2109680013, 2109371700, 2109062146, -2108751352, 2108439317, 2108126041, 2107811526, 2107495770, 2107178775, 2106860540, 2106541065, -2106220352, 2105898399, 2105575208, 2105250778, 2104925109, 2104598202, 2104270057, 2103940674, -2103610054, 2103278196, 2102945101, 2102610768, 2102275199, 2101938393, 2101600350, 2101261071, -2100920556, 2100578805, 2100235819, 2099891596, 2099546139, 2099199446, 2098851519, 2098502357, -2098151960, 2097800329, 2097447464, 2097093365, 2096738032, 2096381466, 2096023667, 2095664635, -2095304370, 2094942872, 2094580142, 2094216179, 2093850985, 2093484559, 2093116901, 2092748012, -2092377892, 2092006541, 2091633960, 2091260147, 2090885105, 2090508833, 2090131331, 2089752599, -2089372638, 2088991448, 2088609029, 2088225381, 2087840505, 2087454400, 2087067068, 2086678508, -2086288720, 2085897705, 2085505463, 2085111994, 2084717298, 2084321376, 2083924228, 2083525854, -2083126254, 2082725429, 2082323379, 2081920103, 2081515603, 2081109879, 2080702930, 2080294757, -2079885360, 2079474740, 2079062896, 2078649830, 2078235540, 2077820028, 2077403294, 2076985338, -2076566160, 2076145760, 2075724139, 2075301296, 2074877233, 2074451950, 2074025446, 2073597721, -2073168777, 2072738614, 2072307231, 2071874629, 2071440808, 2071005769, 2070569511, 2070132035, -2069693342, 2069253430, 2068812302, 2068369957, 2067926394, 2067481616, 2067035621, 2066588410, -2066139983, 2065690341, 2065239484, 2064787411, 2064334124, 2063879623, 2063423908, 2062966978, -2062508835, 2062049479, 2061588910, 2061127128, 2060664133, 2060199927, 2059734508, 2059267877, -2058800036, 2058330983, 2057860719, 2057389244, 2056916560, 2056442665, 2055967560, 2055491246, -2055013723, 2054534991, 2054055050, 2053573901, 2053091544, 2052607979, 2052123207, 2051637227, -2051150040, 2050661647, 2050172048, 2049681242, 2049189231, 2048696014, 2048201592, 2047705965, -2047209133, 2046711097, 2046211857, 2045711414, 2045209767, 2044706916, 2044202863, 2043697608, -2043191150, 2042683490, 2042174628, 2041664565, 2041153301, 2040640837, 2040127172, 2039612306, -2039096241, 2038578976, 2038060512, 2037540850, 2037019988, 2036497928, 2035974670, 2035450215, -2034924562, 2034397712, 2033869665, 2033340422, 2032809982, 2032278347, 2031745516, 2031211490, -2030676269, 2030139853, 2029602243, 2029063439, 2028523442, 2027982251, 2027439867, 2026896291, -2026351522, 2025805561, 2025258408, 2024710064, 2024160529, 2023609803, 2023057887, 2022504780, -2021950484, 2021394998, 2020838323, 2020280460, 2019721407, 2019161167, 2018599739, 2018037123, -2017473321, 2016908331, 2016342155, 2015774793, 2015206245, 2014636511, 2014065592, 2013493489, -2012920201, 2012345729, 2011770073, 2011193233, 2010615210, 2010036005, 2009455617, 2008874047, -2008291295, 2007707362, 2007122248, 2006535953, 2005948478, 2005359822, 2004769987, 2004178973, -2003586779, 2002993407, 2002398857, 2001803128, 2001206222, 2000608139, 2000008879, 1999408442, -1998806829, 1998204040, 1997600076, 1996994937, 1996388622, 1995781134, 1995172471, 1994562635, -1993951625, 1993339442, 1992726087, 1992111559, 1991495860, 1990878989, 1990260946, 1989641733, -1989021350, 1988399796, 1987777073, 1987153180, 1986528118, 1985901888, 1985274489, 1984645923, -1984016189, 1983385288, 1982753220, 1982119985, 1981485585, 1980850019, 1980213288, 1979575392, -1978936331, 1978296106, 1977654717, 1977012165, 1976368450, 1975723572, 1975077532, 1974430331, -1973781967, 1973132443, 1972481757, 1971829912, 1971176906, 1970522741, 1969867417, 1969210933, -1968553292, 1967894492, 1967234535, 1966573420, 1965911148, 1965247720, 1964583136, 1963917396, -1963250501, 1962582451, 1961913246, 1961242888, 1960571375, 1959898709, 1959224890, 1958549919, -1957873796, 1957196520, 1956518093, 1955838516, 1955157788, 1954475909, 1953792881, 1953108703, -1952423377, 1951736902, 1951049279, 1950360508, 1949670589, 1948979524, 1948287312, 1947593954, -1946899451, 1946203802, 1945507008, 1944809070, 1944109987, 1943409761, 1942708392, 1942005880, -1941302225, 1940597428, 1939891490, 1939184411, 1938476190, 1937766830, 1937056329, 1936344689, -1935631910, 1934917992, 1934202936, 1933486742, 1932769411, 1932050943, 1931331338, 1930610597, -1929888720, 1929165708, 1928441561, 1927716279, 1926989864, 1926262315, 1925533633, 1924803818, -1924072871, 1923340791, 1922607581, 1921873239, 1921137767, 1920401165, 1919663432, 1918924571, -1918184581, 1917443462, 1916701216, 1915957841, 1915213340, 1914467712, 1913720958, 1912973078, -1912224073, 1911473942, 1910722688, 1909970309, 1909216806, 1908462181, 1907706433, 1906949562, -1906191570, 1905432457, 1904672222, 1903910867, 1903148392, 1902384797, 1901620084, 1900854251, -1900087301, 1899319232, 1898550047, 1897779744, 1897008325, 1896235790, 1895462140, 1894687374, -1893911494, 1893134500, 1892356392, 1891577171, 1890796837, 1890015391, 1889232832, 1888449163, -1887664383, 1886878492, 1886091491, 1885303381, 1884514161, 1883723833, 1882932397, 1882139853, -1881346202, 1880551444, 1879755580, 1878958610, 1878160535, 1877361354, 1876561070, 1875759681, -1874957189, 1874153594, 1873348897, 1872543097, 1871736196, 1870928194, 1870119091, 1869308888, -1868497586, 1867685184, 1866871683, 1866057085, 1865241388, 1864424594, 1863606704, 1862787717, -1861967634, 1861146456, 1860324183, 1859500816, 1858676355, 1857850800, 1857024153, 1856196413, -1855367581, 1854537657, 1853706643, 1852874538, 1852041343, 1851207059, 1850371686, 1849535224, -1848697674, 1847859036, 1847019312, 1846178501, 1845336604, 1844493621, 1843649553, 1842804401, -1841958164, 1841110844, 1840262441, 1839412956, 1838562388, 1837710739, 1836858008, 1836004197, -1835149306, 1834293336, 1833436286, 1832578158, 1831718951, 1830858668, 1829997307, 1829134869, -1828271356, 1827406767, 1826541103, 1825674364, 1824806552, 1823937666, 1823067707, 1822196675, -1821324572, 1820451397, 1819577151, 1818701835, 1817825449, 1816947994, 1816069469, 1815189877, -1814309216, 1813427489, 1812544694, 1811660833, 1810775906, 1809889915, 1809002858, 1808114737, -1807225553, 1806335305, 1805443995, 1804551623, 1803658189, 1802763694, 1801868139, 1800971523, -1800073849, 1799175115, 1798275323, 1797374472, 1796472565, 1795569601, 1794665580, 1793760504, -1792854372, 1791947186, 1791038946, 1790129652, 1789219305, 1788307905, 1787395453, 1786481950, -1785567396, 1784651792, 1783735137, 1782817434, 1781898681, 1780978881, 1780058032, 1779136137, -1778213194, 1777289206, 1776364172, 1775438094, 1774510970, 1773582803, 1772653593, 1771723340, -1770792044, 1769859707, 1768926328, 1767991909, 1767056450, 1766119952, 1765182414, 1764243838, -1763304224, 1762363573, 1761421885, 1760479161, 1759535401, 1758590607, 1757644777, 1756697914, -1755750017, 1754801087, 1753851126, 1752900132, 1751948107, 1750995052, 1750040966, 1749085851, -1748129707, 1747172535, 1746214334, 1745255107, 1744294853, 1743333573, 1742371267, 1741407936, -1740443581, 1739478202, 1738511799, 1737544374, 1736575927, 1735606458, 1734635968, 1733664458, -1732691928, 1731718378, 1730743810, 1729768224, 1728791620, 1727813999, 1726835361, 1725855708, -1724875040, 1723893357, 1722910659, 1721926948, 1720942225, 1719956488, 1718969740, 1717981981, -1716993211, 1716003431, 1715012642, 1714020844, 1713028037, 1712034223, 1711039401, 1710043573, -1709046739, 1708048900, 1707050055, 1706050207, 1705049355, 1704047500, 1703044642, 1702040783, -1701035922, 1700030061, 1699023199, 1698015339, 1697006479, 1695996621, 1694985765, 1693973912, -1692961062, 1691947217, 1690932376, 1689916541, 1688899711, 1687881888, 1686863072, 1685843263, -1684822463, 1683800672, 1682777890, 1681754118, 1680729357, 1679703608, 1678676870, 1677649144, -1676620432, 1675590733, 1674560049, 1673528379, 1672495725, 1671462087, 1670427466, 1669391862, -1668355276, 1667317709, 1666279161, 1665239632, 1664199124, 1663157637, 1662115172, 1661071729, -1660027308, 1658981911, 1657935539, 1656888190, 1655839867, 1654790570, 1653740300, 1652689057, -1651636841, 1650583654, 1649529496, 1648474367, 1647418269, 1646361202, 1645303166, 1644244162, -1643184191, 1642123253, 1641061349, 1639998480, 1638934646, 1637869848, 1636804087, 1635737362, -1634669676, 1633601027, 1632531418, 1631460848, 1630389319, 1629316830, 1628243383, 1627168978, -1626093616, 1625017297, 1623940023, 1622861793, 1621782608, 1620702469, 1619621377, 1618539332, -1617456335, 1616372386, 1615287487, 1614201637, 1613114838, 1612027089, 1610938393, 1609848749, -1608758157, 1607666620, 1606574136, 1605480708, 1604386335, 1603291018, 1602194758, 1601097555, -1599999411, 1598900325, 1597800299, 1596699333, 1595597428, 1594494583, 1593390801, 1592286082, -1591180426, 1590073833, 1588966306, 1587857843, 1586748447, 1585638117, 1584526854, 1583414660, -1582301533, 1581187476, 1580072489, 1578956572, 1577839726, 1576721952, 1575603251, 1574483623, -1573363068, 1572241588, 1571119183, 1569995854, 1568871601, 1567746425, 1566620327, 1565493307, -1564365367, 1563236506, 1562106725, 1560976026, 1559844408, 1558711873, 1557578421, 1556444052, -1555308768, 1554172569, 1553035455, 1551897428, 1550758488, 1549618636, 1548477872, 1547336197, -1546193612, 1545050118, 1543905714, 1542760402, 1541614183, 1540467057, 1539319024, 1538170087, -1537020244, 1535869497, 1534717846, 1533565293, 1532411837, 1531257480, 1530102222, 1528946064, -1527789007, 1526631051, 1525472197, 1524312445, 1523151797, 1521990252, 1520827813, 1519664478, -1518500250, 1517335128, 1516169114, 1515002208, 1513834411, 1512665723, 1511496145, 1510325678, -1509154322, 1507982079, 1506808949, 1505634932, 1504460029, 1503284242, 1502107570, 1500930014, -1499751576, 1498572255, 1497392053, 1496210969, 1495029006, 1493846163, 1492662441, 1491477842, -1490292364, 1489106011, 1487918781, 1486730675, 1485541696, 1484351842, 1483161115, 1481969516, -1480777044, 1479583702, 1478389489, 1477194407, 1475998456, 1474801636, 1473603949, 1472405394, -1471205974, 1470005688, 1468804538, 1467602523, 1466399645, 1465195904, 1463991302, 1462785838, -1461579514, 1460372329, 1459164286, 1457955385, 1456745625, 1455535009, 1454323536, 1453111208, -1451898025, 1450683988, 1449469098, 1448253355, 1447036760, 1445819314, 1444601017, 1443381870, -1442161874, 1440941030, 1439719338, 1438496799, 1437273414, 1436049184, 1434824109, 1433598189, -1432371426, 1431143821, 1429915374, 1428686085, 1427455956, 1426224988, 1424993180, 1423760534, -1422527051, 1421292730, 1420057574, 1418821582, 1417584755, 1416347095, 1415108601, 1413869275, -1412629117, 1411388129, 1410146309, 1408903661, 1407660183, 1406415878, 1405170745, 1403924785, -1402678000, 1401430389, 1400181954, 1398932695, 1397682613, 1396431709, 1395179984, 1393927438, -1392674072, 1391419886, 1390164882, 1388909060, 1387652422, 1386394966, 1385136696, 1383877610, -1382617710, 1381356997, 1380095472, 1378833134, 1377569986, 1376306026, 1375041258, 1373775680, -1372509294, 1371242101, 1369974101, 1368705296, 1367435685, 1366165269, 1364894050, 1363622028, -1362349204, 1361075579, 1359801152, 1358525926, 1357249901, 1355973077, 1354695455, 1353417037, -1352137822, 1350857812, 1349577007, 1348295409, 1347013017, 1345729833, 1344445857, 1343161090, -1341875533, 1340589187, 1339302052, 1338014129, 1336725419, 1335435923, 1334145641, 1332854574, -1331562723, 1330270089, 1328976672, 1327682474, 1326387494, 1325091734, 1323795195, 1322497877, -1321199781, 1319900907, 1318601257, 1317300832, 1315999631, 1314697657, 1313394909, 1312091388, -1310787095, 1309482032, 1308176198, 1306869594, 1305562222, 1304254082, 1302945174, 1301635500, -1300325060, 1299013855, 1297701886, 1296389154, 1295075659, 1293761402, 1292446384, 1291130606, -1289814068, 1288496772, 1287178717, 1285859905, 1284540337, 1283220013, 1281898935, 1280577102, -1279254516, 1277931177, 1276607086, 1275282245, 1273956653, 1272630312, 1271303222, 1269975384, -1268646800, 1267317469, 1265987392, 1264656571, 1263325005, 1261992697, 1260659646, 1259325853, -1257991320, 1256656047, 1255320034, 1253983283, 1252645794, 1251307568, 1249968606, 1248628909, -1247288478, 1245947312, 1244605414, 1243262783, 1241919421, 1240575329, 1239230506, 1237884955, -1236538675, 1235191668, 1233843935, 1232495475, 1231146291, 1229796382, 1228445750, 1227094395, -1225742318, 1224389521, 1223036002, 1221681765, 1220326809, 1218971135, 1217614743, 1216257636, -1214899813, 1213541275, 1212182024, 1210822059, 1209461382, 1208099993, 1206737894, 1205375085, -1204011567, 1202647340, 1201282407, 1199916766, 1198550419, 1197183368, 1195815612, 1194447153, -1193077991, 1191708127, 1190337562, 1188966297, 1187594332, 1186221669, 1184848308, 1183474250, -1182099496, 1180724046, 1179347902, 1177971064, 1176593533, 1175215310, 1173836395, 1172456790, -1171076495, 1169695512, 1168313840, 1166931481, 1165548435, 1164164704, 1162780288, 1161395188, -1160009405, 1158622939, 1157235792, 1155847964, 1154459456, 1153070269, 1151680403, 1150289860, -1148898640, 1147506745, 1146114174, 1144720929, 1143327011, 1141932420, 1140537158, 1139141224, -1137744621, 1136347348, 1134949406, 1133550797, 1132151521, 1130751579, 1129350972, 1127949701, -1126547765, 1125145168, 1123741908, 1122337987, 1120933406, 1119528166, 1118122267, 1116715710, -1115308496, 1113900627, 1112492101, 1111082922, 1109673089, 1108262603, 1106851465, 1105439676, -1104027237, 1102614148, 1101200410, 1099786025, 1098370993, 1096955314, 1095538991, 1094122023, -1092704411, 1091286156, 1089867259, 1088447722, 1087027544, 1085606726, 1084185270, 1082763176, -1081340445, 1079917078, 1078493076, 1077068439, 1075643169, 1074217266, 1072790730, 1071363564, -1069935768, 1068507342, 1067078288, 1065648605, 1064218296, 1062787361, 1061355801, 1059923616, -1058490808, 1057057377, 1055623324, 1054188651, 1052753357, 1051317443, 1049880912, 1048443763, -1047005996, 1045567615, 1044128617, 1042689006, 1041248781, 1039807944, 1038366495, 1036924436, -1035481766, 1034038487, 1032594600, 1031150105, 1029705004, 1028259297, 1026812985, 1025366069, -1023918550, 1022470428, 1021021705, 1019572382, 1018122458, 1016671936, 1015220816, 1013769098, -1012316784, 1010863875, 1009410370, 1007956272, 1006501581, 1005046298, 1003590424, 1002133959, -1000676905, 999219262, 997761031, 996302214, 994842810, 993382821, 991922248, 990461091, -988999351, 987537030, 986074127, 984610645, 983146583, 981681943, 980216726, 978750932, -977284562, 975817617, 974350098, 972882006, 971413342, 969944106, 968474300, 967003923, -965532978, 964061465, 962589385, 961116739, 959643527, 958169751, 956695411, 955220508, -953745043, 952269017, 950792431, 949315286, 947837582, 946359321, 944880503, 943401129, -941921200, 940440717, 938959681, 937478092, 935995952, 934513261, 933030021, 931546231, -930061894, 928577010, 927091579, 925605603, 924119082, 922632018, 921144411, 919656262, -918167572, 916678342, 915188572, 913698265, 912207419, 910716038, 909224120, 907731667, -906238681, 904745161, 903251110, 901756526, 900261413, 898765769, 897269597, 895772898, -894275671, 892777918, 891279640, 889780838, 888281512, 886781663, 885281293, 883780402, -882278992, 880777062, 879274614, 877771649, 876268167, 874764170, 873259659, 871754633, -870249095, 868743045, 867236484, 865729413, 864221832, 862713743, 861205147, 859696043, -858186435, 856676321, 855165703, 853654582, 852142959, 850630835, 849118210, 847605086, -846091463, 844577343, 843062726, 841547612, 840032004, 838515901, 836999305, 835482217, -833964638, 832446567, 830928007, 829408958, 827889422, 826369398, 824848888, 823327893, -821806413, 820284450, 818762005, 817239078, 815715670, 814191782, 812667415, 811142571, -809617249, 808091450, 806565177, 805038429, 803511207, 801983513, 800455346, 798926709, -797397602, 795868026, 794337982, 792807470, 791276492, 789745049, 788213141, 786680769, -785147934, 783614638, 782080880, 780546663, 779011986, 777476851, 775941259, 774405210, -772868706, 771331747, 769794334, 768256469, 766718151, 765179382, 763640164, 762100496, -760560380, 759019816, 757478806, 755937350, 754395449, 752853105, 751310318, 749767089, -748223418, 746679308, 745134758, 743589770, 742044345, 740498483, 738952186, 737405453, -735858287, 734310688, 732762657, 731214195, 729665303, 728115982, 726566232, 725016055, -723465451, 721914422, 720362968, 718811090, 717258790, 715706067, 714152924, 712599360, -711045377, 709490976, 707936158, 706380923, 704825272, 703269207, 701712728, 700155836, -698598533, 697040818, 695482694, 693924160, 692365218, 690805869, 689246113, 687685952, -686125387, 684564417, 683003045, 681441272, 679879097, 678316522, 676753549, 675190177, -673626408, 672062243, 670497682, 668932727, 667367379, 665801638, 664235505, 662668981, -661102068, 659534766, 657967075, 656398998, 654830535, 653261686, 651692453, 650122837, -648552838, 646982457, 645411696, 643840556, 642269036, 640697139, 639124865, 637552215, -635979190, 634405791, 632832018, 631257873, 629683357, 628108471, 626533215, 624957590, -623381598, 621805239, 620228514, 618651424, 617073971, 615496154, 613917975, 612339436, -610760536, 609181276, 607601658, 606021683, 604441352, 602860664, 601279623, 599698227, -598116479, 596534378, 594951927, 593369126, 591785976, 590202477, 588618632, 587034440, -585449903, 583865021, 582279796, 580694229, 579108320, 577522070, 575935480, 574348552, -572761285, 571173682, 569585743, 567997469, 566408860, 564819919, 563230645, 561641039, -560051104, 558460839, 556870245, 555279324, 553688076, 552096502, 550504604, 548912382, -547319836, 545726969, 544133781, 542540273, 540946445, 539352300, 537757837, 536163058, -534567963, 532972554, 531376831, 529780796, 528184449, 526587791, 524990824, 523393547, -521795963, 520198072, 518599875, 517001373, 515402566, 513803457, 512204045, 510604332, -509004318, 507404005, 505803394, 504202485, 502601279, 500999778, 499397982, 497795892, -496193509, 494590835, 492987869, 491384614, 489781069, 488177236, 486573117, 484968710, -483364019, 481759043, 480153784, 478548243, 476942419, 475336316, 473729932, 472123270, -470516330, 468909114, 467301622, 465693854, 464085813, 462477499, 460868912, 459260055, -457650927, 456041530, 454431865, 452821933, 451211734, 449601270, 447990541, 446379549, -444768294, 443156777, 441545000, 439932963, 438320667, 436708113, 435095303, 433482236, -431868915, 430255339, 428641511, 427027430, 425413098, 423798515, 422183684, 420568604, -418953276, 417337703, 415721883, 414105819, 412489512, 410872962, 409256170, 407639137, -406021865, 404404353, 402786604, 401168618, 399550396, 397931939, 396313247, 394694323, -393075166, 391455778, 389836160, 388216313, 386596237, 384975934, 383355404, 381734649, -380113669, 378492466, 376871039, 375249392, 373627523, 372005435, 370383128, 368760603, -367137861, 365514903, 363891730, 362268343, 360644742, 359020930, 357396906, 355772673, -354148230, 352523578, 350898719, 349273654, 347648383, 346022908, 344397230, 342771348, -341145265, 339518981, 337892498, 336265816, 334638936, 333011859, 331384586, 329757119, -328129457, 326501602, 324873555, 323245317, 321616889, 319988272, 318359466, 316730474, -315101295, 313471930, 311842381, 310212649, 308582734, 306952638, 305322361, 303691904, -302061269, 300430456, 298799466, 297168301, 295536961, 293905447, 292273760, 290641901, -289009871, 287377671, 285745302, 284112765, 282480061, 280847190, 279214155, 277580955, -275947592, 274314066, 272680379, 271046532, 269412525, 267778360, 266144038, 264509558, -262874923, 261240134, 259605191, 257970095, 256334847, 254699448, 253063900, 251428203, -249792358, 248156366, 246520228, 244883945, 243247518, 241610947, 239974235, 238337382, -236700388, 235063255, 233425984, 231788575, 230151030, 228513350, 226875535, 225237587, -223599506, 221961294, 220322951, 218684479, 217045878, 215407149, 213768293, 212129312, -210490206, 208850976, 207211624, 205572149, 203932553, 202292838, 200653003, 199013051, -197372981, 195732795, 194092495, 192452080, 190811551, 189170911, 187530159, 185889297, -184248325, 182607245, 180966058, 179324764, 177683365, 176041861, 174400254, 172758544, -171116733, 169474820, 167832808, 166190698, 164548489, 162906184, 161263783, 159621287, -157978697, 156336015, 154693240, 153050374, 151407418, 149764374, 148121241, 146478021, -144834714, 143191323, 141547847, 139904288, 138260647, 136616925, 134973122, 133329239, -131685278, 130041240, 128397125, 126752935, 125108670, 123464332, 121819921, 120175438, -118530885, 116886262, 115241570, 113596810, 111951983, 110307091, 108662134, 107017112, -105372028, 103726882, 102081675, 100436408, 98791081, 97145697, 95500255, 93854758, - 92209205, 90563597, 88917937, 87272224, 85626460, 83980645, 82334782, 80688869, - 79042909, 77396903, 75750851, 74104755, 72458615, 70812432, 69166208, 67519943, - 65873638, 64227295, 62580914, 60934496, 59288042, 57641553, 55995030, 54348475, - 52701887, 51055268, 49408620, 47761942, 46115236, 44468503, 42821744, 41174960, - 39528151, 37881320, 36234466, 34587590, 32940695, 31293780, 29646846, 27999895, - 26352928, 24705945, 23058947, 21411936, 19764913, 18117878, 16470832, 14823776, - 13176712, 11529640, 9882561, 8235476, 6588387, 4941294, 3294197, 1647099 -}; - -uint16_t ff_fft_offsets_lut[21845]; - -static void fft_lut_init(uint16_t *table, int off, int size, int *index) -{ - if (size < 16) { - table[*index] = off >> 2; - (*index)++; - } - else { - fft_lut_init(table, off, size >> 1, index); - fft_lut_init(table, off + (size >> 1), size >> 2, index); - fft_lut_init(table, off + 3 * (size >> 2), size >> 2, index); - } -} - -static void fft_lut_init_start(void) -{ - int n = 0; - - fft_lut_init(ff_fft_offsets_lut, 0, 1 << 17, &n); -} - -void ff_fft_lut_init(void) -{ - static AVOnce init_once = AV_ONCE_INIT; - - ff_thread_once(&init_once, fft_lut_init_start); -} diff --git a/libavcodec/fft_table.h b/libavcodec/fft_table.h deleted file mode 100644 index 09df49f2b8..0000000000 --- a/libavcodec/fft_table.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2012 - * MIPS Technologies, Inc., California. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Authors: Stanislav Ocovaj (socovaj@mips.com) - * Goran Cordasic (goran@mips.com) - * Djordje Pesut (djordje@mips.com) - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * definitions and tables for FFT - */ -#ifndef AVCODEC_FFT_TABLE_H -#define AVCODEC_FFT_TABLE_H - -#include "libavcodec/fft.h" - -#define MAX_LOG2_NFFT 17 //!< Specifies maximum allowed fft size -#define MAX_FFT_SIZE (1 << MAX_LOG2_NFFT) - -extern const int32_t ff_w_tab_sr[]; -extern uint16_t ff_fft_offsets_lut[]; -void ff_fft_lut_init(void); - -#endif /* AVCODEC_FFT_TABLE_H */ diff --git a/libavcodec/fft_template.c b/libavcodec/fft_template.c deleted file mode 100644 index f2742a3ae8..0000000000 --- a/libavcodec/fft_template.c +++ /dev/null @@ -1,628 +0,0 @@ -/* - * FFT/IFFT transforms - * Copyright (c) 2008 Loren Merritt - * Copyright (c) 2002 Fabrice Bellard - * Partly based on libdjbfft by D. J. Bernstein - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * FFT/IFFT transforms. - */ - -#include -#include -#include "libavutil/mathematics.h" -#include "libavutil/thread.h" -#include "fft.h" -#include "fft-internal.h" - -#if !FFT_FLOAT -#include "fft_table.h" -#else /* !FFT_FLOAT */ - -/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */ -#if !CONFIG_HARDCODED_TABLES -COSTABLE(16); -COSTABLE(32); -COSTABLE(64); -COSTABLE(128); -COSTABLE(256); -COSTABLE(512); -COSTABLE(1024); -COSTABLE(2048); -COSTABLE(4096); -COSTABLE(8192); -COSTABLE(16384); -COSTABLE(32768); -COSTABLE(65536); -COSTABLE(131072); - -static av_cold void init_ff_cos_tabs(int index) -{ - int i; - int m = 1<> 1; - if(!(i&m)) return split_radix_permutation(i, m, inverse)*2; - m >>= 1; - if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1; - else return split_radix_permutation(i, m, inverse)*4 - 1; -} - - -static const int avx_tab[] = { - 0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15 -}; - -static int is_second_half_of_fft32(int i, int n) -{ - if (n <= 32) - return i >= 16; - else if (i < n/2) - return is_second_half_of_fft32(i, n/2); - else if (i < 3*n/4) - return is_second_half_of_fft32(i - n/2, n/4); - else - return is_second_half_of_fft32(i - 3*n/4, n/4); -} - -static av_cold void fft_perm_avx(FFTContext *s) -{ - int i; - int n = 1 << s->nbits; - - for (i = 0; i < n; i += 16) { - int k; - if (is_second_half_of_fft32(i, n)) { - for (k = 0; k < 16; k++) - s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = - i + avx_tab[k]; - - } else { - for (k = 0; k < 16; k++) { - int j = i + k; - j = (j & ~7) | ((j >> 1) & 3) | ((j << 2) & 4); - s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = j; - } - } - } -} - -av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) -{ - int i, j, n; - - s->revtab = NULL; - s->revtab32 = NULL; - - if (nbits < 2 || nbits > 17) - goto fail; - s->nbits = nbits; - n = 1 << nbits; - - if (nbits <= 16) { - s->revtab = av_malloc(n * sizeof(uint16_t)); - if (!s->revtab) - goto fail; - } else { - s->revtab32 = av_malloc(n * sizeof(uint32_t)); - if (!s->revtab32) - goto fail; - } - s->tmp_buf = av_malloc(n * sizeof(FFTComplex)); - if (!s->tmp_buf) - goto fail; - s->inverse = inverse; - s->fft_permutation = FF_FFT_PERM_DEFAULT; - - s->fft_permute = fft_permute_c; - s->fft_calc = fft_calc_c; -#if CONFIG_MDCT - s->imdct_calc = ff_imdct_calc_c; - s->imdct_half = ff_imdct_half_c; - s->mdct_calc = ff_mdct_calc_c; -#endif - -#if FFT_FLOAT -#if ARCH_AARCH64 - ff_fft_init_aarch64(s); -#elif ARCH_ARM - ff_fft_init_arm(s); -#elif ARCH_PPC - ff_fft_init_ppc(s); -#elif ARCH_X86 - ff_fft_init_x86(s); -#endif -#if HAVE_MIPSFPU - ff_fft_init_mips(s); -#endif - for(j=4; j<=nbits; j++) { - ff_init_ff_cos_tabs(j); - } -#else /* FFT_FLOAT */ - ff_fft_lut_init(); -#endif - - - if (ARCH_X86 && FFT_FLOAT && s->fft_permutation == FF_FFT_PERM_AVX) { - fft_perm_avx(s); - } else { -#define PROCESS_FFT_PERM_SWAP_LSBS(num) do {\ - for(i = 0; i < n; i++) {\ - int k;\ - j = i;\ - j = (j & ~3) | ((j >> 1) & 1) | ((j << 1) & 2);\ - k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\ - s->revtab##num[k] = j;\ - } \ -} while(0); - -#define PROCESS_FFT_PERM_DEFAULT(num) do {\ - for(i = 0; i < n; i++) {\ - int k;\ - j = i;\ - k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\ - s->revtab##num[k] = j;\ - } \ -} while(0); - -#define SPLIT_RADIX_PERMUTATION(num) do { \ - if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS) {\ - PROCESS_FFT_PERM_SWAP_LSBS(num) \ - } else {\ - PROCESS_FFT_PERM_DEFAULT(num) \ - }\ -} while(0); - - if (s->revtab) - SPLIT_RADIX_PERMUTATION() - if (s->revtab32) - SPLIT_RADIX_PERMUTATION(32) - -#undef PROCESS_FFT_PERM_DEFAULT -#undef PROCESS_FFT_PERM_SWAP_LSBS -#undef SPLIT_RADIX_PERMUTATION - } - - return 0; - fail: - av_freep(&s->revtab); - av_freep(&s->revtab32); - av_freep(&s->tmp_buf); - return -1; -} - -static void fft_permute_c(FFTContext *s, FFTComplex *z) -{ - int j, np; - const uint16_t *revtab = s->revtab; - const uint32_t *revtab32 = s->revtab32; - np = 1 << s->nbits; - /* TODO: handle split-radix permute in a more optimal way, probably in-place */ - if (revtab) { - for(j=0;jtmp_buf[revtab[j]] = z[j]; - } else - for(j=0;jtmp_buf[revtab32[j]] = z[j]; - - memcpy(z, s->tmp_buf, np * sizeof(FFTComplex)); -} - -av_cold void ff_fft_end(FFTContext *s) -{ - av_freep(&s->revtab); - av_freep(&s->revtab32); - av_freep(&s->tmp_buf); -} - -#if !FFT_FLOAT - -static void fft_calc_c(FFTContext *s, FFTComplex *z) { - - int nbits, i, n, num_transforms, offset, step; - int n4, n2, n34; - unsigned tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - FFTComplex *tmpz; - const int fft_size = (1 << s->nbits); - int64_t accu; - - num_transforms = (0x2aab >> (16 - s->nbits)) | 1; - - for (n=0; n> 1) | 1; - - for (n=0; n> 31); - accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 - tmp4); - tmp7 = (int32_t)((accu + 0x40000000) >> 31); - accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp2 - tmp1); - tmp6 = (int32_t)((accu + 0x40000000) >> 31); - accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 + tmp4); - tmp8 = (int32_t)((accu + 0x40000000) >> 31); - tmp1 = tmp5 + tmp7; - tmp3 = tmp5 - tmp7; - tmp2 = tmp6 + tmp8; - tmp4 = tmp6 - tmp8; - - tmpz[5].re = tmpz[1].re - tmp1; - tmpz[1].re = tmpz[1].re + tmp1; - tmpz[5].im = tmpz[1].im - tmp2; - tmpz[1].im = tmpz[1].im + tmp2; - tmpz[7].re = tmpz[3].re - tmp4; - tmpz[3].re = tmpz[3].re + tmp4; - tmpz[7].im = tmpz[3].im + tmp3; - tmpz[3].im = tmpz[3].im - tmp3; - } - - step = 1 << ((MAX_LOG2_NFFT-4) - 4); - n4 = 4; - - for (nbits=4; nbits<=s->nbits; nbits++){ - n2 = 2*n4; - n34 = 3*n4; - num_transforms = (num_transforms >> 1) | 1; - - for (n=0; n> 31); - accu = (int64_t)w_re*tmpz[ n2+i].im; - accu -= (int64_t)w_im*tmpz[ n2+i].re; - tmp2 = (int32_t)((accu + 0x40000000) >> 31); - accu = (int64_t)w_re*tmpz[n34+i].re; - accu -= (int64_t)w_im*tmpz[n34+i].im; - tmp3 = (int32_t)((accu + 0x40000000) >> 31); - accu = (int64_t)w_re*tmpz[n34+i].im; - accu += (int64_t)w_im*tmpz[n34+i].re; - tmp4 = (int32_t)((accu + 0x40000000) >> 31); - - tmp5 = tmp1 + tmp3; - tmp1 = tmp1 - tmp3; - tmp6 = tmp2 + tmp4; - tmp2 = tmp2 - tmp4; - - tmpz[ n2+i].re = tmpz[ i].re - tmp5; - tmpz[ i].re = tmpz[ i].re + tmp5; - tmpz[ n2+i].im = tmpz[ i].im - tmp6; - tmpz[ i].im = tmpz[ i].im + tmp6; - tmpz[n34+i].re = tmpz[n4+i].re - tmp2; - tmpz[ n4+i].re = tmpz[n4+i].re + tmp2; - tmpz[n34+i].im = tmpz[n4+i].im + tmp1; - tmpz[ n4+i].im = tmpz[n4+i].im - tmp1; - - w_re_ptr += step; - w_im_ptr -= step; - } - } - step >>= 1; - n4 <<= 1; - } -} - -#else /* !FFT_FLOAT */ - -#define BUTTERFLIES(a0,a1,a2,a3) {\ - BF(t3, t5, t5, t1);\ - BF(a2.re, a0.re, a0.re, t5);\ - BF(a3.im, a1.im, a1.im, t3);\ - BF(t4, t6, t2, t6);\ - BF(a3.re, a1.re, a1.re, t4);\ - BF(a2.im, a0.im, a0.im, t6);\ -} - -// force loading all the inputs before storing any. -// this is slightly slower for small data, but avoids store->load aliasing -// for addresses separated by large powers of 2. -#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\ - FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\ - BF(t3, t5, t5, t1);\ - BF(a2.re, a0.re, r0, t5);\ - BF(a3.im, a1.im, i1, t3);\ - BF(t4, t6, t2, t6);\ - BF(a3.re, a1.re, r1, t4);\ - BF(a2.im, a0.im, i0, t6);\ -} - -#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\ - CMUL(t1, t2, a2.re, a2.im, wre, -wim);\ - CMUL(t5, t6, a3.re, a3.im, wre, wim);\ - BUTTERFLIES(a0,a1,a2,a3)\ -} - -#define TRANSFORM_ZERO(a0,a1,a2,a3) {\ - t1 = a2.re;\ - t2 = a2.im;\ - t5 = a3.re;\ - t6 = a3.im;\ - BUTTERFLIES(a0,a1,a2,a3)\ -} - -/* z[0...8n-1], w[1...2n-1] */ -#define PASS(name)\ -static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\ -{\ - FFTDouble t1, t2, t3, t4, t5, t6;\ - int o1 = 2*n;\ - int o2 = 4*n;\ - int o3 = 6*n;\ - const FFTSample *wim = wre+o1;\ - n--;\ -\ - TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\ - TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ - do {\ - z += 2;\ - wre += 2;\ - wim -= 2;\ - TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\ - TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ - } while(--n);\ -} - -PASS(pass) -#if !CONFIG_SMALL -#undef BUTTERFLIES -#define BUTTERFLIES BUTTERFLIES_BIG -PASS(pass_big) -#endif - -#define DECL_FFT(n,n2,n4)\ -static void fft##n(FFTComplex *z)\ -{\ - fft##n2(z);\ - fft##n4(z+n4*2);\ - fft##n4(z+n4*3);\ - pass(z,FFT_NAME(ff_cos_##n),n4/2);\ -} - -static void fft4(FFTComplex *z) -{ - FFTDouble t1, t2, t3, t4, t5, t6, t7, t8; - - BF(t3, t1, z[0].re, z[1].re); - BF(t8, t6, z[3].re, z[2].re); - BF(z[2].re, z[0].re, t1, t6); - BF(t4, t2, z[0].im, z[1].im); - BF(t7, t5, z[2].im, z[3].im); - BF(z[3].im, z[1].im, t4, t8); - BF(z[3].re, z[1].re, t3, t7); - BF(z[2].im, z[0].im, t2, t5); -} - -static void fft8(FFTComplex *z) -{ - FFTDouble t1, t2, t3, t4, t5, t6; - - fft4(z); - - BF(t1, z[5].re, z[4].re, -z[5].re); - BF(t2, z[5].im, z[4].im, -z[5].im); - BF(t5, z[7].re, z[6].re, -z[7].re); - BF(t6, z[7].im, z[6].im, -z[7].im); - - BUTTERFLIES(z[0],z[2],z[4],z[6]); - TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf); -} - -#if !CONFIG_SMALL -static void fft16(FFTComplex *z) -{ - FFTDouble t1, t2, t3, t4, t5, t6; - FFTSample cos_16_1 = FFT_NAME(ff_cos_16)[1]; - FFTSample cos_16_3 = FFT_NAME(ff_cos_16)[3]; - - fft8(z); - fft4(z+8); - fft4(z+12); - - TRANSFORM_ZERO(z[0],z[4],z[8],z[12]); - TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf); - TRANSFORM(z[1],z[5],z[9],z[13],cos_16_1,cos_16_3); - TRANSFORM(z[3],z[7],z[11],z[15],cos_16_3,cos_16_1); -} -#else -DECL_FFT(16,8,4) -#endif -DECL_FFT(32,16,8) -DECL_FFT(64,32,16) -DECL_FFT(128,64,32) -DECL_FFT(256,128,64) -DECL_FFT(512,256,128) -#if !CONFIG_SMALL -#define pass pass_big -#endif -DECL_FFT(1024,512,256) -DECL_FFT(2048,1024,512) -DECL_FFT(4096,2048,1024) -DECL_FFT(8192,4096,2048) -DECL_FFT(16384,8192,4096) -DECL_FFT(32768,16384,8192) -DECL_FFT(65536,32768,16384) -DECL_FFT(131072,65536,32768) - -static void (* const fft_dispatch[])(FFTComplex*) = { - fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024, - fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, fft131072 -}; - -static void fft_calc_c(FFTContext *s, FFTComplex *z) -{ - fft_dispatch[s->nbits-2](z); -} -#endif /* !FFT_FLOAT */ diff --git a/libavcodec/mdct_fixed_32.c b/libavcodec/mdct_fixed_32.c deleted file mode 100644 index eaa6355e67..0000000000 --- a/libavcodec/mdct_fixed_32.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2012 - * MIPS Technologies, Inc., California. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Authors: Stanislav Ocovaj (socovaj@mips.com) - * Goran Cordasic (goran@mips.com) - * Djordje Pesut (djordje@mips.com) - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#define FFT_FLOAT 0 -#include "mdct_template.c" diff --git a/libavcodec/mdct_float.c b/libavcodec/mdct_float.c deleted file mode 100644 index 3d3d3a5548..0000000000 --- a/libavcodec/mdct_float.c +++ /dev/null @@ -1,20 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#define FFT_FLOAT 1 -#include "mdct_template.c" diff --git a/libavcodec/mdct_template.c b/libavcodec/mdct_template.c deleted file mode 100644 index a854ad2700..0000000000 --- a/libavcodec/mdct_template.c +++ /dev/null @@ -1,209 +0,0 @@ -/* - * MDCT/IMDCT transforms - * Copyright (c) 2002 Fabrice Bellard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include -#include -#include "libavutil/common.h" -#include "libavutil/libm.h" -#include "libavutil/mathematics.h" -#include "fft.h" -#include "fft-internal.h" - -/** - * @file - * MDCT/IMDCT transforms. - */ - -#if FFT_FLOAT -# define RSCALE(x, y) ((x) + (y)) -#else -# define RSCALE(x, y) ((int)((x) + (unsigned)(y) + 32) >> 6) -#endif - -/** - * init MDCT or IMDCT computation. - */ -av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale) -{ - int n, n4, i; - double alpha, theta; - int tstep; - - memset(s, 0, sizeof(*s)); - n = 1 << nbits; - s->mdct_bits = nbits; - s->mdct_size = n; - n4 = n >> 2; - s->mdct_permutation = FF_MDCT_PERM_NONE; - - if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0) - goto fail; - - s->tcos = av_malloc_array(n/2, sizeof(FFTSample)); - if (!s->tcos) - goto fail; - - switch (s->mdct_permutation) { - case FF_MDCT_PERM_NONE: - s->tsin = s->tcos + n4; - tstep = 1; - break; - case FF_MDCT_PERM_INTERLEAVE: - s->tsin = s->tcos + 1; - tstep = 2; - break; - default: - goto fail; - } - - theta = 1.0 / 8.0 + (scale < 0 ? n4 : 0); - scale = sqrt(fabs(scale)); - for(i=0;itcos[i*tstep] = lrint(-cos(alpha) * 2147483648.0); - s->tsin[i*tstep] = lrint(-sin(alpha) * 2147483648.0); -#else - s->tcos[i*tstep] = FIX15(-cos(alpha) * scale); - s->tsin[i*tstep] = FIX15(-sin(alpha) * scale); -#endif - } - return 0; - fail: - ff_mdct_end(s); - return -1; -} - -/** - * Compute the middle half of the inverse MDCT of size N = 2^nbits, - * thus excluding the parts that can be derived by symmetry - * @param output N/2 samples - * @param input N/2 samples - */ -void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int k, n8, n4, n2, n, j; - const uint16_t *revtab = s->revtab; - const FFTSample *tcos = s->tcos; - const FFTSample *tsin = s->tsin; - const FFTSample *in1, *in2; - FFTComplex *z = (FFTComplex *)output; - - n = 1 << s->mdct_bits; - n2 = n >> 1; - n4 = n >> 2; - n8 = n >> 3; - - /* pre rotation */ - in1 = input; - in2 = input + n2 - 1; - for(k = 0; k < n4; k++) { - j=revtab[k]; - CMUL(z[j].re, z[j].im, *in2, *in1, tcos[k], tsin[k]); - in1 += 2; - in2 -= 2; - } - s->fft_calc(s, z); - - /* post rotation + reordering */ - for(k = 0; k < n8; k++) { - FFTSample r0, i0, r1, i1; - CMUL(r0, i1, z[n8-k-1].im, z[n8-k-1].re, tsin[n8-k-1], tcos[n8-k-1]); - CMUL(r1, i0, z[n8+k ].im, z[n8+k ].re, tsin[n8+k ], tcos[n8+k ]); - z[n8-k-1].re = r0; - z[n8-k-1].im = i0; - z[n8+k ].re = r1; - z[n8+k ].im = i1; - } -} - -/** - * Compute inverse MDCT of size N = 2^nbits - * @param output N samples - * @param input N/2 samples - */ -void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int k; - int n = 1 << s->mdct_bits; - int n2 = n >> 1; - int n4 = n >> 2; - - ff_imdct_half_c(s, output+n4, input); - - for(k = 0; k < n4; k++) { - output[k] = -output[n2-k-1]; - output[n-k-1] = output[n2+k]; - } -} - -/** - * Compute MDCT of size N = 2^nbits - * @param input N samples - * @param out N/2 samples - */ -void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input) -{ - int i, j, n, n8, n4, n2, n3; - FFTDouble re, im; - const uint16_t *revtab = s->revtab; - const FFTSample *tcos = s->tcos; - const FFTSample *tsin = s->tsin; - FFTComplex *x = (FFTComplex *)out; - - n = 1 << s->mdct_bits; - n2 = n >> 1; - n4 = n >> 2; - n8 = n >> 3; - n3 = 3 * n4; - - /* pre rotation */ - for(i=0;ifft_calc(s, x); - - /* post rotation */ - for(i=0;itcos); - ff_fft_end(s); -} diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index 05ed63bf3e..45c56e8ad9 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -13,7 +13,6 @@ MIPSFPU-OBJS-$(CONFIG_AMRWB_DECODER) += mips/acelp_filters_mips.o \ mips/acelp_vectors_mips.o MIPSFPU-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_float.o MIPSDSP-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_fixed.o -MIPSFPU-OBJS-$(CONFIG_FFT) += mips/fft_mips.o MIPSFPU-OBJS-$(CONFIG_FMTCONVERT) += mips/fmtconvert_mips.o OBJS-$(CONFIG_AC3DSP) += mips/ac3dsp_mips.o OBJS-$(CONFIG_AAC_DECODER) += mips/aacdec_mips.o \ diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c deleted file mode 100644 index bf91fc316c..0000000000 --- a/libavcodec/mips/fft_mips.c +++ /dev/null @@ -1,516 +0,0 @@ -/* - * Copyright (c) 2012 - * MIPS Technologies, Inc., California. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Author: Stanislav Ocovaj (socovaj@mips.com) - * Author: Zoran Lukic (zoranl@mips.com) - * - * Optimized MDCT/IMDCT and FFT transforms - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "config.h" -#include "libavutil/attributes.h" -#include "libavcodec/fft.h" -#include "libavcodec/fft_table.h" -#include "libavutil/mips/asmdefs.h" - -/** - * FFT transform - */ - -#if HAVE_INLINE_ASM -#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 -static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) -{ - int nbits, i, n, num_transforms, offset, step; - int n4, n2, n34; - FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - FFTComplex *tmpz; - float w_re, w_im; - float *w_re_ptr, *w_im_ptr; - const int fft_size = (1 << s->nbits); - float pom, pom1, pom2, pom3; - float temp, temp1, temp3, temp4; - FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4; - FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i; - float f1 = 0.7071067812; - - num_transforms = (21845 >> (17 - s->nbits)) | 1; - - for (n=0; n> 1) | 1; - - for (n=0; nnbits; nbits++) { - num_transforms = (num_transforms >> 1) | 1; - n2 = 2 * n4; - n34 = 3 * n4; - - for (n=0; n>= 1; - n4 <<= 1; - } -} - -/** - * MDCT/IMDCT transforms. - */ - -static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int k, n8, n4, n2, n, j; - const uint16_t *revtab = s->revtab; - const FFTSample *tcos = s->tcos; - const FFTSample *tsin = s->tsin; - const FFTSample *in1, *in2, *in3, *in4; - FFTComplex *z = (FFTComplex *)output; - - int j1; - const float *tcos1, *tsin1, *tcos2, *tsin2; - float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, - temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16; - FFTComplex *z1, *z2; - - n = 1 << s->mdct_bits; - n2 = n >> 1; - n4 = n >> 2; - n8 = n >> 3; - - /* pre rotation */ - in1 = input; - in2 = input + n2 - 1; - in3 = input + 2; - in4 = input + n2 - 3; - - tcos1 = tcos; - tsin1 = tsin; - - /* n4 = 64 or 128 */ - for(k = 0; k < n4; k += 2) { - j = revtab[k ]; - j1 = revtab[k + 1]; - - __asm__ volatile ( - "lwc1 %[temp1], 0(%[in2]) \t\n" - "lwc1 %[temp2], 0(%[tcos1]) \t\n" - "lwc1 %[temp3], 0(%[tsin1]) \t\n" - "lwc1 %[temp4], 0(%[in1]) \t\n" - "lwc1 %[temp5], 0(%[in4]) \t\n" - "mul.s %[temp9], %[temp1], %[temp2] \t\n" - "mul.s %[temp10], %[temp1], %[temp3] \t\n" - "lwc1 %[temp6], 4(%[tcos1]) \t\n" - "lwc1 %[temp7], 4(%[tsin1]) \t\n" - "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n" - "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n" - "mul.s %[temp11], %[temp5], %[temp6] \t\n" - "mul.s %[temp12], %[temp5], %[temp7] \t\n" - "lwc1 %[temp8], 0(%[in3]) \t\n" - PTR_ADDIU " %[tcos1], %[tcos1], 8 \t\n" - PTR_ADDIU " %[tsin1], %[tsin1], 8 \t\n" - PTR_ADDIU " %[in1], %[in1], 16 \t\n" - "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n" - "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n" - PTR_ADDIU " %[in2], %[in2], -16 \t\n" - PTR_ADDIU " %[in3], %[in3], 16 \t\n" - PTR_ADDIU " %[in4], %[in4], -16 \t\n" - - : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), - [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), - [temp5]"=&f"(temp5), [temp6]"=&f"(temp6), - [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), - [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), - [temp11]"=&f"(temp11), [temp12]"=&f"(temp12), - [tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1), - [in1]"+r"(in1), [in2]"+r"(in2), - [in3]"+r"(in3), [in4]"+r"(in4) - : - : "memory" - ); - - z[j ].re = temp9; - z[j ].im = temp10; - z[j1].re = temp11; - z[j1].im = temp12; - } - - s->fft_calc(s, z); - - /* post rotation + reordering */ - /* n8 = 32 or 64 */ - for(k = 0; k < n8; k += 2) { - tcos1 = &tcos[n8 - k - 2]; - tsin1 = &tsin[n8 - k - 2]; - tcos2 = &tcos[n8 + k]; - tsin2 = &tsin[n8 + k]; - z1 = &z[n8 - k - 2]; - z2 = &z[n8 + k ]; - - __asm__ volatile ( - "lwc1 %[temp1], 12(%[z1]) \t\n" - "lwc1 %[temp2], 4(%[tsin1]) \t\n" - "lwc1 %[temp3], 4(%[tcos1]) \t\n" - "lwc1 %[temp4], 8(%[z1]) \t\n" - "lwc1 %[temp5], 4(%[z1]) \t\n" - "mul.s %[temp9], %[temp1], %[temp2] \t\n" - "mul.s %[temp10], %[temp1], %[temp3] \t\n" - "lwc1 %[temp6], 0(%[tsin1]) \t\n" - "lwc1 %[temp7], 0(%[tcos1]) \t\n" - "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n" - "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n" - "mul.s %[temp11], %[temp5], %[temp6] \t\n" - "mul.s %[temp12], %[temp5], %[temp7] \t\n" - "lwc1 %[temp8], 0(%[z1]) \t\n" - "lwc1 %[temp1], 4(%[z2]) \t\n" - "lwc1 %[temp2], 0(%[tsin2]) \t\n" - "lwc1 %[temp3], 0(%[tcos2]) \t\n" - "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n" - "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n" - "mul.s %[temp13], %[temp1], %[temp2] \t\n" - "mul.s %[temp14], %[temp1], %[temp3] \t\n" - "lwc1 %[temp4], 0(%[z2]) \t\n" - "lwc1 %[temp5], 12(%[z2]) \t\n" - "lwc1 %[temp6], 4(%[tsin2]) \t\n" - "lwc1 %[temp7], 4(%[tcos2]) \t\n" - "nmsub.s %[temp13], %[temp13], %[temp4], %[temp3] \t\n" - "madd.s %[temp14], %[temp14], %[temp4], %[temp2] \t\n" - "mul.s %[temp15], %[temp5], %[temp6] \t\n" - "mul.s %[temp16], %[temp5], %[temp7] \t\n" - "lwc1 %[temp8], 8(%[z2]) \t\n" - "nmsub.s %[temp15], %[temp15], %[temp8], %[temp7] \t\n" - "madd.s %[temp16], %[temp16], %[temp8], %[temp6] \t\n" - : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), - [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), - [temp5]"=&f"(temp5), [temp6]"=&f"(temp6), - [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), - [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), - [temp11]"=&f"(temp11), [temp12]"=&f"(temp12), - [temp13]"=&f"(temp13), [temp14]"=&f"(temp14), - [temp15]"=&f"(temp15), [temp16]"=&f"(temp16) - : [z1]"r"(z1), [z2]"r"(z2), - [tsin1]"r"(tsin1), [tcos1]"r"(tcos1), - [tsin2]"r"(tsin2), [tcos2]"r"(tcos2) - : "memory" - ); - - z1[1].re = temp9; - z1[1].im = temp14; - z2[0].re = temp13; - z2[0].im = temp10; - - z1[0].re = temp11; - z1[0].im = temp16; - z2[1].re = temp15; - z2[1].im = temp12; - } -} - -/** - * Compute inverse MDCT of size N = 2^nbits - * @param output N samples - * @param input N/2 samples - */ -static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int k; - int n = 1 << s->mdct_bits; - int n2 = n >> 1; - int n4 = n >> 2; - - ff_imdct_half_mips(s, output+n4, input); - - for(k = 0; k < n4; k+=4) { - output[k] = -output[n2-k-1]; - output[k+1] = -output[n2-k-2]; - output[k+2] = -output[n2-k-3]; - output[k+3] = -output[n2-k-4]; - - output[n-k-1] = output[n2+k]; - output[n-k-2] = output[n2+k+1]; - output[n-k-3] = output[n2+k+2]; - output[n-k-4] = output[n2+k+3]; - } -} -#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */ -#endif /* HAVE_INLINE_ASM */ - -av_cold void ff_fft_init_mips(FFTContext *s) -{ - ff_fft_lut_init(); - ff_init_ff_cos_tabs(17); - -#if HAVE_INLINE_ASM -#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6 - s->fft_calc = ff_fft_calc_mips; -#if CONFIG_MDCT - s->imdct_calc = ff_imdct_calc_mips; - s->imdct_half = ff_imdct_half_mips; -#endif -#endif -#endif -} diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile index bc13d8a0ce..10b9ca60da 100644 --- a/libavcodec/ppc/Makefile +++ b/libavcodec/ppc/Makefile @@ -1,9 +1,6 @@ # subsystems OBJS-$(CONFIG_AUDIODSP) += ppc/audiodsp.o OBJS-$(CONFIG_BLOCKDSP) += ppc/blockdsp.o -OBJS-$(CONFIG_FFT) += ppc/fft_init.o \ - ppc/fft_altivec.o \ - ppc/fft_vsx.o OBJS-$(CONFIG_FDCTDSP) += ppc/fdctdsp.o OBJS-$(CONFIG_FMTCONVERT) += ppc/fmtconvert_altivec.o OBJS-$(CONFIG_H264CHROMA) += ppc/h264chroma_init.o diff --git a/libavcodec/ppc/fft_altivec.S b/libavcodec/ppc/fft_altivec.S deleted file mode 100644 index 8cd68d6a90..0000000000 --- a/libavcodec/ppc/fft_altivec.S +++ /dev/null @@ -1,458 +0,0 @@ -/* - * FFT transform with Altivec optimizations - * Copyright (c) 2009 Loren Merritt - * - * This algorithm (though not any of the implementation details) is - * based on libdjbfft by D. J. Bernstein. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * These functions are not individually interchangeable with the C versions. - * While C takes arrays of FFTComplex, Altivec leaves intermediate results - * in blocks as convenient to the vector size. - * i.e. {4x real, 4x imaginary, 4x real, ...} - * - * I ignore standard calling convention. - * Instead, the following registers are treated as global constants: - * v14: zero - * v15..v18: cosines - * v19..v29: permutations - * r9: 16 - * r12: ff_cos_tabs - * and the rest are free for local use. - */ - -#include "config.h" - -#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN - -#include "asm.S" - -.text - -.macro addi2 ra, imm // add 32-bit immediate -.if \imm & 0xffff - addi \ra, \ra, \imm@l -.endif -.if (\imm+0x8000)>>16 - addis \ra, \ra, \imm@ha -.endif -.endm - -.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3 - vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} - vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} - vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} - vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} - vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} - vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} - vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} - vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} - vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} - vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} -.endm - -.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3 - vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} - vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} - vperm \b2,\b0,\b1,v20 - vperm \b3,\b0,\b1,v21 - vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} - vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} - vaddfp \b0,\b2,\b3 - vsubfp \b1,\b2,\b3 - vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} - vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} - vmrghw \b2,\b0,\b1 - vperm \b3,\b0,\b1,v22 - vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} - vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} - vaddfp \b0,\b2,\b3 - vsubfp \b1,\b2,\b3 - vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} - vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} - vperm \b2,\b0,\b1,v23 - vperm \b3,\b0,\b1,v24 -.endm - -.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1 - vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6} - vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7} - vperm \a2,\a0,\a1,v20 // FFT4 ... - vperm \a3,\a0,\a1,v21 - vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4} - vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7} - vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7} - vaddfp \a0,\a2,\a3 - vsubfp \a1,\a2,\a3 - vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2) - vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9} - vmrghw \a2,\a0,\a1 - vperm \a3,\a0,\a1,v22 - vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8} - vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta} - vaddfp \a0,\a2,\a3 - vsubfp \a1,\a2,\a3 - vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta} - vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb} - vperm \a2,\a0,\a1,v23 - vperm \a3,\a0,\a1,v24 - vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb} - vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc} - vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7} - vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7} - vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3} - vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3} -.endm - -.macro BF d0,d1,s0,s1 - vsubfp \d1,\s0,\s1 - vaddfp \d0,\s0,\s1 -.endm - -.macro zip d0,d1,s0,s1 - vmrghw \d0,\s0,\s1 - vmrglw \d1,\s0,\s1 -.endm - -.macro def_fft4 interleave -fft4\interleave\()_altivec: - lvx v0, 0,r3 - lvx v1,r9,r3 - FFT4 v0,v1,v2,v3 -.ifnb \interleave - zip v0,v1,v2,v3 - stvx v0, 0,r3 - stvx v1,r9,r3 -.else - stvx v2, 0,r3 - stvx v3,r9,r3 -.endif - blr -.endm - -.macro def_fft8 interleave -fft8\interleave\()_altivec: - addi r4,r3,32 - lvx v0, 0,r3 - lvx v1,r9,r3 - lvx v2, 0,r4 - lvx v3,r9,r4 - FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8 -.ifnb \interleave - zip v4,v5,v0,v1 - zip v6,v7,v2,v3 - stvx v4, 0,r3 - stvx v5,r9,r3 - stvx v6, 0,r4 - stvx v7,r9,r4 -.else - stvx v0, 0,r3 - stvx v1,r9,r3 - stvx v2, 0,r4 - stvx v3,r9,r4 -.endif - blr -.endm - -.macro def_fft16 interleave -fft16\interleave\()_altivec: - addi r5,r3,64 - addi r6,r3,96 - addi r4,r3,32 - lvx v0, 0,r5 - lvx v1,r9,r5 - lvx v2, 0,r6 - lvx v3,r9,r6 - FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7 - lvx v0, 0,r3 - lvx v1,r9,r3 - lvx v2, 0,r4 - lvx v3,r9,r4 - FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12 - vmaddfp v8,v4,v15,v14 // r2*wre - vmaddfp v9,v5,v15,v14 // i2*wre - vmaddfp v10,v6,v15,v14 // r3*wre - vmaddfp v11,v7,v15,v14 // i3*wre - vmaddfp v8,v5,v16,v8 // i2*wim - vnmsubfp v9,v4,v16,v9 // r2*wim - vnmsubfp v10,v7,v16,v10 // i3*wim - vmaddfp v11,v6,v16,v11 // r3*wim - BF v10,v12,v10,v8 - BF v11,v13,v9,v11 - BF v0,v4,v0,v10 - BF v3,v7,v3,v12 - BF v1,v5,v1,v11 - BF v2,v6,v2,v13 -.ifnb \interleave - zip v8, v9,v0,v1 - zip v10,v11,v2,v3 - zip v12,v13,v4,v5 - zip v14,v15,v6,v7 - stvx v8, 0,r3 - stvx v9,r9,r3 - stvx v10, 0,r4 - stvx v11,r9,r4 - stvx v12, 0,r5 - stvx v13,r9,r5 - stvx v14, 0,r6 - stvx v15,r9,r6 -.else - stvx v0, 0,r3 - stvx v4, 0,r5 - stvx v3,r9,r4 - stvx v7,r9,r6 - stvx v1,r9,r3 - stvx v5,r9,r5 - stvx v2, 0,r4 - stvx v6, 0,r6 -.endif - blr -.endm - -// void pass(float *z, float *wre, int n) -.macro PASS interleave, suffix -fft_pass\suffix\()_altivec: - mtctr r5 - slwi r0,r5,4 - slwi r7,r5,6 // o2 - slwi r5,r5,5 // o1 - add r10,r5,r7 // o3 - add r0,r4,r0 // wim - addi r6,r5,16 // o1+16 - addi r8,r7,16 // o2+16 - addi r11,r10,16 // o3+16 -1: - lvx v8, 0,r4 // wre - lvx v10, 0,r0 // wim - sub r0,r0,r9 - lvx v9, 0,r0 - vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3] - lvx v4,r3,r7 // r2 = z[o2] - lvx v5,r3,r8 // i2 = z[o2+16] - lvx v6,r3,r10 // r3 = z[o3] - lvx v7,r3,r11 // i3 = z[o3+16] - vmaddfp v10,v4,v8,v14 // r2*wre - vmaddfp v11,v5,v8,v14 // i2*wre - vmaddfp v12,v6,v8,v14 // r3*wre - vmaddfp v13,v7,v8,v14 // i3*wre - lvx v0, 0,r3 // r0 = z[0] - lvx v3,r3,r6 // i1 = z[o1+16] - vmaddfp v10,v5,v9,v10 // i2*wim - vnmsubfp v11,v4,v9,v11 // r2*wim - vnmsubfp v12,v7,v9,v12 // i3*wim - vmaddfp v13,v6,v9,v13 // r3*wim - lvx v1,r3,r9 // i0 = z[16] - lvx v2,r3,r5 // r1 = z[o1] - BF v12,v8,v12,v10 - BF v13,v9,v11,v13 - BF v0,v4,v0,v12 - BF v3,v7,v3,v8 -.if !\interleave - stvx v0, 0,r3 - stvx v4,r3,r7 - stvx v3,r3,r6 - stvx v7,r3,r11 -.endif - BF v1,v5,v1,v13 - BF v2,v6,v2,v9 -.if !\interleave - stvx v1,r3,r9 - stvx v2,r3,r5 - stvx v5,r3,r8 - stvx v6,r3,r10 -.else - vmrghw v8,v0,v1 - vmrglw v9,v0,v1 - stvx v8, 0,r3 - stvx v9,r3,r9 - vmrghw v8,v2,v3 - vmrglw v9,v2,v3 - stvx v8,r3,r5 - stvx v9,r3,r6 - vmrghw v8,v4,v5 - vmrglw v9,v4,v5 - stvx v8,r3,r7 - stvx v9,r3,r8 - vmrghw v8,v6,v7 - vmrglw v9,v6,v7 - stvx v8,r3,r10 - stvx v9,r3,r11 -.endif - addi r3,r3,32 - addi r4,r4,16 - bdnz 1b - sub r3,r3,r5 - blr -.endm - -#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ - -#define WORD_0 0x00,0x01,0x02,0x03 -#define WORD_1 0x04,0x05,0x06,0x07 -#define WORD_2 0x08,0x09,0x0a,0x0b -#define WORD_3 0x0c,0x0d,0x0e,0x0f -#define WORD_s0 0x10,0x11,0x12,0x13 -#define WORD_s1 0x14,0x15,0x16,0x17 -#define WORD_s2 0x18,0x19,0x1a,0x1b -#define WORD_s3 0x1c,0x1d,0x1e,0x1f - -#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d - - .rodata - .align 4 -fft_data: - .float 0, 0, 0, 0 - .float 1, 0.92387953, M_SQRT1_2, 0.38268343 - .float 0, 0.38268343, M_SQRT1_2, 0.92387953 - .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2 - .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 - vcprm(s0,3,2,1) - vcprm(0,1,s2,s1) - vcprm(2,3,s0,s3) - vcprm(2,s3,3,s2) - vcprm(0,1,s0,s1) - vcprm(2,3,s2,s3) - vcprm(2,3,0,1) - vcprm(1,2,s3,s0) - vcprm(0,3,s2,s1) - vcprm(0,2,s1,s3) - vcprm(1,3,s0,s2) - -.macro lvm b, r, regs:vararg - lvx \r, 0, \b - addi \b, \b, 16 - .ifnb \regs - lvm \b, \regs - .endif -.endm - -.macro stvm b, r, regs:vararg - stvx \r, 0, \b - addi \b, \b, 16 - .ifnb \regs - stvm \b, \regs - .endif -.endm - -.macro fft_calc interleave -extfunc ff_fft_calc\interleave\()_altivec - mflr r0 - stp r0, 2*PS(R(1)) - stpu r1, -(160+16*PS)(R(1)) - get_got r11 - addi r6, r1, 16*PS - stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 - mfvrsave r0 - stw r0, 15*PS(R(1)) -#if __APPLE__ - li r6, 0xfffffffc -#else - li r6, -4 -#endif - mtvrsave r6 - - movrel r6, fft_data, r11 - lvm r6, v14, v15, v16, v17, v18, v19, v20, v21 - lvm r6, v22, v23, v24, v25, v26, v27, v28, v29 - - li r9, 16 - movrel r12, X(ff_cos_tabs), r11 - - movrel r6, fft_dispatch_tab\interleave\()_altivec, r11 - lwz r3, 0(R(3)) - subi r3, r3, 2 - slwi r3, r3, 2+ARCH_PPC64 - lpx r3, r3, r6 - mtctr r3 - mr r3, r4 - bctrl - - addi r6, r1, 16*PS - lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 - lwz r6, 15*PS(R(1)) - mtvrsave r6 - lp r1, 0(R(1)) - lp r0, 2*PS(R(1)) - mtlr r0 - blr -.endm - -.macro DECL_FFT suffix, bits, n, n2, n4 -fft\n\suffix\()_altivec: - mflr r0 - stp r0,PS*(\bits-3)(R(1)) - bl fft\n2\()_altivec - addi2 r3,\n*4 - bl fft\n4\()_altivec - addi2 r3,\n*2 - bl fft\n4\()_altivec - addi2 r3,\n*-6 - lp r0,PS*(\bits-3)(R(1)) - lp r4,\bits*PS(R(12)) - mtlr r0 - li r5,\n/16 - b fft_pass\suffix\()_altivec -.endm - -.macro DECL_FFTS interleave, suffix - .text - def_fft4 \suffix - def_fft8 \suffix - def_fft16 \suffix - PASS \interleave, \suffix - DECL_FFT \suffix, 5, 32, 16, 8 - DECL_FFT \suffix, 6, 64, 32, 16 - DECL_FFT \suffix, 7, 128, 64, 32 - DECL_FFT \suffix, 8, 256, 128, 64 - DECL_FFT \suffix, 9, 512, 256, 128 - DECL_FFT \suffix,10, 1024, 512, 256 - DECL_FFT \suffix,11, 2048, 1024, 512 - DECL_FFT \suffix,12, 4096, 2048, 1024 - DECL_FFT \suffix,13, 8192, 4096, 2048 - DECL_FFT \suffix,14,16384, 8192, 4096 - DECL_FFT \suffix,15,32768,16384, 8192 - DECL_FFT \suffix,16,65536,32768,16384 - - fft_calc \suffix - - .rodata - .align 3 -fft_dispatch_tab\suffix\()_altivec: - PTR fft4\suffix\()_altivec - PTR fft8\suffix\()_altivec - PTR fft16\suffix\()_altivec - PTR fft32\suffix\()_altivec - PTR fft64\suffix\()_altivec - PTR fft128\suffix\()_altivec - PTR fft256\suffix\()_altivec - PTR fft512\suffix\()_altivec - PTR fft1024\suffix\()_altivec - PTR fft2048\suffix\()_altivec - PTR fft4096\suffix\()_altivec - PTR fft8192\suffix\()_altivec - PTR fft16384\suffix\()_altivec - PTR fft32768\suffix\()_altivec - PTR fft65536\suffix\()_altivec -.endm - -DECL_FFTS 0 -DECL_FFTS 1, _interleave - -#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */ diff --git a/libavcodec/ppc/fft_init.c b/libavcodec/ppc/fft_init.c deleted file mode 100644 index 65ce64f6a1..0000000000 --- a/libavcodec/ppc/fft_init.c +++ /dev/null @@ -1,168 +0,0 @@ -/* - * FFT/IFFT transforms - * AltiVec-enabled - * Copyright (c) 2009 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/ppc/cpu.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/fft.h" - -/** - * Do a complex FFT with the parameters defined in ff_fft_init(). - * The input data must be permuted before with s->revtab table. - * No 1.0 / sqrt(n) normalization is done. - * AltiVec-enabled: - * This code assumes that the 'z' pointer is 16 bytes-aligned. - * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats. - */ - -#if HAVE_VSX -#include "fft_vsx.h" -#else -void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); -void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z); -#endif - -#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX) -static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int j, k; - int n = 1 << s->mdct_bits; - int n4 = n >> 2; - int n8 = n >> 3; - int n32 = n >> 5; - const uint16_t *revtabj = s->revtab; - const uint16_t *revtabk = s->revtab+n4; - const vec_f *tcos = (const vec_f*)(s->tcos+n8); - const vec_f *tsin = (const vec_f*)(s->tsin+n8); - const vec_f *pin = (const vec_f*)(input+n4); - vec_f *pout = (vec_f*)(output+n4); - - /* pre rotation */ - k = n32-1; - do { - vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d; -#define CMULA(p,o0,o1,o2,o3)\ - a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\ - b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\ - re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\ - im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\ - cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\ - sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\ - r##p = im*cos - re*sin;\ - i##p = re*cos + im*sin; -#define STORE2(v,dst)\ - j = dst;\ - vec_ste(v, 0, output+j*2);\ - vec_ste(v, 4, output+j*2); -#define STORE8(p)\ - a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\ - b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\ - c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\ - d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\ - STORE2(a, revtabk[ p*2-4]);\ - STORE2(b, revtabk[ p*2-3]);\ - STORE2(c, revtabj[-p*2+2]);\ - STORE2(d, revtabj[-p*2+3]); - - cos0 = tcos[k]; - sin0 = tsin[k]; - cos1 = tcos[-k-1]; - sin1 = tsin[-k-1]; - CMULA(0, 0,1,2,3); - CMULA(1, 2,3,0,1); - STORE8(0); - STORE8(1); - revtabj += 4; - revtabk -= 4; - k--; - } while(k >= 0); - -#if HAVE_VSX - ff_fft_calc_vsx(s, (FFTComplex*)output); -#else - ff_fft_calc_altivec(s, (FFTComplex*)output); -#endif - - /* post rotation + reordering */ - j = -n32; - k = n32-1; - do { - vec_f cos,sin,re,im,a,b,c,d; -#define CMULB(d0,d1,o)\ - re = pout[o*2];\ - im = pout[o*2+1];\ - cos = tcos[o];\ - sin = tsin[o];\ - d0 = im*sin - re*cos;\ - d1 = re*sin + im*cos; - - CMULB(a,b,j); - CMULB(c,d,k); - pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2)); - pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0)); - pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2)); - pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0)); - j++; - k--; - } while(k >= 0); -} - -static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) -{ - int k; - int n = 1 << s->mdct_bits; - int n4 = n >> 2; - int n16 = n >> 4; - vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31}; - vec_u32 *p0 = (vec_u32*)(output+n4); - vec_u32 *p1 = (vec_u32*)(output+n4*3); - - imdct_half_altivec(s, output + n4, input); - - for (k = 0; k < n16; k++) { - vec_u32 a = p0[k] ^ sign; - vec_u32 b = p1[-k-1]; - p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0)); - p1[k] = vec_perm(b, b, vcprm(3,2,1,0)); - } -} -#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX) */ - -av_cold void ff_fft_init_ppc(FFTContext *s) -{ -#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX) - if (!PPC_ALTIVEC(av_get_cpu_flags())) - return; - -#if HAVE_VSX - s->fft_calc = ff_fft_calc_interleave_vsx; -#else - s->fft_calc = ff_fft_calc_interleave_altivec; -#endif - if (s->mdct_bits >= 5) { - s->imdct_calc = imdct_calc_altivec; - s->imdct_half = imdct_half_altivec; - } -#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */ -} diff --git a/libavcodec/ppc/fft_vsx.c b/libavcodec/ppc/fft_vsx.c deleted file mode 100644 index c365fa1380..0000000000 --- a/libavcodec/ppc/fft_vsx.c +++ /dev/null @@ -1,226 +0,0 @@ -/* - * FFT transform, optimized with VSX built-in functions - * Copyright (c) 2014 Rong Yan - * - * This algorithm (though not any of the implementation details) is - * based on libdjbfft by D. J. Bernstein. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - - -#include "config.h" -#include "libavutil/cpu.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/fft.h" -#include "libavcodec/fft-internal.h" -#include "fft_vsx.h" - -#if HAVE_VSX - -static void fft32_vsx_interleave(FFTComplex *z) -{ - fft16_vsx_interleave(z); - fft8_vsx_interleave(z+16); - fft8_vsx_interleave(z+24); - pass_vsx_interleave(z,ff_cos_32,4); -} - -static void fft64_vsx_interleave(FFTComplex *z) -{ - fft32_vsx_interleave(z); - fft16_vsx_interleave(z+32); - fft16_vsx_interleave(z+48); - pass_vsx_interleave(z,ff_cos_64, 8); -} -static void fft128_vsx_interleave(FFTComplex *z) -{ - fft64_vsx_interleave(z); - fft32_vsx_interleave(z+64); - fft32_vsx_interleave(z+96); - pass_vsx_interleave(z,ff_cos_128,16); -} -static void fft256_vsx_interleave(FFTComplex *z) -{ - fft128_vsx_interleave(z); - fft64_vsx_interleave(z+128); - fft64_vsx_interleave(z+192); - pass_vsx_interleave(z,ff_cos_256,32); -} -static void fft512_vsx_interleave(FFTComplex *z) -{ - fft256_vsx_interleave(z); - fft128_vsx_interleave(z+256); - fft128_vsx_interleave(z+384); - pass_vsx_interleave(z,ff_cos_512,64); -} -static void fft1024_vsx_interleave(FFTComplex *z) -{ - fft512_vsx_interleave(z); - fft256_vsx_interleave(z+512); - fft256_vsx_interleave(z+768); - pass_vsx_interleave(z,ff_cos_1024,128); - -} -static void fft2048_vsx_interleave(FFTComplex *z) -{ - fft1024_vsx_interleave(z); - fft512_vsx_interleave(z+1024); - fft512_vsx_interleave(z+1536); - pass_vsx_interleave(z,ff_cos_2048,256); -} -static void fft4096_vsx_interleave(FFTComplex *z) -{ - fft2048_vsx_interleave(z); - fft1024_vsx_interleave(z+2048); - fft1024_vsx_interleave(z+3072); - pass_vsx_interleave(z,ff_cos_4096, 512); -} -static void fft8192_vsx_interleave(FFTComplex *z) -{ - fft4096_vsx_interleave(z); - fft2048_vsx_interleave(z+4096); - fft2048_vsx_interleave(z+6144); - pass_vsx_interleave(z,ff_cos_8192,1024); -} -static void fft16384_vsx_interleave(FFTComplex *z) -{ - fft8192_vsx_interleave(z); - fft4096_vsx_interleave(z+8192); - fft4096_vsx_interleave(z+12288); - pass_vsx_interleave(z,ff_cos_16384,2048); -} -static void fft32768_vsx_interleave(FFTComplex *z) -{ - fft16384_vsx_interleave(z); - fft8192_vsx_interleave(z+16384); - fft8192_vsx_interleave(z+24576); - pass_vsx_interleave(z,ff_cos_32768,4096); -} -static void fft65536_vsx_interleave(FFTComplex *z) -{ - fft32768_vsx_interleave(z); - fft16384_vsx_interleave(z+32768); - fft16384_vsx_interleave(z+49152); - pass_vsx_interleave(z,ff_cos_65536,8192); -} - -static void fft32_vsx(FFTComplex *z) -{ - fft16_vsx(z); - fft8_vsx(z+16); - fft8_vsx(z+24); - pass_vsx(z,ff_cos_32,4); -} - -static void fft64_vsx(FFTComplex *z) -{ - fft32_vsx(z); - fft16_vsx(z+32); - fft16_vsx(z+48); - pass_vsx(z,ff_cos_64, 8); -} -static void fft128_vsx(FFTComplex *z) -{ - fft64_vsx(z); - fft32_vsx(z+64); - fft32_vsx(z+96); - pass_vsx(z,ff_cos_128,16); -} -static void fft256_vsx(FFTComplex *z) -{ - fft128_vsx(z); - fft64_vsx(z+128); - fft64_vsx(z+192); - pass_vsx(z,ff_cos_256,32); -} -static void fft512_vsx(FFTComplex *z) -{ - fft256_vsx(z); - fft128_vsx(z+256); - fft128_vsx(z+384); - pass_vsx(z,ff_cos_512,64); -} -static void fft1024_vsx(FFTComplex *z) -{ - fft512_vsx(z); - fft256_vsx(z+512); - fft256_vsx(z+768); - pass_vsx(z,ff_cos_1024,128); - -} -static void fft2048_vsx(FFTComplex *z) -{ - fft1024_vsx(z); - fft512_vsx(z+1024); - fft512_vsx(z+1536); - pass_vsx(z,ff_cos_2048,256); -} -static void fft4096_vsx(FFTComplex *z) -{ - fft2048_vsx(z); - fft1024_vsx(z+2048); - fft1024_vsx(z+3072); - pass_vsx(z,ff_cos_4096, 512); -} -static void fft8192_vsx(FFTComplex *z) -{ - fft4096_vsx(z); - fft2048_vsx(z+4096); - fft2048_vsx(z+6144); - pass_vsx(z,ff_cos_8192,1024); -} -static void fft16384_vsx(FFTComplex *z) -{ - fft8192_vsx(z); - fft4096_vsx(z+8192); - fft4096_vsx(z+12288); - pass_vsx(z,ff_cos_16384,2048); -} -static void fft32768_vsx(FFTComplex *z) -{ - fft16384_vsx(z); - fft8192_vsx(z+16384); - fft8192_vsx(z+24576); - pass_vsx(z,ff_cos_32768,4096); -} -static void fft65536_vsx(FFTComplex *z) -{ - fft32768_vsx(z); - fft16384_vsx(z+32768); - fft16384_vsx(z+49152); - pass_vsx(z,ff_cos_65536,8192); -} - -static void (* const fft_dispatch_vsx[])(FFTComplex*) = { - fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx, - fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx, -}; -static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = { - fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave, - fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave, - fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave, -}; -void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z) -{ - fft_dispatch_vsx_interleave[s->nbits-2](z); -} -void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z) -{ - fft_dispatch_vsx[s->nbits-2](z); -} -#endif /* HAVE_VSX */ diff --git a/libavcodec/ppc/fft_vsx.h b/libavcodec/ppc/fft_vsx.h deleted file mode 100644 index 1e44031aa5..0000000000 --- a/libavcodec/ppc/fft_vsx.h +++ /dev/null @@ -1,829 +0,0 @@ -#ifndef AVCODEC_PPC_FFT_VSX_H -#define AVCODEC_PPC_FFT_VSX_H -/* - * FFT transform, optimized with VSX built-in functions - * Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt - * - * This algorithm (though not any of the implementation details) is - * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - - -#include "config.h" -#include "libavutil/cpu.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/fft.h" -#include "libavcodec/fft-internal.h" - -#if HAVE_VSX - -void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z); -void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z); - - -#define byte_2complex (2*sizeof(FFTComplex)) -#define byte_4complex (4*sizeof(FFTComplex)) -#define byte_6complex (6*sizeof(FFTComplex)) -#define byte_8complex (8*sizeof(FFTComplex)) -#define byte_10complex (10*sizeof(FFTComplex)) -#define byte_12complex (12*sizeof(FFTComplex)) -#define byte_14complex (14*sizeof(FFTComplex)) - -inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n) -{ - int o1 = n<<1; - int o2 = n<<2; - int o3 = o1+o2; - int i1, i2, i3; - FFTSample* out = (FFTSample*)z; - const FFTSample *wim = wre+o1; - vec_f vz0, vzo1, vzo2, vzo3; - vec_f x0, x1, x2, x3; - vec_f x4, x5, x6, x7; - vec_f x8, x9, x10, x11; - vec_f x12, x13, x14, x15; - vec_f x16, x17, x18, x19; - vec_f x20, x21, x22, x23; - vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1; - vec_f y0, y1, y2, y3; - vec_f y4, y5, y8, y9; - vec_f y10, y13, y14, y15; - vec_f y16, y17, y18, y19; - vec_f y20, y21, y22, y23; - vec_f wr1, wi1, wr0, wi0; - vec_f wr2, wi2, wr3, wi3; - vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3; - - n = n-2; - i1 = o1*sizeof(FFTComplex); - i2 = o2*sizeof(FFTComplex); - i3 = o3*sizeof(FFTComplex); - vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i - vzo2plus1 = vec_ld(i2+16, &(out[0])); - vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i - vzo3plus1 = vec_ld(i3+16, &(out[0])); - vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i - vz0plus1 = vec_ld(16, &(out[0])); - vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i - vzo1plus1 = vec_ld(i1+16, &(out[0])); - - x0 = vec_add(vzo2, vzo3); - x1 = vec_sub(vzo2, vzo3); - y0 = vec_add(vzo2plus1, vzo3plus1); - y1 = vec_sub(vzo2plus1, vzo3plus1); - - wr1 = vec_splats(wre[1]); - wi1 = vec_splats(wim[-1]); - wi2 = vec_splats(wim[-2]); - wi3 = vec_splats(wim[-3]); - wr2 = vec_splats(wre[2]); - wr3 = vec_splats(wre[3]); - - x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); - x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); - - y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); - y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); - y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); - y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); - - ymulwi2 = vec_mul(y4, wi2); - ymulwi3 = vec_mul(y5, wi3); - x4 = vec_mul(x2, wr1); - x5 = vec_mul(x3, wi1); - y8 = vec_madd(y2, wr2, ymulwi2); - y9 = vec_msub(y2, wr2, ymulwi2); - x6 = vec_add(x4, x5); - x7 = vec_sub(x4, x5); - y13 = vec_madd(y3, wr3, ymulwi3); - y14 = vec_msub(y3, wr3, ymulwi3); - - x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3)); - y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); - y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); - - x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2)); - x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1)); - - y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); - y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); - - x11 = vec_add(vz0, x9); - x12 = vec_sub(vz0, x9); - x13 = vec_add(vzo1, x10); - x14 = vec_sub(vzo1, x10); - - y18 = vec_add(vz0plus1, y16); - y19 = vec_sub(vz0plus1, y16); - y20 = vec_add(vzo1plus1, y17); - y21 = vec_sub(vzo1plus1, y17); - - x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3)); - x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3)); - y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); - y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); - - - vec_st(x11, 0, &(out[0])); - vec_st(y18, 16, &(out[0])); - vec_st(x15, i1, &(out[0])); - vec_st(y22, i1+16, &(out[0])); - vec_st(x12, i2, &(out[0])); - vec_st(y19, i2+16, &(out[0])); - vec_st(x16, i3, &(out[0])); - vec_st(y23, i3+16, &(out[0])); - - do { - out += 8; - wre += 4; - wim -= 4; - wr0 = vec_splats(wre[0]); - wr1 = vec_splats(wre[1]); - wi0 = vec_splats(wim[0]); - wi1 = vec_splats(wim[-1]); - - wr2 = vec_splats(wre[2]); - wr3 = vec_splats(wre[3]); - wi2 = vec_splats(wim[-2]); - wi3 = vec_splats(wim[-3]); - - vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i - vzo2plus1 = vec_ld(i2+16, &(out[0])); - vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i - vzo3plus1 = vec_ld(i3+16, &(out[0])); - vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i - vz0plus1 = vec_ld(16, &(out[0])); - vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i - vzo1plus1 = vec_ld(i1+16, &(out[0])); - - x0 = vec_add(vzo2, vzo3); - x1 = vec_sub(vzo2, vzo3); - - y0 = vec_add(vzo2plus1, vzo3plus1); - y1 = vec_sub(vzo2plus1, vzo3plus1); - - x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0)); - x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); - x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1)); - x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); - - y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); - y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); - xmulwi0 = vec_mul(x4, wi0); - xmulwi1 = vec_mul(x5, wi1); - - y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); - y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); - - x8 = vec_madd(x2, wr0, xmulwi0); - x9 = vec_msub(x2, wr0, xmulwi0); - ymulwi2 = vec_mul(y4, wi2); - ymulwi3 = vec_mul(y5, wi3); - - x13 = vec_madd(x3, wr1, xmulwi1); - x14 = vec_msub(x3, wr1, xmulwi1); - - y8 = vec_madd(y2, wr2, ymulwi2); - y9 = vec_msub(y2, wr2, ymulwi2); - y13 = vec_madd(y3, wr3, ymulwi3); - y14 = vec_msub(y3, wr3, ymulwi3); - - x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3)); - x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3)); - - y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); - y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); - - x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2)); - x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1)); - - y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); - y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); - - x18 = vec_add(vz0, x16); - x19 = vec_sub(vz0, x16); - x20 = vec_add(vzo1, x17); - x21 = vec_sub(vzo1, x17); - - y18 = vec_add(vz0plus1, y16); - y19 = vec_sub(vz0plus1, y16); - y20 = vec_add(vzo1plus1, y17); - y21 = vec_sub(vzo1plus1, y17); - - x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3)); - x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3)); - - y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); - y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); - - vec_st(x18, 0, &(out[0])); - vec_st(y18, 16, &(out[0])); - vec_st(x22, i1, &(out[0])); - vec_st(y22, i1+16, &(out[0])); - vec_st(x19, i2, &(out[0])); - vec_st(y19, i2+16, &(out[0])); - vec_st(x23, i3, &(out[0])); - vec_st(y23, i3+16, &(out[0])); - } while (n-=2); -} - -inline static void fft2_vsx_interleave(FFTComplex *z) -{ - FFTSample r1, i1; - - r1 = z[0].re - z[1].re; - z[0].re += z[1].re; - z[1].re = r1; - - i1 = z[0].im - z[1].im; - z[0].im += z[1].im; - z[1].im = i1; - } - -inline static void fft4_vsx_interleave(FFTComplex *z) -{ - vec_f a, b, c, d; - float* out= (float*)z; - a = vec_ld(0, &(out[0])); - b = vec_ld(byte_2complex, &(out[0])); - - c = vec_perm(a, b, vcprm(0,1,s2,s1)); - d = vec_perm(a, b, vcprm(2,3,s0,s3)); - a = vec_add(c, d); - b = vec_sub(c, d); - - c = vec_perm(a, b, vcprm(0,1,s0,s1)); - d = vec_perm(a, b, vcprm(2,3,s3,s2)); - - a = vec_add(c, d); - b = vec_sub(c, d); - vec_st(a, 0, &(out[0])); - vec_st(b, byte_2complex, &(out[0])); -} - -inline static void fft8_vsx_interleave(FFTComplex *z) -{ - vec_f vz0, vz1, vz2, vz3; - vec_f x0, x1, x2, x3; - vec_f x4, x5, x6, x7; - vec_f x8, x9, x10, x11; - vec_f x12, x13, x14, x15; - vec_f x16, x17, x18, x19; - vec_f x20, x21, x22, x23; - vec_f x24, x25, x26, x27; - vec_f x28, x29, x30, x31; - vec_f x32, x33, x34; - - float* out= (float*)z; - vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; - - vz0 = vec_ld(0, &(out[0])); - vz1 = vec_ld(byte_2complex, &(out[0])); - vz2 = vec_ld(byte_4complex, &(out[0])); - vz3 = vec_ld(byte_6complex, &(out[0])); - - x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); - x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); - x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1)); - x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3)); - - x4 = vec_add(x0, x1); - x5 = vec_sub(x0, x1); - x6 = vec_add(x2, x3); - x7 = vec_sub(x2, x3); - - x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1)); - x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2)); - x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1)); - x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3)); - - x12 = vec_add(x8, x9); - x13 = vec_sub(x8, x9); - x14 = vec_add(x10, x11); - x15 = vec_sub(x10, x11); - x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1)); - x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1)); - x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1)); - x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i - x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i - - x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3)); - x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3)); - x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2)); - x24 = vec_add(x22, x23); - x25 = vec_sub(x22, x23); - x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1); - - x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i - x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i - - x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i - x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i - x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i - x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i - x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i - x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i - - vec_st(x29, 0, &(out[0])); - vec_st(x33, byte_2complex, &(out[0])); - vec_st(x31, byte_4complex, &(out[0])); - vec_st(x34, byte_6complex, &(out[0])); -} - -inline static void fft16_vsx_interleave(FFTComplex *z) -{ - float* out= (float*)z; - vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; - vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]}; - vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]}; - vec_f vz0, vz1, vz2, vz3; - vec_f vz4, vz5, vz6, vz7; - vec_f x0, x1, x2, x3; - vec_f x4, x5, x6, x7; - vec_f x8, x9, x10, x11; - vec_f x12, x13, x14, x15; - vec_f x16, x17, x18, x19; - vec_f x20, x21, x22, x23; - vec_f x24, x25, x26, x27; - vec_f x28, x29, x30, x31; - vec_f x32, x33, x34, x35; - vec_f x36, x37, x38, x39; - vec_f x40, x41, x42, x43; - vec_f x44, x45, x46, x47; - vec_f x48, x49, x50, x51; - vec_f x52, x53, x54, x55; - vec_f x56, x57, x58, x59; - vec_f x60, x61, x62, x63; - vec_f x64, x65, x66, x67; - vec_f x68, x69, x70, x71; - vec_f x72, x73, x74, x75; - vec_f x76, x77, x78, x79; - vec_f x80, x81, x82, x83; - vec_f x84, x85, x86; - - vz0 = vec_ld(0, &(out[0])); - vz1 = vec_ld(byte_2complex, &(out[0])); - vz2 = vec_ld(byte_4complex, &(out[0])); - vz3 = vec_ld(byte_6complex, &(out[0])); - vz4 = vec_ld(byte_8complex, &(out[0])); - vz5 = vec_ld(byte_10complex, &(out[0])); - vz6 = vec_ld(byte_12complex, &(out[0])); - vz7 = vec_ld(byte_14complex, &(out[0])); - - x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); - x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); - x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); - x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); - - x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1)); - x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3)); - x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1)); - x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3)); - - x8 = vec_add(x0, x1); - x9 = vec_sub(x0, x1); - x10 = vec_add(x2, x3); - x11 = vec_sub(x2, x3); - - x12 = vec_add(x4, x5); - x13 = vec_sub(x4, x5); - x14 = vec_add(x6, x7); - x15 = vec_sub(x6, x7); - - x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1)); - x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2)); - x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2)); - x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3)); - x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1)); - x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3)); - x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1)); - x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2)); - - x24 = vec_add(x16, x17); - x25 = vec_sub(x16, x17); - x26 = vec_add(x18, x19); - x27 = vec_sub(x18, x19); - x28 = vec_add(x20, x21); - x29 = vec_sub(x20, x21); - x30 = vec_add(x22, x23); - x31 = vec_sub(x22, x23); - - x32 = vec_add(x24, x26); - x33 = vec_sub(x24, x26); - x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1)); - - x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2)); - x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3)); - x37 = vec_add(x35, x36); - x38 = vec_sub(x35, x36); - x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0)); - - x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3)); - x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2)); - x42 = vec_add(x40, x41); - x43 = vec_sub(x40, x41); - x44 = vec_mul(x42, vc0); - x45 = vec_mul(x43, vc0); - - x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i - x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i - - x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2)); - x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0)); - x50 = vec_add(x48, x49); - x51 = vec_sub(x48, x49); - x52 = vec_mul(x50, vc1); - x53 = vec_mul(x50, vc2); - x54 = vec_mul(x51, vc1); - x55 = vec_mul(x51, vc2); - - x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3)); - x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0)); - x58 = vec_add(x56, x57); - x59 = vec_sub(x56, x57); - - x60 = vec_perm(x54, x55, vcprm(1,0,3,2)); - x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2)); - x62 = vec_add(x52, x61); - x63 = vec_sub(x52, x61); - x64 = vec_add(x60, x53); - x65 = vec_sub(x60, x53); - x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2)); - x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2)); - - x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i - x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i - x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i - x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i - - x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3)); - x73 = vec_add(x25, x72); - x74 = vec_sub(x25, x72); - x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1)); - x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3)); - x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i - x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i - - x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i - x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i - x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i - x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i - vec_st(x79, 0, &(out[0])); - vec_st(x80, byte_2complex, &(out[0])); - vec_st(x81, byte_4complex, &(out[0])); - vec_st(x82, byte_6complex, &(out[0])); - x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i - x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i - x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i - x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i - vec_st(x83, byte_8complex, &(out[0])); - vec_st(x84, byte_10complex, &(out[0])); - vec_st(x85, byte_12complex, &(out[0])); - vec_st(x86, byte_14complex, &(out[0])); -} - -inline static void fft4_vsx(FFTComplex *z) -{ - vec_f a, b, c, d; - float* out= (float*)z; - a = vec_ld(0, &(out[0])); - b = vec_ld(byte_2complex, &(out[0])); - - c = vec_perm(a, b, vcprm(0,1,s2,s1)); - d = vec_perm(a, b, vcprm(2,3,s0,s3)); - a = vec_add(c, d); - b = vec_sub(c, d); - - c = vec_perm(a,b, vcprm(0,s0,1,s1)); - d = vec_perm(a, b, vcprm(2,s3,3,s2)); - - a = vec_add(c, d); - b = vec_sub(c, d); - - c = vec_perm(a, b, vcprm(0,1,s0,s1)); - d = vec_perm(a, b, vcprm(2,3,s2,s3)); - - vec_st(c, 0, &(out[0])); - vec_st(d, byte_2complex, &(out[0])); - return; -} - -inline static void fft8_vsx(FFTComplex *z) -{ - vec_f vz0, vz1, vz2, vz3; - vec_f vz4, vz5, vz6, vz7, vz8; - - float* out= (float*)z; - vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; - vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; - vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; - - vz0 = vec_ld(0, &(out[0])); - vz1 = vec_ld(byte_2complex, &(out[0])); - vz2 = vec_ld(byte_4complex, &(out[0])); - vz3 = vec_ld(byte_6complex, &(out[0])); - - vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); - vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); - vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); - vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); - - vz2 = vec_add(vz6, vz7); - vz3 = vec_sub(vz6, vz7); - vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); - - vz0 = vec_add(vz4, vz5); - vz1 = vec_sub(vz4, vz5); - - vz3 = vec_madd(vz3, vc1, vc0); - vz3 = vec_madd(vz8, vc2, vz3); - - vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); - vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); - vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); - vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); - - vz0 = vec_add(vz4, vz5); - vz1 = vec_sub(vz4, vz5); - vz2 = vec_add(vz6, vz7); - vz3 = vec_sub(vz6, vz7); - - vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); - vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); - vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); - vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); - - - vz2 = vec_sub(vz4, vz6); - vz3 = vec_sub(vz5, vz7); - - vz0 = vec_add(vz4, vz6); - vz1 = vec_add(vz5, vz7); - - vec_st(vz0, 0, &(out[0])); - vec_st(vz1, byte_2complex, &(out[0])); - vec_st(vz2, byte_4complex, &(out[0])); - vec_st(vz3, byte_6complex, &(out[0])); - return; -} - -inline static void fft16_vsx(FFTComplex *z) -{ - float* out= (float*)z; - vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; - vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; - vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; - vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343}; - vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953}; - vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953}; - - vec_f vz0, vz1, vz2, vz3; - vec_f vz4, vz5, vz6, vz7; - vec_f vz8, vz9, vz10, vz11; - vec_f vz12, vz13; - - vz0 = vec_ld(byte_8complex, &(out[0])); - vz1 = vec_ld(byte_10complex, &(out[0])); - vz2 = vec_ld(byte_12complex, &(out[0])); - vz3 = vec_ld(byte_14complex, &(out[0])); - - vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); - vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); - vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1)); - vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3)); - - vz0 = vec_add(vz4, vz5); - vz1= vec_sub(vz4, vz5); - vz2 = vec_add(vz6, vz7); - vz3 = vec_sub(vz6, vz7); - - vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); - vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); - vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); - vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2)); - - vz0 = vec_add(vz4, vz5); - vz1 = vec_sub(vz4, vz5); - vz2 = vec_add(vz6, vz7); - vz3 = vec_sub(vz6, vz7); - - vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); - vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); - - vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); - vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); - - vz0 = vec_ld(0, &(out[0])); - vz1 = vec_ld(byte_2complex, &(out[0])); - vz2 = vec_ld(byte_4complex, &(out[0])); - vz3 = vec_ld(byte_6complex, &(out[0])); - vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); - vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); - vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); - vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); - - vz2 = vec_add(vz10, vz11); - vz3 = vec_sub(vz10, vz11); - vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); - vz0 = vec_add(vz8, vz9); - vz1 = vec_sub(vz8, vz9); - - vz3 = vec_madd(vz3, vc1, vc0); - vz3 = vec_madd(vz12, vc2, vz3); - vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); - vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); - vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); - vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); - - vz0 = vec_add(vz8, vz9); - vz1 = vec_sub(vz8, vz9); - vz2 = vec_add(vz10, vz11); - vz3 = vec_sub(vz10, vz11); - - vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); - vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); - vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); - vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); - - vz2 = vec_sub(vz8, vz10); - vz3 = vec_sub(vz9, vz11); - vz0 = vec_add(vz8, vz10); - vz1 = vec_add(vz9, vz11); - - vz8 = vec_madd(vz4, vc3, vc0); - vz9 = vec_madd(vz5, vc3, vc0); - vz10 = vec_madd(vz6, vc3, vc0); - vz11 = vec_madd(vz7, vc3, vc0); - - vz8 = vec_madd(vz5, vc4, vz8); - vz9 = vec_madd(vz4, vc5, vz9); - vz10 = vec_madd(vz7, vc5, vz10); - vz11 = vec_madd(vz6, vc4, vz11); - - vz12 = vec_sub(vz10, vz8); - vz10 = vec_add(vz10, vz8); - - vz13 = vec_sub(vz9, vz11); - vz11 = vec_add(vz9, vz11); - - vz4 = vec_sub(vz0, vz10); - vz0 = vec_add(vz0, vz10); - - vz7= vec_sub(vz3, vz12); - vz3= vec_add(vz3, vz12); - - vz5 = vec_sub(vz1, vz11); - vz1 = vec_add(vz1, vz11); - - vz6 = vec_sub(vz2, vz13); - vz2 = vec_add(vz2, vz13); - - vec_st(vz0, 0, &(out[0])); - vec_st(vz1, byte_2complex, &(out[0])); - vec_st(vz2, byte_4complex, &(out[0])); - vec_st(vz3, byte_6complex, &(out[0])); - vec_st(vz4, byte_8complex, &(out[0])); - vec_st(vz5, byte_10complex, &(out[0])); - vec_st(vz6, byte_12complex, &(out[0])); - vec_st(vz7, byte_14complex, &(out[0])); - return; - -} -inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n) -{ - int o1 = n<<1; - int o2 = n<<2; - int o3 = o1+o2; - int i1, i2, i3; - FFTSample* out = (FFTSample*)z; - const FFTSample *wim = wre+o1; - vec_f v0, v1, v2, v3; - vec_f v4, v5, v6, v7; - vec_f v8, v9, v10, v11; - vec_f v12, v13; - - n = n-2; - i1 = o1*sizeof(FFTComplex); - i2 = o2*sizeof(FFTComplex); - i3 = o3*sizeof(FFTComplex); - - v8 = vec_ld(0, &(wre[0])); - v10 = vec_ld(0, &(wim[0])); - v9 = vec_ld(0, &(wim[-4])); - v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); - - v4 = vec_ld(i2, &(out[0])); - v5 = vec_ld(i2+16, &(out[0])); - v6 = vec_ld(i3, &(out[0])); - v7 = vec_ld(i3+16, &(out[0])); - v10 = vec_mul(v4, v8); // r2*wre - v11 = vec_mul(v5, v8); // i2*wre - v12 = vec_mul(v6, v8); // r3*wre - v13 = vec_mul(v7, v8); // i3*wre - - v0 = vec_ld(0, &(out[0])); // r0 - v3 = vec_ld(i1+16, &(out[0])); // i1 - v10 = vec_madd(v5, v9, v10); // r2*wim - v11 = vec_nmsub(v4, v9, v11); // i2*wim - v12 = vec_nmsub(v7, v9, v12); // r3*wim - v13 = vec_madd(v6, v9, v13); // i3*wim - - v1 = vec_ld(16, &(out[0])); // i0 - v2 = vec_ld(i1, &(out[0])); // r1 - v8 = vec_sub(v12, v10); - v12 = vec_add(v12, v10); - v9 = vec_sub(v11, v13); - v13 = vec_add(v11, v13); - v4 = vec_sub(v0, v12); - v0 = vec_add(v0, v12); - v7 = vec_sub(v3, v8); - v3 = vec_add(v3, v8); - - vec_st(v0, 0, &(out[0])); // r0 - vec_st(v3, i1+16, &(out[0])); // i1 - vec_st(v4, i2, &(out[0])); // r2 - vec_st(v7, i3+16, &(out[0]));// i3 - - v5 = vec_sub(v1, v13); - v1 = vec_add(v1, v13); - v6 = vec_sub(v2, v9); - v2 = vec_add(v2, v9); - - vec_st(v1, 16, &(out[0])); // i0 - vec_st(v2, i1, &(out[0])); // r1 - vec_st(v5, i2+16, &(out[0])); // i2 - vec_st(v6, i3, &(out[0])); // r3 - - do { - out += 8; - wre += 4; - wim -= 4; - - v8 = vec_ld(0, &(wre[0])); - v10 = vec_ld(0, &(wim[0])); - v9 = vec_ld(0, &(wim[-4])); - v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); - - v4 = vec_ld(i2, &(out[0])); // r2 - v5 = vec_ld(i2+16, &(out[0])); // i2 - v6 = vec_ld(i3, &(out[0])); // r3 - v7 = vec_ld(i3+16, &(out[0]));// i3 - v10 = vec_mul(v4, v8); // r2*wre - v11 = vec_mul(v5, v8); // i2*wre - v12 = vec_mul(v6, v8); // r3*wre - v13 = vec_mul(v7, v8); // i3*wre - - v0 = vec_ld(0, &(out[0])); // r0 - v3 = vec_ld(i1+16, &(out[0])); // i1 - v10 = vec_madd(v5, v9, v10); // r2*wim - v11 = vec_nmsub(v4, v9, v11); // i2*wim - v12 = vec_nmsub(v7, v9, v12); // r3*wim - v13 = vec_madd(v6, v9, v13); // i3*wim - - v1 = vec_ld(16, &(out[0])); // i0 - v2 = vec_ld(i1, &(out[0])); // r1 - v8 = vec_sub(v12, v10); - v12 = vec_add(v12, v10); - v9 = vec_sub(v11, v13); - v13 = vec_add(v11, v13); - v4 = vec_sub(v0, v12); - v0 = vec_add(v0, v12); - v7 = vec_sub(v3, v8); - v3 = vec_add(v3, v8); - - vec_st(v0, 0, &(out[0])); // r0 - vec_st(v3, i1+16, &(out[0])); // i1 - vec_st(v4, i2, &(out[0])); // r2 - vec_st(v7, i3+16, &(out[0])); // i3 - - v5 = vec_sub(v1, v13); - v1 = vec_add(v1, v13); - v6 = vec_sub(v2, v9); - v2 = vec_add(v2, v9); - - vec_st(v1, 16, &(out[0])); // i0 - vec_st(v2, i1, &(out[0])); // r1 - vec_st(v5, i2+16, &(out[0])); // i2 - vec_st(v6, i3, &(out[0])); // r3 - } while (n-=2); -} - -#endif - -#endif /* AVCODEC_PPC_FFT_VSX_H */ diff --git a/libavcodec/rdft.c b/libavcodec/rdft.c deleted file mode 100644 index ac6f5d6781..0000000000 --- a/libavcodec/rdft.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * (I)RDFT transforms - * Copyright (c) 2009 Alex Converse - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include -#include -#include "libavutil/error.h" -#include "libavutil/mathematics.h" -#include "rdft.h" - -/** - * @file - * (Inverse) Real Discrete Fourier Transforms. - */ - -/** Map one real FFT into two parallel real even and odd FFTs. Then interleave - * the two real FFTs into one complex FFT. Unmangle the results. - * ref: http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM - */ -static void rdft_calc_c(RDFTContext *s, FFTSample *data) -{ - int i, i1, i2; - FFTComplex ev, od, odsum; - const int n = 1 << s->nbits; - const float k1 = 0.5; - const float k2 = 0.5 - s->inverse; - const FFTSample *tcos = s->tcos; - const FFTSample *tsin = s->tsin; - - if (!s->inverse) { - s->fft.fft_permute(&s->fft, (FFTComplex*)data); - s->fft.fft_calc(&s->fft, (FFTComplex*)data); - } - /* i=0 is a special case because of packing, the DC term is real, so we - are going to throw the N/2 term (also real) in with it. */ - ev.re = data[0]; - data[0] = ev.re+data[1]; - data[1] = ev.re-data[1]; - -#define RDFT_UNMANGLE(sign0, sign1) \ - for (i = 1; i < (n>>2); i++) { \ - i1 = 2*i; \ - i2 = n-i1; \ - /* Separate even and odd FFTs */ \ - ev.re = k1*(data[i1 ]+data[i2 ]); \ - od.im = k2*(data[i2 ]-data[i1 ]); \ - ev.im = k1*(data[i1+1]-data[i2+1]); \ - od.re = k2*(data[i1+1]+data[i2+1]); \ - /* Apply twiddle factors to the odd FFT and add to the even FFT */ \ - odsum.re = od.re*tcos[i] sign0 od.im*tsin[i]; \ - odsum.im = od.im*tcos[i] sign1 od.re*tsin[i]; \ - data[i1 ] = ev.re + odsum.re; \ - data[i1+1] = ev.im + odsum.im; \ - data[i2 ] = ev.re - odsum.re; \ - data[i2+1] = odsum.im - ev.im; \ - } - - if (s->negative_sin) { - RDFT_UNMANGLE(+,-) - } else { - RDFT_UNMANGLE(-,+) - } - - data[2*i+1]=s->sign_convention*data[2*i+1]; - if (s->inverse) { - data[0] *= k1; - data[1] *= k1; - s->fft.fft_permute(&s->fft, (FFTComplex*)data); - s->fft.fft_calc(&s->fft, (FFTComplex*)data); - } -} - -av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans) -{ - int n = 1 << nbits; - int ret; - - s->nbits = nbits; - s->inverse = trans == IDFT_C2R || trans == DFT_C2R; - s->sign_convention = trans == IDFT_R2C || trans == DFT_C2R ? 1 : -1; - s->negative_sin = trans == DFT_C2R || trans == DFT_R2C; - - if (nbits < 4 || nbits > 16) - return AVERROR(EINVAL); - - if ((ret = ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C)) < 0) - return ret; - - ff_init_ff_cos_tabs(nbits); - s->tcos = ff_cos_tabs[nbits]; - s->tsin = ff_cos_tabs[nbits] + (n >> 2); - s->rdft_calc = rdft_calc_c; - -#if ARCH_ARM - ff_rdft_init_arm(s); -#endif - - return 0; -} - -av_cold void ff_rdft_end(RDFTContext *s) -{ - ff_fft_end(&s->fft); -} diff --git a/libavcodec/rdft.h b/libavcodec/rdft.h deleted file mode 100644 index ffafca7f24..0000000000 --- a/libavcodec/rdft.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * (I)RDFT transforms - * Copyright (c) 2009 Alex Converse - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#if !defined(AVCODEC_RDFT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT) -#define AVCODEC_RDFT_H - -#include "config.h" -#include "fft.h" - -struct RDFTContext { - int nbits; - int inverse; - int sign_convention; - - /* pre/post rotation tables */ - const FFTSample *tcos; - const FFTSample *tsin; - int negative_sin; - FFTContext fft; - void (*rdft_calc)(struct RDFTContext *s, FFTSample *z); -}; - -/** - * Set up a real FFT. - * @param nbits log2 of the length of the input array - * @param trans the type of transform - */ -int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans); -void ff_rdft_end(RDFTContext *s); - -void ff_rdft_init_arm(RDFTContext *s); - - -#endif /* AVCODEC_RDFT_H */ diff --git a/libavcodec/tests/.gitignore b/libavcodec/tests/.gitignore index 8e8878a881..0df4ae10a0 100644 --- a/libavcodec/tests/.gitignore +++ b/libavcodec/tests/.gitignore @@ -1,6 +1,5 @@ /av1_levels /avcodec -/avfft /avpacket /bitstream_be /bitstream_le @@ -8,8 +7,6 @@ /celp_math /codec_desc /dct -/fft -/fft-fixed32 /golomb /h264_levels /h265_levels diff --git a/libavcodec/tests/avfft.c b/libavcodec/tests/avfft.c deleted file mode 100644 index 22aa99abca..0000000000 --- a/libavcodec/tests/avfft.c +++ /dev/null @@ -1,25 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * This test is similar to fft-fixed.c or fft-fixed32.c - */ - -#define AVFFT 1 -#define FFT_FLOAT 1 -#include "fft.c" diff --git a/libavcodec/tests/fft-fixed32.c b/libavcodec/tests/fft-fixed32.c deleted file mode 100644 index 3c50bf1dc1..0000000000 --- a/libavcodec/tests/fft-fixed32.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#define FFT_FLOAT 0 -#define AVFFT 0 -#include "fft.c" diff --git a/libavcodec/tests/fft.c b/libavcodec/tests/fft.c deleted file mode 100644 index 0f03c9232d..0000000000 --- a/libavcodec/tests/fft.c +++ /dev/null @@ -1,683 +0,0 @@ -/* - * (c) 2002 Fabrice Bellard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/internal.h" - -FF_DISABLE_DEPRECATION_WARNINGS - -/** - * @file - * FFT and MDCT tests. - */ - -#include "config.h" - -#ifndef AVFFT -#define AVFFT 0 -#endif - -#include -#if HAVE_UNISTD_H -#include -#endif -#include -#include -#include - -#include "libavutil/cpu.h" -#include "libavutil/error.h" -#include "libavutil/lfg.h" -#include "libavutil/log.h" -#include "libavutil/mathematics.h" -#include "libavutil/time.h" - -#if AVFFT -#include "libavcodec/avfft.h" -#else -#include "libavcodec/fft.h" -#endif - -#if FFT_FLOAT -#include "libavcodec/dct.h" -#include "libavcodec/rdft.h" -#endif - -/* reference fft */ - -#define MUL16(a, b) ((a) * (b)) - -#define CMAC(pre, pim, are, aim, bre, bim) \ - { \ - pre += (MUL16(are, bre) - MUL16(aim, bim)); \ - pim += (MUL16(are, bim) + MUL16(bre, aim)); \ - } - -#if FFT_FLOAT || AVFFT -#define RANGE 1.0 -#define REF_SCALE(x, bits) (x) -#define FMT "%10.6f" -#else -#define RANGE 8388608 -#define REF_SCALE(x, bits) (x) -#define FMT "%6d" -#endif - -static struct { - float re, im; -} *exptab; - -static int fft_ref_init(int nbits, int inverse) -{ - int i, n = 1 << nbits; - - exptab = av_malloc_array((n / 2), sizeof(*exptab)); - if (!exptab) - return AVERROR(ENOMEM); - - for (i = 0; i < (n / 2); i++) { - double alpha = 2 * M_PI * (float) i / (float) n; - double c1 = cos(alpha), s1 = sin(alpha); - if (!inverse) - s1 = -s1; - exptab[i].re = c1; - exptab[i].im = s1; - } - return 0; -} - -static void fft_ref(FFTComplex *tabr, FFTComplex *tab, int nbits) -{ - int i, j; - int n = 1 << nbits; - int n2 = n >> 1; - - for (i = 0; i < n; i++) { - double tmp_re = 0, tmp_im = 0; - FFTComplex *q = tab; - for (j = 0; j < n; j++) { - double s, c; - int k = (i * j) & (n - 1); - if (k >= n2) { - c = -exptab[k - n2].re; - s = -exptab[k - n2].im; - } else { - c = exptab[k].re; - s = exptab[k].im; - } - CMAC(tmp_re, tmp_im, c, s, q->re, q->im); - q++; - } - tabr[i].re = REF_SCALE(tmp_re, nbits); - tabr[i].im = REF_SCALE(tmp_im, nbits); - } -} - -#if CONFIG_MDCT -static void imdct_ref(FFTSample *out, FFTSample *in, int nbits) -{ - int i, k, n = 1 << nbits; - - for (i = 0; i < n; i++) { - double sum = 0; - for (k = 0; k < n / 2; k++) { - int a = (2 * i + 1 + (n / 2)) * (2 * k + 1); - double f = cos(M_PI * a / (double) (2 * n)); - sum += f * in[k]; - } - out[i] = REF_SCALE(-sum, nbits - 2); - } -} - -/* NOTE: no normalisation by 1 / N is done */ -static void mdct_ref(FFTSample *output, FFTSample *input, int nbits) -{ - int i, k, n = 1 << nbits; - - /* do it by hand */ - for (k = 0; k < n / 2; k++) { - double s = 0; - for (i = 0; i < n; i++) { - double a = (2 * M_PI * (2 * i + 1 + n / 2) * (2 * k + 1) / (4 * n)); - s += input[i] * cos(a); - } - output[k] = REF_SCALE(s, nbits - 1); - } -} -#endif /* CONFIG_MDCT */ - -#if FFT_FLOAT -#if CONFIG_DCT -static void idct_ref(FFTSample *output, FFTSample *input, int nbits) -{ - int i, k, n = 1 << nbits; - - /* do it by hand */ - for (i = 0; i < n; i++) { - double s = 0.5 * input[0]; - for (k = 1; k < n; k++) { - double a = M_PI * k * (i + 0.5) / n; - s += input[k] * cos(a); - } - output[i] = 2 * s / n; - } -} - -static void dct_ref(FFTSample *output, FFTSample *input, int nbits) -{ - int i, k, n = 1 << nbits; - - /* do it by hand */ - for (k = 0; k < n; k++) { - double s = 0; - for (i = 0; i < n; i++) { - double a = M_PI * k * (i + 0.5) / n; - s += input[i] * cos(a); - } - output[k] = s; - } -} -#endif /* CONFIG_DCT */ -#endif /* FFT_FLOAT */ - -static FFTSample frandom(AVLFG *prng) -{ - return (int16_t) av_lfg_get(prng) / 32768.0 * RANGE; -} - -static int check_diff(FFTSample *tab1, FFTSample *tab2, int n, double scale) -{ - int i, err = 0; - double error = 0, max = 0; - - for (i = 0; i < n; i++) { - double e = fabs(tab1[i] - (tab2[i] / scale)) / RANGE; - if (e >= 1e-3) { - av_log(NULL, AV_LOG_ERROR, "ERROR %5d: "FMT" "FMT"\n", - i, tab1[i], tab2[i]); - err = 1; - } - error += e * e; - if (e > max) - max = e; - } - av_log(NULL, AV_LOG_INFO, "max:%f e:%g\n", max, sqrt(error / n)); - return err; -} - -static inline void fft_init(FFTContext **s, int nbits, int inverse) -{ -#if AVFFT - *s = av_fft_init(nbits, inverse); -#else - ff_fft_init(*s, nbits, inverse); -#endif -} - -#if CONFIG_MDCT -static inline void mdct_init(FFTContext **s, int nbits, int inverse, double scale) -{ -#if AVFFT - *s = av_mdct_init(nbits, inverse, scale); -#else - ff_mdct_init(*s, nbits, inverse, scale); -#endif -} - -static inline void mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) -{ -#if AVFFT - av_mdct_calc(s, output, input); -#else - s->mdct_calc(s, output, input); -#endif -} - -static inline void imdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input) -{ -#if AVFFT - av_imdct_calc(s, output, input); -#else - s->imdct_calc(s, output, input); -#endif -} -#endif - -static inline void fft_permute(FFTContext *s, FFTComplex *z) -{ -#if AVFFT - av_fft_permute(s, z); -#else - s->fft_permute(s, z); -#endif -} - -static inline void fft_calc(FFTContext *s, FFTComplex *z) -{ -#if AVFFT - av_fft_calc(s, z); -#else - s->fft_calc(s, z); -#endif -} - -static inline void mdct_end(FFTContext *s) -{ -#if AVFFT - av_mdct_end(s); -#else - ff_mdct_end(s); -#endif -} - -static inline void fft_end(FFTContext *s) -{ -#if AVFFT - av_fft_end(s); -#else - ff_fft_end(s); -#endif -} - -#if FFT_FLOAT -static inline void rdft_init(RDFTContext **r, int nbits, enum RDFTransformType trans) -{ -#if AVFFT - *r = av_rdft_init(nbits, trans); -#else - ff_rdft_init(*r, nbits, trans); -#endif -} - -static inline void dct_init(DCTContext **d, int nbits, enum DCTTransformType trans) -{ -#if AVFFT - *d = av_dct_init(nbits, trans); -#else - ff_dct_init(*d, nbits, trans); -#endif -} - -static inline void rdft_calc(RDFTContext *r, FFTSample *tab) -{ -#if AVFFT - av_rdft_calc(r, tab); -#else - r->rdft_calc(r, tab); -#endif -} - -static inline void dct_calc(DCTContext *d, FFTSample *data) -{ -#if AVFFT - av_dct_calc(d, data); -#else - d->dct_calc(d, data); -#endif -} - -static inline void rdft_end(RDFTContext *r) -{ -#if AVFFT - av_rdft_end(r); -#else - ff_rdft_end(r); -#endif -} - -static inline void dct_end(DCTContext *d) -{ -#if AVFFT - av_dct_end(d); -#else - ff_dct_end(d); -#endif -} -#endif /* FFT_FLOAT */ - -static void help(void) -{ - av_log(NULL, AV_LOG_INFO, - "usage: fft-test [-h] [-s] [-i] [-n b]\n" - "-h print this help\n" - "-s speed test\n" - "-m (I)MDCT test\n" - "-d (I)DCT test\n" - "-r (I)RDFT test\n" - "-i inverse transform test\n" - "-n b set the transform size to 2^b\n" - "-f x set scale factor for output data of (I)MDCT to x\n"); -} - -enum tf_transform { - TRANSFORM_FFT, - TRANSFORM_MDCT, - TRANSFORM_RDFT, - TRANSFORM_DCT, -}; - -#if !HAVE_GETOPT -#include "compat/getopt.c" -#endif - -int main(int argc, char **argv) -{ - FFTComplex *tab, *tab1, *tab_ref; - FFTSample *tab2; - enum tf_transform transform = TRANSFORM_FFT; - FFTContext *m, *s; -#if FFT_FLOAT - RDFTContext *r; - DCTContext *d; -#endif /* FFT_FLOAT */ - int it, i, err = 1; - int do_speed = 0, do_inverse = 0; - int fft_nbits = 9, fft_size; - double scale = 1.0; - AVLFG prng; - -#if !AVFFT - s = av_mallocz(sizeof(*s)); - m = av_mallocz(sizeof(*m)); -#endif - -#if !AVFFT && FFT_FLOAT - r = av_mallocz(sizeof(*r)); - d = av_mallocz(sizeof(*d)); -#endif - - av_lfg_init(&prng, 1); - - for (;;) { - int c = getopt(argc, argv, "hsimrdn:f:c:"); - if (c == -1) - break; - switch (c) { - case 'h': - help(); - return 1; - case 's': - do_speed = 1; - break; - case 'i': - do_inverse = 1; - break; - case 'm': - transform = TRANSFORM_MDCT; - break; - case 'r': - transform = TRANSFORM_RDFT; - break; - case 'd': - transform = TRANSFORM_DCT; - break; - case 'n': - fft_nbits = atoi(optarg); - break; - case 'f': - scale = atof(optarg); - break; - case 'c': - { - unsigned cpuflags = av_get_cpu_flags(); - - if (av_parse_cpu_caps(&cpuflags, optarg) < 0) - return 1; - - av_force_cpu_flags(cpuflags); - break; - } - } - } - - fft_size = 1 << fft_nbits; - tab = av_malloc_array(fft_size, sizeof(FFTComplex)); - tab1 = av_malloc_array(fft_size, sizeof(FFTComplex)); - tab_ref = av_malloc_array(fft_size, sizeof(FFTComplex)); - tab2 = av_malloc_array(fft_size, sizeof(FFTSample)); - - if (!(tab && tab1 && tab_ref && tab2)) - goto cleanup; - - switch (transform) { -#if CONFIG_MDCT - case TRANSFORM_MDCT: - av_log(NULL, AV_LOG_INFO, "Scale factor is set to %f\n", scale); - if (do_inverse) - av_log(NULL, AV_LOG_INFO, "IMDCT"); - else - av_log(NULL, AV_LOG_INFO, "MDCT"); - mdct_init(&m, fft_nbits, do_inverse, scale); - break; -#endif /* CONFIG_MDCT */ - case TRANSFORM_FFT: - if (do_inverse) - av_log(NULL, AV_LOG_INFO, "IFFT"); - else - av_log(NULL, AV_LOG_INFO, "FFT"); - fft_init(&s, fft_nbits, do_inverse); - if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0) - goto cleanup; - break; -#if FFT_FLOAT -# if CONFIG_RDFT - case TRANSFORM_RDFT: - if (do_inverse) - av_log(NULL, AV_LOG_INFO, "IDFT_C2R"); - else - av_log(NULL, AV_LOG_INFO, "DFT_R2C"); - rdft_init(&r, fft_nbits, do_inverse ? IDFT_C2R : DFT_R2C); - if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0) - goto cleanup; - break; -# endif /* CONFIG_RDFT */ -# if CONFIG_DCT - case TRANSFORM_DCT: - if (do_inverse) - av_log(NULL, AV_LOG_INFO, "DCT_III"); - else - av_log(NULL, AV_LOG_INFO, "DCT_II"); - dct_init(&d, fft_nbits, do_inverse ? DCT_III : DCT_II); - break; -# endif /* CONFIG_DCT */ -#endif /* FFT_FLOAT */ - default: - av_log(NULL, AV_LOG_ERROR, "Requested transform not supported\n"); - goto cleanup; - } - av_log(NULL, AV_LOG_INFO, " %d test\n", fft_size); - - /* generate random data */ - - for (i = 0; i < fft_size; i++) { - tab1[i].re = frandom(&prng); - tab1[i].im = frandom(&prng); - } - - /* checking result */ - av_log(NULL, AV_LOG_INFO, "Checking...\n"); - - switch (transform) { -#if CONFIG_MDCT - case TRANSFORM_MDCT: - if (do_inverse) { - imdct_ref(&tab_ref->re, &tab1->re, fft_nbits); - imdct_calc(m, tab2, &tab1->re); - err = check_diff(&tab_ref->re, tab2, fft_size, scale); - } else { - mdct_ref(&tab_ref->re, &tab1->re, fft_nbits); - mdct_calc(m, tab2, &tab1->re); - err = check_diff(&tab_ref->re, tab2, fft_size / 2, scale); - } - break; -#endif /* CONFIG_MDCT */ - case TRANSFORM_FFT: - memcpy(tab, tab1, fft_size * sizeof(FFTComplex)); - fft_permute(s, tab); - fft_calc(s, tab); - - fft_ref(tab_ref, tab1, fft_nbits); - err = check_diff(&tab_ref->re, &tab->re, fft_size * 2, 1.0); - break; -#if FFT_FLOAT -#if CONFIG_RDFT - case TRANSFORM_RDFT: - { - int fft_size_2 = fft_size >> 1; - if (do_inverse) { - tab1[0].im = 0; - tab1[fft_size_2].im = 0; - for (i = 1; i < fft_size_2; i++) { - tab1[fft_size_2 + i].re = tab1[fft_size_2 - i].re; - tab1[fft_size_2 + i].im = -tab1[fft_size_2 - i].im; - } - - memcpy(tab2, tab1, fft_size * sizeof(FFTSample)); - tab2[1] = tab1[fft_size_2].re; - - rdft_calc(r, tab2); - fft_ref(tab_ref, tab1, fft_nbits); - for (i = 0; i < fft_size; i++) { - tab[i].re = tab2[i]; - tab[i].im = 0; - } - err = check_diff(&tab_ref->re, &tab->re, fft_size * 2, 0.5); - } else { - for (i = 0; i < fft_size; i++) { - tab2[i] = tab1[i].re; - tab1[i].im = 0; - } - rdft_calc(r, tab2); - fft_ref(tab_ref, tab1, fft_nbits); - tab_ref[0].im = tab_ref[fft_size_2].re; - err = check_diff(&tab_ref->re, tab2, fft_size, 1.0); - } - break; - } -#endif /* CONFIG_RDFT */ -#if CONFIG_DCT - case TRANSFORM_DCT: - memcpy(tab, tab1, fft_size * sizeof(FFTComplex)); - dct_calc(d, &tab->re); - if (do_inverse) - idct_ref(&tab_ref->re, &tab1->re, fft_nbits); - else - dct_ref(&tab_ref->re, &tab1->re, fft_nbits); - err = check_diff(&tab_ref->re, &tab->re, fft_size, 1.0); - break; -#endif /* CONFIG_DCT */ -#endif /* FFT_FLOAT */ - } - - /* do a speed test */ - - if (do_speed) { - int64_t time_start, duration; - int nb_its; - - av_log(NULL, AV_LOG_INFO, "Speed test...\n"); - /* we measure during about 1 seconds */ - nb_its = 1; - for (;;) { - time_start = av_gettime_relative(); - for (it = 0; it < nb_its; it++) { - switch (transform) { -#if CONFIG_MDCT - case TRANSFORM_MDCT: - if (do_inverse) - imdct_calc(m, &tab->re, &tab1->re); - else - mdct_calc(m, &tab->re, &tab1->re); - break; -#endif - case TRANSFORM_FFT: - memcpy(tab, tab1, fft_size * sizeof(FFTComplex)); - fft_calc(s, tab); - break; -#if FFT_FLOAT - case TRANSFORM_RDFT: - memcpy(tab2, tab1, fft_size * sizeof(FFTSample)); - rdft_calc(r, tab2); - break; - case TRANSFORM_DCT: - memcpy(tab2, tab1, fft_size * sizeof(FFTSample)); - dct_calc(d, tab2); - break; -#endif /* FFT_FLOAT */ - } - } - duration = av_gettime_relative() - time_start; - if (duration >= 1000000) - break; - nb_its *= 2; - } - av_log(NULL, AV_LOG_INFO, - "time: %0.1f us/transform [total time=%0.2f s its=%d]\n", - (double) duration / nb_its, - (double) duration / 1000000.0, - nb_its); - } - - switch (transform) { -#if CONFIG_MDCT - case TRANSFORM_MDCT: - mdct_end(m); - break; -#endif /* CONFIG_MDCT */ - case TRANSFORM_FFT: - fft_end(s); - break; -#if FFT_FLOAT -# if CONFIG_RDFT - case TRANSFORM_RDFT: - rdft_end(r); - break; -# endif /* CONFIG_RDFT */ -# if CONFIG_DCT - case TRANSFORM_DCT: - dct_end(d); - break; -# endif /* CONFIG_DCT */ -#endif /* FFT_FLOAT */ - } - -cleanup: - av_free(tab); - av_free(tab1); - av_free(tab2); - av_free(tab_ref); - av_free(exptab); - -#if !AVFFT - av_free(s); - av_free(m); -#endif - -#if !AVFFT && FFT_FLOAT - av_free(r); - av_free(d); -#endif - - if (err) - printf("Error: %d.\n", err); - - return !!err; -} - -FF_ENABLE_DEPRECATION_WARNINGS diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 2ceb88968f..d5fb30645a 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -5,11 +5,9 @@ OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o -OBJS-$(CONFIG_DCT) += x86/dct_init.o OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \ x86/dirac_dwt_init.o OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o -OBJS-$(CONFIG_FFT) += x86/fft_init.o OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o @@ -98,8 +96,6 @@ X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \ X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o -X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o -X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c deleted file mode 100644 index d0e4b34dd3..0000000000 --- a/libavcodec/x86/dct_init.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/dct.h" - -void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in); -void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); - -av_cold void ff_dct_init_x86(DCTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (EXTERNAL_SSE2(cpu_flags)) - s->dct32 = ff_dct32_float_sse2; - if (EXTERNAL_AVX_FAST(cpu_flags)) - s->dct32 = ff_dct32_float_avx; -} diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm deleted file mode 100644 index 34c3fc9a0f..0000000000 --- a/libavcodec/x86/fft.asm +++ /dev/null @@ -1,838 +0,0 @@ -;****************************************************************************** -;* FFT transform with SSE/AVX optimizations -;* Copyright (c) 2008 Loren Merritt -;* Copyright (c) 2011 Vitor Sessak -;* -;* This algorithm (though not any of the implementation details) is -;* based on libdjbfft by D. J. Bernstein. -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -; These functions are not individually interchangeable with the C versions. -; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results -; in blocks as conventient to the vector size. -; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) - -%include "libavutil/x86/x86util.asm" - -%if ARCH_X86_64 -%define pointer resq -%else -%define pointer resd -%endif - -struc FFTContext - .nbits: resd 1 - .reverse: resd 1 - .revtab: pointer 1 - .tmpbuf: pointer 1 - .mdctsize: resd 1 - .mdctbits: resd 1 - .tcos: pointer 1 - .tsin: pointer 1 - .fftperm: pointer 1 - .fftcalc: pointer 1 - .imdctcalc:pointer 1 - .imdcthalf:pointer 1 -endstruc - -SECTION_RODATA 32 - -%define M_SQRT1_2 0.70710678118654752440 -%define M_COS_PI_1_8 0.923879532511287 -%define M_COS_PI_3_8 0.38268343236509 - -ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 -ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 - -ps_root2: times 8 dd M_SQRT1_2 -ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 -ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 - -perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 -perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 -ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 -ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 -ps_m1p1: dd 1<<31, 0 - -cextern ps_neg - -%assign i 16 -%rep 14 -cextern cos_ %+ i -%assign i i<<1 -%endrep - -%if ARCH_X86_64 - %define pointer dq -%else - %define pointer dd -%endif - -%macro IF0 1+ -%endmacro -%macro IF1 1+ - %1 -%endmacro - -SECTION .text - -; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} -; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} -; %3, %4, %5 tmp -; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} -; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} -%macro T8_AVX 5 - vsubps %5, %1, %2 ; v = %1 - %2 - vaddps %3, %1, %2 ; w = %1 + %2 - vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 - vpermilps %2, %2, [perm1] - vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} - vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} - vsubps %4, %5, %1 ; s = r - q - vaddps %1, %5, %1 ; u = r + q - vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} - vshufps %5, %4, %1, 0xbb - vshufps %3, %4, %1, 0xee - vperm2f128 %3, %3, %5, 0x13 - vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} - vshufps %2, %1, %4, 0xdd - vshufps %1, %1, %4, 0x88 - vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} - vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} - vsubps %5, %1, %3 - vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} - vsubps %2, %4, %1 ; %2 = v - w - vaddps %1, %4, %1 ; %1 = v + w -%endmacro - -; In SSE mode do one fft4 transforms -; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} -; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} -; -; In AVX mode do two fft4 transforms -; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} -; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} -%macro T4_SSE 3 - subps %3, %1, %2 ; {t3,t4,-t8,t7} - addps %1, %1, %2 ; {t1,t2,t6,t5} - xorps %3, %3, [ps_p1p1m1p1] - shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} - shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} - subps %3, %1, %2 ; {r2,i2,r3,i3} - addps %1, %1, %2 ; {r0,i0,r1,i1} - shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} - shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} -%endmacro - -; In SSE mode do one FFT8 -; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} -; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} -; -; In AVX mode do two FFT8 -; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} -; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} -; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} -; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} -%macro T8_SSE 6 - addps %6, %3, %4 ; {t1,t2,t3,t4} - subps %3, %3, %4 ; {r5,i5,r7,i7} - shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} - mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} - mulps %4, %4, [ps_root2] - addps %3, %3, %4 ; {t8,t7,ta,t9} - shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} - shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} - subps %3, %6, %4 ; {t6,t5,tc,tb} - addps %6, %6, %4 ; {t1,t2,t9,ta} - shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} - shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} - subps %3, %1, %6 ; {r4,r5,r6,r7} - addps %1, %1, %6 ; {r0,r1,r2,r3} - subps %4, %2, %5 ; {i4,i5,i6,i7} - addps %2, %2, %5 ; {i0,i1,i2,i3} -%endmacro - -%macro INTERL 5 -%if cpuflag(avx) - vunpckhps %3, %2, %1 - vunpcklps %2, %2, %1 - vextractf128 %4(%5), %2, 0 - vextractf128 %4 %+ H(%5), %3, 0 - vextractf128 %4(%5 + 1), %2, 1 - vextractf128 %4 %+ H(%5 + 1), %3, 1 -%elif cpuflag(sse) - mova %3, %2 - unpcklps %2, %1 - unpckhps %3, %1 - mova %4(%5), %2 - mova %4(%5+1), %3 -%endif -%endmacro - -; scheduled for cpu-bound sizes -%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim -IF%1 mova m4, Z(4) -IF%1 mova m5, Z(5) - mova m0, %2 ; wre - mova m1, %3 ; wim - mulps m2, m4, m0 ; r2*wre -IF%1 mova m6, Z2(6) - mulps m3, m5, m1 ; i2*wim -IF%1 mova m7, Z2(7) - mulps m4, m4, m1 ; r2*wim - mulps m5, m5, m0 ; i2*wre - addps m2, m2, m3 ; r2*wre + i2*wim - mulps m3, m1, m7 ; i3*wim - subps m5, m5, m4 ; i2*wre - r2*wim - mulps m1, m1, m6 ; r3*wim - mulps m4, m0, m6 ; r3*wre - mulps m0, m0, m7 ; i3*wre - subps m4, m4, m3 ; r3*wre - i3*wim - mova m3, Z(0) - addps m0, m0, m1 ; i3*wre + r3*wim - subps m1, m4, m2 ; t3 - addps m4, m4, m2 ; t5 - subps m3, m3, m4 ; r2 - addps m4, m4, Z(0) ; r0 - mova m6, Z(2) - mova Z(4), m3 - mova Z(0), m4 - subps m3, m5, m0 ; t4 - subps m4, m6, m3 ; r3 - addps m3, m3, m6 ; r1 - mova Z2(6), m4 - mova Z(2), m3 - mova m2, Z(3) - addps m3, m5, m0 ; t6 - subps m2, m2, m1 ; i3 - mova m7, Z(1) - addps m1, m1, Z(3) ; i1 - mova Z2(7), m2 - mova Z(3), m1 - subps m4, m7, m3 ; i2 - addps m3, m3, m7 ; i0 - mova Z(5), m4 - mova Z(1), m3 -%endmacro - -; scheduled to avoid store->load aliasing -%macro PASS_BIG 1 ; (!interleave) - mova m4, Z(4) ; r2 - mova m5, Z(5) ; i2 - mova m0, [wq] ; wre - mova m1, [wq+o1q] ; wim - mulps m2, m4, m0 ; r2*wre - mova m6, Z2(6) ; r3 - mulps m3, m5, m1 ; i2*wim - mova m7, Z2(7) ; i3 - mulps m4, m4, m1 ; r2*wim - mulps m5, m5, m0 ; i2*wre - addps m2, m2, m3 ; r2*wre + i2*wim - mulps m3, m1, m7 ; i3*wim - mulps m1, m1, m6 ; r3*wim - subps m5, m5, m4 ; i2*wre - r2*wim - mulps m4, m0, m6 ; r3*wre - mulps m0, m0, m7 ; i3*wre - subps m4, m4, m3 ; r3*wre - i3*wim - mova m3, Z(0) - addps m0, m0, m1 ; i3*wre + r3*wim - subps m1, m4, m2 ; t3 - addps m4, m4, m2 ; t5 - subps m3, m3, m4 ; r2 - addps m4, m4, Z(0) ; r0 - mova m6, Z(2) - mova Z(4), m3 - mova Z(0), m4 - subps m3, m5, m0 ; t4 - subps m4, m6, m3 ; r3 - addps m3, m3, m6 ; r1 -IF%1 mova Z2(6), m4 -IF%1 mova Z(2), m3 - mova m2, Z(3) - addps m5, m5, m0 ; t6 - subps m2, m2, m1 ; i3 - mova m7, Z(1) - addps m1, m1, Z(3) ; i1 -IF%1 mova Z2(7), m2 -IF%1 mova Z(3), m1 - subps m6, m7, m5 ; i2 - addps m5, m5, m7 ; i0 -IF%1 mova Z(5), m6 -IF%1 mova Z(1), m5 -%if %1==0 - INTERL m1, m3, m7, Z, 2 - INTERL m2, m4, m0, Z2, 6 - - mova m1, Z(0) - mova m2, Z(4) - - INTERL m5, m1, m3, Z, 0 - INTERL m6, m2, m7, Z, 4 -%endif -%endmacro - -%define Z(x) [r0+mmsize*x] -%define Z2(x) [r0+mmsize*x] -%define ZH(x) [r0+mmsize*x+mmsize/2] - -INIT_YMM avx - -%if HAVE_AVX_EXTERNAL -align 16 -fft8_avx: - mova m0, Z(0) - mova m1, Z(1) - T8_AVX m0, m1, m2, m3, m4 - mova Z(0), m0 - mova Z(1), m1 - ret - - -align 16 -fft16_avx: - mova m2, Z(2) - mova m3, Z(3) - T4_SSE m2, m3, m7 - - mova m0, Z(0) - mova m1, Z(1) - T8_AVX m0, m1, m4, m5, m7 - - mova m4, [ps_cos16_1] - mova m5, [ps_cos16_2] - vmulps m6, m2, m4 - vmulps m7, m3, m5 - vaddps m7, m7, m6 - vmulps m2, m2, m5 - vmulps m3, m3, m4 - vsubps m3, m3, m2 - vblendps m2, m7, m3, 0xf0 - vperm2f128 m3, m7, m3, 0x21 - vaddps m4, m2, m3 - vsubps m2, m3, m2 - vperm2f128 m2, m2, m2, 0x01 - vsubps m3, m1, m2 - vaddps m1, m1, m2 - vsubps m5, m0, m4 - vaddps m0, m0, m4 - vextractf128 Z(0), m0, 0 - vextractf128 ZH(0), m1, 0 - vextractf128 Z(1), m0, 1 - vextractf128 ZH(1), m1, 1 - vextractf128 Z(2), m5, 0 - vextractf128 ZH(2), m3, 0 - vextractf128 Z(3), m5, 1 - vextractf128 ZH(3), m3, 1 - ret - -align 16 -fft32_avx: - call fft16_avx - - mova m0, Z(4) - mova m1, Z(5) - - T4_SSE m0, m1, m4 - - mova m2, Z(6) - mova m3, Z(7) - - T8_SSE m0, m1, m2, m3, m4, m6 - ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} - ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} - - vperm2f128 m4, m0, m2, 0x20 - vperm2f128 m5, m1, m3, 0x20 - vperm2f128 m6, m0, m2, 0x31 - vperm2f128 m7, m1, m3, 0x31 - - PASS_SMALL 0, [cos_32], [cos_32+32] - - ret - -fft32_interleave_avx: - call fft32_avx - mov r2d, 32 -.deint_loop: - mova m2, Z(0) - mova m3, Z(1) - vunpcklps m0, m2, m3 - vunpckhps m1, m2, m3 - vextractf128 Z(0), m0, 0 - vextractf128 ZH(0), m1, 0 - vextractf128 Z(1), m0, 1 - vextractf128 ZH(1), m1, 1 - add r0, mmsize*2 - sub r2d, mmsize/4 - jg .deint_loop - ret - -%endif - -INIT_XMM sse - -align 16 -fft4_avx: -fft4_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova Z(0), m0 - mova Z(1), m1 - ret - -align 16 -fft8_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova m2, Z(2) - mova m3, Z(3) - T8_SSE m0, m1, m2, m3, m4, m5 - mova Z(0), m0 - mova Z(1), m1 - mova Z(2), m2 - mova Z(3), m3 - ret - -align 16 -fft16_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova m2, Z(2) - mova m3, Z(3) - T8_SSE m0, m1, m2, m3, m4, m5 - mova m4, Z(4) - mova m5, Z(5) - mova Z(0), m0 - mova Z(1), m1 - mova Z(2), m2 - mova Z(3), m3 - T4_SSE m4, m5, m6 - mova m6, Z2(6) - mova m7, Z2(7) - T4_SSE m6, m7, m0 - PASS_SMALL 0, [cos_16], [cos_16+16] - ret - - -%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] -%define Z2(x) [zcq + o3q + mmsize*(x&1)] -%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] -%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] - -%macro DECL_PASS 2+ ; name, payload -align 16 -%1: -DEFINE_ARGS zc, w, n, o1, o3 - lea o3q, [nq*3] - lea o1q, [nq*8] - shl o3q, 4 -.loop: - %2 - add zcq, mmsize*2 - add wq, mmsize - sub nd, mmsize/8 - jg .loop - rep ret -%endmacro - -%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs - lea r2, [dispatch_tab%1] - mov r2, [r2 + (%2q-2)*gprsize] -%ifdef PIC - lea r3, [$$] - add r2, r3 -%endif - call r2 -%endmacro ; FFT_DISPATCH - -INIT_YMM avx - -%if HAVE_AVX_EXTERNAL -DECL_PASS pass_avx, PASS_BIG 1 -DECL_PASS pass_interleave_avx, PASS_BIG 0 - -cglobal fft_calc, 2,5,8 - mov r3d, [r0 + FFTContext.nbits] - mov r0, r1 - mov r1, r3 - FFT_DISPATCH _interleave %+ SUFFIX, r1 - RET - -%endif - -INIT_XMM sse - -DECL_PASS pass_sse, PASS_BIG 1 -DECL_PASS pass_interleave_sse, PASS_BIG 0 - -INIT_XMM sse -cglobal fft_calc, 2,5,8 - mov r3d, [r0 + FFTContext.nbits] - PUSH r1 - PUSH r3 - mov r0, r1 - mov r1, r3 - FFT_DISPATCH _interleave %+ SUFFIX, r1 - POP rcx - POP r4 - cmp rcx, 3+(mmsize/16) - jg .end - mov r2, -1 - add rcx, 3 - shl r2, cl - sub r4, r2 -.loop: - movaps xmm0, [r4 + r2] - movaps xmm1, xmm0 - unpcklps xmm0, [r4 + r2 + 16] - unpckhps xmm1, [r4 + r2 + 16] - movaps [r4 + r2], xmm0 - movaps [r4 + r2 + 16], xmm1 - add r2, mmsize*2 - jl .loop -.end: - RET - -cglobal fft_permute, 2,7,1 - mov r4, [r0 + FFTContext.revtab] - mov r5, [r0 + FFTContext.tmpbuf] - mov ecx, [r0 + FFTContext.nbits] - mov r2, 1 - shl r2, cl - xor r0, r0 -%if ARCH_X86_32 - mov r1, r1m -%endif -.loop: - movaps xmm0, [r1 + 8*r0] - movzx r6, word [r4 + 2*r0] - movzx r3, word [r4 + 2*r0 + 2] - movlps [r5 + 8*r6], xmm0 - movhps [r5 + 8*r3], xmm0 - add r0, 2 - cmp r0, r2 - jl .loop - shl r2, 3 - add r1, r2 - add r5, r2 - neg r2 -; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B -.loopcopy: - movaps xmm0, [r5 + r2] - movaps xmm1, [r5 + r2 + 16] - movaps [r1 + r2], xmm0 - movaps [r1 + r2 + 16], xmm1 - add r2, 32 - jl .loopcopy - RET - -INIT_XMM sse -cglobal imdct_calc, 3,5,3 - mov r3d, [r0 + FFTContext.mdctsize] - mov r4, [r0 + FFTContext.imdcthalf] - add r1, r3 - PUSH r3 - PUSH r1 -%if ARCH_X86_32 - push r2 - push r1 - push r0 -%else - sub rsp, 8+32*WIN64 ; allocate win64 shadow space -%endif - call r4 -%if ARCH_X86_32 - add esp, 12 -%else - add rsp, 8+32*WIN64 -%endif - POP r1 - POP r3 - lea r0, [r1 + 2*r3] - mov r2, r3 - sub r3, mmsize - neg r2 - mova m2, [ps_neg] -.loop: - mova m0, [r1 + r3] - mova m1, [r0 + r2] - shufps m0, m0, 0x1b - shufps m1, m1, 0x1b - xorps m0, m2 - mova [r0 + r3], m1 - mova [r1 + r2], m0 - sub r3, mmsize - add r2, mmsize - jl .loop - RET - -%ifdef PIC -%define SECTION_REL - $$ -%else -%define SECTION_REL -%endif - -%macro DECL_FFT 1-2 ; nbits, suffix -%ifidn %0, 1 -%xdefine fullsuffix SUFFIX -%else -%xdefine fullsuffix %2 %+ SUFFIX -%endif -%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL -%if %1>=5 -%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL -%endif -%if %1>=6 -%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL -%endif - -%assign n 1<<%1 -%rep 18-%1 -%assign n2 n/2 -%assign n4 n/4 -%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL - -align 16 -fft %+ n %+ fullsuffix: - call fft %+ n2 %+ SUFFIX - add r0, n*4 - (n&(-2<<%1)) - call fft %+ n4 %+ SUFFIX - add r0, n*2 - (n2&(-2<<%1)) - call fft %+ n4 %+ SUFFIX - sub r0, n*6 + (n2&(-2<<%1)) - lea r1, [cos_ %+ n] - mov r2d, n4/2 - jmp pass %+ fullsuffix - -%assign n n*2 -%endrep -%undef n - -align 8 -dispatch_tab %+ fullsuffix: pointer list_of_fft -%endmacro ; DECL_FFT - -%if HAVE_AVX_EXTERNAL -INIT_YMM avx -DECL_FFT 6 -DECL_FFT 6, _interleave -%endif -INIT_XMM sse -DECL_FFT 5 -DECL_FFT 5, _interleave - -INIT_XMM sse -%undef mulps -%undef addps -%undef subps -%undef unpcklps -%undef unpckhps - -%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 - movaps xmm0, [%3+%2*4] - movaps xmm1, [%3+%1*4-0x10] - movaps xmm2, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm1, xmm2, 0x77 - movlps xmm4, [%4+%2*2] - movlps xmm5, [%5+%2*2+0x0] - movhps xmm4, [%4+%1*2-0x8] - movhps xmm5, [%5+%1*2-0x8] - movaps xmm2, xmm0 - movaps xmm3, xmm1 - mulps xmm0, xmm5 - mulps xmm1, xmm4 - mulps xmm2, xmm4 - mulps xmm3, xmm5 - subps xmm1, xmm0 - addps xmm2, xmm3 - movaps xmm0, xmm1 - unpcklps xmm1, xmm2 - unpckhps xmm0, xmm2 -%endmacro - -%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 - mulps m6, %3, [%5+%1] - mulps m7, %2, [%5+%1] - mulps %2, %2, [%6+%1] - mulps %3, %3, [%6+%1] - subps %2, %2, m6 - addps %3, %3, m7 -%endmacro - -%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 -.post: -%if cpuflag(avx) - vmovaps ymm1, [%3+%1*2] - vmovaps ymm0, [%3+%1*2+0x20] - vmovaps ymm3, [%3+%2*2] - vmovaps ymm2, [%3+%2*2+0x20] - - CMUL %1, ymm0, ymm1, %3, %4, %5 - CMUL %2, ymm2, ymm3, %3, %4, %5 - vshufps ymm1, ymm1, ymm1, 0x1b - vshufps ymm3, ymm3, ymm3, 0x1b - vperm2f128 ymm1, ymm1, ymm1, 0x01 - vperm2f128 ymm3, ymm3, ymm3, 0x01 - vunpcklps ymm6, ymm2, ymm1 - vunpckhps ymm4, ymm2, ymm1 - vunpcklps ymm7, ymm0, ymm3 - vunpckhps ymm5, ymm0, ymm3 - - vextractf128 [%3+%1*2], ymm7, 0 - vextractf128 [%3+%1*2+0x10], ymm5, 0 - vextractf128 [%3+%1*2+0x20], ymm7, 1 - vextractf128 [%3+%1*2+0x30], ymm5, 1 - - vextractf128 [%3+%2*2], ymm6, 0 - vextractf128 [%3+%2*2+0x10], ymm4, 0 - vextractf128 [%3+%2*2+0x20], ymm6, 1 - vextractf128 [%3+%2*2+0x30], ymm4, 1 - sub %2, 0x20 - add %1, 0x20 - jl .post -%else - movaps xmm1, [%3+%1*2] - movaps xmm0, [%3+%1*2+0x10] - CMUL %1, xmm0, xmm1, %3, %4, %5 - movaps xmm5, [%3+%2*2] - movaps xmm4, [%3+%2*2+0x10] - CMUL %2, xmm4, xmm5, %3, %4, %5 - shufps xmm1, xmm1, 0x1b - shufps xmm5, xmm5, 0x1b - movaps xmm6, xmm4 - unpckhps xmm4, xmm1 - unpcklps xmm6, xmm1 - movaps xmm2, xmm0 - unpcklps xmm0, xmm5 - unpckhps xmm2, xmm5 - movaps [%3+%2*2], xmm6 - movaps [%3+%2*2+0x10], xmm4 - movaps [%3+%1*2], xmm0 - movaps [%3+%1*2+0x10], xmm2 - sub %2, 0x10 - add %1, 0x10 - jl .post -%endif -%endmacro - -%macro DECL_IMDCT 0 -cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input -%if ARCH_X86_64 -%define rrevtab r7 -%define rtcos r8 -%define rtsin r9 -%else -%define rrevtab r6 -%define rtsin r6 -%define rtcos r5 -%endif - mov r3d, [r0+FFTContext.mdctsize] - add r2, r3 - shr r3, 1 - mov rtcos, [r0+FFTContext.tcos] - mov rtsin, [r0+FFTContext.tsin] - add rtcos, r3 - add rtsin, r3 -%if ARCH_X86_64 == 0 - push rtcos - push rtsin -%endif - shr r3, 1 - mov rrevtab, [r0+FFTContext.revtab] - add rrevtab, r3 -%if ARCH_X86_64 == 0 - push rrevtab -%endif - - sub r3, 4 -%if ARCH_X86_64 - xor r4, r4 - sub r4, r3 -%endif -.pre: -%if ARCH_X86_64 == 0 -;unspill - xor r4, r4 - sub r4, r3 - mov rtcos, [esp+8] - mov rtsin, [esp+4] -%endif - - PREROTATER r4, r3, r2, rtcos, rtsin -%if ARCH_X86_64 - movzx r5, word [rrevtab+r4-4] - movzx r6, word [rrevtab+r4-2] - movzx r10, word [rrevtab+r3] - movzx r11, word [rrevtab+r3+2] - movlps [r1+r5 *8], xmm0 - movhps [r1+r6 *8], xmm0 - movlps [r1+r10*8], xmm1 - movhps [r1+r11*8], xmm1 - add r4, 4 -%else - mov r6, [esp] - movzx r5, word [r6+r4-4] - movzx r4, word [r6+r4-2] - movlps [r1+r5*8], xmm0 - movhps [r1+r4*8], xmm0 - movzx r5, word [r6+r3] - movzx r4, word [r6+r3+2] - movlps [r1+r5*8], xmm1 - movhps [r1+r4*8], xmm1 -%endif - sub r3, 4 - jns .pre - - mov r5, r0 - mov r6, r1 - mov r0, r1 - mov r1d, [r5+FFTContext.nbits] - - FFT_DISPATCH SUFFIX, r1 - - mov r0d, [r5+FFTContext.mdctsize] - add r6, r0 - shr r0, 1 -%if ARCH_X86_64 == 0 -%define rtcos r2 -%define rtsin r3 - mov rtcos, [esp+8] - mov rtsin, [esp+4] -%endif - neg r0 - mov r1, -mmsize - sub r1, r0 - POSROTATESHUF r0, r1, r6, rtcos, rtsin -%if ARCH_X86_64 == 0 - add esp, 12 -%endif - RET -%endmacro - -DECL_IMDCT - -INIT_YMM avx - -%if HAVE_AVX_EXTERNAL -DECL_IMDCT -%endif diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h deleted file mode 100644 index 37418ec1f4..0000000000 --- a/libavcodec/x86/fft.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_FFT_H -#define AVCODEC_X86_FFT_H - -#include "libavcodec/fft.h" - -void ff_fft_permute_sse(FFTContext *s, FFTComplex *z); -void ff_fft_calc_avx(FFTContext *s, FFTComplex *z); -void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); - -void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); - -#endif /* AVCODEC_X86_FFT_H */ diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c deleted file mode 100644 index df79d57dc7..0000000000 --- a/libavcodec/x86/fft_init.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/cpu.h" - -#include "fft.h" - -av_cold void ff_fft_init_x86(FFTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (s->nbits > 16) - return; - - if (EXTERNAL_SSE(cpu_flags)) { - s->imdct_calc = ff_imdct_calc_sse; - s->imdct_half = ff_imdct_half_sse; - s->fft_permute = ff_fft_permute_sse; - s->fft_calc = ff_fft_calc_sse; - s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; - } - - if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) { - s->imdct_half = ff_imdct_half_avx; - s->fft_calc = ff_fft_calc_avx; - s->fft_permutation = FF_FFT_PERM_AVX; - } -} diff --git a/tests/Makefile b/tests/Makefile index 7b80762e81..f03cf20d8e 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -181,7 +181,6 @@ include $(SRC_PATH)/tests/fate/enc_external.mak # Must be included after lavf-video.mak include $(SRC_PATH)/tests/fate/ffmpeg.mak include $(SRC_PATH)/tests/fate/ffprobe.mak -include $(SRC_PATH)/tests/fate/fft.mak include $(SRC_PATH)/tests/fate/fifo-muxer.mak include $(SRC_PATH)/tests/fate/filter-audio.mak # Must be included after vcodec.mak diff --git a/tests/fate/fft.mak b/tests/fate/fft.mak deleted file mode 100644 index 76701dcce6..0000000000 --- a/tests/fate/fft.mak +++ /dev/null @@ -1,83 +0,0 @@ -define DEF_FFT -FATE_DCT-$(CONFIG_DCT) += fate-dct1d-$(1) fate-idct1d-$(1) -FATE_FFT-$(CONFIG_FFT) += fate-fft-$(1) fate-ifft-$(1) -FATE_MDCT-$(CONFIG_MDCT) += fate-mdct-$(1) fate-imdct-$(1) -FATE_RDFT-$(CONFIG_RDFT) += fate-rdft-$(1) fate-irdft-$(1) - -fate-fft-$(N): ARGS = -n$(1) -fate-ifft-$(N): ARGS = -n$(1) -i -fate-mdct-$(N): ARGS = -n$(1) -m -fate-imdct-$(N): ARGS = -n$(1) -m -i -fate-rdft-$(N): ARGS = -n$(1) -r -fate-irdft-$(N): ARGS = -n$(1) -r -i -fate-dct1d-$(N): ARGS = -n$(1) -d -fate-idct1d-$(N): ARGS = -n$(1) -d -i -endef - -$(foreach N, 4 5 6 7 8 9 10 11 12, $(eval $(call DEF_FFT,$(N)))) - -fate-dct-float: $(FATE_DCT-yes) -fate-fft-float: $(FATE_FFT-yes) -fate-mdct-float: $(FATE_MDCT-yes) -fate-rdft-float: $(FATE_RDFT-yes) - -FATE_FFT_ALL = $(FATE_DCT-yes) $(FATE_FFT-yes) $(FATE_MDCT-yes) $(FATE_RDFT-yes) - -$(FATE_FFT_ALL): libavcodec/tests/fft$(EXESUF) -$(FATE_FFT_ALL): CMD = run libavcodec/tests/fft$(EXESUF) $(CPUFLAGS:%=-c%) $(ARGS) - -$(FATE_FFT_ALL): CMP = null - -define DEF_FFT_FIXED32 -FATE_FFT_FIXED32 += fate-fft-fixed32-$(1) fate-ifft-fixed32-$(1) \ - fate-mdct-fixed32-$(1) fate-imdct-fixed32-$(1) - -fate-fft-fixed32-$(1): ARGS = -n$(1) -fate-ifft-fixed32-$(1): ARGS = -n$(1) -i -#fate-mdct-fixed32-$(1): ARGS = -n$(1) -m -fate-imdct-fixed32-$(1): ARGS = -n$(1) -m -i -endef - -$(foreach N, 4 5 6 7 8 9 10 11 12, $(eval $(call DEF_FFT_FIXED32,$(N)))) - -fate-fft-fixed32: $(FATE_FFT_FIXED32) -$(FATE_FFT_FIXED32): libavcodec/tests/fft-fixed32$(EXESUF) -$(FATE_FFT_FIXED32): CMD = run libavcodec/tests/fft-fixed32$(EXESUF) $(CPUFLAGS:%=-c%) $(ARGS) -$(FATE_FFT_FIXED32): CMP = null - -define DEF_AV_FFT -FATE_AV_DCT-$(CONFIG_DCT) += fate-av-dct1d-$(1) fate-av-idct1d-$(1) -FATE_AV_FFT-$(CONFIG_FFT) += fate-av-fft-$(1) fate-av-ifft-$(1) -FATE_AV_MDCT-$(CONFIG_MDCT) += fate-av-mdct-$(1) fate-av-imdct-$(1) -FATE_AV_RDFT-$(CONFIG_RDFT) += fate-av-rdft-$(1) fate-av-irdft-$(1) - -fate-av-fft-$(N): ARGS = -n$(1) -fate-av-ifft-$(N): ARGS = -n$(1) -i -fate-av-mdct-$(N): ARGS = -n$(1) -m -fate-av-imdct-$(N): ARGS = -n$(1) -m -i -fate-av-rdft-$(N): ARGS = -n$(1) -r -fate-av-irdft-$(N): ARGS = -n$(1) -r -i -fate-av-dct1d-$(N): ARGS = -n$(1) -d -fate-av-idct1d-$(N): ARGS = -n$(1) -d -i -endef - -$(foreach N, 4 5 6 7 8 9 10 11 12, $(eval $(call DEF_AV_FFT,$(N)))) - -fate-av-dct-float: $(FATE_AV_DCT-yes) -fate-av-fft-float: $(FATE_AV_FFT-yes) -fate-av-mdct-float: $(FATE_AV_MDCT-yes) -fate-av-rdft-float: $(FATE_AV_RDFT-yes) - -FATE_AV_FFT_ALL = $(FATE_AV_DCT-yes) $(FATE_AV_FFT-yes) $(FATE_AV_MDCT-yes) $(FATE_AV_RDFT-yes) - -$(FATE_AV_FFT_ALL): libavcodec/tests/avfft$(EXESUF) -$(FATE_AV_FFT_ALL): CMD = run libavcodec/tests/avfft$(EXESUF) $(CPUFLAGS:%=-c%) $(ARGS) -$(FATE_AV_FFT_ALL): CMP = null - -fate-dct: fate-dct-float -fate-fft: fate-fft-float fate-fft-fixed32 -fate-mdct: fate-mdct-float -fate-rdft: fate-rdft-float - -FATE-$(call ALLYES, AVCODEC FFT MDCT) += $(FATE_FFT_ALL) $(FATE_FFT_FIXED32) $(FATE_AV_FFT_ALL) -fate-fft-all: $(FATE_FFT_ALL) $(FATE_FFT_FIXED32) $(FATE_AV_FFT_ALL)