ffmpeg/libswscale/aarch64/swscale.c
Swinney, Jonathan 75ffca7eef libswscale/aarch64: add another hscale specialization
This specialization handles the case where filtersize is 4 mod 8, e.g.
12, 20, etc. Aarch64 was previously using the c function for this case.
This implementation speeds up that case significantly.

hscale_8_to_15__fs_12_dstW_512_c: 6234.1
hscale_8_to_15__fs_12_dstW_512_neon: 1505.6

Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
2022-08-16 12:08:38 +03:00

72 lines
3.0 KiB
C

/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/aarch64/cpu.h"
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
SwsContext *c, int16_t *data, \
int dstW, const uint8_t *src, \
const int16_t *filter, \
const int32_t *filterPos, int filterSize)
#define SCALE_FUNCS(filter_n, opt) \
SCALE_FUNC(filter_n, 8, 15, opt);
#define ALL_SCALE_FUNCS(opt) \
SCALE_FUNCS(4, opt); \
SCALE_FUNCS(X8, opt); \
SCALE_FUNCS(X4, opt)
ALL_SCALE_FUNCS(neon);
void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do { \
if (c->srcBpc == 8 && c->dstBpc <= 14) { \
hscalefn = \
ff_hscale8to15_ ## filtersize ## _ ## opt; \
} \
} while (0)
#define ASSIGN_SCALE_FUNC(hscalefn, filtersize, opt) do { \
if (filtersize == 4) \
ASSIGN_SCALE_FUNC2(hscalefn, 4, opt); \
else if (filtersize % 8 == 0) \
ASSIGN_SCALE_FUNC2(hscalefn, X8, opt); \
else if (filtersize % 4 == 0 && filtersize % 8 != 0) \
ASSIGN_SCALE_FUNC2(hscalefn, X4, opt); \
} while (0)
av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
ASSIGN_SCALE_FUNC(c->hyScale, c->hLumFilterSize, neon);
ASSIGN_SCALE_FUNC(c->hcScale, c->hChrFilterSize, neon);
if (c->dstBpc == 8) {
c->yuv2planeX = ff_yuv2planeX_8_neon;
}
}
}