avfilter/vf_lut3d: add x86-optimized tetrahedral interpolation

I spotted an interesting pattern that I didn't see before that leads to the implementation being faster.
The bit shifting table I was using before is no longer needed, and was able to remove quite a few lines. 
I also add use of FMA on the AVX2 version.

f32 1920x1080 1 thread with prelut
c impl
1434012700 UNITS in lut3d->interp,       1 runs,      0 skips
1434035335 UNITS in lut3d->interp,       2 runs,      0 skips
1423615347 UNITS in lut3d->interp,       4 runs,      0 skips
1426268863 UNITS in lut3d->interp,       8 runs,      0 skips

sse2
905484420 UNITS in lut3d->interp,       1 runs,      0 skips
905659010 UNITS in lut3d->interp,       2 runs,      0 skips
915167140 UNITS in lut3d->interp,       4 runs,      0 skips
915834222 UNITS in lut3d->interp,       8 runs,      0 skips

avx
574794860 UNITS in lut3d->interp,       1 runs,      0 skips
581035090 UNITS in lut3d->interp,       2 runs,      0 skips
584116720 UNITS in lut3d->interp,       4 runs,      0 skips
581460290 UNITS in lut3d->interp,       8 runs,      0 skips

avx2
301698880 UNITS in lut3d->interp,       1 runs,      0 skips
301982880 UNITS in lut3d->interp,       2 runs,      0 skips
306962430 UNITS in lut3d->interp,       4 runs,      0 skips
305472025 UNITS in lut3d->interp,       8 runs,      0 skips

gbrap16 1920x1080 1 thread with prelut
c impl
1480894840 UNITS in lut3d->interp,       1 runs,      0 skips
1502922990 UNITS in lut3d->interp,       2 runs,      0 skips
1496114307 UNITS in lut3d->interp,       4 runs,      0 skips
1492554551 UNITS in lut3d->interp,       8 runs,      0 skips

sse2
980777180 UNITS in lut3d->interp,       1 runs,      0 skips
986121520 UNITS in lut3d->interp,       2 runs,      0 skips
986489840 UNITS in lut3d->interp,       4 runs,      0 skips
998832248 UNITS in lut3d->interp,       8 runs,      0 skips

avx
622212360 UNITS in lut3d->interp,       1 runs,      0 skips
622981160 UNITS in lut3d->interp,       2 runs,      0 skips
645396315 UNITS in lut3d->interp,       4 runs,      0 skips
641057075 UNITS in lut3d->interp,       8 runs,      0 skips

avx2
321336400 UNITS in lut3d->interp,       1 runs,      0 skips
321268920 UNITS in lut3d->interp,       2 runs,      0 skips
323459895 UNITS in lut3d->interp,       4 runs,      0 skips
324949967 UNITS in lut3d->interp,       8 runs,      0 skips
This commit is contained in:
Mark Reid 2021-10-05 20:58:30 -07:00 committed by Paul B Mahol
parent 5133f4c2c1
commit 716b396740
5 changed files with 840 additions and 56 deletions

83
libavfilter/lut3d.h Normal file
View File

@ -0,0 +1,83 @@
/*
* Copyright (c) 2013 Clément Bœsch
* Copyright (c) 2018 Paul B Mahol
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVFILTER_LUT3D_H
#define AVFILTER_LUT3D_H
#include "libavutil/pixdesc.h"
#include "framesync.h"
#include "avfilter.h"
enum interp_mode {
INTERPOLATE_NEAREST,
INTERPOLATE_TRILINEAR,
INTERPOLATE_TETRAHEDRAL,
INTERPOLATE_PYRAMID,
INTERPOLATE_PRISM,
NB_INTERP_MODE
};
struct rgbvec {
float r, g, b;
};
/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
* of 512x512 (64x64x64) */
#define MAX_LEVEL 256
#define PRELUT_SIZE 65536
typedef struct Lut3DPreLut {
int size;
float min[3];
float max[3];
float scale[3];
float* lut[3];
} Lut3DPreLut;
typedef struct LUT3DContext {
const AVClass *class;
struct rgbvec *lut;
int lutsize;
int lutsize2;
struct rgbvec scale;
int interpolation; ///<interp_mode
char *file;
uint8_t rgba_map[4];
int step;
avfilter_action_func *interp;
Lut3DPreLut prelut;
#if CONFIG_HALDCLUT_FILTER
uint8_t clut_rgba_map[4];
int clut_step;
int clut_bits;
int clut_planar;
int clut_float;
int clut_width;
FFFrameSync fs;
#endif
} LUT3DContext;
typedef struct ThreadData {
AVFrame *in, *out;
} ThreadData;
void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc);
#endif /* AVFILTER_LUT3D_H */

View File

@ -31,73 +31,18 @@
#include "libavutil/intreadwrite.h"
#include "libavutil/intfloat.h"
#include "libavutil/avassert.h"
#include "libavutil/pixdesc.h"
#include "libavutil/avstring.h"
#include "avfilter.h"
#include "drawutils.h"
#include "formats.h"
#include "framesync.h"
#include "internal.h"
#include "video.h"
#include "lut3d.h"
#define R 0
#define G 1
#define B 2
#define A 3
enum interp_mode {
INTERPOLATE_NEAREST,
INTERPOLATE_TRILINEAR,
INTERPOLATE_TETRAHEDRAL,
INTERPOLATE_PYRAMID,
INTERPOLATE_PRISM,
NB_INTERP_MODE
};
struct rgbvec {
float r, g, b;
};
/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
* of 512x512 (64x64x64) */
#define MAX_LEVEL 256
#define PRELUT_SIZE 65536
typedef struct Lut3DPreLut {
int size;
float min[3];
float max[3];
float scale[3];
float* lut[3];
} Lut3DPreLut;
typedef struct LUT3DContext {
const AVClass *class;
int interpolation; ///<interp_mode
char *file;
uint8_t rgba_map[4];
int step;
avfilter_action_func *interp;
struct rgbvec scale;
struct rgbvec *lut;
int lutsize;
int lutsize2;
Lut3DPreLut prelut;
#if CONFIG_HALDCLUT_FILTER
uint8_t clut_rgba_map[4];
int clut_step;
int clut_bits;
int clut_planar;
int clut_float;
int clut_width;
FFFrameSync fs;
#endif
} LUT3DContext;
typedef struct ThreadData {
AVFrame *in, *out;
} ThreadData;
#define OFFSET(x) offsetof(LUT3DContext, x)
#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
#define TFLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
@ -1203,6 +1148,10 @@ static int config_input(AVFilterLink *inlink)
av_assert0(0);
}
if (ARCH_X86) {
ff_lut3d_init_x86(lut3d, desc);
}
return 0;
}

View File

@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o
OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o
OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o
OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o
OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o
OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp_init.o
OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o
OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o
X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o
X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o
X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o
X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o

View File

@ -0,0 +1,662 @@
;*****************************************************************************
;* x86-optimized functions for lut3d filter
;*
;* Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pd_1f: times 8 dd 1.0
pd_3f: times 8 dd 3.0
pd_65535f: times 8 dd 65535.0
pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0
pb_shuffle16: db 0, 1, 0x80, 0x80, \
2, 3, 0x80, 0x80, \
4, 5, 0x80, 0x80, \
6, 7, 0x80, 0x80
pb_lo_pack_shuffle16: db 0, 1, 4, 5, \
8, 9, 12, 13, \
0x80, 0x80, 0x80, 0x80, \
0x80, 0x80, 0x80, 0x80
pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \
0x80, 0x80, 0x80, 0x80, \
0, 1, 4, 5, \
8, 9, 12, 13
SECTION .text
struc Lut3DPreLut
.size: resd 1
.min: resd 3
.max: resd 3
.scale: resd 3
.lut: resq 3
endstruc
struc LUT3DContext
.class: resq 1
.lut: resq 1
.lutsize: resd 1
.lutsize2: resd 1
.scale: resd 3
endstruc
%define AV_NUM_DATA_POINTERS 8
struc AVFrame
.data: resq AV_NUM_DATA_POINTERS
.linesize: resd AV_NUM_DATA_POINTERS
.extended_data: resq 1
.width: resd 1
.height: resd 1
endstruc
%define rm rsp
%define gm rsp+mmsize
%define bm rsp+(mmsize*2)
%define lut3dsizem [rsp+mmsize*3]
%define lut3dsize2m [rsp+mmsize*4]
%define lut3dmaxm [rsp+mmsize*5]
%define prelutmaxm [rsp+mmsize*6]
%define scalerm [rsp+mmsize*7]
%define scalegm [rsp+mmsize*8]
%define scalebm [rsp+mmsize*9]
%define prelutminrm [rsp+mmsize*10]
%define prelutmingm [rsp+mmsize*11]
%define prelutminbm [rsp+mmsize*12]
%define prelutscalerm [rsp+mmsize*13]
%define prelutscalegm [rsp+mmsize*14]
%define prelutscalebm [rsp+mmsize*15]
; data pointers
%define srcrm [rsp+mmsize*16 + 0]
%define srcgm [rsp+mmsize*16 + 8]
%define srcbm [rsp+mmsize*16 + 16]
%define srcam [rsp+mmsize*16 + 24]
%define dstrm [rsp+mmsize*16 + 32]
%define dstgm [rsp+mmsize*16 + 40]
%define dstbm [rsp+mmsize*16 + 48]
%define dstam [rsp+mmsize*16 + 56]
; 1 - prev
; 2 - next
; 3 - offset
%macro FETCH_PRELUT_PN 3
mov tmp2d, [rm + %3]
mov tmp3d, [gm + %3]
movss xm%1, [tmpq + tmp2q*4]
movss xm%2, [tmpq + tmp3q*4]
movss [rm + %3], xm%1
movss [gm + %3], xm%2
%endmacro
; 1 - p
; 2 - n
; 3 - p indices
; 4 - n indices
%macro GATHER_PRELUT 4
%if cpuflag(avx2)
vpcmpeqb m7, m7
vgatherdps m%1, [tmpq + m%3*4], m7 ; p
vpcmpeqb m9, m9
vgatherdps m%2, [tmpq + m%4*4], m9 ; n
%else
mova [rm], m%3
mova [gm], m%4
FETCH_PRELUT_PN %1, %2, 0
FETCH_PRELUT_PN %1, %2, 4
FETCH_PRELUT_PN %1, %2, 8
FETCH_PRELUT_PN %1, %2, 12
%if mmsize > 16
FETCH_PRELUT_PN %1, %2, 16
FETCH_PRELUT_PN %1, %2, 20
FETCH_PRELUT_PN %1, %2, 24
FETCH_PRELUT_PN %1, %2, 28
%endif
movu m%1, [rm]
movu m%2, [gm]
%endif
%endmacro
%macro FLOORPS 2
%if mmsize > 16
vroundps %1, %2, 0x01
%else
cvttps2dq %1, %2
cvtdq2ps %1, %1
%endif
%endmacro
; %1 = %2 * %3 + %1
%macro MADD3 3
%if cpuflag(avx2)
vfmadd231ps %1, %2, %3
%else
mulps %2, %2, %3
addps %1, %1, %2
%endif
%endmacro
; 1 - dst
; 2 - index
; 3 - min
; 4 - scale
; assumes lut max m13, m14 1.0f, zero m15
%macro APPLY_PRELUT 4
; scale
subps m5, m%1, %3 ; v - min
mulps m5, m5, %4 ; v * scale
; clamp
maxps m5, m5, m15 ; max zero, Max first, NAN set to zero
minps m5, m5, m13 ; min lut max
FLOORPS m3, m5 ; prev index
subps m5, m5, m3 ; d
addps m4, m3, m14 ; p+1 = n index
minps m4, m4, m13 ; clamp n idex
mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8]
cvttps2dq m6, m3
cvttps2dq m10, m4
GATHER_PRELUT %1, 4, 6, 10
; lerp
subps m8, m4, m%1
MADD3 m%1, m8, m5
%endmacro
; 1 - dst
; 2 - scale
; assumes lut max m13, zero m15
%macro APPLY_SCALE 2
mulps m%1, m%1, %2
maxps m%1, m%1, m15 ; Max first, NAN set to zero
minps m%1, m%1, m13
%endmacro
%macro BLEND 4
%if mmsize > 16
vblendvps %1, %2, %3, %4
%else
%ifidni %1,%2
%error operand 1 must not equal operand 2
%endif
%ifidni %1,%3
%error operand 1 must not equal operand 3
%endif
mova %1, %2
xorps %1, %3
andps %1, %4
xorps %1, %2
%endif
%endmacro
%macro ADD3 4
addps %1, %2, %3
addps %1, %1, %4
%endmacro
%macro FETCH_LUT3D_RGB 4
mov tmp2d, [rm + %4]
movss xm%1, [tmpq + tmp2q*4 + 0]
movss xm%2, [tmpq + tmp2q*4 + 4]
movss xm%3, [tmpq + tmp2q*4 + 8]
movss [rm + %4], xm%1
movss [gm + %4], xm%2
movss [bm + %4], xm%3
%endmacro
; 1 - dstr
; 2 - dstg
; 3 - dstb
; 4 - indices
%macro GATHER_LUT3D_INDICES 4
%if cpuflag(avx2)
vpcmpeqb m3, m3
vgatherdps m%1, [tmpq + m%4*4 + 0], m3
vpcmpeqb m14, m14
vgatherdps m%2, [tmpq + m%4*4 + 4], m14
vpcmpeqb m15, m15
vgatherdps m%3, [tmpq + m%4*4 + 8], m15
%else
movu [rm], m%4
FETCH_LUT3D_RGB %1, %2, %3, 0
FETCH_LUT3D_RGB %1, %2, %3, 4
FETCH_LUT3D_RGB %1, %2, %3, 8
FETCH_LUT3D_RGB %1, %2, %3, 12
%if mmsize > 16
FETCH_LUT3D_RGB %1, %2, %3, 16
FETCH_LUT3D_RGB %1, %2, %3, 20
FETCH_LUT3D_RGB %1, %2, %3, 24
FETCH_LUT3D_RGB %1, %2, %3, 28
%endif
movu m%1, [rm]
movu m%2, [gm]
movu m%3, [bm]
%endif
%endmacro
%macro interp_tetrahedral 0
%define d_r m0
%define d_g m1
%define d_b m2
%define prev_r m3
%define prev_g m4
%define prev_b m5
%define next_r m6
%define next_g m7
%define next_b m8
%define x0 m4
%define x1 m5
%define x2 m6
; setup prev index
FLOORPS prev_r, m0
FLOORPS prev_g, m1
FLOORPS prev_b, m2
; setup deltas
subps d_r, m0, prev_r
subps d_g, m1, prev_g
subps d_b, m2, prev_b
; setup next index
addps next_r, prev_r, m14 ; +1
minps next_r, next_r, m13 ; clamp lutmax
addps next_g, prev_g, m14 ; +1
minps next_g, next_g, m13 ; clamp lutmax
addps next_b, prev_b, m14 ; +1
minps next_b, next_b, m13 ; clamp lutmax
; prescale indices
mulps prev_r, prev_r, lut3dsize2m
mulps next_r, next_r, lut3dsize2m
mulps prev_g, prev_g, lut3dsizem
mulps next_g, next_g, lut3dsizem
mulps prev_b, prev_b, [pd_3f]
mulps next_b, next_b, [pd_3f]
; cxxxa m10
; 1 is the delta that is the largest
; r> == c100 == (r>g && r>b)
; g> == c010 == (g>r && g>b)
; b> == c001 == (b>r && b>g)
; if delta > other 2 use next else prev
; cxxxb m11;
; 0 is the delta that is the smallest
; r< == c011 == (r<=g && r<=b)
; g< == c101 == (g<=r && g<=b)
; b< == c110 == (b<=r && b<=g)
; if delta <= other 2 use prev else next
cmpps m13, d_r, d_g, 0x1E ; r>g
cmpps m14, d_g, d_b, 0x1E ; g>b
cmpps m15, d_b, d_r, 0x1E ; b>r
; r> !b>r && r>g
andnps m9, m15, m13
BLEND m10, prev_r, next_r, m9
; r< !r>g && b>r
andnps m9, m13, m15
BLEND m11, next_r, prev_r, m9
; g> !r>g && g>b
andnps m9, m13, m14
BLEND m12, prev_g, next_g, m9
addps m10, m12
; g< !g>b && r>g
andnps m9, m14, m13
BLEND m12, next_g, prev_g, m9
addps m11, m12
; b> !g>b && b>r
andnps m9, m14, m15
BLEND m12, prev_b, next_b, m9
addps m10, m12
; b< !b>r && g>b
andnps m9, m15, m14
BLEND m12, next_b, prev_b, m9
addps m11, m12
; c000 m12;
ADD3 m12, prev_r, prev_g, prev_b
; c111 m13;
ADD3 m13, next_r, next_g, next_b
; sort delta r,g,b x0 >= x1 >= x2
minps m7, d_r, d_g
maxps m8, d_r, d_g
minps x2, m7, d_b
maxps m7, m7, d_b
maxps x0, m8, d_b
minps x1, m8, m7
; convert indices to integer
cvttps2dq m12, m12
cvttps2dq m10, m10
cvttps2dq m11, m11
cvttps2dq m13, m13
; now the gathering festival
mov tmpq, [ctxq + LUT3DContext.lut]
GATHER_LUT3D_INDICES 0, 1, 2, 12
movu m14, [pd_1f]
subps m14, m14, x0; 1 - x0
mulps m0, m0, m14
mulps m1, m1, m14
mulps m2, m2, m14
GATHER_LUT3D_INDICES 7, 8, 9, 10
subps m14, x0, x1; x0 - x1
MADD3 m0, m7, m14
MADD3 m1, m8, m14
MADD3 m2, m9, m14
GATHER_LUT3D_INDICES 7, 8, 9, 11
subps m14, x1, x2; x1 - x2
MADD3 m0, m7, m14
MADD3 m1, m8, m14
MADD3 m2, m9, m14
GATHER_LUT3D_INDICES 7, 8, 9, 13
MADD3 m0, m7, x2
MADD3 m1, m8, x2
MADD3 m2, m9, x2
%endmacro
%macro INIT_DATA_PTR 3
mov ptrq, [%2 + AVFrame.data + %3 * 8]
mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
imul tmpd, slice_startd
add ptrq, tmpq
mov %1, ptrq
%endmacro
%macro INC_DATA_PTR 3
mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
mov ptrq, %1
add ptrq, tmpq
mov %1, ptrq
%endmacro
%macro LOAD16 2
mov ptrq, %2
%if mmsize > 16
movu xm%1, [ptrq + xq*2]
%else
movsd xm%1, [ptrq + xq*2]
%endif
%if cpuflag(avx2)
vpmovzxwd m%1, xm%1
%else
%if mmsize > 16
pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0)
pshufb xm%1, xm6 ; pb_shuffle16
pshufb xm4, xm6 ; pb_shuffle16
vinsertf128 m%1, m%1, xm4, 1
%else
pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0)
pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
%endif
%endif
cvtdq2ps m%1, m%1
mulps m%1, m%1, m7 ; pd_65535_invf
%endmacro
%macro STORE16 2
mulps m%2, m%2, m5 ; [pd_65535f]
minps m%2, m%2, m5 ; [pd_65535f]
maxps m%2, m%2, m15 ; zero
cvttps2dq m%2, m%2
%if mmsize > 16
vextractf128 xm4, m%2, 1
pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16]
pshufb xm4, xm7 ; [pb_hi_pack_shuffle16]
por xm%2, xm4
%else
pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
pshufd xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)
%endif
mov ptrq, %1
%if mmsize > 16
movu [ptrq + xq*2], xm%2
%else
movsd [ptrq + xq*2], xm%2
%endif
%endmacro
; 1 - interp method
; 2 - format_name
; 3 - depth
; 4 - is float format
%macro DEFINE_INTERP_FUNC 4
cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr, tmp, tmp2, tmp3
; store lut max and lutsize
mov tmpd, dword [ctxq + LUT3DContext.lutsize]
cvtsi2ss xm0, tmpd
mulss xm0, xm0, [pd_3f]
VBROADCASTSS m0, xm0
mova lut3dsizem, m0
sub tmpd, 1
cvtsi2ss xm0, tmpd
VBROADCASTSS m0, xm0
mova lut3dmaxm, m0
; scale_r
mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4]
VBROADCASTSS m1, xm1
mova scalerm, m1
; scale_g
mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4]
VBROADCASTSS m1, xm1
mova scalegm, m1
; scale_b
mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4]
VBROADCASTSS m1, xm1
mova scalebm, m1
; store lutsize2
cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2]
mulss xm0, xm0, [pd_3f]
VBROADCASTSS m0, xm0
mova lut3dsize2m, m0
; init prelut values
cmp prelutq, 0
je %%skip_init_prelut
mov tmpd, dword [prelutq + Lut3DPreLut.size]
sub tmpd, 1
cvtsi2ss xm0, tmpd
VBROADCASTSS m0, xm0
mova prelutmaxm, m0
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4]
mova prelutminrm, m0
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4]
mova prelutmingm, m0
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4]
mova prelutminbm, m0
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4]
mova prelutscalerm, m0
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4]
mova prelutscalegm, m0
VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4]
mova prelutscalebm, m0
%%skip_init_prelut:
mov widthd, [src_imageq + AVFrame.width]
; gbra pixel order
INIT_DATA_PTR srcrm, src_imageq, 2
INIT_DATA_PTR srcgm, src_imageq, 0
INIT_DATA_PTR srcbm, src_imageq, 1
INIT_DATA_PTR srcam, src_imageq, 3
INIT_DATA_PTR dstrm, dst_imageq, 2
INIT_DATA_PTR dstgm, dst_imageq, 0
INIT_DATA_PTR dstbm, dst_imageq, 1
INIT_DATA_PTR dstam, dst_imageq, 3
%%loop_y:
xor xq, xq
%%loop_x:
movu m14, [pd_1f]
xorps m15, m15, m15
%if %4 ; float
mov ptrq, srcrm
movu m0, [ptrq + xq*4]
mov ptrq, srcgm
movu m1, [ptrq + xq*4]
mov ptrq, srcbm
movu m2, [ptrq + xq*4]
%else
; constants for LOAD16
movu m7, [pd_65535_invf]
%if notcpuflag(avx2) && mmsize >= 32
movu xm6, [pb_shuffle16]
%endif
LOAD16 0, srcrm
LOAD16 1, srcgm
LOAD16 2, srcbm
%endif
cmp prelutq, 0
je %%skip_prelut
mova m13, prelutmaxm
APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm
APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm
APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm
%%skip_prelut:
mova m13, lut3dmaxm
APPLY_SCALE 0, scalerm
APPLY_SCALE 1, scalegm
APPLY_SCALE 2, scalebm
interp_%1
%if %4 ; float
mov ptrq, dstrm
movu [ptrq + xq*4], m0
mov ptrq, dstgm
movu [ptrq + xq*4], m1
mov ptrq, dstbm
movu [ptrq + xq*4], m2
cmp has_alphad, 0
je %%skip_alphaf
mov ptrq, srcam
movu m0, [ptrq + xq*4]
mov ptrq, dstam
movu [ptrq + xq*4], m0
%%skip_alphaf:
%else
; constants for STORE16
movu m5, [pd_65535f]
%if mmsize > 16
movu xm6, [pb_lo_pack_shuffle16]
movu xm7, [pb_hi_pack_shuffle16]
%endif
xorps m15, m15, m15
STORE16 dstrm, 0
STORE16 dstgm, 1
STORE16 dstbm, 2
cmp has_alphad, 0
je %%skip_alpha
%if mmsize > 16
mov ptrq, srcam
movu xm0, [ptrq + xq*2]
mov ptrq, dstam
movu [ptrq + xq*2], xm0
%else
mov ptrq, srcam
movsd xm0, [ptrq + xq*2]
mov ptrq, dstam
movsd [ptrq + xq*2], xm0
%endif
%%skip_alpha:
%endif
add xq, mmsize/4
cmp xd, widthd
jl %%loop_x
INC_DATA_PTR srcrm, src_imageq, 2
INC_DATA_PTR srcgm, src_imageq, 0
INC_DATA_PTR srcbm, src_imageq, 1
INC_DATA_PTR srcam, src_imageq, 3
INC_DATA_PTR dstrm, dst_imageq, 2
INC_DATA_PTR dstgm, dst_imageq, 0
INC_DATA_PTR dstbm, dst_imageq, 1
INC_DATA_PTR dstam, dst_imageq, 3
inc slice_startd
cmp slice_startd, slice_endd
jl %%loop_y
RET
%endmacro
%if ARCH_X86_64
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
%endif
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
%endif
INIT_XMM sse2
DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
%endif

View File

@ -0,0 +1,88 @@
/*
* Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavfilter/lut3d.h"
#define DEFINE_INTERP_FUNC(name, format, opt) \
void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d, Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int slice_end, int has_alpha); \
static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \
{ \
LUT3DContext *lut3d = ctx->priv; \
Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: NULL; \
ThreadData *td = arg; \
AVFrame *in = td->in; \
AVFrame *out = td->out; \
int has_alpha = in->linesize[3] && out != in; \
int slice_start = (in->height * jobnr ) / nb_jobs; \
int slice_end = (in->height * (jobnr+1)) / nb_jobs; \
ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out, slice_start, slice_end, has_alpha); \
return 0; \
}
#if ARCH_X86_64
#if HAVE_AVX2_EXTERNAL
DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2)
DEFINE_INTERP_FUNC(tetrahedral, p16, avx2)
#endif
#if HAVE_AVX_EXTERNAL
DEFINE_INTERP_FUNC(tetrahedral, pf32, avx)
DEFINE_INTERP_FUNC(tetrahedral, p16, avx)
#endif
DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2)
DEFINE_INTERP_FUNC(tetrahedral, p16, sse2)
#endif
av_cold void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc)
{
int cpu_flags = av_get_cpu_flags();
int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR;
int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT;
int depth = desc->comp[0].depth;
#if ARCH_X86_64
if (EXTERNAL_AVX2_FAST(cpu_flags) && EXTERNAL_FMA3(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
#if HAVE_AVX2_EXTERNAL
if (isfloat && planar) {
s->interp = interp_tetrahedral_pf32_avx2;
} else if (depth == 16) {
s->interp = interp_tetrahedral_p16_avx2;
}
#endif
} else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
#if HAVE_AVX_EXTERNAL
if (isfloat) {
s->interp = interp_tetrahedral_pf32_avx;
} else if (depth == 16) {
s->interp = interp_tetrahedral_p16_avx;
}
#endif
} else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
if (isfloat) {
s->interp = interp_tetrahedral_pf32_sse2;
} else if (depth == 16) {
s->interp = interp_tetrahedral_p16_sse2;
}
}
#endif
}