From d6083f503d5bd7f9a2540c3e30d95e7add765d1e Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sun, 25 Feb 2024 10:49:35 -0500 Subject: [PATCH] avcodec/x86/hevc: fix luma 12b overflow Signed-off-by: J. Dekker --- libavcodec/x86/hevc_deblock.asm | 40 +++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm index 85ee4800bb..61b79f8079 100644 --- a/libavcodec/x86/hevc_deblock.asm +++ b/libavcodec/x86/hevc_deblock.asm @@ -541,19 +541,41 @@ ALIGN 16 add betaq, r13 shr betaq, 3; ((beta + (beta >> 1)) >> 3)) - mova m13, [pw_8] psubw m12, m4, m3 ; q0 - p0 - psllw m10, m12, 3; 8 * (q0 - p0) - paddw m12, m10 ; 9 * (q0 - p0) - + paddw m10, m12, m12 + paddw m12, m10 ; 3 * (q0 - p0) psubw m10, m5, m2 ; q1 - p1 - psllw m8, m10, 1; 2 * ( q1 - p1 ) - paddw m10, m8; 3 * ( q1 - p1 ) - psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 ) - paddw m12, m13; + 8 + psubw m12, m10 ; 3 * (q0 - p0) - (q1 - p1) +%if %1 < 12 + paddw m10, m12, m12 + paddw m12, [pw_8]; + 8 + paddw m12, m10 ; 9 * (q0 - p0) - 3 * ( q1 - p1 ) psraw m12, 4; >> 4 , delta0 PABSW m13, m12; abs(delta0) - +%elif cpuflag(ssse3) + pabsw m13, m12 + paddw m10, m13, m13 + paddw m13, [pw_8] + paddw m13, m10 ; abs(9 * (q0 - p0) - 3 * ( q1 - p1 )) + pxor m10, m10 + pcmpgtw m10, m12 + paddw m13, m10 + psrlw m13, 4; >> 4, abs(delta0) + psignw m10, m13, m12 + SWAP 10, 12 +%else + pxor m10, m10 + pcmpgtw m10, m12 + pxor m12, m10 + psubw m12, m10 ; abs() + paddw m13, m12, m12 + paddw m12, [pw_8] + paddw m13, m12 ; 3*abs(m12) + paddw m13, m10 + psrlw m13, 4 + pxor m12, m13, m10 + psubw m12, m10 +%endif psllw m10, m9, 2; 8 * tc paddw m10, m9; 10 * tc