lavu/tx: refactor and separate codelet list and prio code

This commit is contained in:
Lynne 2022-11-17 22:14:53 +01:00
parent 958b3760b5
commit 1c8d77a2bf
No known key found for this signature in database
GPG Key ID: A2FEA5F03F034464

View File

@ -300,6 +300,67 @@ static const FFTXCodelet * const ff_tx_null_list[] = {
NULL,
};
/* Array of all compiled codelet lists. Order is irrelevant. */
static const FFTXCodelet * const * const codelet_list[] = {
ff_tx_codelet_list_float_c,
ff_tx_codelet_list_double_c,
ff_tx_codelet_list_int32_c,
ff_tx_null_list,
#if HAVE_X86ASM
ff_tx_codelet_list_float_x86,
#endif
#if ARCH_AARCH64
ff_tx_codelet_list_float_aarch64,
#endif
};
static const int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
static const int cpu_slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW |
AV_CPU_FLAG_ATOM | AV_CPU_FLAG_SSSE3SLOW |
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER;
static const int cpu_slow_penalties[][2] = {
{ AV_CPU_FLAG_SSE2SLOW, 1 + 64 },
{ AV_CPU_FLAG_SSE3SLOW, 1 + 64 },
{ AV_CPU_FLAG_SSSE3SLOW, 1 + 64 },
{ AV_CPU_FLAG_ATOM, 1 + 128 },
{ AV_CPU_FLAG_AVXSLOW, 1 + 128 },
{ AV_CPU_FLAG_SLOW_GATHER, 1 + 32 },
};
static int get_codelet_prio(const FFTXCodelet *cd, int cpu_flags, int len)
{
int prio = cd->prio;
int max_factor = 0;
/* If the CPU has a SLOW flag, and the instruction is also flagged
* as being slow for such, reduce its priority */
for (int i = 0; i < FF_ARRAY_ELEMS(cpu_slow_penalties); i++) {
if ((cpu_flags & cd->cpu_flags) & cpu_slow_penalties[i][0])
prio -= cpu_slow_penalties[i][1];
}
/* Prioritize aligned-only codelets */
if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
prio += 64;
/* Codelets for specific lengths are generally faster */
if ((len == cd->min_len) && (len == cd->max_len))
prio += 64;
/* Forward-only or inverse-only transforms are generally better */
if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
prio += 64;
/* Larger factors are generally better */
for (int i = 0; i < TX_MAX_SUB; i++)
max_factor = FFMAX(cd->factors[i], max_factor);
if (max_factor)
prio += 16*max_factor;
return prio;
}
#if !CONFIG_SMALL
static void print_flags(AVBPrint *bp, uint64_t f)
{
@ -465,41 +526,15 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
AVTXContext *sub = NULL;
TXCodeletMatch *cd_tmp, *cd_matches = NULL;
unsigned int cd_matches_size = 0;
int codelet_list_idx = codelet_list_num;
int nb_cd_matches = 0;
#if !CONFIG_SMALL
AVBPrint bp = { 0 };
#endif
/* Array of all compiled codelet lists. Order is irrelevant. */
const FFTXCodelet * const * const codelet_list[] = {
ff_tx_codelet_list_float_c,
ff_tx_codelet_list_double_c,
ff_tx_codelet_list_int32_c,
ff_tx_null_list,
#if HAVE_X86ASM
ff_tx_codelet_list_float_x86,
#endif
#if ARCH_AARCH64
ff_tx_codelet_list_float_aarch64,
#endif
};
int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
/* We still accept functions marked with SLOW, even if the CPU is
* marked with the same flag, but we give them lower priority. */
const int cpu_flags = av_get_cpu_flags();
const int slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW |
AV_CPU_FLAG_ATOM | AV_CPU_FLAG_SSSE3SLOW |
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER;
static const int slow_penalties[][2] = {
{ AV_CPU_FLAG_SSE2SLOW, 1 + 64 },
{ AV_CPU_FLAG_SSE3SLOW, 1 + 64 },
{ AV_CPU_FLAG_SSSE3SLOW, 1 + 64 },
{ AV_CPU_FLAG_ATOM, 1 + 128 },
{ AV_CPU_FLAG_AVXSLOW, 1 + 128 },
{ AV_CPU_FLAG_SLOW_GATHER, 1 + 32 },
};
/* Flags the transform wants */
uint64_t req_flags = flags;
@ -519,13 +554,11 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
/* Loop through all codelets in all codelet lists to find matches
* to the requirements */
while (codelet_list_num--) {
const FFTXCodelet * const * list = codelet_list[codelet_list_num];
while (codelet_list_idx--) {
const FFTXCodelet * const * list = codelet_list[codelet_list_idx];
const FFTXCodelet *cd = NULL;
while ((cd = *list++)) {
int max_factor = 0;
/* Check if the type matches */
if (cd->type != TX_TYPE_ANY && type != cd->type)
continue;
@ -546,7 +579,7 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
/* Check if the CPU supports the required ISA */
if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL &&
!(cpu_flags & (cd->cpu_flags & ~slow_mask)))
!(cpu_flags & (cd->cpu_flags & ~cpu_slow_mask)))
continue;
/* Check for factors */
@ -563,33 +596,7 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
cd_matches = cd_tmp;
cd_matches[nb_cd_matches].cd = cd;
cd_matches[nb_cd_matches].prio = cd->prio;
/* If the CPU has a SLOW flag, and the instruction is also flagged
* as being slow for such, reduce its priority */
for (int i = 0; i < FF_ARRAY_ELEMS(slow_penalties); i++) {
if ((cpu_flags & cd->cpu_flags) & slow_penalties[i][0])
cd_matches[nb_cd_matches].prio -= slow_penalties[i][1];
}
/* Prioritize aligned-only codelets */
if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
cd_matches[nb_cd_matches].prio += 64;
/* Codelets for specific lengths are generally faster */
if ((len == cd->min_len) && (len == cd->max_len))
cd_matches[nb_cd_matches].prio += 64;
/* Forward-only or inverse-only transforms are generally better */
if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
cd_matches[nb_cd_matches].prio += 64;
/* Larger factors are generally better */
for (int i = 0; i < TX_MAX_SUB; i++)
max_factor = FFMAX(cd->factors[i], max_factor);
if (max_factor)
cd_matches[nb_cd_matches].prio += 16*max_factor;
cd_matches[nb_cd_matches].prio = get_codelet_prio(cd, cpu_flags, len);
nb_cd_matches++;
}
}