h264: make slice threading work with deblocking_filter=1

In such a case, decode the MBs in parallel without the loop filter, then
execute the filter serially.

The ref2frm array was previously moved to H264SliceContext. That was
incorrect, since it applies to all the slices and should properly be in
H264Context (it did not actually break decoding, since this distinction
only becomes relevant with slice threading and deblocking_filter=1,
which was not implemented before this commit). The ref2frm array is thus
moved back to H264Context.
This commit is contained in:
Anton Khirnov 2016-04-13 13:52:36 +02:00
parent 370ddc7b38
commit b77fffa127
3 changed files with 48 additions and 32 deletions

View File

@ -837,7 +837,6 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size)
nal->ref_idc == 0 && nal->type != NAL_SEI)
continue;
again:
// FIXME these should stop being context-global variables
h->nal_ref_idc = nal->ref_idc;
h->nal_unit_type = nal->type;
@ -947,10 +946,6 @@ again:
if (err < 0) {
av_log(h->avctx, AV_LOG_ERROR, "decode_slice_header error\n");
sl->ref_count[0] = sl->ref_count[1] = sl->list_count = 0;
} else if (err == 1) {
/* Slice could not be decoded in parallel mode, restart. */
sl = &h->slice_ctx[0];
goto again;
}
}
if (context_count) {

View File

@ -392,7 +392,6 @@ typedef struct H264SliceContext {
H264Ref ref_list[2][48]; /**< 0..15: frame refs, 16..47: mbaff field refs.
* Reordered version of default_ref_list
* according to picture reordering in slice header */
int ref2frm[MAX_SLICES][2][64]; ///< reference to frame number lists, used in the loop filter, the first 2 are for -2,-1
const uint8_t *intra_pcm_ptr;
int16_t *dc_val_base;
@ -470,6 +469,11 @@ typedef struct H264Context {
int context_initialized;
int flags;
int workaround_bugs;
/* Set when slice threading is used and at least one slice uses deblocking
* mode 1 (i.e. across slice boundaries). Then we disable the loop filter
* during normal MB decoding and execute it serially at the end.
*/
int postpone_filter;
int8_t(*intra4x4_pred_mode);
H264PredContext hpc;
@ -591,12 +595,6 @@ typedef struct H264Context {
int slice_context_count;
/**
* 1 if the single thread fallback warning has already been
* displayed, 0 otherwise.
*/
int single_decode_warning;
/** @} */
/**
@ -642,6 +640,7 @@ typedef struct H264Context {
AVBufferPool *mb_type_pool;
AVBufferPool *motion_val_pool;
AVBufferPool *ref_index_pool;
int ref2frm[MAX_SLICES][2][64]; ///< reference to frame number lists, used in the loop filter, the first 2 are for -2,-1
} H264Context;
extern const uint16_t ff_h264_mb_sizes[4];

View File

@ -498,6 +498,8 @@ static int h264_frame_start(H264Context *h)
h->next_output_pic = NULL;
h->postpone_filter = 0;
assert(h->cur_pic_ptr->long_ref == 0);
return 0;
@ -920,7 +922,7 @@ static int h264_slice_header_init(H264Context *h)
*
* @param h h264context
*
* @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
* @return 0 if okay, <0 if an error occurred
*/
int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
{
@ -1481,17 +1483,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
* Do not bother to deblock across slices. */
sl->deblocking_filter = 2;
} else {
h->max_contexts = 1;
if (!h->single_decode_warning) {
av_log(h->avctx, AV_LOG_INFO,
"Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
h->single_decode_warning = 1;
}
if (sl != h->slice_ctx) {
av_log(h->avctx, AV_LOG_ERROR,
"Deblocking switched inside frame.\n");
return 1;
}
h->postpone_filter = 1;
}
}
sl->qp_thresh = 15 -
@ -1509,7 +1501,7 @@ int ff_h264_decode_slice_header(H264Context *h, H264SliceContext *sl)
for (j = 0; j < 2; j++) {
int id_list[16];
int *ref2frm = sl->ref2frm[sl->slice_num & (MAX_SLICES - 1)][j];
int *ref2frm = h->ref2frm[sl->slice_num & (MAX_SLICES - 1)][j];
for (i = 0; i < 16; i++) {
id_list[i] = 60;
if (j < sl->list_count && i < sl->ref_count[j] &&
@ -1597,7 +1589,7 @@ static av_always_inline void fill_filter_caches_inter(const H264Context *h,
if (USES_LIST(top_type, list)) {
const int b_xy = h->mb2b_xy[top_xy] + 3 * b_stride;
const int b8_xy = 4 * top_xy + 2;
int (*ref2frm)[64] = sl->ref2frm[h->slice_table[top_xy] & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
int (*ref2frm)[64] = h->ref2frm[h->slice_table[top_xy] & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
AV_COPY128(mv_dst - 1 * 8, h->cur_pic.motion_val[list][b_xy + 0]);
ref_cache[0 - 1 * 8] =
ref_cache[1 - 1 * 8] = ref2frm[list][h->cur_pic.ref_index[list][b8_xy + 0]];
@ -1612,7 +1604,7 @@ static av_always_inline void fill_filter_caches_inter(const H264Context *h,
if (USES_LIST(left_type[LTOP], list)) {
const int b_xy = h->mb2b_xy[left_xy[LTOP]] + 3;
const int b8_xy = 4 * left_xy[LTOP] + 1;
int (*ref2frm)[64] = sl->ref2frm[h->slice_table[left_xy[LTOP]] & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
int (*ref2frm)[64] = h->ref2frm[h->slice_table[left_xy[LTOP]] & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
AV_COPY32(mv_dst - 1 + 0, h->cur_pic.motion_val[list][b_xy + b_stride * 0]);
AV_COPY32(mv_dst - 1 + 8, h->cur_pic.motion_val[list][b_xy + b_stride * 1]);
AV_COPY32(mv_dst - 1 + 16, h->cur_pic.motion_val[list][b_xy + b_stride * 2]);
@ -1645,7 +1637,7 @@ static av_always_inline void fill_filter_caches_inter(const H264Context *h,
{
int8_t *ref = &h->cur_pic.ref_index[list][4 * mb_xy];
int (*ref2frm)[64] = sl->ref2frm[sl->slice_num & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
int (*ref2frm)[64] = h->ref2frm[sl->slice_num & (MAX_SLICES - 1)][0] + (MB_MBAFF(sl) ? 20 : 2);
uint32_t ref01 = (pack16to32(ref2frm[list][ref[0]], ref2frm[list][ref[1]]) & 0x00FF00FF) * 0x0101;
uint32_t ref23 = (pack16to32(ref2frm[list][ref[2]], ref2frm[list][ref[3]]) & 0x00FF00FF) * 0x0101;
AV_WN32A(&ref_cache[0 * 8], ref01);
@ -1820,6 +1812,9 @@ static void loop_filter(const H264Context *h, H264SliceContext *sl, int start_x,
const int pixel_shift = h->pixel_shift;
const int block_h = 16 >> h->chroma_y_shift;
if (h->postpone_filter)
return;
if (sl->deblocking_filter) {
for (mb_x = start_x; mb_x < end_x; mb_x++)
for (mb_y = end_mb_y - FRAME_MBAFF(h); mb_y <= end_mb_y; mb_y++) {
@ -1944,6 +1939,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
H264SliceContext *sl = arg;
const H264Context *h = sl->h264;
int lf_x_start = sl->mb_x;
int orig_deblock = sl->deblocking_filter;
int ret;
sl->linesize = h->cur_pic_ptr->f->linesize[0];
@ -1955,6 +1951,9 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
sl->mb_skip_run = -1;
if (h->postpone_filter)
sl->deblocking_filter = 0;
sl->is_complex = FRAME_MBAFF(h) || h->picture_structure != PICT_FRAME ||
avctx->codec_id != AV_CODEC_ID_H264 ||
(CONFIG_GRAY && (h->flags & AV_CODEC_FLAG_GRAY));
@ -2004,7 +2003,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
sl->mb_y, ER_MB_END);
if (sl->mb_x >= lf_x_start)
loop_filter(h, sl, lf_x_start, sl->mb_x + 1);
return 0;
goto finish;
}
if (ret < 0 || sl->cabac.bytestream > sl->cabac.bytestream_end + 2) {
av_log(h->avctx, AV_LOG_ERROR,
@ -2035,7 +2034,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
sl->mb_y, ER_MB_END);
if (sl->mb_x > lf_x_start)
loop_filter(h, sl, lf_x_start, sl->mb_x);
return 0;
goto finish;
}
}
} else {
@ -2089,7 +2088,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y,
sl->mb_x - 1, sl->mb_y, ER_MB_END);
return 0;
goto finish;
} else {
er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y,
sl->mb_x - 1, sl->mb_y, ER_MB_END);
@ -2109,7 +2108,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
if (sl->mb_x > lf_x_start)
loop_filter(h, sl, lf_x_start, sl->mb_x);
return 0;
goto finish;
} else {
er_add_slice(sl, sl->resync_mb_x, sl->resync_mb_y, sl->mb_x,
sl->mb_y, ER_MB_ERROR);
@ -2119,6 +2118,10 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg)
}
}
}
finish:
sl->deblocking_filter = orig_deblock;
return 0;
}
/**
@ -2139,6 +2142,7 @@ int ff_h264_execute_decode_slices(H264Context *h, unsigned context_count)
int ret;
h->slice_ctx[0].next_slice_idx = h->mb_width * h->mb_height;
h->postpone_filter = 0;
ret = decode_slice(avctx, &h->slice_ctx[0]);
h->mb_y = h->slice_ctx[0].mb_y;
@ -2172,6 +2176,24 @@ int ff_h264_execute_decode_slices(H264Context *h, unsigned context_count)
h->mb_y = sl->mb_y;
for (i = 1; i < context_count; i++)
h->slice_ctx[0].er.error_count += h->slice_ctx[i].er.error_count;
if (h->postpone_filter) {
h->postpone_filter = 0;
for (i = 0; i < context_count; i++) {
int y_end, x_end;
sl = &h->slice_ctx[i];
y_end = FFMIN(sl->mb_y + 1, h->mb_height);
x_end = (sl->mb_y >= h->mb_height) ? h->mb_width : sl->mb_x;
for (j = sl->resync_mb_y; j < y_end; j += 1 + FIELD_OR_MBAFF_PICTURE(h)) {
sl->mb_y = j;
loop_filter(h, sl, j > sl->resync_mb_y ? 0 : sl->resync_mb_x,
j == y_end - 1 ? x_end : h->mb_width);
}
}
}
}
return 0;