Skip to content

Commit

Permalink
Backport cbi buffer optimization from dav1d 1.3.0 (#1121)
Browse files Browse the repository at this point in the history
Account for chroma subsampling when allocating cbi buffers.

Reduces memory usage (by 3 kB per sb128 for 4:2:0) when decoding streams
with subsampled chroma when frame threading is enabled.
  • Loading branch information
fbossen authored May 28, 2024
2 parents a8ee17e + 7b3e1ac commit 1aae512
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 49 deletions.
21 changes: 17 additions & 4 deletions src/decode.c
Original file line number Diff line number Diff line change
Expand Up @@ -2484,6 +2484,9 @@ static void setup_tile(Dav1dTileState *const ts,
ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 8] :
NULL;
ts->frame_thread[p].cbi = f->frame_thread.cbi ?
&f->frame_thread.cbi[(size_t)tile_start_off * size_mul[0] / 64] :
NULL;
ts->frame_thread[p].cf = f->frame_thread.cf ?
(uint8_t*)f->frame_thread.cf +
(((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
Expand Down Expand Up @@ -2855,6 +2858,19 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
}
}

const int cbi_sz = num_sb128 * size_mul[0];
if (cbi_sz != f->frame_thread.cbi_sz) {
dav1d_free_aligned(f->frame_thread.cbi);
f->frame_thread.cbi =
dav1d_alloc_aligned(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
cbi_sz * 32 * 32 / 4, 64);
if (!f->frame_thread.cbi) {
f->frame_thread.cbi_sz = 0;
return retval;
}
f->frame_thread.cbi_sz = cbi_sz;
}

const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
if (cf_sz != f->frame_thread.cf_sz) {
dav1d_freep_aligned(&f->frame_thread.cf);
Expand Down Expand Up @@ -3012,12 +3028,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
}
if (c->n_fc > 1) {
freep(&f->frame_thread.b);
freep(&f->frame_thread.cbi);
f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) *
num_sb128 * 32 * 32);
f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) *
num_sb128 * 32 * 32);
if (!f->frame_thread.b || !f->frame_thread.cbi) {
if (!f->frame_thread.b) {
f->lf.mask_sz = 0;
return retval;
}
Expand Down
19 changes: 14 additions & 5 deletions src/decode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3991,6 +3991,14 @@ fn setup_tile(
},
Ordering::Relaxed,
);
ts.frame_thread[p].cbi_idx.store(
if !frame_thread.cbi.is_empty() {
tile_start_off * size_mul[0] as usize / 64
} else {
0
},
Ordering::Relaxed,
);
ts.frame_thread[p].cf.store(
if !frame_thread.cf.is_empty() {
let bpc = BPC::from_bitdepth_max(bitdepth_max);
Expand Down Expand Up @@ -4472,6 +4480,12 @@ pub(crate) fn rav1d_decode_frame_init(c: &Rav1dContext, fc: &Rav1dFrameContext)
}
}

let cbi_sz = num_sb128 * size_mul[0] as c_int;
// TODO: Fallible allocation
f.frame_thread
.cbi
.resize_with(cbi_sz as usize * 32 * 32 / 4, Default::default);

let cf_sz = (num_sb128 * size_mul[0] as c_int) << hbd;
// TODO: Fallible allocation
f.frame_thread
Expand Down Expand Up @@ -4608,11 +4622,6 @@ pub(crate) fn rav1d_decode_frame_init(c: &Rav1dContext, fc: &Rav1dFrameContext)
f.frame_thread
.b
.resize_with(num_sb128 as usize * 32 * 32, Default::default);

// TODO: fallible allocation
f.frame_thread
.cbi
.resize_with(num_sb128 as usize * 32 * 32, Default::default);
}

f.sr_sb128w = f.sr_cur.p.p.w + 127 >> 7;
Expand Down
5 changes: 3 additions & 2 deletions src/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -279,14 +279,14 @@ struct Dav1dFrameContext {
atomic_uint *frame_progress, *copy_lpf_progress;
// indexed using t->by * f->b4_stride + t->bx
Av1Block *b;
int16_t (*cbi)[3 /* plane */]; /* bits 0-4: txtp, bits 5-15: eob */
int16_t *cbi; /* bits 0-4: txtp, bits 5-15: eob */
// indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1)
pixel (*pal)[3 /* plane */][8 /* idx */];
// iterated over inside tile state
uint8_t *pal_idx;
coef *cf;
int prog_sz;
int pal_sz, pal_idx_sz, cf_sz;
int cbi_sz, pal_sz, pal_idx_sz, cf_sz;
// start offsets per tile
unsigned *tile_start_off;
} frame_thread;
Expand Down Expand Up @@ -364,6 +364,7 @@ struct Dav1dTileState {
atomic_int progress[2 /* 0: reconstruction, 1: entropy */];
struct Dav1dTileState_frame_thread {
uint8_t *pal_idx;
int16_t *cbi;
coef *cf;
} frame_thread[2 /* 0: reconstruction, 1: entropy */];

Expand Down
3 changes: 2 additions & 1 deletion src/internal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ pub struct Rav1dFrameContext_frame_thread {
/// Indexed using `t.b.y * f.b4_stride + t.b.x`.
pub b: DisjointMut<Vec<Av1Block>>,

pub cbi: Vec<[Atomic<CodedBlockInfo>; 3]>,
pub cbi: Vec<Atomic<CodedBlockInfo>>,

/// Indexed using `(t.b.y >> 1) * (f.b4_stride >> 1) + (t.b.x >> 1)`.
/// Inner indices are `[3 plane][8 idx]`.
Expand Down Expand Up @@ -899,6 +899,7 @@ pub struct Rav1dTileState_tiling {
#[repr(C)]
pub struct Rav1dTileState_frame_thread {
pub pal_idx: AtomicUsize, // Offset into `f.frame_thread.pal_idx`
pub cbi_idx: AtomicUsize, // Offset into `f.frame_thread.cbi`
pub cf: AtomicUsize, // Offset into `f.frame_thread.cf`
}

Expand Down
2 changes: 1 addition & 1 deletion src/lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -635,11 +635,11 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
if (c->n_fc > 1) {
freep(&f->tile_thread.lowest_pixel_mem);
freep(&f->frame_thread.b);
dav1d_freep_aligned(&f->frame_thread.cbi);
dav1d_freep_aligned(&f->frame_thread.pal_idx);
dav1d_freep_aligned(&f->frame_thread.cf);
freep(&f->frame_thread.tile_start_off);
dav1d_freep_aligned(&f->frame_thread.pal);
freep(&f->frame_thread.cbi);
}
if (c->n_tc > 1) {
pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
Expand Down
38 changes: 19 additions & 19 deletions src/recon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1697,7 +1697,6 @@ unsafe fn read_coef_tree<BD: BitDepth>(
let mut cf_ctx = 0;
let eob;
let cf;
let mut cbi_idx = 0;

if t.frame_thread.pass != 0 {
let p = t.frame_thread.pass & 1;
Expand All @@ -1707,7 +1706,6 @@ unsafe fn read_coef_tree<BD: BitDepth>(
cf_idx + cmp::min(t_dim.w, 8) as usize * cmp::min(t_dim.h, 8) as usize * 16,
Ordering::Relaxed,
);
cbi_idx = (t.b.y as isize * f.b4_stride + t.b.x as isize) as usize;
} else {
cf = CfSelect::Task;
}
Expand Down Expand Up @@ -1755,11 +1753,13 @@ unsafe fn read_coef_tree<BD: BitDepth>(
}
});
if t.frame_thread.pass == 1 {
f.frame_thread.cbi[cbi_idx][0]
let cbi_idx = ts.frame_thread[1].cbi_idx.fetch_add(1, Ordering::Relaxed);
f.frame_thread.cbi[cbi_idx]
.store(CodedBlockInfo::new(eob as i16, txtp), Ordering::Relaxed);
}
} else {
let cbi = f.frame_thread.cbi[cbi_idx][0].load(Ordering::Relaxed);
let cbi_idx = ts.frame_thread[0].cbi_idx.fetch_add(1, Ordering::Relaxed);
let cbi = f.frame_thread.cbi[cbi_idx].load(Ordering::Relaxed);
eob = cbi.eob().into();
txtp = cbi.txtp();
}
Expand Down Expand Up @@ -1875,7 +1875,6 @@ pub(crate) unsafe fn rav1d_read_coef_blocks<BD: BitDepth>(
y = init_y;
t.b.y += init_y as c_int;
while y < sub_h4 {
let cbi_idx = t.b.y as usize * f.b4_stride as usize;
let mut x_off = (init_x != 0) as c_int;
x = init_x;
t.b.x += init_x as c_int;
Expand Down Expand Up @@ -1928,7 +1927,9 @@ pub(crate) unsafe fn rav1d_read_coef_blocks<BD: BitDepth>(
intra.tx, txtp, eob, ts_c.msac.rng,
);
}
f.frame_thread.cbi[cbi_idx..][t.b.x as usize][0]
let cbi_idx =
ts.frame_thread[1].cbi_idx.fetch_add(1, Ordering::Relaxed);
f.frame_thread.cbi[cbi_idx]
.store(CodedBlockInfo::new(eob as i16, txtp), Ordering::Relaxed);
ts.frame_thread[1].cf.store(
cf_idx
Expand Down Expand Up @@ -1971,7 +1972,6 @@ pub(crate) unsafe fn rav1d_read_coef_blocks<BD: BitDepth>(
y = init_y >> ss_ver;
t.b.y += init_y as c_int;
while y < sub_ch4 {
let cbi_idx = t.b.y as usize * f.b4_stride as usize;
x = init_x >> ss_hor;
t.b.x += init_x as c_int;
while x < sub_cw4 {
Expand Down Expand Up @@ -2016,7 +2016,8 @@ pub(crate) unsafe fn rav1d_read_coef_blocks<BD: BitDepth>(
pl, b.uvtx, txtp, eob, ts_c.msac.rng,
);
}
f.frame_thread.cbi[cbi_idx..][t.b.x as usize][(1 + pl) as usize]
let cbi_idx = ts.frame_thread[1].cbi_idx.fetch_add(1, Ordering::Relaxed);
f.frame_thread.cbi[cbi_idx]
.store(CodedBlockInfo::new(eob as i16, txtp), Ordering::Relaxed);
ts.frame_thread[1].cf.store(
cf_idx + uv_t_dim.w as usize * uv_t_dim.h as usize * 16,
Expand Down Expand Up @@ -2694,9 +2695,9 @@ pub(crate) unsafe fn rav1d_recon_b_intra<BD: BitDepth>(
cf_guard = f.frame_thread.cf.mut_slice_as(cf_idx..cf_idx + len);
cf = &mut *cf_guard;
ts.frame_thread[p].cf.store(cf_idx + len, Ordering::Relaxed);
let cbi = f.frame_thread.cbi
[t.b.y as usize * f.b4_stride as usize + t.b.x as usize][0]
.load(Ordering::Relaxed);
let cbi_idx =
ts.frame_thread[p].cbi_idx.fetch_add(1, Ordering::Relaxed);
let cbi = f.frame_thread.cbi[cbi_idx].load(Ordering::Relaxed);
eob = cbi.eob().into();
txtp = cbi.txtp();
} else {
Expand Down Expand Up @@ -3128,10 +3129,9 @@ pub(crate) unsafe fn rav1d_recon_b_intra<BD: BitDepth>(
cf_guard = f.frame_thread.cf.mut_slice_as(cf_idx..cf_idx + len);
cf = &mut *cf_guard;
ts.frame_thread[p].cf.store(cf_idx + len, Ordering::Relaxed);
let cbi = f.frame_thread.cbi
[t.b.y as usize * f.b4_stride as usize + t.b.x as usize]
[pl + 1]
.load(Ordering::Relaxed);
let cbi_idx =
ts.frame_thread[p].cbi_idx.fetch_add(1, Ordering::Relaxed);
let cbi = f.frame_thread.cbi[cbi_idx].load(Ordering::Relaxed);
eob = cbi.eob().into();
txtp = cbi.txtp();
} else {
Expand Down Expand Up @@ -4083,10 +4083,10 @@ pub(crate) unsafe fn rav1d_recon_b_inter<BD: BitDepth>(
ts.frame_thread[p as usize]
.cf
.store(cf_idx + len, Ordering::Relaxed);
let cbi = f.frame_thread.cbi
[(t.b.y as isize * f.b4_stride + t.b.x as isize) as usize]
[(1 + pl) as usize]
.load(Ordering::Relaxed);
let cbi_idx = ts.frame_thread[p as usize]
.cbi_idx
.fetch_add(1, Ordering::Relaxed);
let cbi = f.frame_thread.cbi[cbi_idx].load(Ordering::Relaxed);
eob = cbi.eob().into();
txtp = cbi.txtp();
} else {
Expand Down
26 changes: 9 additions & 17 deletions src/recon_tmpl.c
Original file line number Diff line number Diff line change
Expand Up @@ -770,14 +770,12 @@ static void read_coef_tree(Dav1dTaskContext *const t,
uint8_t cf_ctx;
int eob;
coef *cf;
int16_t *cbi;

if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
assert(ts->frame_thread[p].cf);
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
cbi = f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
} else {
cf = bitfn(t->cf);
}
Expand All @@ -804,10 +802,11 @@ static void read_coef_tree(Dav1dTaskContext *const t,
case_set_upto16(txw,,,);
#undef set_ctx
if (t->frame_thread.pass == 1)
cbi[0] = eob * (1 << 5) + txtp;
*ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
} else {
eob = cbi[0] >> 5;
txtp = cbi[0] & 0x1f;
const int cbi = *ts->frame_thread[0].cbi++;
eob = cbi >> 5;
txtp = cbi & 0x1f;
}
if (!(t->frame_thread.pass & 1)) {
assert(dst);
Expand Down Expand Up @@ -872,8 +871,6 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
for (y = init_y, t->by += init_y; y < sub_h4;
y += t_dim->h, t->by += t_dim->h, y_off++)
{
int16_t (*const cbi)[3] =
&f->frame_thread.cbi[t->by * f->b4_stride];
int x_off = !!init_x;
for (x = init_x, t->bx += init_x; x < sub_w4;
x += t_dim->w, t->bx += t_dim->w, x_off++)
Expand All @@ -891,7 +888,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
if (DEBUG_BLOCK_INFO)
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
b->tx, txtp, eob, ts->msac.rng);
cbi[t->bx][0] = eob * (1 << 5) + txtp;
*ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
Expand All @@ -917,8 +914,6 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
{
int16_t (*const cbi)[3] =
&f->frame_thread.cbi[t->by * f->b4_stride];
for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
{
Expand All @@ -936,7 +931,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
printf("Post-uv-cf-blk[pl=%d,tx=%d,"
"txtp=%d,eob=%d]: r=%d\n",
pl, b->uvtx, txtp, eob, ts->msac.rng);
cbi[t->bx][pl + 1] = eob * (1 << 5) + txtp;
*ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
Expand Down Expand Up @@ -1320,10 +1315,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
enum TxfmType txtp;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
const int cbi = *ts->frame_thread[p].cbi++;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
const int cbi =
f->frame_thread.cbi[t->by * f->b4_stride + t->bx][0];
eob = cbi >> 5;
txtp = cbi & 0x1f;
} else {
Expand Down Expand Up @@ -1544,10 +1538,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
coef *cf;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
const int cbi = *ts->frame_thread[p].cbi++;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
const int cbi =
f->frame_thread.cbi[t->by * f->b4_stride + t->bx][pl + 1];
eob = cbi >> 5;
txtp = cbi & 0x1f;
} else {
Expand Down Expand Up @@ -1994,10 +1987,9 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
enum TxfmType txtp;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
const int cbi = *ts->frame_thread[p].cbi++;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
const int cbi =
f->frame_thread.cbi[t->by * f->b4_stride + t->bx][pl + 1];
eob = cbi >> 5;
txtp = cbi & 0x1f;
} else {
Expand Down

0 comments on commit 1aae512

Please sign in to comment.