Backport cbi buffer optimization from dav1d 1.3.0 (#1121)

Account for chroma subsampling when allocating cbi buffers. Reduces memory usage (by 3 kB per sb128 for 4:2:0) when decoding streams with subsampled chroma when frame threading is enabled.
memorysafety · May 28, 2024 · 1aae512 · 1aae512
2 parents a8ee17e + 7b3e1ac
commit 1aae512
Show file tree

Hide file tree

Showing 7 changed files with 65 additions and 49 deletions.
diff --git a/src/decode.c b/src/decode.c
@@ -2484,6 +2484,9 @@ static void setup_tile(Dav1dTileState *const ts,
         ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
             &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 8] :
             NULL;
+        ts->frame_thread[p].cbi = f->frame_thread.cbi ?
+            &f->frame_thread.cbi[(size_t)tile_start_off * size_mul[0] / 64] :
+            NULL;
         ts->frame_thread[p].cf = f->frame_thread.cf ?
             (uint8_t*)f->frame_thread.cf +
                 (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
@@ -2855,6 +2858,19 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
             }
         }
 
+        const int cbi_sz = num_sb128 * size_mul[0];
+        if (cbi_sz != f->frame_thread.cbi_sz) {
+            dav1d_free_aligned(f->frame_thread.cbi);
+            f->frame_thread.cbi =
+                dav1d_alloc_aligned(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
+                                    cbi_sz * 32 * 32 / 4, 64);
+            if (!f->frame_thread.cbi) {
+                f->frame_thread.cbi_sz = 0;
+                return retval;
+            }
+            f->frame_thread.cbi_sz = cbi_sz;
+        }
+
         const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
         if (cf_sz != f->frame_thread.cf_sz) {
             dav1d_freep_aligned(&f->frame_thread.cf);
@@ -3012,12 +3028,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
         }
         if (c->n_fc > 1) {
             freep(&f->frame_thread.b);
-            freep(&f->frame_thread.cbi);
             f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) *
                                        num_sb128 * 32 * 32);
-            f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) *
-                                         num_sb128 * 32 * 32);
-            if (!f->frame_thread.b || !f->frame_thread.cbi) {
+            if (!f->frame_thread.b) {
                 f->lf.mask_sz = 0;
                 return retval;
             }

diff --git a/src/decode.rs b/src/decode.rs
@@ -3991,6 +3991,14 @@ fn setup_tile(
             },
             Ordering::Relaxed,
         );
+        ts.frame_thread[p].cbi_idx.store(
+            if !frame_thread.cbi.is_empty() {
+                tile_start_off * size_mul[0] as usize / 64
+            } else {
+                0
+            },
+            Ordering::Relaxed,
+        );
         ts.frame_thread[p].cf.store(
             if !frame_thread.cf.is_empty() {
                 let bpc = BPC::from_bitdepth_max(bitdepth_max);
@@ -4472,6 +4480,12 @@ pub(crate) fn rav1d_decode_frame_init(c: &Rav1dContext, fc: &Rav1dFrameContext)
             }
         }
 
+        let cbi_sz = num_sb128 * size_mul[0] as c_int;
+        // TODO: Fallible allocation
+        f.frame_thread
+            .cbi
+            .resize_with(cbi_sz as usize * 32 * 32 / 4, Default::default);
+
         let cf_sz = (num_sb128 * size_mul[0] as c_int) << hbd;
         // TODO: Fallible allocation
         f.frame_thread
@@ -4608,11 +4622,6 @@ pub(crate) fn rav1d_decode_frame_init(c: &Rav1dContext, fc: &Rav1dFrameContext)
         f.frame_thread
             .b
             .resize_with(num_sb128 as usize * 32 * 32, Default::default);
-
-        // TODO: fallible allocation
-        f.frame_thread
-            .cbi
-            .resize_with(num_sb128 as usize * 32 * 32, Default::default);
     }
 
     f.sr_sb128w = f.sr_cur.p.p.w + 127 >> 7;

diff --git a/src/internal.h b/src/internal.h
@@ -279,14 +279,14 @@ struct Dav1dFrameContext {
         atomic_uint *frame_progress, *copy_lpf_progress;
         // indexed using t->by * f->b4_stride + t->bx
         Av1Block *b;
-        int16_t (*cbi)[3 /* plane */]; /* bits 0-4: txtp, bits 5-15: eob */
+        int16_t *cbi; /* bits 0-4: txtp, bits 5-15: eob */
         // indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1)
         pixel (*pal)[3 /* plane */][8 /* idx */];
         // iterated over inside tile state
         uint8_t *pal_idx;
         coef *cf;
         int prog_sz;
-        int pal_sz, pal_idx_sz, cf_sz;
+        int cbi_sz, pal_sz, pal_idx_sz, cf_sz;
         // start offsets per tile
         unsigned *tile_start_off;
     } frame_thread;
@@ -364,6 +364,7 @@ struct Dav1dTileState {
     atomic_int progress[2 /* 0: reconstruction, 1: entropy */];
     struct Dav1dTileState_frame_thread {
         uint8_t *pal_idx;
+        int16_t *cbi;
         coef *cf;
     } frame_thread[2 /* 0: reconstruction, 1: entropy */];
 

diff --git a/src/internal.rs b/src/internal.rs
@@ -647,7 +647,7 @@ pub struct Rav1dFrameContext_frame_thread {
     /// Indexed using `t.b.y * f.b4_stride + t.b.x`.
     pub b: DisjointMut<Vec<Av1Block>>,
 
-    pub cbi: Vec<[Atomic<CodedBlockInfo>; 3]>,
+    pub cbi: Vec<Atomic<CodedBlockInfo>>,
 
     /// Indexed using `(t.b.y >> 1) * (f.b4_stride >> 1) + (t.b.x >> 1)`.
     /// Inner indices are `[3 plane][8 idx]`.
@@ -899,6 +899,7 @@ pub struct Rav1dTileState_tiling {
 #[repr(C)]
 pub struct Rav1dTileState_frame_thread {
     pub pal_idx: AtomicUsize, // Offset into `f.frame_thread.pal_idx`
+    pub cbi_idx: AtomicUsize, // Offset into `f.frame_thread.cbi`
     pub cf: AtomicUsize,      // Offset into `f.frame_thread.cf`
 }
 

diff --git a/src/lib.c b/src/lib.c
@@ -635,11 +635,11 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
         if (c->n_fc > 1) {
             freep(&f->tile_thread.lowest_pixel_mem);
             freep(&f->frame_thread.b);
+            dav1d_freep_aligned(&f->frame_thread.cbi);
             dav1d_freep_aligned(&f->frame_thread.pal_idx);
             dav1d_freep_aligned(&f->frame_thread.cf);
             freep(&f->frame_thread.tile_start_off);
             dav1d_freep_aligned(&f->frame_thread.pal);
-            freep(&f->frame_thread.cbi);
         }
         if (c->n_tc > 1) {
             pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);

diff --git a/src/recon.rs b/src/recon.rs
@@ -1697,7 +1697,6 @@ unsafe fn read_coef_tree<BD: BitDepth>(
         let mut cf_ctx = 0;
         let eob;
         let cf;
-        let mut cbi_idx = 0;
 
         if t.frame_thread.pass != 0 {
             let p = t.frame_thread.pass & 1;
@@ -1707,7 +1706,6 @@ unsafe fn read_coef_tree<BD: BitDepth>(
                 cf_idx + cmp::min(t_dim.w, 8) as usize * cmp::min(t_dim.h, 8) as usize * 16,
                 Ordering::Relaxed,
             );
-            cbi_idx = (t.b.y as isize * f.b4_stride + t.b.x as isize) as usize;
         } else {
             cf = CfSelect::Task;
         }
@@ -1755,11 +1753,13 @@ unsafe fn read_coef_tree<BD: BitDepth>(
                 }
             });
             if t.frame_thread.pass == 1 {
-                f.frame_thread.cbi[cbi_idx][0]
+                let cbi_idx = ts.frame_thread[1].cbi_idx.fetch_add(1, Ordering::Relaxed);
+                f.frame_thread.cbi[cbi_idx]
                     .store(CodedBlockInfo::new(eob as i16, txtp), Ordering::Relaxed);
             }
         } else {
-            let cbi = f.frame_thread.cbi[cbi_idx][0].load(Ordering::Relaxed);
+            let cbi_idx = ts.frame_thread[0].cbi_idx.fetch_add(1, Ordering::Relaxed);
+            let cbi = f.frame_thread.cbi[cbi_idx].load(Ordering::Relaxed);
             eob = cbi.eob().into();
             txtp = cbi.txtp();
         }
@@ -1875,7 +1875,6 @@ pub(crate) unsafe fn rav1d_read_coef_blocks<BD: BitDepth>(
             y = init_y;
             t.b.y += init_y as c_int;
             while y < sub_h4 {
-                let cbi_idx = t.b.y as usize * f.b4_stride as usize;
                 let mut x_off = (init_x != 0) as c_int;
                 x = init_x;
                 t.b.x += init_x as c_int;
@@ -1928,7 +1927,9 @@ pub(crate) unsafe fn rav1d_read_coef_blocks<BD: BitDepth>(
                                     intra.tx, txtp, eob, ts_c.msac.rng,
                                 );
                             }
-                            f.frame_thread.cbi[cbi_idx..][t.b.x as usize][0]
+                            let cbi_idx =
+                                ts.frame_thread[1].cbi_idx.fetch_add(1, Ordering::Relaxed);
+                            f.frame_thread.cbi[cbi_idx]
                                 .store(CodedBlockInfo::new(eob as i16, txtp), Ordering::Relaxed);
                             ts.frame_thread[1].cf.store(
                                 cf_idx
@@ -1971,7 +1972,6 @@ pub(crate) unsafe fn rav1d_read_coef_blocks<BD: BitDepth>(
                 y = init_y >> ss_ver;
                 t.b.y += init_y as c_int;
                 while y < sub_ch4 {
-                    let cbi_idx = t.b.y as usize * f.b4_stride as usize;
                     x = init_x >> ss_hor;
                     t.b.x += init_x as c_int;
                     while x < sub_cw4 {
@@ -2016,7 +2016,8 @@ pub(crate) unsafe fn rav1d_read_coef_blocks<BD: BitDepth>(
                                 pl, b.uvtx, txtp, eob, ts_c.msac.rng,
                             );
                         }
-                        f.frame_thread.cbi[cbi_idx..][t.b.x as usize][(1 + pl) as usize]
+                        let cbi_idx = ts.frame_thread[1].cbi_idx.fetch_add(1, Ordering::Relaxed);
+                        f.frame_thread.cbi[cbi_idx]
                             .store(CodedBlockInfo::new(eob as i16, txtp), Ordering::Relaxed);
                         ts.frame_thread[1].cf.store(
                             cf_idx + uv_t_dim.w as usize * uv_t_dim.h as usize * 16,
@@ -2694,9 +2695,9 @@ pub(crate) unsafe fn rav1d_recon_b_intra<BD: BitDepth>(
                             cf_guard = f.frame_thread.cf.mut_slice_as(cf_idx..cf_idx + len);
                             cf = &mut *cf_guard;
                             ts.frame_thread[p].cf.store(cf_idx + len, Ordering::Relaxed);
-                            let cbi = f.frame_thread.cbi
-                                [t.b.y as usize * f.b4_stride as usize + t.b.x as usize][0]
-                                .load(Ordering::Relaxed);
+                            let cbi_idx =
+                                ts.frame_thread[p].cbi_idx.fetch_add(1, Ordering::Relaxed);
+                            let cbi = f.frame_thread.cbi[cbi_idx].load(Ordering::Relaxed);
                             eob = cbi.eob().into();
                             txtp = cbi.txtp();
                         } else {
@@ -3128,10 +3129,9 @@ pub(crate) unsafe fn rav1d_recon_b_intra<BD: BitDepth>(
                                 cf_guard = f.frame_thread.cf.mut_slice_as(cf_idx..cf_idx + len);
                                 cf = &mut *cf_guard;
                                 ts.frame_thread[p].cf.store(cf_idx + len, Ordering::Relaxed);
-                                let cbi = f.frame_thread.cbi
-                                    [t.b.y as usize * f.b4_stride as usize + t.b.x as usize]
-                                    [pl + 1]
-                                    .load(Ordering::Relaxed);
+                                let cbi_idx =
+                                    ts.frame_thread[p].cbi_idx.fetch_add(1, Ordering::Relaxed);
+                                let cbi = f.frame_thread.cbi[cbi_idx].load(Ordering::Relaxed);
                                 eob = cbi.eob().into();
                                 txtp = cbi.txtp();
                             } else {
@@ -4083,10 +4083,10 @@ pub(crate) unsafe fn rav1d_recon_b_inter<BD: BitDepth>(
                                 ts.frame_thread[p as usize]
                                     .cf
                                     .store(cf_idx + len, Ordering::Relaxed);
-                                let cbi = f.frame_thread.cbi
-                                    [(t.b.y as isize * f.b4_stride + t.b.x as isize) as usize]
-                                    [(1 + pl) as usize]
-                                    .load(Ordering::Relaxed);
+                                let cbi_idx = ts.frame_thread[p as usize]
+                                    .cbi_idx
+                                    .fetch_add(1, Ordering::Relaxed);
+                                let cbi = f.frame_thread.cbi[cbi_idx].load(Ordering::Relaxed);
                                 eob = cbi.eob().into();
                                 txtp = cbi.txtp();
                             } else {

diff --git a/src/recon_tmpl.c b/src/recon_tmpl.c
@@ -770,14 +770,12 @@ static void read_coef_tree(Dav1dTaskContext *const t,
         uint8_t cf_ctx;
         int eob;
         coef *cf;
-        int16_t *cbi;
 
         if (t->frame_thread.pass) {
             const int p = t->frame_thread.pass & 1;
             assert(ts->frame_thread[p].cf);
             cf = ts->frame_thread[p].cf;
             ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-            cbi = f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
         } else {
             cf = bitfn(t->cf);
         }
@@ -804,10 +802,11 @@ static void read_coef_tree(Dav1dTaskContext *const t,
             case_set_upto16(txw,,,);
 #undef set_ctx
             if (t->frame_thread.pass == 1)
-                cbi[0] = eob * (1 << 5) + txtp;
+                *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
         } else {
-            eob  = cbi[0] >> 5;
-            txtp = cbi[0] & 0x1f;
+            const int cbi = *ts->frame_thread[0].cbi++;
+            eob  = cbi >> 5;
+            txtp = cbi & 0x1f;
         }
         if (!(t->frame_thread.pass & 1)) {
             assert(dst);
@@ -872,8 +871,6 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
             for (y = init_y, t->by += init_y; y < sub_h4;
                  y += t_dim->h, t->by += t_dim->h, y_off++)
             {
-                int16_t (*const cbi)[3] =
-                    &f->frame_thread.cbi[t->by * f->b4_stride];
                 int x_off = !!init_x;
                 for (x = init_x, t->bx += init_x; x < sub_w4;
                      x += t_dim->w, t->bx += t_dim->w, x_off++)
@@ -891,7 +888,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
                         if (DEBUG_BLOCK_INFO)
                             printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                                    b->tx, txtp, eob, ts->msac.rng);
-                        cbi[t->bx][0] = eob * (1 << 5) + txtp;
+                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
                         ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
                         rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
@@ -917,8 +914,6 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
                 for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
                      y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
                 {
-                    int16_t (*const cbi)[3] =
-                        &f->frame_thread.cbi[t->by * f->b4_stride];
                     for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
                          x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
                     {
@@ -936,7 +931,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
                             printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                    "txtp=%d,eob=%d]: r=%d\n",
                                    pl, b->uvtx, txtp, eob, ts->msac.rng);
-                        cbi[t->bx][pl + 1] = eob * (1 << 5) + txtp;
+                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
                         ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
                         rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
@@ -1320,10 +1315,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
                         enum TxfmType txtp;
                         if (t->frame_thread.pass) {
                             const int p = t->frame_thread.pass & 1;
+                            const int cbi = *ts->frame_thread[p].cbi++;
                             cf = ts->frame_thread[p].cf;
                             ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-                            const int cbi =
-                                f->frame_thread.cbi[t->by * f->b4_stride + t->bx][0];
                             eob  = cbi >> 5;
                             txtp = cbi & 0x1f;
                         } else {
@@ -1544,10 +1538,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
                             coef *cf;
                             if (t->frame_thread.pass) {
                                 const int p = t->frame_thread.pass & 1;
+                                const int cbi = *ts->frame_thread[p].cbi++;
                                 cf = ts->frame_thread[p].cf;
                                 ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
-                                const int cbi =
-                                    f->frame_thread.cbi[t->by * f->b4_stride + t->bx][pl + 1];
                                 eob  = cbi >> 5;
                                 txtp = cbi & 0x1f;
                             } else {
@@ -1994,10 +1987,9 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
                         enum TxfmType txtp;
                         if (t->frame_thread.pass) {
                             const int p = t->frame_thread.pass & 1;
+                            const int cbi = *ts->frame_thread[p].cbi++;
                             cf = ts->frame_thread[p].cf;
                             ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
-                            const int cbi =
-                                f->frame_thread.cbi[t->by * f->b4_stride + t->bx][pl + 1];
                             eob  = cbi >> 5;
                             txtp = cbi & 0x1f;
                         } else {