shithub: libvpx

Download patch

ref: 5ade423774690e2bf877559dc1a1f9547db5dac1
parent: e5ed605f01f8b75a8e7db1561bdbb373f8040de3
author: Deb Mukherjee <[email protected]>
date: Tue Nov 5 12:25:38 EST 2013

Removes conditional statements from band getting

Implements scan order to band map with arrays in both the encoder
and decoder to remove conditional statements.

Encoding seems to be about 1% faster at speed 0, tested on football.
Decoding seems to be about 0.5-1% faster on a set of 25 videos.

Change-Id: Idb233ca0b9e0efd790e30880642e8717e1c5c8dd

--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -127,12 +127,6 @@
 extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1];
 extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1];
 
-
-static int get_coef_band(const uint8_t * band_translate, int coef_index) {
-  return (coef_index > MAXBAND_INDEX)
-    ? (COEF_BANDS-1) : band_translate[coef_index];
-}
-
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
@@ -179,11 +173,6 @@
   }
 
   return combine_entropy_contexts(above_ec, left_ec);
-}
-
-static const uint8_t *get_band_translate(TX_SIZE tx_size) {
-  return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
-                           : vp9_coefband_trans_8x8plus;
 }
 
 static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -45,6 +45,7 @@
   DECLARE_ALIGNED(16, int16_t,  qcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, int16_t,  dqcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
+  const uint8_t *band_translate[2];
 } TileWorkerData;
 
 static int read_be32(const uint8_t *p) {
@@ -294,7 +295,8 @@
   VP9_COMMON *cm;
   MACROBLOCKD *xd;
   vp9_reader *r;
-  unsigned char* token_cache;
+  uint8_t *token_cache;
+  const uint8_t *band_translate[2];
 };
 
 static void predict_and_reconstruct_intra_block(int plane, int block,
@@ -303,6 +305,9 @@
   struct intra_args *const args = arg;
   VP9_COMMON *const cm = args->cm;
   MACROBLOCKD *const xd = args->xd;
+  const uint8_t *band_translate[2] = {
+    args->band_translate[0], args->band_translate[1]
+  };
 
   struct macroblockd_plane *const pd = &xd->plane[plane];
   MODE_INFO *const mi = xd->mi_8x8[0];
@@ -324,7 +329,7 @@
 
   if (!mi->mbmi.skip_coeff) {
     vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
-                            args->r, args->token_cache);
+                            args->r, args->token_cache, band_translate);
     inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
   }
 }
@@ -334,7 +339,8 @@
   MACROBLOCKD *xd;
   vp9_reader *r;
   int *eobtotal;
-  unsigned char* token_cache;
+  uint8_t *token_cache;
+  const uint8_t *band_translate[2];
 };
 
 static void reconstruct_inter_block(int plane, int block,
@@ -343,10 +349,14 @@
   struct inter_args *args = arg;
   VP9_COMMON *const cm = args->cm;
   MACROBLOCKD *const xd = args->xd;
+  const uint8_t *band_translate[2] = {
+    args->band_translate[0], args->band_translate[1]
+  };
 
   *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
                                              plane_bsize, tx_size,
-                                             args->r, args->token_cache);
+                                             args->r, args->token_cache,
+                                             band_translate);
   inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
 }
 
@@ -398,7 +408,8 @@
                            const TileInfo *const tile,
                            int mi_row, int mi_col,
                            vp9_reader *r, BLOCK_SIZE bsize,
-                           unsigned char *token_cache) {
+                           uint8_t *token_cache,
+                           const uint8_t *band_translate[2]) {
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
 
@@ -420,7 +431,9 @@
   }
 
   if (!is_inter_block(mbmi)) {
-    struct intra_args arg = { cm, xd, r, token_cache };
+    struct intra_args arg = {
+      cm, xd, r, token_cache, {band_translate[0], band_translate[1]}
+    };
     foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
                               &arg);
   } else {
@@ -438,7 +451,10 @@
     // Reconstruction
     if (!mbmi->skip_coeff) {
       int eobtotal = 0;
-      struct inter_args arg = { cm, xd, r, &eobtotal, token_cache };
+      struct inter_args arg = {
+        cm, xd, r, &eobtotal, token_cache,
+        {band_translate[0], band_translate[1]}
+      };
       foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
       if (!less8x8 && eobtotal == 0)
         mbmi->skip_coeff = 1;  // skip loopfilter
@@ -478,7 +494,8 @@
                             const TileInfo *const tile,
                             int mi_row, int mi_col,
                             vp9_reader* r, BLOCK_SIZE bsize,
-                            unsigned char *token_cache) {
+                            uint8_t *token_cache,
+                            const uint8_t *band_translate[2]) {
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
@@ -489,33 +506,37 @@
   partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
   subsize = get_subsize(bsize, partition);
   if (subsize < BLOCK_8X8) {
-    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
+                   band_translate);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
+                       band_translate);
         break;
       case PARTITION_HORZ:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
+                       band_translate);
         if (mi_row + hbs < cm->mi_rows)
           decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
-                         token_cache);
+                         token_cache, band_translate);
         break;
       case PARTITION_VERT:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
+                       band_translate);
         if (mi_col + hbs < cm->mi_cols)
           decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
-                         token_cache);
+                         token_cache, band_translate);
         break;
       case PARTITION_SPLIT:
         decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
-                        token_cache);
+                        token_cache, band_translate);
         decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
-                        token_cache);
+                        token_cache, band_translate);
         decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
-                        token_cache);
+                        token_cache, band_translate);
         decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
-                        token_cache);
+                        token_cache, band_translate);
         break;
       default:
         assert(!"Invalid partition type");
@@ -798,9 +819,13 @@
     vp9_zero(xd->left_context);
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE)
+         mi_col += MI_BLOCK_SIZE) {
+      const uint8_t *band_translate[2] = {
+        vp9_coefband_trans_4x4, pbi->coefband_trans_8x8plus
+      };
       decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
-                      pbi->token_cache);
+                      pbi->token_cache, band_translate);
+    }
 
     if (pbi->do_loopfilter_inline) {
       const int lf_start = mi_row - MI_BLOCK_SIZE;
@@ -948,7 +973,7 @@
 }
 
 static int tile_worker_hook(void *arg1, void *arg2) {
-  TileWorkerData *tile_data = (TileWorkerData*)arg1;
+  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
   const TileInfo *const tile = (TileInfo*)arg2;
   int mi_row, mi_col;
 
@@ -960,7 +985,8 @@
          mi_col += MI_BLOCK_SIZE) {
       decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
                       mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
-                      tile_data->token_cache);
+                      tile_data->token_cache,
+                      tile_data->band_translate);
     }
   }
   return !tile_data->xd.corrupted;
@@ -1019,6 +1045,8 @@
       tile_data->cm = cm;
       tile_data->xd = pbi->mb;
       tile_data->xd.corrupted = 0;
+      tile_data->band_translate[0] = vp9_coefband_trans_4x4;
+      tile_data->band_translate[1] = pbi->coefband_trans_8x8plus;
       vp9_tile_init(tile, tile_data->cm, 0, tile_col);
 
       setup_token_decoder(data, data_end, size, &cm->error,
@@ -1298,6 +1326,13 @@
   const int tile_rows = 1 << cm->log2_tile_rows;
   const int tile_cols = 1 << cm->log2_tile_cols;
   YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+
+  vpx_memset(pbi->coefband_trans_8x8plus,
+             (COEF_BANDS - 1),
+             sizeof(pbi->coefband_trans_8x8plus));
+  vpx_memcpy(pbi->coefband_trans_8x8plus,
+             vp9_coefband_trans_8x8plus,
+             sizeof(vp9_coefband_trans_8x8plus));
 
   if (!first_partition_size) {
       // showing a frame directly
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -93,7 +93,8 @@
                         vp9_reader *r, int block_idx,
                         PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr,
                         TX_SIZE tx_size, const int16_t *dq, int pt,
-                        uint8_t *token_cache) {
+                        uint8_t *token_cache,
+                        const uint8_t *band_translate) {
   const FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
@@ -108,22 +109,20 @@
   unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] =
       counts->eob_branch[tx_size][type][ref];
   const int16_t *scan, *nb;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const uint8_t *cat6;
   get_scan(xd, tx_size, type, block_idx, &scan, &nb);
 
-  while (1) {
+  while (c < seg_eob) {
     int val;
-    const uint8_t *cat6 = cat6_prob;
-    if (c >= seg_eob)
-      break;
     if (c)
       pt = get_coef_context(nb, token_cache, c);
-    band = get_coef_band(band_translate, c);
+    band = *band_translate++;
     prob = coef_probs[band][pt];
     if (!cm->frame_parallel_decoding_mode)
       ++eob_branch_count[band][pt];
     if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
       break;
+    goto DECODE_ZERO;
 
   SKIP_START:
     if (c >= seg_eob)
@@ -130,9 +129,10 @@
       break;
     if (c)
       pt = get_coef_context(nb, token_cache, c);
-    band = get_coef_band(band_translate, c);
+    band = *band_translate++;
     prob = coef_probs[band][pt];
 
+  DECODE_ZERO:
     if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN];
@@ -200,6 +200,7 @@
       WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY5);
     }
     val = 0;
+    cat6 = cat6_prob;
     while (*cat6) {
       val = (val << 1) | vp9_read(r, *cat6++);
     }
@@ -218,7 +219,8 @@
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
                             TX_SIZE tx_size, vp9_reader *r,
-                            uint8_t *token_cache) {
+                            uint8_t *token_cache,
+                            const uint8_t *band_translate[2]) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
                                  tx_size);
@@ -229,7 +231,8 @@
 
   eob = decode_coefs(cm, xd, r, block,
                      pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block),
-                     tx_size, pd->dequant, pt, token_cache);
+                     tx_size, pd->dequant, pt, token_cache,
+                     band_translate[tx_size != TX_4X4]);
 
   set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
 
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -18,6 +18,7 @@
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
                             TX_SIZE tx_size, vp9_reader *r,
-                            uint8_t *token_cache);
+                            uint8_t *token_cache,
+                            const uint8_t *band_translate[2]);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -54,7 +54,8 @@
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   PARTITION_CONTEXT *above_seg_context;
 
-  DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
+  DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
+  DECLARE_ALIGNED(16, uint8_t, coefband_trans_8x8plus[1024]);
 } VP9D_COMP;
 
 #endif  // VP9_DECODER_VP9_ONYXD_INT_H_
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -184,6 +184,9 @@
   BLOCK_SIZE sb64_partitioning;
 
   void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
+
+  // band cache
+  DECLARE_ALIGNED(16, uint8_t, coefband_trans_8x8plus[1024]);
 };
 
 // TODO(jingning): the variables used here are little complicated. need further
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -138,7 +138,9 @@
   uint8_t token_cache[1024];
   const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block);
   const int16_t *dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const uint8_t *const band_translate = (tx_size == TX_4X4 ?
+                                         vp9_coefband_trans_4x4 :
+                                         mb->coefband_trans_8x8plus);
 
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -179,7 +181,7 @@
       t0 = (vp9_dct_value_tokens_ptr + x)->token;
       /* Consider both possible successor states. */
       if (next < default_eob) {
-        band = get_coef_band(band_translate, i + 1);
+        band = band_translate[i + 1];
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
         rate0 +=
           mb->token_costs[tx_size][type][ref][band][0][pt]
@@ -230,7 +232,7 @@
         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
       }
       if (next < default_eob) {
-        band = get_coef_band(band_translate, i + 1);
+        band = band_translate[i + 1];
         if (t0 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
           rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
@@ -264,7 +266,7 @@
       /* There's no choice to make for a zero coefficient, so we don't
        *  add a new trellis node, but we do need to update the costs.
        */
-      band = get_coef_band(band_translate, i + 1);
+      band = band_translate[i + 1];
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
       /* Update the cost of each path if we're past the EOB token. */
@@ -284,7 +286,7 @@
   }
 
   /* Now pick the best path through the whole trellis. */
-  band = get_coef_band(band_translate, i + 1);
+  band = band_translate[i + 1];
   pt = combine_entropy_contexts(*a, *l);
   rate0 = tokens[next][0].rate;
   rate1 = tokens[next][1].rate;
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1223,6 +1223,13 @@
   cpi->fixed_divide[0] = 0;
   for (i = 1; i < 512; i++)
     cpi->fixed_divide[i] = 0x80000 / i;
+
+  vpx_memset(cpi->mb.coefband_trans_8x8plus,
+             (COEF_BANDS-1),
+             sizeof(cpi->mb.coefband_trans_8x8plus));
+  vpx_memcpy(cpi->mb.coefband_trans_8x8plus,
+             vp9_coefband_trans_8x8plus,
+             sizeof(vp9_coefband_trans_8x8plus));
 }
 
 
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -115,7 +115,9 @@
   vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
   vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
   const int ref = is_inter_block(mbmi);
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const uint8_t *const band_translate = (tx_size == TX_4X4 ?
+                                         vp9_coefband_trans_4x4 :
+                                         cpi->mb.coefband_trans_8x8plus);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
@@ -127,7 +129,7 @@
   get_scan(xd, tx_size, type, block, &scan, &nb);
   c = 0;
   do {
-    const int band = get_coef_band(band_translate, c);
+    const int band = band_translate[c];
     int token;
     int v = 0;
     rc = scan[c];