shithub: libvpx

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -53,10 +53,6 @@

   { 10, 6 }, { 8, 5 },

};

-#define USE_GREEDY_OPTIMIZE_B 1

-#if USE_GREEDY_OPTIMIZE_B

 // 'num' can be negative, but 'shift' must be non-negative.

 #define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \

   ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))

@@ -305,285 +301,6 @@

   return final_eob;

 #undef RIGHT_SHIFT_POSSIBLY_NEGATIVE

-#else

-#define UPDATE_RD_COST()                             \

-  {                                                  \

-    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \

-    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \

-  }

-// This function is a place holder for now but may ultimately need

-// to scan previous tokens to work out the correct context.

-static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb,

-                                     int idx, int token, uint8_t *token_cache) {

-  int bak = token_cache[scan[idx]], pt;

-  token_cache[scan[idx]] = vp9_pt_energy_class[token];

-  pt = get_coef_context(nb, token_cache, idx + 1);

-  token_cache[scan[idx]] = bak;

-  return pt;

-}

-static const int16_t band_count_table[TX_SIZES][8] = {

-  { 1, 2, 3, 4, 3, 16 - 13, 0 },

-  { 1, 2, 3, 4, 11, 64 - 21, 0 },

-  { 1, 2, 3, 4, 11, 256 - 21, 0 },

-  { 1, 2, 3, 4, 11, 1024 - 21, 0 },

-};

-static const int16_t band_cum_count_table[TX_SIZES][8] = {

-  { 0, 1, 3, 6, 10, 13, 16, 0 },

-  { 0, 1, 3, 6, 10, 21, 64, 0 },

-  { 0, 1, 3, 6, 10, 21, 256, 0 },

-  { 0, 1, 3, 6, 10, 21, 1024, 0 },

-};

-typedef struct vp9_token_state {

-  int64_t error;

-  int rate;

-  int16_t next;

-  int16_t token;

-  tran_low_t qc;

-  tran_low_t dqc;

-  uint8_t best_index;

-} vp9_token_state;

-int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,

-                   int ctx) {

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  struct macroblock_plane *const p = &mb->plane[plane];

-  struct macroblockd_plane *const pd = &xd->plane[plane];

-  const int ref = is_inter_block(xd->mi[0]);

-  vp9_token_state tokens[1025][2];

-  uint8_t token_cache[1024];

-  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);

-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

-  const int eob = p->eobs[block];

-  const PLANE_TYPE type = get_plane_type(plane);

-  const int default_eob = 16 << (tx_size << 1);

-  const int shift = (tx_size == TX_32X32);

-  const int16_t *const dequant_ptr = pd->dequant;

-  const uint8_t *const band_translate = get_band_translate(tx_size);

-  const scan_order *const so = get_scan(xd, tx_size, type, block);

-  const int16_t *const scan = so->scan;

-  const int16_t *const nb = so->neighbors;

-  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };

-  int next = eob, sz = 0;

-  const int64_t rdmult = ((int64_t)mb->rdmult * plane_rd_mult[ref][type]) >> 1;

-  const int64_t rddiv = mb->rddiv;

-  int64_t rd_cost0, rd_cost1;

-  int rate0, rate1;

-  int64_t error0, error1;

-  int16_t t0, t1;

-  int best, band = (eob < default_eob) ? band_translate[eob]

-                                       : band_translate[eob - 1];

-  int pt, i, final_eob;

-#if CONFIG_VP9_HIGHBITDEPTH

-  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);

-#else

-  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);

-#endif

-  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =

-      mb->token_costs[tx_size][type][ref];

-  const int16_t *band_counts = &band_count_table[tx_size][band];

-  int16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;

-  token_costs += band;

-  assert((!type && !plane) || (type && plane));

-  assert(eob <= default_eob);

-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */

-  /* Initialize the sentinel node of the trellis. */

-  tokens[eob][0].rate = 0;

-  tokens[eob][0].error = 0;

-  tokens[eob][0].next = default_eob;

-  tokens[eob][0].token = EOB_TOKEN;

-  tokens[eob][0].qc = 0;

-  tokens[eob][1] = tokens[eob][0];

-  for (i = 0; i < eob; i++) {

-    const int rc = scan[i];

-    token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])];

-  }

-  for (i = eob; i-- > 0;) {

-    int base_bits, d2, dx;

-    const int rc = scan[i];

-    int x = qcoeff[rc];

-    /* Only add a trellis state for non-zero coefficients. */

-    if (x) {

-      error0 = tokens[next][0].error;

-      error1 = tokens[next][1].error;

-      /* Evaluate the first possibility for this state. */

-      rate0 = tokens[next][0].rate;

-      rate1 = tokens[next][1].rate;

-      base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost);

-      /* Consider both possible successor states. */

-      if (next < default_eob) {

-        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);

-        rate0 += (*token_costs)[0][pt][tokens[next][0].token];

-        rate1 += (*token_costs)[0][pt][tokens[next][1].token];

-      }

-      UPDATE_RD_COST();

-      /* And pick the best. */

-      best = rd_cost1 < rd_cost0;

-      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);

-#if CONFIG_VP9_HIGHBITDEPTH

-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

-        dx >>= xd->bd - 8;

-      }

-#endif  // CONFIG_VP9_HIGHBITDEPTH

-      d2 = dx * dx;

-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);

-      tokens[i][0].error = d2 + (best ? error1 : error0);

-      tokens[i][0].next = next;

-      tokens[i][0].token = t0;

-      tokens[i][0].qc = x;

-      tokens[i][0].dqc = dqcoeff[rc];

-      tokens[i][0].best_index = best;

-      /* Evaluate the second possibility for this state. */

-      rate0 = tokens[next][0].rate;

-      rate1 = tokens[next][1].rate;

-      if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&

-          (abs(x) * dequant_ptr[rc != 0] <

-           (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) {

-        sz = -(x < 0);

-        x -= 2 * sz + 1;

-      } else {

-        tokens[i][1] = tokens[i][0];

-        next = i;

-        if (!(--band_left)) {

-          --band_counts;

-          band_left = *band_counts;

-          --token_costs;

-        }

-        continue;

-      }

-      /* Consider both possible successor states. */

-      if (!x) {

-        /* If we reduced this coefficient to zero, check to see if

-         *  we need to move the EOB back here.

-         */

-        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;

-        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;

-        base_bits = 0;

-      } else {

-        base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost);

-        t1 = t0;

-      }

-      if (next < default_eob) {

-        if (t0 != EOB_TOKEN) {

-          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);

-          rate0 += (*token_costs)[!x][pt][tokens[next][0].token];

-        }

-        if (t1 != EOB_TOKEN) {

-          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);

-          rate1 += (*token_costs)[!x][pt][tokens[next][1].token];

-        }

-      }

-      UPDATE_RD_COST();

-      /* And pick the best. */

-      best = rd_cost1 < rd_cost0;

-#if CONFIG_VP9_HIGHBITDEPTH

-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

-        dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;

-      } else {

-        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;

-      }

-#else

-      dx -= (dequant_ptr[rc != 0] + sz) ^ sz;

-#endif  // CONFIG_VP9_HIGHBITDEPTH

-      d2 = dx * dx;

-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);

-      tokens[i][1].error = d2 + (best ? error1 : error0);

-      tokens[i][1].next = next;

-      tokens[i][1].token = best ? t1 : t0;

-      tokens[i][1].qc = x;

-      if (x) {

-        tran_low_t offset = dq_step[rc != 0];

-        // The 32x32 transform coefficient uses half quantization step size.

-        // Account for the rounding difference in the dequantized coefficeint

-        // value when the quantization index is dropped from an even number

-        // to an odd number.

-        if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01);

-        if (sz == 0)

-          tokens[i][1].dqc = dqcoeff[rc] - offset;

-        else

-          tokens[i][1].dqc = dqcoeff[rc] + offset;

-      } else {

-        tokens[i][1].dqc = 0;

-      }

-      tokens[i][1].best_index = best;

-      /* Finally, make this the new head of the trellis. */

-      next = i;

-    } else {

-      /* There's no choice to make for a zero coefficient, so we don't

-       *  add a new trellis node, but we do need to update the costs.

-       */

-      pt = get_coef_context(nb, token_cache, i + 1);

-      t0 = tokens[next][0].token;

-      t1 = tokens[next][1].token;

-      /* Update the cost of each path if we're past the EOB token. */

-      if (t0 != EOB_TOKEN) {

-        tokens[next][0].rate += (*token_costs)[1][pt][t0];

-        tokens[next][0].token = ZERO_TOKEN;

-      }

-      if (t1 != EOB_TOKEN) {

-        tokens[next][1].rate += (*token_costs)[1][pt][t1];

-        tokens[next][1].token = ZERO_TOKEN;

-      }

-      tokens[i][0].best_index = tokens[i][1].best_index = 0;

-      /* Don't update next, because we didn't add a new node. */

-    }

-    if (!(--band_left)) {

-      --band_counts;

-      band_left = *band_counts;

-      --token_costs;

-    }

-  }

-  /* Now pick the best path through the whole trellis. */

-  rate0 = tokens[next][0].rate;

-  rate1 = tokens[next][1].rate;

-  error0 = tokens[next][0].error;

-  error1 = tokens[next][1].error;

-  t0 = tokens[next][0].token;

-  t1 = tokens[next][1].token;

-  rate0 += (*token_costs)[0][ctx][t0];

-  rate1 += (*token_costs)[0][ctx][t1];

-  UPDATE_RD_COST();

-  best = rd_cost1 < rd_cost0;

-  final_eob = -1;

-  for (i = next; i < eob; i = next) {

-    const int x = tokens[i][best].qc;

-    const int rc = scan[i];

-    if (x) final_eob = i;

-    qcoeff[rc] = x;

-    dqcoeff[rc] = tokens[i][best].dqc;

-    next = tokens[i][best].next;

-    best = tokens[i][best].best_index;

-  }

-  final_eob++;

-  mb->plane[plane].eobs[block] = final_eob;

-  return final_eob;

-}

-#endif  // USE_GREEDY_OPTIMIZE_B

 static INLINE void fdct32x32(int rd_transform, const int16_t *src,

                              tran_low_t *dst, int src_stride) {