ref: e446ffda45b61f0900a8063a004c7ceef0b4ffa2
parent: 4508eb3123bd9b65a099715ea143680d268ad2ff
author: Alex Converse <[email protected]>
date: Fri Jul 29 08:06:49 EDT 2016
Cache optimizations in optimize_b(). Move best index into the token state. Shrink it down to one byte. This is more cache friendly (access are group together) and uses less total memory. Results in 4% fewer cycles in optimize_b(). Change-Id: I75db484fb3dc82f59928d54b659d79c80ee40452
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -57,6 +57,7 @@
int16_t token;
tran_low_t qc;
tran_low_t dqc;
+ uint8_t best_index;
} vp9_token_state;
static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] ={ {10, 6}, {8, 5}, };
@@ -87,7 +88,6 @@
struct macroblockd_plane *const pd = &xd->plane[plane];
const int ref = is_inter_block(xd->mi[0]);
vp9_token_state tokens[1025][2];
- unsigned best_index[1025][2];
uint8_t token_cache[1024];
const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
@@ -172,7 +172,7 @@
tokens[i][0].token = t0;
tokens[i][0].qc = x;
tokens[i][0].dqc = dqcoeff[rc];
- best_index[i][0] = best;
+ tokens[i][0].best_index = best;
/* Evaluate the second possibility for this state. */
rate0 = tokens[next][0].rate;
@@ -190,7 +190,6 @@
x -= 2 * sz + 1;
} else {
tokens[i][1] = tokens[i][0];
- best_index[i][1] = best_index[i][0];
next = i;
continue;
}
@@ -261,7 +260,7 @@
tokens[i][1].dqc = 0;
}
- best_index[i][1] = best;
+ tokens[i][1].best_index = best;
/* Finally, make this the new head of the trellis. */
next = i;
} else {
@@ -283,7 +282,7 @@
mb->token_costs[tx_size][type][ref][band][1][pt][t1];
tokens[next][1].token = ZERO_TOKEN;
}
- best_index[i][0] = best_index[i][1] = 0;
+ tokens[i][0].best_index = tokens[i][1].best_index = 0;
/* Don't update next, because we didn't add a new node. */
}
}
@@ -309,7 +308,7 @@
qcoeff[rc] = x;
dqcoeff[rc] = tokens[i][best].dqc;
next = tokens[i][best].next;
- best = best_index[i][best];
+ best = tokens[i][best].best_index;
}
final_eob++;