ref: b1df674a99c37ff8dea0609d1f6d7864fa6a05f4
parent: 863204e64dbd8366b7e9a0f67f1f026473a0a145
author: Ronald S. Bultje <[email protected]>
date: Wed Jul 10 05:26:32 EDT 2013
Remove memcpy() in handle_inter_mode() filter selection. Encode time of first 50 frames of bus (speed 0) @ 1500kbps goes from 2min4.9 to 2min3.1, i.e. a 1.4% speedup overall. Change-Id: I9b25e87974430cb942caa276410bb2eda815bd83
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2574,11 +2574,14 @@
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int_mv cur_mv[2];
int64_t this_rd = 0;
- unsigned char tmp_buf[MAX_MB_PLANE][64 * 64];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
int pred_exists = 0;
int interpolating_intpel_seen = 0;
int intpel_mv;
int64_t rd, best_rd = INT64_MAX;
+ int best_needs_copy = 0;
+ uint8_t *orig_dst[MAX_MB_PLANE];
+ int orig_dst_stride[MAX_MB_PLANE];
switch (this_mode) {
int rate_mv;
@@ -2635,6 +2638,16 @@
mbmi->mv[i].as_int = cur_mv[i].as_int;
}
+ // do first prediction into the destination buffer. Do the next
+ // prediction into a temporary buffer. Then keep track of which one
+ // of these currently holds the best predictor, and use the other
+ // one for future predictions. In the end, copy from tmp_buf to
+ // dst if necessary.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ orig_dst[i] = xd->plane[i].dst.buf;
+ orig_dst_stride[i] = xd->plane[i].dst.stride;
+ }
+
/* We don't include the cost of the second reference here, because there
* are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
* words if you present them in that order, the second one is always known
@@ -2662,7 +2675,7 @@
cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
- int rs;
+ int rs, j;
int64_t rs_rd;
const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
const int is_intpel_interp = intpel_mv &&
@@ -2684,6 +2697,21 @@
} else {
int rate_sum = 0;
int64_t dist_sum = 0;
+ if ((cm->mcomp_filter_type == SWITCHABLE &&
+ (!i || best_needs_copy)) ||
+ (cm->mcomp_filter_type != SWITCHABLE &&
+ (cm->mcomp_filter_type == mbmi->interp_filter ||
+ (!interpolating_intpel_seen && is_intpel_interp)))) {
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ xd->plane[j].dst.buf = orig_dst[j];
+ xd->plane[j].dst.stride = orig_dst_stride[j];
+ }
+ } else {
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
+ xd->plane[j].dst.stride = 64;
+ }
+ }
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
@@ -2704,27 +2732,23 @@
if (newbest) {
best_rd = rd;
*best_filter = mbmi->interp_filter;
+ if (cm->mcomp_filter_type == SWITCHABLE && i &&
+ !(interpolating_intpel_seen && is_intpel_interp))
+ best_needs_copy = !best_needs_copy;
}
if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
(cm->mcomp_filter_type != SWITCHABLE &&
cm->mcomp_filter_type == mbmi->interp_filter)) {
- int p;
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- struct macroblockd_plane *pd = &xd->plane[p];
- const int bw = plane_block_width(bsize, pd);
- const int bh = plane_block_height(bsize, pd);
- int i;
-
- for (i = 0; i < bh; i++)
- vpx_memcpy(&tmp_buf[p][64 * i], pd->dst.buf + i * pd->dst.stride,
- bw);
- }
pred_exists = 1;
}
interpolating_intpel_seen |= is_intpel_interp;
}
+
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
}
// Set the appripriate filter
@@ -2732,18 +2756,13 @@
cm->mcomp_filter_type : *best_filter;
vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
if (pred_exists) {
- int p;
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- struct macroblockd_plane *pd = &xd->plane[p];
- const int bw = plane_block_width(bsize, pd);
- const int bh = plane_block_height(bsize, pd);
- int i;
-
- for (i = 0; i < bh; i++)
- vpx_memcpy(pd->dst.buf + i * pd->dst.stride, &tmp_buf[p][64 * i], bw);
+ if (best_needs_copy) {
+ // again temporarily set the buffers to local memory to prevent a memcpy
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
+ xd->plane[i].dst.stride = 64;
+ }
}
} else {
// Handles the special case when a filter that is not in the
@@ -2817,6 +2836,10 @@
if (*rate_y == INT_MAX) {
*rate2 = INT_MAX;
*distortion = INT64_MAX;
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
return INT64_MAX;
}
@@ -2838,6 +2861,11 @@
} else {
*mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
}
+ }
+
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
}
return this_rd; // if 0, this will be re-calculated by caller