ref: 8c411f74e000457b43bc75093b5908819b34e394
parent: ba13ff8501c53c0575e7e814c2c4320987d2d24c
author: Jingning Han <[email protected]>
date: Mon Mar 23 06:02:42 EDT 2015
Hadamard transform based coding mode decision process This commit uses Hadamard transform based rate-distortion cost estimate for rtc coding mode decision. It improves the compression performance of speed -6 for many hard clips at lower bit-rates. For example, 5.5% for jimredvga, 6.7% for mmmoving, 6.1% for niklas720p. This will introduce extra encoding cycle costs at this point. Change-Id: Iaf70634fa2417a705ee29f2456175b981db3d375
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1109,6 +1109,15 @@
add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
specialize qw/vp9_avg_4x4 sse2/;
+add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_8x8 sse2/;
+
+add_proto qw/void vp9_hadamard_16x16/, "int16_t *coeff";
+specialize qw/vp9_hadamard_16x16/;
+
+add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
+specialize qw/vp9_satd sse2/;
+
add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
specialize qw/vp9_int_pro_row sse2/;
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -28,6 +28,87 @@
return (sum + 8) >> 4;
}
+static void hadamard_col8(const int16_t *src_diff, int src_stride,
+ int16_t *coeff) {
+ int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+ int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+ int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+ int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+ int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+ int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+ int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+ int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+ int16_t c0 = b0 + b2;
+ int16_t c1 = b1 + b3;
+ int16_t c2 = b0 - b2;
+ int16_t c3 = b1 - b3;
+ int16_t c4 = b4 + b6;
+ int16_t c5 = b5 + b7;
+ int16_t c6 = b4 - b6;
+ int16_t c7 = b5 - b7;
+
+ coeff[0] = c0 + c4;
+ coeff[7] = c1 + c5;
+ coeff[3] = c2 + c6;
+ coeff[4] = c3 + c7;
+ coeff[2] = c0 - c4;
+ coeff[6] = c1 - c5;
+ coeff[1] = c2 - c6;
+ coeff[5] = c3 - c7;
+}
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ int idx;
+ int16_t buffer[64];
+ int16_t *tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(src_diff, src_stride, tmp_buf);
+ tmp_buf += 8;
+ ++src_diff;
+ }
+
+ tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(tmp_buf, 8, coeff);
+ coeff += 8;
+ ++tmp_buf;
+ }
+}
+
+// In place 16x16 2D Hadamard transform
+void vp9_hadamard_16x16_c(int16_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 64; ++idx) {
+ int16_t a0 = coeff[0];
+ int16_t a1 = coeff[64];
+ int16_t a2 = coeff[128];
+ int16_t a3 = coeff[192];
+
+ int16_t b0 = a0 + a1;
+ int16_t b1 = a0 - a1;
+ int16_t b2 = a2 + a3;
+ int16_t b3 = a2 - a3;
+
+ coeff[0] = (b0 + b2) >> 1;
+ coeff[64] = (b1 + b3) >> 1;
+ coeff[128] = (b0 - b2) >> 1;
+ coeff[192] = (b1 - b3) >> 1;
+
+ ++coeff;
+ }
+}
+
+int16_t vp9_satd_c(const int16_t *coeff, int length) {
+ int i;
+ int satd = 0;
+ for (i = 0; i < length; ++i)
+ satd += abs(coeff[i]);
+
+ return (int16_t)satd;
+}
+
// Integer projection onto row vectors.
void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
const int ref_stride, const int height) {
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -20,9 +20,11 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
+#include "vp9/encoder/vp9_cost.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_pickmode.h"
#include "vp9/encoder/vp9_ratectrl.h"
@@ -188,6 +190,8 @@
cond_cost_list(cpi, cost_list),
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref], NULL, 0, 0);
+ *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
if (scaled_ref_frame) {
@@ -198,7 +202,6 @@
return rv;
}
-
static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int64_t *out_dist_sum,
@@ -312,6 +315,105 @@
*out_dist_sum += dist << 4;
}
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+ int *skippable, int64_t *sse, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ const int step = 1 << (tx_size << 1);
+ const int block_step = (1 << tx_size);
+ int block = 0, r, c;
+ int shift = tx_size == TX_32X32 ? 0 : 2;
+ const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+ xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+ const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+ xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ unsigned int var_y, sse_y;
+ model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+ *sse = INT_MAX;
+ *skippable = 0;
+ return;
+#else
+ (void)cpi;
+#endif
+
+ vp9_subtract_plane(x, bsize, plane);
+
+ *skippable = 1;
+ *rate = 0;
+ *dist = 0;
+ *sse = 0;
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_blocks_high; r += block_step) {
+ for (c = 0; c < num_4x4_w; c += block_step) {
+ if (c < max_blocks_wide) {
+ const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+ int i, j;
+ const int16_t *src_diff;
+ int64_t this_sse;
+ txfrm_block_to_raster_xy(bsize, tx_size, block, &i, &j);
+ src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+
+ switch (tx_size) {
+ case TX_32X32:
+ vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
+ vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+ p->round_fp, p->quant_fp, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_16X16:
+ vp9_fdct16x16(src_diff, coeff, diff_stride);
+ vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_8X8:
+ vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+ vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_4X4:
+ x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ *dist += vp9_block_error(coeff, dqcoeff, step << 4, &this_sse) >> shift;
+ *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
+
+ *sse += (this_sse >> shift);
+ *skippable &= (*eob == 0);
+ }
+ block += step;
+ }
+ }
+
+ *rate <<= 8;
+ *rate *= 6;
+}
+
static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int64_t *out_dist_sum,
@@ -518,7 +620,9 @@
int i, j;
int rate;
int64_t dist;
- unsigned int var_y, sse_y;
+ int64_t this_sse;
+ int is_skippable;
+
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
assert(plane == 0);
(void) plane;
@@ -533,8 +637,16 @@
x->skip_encode ? src_stride : dst_stride,
pd->dst.buf, dst_stride,
i, j, 0);
- // This procedure assumes zero offset from p->src.buf and pd->dst.buf.
- model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
+
+ // TODO(jingning): This needs further refactoring.
+ block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
+ bsize_tx, tx_size);
+ x->skip_txfm[0] = is_skippable;
+ if (is_skippable)
+ rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
+ else
+ rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
+
p->src.buf = src_buf_base;
pd->dst.buf = dst_buf_base;
args->rate += rate;
@@ -602,10 +714,6 @@
*rd_cost = best_rdc;
}
-static const int ref_frame_cost[MAX_REF_FRAMES] = {
- 1235, 229, 530, 615,
-};
-
typedef struct {
MV_REFERENCE_FRAME ref_frame;
PREDICTION_MODE pred_mode;
@@ -682,7 +790,21 @@
int ref_frame_skip_mask = 0;
int idx;
int best_pred_sad = INT_MAX;
+ int ref_frame_cost[MAX_REF_FRAMES];
+ vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+ vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+ vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+ ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
+ ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =
+ ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1);
+
+ ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0);
+ ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+ ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+ ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+ ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
+
if (reuse_inter_pred) {
int i;
for (i = 0; i < 3; i++) {
@@ -773,6 +895,9 @@
int mode_index;
int i;
PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+ int64_t this_sse;
+ int is_skippable;
+
if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
continue;
@@ -935,19 +1060,42 @@
vp9_get_switchable_rate(cpi, xd) : 0;
}
+ // TODO(jingning): disable color operations temporarily.
// chroma component rate-distortion cost modeling
- if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
- int uv_rate = 0;
- int64_t uv_dist = 0;
- if (x->color_sensitivity[0])
- vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
- if (x->color_sensitivity[1])
- vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
- model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist, &var_y, &sse_y);
- this_rdc.rate += uv_rate;
- this_rdc.dist += uv_dist;
+// if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+// int uv_rate = 0;
+// int64_t uv_dist = 0;
+// if (x->color_sensitivity[0])
+// vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
+// if (x->color_sensitivity[1])
+// vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+// model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
+// &var_y, &sse_y);
+// this_rdc.rate += uv_rate;
+// this_rdc.dist += uv_dist;
+// }
+
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+ block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable, &this_sse,
+ 0, bsize, mbmi->tx_size);
+ x->skip_txfm[0] = is_skippable;
+ if (is_skippable) {
+ this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ } else {
+ if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) <
+ RDCOST(x->rdmult, x->rddiv, 0, this_sse)) {
+ this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+ } else {
+ this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ this_rdc.dist = this_sse;
+ }
}
+ if (cm->interp_filter == SWITCHABLE) {
+ if ((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07)
+ this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
+ }
+
this_rdc.rate += rate_mv;
this_rdc.rate +=
cpi->inter_mode_cost[mbmi->mode_context[ref_frame]][INTER_OFFSET(
@@ -1042,6 +1190,8 @@
const PREDICTION_MODE this_mode = intra_mode_list[i];
if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size]))
continue;
+ mbmi->mode = this_mode;
+ mbmi->ref_frame[0] = INTRA_FRAME;
args.mode = this_mode;
args.rate = 0;
args.dist = 0;
@@ -1058,17 +1208,17 @@
if (this_rdc.rdcost < best_rdc.rdcost) {
best_rdc = this_rdc;
- mbmi->mode = this_mode;
+ best_mode = this_mode;
best_intra_tx_size = mbmi->tx_size;
- mbmi->ref_frame[0] = INTRA_FRAME;
+ best_ref_frame = INTRA_FRAME;
mbmi->uv_mode = this_mode;
mbmi->mv[0].as_int = INVALID_MV;
+ best_mode_skip_txfm = x->skip_txfm[0];
}
}
// Reset mb_mode_info to the best inter mode.
- if (mbmi->ref_frame[0] != INTRA_FRAME) {
- x->skip_txfm[0] = best_mode_skip_txfm;
+ if (best_ref_frame != INTRA_FRAME) {
mbmi->tx_size = best_tx_size;
} else {
mbmi->tx_size = best_intra_tx_size;
@@ -1076,6 +1226,9 @@
}
pd->dst = orig_dst;
+ mbmi->mode = best_mode;
+ mbmi->ref_frame[0] = best_ref_frame;
+ x->skip_txfm[0] = best_mode_skip_txfm;
if (reuse_inter_pred && best_pred != NULL) {
if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -57,6 +57,141 @@
return (avg + 8) >> 4;
}
+static void hadamard_col8_sse2(__m128i *in, int iter) {
+ __m128i a0 = in[0];
+ __m128i a1 = in[1];
+ __m128i a2 = in[2];
+ __m128i a3 = in[3];
+ __m128i a4 = in[4];
+ __m128i a5 = in[5];
+ __m128i a6 = in[6];
+ __m128i a7 = in[7];
+
+ __m128i b0 = _mm_add_epi16(a0, a1);
+ __m128i b1 = _mm_sub_epi16(a0, a1);
+ __m128i b2 = _mm_add_epi16(a2, a3);
+ __m128i b3 = _mm_sub_epi16(a2, a3);
+ __m128i b4 = _mm_add_epi16(a4, a5);
+ __m128i b5 = _mm_sub_epi16(a4, a5);
+ __m128i b6 = _mm_add_epi16(a6, a7);
+ __m128i b7 = _mm_sub_epi16(a6, a7);
+
+ a0 = _mm_add_epi16(b0, b2);
+ a1 = _mm_add_epi16(b1, b3);
+ a2 = _mm_sub_epi16(b0, b2);
+ a3 = _mm_sub_epi16(b1, b3);
+ a4 = _mm_add_epi16(b4, b6);
+ a5 = _mm_add_epi16(b5, b7);
+ a6 = _mm_sub_epi16(b4, b6);
+ a7 = _mm_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm_add_epi16(a0, a4);
+ b1 = _mm_add_epi16(a1, a5);
+ b2 = _mm_add_epi16(a2, a6);
+ b3 = _mm_add_epi16(a3, a7);
+ b4 = _mm_sub_epi16(a0, a4);
+ b5 = _mm_sub_epi16(a1, a5);
+ b6 = _mm_sub_epi16(a2, a6);
+ b7 = _mm_sub_epi16(a3, a7);
+
+ a0 = _mm_unpacklo_epi16(b0, b1);
+ a1 = _mm_unpacklo_epi16(b2, b3);
+ a2 = _mm_unpackhi_epi16(b0, b1);
+ a3 = _mm_unpackhi_epi16(b2, b3);
+ a4 = _mm_unpacklo_epi16(b4, b5);
+ a5 = _mm_unpacklo_epi16(b6, b7);
+ a6 = _mm_unpackhi_epi16(b4, b5);
+ a7 = _mm_unpackhi_epi16(b6, b7);
+
+ b0 = _mm_unpacklo_epi32(a0, a1);
+ b1 = _mm_unpacklo_epi32(a4, a5);
+ b2 = _mm_unpackhi_epi32(a0, a1);
+ b3 = _mm_unpackhi_epi32(a4, a5);
+ b4 = _mm_unpacklo_epi32(a2, a3);
+ b5 = _mm_unpacklo_epi32(a6, a7);
+ b6 = _mm_unpackhi_epi32(a2, a3);
+ b7 = _mm_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm_unpacklo_epi64(b0, b1);
+ in[7] = _mm_unpackhi_epi64(b0, b1);
+ in[3] = _mm_unpacklo_epi64(b2, b3);
+ in[4] = _mm_unpackhi_epi64(b2, b3);
+ in[2] = _mm_unpacklo_epi64(b4, b5);
+ in[6] = _mm_unpackhi_epi64(b4, b5);
+ in[1] = _mm_unpacklo_epi64(b6, b7);
+ in[5] = _mm_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm_add_epi16(a0, a4);
+ in[7] = _mm_add_epi16(a1, a5);
+ in[3] = _mm_add_epi16(a2, a6);
+ in[4] = _mm_add_epi16(a3, a7);
+ in[2] = _mm_sub_epi16(a0, a4);
+ in[6] = _mm_sub_epi16(a1, a5);
+ in[1] = _mm_sub_epi16(a2, a6);
+ in[5] = _mm_sub_epi16(a3, a7);
+ }
+}
+
+void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ __m128i src[8];
+ src[0] = _mm_load_si128((const __m128i *)src_diff);
+ src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+ hadamard_col8_sse2(src, 0);
+ hadamard_col8_sse2(src, 1);
+
+ _mm_storeu_si128((__m128i *)coeff, src[0]);
+ coeff += 8;
+ _mm_storeu_si128((__m128i *)coeff, src[1]);
+ coeff += 8;
+ _mm_storeu_si128((__m128i *)coeff, src[2]);
+ coeff += 8;
+ _mm_storeu_si128((__m128i *)coeff, src[3]);
+ coeff += 8;
+ _mm_storeu_si128((__m128i *)coeff, src[4]);
+ coeff += 8;
+ _mm_storeu_si128((__m128i *)coeff, src[5]);
+ coeff += 8;
+ _mm_storeu_si128((__m128i *)coeff, src[6]);
+ coeff += 8;
+ _mm_storeu_si128((__m128i *)coeff, src[7]);
+}
+
+int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
+ int i;
+ __m128i sum = _mm_load_si128((const __m128i *)coeff);
+ __m128i sign = _mm_srai_epi16(sum, 15);
+ __m128i val = _mm_xor_si128(sum, sign);
+ sum = _mm_sub_epi16(val, sign);
+ coeff += 8;
+
+ for (i = 8; i < length; i += 8) {
+ __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+ sign = _mm_srai_epi16(src_line, 15);
+ val = _mm_xor_si128(src_line, sign);
+ val = _mm_sub_epi16(val, sign);
+ sum = _mm_add_epi16(sum, val);
+ coeff += 8;
+ }
+
+ val = _mm_srli_si128(sum, 8);
+ sum = _mm_add_epi16(sum, val);
+ val = _mm_srli_epi64(sum, 32);
+ sum = _mm_add_epi16(sum, val);
+ val = _mm_srli_epi32(sum, 16);
+ sum = _mm_add_epi16(sum, val);
+
+ return _mm_extract_epi16(sum, 0);
+}
+
void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
const int ref_stride, const int height) {
int idx;