ref: af6733aec65f49c6dd5306d4f5bca60f7af4824b
parent: 55c6a74bd4f228e48d56de200f25154eb733fc40
author: Parag Salasakar <[email protected]>
date: Sat Jul 25 08:32:26 EDT 2015
mips msa vp8 recon intra optimization average improvement ~3x-5x Change-Id: I73306863e9bf172d5adc06b8dd54e43985d1e063
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -299,6 +299,11 @@
::testing::Values(
vp8_build_intra_predictors_mby_s_neon));
#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, IntraPredYTest,
+ ::testing::Values(
+ vp8_build_intra_predictors_mby_s_msa));
+#endif
typedef void (*IntraPredUvFunc)(MACROBLOCKD *x,
uint8_t *uabove_row,
@@ -391,6 +396,11 @@
INSTANTIATE_TEST_CASE_P(NEON, IntraPredUVTest,
::testing::Values(
vp8_build_intra_predictors_mbuv_s_neon));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, IntraPredUVTest,
+ ::testing::Values(
+ vp8_build_intra_predictors_mbuv_s_msa));
#endif
} // namespace
--- /dev/null
+++ b/vp8/common/mips/msa/reconintra_msa.c
@@ -1,0 +1,342 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst,
+ int32_t dst_stride)
+{
+ uint64_t out = LD(src);
+
+ SD4(out, out, out, out, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst,
+ int32_t dst_stride)
+{
+ v16u8 out = LD_UB(src);
+
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+ dst += (8 * dst_stride);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride)
+{
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ out0 = src[0 * src_stride] * 0x0101010101010101ull;
+ out1 = src[1 * src_stride] * 0x0101010101010101ull;
+ out2 = src[2 * src_stride] * 0x0101010101010101ull;
+ out3 = src[3 * src_stride] * 0x0101010101010101ull;
+ out4 = src[4 * src_stride] * 0x0101010101010101ull;
+ out5 = src[5 * src_stride] * 0x0101010101010101ull;
+ out6 = src[6 * src_stride] * 0x0101010101010101ull;
+ out7 = src[7 * src_stride] * 0x0101010101010101ull;
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride)
+{
+ uint32_t row;
+ uint8_t inp0, inp1, inp2, inp3;
+ v16u8 src0, src1, src2, src3;
+
+ for (row = 4; row--;)
+ {
+ inp0 = src[0];
+ src += src_stride;
+ inp1 = src[0];
+ src += src_stride;
+ inp2 = src[0];
+ src += src_stride;
+ inp3 = src[0];
+ src += src_stride;
+
+ src0 = (v16u8)__msa_fill_b(inp0);
+ src1 = (v16u8)__msa_fill_b(inp1);
+ src2 = (v16u8)__msa_fill_b(inp2);
+ src3 = (v16u8)__msa_fill_b(inp3);
+
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left,
+ int32_t src_stride_left,
+ uint8_t *dst, int32_t dst_stride,
+ uint8_t is_above, uint8_t is_left)
+{
+ uint32_t row, addition = 0;
+ uint64_t out;
+ v16u8 src_above, store;
+ v8u16 sum_above;
+ v4u32 sum_top;
+ v2u64 sum;
+
+ if (is_left && is_above)
+ {
+ src_above = LD_UB(src_top);
+
+ sum_above = __msa_hadd_u_h(src_above, src_above);
+ sum_top = __msa_hadd_u_w(sum_above, sum_above);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ addition = __msa_copy_u_w((v4i32)sum, 0);
+
+ for (row = 0; row < 8; ++row)
+ {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 8) >> 4;
+ store = (v16u8)__msa_fill_b(addition);
+ }
+ else if (is_left)
+ {
+ for (row = 0; row < 8; ++row)
+ {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 4) >> 3;
+ store = (v16u8)__msa_fill_b(addition);
+ }
+ else if (is_above)
+ {
+ src_above = LD_UB(src_top);
+
+ sum_above = __msa_hadd_u_h(src_above, src_above);
+ sum_top = __msa_hadd_u_w(sum_above, sum_above);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ sum = (v2u64)__msa_srari_d((v2i64)sum, 3);
+ store = (v16u8)__msa_splati_b((v16i8)sum, 0);
+ }
+ else
+ {
+ store = (v16u8)__msa_ldi_b(128);
+ }
+
+ out = __msa_copy_u_d((v2i64)store, 0);
+
+ SD4(out, out, out, out, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_16x16_msa(uint8_t *src_top, uint8_t *src_left,
+ int32_t src_stride_left,
+ uint8_t *dst, int32_t dst_stride,
+ uint8_t is_above, uint8_t is_left)
+{
+ uint32_t row;
+ uint32_t addition = 0;
+ v16u8 src_above, out;
+ v8u16 sum_above;
+ v4u32 sum_top;
+ v2u64 sum;
+
+ if (is_left && is_above)
+ {
+ src_above = LD_UB(src_top);
+
+ sum_above = __msa_hadd_u_h(src_above, src_above);
+ sum_top = __msa_hadd_u_w(sum_above, sum_above);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ sum_top = (v4u32)__msa_pckev_w((v4i32)sum, (v4i32)sum);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ addition = __msa_copy_u_w((v4i32)sum, 0);
+
+ for (row = 0; row < 16; ++row)
+ {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 16) >> 5;
+ out = (v16u8)__msa_fill_b(addition);
+ }
+ else if (is_left)
+ {
+ for (row = 0; row < 16; ++row)
+ {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 8) >> 4;
+ out = (v16u8)__msa_fill_b(addition);
+ }
+ else if (is_above)
+ {
+ src_above = LD_UB(src_top);
+
+ sum_above = __msa_hadd_u_h(src_above, src_above);
+ sum_top = __msa_hadd_u_w(sum_above, sum_above);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ sum_top = (v4u32)__msa_pckev_w((v4i32)sum, (v4i32)sum);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ sum = (v2u64)__msa_srari_d((v2i64)sum, 4);
+ out = (v16u8)__msa_splati_b((v16i8)sum, 0);
+ }
+ else
+ {
+ out = (v16u8)__msa_ldi_b(128);
+ }
+
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+ dst += (8 * dst_stride);
+ ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+void vp8_build_intra_predictors_mby_s_msa(struct macroblockd *x,
+ unsigned char *yabove_row,
+ unsigned char *yleft,
+ int left_stride,
+ unsigned char *ypred_ptr,
+ int y_stride)
+{
+ uint32_t row, col;
+ uint8_t ytop_left = yabove_row[-1];
+
+ switch (x->mode_info_context->mbmi.mode)
+ {
+ case DC_PRED:
+ intra_predict_dc_16x16_msa(yabove_row, yleft, left_stride,
+ ypred_ptr, y_stride,
+ x->up_available, x->left_available);
+ break;
+
+ case V_PRED:
+ intra_predict_vert_16x16_msa(yabove_row, ypred_ptr, y_stride);
+ break;
+
+ case H_PRED:
+ intra_predict_horiz_16x16_msa(yleft, left_stride, ypred_ptr,
+ y_stride);
+ break;
+
+ case TM_PRED:
+ for (row = 0; row < 16; ++row)
+ {
+ for (col = 0; col < 16; ++col)
+ {
+ int pred = yleft[row * left_stride] + yabove_row[col] -
+ ytop_left;
+
+ if (pred < 0)
+ pred = 0;
+
+ if (pred > 255)
+ pred = 255;
+
+ ypred_ptr[col] = pred;
+ }
+
+ ypred_ptr += y_stride;
+ }
+ break;
+
+ case B_PRED:
+ case NEARESTMV:
+ case NEARMV:
+ case ZEROMV:
+ case NEWMV:
+ case SPLITMV:
+ case MB_MODE_COUNT:
+ break;
+ }
+}
+
+void vp8_build_intra_predictors_mbuv_s_msa(struct macroblockd *x,
+ unsigned char *uabove_row,
+ unsigned char *vabove_row,
+ unsigned char *uleft,
+ unsigned char *vleft,
+ int left_stride,
+ unsigned char *upred_ptr,
+ unsigned char *vpred_ptr,
+ int pred_stride)
+{
+ uint32_t row, col;
+ uint8_t utop_left = uabove_row[-1];
+ uint8_t vtop_left = vabove_row[-1];
+
+ switch (x->mode_info_context->mbmi.uv_mode)
+ {
+ case DC_PRED:
+ intra_predict_dc_8x8_msa(uabove_row, uleft, left_stride,
+ upred_ptr, pred_stride,
+ x->up_available, x->left_available);
+ intra_predict_dc_8x8_msa(vabove_row, vleft, left_stride,
+ vpred_ptr, pred_stride,
+ x->up_available, x->left_available);
+ break;
+
+ case V_PRED:
+ intra_predict_vert_8x8_msa(uabove_row, upred_ptr, pred_stride);
+ intra_predict_vert_8x8_msa(vabove_row, vpred_ptr, pred_stride);
+ break;
+
+ case H_PRED:
+ intra_predict_horiz_8x8_msa(uleft, left_stride, upred_ptr,
+ pred_stride);
+ intra_predict_horiz_8x8_msa(vleft, left_stride, vpred_ptr,
+ pred_stride);
+ break;
+
+ case TM_PRED:
+ for (row = 0; row < 8; ++row)
+ {
+ for (col = 0; col < 8; ++col)
+ {
+ int predu = uleft[row * left_stride] + uabove_row[col] -
+ utop_left;
+ int predv = vleft[row * left_stride] + vabove_row[col] -
+ vtop_left;
+
+ if (predu < 0)
+ predu = 0;
+
+ if (predu > 255)
+ predu = 255;
+
+ if (predv < 0)
+ predv = 0;
+
+ if (predv > 255)
+ predv = 255;
+
+ upred_ptr[col] = predu;
+ vpred_ptr[col] = predv;
+ }
+
+ upred_ptr += pred_stride;
+ vpred_ptr += pred_stride;
+ }
+ break;
+
+ case B_PRED:
+ case NEARESTMV:
+ case NEARMV:
+ case ZEROMV:
+ case NEWMV:
+ case SPLITMV:
+ case MB_MODE_COUNT:
+ break;
+ }
+}
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -153,10 +153,10 @@
$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride";
-specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon/;
+specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon msa/;
add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride";
-specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon/;
+specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon msa/;
add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left";
specialize qw/vp8_intra4x4_predict media/;
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -118,6 +118,7 @@
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/copymem_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/reconintra_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h