ref: 432cd4bfb795534c8ba479fd735cc11dc8562469
parent: 39f03bf9c672b8e7c4b483d4e919a695707c90dd
author: Jingning Han <[email protected]>
date: Mon Jul 6 05:33:27 EDT 2015
Move subtract functions from vp9 to vpx_dsp Factor out the subtraction operator as common function. Change-Id: I526e703477c6a290e0e3e3c8898f8bb1ca82779b
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -14,6 +14,7 @@
#include "test/register_state_check.h"
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "vp9/common/vp9_blockd.h"
#include "vpx_mem/vpx_mem.h"
@@ -89,15 +90,15 @@
}
INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
- ::testing::Values(vp9_subtract_block_c));
+ ::testing::Values(vpx_subtract_block_c));
#if HAVE_SSE2 && CONFIG_USE_X86INC
INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
- ::testing::Values(vp9_subtract_block_sse2));
+ ::testing::Values(vpx_subtract_block_sse2));
#endif
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,
- ::testing::Values(vp9_subtract_block_neon));
+ ::testing::Values(vpx_subtract_block_neon));
#endif
} // namespace vp9
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -19,6 +19,8 @@
#include "vpx_mem/vpx_mem.h"
#include "rdopt.h"
+// TODO(jingning,johannkoenig): use vpx_subtract_block to replace
+// codec specified vp9_subtract_ functions.
void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
{
unsigned char *src_ptr = (*(be->base_src) + be->src);
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -922,9 +922,6 @@
# ENCODEMB INVOKE
-add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vp9_subtract_block neon msa/, "$sse2_x86inc";
-
#
# Denoiser
#
@@ -1327,9 +1324,6 @@
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
specialize qw/vp9_highbd_block_error sse2/;
-
- add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
- specialize qw/vp9_highbd_subtract_block/;
add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_fp/;
--- a/vp9/encoder/arm/neon/vp9_subtract_neon.c
+++ /dev/null
@@ -1,81 +1,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-
-#include "vpx/vpx_integer.h"
-
-void vp9_subtract_block_neon(int rows, int cols,
- int16_t *diff, ptrdiff_t diff_stride,
- const uint8_t *src, ptrdiff_t src_stride,
- const uint8_t *pred, ptrdiff_t pred_stride) {
- int r, c;
-
- if (cols > 16) {
- for (r = 0; r < rows; ++r) {
- for (c = 0; c < cols; c += 32) {
- const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
- const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
- const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
- const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
- const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),
- vget_low_u8(v_pred_00));
- const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),
- vget_high_u8(v_pred_00));
- const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),
- vget_low_u8(v_pred_16));
- const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),
- vget_high_u8(v_pred_16));
- vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
- vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
- vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
- vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
- }
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
- } else if (cols > 8) {
- for (r = 0; r < rows; ++r) {
- const uint8x16_t v_src = vld1q_u8(&src[0]);
- const uint8x16_t v_pred = vld1q_u8(&pred[0]);
- const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),
- vget_low_u8(v_pred));
- const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),
- vget_high_u8(v_pred));
- vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
- vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
- } else if (cols > 4) {
- for (r = 0; r < rows; ++r) {
- const uint8x8_t v_src = vld1_u8(&src[0]);
- const uint8x8_t v_pred = vld1_u8(&pred[0]);
- const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
- vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
- } else {
- for (r = 0; r < rows; ++r) {
- for (c = 0; c < cols; ++c)
- diff[c] = src[c] - pred[c];
-
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
- }
-}
--- a/vp9/encoder/mips/msa/vp9_subtract_msa.c
+++ /dev/null
@@ -1,264 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
-
-static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *pred_ptr, int32_t pred_stride,
- int16_t *diff_ptr, int32_t diff_stride) {
- uint32_t src0, src1, src2, src3;
- uint32_t pred0, pred1, pred2, pred3;
- v16i8 src = { 0 };
- v16i8 pred = { 0 };
- v16u8 src_l0, src_l1;
- v8i16 diff0, diff1;
-
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
- INSERT_W4_SB(src0, src1, src2, src3, src);
- INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
- ILVRL_B2_UB(src, pred, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
-}
-
-static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *pred_ptr, int32_t pred_stride,
- int16_t *diff_ptr, int32_t diff_stride) {
- uint32_t loop_cnt;
- uint64_t src0, src1, pred0, pred1;
- v16i8 src = { 0 };
- v16i8 pred = { 0 };
- v16u8 src_l0, src_l1;
- v8i16 diff0, diff1;
-
- for (loop_cnt = 4; loop_cnt--;) {
- LD2(src_ptr, src_stride, src0, src1);
- src_ptr += (2 * src_stride);
- LD2(pred_ptr, pred_stride, pred0, pred1);
- pred_ptr += (2 * pred_stride);
-
- INSERT_D2_SB(src0, src1, src);
- INSERT_D2_SB(pred0, pred1, pred);
- ILVRL_B2_UB(src, pred, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff_ptr, diff_stride);
- diff_ptr += (2 * diff_stride);
- }
-}
-
-static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *pred, int32_t pred_stride,
- int16_t *diff, int32_t diff_stride) {
- int8_t count;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
- v16u8 src_l0, src_l1;
- v8i16 diff0, diff1;
-
- for (count = 2; count--;) {
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
-
- LD_SB8(pred, pred_stride,
- pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7);
- pred += (8 * pred_stride);
-
- ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
- }
-}
-
-static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *pred, int32_t pred_stride,
- int16_t *diff, int32_t diff_stride) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
- v16u8 src_l0, src_l1;
- v8i16 diff0, diff1;
-
- for (loop_cnt = 8; loop_cnt--;) {
- LD_SB2(src, 16, src0, src1);
- src += src_stride;
- LD_SB2(src, 16, src2, src3);
- src += src_stride;
- LD_SB2(src, 16, src4, src5);
- src += src_stride;
- LD_SB2(src, 16, src6, src7);
- src += src_stride;
-
- LD_SB2(pred, 16, pred0, pred1);
- pred += pred_stride;
- LD_SB2(pred, 16, pred2, pred3);
- pred += pred_stride;
- LD_SB2(pred, 16, pred4, pred5);
- pred += pred_stride;
- LD_SB2(pred, 16, pred6, pred7);
- pred += pred_stride;
-
- ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- diff += diff_stride;
- }
-}
-
-static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *pred, int32_t pred_stride,
- int16_t *diff, int32_t diff_stride) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
- v16u8 src_l0, src_l1;
- v8i16 diff0, diff1;
-
- for (loop_cnt = 32; loop_cnt--;) {
- LD_SB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_SB4(src, 16, src4, src5, src6, src7);
- src += src_stride;
-
- LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
- pred += pred_stride;
- LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
- pred += pred_stride;
-
- ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 32, 8);
- ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 48, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 32, 8);
- ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 48, 8);
- diff += diff_stride;
- }
-}
-
-void vp9_subtract_block_msa(int32_t rows, int32_t cols,
- int16_t *diff_ptr, ptrdiff_t diff_stride,
- const uint8_t *src_ptr, ptrdiff_t src_stride,
- const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
- if (rows == cols) {
- switch (rows) {
- case 4:
- sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride,
- diff_ptr, diff_stride);
- break;
- case 8:
- sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride,
- diff_ptr, diff_stride);
- break;
- case 16:
- sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride,
- diff_ptr, diff_stride);
- break;
- case 32:
- sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride,
- diff_ptr, diff_stride);
- break;
- case 64:
- sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride,
- diff_ptr, diff_stride);
- break;
- default:
- vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
- src_stride, pred_ptr, pred_stride);
- break;
- }
- } else {
- vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
- pred_ptr, pred_stride);
- }
-}
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -11,6 +11,7 @@
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
@@ -31,45 +32,6 @@
ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
};
-void vp9_subtract_block_c(int rows, int cols,
- int16_t *diff, ptrdiff_t diff_stride,
- const uint8_t *src, ptrdiff_t src_stride,
- const uint8_t *pred, ptrdiff_t pred_stride) {
- int r, c;
-
- for (r = 0; r < rows; r++) {
- for (c = 0; c < cols; c++)
- diff[c] = src[c] - pred[c];
-
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_subtract_block_c(int rows, int cols,
- int16_t *diff, ptrdiff_t diff_stride,
- const uint8_t *src8, ptrdiff_t src_stride,
- const uint8_t *pred8, ptrdiff_t pred_stride,
- int bd) {
- int r, c;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- (void) bd;
-
- for (r = 0; r < rows; r++) {
- for (c = 0; c < cols; c++) {
- diff[c] = src[c] - pred[c];
- }
-
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
@@ -79,13 +41,13 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+ vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
p->src.stride, pd->dst.buf, pd->dst.stride,
x->e_mbd.bd);
return;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
- vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
pd->dst.buf, pd->dst.stride);
}
@@ -838,7 +800,7 @@
x->skip_encode ? src_stride : dst_stride,
dst, dst_stride, i, j, plane);
if (!x->skip_recode) {
- vp9_highbd_subtract_block(32, 32, src_diff, diff_stride,
+ vpx_highbd_subtract_block(32, 32, src_diff, diff_stride,
src, src_stride, dst, dst_stride, xd->bd);
highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
@@ -859,7 +821,7 @@
x->skip_encode ? src_stride : dst_stride,
dst, dst_stride, i, j, plane);
if (!x->skip_recode) {
- vp9_highbd_subtract_block(16, 16, src_diff, diff_stride,
+ vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
src, src_stride, dst, dst_stride, xd->bd);
vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
@@ -881,7 +843,7 @@
x->skip_encode ? src_stride : dst_stride,
dst, dst_stride, i, j, plane);
if (!x->skip_recode) {
- vp9_highbd_subtract_block(8, 8, src_diff, diff_stride,
+ vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
src, src_stride, dst, dst_stride, xd->bd);
vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
@@ -904,7 +866,7 @@
dst, dst_stride, i, j, plane);
if (!x->skip_recode) {
- vp9_highbd_subtract_block(4, 4, src_diff, diff_stride,
+ vpx_highbd_subtract_block(4, 4, src_diff, diff_stride,
src, src_stride, dst, dst_stride, xd->bd);
if (tx_type != DCT_DCT)
vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
@@ -946,7 +908,7 @@
x->skip_encode ? src_stride : dst_stride,
dst, dst_stride, i, j, plane);
if (!x->skip_recode) {
- vp9_subtract_block(32, 32, src_diff, diff_stride,
+ vpx_subtract_block(32, 32, src_diff, diff_stride,
src, src_stride, dst, dst_stride);
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
@@ -966,7 +928,7 @@
x->skip_encode ? src_stride : dst_stride,
dst, dst_stride, i, j, plane);
if (!x->skip_recode) {
- vp9_subtract_block(16, 16, src_diff, diff_stride,
+ vpx_subtract_block(16, 16, src_diff, diff_stride,
src, src_stride, dst, dst_stride);
vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
@@ -986,7 +948,7 @@
x->skip_encode ? src_stride : dst_stride,
dst, dst_stride, i, j, plane);
if (!x->skip_recode) {
- vp9_subtract_block(8, 8, src_diff, diff_stride,
+ vpx_subtract_block(8, 8, src_diff, diff_stride,
src, src_stride, dst, dst_stride);
vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
@@ -1007,7 +969,7 @@
dst, dst_stride, i, j, plane);
if (!x->skip_recode) {
- vp9_subtract_block(4, 4, src_diff, diff_stride,
+ vpx_subtract_block(4, 4, src_diff, diff_stride,
src, src_stride, dst, dst_stride);
if (tx_type != DCT_DCT)
vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -12,6 +12,7 @@
#include <math.h>
#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
@@ -832,7 +833,7 @@
x->skip_encode ? src : dst,
x->skip_encode ? src_stride : dst_stride,
dst, dst_stride, idx, idy, 0);
- vp9_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
+ vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
dst, dst_stride, xd->bd);
if (xd->lossless) {
const scan_order *so = &vp9_default_scan_orders[TX_4X4];
@@ -932,7 +933,7 @@
x->skip_encode ? src : dst,
x->skip_encode ? src_stride : dst_stride,
dst, dst_stride, idx, idy, 0);
- vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
+ vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
if (xd->lossless) {
const scan_order *so = &vp9_default_scan_orders[TX_4X4];
@@ -1394,16 +1395,16 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_highbd_subtract_block(
+ vpx_highbd_subtract_block(
height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
} else {
- vp9_subtract_block(
+ vpx_subtract_block(
height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
8, src, p->src.stride, dst, pd->dst.stride);
}
#else
- vp9_subtract_block(height, width,
+ vpx_subtract_block(height, width,
vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
8, src, p->src.stride, dst, pd->dst.stride);
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/x86/vp9_subtract_sse2.asm
+++ /dev/null
@@ -1,127 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; void vp9_subtract_block(int rows, int cols,
-; int16_t *diff, ptrdiff_t diff_stride,
-; const uint8_t *src, ptrdiff_t src_stride,
-; const uint8_t *pred, ptrdiff_t pred_stride)
-
-INIT_XMM sse2
-cglobal subtract_block, 7, 7, 8, \
- rows, cols, diff, diff_stride, src, src_stride, \
- pred, pred_stride
-%define pred_str colsq
- pxor m7, m7 ; dedicated zero register
- cmp colsd, 4
- je .case_4
- cmp colsd, 8
- je .case_8
- cmp colsd, 16
- je .case_16
- cmp colsd, 32
- je .case_32
-
-%macro loop16 6
- mova m0, [srcq+%1]
- mova m4, [srcq+%2]
- mova m1, [predq+%3]
- mova m5, [predq+%4]
- punpckhbw m2, m0, m7
- punpckhbw m3, m1, m7
- punpcklbw m0, m7
- punpcklbw m1, m7
- psubw m2, m3
- psubw m0, m1
- punpckhbw m1, m4, m7
- punpckhbw m3, m5, m7
- punpcklbw m4, m7
- punpcklbw m5, m7
- psubw m1, m3
- psubw m4, m5
- mova [diffq+mmsize*0+%5], m0
- mova [diffq+mmsize*1+%5], m2
- mova [diffq+mmsize*0+%6], m4
- mova [diffq+mmsize*1+%6], m1
-%endmacro
-
- mov pred_str, pred_stridemp
-.loop_64:
- loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
- loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
- lea diffq, [diffq+diff_strideq*2]
- add predq, pred_str
- add srcq, src_strideq
- dec rowsd
- jg .loop_64
- RET
-
-.case_32:
- mov pred_str, pred_stridemp
-.loop_32:
- loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
- lea diffq, [diffq+diff_strideq*2]
- add predq, pred_str
- add srcq, src_strideq
- dec rowsd
- jg .loop_32
- RET
-
-.case_16:
- mov pred_str, pred_stridemp
-.loop_16:
- loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
- lea diffq, [diffq+diff_strideq*4]
- lea predq, [predq+pred_str*2]
- lea srcq, [srcq+src_strideq*2]
- sub rowsd, 2
- jg .loop_16
- RET
-
-%macro loop_h 0
- movh m0, [srcq]
- movh m2, [srcq+src_strideq]
- movh m1, [predq]
- movh m3, [predq+pred_str]
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- psubw m0, m1
- psubw m2, m3
- mova [diffq], m0
- mova [diffq+diff_strideq*2], m2
-%endmacro
-
-.case_8:
- mov pred_str, pred_stridemp
-.loop_8:
- loop_h
- lea diffq, [diffq+diff_strideq*4]
- lea srcq, [srcq+src_strideq*2]
- lea predq, [predq+pred_str*2]
- sub rowsd, 2
- jg .loop_8
- RET
-
-INIT_MMX
-.case_4:
- mov pred_str, pred_stridemp
-.loop_4:
- loop_h
- lea diffq, [diffq+diff_strideq*4]
- lea srcq, [srcq+src_strideq*2]
- lea predq, [predq+pred_str*2]
- sub rowsd, 2
- jg .loop_4
- RET
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -114,7 +114,6 @@
ifeq ($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@@ -151,7 +150,6 @@
endif
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
@@ -161,7 +159,6 @@
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
--- /dev/null
+++ b/vpx_dsp/arm/subtract_neon.c
@@ -1,0 +1,80 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_subtract_block_neon(int rows, int cols,
+ int16_t *diff, ptrdiff_t diff_stride,
+ const uint8_t *src, ptrdiff_t src_stride,
+ const uint8_t *pred, ptrdiff_t pred_stride) {
+ int r, c;
+
+ if (cols > 16) {
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; c += 32) {
+ const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+ const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+ const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
+ const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+ const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),
+ vget_low_u8(v_pred_00));
+ const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),
+ vget_high_u8(v_pred_00));
+ const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),
+ vget_low_u8(v_pred_16));
+ const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),
+ vget_high_u8(v_pred_16));
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
+ vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+ vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+ }
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else if (cols > 8) {
+ for (r = 0; r < rows; ++r) {
+ const uint8x16_t v_src = vld1q_u8(&src[0]);
+ const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+ const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),
+ vget_low_u8(v_pred));
+ const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),
+ vget_high_u8(v_pred));
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+ vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else if (cols > 4) {
+ for (r = 0; r < rows; ++r) {
+ const uint8x8_t v_src = vld1_u8(&src[0]);
+ const uint8x8_t v_pred = vld1_u8(&pred[0]);
+ const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else {
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c)
+ diff[c] = src[c] - pred[c];
+
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ }
+}
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -24,6 +24,9 @@
#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
#if (__mips_isa_rev >= 6)
#define LW(psrc) ({ \
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
@@ -38,6 +41,61 @@
\
val_m; \
})
+
+#if (__mips == 64)
+#define LD(psrc) ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint64_t val_m = 0; \
+ \
+ __asm__ __volatile__ ( \
+ "ld %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r" (val_m) \
+ : [psrc_m] "m" (*psrc_m) \
+ ); \
+ \
+ val_m; \
+})
+#else // !(__mips == 64)
+#define LD(psrc) ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint32_t val0_m, val1_m; \
+ uint64_t val_m = 0; \
+ \
+ val0_m = LW(psrc_m); \
+ val1_m = LW(psrc_m + 4); \
+ \
+ val_m = (uint64_t)(val1_m); \
+ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
+ val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
+ \
+ val_m; \
+})
+#endif // (__mips == 64)
+
+#define SW(val, pdst) { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint32_t val_m = (val); \
+ \
+ __asm__ __volatile__ ( \
+ "sw %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m" (*pdst_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+}
+
+#define SD(val, pdst) { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint64_t val_m = (val); \
+ \
+ __asm__ __volatile__ ( \
+ "sd %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m" (*pdst_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+}
#else // !(__mips_isa_rev >= 6)
#define LW(psrc) ({ \
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
@@ -52,6 +110,60 @@
\
val_m; \
})
+
+#define SW(val, pdst) { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint32_t val_m = (val); \
+ \
+ __asm__ __volatile__ ( \
+ "usw %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m" (*pdst_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+}
+
+#if (__mips == 64)
+#define LD(psrc) ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint64_t val_m = 0; \
+ \
+ __asm__ __volatile__ ( \
+ "uld %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r" (val_m) \
+ : [psrc_m] "m" (*psrc_m) \
+ ); \
+ \
+ val_m; \
+})
+#else // !(__mips == 64)
+#define LD(psrc) ({ \
+ const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
+ uint32_t val0_m, val1_m; \
+ uint64_t val_m = 0; \
+ \
+ val0_m = LW(psrc_m1); \
+ val1_m = LW(psrc_m1 + 4); \
+ \
+ val_m = (uint64_t)(val1_m); \
+ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
+ val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
+ \
+ val_m; \
+})
+#endif // (__mips == 64)
+
+#define SD(val, pdst) { \
+ uint8_t *pdst_m1 = (uint8_t *)(pdst); \
+ uint32_t val0_m, val1_m; \
+ \
+ val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \
+ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+ \
+ SW(val0_m, pdst_m1); \
+ SW(val1_m, pdst_m1 + 4); \
+}
#endif // (__mips_isa_rev >= 6)
/* Description : Load 4 words with stride
@@ -69,6 +181,21 @@
out3 = LW((psrc) + 3 * stride); \
}
+/* Description : Load double words with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Details : Load double word in 'out0' from (psrc)
+ Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) { \
+ out0 = LD((psrc)); \
+ out1 = LD((psrc) + stride); \
+}
+#define LD4(psrc, stride, out0, out1, out2, out3) { \
+ LD2((psrc), stride, out0, out1); \
+ LD2((psrc) + 2 * stride, stride, out2, out3); \
+}
+
/* Description : Load vectors with 16 byte elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
@@ -81,6 +208,7 @@
out1 = LD_B(RTYPE, (psrc) + stride); \
}
#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) { \
LD_B2(RTYPE, (psrc), stride, out0, out1); \
@@ -93,6 +221,7 @@
LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
}
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \
LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
@@ -100,6 +229,14 @@
}
#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
+#define LD_B8(RTYPE, psrc, stride, \
+ out0, out1, out2, out3, out4, out5, out6, out7) { \
+ LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
+}
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
/* Description : Load vectors with 8 halfword elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
@@ -271,6 +408,13 @@
#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
+#define INSERT_D2(RTYPE, in0, in1, out) { \
+ out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
+ out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
+}
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
/* Description : Interleave both left and right half of input vectors
Arguments : Inputs - in0, in1
Outputs - out0, out1
@@ -327,5 +471,54 @@
\
tmp_m = __msa_clti_s_h((v8i16)in, 0); \
ILVRL_H2_SW(tmp_m, in, out0, out1); \
+}
+
+/* Description : Store 4 double words with stride
+ Arguments : Inputs - in0, in1, in2, in3, pdst, stride
+ Details : Store double word from 'in0' to (pdst)
+ Store double word from 'in1' to (pdst + stride)
+ Store double word from 'in2' to (pdst + 2 * stride)
+ Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride) { \
+ SD(in0, (pdst)) \
+ SD(in1, (pdst) + stride); \
+ SD(in2, (pdst) + 2 * stride); \
+ SD(in3, (pdst) + 3 * stride); \
+}
+
+/* Description : Store vectors of 8 halfword elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 8 halfword elements from 'in0' to (pdst)
+ Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) { \
+ ST_H(RTYPE, in0, (pdst)); \
+ ST_H(RTYPE, in1, (pdst) + stride); \
+}
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+/* Description : Store 8x4 byte block to destination memory from input
+ vectors
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Index 0 double word element from 'in0' vector is copied to the
+ GP register and stored to (pdst)
+ Index 1 double word element from 'in0' vector is copied to the
+ GP register and stored to (pdst + stride)
+ Index 0 double word element from 'in1' vector is copied to the
+ GP register and stored to (pdst + 2 * stride)
+ Index 1 double word element from 'in1' vector is copied to the
+ GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride) { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in0, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in0, 1); \
+ out2_m = __msa_copy_u_d((v2i64)in1, 0); \
+ out3_m = __msa_copy_u_d((v2i64)in1, 1); \
+ \
+ SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
}
#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */
--- /dev/null
+++ b/vpx_dsp/mips/subtract_msa.c
@@ -1,0 +1,264 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ uint32_t src0, src1, src2, src3;
+ uint32_t pred0, pred1, pred2, pred3;
+ v16i8 src = { 0 };
+ v16i8 pred = { 0 };
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
+ INSERT_W4_SB(src0, src1, src2, src3, src);
+ INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
+ ILVRL_B2_UB(src, pred, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
+}
+
+static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+ const uint8_t *pred_ptr, int32_t pred_stride,
+ int16_t *diff_ptr, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ uint64_t src0, src1, pred0, pred1;
+ v16i8 src = { 0 };
+ v16i8 pred = { 0 };
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ LD2(src_ptr, src_stride, src0, src1);
+ src_ptr += (2 * src_stride);
+ LD2(pred_ptr, pred_stride, pred0, pred1);
+ pred_ptr += (2 * pred_stride);
+
+ INSERT_D2_SB(src0, src1, src);
+ INSERT_D2_SB(pred0, pred1, pred);
+ ILVRL_B2_UB(src, pred, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff_ptr, diff_stride);
+ diff_ptr += (2 * diff_stride);
+ }
+}
+
+static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ int8_t count;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (count = 2; count--;) {
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ LD_SB8(pred, pred_stride,
+ pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7);
+ pred += (8 * pred_stride);
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ diff += diff_stride;
+ }
+}
+
+static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ LD_SB2(src, 16, src0, src1);
+ src += src_stride;
+ LD_SB2(src, 16, src2, src3);
+ src += src_stride;
+ LD_SB2(src, 16, src4, src5);
+ src += src_stride;
+ LD_SB2(src, 16, src6, src7);
+ src += src_stride;
+
+ LD_SB2(pred, 16, pred0, pred1);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred2, pred3);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred4, pred5);
+ pred += pred_stride;
+ LD_SB2(pred, 16, pred6, pred7);
+ pred += pred_stride;
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ diff += diff_stride;
+ }
+}
+
+static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
+ const uint8_t *pred, int32_t pred_stride,
+ int16_t *diff, int32_t diff_stride) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ for (loop_cnt = 32; loop_cnt--;) {
+ LD_SB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_SB4(src, 16, src4, src5, src6, src7);
+ src += src_stride;
+
+ LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
+ pred += pred_stride;
+ LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
+ pred += pred_stride;
+
+ ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 32, 8);
+ ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 48, 8);
+ diff += diff_stride;
+
+ ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff, 8);
+ ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 16, 8);
+ ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 32, 8);
+ ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+ HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+ ST_SH2(diff0, diff1, diff + 48, 8);
+ diff += diff_stride;
+ }
+}
+
+void vpx_subtract_block_msa(int32_t rows, int32_t cols,
+ int16_t *diff_ptr, ptrdiff_t diff_stride,
+ const uint8_t *src_ptr, ptrdiff_t src_stride,
+ const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ if (rows == cols) {
+ switch (rows) {
+ case 4:
+ sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+ diff_ptr, diff_stride);
+ break;
+ case 8:
+ sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+ diff_ptr, diff_stride);
+ break;
+ case 16:
+ sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+ diff_ptr, diff_stride);
+ break;
+ case 32:
+ sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+ diff_ptr, diff_stride);
+ break;
+ case 64:
+ sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+ diff_ptr, diff_stride);
+ break;
+ default:
+ vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+ } else {
+ vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ }
+}
--- /dev/null
+++ b/vpx_dsp/subtract.c
@@ -1,0 +1,56 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_subtract_block_c(int rows, int cols,
+ int16_t *diff, ptrdiff_t diff_stride,
+ const uint8_t *src, ptrdiff_t src_stride,
+ const uint8_t *pred, ptrdiff_t pred_stride) {
+ int r, c;
+
+ for (r = 0; r < rows; r++) {
+ for (c = 0; c < cols; c++)
+ diff[c] = src[c] - pred[c];
+
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_c(int rows, int cols,
+ int16_t *diff, ptrdiff_t diff_stride,
+ const uint8_t *src8, ptrdiff_t src_stride,
+ const uint8_t *pred8, ptrdiff_t pred_stride,
+ int bd) {
+ int r, c;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ (void) bd;
+
+ for (r = 0; r < rows; r++) {
+ for (c = 0; c < cols; c++) {
+ diff[c] = src[c] - pred[c];
+ }
+
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -12,13 +12,16 @@
ifeq ($(CONFIG_ENCODERS),yes)
DSP_SRCS-yes += sad.c
+DSP_SRCS-yes += subtract.c
DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c
DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm
DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm
@@ -30,6 +33,7 @@
ifeq ($(CONFIG_USE_X86INC),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -37,6 +37,12 @@
if (vpx_config("CONFIG_ENCODERS") eq "yes") {
#
+# Block subtraction
+#
+add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
+
+#
# Single block SAD
#
add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
@@ -210,6 +216,12 @@
specialize qw/vpx_sad4x4x4d msa/, "$sse_x86inc";
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ #
+ # Block subtraction
+ #
+ add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+ specialize qw/vpx_highbd_subtract_block/;
+
#
# Single block SAD
#
--- /dev/null
+++ b/vpx_dsp/x86/subtract_sse2.asm
@@ -1,0 +1,128 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+%define program_name vpx
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void vpx_subtract_block(int rows, int cols,
+; int16_t *diff, ptrdiff_t diff_stride,
+; const uint8_t *src, ptrdiff_t src_stride,
+; const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+ rows, cols, diff, diff_stride, src, src_stride, \
+ pred, pred_stride
+%define pred_str colsq
+ pxor m7, m7 ; dedicated zero register
+ cmp colsd, 4
+ je .case_4
+ cmp colsd, 8
+ je .case_8
+ cmp colsd, 16
+ je .case_16
+ cmp colsd, 32
+ je .case_32
+
+%macro loop16 6
+ mova m0, [srcq+%1]
+ mova m4, [srcq+%2]
+ mova m1, [predq+%3]
+ mova m5, [predq+%4]
+ punpckhbw m2, m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ psubw m2, m3
+ psubw m0, m1
+ punpckhbw m1, m4, m7
+ punpckhbw m3, m5, m7
+ punpcklbw m4, m7
+ punpcklbw m5, m7
+ psubw m1, m3
+ psubw m4, m5
+ mova [diffq+mmsize*0+%5], m0
+ mova [diffq+mmsize*1+%5], m2
+ mova [diffq+mmsize*0+%6], m4
+ mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+ mov pred_str, pred_stridemp
+.loop_64:
+ loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+ loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_64
+ RET
+
+.case_32:
+ mov pred_str, pred_stridemp
+.loop_32:
+ loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_32
+ RET
+
+.case_16:
+ mov pred_str, pred_stridemp
+.loop_16:
+ loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+ lea diffq, [diffq+diff_strideq*4]
+ lea predq, [predq+pred_str*2]
+ lea srcq, [srcq+src_strideq*2]
+ sub rowsd, 2
+ jg .loop_16
+ RET
+
+%macro loop_h 0
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+ movh m1, [predq]
+ movh m3, [predq+pred_str]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ psubw m0, m1
+ psubw m2, m3
+ mova [diffq], m0
+ mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+ mov pred_str, pred_stridemp
+.loop_8:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_8
+ RET
+
+INIT_MMX
+.case_4:
+ mov pred_str, pred_stridemp
+.loop_4:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_4
+ RET