ref: ac7f403cbea3f6ec6f36c0c3e18dce1d0fe1c963
parent: 55c2646666f35bb8841e50cfdafd9c5c695948bc
parent: 6a82f0d7fb9ee908c389e8d55444bbaed3d54e9c
author: Johann <[email protected]>
date: Tue Jul 7 19:57:17 EDT 2015
Merge "Move sub pixel variance to vpx_dsp"
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -21,13 +21,6 @@
#include "vpx/vpx_integer.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
-#if CONFIG_VP8_ENCODER
-# include "./vp8_rtcd.h"
-#endif // CONFIG_VP8_ENCODER
-#if CONFIG_VP9_ENCODER
-# include "./vp9_rtcd.h"
-# include "vp9/encoder/vp9_variance.h"
-#endif // CONFIG_VP9_ENCODER
#include "./vpx_dsp_rtcd.h"
namespace {
@@ -39,9 +32,16 @@
int xoffset, int yoffset,
const uint8_t *b, int b_stride,
unsigned int *sse);
+typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse,
+ const uint8_t *second_pred);
typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride);
+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+
using ::std::tr1::get;
using ::std::tr1::make_tuple;
using ::std::tr1::tuple;
@@ -166,8 +166,6 @@
(l2w + l2h)));
}
-typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
-
class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
public:
SumOfSquaresTest() : func_(GetParam()) {}
@@ -687,9 +685,8 @@
}
}
-#if CONFIG_VP9_ENCODER
template<>
-void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
+void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
for (int x = 0; x < 8; ++x) {
for (int y = 0; y < 8; ++y) {
if (!use_high_bit_depth_) {
@@ -726,11 +723,12 @@
}
}
}
-#endif // CONFIG_VP9_ENCODER
typedef MseTest<Get4x4SseFunc> VpxSseTest;
typedef MseTest<VarianceMxNFunc> VpxMseTest;
typedef VarianceTest<VarianceMxNFunc> VpxVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxSubpelAvgVarianceTest;
TEST_P(VpxSseTest, Ref_sse) { RefTest_sse(); }
TEST_P(VpxSseTest, Max_sse) { MaxTest_sse(); }
@@ -742,6 +740,9 @@
TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
+TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); }
INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
::testing::Values(vpx_get_mb_ss_c));
@@ -773,7 +774,6 @@
const VarianceMxNFunc variance8x4_c = vpx_variance8x4_c;
const VarianceMxNFunc variance4x8_c = vpx_variance4x8_c;
const VarianceMxNFunc variance4x4_c = vpx_variance4x4_c;
-
INSTANTIATE_TEST_CASE_P(
C, VpxVarianceTest,
::testing::Values(make_tuple(6, 6, variance64x64_c, 0),
@@ -790,9 +790,79 @@
make_tuple(2, 3, variance4x8_c, 0),
make_tuple(2, 2, variance4x4_c, 0)));
+const SubpixVarMxNFunc subpel_var64x64_c = vpx_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc subpel_var64x32_c = vpx_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc subpel_var32x64_c = vpx_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc subpel_var32x32_c = vpx_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc subpel_var32x16_c = vpx_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc subpel_var16x32_c = vpx_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc subpel_var16x16_c = vpx_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc subpel_var16x8_c = vpx_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc subpel_var8x16_c = vpx_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc subpel_var8x8_c = vpx_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc subpel_var8x4_c = vpx_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc subpel_var4x8_c = vpx_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc subpel_var4x4_c = vpx_sub_pixel_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+ C, VpxSubpelVarianceTest,
+ ::testing::Values(make_tuple(6, 6, subpel_var64x64_c, 0),
+ make_tuple(6, 5, subpel_var64x32_c, 0),
+ make_tuple(5, 6, subpel_var32x64_c, 0),
+ make_tuple(5, 5, subpel_var32x32_c, 0),
+ make_tuple(5, 4, subpel_var32x16_c, 0),
+ make_tuple(4, 5, subpel_var16x32_c, 0),
+ make_tuple(4, 4, subpel_var16x16_c, 0),
+ make_tuple(4, 3, subpel_var16x8_c, 0),
+ make_tuple(3, 4, subpel_var8x16_c, 0),
+ make_tuple(3, 3, subpel_var8x8_c, 0),
+ make_tuple(3, 2, subpel_var8x4_c, 0),
+ make_tuple(2, 3, subpel_var4x8_c, 0),
+ make_tuple(2, 2, subpel_var4x4_c, 0)));
+
+const SubpixAvgVarMxNFunc subpel_avg_var64x64_c =
+ vpx_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc subpel_avg_var64x32_c =
+ vpx_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc subpel_avg_var32x64_c =
+ vpx_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc subpel_avg_var32x32_c =
+ vpx_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc subpel_avg_var32x16_c =
+ vpx_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc subpel_avg_var16x32_c =
+ vpx_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc subpel_avg_var16x16_c =
+ vpx_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc subpel_avg_var16x8_c =
+ vpx_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc subpel_avg_var8x16_c =
+ vpx_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc subpel_avg_var8x8_c = vpx_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc subpel_avg_var8x4_c = vpx_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc subpel_avg_var4x8_c = vpx_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc subpel_avg_var4x4_c = vpx_sub_pixel_avg_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+ C, VpxSubpelAvgVarianceTest,
+ ::testing::Values(make_tuple(6, 6, subpel_avg_var64x64_c, 0),
+ make_tuple(6, 5, subpel_avg_var64x32_c, 0),
+ make_tuple(5, 6, subpel_avg_var32x64_c, 0),
+ make_tuple(5, 5, subpel_avg_var32x32_c, 0),
+ make_tuple(5, 4, subpel_avg_var32x16_c, 0),
+ make_tuple(4, 5, subpel_avg_var16x32_c, 0),
+ make_tuple(4, 4, subpel_avg_var16x16_c, 0),
+ make_tuple(4, 3, subpel_avg_var16x8_c, 0),
+ make_tuple(3, 4, subpel_avg_var8x16_c, 0),
+ make_tuple(3, 3, subpel_avg_var8x8_c, 0),
+ make_tuple(3, 2, subpel_avg_var8x4_c, 0),
+ make_tuple(2, 3, subpel_avg_var4x8_c, 0),
+ make_tuple(2, 2, subpel_avg_var4x4_c, 0)));
+
#if CONFIG_VP9_HIGHBITDEPTH
typedef MseTest<VarianceMxNFunc> VpxHBDMseTest;
typedef VarianceTest<VarianceMxNFunc> VpxHBDVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxHBDSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc>
+ VpxHBDSubpelAvgVarianceTest;
TEST_P(VpxHBDMseTest, Ref_mse) { RefTest_mse(); }
TEST_P(VpxHBDMseTest, Max_mse) { MaxTest_mse(); }
@@ -800,6 +870,9 @@
TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
/* TODO(debargha): This test does not support the highbd version
const VarianceMxNFunc highbd_12_mse16x16_c = vpx_highbd_12_mse16x16_c;
@@ -844,7 +917,6 @@
const VarianceMxNFunc highbd_12_variance8x4_c = vpx_highbd_12_variance8x4_c;
const VarianceMxNFunc highbd_12_variance4x8_c = vpx_highbd_12_variance4x8_c;
const VarianceMxNFunc highbd_12_variance4x4_c = vpx_highbd_12_variance4x4_c;
-
const VarianceMxNFunc highbd_10_variance64x64_c = vpx_highbd_10_variance64x64_c;
const VarianceMxNFunc highbd_10_variance64x32_c = vpx_highbd_10_variance64x32_c;
const VarianceMxNFunc highbd_10_variance32x64_c = vpx_highbd_10_variance32x64_c;
@@ -858,7 +930,6 @@
const VarianceMxNFunc highbd_10_variance8x4_c = vpx_highbd_10_variance8x4_c;
const VarianceMxNFunc highbd_10_variance4x8_c = vpx_highbd_10_variance4x8_c;
const VarianceMxNFunc highbd_10_variance4x4_c = vpx_highbd_10_variance4x4_c;
-
const VarianceMxNFunc highbd_8_variance64x64_c = vpx_highbd_8_variance64x64_c;
const VarianceMxNFunc highbd_8_variance64x32_c = vpx_highbd_8_variance64x32_c;
const VarianceMxNFunc highbd_8_variance32x64_c = vpx_highbd_8_variance32x64_c;
@@ -913,6 +984,247 @@
make_tuple(3, 2, highbd_8_variance8x4_c, 8),
make_tuple(2, 3, highbd_8_variance4x8_c, 8),
make_tuple(2, 2, highbd_8_variance4x4_c, 8)));
+
+const SubpixVarMxNFunc highbd_8_subpel_var64x64_c =
+ vpx_highbd_8_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc highbd_8_subpel_var64x32_c =
+ vpx_highbd_8_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc highbd_8_subpel_var32x64_c =
+ vpx_highbd_8_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc highbd_8_subpel_var32x32_c =
+ vpx_highbd_8_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc highbd_8_subpel_var32x16_c =
+ vpx_highbd_8_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc highbd_8_subpel_var16x32_c =
+ vpx_highbd_8_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc highbd_8_subpel_var16x16_c =
+ vpx_highbd_8_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc highbd_8_subpel_var16x8_c =
+ vpx_highbd_8_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc highbd_8_subpel_var8x16_c =
+ vpx_highbd_8_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc highbd_8_subpel_var8x8_c =
+ vpx_highbd_8_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc highbd_8_subpel_var8x4_c =
+ vpx_highbd_8_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc highbd_8_subpel_var4x8_c =
+ vpx_highbd_8_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc highbd_8_subpel_var4x4_c =
+ vpx_highbd_8_sub_pixel_variance4x4_c;
+const SubpixVarMxNFunc highbd_10_subpel_var64x64_c =
+ vpx_highbd_10_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc highbd_10_subpel_var64x32_c =
+ vpx_highbd_10_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc highbd_10_subpel_var32x64_c =
+ vpx_highbd_10_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc highbd_10_subpel_var32x32_c =
+ vpx_highbd_10_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc highbd_10_subpel_var32x16_c =
+ vpx_highbd_10_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc highbd_10_subpel_var16x32_c =
+ vpx_highbd_10_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc highbd_10_subpel_var16x16_c =
+ vpx_highbd_10_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc highbd_10_subpel_var16x8_c =
+ vpx_highbd_10_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc highbd_10_subpel_var8x16_c =
+ vpx_highbd_10_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc highbd_10_subpel_var8x8_c =
+ vpx_highbd_10_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc highbd_10_subpel_var8x4_c =
+ vpx_highbd_10_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc highbd_10_subpel_var4x8_c =
+ vpx_highbd_10_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc highbd_10_subpel_var4x4_c =
+ vpx_highbd_10_sub_pixel_variance4x4_c;
+const SubpixVarMxNFunc highbd_12_subpel_var64x64_c =
+ vpx_highbd_12_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc highbd_12_subpel_var64x32_c =
+ vpx_highbd_12_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc highbd_12_subpel_var32x64_c =
+ vpx_highbd_12_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc highbd_12_subpel_var32x32_c =
+ vpx_highbd_12_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc highbd_12_subpel_var32x16_c =
+ vpx_highbd_12_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc highbd_12_subpel_var16x32_c =
+ vpx_highbd_12_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc highbd_12_subpel_var16x16_c =
+ vpx_highbd_12_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc highbd_12_subpel_var16x8_c =
+ vpx_highbd_12_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc highbd_12_subpel_var8x16_c =
+ vpx_highbd_12_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc highbd_12_subpel_var8x8_c =
+ vpx_highbd_12_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc highbd_12_subpel_var8x4_c =
+ vpx_highbd_12_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc highbd_12_subpel_var4x8_c =
+ vpx_highbd_12_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc highbd_12_subpel_var4x4_c =
+ vpx_highbd_12_sub_pixel_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+ C, VpxHBDSubpelVarianceTest,
+ ::testing::Values(make_tuple(6, 6, highbd_8_subpel_var64x64_c, 8),
+ make_tuple(6, 5, highbd_8_subpel_var64x32_c, 8),
+ make_tuple(5, 6, highbd_8_subpel_var32x64_c, 8),
+ make_tuple(5, 5, highbd_8_subpel_var32x32_c, 8),
+ make_tuple(5, 4, highbd_8_subpel_var32x16_c, 8),
+ make_tuple(4, 5, highbd_8_subpel_var16x32_c, 8),
+ make_tuple(4, 4, highbd_8_subpel_var16x16_c, 8),
+ make_tuple(4, 3, highbd_8_subpel_var16x8_c, 8),
+ make_tuple(3, 4, highbd_8_subpel_var8x16_c, 8),
+ make_tuple(3, 3, highbd_8_subpel_var8x8_c, 8),
+ make_tuple(3, 2, highbd_8_subpel_var8x4_c, 8),
+ make_tuple(2, 3, highbd_8_subpel_var4x8_c, 8),
+ make_tuple(2, 2, highbd_8_subpel_var4x4_c, 8),
+ make_tuple(6, 6, highbd_10_subpel_var64x64_c, 10),
+ make_tuple(6, 5, highbd_10_subpel_var64x32_c, 10),
+ make_tuple(5, 6, highbd_10_subpel_var32x64_c, 10),
+ make_tuple(5, 5, highbd_10_subpel_var32x32_c, 10),
+ make_tuple(5, 4, highbd_10_subpel_var32x16_c, 10),
+ make_tuple(4, 5, highbd_10_subpel_var16x32_c, 10),
+ make_tuple(4, 4, highbd_10_subpel_var16x16_c, 10),
+ make_tuple(4, 3, highbd_10_subpel_var16x8_c, 10),
+ make_tuple(3, 4, highbd_10_subpel_var8x16_c, 10),
+ make_tuple(3, 3, highbd_10_subpel_var8x8_c, 10),
+ make_tuple(3, 2, highbd_10_subpel_var8x4_c, 10),
+ make_tuple(2, 3, highbd_10_subpel_var4x8_c, 10),
+ make_tuple(2, 2, highbd_10_subpel_var4x4_c, 10),
+ make_tuple(6, 6, highbd_12_subpel_var64x64_c, 12),
+ make_tuple(6, 5, highbd_12_subpel_var64x32_c, 12),
+ make_tuple(5, 6, highbd_12_subpel_var32x64_c, 12),
+ make_tuple(5, 5, highbd_12_subpel_var32x32_c, 12),
+ make_tuple(5, 4, highbd_12_subpel_var32x16_c, 12),
+ make_tuple(4, 5, highbd_12_subpel_var16x32_c, 12),
+ make_tuple(4, 4, highbd_12_subpel_var16x16_c, 12),
+ make_tuple(4, 3, highbd_12_subpel_var16x8_c, 12),
+ make_tuple(3, 4, highbd_12_subpel_var8x16_c, 12),
+ make_tuple(3, 3, highbd_12_subpel_var8x8_c, 12),
+ make_tuple(3, 2, highbd_12_subpel_var8x4_c, 12),
+ make_tuple(2, 3, highbd_12_subpel_var4x8_c, 12),
+ make_tuple(2, 2, highbd_12_subpel_var4x4_c, 12)));
+
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x64_c =
+ vpx_highbd_8_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x32_c =
+ vpx_highbd_8_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x64_c =
+ vpx_highbd_8_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x32_c =
+ vpx_highbd_8_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x16_c =
+ vpx_highbd_8_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x32_c =
+ vpx_highbd_8_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x16_c =
+ vpx_highbd_8_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x8_c =
+ vpx_highbd_8_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x16_c =
+ vpx_highbd_8_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x8_c =
+ vpx_highbd_8_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x4_c =
+ vpx_highbd_8_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x8_c =
+ vpx_highbd_8_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x4_c =
+ vpx_highbd_8_sub_pixel_avg_variance4x4_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x64_c =
+ vpx_highbd_10_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x32_c =
+ vpx_highbd_10_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x64_c =
+ vpx_highbd_10_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x32_c =
+ vpx_highbd_10_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x16_c =
+ vpx_highbd_10_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x32_c =
+ vpx_highbd_10_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x16_c =
+ vpx_highbd_10_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x8_c =
+ vpx_highbd_10_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x16_c =
+ vpx_highbd_10_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x8_c =
+ vpx_highbd_10_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x4_c =
+ vpx_highbd_10_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x8_c =
+ vpx_highbd_10_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x4_c =
+ vpx_highbd_10_sub_pixel_avg_variance4x4_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x64_c =
+ vpx_highbd_12_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x32_c =
+ vpx_highbd_12_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x64_c =
+ vpx_highbd_12_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x32_c =
+ vpx_highbd_12_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x16_c =
+ vpx_highbd_12_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x32_c =
+ vpx_highbd_12_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x16_c =
+ vpx_highbd_12_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x8_c =
+ vpx_highbd_12_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x16_c =
+ vpx_highbd_12_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x8_c =
+ vpx_highbd_12_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x4_c =
+ vpx_highbd_12_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x8_c =
+ vpx_highbd_12_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x4_c =
+ vpx_highbd_12_sub_pixel_avg_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+ C, VpxHBDSubpelAvgVarianceTest,
+ ::testing::Values(
+ make_tuple(6, 6, highbd_8_subpel_avg_var64x64_c, 8),
+ make_tuple(6, 5, highbd_8_subpel_avg_var64x32_c, 8),
+ make_tuple(5, 6, highbd_8_subpel_avg_var32x64_c, 8),
+ make_tuple(5, 5, highbd_8_subpel_avg_var32x32_c, 8),
+ make_tuple(5, 4, highbd_8_subpel_avg_var32x16_c, 8),
+ make_tuple(4, 5, highbd_8_subpel_avg_var16x32_c, 8),
+ make_tuple(4, 4, highbd_8_subpel_avg_var16x16_c, 8),
+ make_tuple(4, 3, highbd_8_subpel_avg_var16x8_c, 8),
+ make_tuple(3, 4, highbd_8_subpel_avg_var8x16_c, 8),
+ make_tuple(3, 3, highbd_8_subpel_avg_var8x8_c, 8),
+ make_tuple(3, 2, highbd_8_subpel_avg_var8x4_c, 8),
+ make_tuple(2, 3, highbd_8_subpel_avg_var4x8_c, 8),
+ make_tuple(2, 2, highbd_8_subpel_avg_var4x4_c, 8),
+ make_tuple(6, 6, highbd_10_subpel_avg_var64x64_c, 10),
+ make_tuple(6, 5, highbd_10_subpel_avg_var64x32_c, 10),
+ make_tuple(5, 6, highbd_10_subpel_avg_var32x64_c, 10),
+ make_tuple(5, 5, highbd_10_subpel_avg_var32x32_c, 10),
+ make_tuple(5, 4, highbd_10_subpel_avg_var32x16_c, 10),
+ make_tuple(4, 5, highbd_10_subpel_avg_var16x32_c, 10),
+ make_tuple(4, 4, highbd_10_subpel_avg_var16x16_c, 10),
+ make_tuple(4, 3, highbd_10_subpel_avg_var16x8_c, 10),
+ make_tuple(3, 4, highbd_10_subpel_avg_var8x16_c, 10),
+ make_tuple(3, 3, highbd_10_subpel_avg_var8x8_c, 10),
+ make_tuple(3, 2, highbd_10_subpel_avg_var8x4_c, 10),
+ make_tuple(2, 3, highbd_10_subpel_avg_var4x8_c, 10),
+ make_tuple(2, 2, highbd_10_subpel_avg_var4x4_c, 10),
+ make_tuple(6, 6, highbd_12_subpel_avg_var64x64_c, 12),
+ make_tuple(6, 5, highbd_12_subpel_avg_var64x32_c, 12),
+ make_tuple(5, 6, highbd_12_subpel_avg_var32x64_c, 12),
+ make_tuple(5, 5, highbd_12_subpel_avg_var32x32_c, 12),
+ make_tuple(5, 4, highbd_12_subpel_avg_var32x16_c, 12),
+ make_tuple(4, 5, highbd_12_subpel_avg_var16x32_c, 12),
+ make_tuple(4, 4, highbd_12_subpel_avg_var16x16_c, 12),
+ make_tuple(4, 3, highbd_12_subpel_avg_var16x8_c, 12),
+ make_tuple(3, 4, highbd_12_subpel_avg_var8x16_c, 12),
+ make_tuple(3, 3, highbd_12_subpel_avg_var8x8_c, 12),
+ make_tuple(3, 2, highbd_12_subpel_avg_var8x4_c, 12),
+ make_tuple(2, 3, highbd_12_subpel_avg_var4x8_c, 12),
+ make_tuple(2, 2, highbd_12_subpel_avg_var4x4_c, 12)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#if HAVE_MMX
@@ -935,6 +1247,19 @@
make_tuple(3, 4, variance8x16_mmx, 0),
make_tuple(3, 3, variance8x8_mmx, 0),
make_tuple(2, 2, variance4x4_mmx, 0)));
+
+const SubpixVarMxNFunc subpel_var16x16_mmx = vpx_sub_pixel_variance16x16_mmx;
+const SubpixVarMxNFunc subpel_var16x8_mmx = vpx_sub_pixel_variance16x8_mmx;
+const SubpixVarMxNFunc subpel_var8x16_mmx = vpx_sub_pixel_variance8x16_mmx;
+const SubpixVarMxNFunc subpel_var8x8_mmx = vpx_sub_pixel_variance8x8_mmx;
+const SubpixVarMxNFunc subpel_var4x4_mmx = vpx_sub_pixel_variance4x4_mmx;
+INSTANTIATE_TEST_CASE_P(
+ MMX, VpxSubpelVarianceTest,
+ ::testing::Values(make_tuple(4, 4, subpel_var16x16_mmx, 0),
+ make_tuple(4, 3, subpel_var16x8_mmx, 0),
+ make_tuple(3, 4, subpel_var8x16_mmx, 0),
+ make_tuple(3, 3, subpel_var8x8_mmx, 0),
+ make_tuple(2, 2, subpel_var4x4_mmx, 0)));
#endif // HAVE_MMX
#if HAVE_SSE2
@@ -979,6 +1304,90 @@
make_tuple(3, 2, variance8x4_sse2, 0),
make_tuple(2, 3, variance4x8_sse2, 0),
make_tuple(2, 2, variance4x4_sse2, 0)));
+
+#if CONFIG_USE_X86INC
+const SubpixVarMxNFunc subpel_variance64x64_sse2 =
+ vpx_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc subpel_variance64x32_sse2 =
+ vpx_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc subpel_variance32x64_sse2 =
+ vpx_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc subpel_variance32x32_sse2 =
+ vpx_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc subpel_variance32x16_sse2 =
+ vpx_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc subpel_variance16x32_sse2 =
+ vpx_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc subpel_variance16x16_sse2 =
+ vpx_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc subpel_variance16x8_sse2 =
+ vpx_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc subpel_variance8x16_sse2 =
+ vpx_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc subpel_variance8x8_sse2 = vpx_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc subpel_variance8x4_sse2 = vpx_sub_pixel_variance8x4_sse2;
+const SubpixVarMxNFunc subpel_variance4x8_sse = vpx_sub_pixel_variance4x8_sse;
+const SubpixVarMxNFunc subpel_variance4x4_sse = vpx_sub_pixel_variance4x4_sse;
+INSTANTIATE_TEST_CASE_P(
+ SSE2, VpxSubpelVarianceTest,
+ ::testing::Values(make_tuple(6, 6, subpel_variance64x64_sse2, 0),
+ make_tuple(6, 5, subpel_variance64x32_sse2, 0),
+ make_tuple(5, 6, subpel_variance32x64_sse2, 0),
+ make_tuple(5, 5, subpel_variance32x32_sse2, 0),
+ make_tuple(5, 4, subpel_variance32x16_sse2, 0),
+ make_tuple(4, 5, subpel_variance16x32_sse2, 0),
+ make_tuple(4, 4, subpel_variance16x16_sse2, 0),
+ make_tuple(4, 3, subpel_variance16x8_sse2, 0),
+ make_tuple(3, 4, subpel_variance8x16_sse2, 0),
+ make_tuple(3, 3, subpel_variance8x8_sse2, 0),
+ make_tuple(3, 2, subpel_variance8x4_sse2, 0),
+ make_tuple(2, 3, subpel_variance4x8_sse, 0),
+ make_tuple(2, 2, subpel_variance4x4_sse, 0)));
+
+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_sse2 =
+ vpx_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x32_sse2 =
+ vpx_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x64_sse2 =
+ vpx_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_sse2 =
+ vpx_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x16_sse2 =
+ vpx_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x32_sse2 =
+ vpx_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x16_sse2 =
+ vpx_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x8_sse2 =
+ vpx_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x16_sse2 =
+ vpx_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x8_sse2 =
+ vpx_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x4_sse2 =
+ vpx_sub_pixel_avg_variance8x4_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x8_sse =
+ vpx_sub_pixel_avg_variance4x8_sse;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x4_sse =
+ vpx_sub_pixel_avg_variance4x4_sse;
+INSTANTIATE_TEST_CASE_P(
+ SSE2, VpxSubpelAvgVarianceTest,
+ ::testing::Values(
+ make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0),
+ make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),
+ make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0),
+ make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0),
+ make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0),
+ make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0),
+ make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0),
+ make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0),
+ make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0),
+ make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0),
+ make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0),
+ make_tuple(2, 3, subpel_avg_variance4x8_sse, 0),
+ make_tuple(2, 2, subpel_avg_variance4x4_sse, 0)));
+#endif // CONFIG_USE_X86INC
+
#if CONFIG_VP9_HIGHBITDEPTH
/* TODO(debargha): This test does not support the highbd version
const VarianceMxNFunc highbd_12_mse16x16_sse2 = vpx_highbd_12_mse16x16_sse2;
@@ -1103,795 +1512,304 @@
make_tuple(4, 3, highbd_8_variance16x8_sse2, 8),
make_tuple(3, 4, highbd_8_variance8x16_sse2, 8),
make_tuple(3, 3, highbd_8_variance8x8_sse2, 8)));
-#endif // CONFIG_VP9_HIGHBITDEPTH
-#endif // HAVE_SSE2
-#if CONFIG_VP8_ENCODER
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VP8SubpelVarianceTest;
-
-TEST_P(VP8SubpelVarianceTest, Ref) { RefTest(); }
-TEST_P(VP8SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
-#endif // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VP9SubpelVarianceTest;
-typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
-
-TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
-TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
-TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VP9SubpelVarianceHighTest;
-typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t>
- VP9SubpelAvgVarianceHighTest;
-
-TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); }
-TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); }
-TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); }
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-const SubpixVarMxNFunc subpel_variance4x4_c = vp9_sub_pixel_variance4x4_c;
-const SubpixVarMxNFunc subpel_variance4x8_c = vp9_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc subpel_variance8x4_c = vp9_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc subpel_variance8x8_c = vp9_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc subpel_variance8x16_c = vp9_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc subpel_variance16x8_c = vp9_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc subpel_variance16x16_c = vp9_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc subpel_variance16x32_c = vp9_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc subpel_variance32x16_c = vp9_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc subpel_variance32x32_c = vp9_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc subpel_variance32x64_c = vp9_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc subpel_variance64x32_c = vp9_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc subpel_variance64x64_c = vp9_sub_pixel_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
- C, VP9SubpelVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c, 0),
- make_tuple(2, 3, subpel_variance4x8_c, 0),
- make_tuple(3, 2, subpel_variance8x4_c, 0),
- make_tuple(3, 3, subpel_variance8x8_c, 0),
- make_tuple(3, 4, subpel_variance8x16_c, 0),
- make_tuple(4, 3, subpel_variance16x8_c, 0),
- make_tuple(4, 4, subpel_variance16x16_c, 0),
- make_tuple(4, 5, subpel_variance16x32_c, 0),
- make_tuple(5, 4, subpel_variance32x16_c, 0),
- make_tuple(5, 5, subpel_variance32x32_c, 0),
- make_tuple(5, 6, subpel_variance32x64_c, 0),
- make_tuple(6, 5, subpel_variance64x32_c, 0),
- make_tuple(6, 6, subpel_variance64x64_c, 0)));
-
-#if CONFIG_VP8_ENCODER
-const SubpixVarMxNFunc vp8_subpel_variance16x16_c =
- vp8_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc vp8_subpel_variance16x8_c = vp8_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc vp8_subpel_variance8x16_c = vp8_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc vp8_subpel_variance8x8_c = vp8_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc vp8_subpel_variance4x4_c = vp8_sub_pixel_variance4x4_c;
-INSTANTIATE_TEST_CASE_P(
- C, VP8SubpelVarianceTest,
- ::testing::Values(make_tuple(2, 2, vp8_subpel_variance4x4_c, 0),
- make_tuple(3, 3, vp8_subpel_variance8x8_c, 0),
- make_tuple(3, 4, vp8_subpel_variance8x16_c, 0),
- make_tuple(4, 3, vp8_subpel_variance16x8_c, 0),
- make_tuple(4, 4, vp8_subpel_variance16x16_c, 0)));
-#endif // CONFIG_VP8_ENCODER
-
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
- vp9_sub_pixel_avg_variance4x4_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
- vp9_sub_pixel_avg_variance4x8_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c =
- vp9_sub_pixel_avg_variance8x4_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c =
- vp9_sub_pixel_avg_variance8x8_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c =
- vp9_sub_pixel_avg_variance8x16_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c =
- vp9_sub_pixel_avg_variance16x8_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c =
- vp9_sub_pixel_avg_variance16x16_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c =
- vp9_sub_pixel_avg_variance16x32_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c =
- vp9_sub_pixel_avg_variance32x16_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c =
- vp9_sub_pixel_avg_variance32x32_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c =
- vp9_sub_pixel_avg_variance32x64_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c =
- vp9_sub_pixel_avg_variance64x32_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c =
- vp9_sub_pixel_avg_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
- C, VP9SubpelAvgVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c, 0),
- make_tuple(2, 3, subpel_avg_variance4x8_c, 0),
- make_tuple(3, 2, subpel_avg_variance8x4_c, 0),
- make_tuple(3, 3, subpel_avg_variance8x8_c, 0),
- make_tuple(3, 4, subpel_avg_variance8x16_c, 0),
- make_tuple(4, 3, subpel_avg_variance16x8_c, 0),
- make_tuple(4, 4, subpel_avg_variance16x16_c, 0),
- make_tuple(4, 5, subpel_avg_variance16x32_c, 0),
- make_tuple(5, 4, subpel_avg_variance32x16_c, 0),
- make_tuple(5, 5, subpel_avg_variance32x32_c, 0),
- make_tuple(5, 6, subpel_avg_variance32x64_c, 0),
- make_tuple(6, 5, subpel_avg_variance64x32_c, 0),
- make_tuple(6, 6, subpel_avg_variance64x64_c, 0)));
-#if CONFIG_VP9_HIGHBITDEPTH
-const SubpixVarMxNFunc highbd_10_subpel_variance4x4_c =
- vp9_highbd_10_sub_pixel_variance4x4_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance4x8_c =
- vp9_highbd_10_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x4_c =
- vp9_highbd_10_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x8_c =
- vp9_highbd_10_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x16_c =
- vp9_highbd_10_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x8_c =
- vp9_highbd_10_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x16_c =
- vp9_highbd_10_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x32_c =
- vp9_highbd_10_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x16_c =
- vp9_highbd_10_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x32_c =
- vp9_highbd_10_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x64_c =
- vp9_highbd_10_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance64x32_c =
- vp9_highbd_10_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance64x64_c =
- vp9_highbd_10_sub_pixel_variance64x64_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance4x4_c =
- vp9_highbd_12_sub_pixel_variance4x4_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance4x8_c =
- vp9_highbd_12_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x4_c =
- vp9_highbd_12_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x8_c =
- vp9_highbd_12_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x16_c =
- vp9_highbd_12_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x8_c =
- vp9_highbd_12_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x16_c =
- vp9_highbd_12_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x32_c =
- vp9_highbd_12_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x16_c =
- vp9_highbd_12_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x32_c =
- vp9_highbd_12_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x64_c =
- vp9_highbd_12_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance64x32_c =
- vp9_highbd_12_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance64x64_c =
- vp9_highbd_12_sub_pixel_variance64x64_c;
-const SubpixVarMxNFunc highbd_subpel_variance4x4_c =
- vp9_highbd_sub_pixel_variance4x4_c;
-const SubpixVarMxNFunc highbd_subpel_variance4x8_c =
- vp9_highbd_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc highbd_subpel_variance8x4_c =
- vp9_highbd_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc highbd_subpel_variance8x8_c =
- vp9_highbd_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc highbd_subpel_variance8x16_c =
- vp9_highbd_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc highbd_subpel_variance16x8_c =
- vp9_highbd_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc highbd_subpel_variance16x16_c =
- vp9_highbd_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc highbd_subpel_variance16x32_c =
- vp9_highbd_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc highbd_subpel_variance32x16_c =
- vp9_highbd_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc highbd_subpel_variance32x32_c =
- vp9_highbd_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc highbd_subpel_variance32x64_c =
- vp9_highbd_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc highbd_subpel_variance64x32_c =
- vp9_highbd_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc highbd_subpel_variance64x64_c =
- vp9_highbd_sub_pixel_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
- C, VP9SubpelVarianceHighTest,
- ::testing::Values(make_tuple(2, 2, highbd_10_subpel_variance4x4_c, 10),
- make_tuple(2, 3, highbd_10_subpel_variance4x8_c, 10),
- make_tuple(3, 2, highbd_10_subpel_variance8x4_c, 10),
- make_tuple(3, 3, highbd_10_subpel_variance8x8_c, 10),
- make_tuple(3, 4, highbd_10_subpel_variance8x16_c, 10),
- make_tuple(4, 3, highbd_10_subpel_variance16x8_c, 10),
- make_tuple(4, 4, highbd_10_subpel_variance16x16_c, 10),
- make_tuple(4, 5, highbd_10_subpel_variance16x32_c, 10),
- make_tuple(5, 4, highbd_10_subpel_variance32x16_c, 10),
- make_tuple(5, 5, highbd_10_subpel_variance32x32_c, 10),
- make_tuple(5, 6, highbd_10_subpel_variance32x64_c, 10),
- make_tuple(6, 5, highbd_10_subpel_variance64x32_c, 10),
- make_tuple(6, 6, highbd_10_subpel_variance64x64_c, 10),
- make_tuple(2, 2, highbd_12_subpel_variance4x4_c, 12),
- make_tuple(2, 3, highbd_12_subpel_variance4x8_c, 12),
- make_tuple(3, 2, highbd_12_subpel_variance8x4_c, 12),
- make_tuple(3, 3, highbd_12_subpel_variance8x8_c, 12),
- make_tuple(3, 4, highbd_12_subpel_variance8x16_c, 12),
- make_tuple(4, 3, highbd_12_subpel_variance16x8_c, 12),
- make_tuple(4, 4, highbd_12_subpel_variance16x16_c, 12),
- make_tuple(4, 5, highbd_12_subpel_variance16x32_c, 12),
- make_tuple(5, 4, highbd_12_subpel_variance32x16_c, 12),
- make_tuple(5, 5, highbd_12_subpel_variance32x32_c, 12),
- make_tuple(5, 6, highbd_12_subpel_variance32x64_c, 12),
- make_tuple(6, 5, highbd_12_subpel_variance64x32_c, 12),
- make_tuple(6, 6, highbd_12_subpel_variance64x64_c, 12),
- make_tuple(2, 2, highbd_subpel_variance4x4_c, 8),
- make_tuple(2, 3, highbd_subpel_variance4x8_c, 8),
- make_tuple(3, 2, highbd_subpel_variance8x4_c, 8),
- make_tuple(3, 3, highbd_subpel_variance8x8_c, 8),
- make_tuple(3, 4, highbd_subpel_variance8x16_c, 8),
- make_tuple(4, 3, highbd_subpel_variance16x8_c, 8),
- make_tuple(4, 4, highbd_subpel_variance16x16_c, 8),
- make_tuple(4, 5, highbd_subpel_variance16x32_c, 8),
- make_tuple(5, 4, highbd_subpel_variance32x16_c, 8),
- make_tuple(5, 5, highbd_subpel_variance32x32_c, 8),
- make_tuple(5, 6, highbd_subpel_variance32x64_c, 8),
- make_tuple(6, 5, highbd_subpel_variance64x32_c, 8),
- make_tuple(6, 6, highbd_subpel_variance64x64_c, 8)));
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x4_c =
- vp9_highbd_10_sub_pixel_avg_variance4x4_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x8_c =
- vp9_highbd_10_sub_pixel_avg_variance4x8_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_c =
- vp9_highbd_10_sub_pixel_avg_variance8x4_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_c =
- vp9_highbd_10_sub_pixel_avg_variance8x8_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_c =
- vp9_highbd_10_sub_pixel_avg_variance8x16_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_c =
- vp9_highbd_10_sub_pixel_avg_variance16x8_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_c =
- vp9_highbd_10_sub_pixel_avg_variance16x16_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_c =
- vp9_highbd_10_sub_pixel_avg_variance16x32_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_c =
- vp9_highbd_10_sub_pixel_avg_variance32x16_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_c =
- vp9_highbd_10_sub_pixel_avg_variance32x32_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_c =
- vp9_highbd_10_sub_pixel_avg_variance32x64_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_c =
- vp9_highbd_10_sub_pixel_avg_variance64x32_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_c =
- vp9_highbd_10_sub_pixel_avg_variance64x64_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x4_c =
- vp9_highbd_12_sub_pixel_avg_variance4x4_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x8_c =
- vp9_highbd_12_sub_pixel_avg_variance4x8_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_c =
- vp9_highbd_12_sub_pixel_avg_variance8x4_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_c =
- vp9_highbd_12_sub_pixel_avg_variance8x8_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_c =
- vp9_highbd_12_sub_pixel_avg_variance8x16_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_c =
- vp9_highbd_12_sub_pixel_avg_variance16x8_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_c =
- vp9_highbd_12_sub_pixel_avg_variance16x16_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_c =
- vp9_highbd_12_sub_pixel_avg_variance16x32_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_c =
- vp9_highbd_12_sub_pixel_avg_variance32x16_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_c =
- vp9_highbd_12_sub_pixel_avg_variance32x32_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_c =
- vp9_highbd_12_sub_pixel_avg_variance32x64_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_c =
- vp9_highbd_12_sub_pixel_avg_variance64x32_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_c =
- vp9_highbd_12_sub_pixel_avg_variance64x64_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x4_c =
- vp9_highbd_sub_pixel_avg_variance4x4_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x8_c =
- vp9_highbd_sub_pixel_avg_variance4x8_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_c =
- vp9_highbd_sub_pixel_avg_variance8x4_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_c =
- vp9_highbd_sub_pixel_avg_variance8x8_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_c =
- vp9_highbd_sub_pixel_avg_variance8x16_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_c =
- vp9_highbd_sub_pixel_avg_variance16x8_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_c =
- vp9_highbd_sub_pixel_avg_variance16x16_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_c =
- vp9_highbd_sub_pixel_avg_variance16x32_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_c =
- vp9_highbd_sub_pixel_avg_variance32x16_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_c =
- vp9_highbd_sub_pixel_avg_variance32x32_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_c =
- vp9_highbd_sub_pixel_avg_variance32x64_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_c =
- vp9_highbd_sub_pixel_avg_variance64x32_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_c =
- vp9_highbd_sub_pixel_avg_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
- C, VP9SubpelAvgVarianceHighTest,
- ::testing::Values(
- make_tuple(2, 2, highbd_10_subpel_avg_variance4x4_c, 10),
- make_tuple(2, 3, highbd_10_subpel_avg_variance4x8_c, 10),
- make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_c, 10),
- make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_c, 10),
- make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_c, 10),
- make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_c, 10),
- make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_c, 10),
- make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_c, 10),
- make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_c, 10),
- make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_c, 10),
- make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_c, 10),
- make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_c, 10),
- make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_c, 10),
- make_tuple(2, 2, highbd_12_subpel_avg_variance4x4_c, 12),
- make_tuple(2, 3, highbd_12_subpel_avg_variance4x8_c, 12),
- make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_c, 12),
- make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_c, 12),
- make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_c, 12),
- make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_c, 12),
- make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_c, 12),
- make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_c, 12),
- make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_c, 12),
- make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_c, 12),
- make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_c, 12),
- make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_c, 12),
- make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_c, 12),
- make_tuple(2, 2, highbd_subpel_avg_variance4x4_c, 8),
- make_tuple(2, 3, highbd_subpel_avg_variance4x8_c, 8),
- make_tuple(3, 2, highbd_subpel_avg_variance8x4_c, 8),
- make_tuple(3, 3, highbd_subpel_avg_variance8x8_c, 8),
- make_tuple(3, 4, highbd_subpel_avg_variance8x16_c, 8),
- make_tuple(4, 3, highbd_subpel_avg_variance16x8_c, 8),
- make_tuple(4, 4, highbd_subpel_avg_variance16x16_c, 8),
- make_tuple(4, 5, highbd_subpel_avg_variance16x32_c, 8),
- make_tuple(5, 4, highbd_subpel_avg_variance32x16_c, 8),
- make_tuple(5, 5, highbd_subpel_avg_variance32x32_c, 8),
- make_tuple(5, 6, highbd_subpel_avg_variance32x64_c, 8),
- make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8),
- make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8)));
-#endif // CONFIG_VP9_HIGHBITDEPTH
-#endif // CONFIG_VP9_ENCODER
-
-#if CONFIG_VP8_ENCODER
-#if HAVE_MMX
-const SubpixVarMxNFunc subpel_variance16x16_mmx =
- vp8_sub_pixel_variance16x16_mmx;
-const SubpixVarMxNFunc subpel_variance16x8_mmx = vp8_sub_pixel_variance16x8_mmx;
-const SubpixVarMxNFunc subpel_variance8x16_mmx = vp8_sub_pixel_variance8x16_mmx;
-const SubpixVarMxNFunc subpel_variance8x8_mmx = vp8_sub_pixel_variance8x8_mmx;
-const SubpixVarMxNFunc subpel_variance4x4_mmx = vp8_sub_pixel_variance4x4_mmx;
-INSTANTIATE_TEST_CASE_P(
- MMX, VP8SubpelVarianceTest,
- ::testing::Values(make_tuple(4, 4, subpel_variance16x16_mmx, 0),
- make_tuple(4, 3, subpel_variance16x8_mmx, 0),
- make_tuple(3, 4, subpel_variance8x16_mmx, 0),
- make_tuple(3, 3, subpel_variance8x8_mmx, 0),
- make_tuple(2, 2, subpel_variance4x4_mmx, 0)));
-#endif // HAVE_MMX
-#endif // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
-#if HAVE_SSE2
#if CONFIG_USE_X86INC
-const SubpixVarMxNFunc subpel_variance4x4_sse = vp9_sub_pixel_variance4x4_sse;
-const SubpixVarMxNFunc subpel_variance4x8_sse = vp9_sub_pixel_variance4x8_sse;
-const SubpixVarMxNFunc subpel_variance8x4_sse2 = vp9_sub_pixel_variance8x4_sse2;
-const SubpixVarMxNFunc subpel_variance8x8_sse2 = vp9_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc subpel_variance8x16_sse2 =
- vp9_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc subpel_variance16x8_sse2 =
- vp9_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc subpel_variance16x16_sse2 =
- vp9_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc subpel_variance16x32_sse2 =
- vp9_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc subpel_variance32x16_sse2 =
- vp9_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc subpel_variance32x32_sse2 =
- vp9_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc subpel_variance32x64_sse2 =
- vp9_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc subpel_variance64x32_sse2 =
- vp9_sub_pixel_variance64x32_sse2;
-const SubpixVarMxNFunc subpel_variance64x64_sse2 =
- vp9_sub_pixel_variance64x64_sse2;
-INSTANTIATE_TEST_CASE_P(
- SSE2, VP9SubpelVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse, 0),
- make_tuple(2, 3, subpel_variance4x8_sse, 0),
- make_tuple(3, 2, subpel_variance8x4_sse2, 0),
- make_tuple(3, 3, subpel_variance8x8_sse2, 0),
- make_tuple(3, 4, subpel_variance8x16_sse2, 0),
- make_tuple(4, 3, subpel_variance16x8_sse2, 0),
- make_tuple(4, 4, subpel_variance16x16_sse2, 0),
- make_tuple(4, 5, subpel_variance16x32_sse2, 0),
- make_tuple(5, 4, subpel_variance32x16_sse2, 0),
- make_tuple(5, 5, subpel_variance32x32_sse2, 0),
- make_tuple(5, 6, subpel_variance32x64_sse2, 0),
- make_tuple(6, 5, subpel_variance64x32_sse2, 0),
- make_tuple(6, 6, subpel_variance64x64_sse2, 0)));
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
- vp9_sub_pixel_avg_variance4x4_sse;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
- vp9_sub_pixel_avg_variance4x8_sse;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 =
- vp9_sub_pixel_avg_variance8x4_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 =
- vp9_sub_pixel_avg_variance8x8_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 =
- vp9_sub_pixel_avg_variance8x16_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 =
- vp9_sub_pixel_avg_variance16x8_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 =
- vp9_sub_pixel_avg_variance16x16_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 =
- vp9_sub_pixel_avg_variance16x32_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 =
- vp9_sub_pixel_avg_variance32x16_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 =
- vp9_sub_pixel_avg_variance32x32_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 =
- vp9_sub_pixel_avg_variance32x64_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 =
- vp9_sub_pixel_avg_variance64x32_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 =
- vp9_sub_pixel_avg_variance64x64_sse2;
-INSTANTIATE_TEST_CASE_P(
- SSE2, VP9SubpelAvgVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse, 0),
- make_tuple(2, 3, subpel_avg_variance4x8_sse, 0),
- make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0),
- make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0),
- make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0),
- make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0),
- make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0),
- make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0),
- make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0),
- make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0),
- make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0),
- make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),
- make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0)));
-#if CONFIG_VP9_HIGHBITDEPTH
-const SubpixVarMxNFunc highbd_subpel_variance8x4_sse2 =
- vp9_highbd_sub_pixel_variance8x4_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance8x8_sse2 =
- vp9_highbd_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance8x16_sse2 =
- vp9_highbd_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance16x8_sse2 =
- vp9_highbd_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance16x16_sse2 =
- vp9_highbd_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance16x32_sse2 =
- vp9_highbd_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance32x16_sse2 =
- vp9_highbd_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance32x32_sse2 =
- vp9_highbd_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance32x64_sse2 =
- vp9_highbd_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance64x32_sse2 =
- vp9_highbd_sub_pixel_variance64x32_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance64x64_sse2 =
- vp9_highbd_sub_pixel_variance64x64_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x4_sse2 =
- vp9_highbd_10_sub_pixel_variance8x4_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x8_sse2 =
- vp9_highbd_10_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x16_sse2 =
- vp9_highbd_10_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x8_sse2 =
- vp9_highbd_10_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x16_sse2 =
- vp9_highbd_10_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x32_sse2 =
- vp9_highbd_10_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x16_sse2 =
- vp9_highbd_10_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x32_sse2 =
- vp9_highbd_10_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x64_sse2 =
- vp9_highbd_10_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance64x32_sse2 =
- vp9_highbd_10_sub_pixel_variance64x32_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance64x64_sse2 =
- vp9_highbd_10_sub_pixel_variance64x64_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x4_sse2 =
- vp9_highbd_12_sub_pixel_variance8x4_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x8_sse2 =
- vp9_highbd_12_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x16_sse2 =
- vp9_highbd_12_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x8_sse2 =
- vp9_highbd_12_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x16_sse2 =
- vp9_highbd_12_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x32_sse2 =
- vp9_highbd_12_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x16_sse2 =
- vp9_highbd_12_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x32_sse2 =
- vp9_highbd_12_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x64_sse2 =
- vp9_highbd_12_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance64x32_sse2 =
- vp9_highbd_12_sub_pixel_variance64x32_sse2;
const SubpixVarMxNFunc highbd_12_subpel_variance64x64_sse2 =
- vp9_highbd_12_sub_pixel_variance64x64_sse2;
+ vpx_highbd_12_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance64x32_sse2 =
+ vpx_highbd_12_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance32x64_sse2 =
+ vpx_highbd_12_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance32x32_sse2 =
+ vpx_highbd_12_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance32x16_sse2 =
+ vpx_highbd_12_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance16x32_sse2 =
+ vpx_highbd_12_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance16x16_sse2 =
+ vpx_highbd_12_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance16x8_sse2 =
+ vpx_highbd_12_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance8x16_sse2 =
+ vpx_highbd_12_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance8x8_sse2 =
+ vpx_highbd_12_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance8x4_sse2 =
+ vpx_highbd_12_sub_pixel_variance8x4_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance64x64_sse2 =
+ vpx_highbd_10_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance64x32_sse2 =
+ vpx_highbd_10_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance32x64_sse2 =
+ vpx_highbd_10_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance32x32_sse2 =
+ vpx_highbd_10_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance32x16_sse2 =
+ vpx_highbd_10_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance16x32_sse2 =
+ vpx_highbd_10_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance16x16_sse2 =
+ vpx_highbd_10_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance16x8_sse2 =
+ vpx_highbd_10_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance8x16_sse2 =
+ vpx_highbd_10_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance8x8_sse2 =
+ vpx_highbd_10_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance8x4_sse2 =
+ vpx_highbd_10_sub_pixel_variance8x4_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance64x64_sse2 =
+ vpx_highbd_8_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance64x32_sse2 =
+ vpx_highbd_8_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance32x64_sse2 =
+ vpx_highbd_8_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance32x32_sse2 =
+ vpx_highbd_8_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance32x16_sse2 =
+ vpx_highbd_8_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance16x32_sse2 =
+ vpx_highbd_8_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance16x16_sse2 =
+ vpx_highbd_8_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance16x8_sse2 =
+ vpx_highbd_8_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance8x16_sse2 =
+ vpx_highbd_8_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance8x8_sse2 =
+ vpx_highbd_8_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance8x4_sse2 =
+ vpx_highbd_8_sub_pixel_variance8x4_sse2;
INSTANTIATE_TEST_CASE_P(
- SSE2, VP9SubpelVarianceHighTest,
- ::testing::Values(make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10),
- make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10),
- make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10),
- make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10),
- make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10),
- make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10),
- make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10),
- make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10),
- make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10),
- make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10),
- make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10),
- make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12),
- make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12),
- make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12),
- make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12),
- make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12),
- make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12),
- make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12),
- make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12),
- make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12),
+ SSE2, VpxHBDSubpelVarianceTest,
+ ::testing::Values(make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12),
make_tuple(6, 5, highbd_12_subpel_variance64x32_sse2, 12),
- make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12),
- make_tuple(3, 2, highbd_subpel_variance8x4_sse2, 8),
- make_tuple(3, 3, highbd_subpel_variance8x8_sse2, 8),
- make_tuple(3, 4, highbd_subpel_variance8x16_sse2, 8),
- make_tuple(4, 3, highbd_subpel_variance16x8_sse2, 8),
- make_tuple(4, 4, highbd_subpel_variance16x16_sse2, 8),
- make_tuple(4, 5, highbd_subpel_variance16x32_sse2, 8),
- make_tuple(5, 4, highbd_subpel_variance32x16_sse2, 8),
- make_tuple(5, 5, highbd_subpel_variance32x32_sse2, 8),
- make_tuple(5, 6, highbd_subpel_variance32x64_sse2, 8),
- make_tuple(6, 5, highbd_subpel_variance64x32_sse2, 8),
- make_tuple(6, 6, highbd_subpel_variance64x64_sse2, 8)));
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_sse2 =
- vp9_highbd_sub_pixel_avg_variance8x4_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_sse2 =
- vp9_highbd_sub_pixel_avg_variance8x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_sse2 =
- vp9_highbd_sub_pixel_avg_variance8x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_sse2 =
- vp9_highbd_sub_pixel_avg_variance16x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_sse2 =
- vp9_highbd_sub_pixel_avg_variance16x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_sse2 =
- vp9_highbd_sub_pixel_avg_variance16x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_sse2 =
- vp9_highbd_sub_pixel_avg_variance32x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_sse2 =
- vp9_highbd_sub_pixel_avg_variance32x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_sse2 =
- vp9_highbd_sub_pixel_avg_variance32x64_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_sse2 =
- vp9_highbd_sub_pixel_avg_variance64x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_sse2 =
- vp9_highbd_sub_pixel_avg_variance64x64_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_sse2 =
- vp9_highbd_10_sub_pixel_avg_variance8x4_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_sse2 =
- vp9_highbd_10_sub_pixel_avg_variance8x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_sse2 =
- vp9_highbd_10_sub_pixel_avg_variance8x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_sse2 =
- vp9_highbd_10_sub_pixel_avg_variance16x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_sse2 =
- vp9_highbd_10_sub_pixel_avg_variance16x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_sse2 =
- vp9_highbd_10_sub_pixel_avg_variance16x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_sse2 =
- vp9_highbd_10_sub_pixel_avg_variance32x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_sse2 =
- vp9_highbd_10_sub_pixel_avg_variance32x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_sse2 =
- vp9_highbd_10_sub_pixel_avg_variance32x64_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_sse2 =
- vp9_highbd_10_sub_pixel_avg_variance64x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_sse2 =
- vp9_highbd_10_sub_pixel_avg_variance64x64_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_sse2 =
- vp9_highbd_12_sub_pixel_avg_variance8x4_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_sse2 =
- vp9_highbd_12_sub_pixel_avg_variance8x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_sse2 =
- vp9_highbd_12_sub_pixel_avg_variance8x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_sse2 =
- vp9_highbd_12_sub_pixel_avg_variance16x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_sse2 =
- vp9_highbd_12_sub_pixel_avg_variance16x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_sse2 =
- vp9_highbd_12_sub_pixel_avg_variance16x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_sse2 =
- vp9_highbd_12_sub_pixel_avg_variance32x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_sse2 =
- vp9_highbd_12_sub_pixel_avg_variance32x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_sse2 =
- vp9_highbd_12_sub_pixel_avg_variance32x64_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_sse2 =
- vp9_highbd_12_sub_pixel_avg_variance64x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_sse2 =
- vp9_highbd_12_sub_pixel_avg_variance64x64_sse2;
+ make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12),
+ make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12),
+ make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12),
+ make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12),
+ make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12),
+ make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12),
+ make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12),
+ make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12),
+ make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12),
+ make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10),
+ make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10),
+ make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10),
+ make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10),
+ make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10),
+ make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10),
+ make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10),
+ make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10),
+ make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10),
+ make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10),
+ make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10),
+ make_tuple(6, 6, highbd_8_subpel_variance64x64_sse2, 8),
+ make_tuple(6, 5, highbd_8_subpel_variance64x32_sse2, 8),
+ make_tuple(5, 6, highbd_8_subpel_variance32x64_sse2, 8),
+ make_tuple(5, 5, highbd_8_subpel_variance32x32_sse2, 8),
+ make_tuple(5, 4, highbd_8_subpel_variance32x16_sse2, 8),
+ make_tuple(4, 5, highbd_8_subpel_variance16x32_sse2, 8),
+ make_tuple(4, 4, highbd_8_subpel_variance16x16_sse2, 8),
+ make_tuple(4, 3, highbd_8_subpel_variance16x8_sse2, 8),
+ make_tuple(3, 4, highbd_8_subpel_variance8x16_sse2, 8),
+ make_tuple(3, 3, highbd_8_subpel_variance8x8_sse2, 8),
+ make_tuple(3, 2, highbd_8_subpel_variance8x4_sse2, 8)));
+
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x64_sse2 =
+ vpx_highbd_12_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x32_sse2 =
+ vpx_highbd_12_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x64_sse2 =
+ vpx_highbd_12_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x32_sse2 =
+ vpx_highbd_12_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x16_sse2 =
+ vpx_highbd_12_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x32_sse2 =
+ vpx_highbd_12_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x16_sse2 =
+ vpx_highbd_12_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x8_sse2 =
+ vpx_highbd_12_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x16_sse2 =
+ vpx_highbd_12_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x8_sse2 =
+ vpx_highbd_12_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x4_sse2 =
+ vpx_highbd_12_sub_pixel_avg_variance8x4_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x64_sse2 =
+ vpx_highbd_10_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x32_sse2 =
+ vpx_highbd_10_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x64_sse2 =
+ vpx_highbd_10_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x32_sse2 =
+ vpx_highbd_10_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x16_sse2 =
+ vpx_highbd_10_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x32_sse2 =
+ vpx_highbd_10_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x16_sse2 =
+ vpx_highbd_10_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x8_sse2 =
+ vpx_highbd_10_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x16_sse2 =
+ vpx_highbd_10_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x8_sse2 =
+ vpx_highbd_10_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x4_sse2 =
+ vpx_highbd_10_sub_pixel_avg_variance8x4_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x64_sse2 =
+ vpx_highbd_8_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x32_sse2 =
+ vpx_highbd_8_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x64_sse2 =
+ vpx_highbd_8_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x32_sse2 =
+ vpx_highbd_8_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x16_sse2 =
+ vpx_highbd_8_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x32_sse2 =
+ vpx_highbd_8_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x16_sse2 =
+ vpx_highbd_8_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x8_sse2 =
+ vpx_highbd_8_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x16_sse2 =
+ vpx_highbd_8_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x8_sse2 =
+ vpx_highbd_8_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x4_sse2 =
+ vpx_highbd_8_sub_pixel_avg_variance8x4_sse2;
INSTANTIATE_TEST_CASE_P(
- SSE2, VP9SubpelAvgVarianceHighTest,
+ SSE2, VpxHBDSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10),
- make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10),
- make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10),
- make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10),
- make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10),
- make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10),
- make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10),
- make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10),
- make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10),
- make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10),
- make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10),
- make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12),
- make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12),
- make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12),
- make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12),
- make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12),
- make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12),
- make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12),
- make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12),
- make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12),
- make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12),
- make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12),
- make_tuple(3, 2, highbd_subpel_avg_variance8x4_sse2, 8),
- make_tuple(3, 3, highbd_subpel_avg_variance8x8_sse2, 8),
- make_tuple(3, 4, highbd_subpel_avg_variance8x16_sse2, 8),
- make_tuple(4, 3, highbd_subpel_avg_variance16x8_sse2, 8),
- make_tuple(4, 4, highbd_subpel_avg_variance16x16_sse2, 8),
- make_tuple(4, 5, highbd_subpel_avg_variance16x32_sse2, 8),
- make_tuple(5, 4, highbd_subpel_avg_variance32x16_sse2, 8),
- make_tuple(5, 5, highbd_subpel_avg_variance32x32_sse2, 8),
- make_tuple(5, 6, highbd_subpel_avg_variance32x64_sse2, 8),
- make_tuple(6, 5, highbd_subpel_avg_variance64x32_sse2, 8),
- make_tuple(6, 6, highbd_subpel_avg_variance64x64_sse2, 8)));
-#endif // CONFIG_VP9_HIGHBITDEPTH
+ make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12),
+ make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12),
+ make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12),
+ make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12),
+ make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12),
+ make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12),
+ make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12),
+ make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12),
+ make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12),
+ make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12),
+ make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12),
+ make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10),
+ make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10),
+ make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10),
+ make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10),
+ make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10),
+ make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10),
+ make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10),
+ make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10),
+ make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10),
+ make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10),
+ make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10),
+ make_tuple(6, 6, highbd_8_subpel_avg_variance64x64_sse2, 8),
+ make_tuple(6, 5, highbd_8_subpel_avg_variance64x32_sse2, 8),
+ make_tuple(5, 6, highbd_8_subpel_avg_variance32x64_sse2, 8),
+ make_tuple(5, 5, highbd_8_subpel_avg_variance32x32_sse2, 8),
+ make_tuple(5, 4, highbd_8_subpel_avg_variance32x16_sse2, 8),
+ make_tuple(4, 5, highbd_8_subpel_avg_variance16x32_sse2, 8),
+ make_tuple(4, 4, highbd_8_subpel_avg_variance16x16_sse2, 8),
+ make_tuple(4, 3, highbd_8_subpel_avg_variance16x8_sse2, 8),
+ make_tuple(3, 4, highbd_8_subpel_avg_variance8x16_sse2, 8),
+ make_tuple(3, 3, highbd_8_subpel_avg_variance8x8_sse2, 8),
+ make_tuple(3, 2, highbd_8_subpel_avg_variance8x4_sse2, 8)));
#endif // CONFIG_USE_X86INC
+#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_SSE2
-#endif // CONFIG_VP9_ENCODER
-#if CONFIG_VP8_ENCODER
-#if HAVE_SSE2
-const SubpixVarMxNFunc vp8_subpel_variance16x16_sse2 =
- vp8_sub_pixel_variance16x16_wmt;
-const SubpixVarMxNFunc vp8_subpel_variance16x8_sse2 =
- vp8_sub_pixel_variance16x8_wmt;
-const SubpixVarMxNFunc vp8_subpel_variance8x16_sse2 =
- vp8_sub_pixel_variance8x16_wmt;
-const SubpixVarMxNFunc vp8_subpel_variance8x8_sse2 =
- vp8_sub_pixel_variance8x8_wmt;
-const SubpixVarMxNFunc vp8_subpel_variance4x4_sse2 =
- vp8_sub_pixel_variance4x4_wmt;
-INSTANTIATE_TEST_CASE_P(
- SSE2, VP8SubpelVarianceTest,
- ::testing::Values(make_tuple(2, 2, vp8_subpel_variance4x4_sse2, 0),
- make_tuple(3, 3, vp8_subpel_variance8x8_sse2, 0),
- make_tuple(3, 4, vp8_subpel_variance8x16_sse2, 0),
- make_tuple(4, 3, vp8_subpel_variance16x8_sse2, 0),
- make_tuple(4, 4, vp8_subpel_variance16x16_sse2, 0)));
-#endif // HAVE_SSE2
-#endif // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
#if HAVE_SSSE3
#if CONFIG_USE_X86INC
-const SubpixVarMxNFunc subpel_variance4x4_ssse3 =
- vp9_sub_pixel_variance4x4_ssse3;
-const SubpixVarMxNFunc subpel_variance4x8_ssse3 =
- vp9_sub_pixel_variance4x8_ssse3;
-const SubpixVarMxNFunc subpel_variance8x4_ssse3 =
- vp9_sub_pixel_variance8x4_ssse3;
-const SubpixVarMxNFunc subpel_variance8x8_ssse3 =
- vp9_sub_pixel_variance8x8_ssse3;
-const SubpixVarMxNFunc subpel_variance8x16_ssse3 =
- vp9_sub_pixel_variance8x16_ssse3;
-const SubpixVarMxNFunc subpel_variance16x8_ssse3 =
- vp9_sub_pixel_variance16x8_ssse3;
-const SubpixVarMxNFunc subpel_variance16x16_ssse3 =
- vp9_sub_pixel_variance16x16_ssse3;
-const SubpixVarMxNFunc subpel_variance16x32_ssse3 =
- vp9_sub_pixel_variance16x32_ssse3;
-const SubpixVarMxNFunc subpel_variance32x16_ssse3 =
- vp9_sub_pixel_variance32x16_ssse3;
-const SubpixVarMxNFunc subpel_variance32x32_ssse3 =
- vp9_sub_pixel_variance32x32_ssse3;
-const SubpixVarMxNFunc subpel_variance32x64_ssse3 =
- vp9_sub_pixel_variance32x64_ssse3;
-const SubpixVarMxNFunc subpel_variance64x32_ssse3 =
- vp9_sub_pixel_variance64x32_ssse3;
const SubpixVarMxNFunc subpel_variance64x64_ssse3 =
- vp9_sub_pixel_variance64x64_ssse3;
+ vpx_sub_pixel_variance64x64_ssse3;
+const SubpixVarMxNFunc subpel_variance64x32_ssse3 =
+ vpx_sub_pixel_variance64x32_ssse3;
+const SubpixVarMxNFunc subpel_variance32x64_ssse3 =
+ vpx_sub_pixel_variance32x64_ssse3;
+const SubpixVarMxNFunc subpel_variance32x32_ssse3 =
+ vpx_sub_pixel_variance32x32_ssse3;
+const SubpixVarMxNFunc subpel_variance32x16_ssse3 =
+ vpx_sub_pixel_variance32x16_ssse3;
+const SubpixVarMxNFunc subpel_variance16x32_ssse3 =
+ vpx_sub_pixel_variance16x32_ssse3;
+const SubpixVarMxNFunc subpel_variance16x16_ssse3 =
+ vpx_sub_pixel_variance16x16_ssse3;
+const SubpixVarMxNFunc subpel_variance16x8_ssse3 =
+ vpx_sub_pixel_variance16x8_ssse3;
+const SubpixVarMxNFunc subpel_variance8x16_ssse3 =
+ vpx_sub_pixel_variance8x16_ssse3;
+const SubpixVarMxNFunc subpel_variance8x8_ssse3 =
+ vpx_sub_pixel_variance8x8_ssse3;
+const SubpixVarMxNFunc subpel_variance8x4_ssse3 =
+ vpx_sub_pixel_variance8x4_ssse3;
+const SubpixVarMxNFunc subpel_variance4x8_ssse3 =
+ vpx_sub_pixel_variance4x8_ssse3;
+const SubpixVarMxNFunc subpel_variance4x4_ssse3 =
+ vpx_sub_pixel_variance4x4_ssse3;
INSTANTIATE_TEST_CASE_P(
- SSSE3, VP9SubpelVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3, 0),
- make_tuple(2, 3, subpel_variance4x8_ssse3, 0),
- make_tuple(3, 2, subpel_variance8x4_ssse3, 0),
- make_tuple(3, 3, subpel_variance8x8_ssse3, 0),
- make_tuple(3, 4, subpel_variance8x16_ssse3, 0),
- make_tuple(4, 3, subpel_variance16x8_ssse3, 0),
- make_tuple(4, 4, subpel_variance16x16_ssse3, 0),
- make_tuple(4, 5, subpel_variance16x32_ssse3, 0),
- make_tuple(5, 4, subpel_variance32x16_ssse3, 0),
- make_tuple(5, 5, subpel_variance32x32_ssse3, 0),
- make_tuple(5, 6, subpel_variance32x64_ssse3, 0),
+ SSSE3, VpxSubpelVarianceTest,
+ ::testing::Values(make_tuple(6, 6, subpel_variance64x64_ssse3, 0),
make_tuple(6, 5, subpel_variance64x32_ssse3, 0),
- make_tuple(6, 6, subpel_variance64x64_ssse3, 0)));
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
- vp9_sub_pixel_avg_variance4x4_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
- vp9_sub_pixel_avg_variance4x8_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 =
- vp9_sub_pixel_avg_variance8x4_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 =
- vp9_sub_pixel_avg_variance8x8_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 =
- vp9_sub_pixel_avg_variance8x16_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 =
- vp9_sub_pixel_avg_variance16x8_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 =
- vp9_sub_pixel_avg_variance16x16_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 =
- vp9_sub_pixel_avg_variance16x32_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 =
- vp9_sub_pixel_avg_variance32x16_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 =
- vp9_sub_pixel_avg_variance32x32_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 =
- vp9_sub_pixel_avg_variance32x64_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 =
- vp9_sub_pixel_avg_variance64x32_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 =
- vp9_sub_pixel_avg_variance64x64_ssse3;
+ make_tuple(5, 6, subpel_variance32x64_ssse3, 0),
+ make_tuple(5, 5, subpel_variance32x32_ssse3, 0),
+ make_tuple(5, 4, subpel_variance32x16_ssse3, 0),
+ make_tuple(4, 5, subpel_variance16x32_ssse3, 0),
+ make_tuple(4, 4, subpel_variance16x16_ssse3, 0),
+ make_tuple(4, 3, subpel_variance16x8_ssse3, 0),
+ make_tuple(3, 4, subpel_variance8x16_ssse3, 0),
+ make_tuple(3, 3, subpel_variance8x8_ssse3, 0),
+ make_tuple(3, 2, subpel_variance8x4_ssse3, 0),
+ make_tuple(2, 3, subpel_variance4x8_ssse3, 0),
+ make_tuple(2, 2, subpel_variance4x4_ssse3, 0)));
+
+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_ssse3 =
+ vpx_sub_pixel_avg_variance64x64_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x32_ssse3 =
+ vpx_sub_pixel_avg_variance64x32_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x64_ssse3 =
+ vpx_sub_pixel_avg_variance32x64_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_ssse3 =
+ vpx_sub_pixel_avg_variance32x32_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x16_ssse3 =
+ vpx_sub_pixel_avg_variance32x16_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x32_ssse3 =
+ vpx_sub_pixel_avg_variance16x32_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x16_ssse3 =
+ vpx_sub_pixel_avg_variance16x16_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x8_ssse3 =
+ vpx_sub_pixel_avg_variance16x8_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x16_ssse3 =
+ vpx_sub_pixel_avg_variance8x16_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x8_ssse3 =
+ vpx_sub_pixel_avg_variance8x8_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x4_ssse3 =
+ vpx_sub_pixel_avg_variance8x4_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x8_ssse3 =
+ vpx_sub_pixel_avg_variance4x8_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x4_ssse3 =
+ vpx_sub_pixel_avg_variance4x4_ssse3;
INSTANTIATE_TEST_CASE_P(
- SSSE3, VP9SubpelAvgVarianceTest,
- ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0),
- make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0),
- make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0),
- make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0),
- make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0),
- make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0),
- make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0),
- make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0),
- make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0),
- make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0),
- make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0),
+ SSSE3, VpxSubpelAvgVarianceTest,
+ ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0),
make_tuple(6, 5, subpel_avg_variance64x32_ssse3, 0),
- make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0)));
+ make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0),
+ make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0),
+ make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0),
+ make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0),
+ make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0),
+ make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0),
+ make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0),
+ make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0),
+ make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0),
+ make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0),
+ make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0)));
#endif // CONFIG_USE_X86INC
#endif // HAVE_SSSE3
-#endif // CONFIG_VP9_ENCODER
-#if CONFIG_VP8_ENCODER
-#if HAVE_SSSE3
-const SubpixVarMxNFunc vp8_subpel_variance16x16_ssse3 =
- vp8_sub_pixel_variance16x16_ssse3;
-const SubpixVarMxNFunc vp8_subpel_variance16x8_ssse3 =
- vp8_sub_pixel_variance16x8_ssse3;
-INSTANTIATE_TEST_CASE_P(
- SSSE3, VP8SubpelVarianceTest,
- ::testing::Values(make_tuple(4, 3, vp8_subpel_variance16x8_ssse3, 0),
- make_tuple(4, 4, vp8_subpel_variance16x16_ssse3, 0)));
-#endif // HAVE_SSSE3
-#endif // CONFIG_VP8_ENCODER
-
#if HAVE_AVX2
const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2;
INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest,
@@ -1910,39 +1828,46 @@
make_tuple(5, 4, variance32x16_avx2, 0),
make_tuple(4, 4, variance16x16_avx2, 0)));
-#if CONFIG_VP9_ENCODER
-const SubpixVarMxNFunc subpel_variance32x32_avx2 =
- vp9_sub_pixel_variance32x32_avx2;
const SubpixVarMxNFunc subpel_variance64x64_avx2 =
- vp9_sub_pixel_variance64x64_avx2;
+ vpx_sub_pixel_variance64x64_avx2;
+const SubpixVarMxNFunc subpel_variance32x32_avx2 =
+ vpx_sub_pixel_variance32x32_avx2;
INSTANTIATE_TEST_CASE_P(
- AVX2, VP9SubpelVarianceTest,
- ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2, 0),
- make_tuple(6, 6, subpel_variance64x64_avx2, 0)));
+ AVX2, VpxSubpelVarianceTest,
+ ::testing::Values(make_tuple(6, 6, subpel_variance64x64_avx2, 0),
+ make_tuple(5, 5, subpel_variance32x32_avx2, 0)));
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 =
- vp9_sub_pixel_avg_variance32x32_avx2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 =
- vp9_sub_pixel_avg_variance64x64_avx2;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_avx2 =
+ vpx_sub_pixel_avg_variance64x64_avx2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_avx2 =
+ vpx_sub_pixel_avg_variance32x32_avx2;
INSTANTIATE_TEST_CASE_P(
- AVX2, VP9SubpelAvgVarianceTest,
- ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0),
- make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0)));
-#endif // CONFIG_VP9_ENCODER
+ AVX2, VpxSubpelAvgVarianceTest,
+ ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0),
+ make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0)));
#endif // HAVE_AVX2
-#if CONFIG_VP8_ENCODER
#if HAVE_MEDIA
+const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media;
+INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,
+ ::testing::Values(make_tuple(4, 4, mse16x16_media)));
+
+const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media;
+const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media;
+INSTANTIATE_TEST_CASE_P(
+ MEDIA, VpxVarianceTest,
+ ::testing::Values(make_tuple(4, 4, variance16x16_media, 0),
+ make_tuple(3, 3, variance8x8_media, 0)));
+
const SubpixVarMxNFunc subpel_variance16x16_media =
- vp8_sub_pixel_variance16x16_armv6;
+ vpx_sub_pixel_variance16x16_media;
const SubpixVarMxNFunc subpel_variance8x8_media =
- vp8_sub_pixel_variance8x8_armv6;
+ vpx_sub_pixel_variance8x8_media;
INSTANTIATE_TEST_CASE_P(
- MEDIA, VP8SubpelVarianceTest,
- ::testing::Values(make_tuple(3, 3, subpel_variance8x8_media, 0),
- make_tuple(4, 4, subpel_variance16x16_media, 0)));
+ MEDIA, VpxSubpelVarianceTest,
+ ::testing::Values(make_tuple(4, 4, subpel_variance16x16_media, 0),
+ make_tuple(3, 3, subpel_variance8x8_media, 0)));
#endif // HAVE_MEDIA
-#endif // CONFIG_VP8_ENCODER
#if HAVE_NEON
const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon;
@@ -1972,46 +1897,21 @@
make_tuple(3, 4, variance8x16_neon, 0),
make_tuple(3, 3, variance8x8_neon, 0)));
-#if CONFIG_VP8_ENCODER
-#if HAVE_NEON_ASM
-const SubpixVarMxNFunc vp8_subpel_variance16x16_neon =
- vp8_sub_pixel_variance16x16_neon;
-INSTANTIATE_TEST_CASE_P(
- NEON, VP8SubpelVarianceTest,
- ::testing::Values(make_tuple(4, 4, vp8_subpel_variance16x16_neon, 0)));
-#endif // HAVE_NEON_ASM
-#endif // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
-const SubpixVarMxNFunc subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon;
-const SubpixVarMxNFunc subpel_variance16x16_neon =
- vp9_sub_pixel_variance16x16_neon;
-const SubpixVarMxNFunc subpel_variance32x32_neon =
- vp9_sub_pixel_variance32x32_neon;
const SubpixVarMxNFunc subpel_variance64x64_neon =
- vp9_sub_pixel_variance64x64_neon;
+ vpx_sub_pixel_variance64x64_neon;
+const SubpixVarMxNFunc subpel_variance32x32_neon =
+ vpx_sub_pixel_variance32x32_neon;
+const SubpixVarMxNFunc subpel_variance16x16_neon =
+ vpx_sub_pixel_variance16x16_neon;
+const SubpixVarMxNFunc subpel_variance8x8_neon = vpx_sub_pixel_variance8x8_neon;
INSTANTIATE_TEST_CASE_P(
- NEON, VP9SubpelVarianceTest,
- ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0),
- make_tuple(4, 4, subpel_variance16x16_neon, 0),
+ NEON, VpxSubpelVarianceTest,
+ ::testing::Values(make_tuple(6, 6, subpel_variance64x64_neon, 0),
make_tuple(5, 5, subpel_variance32x32_neon, 0),
- make_tuple(6, 6, subpel_variance64x64_neon, 0)));
-#endif // CONFIG_VP9_ENCODER
+ make_tuple(4, 4, subpel_variance16x16_neon, 0),
+ make_tuple(3, 3, subpel_variance8x8_neon, 0)));
#endif // HAVE_NEON
-#if HAVE_MEDIA
-const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media;
-INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,
- ::testing::Values(make_tuple(4, 4, mse16x16_media)));
-
-const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media;
-const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media;
-INSTANTIATE_TEST_CASE_P(
- MEDIA, VpxVarianceTest,
- ::testing::Values(make_tuple(4, 4, variance16x16_media, 0),
- make_tuple(3, 3, variance8x8_media, 0)));
-#endif // HAVE_MEDIA
-
#if HAVE_MSA
INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest,
::testing::Values(vpx_get_mb_ss_msa));
@@ -2059,29 +1959,28 @@
make_tuple(2, 3, variance4x8_msa, 0),
make_tuple(2, 2, variance4x4_msa, 0)));
-#if CONFIG_VP9_ENCODER
-const SubpixVarMxNFunc subpel_variance4x4_msa = vp9_sub_pixel_variance4x4_msa;
-const SubpixVarMxNFunc subpel_variance4x8_msa = vp9_sub_pixel_variance4x8_msa;
-const SubpixVarMxNFunc subpel_variance8x4_msa = vp9_sub_pixel_variance8x4_msa;
-const SubpixVarMxNFunc subpel_variance8x8_msa = vp9_sub_pixel_variance8x8_msa;
-const SubpixVarMxNFunc subpel_variance8x16_msa = vp9_sub_pixel_variance8x16_msa;
-const SubpixVarMxNFunc subpel_variance16x8_msa = vp9_sub_pixel_variance16x8_msa;
+const SubpixVarMxNFunc subpel_variance4x4_msa = vpx_sub_pixel_variance4x4_msa;
+const SubpixVarMxNFunc subpel_variance4x8_msa = vpx_sub_pixel_variance4x8_msa;
+const SubpixVarMxNFunc subpel_variance8x4_msa = vpx_sub_pixel_variance8x4_msa;
+const SubpixVarMxNFunc subpel_variance8x8_msa = vpx_sub_pixel_variance8x8_msa;
+const SubpixVarMxNFunc subpel_variance8x16_msa = vpx_sub_pixel_variance8x16_msa;
+const SubpixVarMxNFunc subpel_variance16x8_msa = vpx_sub_pixel_variance16x8_msa;
const SubpixVarMxNFunc subpel_variance16x16_msa =
- vp9_sub_pixel_variance16x16_msa;
+ vpx_sub_pixel_variance16x16_msa;
const SubpixVarMxNFunc subpel_variance16x32_msa =
- vp9_sub_pixel_variance16x32_msa;
+ vpx_sub_pixel_variance16x32_msa;
const SubpixVarMxNFunc subpel_variance32x16_msa =
- vp9_sub_pixel_variance32x16_msa;
+ vpx_sub_pixel_variance32x16_msa;
const SubpixVarMxNFunc subpel_variance32x32_msa =
- vp9_sub_pixel_variance32x32_msa;
+ vpx_sub_pixel_variance32x32_msa;
const SubpixVarMxNFunc subpel_variance32x64_msa =
- vp9_sub_pixel_variance32x64_msa;
+ vpx_sub_pixel_variance32x64_msa;
const SubpixVarMxNFunc subpel_variance64x32_msa =
- vp9_sub_pixel_variance64x32_msa;
+ vpx_sub_pixel_variance64x32_msa;
const SubpixVarMxNFunc subpel_variance64x64_msa =
- vp9_sub_pixel_variance64x64_msa;
+ vpx_sub_pixel_variance64x64_msa;
INSTANTIATE_TEST_CASE_P(
- MSA, VP9SubpelVarianceTest,
+ MSA, VpxSubpelVarianceTest,
::testing::Values(make_tuple(2, 2, subpel_variance4x4_msa, 0),
make_tuple(2, 3, subpel_variance4x8_msa, 0),
make_tuple(3, 2, subpel_variance8x4_msa, 0),
@@ -2095,6 +1994,5 @@
make_tuple(5, 6, subpel_variance32x64_msa, 0),
make_tuple(6, 5, subpel_variance64x32_msa, 0),
make_tuple(6, 6, subpel_variance64x64_msa, 0)));
-#endif // CONFIG_VP9_ENCODER
#endif // HAVE_MSA
} // namespace
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ /dev/null
@@ -1,182 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_variance_halfpixvar16x16_h_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_h_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- ldr r10, c80808080
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov lr, #0 ; constant zero
-loop
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load 4 src pixels
- ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #0] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load 4 src pixels
- ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #4] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load 4 src pixels
- ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #8] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load 4 src pixels
- ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
- ldr r5, [r2, #12] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- subs r12, r12, #1
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
-c80808080
- DCD 0x80808080
-
- END
-
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ /dev/null
@@ -1,222 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_variance_halfpixvar16x16_hv_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_hv_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- ldr r10, c80808080
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov lr, #0 ; constant zero
-loop
- add r9, r0, r1 ; pointer to pixels on the next row
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load source pixels a, row N
- ldr r6, [r0, #1] ; load source pixels b, row N
- ldr r5, [r9, #0] ; load source pixels c, row N+1
- ldr r7, [r9, #1] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #0] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load source pixels a, row N
- ldr r6, [r0, #5] ; load source pixels b, row N
- ldr r5, [r9, #4] ; load source pixels c, row N+1
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- ldr r7, [r9, #5] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #4] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load source pixels a, row N
- ldr r6, [r0, #9] ; load source pixels b, row N
- ldr r5, [r9, #8] ; load source pixels c, row N+1
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- ldr r7, [r9, #9] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #8] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load source pixels a, row N
- ldr r6, [r0, #13] ; load source pixels b, row N
- ldr r5, [r9, #12] ; load source pixels c, row N+1
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
- ldr r7, [r9, #13] ; load source pixels d, row N+1
-
- ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
- ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
- mvn r7, r7
- uhsub8 r5, r5, r7
- eor r5, r5, r10
- ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
- mvn r5, r5
- uhsub8 r4, r4, r5
- ldr r5, [r2, #12] ; load 4 ref pixels
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- subs r12, r12, #1
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
-c80808080
- DCD 0x80808080
-
- END
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ /dev/null
@@ -1,184 +1,0 @@
-;
-; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_variance_halfpixvar16x16_v_armv6|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int source_stride
-; r2 unsigned char *ref_ptr
-; r3 int recon_stride
-; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_v_armv6| PROC
-
- stmfd sp!, {r4-r12, lr}
-
- pld [r0, r1, lsl #0]
- pld [r2, r3, lsl #0]
-
- mov r8, #0 ; initialize sum = 0
- ldr r10, c80808080
- mov r11, #0 ; initialize sse = 0
- mov r12, #16 ; set loop counter to 16 (=block height)
- mov lr, #0 ; constant zero
-loop
- add r9, r0, r1 ; set src pointer to next row
- ; 1st 4 pixels
- ldr r4, [r0, #0] ; load 4 src pixels
- ldr r6, [r9, #0] ; load 4 src pixels from next row
- ldr r5, [r2, #0] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- usub8 r6, r4, r5 ; calculate difference
- pld [r0, r1, lsl #1]
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- pld [r2, r3, lsl #1]
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
- ; calculate total sum
- adds r8, r8, r4 ; add positive differences to sum
- subs r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 2nd 4 pixels
- ldr r4, [r0, #4] ; load 4 src pixels
- ldr r6, [r9, #4] ; load 4 src pixels from next row
- ldr r5, [r2, #4] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 3rd 4 pixels
- ldr r4, [r0, #8] ; load 4 src pixels
- ldr r6, [r9, #8] ; load 4 src pixels from next row
- ldr r5, [r2, #8] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
-
- ; 4th 4 pixels
- ldr r4, [r0, #12] ; load 4 src pixels
- ldr r6, [r9, #12] ; load 4 src pixels from next row
- ldr r5, [r2, #12] ; load 4 ref pixels
-
- ; bilinear interpolation
- mvn r6, r6
- uhsub8 r4, r4, r6
- eor r4, r4, r10
-
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
- usub8 r6, r4, r5 ; calculate difference
- add r0, r0, r1 ; set src_ptr to next row
- sel r7, r6, lr ; select bytes with positive difference
- usub8 r6, r5, r4 ; calculate difference with reversed operands
- add r2, r2, r3 ; set dst_ptr to next row
- sel r6, r6, lr ; select bytes with negative difference
-
- ; calculate partial sums
- usad8 r4, r7, lr ; calculate sum of positive differences
- usad8 r5, r6, lr ; calculate sum of negative differences
- orr r6, r6, r7 ; differences of all 4 pixels
-
- ; calculate total sum
- add r8, r8, r4 ; add positive differences to sum
- sub r8, r8, r5 ; subtract negative differences from sum
-
- ; calculate sse
- uxtb16 r5, r6 ; byte (two pixels) to halfwords
- uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
- smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
- smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
-
-
- subs r12, r12, #1
-
- bne loop
-
- ; return stuff
- ldr r6, [sp, #40] ; get address of sse
- mul r0, r8, r8 ; sum * sum
- str r11, [r6] ; store sse
- sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
- ldmfd sp!, {r4-r12, pc}
-
- ENDP
-
-c80808080
- DCD 0x80808080
-
- END
-
--- a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
+++ /dev/null
@@ -1,1017 +1,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-static const uint8_t bilinear_taps_coeff[8][2] = {
- {128, 0},
- {112, 16},
- { 96, 32},
- { 80, 48},
- { 64, 64},
- { 48, 80},
- { 32, 96},
- { 16, 112}
-};
-
-unsigned int vp8_sub_pixel_variance16x16_neon_func(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse) {
- int i;
- DECLARE_ALIGNED(16, unsigned char, tmp[528]);
- unsigned char *tmpp;
- unsigned char *tmpp2;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
- uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
- uint8x8_t d19u8, d20u8, d21u8;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64, d2s64, d3s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
- uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
- uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
- uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- tmpp2 = tmp + 272;
- tmpp = tmp;
- if (xoffset == 0) { // secondpass_bfilter16x16_only
- d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
- d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
-
- q11u8 = vld1q_u8(src_ptr);
- src_ptr += src_pixels_per_line;
- for (i = 4; i > 0; i--) {
- q12u8 = vld1q_u8(src_ptr);
- src_ptr += src_pixels_per_line;
- q13u8 = vld1q_u8(src_ptr);
- src_ptr += src_pixels_per_line;
- q14u8 = vld1q_u8(src_ptr);
- src_ptr += src_pixels_per_line;
- q15u8 = vld1q_u8(src_ptr);
- src_ptr += src_pixels_per_line;
-
- __builtin_prefetch(src_ptr);
- __builtin_prefetch(src_ptr + src_pixels_per_line);
- __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
-
- q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
- q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
- q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
- q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
- q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
- q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
- q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
- q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
-
- q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
- q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
- q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
- q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
- q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
- q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
- q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
- q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
-
- d2u8 = vqrshrn_n_u16(q1u16, 7);
- d3u8 = vqrshrn_n_u16(q2u16, 7);
- d4u8 = vqrshrn_n_u16(q3u16, 7);
- d5u8 = vqrshrn_n_u16(q4u16, 7);
- d6u8 = vqrshrn_n_u16(q5u16, 7);
- d7u8 = vqrshrn_n_u16(q6u16, 7);
- d8u8 = vqrshrn_n_u16(q7u16, 7);
- d9u8 = vqrshrn_n_u16(q8u16, 7);
-
- q1u8 = vcombine_u8(d2u8, d3u8);
- q2u8 = vcombine_u8(d4u8, d5u8);
- q3u8 = vcombine_u8(d6u8, d7u8);
- q4u8 = vcombine_u8(d8u8, d9u8);
-
- q11u8 = q15u8;
-
- vst1q_u8((uint8_t *)tmpp2, q1u8);
- tmpp2 += 16;
- vst1q_u8((uint8_t *)tmpp2, q2u8);
- tmpp2 += 16;
- vst1q_u8((uint8_t *)tmpp2, q3u8);
- tmpp2 += 16;
- vst1q_u8((uint8_t *)tmpp2, q4u8);
- tmpp2 += 16;
- }
- } else if (yoffset == 0) { // firstpass_bfilter16x16_only
- d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
- d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
-
- for (i = 4; i > 0 ; i--) {
- d2u8 = vld1_u8(src_ptr);
- d3u8 = vld1_u8(src_ptr + 8);
- d4u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
- d5u8 = vld1_u8(src_ptr);
- d6u8 = vld1_u8(src_ptr + 8);
- d7u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
- d8u8 = vld1_u8(src_ptr);
- d9u8 = vld1_u8(src_ptr + 8);
- d10u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
- d11u8 = vld1_u8(src_ptr);
- d12u8 = vld1_u8(src_ptr + 8);
- d13u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
-
- __builtin_prefetch(src_ptr);
- __builtin_prefetch(src_ptr + src_pixels_per_line);
- __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
-
- q7u16 = vmull_u8(d2u8, d0u8);
- q8u16 = vmull_u8(d3u8, d0u8);
- q9u16 = vmull_u8(d5u8, d0u8);
- q10u16 = vmull_u8(d6u8, d0u8);
- q11u16 = vmull_u8(d8u8, d0u8);
- q12u16 = vmull_u8(d9u8, d0u8);
- q13u16 = vmull_u8(d11u8, d0u8);
- q14u16 = vmull_u8(d12u8, d0u8);
-
- d2u8 = vext_u8(d2u8, d3u8, 1);
- d5u8 = vext_u8(d5u8, d6u8, 1);
- d8u8 = vext_u8(d8u8, d9u8, 1);
- d11u8 = vext_u8(d11u8, d12u8, 1);
-
- q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
- q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
- q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
- q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
-
- d3u8 = vext_u8(d3u8, d4u8, 1);
- d6u8 = vext_u8(d6u8, d7u8, 1);
- d9u8 = vext_u8(d9u8, d10u8, 1);
- d12u8 = vext_u8(d12u8, d13u8, 1);
-
- q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
- q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
- q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
- q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
-
- d14u8 = vqrshrn_n_u16(q7u16, 7);
- d15u8 = vqrshrn_n_u16(q8u16, 7);
- d16u8 = vqrshrn_n_u16(q9u16, 7);
- d17u8 = vqrshrn_n_u16(q10u16, 7);
- d18u8 = vqrshrn_n_u16(q11u16, 7);
- d19u8 = vqrshrn_n_u16(q12u16, 7);
- d20u8 = vqrshrn_n_u16(q13u16, 7);
- d21u8 = vqrshrn_n_u16(q14u16, 7);
-
- q7u8 = vcombine_u8(d14u8, d15u8);
- q8u8 = vcombine_u8(d16u8, d17u8);
- q9u8 = vcombine_u8(d18u8, d19u8);
- q10u8 = vcombine_u8(d20u8, d21u8);
-
- vst1q_u8((uint8_t *)tmpp2, q7u8);
- tmpp2 += 16;
- vst1q_u8((uint8_t *)tmpp2, q8u8);
- tmpp2 += 16;
- vst1q_u8((uint8_t *)tmpp2, q9u8);
- tmpp2 += 16;
- vst1q_u8((uint8_t *)tmpp2, q10u8);
- tmpp2 += 16;
- }
- } else {
- d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
- d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
-
- d2u8 = vld1_u8(src_ptr);
- d3u8 = vld1_u8(src_ptr + 8);
- d4u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
- d5u8 = vld1_u8(src_ptr);
- d6u8 = vld1_u8(src_ptr + 8);
- d7u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
- d8u8 = vld1_u8(src_ptr);
- d9u8 = vld1_u8(src_ptr + 8);
- d10u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
- d11u8 = vld1_u8(src_ptr);
- d12u8 = vld1_u8(src_ptr + 8);
- d13u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
-
- // First Pass: output_height lines x output_width columns (17x16)
- for (i = 3; i > 0; i--) {
- q7u16 = vmull_u8(d2u8, d0u8);
- q8u16 = vmull_u8(d3u8, d0u8);
- q9u16 = vmull_u8(d5u8, d0u8);
- q10u16 = vmull_u8(d6u8, d0u8);
- q11u16 = vmull_u8(d8u8, d0u8);
- q12u16 = vmull_u8(d9u8, d0u8);
- q13u16 = vmull_u8(d11u8, d0u8);
- q14u16 = vmull_u8(d12u8, d0u8);
-
- d2u8 = vext_u8(d2u8, d3u8, 1);
- d5u8 = vext_u8(d5u8, d6u8, 1);
- d8u8 = vext_u8(d8u8, d9u8, 1);
- d11u8 = vext_u8(d11u8, d12u8, 1);
-
- q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
- q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
- q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
- q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
-
- d3u8 = vext_u8(d3u8, d4u8, 1);
- d6u8 = vext_u8(d6u8, d7u8, 1);
- d9u8 = vext_u8(d9u8, d10u8, 1);
- d12u8 = vext_u8(d12u8, d13u8, 1);
-
- q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
- q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
- q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
- q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
-
- d14u8 = vqrshrn_n_u16(q7u16, 7);
- d15u8 = vqrshrn_n_u16(q8u16, 7);
- d16u8 = vqrshrn_n_u16(q9u16, 7);
- d17u8 = vqrshrn_n_u16(q10u16, 7);
- d18u8 = vqrshrn_n_u16(q11u16, 7);
- d19u8 = vqrshrn_n_u16(q12u16, 7);
- d20u8 = vqrshrn_n_u16(q13u16, 7);
- d21u8 = vqrshrn_n_u16(q14u16, 7);
-
- d2u8 = vld1_u8(src_ptr);
- d3u8 = vld1_u8(src_ptr + 8);
- d4u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
- d5u8 = vld1_u8(src_ptr);
- d6u8 = vld1_u8(src_ptr + 8);
- d7u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
- d8u8 = vld1_u8(src_ptr);
- d9u8 = vld1_u8(src_ptr + 8);
- d10u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
- d11u8 = vld1_u8(src_ptr);
- d12u8 = vld1_u8(src_ptr + 8);
- d13u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
-
- q7u8 = vcombine_u8(d14u8, d15u8);
- q8u8 = vcombine_u8(d16u8, d17u8);
- q9u8 = vcombine_u8(d18u8, d19u8);
- q10u8 = vcombine_u8(d20u8, d21u8);
-
- vst1q_u8((uint8_t *)tmpp, q7u8);
- tmpp += 16;
- vst1q_u8((uint8_t *)tmpp, q8u8);
- tmpp += 16;
- vst1q_u8((uint8_t *)tmpp, q9u8);
- tmpp += 16;
- vst1q_u8((uint8_t *)tmpp, q10u8);
- tmpp += 16;
- }
-
- // First-pass filtering for rest 5 lines
- d14u8 = vld1_u8(src_ptr);
- d15u8 = vld1_u8(src_ptr + 8);
- d16u8 = vld1_u8(src_ptr + 16);
- src_ptr += src_pixels_per_line;
-
- q9u16 = vmull_u8(d2u8, d0u8);
- q10u16 = vmull_u8(d3u8, d0u8);
- q11u16 = vmull_u8(d5u8, d0u8);
- q12u16 = vmull_u8(d6u8, d0u8);
- q13u16 = vmull_u8(d8u8, d0u8);
- q14u16 = vmull_u8(d9u8, d0u8);
-
- d2u8 = vext_u8(d2u8, d3u8, 1);
- d5u8 = vext_u8(d5u8, d6u8, 1);
- d8u8 = vext_u8(d8u8, d9u8, 1);
-
- q9u16 = vmlal_u8(q9u16, d2u8, d1u8);
- q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
- q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
-
- d3u8 = vext_u8(d3u8, d4u8, 1);
- d6u8 = vext_u8(d6u8, d7u8, 1);
- d9u8 = vext_u8(d9u8, d10u8, 1);
-
- q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
- q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
- q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
-
- q1u16 = vmull_u8(d11u8, d0u8);
- q2u16 = vmull_u8(d12u8, d0u8);
- q3u16 = vmull_u8(d14u8, d0u8);
- q4u16 = vmull_u8(d15u8, d0u8);
-
- d11u8 = vext_u8(d11u8, d12u8, 1);
- d14u8 = vext_u8(d14u8, d15u8, 1);
-
- q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
- q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
-
- d12u8 = vext_u8(d12u8, d13u8, 1);
- d15u8 = vext_u8(d15u8, d16u8, 1);
-
- q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
- q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
-
- d10u8 = vqrshrn_n_u16(q9u16, 7);
- d11u8 = vqrshrn_n_u16(q10u16, 7);
- d12u8 = vqrshrn_n_u16(q11u16, 7);
- d13u8 = vqrshrn_n_u16(q12u16, 7);
- d14u8 = vqrshrn_n_u16(q13u16, 7);
- d15u8 = vqrshrn_n_u16(q14u16, 7);
- d16u8 = vqrshrn_n_u16(q1u16, 7);
- d17u8 = vqrshrn_n_u16(q2u16, 7);
- d18u8 = vqrshrn_n_u16(q3u16, 7);
- d19u8 = vqrshrn_n_u16(q4u16, 7);
-
- q5u8 = vcombine_u8(d10u8, d11u8);
- q6u8 = vcombine_u8(d12u8, d13u8);
- q7u8 = vcombine_u8(d14u8, d15u8);
- q8u8 = vcombine_u8(d16u8, d17u8);
- q9u8 = vcombine_u8(d18u8, d19u8);
-
- vst1q_u8((uint8_t *)tmpp, q5u8);
- tmpp += 16;
- vst1q_u8((uint8_t *)tmpp, q6u8);
- tmpp += 16;
- vst1q_u8((uint8_t *)tmpp, q7u8);
- tmpp += 16;
- vst1q_u8((uint8_t *)tmpp, q8u8);
- tmpp += 16;
- vst1q_u8((uint8_t *)tmpp, q9u8);
-
- // secondpass_filter
- d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
- d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
-
- tmpp = tmp;
- tmpp2 = tmpp + 272;
- q11u8 = vld1q_u8(tmpp);
- tmpp += 16;
- for (i = 4; i > 0; i--) {
- q12u8 = vld1q_u8(tmpp);
- tmpp += 16;
- q13u8 = vld1q_u8(tmpp);
- tmpp += 16;
- q14u8 = vld1q_u8(tmpp);
- tmpp += 16;
- q15u8 = vld1q_u8(tmpp);
- tmpp += 16;
-
- q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
- q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
- q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
- q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
- q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
- q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
- q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
- q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
-
- q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
- q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
- q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
- q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
- q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
- q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
- q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
- q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
-
- d2u8 = vqrshrn_n_u16(q1u16, 7);
- d3u8 = vqrshrn_n_u16(q2u16, 7);
- d4u8 = vqrshrn_n_u16(q3u16, 7);
- d5u8 = vqrshrn_n_u16(q4u16, 7);
- d6u8 = vqrshrn_n_u16(q5u16, 7);
- d7u8 = vqrshrn_n_u16(q6u16, 7);
- d8u8 = vqrshrn_n_u16(q7u16, 7);
- d9u8 = vqrshrn_n_u16(q8u16, 7);
-
- q1u8 = vcombine_u8(d2u8, d3u8);
- q2u8 = vcombine_u8(d4u8, d5u8);
- q3u8 = vcombine_u8(d6u8, d7u8);
- q4u8 = vcombine_u8(d8u8, d9u8);
-
- q11u8 = q15u8;
-
- vst1q_u8((uint8_t *)tmpp2, q1u8);
- tmpp2 += 16;
- vst1q_u8((uint8_t *)tmpp2, q2u8);
- tmpp2 += 16;
- vst1q_u8((uint8_t *)tmpp2, q3u8);
- tmpp2 += 16;
- vst1q_u8((uint8_t *)tmpp2, q4u8);
- tmpp2 += 16;
- }
- }
-
- // sub_pixel_variance16x16_neon
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- tmpp = tmp + 272;
- for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop
- q0u8 = vld1q_u8(tmpp);
- tmpp += 16;
- q1u8 = vld1q_u8(tmpp);
- tmpp += 16;
- q2u8 = vld1q_u8(dst_ptr);
- dst_ptr += dst_pixels_per_line;
- q3u8 = vld1q_u8(dst_ptr);
- dst_ptr += dst_pixels_per_line;
-
- d0u8 = vget_low_u8(q0u8);
- d1u8 = vget_high_u8(q0u8);
- d2u8 = vget_low_u8(q1u8);
- d3u8 = vget_high_u8(q1u8);
-
- q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
- q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
- q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
- q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
- q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
- q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vget_low_s64(q0s64);
- d1s64 = vget_high_s64(q0s64);
- d2s64 = vget_low_s64(q1s64);
- d3s64 = vget_high_s64(q1s64);
- d0s64 = vadd_s64(d0s64, d1s64);
- d1s64 = vadd_s64(d2s64, d3s64);
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
- vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_h_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- int i;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64, d2s64, d3s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8;
- uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8;
- uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
- q0u8 = vld1q_u8(src_ptr);
- q1u8 = vld1q_u8(src_ptr + 16);
- src_ptr += source_stride;
- q2u8 = vld1q_u8(src_ptr);
- q3u8 = vld1q_u8(src_ptr + 16);
- src_ptr += source_stride;
- q4u8 = vld1q_u8(src_ptr);
- q5u8 = vld1q_u8(src_ptr + 16);
- src_ptr += source_stride;
- q6u8 = vld1q_u8(src_ptr);
- q7u8 = vld1q_u8(src_ptr + 16);
- src_ptr += source_stride;
-
- q11u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q12u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q13u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q14u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
-
- q1u8 = vextq_u8(q0u8, q1u8, 1);
- q3u8 = vextq_u8(q2u8, q3u8, 1);
- q5u8 = vextq_u8(q4u8, q5u8, 1);
- q7u8 = vextq_u8(q6u8, q7u8, 1);
-
- q0u8 = vrhaddq_u8(q0u8, q1u8);
- q1u8 = vrhaddq_u8(q2u8, q3u8);
- q2u8 = vrhaddq_u8(q4u8, q5u8);
- q3u8 = vrhaddq_u8(q6u8, q7u8);
-
- d0u8 = vget_low_u8(q0u8);
- d1u8 = vget_high_u8(q0u8);
- d2u8 = vget_low_u8(q1u8);
- d3u8 = vget_high_u8(q1u8);
- d4u8 = vget_low_u8(q2u8);
- d5u8 = vget_high_u8(q2u8);
- d6u8 = vget_low_u8(q3u8);
- d7u8 = vget_high_u8(q3u8);
-
- q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8));
- q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8));
- q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8));
- q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8));
- q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8));
- q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8));
- q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8));
- q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8));
-
- d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16));
- d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16));
- q9s32 = vmlal_s16(q9s32, d8s16, d8s16);
- q10s32 = vmlal_s16(q10s32, d9s16, d9s16);
- d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
- d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16));
- q9s32 = vmlal_s16(q9s32, d10s16, d10s16);
- q10s32 = vmlal_s16(q10s32, d11s16, d11s16);
- d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
- d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16));
- q9s32 = vmlal_s16(q9s32, d12s16, d12s16);
- q10s32 = vmlal_s16(q10s32, d13s16, d13s16);
- d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16));
- d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16));
- q9s32 = vmlal_s16(q9s32, d14s16, d14s16);
- q10s32 = vmlal_s16(q10s32, d15s16, d15s16);
- d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
- d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
- q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
- q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
- d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
- d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
- q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
- q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
- d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
- d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
- q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
- q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
- d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
- d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
- q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
- q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vget_low_s64(q0s64);
- d1s64 = vget_high_s64(q0s64);
- d2s64 = vget_low_s64(q1s64);
- d3s64 = vget_high_s64(q1s64);
- d0s64 = vadd_s64(d0s64, d1s64);
- d1s64 = vadd_s64(d2s64, d3s64);
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
- vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_v_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- int i;
- uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64, d2s64, d3s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8;
- uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- q0u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
- q2u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q4u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q6u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q15u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
-
- q1u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q3u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q5u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q7u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
-
- q0u8 = vrhaddq_u8(q0u8, q2u8);
- q2u8 = vrhaddq_u8(q2u8, q4u8);
- q4u8 = vrhaddq_u8(q4u8, q6u8);
- q6u8 = vrhaddq_u8(q6u8, q15u8);
-
- d0u8 = vget_low_u8(q0u8);
- d1u8 = vget_high_u8(q0u8);
- d4u8 = vget_low_u8(q2u8);
- d5u8 = vget_high_u8(q2u8);
- d8u8 = vget_low_u8(q4u8);
- d9u8 = vget_high_u8(q4u8);
- d12u8 = vget_low_u8(q6u8);
- d13u8 = vget_high_u8(q6u8);
-
- q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8));
- q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8));
- q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8));
- q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8));
- q0u16 = vsubl_u8(d8u8, vget_low_u8(q5u8));
- q1u16 = vsubl_u8(d9u8, vget_high_u8(q5u8));
- q2u16 = vsubl_u8(d12u8, vget_low_u8(q7u8));
- q3u16 = vsubl_u8(d13u8, vget_high_u8(q7u8));
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
- q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
- q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
- d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
- q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
- q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
- d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
- d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
- q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
- q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
- d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
- d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
- q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
- q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
- d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
- d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
- q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
- q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
-
- q0u8 = q15u8;
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vget_low_s64(q0s64);
- d1s64 = vget_high_s64(q0s64);
- d2s64 = vget_low_s64(q1s64);
- d3s64 = vget_high_s64(q1s64);
- d0s64 = vadd_s64(d0s64, d1s64);
- d1s64 = vadd_s64(d2s64, d3s64);
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
- vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_hv_neon(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- int i;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16;
- int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64, d2s64, d3s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
- uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16;
- int32x4_t q13s32, q14s32, q15s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q13s32 = vdupq_n_s32(0);
- q14s32 = vdupq_n_s32(0);
- q15s32 = vdupq_n_s32(0);
-
- q0u8 = vld1q_u8(src_ptr);
- q1u8 = vld1q_u8(src_ptr + 16);
- src_ptr += source_stride;
- q1u8 = vextq_u8(q0u8, q1u8, 1);
- q0u8 = vrhaddq_u8(q0u8, q1u8);
- for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
- q2u8 = vld1q_u8(src_ptr);
- q3u8 = vld1q_u8(src_ptr + 16);
- src_ptr += source_stride;
- q4u8 = vld1q_u8(src_ptr);
- q5u8 = vld1q_u8(src_ptr + 16);
- src_ptr += source_stride;
- q6u8 = vld1q_u8(src_ptr);
- q7u8 = vld1q_u8(src_ptr + 16);
- src_ptr += source_stride;
- q8u8 = vld1q_u8(src_ptr);
- q9u8 = vld1q_u8(src_ptr + 16);
- src_ptr += source_stride;
-
- q3u8 = vextq_u8(q2u8, q3u8, 1);
- q5u8 = vextq_u8(q4u8, q5u8, 1);
- q7u8 = vextq_u8(q6u8, q7u8, 1);
- q9u8 = vextq_u8(q8u8, q9u8, 1);
-
- q1u8 = vrhaddq_u8(q2u8, q3u8);
- q2u8 = vrhaddq_u8(q4u8, q5u8);
- q3u8 = vrhaddq_u8(q6u8, q7u8);
- q4u8 = vrhaddq_u8(q8u8, q9u8);
- q0u8 = vrhaddq_u8(q0u8, q1u8);
- q1u8 = vrhaddq_u8(q1u8, q2u8);
- q2u8 = vrhaddq_u8(q2u8, q3u8);
- q3u8 = vrhaddq_u8(q3u8, q4u8);
-
- q5u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q6u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q7u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q8u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
-
- d0u8 = vget_low_u8(q0u8);
- d1u8 = vget_high_u8(q0u8);
- d2u8 = vget_low_u8(q1u8);
- d3u8 = vget_high_u8(q1u8);
- d4u8 = vget_low_u8(q2u8);
- d5u8 = vget_high_u8(q2u8);
- d6u8 = vget_low_u8(q3u8);
- d7u8 = vget_high_u8(q3u8);
-
- q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8));
- q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8));
- q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8));
- q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8));
- q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8));
- q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8));
- q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8));
- q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8));
-
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16));
- q14s32 = vmlal_s16(q14s32, d18s16, d18s16);
- q15s32 = vmlal_s16(q15s32, d19s16, d19s16);
-
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
- q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16));
- q14s32 = vmlal_s16(q14s32, d20s16, d20s16);
- q15s32 = vmlal_s16(q15s32, d21s16, d21s16);
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16));
- q14s32 = vmlal_s16(q14s32, d22s16, d22s16);
- q15s32 = vmlal_s16(q15s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16));
- q14s32 = vmlal_s16(q14s32, d24s16, d24s16);
- q15s32 = vmlal_s16(q15s32, d25s16, d25s16);
-
- d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
- d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
- q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16));
- q14s32 = vmlal_s16(q14s32, d0s16, d0s16);
- q15s32 = vmlal_s16(q15s32, d1s16, d1s16);
-
- d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
- d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
- q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16));
- q14s32 = vmlal_s16(q14s32, d2s16, d2s16);
- q15s32 = vmlal_s16(q15s32, d3s16, d3s16);
-
- d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
- d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
- q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16));
- q14s32 = vmlal_s16(q14s32, d10s16, d10s16);
- q15s32 = vmlal_s16(q15s32, d11s16, d11s16);
-
- d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
- d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
- q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16));
- q14s32 = vmlal_s16(q14s32, d12s16, d12s16);
- q15s32 = vmlal_s16(q15s32, d13s16, d13s16);
-
- q0u8 = q4u8;
- }
-
- q15s32 = vaddq_s32(q14s32, q15s32);
- q0s64 = vpaddlq_s32(q13s32);
- q1s64 = vpaddlq_s32(q15s32);
-
- d0s64 = vget_low_s64(q0s64);
- d1s64 = vget_high_s64(q0s64);
- d2s64 = vget_low_s64(q1s64);
- d3s64 = vget_high_s64(q1s64);
- d0s64 = vadd_s64(d0s64, d1s64);
- d1s64 = vadd_s64(d2s64, d3s64);
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
- vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
-
-#define FILTER_BITS 7
-
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
- const int32x4_t a = vpaddlq_s16(v_16x8);
- const int64x2_t b = vpaddlq_s32(a);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
- const int64x2_t b = vpaddlq_s32(v_32x4);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- return vget_lane_s32(c, 0);
-}
-
-static void variance_neon_w8(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- int w, int h, unsigned int *sse, int *sum) {
- int i, j;
- int16x8_t v_sum = vdupq_n_s16(0);
- int32x4_t v_sse_lo = vdupq_n_s32(0);
- int32x4_t v_sse_hi = vdupq_n_s32(0);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- const uint8x8_t v_a = vld1_u8(&a[j]);
- const uint8x8_t v_b = vld1_u8(&b[j]);
- const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
- const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
- v_sum = vaddq_s16(v_sum, sv_diff);
- v_sse_lo = vmlal_s16(v_sse_lo,
- vget_low_s16(sv_diff),
- vget_low_s16(sv_diff));
- v_sse_hi = vmlal_s16(v_sse_hi,
- vget_high_s16(sv_diff),
- vget_high_s16(sv_diff));
- }
- a += a_stride;
- b += b_stride;
- }
-
- *sum = horizontal_add_s16x8(v_sum);
- *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
-}
-
-static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
- return *sse - (((int64_t)sum * sum) / (8 * 8));
-}
-
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *vpx_filter) {
- const uint8x8_t f0 = vmov_n_u8(vpx_filter[0]);
- const uint8x8_t f1 = vmov_n_u8(vpx_filter[1]);
- unsigned int i;
- for (i = 0; i < output_height; ++i) {
- const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
- const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
- const uint16x8_t a = vmull_u8(src_0, f0);
- const uint16x8_t b = vmlal_u8(a, src_1, f1);
- const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
- vst1_u8(&output_ptr[0], out);
- // Next row...
- src_ptr += src_pixels_per_line;
- output_ptr += output_width;
- }
-}
-
-unsigned int vp8_sub_pixel_variance8x8_neon(
- const unsigned char *src,
- int src_stride,
- int xoffset,
- int yoffset,
- const unsigned char *dst,
- int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
- if (xoffset == 0) {
- var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8,
- 8, bilinear_taps_coeff[yoffset]);
- } else if (yoffset == 0) {
- var_filter_block2d_bil_w8(src, temp2, src_stride, 1,
- 9, 8,
- bilinear_taps_coeff[xoffset]);
- } else {
- var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
- 9, 8,
- bilinear_taps_coeff[xoffset]);
- var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
- 8, bilinear_taps_coeff[yoffset]);
- }
- return variance8x8_neon(temp2, 8, dst, dst_stride, sse);
-}
--- a/vp8/common/arm/variance_arm.c
+++ /dev/null
@@ -1,137 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "./vp8_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vp8/common/variance.h"
-#include "vp8/common/filter.h"
-
-// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
-#if CONFIG_VP8_ENCODER
-
-#if HAVE_MEDIA
-#include "vp8/common/arm/bilinearfilter_arm.h"
-
-unsigned int vp8_sub_pixel_variance8x8_armv6
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- unsigned short first_pass[10*8];
- unsigned char second_pass[8*8];
- const short *HFilter, *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
- src_pixels_per_line,
- 9, 8, HFilter);
- vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
- 8, 8, 8, VFilter);
-
- return vpx_variance8x8_media(second_pass, 8, dst_ptr,
- dst_pixels_per_line, sse);
-}
-
-unsigned int vp8_sub_pixel_variance16x16_armv6
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- unsigned short first_pass[36*16];
- unsigned char second_pass[20*16];
- const short *HFilter, *VFilter;
- unsigned int var;
-
- if (xoffset == 4 && yoffset == 0)
- {
- var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, sse);
- }
- else if (xoffset == 0 && yoffset == 4)
- {
- var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, sse);
- }
- else if (xoffset == 4 && yoffset == 4)
- {
- var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, sse);
- }
- else
- {
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
- src_pixels_per_line,
- 17, 16, HFilter);
- vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
- 16, 16, 16, VFilter);
-
- var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
- dst_pixels_per_line, sse);
- }
- return var;
-}
-
-#endif // HAVE_MEDIA
-
-
-#if HAVE_NEON
-
-extern unsigned int vp8_sub_pixel_variance16x16_neon_func
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-);
-
-unsigned int vp8_sub_pixel_variance16x16_neon
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- if (xoffset == 4 && yoffset == 0)
- return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
- else if (xoffset == 0 && yoffset == 4)
- return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
- else if (xoffset == 4 && yoffset == 4)
- return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
- else
- return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-}
-
-#endif // HAVE_NEON
-#endif // CONFIG_VP8_ENCODER
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c
@@ -20,7 +20,7 @@
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp8/common/postproc.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/yv12config.h"
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -238,47 +238,6 @@
$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
#
-# Sub-pixel Variance
-#
-add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/;
-$vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media/;
-$vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt;
-$vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/;
-$vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance16x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance16x8 mmx sse2 ssse3/;
-$vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon_asm/;
-$vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt;
-$vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6;
-$vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/;
-$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;
-$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/;
-$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;
-$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/;
-$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
-$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
-
-#
# Encoder functions below this point.
#
if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
--- a/vp8/common/variance.h
+++ /dev/null
@@ -1,92 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP8_COMMON_VARIANCE_H_
-#define VP8_COMMON_VARIANCE_H_
-
-#include "vpx_config.h"
-
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef unsigned int(*vpx_sad_fn_t)(
- const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int ref_stride);
-
-typedef void (*vp8_copy32xn_fn_t)(
- const unsigned char *src_ptr,
- int source_stride,
- unsigned char *ref_ptr,
- int ref_stride,
- int n);
-
-typedef void (*vpx_sad_multi_fn_t)(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_array,
- int ref_stride,
- unsigned int *sad_array);
-
-typedef void (*vpx_sad_multi_d_fn_t)
- (
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char * const ref_array[],
- int ref_stride,
- unsigned int *sad_array
- );
-
-typedef unsigned int (*vpx_variance_fn_t)
- (
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int ref_stride,
- unsigned int *sse
- );
-
-typedef unsigned int (*vp8_subpixvariance_fn_t)
- (
- const unsigned char *src_ptr,
- int source_stride,
- int xoffset,
- int yoffset,
- const unsigned char *ref_ptr,
- int Refstride,
- unsigned int *sse
- );
-
-typedef struct variance_vtable
-{
- vpx_sad_fn_t sdf;
- vpx_variance_fn_t vf;
- vp8_subpixvariance_fn_t svf;
- vpx_variance_fn_t svf_halfpix_h;
- vpx_variance_fn_t svf_halfpix_v;
- vpx_variance_fn_t svf_halfpix_hv;
- vpx_sad_multi_fn_t sdx3f;
- vpx_sad_multi_fn_t sdx8f;
- vpx_sad_multi_d_fn_t sdx4df;
-#if ARCH_X86 || ARCH_X86_64
- vp8_copy32xn_fn_t copymem;
-#endif
-} vp8_variance_fn_ptr_t;
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VP8_COMMON_VARIANCE_H_
--- a/vp8/common/variance_c.c
+++ /dev/null
@@ -1,337 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "filter.h"
-#include "variance.h"
-
-/* This is a bad idea.
- * ctz = count trailing zeros */
-static int ctz(int a) {
- int b = 0;
- while (a != 1) {
- a >>= 1;
- b++;
- }
- return b;
-}
-
-static unsigned int variance(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- int w,
- int h,
- unsigned int *sse)
-{
- int i, j;
- int diff, sum;
-
- sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++)
- {
- for (j = 0; j < w; j++)
- {
- diff = src_ptr[j] - ref_ptr[j];
- sum += diff;
- *sse += diff * diff;
- }
-
- src_ptr += source_stride;
- ref_ptr += recon_stride;
- }
-
- return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
-}
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil_first_pass
- *
- * INPUTS : UINT8 *src_ptr : Pointer to source block.
- * UINT32 src_pixels_per_line : Stride of input block.
- * UINT32 pixel_step : Offset between filter input samples (see notes).
- * UINT32 output_height : Input block height.
- * UINT32 output_width : Input block width.
- * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
- *
- * OUTPUTS : INT32 *output_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
- * either horizontal or vertical direction to produce the
- * filtered output block. Used to implement first-pass
- * of 2-D separable filter.
- *
- * SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
- * Two filter taps should sum to VP8_FILTER_WEIGHT.
- * pixel_step defines whether the filter is applied
- * horizontally (pixel_step=1) or vertically (pixel_step=stride).
- * It defines the offset required to move from one input
- * to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_first_pass
-(
- const unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp8_filter
-)
-{
- unsigned int i, j;
-
- for (i = 0; i < output_height; i++)
- {
- for (j = 0; j < output_width; j++)
- {
- /* Apply bilinear filter */
- output_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
- ((int)src_ptr[pixel_step] * vp8_filter[1]) +
- (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
- src_ptr++;
- }
-
- /* Next row... */
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil_second_pass
- *
- * INPUTS : INT32 *src_ptr : Pointer to source block.
- * UINT32 src_pixels_per_line : Stride of input block.
- * UINT32 pixel_step : Offset between filter input samples (see notes).
- * UINT32 output_height : Input block height.
- * UINT32 output_width : Input block width.
- * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
- *
- * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
- * either horizontal or vertical direction to produce the
- * filtered output block. Used to implement second-pass
- * of 2-D separable filter.
- *
- * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- * Two filter taps should sum to VP8_FILTER_WEIGHT.
- * pixel_step defines whether the filter is applied
- * horizontally (pixel_step=1) or vertically (pixel_step=stride).
- * It defines the offset required to move from one input
- * to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_second_pass
-(
- const unsigned short *src_ptr,
- unsigned char *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp8_filter
-)
-{
- unsigned int i, j;
- int Temp;
-
- for (i = 0; i < output_height; i++)
- {
- for (j = 0; j < output_width; j++)
- {
- /* Apply filter */
- Temp = ((int)src_ptr[0] * vp8_filter[0]) +
- ((int)src_ptr[pixel_step] * vp8_filter[1]) +
- (VP8_FILTER_WEIGHT / 2);
- output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
- src_ptr++;
- }
-
- /* Next row... */
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-
-unsigned int vp8_sub_pixel_variance4x4_c
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- unsigned char temp2[20*16];
- const short *HFilter, *VFilter;
- unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- /* First filter 1d Horizontal */
- var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
-
- /* Now filter Verticaly */
- var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
-
- return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_c
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */
- unsigned char temp2[20*16];
- const short *HFilter, *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
- var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
-
- return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
-}
-
-unsigned int vp8_sub_pixel_variance16x16_c
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- unsigned short FData3[17*16]; /* Temp data bufffer used in filtering */
- unsigned char temp2[20*16];
- const short *HFilter, *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
- var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
-
- return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_c(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0,
- ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_c(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4,
- ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_c(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4,
- ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_sub_pixel_variance16x8_c
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- unsigned short FData3[16*9]; /* Temp data bufffer used in filtering */
- unsigned char temp2[20*16];
- const short *HFilter, *VFilter;
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
- var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
- var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
-
- return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
-}
-
-unsigned int vp8_sub_pixel_variance8x16_c
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- unsigned short FData3[9*16]; /* Temp data bufffer used in filtering */
- unsigned char temp2[20*16];
- const short *HFilter, *VFilter;
-
-
- HFilter = vp8_bilinear_filters[xoffset];
- VFilter = vp8_bilinear_filters[yoffset];
-
-
- var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
- var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
-
- return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
-}
--- a/vp8/common/x86/variance_impl_sse2.asm
+++ /dev/null
@@ -1,972 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift 7
-
-;void vp8_filter_block2d_bil_var_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int xoffset,
-; int yoffset,
-; int *sum,
-; unsigned int *sumsquared;;
-;
-;)
-global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
-sym(vp8_filter_block2d_bil_var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- pxor xmm6, xmm6 ;
- pxor xmm7, xmm7 ;
-
- lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
- movdqa xmm4, XMMWORD PTR [rsi]
-
- lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)]
- movsxd rax, dword ptr arg(5) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je filter_block2d_bil_var_sse2_sp_only
-
- shl rax, 5 ; point to filter coeff with xoffset
- lea rax, [rax + rcx] ; HFilter
-
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip second_pass filter if yoffset=0
- je filter_block2d_bil_var_sse2_fp_only
-
- shl rdx, 5
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
-
- pxor xmm0, xmm0 ;
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
- movdqa xmm5, xmm1
-
- movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
- lea rsi, [rsi + rbx]
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0 ;
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movdqa xmm3, xmm5 ;
- movdqa xmm5, xmm1 ;
-
- pmullw xmm3, [rdx] ;
- pmullw xmm1, [rdx+16] ;
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- lea rsi, [rsi + rbx] ;ref_pixels_per_line
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_var_sse2_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
- je filter_block2d_bil_var_sse2_full_pixel
-
- shl rdx, 5
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
- movq xmm1, QWORD PTR [rsi] ;
- punpcklbw xmm1, xmm0 ;
-
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
- lea rsi, [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
- movq xmm3, QWORD PTR [rsi] ;
- punpcklbw xmm3, xmm0 ;
- movdqa xmm5, xmm3
-
- pmullw xmm1, [rdx] ;
- pmullw xmm3, [rdx+16] ;
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- movdqa xmm1, xmm5 ;
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_sp_only_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
- pxor xmm0, xmm0 ;
-
-filter_block2d_bil_full_pixel_loop:
- movq xmm1, QWORD PTR [rsi] ;
- punpcklbw xmm1, xmm0 ;
-
- movq xmm2, QWORD PTR [rdi] ;
- punpcklbw xmm2, xmm0 ;
-
- psubw xmm1, xmm2 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_full_pixel_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0 ;
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
- lea rsi, [rsi + rdx]
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_fp_only_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
- movdq2q mm6, xmm6 ;
- movdq2q mm7, xmm7 ;
-
- psrldq xmm6, 8
- psrldq xmm7, 8
-
- movdq2q mm2, xmm6
- movdq2q mm3, xmm7
-
- paddw mm6, mm2
- paddd mm7, mm3
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rsi, arg(7) ; sum
- mov rdi, arg(8) ; sumsquared
-
- movd [rsi], mm2 ; xsum
- movd [rdi], mm4 ; xxsum
-
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_half_horiz_vert_variance8x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
-sym(vp8_half_horiz_vert_variance8x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
-
- movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
- movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
-%else
- add rsi, r8
-%endif
-
-vp8_half_horiz_vert_variance8x_h_1:
-
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm2, QWORD PTR [rsi+1] ;
- pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
- pavgb xmm5, xmm1 ; xmm = vertical average of the above
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
-
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
-
- movdqa xmm5, xmm1 ; save xmm1 for use on the next row
-
-%if ABI_IS_32BIT
- add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
- add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
-%else
- add rsi, r8
- add rdi, r9
-%endif
-
- sub rcx, 1 ;
- jnz vp8_half_horiz_vert_variance8x_h_1 ;
-
- movdq2q mm6, xmm6 ;
- movdq2q mm7, xmm7 ;
-
- psrldq xmm6, 8
- psrldq xmm7, 8
-
- movdq2q mm2, xmm6
- movdq2q mm3, xmm7
-
- paddw mm6, mm2
- paddd mm7, mm3
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rsi, arg(5) ; sum
- mov rdi, arg(6) ; sumsquared
-
- movd [rsi], mm2 ;
- movd [rdi], mm4 ;
-
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_half_horiz_vert_variance16x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
-sym(vp8_half_horiz_vert_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
-
- pxor xmm0, xmm0 ;
-
- movdqu xmm5, XMMWORD PTR [rsi]
- movdqu xmm3, XMMWORD PTR [rsi+1]
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
-
- lea rsi, [rsi + rax]
-
-vp8_half_horiz_vert_variance16x_h_1:
- movdqu xmm1, XMMWORD PTR [rsi] ;
- movdqu xmm2, XMMWORD PTR [rsi+1] ;
- pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
- pavgb xmm5, xmm1 ; xmm = vertical average of the above
-
- movdqa xmm4, xmm5
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
- punpckhbw xmm4, xmm0
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
- psubw xmm5, xmm3 ; xmm5 -= xmm3
-
- movq xmm3, QWORD PTR [rdi+8]
- punpcklbw xmm3, xmm0
- psubw xmm4, xmm3
-
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm4
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm4, xmm4
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm4
-
- movdqa xmm5, xmm1 ; save xmm1 for use on the next row
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1 ;
- jnz vp8_half_horiz_vert_variance16x_h_1 ;
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_half_vert_variance8x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
-sym(vp8_half_vert_variance8x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
-vp8_half_vert_variance8x_h_1:
- movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
- movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
-
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
- add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
- add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
-%else
- add rsi, r8
- add rdi, r9
-%endif
-
- sub rcx, 1 ;
- jnz vp8_half_vert_variance8x_h_1 ;
-
- movdq2q mm6, xmm6 ;
- movdq2q mm7, xmm7 ;
-
- psrldq xmm6, 8
- psrldq xmm7, 8
-
- movdq2q mm2, xmm6
- movdq2q mm3, xmm7
-
- paddw mm6, mm2
- paddd mm7, mm3
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rsi, arg(5) ; sum
- mov rdi, arg(6) ; sumsquared
-
- movd [rsi], mm2 ;
- movd [rdi], mm4 ;
-
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_half_vert_variance16x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
-sym(vp8_half_vert_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr
-
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
-
- movdqu xmm5, XMMWORD PTR [rsi]
- lea rsi, [rsi + rax ]
- pxor xmm0, xmm0
-
-vp8_half_vert_variance16x_h_1:
- movdqu xmm3, XMMWORD PTR [rsi]
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- movdqa xmm4, xmm5
- punpcklbw xmm5, xmm0
- punpckhbw xmm4, xmm0
-
- movq xmm2, QWORD PTR [rdi]
- punpcklbw xmm2, xmm0
- psubw xmm5, xmm2
- movq xmm2, QWORD PTR [rdi+8]
- punpcklbw xmm2, xmm0
- psubw xmm4, xmm2
-
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm4
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm4, xmm4
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm4
-
- movdqa xmm5, xmm3
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1
- jnz vp8_half_vert_variance16x_h_1
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_half_horiz_variance8x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
-sym(vp8_half_horiz_variance8x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
-
- pxor xmm0, xmm0 ;
-vp8_half_horiz_variance8x_h_1:
- movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
- movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
-
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
- add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
- add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
-%else
- add rsi, r8
- add rdi, r9
-%endif
- sub rcx, 1 ;
- jnz vp8_half_horiz_variance8x_h_1 ;
-
- movdq2q mm6, xmm6 ;
- movdq2q mm7, xmm7 ;
-
- psrldq xmm6, 8
- psrldq xmm7, 8
-
- movdq2q mm2, xmm6
- movdq2q mm3, xmm7
-
- paddw mm6, mm2
- paddd mm7, mm3
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rsi, arg(5) ; sum
- mov rdi, arg(6) ; sumsquared
-
- movd [rsi], mm2 ;
- movd [rdi], mm4 ;
-
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_half_horiz_variance16x_h_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
-sym(vp8_half_horiz_variance16x_h_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6 ; error accumulator
- pxor xmm7, xmm7 ; sse eaccumulator
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
-
- pxor xmm0, xmm0 ;
-
-vp8_half_horiz_variance16x_h_1:
- movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
- movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
-
- pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
- movdqa xmm1, xmm5
- punpcklbw xmm5, xmm0 ; xmm5 = words of above
- punpckhbw xmm1, xmm0
-
- movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
- punpcklbw xmm3, xmm0 ; xmm3 = words of above
- movq xmm2, QWORD PTR [rdi+8]
- punpcklbw xmm2, xmm0
-
- psubw xmm5, xmm3 ; xmm5 -= xmm3
- psubw xmm1, xmm2
- paddw xmm6, xmm5 ; xmm6 += accumulated column differences
- paddw xmm6, xmm1
- pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
- pmaddwd xmm1, xmm1
- paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
- paddd xmm7, xmm1
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
-
- sub rcx, 1 ;
- jnz vp8_half_horiz_variance16x_h_1 ;
-
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(5) ;[Sum]
- mov rdi, arg(6) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-vp8_bilinear_filters_sse2:
- dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
- dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
- dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
- dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
- dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
- dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
- dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
--- a/vp8/common/x86/variance_impl_ssse3.asm
+++ /dev/null
@@ -1,364 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift 7
-
-
-;void vp8_filter_block2d_bil_var_ssse3
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int xoffset,
-; int yoffset,
-; int *sum,
-; unsigned int *sumsquared;;
-;
-;)
-;Note: The filter coefficient at offset=0 is 128. Since the second register
-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
-sym(vp8_filter_block2d_bil_var_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
- movsxd rax, dword ptr arg(5) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .filter_block2d_bil_var_ssse3_sp_only
-
- shl rax, 4 ; point to filter coeff with xoffset
- lea rax, [rax + rcx] ; HFilter
-
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip second_pass filter if yoffset=0
- je .filter_block2d_bil_var_ssse3_fp_only
-
- shl rdx, 4
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
-
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi+1]
- movdqa xmm2, xmm0
-
- punpcklbw xmm0, xmm1
- punpckhbw xmm2, xmm1
- pmaddubsw xmm0, [rax]
- pmaddubsw xmm2, [rax]
-
- paddw xmm0, [GLOBAL(xmm_bi_rd)]
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- psraw xmm0, xmm_filter_shift
- psraw xmm2, xmm_filter_shift
-
- packuswb xmm0, xmm2
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
- lea rsi, [rsi + r8]
-%endif
-
-.filter_block2d_bil_var_ssse3_loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rsi+1]
- movdqa xmm3, xmm1
-
- punpcklbw xmm1, xmm2
- punpckhbw xmm3, xmm2
- pmaddubsw xmm1, [rax]
- pmaddubsw xmm3, [rax]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
- packuswb xmm1, xmm3
-
- movdqa xmm2, xmm0
- movdqa xmm0, xmm1
- movdqa xmm3, xmm2
-
- punpcklbw xmm2, xmm1
- punpckhbw xmm3, xmm1
- pmaddubsw xmm2, [rdx]
- pmaddubsw xmm3, [rdx]
-
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm2, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
-
- movq xmm1, QWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm1, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm2, xmm1
- psubw xmm3, xmm5
- paddw xmm6, xmm2
- paddw xmm6, xmm3
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- paddd xmm7, xmm2
- paddd xmm7, xmm3
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rsi, [rsi + r8]
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_var_ssse3_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_sp_only:
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; Both xoffset =0 and yoffset=0
- je .filter_block2d_bil_var_ssse3_full_pixel
-
- shl rdx, 4
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqa xmm0, xmm1
-
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
- lea rsi, [rsi + rax]
-
-.filter_block2d_bil_sp_only_loop:
- movdqu xmm3, XMMWORD PTR [rsi]
- movdqa xmm2, xmm1
- movdqa xmm0, xmm3
-
- punpcklbw xmm1, xmm3
- punpckhbw xmm2, xmm3
- pmaddubsw xmm1, [rdx]
- pmaddubsw xmm2, [rdx]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm2, xmm_filter_shift
-
- movq xmm3, QWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm3, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm1, xmm3
- psubw xmm2, xmm5
- paddw xmm6, xmm1
- paddw xmm6, xmm2
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm7, xmm1
- paddd xmm7, xmm2
-
- movdqa xmm1, xmm0
- lea rsi, [rsi + rax] ;ref_pixels_per_line
-
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_sp_only_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_full_pixel:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
- pxor xmm0, xmm0
-
-.filter_block2d_bil_full_pixel_loop:
- movq xmm1, QWORD PTR [rsi]
- punpcklbw xmm1, xmm0
- movq xmm2, QWORD PTR [rsi+8]
- punpcklbw xmm2, xmm0
-
- movq xmm3, QWORD PTR [rdi]
- punpcklbw xmm3, xmm0
- movq xmm4, QWORD PTR [rdi+8]
- punpcklbw xmm4, xmm0
-
- psubw xmm1, xmm3
- psubw xmm2, xmm4
- paddw xmm6, xmm1
- paddw xmm6, xmm2
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm7, xmm1
- paddd xmm7, xmm2
-
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rdx] ;src_pixels_per_line
- sub rcx, 1
- jnz .filter_block2d_bil_full_pixel_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_fp_only:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0
-
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-.filter_block2d_bil_fp_only_loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rsi+1]
- movdqa xmm3, xmm1
-
- punpcklbw xmm1, xmm2
- punpckhbw xmm3, xmm2
- pmaddubsw xmm1, [rax]
- pmaddubsw xmm3, [rax]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
-
- movq xmm2, XMMWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm2, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm1, xmm2
- psubw xmm3, xmm5
- paddw xmm6, xmm1
- paddw xmm6, xmm3
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd xmm7, xmm1
- paddd xmm7, xmm3
-
- lea rsi, [rsi + rdx]
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_fp_only_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_variance:
- pxor xmm0, xmm0
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(7) ;[Sum]
- mov rdi, arg(8) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-vp8_bilinear_filters_ssse3:
- times 8 db 128, 0
- times 8 db 112, 16
- times 8 db 96, 32
- times 8 db 80, 48
- times 8 db 64, 64
- times 8 db 48, 80
- times 8 db 32, 96
- times 8 db 16, 112
--- a/vp8/common/x86/variance_ssse3.c
+++ /dev/null
@@ -1,157 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-
-extern void vp8_half_horiz_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp8_half_horiz_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp8_half_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp8_filter_block2d_bil_var_ssse3
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int xoffset,
- int yoffset,
- int *sum,
- unsigned int *sumsquared
-);
-
-unsigned int vp8_sub_pixel_variance16x16_ssse3
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- int xsum0;
- unsigned int xxsum0;
-
- /* note we could avoid these if statements if the calling function
- * just called the appropriate functions inside.
- */
- if (xoffset == 4 && yoffset == 0)
- {
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- }
- else if (xoffset == 0 && yoffset == 4)
- {
- vp8_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- }
- else if (xoffset == 4 && yoffset == 4)
- {
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- }
- else
- {
- vp8_filter_block2d_bil_var_ssse3(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum0, &xxsum0);
- }
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp8_sub_pixel_variance16x8_ssse3
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-
-)
-{
- int xsum0;
- unsigned int xxsum0;
-
- if (xoffset == 4 && yoffset == 0)
- {
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- }
- else if (xoffset == 0 && yoffset == 4)
- {
- vp8_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- }
- else if (xoffset == 4 && yoffset == 4)
- {
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- }
- else
- {
- vp8_filter_block2d_bil_var_ssse3(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum0, &xxsum0);
- }
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
--- a/vp8/common/x86/vp8_variance_impl_mmx.asm
+++ /dev/null
@@ -1,353 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define mmx_filter_shift 7
-
-;void vp8_filter_block2d_bil4x4_var_mmx
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned short *HFilter,
-; unsigned short *VFilter,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vp8_filter_block2d_bil4x4_var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
-
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
-
- mov rax, arg(4) ;HFilter ;
- mov rdx, arg(5) ;VFilter ;
-
- mov rsi, arg(0) ;ref_ptr ;
- mov rdi, arg(2) ;src_ptr ;
-
- mov rcx, 4 ;
- pxor mm0, mm0 ;
-
- movd mm1, [rsi] ;
- movd mm3, [rsi+1] ;
-
- punpcklbw mm1, mm0 ;
- pmullw mm1, [rax] ;
-
- punpcklbw mm3, mm0 ;
- pmullw mm3, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- movq mm5, mm1
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- add rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
- movd mm1, [rsi] ;
- movd mm3, [rsi+1] ;
-
- punpcklbw mm1, mm0 ;
- pmullw mm1, [rax] ;
-
- punpcklbw mm3, mm0 ;
- pmullw mm3, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- movq mm3, mm5 ;
-
- movq mm5, mm1 ;
- pmullw mm3, [rdx] ;
-
- pmullw mm1, [rdx+8] ;
- paddw mm1, mm3 ;
-
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- psraw mm1, mmx_filter_shift ;
-
- movd mm3, [rdi] ;
- punpcklbw mm3, mm0 ;
-
- psubw mm1, mm3 ;
- paddw mm6, mm1 ;
-
- pmaddwd mm1, mm1 ;
- paddd mm7, mm1 ;
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
- add rdi, dword ptr arg(3) ;src_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
- add rsi, r8
- add rdi, r9
-%endif
- sub rcx, 1 ;
- jnz .filter_block2d_bil4x4_var_mmx_loop ;
-
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rdi, arg(6) ;sum
- mov rsi, arg(7) ;sumsquared
-
- movd dword ptr [rdi], mm2 ;
- movd dword ptr [rsi], mm4 ;
-
-
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
-;void vp8_filter_block2d_bil_var_mmx
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; unsigned short *HFilter,
-; unsigned short *VFilter,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
-sym(vp8_filter_block2d_bil_var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
- mov rax, arg(5) ;HFilter ;
-
- mov rdx, arg(6) ;VFilter ;
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
-
- pxor mm0, mm0 ;
- movq mm1, [rsi] ;
-
- movq mm3, [rsi+1] ;
- movq mm2, mm1 ;
-
- movq mm4, mm3 ;
- punpcklbw mm1, mm0 ;
-
- punpckhbw mm2, mm0 ;
- pmullw mm1, [rax] ;
-
- pmullw mm2, [rax] ;
- punpcklbw mm3, mm0 ;
-
- punpckhbw mm4, mm0 ;
- pmullw mm3, [rax+8] ;
-
- pmullw mm4, [rax+8] ;
- paddw mm1, mm3 ;
-
- paddw mm2, mm4 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm2, mmx_filter_shift ;
- movq mm5, mm1
-
- packuswb mm5, mm2 ;
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- add rsi, r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
- movq mm1, [rsi] ;
- movq mm3, [rsi+1] ;
-
- movq mm2, mm1 ;
- movq mm4, mm3 ;
-
- punpcklbw mm1, mm0 ;
- punpckhbw mm2, mm0 ;
-
- pmullw mm1, [rax] ;
- pmullw mm2, [rax] ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, [rax+8] ;
- pmullw mm4, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm2, mm4 ;
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- psraw mm1, mmx_filter_shift ;
-
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
- psraw mm2, mmx_filter_shift ;
-
- movq mm3, mm5 ;
- movq mm4, mm5 ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- movq mm5, mm1 ;
- packuswb mm5, mm2 ;
-
- pmullw mm3, [rdx] ;
- pmullw mm4, [rdx] ;
-
- pmullw mm1, [rdx+8] ;
- pmullw mm2, [rdx+8] ;
-
- paddw mm1, mm3 ;
- paddw mm2, mm4 ;
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- psraw mm2, mmx_filter_shift ;
-
- movq mm3, [rdi] ;
- movq mm4, mm3 ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- psubw mm1, mm3 ;
- psubw mm2, mm4 ;
-
- paddw mm6, mm1 ;
- pmaddwd mm1, mm1 ;
-
- paddw mm6, mm2 ;
- pmaddwd mm2, mm2 ;
-
- paddd mm7, mm1 ;
- paddd mm7, mm2 ;
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
- add rdi, dword ptr arg(3) ;src_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
- add rsi, r8
- add rdi, r9
-%endif
- sub rcx, 1 ;
- jnz .filter_block2d_bil_var_mmx_loop ;
-
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rdi, arg(7) ;sum
- mov rsi, arg(8) ;sumsquared
-
- movd dword ptr [rdi], mm2 ;
- movd dword ptr [rsi], mm4 ;
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
- times 4 dw 64
--- a/vp8/common/x86/vp8_variance_mmx.c
+++ /dev/null
@@ -1,244 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/x86/filter_x86.h"
-
-extern void filter_block1d_h6_mmx
-(
- const unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- short *filter
-);
-extern void filter_block1d_v6_mmx
-(
- const short *src_ptr,
- unsigned char *output_ptr,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- short *filter
-);
-
-extern void vp8_filter_block2d_bil4x4_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp8_filter_block2d_bil_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-
-unsigned int vp8_sub_pixel_variance4x4_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse)
-
-{
- int xsum;
- unsigned int xxsum;
- vp8_filter_block2d_bil4x4_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line,
- vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
-
- int xsum;
- unsigned int xxsum;
- vp8_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp8_sub_pixel_variance16x16_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
-
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
-
- vp8_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
- &xsum0, &xxsum0
- );
-
-
- vp8_filter_block2d_bil_var_mmx(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-
-
-}
-
-unsigned int vp8_sub_pixel_variance16x8_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
-
- vp8_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
- &xsum0, &xxsum0
- );
-
-
- vp8_filter_block2d_bil_var_mmx(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp8_sub_pixel_variance8x16_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- int xsum;
- unsigned int xxsum;
- vp8_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
- ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
- ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
- ref_ptr, recon_stride, sse);
-}
--- a/vp8/common/x86/vp8_variance_sse2.c
+++ /dev/null
@@ -1,403 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/x86/filter_x86.h"
-
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-
-extern void vp8_filter_block2d_bil4x4_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-
-void vp8_filter_block2d_bil_var_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int xoffset,
- int yoffset,
- int *sum,
- unsigned int *sumsquared
-);
-void vp8_half_horiz_vert_variance8x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-void vp8_half_horiz_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-void vp8_half_horiz_variance8x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-void vp8_half_horiz_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-void vp8_half_vert_variance8x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-void vp8_half_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-
-unsigned int vp8_sub_pixel_variance4x4_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- int xsum;
- unsigned int xxsum;
- vp8_filter_block2d_bil4x4_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line,
- vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- int xsum;
- unsigned int xxsum;
-
- if (xoffset == 4 && yoffset == 0)
- {
- vp8_half_horiz_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- }
- else if (xoffset == 0 && yoffset == 4)
- {
- vp8_half_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- }
- else if (xoffset == 4 && yoffset == 4)
- {
- vp8_half_horiz_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- }
- else
- {
- vp8_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum, &xxsum);
- }
-
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp8_sub_pixel_variance16x16_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
-
- /* note we could avoid these if statements if the calling function
- * just called the appropriate functions inside.
- */
- if (xoffset == 4 && yoffset == 0)
- {
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- }
- else if (xoffset == 0 && yoffset == 4)
- {
- vp8_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- }
- else if (xoffset == 4 && yoffset == 4)
- {
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- }
- else
- {
- vp8_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum0, &xxsum0
- );
-
- vp8_filter_block2d_bil_var_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum1, &xxsum1
- );
- xsum0 += xsum1;
- xxsum0 += xxsum1;
- }
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp8_sub_pixel_variance16x8_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-
-)
-{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
- if (xoffset == 4 && yoffset == 0)
- {
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- }
- else if (xoffset == 0 && yoffset == 4)
- {
- vp8_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- }
- else if (xoffset == 4 && yoffset == 4)
- {
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- }
- else
- {
- vp8_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum0, &xxsum0);
-
- vp8_filter_block2d_bil_var_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum1, &xxsum1);
- xsum0 += xsum1;
- xxsum0 += xxsum1;
- }
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp8_sub_pixel_variance8x16_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- int xsum;
- unsigned int xxsum;
-
- if (xoffset == 4 && yoffset == 0)
- {
- vp8_half_horiz_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- }
- else if (xoffset == 0 && yoffset == 4)
- {
- vp8_half_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- }
- else if (xoffset == 4 && yoffset == 4)
- {
- vp8_half_horiz_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- }
- else
- {
- vp8_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum, &xxsum);
- }
-
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_wmt(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse)
-{
- int xsum0;
- unsigned int xxsum0;
-
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_wmt(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse)
-{
- int xsum0;
- unsigned int xxsum0;
- vp8_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse)
-{
- int xsum0;
- unsigned int xxsum0;
-
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -16,7 +16,7 @@
#include "./vpx_scale_rtcd.h"
#include "block.h"
#include "onyx_int.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
#include "encodeintra.h"
#include "vp8/common/setupintrarecon.h"
#include "vp8/common/systemdependent.h"
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -13,7 +13,7 @@
#define VP8_ENCODER_MCOMP_H_
#include "block.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
#ifdef __cplusplus
extern "C" {
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2132,10 +2132,10 @@
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
- cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16;
- cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h;
- cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v;
- cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;
+ cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16;
+ cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vpx_variance_halfpixvar16x16_h;
+ cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vpx_variance_halfpixvar16x16_v;
+ cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv;
cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
@@ -2142,7 +2142,7 @@
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
- cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8;
+ cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
@@ -2152,7 +2152,7 @@
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
- cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16;
+ cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
@@ -2162,7 +2162,7 @@
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
- cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8;
+ cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
@@ -2172,7 +2172,7 @@
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
- cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4;
+ cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -18,7 +18,7 @@
#include "treewriter.h"
#include "tokenize.h"
#include "vp8/common/onyxc_int.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
#include "encodemb.h"
#include "quantize.h"
#include "vp8/common/entropy.h"
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -22,7 +22,7 @@
#include "encodemb.h"
#include "vp8/common/reconinter.h"
#include "vp8/common/reconintra4x4.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
#include "mcomp.h"
#include "rdopt.h"
#include "vpx_mem/vpx_mem.h"
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -29,7 +29,7 @@
#include "vp8/common/quant_common.h"
#include "encodemb.h"
#include "quantize.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
#include "mcomp.h"
#include "rdopt.h"
#include "vpx_mem/vpx_mem.h"
@@ -500,9 +500,9 @@
if ((mv_row | mv_col) & 7)
{
- vp8_sub_pixel_variance8x8(uptr, pre_stride,
+ vpx_sub_pixel_variance8x8(uptr, pre_stride,
mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
- vp8_sub_pixel_variance8x8(vptr, pre_stride,
+ vpx_sub_pixel_variance8x8(vptr, pre_stride,
mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
sse2 += sse1;
}
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -63,8 +63,6 @@
VP8_COMMON_SRCS-yes += common/reconintra4x4.c
VP8_COMMON_SRCS-yes += common/setupintrarecon.c
VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
-VP8_COMMON_SRCS-yes += common/variance_c.c
-VP8_COMMON_SRCS-yes += common/variance.h
VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
@@ -86,8 +84,6 @@
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_mmx.c
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_impl_mmx.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
@@ -96,12 +92,8 @@
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp8_variance_sse2.c
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
@@ -129,7 +121,6 @@
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/variance_arm.c
# common (media)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/bilinearfilter_arm.c
@@ -149,9 +140,6 @@
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
-VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
# common (neon intrinsics)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c
@@ -170,6 +158,5 @@
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
--- a/vp9/common/mips/msa/vp9_convolve_avg_msa.c
+++ b/vp9/common/mips/msa/vp9_convolve_avg_msa.c
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, int32_t height) {
--- a/vp9/common/mips/msa/vp9_convolve_copy_msa.c
+++ b/vp9/common/mips/msa/vp9_convolve_copy_msa.c
@@ -9,7 +9,7 @@
*/
#include <string.h>
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, int32_t height) {
--- a/vp9/common/mips/msa/vp9_convolve_msa.h
+++ b/vp9/common/mips/msa/vp9_convolve_msa.h
@@ -12,7 +12,7 @@
#define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
#include "vp9/common/vp9_filter.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
extern const uint8_t mc_filt_mask_arr[16 * 3];
--- a/vp9/common/mips/msa/vp9_idct_msa.h
+++ b/vp9/common/mips/msa/vp9_idct_msa.h
@@ -13,7 +13,7 @@
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
v8i16 k0_m = __msa_fill_h(cnst0); \
--- a/vp9/common/mips/msa/vp9_intra_predict_msa.c
+++ b/vp9/common/mips/msa/vp9_intra_predict_msa.c
@@ -9,7 +9,7 @@
*/
#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) { \
out0 = __msa_subs_u_h(out0, in0); \
--- a/vp9/common/mips/msa/vp9_loopfilter_msa.h
+++ b/vp9/common/mips/msa/vp9_loopfilter_msa.h
@@ -11,7 +11,7 @@
#ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
#define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
p1_out, p0_out, q0_out, q1_out) { \
--- a/vp9/common/mips/msa/vp9_macros_msa.h
+++ /dev/null
@@ -1,1885 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
-#define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
-
-#include <msa.h>
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
-#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
-
-#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
-#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
-
-#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
-
-#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
-#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
-
-#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
-
-#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
-
-#if (__mips_isa_rev >= 6)
-#define LH(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint16_t val_m; \
- \
- __asm__ __volatile__ ( \
- "lh %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
-
-#define LW(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val_m; \
- \
- __asm__ __volatile__ ( \
- "lw %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
-
-#if (__mips == 64)
-#define LD(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint64_t val_m = 0; \
- \
- __asm__ __volatile__ ( \
- "ld %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
-#else // !(__mips == 64)
-#define LD(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val0_m, val1_m; \
- uint64_t val_m = 0; \
- \
- val0_m = LW(psrc_m); \
- val1_m = LW(psrc_m + 4); \
- \
- val_m = (uint64_t)(val1_m); \
- val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
- val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
- \
- val_m; \
-})
-#endif // (__mips == 64)
-
-#define SH(val, pdst) { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint16_t val_m = (val); \
- \
- __asm__ __volatile__ ( \
- "sh %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m" (*pdst_m) \
- : [val_m] "r" (val_m) \
- ); \
-}
-
-#define SW(val, pdst) { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint32_t val_m = (val); \
- \
- __asm__ __volatile__ ( \
- "sw %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m" (*pdst_m) \
- : [val_m] "r" (val_m) \
- ); \
-}
-
-#define SD(val, pdst) { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint64_t val_m = (val); \
- \
- __asm__ __volatile__ ( \
- "sd %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m" (*pdst_m) \
- : [val_m] "r" (val_m) \
- ); \
-}
-#else // !(__mips_isa_rev >= 6)
-#define LH(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint16_t val_m; \
- \
- __asm__ __volatile__ ( \
- "ulh %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
-
-#define LW(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val_m; \
- \
- __asm__ __volatile__ ( \
- "ulw %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
-
-#if (__mips == 64)
-#define LD(psrc) ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint64_t val_m = 0; \
- \
- __asm__ __volatile__ ( \
- "uld %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r" (val_m) \
- : [psrc_m] "m" (*psrc_m) \
- ); \
- \
- val_m; \
-})
-#else // !(__mips == 64)
-#define LD(psrc) ({ \
- const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
- uint32_t val0_m, val1_m; \
- uint64_t val_m = 0; \
- \
- val0_m = LW(psrc_m1); \
- val1_m = LW(psrc_m1 + 4); \
- \
- val_m = (uint64_t)(val1_m); \
- val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
- val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
- \
- val_m; \
-})
-#endif // (__mips == 64)
-
-#define SH(val, pdst) { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint16_t val_m = (val); \
- \
- __asm__ __volatile__ ( \
- "ush %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m" (*pdst_m) \
- : [val_m] "r" (val_m) \
- ); \
-}
-
-#define SW(val, pdst) { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint32_t val_m = (val); \
- \
- __asm__ __volatile__ ( \
- "usw %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m" (*pdst_m) \
- : [val_m] "r" (val_m) \
- ); \
-}
-
-#define SD(val, pdst) { \
- uint8_t *pdst_m1 = (uint8_t *)(pdst); \
- uint32_t val0_m, val1_m; \
- \
- val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \
- val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
- \
- SW(val0_m, pdst_m1); \
- SW(val1_m, pdst_m1 + 4); \
-}
-#endif // (__mips_isa_rev >= 6)
-
-/* Description : Load 4 words with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1, out2, out3
- Details : Load word in 'out0' from (psrc)
- Load word in 'out1' from (psrc + stride)
- Load word in 'out2' from (psrc + 2 * stride)
- Load word in 'out3' from (psrc + 3 * stride)
-*/
-#define LW4(psrc, stride, out0, out1, out2, out3) { \
- out0 = LW((psrc)); \
- out1 = LW((psrc) + stride); \
- out2 = LW((psrc) + 2 * stride); \
- out3 = LW((psrc) + 3 * stride); \
-}
-
-/* Description : Load double words with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Details : Load double word in 'out0' from (psrc)
- Load double word in 'out1' from (psrc + stride)
-*/
-#define LD2(psrc, stride, out0, out1) { \
- out0 = LD((psrc)); \
- out1 = LD((psrc) + stride); \
-}
-#define LD4(psrc, stride, out0, out1, out2, out3) { \
- LD2((psrc), stride, out0, out1); \
- LD2((psrc) + 2 * stride, stride, out2, out3); \
-}
-
-/* Description : Store 4 words with stride
- Arguments : Inputs - in0, in1, in2, in3, pdst, stride
- Details : Store word from 'in0' to (pdst)
- Store word from 'in1' to (pdst + stride)
- Store word from 'in2' to (pdst + 2 * stride)
- Store word from 'in3' to (pdst + 3 * stride)
-*/
-#define SW4(in0, in1, in2, in3, pdst, stride) { \
- SW(in0, (pdst)) \
- SW(in1, (pdst) + stride); \
- SW(in2, (pdst) + 2 * stride); \
- SW(in3, (pdst) + 3 * stride); \
-}
-
-/* Description : Store 4 double words with stride
- Arguments : Inputs - in0, in1, in2, in3, pdst, stride
- Details : Store double word from 'in0' to (pdst)
- Store double word from 'in1' to (pdst + stride)
- Store double word from 'in2' to (pdst + 2 * stride)
- Store double word from 'in3' to (pdst + 3 * stride)
-*/
-#define SD4(in0, in1, in2, in3, pdst, stride) { \
- SD(in0, (pdst)) \
- SD(in1, (pdst) + stride); \
- SD(in2, (pdst) + 2 * stride); \
- SD(in3, (pdst) + 3 * stride); \
-}
-
-/* Description : Load vectors with 16 byte elements with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Load 16 byte elements in 'out0' from (psrc)
- Load 16 byte elements in 'out1' from (psrc + stride)
-*/
-#define LD_B2(RTYPE, psrc, stride, out0, out1) { \
- out0 = LD_B(RTYPE, (psrc)); \
- out1 = LD_B(RTYPE, (psrc) + stride); \
-}
-#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
-#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
-
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \
- LD_B2(RTYPE, (psrc), stride, out0, out1); \
- LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
-}
-#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
-#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
-
-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \
- LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
-}
-#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
-#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
-
-#define LD_B7(RTYPE, psrc, stride, \
- out0, out1, out2, out3, out4, out5, out6) { \
- LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
- LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
-}
-#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
-
-#define LD_B8(RTYPE, psrc, stride, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
-}
-#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
-#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
-
-/* Description : Load vectors with 8 halfword elements with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Details : Load 8 halfword elements in 'out0' from (psrc)
- Load 8 halfword elements in 'out1' from (psrc + stride)
-*/
-#define LD_H2(RTYPE, psrc, stride, out0, out1) { \
- out0 = LD_H(RTYPE, (psrc)); \
- out1 = LD_H(RTYPE, (psrc) + (stride)); \
-}
-#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
-
-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \
- LD_H2(RTYPE, (psrc), stride, out0, out1); \
- LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
-}
-#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
-
-#define LD_H8(RTYPE, psrc, stride, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
-}
-#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
-
-#define LD_H16(RTYPE, psrc, stride, \
- out0, out1, out2, out3, out4, out5, out6, out7, \
- out8, out9, out10, out11, out12, out13, out14, out15) { \
- LD_H8(RTYPE, (psrc), stride, \
- out0, out1, out2, out3, out4, out5, out6, out7); \
- LD_H8(RTYPE, (psrc) + 8 * stride, stride, \
- out8, out9, out10, out11, out12, out13, out14, out15); \
-}
-#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
-
-/* Description : Load 4x4 block of signed halfword elements from 1D source
- data into 4 vectors (Each vector with 4 signed halfwords)
- Arguments : Input - psrc
- Outputs - out0, out1, out2, out3
-*/
-#define LD4x4_SH(psrc, out0, out1, out2, out3) { \
- out0 = LD_SH(psrc); \
- out2 = LD_SH(psrc + 8); \
- out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
- out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
-}
-
-/* Description : Load 2 vectors of signed word elements with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Return Type - signed word
-*/
-#define LD_SW2(psrc, stride, out0, out1) { \
- out0 = LD_SW((psrc)); \
- out1 = LD_SW((psrc) + stride); \
-}
-
-/* Description : Store vectors of 16 byte elements with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 16 byte elements from 'in0' to (pdst)
- Store 16 byte elements from 'in1' to (pdst + stride)
-*/
-#define ST_B2(RTYPE, in0, in1, pdst, stride) { \
- ST_B(RTYPE, in0, (pdst)); \
- ST_B(RTYPE, in1, (pdst) + stride); \
-}
-#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
-
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \
- ST_B2(RTYPE, in0, in1, (pdst), stride); \
- ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-}
-#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
-
-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- pdst, stride) { \
- ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
- ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
-}
-#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
-
-/* Description : Store vectors of 8 halfword elements with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 8 halfword elements from 'in0' to (pdst)
- Store 8 halfword elements from 'in1' to (pdst + stride)
-*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) { \
- ST_H(RTYPE, in0, (pdst)); \
- ST_H(RTYPE, in1, (pdst) + stride); \
-}
-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
-
-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) { \
- ST_H2(RTYPE, in0, in1, (pdst), stride); \
- ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-}
-#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
-
-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \
- ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
- ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
-}
-#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
-
-/* Description : Store vectors of word elements with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 4 word elements from 'in0' to (pdst)
- Store 4 word elements from 'in1' to (pdst + stride)
-*/
-#define ST_SW2(in0, in1, pdst, stride) { \
- ST_SW(in0, (pdst)); \
- ST_SW(in1, (pdst) + stride); \
-}
-
-/* Description : Store 2x4 byte block to destination memory from input vector
- Arguments : Inputs - in, stidx, pdst, stride
- Details : Index 'stidx' halfword element from 'in' vector is copied to
- the GP register and stored to (pdst)
- Index 'stidx+1' halfword element from 'in' vector is copied to
- the GP register and stored to (pdst + stride)
- Index 'stidx+2' halfword element from 'in' vector is copied to
- the GP register and stored to (pdst + 2 * stride)
- Index 'stidx+3' halfword element from 'in' vector is copied to
- the GP register and stored to (pdst + 3 * stride)
-*/
-#define ST2x4_UB(in, stidx, pdst, stride) { \
- uint16_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \
- out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
- out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
- out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
- \
- SH(out0_m, pblk_2x4_m); \
- SH(out1_m, pblk_2x4_m + stride); \
- SH(out2_m, pblk_2x4_m + 2 * stride); \
- SH(out3_m, pblk_2x4_m + 3 * stride); \
-}
-
-/* Description : Store 4x2 byte block to destination memory from input vector
- Arguments : Inputs - in, pdst, stride
- Details : Index 0 word element from 'in' vector is copied to the GP
- register and stored to (pdst)
- Index 1 word element from 'in' vector is copied to the GP
- register and stored to (pdst + stride)
-*/
-#define ST4x2_UB(in, pdst, stride) { \
- uint32_t out0_m, out1_m; \
- uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_w((v4i32)in, 0); \
- out1_m = __msa_copy_u_w((v4i32)in, 1); \
- \
- SW(out0_m, pblk_4x2_m); \
- SW(out1_m, pblk_4x2_m + stride); \
-}
-
-/* Description : Store 4x4 byte block to destination memory from input vector
- Arguments : Inputs - in0, in1, pdst, stride
- Details : 'Idx0' word element from input vector 'in0' is copied to the
- GP register and stored to (pdst)
- 'Idx1' word element from input vector 'in0' is copied to the
- GP register and stored to (pdst + stride)
- 'Idx2' word element from input vector 'in0' is copied to the
- GP register and stored to (pdst + 2 * stride)
- 'Idx3' word element from input vector 'in0' is copied to the
- GP register and stored to (pdst + 3 * stride)
-*/
-#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \
- uint32_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_w((v4i32)in0, idx0); \
- out1_m = __msa_copy_u_w((v4i32)in0, idx1); \
- out2_m = __msa_copy_u_w((v4i32)in1, idx2); \
- out3_m = __msa_copy_u_w((v4i32)in1, idx3); \
- \
- SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
-}
-#define ST4x8_UB(in0, in1, pdst, stride) { \
- uint8_t *pblk_4x8 = (uint8_t *)(pdst); \
- \
- ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
- ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
-}
-
-/* Description : Store 8x1 byte block to destination memory from input vector
- Arguments : Inputs - in, pdst
- Details : Index 0 double word element from 'in' vector is copied to the
- GP register and stored to (pdst)
-*/
-#define ST8x1_UB(in, pdst) { \
- uint64_t out0_m; \
- \
- out0_m = __msa_copy_u_d((v2i64)in, 0); \
- SD(out0_m, pdst); \
-}
-
-/* Description : Store 8x2 byte block to destination memory from input vector
- Arguments : Inputs - in, pdst, stride
- Details : Index 0 double word element from 'in' vector is copied to the
- GP register and stored to (pdst)
- Index 1 double word element from 'in' vector is copied to the
- GP register and stored to (pdst + stride)
-*/
-#define ST8x2_UB(in, pdst, stride) { \
- uint64_t out0_m, out1_m; \
- uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_d((v2i64)in, 0); \
- out1_m = __msa_copy_u_d((v2i64)in, 1); \
- \
- SD(out0_m, pblk_8x2_m); \
- SD(out1_m, pblk_8x2_m + stride); \
-}
-
-/* Description : Store 8x4 byte block to destination memory from input
- vectors
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Index 0 double word element from 'in0' vector is copied to the
- GP register and stored to (pdst)
- Index 1 double word element from 'in0' vector is copied to the
- GP register and stored to (pdst + stride)
- Index 0 double word element from 'in1' vector is copied to the
- GP register and stored to (pdst + 2 * stride)
- Index 1 double word element from 'in1' vector is copied to the
- GP register and stored to (pdst + 3 * stride)
-*/
-#define ST8x4_UB(in0, in1, pdst, stride) { \
- uint64_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_d((v2i64)in0, 0); \
- out1_m = __msa_copy_u_d((v2i64)in0, 1); \
- out2_m = __msa_copy_u_d((v2i64)in1, 0); \
- out3_m = __msa_copy_u_d((v2i64)in1, 1); \
- \
- SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
-}
-
-/* Description : average with rounding (in0 + in1 + 1) / 2.
- Arguments : Inputs - in0, in1, in2, in3,
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Each unsigned byte element from 'in0' vector is added with
- each unsigned byte element from 'in1' vector. Then average
- with rounding is calculated and written to 'out0'
-*/
-#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
- out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
-}
-#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
-
-#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
- AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
-}
-#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
-
-/* Description : Immediate number of elements to slide with zero
- Arguments : Inputs - in0, in1, slide_val
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Byte elements from 'zero_m' vector are slide into 'in0' by
- value specified in the 'slide_val'
-*/
-#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \
- v16i8 zero_m = { 0 }; \
- out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
- out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
-}
-#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
-
-#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
- out0, out1, out2, out3, slide_val) { \
- SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
- SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
-}
-#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
-
-/* Description : Immediate number of elements to slide
- Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
- value specified in the 'slide_val'
-*/
-#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \
- out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \
- out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
-}
-#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
-
-#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
- out0, out1, out2, slide_val) { \
- SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
- out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \
-}
-#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
-#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
-
-/* Description : Shuffle byte vector elements as per mask vector
- Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Byte elements from 'in0' & 'in1' are copied selectively to
- 'out0' as per control vector 'mask0'
-*/
-#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \
- out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
- out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
-}
-#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
-#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
-#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
-
-#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
- out0, out1, out2, out3) { \
- VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
- VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
-}
-#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
-#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product of byte vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Unsigned byte elements from 'mult0' are multiplied with
- unsigned byte elements from 'cnst0' producing a result
- twice the size of input i.e. unsigned halfword.
- The multiplication result of adjacent odd-even elements
- are added together and written to the 'out0' vector
-*/
-#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \
- out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \
-}
-#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
-
-#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
- cnst0, cnst1, cnst2, cnst3, \
- out0, out1, out2, out3) { \
- DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
-}
-#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
-
-/* Description : Dot product of byte vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed byte elements from 'mult0' are multiplied with
- signed byte elements from 'cnst0' producing a result
- twice the size of input i.e. signed halfword.
- The multiplication result of adjacent odd-even elements
- are added together and written to the 'out0' vector
-*/
-#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
- out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
-}
-#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
-
-#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
- cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \
- DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
-}
-#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product of halfword vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed halfword elements from 'mult0' are multiplied with
- signed halfword elements from 'cnst0' producing a result
- twice the size of input i.e. signed word.
- The multiplication result of adjacent odd-even elements
- are added together and written to the 'out0' vector
-*/
-#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
- out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
-}
-#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
-
-#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
- cnst0, cnst1, cnst2, cnst3, \
- out0, out1, out2, out3) { \
- DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
-}
-#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
-
-/* Description : Dot product of word vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed word elements from 'mult0' are multiplied with
- signed word elements from 'cnst0' producing a result
- twice the size of input i.e. signed double word.
- The multiplication result of adjacent odd-even elements
- are added together and written to the 'out0' vector
-*/
-#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
- out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
-}
-#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
-
-/* Description : Dot product & addition of byte vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed byte elements from 'mult0' are multiplied with
- signed byte elements from 'cnst0' producing a result
- twice the size of input i.e. signed halfword.
- The multiplication result of adjacent odd-even elements
- are added to the 'out0' vector
-*/
-#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
- out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
-}
-#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
-
-#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
- cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \
- DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
-}
-#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product & addition of halfword vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed halfword elements from 'mult0' are multiplied with
- signed halfword elements from 'cnst0' producing a result
- twice the size of input i.e. signed word.
- The multiplication result of adjacent odd-even elements
- are added to the 'out0' vector
-*/
-#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
- out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
- out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
-}
-#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
-
-/* Description : Dot product & addition of double word vector elements
- Arguments : Inputs - mult0, mult1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Each signed word element from 'mult0' is multiplied with itself
- producing an intermediate result twice the size of input
- i.e. signed double word
- The multiplication result of adjacent odd-even elements
- are added to the 'out0' vector
-*/
-#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) { \
- out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
- out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
-}
-#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
-
-/* Description : Minimum values between unsigned elements of
- either vector are copied to the output vector
- Arguments : Inputs - in0, in1, min_vec
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Minimum of unsigned halfword element values from 'in0' and
- 'min_vec' are written to output vector 'in0'
-*/
-#define MIN_UH2(RTYPE, in0, in1, min_vec) { \
- in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
- in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
-}
-#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
-
-#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) { \
- MIN_UH2(RTYPE, in0, in1, min_vec); \
- MIN_UH2(RTYPE, in2, in3, min_vec); \
-}
-#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
-
-/* Description : Clips all signed halfword elements of input vector
- between 0 & 255
- Arguments : Input - in
- Output - out_m
- Return Type - signed halfword
-*/
-#define CLIP_SH_0_255(in) ({ \
- v8i16 max_m = __msa_ldi_h(255); \
- v8i16 out_m; \
- \
- out_m = __msa_maxi_s_h((v8i16)in, 0); \
- out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
- out_m; \
-})
-#define CLIP_SH2_0_255(in0, in1) { \
- in0 = CLIP_SH_0_255(in0); \
- in1 = CLIP_SH_0_255(in1); \
-}
-#define CLIP_SH4_0_255(in0, in1, in2, in3) { \
- CLIP_SH2_0_255(in0, in1); \
- CLIP_SH2_0_255(in2, in3); \
-}
-
-/* Description : Horizontal addition of 4 signed word elements of input vector
- Arguments : Input - in (signed word vector)
- Output - sum_m (i32 sum)
- Return Type - signed word (GP)
- Details : 4 signed word elements of 'in' vector are added together and
- the resulting integer sum is returned
-*/
-#define HADD_SW_S32(in) ({ \
- v2i64 res0_m, res1_m; \
- int32_t sum_m; \
- \
- res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
- res1_m = __msa_splati_d(res0_m, 1); \
- res0_m = res0_m + res1_m; \
- sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \
- sum_m; \
-})
-
-/* Description : Horizontal addition of unsigned byte vector elements
- Arguments : Inputs - in0, in1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Each unsigned odd byte element from 'in0' is added to
- even unsigned byte element from 'in0' (pairwise) and the
- halfword result is written to 'out0'
-*/
-#define HADD_UB2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
- out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
-}
-#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
-
-#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \
- HADD_UB2(RTYPE, in0, in1, out0, out1); \
- HADD_UB2(RTYPE, in2, in3, out2, out3); \
-}
-#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
-
-/* Description : Horizontal subtraction of unsigned byte vector elements
- Arguments : Inputs - in0, in1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Each unsigned odd byte element from 'in0' is subtracted from
- even unsigned byte element from 'in0' (pairwise) and the
- halfword result is written to 'out0'
-*/
-#define HSUB_UB2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
- out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
-}
-#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
-
-/* Description : Horizontal subtraction of signed halfword vector elements
- Arguments : Inputs - in0, in1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Each signed odd halfword element from 'in0' is subtracted from
- even signed halfword element from 'in0' (pairwise) and the
- word result is written to 'out0'
-*/
-#define HSUB_UH2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
- out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
-}
-#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
-
-/* Description : Set element n input vector to GPR value
- Arguments : Inputs - in0, in1, in2, in3
- Output - out
- Return Type - as per RTYPE
- Details : Set element 0 in vector 'out' to value specified in 'in0'
-*/
-#define INSERT_W2(RTYPE, in0, in1, out) { \
- out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
-}
-#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
-
-#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \
- out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
-}
-#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
-#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
-
-#define INSERT_D2(RTYPE, in0, in1, out) { \
- out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
- out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
-}
-#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
-#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
-
-/* Description : Interleave even byte elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even byte elements of 'in0' and 'in1' are interleaved
- and written to 'out0'
-*/
-#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
- out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
-}
-#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
-#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave even halfword elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even halfword elements of 'in0' and 'in1' are interleaved
- and written to 'out0'
-*/
-#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
- out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
-}
-#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
-#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
-#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
-
-/* Description : Interleave even word elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even word elements of 'in0' and 'in1' are interleaved
- and written to 'out0'
-*/
-#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
- out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
-}
-#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
-
-/* Description : Interleave even double word elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even double word elements of 'in0' and 'in1' are interleaved
- and written to 'out0'
-*/
-#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
- out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
-}
-#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
-
-/* Description : Interleave left half of byte elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Left half of byte elements of 'in0' and 'in1' are interleaved
- and written to 'out0'.
-*/
-#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
-}
-#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
-#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
-#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
-#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
-
-#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
-#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
-#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
-
-/* Description : Interleave left half of halfword elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Left half of halfword elements of 'in0' and 'in1' are
- interleaved and written to 'out0'.
-*/
-#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
-}
-#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave left half of word elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Left half of word elements of 'in0' and 'in1' are interleaved
- and written to 'out0'.
-*/
-#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
- out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
-}
-#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
-#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave right half of byte elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Right half of byte elements of 'in0' and 'in1' are interleaved
- and written to out0.
-*/
-#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
-}
-#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
-#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
-#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
-#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
-
-#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
-#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
-#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
-#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
-#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
-
-#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- in8, in9, in10, in11, in12, in13, in14, in15, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3); \
- ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
- out4, out5, out6, out7); \
-}
-#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
-
-/* Description : Interleave right half of halfword elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Right half of halfword elements of 'in0' and 'in1' are
- interleaved and written to 'out0'.
-*/
-#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
-}
-#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
-
-#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
-#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
-
-#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
- out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
-}
-#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
-#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
-
-#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
-#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
-
-/* Description : Interleave right half of double word elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Right half of double word elements of 'in0' and 'in1' are
- interleaved and written to 'out0'.
-*/
-#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
- out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
-}
-#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
-#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
-#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
-
-#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) { \
- ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
- out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \
-}
-#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
-
-#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
-#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
-#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
-
-/* Description : Interleave both left and right half of input vectors
- Arguments : Inputs - in0, in1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Right half of byte elements from 'in0' and 'in1' are
- interleaved and written to 'out0'
-*/
-#define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
-}
-#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
-#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
-#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
-#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
-
-#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
-}
-#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
-#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
-
-#define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \
- out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
- out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
-}
-#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
-#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
-
-/* Description : Saturate the halfword element values to the max
- unsigned value of (sat_val + 1) bits
- The element data width remains unchanged
- Arguments : Inputs - in0, in1, sat_val
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Each unsigned halfword element from 'in0' is saturated to the
- value generated with (sat_val + 1) bit range.
- The results are written in place
-*/
-#define SAT_UH2(RTYPE, in0, in1, sat_val) { \
- in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
- in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
-}
-#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
-
-#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) { \
- SAT_UH2(RTYPE, in0, in1, sat_val); \
- SAT_UH2(RTYPE, in2, in3, sat_val) \
-}
-#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
-
-/* Description : Saturate the halfword element values to the max
- unsigned value of (sat_val + 1) bits
- The element data width remains unchanged
- Arguments : Inputs - in0, in1, sat_val
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Each unsigned halfword element from 'in0' is saturated to the
- value generated with (sat_val + 1) bit range
- The results are written in place
-*/
-#define SAT_SH2(RTYPE, in0, in1, sat_val) { \
- in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
- in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
-}
-#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
-
-#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) { \
- SAT_SH2(RTYPE, in0, in1, sat_val); \
- SAT_SH2(RTYPE, in2, in3, sat_val); \
-}
-#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Indexed halfword element values are replicated to all
- elements in output vector
- Arguments : Inputs - in, idx0, idx1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : 'idx0' element value from 'in' vector is replicated to all
- elements in 'out0' vector
- Valid index range for halfword operation is 0-7
-*/
-#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) { \
- out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \
- out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \
-}
-#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
-
-#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
- out0, out1, out2, out3) { \
- SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
- SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
-}
-#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
-#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even byte elements of vector pairs
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even byte elements of 'in0' are copied to the left half of
- 'out0' & even byte elements of 'in1' are copied to the right
- half of 'out0'.
-*/
-#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
-}
-#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
-#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
-#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
-
-#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
- PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
-#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
-#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
-#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even halfword elements of vector pairs
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even halfword elements of 'in0' are copied to the left half of
- 'out0' & even halfword elements of 'in1' are copied to the
- right half of 'out0'.
-*/
-#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
-}
-#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
-#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
-
-#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
- PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
-#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even double word elements of vector pairs
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even double elements of 'in0' are copied to the left half of
- 'out0' & even double elements of 'in1' are copied to the right
- half of 'out0'.
-*/
-#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
- out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
-}
-#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
-#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
-
-#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
- PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
-#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
-
-/* Description : Each byte element is logically xor'ed with immediate 128
- Arguments : Inputs - in0, in1
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Each unsigned byte element from input vector 'in0' is
- logically xor'ed with 128 and the result is stored in-place.
-*/
-#define XORI_B2_128(RTYPE, in0, in1) { \
- in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
- in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
-}
-#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
-#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
-
-#define XORI_B3_128(RTYPE, in0, in1, in2) { \
- XORI_B2_128(RTYPE, in0, in1); \
- in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
-}
-#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
-
-#define XORI_B4_128(RTYPE, in0, in1, in2, in3) { \
- XORI_B2_128(RTYPE, in0, in1); \
- XORI_B2_128(RTYPE, in2, in3); \
-}
-#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
-#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
-
-#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) { \
- XORI_B4_128(RTYPE, in0, in1, in2, in3); \
- XORI_B3_128(RTYPE, in4, in5, in6); \
-}
-#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
-
-/* Description : Average of signed halfword elements -> (a + b) / 2
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- Outputs - out0, out1, out2, out3
- Return Type - as per RTYPE
- Details : Each signed halfword element from 'in0' is added to each
- signed halfword element of 'in1' with full precision resulting
- in one extra bit in the result. The result is then divided by
- 2 and written to 'out0'
-*/
-#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \
- out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \
- out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \
-}
-#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Addition of signed halfword elements and signed saturation
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed halfword elements from 'in0' are added to signed
- halfword elements of 'in1'. The result is then signed saturated
- between halfword data type range
-*/
-#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \
- out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
-}
-#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
-
-#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
-}
-#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Shift left all elements of vector (generic for all data types)
- Arguments : Inputs - in0, in1, in2, in3, shift
- Outputs - in place operation
- Return Type - as per input vector RTYPE
- Details : Each element of vector 'in0' is left shifted by 'shift' and
- the result is written in-place.
-*/
-#define SLLI_4V(in0, in1, in2, in3, shift) { \
- in0 = in0 << shift; \
- in1 = in1 << shift; \
- in2 = in2 << shift; \
- in3 = in3 << shift; \
-}
-
-/* Description : Arithmetic shift right all elements of vector
- (generic for all data types)
- Arguments : Inputs - in0, in1, in2, in3, shift
- Outputs - in place operation
- Return Type - as per input vector RTYPE
- Details : Each element of vector 'in0' is right shifted by 'shift' and
- the result is written in-place. 'shift' is a GP variable.
-*/
-#define SRA_4V(in0, in1, in2, in3, shift) { \
- in0 = in0 >> shift; \
- in1 = in1 >> shift; \
- in2 = in2 >> shift; \
- in3 = in3 >> shift; \
-}
-
-/* Description : Shift right arithmetic rounded words
- Arguments : Inputs - in0, in1, shift
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Each element of vector 'in0' is shifted right arithmetically by
- the number of bits in the corresponding element in the vector
- 'shift'. The last discarded bit is added to shifted value for
- rounding and the result is written in-place.
- 'shift' is a vector.
-*/
-#define SRAR_W2(RTYPE, in0, in1, shift) { \
- in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
- in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
-}
-
-#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) { \
- SRAR_W2(RTYPE, in0, in1, shift) \
- SRAR_W2(RTYPE, in2, in3, shift) \
-}
-#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
-
-/* Description : Shift right arithmetic rounded (immediate)
- Arguments : Inputs - in0, in1, shift
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Each element of vector 'in0' is shifted right arithmetically by
- the value in 'shift'. The last discarded bit is added to the
- shifted value for rounding and the result is written in-place.
- 'shift' is an immediate value.
-*/
-#define SRARI_H2(RTYPE, in0, in1, shift) { \
- in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
- in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
-}
-#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
-#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
-
-#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) { \
- SRARI_H2(RTYPE, in0, in1, shift); \
- SRARI_H2(RTYPE, in2, in3, shift); \
-}
-#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
-#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
-
-#define SRARI_W2(RTYPE, in0, in1, shift) { \
- in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
- in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
-}
-#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
-
-#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \
- SRARI_W2(RTYPE, in0, in1, shift); \
- SRARI_W2(RTYPE, in2, in3, shift); \
-}
-#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
-
-/* Description : Logical shift right all elements of vector (immediate)
- Arguments : Inputs - in0, in1, in2, in3, shift
- Outputs - out0, out1, out2, out3
- Return Type - as per RTYPE
- Details : Each element of vector 'in0' is right shifted by 'shift' and
- the result is written in-place. 'shift' is an immediate value.
-*/
-#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) { \
- out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \
- out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \
- out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \
- out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \
-}
-#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
-
-/* Description : Multiplication of pairs of vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Details : Each element from 'in0' is multiplied with elements from 'in1'
- and the result is written to 'out0'
-*/
-#define MUL2(in0, in1, in2, in3, out0, out1) { \
- out0 = in0 * in1; \
- out1 = in2 * in3; \
-}
-#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- MUL2(in0, in1, in2, in3, out0, out1); \
- MUL2(in4, in5, in6, in7, out2, out3); \
-}
-
-/* Description : Addition of 2 pairs of vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Details : Each element in 'in0' is added to 'in1' and result is written
- to 'out0'.
-*/
-#define ADD2(in0, in1, in2, in3, out0, out1) { \
- out0 = in0 + in1; \
- out1 = in2 + in3; \
-}
-#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- ADD2(in0, in1, in2, in3, out0, out1); \
- ADD2(in4, in5, in6, in7, out2, out3); \
-}
-
-/* Description : Subtraction of 2 pairs of vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Details : Each element in 'in1' is subtracted from 'in0' and result is
- written to 'out0'.
-*/
-#define SUB2(in0, in1, in2, in3, out0, out1) { \
- out0 = in0 - in1; \
- out1 = in2 - in3; \
-}
-#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) { \
- out0 = in0 - in1; \
- out1 = in2 - in3; \
- out2 = in4 - in5; \
- out3 = in6 - in7; \
-}
-
-/* Description : Sign extend halfword elements from right half of the vector
- Arguments : Input - in (halfword vector)
- Output - out (sign extended word vector)
- Return Type - signed word
- Details : Sign bit of halfword elements from input vector 'in' is
- extracted and interleaved with same vector 'in0' to generate
- 4 word elements keeping sign intact
-*/
-#define UNPCK_R_SH_SW(in, out) { \
- v8i16 sign_m; \
- \
- sign_m = __msa_clti_s_h((v8i16)in, 0); \
- out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
-}
-
-/* Description : Zero extend unsigned byte elements to halfword elements
- Arguments : Input - in (unsigned byte vector)
- Outputs - out0, out1 (unsigned halfword vectors)
- Return Type - signed halfword
- Details : Zero extended right half of vector is returned in 'out0'
- Zero extended left half of vector is returned in 'out1'
-*/
-#define UNPCK_UB_SH(in, out0, out1) { \
- v16i8 zero_m = { 0 }; \
- \
- ILVRL_B2_SH(zero_m, in, out0, out1); \
-}
-
-/* Description : Sign extend halfword elements from input vector and return
- the result in pair of vectors
- Arguments : Input - in (halfword vector)
- Outputs - out0, out1 (sign extended word vectors)
- Return Type - signed word
- Details : Sign bit of halfword elements from input vector 'in' is
- extracted and interleaved right with same vector 'in0' to
- generate 4 signed word elements in 'out0'
- Then interleaved left with same vector 'in0' to
- generate 4 signed word elements in 'out1'
-*/
-#define UNPCK_SH_SW(in, out0, out1) { \
- v8i16 tmp_m; \
- \
- tmp_m = __msa_clti_s_h((v8i16)in, 0); \
- ILVRL_H2_SW(tmp_m, in, out0, out1); \
-}
-
-/* Description : Butterfly of 4 input vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1, out2, out3
- Details : Butterfly operation
-*/
-#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \
- out0 = in0 + in3; \
- out1 = in1 + in2; \
- \
- out2 = in1 - in2; \
- out3 = in0 - in3; \
-}
-
-/* Description : Butterfly of 8 input vectors
- Arguments : Inputs - in0 ... in7
- Outputs - out0 .. out7
- Details : Butterfly operation
-*/
-#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- out0 = in0 + in7; \
- out1 = in1 + in6; \
- out2 = in2 + in5; \
- out3 = in3 + in4; \
- \
- out4 = in3 - in4; \
- out5 = in2 - in5; \
- out6 = in1 - in6; \
- out7 = in0 - in7; \
-}
-
-/* Description : Butterfly of 16 input vectors
- Arguments : Inputs - in0 ... in15
- Outputs - out0 .. out15
- Details : Butterfly operation
-*/
-#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
- in8, in9, in10, in11, in12, in13, in14, in15, \
- out0, out1, out2, out3, out4, out5, out6, out7, \
- out8, out9, out10, out11, out12, out13, out14, out15) { \
- out0 = in0 + in15; \
- out1 = in1 + in14; \
- out2 = in2 + in13; \
- out3 = in3 + in12; \
- out4 = in4 + in11; \
- out5 = in5 + in10; \
- out6 = in6 + in9; \
- out7 = in7 + in8; \
- \
- out8 = in7 - in8; \
- out9 = in6 - in9; \
- out10 = in5 - in10; \
- out11 = in4 - in11; \
- out12 = in3 - in12; \
- out13 = in2 - in13; \
- out14 = in1 - in14; \
- out15 = in0 - in15; \
-}
-
-/* Description : Transpose input 8x8 byte block
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
- \
- ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
- tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
- ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
- ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
- ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
- ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
- SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
- SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
-}
-#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
-
-/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
- in8, in9, in10, in11, in12, in13, in14, in15
- Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- Return Type - unsigned byte
-*/
-#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
- in8, in9, in10, in11, in12, in13, in14, in15, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
- \
- ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
- ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
- ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
- ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
- \
- tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \
- tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \
- tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \
- tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \
- out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \
- tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \
- out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \
- tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \
- \
- ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
- out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- \
- tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
- tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \
- out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- \
- ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
- out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- \
- tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
- tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
- tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
- tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
- out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
-}
-
-/* Description : Transpose 4x4 block with half word elements in vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1, out2, out3
- Return Type - signed halfword
-*/
-#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \
- v8i16 s0_m, s1_m; \
- \
- ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
- ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
- out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
- out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \
-}
-
-/* Description : Transpose 4x8 block with half word elements in vectors
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- Return Type - signed halfword
-*/
-#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
- v8i16 zero_m = { 0 }; \
- \
- ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
- tmp0_n, tmp1_n, tmp2_n, tmp3_n); \
- ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \
- ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \
- \
- out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
- out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
- out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
- out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
- \
- out4 = zero_m; \
- out5 = zero_m; \
- out6 = zero_m; \
- out7 = zero_m; \
-}
-
-/* Description : Transpose 8x4 block with half word elements in vectors
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- Return Type - signed halfword
-*/
-#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \
- v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \
- ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \
- ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
- ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
-}
-
-/* Description : Transpose 8x8 block with half word elements in vectors
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7) { \
- v8i16 s0_m, s1_m; \
- v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
- \
- ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
- ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
- ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
- ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
- PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
- tmp3_m, tmp7_m, out0, out2, out4, out6); \
- out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
- out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
- out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
- out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
-}
-#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
-
-/* Description : Transpose 4x4 block with word elements in vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1, out2, out3
- Return Type - signed word
-*/
-#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) { \
- v4i32 s0_m, s1_m, s2_m, s3_m; \
- \
- ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
- ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
- \
- out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \
- out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \
- out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \
- out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \
-}
-
-/* Description : Add block 4x4
- Arguments : Inputs - in0, in1, in2, in3, pdst, stride
- Details : Least significant 4 bytes from each input vector are added to
- the destination bytes, clipped between 0-255 and stored.
-*/
-#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \
- uint32_t src0_m, src1_m, src2_m, src3_m; \
- v8i16 inp0_m, inp1_m, res0_m, res1_m; \
- v16i8 dst0_m = { 0 }; \
- v16i8 dst1_m = { 0 }; \
- v16i8 zero_m = { 0 }; \
- \
- ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
- LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
- INSERT_W2_SB(src0_m, src1_m, dst0_m); \
- INSERT_W2_SB(src2_m, src3_m, dst1_m); \
- ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
- ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
- CLIP_SH2_0_255(res0_m, res1_m); \
- PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
- ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \
-}
-
-/* Description : Pack even elements of input vectors & xor with 128
- Arguments : Inputs - in0, in1
- Output - out_m
- Return Type - unsigned byte
- Details : Signed byte even elements from 'in0' and 'in1' are packed
- together in one vector and the resulting vector is xor'ed with
- 128 to shift the range from signed to unsigned byte
-*/
-#define PCKEV_XORI128_UB(in0, in1) ({ \
- v16u8 out_m; \
- \
- out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
- out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \
- out_m; \
-})
-
-/* Description : Converts inputs to unsigned bytes, interleave, average & store
- as 8x4 unsigned byte block
- Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
- pdst, stride
-*/
-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
- dst0, dst1, dst2, dst3, pdst, stride) { \
- v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- \
- tmp0_m = PCKEV_XORI128_UB(in0, in1); \
- tmp1_m = PCKEV_XORI128_UB(in2, in3); \
- ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
- AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
-}
-
-/* Description : Pack even byte elements and store byte vector in destination
- memory
- Arguments : Inputs - in0, in1, pdst
-*/
-#define PCKEV_ST_SB(in0, in1, pdst) { \
- v16i8 tmp_m; \
- \
- tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
- ST_SB(tmp_m, (pdst)); \
-}
-
-/* Description : Horizontal 2 tap filter kernel code
- Arguments : Inputs - in0, in1, mask, coeff, shift
-*/
-#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \
- v16i8 tmp0_m; \
- v8u16 tmp1_m; \
- \
- tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
- tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \
- tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \
- tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
- \
- tmp1_m; \
-})
-#endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */
--- a/vp9/common/mips/msa/vp9_mfqe_msa.c
+++ b/vp9/common/mips/msa/vp9_mfqe_msa.c
@@ -10,7 +10,7 @@
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
uint8_t *dst_ptr, int32_t dst_stride,
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -802,88 +802,6 @@
#
if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
-
-# variance
-add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x16 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x8 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
-
-# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
-add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
-#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
-
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
specialize qw/vp9_avg_8x8 sse2 neon msa/;
@@ -1084,241 +1002,6 @@
specialize qw/vp9_temporal_filter_apply sse2 msa/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_sub_pixel_variance4x4/;
-
- add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_sub_pixel_variance4x4/;
-
- add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_sub_pixel_variance4x4/;
-
- add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
- specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
-
# ENCODEMB INVOKE
--- a/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ /dev/null
@@ -1,153 +1,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "./vpx_config.h"
-
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_filter.h"
-
-static const uint8_t bilinear_filters[8][2] = {
- { 128, 0, },
- { 112, 16, },
- { 96, 32, },
- { 80, 48, },
- { 64, 64, },
- { 48, 80, },
- { 32, 96, },
- { 16, 112, },
-};
-
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *vp9_filter) {
- const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);
- const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);
- unsigned int i;
- for (i = 0; i < output_height; ++i) {
- const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
- const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
- const uint16x8_t a = vmull_u8(src_0, f0);
- const uint16x8_t b = vmlal_u8(a, src_1, f1);
- const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
- vst1_u8(&output_ptr[0], out);
- // Next row...
- src_ptr += src_pixels_per_line;
- output_ptr += output_width;
- }
-}
-
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *vp9_filter) {
- const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);
- const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);
- unsigned int i, j;
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; j += 16) {
- const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
- const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
- const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
- const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
- const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
- const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
- const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
- const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
- vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
- }
- // Next row...
- src_ptr += src_pixels_per_line;
- output_ptr += output_width;
- }
-}
-
-unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
- int src_stride,
- int xoffset,
- int yoffset,
- const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
-
- var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
- 9, 8,
- bilinear_filters[xoffset]);
- var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
- 8, bilinear_filters[yoffset]);
- return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
-}
-
-unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
- int src_stride,
- int xoffset,
- int yoffset,
- const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
-
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
- 17, 16,
- bilinear_filters[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
- 16, bilinear_filters[yoffset]);
- return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
-}
-
-unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
- int src_stride,
- int xoffset,
- int yoffset,
- const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
-
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
- 33, 32,
- bilinear_filters[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
- 32, bilinear_filters[yoffset]);
- return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
-}
-
-unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
- int src_stride,
- int xoffset,
- int yoffset,
- const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
-
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
- 65, 64,
- bilinear_filters[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
- 64, bilinear_filters[yoffset]);
- return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
-}
--- a/vp9/encoder/mips/msa/vp9_avg_msa.c
+++ b/vp9/encoder/mips/msa/vp9_avg_msa.c
@@ -9,7 +9,7 @@
*/
#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
uint32_t sum_out;
--- a/vp9/encoder/mips/msa/vp9_error_msa.c
+++ b/vp9/encoder/mips/msa/vp9_error_msa.c
@@ -9,7 +9,7 @@
*/
#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \
static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr, \
--- a/vp9/encoder/mips/msa/vp9_fdct_msa.h
+++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h
@@ -13,7 +13,7 @@
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
v8i16 k0_m = __msa_fill_h(cnst0); \
--- a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
+++ b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
@@ -9,7 +9,7 @@
*/
#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
uint32_t stride,
--- a/vp9/encoder/mips/msa/vp9_variance_msa.c
+++ /dev/null
@@ -1,768 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
-
-static const uint8_t bilinear_filters[8][2] = {
- { 128, 0, },
- { 112, 16, },
- { 96, 32, },
- { 80, 48, },
- { 64, 64, },
- { 48, 80, },
- { 32, 96, },
- { 16, 112, },
-};
-
-#define CALC_MSE_AVG_B(src, ref, var, sub) { \
- v16u8 src_l0_m, src_l1_m; \
- v8i16 res_l0_m, res_l1_m; \
- \
- ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
- HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
- DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
- \
- sub += res_l0_m + res_l1_m; \
-}
-
-#define VARIANCE_WxH(sse, diff, shift) \
- sse - (((uint32_t)diff * diff) >> shift)
-
-#define VARIANCE_LARGE_WxH(sse, diff, shift) \
- sse - (((int64_t)diff * diff) >> shift)
-
-static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 filt0, ref = { 0 };
- v16i8 src0, src1, src2, src3;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 vec0, vec1, vec2, vec3;
- v8u16 const255;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- vec0, vec1, vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
- src0, src1, src2, src3);
- ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
- src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
- CALC_MSE_AVG_B(src0, ref, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 filt0, out, ref0, ref1, ref2, ref3;
- v16i8 src0, src1, src2, src3;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 vec0, vec1, vec2, vec3, const255;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- vec0, vec1, vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
- src0, src1, src2, src3);
- out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
- CALC_MSE_AVG_B(out, ref0, var, avg);
- out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
- CALC_MSE_AVG_B(out, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v16u8 dst0, dst1, dst2, dst3, filt0;
- v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
- v8u16 const255;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- dst += (4 * dst_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- out0, out1, out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
- out4, out5, out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- MIN_UH4_UH(out0, out1, out2, out3, const255);
- MIN_UH4_UH(out4, out5, out6, out7, const255);
- PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
- src0, src1, src2, src3);
- CALC_MSE_AVG_B(src0, dst0, var, avg);
- CALC_MSE_AVG_B(src1, dst1, var, avg);
- CALC_MSE_AVG_B(src2, dst2, var, avg);
- CALC_MSE_AVG_B(src3, dst3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4, out;
- v16u8 src10_r, src32_r, src21_r, src43_r;
- v16u8 ref = { 0 };
- v16u8 src2110, src4332;
- v16u8 filt0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
- v8u16 tmp0, tmp1;
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
- src10_r, src21_r, src32_r, src43_r);
- ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
- DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- CALC_MSE_AVG_B(out, ref, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 ref0, ref1, ref2, ref3;
- v8u16 vec0, vec1, vec2, vec3;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v16u8 filt0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
- vec0, vec1, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
- tmp0, tmp1, tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 out0, out1, out2, out3;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v16u8 filt0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
- src0 = src4;
-
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- CALC_MSE_AVG_B(out2, ref2, var, avg);
- CALC_MSE_AVG_B(out3, ref3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter_horiz,
- const uint8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 out, ref = { 0 };
- v16u8 filt_vt, filt_hz, vec0, vec1;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
- v8u16 tmp0, tmp1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
- hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- CALC_MSE_AVG_B(out, ref, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter_horiz,
- const uint8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 out0, out1;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 hz_out0, hz_out1;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v16u8 filt_vt, filt_hz, vec0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp0 = __msa_dotp_u_h(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp1 = __msa_dotp_u_h(vec0, filt_vt);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp2 = __msa_dotp_u_h(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp3 = __msa_dotp_u_h(vec0, filt_vt);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter_horiz,
- const uint8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 filt_hz, filt_vt, vec0, vec1;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
- v8u16 tmp0, tmp1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- LD_UB2(src, 8, src0, src1);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src0, src2, src4, src6);
- LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- CALC_MSE_AVG_B(src2, ref2, var, avg);
- CALC_MSE_AVG_B(src3, ref3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter_horiz,
- const uint8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height,
- &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
- int32_t src_stride,
- const uint8_t *dst,
- int32_t dst_stride,
- const uint8_t *filter_horiz,
- const uint8_t *filter_vert,
- int32_t height,
- int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height,
- &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
-#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
-#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
-
-#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
-#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
-
-#define VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
-uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
- int32_t src_stride, \
- int32_t xoffset, \
- int32_t yoffset, \
- const uint8_t *ref, \
- int32_t ref_stride, \
- uint32_t *sse) { \
- int32_t diff; \
- uint32_t var; \
- const uint8_t *h_filter = bilinear_filters[xoffset]; \
- const uint8_t *v_filter = bilinear_filters[yoffset]; \
- \
- if (yoffset) { \
- if (xoffset) { \
- *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride, \
- ref, ref_stride, \
- h_filter, v_filter, \
- ht, &diff); \
- } else { \
- *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride, \
- ref, ref_stride, \
- v_filter, ht, &diff); \
- } \
- \
- var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
- } else { \
- if (xoffset) { \
- *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride, \
- ref, ref_stride, \
- h_filter, ht, &diff); \
- \
- var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
- } else { \
- var = vpx_variance##wd##x##ht##_msa(src, src_stride, \
- ref, ref_stride, sse); \
- } \
- } \
- \
- return var; \
-}
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1023,8 +1023,8 @@
vpx_highbd_sad32x16_bits8,
vpx_highbd_sad32x16_avg_bits8,
vpx_highbd_8_variance32x16,
- vp9_highbd_sub_pixel_variance32x16,
- vp9_highbd_sub_pixel_avg_variance32x16,
+ vpx_highbd_8_sub_pixel_variance32x16,
+ vpx_highbd_8_sub_pixel_avg_variance32x16,
NULL,
NULL,
vpx_highbd_sad32x16x4d_bits8)
@@ -1033,8 +1033,8 @@
vpx_highbd_sad16x32_bits8,
vpx_highbd_sad16x32_avg_bits8,
vpx_highbd_8_variance16x32,
- vp9_highbd_sub_pixel_variance16x32,
- vp9_highbd_sub_pixel_avg_variance16x32,
+ vpx_highbd_8_sub_pixel_variance16x32,
+ vpx_highbd_8_sub_pixel_avg_variance16x32,
NULL,
NULL,
vpx_highbd_sad16x32x4d_bits8)
@@ -1043,8 +1043,8 @@
vpx_highbd_sad64x32_bits8,
vpx_highbd_sad64x32_avg_bits8,
vpx_highbd_8_variance64x32,
- vp9_highbd_sub_pixel_variance64x32,
- vp9_highbd_sub_pixel_avg_variance64x32,
+ vpx_highbd_8_sub_pixel_variance64x32,
+ vpx_highbd_8_sub_pixel_avg_variance64x32,
NULL,
NULL,
vpx_highbd_sad64x32x4d_bits8)
@@ -1053,8 +1053,8 @@
vpx_highbd_sad32x64_bits8,
vpx_highbd_sad32x64_avg_bits8,
vpx_highbd_8_variance32x64,
- vp9_highbd_sub_pixel_variance32x64,
- vp9_highbd_sub_pixel_avg_variance32x64,
+ vpx_highbd_8_sub_pixel_variance32x64,
+ vpx_highbd_8_sub_pixel_avg_variance32x64,
NULL,
NULL,
vpx_highbd_sad32x64x4d_bits8)
@@ -1063,8 +1063,8 @@
vpx_highbd_sad32x32_bits8,
vpx_highbd_sad32x32_avg_bits8,
vpx_highbd_8_variance32x32,
- vp9_highbd_sub_pixel_variance32x32,
- vp9_highbd_sub_pixel_avg_variance32x32,
+ vpx_highbd_8_sub_pixel_variance32x32,
+ vpx_highbd_8_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits8,
vpx_highbd_sad32x32x8_bits8,
vpx_highbd_sad32x32x4d_bits8)
@@ -1073,8 +1073,8 @@
vpx_highbd_sad64x64_bits8,
vpx_highbd_sad64x64_avg_bits8,
vpx_highbd_8_variance64x64,
- vp9_highbd_sub_pixel_variance64x64,
- vp9_highbd_sub_pixel_avg_variance64x64,
+ vpx_highbd_8_sub_pixel_variance64x64,
+ vpx_highbd_8_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits8,
vpx_highbd_sad64x64x8_bits8,
vpx_highbd_sad64x64x4d_bits8)
@@ -1083,8 +1083,8 @@
vpx_highbd_sad16x16_bits8,
vpx_highbd_sad16x16_avg_bits8,
vpx_highbd_8_variance16x16,
- vp9_highbd_sub_pixel_variance16x16,
- vp9_highbd_sub_pixel_avg_variance16x16,
+ vpx_highbd_8_sub_pixel_variance16x16,
+ vpx_highbd_8_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits8,
vpx_highbd_sad16x16x8_bits8,
vpx_highbd_sad16x16x4d_bits8)
@@ -1093,8 +1093,8 @@
vpx_highbd_sad16x8_bits8,
vpx_highbd_sad16x8_avg_bits8,
vpx_highbd_8_variance16x8,
- vp9_highbd_sub_pixel_variance16x8,
- vp9_highbd_sub_pixel_avg_variance16x8,
+ vpx_highbd_8_sub_pixel_variance16x8,
+ vpx_highbd_8_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits8,
vpx_highbd_sad16x8x8_bits8,
vpx_highbd_sad16x8x4d_bits8)
@@ -1103,8 +1103,8 @@
vpx_highbd_sad8x16_bits8,
vpx_highbd_sad8x16_avg_bits8,
vpx_highbd_8_variance8x16,
- vp9_highbd_sub_pixel_variance8x16,
- vp9_highbd_sub_pixel_avg_variance8x16,
+ vpx_highbd_8_sub_pixel_variance8x16,
+ vpx_highbd_8_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits8,
vpx_highbd_sad8x16x8_bits8,
vpx_highbd_sad8x16x4d_bits8)
@@ -1113,8 +1113,8 @@
vpx_highbd_sad8x8_bits8,
vpx_highbd_sad8x8_avg_bits8,
vpx_highbd_8_variance8x8,
- vp9_highbd_sub_pixel_variance8x8,
- vp9_highbd_sub_pixel_avg_variance8x8,
+ vpx_highbd_8_sub_pixel_variance8x8,
+ vpx_highbd_8_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits8,
vpx_highbd_sad8x8x8_bits8,
vpx_highbd_sad8x8x4d_bits8)
@@ -1123,8 +1123,8 @@
vpx_highbd_sad8x4_bits8,
vpx_highbd_sad8x4_avg_bits8,
vpx_highbd_8_variance8x4,
- vp9_highbd_sub_pixel_variance8x4,
- vp9_highbd_sub_pixel_avg_variance8x4,
+ vpx_highbd_8_sub_pixel_variance8x4,
+ vpx_highbd_8_sub_pixel_avg_variance8x4,
NULL,
vpx_highbd_sad8x4x8_bits8,
vpx_highbd_sad8x4x4d_bits8)
@@ -1133,8 +1133,8 @@
vpx_highbd_sad4x8_bits8,
vpx_highbd_sad4x8_avg_bits8,
vpx_highbd_8_variance4x8,
- vp9_highbd_sub_pixel_variance4x8,
- vp9_highbd_sub_pixel_avg_variance4x8,
+ vpx_highbd_8_sub_pixel_variance4x8,
+ vpx_highbd_8_sub_pixel_avg_variance4x8,
NULL,
vpx_highbd_sad4x8x8_bits8,
vpx_highbd_sad4x8x4d_bits8)
@@ -1143,8 +1143,8 @@
vpx_highbd_sad4x4_bits8,
vpx_highbd_sad4x4_avg_bits8,
vpx_highbd_8_variance4x4,
- vp9_highbd_sub_pixel_variance4x4,
- vp9_highbd_sub_pixel_avg_variance4x4,
+ vpx_highbd_8_sub_pixel_variance4x4,
+ vpx_highbd_8_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits8,
vpx_highbd_sad4x4x8_bits8,
vpx_highbd_sad4x4x4d_bits8)
@@ -1155,8 +1155,8 @@
vpx_highbd_sad32x16_bits10,
vpx_highbd_sad32x16_avg_bits10,
vpx_highbd_10_variance32x16,
- vp9_highbd_10_sub_pixel_variance32x16,
- vp9_highbd_10_sub_pixel_avg_variance32x16,
+ vpx_highbd_10_sub_pixel_variance32x16,
+ vpx_highbd_10_sub_pixel_avg_variance32x16,
NULL,
NULL,
vpx_highbd_sad32x16x4d_bits10)
@@ -1165,8 +1165,8 @@
vpx_highbd_sad16x32_bits10,
vpx_highbd_sad16x32_avg_bits10,
vpx_highbd_10_variance16x32,
- vp9_highbd_10_sub_pixel_variance16x32,
- vp9_highbd_10_sub_pixel_avg_variance16x32,
+ vpx_highbd_10_sub_pixel_variance16x32,
+ vpx_highbd_10_sub_pixel_avg_variance16x32,
NULL,
NULL,
vpx_highbd_sad16x32x4d_bits10)
@@ -1175,8 +1175,8 @@
vpx_highbd_sad64x32_bits10,
vpx_highbd_sad64x32_avg_bits10,
vpx_highbd_10_variance64x32,
- vp9_highbd_10_sub_pixel_variance64x32,
- vp9_highbd_10_sub_pixel_avg_variance64x32,
+ vpx_highbd_10_sub_pixel_variance64x32,
+ vpx_highbd_10_sub_pixel_avg_variance64x32,
NULL,
NULL,
vpx_highbd_sad64x32x4d_bits10)
@@ -1185,8 +1185,8 @@
vpx_highbd_sad32x64_bits10,
vpx_highbd_sad32x64_avg_bits10,
vpx_highbd_10_variance32x64,
- vp9_highbd_10_sub_pixel_variance32x64,
- vp9_highbd_10_sub_pixel_avg_variance32x64,
+ vpx_highbd_10_sub_pixel_variance32x64,
+ vpx_highbd_10_sub_pixel_avg_variance32x64,
NULL,
NULL,
vpx_highbd_sad32x64x4d_bits10)
@@ -1195,8 +1195,8 @@
vpx_highbd_sad32x32_bits10,
vpx_highbd_sad32x32_avg_bits10,
vpx_highbd_10_variance32x32,
- vp9_highbd_10_sub_pixel_variance32x32,
- vp9_highbd_10_sub_pixel_avg_variance32x32,
+ vpx_highbd_10_sub_pixel_variance32x32,
+ vpx_highbd_10_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits10,
vpx_highbd_sad32x32x8_bits10,
vpx_highbd_sad32x32x4d_bits10)
@@ -1205,8 +1205,8 @@
vpx_highbd_sad64x64_bits10,
vpx_highbd_sad64x64_avg_bits10,
vpx_highbd_10_variance64x64,
- vp9_highbd_10_sub_pixel_variance64x64,
- vp9_highbd_10_sub_pixel_avg_variance64x64,
+ vpx_highbd_10_sub_pixel_variance64x64,
+ vpx_highbd_10_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits10,
vpx_highbd_sad64x64x8_bits10,
vpx_highbd_sad64x64x4d_bits10)
@@ -1215,8 +1215,8 @@
vpx_highbd_sad16x16_bits10,
vpx_highbd_sad16x16_avg_bits10,
vpx_highbd_10_variance16x16,
- vp9_highbd_10_sub_pixel_variance16x16,
- vp9_highbd_10_sub_pixel_avg_variance16x16,
+ vpx_highbd_10_sub_pixel_variance16x16,
+ vpx_highbd_10_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits10,
vpx_highbd_sad16x16x8_bits10,
vpx_highbd_sad16x16x4d_bits10)
@@ -1225,8 +1225,8 @@
vpx_highbd_sad16x8_bits10,
vpx_highbd_sad16x8_avg_bits10,
vpx_highbd_10_variance16x8,
- vp9_highbd_10_sub_pixel_variance16x8,
- vp9_highbd_10_sub_pixel_avg_variance16x8,
+ vpx_highbd_10_sub_pixel_variance16x8,
+ vpx_highbd_10_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits10,
vpx_highbd_sad16x8x8_bits10,
vpx_highbd_sad16x8x4d_bits10)
@@ -1235,8 +1235,8 @@
vpx_highbd_sad8x16_bits10,
vpx_highbd_sad8x16_avg_bits10,
vpx_highbd_10_variance8x16,
- vp9_highbd_10_sub_pixel_variance8x16,
- vp9_highbd_10_sub_pixel_avg_variance8x16,
+ vpx_highbd_10_sub_pixel_variance8x16,
+ vpx_highbd_10_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits10,
vpx_highbd_sad8x16x8_bits10,
vpx_highbd_sad8x16x4d_bits10)
@@ -1245,8 +1245,8 @@
vpx_highbd_sad8x8_bits10,
vpx_highbd_sad8x8_avg_bits10,
vpx_highbd_10_variance8x8,
- vp9_highbd_10_sub_pixel_variance8x8,
- vp9_highbd_10_sub_pixel_avg_variance8x8,
+ vpx_highbd_10_sub_pixel_variance8x8,
+ vpx_highbd_10_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits10,
vpx_highbd_sad8x8x8_bits10,
vpx_highbd_sad8x8x4d_bits10)
@@ -1255,8 +1255,8 @@
vpx_highbd_sad8x4_bits10,
vpx_highbd_sad8x4_avg_bits10,
vpx_highbd_10_variance8x4,
- vp9_highbd_10_sub_pixel_variance8x4,
- vp9_highbd_10_sub_pixel_avg_variance8x4,
+ vpx_highbd_10_sub_pixel_variance8x4,
+ vpx_highbd_10_sub_pixel_avg_variance8x4,
NULL,
vpx_highbd_sad8x4x8_bits10,
vpx_highbd_sad8x4x4d_bits10)
@@ -1265,8 +1265,8 @@
vpx_highbd_sad4x8_bits10,
vpx_highbd_sad4x8_avg_bits10,
vpx_highbd_10_variance4x8,
- vp9_highbd_10_sub_pixel_variance4x8,
- vp9_highbd_10_sub_pixel_avg_variance4x8,
+ vpx_highbd_10_sub_pixel_variance4x8,
+ vpx_highbd_10_sub_pixel_avg_variance4x8,
NULL,
vpx_highbd_sad4x8x8_bits10,
vpx_highbd_sad4x8x4d_bits10)
@@ -1275,8 +1275,8 @@
vpx_highbd_sad4x4_bits10,
vpx_highbd_sad4x4_avg_bits10,
vpx_highbd_10_variance4x4,
- vp9_highbd_10_sub_pixel_variance4x4,
- vp9_highbd_10_sub_pixel_avg_variance4x4,
+ vpx_highbd_10_sub_pixel_variance4x4,
+ vpx_highbd_10_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits10,
vpx_highbd_sad4x4x8_bits10,
vpx_highbd_sad4x4x4d_bits10)
@@ -1287,8 +1287,8 @@
vpx_highbd_sad32x16_bits12,
vpx_highbd_sad32x16_avg_bits12,
vpx_highbd_12_variance32x16,
- vp9_highbd_12_sub_pixel_variance32x16,
- vp9_highbd_12_sub_pixel_avg_variance32x16,
+ vpx_highbd_12_sub_pixel_variance32x16,
+ vpx_highbd_12_sub_pixel_avg_variance32x16,
NULL,
NULL,
vpx_highbd_sad32x16x4d_bits12)
@@ -1297,8 +1297,8 @@
vpx_highbd_sad16x32_bits12,
vpx_highbd_sad16x32_avg_bits12,
vpx_highbd_12_variance16x32,
- vp9_highbd_12_sub_pixel_variance16x32,
- vp9_highbd_12_sub_pixel_avg_variance16x32,
+ vpx_highbd_12_sub_pixel_variance16x32,
+ vpx_highbd_12_sub_pixel_avg_variance16x32,
NULL,
NULL,
vpx_highbd_sad16x32x4d_bits12)
@@ -1307,8 +1307,8 @@
vpx_highbd_sad64x32_bits12,
vpx_highbd_sad64x32_avg_bits12,
vpx_highbd_12_variance64x32,
- vp9_highbd_12_sub_pixel_variance64x32,
- vp9_highbd_12_sub_pixel_avg_variance64x32,
+ vpx_highbd_12_sub_pixel_variance64x32,
+ vpx_highbd_12_sub_pixel_avg_variance64x32,
NULL,
NULL,
vpx_highbd_sad64x32x4d_bits12)
@@ -1317,8 +1317,8 @@
vpx_highbd_sad32x64_bits12,
vpx_highbd_sad32x64_avg_bits12,
vpx_highbd_12_variance32x64,
- vp9_highbd_12_sub_pixel_variance32x64,
- vp9_highbd_12_sub_pixel_avg_variance32x64,
+ vpx_highbd_12_sub_pixel_variance32x64,
+ vpx_highbd_12_sub_pixel_avg_variance32x64,
NULL,
NULL,
vpx_highbd_sad32x64x4d_bits12)
@@ -1327,8 +1327,8 @@
vpx_highbd_sad32x32_bits12,
vpx_highbd_sad32x32_avg_bits12,
vpx_highbd_12_variance32x32,
- vp9_highbd_12_sub_pixel_variance32x32,
- vp9_highbd_12_sub_pixel_avg_variance32x32,
+ vpx_highbd_12_sub_pixel_variance32x32,
+ vpx_highbd_12_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits12,
vpx_highbd_sad32x32x8_bits12,
vpx_highbd_sad32x32x4d_bits12)
@@ -1337,8 +1337,8 @@
vpx_highbd_sad64x64_bits12,
vpx_highbd_sad64x64_avg_bits12,
vpx_highbd_12_variance64x64,
- vp9_highbd_12_sub_pixel_variance64x64,
- vp9_highbd_12_sub_pixel_avg_variance64x64,
+ vpx_highbd_12_sub_pixel_variance64x64,
+ vpx_highbd_12_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits12,
vpx_highbd_sad64x64x8_bits12,
vpx_highbd_sad64x64x4d_bits12)
@@ -1347,8 +1347,8 @@
vpx_highbd_sad16x16_bits12,
vpx_highbd_sad16x16_avg_bits12,
vpx_highbd_12_variance16x16,
- vp9_highbd_12_sub_pixel_variance16x16,
- vp9_highbd_12_sub_pixel_avg_variance16x16,
+ vpx_highbd_12_sub_pixel_variance16x16,
+ vpx_highbd_12_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits12,
vpx_highbd_sad16x16x8_bits12,
vpx_highbd_sad16x16x4d_bits12)
@@ -1357,8 +1357,8 @@
vpx_highbd_sad16x8_bits12,
vpx_highbd_sad16x8_avg_bits12,
vpx_highbd_12_variance16x8,
- vp9_highbd_12_sub_pixel_variance16x8,
- vp9_highbd_12_sub_pixel_avg_variance16x8,
+ vpx_highbd_12_sub_pixel_variance16x8,
+ vpx_highbd_12_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits12,
vpx_highbd_sad16x8x8_bits12,
vpx_highbd_sad16x8x4d_bits12)
@@ -1367,8 +1367,8 @@
vpx_highbd_sad8x16_bits12,
vpx_highbd_sad8x16_avg_bits12,
vpx_highbd_12_variance8x16,
- vp9_highbd_12_sub_pixel_variance8x16,
- vp9_highbd_12_sub_pixel_avg_variance8x16,
+ vpx_highbd_12_sub_pixel_variance8x16,
+ vpx_highbd_12_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits12,
vpx_highbd_sad8x16x8_bits12,
vpx_highbd_sad8x16x4d_bits12)
@@ -1377,8 +1377,8 @@
vpx_highbd_sad8x8_bits12,
vpx_highbd_sad8x8_avg_bits12,
vpx_highbd_12_variance8x8,
- vp9_highbd_12_sub_pixel_variance8x8,
- vp9_highbd_12_sub_pixel_avg_variance8x8,
+ vpx_highbd_12_sub_pixel_variance8x8,
+ vpx_highbd_12_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits12,
vpx_highbd_sad8x8x8_bits12,
vpx_highbd_sad8x8x4d_bits12)
@@ -1387,8 +1387,8 @@
vpx_highbd_sad8x4_bits12,
vpx_highbd_sad8x4_avg_bits12,
vpx_highbd_12_variance8x4,
- vp9_highbd_12_sub_pixel_variance8x4,
- vp9_highbd_12_sub_pixel_avg_variance8x4,
+ vpx_highbd_12_sub_pixel_variance8x4,
+ vpx_highbd_12_sub_pixel_avg_variance8x4,
NULL,
vpx_highbd_sad8x4x8_bits12,
vpx_highbd_sad8x4x4d_bits12)
@@ -1397,8 +1397,8 @@
vpx_highbd_sad4x8_bits12,
vpx_highbd_sad4x8_avg_bits12,
vpx_highbd_12_variance4x8,
- vp9_highbd_12_sub_pixel_variance4x8,
- vp9_highbd_12_sub_pixel_avg_variance4x8,
+ vpx_highbd_12_sub_pixel_variance4x8,
+ vpx_highbd_12_sub_pixel_avg_variance4x8,
NULL,
vpx_highbd_sad4x8x8_bits12,
vpx_highbd_sad4x8x4d_bits12)
@@ -1407,8 +1407,8 @@
vpx_highbd_sad4x4_bits12,
vpx_highbd_sad4x4_avg_bits12,
vpx_highbd_12_variance4x4,
- vp9_highbd_12_sub_pixel_variance4x4,
- vp9_highbd_12_sub_pixel_avg_variance4x4,
+ vpx_highbd_12_sub_pixel_variance4x4,
+ vpx_highbd_12_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits12,
vpx_highbd_sad4x4x8_bits12,
vpx_highbd_sad4x4x4d_bits12)
@@ -1832,62 +1832,62 @@
cpi->fn_ptr[BT].sdx4df = SDX4DF;
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
- vpx_variance32x16, vp9_sub_pixel_variance32x16,
- vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
+ vpx_variance32x16, vpx_sub_pixel_variance32x16,
+ vpx_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
- vpx_variance16x32, vp9_sub_pixel_variance16x32,
- vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
+ vpx_variance16x32, vpx_sub_pixel_variance16x32,
+ vpx_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
- vpx_variance64x32, vp9_sub_pixel_variance64x32,
- vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
+ vpx_variance64x32, vpx_sub_pixel_variance64x32,
+ vpx_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
- vpx_variance32x64, vp9_sub_pixel_variance32x64,
- vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
+ vpx_variance32x64, vpx_sub_pixel_variance32x64,
+ vpx_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
- vpx_variance32x32, vp9_sub_pixel_variance32x32,
- vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
+ vpx_variance32x32, vpx_sub_pixel_variance32x32,
+ vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
vpx_sad32x32x4d)
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
- vpx_variance64x64, vp9_sub_pixel_variance64x64,
- vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
+ vpx_variance64x64, vpx_sub_pixel_variance64x64,
+ vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
vpx_sad64x64x4d)
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
- vpx_variance16x16, vp9_sub_pixel_variance16x16,
- vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
+ vpx_variance16x16, vpx_sub_pixel_variance16x16,
+ vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
vpx_sad16x16x4d)
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
- vpx_variance16x8, vp9_sub_pixel_variance16x8,
- vp9_sub_pixel_avg_variance16x8,
+ vpx_variance16x8, vpx_sub_pixel_variance16x8,
+ vpx_sub_pixel_avg_variance16x8,
vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
- vpx_variance8x16, vp9_sub_pixel_variance8x16,
- vp9_sub_pixel_avg_variance8x16,
+ vpx_variance8x16, vpx_sub_pixel_variance8x16,
+ vpx_sub_pixel_avg_variance8x16,
vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
- vpx_variance8x8, vp9_sub_pixel_variance8x8,
- vp9_sub_pixel_avg_variance8x8,
+ vpx_variance8x8, vpx_sub_pixel_variance8x8,
+ vpx_sub_pixel_avg_variance8x8,
vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
- vpx_variance8x4, vp9_sub_pixel_variance8x4,
- vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
+ vpx_variance8x4, vpx_sub_pixel_variance8x4,
+ vpx_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
- vpx_variance4x8, vp9_sub_pixel_variance4x8,
- vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
+ vpx_variance4x8, vpx_sub_pixel_variance4x8,
+ vpx_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
- vpx_variance4x4, vp9_sub_pixel_variance4x4,
- vp9_sub_pixel_avg_variance4x4,
+ vpx_variance4x4, vpx_sub_pixel_variance4x4,
+ vpx_sub_pixel_avg_variance4x4,
vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
#if CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -40,7 +40,7 @@
#include "vp9/encoder/vp9_speed_features.h"
#include "vp9/encoder/vp9_svc_layercontext.h"
#include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vpx_dsp/variance.h"
#if CONFIG_VP9_TEMPORAL_DENOISING
#include "vp9/encoder/vp9_denoiser.h"
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -35,7 +35,7 @@
#include "vp9/encoder/vp9_mcomp.h"
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rd.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vpx_dsp/variance.h"
#define OUTPUT_FPF 0
#define ARF_STATS_OUTPUT 0
@@ -298,7 +298,7 @@
}
}
-static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
switch (bsize) {
case BLOCK_8X8:
return vpx_mse8x8;
@@ -315,13 +315,13 @@
const struct buf_2d *src,
const struct buf_2d *ref) {
unsigned int sse;
- const vp9_variance_fn_t fn = get_block_variance_fn(bsize);
+ const vpx_variance_fn_t fn = get_block_variance_fn(bsize);
fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
return sse;
}
#if CONFIG_VP9_HIGHBITDEPTH
-static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
int bd) {
switch (bd) {
default:
@@ -368,7 +368,7 @@
const struct buf_2d *ref,
int bd) {
unsigned int sse;
- const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+ const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
return sse;
}
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -13,7 +13,7 @@
#define VP9_ENCODER_VP9_MCOMP_H_
#include "vp9/encoder/vp9_block.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vpx_dsp/variance.h"
#ifdef __cplusplus
extern "C" {
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -37,7 +37,6 @@
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/encoder/vp9_rd.h"
#include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_variance.h"
#define RD_THRESH_POW 1.25
#define RD_MULT_EPB_RATIO 64
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -39,7 +39,6 @@
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/encoder/vp9_rd.h"
#include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_variance.h"
#include "vp9/encoder/vp9_aq_variance.h"
#define LAST_FRAME_MODE_MASK ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
--- a/vp9/encoder/vp9_variance.c
+++ /dev/null
@@ -1,380 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_filter.h"
-
-#include "vp9/encoder/vp9_variance.h"
-
-static const uint8_t bilinear_filters[8][2] = {
- { 128, 0, },
- { 112, 16, },
- { 96, 32, },
- { 80, 48, },
- { 64, 64, },
- { 48, 80, },
- { 32, 96, },
- { 16, 112, },
-};
-
-// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// first-pass of 2-D separable filter.
-//
-// Produces int32_t output to retain precision for next pass. Two filter taps
-// should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is
-// applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It
-// defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
- uint16_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *vp9_filter) {
- unsigned int i, j;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
- (int)src_ptr[pixel_step] * vp9_filter[1],
- FILTER_BITS);
-
- src_ptr++;
- }
-
- // Next row...
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// second-pass of 2-D separable filter.
-//
-// Requires 32-bit input as produced by filter_block2d_bil_first_pass. Two
-// filter taps should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the
-// filter is applied horizontally (pixel_step=1) or vertically (pixel_step=
-// stride). It defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *vp9_filter) {
- unsigned int i, j;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
- (int)src_ptr[pixel_step] * vp9_filter[1],
- FILTER_BITS);
- src_ptr++;
- }
-
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-#define SUBPIX_VAR(W, H) \
-unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
-\
- var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
- bilinear_filters[xoffset]); \
- var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
-}
-
-#define SUBPIX_AVG_VAR(W, H) \
-unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- unsigned int *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
-\
- var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
- bilinear_filters[xoffset]); \
- var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
-\
- return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
-}
-
-SUBPIX_VAR(4, 4)
-SUBPIX_AVG_VAR(4, 4)
-
-SUBPIX_VAR(4, 8)
-SUBPIX_AVG_VAR(4, 8)
-
-SUBPIX_VAR(8, 4)
-SUBPIX_AVG_VAR(8, 4)
-
-SUBPIX_VAR(8, 8)
-SUBPIX_AVG_VAR(8, 8)
-
-SUBPIX_VAR(8, 16)
-SUBPIX_AVG_VAR(8, 16)
-
-SUBPIX_VAR(16, 8)
-SUBPIX_AVG_VAR(16, 8)
-
-SUBPIX_VAR(16, 16)
-SUBPIX_AVG_VAR(16, 16)
-
-SUBPIX_VAR(16, 32)
-SUBPIX_AVG_VAR(16, 32)
-
-SUBPIX_VAR(32, 16)
-SUBPIX_AVG_VAR(32, 16)
-
-SUBPIX_VAR(32, 32)
-SUBPIX_AVG_VAR(32, 32)
-
-SUBPIX_VAR(32, 64)
-SUBPIX_AVG_VAR(32, 64)
-
-SUBPIX_VAR(64, 32)
-SUBPIX_AVG_VAR(64, 32)
-
-SUBPIX_VAR(64, 64)
-SUBPIX_AVG_VAR(64, 64)
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_var_filter_block2d_bil_first_pass(
- const uint8_t *src_ptr8,
- uint16_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *vp9_filter) {
- unsigned int i, j;
- uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- output_ptr[j] =
- ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
- (int)src_ptr[pixel_step] * vp9_filter[1],
- FILTER_BITS);
-
- src_ptr++;
- }
-
- // Next row...
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-static void highbd_var_filter_block2d_bil_second_pass(
- const uint16_t *src_ptr,
- uint16_t *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *vp9_filter) {
- unsigned int i, j;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- output_ptr[j] =
- ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
- (int)src_ptr[pixel_step] * vp9_filter[1],
- FILTER_BITS);
- src_ptr++;
- }
-
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-#define HIGHBD_SUBPIX_VAR(W, H) \
-unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
- dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
- W, dst, dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
- W, dst, dst_stride, sse); \
-}
-
-#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
-unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- unsigned int *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
-\
- return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
- dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- unsigned int *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
-\
- return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
- W, dst, dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, \
- int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, \
- unsigned int *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
- highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters[xoffset]); \
- highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
- bilinear_filters[yoffset]); \
-\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
-\
- return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
- W, dst, dst_stride, sse); \
-}
-
-HIGHBD_SUBPIX_VAR(4, 4)
-HIGHBD_SUBPIX_AVG_VAR(4, 4)
-
-HIGHBD_SUBPIX_VAR(4, 8)
-HIGHBD_SUBPIX_AVG_VAR(4, 8)
-
-HIGHBD_SUBPIX_VAR(8, 4)
-HIGHBD_SUBPIX_AVG_VAR(8, 4)
-
-HIGHBD_SUBPIX_VAR(8, 8)
-HIGHBD_SUBPIX_AVG_VAR(8, 8)
-
-HIGHBD_SUBPIX_VAR(8, 16)
-HIGHBD_SUBPIX_AVG_VAR(8, 16)
-
-HIGHBD_SUBPIX_VAR(16, 8)
-HIGHBD_SUBPIX_AVG_VAR(16, 8)
-
-HIGHBD_SUBPIX_VAR(16, 16)
-HIGHBD_SUBPIX_AVG_VAR(16, 16)
-
-HIGHBD_SUBPIX_VAR(16, 32)
-HIGHBD_SUBPIX_AVG_VAR(16, 32)
-
-HIGHBD_SUBPIX_VAR(32, 16)
-HIGHBD_SUBPIX_AVG_VAR(32, 16)
-
-HIGHBD_SUBPIX_VAR(32, 32)
-HIGHBD_SUBPIX_AVG_VAR(32, 32)
-
-HIGHBD_SUBPIX_VAR(32, 64)
-HIGHBD_SUBPIX_AVG_VAR(32, 64)
-
-HIGHBD_SUBPIX_VAR(64, 32)
-HIGHBD_SUBPIX_AVG_VAR(64, 32)
-
-HIGHBD_SUBPIX_VAR(64, 64)
-HIGHBD_SUBPIX_AVG_VAR(64, 64)
-#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_variance.h
+++ /dev/null
@@ -1,81 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_VP9_VARIANCE_H_
-#define VP9_ENCODER_VP9_VARIANCE_H_
-
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int ref_stride);
-
-typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- const uint8_t *second_pred);
-
-typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int *sad_array);
-
-typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t* const ref_ptr[],
- int ref_stride, unsigned int *sad_array);
-
-typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int *sse);
-
-typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
- int source_stride,
- int xoffset,
- int yoffset,
- const uint8_t *ref_ptr,
- int Refstride,
- unsigned int *sse);
-
-typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
- int source_stride,
- int xoffset,
- int yoffset,
- const uint8_t *ref_ptr,
- int Refstride,
- unsigned int *sse,
- const uint8_t *second_pred);
-
-typedef struct vp9_variance_vtable {
- vp9_sad_fn_t sdf;
- vp9_sad_avg_fn_t sdaf;
- vp9_variance_fn_t vf;
- vp9_subpixvariance_fn_t svf;
- vp9_subp_avg_variance_fn_t svaf;
- vp9_sad_multi_fn_t sdx3f;
- vp9_sad_multi_fn_t sdx8f;
- vp9_sad_multi_d_fn_t sdx4df;
-} vp9_variance_fn_ptr_t;
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VP9_ENCODER_VP9_VARIANCE_H_
--- a/vp9/encoder/x86/vp9_highbd_subpel_variance.asm
+++ /dev/null
@@ -1,1039 +1,0 @@
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_8: times 8 dw 8
-bilin_filter_m_sse2: times 8 dw 16
- times 8 dw 0
- times 8 dw 14
- times 8 dw 2
- times 8 dw 12
- times 8 dw 4
- times 8 dw 10
- times 8 dw 6
- times 16 dw 8
- times 8 dw 6
- times 8 dw 10
- times 8 dw 4
- times 8 dw 12
- times 8 dw 2
- times 8 dw 14
-
-SECTION .text
-
-; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
-; int x_offset, int y_offset,
-; const uint8_t *dst, ptrdiff_t dst_stride,
-; int height, unsigned int *sse);
-;
-; This function returns the SE and stores SSE in the given pointer.
-
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
- psubw %3, %4
- psubw %1, %2
- mova %4, %3 ; make copies to manipulate to calc sum
- mova %2, %1 ; use originals for calc sse
- pmaddwd %3, %3
- paddw %4, %2
- pmaddwd %1, %1
- movhlps %2, %4
- paddd %6, %3
- paddw %4, %2
- pxor %2, %2
- pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
- punpcklwd %4, %2 ; sign-extend word to dword
- paddd %6, %1
- paddd %5, %4
-
-%endmacro
-
-%macro STORE_AND_RET 0
-%if mmsize == 16
- ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
- ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
- ; We have to sign-extend it before adding the words within the register
- ; and outputing to a dword.
- movhlps m3, m7
- movhlps m4, m6
- paddd m7, m3
- paddd m6, m4
- pshufd m3, m7, 0x1
- pshufd m4, m6, 0x1
- paddd m7, m3
- paddd m6, m4
- mov r1, ssem ; r1 = unsigned int *sse
- movd [r1], m7 ; store sse
- movd rax, m6 ; store sum as return value
-%endif
- RET
-%endmacro
-
-%macro INC_SRC_BY_SRC_STRIDE 0
-%if ARCH_X86=1 && CONFIG_PIC=1
- lea srcq, [srcq + src_stridemp*2]
-%else
- lea srcq, [srcq + src_strideq*2]
-%endif
-%endmacro
-
-%macro INC_SRC_BY_SRC_2STRIDE 0
-%if ARCH_X86=1 && CONFIG_PIC=1
- lea srcq, [srcq + src_stridemp*4]
-%else
- lea srcq, [srcq + src_strideq*4]
-%endif
-%endmacro
-
-%macro SUBPEL_VARIANCE 1-2 0 ; W
-%define bilin_filter_m bilin_filter_m_sse2
-%define filter_idx_shift 5
-
-
-%ifdef PIC ; 64bit PIC
- %if %2 == 1 ; avg
- cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, height, sse
- %define sec_str sec_strideq
- %else
- cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
- y_offset, dst, dst_stride, height, sse
- %endif
- %define h heightd
- %define bilin_filter sseq
-%else
- %if ARCH_X86=1 && CONFIG_PIC=1
- %if %2 == 1 ; avg
- cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, \
- height, sse, g_bilin_filter, g_pw_8
- %define h dword heightm
- %define sec_str sec_stridemp
-
- ; Store bilin_filter and pw_8 location in stack
- GET_GOT eax
- add esp, 4 ; restore esp
-
- lea ecx, [GLOBAL(bilin_filter_m)]
- mov g_bilin_filterm, ecx
-
- lea ecx, [GLOBAL(pw_8)]
- mov g_pw_8m, ecx
-
- LOAD_IF_USED 0, 1 ; load eax, ecx back
- %else
- cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, dst, dst_stride, height, \
- sse, g_bilin_filter, g_pw_8
- %define h heightd
-
- ; Store bilin_filter and pw_8 location in stack
- GET_GOT eax
- add esp, 4 ; restore esp
-
- lea ecx, [GLOBAL(bilin_filter_m)]
- mov g_bilin_filterm, ecx
-
- lea ecx, [GLOBAL(pw_8)]
- mov g_pw_8m, ecx
-
- LOAD_IF_USED 0, 1 ; load eax, ecx back
- %endif
- %else
- %if %2 == 1 ; avg
- cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
- 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, \
- height, sse
- %if ARCH_X86_64
- %define h heightd
- %define sec_str sec_strideq
- %else
- %define h dword heightm
- %define sec_str sec_stridemp
- %endif
- %else
- cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, dst, dst_stride, height, sse
- %define h heightd
- %endif
-
- %define bilin_filter bilin_filter_m
- %endif
-%endif
-
- ASSERT %1 <= 16 ; m6 overflows if w > 16
- pxor m6, m6 ; sum
- pxor m7, m7 ; sse
-
-%if %1 < 16
- sar h, 1
-%endif
-%if %2 == 1 ; avg
- shl sec_str, 1
-%endif
-
- ; FIXME(rbultje) replace by jumptable?
- test x_offsetd, x_offsetd
- jnz .x_nonzero
- ; x_offset == 0
- test y_offsetd, y_offsetd
- jnz .x_zero_y_nonzero
-
- ; x_offset == 0 && y_offset == 0
-.x_zero_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m2, [srcq + 16]
- mova m1, [dstq]
- mova m3, [dstq + 16]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m2, [secq+16]
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m2, [srcq + src_strideq*2]
- mova m1, [dstq]
- mova m3, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m2, [secq]
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec h
- jg .x_zero_y_zero_loop
- STORE_AND_RET
-
-.x_zero_y_nonzero:
- cmp y_offsetd, 8
- jne .x_zero_y_nonhalf
-
- ; x_offset == 0 && y_offset == 0.5
-.x_zero_y_half_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m4, [srcq+src_strideq*2]
- movu m5, [srcq+src_strideq*2+16]
- mova m2, [dstq]
- mova m3, [dstq+16]
- pavgw m0, m4
- pavgw m1, m5
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m1, [srcq+src_strideq*2]
- movu m5, [srcq+src_strideq*4]
- mova m2, [dstq]
- mova m3, [dstq+dst_strideq*2]
- pavgw m0, m1
- pavgw m1, m5
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m1, [secq]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec h
- jg .x_zero_y_half_loop
- STORE_AND_RET
-
-.x_zero_y_nonhalf:
- ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
- lea bilin_filter, [bilin_filter_m]
-%endif
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+y_offsetq]
- mova m9, [bilin_filter+y_offsetq+16]
- mova m10, [pw_8]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ; x86-32 or mmx
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0, reuse x_offset reg
-%define tempq x_offsetq
- add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
-%endif
-%endif
-
-.x_zero_y_other_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq + 16]
- movu m4, [srcq+src_strideq*2]
- movu m5, [srcq+src_strideq*2+16]
- mova m2, [dstq]
- mova m3, [dstq+16]
- ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
- ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
- ; instructions is the same (5), but it is 1 mul instead of 2, so might be
- ; slightly faster because of pmullw latency. It would also cut our rodata
- ; tables in half for this function, and save 1-2 registers on x86-64.
- pmullw m1, filter_y_a
- pmullw m5, filter_y_b
- paddw m1, filter_rnd
- pmullw m0, filter_y_a
- pmullw m4, filter_y_b
- paddw m0, filter_rnd
- paddw m1, m5
- paddw m0, m4
- psrlw m1, 4
- psrlw m0, 4
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m1, [srcq+src_strideq*2]
- movu m5, [srcq+src_strideq*4]
- mova m4, m1
- mova m2, [dstq]
- mova m3, [dstq+dst_strideq*2]
- pmullw m1, filter_y_a
- pmullw m5, filter_y_b
- paddw m1, filter_rnd
- pmullw m0, filter_y_a
- pmullw m4, filter_y_b
- paddw m0, filter_rnd
- paddw m1, m5
- paddw m0, m4
- psrlw m1, 4
- psrlw m0, 4
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m1, [secq]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec h
- jg .x_zero_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonzero:
- cmp x_offsetd, 8
- jne .x_nonhalf
- ; x_offset == 0.5
- test y_offsetd, y_offsetd
- jnz .x_half_y_nonzero
-
- ; x_offset == 0.5 && y_offset == 0
-.x_half_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq + 16]
- movu m4, [srcq + 2]
- movu m5, [srcq + 18]
- mova m2, [dstq]
- mova m3, [dstq + 16]
- pavgw m0, m4
- pavgw m1, m5
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m1, [srcq + src_strideq*2]
- movu m4, [srcq + 2]
- movu m5, [srcq + src_strideq*2 + 2]
- mova m2, [dstq]
- mova m3, [dstq + dst_strideq*2]
- pavgw m0, m4
- pavgw m1, m5
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m1, [secq]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec h
- jg .x_half_y_zero_loop
- STORE_AND_RET
-
-.x_half_y_nonzero:
- cmp y_offsetd, 8
- jne .x_half_y_nonhalf
-
- ; x_offset == 0.5 && y_offset == 0.5
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+2]
- movu m3, [srcq+18]
- lea srcq, [srcq + src_strideq*2]
- pavgw m0, m2
- pavgw m1, m3
-.x_half_y_half_loop:
- movu m2, [srcq]
- movu m3, [srcq + 16]
- movu m4, [srcq + 2]
- movu m5, [srcq + 18]
- pavgw m2, m4
- pavgw m3, m5
- pavgw m0, m2
- pavgw m1, m3
- mova m4, [dstq]
- mova m5, [dstq + 16]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m4, m1, m5, m6, m7
- mova m0, m2
- mova m1, m3
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m2, [srcq+2]
- lea srcq, [srcq + src_strideq*2]
- pavgw m0, m2
-.x_half_y_half_loop:
- movu m2, [srcq]
- movu m3, [srcq + src_strideq*2]
- movu m4, [srcq + 2]
- movu m5, [srcq + src_strideq*2 + 2]
- pavgw m2, m4
- pavgw m3, m5
- pavgw m0, m2
- pavgw m2, m3
- mova m4, [dstq]
- mova m5, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m2, [secq]
-%endif
- SUM_SSE m0, m4, m2, m5, m6, m7
- mova m0, m3
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec h
- jg .x_half_y_half_loop
- STORE_AND_RET
-
-.x_half_y_nonhalf:
- ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
- lea bilin_filter, [bilin_filter_m]
-%endif
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+y_offsetq]
- mova m9, [bilin_filter+y_offsetq+16]
- mova m10, [pw_8]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ; x86_32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0.5. We can reuse x_offset reg
-%define tempq x_offsetq
- add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
-%endif
-%endif
-
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+2]
- movu m3, [srcq+18]
- lea srcq, [srcq + src_strideq*2]
- pavgw m0, m2
- pavgw m1, m3
-.x_half_y_other_loop:
- movu m2, [srcq]
- movu m3, [srcq+16]
- movu m4, [srcq+2]
- movu m5, [srcq+18]
- pavgw m2, m4
- pavgw m3, m5
- mova m4, m2
- mova m5, m3
- pmullw m1, filter_y_a
- pmullw m3, filter_y_b
- paddw m1, filter_rnd
- paddw m1, m3
- pmullw m0, filter_y_a
- pmullw m2, filter_y_b
- paddw m0, filter_rnd
- psrlw m1, 4
- paddw m0, m2
- mova m2, [dstq]
- psrlw m0, 4
- mova m3, [dstq+16]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
- mova m0, m4
- mova m1, m5
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m2, [srcq+2]
- lea srcq, [srcq + src_strideq*2]
- pavgw m0, m2
-.x_half_y_other_loop:
- movu m2, [srcq]
- movu m3, [srcq+src_strideq*2]
- movu m4, [srcq+2]
- movu m5, [srcq+src_strideq*2+2]
- pavgw m2, m4
- pavgw m3, m5
- mova m4, m2
- mova m5, m3
- pmullw m4, filter_y_a
- pmullw m3, filter_y_b
- paddw m4, filter_rnd
- paddw m4, m3
- pmullw m0, filter_y_a
- pmullw m2, filter_y_b
- paddw m0, filter_rnd
- psrlw m4, 4
- paddw m0, m2
- mova m2, [dstq]
- psrlw m0, 4
- mova m3, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m4, [secq]
-%endif
- SUM_SSE m0, m2, m4, m3, m6, m7
- mova m0, m5
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec h
- jg .x_half_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonhalf:
- test y_offsetd, y_offsetd
- jnz .x_nonhalf_y_nonzero
-
- ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
- lea bilin_filter, [bilin_filter_m]
-%endif
- shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+x_offsetq]
- mova m9, [bilin_filter+x_offsetq+16]
- mova m10, [pw_8]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0. We can reuse y_offset reg.
-%define tempq y_offsetq
- add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
-%endif
-%endif
-
-.x_other_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+2]
- movu m3, [srcq+18]
- mova m4, [dstq]
- mova m5, [dstq+16]
- pmullw m1, filter_x_a
- pmullw m3, filter_x_b
- paddw m1, filter_rnd
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- paddw m1, m3
- paddw m0, m2
- psrlw m1, 4
- psrlw m0, 4
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m4, m1, m5, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m1, [srcq+src_strideq*2]
- movu m2, [srcq+2]
- movu m3, [srcq+src_strideq*2+2]
- mova m4, [dstq]
- mova m5, [dstq+dst_strideq*2]
- pmullw m1, filter_x_a
- pmullw m3, filter_x_b
- paddw m1, filter_rnd
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- paddw m1, m3
- paddw m0, m2
- psrlw m1, 4
- psrlw m0, 4
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m1, [secq]
-%endif
- SUM_SSE m0, m4, m1, m5, m6, m7
-
- lea srcq, [srcq+src_strideq*4]
- lea dstq, [dstq+dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec h
- jg .x_other_y_zero_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonhalf_y_nonzero:
- cmp y_offsetd, 8
- jne .x_nonhalf_y_nonhalf
-
- ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
- lea bilin_filter, [bilin_filter_m]
-%endif
- shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+x_offsetq]
- mova m9, [bilin_filter+x_offsetq+16]
- mova m10, [pw_8]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0.5. We can reuse y_offset reg.
-%define tempq y_offsetq
- add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
-%endif
-%endif
-
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+2]
- movu m3, [srcq+18]
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- pmullw m1, filter_x_a
- pmullw m3, filter_x_b
- paddw m1, filter_rnd
- paddw m0, m2
- paddw m1, m3
- psrlw m0, 4
- psrlw m1, 4
- lea srcq, [srcq+src_strideq*2]
-.x_other_y_half_loop:
- movu m2, [srcq]
- movu m3, [srcq+16]
- movu m4, [srcq+2]
- movu m5, [srcq+18]
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m2, filter_rnd
- pmullw m3, filter_x_a
- pmullw m5, filter_x_b
- paddw m3, filter_rnd
- paddw m2, m4
- paddw m3, m5
- mova m4, [dstq]
- mova m5, [dstq+16]
- psrlw m2, 4
- psrlw m3, 4
- pavgw m0, m2
- pavgw m1, m3
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m4, m1, m5, m6, m7
- mova m0, m2
- mova m1, m3
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m2, [srcq+2]
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- paddw m0, m2
- psrlw m0, 4
- lea srcq, [srcq+src_strideq*2]
-.x_other_y_half_loop:
- movu m2, [srcq]
- movu m3, [srcq+src_strideq*2]
- movu m4, [srcq+2]
- movu m5, [srcq+src_strideq*2+2]
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m2, filter_rnd
- pmullw m3, filter_x_a
- pmullw m5, filter_x_b
- paddw m3, filter_rnd
- paddw m2, m4
- paddw m3, m5
- mova m4, [dstq]
- mova m5, [dstq+dst_strideq*2]
- psrlw m2, 4
- psrlw m3, 4
- pavgw m0, m2
- pavgw m2, m3
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m2, [secq]
-%endif
- SUM_SSE m0, m4, m2, m5, m6, m7
- mova m0, m3
-
- lea srcq, [srcq+src_strideq*4]
- lea dstq, [dstq+dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec h
- jg .x_other_y_half_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonhalf_y_nonhalf:
-; loading filter - this is same as in 8-bit depth
-%ifdef PIC
- lea bilin_filter, [bilin_filter_m]
-%endif
- shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+x_offsetq]
- mova m9, [bilin_filter+x_offsetq+16]
- mova m10, [bilin_filter+y_offsetq]
- mova m11, [bilin_filter+y_offsetq+16]
- mova m12, [pw_8]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_y_a m10
-%define filter_y_b m11
-%define filter_rnd m12
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; In this case, there is NO unused register. Used src_stride register. Later,
-; src_stride has to be loaded from stack when it is needed.
-%define tempq src_strideq
- mov tempq, g_bilin_filterm
- add x_offsetq, tempq
- add y_offsetq, tempq
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
- add y_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
-%endif
-%endif
-; end of load filter
-
- ; x_offset == bilin interpolation && y_offset == bilin interpolation
-%if %1 == 16
- movu m0, [srcq]
- movu m2, [srcq+2]
- movu m1, [srcq+16]
- movu m3, [srcq+18]
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- pmullw m1, filter_x_a
- pmullw m3, filter_x_b
- paddw m1, filter_rnd
- paddw m0, m2
- paddw m1, m3
- psrlw m0, 4
- psrlw m1, 4
-
- INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
- movu m2, [srcq]
- movu m4, [srcq+2]
- movu m3, [srcq+16]
- movu m5, [srcq+18]
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m2, filter_rnd
- pmullw m3, filter_x_a
- pmullw m5, filter_x_b
- paddw m3, filter_rnd
- paddw m2, m4
- paddw m3, m5
- psrlw m2, 4
- psrlw m3, 4
- mova m4, m2
- mova m5, m3
- pmullw m0, filter_y_a
- pmullw m2, filter_y_b
- paddw m0, filter_rnd
- pmullw m1, filter_y_a
- pmullw m3, filter_y_b
- paddw m0, m2
- paddw m1, filter_rnd
- mova m2, [dstq]
- paddw m1, m3
- psrlw m0, 4
- psrlw m1, 4
- mova m3, [dstq+16]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
- mova m0, m4
- mova m1, m5
-
- INC_SRC_BY_SRC_STRIDE
- lea dstq, [dstq + dst_strideq * 2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m2, [srcq+2]
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- paddw m0, m2
- psrlw m0, 4
-
- INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
- movu m2, [srcq]
- movu m4, [srcq+2]
- movu m3, [srcq+src_strideq*2]
- movu m5, [srcq+src_strideq*2+2]
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m2, filter_rnd
- pmullw m3, filter_x_a
- pmullw m5, filter_x_b
- paddw m3, filter_rnd
- paddw m2, m4
- paddw m3, m5
- psrlw m2, 4
- psrlw m3, 4
- mova m4, m2
- mova m5, m3
- pmullw m0, filter_y_a
- pmullw m2, filter_y_b
- paddw m0, filter_rnd
- pmullw m4, filter_y_a
- pmullw m3, filter_y_b
- paddw m0, m2
- paddw m4, filter_rnd
- mova m2, [dstq]
- paddw m4, m3
- psrlw m0, 4
- psrlw m4, 4
- mova m3, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m4, [secq]
-%endif
- SUM_SSE m0, m2, m4, m3, m6, m7
- mova m0, m5
-
- INC_SRC_BY_SRC_2STRIDE
- lea dstq, [dstq + dst_strideq * 4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec h
- jg .x_other_y_other_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
- STORE_AND_RET
-%endmacro
-
-INIT_XMM sse2
-SUBPEL_VARIANCE 8
-SUBPEL_VARIANCE 16
-
-INIT_XMM sse2
-SUBPEL_VARIANCE 8, 1
-SUBPEL_VARIANCE 16, 1
--- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c
+++ /dev/null
@@ -1,349 +1,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vpx_config.h"
-#include "vp9/common/vp9_common.h"
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-#define DECL(w, opt) \
-int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint16_t *dst, \
- ptrdiff_t dst_stride, \
- int height, unsigned int *sse);
-#define DECLS(opt1, opt2) \
-DECL(8, opt1); \
-DECL(16, opt1)
-
-DECLS(sse2, sse);
-// DECLS(ssse3, ssse3);
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
- int src_stride, \
- int x_offset, \
- int y_offset, \
- const uint8_t *dst8, \
- int dst_stride, \
- uint32_t *sse_ptr) { \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, h, \
- &sse); \
- if (w > wf) { \
- unsigned int sse2; \
- int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
- src_stride, \
- x_offset, y_offset, \
- dst + 16, \
- dst_stride, \
- h, &sse2); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- h, &sse2); \
- se += se2; \
- sse += sse2; \
- se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, \
- dst + 48, dst_stride, h, &sse2); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- h, &sse); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
- src_stride, \
- x_offset, y_offset, \
- dst + 16, \
- dst_stride, \
- h, &sse2); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- h, &sse2); \
- se += se2; \
- sse += sse2; \
- se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
- x_offset, y_offset, \
- dst + 48, dst_stride, \
- h, &sse2); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 2); \
- sse = ROUND_POWER_OF_TWO(sse, 4); \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
- int start_row; \
- uint32_t sse; \
- int se = 0; \
- uint64_t long_sse = 0; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- for (start_row = 0; start_row < h; start_row +=16) { \
- uint32_t sse2; \
- int height = h - start_row < 16 ? h - start_row : 16; \
- int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + (start_row * dst_stride), \
- dst_stride, height, &sse2); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf) { \
- se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
- dst_stride, height, &sse2); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf * 2) { \
- se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
- dst_stride, height, &sse2); \
- se += se2; \
- long_sse += sse2; \
- se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
- dst_stride, height, &sse2); \
- se += se2; \
- long_sse += sse2; \
- }\
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 4); \
- sse = ROUND_POWER_OF_TWO(long_sse, 8); \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
-FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt1, (int64_t));
-
-
-FNS(sse2, sse);
-
-#undef FNS
-#undef FN
-
-#define DECL(w, opt) \
-int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint16_t *dst, \
- ptrdiff_t dst_stride, \
- const uint16_t *sec, \
- ptrdiff_t sec_stride, \
- int height, \
- unsigned int *sse);
-#define DECLS(opt1) \
-DECL(16, opt1) \
-DECL(8, opt1)
-
-DECLS(sse2);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
- const uint8_t *sec8) { \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
- int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src, src_stride, x_offset, \
- y_offset, dst, dst_stride, sec, w, h, &sse); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, \
- dst + 16, dst_stride, sec + 16, w, h, &sse2); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, \
- dst + 32, dst_stride, sec + 32, w, h, &sse2); \
- se += se2; \
- sse += sse2; \
- se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, \
- dst + 48, dst_stride, sec + 48, w, h, &sse2); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
- const uint8_t *sec8) { \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
- int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src, src_stride, x_offset, \
- y_offset, dst, dst_stride, \
- sec, w, h, &sse); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16, src_stride, \
- x_offset, y_offset, \
- dst + 16, dst_stride, \
- sec + 16, w, h, &sse2); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- sec + 32, w, h, &sse2); \
- se += se2; \
- sse += sse2; \
- se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48, src_stride, \
- x_offset, y_offset, \
- dst + 48, dst_stride, \
- sec + 48, w, h, &sse2); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 2); \
- sse = ROUND_POWER_OF_TWO(sse, 4); \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
- const uint8_t *sec8) { \
- int start_row; \
- uint32_t sse; \
- int se = 0; \
- uint64_t long_sse = 0; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
- for (start_row = 0; start_row < h; start_row +=16) { \
- uint32_t sse2; \
- int height = h - start_row < 16 ? h - start_row : 16; \
- int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + (start_row * dst_stride), dst_stride, \
- sec + (start_row * w), w, height, &sse2); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf) { \
- se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, \
- dst + 16 + (start_row * dst_stride), dst_stride, \
- sec + 16 + (start_row * w), w, height, &sse2); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf * 2) { \
- se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 32 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, \
- dst + 32 + (start_row * dst_stride), dst_stride, \
- sec + 32 + (start_row * w), w, height, &sse2); \
- se += se2; \
- long_sse += sse2; \
- se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48 + (start_row * src_stride), src_stride, \
- x_offset, y_offset, \
- dst + 48 + (start_row * dst_stride), dst_stride, \
- sec + 48 + (start_row * w), w, height, &sse2); \
- se += se2; \
- long_sse += sse2; \
- } \
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 4); \
- sse = ROUND_POWER_OF_TWO(long_sse, 8); \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-
-#define FNS(opt1) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
-FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt1, (int64_t));
-
-FNS(sse2);
-
-#undef FNS
-#undef FN
--- a/vp9/encoder/x86/vp9_subpel_variance.asm
+++ /dev/null
@@ -1,1396 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_8: times 8 dw 8
-bilin_filter_m_sse2: times 8 dw 16
- times 8 dw 0
- times 8 dw 14
- times 8 dw 2
- times 8 dw 12
- times 8 dw 4
- times 8 dw 10
- times 8 dw 6
- times 16 dw 8
- times 8 dw 6
- times 8 dw 10
- times 8 dw 4
- times 8 dw 12
- times 8 dw 2
- times 8 dw 14
-
-bilin_filter_m_ssse3: times 8 db 16, 0
- times 8 db 14, 2
- times 8 db 12, 4
- times 8 db 10, 6
- times 16 db 8
- times 8 db 6, 10
- times 8 db 4, 12
- times 8 db 2, 14
-
-SECTION .text
-
-; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
-; int x_offset, int y_offset,
-; const uint8_t *dst, ptrdiff_t dst_stride,
-; int height, unsigned int *sse);
-;
-; This function returns the SE and stores SSE in the given pointer.
-
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
- psubw %3, %4
- psubw %1, %2
- paddw %5, %3
- pmaddwd %3, %3
- paddw %5, %1
- pmaddwd %1, %1
- paddd %6, %3
- paddd %6, %1
-%endmacro
-
-%macro STORE_AND_RET 0
-%if mmsize == 16
- ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
- ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
- ; We have to sign-extend it before adding the words within the register
- ; and outputing to a dword.
- pcmpgtw m5, m6 ; mask for 0 > x
- movhlps m3, m7
- punpcklwd m4, m6, m5
- punpckhwd m6, m5 ; sign-extend m6 word->dword
- paddd m7, m3
- paddd m6, m4
- pshufd m3, m7, 0x1
- movhlps m4, m6
- paddd m7, m3
- paddd m6, m4
- mov r1, ssem ; r1 = unsigned int *sse
- pshufd m4, m6, 0x1
- movd [r1], m7 ; store sse
- paddd m6, m4
- movd raxd, m6 ; store sum as return value
-%else ; mmsize == 8
- pshufw m4, m6, 0xe
- pshufw m3, m7, 0xe
- paddw m6, m4
- paddd m7, m3
- pcmpgtw m5, m6 ; mask for 0 > x
- mov r1, ssem ; r1 = unsigned int *sse
- punpcklwd m6, m5 ; sign-extend m6 word->dword
- movd [r1], m7 ; store sse
- pshufw m4, m6, 0xe
- paddd m6, m4
- movd raxd, m6 ; store sum as return value
-%endif
- RET
-%endmacro
-
-%macro INC_SRC_BY_SRC_STRIDE 0
-%if ARCH_X86=1 && CONFIG_PIC=1
- add srcq, src_stridemp
-%else
- add srcq, src_strideq
-%endif
-%endmacro
-
-%macro SUBPEL_VARIANCE 1-2 0 ; W
-%if cpuflag(ssse3)
-%define bilin_filter_m bilin_filter_m_ssse3
-%define filter_idx_shift 4
-%else
-%define bilin_filter_m bilin_filter_m_sse2
-%define filter_idx_shift 5
-%endif
-; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
-; 11, not 13, if the registers are ordered correctly. May make a minor speed
-; difference on Win64
-
-%ifdef PIC ; 64bit PIC
- %if %2 == 1 ; avg
- cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, height, sse
- %define sec_str sec_strideq
- %else
- cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
- y_offset, dst, dst_stride, height, sse
- %endif
- %define h heightd
- %define bilin_filter sseq
-%else
- %if ARCH_X86=1 && CONFIG_PIC=1
- %if %2 == 1 ; avg
- cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, \
- height, sse, g_bilin_filter, g_pw_8
- %define h dword heightm
- %define sec_str sec_stridemp
-
- ;Store bilin_filter and pw_8 location in stack
- GET_GOT eax
- add esp, 4 ; restore esp
-
- lea ecx, [GLOBAL(bilin_filter_m)]
- mov g_bilin_filterm, ecx
-
- lea ecx, [GLOBAL(pw_8)]
- mov g_pw_8m, ecx
-
- LOAD_IF_USED 0, 1 ; load eax, ecx back
- %else
- cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
- y_offset, dst, dst_stride, height, sse, \
- g_bilin_filter, g_pw_8
- %define h heightd
-
- ;Store bilin_filter and pw_8 location in stack
- GET_GOT eax
- add esp, 4 ; restore esp
-
- lea ecx, [GLOBAL(bilin_filter_m)]
- mov g_bilin_filterm, ecx
-
- lea ecx, [GLOBAL(pw_8)]
- mov g_pw_8m, ecx
-
- LOAD_IF_USED 0, 1 ; load eax, ecx back
- %endif
- %else
- %if %2 == 1 ; avg
- cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
- 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, \
- height, sse
- %if ARCH_X86_64
- %define h heightd
- %define sec_str sec_strideq
- %else
- %define h dword heightm
- %define sec_str sec_stridemp
- %endif
- %else
- cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
- y_offset, dst, dst_stride, height, sse
- %define h heightd
- %endif
-
- %define bilin_filter bilin_filter_m
- %endif
-%endif
-
- ASSERT %1 <= 16 ; m6 overflows if w > 16
- pxor m6, m6 ; sum
- pxor m7, m7 ; sse
- ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
- ; could perhaps use it for something more productive then
- pxor m5, m5 ; dedicated zero register
-%if %1 < 16
- sar h, 1
-%if %2 == 1 ; avg
- shl sec_str, 1
-%endif
-%endif
-
- ; FIXME(rbultje) replace by jumptable?
- test x_offsetd, x_offsetd
- jnz .x_nonzero
- ; x_offset == 0
- test y_offsetd, y_offsetd
- jnz .x_zero_y_nonzero
-
- ; x_offset == 0 && y_offset == 0
-.x_zero_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- mova m1, [dstq]
-%if %2 == 1 ; avg
- pavgb m0, [secq]
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
-%endif
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%if %2 == 0 ; !avg
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movh m0, [srcq]
-%if %2 == 1 ; avg
-%if mmsize == 16
- movhps m0, [srcq+src_strideq]
-%else ; mmsize == 8
- punpckldq m0, [srcq+src_strideq]
-%endif
-%else ; !avg
- movh m2, [srcq+src_strideq]
-%endif
- movh m1, [dstq]
- movh m3, [dstq+dst_strideq]
-%if %2 == 1 ; avg
- pavgb m0, [secq]
- punpcklbw m3, m5
- punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else ; !avg
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec h
- jg .x_zero_y_zero_loop
- STORE_AND_RET
-
-.x_zero_y_nonzero:
- cmp y_offsetd, 8
- jne .x_zero_y_nonhalf
-
- ; x_offset == 0 && y_offset == 0.5
-.x_zero_y_half_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+src_strideq]
- mova m1, [dstq]
- pavgb m0, m4
- punpckhbw m3, m1, m5
-%if %2 == 1 ; avg
- pavgb m0, [secq]
-%endif
- punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movh m0, [srcq]
- movh m2, [srcq+src_strideq]
-%if %2 == 1 ; avg
-%if mmsize == 16
- movhps m2, [srcq+src_strideq*2]
-%else ; mmsize == 8
-%if %1 == 4
- movh m1, [srcq+src_strideq*2]
- punpckldq m2, m1
-%else
- punpckldq m2, [srcq+src_strideq*2]
-%endif
-%endif
- movh m1, [dstq]
-%if mmsize == 16
- movlhps m0, m2
-%else ; mmsize == 8
- punpckldq m0, m2
-%endif
- movh m3, [dstq+dst_strideq]
- pavgb m0, m2
- punpcklbw m1, m5
- pavgb m0, [secq]
- punpcklbw m3, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else ; !avg
- movh m4, [srcq+src_strideq*2]
- movh m1, [dstq]
- pavgb m0, m2
- movh m3, [dstq+dst_strideq]
- pavgb m2, m4
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec h
- jg .x_zero_y_half_loop
- STORE_AND_RET
-
-.x_zero_y_nonhalf:
- ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
- lea bilin_filter, [bilin_filter_m]
-%endif
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+y_offsetq+16]
-%endif
- mova m10, [pw_8]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ; x86-32 or mmx
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0, reuse x_offset reg
-%define tempq x_offsetq
- add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
-%endif
-%endif
-
-.x_zero_y_other_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+src_strideq]
- mova m1, [dstq]
-%if cpuflag(ssse3)
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_y_a
- pmaddubsw m0, filter_y_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
-%else
- punpckhbw m2, m0, m5
- punpckhbw m3, m4, m5
- punpcklbw m0, m5
- punpcklbw m4, m5
- ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
- ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
- ; instructions is the same (5), but it is 1 mul instead of 2, so might be
- ; slightly faster because of pmullw latency. It would also cut our rodata
- ; tables in half for this function, and save 1-2 registers on x86-64.
- pmullw m2, filter_y_a
- pmullw m3, filter_y_b
- paddw m2, filter_rnd
- pmullw m0, filter_y_a
- pmullw m4, filter_y_b
- paddw m0, filter_rnd
- paddw m2, m3
- paddw m0, m4
-%endif
- psraw m2, 4
- psraw m0, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movh m0, [srcq]
- movh m2, [srcq+src_strideq]
- movh m4, [srcq+src_strideq*2]
- movh m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
- movh m1, [dstq]
- punpcklbw m0, m2
- punpcklbw m2, m4
- pmaddubsw m0, filter_y_a
- pmaddubsw m2, filter_y_a
- punpcklbw m3, m5
- paddw m2, filter_rnd
- paddw m0, filter_rnd
-%else
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m4, m5
- pmullw m0, filter_y_a
- pmullw m1, m2, filter_y_b
- punpcklbw m3, m5
- paddw m0, filter_rnd
- pmullw m2, filter_y_a
- pmullw m4, filter_y_b
- paddw m0, m1
- paddw m2, filter_rnd
- movh m1, [dstq]
- paddw m2, m4
-%endif
- psraw m0, 4
- psraw m2, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec h
- jg .x_zero_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonzero:
- cmp x_offsetd, 8
- jne .x_nonhalf
- ; x_offset == 0.5
- test y_offsetd, y_offsetd
- jnz .x_half_y_nonzero
-
- ; x_offset == 0.5 && y_offset == 0
-.x_half_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+1]
- mova m1, [dstq]
- pavgb m0, m4
- punpckhbw m3, m1, m5
-%if %2 == 1 ; avg
- pavgb m0, [secq]
-%endif
- punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movh m0, [srcq]
- movh m4, [srcq+1]
-%if %2 == 1 ; avg
-%if mmsize == 16
- movhps m0, [srcq+src_strideq]
- movhps m4, [srcq+src_strideq+1]
-%else ; mmsize == 8
- punpckldq m0, [srcq+src_strideq]
- punpckldq m4, [srcq+src_strideq+1]
-%endif
- movh m1, [dstq]
- movh m3, [dstq+dst_strideq]
- pavgb m0, m4
- punpcklbw m3, m5
- pavgb m0, [secq]
- punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else ; !avg
- movh m2, [srcq+src_strideq]
- movh m1, [dstq]
- pavgb m0, m4
- movh m4, [srcq+src_strideq+1]
- movh m3, [dstq+dst_strideq]
- pavgb m2, m4
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec h
- jg .x_half_y_zero_loop
- STORE_AND_RET
-
-.x_half_y_nonzero:
- cmp y_offsetd, 8
- jne .x_half_y_nonhalf
-
- ; x_offset == 0.5 && y_offset == 0.5
-%if %1 == 16
- movu m0, [srcq]
- movu m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
-.x_half_y_half_loop:
- movu m4, [srcq]
- movu m3, [srcq+1]
- mova m1, [dstq]
- pavgb m4, m3
- punpckhbw m3, m1, m5
- pavgb m0, m4
-%if %2 == 1 ; avg
- punpcklbw m1, m5
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movh m0, [srcq]
- movh m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
-.x_half_y_half_loop:
- movh m2, [srcq]
- movh m3, [srcq+1]
-%if %2 == 1 ; avg
-%if mmsize == 16
- movhps m2, [srcq+src_strideq]
- movhps m3, [srcq+src_strideq+1]
-%else
-%if %1 == 4
- movh m1, [srcq+src_strideq]
- punpckldq m2, m1
- movh m1, [srcq+src_strideq+1]
- punpckldq m3, m1
-%else
- punpckldq m2, [srcq+src_strideq]
- punpckldq m3, [srcq+src_strideq+1]
-%endif
-%endif
- pavgb m2, m3
-%if mmsize == 16
- movlhps m0, m2
- movhlps m4, m2
-%else ; mmsize == 8
- punpckldq m0, m2
- pshufw m4, m2, 0xe
-%endif
- movh m1, [dstq]
- pavgb m0, m2
- movh m3, [dstq+dst_strideq]
- pavgb m0, [secq]
- punpcklbw m3, m5
- punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else ; !avg
- movh m4, [srcq+src_strideq]
- movh m1, [srcq+src_strideq+1]
- pavgb m2, m3
- pavgb m4, m1
- pavgb m0, m2
- pavgb m2, m4
- movh m1, [dstq]
- movh m3, [dstq+dst_strideq]
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec h
- jg .x_half_y_half_loop
- STORE_AND_RET
-
-.x_half_y_nonhalf:
- ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
- lea bilin_filter, [bilin_filter_m]
-%endif
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+y_offsetq+16]
-%endif
- mova m10, [pw_8]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ;x86_32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0.5. We can reuse x_offset reg
-%define tempq x_offsetq
- add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
-%endif
-%endif
-
-%if %1 == 16
- movu m0, [srcq]
- movu m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
-.x_half_y_other_loop:
- movu m4, [srcq]
- movu m2, [srcq+1]
- mova m1, [dstq]
- pavgb m4, m2
-%if cpuflag(ssse3)
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_y_a
- pmaddubsw m0, filter_y_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
- psraw m2, 4
-%else
- punpckhbw m2, m0, m5
- punpckhbw m3, m4, m5
- pmullw m2, filter_y_a
- pmullw m3, filter_y_b
- paddw m2, filter_rnd
- punpcklbw m0, m5
- paddw m2, m3
- punpcklbw m3, m4, m5
- pmullw m0, filter_y_a
- pmullw m3, filter_y_b
- paddw m0, filter_rnd
- psraw m2, 4
- paddw m0, m3
-%endif
- punpckhbw m3, m1, m5
- psraw m0, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movh m0, [srcq]
- movh m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
-%if notcpuflag(ssse3)
- punpcklbw m0, m5
-%endif
-.x_half_y_other_loop:
- movh m2, [srcq]
- movh m1, [srcq+1]
- movh m4, [srcq+src_strideq]
- movh m3, [srcq+src_strideq+1]
- pavgb m2, m1
- pavgb m4, m3
- movh m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
- movh m1, [dstq]
- punpcklbw m0, m2
- punpcklbw m2, m4
- pmaddubsw m0, filter_y_a
- pmaddubsw m2, filter_y_a
- punpcklbw m3, m5
- paddw m0, filter_rnd
- paddw m2, filter_rnd
-%else
- punpcklbw m2, m5
- punpcklbw m4, m5
- pmullw m0, filter_y_a
- pmullw m1, m2, filter_y_b
- punpcklbw m3, m5
- paddw m0, filter_rnd
- pmullw m2, filter_y_a
- paddw m0, m1
- pmullw m1, m4, filter_y_b
- paddw m2, filter_rnd
- paddw m2, m1
- movh m1, [dstq]
-%endif
- psraw m0, 4
- psraw m2, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec h
- jg .x_half_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonhalf:
- test y_offsetd, y_offsetd
- jnz .x_nonhalf_y_nonzero
-
- ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
- lea bilin_filter, [bilin_filter_m]
-%endif
- shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+x_offsetq+16]
-%endif
- mova m10, [pw_8]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-;y_offset == 0. We can reuse y_offset reg.
-%define tempq y_offsetq
- add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
-%endif
-%endif
-
-.x_other_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+1]
- mova m1, [dstq]
-%if cpuflag(ssse3)
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_x_a
- pmaddubsw m0, filter_x_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
-%else
- punpckhbw m2, m0, m5
- punpckhbw m3, m4, m5
- punpcklbw m0, m5
- punpcklbw m4, m5
- pmullw m2, filter_x_a
- pmullw m3, filter_x_b
- paddw m2, filter_rnd
- pmullw m0, filter_x_a
- pmullw m4, filter_x_b
- paddw m0, filter_rnd
- paddw m2, m3
- paddw m0, m4
-%endif
- psraw m2, 4
- psraw m0, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movh m0, [srcq]
- movh m1, [srcq+1]
- movh m2, [srcq+src_strideq]
- movh m4, [srcq+src_strideq+1]
- movh m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
- punpcklbw m0, m1
- movh m1, [dstq]
- punpcklbw m2, m4
- pmaddubsw m0, filter_x_a
- pmaddubsw m2, filter_x_a
- punpcklbw m3, m5
- paddw m0, filter_rnd
- paddw m2, filter_rnd
-%else
- punpcklbw m0, m5
- punpcklbw m1, m5
- punpcklbw m2, m5
- punpcklbw m4, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- punpcklbw m3, m5
- paddw m0, filter_rnd
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m0, m1
- paddw m2, filter_rnd
- movh m1, [dstq]
- paddw m2, m4
-%endif
- psraw m0, 4
- psraw m2, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec h
- jg .x_other_y_zero_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonhalf_y_nonzero:
- cmp y_offsetd, 8
- jne .x_nonhalf_y_nonhalf
-
- ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
- lea bilin_filter, [bilin_filter_m]
-%endif
- shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+x_offsetq+16]
-%endif
- mova m10, [pw_8]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0.5. We can reuse y_offset reg.
-%define tempq y_offsetq
- add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
-%endif
-%endif
-
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+1]
-%if cpuflag(ssse3)
- punpckhbw m2, m0, m1
- punpcklbw m0, m1
- pmaddubsw m2, filter_x_a
- pmaddubsw m0, filter_x_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
-%else
- punpckhbw m2, m0, m5
- punpckhbw m3, m1, m5
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- pmullw m2, filter_x_a
- pmullw m3, filter_x_b
- paddw m2, filter_rnd
- paddw m0, m1
- paddw m2, m3
-%endif
- psraw m0, 4
- psraw m2, 4
- add srcq, src_strideq
- packuswb m0, m2
-.x_other_y_half_loop:
- movu m4, [srcq]
- movu m3, [srcq+1]
-%if cpuflag(ssse3)
- mova m1, [dstq]
- punpckhbw m2, m4, m3
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- paddw m2, filter_rnd
- paddw m4, filter_rnd
- psraw m2, 4
- psraw m4, 4
- packuswb m4, m2
- pavgb m0, m4
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
-%else
- punpckhbw m2, m4, m5
- punpckhbw m1, m3, m5
- punpcklbw m4, m5
- punpcklbw m3, m5
- pmullw m4, filter_x_a
- pmullw m3, filter_x_b
- paddw m4, filter_rnd
- pmullw m2, filter_x_a
- pmullw m1, filter_x_b
- paddw m2, filter_rnd
- paddw m4, m3
- paddw m2, m1
- mova m1, [dstq]
- psraw m4, 4
- psraw m2, 4
- punpckhbw m3, m1, m5
- ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
- ; have a 1-register shortage to be able to store the backup of the bilin
- ; filtered second line as words as cache for the next line. Packing into
- ; a byte costs 1 pack and 2 unpacks, but saves a register.
- packuswb m4, m2
- punpcklbw m1, m5
- pavgb m0, m4
-%endif
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- pavgb m0, [secq]
-%endif
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movh m0, [srcq]
- movh m1, [srcq+1]
-%if cpuflag(ssse3)
- punpcklbw m0, m1
- pmaddubsw m0, filter_x_a
- paddw m0, filter_rnd
-%else
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- paddw m0, m1
-%endif
- add srcq, src_strideq
- psraw m0, 4
-.x_other_y_half_loop:
- movh m2, [srcq]
- movh m1, [srcq+1]
- movh m4, [srcq+src_strideq]
- movh m3, [srcq+src_strideq+1]
-%if cpuflag(ssse3)
- punpcklbw m2, m1
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- movh m1, [dstq]
- movh m3, [dstq+dst_strideq]
- paddw m2, filter_rnd
- paddw m4, filter_rnd
-%else
- punpcklbw m2, m5
- punpcklbw m1, m5
- punpcklbw m4, m5
- punpcklbw m3, m5
- pmullw m2, filter_x_a
- pmullw m1, filter_x_b
- paddw m2, filter_rnd
- pmullw m4, filter_x_a
- pmullw m3, filter_x_b
- paddw m4, filter_rnd
- paddw m2, m1
- movh m1, [dstq]
- paddw m4, m3
- movh m3, [dstq+dst_strideq]
-%endif
- psraw m2, 4
- psraw m4, 4
- pavgw m0, m2
- pavgw m2, m4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline - also consider going to bytes here
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- punpcklbw m3, m5
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec h
- jg .x_other_y_half_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonhalf_y_nonhalf:
-%ifdef PIC
- lea bilin_filter, [bilin_filter_m]
-%endif
- shl x_offsetd, filter_idx_shift
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+x_offsetq+16]
-%endif
- mova m10, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m11, [bilin_filter+y_offsetq+16]
-%endif
- mova m12, [pw_8]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_y_a m10
-%define filter_y_b m11
-%define filter_rnd m12
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; In this case, there is NO unused register. Used src_stride register. Later,
-; src_stride has to be loaded from stack when it is needed.
-%define tempq src_strideq
- mov tempq, g_bilin_filterm
- add x_offsetq, tempq
- add y_offsetq, tempq
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
- add y_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
-%endif
-%endif
-
- ; x_offset == bilin interpolation && y_offset == bilin interpolation
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+1]
-%if cpuflag(ssse3)
- punpckhbw m2, m0, m1
- punpcklbw m0, m1
- pmaddubsw m2, filter_x_a
- pmaddubsw m0, filter_x_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
-%else
- punpckhbw m2, m0, m5
- punpckhbw m3, m1, m5
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- pmullw m2, filter_x_a
- pmullw m3, filter_x_b
- paddw m2, filter_rnd
- paddw m0, m1
- paddw m2, m3
-%endif
- psraw m0, 4
- psraw m2, 4
-
- INC_SRC_BY_SRC_STRIDE
-
- packuswb m0, m2
-.x_other_y_other_loop:
-%if cpuflag(ssse3)
- movu m4, [srcq]
- movu m3, [srcq+1]
- mova m1, [dstq]
- punpckhbw m2, m4, m3
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- punpckhbw m3, m1, m5
- paddw m2, filter_rnd
- paddw m4, filter_rnd
- psraw m2, 4
- psraw m4, 4
- packuswb m4, m2
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_y_a
- pmaddubsw m0, filter_y_a
- punpcklbw m1, m5
- paddw m2, filter_rnd
- paddw m0, filter_rnd
- psraw m2, 4
- psraw m0, 4
-%else
- movu m3, [srcq]
- movu m4, [srcq+1]
- punpckhbw m1, m3, m5
- punpckhbw m2, m4, m5
- punpcklbw m3, m5
- punpcklbw m4, m5
- pmullw m3, filter_x_a
- pmullw m4, filter_x_b
- paddw m3, filter_rnd
- pmullw m1, filter_x_a
- pmullw m2, filter_x_b
- paddw m1, filter_rnd
- paddw m3, m4
- paddw m1, m2
- psraw m3, 4
- psraw m1, 4
- packuswb m4, m3, m1
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- pmullw m2, filter_y_a
- pmullw m1, filter_y_b
- paddw m2, filter_rnd
- pmullw m0, filter_y_a
- pmullw m3, filter_y_b
- paddw m2, m1
- mova m1, [dstq]
- paddw m0, filter_rnd
- psraw m2, 4
- paddw m0, m3
- punpckhbw m3, m1, m5
- psraw m0, 4
- punpcklbw m1, m5
-%endif
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- INC_SRC_BY_SRC_STRIDE
- add dstq, dst_strideq
-%else ; %1 < 16
- movh m0, [srcq]
- movh m1, [srcq+1]
-%if cpuflag(ssse3)
- punpcklbw m0, m1
- pmaddubsw m0, filter_x_a
- paddw m0, filter_rnd
-%else
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- paddw m0, m1
-%endif
- psraw m0, 4
-%if cpuflag(ssse3)
- packuswb m0, m0
-%endif
-
- INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
- movh m2, [srcq]
- movh m1, [srcq+1]
-
- INC_SRC_BY_SRC_STRIDE
- movh m4, [srcq]
- movh m3, [srcq+1]
-
-%if cpuflag(ssse3)
- punpcklbw m2, m1
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- movh m3, [dstq+dst_strideq]
- movh m1, [dstq]
- paddw m2, filter_rnd
- paddw m4, filter_rnd
- psraw m2, 4
- psraw m4, 4
- packuswb m2, m2
- packuswb m4, m4
- punpcklbw m0, m2
- punpcklbw m2, m4
- pmaddubsw m0, filter_y_a
- pmaddubsw m2, filter_y_a
- punpcklbw m3, m5
- paddw m0, filter_rnd
- paddw m2, filter_rnd
- psraw m0, 4
- psraw m2, 4
- punpcklbw m1, m5
-%else
- punpcklbw m2, m5
- punpcklbw m1, m5
- punpcklbw m4, m5
- punpcklbw m3, m5
- pmullw m2, filter_x_a
- pmullw m1, filter_x_b
- paddw m2, filter_rnd
- pmullw m4, filter_x_a
- pmullw m3, filter_x_b
- paddw m4, filter_rnd
- paddw m2, m1
- paddw m4, m3
- psraw m2, 4
- psraw m4, 4
- pmullw m0, filter_y_a
- pmullw m3, m2, filter_y_b
- paddw m0, filter_rnd
- pmullw m2, filter_y_a
- pmullw m1, m4, filter_y_b
- paddw m2, filter_rnd
- paddw m0, m3
- movh m3, [dstq+dst_strideq]
- paddw m2, m1
- movh m1, [dstq]
- psraw m0, 4
- psraw m2, 4
- punpcklbw m3, m5
- punpcklbw m1, m5
-%endif
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- INC_SRC_BY_SRC_STRIDE
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec h
- jg .x_other_y_other_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
- STORE_AND_RET
-%endmacro
-
-; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
-; between the ssse3 and non-ssse3 version. It may make sense to merge their
-; code in the sense that the ssse3 version would jump to the appropriate
-; location in the sse/2 version, rather than duplicating that code in the
-; binary.
-
-INIT_MMX sse
-SUBPEL_VARIANCE 4
-INIT_XMM sse2
-SUBPEL_VARIANCE 8
-SUBPEL_VARIANCE 16
-
-INIT_MMX ssse3
-SUBPEL_VARIANCE 4
-INIT_XMM ssse3
-SUBPEL_VARIANCE 8
-SUBPEL_VARIANCE 16
-
-INIT_MMX sse
-SUBPEL_VARIANCE 4, 1
-INIT_XMM sse2
-SUBPEL_VARIANCE 8, 1
-SUBPEL_VARIANCE 16, 1
-
-INIT_MMX ssse3
-SUBPEL_VARIANCE 4, 1
-INIT_XMM ssse3
-SUBPEL_VARIANCE 8, 1
-SUBPEL_VARIANCE 16, 1
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+++ /dev/null
@@ -1,525 +1,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h> // AVX2
-
-#include "./vp9_rtcd.h"
-#include "vpx_ports/mem.h"
-#include "vp9/encoder/vp9_variance.h"
-
-DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
- 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
- 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
- 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
- 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
- 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
- 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
- 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
- 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
- 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
- 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
- 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
- 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
- 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
-};
-
-#define FILTER_SRC(filter) \
- /* filter the source */ \
- exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
- exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
- \
- /* add 8 to source */ \
- exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
- exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
- \
- /* divide source by 16 */ \
- exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
- exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-#define MERGE_WITH_SRC(src_reg, reg) \
- exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
- exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
-
-#define LOAD_SRC_DST \
- /* load source and destination */ \
- src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
- dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
-
-#define AVG_NEXT_SRC(src_reg, size_stride) \
- src_next_reg = _mm256_loadu_si256((__m256i const *) \
- (src + size_stride)); \
- /* average between current and next stride source */ \
- src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-#define MERGE_NEXT_SRC(src_reg, size_stride) \
- src_next_reg = _mm256_loadu_si256((__m256i const *) \
- (src + size_stride)); \
- MERGE_WITH_SRC(src_reg, src_next_reg)
-
-#define CALC_SUM_SSE_INSIDE_LOOP \
- /* expand each byte to 2 bytes */ \
- exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
- exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
- /* source - dest */ \
- exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
- exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
- /* caculate sum */ \
- sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
- exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
- sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
- exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
- /* calculate sse */ \
- sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
- sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-// final calculation to sum and sse
-#define CALC_SUM_AND_SSE \
- res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
- sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
- sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
- sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
- sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
- sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
- \
- sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
- sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
- \
- sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
- sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
- *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
- _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
- sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
- sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
- sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
- _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
-
-
-unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- int height,
- unsigned int *sse) {
- __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
- __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
- __m256i zero_reg;
- int i, sum;
- sum_reg = _mm256_set1_epi16(0);
- sse_reg = _mm256_set1_epi16(0);
- zero_reg = _mm256_set1_epi16(0);
-
- // x_offset = 0 and y_offset = 0
- if (x_offset == 0) {
- if (y_offset == 0) {
- for (i = 0; i < height ; i++) {
- LOAD_SRC_DST
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
- }
- // x_offset = 0 and y_offset = 8
- } else if (y_offset == 8) {
- __m256i src_next_reg;
- for (i = 0; i < height ; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, src_stride)
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
- }
- // x_offset = 0 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg;
-
- y_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height ; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, src_stride)
- FILTER_SRC(filter)
- CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
- }
- }
- // x_offset = 8 and y_offset = 0
- } else if (x_offset == 8) {
- if (y_offset == 0) {
- __m256i src_next_reg;
- for (i = 0; i < height ; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
- }
- // x_offset = 8 and y_offset = 8
- } else if (y_offset == 8) {
- __m256i src_next_reg, src_avg;
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height ; i++) {
- src_avg = src_reg;
- src+= src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- // average between previous average to current average
- src_avg = _mm256_avg_epu8(src_avg, src_reg);
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_avg, zero_reg)
- // save current source average
- CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
- }
- // x_offset = 8 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg, src_avg;
- y_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height ; i++) {
- // save current source average
- src_avg = src_reg;
- src+= src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- MERGE_WITH_SRC(src_avg, src_reg)
- FILTER_SRC(filter)
- CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
- }
- }
- // x_offset = bilin interpolation and y_offset = 0
- } else {
- if (y_offset == 0) {
- __m256i filter, pw8, src_next_reg;
- x_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height ; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = 8
- } else if (y_offset == 8) {
- __m256i filter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height ; i++) {
- src+= src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // average between previous pack to the current
- src_pack = _mm256_avg_epu8(src_pack, src_reg);
- MERGE_WITH_SRC(src_pack, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src_pack = src_reg;
- dst+= dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = bilin interpolation
- } else {
- __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- xfilter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
- y_offset <<= 5;
- yfilter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
- MERGE_NEXT_SRC(src_reg, 1)
-
- FILTER_SRC(xfilter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height ; i++) {
- src+= src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(xfilter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // merge previous pack to current pack source
- MERGE_WITH_SRC(src_pack, src_reg)
- // filter the source
- FILTER_SRC(yfilter)
- src_pack = src_reg;
- CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
- }
- }
- }
- CALC_SUM_AND_SSE
- return sum;
-}
-
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- const uint8_t *sec,
- int sec_stride,
- int height,
- unsigned int *sse) {
- __m256i sec_reg;
- __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
- __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
- __m256i zero_reg;
- int i, sum;
- sum_reg = _mm256_set1_epi16(0);
- sse_reg = _mm256_set1_epi16(0);
- zero_reg = _mm256_set1_epi16(0);
-
- // x_offset = 0 and y_offset = 0
- if (x_offset == 0) {
- if (y_offset == 0) {
- for (i = 0; i < height ; i++) {
- LOAD_SRC_DST
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec+= sec_stride;
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
- }
- } else if (y_offset == 8) {
- __m256i src_next_reg;
- for (i = 0; i < height ; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, src_stride)
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec+= sec_stride;
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
- }
- // x_offset = 0 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg;
-
- y_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height ; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, src_stride)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec+= sec_stride;
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
- }
- }
- // x_offset = 8 and y_offset = 0
- } else if (x_offset == 8) {
- if (y_offset == 0) {
- __m256i src_next_reg;
- for (i = 0; i < height ; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec+= sec_stride;
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
- }
- // x_offset = 8 and y_offset = 8
- } else if (y_offset == 8) {
- __m256i src_next_reg, src_avg;
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height ; i++) {
- // save current source average
- src_avg = src_reg;
- src+= src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- // average between previous average to current average
- src_avg = _mm256_avg_epu8(src_avg, src_reg);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
- src_avg = _mm256_avg_epu8(src_avg, sec_reg);
- sec+= sec_stride;
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_avg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
- }
- // x_offset = 8 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg, src_avg;
- y_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height ; i++) {
- // save current source average
- src_avg = src_reg;
- src+= src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- MERGE_WITH_SRC(src_avg, src_reg)
- FILTER_SRC(filter)
- src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
- src_avg = _mm256_avg_epu8(src_avg, sec_reg);
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_avg, zero_reg)
- sec+= sec_stride;
- CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
- }
- }
- // x_offset = bilin interpolation and y_offset = 0
- } else {
- if (y_offset == 0) {
- __m256i filter, pw8, src_next_reg;
- x_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height ; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- MERGE_WITH_SRC(src_reg, zero_reg)
- sec+= sec_stride;
- CALC_SUM_SSE_INSIDE_LOOP
- src+= src_stride;
- dst+= dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = 8
- } else if (y_offset == 8) {
- __m256i filter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- filter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height ; i++) {
- src+= src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // average between previous pack to the current
- src_pack = _mm256_avg_epu8(src_pack, src_reg);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
- src_pack = _mm256_avg_epu8(src_pack, sec_reg);
- sec+= sec_stride;
- MERGE_WITH_SRC(src_pack, zero_reg)
- src_pack = src_reg;
- CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = bilin interpolation
- } else {
- __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- xfilter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + x_offset));
- y_offset <<= 5;
- yfilter = _mm256_load_si256((__m256i const *)
- (bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *) (src));
- MERGE_NEXT_SRC(src_reg, 1)
-
- FILTER_SRC(xfilter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height ; i++) {
- src+= src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(xfilter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // merge previous pack to current pack source
- MERGE_WITH_SRC(src_pack, src_reg)
- // filter the source
- FILTER_SRC(yfilter)
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
- src_pack = _mm256_avg_epu8(src_pack, sec_reg);
- MERGE_WITH_SRC(src_pack, zero_reg)
- src_pack = src_reg;
- sec+= sec_stride;
- CALC_SUM_SSE_INSIDE_LOOP
- dst+= dst_stride;
- }
- }
- }
- CALC_SUM_AND_SSE
- return sum;
-}
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ /dev/null
@@ -1,104 +1,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
- int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride,
- int height,
- unsigned int *sse);
-
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- const uint8_t *sec,
- int sec_stride,
- int height,
- unsigned int *sseptr);
-
-unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- unsigned int sse1;
- const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- 64, &sse1);
- unsigned int sse2;
- const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
- x_offset, y_offset,
- dst + 32, dst_stride,
- 64, &sse2);
- const int se = se1 + se2;
- *sse = sse1 + sse2;
- return *sse - (((int64_t)se * se) >> 12);
-}
-
-unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- 32, sse);
- return *sse - (((int64_t)se * se) >> 10);
-}
-
-unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- unsigned int *sse,
- const uint8_t *sec) {
- unsigned int sse1;
- const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- sec, 64, 64, &sse1);
- unsigned int sse2;
- const int se2 =
- vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
- y_offset, dst + 32, dst_stride,
- sec + 32, 64, 64, &sse2);
- const int se = se1 + se2;
-
- *sse = sse1 + sse2;
-
- return *sse - (((int64_t)se * se) >> 12);
-}
-
-unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- unsigned int *sse,
- const uint8_t *sec) {
- // processing 32 element in parallel
- const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- sec, 32, 32, sse);
- return *sse - (((int64_t)se * se) >> 10);
-}
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ /dev/null
@@ -1,182 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
-int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint8_t *dst, \
- ptrdiff_t dst_stride, \
- int height, unsigned int *sse, \
- void *unused0, void *unused)
-#define DECLS(opt1, opt2) \
-DECL(4, opt2); \
-DECL(8, opt1); \
-DECL(16, opt1)
-
-DECLS(sse2, sse);
-DECLS(ssse3, ssse3);
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
- int src_stride, \
- int x_offset, \
- int y_offset, \
- const uint8_t *dst, \
- int dst_stride, \
- unsigned int *sse_ptr) { \
- unsigned int sse; \
- int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
- y_offset, dst, dst_stride, \
- h, &sse, NULL, NULL); \
- if (w > wf) { \
- unsigned int sse2; \
- int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
- x_offset, y_offset, \
- dst + 16, dst_stride, \
- h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
- x_offset, y_offset, \
- dst + 48, dst_stride, \
- h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sse_ptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
-FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
-FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
-FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
-FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
-FN(4, 4, 4, 2, 2, opt2, (unsigned int))
-
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
-int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint8_t *dst, \
- ptrdiff_t dst_stride, \
- const uint8_t *sec, \
- ptrdiff_t sec_stride, \
- int height, unsigned int *sse, \
- void *unused0, void *unused)
-#define DECLS(opt1, opt2) \
-DECL(4, opt2); \
-DECL(8, opt1); \
-DECL(16, opt1)
-
-DECLS(sse2, sse);
-DECLS(ssse3, ssse3);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
- int src_stride, \
- int x_offset, \
- int y_offset, \
- const uint8_t *dst, \
- int dst_stride, \
- unsigned int *sseptr, \
- const uint8_t *sec) { \
- unsigned int sse; \
- int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
- y_offset, dst, dst_stride, \
- sec, w, h, &sse, NULL, \
- NULL); \
- if (w > wf) { \
- unsigned int sse2; \
- int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
- x_offset, y_offset, \
- dst + 16, dst_stride, \
- sec + 16, w, h, &sse2, \
- NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
- x_offset, y_offset, \
- dst + 32, dst_stride, \
- sec + 32, w, h, &sse2, \
- NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
- x_offset, y_offset, \
- dst + 48, dst_stride, \
- sec + 48, w, h, &sse2, \
- NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sseptr = sse; \
- return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
-FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
-FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
-FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
-FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
-FN(4, 4, 4, 2, 2, opt2, (unsigned int))
-
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
-
-#undef FNS
-#undef FN
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -131,7 +131,6 @@
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
# common (msa)
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -58,7 +58,6 @@
VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.h
VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
-VP9_CX_SRCS-yes += encoder/vp9_variance.h
VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
VP9_CX_SRCS-yes += encoder/vp9_encoder.c
VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
@@ -84,7 +83,6 @@
VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
-VP9_CX_SRCS-yes += encoder/vp9_variance.c
VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c
VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h
VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c
@@ -103,7 +101,6 @@
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@@ -114,13 +111,7 @@
ifeq ($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
endif
-endif
ifeq ($(ARCH_X86_64),yes)
ifeq ($(CONFIG_USE_X86INC),yes)
@@ -143,7 +134,6 @@
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2_impl.h
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
@@ -150,7 +140,6 @@
endif
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
@@ -160,6 +149,5 @@
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_variance_msa.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
--- /dev/null
+++ b/vpx_dsp/arm/bilinear_filter_media.asm
@@ -1,0 +1,237 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vpx_filter_block2d_bil_first_pass_media|
+ EXPORT |vpx_filter_block2d_bil_second_pass_media|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;-------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 unsigned short *dst_ptr,
+; r2 unsigned int src_pitch,
+; r3 unsigned int height,
+; stack unsigned int width,
+; stack const short *vpx_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vpx_filter_block2d_bil_first_pass_media| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vpx_filter address
+ ldr r4, [sp, #36] ; width
+
+ mov r12, r3 ; outer-loop counter
+
+ add r7, r2, r4 ; preload next row
+ pld [r0, r7]
+
+ sub r2, r2, r4 ; src increment for height loop
+
+ ldr r5, [r11] ; load up filter coefficients
+
+ mov r3, r3, lsl #1 ; height*2
+ add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+ mov r11, r1 ; save dst_ptr for each row
+
+ cmp r5, #128 ; if filter coef = 128, then skip the filter
+ beq bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+ ldrb r6, [r0] ; load source data
+ ldrb r7, [r0, #1]
+ ldrb r8, [r0, #2]
+ mov lr, r4, lsr #2 ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+ ldrb r9, [r0, #3]
+ ldrb r10, [r0, #4]
+
+ pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
+ pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
+
+ smuad r6, r6, r5 ; apply the filter
+ pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
+ smuad r7, r7, r5
+ pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
+
+ smuad r8, r8, r5
+ smuad r9, r9, r5
+
+ add r0, r0, #4
+ subs lr, lr, #1
+
+ add r6, r6, #0x40 ; round_shift_and_clamp
+ add r7, r7, #0x40
+ usat r6, #16, r6, asr #7
+ usat r7, #16, r7, asr #7
+
+ strh r6, [r1], r3 ; result is transposed and stored
+
+ add r8, r8, #0x40 ; round_shift_and_clamp
+ strh r7, [r1], r3
+ add r9, r9, #0x40
+ usat r8, #16, r8, asr #7
+ usat r9, #16, r9, asr #7
+
+ strh r8, [r1], r3 ; result is transposed and stored
+
+ ldrneb r6, [r0] ; load source data
+ strh r9, [r1], r3
+
+ ldrneb r7, [r0, #1]
+ ldrneb r8, [r0, #2]
+
+ bne bil_width_loop_1st_v6
+
+ add r0, r0, r2 ; move to next input row
+ subs r12, r12, #1
+
+ add r9, r2, r4, lsl #1 ; adding back block width
+ pld [r0, r9] ; preload next row
+
+ add r11, r11, #2 ; move over to next column
+ mov r1, r11
+
+ bne bil_height_loop_1st_v6
+
+ ldmia sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+ mov lr, r4, lsr #2 ; loop counter
+
+|bil_width_loop_null_1st|
+ ldrb r6, [r0] ; load data
+ ldrb r7, [r0, #1]
+ ldrb r8, [r0, #2]
+ ldrb r9, [r0, #3]
+
+ strh r6, [r1], r3 ; store it to immediate buffer
+ add r0, r0, #4
+ strh r7, [r1], r3
+ subs lr, lr, #1
+ strh r8, [r1], r3
+ strh r9, [r1], r3
+
+ bne bil_width_loop_null_1st
+
+ subs r12, r12, #1
+ add r0, r0, r2 ; move to next input line
+ add r11, r11, #2 ; move over to next column
+ mov r1, r11
+
+ bne bil_height_loop_null_1st
+
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP ; |vpx_filter_block2d_bil_first_pass_media|
+
+
+;---------------------------------
+; r0 unsigned short *src_ptr,
+; r1 unsigned char *dst_ptr,
+; r2 int dst_pitch,
+; r3 unsigned int height,
+; stack unsigned int width,
+; stack const short *vpx_filter
+;---------------------------------
+|vpx_filter_block2d_bil_second_pass_media| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vpx_filter address
+ ldr r4, [sp, #36] ; width
+
+ ldr r5, [r11] ; load up filter coefficients
+ mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
+ mov r11, r1
+
+ cmp r5, #128 ; if filter coef = 128, then skip the filter
+ beq bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+ ldr r6, [r0] ; load the data
+ ldr r8, [r0, #4]
+ ldrh r10, [r0, #8]
+ mov lr, r3, lsr #2 ; loop counter
+
+|bil_width_loop_2nd|
+ pkhtb r7, r6, r8 ; src[1] | src[2]
+ pkhtb r9, r8, r10 ; src[3] | src[4]
+
+ smuad r6, r6, r5 ; apply filter
+ smuad r8, r8, r5 ; apply filter
+
+ subs lr, lr, #1
+
+ smuadx r7, r7, r5 ; apply filter
+ smuadx r9, r9, r5 ; apply filter
+
+ add r0, r0, #8
+
+ add r6, r6, #0x40 ; round_shift_and_clamp
+ add r7, r7, #0x40
+ usat r6, #8, r6, asr #7
+ usat r7, #8, r7, asr #7
+ strb r6, [r1], r2 ; the result is transposed back and stored
+
+ add r8, r8, #0x40 ; round_shift_and_clamp
+ strb r7, [r1], r2
+ add r9, r9, #0x40
+ usat r8, #8, r8, asr #7
+ usat r9, #8, r9, asr #7
+ strb r8, [r1], r2 ; the result is transposed back and stored
+
+ ldrne r6, [r0] ; load data
+ strb r9, [r1], r2
+ ldrne r8, [r0, #4]
+ ldrneh r10, [r0, #8]
+
+ bne bil_width_loop_2nd
+
+ subs r12, r12, #1
+ add r0, r0, #4 ; update src for next row
+ add r11, r11, #1
+ mov r1, r11
+
+ bne bil_height_loop_2nd
+ ldmia sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+ mov lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+ ldr r6, [r0], #4 ; load data
+ subs lr, lr, #1
+ ldr r8, [r0], #4
+
+ strb r6, [r1], r2 ; store data
+ mov r7, r6, lsr #16
+ strb r7, [r1], r2
+ mov r9, r8, lsr #16
+ strb r8, [r1], r2
+ strb r9, [r1], r2
+
+ bne bil_width_loop_null_2nd
+
+ subs r12, r12, #1
+ add r0, r0, #4
+ add r11, r11, #1
+ mov r1, r11
+
+ bne bil_height_loop_null_2nd
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vpx_filter_block2d_second_pass_media|
+
+ END
--- /dev/null
+++ b/vpx_dsp/arm/subpel_variance_media.c
@@ -1,0 +1,105 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_MEDIA
+static const int16_t bilinear_filters_media[8][2] = {
+ { 128, 0 },
+ { 112, 16 },
+ { 96, 32 },
+ { 80, 48 },
+ { 64, 64 },
+ { 48, 80 },
+ { 32, 96 },
+ { 16, 112 }
+};
+
+extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr,
+ uint16_t *dst_ptr,
+ uint32_t src_pitch,
+ uint32_t height,
+ uint32_t width,
+ const int16_t *filter);
+
+extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr,
+ uint8_t *dst_ptr,
+ int32_t src_pitch,
+ uint32_t height,
+ uint32_t width,
+ const int16_t *filter);
+
+
+unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset, int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ uint16_t first_pass[10*8];
+ uint8_t second_pass[8*8];
+ const int16_t *HFilter, *VFilter;
+
+ HFilter = bilinear_filters_media[xoffset];
+ VFilter = bilinear_filters_media[yoffset];
+
+ vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+ src_pixels_per_line,
+ 9, 8, HFilter);
+ vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
+ 8, 8, 8, VFilter);
+
+ return vpx_variance8x8_media(second_pass, 8, dst_ptr,
+ dst_pixels_per_line, sse);
+}
+
+unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ uint16_t first_pass[36*16];
+ uint8_t second_pass[20*16];
+ const int16_t *HFilter, *VFilter;
+ unsigned int var;
+
+ if (xoffset == 4 && yoffset == 0) {
+ var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ sse);
+ } else if (xoffset == 0 && yoffset == 4) {
+ var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ sse);
+ } else if (xoffset == 4 && yoffset == 4) {
+ var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ sse);
+ } else {
+ HFilter = bilinear_filters_media[xoffset];
+ VFilter = bilinear_filters_media[yoffset];
+
+ vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+ src_pixels_per_line,
+ 17, 16, HFilter);
+ vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
+ 16, 16, 16, VFilter);
+
+ var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
+ dst_pixels_per_line, sse);
+ }
+ return var;
+}
+#endif // HAVE_MEDIA
--- /dev/null
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -1,0 +1,152 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+ { 128, 0, },
+ { 112, 16, },
+ { 96, 32, },
+ { 80, 48, },
+ { 64, 64, },
+ { 48, 80, },
+ { 32, 96, },
+ { 16, 112, },
+};
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ const uint8x8_t f0 = vmov_n_u8(filter[0]);
+ const uint8x8_t f1 = vmov_n_u8(filter[1]);
+ unsigned int i;
+ for (i = 0; i < output_height; ++i) {
+ const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+ const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+ const uint16x8_t a = vmull_u8(src_0, f0);
+ const uint16x8_t b = vmlal_u8(a, src_1, f1);
+ const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+ vst1_u8(&output_ptr[0], out);
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ const uint8x8_t f0 = vmov_n_u8(filter[0]);
+ const uint8x8_t f1 = vmov_n_u8(filter[1]);
+ unsigned int i, j;
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 16) {
+ const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
+ const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
+ const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+ const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+ const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+ const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+ const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+ const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+ vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+ }
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
+
+ var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
+ 9, 8,
+ bilinear_filters[xoffset]);
+ var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
+ 8, bilinear_filters[yoffset]);
+ return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+ 17, 16,
+ bilinear_filters[xoffset]);
+ var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
+ 16, bilinear_filters[yoffset]);
+ return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+ 33, 32,
+ bilinear_filters[xoffset]);
+ var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
+ 32, bilinear_filters[yoffset]);
+ return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
+ DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+ 65, 64,
+ bilinear_filters[xoffset]);
+ var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
+ 64, bilinear_filters[yoffset]);
+ return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+}
--- /dev/null
+++ b/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm
@@ -1,0 +1,182 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vpx_variance_halfpixvar16x16_h_media|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vpx_variance_halfpixvar16x16_h_media| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
+
--- /dev/null
+++ b/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm
@@ -1,0 +1,222 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vpx_variance_halfpixvar16x16_hv_media|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vpx_variance_halfpixvar16x16_hv_media| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; pointer to pixels on the next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load source pixels a, row N
+ ldr r6, [r0, #1] ; load source pixels b, row N
+ ldr r5, [r9, #0] ; load source pixels c, row N+1
+ ldr r7, [r9, #1] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #0] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load source pixels a, row N
+ ldr r6, [r0, #5] ; load source pixels b, row N
+ ldr r5, [r9, #4] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #5] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #4] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load source pixels a, row N
+ ldr r6, [r0, #9] ; load source pixels b, row N
+ ldr r5, [r9, #8] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #9] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #8] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load source pixels a, row N
+ ldr r6, [r0, #13] ; load source pixels b, row N
+ ldr r5, [r9, #12] ; load source pixels c, row N+1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+ ldr r7, [r9, #13] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #12] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ subs r12, r12, #1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
--- /dev/null
+++ b/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm
@@ -1,0 +1,184 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vpx_variance_halfpixvar16x16_v_media|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vpx_variance_halfpixvar16x16_v_media| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; set src pointer to next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r9, #0] ; load 4 src pixels from next row
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r9, #4] ; load 4 src pixels from next row
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r9, #8] ; load 4 src pixels from next row
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r9, #12] ; load 4 src pixels from next row
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; subtract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
+
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -24,10 +24,34 @@
#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
#if (__mips_isa_rev >= 6)
+#define LH(psrc) ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint16_t val_m; \
+ \
+ __asm__ __volatile__ ( \
+ "lh %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r" (val_m) \
+ : [psrc_m] "m" (*psrc_m) \
+ ); \
+ \
+ val_m; \
+})
+
#define LW(psrc) ({ \
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
uint32_t val_m; \
@@ -73,6 +97,18 @@
})
#endif // (__mips == 64)
+#define SH(val, pdst) { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint16_t val_m = (val); \
+ \
+ __asm__ __volatile__ ( \
+ "sh %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m" (*pdst_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+}
+
#define SW(val, pdst) { \
uint8_t *pdst_m = (uint8_t *)(pdst); \
const uint32_t val_m = (val); \
@@ -97,6 +133,20 @@
); \
}
#else // !(__mips_isa_rev >= 6)
+#define LH(psrc) ({ \
+ const uint8_t *psrc_m = (const uint8_t *)(psrc); \
+ uint16_t val_m; \
+ \
+ __asm__ __volatile__ ( \
+ "ulh %[val_m], %[psrc_m] \n\t" \
+ \
+ : [val_m] "=r" (val_m) \
+ : [psrc_m] "m" (*psrc_m) \
+ ); \
+ \
+ val_m; \
+})
+
#define LW(psrc) ({ \
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
uint32_t val_m; \
@@ -111,18 +161,6 @@
val_m; \
})
-#define SW(val, pdst) { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint32_t val_m = (val); \
- \
- __asm__ __volatile__ ( \
- "usw %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m" (*pdst_m) \
- : [val_m] "r" (val_m) \
- ); \
-}
-
#if (__mips == 64)
#define LD(psrc) ({ \
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
@@ -154,6 +192,30 @@
})
#endif // (__mips == 64)
+#define SH(val, pdst) { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint16_t val_m = (val); \
+ \
+ __asm__ __volatile__ ( \
+ "ush %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m" (*pdst_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+}
+
+#define SW(val, pdst) { \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ const uint32_t val_m = (val); \
+ \
+ __asm__ __volatile__ ( \
+ "usw %[val_m], %[pdst_m] \n\t" \
+ \
+ : [pdst_m] "=m" (*pdst_m) \
+ : [val_m] "r" (val_m) \
+ ); \
+}
+
#define SD(val, pdst) { \
uint8_t *pdst_m1 = (uint8_t *)(pdst); \
uint32_t val0_m, val1_m; \
@@ -196,6 +258,34 @@
LD2((psrc) + 2 * stride, stride, out2, out3); \
}
+/* Description : Store 4 words with stride
+ Arguments : Inputs - in0, in1, in2, in3, pdst, stride
+ Details : Store word from 'in0' to (pdst)
+ Store word from 'in1' to (pdst + stride)
+ Store word from 'in2' to (pdst + 2 * stride)
+ Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride) { \
+ SW(in0, (pdst)) \
+ SW(in1, (pdst) + stride); \
+ SW(in2, (pdst) + 2 * stride); \
+ SW(in3, (pdst) + 3 * stride); \
+}
+
+/* Description : Store 4 double words with stride
+ Arguments : Inputs - in0, in1, in2, in3, pdst, stride
+ Details : Store double word from 'in0' to (pdst)
+ Store double word from 'in1' to (pdst + stride)
+ Store double word from 'in2' to (pdst + 2 * stride)
+ Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride) { \
+ SD(in0, (pdst)) \
+ SD(in1, (pdst) + stride); \
+ SD(in2, (pdst) + 2 * stride); \
+ SD(in3, (pdst) + 3 * stride); \
+}
+
/* Description : Load vectors with 16 byte elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
@@ -228,7 +318,15 @@
out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
}
#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+#define LD_B7(RTYPE, psrc, stride, \
+ out0, out1, out2, out3, out4, out5, out6) { \
+ LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
+ LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
+}
+#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
+
#define LD_B8(RTYPE, psrc, stride, \
out0, out1, out2, out3, out4, out5, out6, out7) { \
LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
@@ -247,6 +345,7 @@
out0 = LD_H(RTYPE, (psrc)); \
out1 = LD_H(RTYPE, (psrc) + (stride)); \
}
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \
LD_H2(RTYPE, (psrc), stride, out0, out1); \
@@ -254,6 +353,229 @@
}
#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+#define LD_H8(RTYPE, psrc, stride, \
+ out0, out1, out2, out3, out4, out5, out6, out7) { \
+ LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
+ LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
+}
+#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
+
+#define LD_H16(RTYPE, psrc, stride, \
+ out0, out1, out2, out3, out4, out5, out6, out7, \
+ out8, out9, out10, out11, out12, out13, out14, out15) { \
+ LD_H8(RTYPE, (psrc), stride, \
+ out0, out1, out2, out3, out4, out5, out6, out7); \
+ LD_H8(RTYPE, (psrc) + 8 * stride, stride, \
+ out8, out9, out10, out11, out12, out13, out14, out15); \
+}
+#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
+
+/* Description : Load 4x4 block of signed halfword elements from 1D source
+ data into 4 vectors (Each vector with 4 signed halfwords)
+ Arguments : Input - psrc
+ Outputs - out0, out1, out2, out3
+*/
+#define LD4x4_SH(psrc, out0, out1, out2, out3) { \
+ out0 = LD_SH(psrc); \
+ out2 = LD_SH(psrc + 8); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
+}
+
+/* Description : Load 2 vectors of signed word elements with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+ Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1) { \
+ out0 = LD_SW((psrc)); \
+ out1 = LD_SW((psrc) + stride); \
+}
+
+/* Description : Store vectors of 16 byte elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 16 byte elements from 'in0' to (pdst)
+ Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) { \
+ ST_B(RTYPE, in0, (pdst)); \
+ ST_B(RTYPE, in1, (pdst) + stride); \
+}
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \
+ ST_B2(RTYPE, in0, in1, (pdst), stride); \
+ ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+}
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ pdst, stride) { \
+ ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
+ ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
+}
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 8 halfword elements from 'in0' to (pdst)
+ Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) { \
+ ST_H(RTYPE, in0, (pdst)); \
+ ST_H(RTYPE, in1, (pdst) + stride); \
+}
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) { \
+ ST_H2(RTYPE, in0, in1, (pdst), stride); \
+ ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+}
+#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
+
+#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \
+ ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
+ ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
+}
+#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
+
+/* Description : Store vectors of word elements with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 4 word elements from 'in0' to (pdst)
+ Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride) { \
+ ST_SW(in0, (pdst)); \
+ ST_SW(in1, (pdst) + stride); \
+}
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+ Arguments : Inputs - in, stidx, pdst, stride
+ Details : Index 'stidx' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst)
+ Index 'stidx+1' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + stride)
+ Index 'stidx+2' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + 2 * stride)
+ Index 'stidx+3' halfword element from 'in' vector is copied to
+ the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride) { \
+ uint16_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \
+ out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
+ out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
+ out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
+ \
+ SH(out0_m, pblk_2x4_m); \
+ SH(out1_m, pblk_2x4_m + stride); \
+ SH(out2_m, pblk_2x4_m + 2 * stride); \
+ SH(out3_m, pblk_2x4_m + 3 * stride); \
+}
+
+/* Description : Store 4x2 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst, stride
+ Details : Index 0 word element from 'in' vector is copied to the GP
+ register and stored to (pdst)
+ Index 1 word element from 'in' vector is copied to the GP
+ register and stored to (pdst + stride)
+*/
+#define ST4x2_UB(in, pdst, stride) { \
+ uint32_t out0_m, out1_m; \
+ uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_w((v4i32)in, 0); \
+ out1_m = __msa_copy_u_w((v4i32)in, 1); \
+ \
+ SW(out0_m, pblk_4x2_m); \
+ SW(out1_m, pblk_4x2_m + stride); \
+}
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : 'Idx0' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst)
+ 'Idx1' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + stride)
+ 'Idx2' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + 2 * stride)
+ 'Idx3' word element from input vector 'in0' is copied to the
+ GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \
+ uint32_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_w((v4i32)in0, idx0); \
+ out1_m = __msa_copy_u_w((v4i32)in0, idx1); \
+ out2_m = __msa_copy_u_w((v4i32)in1, idx2); \
+ out3_m = __msa_copy_u_w((v4i32)in1, idx3); \
+ \
+ SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
+}
+#define ST4x8_UB(in0, in1, pdst, stride) { \
+ uint8_t *pblk_4x8 = (uint8_t *)(pdst); \
+ \
+ ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
+ ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
+}
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst
+ Details : Index 0 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst)
+*/
+#define ST8x1_UB(in, pdst) { \
+ uint64_t out0_m; \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in, 0); \
+ SD(out0_m, pdst); \
+}
+
+/* Description : Store 8x2 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst, stride
+ Details : Index 0 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst)
+ Index 1 double word element from 'in' vector is copied to the
+ GP register and stored to (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride) { \
+ uint64_t out0_m, out1_m; \
+ uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in, 1); \
+ \
+ SD(out0_m, pblk_8x2_m); \
+ SD(out1_m, pblk_8x2_m + stride); \
+}
+
+/* Description : Store 8x4 byte block to destination memory from input
+ vectors
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Index 0 double word element from 'in0' vector is copied to the
+ GP register and stored to (pdst)
+ Index 1 double word element from 'in0' vector is copied to the
+ GP register and stored to (pdst + stride)
+ Index 0 double word element from 'in1' vector is copied to the
+ GP register and stored to (pdst + 2 * stride)
+ Index 1 double word element from 'in1' vector is copied to the
+ GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride) { \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in0, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in0, 1); \
+ out2_m = __msa_copy_u_d((v2i64)in1, 0); \
+ out3_m = __msa_copy_u_d((v2i64)in1, 1); \
+ \
+ SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
+}
+
/* Description : average with rounding (in0 + in1 + 1) / 2.
Arguments : Inputs - in0, in1, in2, in3,
Outputs - out0, out1
@@ -275,6 +597,27 @@
}
#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
+/* Description : Immediate number of elements to slide with zero
+ Arguments : Inputs - in0, in1, slide_val
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'zero_m' vector are slid into 'in0' by
+ value specified in the 'slide_val'
+*/
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \
+ v16i8 zero_m = { 0 }; \
+ out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
+ out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
+}
+#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
+
+#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
+ out0, out1, out2, out3, slide_val) { \
+ SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
+ SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
+}
+#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
+
/* Description : Immediate number of elements to slide
Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
Outputs - out0, out1
@@ -287,7 +630,149 @@
out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
}
#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
+#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
+ out0, out1, out2, slide_val) { \
+ SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
+ out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \
+}
+#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+ Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Byte elements from 'in0' & 'in1' are copied selectively to
+ 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \
+ out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+}
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+
+#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
+ out0, out1, out2, out3) { \
+ VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
+ VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
+}
+#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
+#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Unsigned byte elements from 'mult0' are multiplied with
+ unsigned byte elements from 'cnst0' producing a result
+ twice the size of input i.e. unsigned halfword.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
+ out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \
+ out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \
+}
+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
+
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
+ cnst0, cnst1, cnst2, cnst3, \
+ out0, out1, out2, out3) { \
+ DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+}
+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed byte elements from 'mult0' are multiplied with
+ signed byte elements from 'cnst0' producing a result
+ twice the size of input i.e. signed halfword.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
+ out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
+}
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
+ cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \
+ DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+}
+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed halfword elements from 'mult0' are multiplied with
+ signed halfword elements from 'cnst0' producing a result
+ twice the size of input i.e. signed word.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
+ out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
+}
+#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
+
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
+ cnst0, cnst1, cnst2, cnst3, \
+ out0, out1, out2, out3) { \
+ DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+}
+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of word vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed word elements from 'mult0' are multiplied with
+ signed word elements from 'cnst0' producing a result
+ twice the size of input i.e. signed double word.
+ The multiplication result of adjacent odd-even elements
+ are added together and written to the 'out0' vector
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
+ out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
+}
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
+/* Description : Dot product & addition of byte vector elements
+ Arguments : Inputs - mult0, mult1, cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed byte elements from 'mult0' are multiplied with
+ signed byte elements from 'cnst0' producing a result
+ twice the size of input i.e. signed halfword.
+ The multiplication result of adjacent odd-even elements
+ are added to the 'out0' vector
+*/
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
+ out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
+ out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
+}
+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
+
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
+ cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \
+ DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
+ DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
+}
+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
+
/* Description : Dot product & addition of halfword vector elements
Arguments : Inputs - mult0, mult1, cnst0, cnst1
Outputs - out0, out1
@@ -309,7 +794,7 @@
Outputs - out0, out1
Return Type - as per RTYPE
Details : Each signed word element from 'mult0' is multiplied with itself
- producing an intermediate result twice the size of it
+ producing an intermediate result twice the size of input
i.e. signed double word
The multiplication result of adjacent odd-even elements
are added to the 'out0' vector
@@ -320,6 +805,49 @@
}
#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
+/* Description : Minimum values between unsigned elements of
+ either vector are copied to the output vector
+ Arguments : Inputs - in0, in1, min_vec
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Minimum of unsigned halfword element values from 'in0' and
+ 'min_vec' are written to output vector 'in0'
+*/
+#define MIN_UH2(RTYPE, in0, in1, min_vec) { \
+ in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
+ in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
+}
+#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
+
+#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) { \
+ MIN_UH2(RTYPE, in0, in1, min_vec); \
+ MIN_UH2(RTYPE, in2, in3, min_vec); \
+}
+#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+ between 0 & 255
+ Arguments : Input - in
+ Output - out_m
+ Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in) ({ \
+ v8i16 max_m = __msa_ldi_h(255); \
+ v8i16 out_m; \
+ \
+ out_m = __msa_maxi_s_h((v8i16)in, 0); \
+ out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
+ out_m; \
+})
+#define CLIP_SH2_0_255(in0, in1) { \
+ in0 = CLIP_SH_0_255(in0); \
+ in1 = CLIP_SH_0_255(in1); \
+}
+#define CLIP_SH4_0_255(in0, in1, in2, in3) { \
+ CLIP_SH2_0_255(in0, in1); \
+ CLIP_SH2_0_255(in2, in3); \
+}
+
/* Description : Horizontal addition of 4 signed word elements of input vector
Arguments : Input - in (signed word vector)
Output - sum_m (i32 sum)
@@ -358,6 +886,26 @@
sum_m; \
})
+/* Description : Horizontal addition of unsigned byte vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each unsigned odd byte element from 'in0' is added to
+ even unsigned byte element from 'in0' (pairwise) and the
+ halfword result is written to 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1) { \
+ out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
+ out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
+}
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \
+ HADD_UB2(RTYPE, in0, in1, out0, out1); \
+ HADD_UB2(RTYPE, in2, in3, out2, out3); \
+}
+#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
+
/* Description : Horizontal subtraction of unsigned byte vector elements
Arguments : Inputs - in0, in1
Outputs - out0, out1
@@ -393,6 +941,20 @@
sad_m; \
})
+/* Description : Horizontal subtraction of signed halfword vector elements
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Each signed odd halfword element from 'in0' is subtracted from
+ even signed halfword element from 'in0' (pairwise) and the
+ word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1) { \
+ out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
+ out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
+}
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
/* Description : Set element n input vector to GPR value
Arguments : Inputs - in0, in1, in2, in3
Output - out
@@ -399,6 +961,12 @@
Return Type - as per RTYPE
Details : Set element 0 in vector 'out' to value specified in 'in0'
*/
+#define INSERT_W2(RTYPE, in0, in1, out) { \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+ out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+}
+#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
+
#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \
out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
@@ -415,6 +983,211 @@
#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+/* Description : Interleave even byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even byte elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+ out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+}
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even halfword elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
+ out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
+}
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
+ out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
+}
+#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even double word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
+ out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
+}
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of byte elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
+}
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+}
+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
+
+/* Description : Interleave left half of halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of halfword elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
+}
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Left half of word elements of 'in0' and 'in1' are interleaved
+ and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
+}
+#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of byte elements of 'in0' and 'in1' are interleaved
+ and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
+}
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+}
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+
+#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15, \
+ out0, out1, out2, out3, out4, out5, out6, out7) { \
+ ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3); \
+ ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
+ out4, out5, out6, out7); \
+}
+#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of halfword elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
+}
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
+}
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
+}
+#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
+}
+#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of double word elements of 'in0' and 'in1' are
+ interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
+ out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
+}
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) { \
+ ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \
+}
+#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
+
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
+}
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
/* Description : Interleave both left and right half of input vectors
Arguments : Inputs - in0, in1
Outputs - out0, out1
@@ -427,13 +1200,138 @@
out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
}
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \
out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
}
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \
+ out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+ out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+}
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+ unsigned value of (sat_val + 1) bits
+ The element data width remains unchanged
+ Arguments : Inputs - in0, in1, sat_val
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned halfword element from 'in0' is saturated to the
+ value generated with (sat_val + 1) bit range.
+ The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val) { \
+ in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
+ in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
+}
+#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
+
+#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) { \
+ SAT_UH2(RTYPE, in0, in1, sat_val); \
+ SAT_UH2(RTYPE, in2, in3, sat_val) \
+}
+#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+ unsigned value of (sat_val + 1) bits
+ The element data width remains unchanged
+ Arguments : Inputs - in0, in1, sat_val
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned halfword element from 'in0' is saturated to the
+ value generated with (sat_val + 1) bit range
+ The results are written in place
+*/
+#define SAT_SH2(RTYPE, in0, in1, sat_val) { \
+ in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
+ in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
+}
+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
+
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) { \
+ SAT_SH2(RTYPE, in0, in1, sat_val); \
+ SAT_SH2(RTYPE, in2, in3, sat_val); \
+}
+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Indexed halfword element values are replicated to all
+ elements in output vector
+ Arguments : Inputs - in, idx0, idx1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : 'idx0' element value from 'in' vector is replicated to all
+ elements in 'out0' vector
+ Valid index range for halfword operation is 0-7
+*/
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) { \
+ out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \
+ out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \
+}
+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
+
+#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
+ out0, out1, out2, out3) { \
+ SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
+ SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
+}
+#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
+#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even byte elements of 'in0' are copied to the left half of
+ 'out0' & even byte elements of 'in1' are copied to the right
+ half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+ out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
+}
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
+}
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even halfword elements of 'in0' are copied to the left half of
+ 'out0' & even halfword elements of 'in1' are copied to the
+ right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
+}
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
+}
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
/* Description : Pack even double word elements of vector pairs
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
@@ -447,6 +1345,7 @@
out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
}
#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) { \
@@ -455,6 +1354,256 @@
}
#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
+/* Description : Each byte element is logically xor'ed with immediate 128
+ Arguments : Inputs - in0, in1
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each unsigned byte element from input vector 'in0' is
+ logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128(RTYPE, in0, in1) { \
+ in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
+ in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
+}
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+
+#define XORI_B3_128(RTYPE, in0, in1, in2) { \
+ XORI_B2_128(RTYPE, in0, in1); \
+ in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
+}
+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
+
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) { \
+ XORI_B2_128(RTYPE, in0, in1); \
+ XORI_B2_128(RTYPE, in2, in3); \
+}
+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
+
+#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) { \
+ XORI_B4_128(RTYPE, in0, in1, in2, in3); \
+ XORI_B3_128(RTYPE, in4, in5, in6); \
+}
+#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
+
+/* Description : Average of signed halfword elements -> (a + b) / 2
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3
+ Return Type - as per RTYPE
+ Details : Each signed halfword element from 'in0' is added to each
+ signed halfword element of 'in1' with full precision resulting
+ in one extra bit in the result. The result is then divided by
+ 2 and written to 'out0'
+*/
+#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \
+ out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \
+ out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \
+}
+#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Addition of signed halfword elements and signed saturation
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Signed halfword elements from 'in0' are added to signed
+ halfword elements of 'in1'. The result is then signed saturated
+ between halfword data type range
+*/
+#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \
+ out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
+ out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
+}
+#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
+
+#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
+}
+#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Shift left all elements of vector (generic for all data types)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - in place operation
+ Return Type - as per input vector RTYPE
+ Details : Each element of vector 'in0' is left shifted by 'shift' and
+ the result is written in-place.
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) { \
+ in0 = in0 << shift; \
+ in1 = in1 << shift; \
+ in2 = in2 << shift; \
+ in3 = in3 << shift; \
+}
+
+/* Description : Arithmetic shift right all elements of vector
+ (generic for all data types)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - in place operation
+ Return Type - as per input vector RTYPE
+ Details : Each element of vector 'in0' is right shifted by 'shift' and
+ the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V(in0, in1, in2, in3, shift) { \
+ in0 = in0 >> shift; \
+ in1 = in1 >> shift; \
+ in2 = in2 >> shift; \
+ in3 = in3 >> shift; \
+}
+
+/* Description : Shift right arithmetic rounded words
+ Arguments : Inputs - in0, in1, shift
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each element of vector 'in0' is shifted right arithmetically by
+ the number of bits in the corresponding element in the vector
+ 'shift'. The last discarded bit is added to shifted value for
+ rounding and the result is written in-place.
+ 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift) { \
+ in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
+ in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
+}
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) { \
+ SRAR_W2(RTYPE, in0, in1, shift) \
+ SRAR_W2(RTYPE, in2, in3, shift) \
+}
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
+/* Description : Shift right arithmetic rounded (immediate)
+ Arguments : Inputs - in0, in1, shift
+ Outputs - in place operation
+ Return Type - as per RTYPE
+ Details : Each element of vector 'in0' is shifted right arithmetically by
+ the value in 'shift'. The last discarded bit is added to the
+ shifted value for rounding and the result is written in-place.
+ 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift) { \
+ in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
+ in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
+}
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) { \
+ SRARI_H2(RTYPE, in0, in1, shift); \
+ SRARI_H2(RTYPE, in2, in3, shift); \
+}
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift) { \
+ in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
+ in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
+}
+#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \
+ SRARI_W2(RTYPE, in0, in1, shift); \
+ SRARI_W2(RTYPE, in2, in3, shift); \
+}
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Logical shift right all elements of vector (immediate)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - out0, out1, out2, out3
+ Return Type - as per RTYPE
+ Details : Each element of vector 'in0' is right shifted by 'shift' and
+ the result is written in-place. 'shift' is an immediate value.
+*/
+#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) { \
+ out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \
+ out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \
+ out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \
+ out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \
+}
+#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element from 'in0' is multiplied with elements from 'in1'
+ and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) { \
+ out0 = in0 * in1; \
+ out1 = in2 * in3; \
+}
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ MUL2(in0, in1, in2, in3, out0, out1); \
+ MUL2(in4, in5, in6, in7, out2, out3); \
+}
+
+/* Description : Addition of 2 pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element in 'in0' is added to 'in1' and result is written
+ to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1) { \
+ out0 = in0 + in1; \
+ out1 = in2 + in3; \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ ADD2(in0, in1, in2, in3, out0, out1); \
+ ADD2(in4, in5, in6, in7, out2, out3); \
+}
+
+/* Description : Subtraction of 2 pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element in 'in1' is subtracted from 'in0' and result is
+ written to 'out0'.
+*/
+#define SUB2(in0, in1, in2, in3, out0, out1) { \
+ out0 = in0 - in1; \
+ out1 = in2 - in3; \
+}
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) { \
+ out0 = in0 - in1; \
+ out1 = in2 - in3; \
+ out2 = in4 - in5; \
+ out3 = in6 - in7; \
+}
+
+/* Description : Sign extend halfword elements from right half of the vector
+ Arguments : Input - in (halfword vector)
+ Output - out (sign extended word vector)
+ Return Type - signed word
+ Details : Sign bit of halfword elements from input vector 'in' is
+ extracted and interleaved with same vector 'in0' to generate
+ 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out) { \
+ v8i16 sign_m; \
+ \
+ sign_m = __msa_clti_s_h((v8i16)in, 0); \
+ out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
+}
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+ Arguments : Input - in (unsigned byte vector)
+ Outputs - out0, out1 (unsigned halfword vectors)
+ Return Type - signed halfword
+ Details : Zero extended right half of vector is returned in 'out0'
+ Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH(in, out0, out1) { \
+ v16i8 zero_m = { 0 }; \
+ \
+ ILVRL_B2_SH(zero_m, in, out0, out1); \
+}
+
/* Description : Sign extend halfword elements from input vector and return
the result in pair of vectors
Arguments : Input - in (halfword vector)
@@ -473,52 +1622,312 @@
ILVRL_H2_SW(tmp_m, in, out0, out1); \
}
-/* Description : Store 4 double words with stride
+/* Description : Butterfly of 4 input vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Details : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \
+ out0 = in0 + in3; \
+ out1 = in1 + in2; \
+ \
+ out2 = in1 - in2; \
+ out3 = in0 - in3; \
+}
+
+/* Description : Butterfly of 8 input vectors
+ Arguments : Inputs - in0 ... in7
+ Outputs - out0 .. out7
+ Details : Butterfly operation
+*/
+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7) { \
+ out0 = in0 + in7; \
+ out1 = in1 + in6; \
+ out2 = in2 + in5; \
+ out3 = in3 + in4; \
+ \
+ out4 = in3 - in4; \
+ out5 = in2 - in5; \
+ out6 = in1 - in6; \
+ out7 = in0 - in7; \
+}
+
+/* Description : Butterfly of 16 input vectors
+ Arguments : Inputs - in0 ... in15
+ Outputs - out0 .. out15
+ Details : Butterfly operation
+*/
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15, \
+ out0, out1, out2, out3, out4, out5, out6, out7, \
+ out8, out9, out10, out11, out12, out13, out14, out15) { \
+ out0 = in0 + in15; \
+ out1 = in1 + in14; \
+ out2 = in2 + in13; \
+ out3 = in3 + in12; \
+ out4 = in4 + in11; \
+ out5 = in5 + in10; \
+ out6 = in6 + in9; \
+ out7 = in7 + in8; \
+ \
+ out8 = in7 - in8; \
+ out9 = in6 - in9; \
+ out10 = in5 - in10; \
+ out11 = in4 - in11; \
+ out12 = in3 - in12; \
+ out13 = in2 - in13; \
+ out14 = in1 - in14; \
+ out15 = in0 - in15; \
+}
+
+/* Description : Transpose input 8x8 byte block
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7) { \
+ v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
+ tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
+ ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
+ ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
+ ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
+ SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
+ SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
+}
+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15, \
+ out0, out1, out2, out3, out4, out5, out6, out7) { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
+ ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
+ ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
+ ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
+ \
+ tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \
+ tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \
+ tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \
+ tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \
+ out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \
+ tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \
+ out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \
+ tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \
+ \
+ ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
+ out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \
+ out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
+ out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
+ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
+ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
+ out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+ out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
+}
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \
+ v8i16 s0_m, s1_m; \
+ \
+ ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
+ ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \
+}
+
+/* Description : Transpose 4x8 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - signed halfword
+*/
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7) { \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
+ v8i16 zero_m = { 0 }; \
+ \
+ ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
+ tmp0_n, tmp1_n, tmp2_n, tmp3_n); \
+ ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \
+ ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \
+ \
+ out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
+ out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
+ out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
+ out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
+ \
+ out4 = zero_m; \
+ out5 = zero_m; \
+ out6 = zero_m; \
+ out7 = zero_m; \
+}
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \
+ ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \
+ ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
+ ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
+}
+
+/* Description : Transpose 8x8 block with half word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7) { \
+ v8i16 s0_m, s1_m; \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
+ ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
+ ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
+ ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
+ ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
+ PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
+ tmp3_m, tmp7_m, out0, out2, out4, out6); \
+ out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
+ out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
+ out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
+ out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
+}
+#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
+
+/* Description : Transpose 4x4 block with word elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1, out2, out3
+ Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) { \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
+ ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
+ \
+ out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \
+ out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \
+ out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \
+ out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \
+}
+
+/* Description : Add block 4x4
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
- Details : Store double word from 'in0' to (pdst)
- Store double word from 'in1' to (pdst + stride)
- Store double word from 'in2' to (pdst + 2 * stride)
- Store double word from 'in3' to (pdst + 3 * stride)
+ Details : Least significant 4 bytes from each input vector are added to
+ the destination bytes, clipped between 0-255 and stored.
*/
-#define SD4(in0, in1, in2, in3, pdst, stride) { \
- SD(in0, (pdst)) \
- SD(in1, (pdst) + stride); \
- SD(in2, (pdst) + 2 * stride); \
- SD(in3, (pdst) + 3 * stride); \
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \
+ uint32_t src0_m, src1_m, src2_m, src3_m; \
+ v8i16 inp0_m, inp1_m, res0_m, res1_m; \
+ v16i8 dst0_m = { 0 }; \
+ v16i8 dst1_m = { 0 }; \
+ v16i8 zero_m = { 0 }; \
+ \
+ ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
+ LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
+ INSERT_W2_SB(src0_m, src1_m, dst0_m); \
+ INSERT_W2_SB(src2_m, src3_m, dst1_m); \
+ ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
+ ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
+ CLIP_SH2_0_255(res0_m, res1_m); \
+ PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
+ ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \
}
-/* Description : Store vectors of 8 halfword elements with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 8 halfword elements from 'in0' to (pdst)
- Store 8 halfword elements from 'in1' to (pdst + stride)
+/* Description : Pack even elements of input vectors & xor with 128
+ Arguments : Inputs - in0, in1
+ Output - out_m
+ Return Type - unsigned byte
+ Details : Signed byte even elements from 'in0' and 'in1' are packed
+ together in one vector and the resulting vector is xor'ed with
+ 128 to shift the range from signed to unsigned byte
*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) { \
- ST_H(RTYPE, in0, (pdst)); \
- ST_H(RTYPE, in1, (pdst) + stride); \
+#define PCKEV_XORI128_UB(in0, in1) ({ \
+ v16u8 out_m; \
+ \
+ out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
+ out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \
+ out_m; \
+})
+
+/* Description : Converts inputs to unsigned bytes, interleave, average & store
+ as 8x4 unsigned byte block
+ Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+ pdst, stride
+*/
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
+ dst0, dst1, dst2, dst3, pdst, stride) { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ \
+ tmp0_m = PCKEV_XORI128_UB(in0, in1); \
+ tmp1_m = PCKEV_XORI128_UB(in2, in3); \
+ ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
+ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
}
-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
-/* Description : Store 8x4 byte block to destination memory from input
- vectors
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Index 0 double word element from 'in0' vector is copied to the
- GP register and stored to (pdst)
- Index 1 double word element from 'in0' vector is copied to the
- GP register and stored to (pdst + stride)
- Index 0 double word element from 'in1' vector is copied to the
- GP register and stored to (pdst + 2 * stride)
- Index 1 double word element from 'in1' vector is copied to the
- GP register and stored to (pdst + 3 * stride)
+/* Description : Pack even byte elements and store byte vector in destination
+ memory
+ Arguments : Inputs - in0, in1, pdst
*/
-#define ST8x4_UB(in0, in1, pdst, stride) { \
- uint64_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_d((v2i64)in0, 0); \
- out1_m = __msa_copy_u_d((v2i64)in0, 1); \
- out2_m = __msa_copy_u_d((v2i64)in1, 0); \
- out3_m = __msa_copy_u_d((v2i64)in1, 1); \
- \
- SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
+#define PCKEV_ST_SB(in0, in1, pdst) { \
+ v16i8 tmp_m; \
+ \
+ tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
+ ST_SB(tmp_m, (pdst)); \
}
+
+/* Description : Horizontal 2 tap filter kernel code
+ Arguments : Inputs - in0, in1, mask, coeff, shift
+*/
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \
+ v16i8 tmp0_m; \
+ v8u16 tmp1_m; \
+ \
+ tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
+ tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \
+ tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \
+ tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
+ \
+ tmp1_m; \
+})
#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */
--- /dev/null
+++ b/vpx_dsp/mips/sub_pixel_variance_msa.c
@@ -1,0 +1,767 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters_msa[8][2] = {
+ { 128, 0, },
+ { 112, 16, },
+ { 96, 32, },
+ { 80, 48, },
+ { 64, 64, },
+ { 48, 80, },
+ { 32, 96, },
+ { 16, 112, },
+};
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) { \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
+ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
+ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+ \
+ sub += res_l0_m + res_l1_m; \
+}
+
+#define VARIANCE_WxH(sse, diff, shift) \
+ sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+ sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter,
+ int32_t height,
+ int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 filt0, ref = { 0 };
+ v16i8 src0, src1, src2, src3;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 const255;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+ vec0, vec1, vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+ src0, src1, src2, src3);
+ ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+ src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+ CALC_MSE_AVG_B(src0, ref, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter,
+ int32_t height,
+ int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 filt0, out, ref0, ref1, ref2, ref3;
+ v16i8 src0, src1, src2, src3;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 vec0, vec1, vec2, vec3, const255;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+ vec0, vec1, vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+ PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+ src0, src1, src2, src3);
+ out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+ CALC_MSE_AVG_B(out, ref0, var, avg);
+ out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+ CALC_MSE_AVG_B(out, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter,
+ int32_t height,
+ int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v16u8 dst0, dst1, dst2, dst3, filt0;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8u16 const255;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+ out0, out1, out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+ out4, out5, out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ MIN_UH4_UH(out0, out1, out2, out3, const255);
+ MIN_UH4_UH(out4, out5, out6, out7, const255);
+ PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
+ src0, src1, src2, src3);
+ CALC_MSE_AVG_B(src0, dst0, var, avg);
+ CALC_MSE_AVG_B(src1, dst1, var, avg);
+ CALC_MSE_AVG_B(src2, dst2, var, avg);
+ CALC_MSE_AVG_B(src3, dst3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter,
+ int32_t height,
+ int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter,
+ int32_t height,
+ int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter,
+ int32_t height,
+ int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4, out;
+ v16u8 src10_r, src32_r, src21_r, src43_r;
+ v16u8 ref = { 0 };
+ v16u8 src2110, src4332;
+ v16u8 filt0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+ v8u16 tmp0, tmp1;
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+ src10_r, src21_r, src32_r, src43_r);
+ ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter,
+ int32_t height,
+ int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
+ vec0, vec1, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter,
+ int32_t height,
+ int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out0, out1, out2, out3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter);
+ filt0 = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+ src0 = src4;
+
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ CALC_MSE_AVG_B(out2, ref2, var, avg);
+ CALC_MSE_AVG_B(out3, ref3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter,
+ int32_t height,
+ int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter,
+ int32_t height,
+ int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+ filter, height, &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter_horiz,
+ const uint8_t *filter_vert,
+ int32_t height,
+ int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out, ref = { 0 };
+ v16u8 filt_vt, filt_hz, vec0, vec1;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
+ v8u16 tmp0, tmp1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ CALC_MSE_AVG_B(out, ref, var, avg);
+ src0 = src4;
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter_horiz,
+ const uint8_t *filter_vert,
+ int32_t height,
+ int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 out0, out1;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 hz_out0, hz_out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt_vt, filt_hz, vec0;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ CALC_MSE_AVG_B(out0, ref0, var, avg);
+ CALC_MSE_AVG_B(out1, ref1, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter_horiz,
+ const uint8_t *filter_vert,
+ int32_t height,
+ int32_t *diff) {
+ int16_t filtval;
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 ref0, ref1, ref2, ref3;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
+ v8u16 tmp0, tmp1;
+ v8i16 avg = { 0 };
+ v4i32 vec, var = { 0 };
+
+ filtval = LH(filter_horiz);
+ filt_hz = (v16u8)__msa_fill_h(filtval);
+ filtval = LH(filter_vert);
+ filt_vt = (v16u8)__msa_fill_h(filtval);
+
+ LD_UB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src0, src2, src4, src6);
+ LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+ dst += (4 * dst_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+ CALC_MSE_AVG_B(src0, ref0, var, avg);
+ CALC_MSE_AVG_B(src1, ref1, var, avg);
+ CALC_MSE_AVG_B(src2, ref2, var, avg);
+ CALC_MSE_AVG_B(src3, ref3, var, avg);
+ }
+
+ vec = __msa_hadd_s_w(avg, avg);
+ *diff = HADD_SW_S32(vec);
+
+ return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter_horiz,
+ const uint8_t *filter_vert,
+ int32_t height,
+ int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[2];
+
+ for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height,
+ &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1];
+
+ return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
+ int32_t src_stride,
+ const uint8_t *dst,
+ int32_t dst_stride,
+ const uint8_t *filter_horiz,
+ const uint8_t *filter_vert,
+ int32_t height,
+ int32_t *diff) {
+ uint32_t loop_cnt, sse = 0;
+ int32_t diff0[4];
+
+ for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+ sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height,
+ &diff0[loop_cnt]);
+ src += 16;
+ dst += 16;
+ }
+
+ *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+ return sse;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
+uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
+ int32_t src_stride, \
+ int32_t xoffset, \
+ int32_t yoffset, \
+ const uint8_t *ref, \
+ int32_t ref_stride, \
+ uint32_t *sse) { \
+ int32_t diff; \
+ uint32_t var; \
+ const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
+ const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
+ \
+ if (yoffset) { \
+ if (xoffset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride, \
+ ref, ref_stride, \
+ h_filter, v_filter, \
+ ht, &diff); \
+ } else { \
+ *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride, \
+ ref, ref_stride, \
+ v_filter, ht, &diff); \
+ } \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ if (xoffset) { \
+ *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride, \
+ ref, ref_stride, \
+ h_filter, ht, &diff); \
+ \
+ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
+ } else { \
+ var = vpx_variance##wd##x##ht##_msa(src, src_stride, \
+ ref, ref_stride, sse); \
+ } \
+ } \
+ \
+ return var; \
+}
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -14,13 +14,26 @@
#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride) {
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+ { 128, 0 },
+ { 112, 16 },
+ { 96, 32 },
+ { 80, 48 },
+ { 64, 64 },
+ { 48, 80 },
+ { 32, 96 },
+ { 16, 112 },
+};
+
+uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride) {
int distortion = 0;
int r, c;
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
+ for (r = 0; r < 4; ++r) {
+ for (c = 0; c < 4; ++c) {
int diff = a[c] - b[c];
distortion += diff * diff;
}
@@ -32,7 +45,7 @@
return distortion;
}
-unsigned int vpx_get_mb_ss_c(const int16_t *a) {
+uint32_t vpx_get_mb_ss_c(const int16_t *a) {
unsigned int i, sum = 0;
for (i = 0; i < 256; ++i) {
@@ -42,16 +55,38 @@
return sum;
}
+uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,
+ b, b_stride, sse);
+}
+
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,
+ b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,
+ b, b_stride, sse);
+}
+
static void variance(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
- int w, int h, unsigned int *sse, int *sum) {
+ int w, int h, uint32_t *sse, int *sum) {
int i, j;
*sum = 0;
*sse = 0;
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
@@ -62,15 +97,113 @@
}
}
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
+ (int)a[pixel_step] * filter[1],
+ FILTER_BITS);
+
+ ++a;
+ }
+
+ a += src_pixels_per_line - output_width;
+ b += output_width;
+ }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
+ (int)a[pixel_step] * filter[1],
+ FILTER_BITS);
+ ++a;
+ }
+
+ a += src_pixels_per_line - output_width;
+ b += output_width;
+ }
+}
+
#define VAR(W, H) \
-unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- unsigned int *sse) { \
+uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
int sum; \
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
}
+#define SUBPIX_VAR(W, H) \
+uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+\
+ var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+\
+ return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
+}
+
+#define SUBPIX_AVG_VAR(W, H) \
+uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
+ int a_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *b, \
+ int b_stride, \
+ uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+\
+ var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+\
+ vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+\
+ return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
+}
+
/* Identical to the variance call except it takes an additional parameter, sum,
* and returns that value using pass-by-reference instead of returning
* sse - sum^2 / w*h
@@ -78,7 +211,7 @@
#define GET_VAR(W, H) \
void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
- unsigned int *sse, int *sum) { \
+ uint32_t *sse, int *sum) { \
variance(a, a_stride, b, b_stride, W, H, sse, sum); \
}
@@ -87,28 +220,34 @@
* variable.
*/
#define MSE(W, H) \
-unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- unsigned int *sse) { \
+uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
int sum; \
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse; \
}
-VAR(64, 64)
-VAR(64, 32)
-VAR(32, 64)
-VAR(32, 32)
-VAR(32, 16)
-VAR(16, 32)
-VAR(16, 16)
-VAR(16, 8)
-VAR(8, 16)
-VAR(8, 8)
-VAR(8, 4)
-VAR(4, 8)
-VAR(4, 4)
+/* All three forms of the variance are available in the same sizes. */
+#define VARIANCES(W, H) \
+ VAR(W, H) \
+ SUBPIX_VAR(W, H) \
+ SUBPIX_AVG_VAR(W, H)
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
GET_VAR(16, 16)
GET_VAR(8, 8)
@@ -117,12 +256,13 @@
MSE(8, 16)
MSE(8, 8)
-void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, const uint8_t *ref, int ref_stride) {
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height,
+ const uint8_t *ref, int ref_stride) {
int i, j;
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
const int tmp = pred[j] + ref[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
}
@@ -143,8 +283,8 @@
*sum = 0;
*sse = 0;
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
@@ -156,60 +296,60 @@
static void highbd_8_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
- int w, int h, unsigned int *sse, int *sum) {
+ int w, int h, uint32_t *sse, int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sse = (unsigned int)sse_long;
+ *sse = (uint32_t)sse_long;
*sum = (int)sum_long;
}
static void highbd_10_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
- int w, int h, unsigned int *sse, int *sum) {
+ int w, int h, uint32_t *sse, int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
}
static void highbd_12_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
- int w, int h, unsigned int *sse, int *sum) {
+ int w, int h, uint32_t *sse, int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
}
#define HIGHBD_VAR(W, H) \
-unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
- int a_stride, \
- const uint8_t *b, \
- int b_stride, \
- unsigned int *sse) { \
+uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
+ int a_stride, \
+ const uint8_t *b, \
+ int b_stride, \
+ uint32_t *sse) { \
int sum; \
highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
} \
\
-unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
- int a_stride, \
- const uint8_t *b, \
- int b_stride, \
- unsigned int *sse) { \
+uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
+ int a_stride, \
+ const uint8_t *b, \
+ int b_stride, \
+ uint32_t *sse) { \
int sum; \
highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
} \
\
-unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
- int a_stride, \
- const uint8_t *b, \
- int b_stride, \
- unsigned int *sse) { \
+uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
+ int a_stride, \
+ const uint8_t *b, \
+ int b_stride, \
+ uint32_t *sse) { \
int sum; \
highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
@@ -217,54 +357,243 @@
#define HIGHBD_GET_VAR(S) \
void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- unsigned int *sse, int *sum) { \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse, int *sum) { \
highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
} \
\
void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
- unsigned int *sse, int *sum) { \
+ uint32_t *sse, int *sum) { \
highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
} \
\
void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
- unsigned int *sse, int *sum) { \
+ uint32_t *sse, int *sum) { \
highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
}
#define HIGHBD_MSE(W, H) \
-unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
- int src_stride, \
- const uint8_t *ref, \
- int ref_stride, \
- unsigned int *sse) { \
+uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
+ int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride, \
+ uint32_t *sse) { \
int sum; \
highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
} \
\
-unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
- int src_stride, \
- const uint8_t *ref, \
- int ref_stride, \
- unsigned int *sse) { \
+uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
+ int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride, \
+ uint32_t *sse) { \
int sum; \
highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
} \
\
-unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
- int src_stride, \
- const uint8_t *ref, \
- int ref_stride, \
- unsigned int *sse) { \
+uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
+ int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride, \
+ uint32_t *sse) { \
int sum; \
highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
}
+static void highbd_var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr8,
+ uint16_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ output_ptr[j] =
+ ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
+ (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+
+ ++src_ptr;
+ }
+
+ // Next row...
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+static void highbd_var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr,
+ uint16_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ output_ptr[j] =
+ ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
+ (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+ ++src_ptr;
+ }
+
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+\
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+\
+ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+ dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+\
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+\
+ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+ W, dst, dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+\
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+\
+ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+ W, dst, dst_stride, sse); \
+}
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+\
+ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+\
+ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+\
+ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+\
+ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+ W, dst, dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, bilinear_filters[xoffset]); \
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ bilinear_filters[yoffset]); \
+\
+ vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+\
+ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+ W, dst, dst_stride, sse); \
+}
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+ HIGHBD_VAR(W, H) \
+ HIGHBD_SUBPIX_VAR(W, H) \
+ HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
HIGHBD_GET_VAR(8)
HIGHBD_GET_VAR(16)
@@ -273,20 +602,6 @@
HIGHBD_MSE(8, 16)
HIGHBD_MSE(8, 8)
-HIGHBD_VAR(64, 64)
-HIGHBD_VAR(64, 32)
-HIGHBD_VAR(32, 64)
-HIGHBD_VAR(32, 32)
-HIGHBD_VAR(32, 16)
-HIGHBD_VAR(16, 32)
-HIGHBD_VAR(16, 16)
-HIGHBD_VAR(16, 8)
-HIGHBD_VAR(8, 16)
-HIGHBD_VAR(8, 8)
-HIGHBD_VAR(8, 4)
-HIGHBD_VAR(4, 8)
-HIGHBD_VAR(4, 4)
-
void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
int width, int height, const uint8_t *ref8,
int ref_stride) {
@@ -293,8 +608,8 @@
int i, j;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
const int tmp = pred[j] + ref[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
}
--- /dev/null
+++ b/vpx_dsp/variance.h
@@ -1,0 +1,94 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_VARIANCE_H_
+#define VPX_DSP_VARIANCE_H_
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+#define FILTER_WEIGHT 128
+
+typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b_ptr, int b_stride);
+
+typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *second_pred);
+
+typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
+ uint8_t *b, int b_stride, int n);
+
+typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sad_array);
+
+typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *const b_array[],
+ int b_stride,
+ unsigned int *sad_array);
+
+typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse);
+
+typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse);
+
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
+ int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b_ptr,
+ int b_stride,
+ unsigned int *sse,
+ const uint8_t *second_pred);
+#if CONFIG_VP8
+typedef struct variance_vtable {
+ vpx_sad_fn_t sdf;
+ vpx_variance_fn_t vf;
+ vpx_subpixvariance_fn_t svf;
+ vpx_variance_fn_t svf_halfpix_h;
+ vpx_variance_fn_t svf_halfpix_v;
+ vpx_variance_fn_t svf_halfpix_hv;
+ vpx_sad_multi_fn_t sdx3f;
+ vpx_sad_multi_fn_t sdx8f;
+ vpx_sad_multi_d_fn_t sdx4df;
+#if ARCH_X86 || ARCH_X86_64
+ vp8_copy32xn_fn_t copymem;
+#endif
+} vp8_variance_fn_ptr_t;
+#endif // CONFIG_VP8
+
+#if CONFIG_VP9
+typedef struct vp9_variance_vtable {
+ vpx_sad_fn_t sdf;
+ vpx_sad_avg_fn_t sdaf;
+ vpx_variance_fn_t vf;
+ vpx_subpixvariance_fn_t svf;
+ vpx_subp_avg_variance_fn_t svaf;
+ vpx_sad_multi_fn_t sdx3f;
+ vpx_sad_multi_fn_t sdx8f;
+ vpx_sad_multi_d_fn_t sdx4df;
+} vp9_variance_fn_ptr_t;
+#endif // CONFIG_VP9
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_DSP_VARIANCE_H_
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -10,6 +10,8 @@
DSP_SRCS-yes += vpx_dsp.mk
+DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
+
ifeq ($(CONFIG_ENCODERS),yes)
DSP_SRCS-yes += sad.c
DSP_SRCS-yes += subtract.c
@@ -19,7 +21,6 @@
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c
-DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c
@@ -45,21 +46,36 @@
ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
DSP_SRCS-yes += variance.c
+DSP_SRCS-yes += variance.h
+DSP_SRCS-$(HAVE_MEDIA) += arm/bilinear_filter_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA) += arm/subpel_variance_media.c
+DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_h_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_hv_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_v_media$(ASM)
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM)
+DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c
DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c
DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm
-DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3
+endif # CONFIG_USE_X86INC
+
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
+endif # CONFIG_USE_X86INC
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -412,6 +412,9 @@
if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+#
+# Variance
+#
add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
@@ -451,7 +454,9 @@
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance4x4 mmx sse2 msa/;
-
+#
+# Specialty Variance
+#
add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
@@ -478,6 +483,99 @@
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+#
+# Subpixel Variance
+#
+add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
+
+#
+# Specialty Subpixel
+#
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_variance_halfpixvar16x16_h mmx media/;
+
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_variance_halfpixvar16x16_v mmx media/;
+
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_variance_halfpixvar16x16_hv mmx media/;
+
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance64x64 sse2/;
@@ -615,6 +713,226 @@
specialize qw/vpx_highbd_12_mse8x8 sse2/;
add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+
+ #
+ # Subpixel Variance
+ #
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
--- /dev/null
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -1,0 +1,1041 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%define program_name vpx
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+SECTION .text
+
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ mova %4, %3 ; make copies to manipulate to calc sum
+ mova %2, %1 ; use originals for calc sse
+ pmaddwd %3, %3
+ paddw %4, %2
+ pmaddwd %1, %1
+ movhlps %2, %4
+ paddd %6, %3
+ paddw %4, %2
+ pxor %2, %2
+ pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
+ punpcklwd %4, %2 ; sign-extend word to dword
+ paddd %6, %1
+ paddd %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ movhlps m3, m7
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ pshufd m4, m6, 0x1
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ movd [r1], m7 ; store sse
+ movd rax, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if ARCH_X86=1 && CONFIG_PIC=1
+ lea srcq, [srcq + src_stridemp*2]
+%else
+ lea srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro INC_SRC_BY_SRC_2STRIDE 0
+%if ARCH_X86=1 && CONFIG_PIC=1
+ lea srcq, [srcq + src_stridemp*4]
+%else
+ lea srcq, [srcq + src_strideq*4]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%ifdef PIC ; 64bit PIC
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %endif
+ %define h heightd
+ %define bilin_filter sseq
+%else
+ %if ARCH_X86=1 && CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse, g_bilin_filter, g_pw_8
+ %define h dword heightm
+ %define sec_str sec_stridemp
+
+ ; Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, height, \
+ sse, g_bilin_filter, g_pw_8
+ %define h heightd
+
+ ; Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %endif
+ %else
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse
+ %if ARCH_X86_64
+ %define h heightd
+ %define sec_str sec_strideq
+ %else
+ %define h dword heightm
+ %define sec_str sec_stridemp
+ %endif
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, height, sse
+ %define h heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+
+%if %1 < 16
+ sar h, 1
+%endif
+%if %2 == 1 ; avg
+ shl sec_str, 1
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq + 16]
+ mova m1, [dstq]
+ mova m3, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m2, [secq+16]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq + src_strideq*2]
+ mova m1, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pavgw m0, m1
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_zero_y_half_loop
+ STORE_AND_RET
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m4, m1
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonzero:
+ cmp x_offsetd, 8
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ mova m2, [dstq]
+ mova m3, [dstq + 16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ mova m2, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_half_y_zero_loop
+ STORE_AND_RET
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m1, m3
+ mova m4, [dstq]
+ mova m5, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m2, m3
+ mova m4, [dstq]
+ mova m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_half_y_half_loop
+ STORE_AND_RET
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m1, filter_rnd
+ paddw m1, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m1, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m4, filter_rnd
+ paddw m4, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m4, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m4, [secq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m2, [srcq+2]
+ movu m3, [srcq+src_strideq*2+2]
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m1, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m2, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [bilin_filter+y_offsetq]
+ mova m11, [bilin_filter+y_offsetq+16]
+ mova m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+; end of load filter
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ movu m1, [srcq+16]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ movu m3, [srcq+16]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m1, filter_rnd
+ mova m2, [dstq]
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ movu m3, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m4, filter_rnd
+ mova m2, [dstq]
+ paddw m4, m3
+ psrlw m0, 4
+ psrlw m4, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m4, [secq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ INC_SRC_BY_SRC_2STRIDE
+ lea dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec h
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -8,9 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
@@ -243,3 +241,341 @@
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
return *sse;
}
+
+#if CONFIG_USE_X86INC
+#define DECL(w, opt) \
+ int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint16_t *dst, \
+ ptrdiff_t dst_stride, \
+ int height, unsigned int *sse);
+#define DECLS(opt1, opt2) \
+ DECL(8, opt1); \
+ DECL(16, opt1)
+
+DECLS(sse2, sse);
+// TODO(johannkoenig): enable the ssse3 or delete
+// DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
+ int src_stride, \
+ int x_offset, \
+ int y_offset, \
+ const uint8_t *dst8, \
+ int dst_stride, \
+ uint32_t *sse_ptr) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, h, \
+ &sse); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+ src_stride, \
+ x_offset, y_offset, \
+ dst + 16, \
+ dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, \
+ dst + 48, dst_stride, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ h, &sse); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+ src_stride, \
+ x_offset, y_offset, \
+ dst + 16, \
+ dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ for (start_row = 0; start_row < h; start_row +=16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
+ dst_stride, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ }\
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+
+FNS(sse2, sse);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint16_t *dst, \
+ ptrdiff_t dst_stride, \
+ const uint16_t *sec, \
+ ptrdiff_t sec_stride, \
+ int height, \
+ unsigned int *sse);
+#define DECLS(opt1) \
+DECL(16, opt1) \
+DECL(8, opt1)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, sec, w, h, &sse); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, x_offset, y_offset, \
+ dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, \
+ dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, \
+ dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ sec, w, h, &sse); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16, src_stride, \
+ x_offset, y_offset, \
+ dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ for (start_row = 0; start_row < h; start_row +=16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + (start_row * dst_stride), dst_stride, \
+ sec + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 16 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, \
+ dst + 16 + (start_row * dst_stride), dst_stride, \
+ sec + 16 + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 32 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, \
+ dst + 32 + (start_row * dst_stride), dst_stride, \
+ sec + 32 + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 48 + (start_row * src_stride), src_stride, \
+ x_offset, y_offset, \
+ dst + 48 + (start_row * dst_stride), dst_stride, \
+ sec + 48 + (start_row * w), w, height, &sse2); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+
+#define FNS(opt1) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
+#endif // CONFIG_USE_X86INC
--- /dev/null
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -1,0 +1,1398 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%define program_name vpx
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+bilin_filter_m_ssse3: times 8 db 16, 0
+ times 8 db 14, 2
+ times 8 db 12, 4
+ times 8 db 10, 6
+ times 16 db 8
+ times 8 db 6, 10
+ times 8 db 4, 12
+ times 8 db 2, 14
+
+SECTION .text
+
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ paddw %5, %3
+ pmaddwd %3, %3
+ paddw %5, %1
+ pmaddwd %1, %1
+ paddd %6, %3
+ paddd %6, %1
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ pcmpgtw m5, m6 ; mask for 0 > x
+ movhlps m3, m7
+ punpcklwd m4, m6, m5
+ punpckhwd m6, m5 ; sign-extend m6 word->dword
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ pshufd m4, m6, 0x1
+ movd [r1], m7 ; store sse
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%else ; mmsize == 8
+ pshufw m4, m6, 0xe
+ pshufw m3, m7, 0xe
+ paddw m6, m4
+ paddd m7, m3
+ pcmpgtw m5, m6 ; mask for 0 > x
+ mov r1, ssem ; r1 = unsigned int *sse
+ punpcklwd m6, m5 ; sign-extend m6 word->dword
+ movd [r1], m7 ; store sse
+ pshufw m4, m6, 0xe
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+%else
+ add srcq, src_strideq
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+
+%ifdef PIC ; 64bit PIC
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %endif
+ %define h heightd
+ %define bilin_filter sseq
+%else
+ %if ARCH_X86=1 && CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse, g_bilin_filter, g_pw_8
+ %define h dword heightm
+ %define sec_str sec_stridemp
+
+ ;Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse, \
+ g_bilin_filter, g_pw_8
+ %define h heightd
+
+ ;Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %endif
+ %else
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse
+ %if ARCH_X86_64
+ %define h heightd
+ %define sec_str sec_strideq
+ %else
+ %define h dword heightm
+ %define sec_str sec_stridemp
+ %endif
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %define h heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+ ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+ ; could perhaps use it for something more productive then
+ pxor m5, m5 ; dedicated zero register
+%if %1 < 16
+ sar h, 1
+%if %2 == 1 ; avg
+ shl sec_str, 1
+%endif
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ mova m1, [dstq]
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%if %2 == 0 ; !avg
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m0, [srcq+src_strideq]
+%else ; mmsize == 8
+ punpckldq m0, [srcq+src_strideq]
+%endif
+%else ; !avg
+ movh m2, [srcq+src_strideq]
+%endif
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m2, [srcq+src_strideq*2]
+%else ; mmsize == 8
+%if %1 == 4
+ movh m1, [srcq+src_strideq*2]
+ punpckldq m2, m1
+%else
+ punpckldq m2, [srcq+src_strideq*2]
+%endif
+%endif
+ movh m1, [dstq]
+%if mmsize == 16
+ movlhps m0, m2
+%else ; mmsize == 8
+ punpckldq m0, m2
+%endif
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ movh m4, [srcq+src_strideq*2]
+ movh m1, [dstq]
+ pavgb m0, m2
+ movh m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_zero_y_half_loop
+ STORE_AND_RET
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq*2]
+ movh m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movh m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movh m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonzero:
+ cmp x_offsetd, 8
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m4, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m0, [srcq+src_strideq]
+ movhps m4, [srcq+src_strideq+1]
+%else ; mmsize == 8
+ punpckldq m0, [srcq+src_strideq]
+ punpckldq m4, [srcq+src_strideq+1]
+%endif
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+ pavgb m0, [secq]
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ movh m2, [srcq+src_strideq]
+ movh m1, [dstq]
+ pavgb m0, m4
+ movh m4, [srcq+src_strideq+1]
+ movh m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_half_y_zero_loop
+ STORE_AND_RET
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m3
+ punpckhbw m3, m1, m5
+ pavgb m0, m4
+%if %2 == 1 ; avg
+ punpcklbw m1, m5
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movh m2, [srcq]
+ movh m3, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m2, [srcq+src_strideq]
+ movhps m3, [srcq+src_strideq+1]
+%else
+%if %1 == 4
+ movh m1, [srcq+src_strideq]
+ punpckldq m2, m1
+ movh m1, [srcq+src_strideq+1]
+ punpckldq m3, m1
+%else
+ punpckldq m2, [srcq+src_strideq]
+ punpckldq m3, [srcq+src_strideq+1]
+%endif
+%endif
+ pavgb m2, m3
+%if mmsize == 16
+ movlhps m0, m2
+ movhlps m4, m2
+%else ; mmsize == 8
+ punpckldq m0, m2
+ pshufw m4, m2, 0xe
+%endif
+ movh m1, [dstq]
+ pavgb m0, m2
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ movh m4, [srcq+src_strideq]
+ movh m1, [srcq+src_strideq+1]
+ pavgb m2, m3
+ pavgb m4, m1
+ pavgb m0, m2
+ pavgb m2, m4
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_half_y_half_loop
+ STORE_AND_RET
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ;x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_other_loop:
+ movu m4, [srcq]
+ movu m2, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m2
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ punpcklbw m0, m5
+ paddw m2, m3
+ punpcklbw m3, m4, m5
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+%endif
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+%if notcpuflag(ssse3)
+ punpcklbw m0, m5
+%endif
+.x_half_y_other_loop:
+ movh m2, [srcq]
+ movh m1, [srcq+1]
+ movh m4, [srcq+src_strideq]
+ movh m3, [srcq+src_strideq+1]
+ pavgb m2, m1
+ pavgb m4, m3
+ movh m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movh m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ paddw m0, m1
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m2, m1
+ movh m1, [dstq]
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m1, [srcq+1]
+ movh m2, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq+1]
+ movh m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ movh m1, [dstq]
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_x_a
+ pmaddubsw m2, filter_x_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movh m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+ add srcq, src_strideq
+ packuswb m0, m2
+.x_other_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+%if cpuflag(ssse3)
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%else
+ punpckhbw m2, m4, m5
+ punpckhbw m1, m3, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ paddw m4, m3
+ paddw m2, m1
+ mova m1, [dstq]
+ psraw m4, 4
+ psraw m2, 4
+ punpckhbw m3, m1, m5
+ ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+ ; have a 1-register shortage to be able to store the backup of the bilin
+ ; filtered second line as words as cache for the next line. Packing into
+ ; a byte costs 1 pack and 2 unpacks, but saves a register.
+ packuswb m4, m2
+ punpcklbw m1, m5
+ pavgb m0, m4
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ pavgb m0, [secq]
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ add srcq, src_strideq
+ psraw m0, 4
+.x_other_y_half_loop:
+ movh m2, [srcq]
+ movh m1, [srcq+1]
+ movh m4, [srcq+src_strideq]
+ movh m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ movh m1, [dstq]
+ paddw m4, m3
+ movh m3, [dstq+dst_strideq]
+%endif
+ psraw m2, 4
+ psraw m4, 4
+ pavgw m0, m2
+ pavgw m2, m4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline - also consider going to bytes here
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m11, [bilin_filter+y_offsetq+16]
+%endif
+ mova m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+ packuswb m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ punpckhbw m3, m1, m5
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ punpcklbw m1, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+ psraw m0, 4
+%else
+ movu m3, [srcq]
+ movu m4, [srcq+1]
+ punpckhbw m1, m3, m5
+ punpckhbw m2, m4, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ pmullw m3, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m3, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m1, filter_rnd
+ paddw m3, m4
+ paddw m1, m2
+ psraw m3, 4
+ psraw m1, 4
+ packuswb m4, m3, m1
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ pmullw m2, filter_y_a
+ pmullw m1, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, m1
+ mova m1, [dstq]
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ psraw m0, 4
+%if cpuflag(ssse3)
+ packuswb m0, m0
+%endif
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movh m2, [srcq]
+ movh m1, [srcq+1]
+
+ INC_SRC_BY_SRC_STRIDE
+ movh m4, [srcq]
+ movh m3, [srcq+1]
+
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movh m3, [dstq+dst_strideq]
+ movh m1, [dstq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m2, m2
+ packuswb m4, m4
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m1, m5
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ paddw m4, m3
+ psraw m2, 4
+ psraw m4, 4
+ pmullw m0, filter_y_a
+ pmullw m3, m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m0, m3
+ movh m3, [dstq+dst_strideq]
+ paddw m2, m1
+ movh m1, [dstq]
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_MMX sse
+SUBPEL_VARIANCE 4
+INIT_XMM sse2
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE 4
+INIT_XMM ssse3
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_MMX sse
+SUBPEL_VARIANCE 4, 1
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE 4, 1
+INIT_XMM ssse3
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -91,3 +91,93 @@
sse, &sum, vpx_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 11);
}
+
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height,
+ unsigned int *sse);
+
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ const uint8_t *sec,
+ int sec_stride,
+ int height,
+ unsigned int *sseptr);
+
+unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ unsigned int sse1;
+ const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ 64, &sse1);
+ unsigned int sse2;
+ const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+ x_offset, y_offset,
+ dst + 32, dst_stride,
+ 64, &sse2);
+ const int se = se1 + se2;
+ *sse = sse1 + sse2;
+ return *sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ 32, sse);
+ return *sse - (((int64_t)se * se) >> 10);
+}
+
+unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse,
+ const uint8_t *sec) {
+ unsigned int sse1;
+ const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ sec, 64, 64, &sse1);
+ unsigned int sse2;
+ const int se2 =
+ vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
+ y_offset, dst + 32, dst_stride,
+ sec + 32, 64, 64, &sse2);
+ const int se = se1 + se2;
+
+ *sse = sse1 + sse2;
+
+ return *sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse,
+ const uint8_t *sec) {
+ // Process 32 elements in parallel.
+ const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ sec, 32, 32, sse);
+ return *sse - (((int64_t)se * se) >> 10);
+}
--- a/vpx_dsp/x86/variance_impl_avx2.c
+++ b/vpx_dsp/x86/variance_impl_avx2.c
@@ -11,7 +11,28 @@
#include <immintrin.h> // AVX2
#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+};
+
+
void vpx_get16x16var_avx2(const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
@@ -212,4 +233,495 @@
*((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
_mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
}
+}
+
+#define FILTER_SRC(filter) \
+ /* filter the source */ \
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+ \
+ /* add 8 to source */ \
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+ \
+ /* divide source by 16 */ \
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg) \
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST \
+ /* load source and destination */ \
+ src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
+ dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride) \
+ src_next_reg = _mm256_loadu_si256((__m256i const *) \
+ (src + size_stride)); \
+ /* average between current and next stride source */ \
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride) \
+ src_next_reg = _mm256_loadu_si256((__m256i const *) \
+ (src + size_stride)); \
+ MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+ /* expand each byte to 2 bytes */ \
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+ /* source - dest */ \
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+ /* caculate sum */ \
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+ /* calculate sse */ \
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+ res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+ sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+ sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+ \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+ \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ int height,
+ unsigned int *sse) {
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+ __m256i zero_reg;
+ int i, sum;
+ sum_reg = _mm256_set1_epi16(0);
+ sse_reg = _mm256_set1_epi16(0);
+ zero_reg = _mm256_set1_epi16(0);
+
+ // x_offset = 0 and y_offset = 0
+ if (x_offset == 0) {
+ if (y_offset == 0) {
+ for (i = 0; i < height ; i++) {
+ LOAD_SRC_DST
+ // expend each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ // x_offset = 0 and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i src_next_reg;
+ for (i = 0; i < height ; i++) {
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, src_stride)
+ // expend each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ // x_offset = 0 and y_offset = bilin interpolation
+ } else {
+ __m256i filter, pw8, src_next_reg;
+
+ y_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height ; i++) {
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, src_stride)
+ FILTER_SRC(filter)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ }
+ // x_offset = 8 and y_offset = 0
+ } else if (x_offset == 8) {
+ if (y_offset == 0) {
+ __m256i src_next_reg;
+ for (i = 0; i < height ; i++) {
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ // x_offset = 8 and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i src_next_reg, src_avg;
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ AVG_NEXT_SRC(src_reg, 1)
+ for (i = 0; i < height ; i++) {
+ src_avg = src_reg;
+ src+= src_stride;
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ // average between previous average to current average
+ src_avg = _mm256_avg_epu8(src_avg, src_reg);
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_avg, zero_reg)
+ // save current source average
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst+= dst_stride;
+ }
+ // x_offset = 8 and y_offset = bilin interpolation
+ } else {
+ __m256i filter, pw8, src_next_reg, src_avg;
+ y_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ AVG_NEXT_SRC(src_reg, 1)
+ for (i = 0; i < height ; i++) {
+ // save current source average
+ src_avg = src_reg;
+ src+= src_stride;
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ MERGE_WITH_SRC(src_avg, src_reg)
+ FILTER_SRC(filter)
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst+= dst_stride;
+ }
+ }
+ // x_offset = bilin interpolation and y_offset = 0
+ } else {
+ if (y_offset == 0) {
+ __m256i filter, pw8, src_next_reg;
+ x_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + x_offset));
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height ; i++) {
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ // x_offset = bilin interpolation and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i filter, pw8, src_next_reg, src_pack;
+ x_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + x_offset));
+ pw8 = _mm256_set1_epi16(8);
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height ; i++) {
+ src+= src_stride;
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // average between previous pack to the current
+ src_pack = _mm256_avg_epu8(src_pack, src_reg);
+ MERGE_WITH_SRC(src_pack, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src_pack = src_reg;
+ dst+= dst_stride;
+ }
+ // x_offset = bilin interpolation and y_offset = bilin interpolation
+ } else {
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+ x_offset <<= 5;
+ xfilter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + x_offset));
+ y_offset <<= 5;
+ yfilter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ MERGE_NEXT_SRC(src_reg, 1)
+
+ FILTER_SRC(xfilter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height ; i++) {
+ src+= src_stride;
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(xfilter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // merge previous pack to current pack source
+ MERGE_WITH_SRC(src_pack, src_reg)
+ // filter the source
+ FILTER_SRC(yfilter)
+ src_pack = src_reg;
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst+= dst_stride;
+ }
+ }
+ }
+ CALC_SUM_AND_SSE
+ return sum;
+}
+
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ const uint8_t *sec,
+ int sec_stride,
+ int height,
+ unsigned int *sse) {
+ __m256i sec_reg;
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+ __m256i zero_reg;
+ int i, sum;
+ sum_reg = _mm256_set1_epi16(0);
+ sse_reg = _mm256_set1_epi16(0);
+ zero_reg = _mm256_set1_epi16(0);
+
+ // x_offset = 0 and y_offset = 0
+ if (x_offset == 0) {
+ if (y_offset == 0) {
+ for (i = 0; i < height ; i++) {
+ LOAD_SRC_DST
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ sec+= sec_stride;
+ // expend each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ } else if (y_offset == 8) {
+ __m256i src_next_reg;
+ for (i = 0; i < height ; i++) {
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, src_stride)
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ sec+= sec_stride;
+ // expend each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ // x_offset = 0 and y_offset = bilin interpolation
+ } else {
+ __m256i filter, pw8, src_next_reg;
+
+ y_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height ; i++) {
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, src_stride)
+ FILTER_SRC(filter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ sec+= sec_stride;
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ }
+ // x_offset = 8 and y_offset = 0
+ } else if (x_offset == 8) {
+ if (y_offset == 0) {
+ __m256i src_next_reg;
+ for (i = 0; i < height ; i++) {
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ sec+= sec_stride;
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ // x_offset = 8 and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i src_next_reg, src_avg;
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ AVG_NEXT_SRC(src_reg, 1)
+ for (i = 0; i < height ; i++) {
+ // save current source average
+ src_avg = src_reg;
+ src+= src_stride;
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ // average between previous average to current average
+ src_avg = _mm256_avg_epu8(src_avg, src_reg);
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+ sec+= sec_stride;
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_avg, zero_reg)
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst+= dst_stride;
+ }
+ // x_offset = 8 and y_offset = bilin interpolation
+ } else {
+ __m256i filter, pw8, src_next_reg, src_avg;
+ y_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ AVG_NEXT_SRC(src_reg, 1)
+ for (i = 0; i < height ; i++) {
+ // save current source average
+ src_avg = src_reg;
+ src+= src_stride;
+ LOAD_SRC_DST
+ AVG_NEXT_SRC(src_reg, 1)
+ MERGE_WITH_SRC(src_avg, src_reg)
+ FILTER_SRC(filter)
+ src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+ // expand each byte to 2 bytes
+ MERGE_WITH_SRC(src_avg, zero_reg)
+ sec+= sec_stride;
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst+= dst_stride;
+ }
+ }
+ // x_offset = bilin interpolation and y_offset = 0
+ } else {
+ if (y_offset == 0) {
+ __m256i filter, pw8, src_next_reg;
+ x_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + x_offset));
+ pw8 = _mm256_set1_epi16(8);
+ for (i = 0; i < height ; i++) {
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+ MERGE_WITH_SRC(src_reg, zero_reg)
+ sec+= sec_stride;
+ CALC_SUM_SSE_INSIDE_LOOP
+ src+= src_stride;
+ dst+= dst_stride;
+ }
+ // x_offset = bilin interpolation and y_offset = 8
+ } else if (y_offset == 8) {
+ __m256i filter, pw8, src_next_reg, src_pack;
+ x_offset <<= 5;
+ filter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + x_offset));
+ pw8 = _mm256_set1_epi16(8);
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height ; i++) {
+ src+= src_stride;
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(filter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // average between previous pack to the current
+ src_pack = _mm256_avg_epu8(src_pack, src_reg);
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+ sec+= sec_stride;
+ MERGE_WITH_SRC(src_pack, zero_reg)
+ src_pack = src_reg;
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst+= dst_stride;
+ }
+ // x_offset = bilin interpolation and y_offset = bilin interpolation
+ } else {
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+ x_offset <<= 5;
+ xfilter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + x_offset));
+ y_offset <<= 5;
+ yfilter = _mm256_load_si256((__m256i const *)
+ (bilinear_filters_avx2 + y_offset));
+ pw8 = _mm256_set1_epi16(8);
+ // load source and another source starting from the next
+ // following byte
+ src_reg = _mm256_loadu_si256((__m256i const *) (src));
+ MERGE_NEXT_SRC(src_reg, 1)
+
+ FILTER_SRC(xfilter)
+ // convert each 16 bit to 8 bit to each low and high lane source
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ for (i = 0; i < height ; i++) {
+ src+= src_stride;
+ LOAD_SRC_DST
+ MERGE_NEXT_SRC(src_reg, 1)
+ FILTER_SRC(xfilter)
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ // merge previous pack to current pack source
+ MERGE_WITH_SRC(src_pack, src_reg)
+ // filter the source
+ FILTER_SRC(yfilter)
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+ src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+ MERGE_WITH_SRC(src_pack, zero_reg)
+ src_pack = src_reg;
+ sec+= sec_stride;
+ CALC_SUM_SSE_INSIDE_LOOP
+ dst+= dst_stride;
+ }
+ }
+ }
+ CALC_SUM_AND_SSE
+ return sum;
}
--- a/vpx_dsp/x86/variance_impl_mmx.asm
+++ b/vpx_dsp/x86/variance_impl_mmx.asm
@@ -11,6 +11,8 @@
%include "vpx_ports/x86_abi_support.asm"
+%define mmx_filter_shift 7
+
;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
global sym(vpx_get_mb_ss_mmx) PRIVATE
sym(vpx_get_mb_ss_mmx):
@@ -52,7 +54,6 @@
movsxd rcx, dword ptr [rsp+4]
add rax, rcx
-
; begin epilog
add rsp, 8
pop rdi
@@ -62,7 +63,6 @@
pop rbp
ret
-
;void vpx_get8x8var_mmx
;(
; unsigned char *src_ptr,
@@ -83,7 +83,6 @@
sub rsp, 16
; end prolog
-
pxor mm5, mm5 ; Blank mmx6
pxor mm6, mm6 ; Blank mmx7
pxor mm7, mm7 ; Blank mmx7
@@ -117,7 +116,6 @@
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
-
; Row 2
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
@@ -298,7 +296,6 @@
mov dword ptr [rdi], edx
xor rax, rax ; return 0
-
; begin epilog
add rsp, 16
pop rbx
@@ -308,8 +305,6 @@
pop rbp
ret
-
-
;void
;vpx_get4x4var_mmx
;(
@@ -331,7 +326,6 @@
sub rsp, 16
; end prolog
-
pxor mm5, mm5 ; Blank mmx6
pxor mm6, mm6 ; Blank mmx7
pxor mm7, mm7 ; Blank mmx7
@@ -354,7 +348,6 @@
movd mm1, [rbx] ; Copy four bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
-
; Row 2
movd mm0, [rax] ; Copy four bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
@@ -393,7 +386,6 @@
pmaddwd mm0, mm0 ; square and accumulate
paddd mm7, mm0 ; accumulate in mm7
-
; Now accumulate the final results.
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
@@ -413,7 +405,6 @@
mov dword ptr [rdi], edx
xor rax, rax ; return 0
-
; begin epilog
add rsp, 16
pop rbx
@@ -422,3 +413,332 @@
UNSHADOW_ARGS
pop rbp
ret
+
+;void vpx_filter_block2d_bil4x4_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
+sym(vpx_filter_block2d_bil4x4_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+
+ mov rax, arg(4) ;HFilter ;
+ mov rdx, arg(5) ;VFilter ;
+
+ mov rsi, arg(0) ;ref_ptr ;
+ mov rdi, arg(2) ;src_ptr ;
+
+ mov rcx, 4 ;
+ pxor mm0, mm0 ;
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm5, mm1
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rsi, r8
+%endif
+
+.filter_block2d_bil4x4_var_mmx_loop:
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm3, mm5 ;
+
+ movq mm5, mm1 ;
+ pmullw mm3, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ paddw mm1, mm3 ;
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm1, mmx_filter_shift ;
+
+ movd mm3, [rdi] ;
+ punpcklbw mm3, mm0 ;
+
+ psubw mm1, mm3 ;
+ paddw mm6, mm1 ;
+
+ pmaddwd mm1, mm1 ;
+ paddd mm7, mm1 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz .filter_block2d_bil4x4_var_mmx_loop ;
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(6) ;sum
+ mov rsi, arg(7) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block2d_bil_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
+sym(vpx_filter_block2d_bil_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov rax, arg(5) ;HFilter ;
+
+ mov rdx, arg(6) ;VFilter ;
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor mm0, mm0 ;
+ movq mm1, [rsi] ;
+
+ movq mm3, [rsi+1] ;
+ movq mm2, mm1 ;
+
+ movq mm4, mm3 ;
+ punpcklbw mm1, mm0 ;
+
+ punpckhbw mm2, mm0 ;
+ pmullw mm1, [rax] ;
+
+ pmullw mm2, [rax] ;
+ punpcklbw mm3, mm0 ;
+
+ punpckhbw mm4, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ pmullw mm4, [rax+8] ;
+ paddw mm1, mm3 ;
+
+ paddw mm2, mm4 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm2, mmx_filter_shift ;
+ movq mm5, mm1
+
+ packuswb mm5, mm2 ;
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ add rsi, r8
+%endif
+
+.filter_block2d_bil_var_mmx_loop:
+
+ movq mm1, [rsi] ;
+ movq mm3, [rsi+1] ;
+
+ movq mm2, mm1 ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm1, mm0 ;
+ punpckhbw mm2, mm0 ;
+
+ pmullw mm1, [rax] ;
+ pmullw mm2, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, [rax+8] ;
+ pmullw mm4, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm1, mmx_filter_shift ;
+
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, mm5 ;
+ movq mm4, mm5 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ movq mm5, mm1 ;
+ packuswb mm5, mm2 ;
+
+ pmullw mm3, [rdx] ;
+ pmullw mm4, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ pmullw mm2, [rdx+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, [rdi] ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ psubw mm1, mm3 ;
+ psubw mm2, mm4 ;
+
+ paddw mm6, mm1 ;
+ pmaddwd mm1, mm1 ;
+
+ paddw mm6, mm2 ;
+ pmaddwd mm2, mm2 ;
+
+ paddd mm7, mm1 ;
+ paddd mm7, mm2 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz .filter_block2d_bil_var_mmx_loop ;
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(7) ;sum
+ mov rsi, arg(8) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+ times 4 dw 64
--- a/vpx_dsp/x86/variance_mmx.c
+++ b/vpx_dsp/x86/variance_mmx.c
@@ -10,12 +10,45 @@
#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
+ { 128, 128, 128, 128, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 112, 112, 112, 112 }
+};
+
extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse, int *sum);
-unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
+extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const int16_t *HFilter,
+ const int16_t *VFilter,
+ int *sum,
+ unsigned int *sumsquared);
+
+extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ const int16_t *HFilter,
+ const int16_t *VFilter,
+ int *sum,
+ unsigned int *sumsquared);
+
+
+unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int var;
int avg;
@@ -25,8 +58,8 @@
return (var - (((unsigned int)avg * avg) >> 4));
}
-unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
+unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int var;
int avg;
@@ -37,8 +70,8 @@
return (var - (((unsigned int)avg * avg) >> 6));
}
-unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
+unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3;
@@ -55,8 +88,8 @@
return var;
}
-unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
+unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3, avg;
@@ -74,8 +107,8 @@
return (var - (((unsigned int)avg * avg) >> 8));
}
-unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
+unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
@@ -89,8 +122,8 @@
return (var - (((unsigned int)avg * avg) >> 7));
}
-unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
+unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
+ const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
@@ -104,4 +137,113 @@
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
+}
+
+uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int xsum;
+ unsigned int xxsum;
+ vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,
+ bilinear_filters_mmx[xoffset],
+ bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum);
+ *sse = xxsum;
+ return (xxsum - (((unsigned int)xsum * xsum) >> 4));
+}
+
+
+uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int xsum;
+ uint32_t xxsum;
+ vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
+ bilinear_filters_mmx[xoffset],
+ bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum);
+ *sse = xxsum;
+ return (xxsum - (((uint32_t)xsum * xsum) >> 6));
+}
+
+uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
+ bilinear_filters_mmx[xoffset],
+ bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0);
+
+ vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,
+ bilinear_filters_mmx[xoffset],
+ bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1);
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+}
+
+uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
+ bilinear_filters_mmx[xoffset],
+ bilinear_filters_mmx[yoffset],
+ &xsum0, &xxsum0);
+
+ vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,
+ bilinear_filters_mmx[xoffset],
+ bilinear_filters_mmx[yoffset],
+ &xsum1, &xxsum1);
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));
+}
+
+uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int xsum;
+ unsigned int xxsum;
+ vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
+ bilinear_filters_mmx[xoffset],
+ bilinear_filters_mmx[yoffset],
+ &xsum, &xxsum);
+ *sse = xxsum;
+ return (xxsum - (((uint32_t)xsum * xsum) >> 7));
+}
+
+uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);
}
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -307,3 +307,171 @@
vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
return *sse;
}
+
+#if CONFIG_USE_X86INC
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt) \
+ int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ int height, unsigned int *sse, \
+ void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+ DECL(4, opt2); \
+ DECL(8, opt1); \
+ DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
+ int src_stride, \
+ int x_offset, \
+ int y_offset, \
+ const uint8_t *dst, \
+ int dst_stride, \
+ unsigned int *sse_ptr) { \
+ unsigned int sse; \
+ int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ h, &sse, NULL, NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
+ x_offset, y_offset, \
+ dst + 16, dst_stride, \
+ h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
+FN(16, 8, 16, 4, 3, opt1, (uint32_t)); \
+FN(8, 16, 8, 3, 4, opt1, (uint32_t)); \
+FN(8, 8, 8, 3, 3, opt1, (uint32_t)); \
+FN(8, 4, 8, 3, 2, opt1, (uint32_t)); \
+FN(4, 8, 4, 2, 3, opt2, (uint32_t)); \
+FN(4, 4, 4, 2, 2, opt2, (uint32_t))
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ const uint8_t *sec, \
+ ptrdiff_t sec_stride, \
+ int height, unsigned int *sse, \
+ void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
+ int src_stride, \
+ int x_offset, \
+ int y_offset, \
+ const uint8_t *dst, \
+ int dst_stride, \
+ unsigned int *sseptr, \
+ const uint8_t *sec) { \
+ unsigned int sse; \
+ int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ sec, w, h, &sse, NULL, \
+ NULL); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
+ x_offset, y_offset, \
+ dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2, \
+ NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2, \
+ NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2, \
+ NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sseptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
+FN(16, 8, 16, 4, 3, opt1, (uint32_t)); \
+FN(8, 16, 8, 3, 4, opt1, (uint32_t)); \
+FN(8, 8, 8, 3, 3, opt1, (uint32_t)); \
+FN(8, 4, 8, 3, 2, opt1, (uint32_t)); \
+FN(4, 8, 4, 2, 3, opt2, (uint32_t)); \
+FN(4, 4, 4, 2, 2, opt2, (uint32_t))
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+#endif // CONFIG_USE_X86INC