ref: 873a158f1449c0feb60d928744c478dbf899771b
parent: 8abd0c2a12ffde6ddceba38c981b660ee28273f2
parent: 7186a2dd863028faa54d2e5fc7995693488af63c
author: Zoe Liu <[email protected]>
date: Fri Jul 31 14:20:14 EDT 2015
Merge "Code refactor on InterpKernel"
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -14,6 +14,7 @@
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
@@ -20,6 +21,8 @@
#include "test/util.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
@@ -945,7 +948,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
+ vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
filter_x_stride, filter_y, filter_y_stride,
w, h, 8);
}
@@ -957,7 +960,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -969,7 +972,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -981,7 +984,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -993,7 +996,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -1005,7 +1008,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -1017,7 +1020,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1029,7 +1032,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1041,7 +1044,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1053,7 +1056,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1065,7 +1068,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1077,7 +1080,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1089,7 +1092,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1101,7 +1104,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1113,7 +1116,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1125,7 +1128,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1137,7 +1140,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1149,7 +1152,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1162,7 +1165,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -1174,7 +1177,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -1186,7 +1189,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -1198,7 +1201,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -1210,7 +1213,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -1222,7 +1225,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -1234,7 +1237,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -1246,7 +1249,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
@@ -1258,7 +1261,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1270,7 +1273,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1282,7 +1285,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1294,7 +1297,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1306,7 +1309,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1318,7 +1321,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1330,7 +1333,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1342,7 +1345,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
@@ -1354,7 +1357,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1366,7 +1369,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1378,7 +1381,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1390,7 +1393,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1402,7 +1405,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1414,7 +1417,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1426,7 +1429,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1438,7 +1441,7 @@
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
@@ -1504,10 +1507,10 @@
#else
const ConvolveFunctions convolve8_c(
- vp9_convolve_copy_c, vp9_convolve_avg_c,
- vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,
- vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,
- vp9_convolve8_c, vp9_convolve8_avg_c, 0);
+ vpx_convolve_copy_c, vpx_convolve_avg_c,
+ vpx_convolve8_horiz_c, vpx_convolve8_avg_horiz_c,
+ vpx_convolve8_vert_c, vpx_convolve8_avg_vert_c,
+ vpx_convolve8_c, vpx_convolve8_avg_c, 0);
INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_c),
@@ -1585,13 +1588,13 @@
#else
const ConvolveFunctions convolve8_sse2(
#if CONFIG_USE_X86INC
- vp9_convolve_copy_sse2, vp9_convolve_avg_sse2,
+ vpx_convolve_copy_sse2, vpx_convolve_avg_sse2,
#else
- vp9_convolve_copy_c, vp9_convolve_avg_c,
+ vpx_convolve_copy_c, vpx_convolve_avg_c,
#endif // CONFIG_USE_X86INC
- vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
- vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
- vp9_convolve8_sse2, vp9_convolve8_avg_sse2, 0);
+ vpx_convolve8_horiz_sse2, vpx_convolve8_avg_horiz_sse2,
+ vpx_convolve8_vert_sse2, vpx_convolve8_avg_vert_sse2,
+ vpx_convolve8_sse2, vpx_convolve8_avg_sse2, 0);
INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_sse2),
@@ -1612,10 +1615,10 @@
#if HAVE_SSSE3
const ConvolveFunctions convolve8_ssse3(
- vp9_convolve_copy_c, vp9_convolve_avg_c,
- vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
- vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3,
- vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3, 0);
+ vpx_convolve_copy_c, vpx_convolve_avg_c,
+ vpx_convolve8_horiz_ssse3, vpx_convolve8_avg_horiz_ssse3,
+ vpx_convolve8_vert_ssse3, vpx_convolve8_avg_vert_ssse3,
+ vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3, 0);
INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_ssse3),
@@ -1635,10 +1638,10 @@
#if HAVE_AVX2 && HAVE_SSSE3
const ConvolveFunctions convolve8_avx2(
- vp9_convolve_copy_c, vp9_convolve_avg_c,
- vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
- vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
- vp9_convolve8_avx2, vp9_convolve8_avg_ssse3, 0);
+ vpx_convolve_copy_c, vpx_convolve_avg_c,
+ vpx_convolve8_horiz_avx2, vpx_convolve8_avg_horiz_ssse3,
+ vpx_convolve8_vert_avx2, vpx_convolve8_avg_vert_ssse3,
+ vpx_convolve8_avx2, vpx_convolve8_avg_ssse3, 0);
INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_avx2),
@@ -1659,16 +1662,16 @@
#if HAVE_NEON
#if HAVE_NEON_ASM
const ConvolveFunctions convolve8_neon(
- vp9_convolve_copy_neon, vp9_convolve_avg_neon,
- vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
- vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
- vp9_convolve8_neon, vp9_convolve8_avg_neon, 0);
+ vpx_convolve_copy_neon, vpx_convolve_avg_neon,
+ vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon,
+ vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon,
+ vpx_convolve8_neon, vpx_convolve8_avg_neon, 0);
#else // HAVE_NEON
const ConvolveFunctions convolve8_neon(
- vp9_convolve_copy_neon, vp9_convolve_avg_neon,
- vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
- vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
- vp9_convolve8_neon, vp9_convolve8_avg_neon, 0);
+ vpx_convolve_copy_neon, vpx_convolve_avg_neon,
+ vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon,
+ vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon,
+ vpx_convolve8_neon, vpx_convolve8_avg_neon, 0);
#endif // HAVE_NEON_ASM
INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
@@ -1689,10 +1692,10 @@
#if HAVE_DSPR2
const ConvolveFunctions convolve8_dspr2(
- vp9_convolve_copy_dspr2, vp9_convolve_avg_dspr2,
- vp9_convolve8_horiz_dspr2, vp9_convolve8_avg_horiz_dspr2,
- vp9_convolve8_vert_dspr2, vp9_convolve8_avg_vert_dspr2,
- vp9_convolve8_dspr2, vp9_convolve8_avg_dspr2, 0);
+ vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2,
+ vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2,
+ vpx_convolve8_vert_dspr2, vpx_convolve8_avg_vert_dspr2,
+ vpx_convolve8_dspr2, vpx_convolve8_avg_dspr2, 0);
INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_dspr2),
@@ -1712,10 +1715,10 @@
#if HAVE_MSA
const ConvolveFunctions convolve8_msa(
- vp9_convolve_copy_msa, vp9_convolve_avg_msa,
- vp9_convolve8_horiz_msa, vp9_convolve8_avg_horiz_msa,
- vp9_convolve8_vert_msa, vp9_convolve8_avg_vert_msa,
- vp9_convolve8_msa, vp9_convolve8_avg_msa, 0);
+ vpx_convolve_copy_msa, vpx_convolve_avg_msa,
+ vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa,
+ vpx_convolve8_vert_msa, vpx_convolve8_avg_vert_msa,
+ vpx_convolve8_msa, vpx_convolve8_avg_msa, 0);
INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_msa),
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
+++ /dev/null
@@ -1,390 +1,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/mem.h"
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(
- int16x4_t dsrc0,
- int16x4_t dsrc1,
- int16x4_t dsrc2,
- int16x4_t dsrc3,
- int16x4_t dsrc4,
- int16x4_t dsrc5,
- int16x4_t dsrc6,
- int16x4_t dsrc7,
- int16x8_t q0s16) {
- int32x4_t qdst;
- int16x4_t d0s16, d1s16;
-
- d0s16 = vget_low_s16(q0s16);
- d1s16 = vget_high_s16(q0s16);
-
- qdst = vmull_lane_s16(dsrc0, d0s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
- qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
- return qdst;
-}
-
-void vp9_convolve8_avg_horiz_neon(
- uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x,
- int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
- int w,
- int h) {
- int width;
- uint8_t *s, *d;
- uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
- uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
- uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
- uint16x8x2_t q0x2u16;
- uint8x8x2_t d0x2u8, d1x2u8;
- uint32x2x2_t d0x2u32;
- uint16x4x2_t d0x2u16, d1x2u16;
- uint32x4x2_t q0x2u32;
-
- if (x_step_q4 != 16) {
- vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- return;
- }
-
- q0s16 = vld1q_s16(filter_x);
-
- src -= 3; // adjust for taps
- for (; h > 0; h -= 4) { // loop_horiz_v
- s = src;
- d24u8 = vld1_u8(s);
- s += src_stride;
- d25u8 = vld1_u8(s);
- s += src_stride;
- d26u8 = vld1_u8(s);
- s += src_stride;
- d27u8 = vld1_u8(s);
-
- q12u8 = vcombine_u8(d24u8, d25u8);
- q13u8 = vcombine_u8(d26u8, d27u8);
-
- q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
- vreinterpretq_u16_u8(q13u8));
- d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
- d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
- d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
- d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
- d0x2u8 = vtrn_u8(d24u8, d25u8);
- d1x2u8 = vtrn_u8(d26u8, d27u8);
-
- __builtin_prefetch(src + src_stride * 4);
- __builtin_prefetch(src + src_stride * 5);
-
- q8u16 = vmovl_u8(d0x2u8.val[0]);
- q9u16 = vmovl_u8(d0x2u8.val[1]);
- q10u16 = vmovl_u8(d1x2u8.val[0]);
- q11u16 = vmovl_u8(d1x2u8.val[1]);
-
- src += 7;
- d16u16 = vget_low_u16(q8u16);
- d17u16 = vget_high_u16(q8u16);
- d18u16 = vget_low_u16(q9u16);
- d19u16 = vget_high_u16(q9u16);
- q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
- q9u16 = vcombine_u16(d17u16, d19u16);
-
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
- for (width = w;
- width > 0;
- width -= 4, src += 4, dst += 4) { // loop_horiz
- s = src;
- d28u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d29u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d31u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d30u32 = vld1_dup_u32((const uint32_t *)s);
-
- __builtin_prefetch(src + 64);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
- vreinterpret_u16_u32(d31u32));
- d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
- vreinterpret_u16_u32(d30u32));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
- vreinterpret_u8_u16(d1x2u16.val[0])); // d29
- d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
- vreinterpret_u8_u16(d1x2u16.val[1])); // d30
-
- __builtin_prefetch(src + 64 + src_stride);
-
- q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
- q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
- q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
- vreinterpretq_u32_u8(q15u8));
-
- d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
- d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
- q12u16 = vmovl_u8(d28u8);
- q13u16 = vmovl_u8(d29u8);
-
- __builtin_prefetch(src + 64 + src_stride * 2);
-
- d = dst;
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
- d += dst_stride;
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
- d18s16, d19s16, d23s16, d24s16, q0s16);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
- d19s16, d23s16, d24s16, d26s16, q0s16);
- q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
- d23s16, d24s16, d26s16, d27s16, q0s16);
- q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- __builtin_prefetch(src + 64 + src_stride * 3);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u8 = vqmovn_u16(q1u16);
- d3u8 = vqmovn_u16(q2u16);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
- vreinterpret_u16_u8(d3u8));
- d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
- vreinterpret_u32_u16(d0x2u16.val[1]));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
- vreinterpret_u8_u32(d0x2u32.val[1]));
-
- q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
- q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
- q1u8 = vrhaddq_u8(q1u8, q3u8);
-
- d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
- d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
- d = dst;
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
- q8u16 = q9u16;
- d20s16 = d23s16;
- q11u16 = q12u16;
- q9u16 = q13u16;
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- }
- src += src_stride * 4 - w - 7;
- dst += dst_stride * 4 - w;
- }
- return;
-}
-
-void vp9_convolve8_avg_vert_neon(
- uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y,
- int y_step_q4,
- int w,
- int h) {
- int height;
- uint8_t *s, *d;
- uint8x8_t d2u8, d3u8;
- uint32x2_t d2u32, d3u32, d6u32, d7u32;
- uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
- uint8x16_t q1u8, q3u8;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
-
- if (y_step_q4 != 16) {
- vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- return;
- }
-
- src -= src_stride * 3;
- q0s16 = vld1q_s16(filter_y);
- for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
- s = src;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
- s += src_stride;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
- s += src_stride;
- d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
- s += src_stride;
-
- q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
- q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
- q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
- q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d = dst;
- for (height = h; height > 0; height -= 4) { // loop_vert
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
- s += src_stride;
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
- s += src_stride;
-
- q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
- q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
- d += dst_stride;
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
- d -= dst_stride * 3;
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- __builtin_prefetch(s);
- __builtin_prefetch(s + src_stride);
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
- d20s16, d21s16, d22s16, d24s16, q0s16);
- __builtin_prefetch(s + src_stride * 2);
- __builtin_prefetch(s + src_stride * 3);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
- d21s16, d22s16, d24s16, d26s16, q0s16);
- __builtin_prefetch(d);
- __builtin_prefetch(d + dst_stride);
- q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
- d22s16, d24s16, d26s16, d27s16, q0s16);
- __builtin_prefetch(d + dst_stride * 2);
- __builtin_prefetch(d + dst_stride * 3);
- q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u8 = vqmovn_u16(q1u16);
- d3u8 = vqmovn_u16(q2u16);
-
- q1u8 = vcombine_u8(d2u8, d3u8);
- q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
- q1u8 = vrhaddq_u8(q1u8, q3u8);
-
- d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
- d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
- d += dst_stride;
-
- q8u16 = q10u16;
- d18s16 = d22s16;
- d19s16 = d24s16;
- q10u16 = q13u16;
- d22s16 = d25s16;
- }
- }
- return;
-}
--- a/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm
+++ /dev/null
@@ -1,302 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- ; These functions are only valid when:
- ; x_step_q4 == 16
- ; w%4 == 0
- ; h%4 == 0
- ; taps == 8
- ; VP9_FILTER_WEIGHT == 128
- ; VP9_FILTER_SHIFT == 7
-
- EXPORT |vp9_convolve8_avg_horiz_neon|
- EXPORT |vp9_convolve8_avg_vert_neon|
- IMPORT |vp9_convolve8_avg_horiz_c|
- IMPORT |vp9_convolve8_avg_vert_c|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Multiply and accumulate by q0
- MACRO
- MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
- vmull.s16 $dst, $src0, d0[0]
- vmlal.s16 $dst, $src1, d0[1]
- vmlal.s16 $dst, $src2, d0[2]
- vmlal.s16 $dst, $src3, d0[3]
- vmlal.s16 $dst, $src4, d1[0]
- vmlal.s16 $dst, $src5, d1[1]
- vmlal.s16 $dst, $src6, d1[2]
- vmlal.s16 $dst, $src7, d1[3]
- MEND
-
-; r0 const uint8_t *src
-; r1 int src_stride
-; r2 uint8_t *dst
-; r3 int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4 ; unused
-; sp[]int w
-; sp[]int h
-
-|vp9_convolve8_avg_horiz_neon| PROC
- ldr r12, [sp, #4] ; x_step_q4
- cmp r12, #16
- bne vp9_convolve8_avg_horiz_c
-
- push {r4-r10, lr}
-
- sub r0, r0, #3 ; adjust for taps
-
- ldr r5, [sp, #32] ; filter_x
- ldr r6, [sp, #48] ; w
- ldr r7, [sp, #52] ; h
-
- vld1.s16 {q0}, [r5] ; filter_x
-
- sub r8, r1, r1, lsl #2 ; -src_stride * 3
- add r8, r8, #4 ; -src_stride * 3 + 4
-
- sub r4, r3, r3, lsl #2 ; -dst_stride * 3
- add r4, r4, #4 ; -dst_stride * 3 + 4
-
- rsb r9, r6, r1, lsl #2 ; reset src for outer loop
- sub r9, r9, #7
- rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
-
- mov r10, r6 ; w loop counter
-
-vp9_convolve8_avg_loop_horiz_v
- vld1.8 {d24}, [r0], r1
- vld1.8 {d25}, [r0], r1
- vld1.8 {d26}, [r0], r1
- vld1.8 {d27}, [r0], r8
-
- vtrn.16 q12, q13
- vtrn.8 d24, d25
- vtrn.8 d26, d27
-
- pld [r0, r1, lsl #2]
-
- vmovl.u8 q8, d24
- vmovl.u8 q9, d25
- vmovl.u8 q10, d26
- vmovl.u8 q11, d27
-
- ; save a few instructions in the inner loop
- vswp d17, d18
- vmov d23, d21
-
- add r0, r0, #3
-
-vp9_convolve8_avg_loop_horiz
- add r5, r0, #64
-
- vld1.32 {d28[]}, [r0], r1
- vld1.32 {d29[]}, [r0], r1
- vld1.32 {d31[]}, [r0], r1
- vld1.32 {d30[]}, [r0], r8
-
- pld [r5]
-
- vtrn.16 d28, d31
- vtrn.16 d29, d30
- vtrn.8 d28, d29
- vtrn.8 d31, d30
-
- pld [r5, r1]
-
- ; extract to s16
- vtrn.32 q14, q15
- vmovl.u8 q12, d28
- vmovl.u8 q13, d29
-
- pld [r5, r1, lsl #1]
-
- ; slightly out of order load to match the existing data
- vld1.u32 {d6[0]}, [r2], r3
- vld1.u32 {d7[0]}, [r2], r3
- vld1.u32 {d6[1]}, [r2], r3
- vld1.u32 {d7[1]}, [r2], r3
-
- sub r2, r2, r3, lsl #2 ; reset for store
-
- ; src[] * filter_x
- MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
- MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
- MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
- MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
-
- pld [r5, -r8]
-
- ; += 64 >> 7
- vqrshrun.s32 d2, q1, #7
- vqrshrun.s32 d3, q2, #7
- vqrshrun.s32 d4, q14, #7
- vqrshrun.s32 d5, q15, #7
-
- ; saturate
- vqmovn.u16 d2, q1
- vqmovn.u16 d3, q2
-
- ; transpose
- vtrn.16 d2, d3
- vtrn.32 d2, d3
- vtrn.8 d2, d3
-
- ; average the new value and the dst value
- vrhadd.u8 q1, q1, q3
-
- vst1.u32 {d2[0]}, [r2@32], r3
- vst1.u32 {d3[0]}, [r2@32], r3
- vst1.u32 {d2[1]}, [r2@32], r3
- vst1.u32 {d3[1]}, [r2@32], r4
-
- vmov q8, q9
- vmov d20, d23
- vmov q11, q12
- vmov q9, q13
-
- subs r6, r6, #4 ; w -= 4
- bgt vp9_convolve8_avg_loop_horiz
-
- ; outer loop
- mov r6, r10 ; restore w counter
- add r0, r0, r9 ; src += src_stride * 4 - w
- add r2, r2, r12 ; dst += dst_stride * 4 - w
- subs r7, r7, #4 ; h -= 4
- bgt vp9_convolve8_avg_loop_horiz_v
-
- pop {r4-r10, pc}
-
- ENDP
-
-|vp9_convolve8_avg_vert_neon| PROC
- ldr r12, [sp, #12]
- cmp r12, #16
- bne vp9_convolve8_avg_vert_c
-
- push {r4-r8, lr}
-
- ; adjust for taps
- sub r0, r0, r1
- sub r0, r0, r1, lsl #1
-
- ldr r4, [sp, #32] ; filter_y
- ldr r6, [sp, #40] ; w
- ldr lr, [sp, #44] ; h
-
- vld1.s16 {q0}, [r4] ; filter_y
-
- lsl r1, r1, #1
- lsl r3, r3, #1
-
-vp9_convolve8_avg_loop_vert_h
- mov r4, r0
- add r7, r0, r1, asr #1
- mov r5, r2
- add r8, r2, r3, asr #1
- mov r12, lr ; h loop counter
-
- vld1.u32 {d16[0]}, [r4], r1
- vld1.u32 {d16[1]}, [r7], r1
- vld1.u32 {d18[0]}, [r4], r1
- vld1.u32 {d18[1]}, [r7], r1
- vld1.u32 {d20[0]}, [r4], r1
- vld1.u32 {d20[1]}, [r7], r1
- vld1.u32 {d22[0]}, [r4], r1
-
- vmovl.u8 q8, d16
- vmovl.u8 q9, d18
- vmovl.u8 q10, d20
- vmovl.u8 q11, d22
-
-vp9_convolve8_avg_loop_vert
- ; always process a 4x4 block at a time
- vld1.u32 {d24[0]}, [r7], r1
- vld1.u32 {d26[0]}, [r4], r1
- vld1.u32 {d26[1]}, [r7], r1
- vld1.u32 {d24[1]}, [r4], r1
-
- ; extract to s16
- vmovl.u8 q12, d24
- vmovl.u8 q13, d26
-
- vld1.u32 {d6[0]}, [r5@32], r3
- vld1.u32 {d6[1]}, [r8@32], r3
- vld1.u32 {d7[0]}, [r5@32], r3
- vld1.u32 {d7[1]}, [r8@32], r3
-
- pld [r7]
- pld [r4]
-
- ; src[] * filter_y
- MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
-
- pld [r7, r1]
- pld [r4, r1]
-
- MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
-
- pld [r5]
- pld [r8]
-
- MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
-
- pld [r5, r3]
- pld [r8, r3]
-
- MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
-
- ; += 64 >> 7
- vqrshrun.s32 d2, q1, #7
- vqrshrun.s32 d3, q2, #7
- vqrshrun.s32 d4, q14, #7
- vqrshrun.s32 d5, q15, #7
-
- ; saturate
- vqmovn.u16 d2, q1
- vqmovn.u16 d3, q2
-
- ; average the new value and the dst value
- vrhadd.u8 q1, q1, q3
-
- sub r5, r5, r3, lsl #1 ; reset for store
- sub r8, r8, r3, lsl #1
-
- vst1.u32 {d2[0]}, [r5@32], r3
- vst1.u32 {d2[1]}, [r8@32], r3
- vst1.u32 {d3[0]}, [r5@32], r3
- vst1.u32 {d3[1]}, [r8@32], r3
-
- vmov q8, q10
- vmov d18, d22
- vmov d19, d24
- vmov q10, q13
- vmov d22, d25
-
- subs r12, r12, #4 ; h -= 4
- bgt vp9_convolve8_avg_loop_vert
-
- ; outer loop
- add r0, r0, #4
- add r2, r2, #4
- subs r6, r6, #4 ; w -= 4
- bgt vp9_convolve8_avg_loop_vert_h
-
- pop {r4-r8, pc}
-
- ENDP
- END
--- a/vp9/common/arm/neon/vp9_convolve8_neon.c
+++ /dev/null
@@ -1,357 +1,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/mem.h"
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(
- int16x4_t dsrc0,
- int16x4_t dsrc1,
- int16x4_t dsrc2,
- int16x4_t dsrc3,
- int16x4_t dsrc4,
- int16x4_t dsrc5,
- int16x4_t dsrc6,
- int16x4_t dsrc7,
- int16x8_t q0s16) {
- int32x4_t qdst;
- int16x4_t d0s16, d1s16;
-
- d0s16 = vget_low_s16(q0s16);
- d1s16 = vget_high_s16(q0s16);
-
- qdst = vmull_lane_s16(dsrc0, d0s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
- qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
- return qdst;
-}
-
-void vp9_convolve8_horiz_neon(
- uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x,
- int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
- int w,
- int h) {
- int width;
- uint8_t *s, *d, *psrc, *pdst;
- uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
- uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
- uint8x16_t q12u8, q13u8, q14u8, q15u8;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
- uint16x8x2_t q0x2u16;
- uint8x8x2_t d0x2u8, d1x2u8;
- uint32x2x2_t d0x2u32;
- uint16x4x2_t d0x2u16, d1x2u16;
- uint32x4x2_t q0x2u32;
-
- if (x_step_q4 != 16) {
- vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- return;
- }
-
- q0s16 = vld1q_s16(filter_x);
-
- src -= 3; // adjust for taps
- for (; h > 0; h -= 4,
- src += src_stride * 4,
- dst += dst_stride * 4) { // loop_horiz_v
- s = src;
- d24u8 = vld1_u8(s);
- s += src_stride;
- d25u8 = vld1_u8(s);
- s += src_stride;
- d26u8 = vld1_u8(s);
- s += src_stride;
- d27u8 = vld1_u8(s);
-
- q12u8 = vcombine_u8(d24u8, d25u8);
- q13u8 = vcombine_u8(d26u8, d27u8);
-
- q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
- vreinterpretq_u16_u8(q13u8));
- d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
- d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
- d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
- d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
- d0x2u8 = vtrn_u8(d24u8, d25u8);
- d1x2u8 = vtrn_u8(d26u8, d27u8);
-
- __builtin_prefetch(src + src_stride * 4);
- __builtin_prefetch(src + src_stride * 5);
- __builtin_prefetch(src + src_stride * 6);
-
- q8u16 = vmovl_u8(d0x2u8.val[0]);
- q9u16 = vmovl_u8(d0x2u8.val[1]);
- q10u16 = vmovl_u8(d1x2u8.val[0]);
- q11u16 = vmovl_u8(d1x2u8.val[1]);
-
- d16u16 = vget_low_u16(q8u16);
- d17u16 = vget_high_u16(q8u16);
- d18u16 = vget_low_u16(q9u16);
- d19u16 = vget_high_u16(q9u16);
- q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
- q9u16 = vcombine_u16(d17u16, d19u16);
-
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
- for (width = w, psrc = src + 7, pdst = dst;
- width > 0;
- width -= 4, psrc += 4, pdst += 4) { // loop_horiz
- s = psrc;
- d28u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d29u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d31u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d30u32 = vld1_dup_u32((const uint32_t *)s);
-
- __builtin_prefetch(psrc + 64);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
- vreinterpret_u16_u32(d31u32));
- d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
- vreinterpret_u16_u32(d30u32));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
- vreinterpret_u8_u16(d1x2u16.val[0])); // d29
- d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
- vreinterpret_u8_u16(d1x2u16.val[1])); // d30
-
- __builtin_prefetch(psrc + 64 + src_stride);
-
- q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
- q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
- q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
- vreinterpretq_u32_u8(q15u8));
-
- d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
- d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
- q12u16 = vmovl_u8(d28u8);
- q13u16 = vmovl_u8(d29u8);
-
- __builtin_prefetch(psrc + 64 + src_stride * 2);
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
- d18s16, d19s16, d23s16, d24s16, q0s16);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
- d19s16, d23s16, d24s16, d26s16, q0s16);
- q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
- d23s16, d24s16, d26s16, d27s16, q0s16);
- q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- __builtin_prefetch(psrc + 60 + src_stride * 3);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u8 = vqmovn_u16(q1u16);
- d3u8 = vqmovn_u16(q2u16);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
- vreinterpret_u16_u8(d3u8));
- d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
- vreinterpret_u32_u16(d0x2u16.val[1]));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
- vreinterpret_u8_u32(d0x2u32.val[1]));
-
- d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
- d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
-
- d = pdst;
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
- q8u16 = q9u16;
- d20s16 = d23s16;
- q11u16 = q12u16;
- q9u16 = q13u16;
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- }
- }
- return;
-}
-
-void vp9_convolve8_vert_neon(
- uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y,
- int y_step_q4,
- int w,
- int h) {
- int height;
- uint8_t *s, *d;
- uint32x2_t d2u32, d3u32;
- uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
-
- if (y_step_q4 != 16) {
- vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- return;
- }
-
- src -= src_stride * 3;
- q0s16 = vld1q_s16(filter_y);
- for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
- s = src;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
- s += src_stride;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
- s += src_stride;
- d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
- s += src_stride;
-
- q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
- q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
- q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
- q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d = dst;
- for (height = h; height > 0; height -= 4) { // loop_vert
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
- s += src_stride;
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
- s += src_stride;
-
- q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
- q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- __builtin_prefetch(d);
- __builtin_prefetch(d + dst_stride);
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
- d20s16, d21s16, d22s16, d24s16, q0s16);
- __builtin_prefetch(d + dst_stride * 2);
- __builtin_prefetch(d + dst_stride * 3);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
- d21s16, d22s16, d24s16, d26s16, q0s16);
- __builtin_prefetch(s);
- __builtin_prefetch(s + src_stride);
- q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
- d22s16, d24s16, d26s16, d27s16, q0s16);
- __builtin_prefetch(s + src_stride * 2);
- __builtin_prefetch(s + src_stride * 3);
- q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
- d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
-
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
- d += dst_stride;
-
- q8u16 = q10u16;
- d18s16 = d22s16;
- d19s16 = d24s16;
- q10u16 = q13u16;
- d22s16 = d25s16;
- }
- }
- return;
-}
--- a/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm
+++ /dev/null
@@ -1,280 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- ; These functions are only valid when:
- ; x_step_q4 == 16
- ; w%4 == 0
- ; h%4 == 0
- ; taps == 8
- ; VP9_FILTER_WEIGHT == 128
- ; VP9_FILTER_SHIFT == 7
-
- EXPORT |vp9_convolve8_horiz_neon|
- EXPORT |vp9_convolve8_vert_neon|
- IMPORT |vp9_convolve8_horiz_c|
- IMPORT |vp9_convolve8_vert_c|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Multiply and accumulate by q0
- MACRO
- MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
- vmull.s16 $dst, $src0, d0[0]
- vmlal.s16 $dst, $src1, d0[1]
- vmlal.s16 $dst, $src2, d0[2]
- vmlal.s16 $dst, $src3, d0[3]
- vmlal.s16 $dst, $src4, d1[0]
- vmlal.s16 $dst, $src5, d1[1]
- vmlal.s16 $dst, $src6, d1[2]
- vmlal.s16 $dst, $src7, d1[3]
- MEND
-
-; r0 const uint8_t *src
-; r1 int src_stride
-; r2 uint8_t *dst
-; r3 int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4 ; unused
-; sp[]int w
-; sp[]int h
-
-|vp9_convolve8_horiz_neon| PROC
- ldr r12, [sp, #4] ; x_step_q4
- cmp r12, #16
- bne vp9_convolve8_horiz_c
-
- push {r4-r10, lr}
-
- sub r0, r0, #3 ; adjust for taps
-
- ldr r5, [sp, #32] ; filter_x
- ldr r6, [sp, #48] ; w
- ldr r7, [sp, #52] ; h
-
- vld1.s16 {q0}, [r5] ; filter_x
-
- sub r8, r1, r1, lsl #2 ; -src_stride * 3
- add r8, r8, #4 ; -src_stride * 3 + 4
-
- sub r4, r3, r3, lsl #2 ; -dst_stride * 3
- add r4, r4, #4 ; -dst_stride * 3 + 4
-
- rsb r9, r6, r1, lsl #2 ; reset src for outer loop
- sub r9, r9, #7
- rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
-
- mov r10, r6 ; w loop counter
-
-vp9_convolve8_loop_horiz_v
- vld1.8 {d24}, [r0], r1
- vld1.8 {d25}, [r0], r1
- vld1.8 {d26}, [r0], r1
- vld1.8 {d27}, [r0], r8
-
- vtrn.16 q12, q13
- vtrn.8 d24, d25
- vtrn.8 d26, d27
-
- pld [r0, r1, lsl #2]
-
- vmovl.u8 q8, d24
- vmovl.u8 q9, d25
- vmovl.u8 q10, d26
- vmovl.u8 q11, d27
-
- ; save a few instructions in the inner loop
- vswp d17, d18
- vmov d23, d21
-
- add r0, r0, #3
-
-vp9_convolve8_loop_horiz
- add r5, r0, #64
-
- vld1.32 {d28[]}, [r0], r1
- vld1.32 {d29[]}, [r0], r1
- vld1.32 {d31[]}, [r0], r1
- vld1.32 {d30[]}, [r0], r8
-
- pld [r5]
-
- vtrn.16 d28, d31
- vtrn.16 d29, d30
- vtrn.8 d28, d29
- vtrn.8 d31, d30
-
- pld [r5, r1]
-
- ; extract to s16
- vtrn.32 q14, q15
- vmovl.u8 q12, d28
- vmovl.u8 q13, d29
-
- pld [r5, r1, lsl #1]
-
- ; src[] * filter_x
- MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
- MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
- MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
- MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
-
- pld [r5, -r8]
-
- ; += 64 >> 7
- vqrshrun.s32 d2, q1, #7
- vqrshrun.s32 d3, q2, #7
- vqrshrun.s32 d4, q14, #7
- vqrshrun.s32 d5, q15, #7
-
- ; saturate
- vqmovn.u16 d2, q1
- vqmovn.u16 d3, q2
-
- ; transpose
- vtrn.16 d2, d3
- vtrn.32 d2, d3
- vtrn.8 d2, d3
-
- vst1.u32 {d2[0]}, [r2@32], r3
- vst1.u32 {d3[0]}, [r2@32], r3
- vst1.u32 {d2[1]}, [r2@32], r3
- vst1.u32 {d3[1]}, [r2@32], r4
-
- vmov q8, q9
- vmov d20, d23
- vmov q11, q12
- vmov q9, q13
-
- subs r6, r6, #4 ; w -= 4
- bgt vp9_convolve8_loop_horiz
-
- ; outer loop
- mov r6, r10 ; restore w counter
- add r0, r0, r9 ; src += src_stride * 4 - w
- add r2, r2, r12 ; dst += dst_stride * 4 - w
- subs r7, r7, #4 ; h -= 4
- bgt vp9_convolve8_loop_horiz_v
-
- pop {r4-r10, pc}
-
- ENDP
-
-|vp9_convolve8_vert_neon| PROC
- ldr r12, [sp, #12]
- cmp r12, #16
- bne vp9_convolve8_vert_c
-
- push {r4-r8, lr}
-
- ; adjust for taps
- sub r0, r0, r1
- sub r0, r0, r1, lsl #1
-
- ldr r4, [sp, #32] ; filter_y
- ldr r6, [sp, #40] ; w
- ldr lr, [sp, #44] ; h
-
- vld1.s16 {q0}, [r4] ; filter_y
-
- lsl r1, r1, #1
- lsl r3, r3, #1
-
-vp9_convolve8_loop_vert_h
- mov r4, r0
- add r7, r0, r1, asr #1
- mov r5, r2
- add r8, r2, r3, asr #1
- mov r12, lr ; h loop counter
-
- vld1.u32 {d16[0]}, [r4], r1
- vld1.u32 {d16[1]}, [r7], r1
- vld1.u32 {d18[0]}, [r4], r1
- vld1.u32 {d18[1]}, [r7], r1
- vld1.u32 {d20[0]}, [r4], r1
- vld1.u32 {d20[1]}, [r7], r1
- vld1.u32 {d22[0]}, [r4], r1
-
- vmovl.u8 q8, d16
- vmovl.u8 q9, d18
- vmovl.u8 q10, d20
- vmovl.u8 q11, d22
-
-vp9_convolve8_loop_vert
- ; always process a 4x4 block at a time
- vld1.u32 {d24[0]}, [r7], r1
- vld1.u32 {d26[0]}, [r4], r1
- vld1.u32 {d26[1]}, [r7], r1
- vld1.u32 {d24[1]}, [r4], r1
-
- ; extract to s16
- vmovl.u8 q12, d24
- vmovl.u8 q13, d26
-
- pld [r5]
- pld [r8]
-
- ; src[] * filter_y
- MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
-
- pld [r5, r3]
- pld [r8, r3]
-
- MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
-
- pld [r7]
- pld [r4]
-
- MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
-
- pld [r7, r1]
- pld [r4, r1]
-
- MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
-
- ; += 64 >> 7
- vqrshrun.s32 d2, q1, #7
- vqrshrun.s32 d3, q2, #7
- vqrshrun.s32 d4, q14, #7
- vqrshrun.s32 d5, q15, #7
-
- ; saturate
- vqmovn.u16 d2, q1
- vqmovn.u16 d3, q2
-
- vst1.u32 {d2[0]}, [r5@32], r3
- vst1.u32 {d2[1]}, [r8@32], r3
- vst1.u32 {d3[0]}, [r5@32], r3
- vst1.u32 {d3[1]}, [r8@32], r3
-
- vmov q8, q10
- vmov d18, d22
- vmov d19, d24
- vmov q10, q13
- vmov d22, d25
-
- subs r12, r12, #4 ; h -= 4
- bgt vp9_convolve8_loop_vert
-
- ; outer loop
- add r0, r0, #4
- add r2, r2, #4
- subs r6, r6, #4 ; w -= 4
- bgt vp9_convolve8_loop_vert_h
-
- pop {r4-r8, pc}
-
- ENDP
- END
--- a/vp9/common/arm/neon/vp9_convolve_avg_neon.c
+++ /dev/null
@@ -1,145 +1,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-void vp9_convolve_avg_neon(
- const uint8_t *src, // r0
- ptrdiff_t src_stride, // r1
- uint8_t *dst, // r2
- ptrdiff_t dst_stride, // r3
- const int16_t *filter_x,
- int filter_x_stride,
- const int16_t *filter_y,
- int filter_y_stride,
- int w,
- int h) {
- uint8_t *d;
- uint8x8_t d0u8, d1u8, d2u8, d3u8;
- uint32x2_t d0u32, d2u32;
- uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
-
- d = dst;
- if (w > 32) { // avg64
- for (; h > 0; h -= 1) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- q2u8 = vld1q_u8(src + 32);
- q3u8 = vld1q_u8(src + 48);
- src += src_stride;
- q8u8 = vld1q_u8(d);
- q9u8 = vld1q_u8(d + 16);
- q10u8 = vld1q_u8(d + 32);
- q11u8 = vld1q_u8(d + 48);
- d += dst_stride;
-
- q0u8 = vrhaddq_u8(q0u8, q8u8);
- q1u8 = vrhaddq_u8(q1u8, q9u8);
- q2u8 = vrhaddq_u8(q2u8, q10u8);
- q3u8 = vrhaddq_u8(q3u8, q11u8);
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- vst1q_u8(dst + 32, q2u8);
- vst1q_u8(dst + 48, q3u8);
- dst += dst_stride;
- }
- } else if (w == 32) { // avg32
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- src += src_stride;
- q2u8 = vld1q_u8(src);
- q3u8 = vld1q_u8(src + 16);
- src += src_stride;
- q8u8 = vld1q_u8(d);
- q9u8 = vld1q_u8(d + 16);
- d += dst_stride;
- q10u8 = vld1q_u8(d);
- q11u8 = vld1q_u8(d + 16);
- d += dst_stride;
-
- q0u8 = vrhaddq_u8(q0u8, q8u8);
- q1u8 = vrhaddq_u8(q1u8, q9u8);
- q2u8 = vrhaddq_u8(q2u8, q10u8);
- q3u8 = vrhaddq_u8(q3u8, q11u8);
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- dst += dst_stride;
- vst1q_u8(dst, q2u8);
- vst1q_u8(dst + 16, q3u8);
- dst += dst_stride;
- }
- } else if (w > 8) { // avg16
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- src += src_stride;
- q1u8 = vld1q_u8(src);
- src += src_stride;
- q2u8 = vld1q_u8(d);
- d += dst_stride;
- q3u8 = vld1q_u8(d);
- d += dst_stride;
-
- q0u8 = vrhaddq_u8(q0u8, q2u8);
- q1u8 = vrhaddq_u8(q1u8, q3u8);
-
- vst1q_u8(dst, q0u8);
- dst += dst_stride;
- vst1q_u8(dst, q1u8);
- dst += dst_stride;
- }
- } else if (w == 8) { // avg8
- for (; h > 0; h -= 2) {
- d0u8 = vld1_u8(src);
- src += src_stride;
- d1u8 = vld1_u8(src);
- src += src_stride;
- d2u8 = vld1_u8(d);
- d += dst_stride;
- d3u8 = vld1_u8(d);
- d += dst_stride;
-
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- q0u8 = vrhaddq_u8(q0u8, q1u8);
-
- vst1_u8(dst, vget_low_u8(q0u8));
- dst += dst_stride;
- vst1_u8(dst, vget_high_u8(q0u8));
- dst += dst_stride;
- }
- } else { // avg4
- for (; h > 0; h -= 2) {
- d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
- src += src_stride;
- d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
- src += src_stride;
- d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
- d += dst_stride;
- d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
- d += dst_stride;
-
- d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
- vreinterpret_u8_u32(d2u32));
-
- d0u32 = vreinterpret_u32_u8(d0u8);
- vst1_lane_u32((uint32_t *)dst, d0u32, 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, d0u32, 1);
- dst += dst_stride;
- }
- }
- return;
-}
--- a/vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm
+++ /dev/null
@@ -1,116 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_convolve_avg_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|vp9_convolve_avg_neon| PROC
- push {r4-r6, lr}
- ldrd r4, r5, [sp, #32]
- mov r6, r2
-
- cmp r4, #32
- bgt avg64
- beq avg32
- cmp r4, #8
- bgt avg16
- beq avg8
- b avg4
-
-avg64
- sub lr, r1, #32
- sub r4, r3, #32
-avg64_h
- pld [r0, r1, lsl #1]
- vld1.8 {q0-q1}, [r0]!
- vld1.8 {q2-q3}, [r0], lr
- pld [r2, r3]
- vld1.8 {q8-q9}, [r6@128]!
- vld1.8 {q10-q11}, [r6@128], r4
- vrhadd.u8 q0, q0, q8
- vrhadd.u8 q1, q1, q9
- vrhadd.u8 q2, q2, q10
- vrhadd.u8 q3, q3, q11
- vst1.8 {q0-q1}, [r2@128]!
- vst1.8 {q2-q3}, [r2@128], r4
- subs r5, r5, #1
- bgt avg64_h
- pop {r4-r6, pc}
-
-avg32
- vld1.8 {q0-q1}, [r0], r1
- vld1.8 {q2-q3}, [r0], r1
- vld1.8 {q8-q9}, [r6@128], r3
- vld1.8 {q10-q11}, [r6@128], r3
- pld [r0]
- vrhadd.u8 q0, q0, q8
- pld [r0, r1]
- vrhadd.u8 q1, q1, q9
- pld [r6]
- vrhadd.u8 q2, q2, q10
- pld [r6, r3]
- vrhadd.u8 q3, q3, q11
- vst1.8 {q0-q1}, [r2@128], r3
- vst1.8 {q2-q3}, [r2@128], r3
- subs r5, r5, #2
- bgt avg32
- pop {r4-r6, pc}
-
-avg16
- vld1.8 {q0}, [r0], r1
- vld1.8 {q1}, [r0], r1
- vld1.8 {q2}, [r6@128], r3
- vld1.8 {q3}, [r6@128], r3
- pld [r0]
- pld [r0, r1]
- vrhadd.u8 q0, q0, q2
- pld [r6]
- pld [r6, r3]
- vrhadd.u8 q1, q1, q3
- vst1.8 {q0}, [r2@128], r3
- vst1.8 {q1}, [r2@128], r3
- subs r5, r5, #2
- bgt avg16
- pop {r4-r6, pc}
-
-avg8
- vld1.8 {d0}, [r0], r1
- vld1.8 {d1}, [r0], r1
- vld1.8 {d2}, [r6@64], r3
- vld1.8 {d3}, [r6@64], r3
- pld [r0]
- pld [r0, r1]
- vrhadd.u8 q0, q0, q1
- pld [r6]
- pld [r6, r3]
- vst1.8 {d0}, [r2@64], r3
- vst1.8 {d1}, [r2@64], r3
- subs r5, r5, #2
- bgt avg8
- pop {r4-r6, pc}
-
-avg4
- vld1.32 {d0[0]}, [r0], r1
- vld1.32 {d0[1]}, [r0], r1
- vld1.32 {d2[0]}, [r6@32], r3
- vld1.32 {d2[1]}, [r6@32], r3
- vrhadd.u8 d0, d0, d2
- vst1.32 {d0[0]}, [r2@32], r3
- vst1.32 {d0[1]}, [r2@32], r3
- subs r5, r5, #2
- bgt avg4
- pop {r4-r6, pc}
- ENDP
-
- END
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ /dev/null
@@ -1,82 +1,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vpx_ports/mem.h"
-
-void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
- * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
- */
- DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
-
- // Account for the vertical phase needing 3 lines prior and 4 lines post
- int intermediate_height = h + 7;
-
- if (x_step_q4 != 16 || y_step_q4 != 16) {
- vp9_convolve8_c(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
- return;
- }
-
- /* Filter starting 3 lines back. The neon implementation will ignore the
- * given height and filter a multiple of 4 lines. Since this goes in to
- * the temp buffer which has lots of extra room and is subsequently discarded
- * this is safe if somewhat less than ideal.
- */
- vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
- temp, 64,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, intermediate_height);
-
- /* Step into the temp buffer 3 lines to get the actual frame data */
- vp9_convolve8_vert_neon(temp + 64 * 3, 64,
- dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
-}
-
-void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
- int intermediate_height = h + 7;
-
- if (x_step_q4 != 16 || y_step_q4 != 16) {
- vp9_convolve8_avg_c(src, src_stride,
- dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4,
- w, h);
- return;
- }
-
- /* This implementation has the same issues as above. In addition, we only want
- * to average the values after both passes.
- */
- vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
- temp, 64,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, intermediate_height);
- vp9_convolve8_avg_vert_neon(temp + 64 * 3,
- 64, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
-}
--- a/vp9/common/arm/neon/vp9_copy_neon.c
+++ /dev/null
@@ -1,92 +1,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-void vp9_convolve_copy_neon(
- const uint8_t *src, // r0
- ptrdiff_t src_stride, // r1
- uint8_t *dst, // r2
- ptrdiff_t dst_stride, // r3
- const int16_t *filter_x,
- int filter_x_stride,
- const int16_t *filter_y,
- int filter_y_stride,
- int w,
- int h) {
- uint8x8_t d0u8, d2u8;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
-
- if (w > 32) { // copy64
- for (; h > 0; h--) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- q2u8 = vld1q_u8(src + 32);
- q3u8 = vld1q_u8(src + 48);
- src += src_stride;
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- vst1q_u8(dst + 32, q2u8);
- vst1q_u8(dst + 48, q3u8);
- dst += dst_stride;
- }
- } else if (w == 32) { // copy32
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- src += src_stride;
- q2u8 = vld1q_u8(src);
- q3u8 = vld1q_u8(src + 16);
- src += src_stride;
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- dst += dst_stride;
- vst1q_u8(dst, q2u8);
- vst1q_u8(dst + 16, q3u8);
- dst += dst_stride;
- }
- } else if (w > 8) { // copy16
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- src += src_stride;
- q1u8 = vld1q_u8(src);
- src += src_stride;
-
- vst1q_u8(dst, q0u8);
- dst += dst_stride;
- vst1q_u8(dst, q1u8);
- dst += dst_stride;
- }
- } else if (w == 8) { // copy8
- for (; h > 0; h -= 2) {
- d0u8 = vld1_u8(src);
- src += src_stride;
- d2u8 = vld1_u8(src);
- src += src_stride;
-
- vst1_u8(dst, d0u8);
- dst += dst_stride;
- vst1_u8(dst, d2u8);
- dst += dst_stride;
- }
- } else { // copy4
- for (; h > 0; h--) {
- *(uint32_t *)dst = *(const uint32_t *)src;
- src += src_stride;
- dst += dst_stride;
- }
- }
- return;
-}
--- a/vp9/common/arm/neon/vp9_copy_neon_asm.asm
+++ /dev/null
@@ -1,84 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_convolve_copy_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|vp9_convolve_copy_neon| PROC
- push {r4-r5, lr}
- ldrd r4, r5, [sp, #28]
-
- cmp r4, #32
- bgt copy64
- beq copy32
- cmp r4, #8
- bgt copy16
- beq copy8
- b copy4
-
-copy64
- sub lr, r1, #32
- sub r3, r3, #32
-copy64_h
- pld [r0, r1, lsl #1]
- vld1.8 {q0-q1}, [r0]!
- vld1.8 {q2-q3}, [r0], lr
- vst1.8 {q0-q1}, [r2@128]!
- vst1.8 {q2-q3}, [r2@128], r3
- subs r5, r5, #1
- bgt copy64_h
- pop {r4-r5, pc}
-
-copy32
- pld [r0, r1, lsl #1]
- vld1.8 {q0-q1}, [r0], r1
- pld [r0, r1, lsl #1]
- vld1.8 {q2-q3}, [r0], r1
- vst1.8 {q0-q1}, [r2@128], r3
- vst1.8 {q2-q3}, [r2@128], r3
- subs r5, r5, #2
- bgt copy32
- pop {r4-r5, pc}
-
-copy16
- pld [r0, r1, lsl #1]
- vld1.8 {q0}, [r0], r1
- pld [r0, r1, lsl #1]
- vld1.8 {q1}, [r0], r1
- vst1.8 {q0}, [r2@128], r3
- vst1.8 {q1}, [r2@128], r3
- subs r5, r5, #2
- bgt copy16
- pop {r4-r5, pc}
-
-copy8
- pld [r0, r1, lsl #1]
- vld1.8 {d0}, [r0], r1
- pld [r0, r1, lsl #1]
- vld1.8 {d2}, [r0], r1
- vst1.8 {d0}, [r2@64], r3
- vst1.8 {d2}, [r2@64], r3
- subs r5, r5, #2
- bgt copy8
- pop {r4-r5, pc}
-
-copy4
- ldr r12, [r0], r1
- str r12, [r2], r3
- subs r5, r5, #1
- bgt copy4
- pop {r4-r5, pc}
- ENDP
-
- END
--- a/vp9/common/mips/msa/vp9_convolve8_avg_horiz_msa.c
+++ /dev/null
@@ -1,782 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
-
-static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 dst0, dst1, dst2, dst3, res2, res3;
- v16u8 mask0, mask1, mask2, mask3;
- v8i16 filt, res0, res1;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, res0, res1);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- SRARI_H2_SH(res0, res1, FILTER_BITS);
- SAT_SH2_SH(res0, res1, 7);
- PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
- XORI_B2_128_UB(res2, res3);
- AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v8i16 filt, vec0, vec1, vec2, vec3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, vec0, vec1);
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, vec2, vec3);
- SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
- SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
- PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
- res3);
- ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
- XORI_B2_128_UB(res0, res2);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
- dst6);
- ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
- AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
- ST4x8_UB(res0, res2, dst, dst_stride);
-}
-
-static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- if (4 == height) {
- common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- int32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- src += (4 * src_stride);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
- dst, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- int32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
- v8i16 filt, out0, out1, out2, out3;
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = height >> 1; loop_cnt--;) {
- LD_SB2(src, src_stride, src0, src2);
- LD_SB2(src + 8, src_stride, src1, src3);
- src += (2 * src_stride);
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
- VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
- VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
- vec14);
- VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
- vec15);
- DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
- vec9, vec10, vec11);
- DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
- vec2, vec3);
- DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
- vec9, vec10, vec11);
- ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
- out2, out3);
- LD_UB2(dst, dst_stride, dst0, dst1);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
- dst += dst_stride;
- PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
- v8i16 filt, out0, out1, out2, out3;
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = height; loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
- VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
- VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
- vec14);
- VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
- vec15);
- DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
- vec9, vec10, vec11);
- DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
- vec2, vec3);
- DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
- vec9, vec10, vec11);
- ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- LD_UB2(dst, 16, dst1, dst2);
- PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
- PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
- dst += dst_stride;
- }
-}
-
-static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt, cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
- v8i16 filt, out0, out1, out2, out3;
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = height; loop_cnt--;) {
- for (cnt = 0; cnt < 2; ++cnt) {
- src0 = LD_SB(&src[cnt << 5]);
- src2 = LD_SB(&src[16 + (cnt << 5)]);
- src3 = LD_SB(&src[24 + (cnt << 5)]);
- src1 = __msa_sldi_b(src2, src0, 8);
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
- vec12);
- VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
- vec13);
- VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
- vec14);
- VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
- vec15);
- DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
- vec1, vec2, vec3);
- DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
- vec9, vec10, vec11);
- DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
- vec1, vec2, vec3);
- DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
- vec9, vec10, vec11);
- ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
- PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
- PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
- }
-
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
- v8u16 vec2, vec3, const255, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
- SRARI_H2_UH(vec2, vec3, FILTER_BITS);
- MIN_UH2_UH(vec2, vec3, const255);
- PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
- AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v8u16 vec4, vec5, vec6, vec7, const255, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
- VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
- vec6, vec7);
- SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
- MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
- PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
- res3);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
- dst6);
- AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
- res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- if (4 == height) {
- common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
- v8u16 vec0, vec1, vec2, vec3, const255, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
- dst, dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- v16i8 src0, src1, src2, src3, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
- v8u16 vec0, vec1, vec2, vec3, const255, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
- dst, dst_stride);
- dst += (4 * dst_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
- dst, dst_stride);
- dst += (4 * dst_stride);
-
- if (16 == height) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
- dst, dst_stride);
- dst += (4 * dst_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
- dst, dst_stride);
- }
-}
-
-static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- if (4 == height) {
- common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
- } else {
- common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
- filter, height);
- }
-}
-
-static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
- res2, res3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
- res6, res7);
- SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
- SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- MIN_UH4_UH(res0, res1, res2, res3, const255);
- MIN_UH4_UH(res4, res5, res6, res7, const255);
- PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
- dst += dst_stride;
-
- for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
- res2, res3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
- res6, res7);
- SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
- SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- MIN_UH4_UH(res0, res1, res2, res3, const255);
- MIN_UH4_UH(res4, res5, res6, res7, const255);
- PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
- dst += dst_stride;
- PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- for (loop_cnt = (height >> 1); loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
- src4 = LD_SB(src);
- src6 = LD_SB(src + 16);
- src7 = LD_SB(src + 24);
- src5 = __msa_sldi_b(src6, src4, 8);
- src += src_stride;
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
- res2, res3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
- res6, res7);
- SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
- SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
- MIN_UH4_UH(res0, res1, res2, res3, const255);
- MIN_UH4_UH(res4, res5, res6, res7, const255);
- LD_UB2(dst, 16, dst0, dst1);
- PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
- PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
- dst += dst_stride;
- LD_UB2(dst, 16, dst2, dst3);
- PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
- PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, dst0, dst1, dst2, dst3;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7, const255, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- const255 = (v8u16)__msa_ldi_h(255);
-
- for (loop_cnt = height; loop_cnt--;) {
- LD_SB4(src, 16, src0, src2, src4, src6);
- src7 = LD_SB(src + 56);
- SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
- src += src_stride;
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
- MIN_UH4_UH(out0, out1, out2, out3, const255);
- MIN_UH4_UH(out4, out5, out6, out7, const255);
- PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
- PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
- PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
- PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
- dst += dst_stride;
- }
-}
-
-void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- int8_t cnt, filt_hor[8];
-
- if (16 != x_step_q4) {
- vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- if (((const int32_t *)filter_x)[1] == 0x800000) {
- vp9_convolve_avg(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- for (cnt = 0; cnt < 8; ++cnt) {
- filt_hor[cnt] = filter_x[cnt];
- }
-
- if (((const int32_t *)filter_x)[0] == 0) {
- switch (w) {
- case 4:
- common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 8:
- common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 16:
- common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 32:
- common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 64:
- common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- default:
- vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- } else {
- switch (w) {
- case 4:
- common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 8:
- common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 16:
- common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 32:
- common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 64:
- common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- default:
- vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- }
-}
--- a/vp9/common/mips/msa/vp9_convolve8_avg_msa.c
+++ /dev/null
@@ -1,679 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
-
-static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
- v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
- v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
- v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= (3 + 3 * src_stride);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
-
- filt = LD_SH(filter_vert);
- SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
- ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
- vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
- res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
- vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
- res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
- SRARI_H2_SH(res0, res1, FILTER_BITS);
- SAT_SH2_SH(res0, res1, 7);
- PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
- XORI_B2_128_UB(tmp0, tmp1);
- AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
- ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- hz_out5 = hz_out9;
- vec0 = vec2;
- vec1 = vec3;
- vec2 = vec4;
- }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
- v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
- v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
- v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
- v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= (3 + 3 * src_stride);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
-
- filt = LD_SH(filter_vert);
- SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
- ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
- ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
- ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
- hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
- tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
- tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
- tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
- tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
- CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
- dst, dst_stride);
- dst += (4 * dst_stride);
-
- hz_out6 = hz_out10;
- out0 = out2;
- out1 = out3;
- out2 = out8;
- out4 = out6;
- out5 = out7;
- out6 = out9;
- }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 2; multiple8_cnt--;) {
- common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 4; multiple8_cnt--;) {
- common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 8; multiple8_cnt--;) {
- common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, mask;
- v16u8 filt_hz, filt_vt, vec0, vec1;
- v16u8 dst0, dst1, dst2, dst3, res0, res1;
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- filt = LD_UH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
- hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
- AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
- v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
- src8 = LD_SB(src);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
- hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
- hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
- SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
- hz_out3, hz_out5, 8);
- hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
-
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
- dst4, dst6);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
- tmp0, tmp1, tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
- res2, res3);
- AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
- res2, res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- if (4 == height) {
- common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert);
- } else if (8 == height) {
- common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert);
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, mask;
- v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
- v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
- src += (5 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp1 = __msa_dotp_u_h(vec1, filt_vt);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp2 = __msa_dotp_u_h(vec2, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp3 = __msa_dotp_u_h(vec3, filt_vt);
-
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
- dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, mask;
- v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
- v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_SB(src);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
- dst, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- if (4 == height) {
- common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert);
- } else {
- common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert,
- height);
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB2(src, 8, src0, src1);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
- dst += dst_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
- dst += dst_stride;
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
- dst += dst_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 2; multiple8_cnt--;) {
- common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- src += 16;
- dst += 16;
- }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 4; multiple8_cnt--;) {
- common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- src += 16;
- dst += 16;
- }
-}
-
-void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- int8_t cnt, filt_hor[8], filt_ver[8];
-
- if (16 != x_step_q4 || 16 != y_step_q4) {
- vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- if (((const int32_t *)filter_x)[1] == 0x800000 &&
- ((const int32_t *)filter_y)[1] == 0x800000) {
- vp9_convolve_avg(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- for (cnt = 0; cnt < 8; ++cnt) {
- filt_hor[cnt] = filter_x[cnt];
- filt_ver[cnt] = filter_y[cnt];
- }
-
- if (((const int32_t *)filter_x)[0] == 0 &&
- ((const int32_t *)filter_y)[0] == 0) {
- switch (w) {
- case 4:
- common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], h);
- break;
- case 8:
- common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], h);
- break;
- case 16:
- common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], h);
- break;
- case 32:
- common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], h);
- break;
- case 64:
- common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], h);
- break;
- default:
- vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- } else if (((const int32_t *)filter_x)[0] == 0 ||
- ((const int32_t *)filter_y)[0] == 0) {
- vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- } else {
- switch (w) {
- case 4:
- common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, h);
- break;
- case 8:
- common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, h);
- break;
- case 16:
- common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, h);
- break;
- case 32:
- common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, h);
- break;
- case 64:
- common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, h);
- break;
- default:
- vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- }
-}
--- a/vp9/common/mips/msa/vp9_convolve8_avg_vert_msa.c
+++ /dev/null
@@ -1,753 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
-
-static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16u8 dst0, dst1, dst2, dst3, out;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
- v16i8 src10998, filt0, filt1, filt2, filt3;
- v8i16 filt, out10, out32;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
- ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
- src4332, src6554);
- XORI_B3_128_SB(src2110, src4332, src6554);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
- XORI_B2_128_SB(src8776, src10998);
- out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
- filt1, filt2, filt3);
- out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
- filt1, filt2, filt3);
- SRARI_H2_SH(out10, out32, FILTER_BITS);
- SAT_SH2_SH(out10, out32, 7);
- out = PCKEV_XORI128_UB(out10, out32);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
- dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
- out = __msa_aver_u_b(out, dst0);
-
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
-
- src2110 = src6554;
- src4332 = src8776;
- src6554 = src10998;
- src6 = src10;
- }
-}
-
-static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16u8 dst0, dst1, dst2, dst3;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
- v8i16 filt, out0, out1, out2, out3;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- XORI_B4_128_SB(src7, src8, src9, src10);
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
- filt1, filt2, filt3);
- out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
- filt1, filt2, filt3);
- out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
- filt1, filt2, filt3);
- out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
- filt1, filt2, filt3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
- dst, dst_stride);
- dst += (4 * dst_stride);
-
- src10_r = src54_r;
- src32_r = src76_r;
- src54_r = src98_r;
- src21_r = src65_r;
- src43_r = src87_r;
- src65_r = src109_r;
- src6 = src10;
- }
-}
-
-static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height,
- int32_t width) {
- const uint8_t *src_tmp;
- uint8_t *dst_tmp;
- uint32_t loop_cnt, cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
- v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
- v16i8 filt0, filt1, filt2, filt3;
- v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
- v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- for (cnt = (width >> 4); cnt--;) {
- src_tmp = src;
- dst_tmp = dst;
-
- LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src_tmp += (7 * src_stride);
-
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
- ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
- src54_l, src21_l);
- ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
- src_tmp += (4 * src_stride);
-
- LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
- XORI_B4_128_SB(src7, src8, src9, src10);
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
- src87_l, src98_l, src109_l);
- out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
- filt1, filt2, filt3);
- out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
- filt1, filt2, filt3);
- out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
- filt1, filt2, filt3);
- out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
- filt1, filt2, filt3);
- out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
- filt1, filt2, filt3);
- out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
- filt1, filt2, filt3);
- out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
- filt1, filt2, filt3);
- out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
- filt1, filt2, filt3);
- SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
- SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
- SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
- SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
- PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
- out3_r, tmp0, tmp1, tmp2, tmp3);
- XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
- AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
- dst2, dst3);
- ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
- dst_tmp += (4 * dst_stride);
-
- src10_r = src54_r;
- src32_r = src76_r;
- src54_r = src98_r;
- src21_r = src65_r;
- src43_r = src87_r;
- src65_r = src109_r;
- src10_l = src54_l;
- src32_l = src76_l;
- src54_l = src98_l;
- src21_l = src65_l;
- src43_l = src87_l;
- src65_l = src109_l;
- src6 = src10;
- }
-
- src += 16;
- dst += 16;
- }
-}
-
-static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
- filter, height, 16);
-}
-
-static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
- filter, height, 32);
-}
-
-static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
- filter, height, 64);
-}
-
-static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, src4;
- v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
- v16i8 src10_r, src32_r, src21_r, src43_r;
- v8i16 filt;
- v8u16 tmp0, tmp1;
-
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- src4 = LD_SB(src);
- src += src_stride;
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
- dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
- ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
- src32_r, src43_r);
- ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
- DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
-
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- out = __msa_aver_u_b(out, dst0);
-
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
- v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
- v16u8 src2110, src4332, src6554, src8776, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
- src8 = LD_SB(src);
-
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
- ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
- dst2, dst3);
- ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
- ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
- src32_r, src43_r);
- ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
- src76_r, src87_r);
- ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
- src87_r, src76_r, src2110, src4332, src6554, src8776);
- DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
- tmp0, tmp1, tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
- AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
- ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- if (4 == height) {
- common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter) {
- v16u8 src0, src1, src2, src3, src4;
- v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
- ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
- dst, dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
- v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
- src += (8 * src_stride);
- LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
-
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
- vec2, vec3);
- ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
- vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
- dst, dst_stride);
- dst += (4 * dst_stride);
-
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
- dst, dst_stride);
- dst += (4 * dst_stride);
-
- src0 = src8;
- }
-}
-
-static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- if (4 == height) {
- common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
- } else {
- common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
- filter, height);
- }
-}
-
-static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 tmp0, tmp1, tmp2, tmp3, filt;
-
- /* rearranging filter_y */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
- dst += dst_stride;
-
- ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
- dst += dst_stride;
-
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
- dst += dst_stride;
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
- dst += dst_stride;
-
- src0 = src4;
- }
-}
-
-static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3, filt;
-
- /* rearranging filter_y */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_UB2(src, 16, src0, src5);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-
- LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
- LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
- src += (4 * src_stride);
-
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
-
- ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
-
- ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
- ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
-
- ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
- ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
- dst += (4 * dst_stride);
-
- src0 = src4;
- src5 = src9;
- }
-}
-
-static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter,
- int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5;
- v16u8 src6, src7, src8, src9, src10, src11, filt0;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- v8u16 filt;
-
- /* rearranging filter_y */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_UB4(src, 16, src0, src3, src6, src9);
- src += src_stride;
-
- for (loop_cnt = (height >> 1); loop_cnt--;) {
- LD_UB2(src, src_stride, src1, src2);
- LD_UB2(dst, dst_stride, dst0, dst1);
- LD_UB2(src + 16, src_stride, src4, src5);
- LD_UB2(dst + 16, dst_stride, dst2, dst3);
- LD_UB2(src + 32, src_stride, src7, src8);
- LD_UB2(dst + 32, dst_stride, dst4, dst5);
- LD_UB2(src + 48, src_stride, src10, src11);
- LD_UB2(dst + 48, dst_stride, dst6, dst7);
- src += (2 * src_stride);
-
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
-
- ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
- ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
- SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
- SAT_UH2_UH(tmp4, tmp5, 7);
- PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
- SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
- SAT_UH2_UH(tmp6, tmp7, 7);
- PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
-
- ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
- ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
-
- ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
- ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
- SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
- SAT_UH2_UH(tmp4, tmp5, 7);
- PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
- SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
- SAT_UH2_UH(tmp6, tmp7, 7);
- PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
- dst += (2 * dst_stride);
-
- src0 = src2;
- src3 = src5;
- src6 = src8;
- src9 = src11;
- }
-}
-
-void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- int8_t cnt, filt_ver[8];
-
- if (16 != y_step_q4) {
- vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- if (((const int32_t *)filter_y)[1] == 0x800000) {
- vp9_convolve_avg(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- for (cnt = 0; cnt < 8; ++cnt) {
- filt_ver[cnt] = filter_y[cnt];
- }
-
- if (((const int32_t *)filter_y)[0] == 0) {
- switch (w) {
- case 4:
- common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 8:
- common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 16:
- common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 32:
- common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 64:
- common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- default:
- vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- } else {
- switch (w) {
- case 4:
- common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- case 8:
- common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- case 16:
- common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
-
- break;
- case 32:
- common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- case 64:
- common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- default:
- vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- }
-}
--- a/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c
+++ /dev/null
@@ -1,742 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
-
-static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16u8 mask0, mask1, mask2, mask3, out;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v8i16 filt, out0, out1;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, out0, out1);
- SRARI_H2_SH(out0, out1, FILTER_BITS);
- SAT_SH2_SH(out0, out1, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16i8 filt0, filt1, filt2, filt3;
- v16i8 src0, src1, src2, src3;
- v16u8 mask0, mask1, mask2, mask3, out;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- src += (4 * src_stride);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, out0, out1);
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
- out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, out0, out1, out2,
- out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- tmp0 = PCKEV_XORI128_UB(out0, out1);
- tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
-}
-
-static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- src += (4 * src_stride);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- tmp0 = PCKEV_XORI128_UB(out0, out1);
- tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
- } else {
- common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
- }
-}
-
-static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, out;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = (height >> 1); loop_cnt--;) {
- LD_SB2(src, src_stride, src0, src2);
- LD_SB2(src + 8, src_stride, src1, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- src += (2 * src_stride);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST_UB(out, dst);
- dst += dst_stride;
- out = PCKEV_XORI128_UB(out2, out3);
- ST_UB(out, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, out;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = (height >> 1); loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
-
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
-
- out = PCKEV_XORI128_UB(out0, out1);
- ST_UB(out, dst);
- out = PCKEV_XORI128_UB(out2, out3);
- ST_UB(out, dst + 16);
- dst += dst_stride;
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST_UB(out, dst);
- out = PCKEV_XORI128_UB(out2, out3);
- ST_UB(out, dst + 16);
- dst += dst_stride;
- }
-}
-
-static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- int32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, out;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = height; loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST_UB(out, dst);
- out = PCKEV_XORI128_UB(out2, out3);
- ST_UB(out, dst + 16);
-
- src0 = LD_SB(src + 32);
- src2 = LD_SB(src + 48);
- src3 = LD_SB(src + 56);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST_UB(out, dst + 32);
- out = PCKEV_XORI128_UB(out2, out3);
- ST_UB(out, dst + 48);
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, mask;
- v16u8 filt0, vec0, vec1, res0, res1;
- v8u16 vec2, vec3, filt, const255;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
-
- const255 = (v8u16) __msa_ldi_h(255);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
- SRARI_H2_UH(vec2, vec3, FILTER_BITS);
- MIN_UH2_UH(vec2, vec3, const255);
- PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16u8 vec0, vec1, vec2, vec3, filt0;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16i8 res0, res1, res2, res3;
- v8u16 vec4, vec5, vec6, vec7, filt, const255;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
-
- const255 = (v8u16) __msa_ldi_h(255);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
- VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
- vec6, vec7);
- SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
- MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
- PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
- res2, res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16u8 filt0;
- v16i8 src0, src1, src2, src3, mask;
- v8u16 vec0, vec1, vec2, vec3, const255, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
-
- const255 = (v8u16) __msa_ldi_h(255);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
- ST8x4_UB(src0, src1, dst, dst_stride);
-}
-
-static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- v16u8 filt0;
- v16i8 src0, src1, src2, src3, mask, out0, out1;
- v8u16 vec0, vec1, vec2, vec3, filt, const255;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
-
- const255 = (v8u16) __msa_ldi_h(255);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- if (16 == height) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
- PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
- }
-}
-
-static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
- } else {
- common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
- }
-}
-
-static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- loop_cnt = (height >> 2) - 1;
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
-
- const255 = (v8u16) __msa_ldi_h(255);
-
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- MIN_UH4_UH(out0, out1, out2, out3, const255);
- MIN_UH4_UH(out4, out5, out6, out7, const255);
- PCKEV_ST_SB(out0, out1, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out2, out3, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out4, out5, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out6, out7, dst);
- dst += dst_stride;
-
- for (; loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- MIN_UH4_UH(out0, out1, out2, out3, const255);
- MIN_UH4_UH(out4, out5, out6, out7, const255);
- PCKEV_ST_SB(out0, out1, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out2, out3, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out4, out5, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out6, out7, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
-
- const255 = (v8u16) __msa_ldi_h(255);
-
- for (loop_cnt = height >> 1; loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
- src4 = LD_SB(src);
- src6 = LD_SB(src + 16);
- src7 = LD_SB(src + 24);
- src5 = __msa_sldi_b(src6, src4, 8);
- src += src_stride;
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- MIN_UH4_UH(out0, out1, out2, out3, const255);
- MIN_UH4_UH(out4, out5, out6, out7, const255);
- PCKEV_ST_SB(out0, out1, dst);
- PCKEV_ST_SB(out2, out3, dst + 16);
- dst += dst_stride;
- PCKEV_ST_SB(out4, out5, dst);
- PCKEV_ST_SB(out6, out7, dst + 16);
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
-
- const255 = (v8u16) __msa_ldi_h(255);
-
- for (loop_cnt = height; loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src4 = LD_SB(src + 32);
- src6 = LD_SB(src + 48);
- src7 = LD_SB(src + 56);
- SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
- src += src_stride;
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- MIN_UH4_UH(out0, out1, out2, out3, const255);
- MIN_UH4_UH(out4, out5, out6, out7, const255);
- PCKEV_ST_SB(out0, out1, dst);
- PCKEV_ST_SB(out2, out3, dst + 16);
- PCKEV_ST_SB(out4, out5, dst + 32);
- PCKEV_ST_SB(out6, out7, dst + 48);
- dst += dst_stride;
- }
-}
-
-void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- int8_t cnt, filt_hor[8];
-
- if (16 != x_step_q4) {
- vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- if (((const int32_t *)filter_x)[1] == 0x800000) {
- vp9_convolve_copy(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- for (cnt = 0; cnt < 8; ++cnt) {
- filt_hor[cnt] = filter_x[cnt];
- }
-
- if (((const int32_t *)filter_x)[0] == 0) {
- switch (w) {
- case 4:
- common_hz_2t_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 8:
- common_hz_2t_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 16:
- common_hz_2t_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 32:
- common_hz_2t_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 64:
- common_hz_2t_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- default:
- vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- } else {
- switch (w) {
- case 4:
- common_hz_8t_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 8:
- common_hz_8t_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 16:
- common_hz_8t_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 32:
- common_hz_8t_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 64:
- common_hz_8t_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- default:
- vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- }
-}
--- a/vp9/common/mips/msa/vp9_convolve8_msa.c
+++ /dev/null
@@ -1,654 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
-
-const uint8_t mc_filt_mask_arr[16 * 3] = {
- /* 8 width cases */
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
- /* 4 width cases */
- 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
- /* 4 width cases */
- 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
-};
-
-static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
- v16u8 mask0, mask1, mask2, mask3, out;
- v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
- v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= (3 + 3 * src_stride);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
-
- filt = LD_SH(filter_vert);
- SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
- ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
- out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src += (4 * src_stride);
-
- hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
- out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
- tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
- out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
- tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
- SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
- SAT_SH2_SH(tmp0, tmp1, 7);
- out = PCKEV_XORI128_UB(tmp0, tmp1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
-
- hz_out5 = hz_out9;
- out0 = out2;
- out1 = out3;
- out2 = out4;
- }
-}
-
-static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
- v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
- v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
- v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
- v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= (3 + 3 * src_stride);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
- hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
- filt_hz1, filt_hz2, filt_hz3);
-
- filt = LD_SH(filter_vert);
- SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
- ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
- ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
- ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- src += (4 * src_stride);
-
- XORI_B4_128_SB(src7, src8, src9, src10);
-
- hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
- tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
- tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
- tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
-
- hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
- filt_hz0, filt_hz1, filt_hz2, filt_hz3);
- out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
- tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
- filt_vt2, filt_vt3);
- SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
- vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
- vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST8x4_UB(vec0, vec1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- hz_out6 = hz_out10;
- out0 = out2;
- out1 = out3;
- out2 = out8;
- out4 = out6;
- out5 = out7;
- out6 = out9;
- }
-}
-
-static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 2; multiple8_cnt--;) {
- common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 4; multiple8_cnt--;) {
- common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 8; multiple8_cnt--;) {
- common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert, height);
- src += 8;
- dst += 8;
- }
-}
-
-static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, mask;
- v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- filt = LD_UH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
- hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
- v16i8 res0, res1, res2, res3;
- v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
- v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- filt = LD_UH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
- src8 = LD_SB(src);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
- hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
- hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
- SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
- hz_out3, hz_out5, 8);
- hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
-
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
- vec4, vec5, vec6, vec7);
- SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
- SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
- PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
- res2, res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- if (4 == height) {
- common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert);
- } else if (8 == height) {
- common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert);
- }
-}
-
-static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert) {
- v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
- v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
- v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp1 = __msa_dotp_u_h(vec1, filt_vt);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp2 = __msa_dotp_u_h(vec2, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp3 = __msa_dotp_u_h(vec3, filt_vt);
-
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- int8_t *filter_horiz,
- int8_t *filter_vert,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
- v16u8 filt_hz, filt_vt, vec0;
- v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_SB(src);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_SB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
- SAT_UH2_UH(tmp1, tmp2, 7);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- LD_SB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp4 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
- SAT_UH2_UH(tmp3, tmp4, 7);
- PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp5 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp6 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp7 = __msa_dotp_u_h(vec0, filt_vt);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp8 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
- SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
- PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- if (4 == height) {
- common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert);
- } else {
- common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height);
- }
-}
-
-static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt_hz, filt_vt, vec0, vec1;
- v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
- v8i16 filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_SH(filter_horiz);
- filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
- filt = LD_SH(filter_vert);
- filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB2(src, 8, src0, src1);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
- SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
- SAT_UH2_UH(tmp1, tmp2, 7);
- PCKEV_ST_SB(tmp1, tmp2, dst);
- dst += dst_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
- SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
- SAT_UH2_UH(tmp1, tmp2, 7);
- PCKEV_ST_SB(tmp1, tmp2, dst);
- dst += dst_stride;
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
- SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
- SAT_UH2_UH(tmp1, tmp2, 7);
- PCKEV_ST_SB(tmp1, tmp2, dst);
- dst += dst_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
- SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
- SAT_UH2_UH(tmp1, tmp2, 7);
- PCKEV_ST_SB(tmp1, tmp2, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 2; multiple8_cnt--;) {
- common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert, height);
- src += 16;
- dst += 16;
- }
-}
-
-static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter_horiz, int8_t *filter_vert,
- int32_t height) {
- int32_t multiple8_cnt;
- for (multiple8_cnt = 4; multiple8_cnt--;) {
- common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
- filter_vert, height);
- src += 16;
- dst += 16;
- }
-}
-
-void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int32_t x_step_q4,
- const int16_t *filter_y, int32_t y_step_q4,
- int32_t w, int32_t h) {
- int8_t cnt, filt_hor[8], filt_ver[8];
-
- if (16 != x_step_q4 || 16 != y_step_q4) {
- vp9_convolve8_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- if (((const int32_t *)filter_x)[1] == 0x800000 &&
- ((const int32_t *)filter_y)[1] == 0x800000) {
- vp9_convolve_copy(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- for (cnt = 0; cnt < 8; ++cnt) {
- filt_hor[cnt] = filter_x[cnt];
- filt_ver[cnt] = filter_y[cnt];
- }
-
- if (((const int32_t *)filter_x)[0] == 0 &&
- ((const int32_t *)filter_y)[0] == 0) {
- switch (w) {
- case 4:
- common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], (int32_t)h);
- break;
- case 8:
- common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], (int32_t)h);
- break;
- case 16:
- common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], (int32_t)h);
- break;
- case 32:
- common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], (int32_t)h);
- break;
- case 64:
- common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_hor[3], &filt_ver[3], (int32_t)h);
- break;
- default:
- vp9_convolve8_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- } else if (((const int32_t *)filter_x)[0] == 0 ||
- ((const int32_t *)filter_y)[0] == 0) {
- vp9_convolve8_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- } else {
- switch (w) {
- case 4:
- common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, (int32_t)h);
- break;
- case 8:
- common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, (int32_t)h);
- break;
- case 16:
- common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, (int32_t)h);
- break;
- case 32:
- common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, (int32_t)h);
- break;
- case 64:
- common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_hor, filt_ver, (int32_t)h);
- break;
- default:
- vp9_convolve8_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- }
-}
--- a/vp9/common/mips/msa/vp9_convolve8_vert_msa.c
+++ /dev/null
@@ -1,745 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
-
-static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
- v16i8 src10998, filt0, filt1, filt2, filt3;
- v16u8 out;
- v8i16 filt, out10, out32;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
- ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
- src4332, src6554);
- XORI_B3_128_SB(src2110, src4332, src6554);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- src += (4 * src_stride);
-
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
- XORI_B2_128_SB(src8776, src10998);
- out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
- filt1, filt2, filt3);
- out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
- filt1, filt2, filt3);
- SRARI_H2_SH(out10, out32, FILTER_BITS);
- SAT_SH2_SH(out10, out32, 7);
- out = PCKEV_XORI128_UB(out10, out32);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
-
- src2110 = src6554;
- src4332 = src8776;
- src6554 = src10998;
- src6 = src10;
- }
-}
-
-static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
- v16u8 tmp0, tmp1;
- v8i16 filt, out0_r, out1_r, out2_r, out3_r;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src += (4 * src_stride);
-
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
- filt1, filt2, filt3);
- out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
- filt1, filt2, filt3);
- out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
- filt1, filt2, filt3);
- out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
- filt1, filt2, filt3);
- SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
- SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
- tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
- tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- src10_r = src54_r;
- src32_r = src76_r;
- src54_r = src98_r;
- src21_r = src65_r;
- src43_r = src87_r;
- src65_r = src109_r;
- src6 = src10;
- }
-}
-
-static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 filt0, filt1, filt2, filt3;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
- v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
- v16u8 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
- ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
- src54_l, src21_l);
- ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src += (4 * src_stride);
-
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
- src87_l, src98_l, src109_l);
- out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
- filt1, filt2, filt3);
- out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
- filt1, filt2, filt3);
- out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
- filt1, filt2, filt3);
- out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
- filt1, filt2, filt3);
- out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
- filt1, filt2, filt3);
- out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
- filt1, filt2, filt3);
- out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
- filt1, filt2, filt3);
- out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
- filt1, filt2, filt3);
- SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
- SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
- SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
- SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
- PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
- tmp0, tmp1, tmp2, tmp3);
- XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
- ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
- dst += (4 * dst_stride);
-
- src10_r = src54_r;
- src32_r = src76_r;
- src54_r = src98_r;
- src21_r = src65_r;
- src43_r = src87_r;
- src65_r = src109_r;
- src10_l = src54_l;
- src32_l = src76_l;
- src54_l = src98_l;
- src21_l = src65_l;
- src43_l = src87_l;
- src65_l = src109_l;
- src6 = src10;
- }
-}
-
-static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height,
- int32_t width) {
- const uint8_t *src_tmp;
- uint8_t *dst_tmp;
- uint32_t loop_cnt, cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 filt0, filt1, filt2, filt3;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
- v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
- v16u8 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- for (cnt = (width >> 4); cnt--;) {
- src_tmp = src;
- dst_tmp = dst;
-
- LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src_tmp += (7 * src_stride);
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
- src32_r, src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
- ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
- src32_l, src54_l, src21_l);
- ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src_tmp += (4 * src_stride);
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
- src87_l, src98_l, src109_l);
- out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
- filt1, filt2, filt3);
- out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
- filt1, filt2, filt3);
- out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
- filt1, filt2, filt3);
- out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
- filt1, filt2, filt3);
- out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
- filt1, filt2, filt3);
- out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
- filt1, filt2, filt3);
- out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
- filt1, filt2, filt3);
- out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
- filt1, filt2, filt3);
- SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
- SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
- SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
- SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
- PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
- out3_r, tmp0, tmp1, tmp2, tmp3);
- XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
- ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
- dst_tmp += (4 * dst_stride);
-
- src10_r = src54_r;
- src32_r = src76_r;
- src54_r = src98_r;
- src21_r = src65_r;
- src43_r = src87_r;
- src65_r = src109_r;
- src10_l = src54_l;
- src32_l = src76_l;
- src54_l = src98_l;
- src21_l = src65_l;
- src43_l = src87_l;
- src65_l = src109_l;
- src6 = src10;
- }
-
- src += 16;
- dst += 16;
- }
-}
-
-static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
- 32);
-}
-
-static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
- 64);
-}
-
-static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, src4;
- v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
- v16u8 filt0;
- v8i16 filt;
- v8u16 tmp0, tmp1;
-
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
- src += (5 * src_stride);
-
- ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
- src32_r, src43_r);
- ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
- DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
- v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v16u8 filt0;
- v8i16 filt;
-
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
-
- src8 = LD_SB(src);
- src += src_stride;
-
- ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
- src32_r, src43_r);
- ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
- src76_r, src87_r);
- ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
- src87_r, src76_r, src2110, src4332, src6554, src8776);
- DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
- tmp0, tmp1, tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
- ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
- ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
-}
-
-static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
- v16i8 out0, out1;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
- ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
-}
-
-static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v16i8 out0, out1;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
- src += (8 * src_stride);
-
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
- vec2, vec3);
- ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
- vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
- PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- src0 = src8;
- }
-}
-
-static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
- } else {
- common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
- }
-}
-
-static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
-
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_ST_SB(tmp0, tmp1, dst);
- dst += dst_stride;
-
- ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_ST_SB(tmp2, tmp3, dst);
- dst += dst_stride;
-
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_ST_SB(tmp0, tmp1, dst);
- dst += dst_stride;
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_ST_SB(tmp2, tmp3, dst);
- dst += dst_stride;
-
- src0 = src4;
- }
-}
-
-static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_UB(src);
- src5 = LD_UB(src + 16);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-
- LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
- src += (4 * src_stride);
-
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_ST_SB(tmp0, tmp1, dst);
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
-
- ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
-
- ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
- ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_ST_SB(tmp0, tmp1, dst + 16);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
-
- ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
- ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
- dst += (4 * dst_stride);
-
- src0 = src4;
- src5 = src9;
- }
-}
-
-static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_UB4(src, 16, src0, src3, src6, src9);
- src += src_stride;
-
- for (loop_cnt = (height >> 1); loop_cnt--;) {
- LD_UB2(src, src_stride, src1, src2);
- LD_UB2(src + 16, src_stride, src4, src5);
- LD_UB2(src + 32, src_stride, src7, src8);
- LD_UB2(src + 48, src_stride, src10, src11);
- src += (2 * src_stride);
-
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_ST_SB(tmp0, tmp1, dst);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
-
- ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
- ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
- SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
- SAT_UH2_UH(tmp4, tmp5, 7);
- PCKEV_ST_SB(tmp4, tmp5, dst + 16);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
- SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
- SAT_UH2_UH(tmp6, tmp7, 7);
- PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
-
- ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
- ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
- PCKEV_ST_SB(tmp0, tmp1, dst + 32);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
- PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
-
- ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
- ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
- SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
- SAT_UH2_UH(tmp4, tmp5, 7);
- PCKEV_ST_SB(tmp4, tmp5, dst + 48);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
- SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
- SAT_UH2_UH(tmp6, tmp7, 7);
- PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
- dst += (2 * dst_stride);
-
- src0 = src2;
- src3 = src5;
- src6 = src8;
- src9 = src11;
- }
-}
-
-void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- int8_t cnt, filt_ver[8];
-
- if (16 != y_step_q4) {
- vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- if (((const int32_t *)filter_y)[1] == 0x800000) {
- vp9_convolve_copy(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- return;
- }
-
- for (cnt = 8; cnt--;) {
- filt_ver[cnt] = filter_y[cnt];
- }
-
- if (((const int32_t *)filter_y)[0] == 0) {
- switch (w) {
- case 4:
- common_vt_2t_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 8:
- common_vt_2t_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 16:
- common_vt_2t_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 32:
- common_vt_2t_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 64:
- common_vt_2t_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- default:
- vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- } else {
- switch (w) {
- case 4:
- common_vt_8t_4w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- case 8:
- common_vt_8t_8w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- case 16:
- common_vt_8t_16w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- case 32:
- common_vt_8t_32w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- case 64:
- common_vt_8t_64w_msa(src, (int32_t)src_stride,
- dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- default:
- vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- break;
- }
- }
-}
--- a/vp9/common/mips/msa/vp9_convolve_avg_msa.c
+++ /dev/null
@@ -1,232 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_dsp/mips/macros_msa.h"
-
-static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- uint32_t out0, out1, out2, out3;
- v16u8 src0, src1, src2, src3;
- v16u8 dst0, dst1, dst2, dst3;
-
- if (0 == (height % 4)) {
- for (cnt = (height / 4); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
- dst0, dst1, dst2, dst3);
-
- out0 = __msa_copy_u_w((v4i32)dst0, 0);
- out1 = __msa_copy_u_w((v4i32)dst1, 0);
- out2 = __msa_copy_u_w((v4i32)dst2, 0);
- out3 = __msa_copy_u_w((v4i32)dst3, 0);
- SW4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == (height % 2)) {
- for (cnt = (height / 2); cnt--;) {
- LD_UB2(src, src_stride, src0, src1);
- src += (2 * src_stride);
-
- LD_UB2(dst, dst_stride, dst0, dst1);
-
- AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
-
- out0 = __msa_copy_u_w((v4i32)dst0, 0);
- out1 = __msa_copy_u_w((v4i32)dst1, 0);
- SW(out0, dst);
- dst += dst_stride;
- SW(out1, dst);
- dst += dst_stride;
- }
- }
-}
-
-static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- uint64_t out0, out1, out2, out3;
- v16u8 src0, src1, src2, src3;
- v16u8 dst0, dst1, dst2, dst3;
-
- for (cnt = (height / 4); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
- dst0, dst1, dst2, dst3);
-
- out0 = __msa_copy_u_d((v2i64)dst0, 0);
- out1 = __msa_copy_u_d((v2i64)dst1, 0);
- out2 = __msa_copy_u_d((v2i64)dst2, 0);
- out3 = __msa_copy_u_d((v2i64)dst3, 0);
- SD4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-
- for (cnt = (height / 8); cnt--;) {
- LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
- LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
- dst0, dst1, dst2, dst3);
- AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
- dst4, dst5, dst6, dst7);
- ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
- dst += (8 * dst_stride);
- }
-}
-
-static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- uint8_t *dst_dup = dst;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
-
- for (cnt = (height / 8); cnt--;) {
- LD_UB4(src, src_stride, src0, src2, src4, src6);
- LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
- LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
- dst_dup += (4 * dst_stride);
- LD_UB4(src, src_stride, src8, src10, src12, src14);
- LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
- src += (4 * src_stride);
- LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
- LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
- dst_dup += (4 * dst_stride);
-
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
- dst0, dst1, dst2, dst3);
- AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
- dst4, dst5, dst6, dst7);
- AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
- dst8, dst9, dst10, dst11);
- AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
- dst12, dst13, dst14, dst15);
-
- ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
- ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
- dst += (4 * dst_stride);
- ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
- ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- uint8_t *dst_dup = dst;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
- v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
-
- for (cnt = (height / 4); cnt--;) {
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB4(src, 16, src4, src5, src6, src7);
- src += src_stride;
- LD_UB4(src, 16, src8, src9, src10, src11);
- src += src_stride;
- LD_UB4(src, 16, src12, src13, src14, src15);
- src += src_stride;
-
- LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
- dst_dup += dst_stride;
- LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
- dst_dup += dst_stride;
- LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
- dst_dup += dst_stride;
- LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
- dst_dup += dst_stride;
-
- AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
- dst0, dst1, dst2, dst3);
- AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
- dst4, dst5, dst6, dst7);
- AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
- dst8, dst9, dst10, dst11);
- AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
- dst12, dst13, dst14, dst15);
-
- ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
- dst += dst_stride;
- ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
- dst += dst_stride;
- ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
- dst += dst_stride;
- ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
- dst += dst_stride;
- }
-}
-
-void vp9_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int32_t filter_x_stride,
- const int16_t *filter_y, int32_t filter_y_stride,
- int32_t w, int32_t h) {
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
-
- switch (w) {
- case 4: {
- avg_width4_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 8: {
- avg_width8_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 16: {
- avg_width16_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 32: {
- avg_width32_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 64: {
- avg_width64_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- default: {
- int32_t lp, cnt;
- for (cnt = h; cnt--;) {
- for (lp = 0; lp < w; ++lp) {
- dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
- }
- src += src_stride;
- dst += dst_stride;
- }
- break;
- }
- }
-}
--- a/vp9/common/mips/msa/vp9_convolve_copy_msa.c
+++ /dev/null
@@ -1,247 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <string.h>
-#include "vpx_dsp/mips/macros_msa.h"
-
-static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
- if (0 == height % 12) {
- for (cnt = (height / 12); cnt--;) {
- LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
-
- out0 = __msa_copy_u_d((v2i64)src0, 0);
- out1 = __msa_copy_u_d((v2i64)src1, 0);
- out2 = __msa_copy_u_d((v2i64)src2, 0);
- out3 = __msa_copy_u_d((v2i64)src3, 0);
- out4 = __msa_copy_u_d((v2i64)src4, 0);
- out5 = __msa_copy_u_d((v2i64)src5, 0);
- out6 = __msa_copy_u_d((v2i64)src6, 0);
- out7 = __msa_copy_u_d((v2i64)src7, 0);
-
- SD4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- SD4(out4, out5, out6, out7, dst, dst_stride);
- dst += (4 * dst_stride);
-
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- out0 = __msa_copy_u_d((v2i64)src0, 0);
- out1 = __msa_copy_u_d((v2i64)src1, 0);
- out2 = __msa_copy_u_d((v2i64)src2, 0);
- out3 = __msa_copy_u_d((v2i64)src3, 0);
- SD4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == height % 8) {
- for (cnt = height >> 3; cnt--;) {
- LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
-
- out0 = __msa_copy_u_d((v2i64)src0, 0);
- out1 = __msa_copy_u_d((v2i64)src1, 0);
- out2 = __msa_copy_u_d((v2i64)src2, 0);
- out3 = __msa_copy_u_d((v2i64)src3, 0);
- out4 = __msa_copy_u_d((v2i64)src4, 0);
- out5 = __msa_copy_u_d((v2i64)src5, 0);
- out6 = __msa_copy_u_d((v2i64)src6, 0);
- out7 = __msa_copy_u_d((v2i64)src7, 0);
-
- SD4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- SD4(out4, out5, out6, out7, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == height % 4) {
- for (cnt = (height / 4); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- out0 = __msa_copy_u_d((v2i64)src0, 0);
- out1 = __msa_copy_u_d((v2i64)src1, 0);
- out2 = __msa_copy_u_d((v2i64)src2, 0);
- out3 = __msa_copy_u_d((v2i64)src3, 0);
-
- SD4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == height % 2) {
- for (cnt = (height / 2); cnt--;) {
- LD_UB2(src, src_stride, src0, src1);
- src += (2 * src_stride);
- out0 = __msa_copy_u_d((v2i64)src0, 0);
- out1 = __msa_copy_u_d((v2i64)src1, 0);
-
- SD(out0, dst);
- dst += dst_stride;
- SD(out1, dst);
- dst += dst_stride;
- }
- }
-}
-
-static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int32_t height, int32_t width) {
- int32_t cnt, loop_cnt;
- const uint8_t *src_tmp;
- uint8_t *dst_tmp;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
- for (cnt = (width >> 4); cnt--;) {
- src_tmp = src;
- dst_tmp = dst;
-
- for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_UB8(src_tmp, src_stride,
- src0, src1, src2, src3, src4, src5, src6, src7);
- src_tmp += (8 * src_stride);
-
- ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
- dst_tmp, dst_stride);
- dst_tmp += (8 * dst_stride);
- }
-
- src += 16;
- dst += 16;
- }
-}
-
-static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
- if (0 == height % 12) {
- for (cnt = (height / 12); cnt--;) {
- LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
- ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
- dst += (8 * dst_stride);
-
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == height % 8) {
- copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
- } else if (0 == height % 4) {
- for (cnt = (height >> 2); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- }
-}
-
-static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
- if (0 == height % 12) {
- for (cnt = (height / 12); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
- src += (4 * src_stride);
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
- dst += (4 * dst_stride);
-
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
- src += (4 * src_stride);
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
- dst += (4 * dst_stride);
-
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
- src += (4 * src_stride);
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == height % 8) {
- copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
- } else if (0 == height % 4) {
- for (cnt = (height >> 2); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
- src += (4 * src_stride);
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
- dst += (4 * dst_stride);
- }
- }
-}
-
-static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
-}
-
-void vp9_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int32_t filter_x_stride,
- const int16_t *filter_y, int32_t filter_y_stride,
- int32_t w, int32_t h) {
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
-
- switch (w) {
- case 4: {
- uint32_t cnt, tmp;
- /* 1 word storage */
- for (cnt = h; cnt--;) {
- tmp = LW(src);
- SW(tmp, dst);
- src += src_stride;
- dst += dst_stride;
- }
- break;
- }
- case 8: {
- copy_width8_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 16: {
- copy_width16_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 32: {
- copy_width32_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 64: {
- copy_width64_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- default: {
- uint32_t cnt;
- for (cnt = h; cnt--;) {
- memcpy(dst, src, w);
- src += src_stride;
- dst += dst_stride;
- }
- break;
- }
- }
-}
--- a/vp9/common/mips/msa/vp9_convolve_msa.h
+++ /dev/null
@@ -1,119 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
-#define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
-
-#include "vp9/common/vp9_filter.h"
-#include "vpx_dsp/mips/macros_msa.h"
-
-extern const uint8_t mc_filt_mask_arr[16 * 3];
-
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
- filt0, filt1, filt2, filt3) ({ \
- v8i16 tmp0, tmp1; \
- \
- tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
- tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
- tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \
- tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \
- tmp0 = __msa_adds_s_h(tmp0, tmp1); \
- \
- tmp0; \
-})
-
-#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \
- filt_h0, filt_h1, filt_h2, filt_h3) ({ \
- v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
- v8i16 hz_out_m; \
- \
- VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \
- vec0_m, vec1_m, vec2_m, vec3_m); \
- hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \
- filt_h0, filt_h1, filt_h2, filt_h3); \
- \
- hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \
- hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
- \
- hz_out_m; \
-})
-
-#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
- mask0, mask1, mask2, mask3, \
- filt0, filt1, filt2, filt3, \
- out0, out1) { \
- v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v8i16 res0_m, res1_m, res2_m, res3_m; \
- \
- VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
- DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
- VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
- DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
- VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
- DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
- VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
- DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
- ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
-}
-
-#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
- mask0, mask1, mask2, mask3, \
- filt0, filt1, filt2, filt3, \
- out0, out1, out2, out3) { \
- v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
- \
- VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
- DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
- res0_m, res1_m, res2_m, res3_m); \
- VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
- DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
- res4_m, res5_m, res6_m, res7_m); \
- VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
- DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
- res0_m, res1_m, res2_m, res3_m); \
- VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
- DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
- res4_m, res5_m, res6_m, res7_m); \
- ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
- res7_m, out0, out1, out2, out3); \
-}
-
-#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) { \
- v16u8 tmp_m; \
- \
- tmp_m = PCKEV_XORI128_UB(in1, in0); \
- tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
- ST_UB(tmp_m, (pdst)); \
-}
-
-#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) { \
- v16u8 tmp_m; \
- \
- tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
- tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
- ST_UB(tmp_m, (pdst)); \
-}
-
-#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
- pdst, stride) { \
- v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- \
- PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
- PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
- AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
-}
-#endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */
--- a/vp9/common/vp9_convolve.c
+++ /dev/null
@@ -1,557 +1,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/vp9_filter.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4, int w, int h) {
- int x, y;
- src -= SUBPEL_TAPS / 2 - 1;
- for (y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (x = 0; x < w; ++x) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
- dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4, int w, int h) {
- int x, y;
- src -= SUBPEL_TAPS / 2 - 1;
- for (y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (x = 0; x < w; ++x) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
- dst[x] = ROUND_POWER_OF_TWO(dst[x] +
- clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
- int x, y;
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
- for (x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_y[k * src_stride] * y_filter[k];
- dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
- int x, y;
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
- for (x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_y[k * src_stride] * y_filter[k];
- dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
- clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void convolve(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters,
- int x0_q4, int x_step_q4,
- const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4,
- int w, int h) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
- // 2d filtering proceeds in 2 steps:
- // (1) Interpolate horizontally into an intermediate buffer, temp.
- // (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
- // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
- // --Largest block size is 64x64 pixels.
- // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
- // original frame (in 1/16th pixel units).
- // --Must round-up because block may be located at sub-pixel position.
- // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
- // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- uint8_t temp[135 * 64];
- int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= 64);
- assert(h <= 64);
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
-
- convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
- x_filters, x0_q4, x_step_q4, w, intermediate_height);
- convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
- y_filters, y0_q4, y_step_q4, w, h);
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
- // NOTE: This assumes that the filter table is 256-byte aligned.
- // TODO(agrange) Modify to make independent of table alignment.
- return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
- return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
- (void)y_step_q4;
-
- convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h);
-}
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
- (void)y_step_q4;
-
- convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h);
-}
-
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
- (void)x_step_q4;
-
- convolve_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h);
-}
-
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
- (void)x_step_q4;
-
- convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h);
-}
-
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- convolve(src, src_stride, dst, dst_stride,
- filters_x, x0_q4, x_step_q4,
- filters_y, y0_q4, y_step_q4, w, h);
-}
-
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- /* Fixed size intermediate buffer places limits on parameters. */
- DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
- assert(w <= 64);
- assert(h <= 64);
-
- vp9_convolve8_c(src, src_stride, temp, 64,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h);
- vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
-}
-
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
- int r;
-
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
-
- for (r = h; r > 0; --r) {
- memcpy(dst, src, w);
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
- int x, y;
-
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
-
- for (y = 0; y < h; ++y) {
- for (x = 0; x < w; ++x)
- dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4,
- int w, int h, int bd) {
- int x, y;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= SUBPEL_TAPS / 2 - 1;
- for (y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (x = 0; x < w; ++x) {
- const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
- dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4,
- int w, int h, int bd) {
- int x, y;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= SUBPEL_TAPS / 2 - 1;
- for (y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (x = 0; x < w; ++x) {
- const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_x[k] * x_filter[k];
- dst[x] = ROUND_POWER_OF_TWO(dst[x] +
- clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h,
- int bd) {
- int x, y;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (y = 0; y < h; ++y) {
- const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_y[k * src_stride] * y_filter[k];
- dst[y * dst_stride] = clip_pixel_highbd(
- ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h,
- int bd) {
- int x, y;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (y = 0; y < h; ++y) {
- const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- int k, sum = 0;
- for (k = 0; k < SUBPEL_TAPS; ++k)
- sum += src_y[k * src_stride] * y_filter[k];
- dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
- clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters,
- int x0_q4, int x_step_q4,
- const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4,
- int w, int h, int bd) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
- // 2d filtering proceeds in 2 steps:
- // (1) Interpolate horizontally into an intermediate buffer, temp.
- // (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
- // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
- // --Largest block size is 64x64 pixels.
- // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
- // original frame (in 1/16th pixel units).
- // --Must round-up because block may be located at sub-pixel position.
- // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
- // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- uint16_t temp[64 * 135];
- int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= 64);
- assert(h <= 64);
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
-
- highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, CONVERT_TO_BYTEPTR(temp), 64,
- x_filters, x0_q4, x_step_q4, w,
- intermediate_height, bd);
- highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
- 64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
- w, h, bd);
-}
-
-
-void vp9_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
- (void)filter_y;
- (void)y_step_q4;
-
- highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h, bd);
-}
-
-void vp9_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
- (void)filter_y;
- (void)y_step_q4;
-
- highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, w, h, bd);
-}
-
-void vp9_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
- (void)filter_x;
- (void)x_step_q4;
-
- highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h, bd);
-}
-
-void vp9_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
- (void)filter_x;
- (void)x_step_q4;
-
- highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
- y0_q4, y_step_q4, w, h, bd);
-}
-
-void vp9_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- highbd_convolve(src, src_stride, dst, dst_stride,
- filters_x, x0_q4, x_step_q4,
- filters_y, y0_q4, y_step_q4, w, h, bd);
-}
-
-void vp9_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- // Fixed size intermediate buffer places limits on parameters.
- DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
- assert(w <= 64);
- assert(h <= 64);
-
- vp9_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
- vp9_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
- NULL, 0, NULL, 0, w, h, bd);
-}
-
-void vp9_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h, int bd) {
- int r;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
- (void)bd;
-
- for (r = h; r > 0; --r) {
- memcpy(dst, src, w * sizeof(uint16_t));
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h, int bd) {
- int x, y;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
- (void)bd;
-
- for (y = 0; y < h; ++y) {
- for (x = 0; x < w; ++x) {
- dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-#endif
--- a/vp9/common/vp9_convolve.h
+++ /dev/null
@@ -1,38 +1,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef VP9_COMMON_VP9_CONVOLVE_H_
-#define VP9_COMMON_VP9_CONVOLVE_H_
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd);
-#endif
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VP9_COMMON_VP9_CONVOLVE_H_
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -11,9 +11,10 @@
#ifndef VP9_COMMON_VP9_ENTROPYMODE_H_
#define VP9_COMMON_VP9_ENTROPYMODE_H_
-#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
extern "C" {
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
@@ -20,13 +21,6 @@
extern "C" {
#endif
-#define FILTER_BITS 7
-
-#define SUBPEL_BITS 4
-#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
-#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
-#define SUBPEL_TAPS 8
-
#define EIGHTTAP 0
#define EIGHTTAP_SMOOTH 1
#define EIGHTTAP_SHARP 2
@@ -36,9 +30,8 @@
// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
#define SWITCHABLE 4 /* should be the last one */
-typedef uint8_t INTERP_FILTER;
-typedef int16_t InterpKernel[SUBPEL_TAPS];
+typedef uint8_t INTERP_FILTER;
extern const InterpKernel *vp9_filter_kernels[4];
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -15,6 +15,9 @@
#include "./vpx_config.h"
#include "vpx_dsp/txfm_common.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif // CONFIG_VP9_HIGHBITDEPTH
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -16,7 +16,6 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -11,8 +11,10 @@
#ifndef VP9_COMMON_VP9_RECONINTER_H_
#define VP9_COMMON_VP9_RECONINTER_H_
-#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_onyxc_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
extern "C" {
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -11,6 +11,9 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif // CONFIG_VP9_HIGHBITDEPTH
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/vpx_once.h"
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -54,12 +54,6 @@
$avx2_x86_64 = 'avx2';
}
-# optimizations which depend on multiple features
-$avx2_ssse3 = '';
-if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
- $avx2_ssse3 = 'avx2';
-}
-
#
# post proc
#
@@ -86,33 +80,6 @@
add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
}
-
-#
-# Sub Pixel Filters
-#
-add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2 msa/;
-
-add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
-
-add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
#
# dct
--- a/vp9/common/vp9_scale.c
+++ b/vp9/common/vp9_scale.c
@@ -8,9 +8,10 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_scale.h"
+#include "vpx_dsp/vpx_filter.h"
static INLINE int scaled_x(int val, const struct scale_factors *sf) {
return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT);
@@ -81,85 +82,85 @@
if (sf->x_step_q4 == 16) {
if (sf->y_step_q4 == 16) {
// No scaling in either direction.
- sf->predict[0][0][0] = vp9_convolve_copy;
- sf->predict[0][0][1] = vp9_convolve_avg;
- sf->predict[0][1][0] = vp9_convolve8_vert;
- sf->predict[0][1][1] = vp9_convolve8_avg_vert;
- sf->predict[1][0][0] = vp9_convolve8_horiz;
- sf->predict[1][0][1] = vp9_convolve8_avg_horiz;
+ sf->predict[0][0][0] = vpx_convolve_copy;
+ sf->predict[0][0][1] = vpx_convolve_avg;
+ sf->predict[0][1][0] = vpx_convolve8_vert;
+ sf->predict[0][1][1] = vpx_convolve8_avg_vert;
+ sf->predict[1][0][0] = vpx_convolve8_horiz;
+ sf->predict[1][0][1] = vpx_convolve8_avg_horiz;
} else {
// No scaling in x direction. Must always scale in the y direction.
- sf->predict[0][0][0] = vp9_convolve8_vert;
- sf->predict[0][0][1] = vp9_convolve8_avg_vert;
- sf->predict[0][1][0] = vp9_convolve8_vert;
- sf->predict[0][1][1] = vp9_convolve8_avg_vert;
- sf->predict[1][0][0] = vp9_convolve8;
- sf->predict[1][0][1] = vp9_convolve8_avg;
+ sf->predict[0][0][0] = vpx_convolve8_vert;
+ sf->predict[0][0][1] = vpx_convolve8_avg_vert;
+ sf->predict[0][1][0] = vpx_convolve8_vert;
+ sf->predict[0][1][1] = vpx_convolve8_avg_vert;
+ sf->predict[1][0][0] = vpx_convolve8;
+ sf->predict[1][0][1] = vpx_convolve8_avg;
}
} else {
if (sf->y_step_q4 == 16) {
// No scaling in the y direction. Must always scale in the x direction.
- sf->predict[0][0][0] = vp9_convolve8_horiz;
- sf->predict[0][0][1] = vp9_convolve8_avg_horiz;
- sf->predict[0][1][0] = vp9_convolve8;
- sf->predict[0][1][1] = vp9_convolve8_avg;
- sf->predict[1][0][0] = vp9_convolve8_horiz;
- sf->predict[1][0][1] = vp9_convolve8_avg_horiz;
+ sf->predict[0][0][0] = vpx_convolve8_horiz;
+ sf->predict[0][0][1] = vpx_convolve8_avg_horiz;
+ sf->predict[0][1][0] = vpx_convolve8;
+ sf->predict[0][1][1] = vpx_convolve8_avg;
+ sf->predict[1][0][0] = vpx_convolve8_horiz;
+ sf->predict[1][0][1] = vpx_convolve8_avg_horiz;
} else {
// Must always scale in both directions.
- sf->predict[0][0][0] = vp9_convolve8;
- sf->predict[0][0][1] = vp9_convolve8_avg;
- sf->predict[0][1][0] = vp9_convolve8;
- sf->predict[0][1][1] = vp9_convolve8_avg;
- sf->predict[1][0][0] = vp9_convolve8;
- sf->predict[1][0][1] = vp9_convolve8_avg;
+ sf->predict[0][0][0] = vpx_convolve8;
+ sf->predict[0][0][1] = vpx_convolve8_avg;
+ sf->predict[0][1][0] = vpx_convolve8;
+ sf->predict[0][1][1] = vpx_convolve8_avg;
+ sf->predict[1][0][0] = vpx_convolve8;
+ sf->predict[1][0][1] = vpx_convolve8_avg;
}
}
// 2D subpel motion always gets filtered in both directions
- sf->predict[1][1][0] = vp9_convolve8;
- sf->predict[1][1][1] = vp9_convolve8_avg;
+ sf->predict[1][1][0] = vpx_convolve8;
+ sf->predict[1][1][1] = vpx_convolve8_avg;
#if CONFIG_VP9_HIGHBITDEPTH
if (use_highbd) {
if (sf->x_step_q4 == 16) {
if (sf->y_step_q4 == 16) {
// No scaling in either direction.
- sf->highbd_predict[0][0][0] = vp9_highbd_convolve_copy;
- sf->highbd_predict[0][0][1] = vp9_highbd_convolve_avg;
- sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert;
- sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert;
- sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz;
- sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz;
+ sf->highbd_predict[0][0][0] = vpx_highbd_convolve_copy;
+ sf->highbd_predict[0][0][1] = vpx_highbd_convolve_avg;
+ sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+ sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+ sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+ sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
} else {
// No scaling in x direction. Must always scale in the y direction.
- sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_vert;
- sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_vert;
- sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert;
- sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert;
- sf->highbd_predict[1][0][0] = vp9_highbd_convolve8;
- sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg;
+ sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_vert;
+ sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_vert;
+ sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+ sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+ sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
}
} else {
if (sf->y_step_q4 == 16) {
// No scaling in the y direction. Must always scale in the x direction.
- sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_horiz;
- sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_horiz;
- sf->highbd_predict[0][1][0] = vp9_highbd_convolve8;
- sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg;
- sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz;
- sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz;
+ sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_horiz;
+ sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_horiz;
+ sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+ sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+ sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
} else {
// Must always scale in both directions.
- sf->highbd_predict[0][0][0] = vp9_highbd_convolve8;
- sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg;
- sf->highbd_predict[0][1][0] = vp9_highbd_convolve8;
- sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg;
- sf->highbd_predict[1][0][0] = vp9_highbd_convolve8;
- sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg;
+ sf->highbd_predict[0][0][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg;
+ sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+ sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
}
}
// 2D subpel motion always gets filtered in both directions.
- sf->highbd_predict[1][1][0] = vp9_highbd_convolve8;
- sf->highbd_predict[1][1][1] = vp9_highbd_convolve8_avg;
+ sf->highbd_predict[1][1][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg;
}
#endif
}
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -12,7 +12,7 @@
#define VP9_COMMON_VP9_SCALE_H_
#include "vp9/common/vp9_mv.h"
-#include "vp9/common/vp9_convolve.h"
+#include "vpx_dsp/vpx_convolve.h"
#ifdef __cplusplus
extern "C" {
--- a/vp9/common/x86/convolve.h
+++ /dev/null
@@ -1,296 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef VP9_COMMON_X86_CONVOLVE_H_
-#define VP9_COMMON_X86_CONVOLVE_H_
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-typedef void filter8_1dfunction (
- const uint8_t *src_ptr,
- ptrdiff_t src_pitch,
- uint8_t *output_ptr,
- ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter
-);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h) { \
- if (step_q4 == 16 && filter[3] != 128) { \
- if (filter[0] || filter[1] || filter[2]) { \
- while (w >= 16) { \
- vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- vp9_filter_block1d16_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_filter_block1d8_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_filter_block1d4_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h); \
- } \
-}
-
-#define FUN_CONV_2D(avg, opt) \
-void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h) { \
- assert(w <= 64); \
- assert(h <= 64); \
- if (x_step_q4 == 16 && y_step_q4 == 16) { \
- if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
- filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
- DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
- vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h + 7); \
- vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h); \
- } else { \
- DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
- vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h + 1); \
- vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h); \
- } \
- } else { \
- vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
- } \
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-
-typedef void highbd_filter8_1dfunction (
- const uint16_t *src_ptr,
- const ptrdiff_t src_pitch,
- uint16_t *output_ptr,
- ptrdiff_t out_pitch,
- unsigned int output_height,
- const int16_t *filter,
- int bd
-);
-
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
- ptrdiff_t src_stride, \
- uint8_t *dst8, \
- ptrdiff_t dst_stride, \
- const int16_t *filter_x, \
- int x_step_q4, \
- const int16_t *filter_y, \
- int y_step_q4, \
- int w, int h, int bd) { \
- if (step_q4 == 16 && filter[3] != 128) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- if (filter[0] || filter[1] || filter[2]) { \
- while (w >= 16) { \
- vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h, bd); \
- } \
-}
-
-#define HIGH_FUN_CONV_2D(avg, opt) \
-void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h, int bd) { \
- assert(w <= 64); \
- assert(h <= 64); \
- if (x_step_q4 == 16 && y_step_q4 == 16) { \
- if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
- filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
- vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h + 7, bd); \
- vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
- 64, dst, dst_stride, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h, bd); \
- } else { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
- vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h + 1, bd); \
- vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
- dst, dst_stride, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h, bd); \
- } \
- } else { \
- vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, w, \
- h, bd); \
- } \
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-#endif // VP9_COMMON_X86_CONVOLVE_H_
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ /dev/null
@@ -1,162 +1,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-#include "vp9/common/x86/convolve.h"
-
-#if HAVE_SSE2
-filter8_1dfunction vp9_filter_block1d16_v8_sse2;
-filter8_1dfunction vp9_filter_block1d16_h8_sse2;
-filter8_1dfunction vp9_filter_block1d8_v8_sse2;
-filter8_1dfunction vp9_filter_block1d8_h8_sse2;
-filter8_1dfunction vp9_filter_block1d4_v8_sse2;
-filter8_1dfunction vp9_filter_block1d4_h8_sse2;
-filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
-
-filter8_1dfunction vp9_filter_block1d16_v2_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_sse2;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
-
-// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
-
-// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_ , sse2);
-
-#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_avg_sse2;
-
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2;
-
-// void vp9_highbd_convolve8_horiz_sse2(const uint8_t *src,
-// ptrdiff_t src_stride,
-// uint8_t *dst,
-// ptrdiff_t dst_stride,
-// const int16_t *filter_x,
-// int x_step_q4,
-// const int16_t *filter_y,
-// int y_step_q4,
-// int w, int h, int bd);
-// void vp9_highbd_convolve8_vert_sse2(const uint8_t *src,
-// ptrdiff_t src_stride,
-// uint8_t *dst,
-// ptrdiff_t dst_stride,
-// const int16_t *filter_x,
-// int x_step_q4,
-// const int16_t *filter_y,
-// int y_step_q4,
-// int w, int h, int bd);
-// void vp9_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
-// ptrdiff_t src_stride,
-// uint8_t *dst,
-// ptrdiff_t dst_stride,
-// const int16_t *filter_x,
-// int x_step_q4,
-// const int16_t *filter_y,
-// int y_step_q4,
-// int w, int h, int bd);
-// void vp9_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
-// ptrdiff_t src_stride,
-// uint8_t *dst,
-// ptrdiff_t dst_stride,
-// const int16_t *filter_x,
-// int x_step_q4,
-// const int16_t *filter_y,
-// int y_step_q4,
-// int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
- sse2);
-
-// void vp9_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h, int bd);
-// void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_ , sse2);
-#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-#endif // HAVE_SSE2
--- a/vp9/common/x86/vp9_copy_sse2.asm
+++ /dev/null
@@ -1,156 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro convolve_fn 1
-INIT_XMM sse2
-cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
- fx, fxs, fy, fys, w, h
- mov r4d, dword wm
- cmp r4d, 4
- je .w4
- cmp r4d, 8
- je .w8
- cmp r4d, 16
- je .w16
- cmp r4d, 32
- je .w32
-
- mov r4d, dword hm
-.loop64:
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+32]
- movu m3, [srcq+48]
- add srcq, src_strideq
-%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+16]
- pavgb m2, [dstq+32]
- pavgb m3, [dstq+48]
-%endif
- mova [dstq ], m0
- mova [dstq+16], m1
- mova [dstq+32], m2
- mova [dstq+48], m3
- add dstq, dst_strideq
- dec r4d
- jnz .loop64
- RET
-
-.w32:
- mov r4d, dword hm
-.loop32:
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+src_strideq]
- movu m3, [srcq+src_strideq+16]
- lea srcq, [srcq+src_strideq*2]
-%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq +16]
- pavgb m2, [dstq+dst_strideq]
- pavgb m3, [dstq+dst_strideq+16]
-%endif
- mova [dstq ], m0
- mova [dstq +16], m1
- mova [dstq+dst_strideq ], m2
- mova [dstq+dst_strideq+16], m3
- lea dstq, [dstq+dst_strideq*2]
- sub r4d, 2
- jnz .loop32
- RET
-
-.w16:
- mov r4d, dword hm
- lea r5q, [src_strideq*3]
- lea r6q, [dst_strideq*3]
-.loop16:
- movu m0, [srcq]
- movu m1, [srcq+src_strideq]
- movu m2, [srcq+src_strideq*2]
- movu m3, [srcq+r5q]
- lea srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+dst_strideq]
- pavgb m2, [dstq+dst_strideq*2]
- pavgb m3, [dstq+r6q]
-%endif
- mova [dstq ], m0
- mova [dstq+dst_strideq ], m1
- mova [dstq+dst_strideq*2], m2
- mova [dstq+r6q ], m3
- lea dstq, [dstq+dst_strideq*4]
- sub r4d, 4
- jnz .loop16
- RET
-
-INIT_MMX sse
-.w8:
- mov r4d, dword hm
- lea r5q, [src_strideq*3]
- lea r6q, [dst_strideq*3]
-.loop8:
- movu m0, [srcq]
- movu m1, [srcq+src_strideq]
- movu m2, [srcq+src_strideq*2]
- movu m3, [srcq+r5q]
- lea srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+dst_strideq]
- pavgb m2, [dstq+dst_strideq*2]
- pavgb m3, [dstq+r6q]
-%endif
- mova [dstq ], m0
- mova [dstq+dst_strideq ], m1
- mova [dstq+dst_strideq*2], m2
- mova [dstq+r6q ], m3
- lea dstq, [dstq+dst_strideq*4]
- sub r4d, 4
- jnz .loop8
- RET
-
-.w4:
- mov r4d, dword hm
- lea r5q, [src_strideq*3]
- lea r6q, [dst_strideq*3]
-.loop4:
- movh m0, [srcq]
- movh m1, [srcq+src_strideq]
- movh m2, [srcq+src_strideq*2]
- movh m3, [srcq+r5q]
- lea srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
- movh m4, [dstq]
- movh m5, [dstq+dst_strideq]
- movh m6, [dstq+dst_strideq*2]
- movh m7, [dstq+r6q]
- pavgb m0, m4
- pavgb m1, m5
- pavgb m2, m6
- pavgb m3, m7
-%endif
- movh [dstq ], m0
- movh [dstq+dst_strideq ], m1
- movh [dstq+dst_strideq*2], m2
- movh [dstq+r6q ], m3
- lea dstq, [dstq+dst_strideq*4]
- sub r4d, 4
- jnz .loop4
- RET
-%endmacro
-
-convolve_fn copy
-convolve_fn avg
--- a/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,962 +1,0 @@
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro HIGH_GET_FILTERS_4 0
- mov rdx, arg(5) ;filter ptr
- mov rcx, 0x00000040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- psrldq xmm7, 8
- pshuflw xmm4, xmm7, 0b ;k4
- pshuflw xmm5, xmm7, 01010101b ;k5
- pshuflw xmm6, xmm7, 10101010b ;k6
- pshuflw xmm7, xmm7, 11111111b ;k7
-
- punpcklwd xmm0, xmm6
- punpcklwd xmm2, xmm5
- punpcklwd xmm3, xmm4
- punpcklwd xmm1, xmm7
-
- movdqa k0k6, xmm0
- movdqa k2k5, xmm2
- movdqa k3k4, xmm3
- movdqa k1k7, xmm1
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6
-
- ;Compute max and min values of a pixel
- mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
- movq xmm0, rdx
- movq xmm1, rcx
- pshufd xmm0, xmm0, 0b
- movdqa xmm2, xmm0
- psllw xmm0, xmm1
- psubw xmm0, xmm2
- pxor xmm1, xmm1
- movdqa max, xmm0 ;max value (for clamping)
- movdqa min, xmm1 ;min value (for clamping)
-
-%endm
-
-%macro HIGH_APPLY_FILTER_4 1
- punpcklwd xmm0, xmm6 ;two row in one register
- punpcklwd xmm1, xmm7
- punpcklwd xmm2, xmm5
- punpcklwd xmm3, xmm4
-
- pmaddwd xmm0, k0k6 ;multiply the filter factors
- pmaddwd xmm1, k1k7
- pmaddwd xmm2, k2k5
- pmaddwd xmm3, k3k4
-
- paddd xmm0, xmm1 ;sum
- paddd xmm0, xmm2
- paddd xmm0, xmm3
-
- paddd xmm0, krd ;rounding
- psrad xmm0, 7 ;shift
- packssdw xmm0, xmm0 ;pack to word
-
- ;clamp the values
- pminsw xmm0, max
- pmaxsw xmm0, min
-
-%if %1
- movq xmm1, [rdi]
- pavgw xmm0, xmm1
-%endif
- movq [rdi], xmm0
-%endm
-
-%macro HIGH_GET_FILTERS 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x00000040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- pshufhw xmm4, xmm7, 0b ;k4
- pshufhw xmm5, xmm7, 01010101b ;k5
- pshufhw xmm6, xmm7, 10101010b ;k6
- pshufhw xmm7, xmm7, 11111111b ;k7
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
- punpcklwd xmm0, xmm1
- punpckhwd xmm6, xmm7
- punpckhwd xmm2, xmm5
- punpckhwd xmm3, xmm4
-
- movdqa k0k1, xmm0 ;store filter factors on stack
- movdqa k6k7, xmm6
- movdqa k2k5, xmm2
- movdqa k3k4, xmm3
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6 ;rounding
-
- ;Compute max and min values of a pixel
- mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
- movq xmm0, rdx
- movq xmm1, rcx
- pshufd xmm0, xmm0, 0b
- movdqa xmm2, xmm0
- psllw xmm0, xmm1
- psubw xmm0, xmm2
- pxor xmm1, xmm1
- movdqa max, xmm0 ;max value (for clamping)
- movdqa min, xmm1 ;min value (for clamping)
-%endm
-
-%macro LOAD_VERT_8 1
- movdqu xmm0, [rsi + %1] ;0
- movdqu xmm1, [rsi + rax + %1] ;1
- movdqu xmm6, [rsi + rdx * 2 + %1] ;6
- lea rsi, [rsi + rax]
- movdqu xmm7, [rsi + rdx * 2 + %1] ;7
- movdqu xmm2, [rsi + rax + %1] ;2
- movdqu xmm3, [rsi + rax * 2 + %1] ;3
- movdqu xmm4, [rsi + rdx + %1] ;4
- movdqu xmm5, [rsi + rax * 4 + %1] ;5
-%endm
-
-%macro HIGH_APPLY_FILTER_8 2
- movdqu temp, xmm4
- movdqa xmm4, xmm0
- punpcklwd xmm0, xmm1
- punpckhwd xmm4, xmm1
- movdqa xmm1, xmm6
- punpcklwd xmm6, xmm7
- punpckhwd xmm1, xmm7
- movdqa xmm7, xmm2
- punpcklwd xmm2, xmm5
- punpckhwd xmm7, xmm5
-
- movdqu xmm5, temp
- movdqu temp, xmm4
- movdqa xmm4, xmm3
- punpcklwd xmm3, xmm5
- punpckhwd xmm4, xmm5
- movdqu xmm5, temp
-
- pmaddwd xmm0, k0k1
- pmaddwd xmm5, k0k1
- pmaddwd xmm6, k6k7
- pmaddwd xmm1, k6k7
- pmaddwd xmm2, k2k5
- pmaddwd xmm7, k2k5
- pmaddwd xmm3, k3k4
- pmaddwd xmm4, k3k4
-
- paddd xmm0, xmm6
- paddd xmm0, xmm2
- paddd xmm0, xmm3
- paddd xmm5, xmm1
- paddd xmm5, xmm7
- paddd xmm5, xmm4
-
- paddd xmm0, krd ;rounding
- paddd xmm5, krd
- psrad xmm0, 7 ;shift
- psrad xmm5, 7
- packssdw xmm0, xmm5 ;pack back to word
-
- ;clamp the values
- pminsw xmm0, max
- pmaxsw xmm0, min
-
-%if %1
- movdqu xmm1, [rdi + %2]
- pavgw xmm0, xmm1
-%endif
- movdqu [rdi + %2], xmm0
-%endm
-
-;void vp9_filter_block1d4_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_highbd_filter_block1d4_v8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 7
- %define k0k6 [rsp + 16 * 0]
- %define k2k5 [rsp + 16 * 1]
- %define k3k4 [rsp + 16 * 2]
- %define k1k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define max [rsp + 16 * 5]
- %define min [rsp + 16 * 6]
-
- HIGH_GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rbx, [rbx + rbx]
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movq xmm0, [rsi] ;load src: row 0
- movq xmm1, [rsi + rax] ;1
- movq xmm6, [rsi + rdx * 2] ;6
- lea rsi, [rsi + rax]
- movq xmm7, [rsi + rdx * 2] ;7
- movq xmm2, [rsi + rax] ;2
- movq xmm3, [rsi + rax * 2] ;3
- movq xmm4, [rsi + rdx] ;4
- movq xmm5, [rsi + rax * 4] ;5
-
- HIGH_APPLY_FILTER_4 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 7
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d8_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_highbd_filter_block1d8_v8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rbx, [rbx + rbx]
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- HIGH_APPLY_FILTER_8 0, 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d16_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_highbd_filter_block1d16_v8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rbx, [rbx + rbx]
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- HIGH_APPLY_FILTER_8 0, 0
- sub rsi, rax
-
- LOAD_VERT_8 16
- HIGH_APPLY_FILTER_8 0, 16
- add rdi, rbx
-
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 7
- %define k0k6 [rsp + 16 * 0]
- %define k2k5 [rsp + 16 * 1]
- %define k3k4 [rsp + 16 * 2]
- %define k1k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define max [rsp + 16 * 5]
- %define min [rsp + 16 * 6]
-
- HIGH_GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rbx, [rbx + rbx]
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movq xmm0, [rsi] ;load src: row 0
- movq xmm1, [rsi + rax] ;1
- movq xmm6, [rsi + rdx * 2] ;6
- lea rsi, [rsi + rax]
- movq xmm7, [rsi + rdx * 2] ;7
- movq xmm2, [rsi + rax] ;2
- movq xmm3, [rsi + rax * 2] ;3
- movq xmm4, [rsi + rdx] ;4
- movq xmm5, [rsi + rax * 4] ;5
-
- HIGH_APPLY_FILTER_4 1
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 7
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rbx, [rbx + rbx]
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-.loop:
- LOAD_VERT_8 0
- HIGH_APPLY_FILTER_8 1, 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rbx, [rbx + rbx]
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-.loop:
- LOAD_VERT_8 0
- HIGH_APPLY_FILTER_8 1, 0
- sub rsi, rax
-
- LOAD_VERT_8 16
- HIGH_APPLY_FILTER_8 1, 16
- add rdi, rbx
-
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d4_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_highbd_filter_block1d4_h8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 7
- %define k0k6 [rsp + 16 * 0]
- %define k2k5 [rsp + 16 * 1]
- %define k3k4 [rsp + 16 * 2]
- %define k1k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define max [rsp + 16 * 5]
- %define min [rsp + 16 * 6]
-
- HIGH_GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rdx, [rdx + rdx]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 6] ;load src
- movdqu xmm4, [rsi + 2]
- movdqa xmm1, xmm0
- movdqa xmm6, xmm4
- movdqa xmm7, xmm4
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- movdqa xmm5, xmm4
-
- psrldq xmm1, 2
- psrldq xmm6, 4
- psrldq xmm7, 6
- psrldq xmm2, 4
- psrldq xmm3, 6
- psrldq xmm5, 2
-
- HIGH_APPLY_FILTER_4 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 7
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d8_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_highbd_filter_block1d8_h8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rdx, [rdx + rdx]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 6] ;load src
- movdqu xmm1, [rsi - 4]
- movdqu xmm2, [rsi - 2]
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi + 2]
- movdqu xmm5, [rsi + 4]
- movdqu xmm6, [rsi + 6]
- movdqu xmm7, [rsi + 8]
-
- HIGH_APPLY_FILTER_8 0, 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d16_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_highbd_filter_block1d16_h8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rdx, [rdx + rdx]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 6] ;load src
- movdqu xmm1, [rsi - 4]
- movdqu xmm2, [rsi - 2]
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi + 2]
- movdqu xmm5, [rsi + 4]
- movdqu xmm6, [rsi + 6]
- movdqu xmm7, [rsi + 8]
-
- HIGH_APPLY_FILTER_8 0, 0
-
- movdqu xmm0, [rsi + 10] ;load src
- movdqu xmm1, [rsi + 12]
- movdqu xmm2, [rsi + 14]
- movdqu xmm3, [rsi + 16]
- movdqu xmm4, [rsi + 18]
- movdqu xmm5, [rsi + 20]
- movdqu xmm6, [rsi + 22]
- movdqu xmm7, [rsi + 24]
-
- HIGH_APPLY_FILTER_8 0, 16
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 7
- %define k0k6 [rsp + 16 * 0]
- %define k2k5 [rsp + 16 * 1]
- %define k3k4 [rsp + 16 * 2]
- %define k1k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define max [rsp + 16 * 5]
- %define min [rsp + 16 * 6]
-
- HIGH_GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rdx, [rdx + rdx]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 6] ;load src
- movdqu xmm4, [rsi + 2]
- movdqa xmm1, xmm0
- movdqa xmm6, xmm4
- movdqa xmm7, xmm4
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- movdqa xmm5, xmm4
-
- psrldq xmm1, 2
- psrldq xmm6, 4
- psrldq xmm7, 6
- psrldq xmm2, 4
- psrldq xmm3, 6
- psrldq xmm5, 2
-
- HIGH_APPLY_FILTER_4 1
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 7
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rdx, [rdx + rdx]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 6] ;load src
- movdqu xmm1, [rsi - 4]
- movdqu xmm2, [rsi - 2]
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi + 2]
- movdqu xmm5, [rsi + 4]
- movdqu xmm6, [rsi + 6]
- movdqu xmm7, [rsi + 8]
-
- HIGH_APPLY_FILTER_8 1, 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rdx, [rdx + rdx]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 6] ;load src
- movdqu xmm1, [rsi - 4]
- movdqu xmm2, [rsi - 2]
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi + 2]
- movdqu xmm5, [rsi + 4]
- movdqu xmm6, [rsi + 6]
- movdqu xmm7, [rsi + 8]
-
- HIGH_APPLY_FILTER_8 1, 0
-
- movdqu xmm0, [rsi + 10] ;load src
- movdqu xmm1, [rsi + 12]
- movdqu xmm2, [rsi + 14]
- movdqu xmm3, [rsi + 16]
- movdqu xmm4, [rsi + 18]
- movdqu xmm5, [rsi + 20]
- movdqu xmm6, [rsi + 22]
- movdqu xmm7, [rsi + 24]
-
- HIGH_APPLY_FILTER_8 1, 16
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,494 +1,0 @@
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro HIGH_GET_PARAM_4 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x00000040
-
- movdqa xmm3, [rdx] ;load filters
- pshuflw xmm4, xmm3, 11111111b ;k3
- psrldq xmm3, 8
- pshuflw xmm3, xmm3, 0b ;k4
- punpcklwd xmm4, xmm3 ;k3k4
-
- movq xmm3, rcx ;rounding
- pshufd xmm3, xmm3, 0
-
- mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
- movq xmm5, rdx
- movq xmm2, rcx
- pshufd xmm5, xmm5, 0b
- movdqa xmm1, xmm5
- psllw xmm5, xmm2
- psubw xmm5, xmm1 ;max value (for clamping)
- pxor xmm2, xmm2 ;min value (for clamping)
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro HIGH_APPLY_FILTER_4 1
-
- punpcklwd xmm0, xmm1 ;two row in one register
- pmaddwd xmm0, xmm4 ;multiply the filter factors
-
- paddd xmm0, xmm3 ;rounding
- psrad xmm0, 7 ;shift
- packssdw xmm0, xmm0 ;pack to word
-
- ;clamp the values
- pminsw xmm0, xmm5
- pmaxsw xmm0, xmm2
-
-%if %1
- movq xmm1, [rdi]
- pavgw xmm0, xmm1
-%endif
-
- movq [rdi], xmm0
- lea rsi, [rsi + 2*rax]
- lea rdi, [rdi + 2*rdx]
- dec rcx
-%endm
-
-%if ARCH_X86_64
-%macro HIGH_GET_PARAM 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x00000040
-
- movdqa xmm6, [rdx] ;load filters
-
- pshuflw xmm7, xmm6, 11111111b ;k3
- pshufhw xmm6, xmm6, 0b ;k4
- psrldq xmm6, 8
- punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
-
- movq xmm4, rcx ;rounding
- pshufd xmm4, xmm4, 0
-
- mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
- movq xmm8, rdx
- movq xmm5, rcx
- pshufd xmm8, xmm8, 0b
- movdqa xmm1, xmm8
- psllw xmm8, xmm5
- psubw xmm8, xmm1 ;max value (for clamping)
- pxor xmm5, xmm5 ;min value (for clamping)
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro HIGH_APPLY_FILTER_8 1
- movdqa xmm6, xmm0
- punpckhwd xmm6, xmm1
- punpcklwd xmm0, xmm1
- pmaddwd xmm6, xmm7
- pmaddwd xmm0, xmm7
-
- paddd xmm6, xmm4 ;rounding
- paddd xmm0, xmm4 ;rounding
- psrad xmm6, 7 ;shift
- psrad xmm0, 7 ;shift
- packssdw xmm0, xmm6 ;pack back to word
-
- ;clamp the values
- pminsw xmm0, xmm8
- pmaxsw xmm0, xmm5
-
-%if %1
- movdqu xmm1, [rdi]
- pavgw xmm0, xmm1
-%endif
- movdqu [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + 2*rax]
- lea rdi, [rdi + 2*rdx]
- dec rcx
-%endm
-
-%macro HIGH_APPLY_FILTER_16 1
- movdqa xmm9, xmm0
- movdqa xmm6, xmm2
- punpckhwd xmm9, xmm1
- punpckhwd xmm6, xmm3
- punpcklwd xmm0, xmm1
- punpcklwd xmm2, xmm3
-
- pmaddwd xmm9, xmm7
- pmaddwd xmm6, xmm7
- pmaddwd xmm0, xmm7
- pmaddwd xmm2, xmm7
-
- paddd xmm9, xmm4 ;rounding
- paddd xmm6, xmm4
- paddd xmm0, xmm4
- paddd xmm2, xmm4
-
- psrad xmm9, 7 ;shift
- psrad xmm6, 7
- psrad xmm0, 7
- psrad xmm2, 7
-
- packssdw xmm0, xmm9 ;pack back to word
- packssdw xmm2, xmm6 ;pack back to word
-
- ;clamp the values
- pminsw xmm0, xmm8
- pmaxsw xmm0, xmm5
- pminsw xmm2, xmm8
- pmaxsw xmm2, xmm5
-
-%if %1
- movdqu xmm1, [rdi]
- movdqu xmm3, [rdi + 16]
- pavgw xmm0, xmm1
- pavgw xmm2, xmm3
-%endif
- movdqu [rdi], xmm0 ;store the result
- movdqu [rdi + 16], xmm2 ;store the result
-
- lea rsi, [rsi + 2*rax]
- lea rdi, [rdi + 2*rdx]
- dec rcx
-%endm
-%endif
-
-global sym(vp9_highbd_filter_block1d4_v2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM_4
-.loop:
- movq xmm0, [rsi] ;load src
- movq xmm1, [rsi + 2*rax]
-
- HIGH_APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-%if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_v2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 8
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + 2*rax] ;1
-
- HIGH_APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_highbd_filter_block1d16_v2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 9
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm2, [rsi + 16]
- movdqu xmm1, [rsi + 2*rax] ;1
- movdqu xmm3, [rsi + 2*rax + 16]
-
- HIGH_APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%endif
-
-global sym(vp9_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM_4
-.loop:
- movq xmm0, [rsi] ;load src
- movq xmm1, [rsi + 2*rax]
-
- HIGH_APPLY_FILTER_4 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-%if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 8
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + 2*rax] ;1
-
- HIGH_APPLY_FILTER_8 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 9
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + 2*rax] ;1
- movdqu xmm2, [rsi + 16]
- movdqu xmm3, [rsi + 2*rax + 16]
-
- HIGH_APPLY_FILTER_16 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%endif
-
-global sym(vp9_highbd_filter_block1d4_h2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 2
-
- HIGH_APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-%if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_h2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 8
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 2]
-
- HIGH_APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_highbd_filter_block1d16_h2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 9
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 2]
- movdqu xmm2, [rsi + 16]
- movdqu xmm3, [rsi + 18]
-
- HIGH_APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%endif
-
-global sym(vp9_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 2
-
- HIGH_APPLY_FILTER_4 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-%if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 8
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 2]
-
- HIGH_APPLY_FILTER_8 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 9
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 2]
- movdqu xmm2, [rsi + 16]
- movdqu xmm3, [rsi + 18]
-
- HIGH_APPLY_FILTER_16 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%endif
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
+++ /dev/null
@@ -1,602 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// immintrin.h.
-#include "./vp9_rtcd.h"
-
-#include <immintrin.h>
-
-#include "vp9/common/x86/convolve.h"
-#include "vpx_ports/mem.h"
-
-// filters for 16_h8 and 16_v8
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-#if defined(__clang__)
-# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
- (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0)
-# define MM256_BROADCASTSI128_SI256(x) \
- _mm_broadcastsi128_si256((__m128i const *)&(x))
-# else // clang > 3.3, and not 5.0 on macosx.
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif // clang <= 3.3
-#elif defined(__GNUC__)
-# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-# define MM256_BROADCASTSI128_SI256(x) \
- _mm_broadcastsi128_si256((__m128i const *)&(x))
-# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-# else // gcc > 4.7
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif // gcc <= 4.6
-#else // !(gcc || clang)
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif // __clang__
-
-static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr,
- ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
- __m256i srcReg32b1, srcReg32b2, filtersReg32;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x706u));
-
- filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pixels_per_line << 1;
- dst_stride = output_pitch << 1;
- for (i = output_height; i > 1; i-=2) {
- // load the 2 strides of source
- srcReg32b1 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
- srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
- _mm_loadu_si128((const __m128i *)
- (src_ptr+src_pixels_per_line-3)), 1);
-
- // filter the source buffer
- srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
- srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
- // filter the source buffer
- srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
- srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
- _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
- // reading 2 strides of the next 16 bytes
- // (part of it was being read by earlier read)
- srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
- srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
- _mm_loadu_si128((const __m128i *)
- (src_ptr+src_pixels_per_line+5)), 1);
-
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
- _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
- // filter the source buffer
- srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
- // add and saturate the results together
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
- // filter the source buffer
- srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
- srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
- _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
- _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
-
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
- srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
- srcRegFilt32b2_1);
-
- src_ptr+=src_stride;
-
- // save 16 bytes
- _mm_store_si128((__m128i*)output_ptr,
- _mm256_castsi256_si128(srcRegFilt32b1_1));
-
- // save the next 16 bits
- _mm_store_si128((__m128i*)(output_ptr+output_pitch),
- _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
- output_ptr+=dst_stride;
- }
-
- // if the number of strides is odd.
- // process only 16 bytes
- if (i > 0) {
- __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
- __m128i srcRegFilt2, srcRegFilt3;
-
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
- // filter the source buffer
- srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt1Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt4Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt2Reg));
- srcRegFilt2= _mm_shuffle_epi8(srcReg1,
- _mm256_castsi256_si128(filt3Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
- _mm256_castsi256_si128(secondFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
- // reading the next 16 bytes
- // (part of it was being read by earlier read)
- srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
- // filter the source buffer
- srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt1Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt4Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt2Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
- _mm256_castsi256_si128(filt3Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
- _mm256_castsi256_si128(secondFilters));
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
- _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm_min_epi16(srcRegFilt3, srcRegFilt2));
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm256_castsi256_si128(addFilterReg64));
-
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm256_castsi256_si128(addFilterReg64));
-
- // shift by 7 bit each 16 bit
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
- srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
- // save 16 bytes
- _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
- }
-}
-
-static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
- ptrdiff_t src_pitch,
- uint8_t *output_ptr,
- ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg64;
- __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
- __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
- __m256i srcReg32b11, srcReg32b12, filtersReg32;
- __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the
- // same data in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32,
- _mm256_set1_epi16(0x706u));
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pitch << 1;
- dst_stride = out_pitch << 1;
-
- // load 16 bytes 7 times in stride of src_pitch
- srcReg32b1 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr)));
- srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
- srcReg32b3 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
- srcReg32b4 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
- srcReg32b5 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
- srcReg32b6 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
- srcReg32b7 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-
- // have each consecutive loads on the same 256 register
- srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
- _mm256_castsi256_si128(srcReg32b2), 1);
- srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
- _mm256_castsi256_si128(srcReg32b3), 1);
- srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
- _mm256_castsi256_si128(srcReg32b4), 1);
- srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
- _mm256_castsi256_si128(srcReg32b5), 1);
- srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
- _mm256_castsi256_si128(srcReg32b6), 1);
- srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
- _mm256_castsi256_si128(srcReg32b7), 1);
-
- // merge every two consecutive registers except the last one
- srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
- srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
-
- // save
- srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-
- // save
- srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-
- // save
- srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
- // save
- srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
-
-
- for (i = output_height; i > 1; i-=2) {
- // load the last 2 loads of 16 bytes and have every two
- // consecutive loads in the same 256 bit register
- srcReg32b8 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
- srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
- _mm256_castsi256_si128(srcReg32b8), 1);
- srcReg32b9 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
- srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
- _mm256_castsi256_si128(srcReg32b9), 1);
-
- // merge every two consecutive registers
- // save
- srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
- srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
- srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_min_epi16(srcReg32b8, srcReg32b12));
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
- srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
- // add and saturate the results together
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_min_epi16(srcReg32b8, srcReg32b12));
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
- srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
-
- src_ptr+=src_stride;
-
- // save 16 bytes
- _mm_store_si128((__m128i*)output_ptr,
- _mm256_castsi256_si128(srcReg32b1));
-
- // save the next 16 bits
- _mm_store_si128((__m128i*)(output_ptr+out_pitch),
- _mm256_extractf128_si256(srcReg32b1, 1));
-
- output_ptr+=dst_stride;
-
- // save part of the registers for next strides
- srcReg32b10 = srcReg32b11;
- srcReg32b1 = srcReg32b3;
- srcReg32b11 = srcReg32b2;
- srcReg32b3 = srcReg32b5;
- srcReg32b2 = srcReg32b4;
- srcReg32b5 = srcReg32b7;
- srcReg32b7 = srcReg32b9;
- }
- if (i > 0) {
- __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
- __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
- // load the last 16 bytes
- srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the last 2 results together
- srcRegFilt4 = _mm_unpacklo_epi8(
- _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
- srcRegFilt7 = _mm_unpackhi_epi8(
- _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
- _mm256_castsi256_si128(forthFilters));
- srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
- _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
-
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
- _mm256_castsi256_si128(secondFilters));
- srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
- _mm256_castsi256_si128(secondFilters));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
- _mm256_castsi256_si128(thirdFilters));
- srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
- _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm_min_epi16(srcRegFilt4, srcRegFilt6));
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
- _mm_min_epi16(srcRegFilt5, srcRegFilt7));
-
- // add and saturate the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm_max_epi16(srcRegFilt4, srcRegFilt6));
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
- _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
-
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm256_castsi256_si128(addFilterReg64));
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
- _mm256_castsi256_si128(addFilterReg64));
-
- // shift by 7 bit each 16 bit
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
- srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
-
- // save 16 bytes
- _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
- }
-}
-
-#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-#if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
-#else // ARCH_X86
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
-#endif // ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
-#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
-#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
-#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
-#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
-#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
-#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
-// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_2D(, avx2);
-#endif // HAVE_AX2 && HAVE_SSSE3
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,601 +1,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// tmmintrin.h.
-#include "./vp9_rtcd.h"
-
-#include <tmmintrin.h>
-
-#include "vp9/common/x86/convolve.h"
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-// These are reused by the avx2 intrinsics.
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-
-void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr,
- ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i firstFilters, secondFilters, shuffle1, shuffle2;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, srcReg, minReg;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter into the first lane
- firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
- // duplicate only the third 16 bit in the filter into the first lane
- secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
- // duplicate only the seconds 16 bits in the filter into the second lane
- // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
- firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
- // duplicate only the forth 16 bits in the filter into the second lane
- // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
- secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
- // loading the local filters
- shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
- shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
-
- for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
- // filter the source buffer
- srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
- srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // extract the higher half of the lane
- srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
- srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
-
- minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
- // add and saturate all the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
- srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bits
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
- src_ptr+=src_pixels_per_line;
-
- // save only 4 bytes
- *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
- output_ptr+=output_pitch;
- }
-}
-
-void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr,
- ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
- __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, minReg;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 128 bit register
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 128 bit register
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 128 bit register
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 128 bit register
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
- filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
- filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
- filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
- for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
- // filter the source buffer
- srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
- srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
- // add and saturate all the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
- srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bits
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr+=src_pixels_per_line;
-
- // save only 8 bytes
- _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
- output_ptr+=output_pitch;
- }
-}
-
-static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr,
- ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
- __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 128 bit register
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 128 bit register
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 128 bit register
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 128 bit register
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
- filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
- filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
- filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
- for (i = 0; i < output_height; i++) {
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
- // filter the source buffer
- srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
- // reading the next 16 bytes.
- // (part of it was being read by earlier read)
- srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
- // filter the source buffer
- srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
-
- // add and saturate the results together
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm_min_epi16(srcRegFilt3, srcRegFilt2));
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
- srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
- src_ptr+=src_pixels_per_line;
-
- // save 16 bytes
- _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
-
- output_ptr+=output_pitch;
- }
-}
-
-void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
- ptrdiff_t src_pitch,
- uint8_t *output_ptr,
- ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i addFilterReg64, filtersReg, minReg;
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
- __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
- __m128i srcReg8;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits in the filter
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits in the filter
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits in the filter
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- // load the first 7 rows of 8 bytes
- srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
- srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
- srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
- srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
- srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-
- for (i = 0; i < output_height; i++) {
- // load the last 8 bytes
- srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the result together
- srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
- srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-
- // merge the result together
- srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
- srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
- srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
- // add and saturate the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
- srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr+=src_pitch;
-
- // shift down a row
- srcReg1 = srcReg2;
- srcReg2 = srcReg3;
- srcReg3 = srcReg4;
- srcReg4 = srcReg5;
- srcReg5 = srcReg6;
- srcReg6 = srcReg7;
- srcReg7 = srcReg8;
-
- // save only 8 bytes convolve result
- _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
- output_ptr+=out_pitch;
- }
-}
-
-static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
- ptrdiff_t src_pitch,
- uint8_t *output_ptr,
- ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
- __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
- __m128i srcReg8;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits in the filter
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits in the filter
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits in the filter
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- // load the first 7 rows of 16 bytes
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
- srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
- srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
- srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
- srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
- srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
- srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-
- for (i = 0; i < output_height; i++) {
- // load the last 16 bytes
- srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the result together
- srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
- srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
- srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
- srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
- srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-
- // add and saturate the results together
- srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
- // merge the result together
- srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
- srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
- srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
-
- // merge the result together
- srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
- srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
- srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
- _mm_min_epi16(srcRegFilt3, srcRegFilt7));
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm_min_epi16(srcRegFilt6, srcRegFilt8));
-
- // add and saturate the results together
- srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
- _mm_max_epi16(srcRegFilt3, srcRegFilt7));
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm_max_epi16(srcRegFilt6, srcRegFilt8));
- srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
-
- src_ptr+=src_pitch;
-
- // shift down a row
- srcReg1 = srcReg2;
- srcReg2 = srcReg3;
- srcReg3 = srcReg4;
- srcReg4 = srcReg5;
- srcReg5 = srcReg6;
- srcReg6 = srcReg7;
- srcReg7 = srcReg8;
-
- // save 16 bytes convolve result
- _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
-
- output_ptr+=out_pitch;
- }
-}
-
-#if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
-#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
-#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
-#else // ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#endif // ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
-
-// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
- ssse3);
-
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_ , ssse3);
--- a/vp9/common/x86/vp9_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,987 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro GET_FILTERS_4 0
- mov rdx, arg(5) ;filter ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- psrldq xmm7, 8
- pshuflw xmm4, xmm7, 0b ;k4
- pshuflw xmm5, xmm7, 01010101b ;k5
- pshuflw xmm6, xmm7, 10101010b ;k6
- pshuflw xmm7, xmm7, 11111111b ;k7
-
- punpcklqdq xmm0, xmm1
- punpcklqdq xmm2, xmm3
- punpcklqdq xmm5, xmm4
- punpcklqdq xmm6, xmm7
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm2
- movdqa k5k4, xmm5
- movdqa k6k7, xmm6
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6
-
- pxor xmm7, xmm7
- movdqa zero, xmm7
-%endm
-
-%macro APPLY_FILTER_4 1
- punpckldq xmm0, xmm1 ;two row in one register
- punpckldq xmm6, xmm7
- punpckldq xmm2, xmm3
- punpckldq xmm5, xmm4
-
- punpcklbw xmm0, zero ;unpack to word
- punpcklbw xmm6, zero
- punpcklbw xmm2, zero
- punpcklbw xmm5, zero
-
- pmullw xmm0, k0k1 ;multiply the filter factors
- pmullw xmm6, k6k7
- pmullw xmm2, k2k3
- pmullw xmm5, k5k4
-
- paddsw xmm0, xmm6 ;sum
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddsw xmm0, xmm1
- paddsw xmm0, xmm2
- psrldq xmm2, 8
- paddsw xmm0, xmm5
- psrldq xmm5, 8
- paddsw xmm0, xmm2
- paddsw xmm0, xmm5
-
- paddsw xmm0, krd ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack to byte
-
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movd [rdi], xmm0
-%endm
-
-%macro GET_FILTERS 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- pshufhw xmm4, xmm7, 0b ;k4
- pshufhw xmm5, xmm7, 01010101b ;k5
- pshufhw xmm6, xmm7, 10101010b ;k6
- pshufhw xmm7, xmm7, 11111111b ;k7
-
- punpcklwd xmm0, xmm0
- punpcklwd xmm1, xmm1
- punpcklwd xmm2, xmm2
- punpcklwd xmm3, xmm3
- punpckhwd xmm4, xmm4
- punpckhwd xmm5, xmm5
- punpckhwd xmm6, xmm6
- punpckhwd xmm7, xmm7
-
- movdqa k0, xmm0 ;store filter factors on stack
- movdqa k1, xmm1
- movdqa k2, xmm2
- movdqa k3, xmm3
- movdqa k4, xmm4
- movdqa k5, xmm5
- movdqa k6, xmm6
- movdqa k7, xmm7
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6 ;rounding
-
- pxor xmm7, xmm7
- movdqa zero, xmm7
-%endm
-
-%macro LOAD_VERT_8 1
- movq xmm0, [rsi + %1] ;0
- movq xmm1, [rsi + rax + %1] ;1
- movq xmm6, [rsi + rdx * 2 + %1] ;6
- lea rsi, [rsi + rax]
- movq xmm7, [rsi + rdx * 2 + %1] ;7
- movq xmm2, [rsi + rax + %1] ;2
- movq xmm3, [rsi + rax * 2 + %1] ;3
- movq xmm4, [rsi + rdx + %1] ;4
- movq xmm5, [rsi + rax * 4 + %1] ;5
-%endm
-
-%macro APPLY_FILTER_8 2
- punpcklbw xmm0, zero
- punpcklbw xmm1, zero
- punpcklbw xmm6, zero
- punpcklbw xmm7, zero
- punpcklbw xmm2, zero
- punpcklbw xmm5, zero
- punpcklbw xmm3, zero
- punpcklbw xmm4, zero
-
- pmullw xmm0, k0
- pmullw xmm1, k1
- pmullw xmm6, k6
- pmullw xmm7, k7
- pmullw xmm2, k2
- pmullw xmm5, k5
- pmullw xmm3, k3
- pmullw xmm4, k4
-
- paddsw xmm0, xmm1
- paddsw xmm0, xmm6
- paddsw xmm0, xmm7
- paddsw xmm0, xmm2
- paddsw xmm0, xmm5
- paddsw xmm0, xmm3
- paddsw xmm0, xmm4
-
- paddsw xmm0, krd ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack back to byte
-%if %1
- movq xmm1, [rdi + %2]
- pavgb xmm0, xmm1
-%endif
- movq [rdi + %2], xmm0
-%endm
-
-;void vp9_filter_block1d4_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
-sym(vp9_filter_block1d4_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movd xmm0, [rsi] ;load src: row 0
- movd xmm1, [rsi + rax] ;1
- movd xmm6, [rsi + rdx * 2] ;6
- lea rsi, [rsi + rax]
- movd xmm7, [rsi + rdx * 2] ;7
- movd xmm2, [rsi + rax] ;2
- movd xmm3, [rsi + rax * 2] ;3
- movd xmm4, [rsi + rdx] ;4
- movd xmm5, [rsi + rax * 4] ;5
-
- APPLY_FILTER_4 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d8_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
-sym(vp9_filter_block1d8_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 0, 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d16_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
-sym(vp9_filter_block1d16_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 0, 0
- sub rsi, rax
-
- LOAD_VERT_8 8
- APPLY_FILTER_8 0, 8
- add rdi, rbx
-
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_v8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movd xmm0, [rsi] ;load src: row 0
- movd xmm1, [rsi + rax] ;1
- movd xmm6, [rsi + rdx * 2] ;6
- lea rsi, [rsi + rax]
- movd xmm7, [rsi + rdx * 2] ;7
- movd xmm2, [rsi + rax] ;2
- movd xmm3, [rsi + rax * 2] ;3
- movd xmm4, [rsi + rdx] ;4
- movd xmm5, [rsi + rax * 4] ;5
-
- APPLY_FILTER_4 1
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_v8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 1, 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_v8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 1, 0
- sub rsi, rax
-
- LOAD_VERT_8 8
- APPLY_FILTER_8 1, 8
- add rdi, rbx
-
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d4_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
-sym(vp9_filter_block1d4_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- movdqa xmm5, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm3, 3
- psrldq xmm5, 5
- psrldq xmm4, 4
-
- APPLY_FILTER_4 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d8_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
-sym(vp9_filter_block1d8_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d16_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
-sym(vp9_filter_block1d16_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 0
-
- movdqu xmm0, [rsi + 5] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 8
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_h8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- movdqa xmm5, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm3, 3
- psrldq xmm5, 5
- psrldq xmm4, 4
-
- APPLY_FILTER_4 1
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_h8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 1, 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_h8_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 1, 0
-
- movdqu xmm0, [rsi + 5] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 1, 8
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ /dev/null
@@ -1,1071 +1,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro VERTx4 1
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movq xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
-
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm1
- pshufd xmm5, xmm5, 0
- movdqa k4k5, xmm2
- movdqa k6k7, xmm3
- movdqa krd, xmm5
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
- lea rbx, [rdx + rdx*4]
- add rbx, rdx ;pitch * 6
-
-.loop:
- movd xmm0, [rsi] ;A
- movd xmm1, [rsi + rdx] ;B
- movd xmm2, [rsi + rdx * 2] ;C
- movd xmm3, [rax + rdx * 2] ;D
- movd xmm4, [rsi + rdx * 4] ;E
- movd xmm5, [rax + rdx * 4] ;F
-
- punpcklbw xmm0, xmm1 ;A B
- punpcklbw xmm2, xmm3 ;C D
- punpcklbw xmm4, xmm5 ;E F
-
- movd xmm6, [rsi + rbx] ;G
- movd xmm7, [rax + rbx] ;H
-
- pmaddubsw xmm0, k0k1
- pmaddubsw xmm2, k2k3
- punpcklbw xmm6, xmm7 ;G H
- pmaddubsw xmm4, k4k5
- pmaddubsw xmm6, k6k7
-
- movdqa xmm1, xmm2
- paddsw xmm0, xmm6
- pmaxsw xmm2, xmm4
- pminsw xmm4, xmm1
- paddsw xmm0, xmm4
- paddsw xmm0, xmm2
-
- paddsw xmm0, krd
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- add rsi, rdx
- add rax, rdx
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movd [rdi], xmm0
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .loop
-%endm
-
-%macro VERTx8 1
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movq xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
-
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm1
- pshufd xmm5, xmm5, 0
- movdqa k4k5, xmm2
- movdqa k6k7, xmm3
- movdqa krd, xmm5
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
- lea rbx, [rdx + rdx*4]
- add rbx, rdx ;pitch * 6
-
-.loop:
- movq xmm0, [rsi] ;A
- movq xmm1, [rsi + rdx] ;B
- movq xmm2, [rsi + rdx * 2] ;C
- movq xmm3, [rax + rdx * 2] ;D
- movq xmm4, [rsi + rdx * 4] ;E
- movq xmm5, [rax + rdx * 4] ;F
-
- punpcklbw xmm0, xmm1 ;A B
- punpcklbw xmm2, xmm3 ;C D
- punpcklbw xmm4, xmm5 ;E F
-
- movq xmm6, [rsi + rbx] ;G
- movq xmm7, [rax + rbx] ;H
-
- pmaddubsw xmm0, k0k1
- pmaddubsw xmm2, k2k3
- punpcklbw xmm6, xmm7 ;G H
- pmaddubsw xmm4, k4k5
- pmaddubsw xmm6, k6k7
-
- paddsw xmm0, xmm6
- movdqa xmm1, xmm2
- pmaxsw xmm2, xmm4
- pminsw xmm4, xmm1
- paddsw xmm0, xmm4
- paddsw xmm0, xmm2
-
- paddsw xmm0, krd
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- add rsi, rdx
- add rax, rdx
-%if %1
- movq xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movq [rdi], xmm0
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .loop
-%endm
-
-
-%macro VERTx16 1
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movq xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
-
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm1
- pshufd xmm5, xmm5, 0
- movdqa k4k5, xmm2
- movdqa k6k7, xmm3
- movdqa krd, xmm5
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
- lea rbx, [rdx + rdx*4]
- add rbx, rdx ;pitch * 6
-
-.loop:
- movq xmm0, [rsi] ;A
- movq xmm1, [rsi + rdx] ;B
- movq xmm2, [rsi + rdx * 2] ;C
- movq xmm3, [rax + rdx * 2] ;D
- movq xmm4, [rsi + rdx * 4] ;E
- movq xmm5, [rax + rdx * 4] ;F
-
- punpcklbw xmm0, xmm1 ;A B
- punpcklbw xmm2, xmm3 ;C D
- punpcklbw xmm4, xmm5 ;E F
-
- movq xmm6, [rsi + rbx] ;G
- movq xmm7, [rax + rbx] ;H
-
- pmaddubsw xmm0, k0k1
- pmaddubsw xmm2, k2k3
- punpcklbw xmm6, xmm7 ;G H
- pmaddubsw xmm4, k4k5
- pmaddubsw xmm6, k6k7
-
- paddsw xmm0, xmm6
- movdqa xmm1, xmm2
- pmaxsw xmm2, xmm4
- pminsw xmm4, xmm1
- paddsw xmm0, xmm4
- paddsw xmm0, xmm2
-
- paddsw xmm0, krd
- psraw xmm0, 7
- packuswb xmm0, xmm0
-%if %1
- movq xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movq [rdi], xmm0
-
- movq xmm0, [rsi + 8] ;A
- movq xmm1, [rsi + rdx + 8] ;B
- movq xmm2, [rsi + rdx * 2 + 8] ;C
- movq xmm3, [rax + rdx * 2 + 8] ;D
- movq xmm4, [rsi + rdx * 4 + 8] ;E
- movq xmm5, [rax + rdx * 4 + 8] ;F
-
- punpcklbw xmm0, xmm1 ;A B
- punpcklbw xmm2, xmm3 ;C D
- punpcklbw xmm4, xmm5 ;E F
-
- movq xmm6, [rsi + rbx + 8] ;G
- movq xmm7, [rax + rbx + 8] ;H
- punpcklbw xmm6, xmm7 ;G H
-
- pmaddubsw xmm0, k0k1
- pmaddubsw xmm2, k2k3
- pmaddubsw xmm4, k4k5
- pmaddubsw xmm6, k6k7
-
- paddsw xmm0, xmm6
- movdqa xmm1, xmm2
- pmaxsw xmm2, xmm4
- pminsw xmm4, xmm1
- paddsw xmm0, xmm4
- paddsw xmm0, xmm2
-
- paddsw xmm0, krd
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- add rsi, rdx
- add rax, rdx
-%if %1
- movq xmm1, [rdi+8]
- pavgb xmm0, xmm1
-%endif
-
- movq [rdi+8], xmm0
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .loop
-%endm
-
-;void vp9_filter_block1d8_v8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- VERTx4 0
-
- add rsp, 16*5
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d8_v8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- VERTx8 0
-
- add rsp, 16*5
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d16_v8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- VERTx16 0
-
- add rsp, 16*5
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-
-global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v8_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- VERTx4 1
-
- add rsp, 16*5
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- VERTx8 1
-
- add rsp, 16*5
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- VERTx16 1
-
- add rsp, 16*5
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%macro HORIZx4_ROW 2
- movdqa %2, %1
- pshufb %1, [GLOBAL(shuf_t0t1)]
- pshufb %2, [GLOBAL(shuf_t2t3)]
- pmaddubsw %1, k0k1k4k5
- pmaddubsw %2, k2k3k6k7
-
- movdqa xmm4, %1
- movdqa xmm5, %2
- psrldq %1, 8
- psrldq %2, 8
- movdqa xmm6, xmm5
-
- paddsw xmm4, %2
- pmaxsw xmm5, %1
- pminsw %1, xmm6
- paddsw %1, xmm4
- paddsw %1, xmm5
-
- paddsw %1, krd
- psraw %1, 7
- packuswb %1, %1
-%endm
-
-%macro HORIZx4 1
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movq xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm6, xmm4, 0b ;k0_k1
- pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5
- pshuflw xmm7, xmm4, 01010101b ;k2_k3
- pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
- pshufd xmm5, xmm5, 0 ;rounding
-
- movdqa k0k1k4k5, xmm6
- movdqa k2k3k6k7, xmm7
- movdqa krd, xmm5
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;output_pitch
- movsxd rcx, dword ptr arg(4) ;output_height
- shr rcx, 1
-.loop:
- ;Do two rows once
- movq xmm0, [rsi - 3] ;load src
- movq xmm1, [rsi + 5]
- movq xmm2, [rsi + rax - 3]
- movq xmm3, [rsi + rax + 5]
- punpcklqdq xmm0, xmm1
- punpcklqdq xmm2, xmm3
-
- HORIZx4_ROW xmm0, xmm1
- HORIZx4_ROW xmm2, xmm3
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
- movd xmm3, [rdi + rdx]
- pavgb xmm2, xmm3
-%endif
- movd [rdi], xmm0
- movd [rdi +rdx], xmm2
-
- lea rsi, [rsi + rax]
- prefetcht0 [rsi + 4 * rax - 3]
- lea rsi, [rsi + rax]
- lea rdi, [rdi + 2 * rdx]
- prefetcht0 [rsi + 2 * rax - 3]
-
- dec rcx
- jnz .loop
-
- ; Do last row if output_height is odd
- movsxd rcx, dword ptr arg(4) ;output_height
- and rcx, 1
- je .done
-
- movq xmm0, [rsi - 3] ; load src
- movq xmm1, [rsi + 5]
- punpcklqdq xmm0, xmm1
-
- HORIZx4_ROW xmm0, xmm1
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movd [rdi], xmm0
-.done
-%endm
-
-%macro HORIZx8_ROW 4
- movdqa %2, %1
- movdqa %3, %1
- movdqa %4, %1
-
- pshufb %1, [GLOBAL(shuf_t0t1)]
- pshufb %2, [GLOBAL(shuf_t2t3)]
- pshufb %3, [GLOBAL(shuf_t4t5)]
- pshufb %4, [GLOBAL(shuf_t6t7)]
-
- pmaddubsw %1, k0k1
- pmaddubsw %2, k2k3
- pmaddubsw %3, k4k5
- pmaddubsw %4, k6k7
-
- paddsw %1, %4
- movdqa %4, %2
- pmaxsw %2, %3
- pminsw %3, %4
- paddsw %1, %3
- paddsw %1, %2
-
- paddsw %1, krd
- psraw %1, 7
- packuswb %1, %1
-%endm
-
-%macro HORIZx8 1
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movq xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
-
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm1
- pshufd xmm5, xmm5, 0
- movdqa k4k5, xmm2
- movdqa k6k7, xmm3
- movdqa krd, xmm5
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;output_pitch
- movsxd rcx, dword ptr arg(4) ;output_height
- shr rcx, 1
-
-.loop:
- movq xmm0, [rsi - 3] ;load src
- movq xmm3, [rsi + 5]
- movq xmm4, [rsi + rax - 3]
- movq xmm7, [rsi + rax + 5]
- punpcklqdq xmm0, xmm3
- punpcklqdq xmm4, xmm7
-
- HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
- HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
-%if %1
- movq xmm1, [rdi]
- movq xmm2, [rdi + rdx]
- pavgb xmm0, xmm1
- pavgb xmm4, xmm2
-%endif
- movq [rdi], xmm0
- movq [rdi + rdx], xmm4
-
- lea rsi, [rsi + rax]
- prefetcht0 [rsi + 4 * rax - 3]
- lea rsi, [rsi + rax]
- lea rdi, [rdi + 2 * rdx]
- prefetcht0 [rsi + 2 * rax - 3]
- dec rcx
- jnz .loop
-
- ;Do last row if output_height is odd
- movsxd rcx, dword ptr arg(4) ;output_height
- and rcx, 1
- je .done
-
- movq xmm0, [rsi - 3]
- movq xmm3, [rsi + 5]
- punpcklqdq xmm0, xmm3
-
- HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
-%if %1
- movq xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movq [rdi], xmm0
-.done
-%endm
-
-%macro HORIZx16 1
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movq xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
-
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm1
- pshufd xmm5, xmm5, 0
- movdqa k4k5, xmm2
- movdqa k6k7, xmm3
- movdqa krd, xmm5
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;output_pitch
- movsxd rcx, dword ptr arg(4) ;output_height
-
-.loop:
- prefetcht0 [rsi + 2 * rax -3]
-
- movq xmm0, [rsi - 3] ;load src data
- movq xmm4, [rsi + 5]
- movq xmm6, [rsi + 13]
- punpcklqdq xmm0, xmm4
- punpcklqdq xmm4, xmm6
-
- movdqa xmm7, xmm0
-
- punpcklbw xmm7, xmm7
- punpckhbw xmm0, xmm0
- movdqa xmm1, xmm0
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
-
- palignr xmm0, xmm7, 1
- palignr xmm1, xmm7, 5
- pmaddubsw xmm0, k0k1
- palignr xmm2, xmm7, 9
- pmaddubsw xmm1, k2k3
- palignr xmm3, xmm7, 13
-
- pmaddubsw xmm2, k4k5
- pmaddubsw xmm3, k6k7
- paddsw xmm0, xmm3
-
- movdqa xmm3, xmm4
- punpcklbw xmm3, xmm3
- punpckhbw xmm4, xmm4
-
- movdqa xmm5, xmm4
- movdqa xmm6, xmm4
- movdqa xmm7, xmm4
-
- palignr xmm4, xmm3, 1
- palignr xmm5, xmm3, 5
- palignr xmm6, xmm3, 9
- palignr xmm7, xmm3, 13
-
- movdqa xmm3, xmm1
- pmaddubsw xmm4, k0k1
- pmaxsw xmm1, xmm2
- pmaddubsw xmm5, k2k3
- pminsw xmm2, xmm3
- pmaddubsw xmm6, k4k5
- paddsw xmm0, xmm2
- pmaddubsw xmm7, k6k7
- paddsw xmm0, xmm1
-
- paddsw xmm4, xmm7
- movdqa xmm7, xmm5
- pmaxsw xmm5, xmm6
- pminsw xmm6, xmm7
- paddsw xmm4, xmm6
- paddsw xmm4, xmm5
-
- paddsw xmm0, krd
- paddsw xmm4, krd
- psraw xmm0, 7
- psraw xmm4, 7
- packuswb xmm0, xmm0
- packuswb xmm4, xmm4
- punpcklqdq xmm0, xmm4
-%if %1
- movdqa xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
-
- lea rsi, [rsi + rax]
- movdqa [rdi], xmm0
-
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-%endm
-
-;void vp9_filter_block1d4_h8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 3
- %define k0k1k4k5 [rsp + 16 * 0]
- %define k2k3k6k7 [rsp + 16 * 1]
- %define krd [rsp + 16 * 2]
-
- HORIZx4 0
-
- add rsp, 16 * 3
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d8_h8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- HORIZx8 0
-
- add rsp, 16*5
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d16_h8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- HORIZx16 0
-
- add rsp, 16*5
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h8_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 3
- %define k0k1k4k5 [rsp + 16 * 0]
- %define k2k3k6k7 [rsp + 16 * 1]
- %define krd [rsp + 16 * 2]
-
- HORIZx4 1
-
- add rsp, 16 * 3
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- HORIZx8 1
-
- add rsp, 16*5
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
-
- HORIZx16 1
-
- add rsp, 16*5
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-SECTION_RODATA
-align 16
-shuf_t0t1:
- db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-align 16
-shuf_t2t3:
- db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-align 16
-shuf_t4t5:
- db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-align 16
-shuf_t6t7:
- db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
--- a/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,448 +1,0 @@
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm3, [rdx] ;load filters
- pshuflw xmm4, xmm3, 11111111b ;k3
- psrldq xmm3, 8
- pshuflw xmm3, xmm3, 0b ;k4
- punpcklqdq xmm4, xmm3 ;k3k4
-
- movq xmm3, rcx ;rounding
- pshufd xmm3, xmm3, 0
-
- pxor xmm2, xmm2
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-
- punpckldq xmm0, xmm1 ;two row in one register
- punpcklbw xmm0, xmm2 ;unpack to word
- pmullw xmm0, xmm4 ;multiply the filter factors
-
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddsw xmm0, xmm1
-
- paddsw xmm0, xmm3 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack to byte
-
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
-
- movd [rdi], xmm0
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro GET_PARAM 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
-
- pshuflw xmm6, xmm7, 11111111b ;k3
- pshufhw xmm7, xmm7, 0b ;k4
- punpcklwd xmm6, xmm6
- punpckhwd xmm7, xmm7
-
- movq xmm4, rcx ;rounding
- pshufd xmm4, xmm4, 0
-
- pxor xmm5, xmm5
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
-
- pmullw xmm0, xmm6
- pmullw xmm1, xmm7
- paddsw xmm0, xmm1
- paddsw xmm0, xmm4 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack back to byte
-%if %1
- movq xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movq [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro APPLY_FILTER_16 1
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpckhbw xmm2, xmm5
- punpckhbw xmm3, xmm5
-
- pmullw xmm0, xmm6
- pmullw xmm1, xmm7
- pmullw xmm2, xmm6
- pmullw xmm3, xmm7
-
- paddsw xmm0, xmm1
- paddsw xmm2, xmm3
-
- paddsw xmm0, xmm4 ;rounding
- paddsw xmm2, xmm4
- psraw xmm0, 7 ;shift
- psraw xmm2, 7
- packuswb xmm0, xmm2 ;pack back to byte
-%if %1
- movdqu xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movdqu [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-global sym(vp9_filter_block1d4_v2_sse2) PRIVATE
-sym(vp9_filter_block1d4_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movd xmm0, [rsi] ;load src
- movd xmm1, [rsi + rax]
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_v2_sse2) PRIVATE
-sym(vp9_filter_block1d8_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movq xmm0, [rsi] ;0
- movq xmm1, [rsi + rax] ;1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_v2_sse2) PRIVATE
-sym(vp9_filter_block1d16_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + rax] ;1
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_v2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movd xmm0, [rsi] ;load src
- movd xmm1, [rsi + rax]
-
- APPLY_FILTER_4 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_v2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movq xmm0, [rsi] ;0
- movq xmm1, [rsi + rax] ;1
-
- APPLY_FILTER_8 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_v2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + rax] ;1
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d4_h2_sse2) PRIVATE
-sym(vp9_filter_block1d4_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_h2_sse2) PRIVATE
-sym(vp9_filter_block1d8_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_h2_sse2) PRIVATE
-sym(vp9_filter_block1d16_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 1]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_h2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_4 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_h2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_8 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_h2_avg_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 1]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
+++ /dev/null
@@ -1,422 +1,0 @@
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm3, [rdx] ;load filters
- psrldq xmm3, 6
- packsswb xmm3, xmm3
- pshuflw xmm3, xmm3, 0b ;k3_k4
-
- movq xmm2, rcx ;rounding
- pshufd xmm2, xmm2, 0
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
- punpcklbw xmm0, xmm1
- pmaddubsw xmm0, xmm3
-
- paddsw xmm0, xmm2 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack to byte
-
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movd [rdi], xmm0
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro GET_PARAM 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
- psrldq xmm7, 6
- packsswb xmm7, xmm7
- pshuflw xmm7, xmm7, 0b ;k3_k4
- punpcklwd xmm7, xmm7
-
- movq xmm6, rcx ;rounding
- pshufd xmm6, xmm6, 0
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
- punpcklbw xmm0, xmm1
- pmaddubsw xmm0, xmm7
-
- paddsw xmm0, xmm6 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack back to byte
-
-%if %1
- movq xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movq [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro APPLY_FILTER_16 1
- punpcklbw xmm0, xmm1
- punpckhbw xmm2, xmm1
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm2, xmm7
-
- paddsw xmm0, xmm6 ;rounding
- paddsw xmm2, xmm6
- psraw xmm0, 7 ;shift
- psraw xmm2, 7
- packuswb xmm0, xmm2 ;pack back to byte
-
-%if %1
- movdqu xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movdqu [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movd xmm0, [rsi] ;load src
- movd xmm1, [rsi + rax]
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movq xmm0, [rsi] ;0
- movq xmm1, [rsi + rax] ;1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + rax] ;1
- movdqa xmm2, xmm0
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movd xmm0, [rsi] ;load src
- movd xmm1, [rsi + rax]
-
- APPLY_FILTER_4 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movq xmm0, [rsi] ;0
- movq xmm1, [rsi + rax] ;1
-
- APPLY_FILTER_8 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + rax] ;1
- movdqa xmm2, xmm0
-
- APPLY_FILTER_16 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 1]
- movdqa xmm2, xmm0
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_4 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_8 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h2_avg_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 1]
- movdqa xmm2, xmm0
-
- APPLY_FILTER_16 1
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -12,6 +12,7 @@
#include <stdlib.h> // qsort()
#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx_dsp/bitreader_buffer.h"
--- a/vp9/encoder/vp9_blockiness.c
+++ b/vp9/encoder/vp9_blockiness.c
@@ -8,12 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vpx_config.h"
#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_convolve.h"
#include "vp9/common/vp9_filter.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
static int horizontal_filter(const uint8_t *s) {
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -10,6 +10,7 @@
#include <assert.h>
#include <limits.h>
+#include "./vpx_dsp_rtcd.h"
#include "vpx_scale/yv12config.h"
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_reconinter.h"
@@ -336,12 +337,12 @@
}
if (decision == FILTER_BLOCK) {
- vp9_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+ vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
NULL, 0, NULL, 0,
num_4x4_blocks_wide_lookup[bs] << 2,
num_4x4_blocks_high_lookup[bs] << 2);
} else { // COPY_BLOCK
- vp9_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+ vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
NULL, 0, NULL, 0,
num_4x4_blocks_wide_lookup[bs] << 2,
num_4x4_blocks_high_lookup[bs] << 2);
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -12,11 +12,12 @@
#include <stdio.h>
#include <limits.h>
-#include "./vpx_config.h"
#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/vpx_timer.h"
#include "vpx_scale/vpx_scale.h"
@@ -2580,18 +2581,18 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+ vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
kernel[y_q4 & 0xf], 16 * src_h / dst_h,
16 / factor, 16 / factor, bd);
} else {
- vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+ vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
kernel[y_q4 & 0xf], 16 * src_h / dst_h,
16 / factor, 16 / factor);
}
#else
- vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+ vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
kernel[y_q4 & 0xf], 16 * src_h / dst_h,
16 / factor, 16 / factor);
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1504,15 +1504,15 @@
this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth)
- vp9_highbd_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride,
NULL, 0, NULL, 0, bw, bh, xd->bd);
else
- vp9_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride,
NULL, 0, NULL, 0, bw, bh);
#else
- vp9_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride,
NULL, 0, NULL, 0, bw, bh);
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -1577,15 +1577,15 @@
if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth)
- vp9_highbd_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
pd->dst.buf, pd->dst.stride, NULL, 0,
NULL, 0, bw, bh, xd->bd);
else
- vp9_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_convolve_copy(best_pred->data, best_pred->stride,
pd->dst.buf, pd->dst.stride, NULL, 0,
NULL, 0, bw, bh);
#else
- vp9_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_convolve_copy(best_pred->data, best_pred->stride,
pd->dst.buf, pd->dst.stride, NULL, 0,
NULL, 0, bw, bh);
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -15,6 +15,9 @@
#include <stdlib.h>
#include <string.h>
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif // CONFIG_VP9_HIGHBITDEPTH
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_resize.h"
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -13,14 +13,10 @@
VP9_COMMON_SRCS-yes += common/vp9_ppflags.h
VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
VP9_COMMON_SRCS-yes += common/vp9_blockd.c
-VP9_COMMON_SRCS-yes += common/vp9_convolve.c
-VP9_COMMON_SRCS-yes += common/vp9_convolve.h
VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
VP9_COMMON_SRCS-yes += common/vp9_entropy.c
VP9_COMMON_SRCS-yes += common/vp9_entropymode.c
VP9_COMMON_SRCS-yes += common/vp9_entropymv.c
-VP9_COMMON_SRCS-yes += common/vp9_filter.c
-VP9_COMMON_SRCS-yes += common/vp9_filter.h
VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c
VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h
VP9_COMMON_SRCS-yes += common/vp9_idct.c
@@ -31,6 +27,8 @@
VP9_COMMON_SRCS-yes += common/vp9_entropymode.h
VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
VP9_COMMON_SRCS-yes += common/vp9_enums.h
+VP9_COMMON_SRCS-yes += common/vp9_filter.h
+VP9_COMMON_SRCS-yes += common/vp9_filter.c
VP9_COMMON_SRCS-yes += common/vp9_idct.h
VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
VP9_COMMON_SRCS-yes += common/vp9_thread_common.h
@@ -64,33 +62,16 @@
VP9_COMMON_SRCS-yes += common/vp9_scan.c
VP9_COMMON_SRCS-yes += common/vp9_scan.h
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
endif
-ifeq ($(CONFIG_USE_X86INC),yes)
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
-endif
-
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm
-endif
-
# common (c)
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_common_dspr2.h
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_dspr2.c
@@ -113,15 +94,6 @@
endif
# common (msa)
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_horiz_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_vert_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_avg_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_copy_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_msa.h
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
@@ -151,11 +123,6 @@
# neon with assembly and intrinsics implementations. If both are available
# prefer assembly.
ifeq ($(HAVE_NEON_ASM), yes)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
@@ -167,11 +134,6 @@
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
else
ifeq ($(HAVE_NEON), yes)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_avg_neon.c
@@ -1,0 +1,393 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+ int16x4_t dsrc0,
+ int16x4_t dsrc1,
+ int16x4_t dsrc2,
+ int16x4_t dsrc3,
+ int16x4_t dsrc4,
+ int16x4_t dsrc5,
+ int16x4_t dsrc6,
+ int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void vpx_convolve8_avg_horiz_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w,
+ int h) {
+ int width;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ if (x_step_q4 != 16) {
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+}
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+ vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ src += 7;
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w;
+ width > 0;
+ width -= 4, src += 4, dst += 4) { // loop_horiz
+ s = src;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(src + 64);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+ vreinterpret_u16_u32(d31u32));
+ d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+ vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(src + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+ vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(src + 64 + src_stride * 2);
+
+ d = dst;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+ d18s16, d19s16, d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+ d19s16, d23s16, d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(src + 64 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+ vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ d = dst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ src += src_stride * 4 - w - 7;
+ dst += dst_stride * 4 - w;
+ }
+ return;
+}
+
+void vpx_convolve8_avg_vert_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y,
+ int y_step_q4,
+ int w,
+ int h) {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t d2u8, d3u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ uint8x16_t q1u8, q3u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ if (y_step_q4 != 16) {
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+ d -= dst_stride * 3;
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+ d20s16, d21s16, d22s16, d24s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+ d21s16, d22s16, d24s16, d26s16, q0s16);
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, d26s16, d27s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
@@ -1,0 +1,302 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ ; These functions are only valid when:
+ ; x_step_q4 == 16
+ ; w%4 == 0
+ ; h%4 == 0
+ ; taps == 8
+ ; VP9_FILTER_WEIGHT == 128
+ ; VP9_FILTER_SHIFT == 7
+
+ EXPORT |vpx_convolve8_avg_horiz_neon|
+ EXPORT |vpx_convolve8_avg_vert_neon|
+ IMPORT |vpx_convolve8_avg_horiz_c|
+ IMPORT |vpx_convolve8_avg_vert_c|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Multiply and accumulate by q0
+ MACRO
+ MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+ vmull.s16 $dst, $src0, d0[0]
+ vmlal.s16 $dst, $src1, d0[1]
+ vmlal.s16 $dst, $src2, d0[2]
+ vmlal.s16 $dst, $src3, d0[3]
+ vmlal.s16 $dst, $src4, d1[0]
+ vmlal.s16 $dst, $src5, d1[1]
+ vmlal.s16 $dst, $src6, d1[2]
+ vmlal.s16 $dst, $src7, d1[3]
+ MEND
+
+; r0 const uint8_t *src
+; r1 int src_stride
+; r2 uint8_t *dst
+; r3 int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4 ; unused
+; sp[]int w
+; sp[]int h
+
+|vpx_convolve8_avg_horiz_neon| PROC
+ ldr r12, [sp, #4] ; x_step_q4
+ cmp r12, #16
+ bne vpx_convolve8_avg_horiz_c
+
+ push {r4-r10, lr}
+
+ sub r0, r0, #3 ; adjust for taps
+
+ ldr r5, [sp, #32] ; filter_x
+ ldr r6, [sp, #48] ; w
+ ldr r7, [sp, #52] ; h
+
+ vld1.s16 {q0}, [r5] ; filter_x
+
+ sub r8, r1, r1, lsl #2 ; -src_stride * 3
+ add r8, r8, #4 ; -src_stride * 3 + 4
+
+ sub r4, r3, r3, lsl #2 ; -dst_stride * 3
+ add r4, r4, #4 ; -dst_stride * 3 + 4
+
+ rsb r9, r6, r1, lsl #2 ; reset src for outer loop
+ sub r9, r9, #7
+ rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
+
+ mov r10, r6 ; w loop counter
+
+vpx_convolve8_avg_loop_horiz_v
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d27}, [r0], r8
+
+ vtrn.16 q12, q13
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+
+ pld [r0, r1, lsl #2]
+
+ vmovl.u8 q8, d24
+ vmovl.u8 q9, d25
+ vmovl.u8 q10, d26
+ vmovl.u8 q11, d27
+
+ ; save a few instructions in the inner loop
+ vswp d17, d18
+ vmov d23, d21
+
+ add r0, r0, #3
+
+vpx_convolve8_avg_loop_horiz
+ add r5, r0, #64
+
+ vld1.32 {d28[]}, [r0], r1
+ vld1.32 {d29[]}, [r0], r1
+ vld1.32 {d31[]}, [r0], r1
+ vld1.32 {d30[]}, [r0], r8
+
+ pld [r5]
+
+ vtrn.16 d28, d31
+ vtrn.16 d29, d30
+ vtrn.8 d28, d29
+ vtrn.8 d31, d30
+
+ pld [r5, r1]
+
+ ; extract to s16
+ vtrn.32 q14, q15
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+
+ pld [r5, r1, lsl #1]
+
+ ; slightly out of order load to match the existing data
+ vld1.u32 {d6[0]}, [r2], r3
+ vld1.u32 {d7[0]}, [r2], r3
+ vld1.u32 {d6[1]}, [r2], r3
+ vld1.u32 {d7[1]}, [r2], r3
+
+ sub r2, r2, r3, lsl #2 ; reset for store
+
+ ; src[] * filter_x
+ MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
+ MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
+ MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
+ MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+ pld [r5, -r8]
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ ; transpose
+ vtrn.16 d2, d3
+ vtrn.32 d2, d3
+ vtrn.8 d2, d3
+
+ ; average the new value and the dst value
+ vrhadd.u8 q1, q1, q3
+
+ vst1.u32 {d2[0]}, [r2@32], r3
+ vst1.u32 {d3[0]}, [r2@32], r3
+ vst1.u32 {d2[1]}, [r2@32], r3
+ vst1.u32 {d3[1]}, [r2@32], r4
+
+ vmov q8, q9
+ vmov d20, d23
+ vmov q11, q12
+ vmov q9, q13
+
+ subs r6, r6, #4 ; w -= 4
+ bgt vpx_convolve8_avg_loop_horiz
+
+ ; outer loop
+ mov r6, r10 ; restore w counter
+ add r0, r0, r9 ; src += src_stride * 4 - w
+ add r2, r2, r12 ; dst += dst_stride * 4 - w
+ subs r7, r7, #4 ; h -= 4
+ bgt vpx_convolve8_avg_loop_horiz_v
+
+ pop {r4-r10, pc}
+
+ ENDP
+
+|vpx_convolve8_avg_vert_neon| PROC
+ ldr r12, [sp, #12]
+ cmp r12, #16
+ bne vpx_convolve8_avg_vert_c
+
+ push {r4-r8, lr}
+
+ ; adjust for taps
+ sub r0, r0, r1
+ sub r0, r0, r1, lsl #1
+
+ ldr r4, [sp, #32] ; filter_y
+ ldr r6, [sp, #40] ; w
+ ldr lr, [sp, #44] ; h
+
+ vld1.s16 {q0}, [r4] ; filter_y
+
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+
+vpx_convolve8_avg_loop_vert_h
+ mov r4, r0
+ add r7, r0, r1, asr #1
+ mov r5, r2
+ add r8, r2, r3, asr #1
+ mov r12, lr ; h loop counter
+
+ vld1.u32 {d16[0]}, [r4], r1
+ vld1.u32 {d16[1]}, [r7], r1
+ vld1.u32 {d18[0]}, [r4], r1
+ vld1.u32 {d18[1]}, [r7], r1
+ vld1.u32 {d20[0]}, [r4], r1
+ vld1.u32 {d20[1]}, [r7], r1
+ vld1.u32 {d22[0]}, [r4], r1
+
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+
+vpx_convolve8_avg_loop_vert
+ ; always process a 4x4 block at a time
+ vld1.u32 {d24[0]}, [r7], r1
+ vld1.u32 {d26[0]}, [r4], r1
+ vld1.u32 {d26[1]}, [r7], r1
+ vld1.u32 {d24[1]}, [r4], r1
+
+ ; extract to s16
+ vmovl.u8 q12, d24
+ vmovl.u8 q13, d26
+
+ vld1.u32 {d6[0]}, [r5@32], r3
+ vld1.u32 {d6[1]}, [r8@32], r3
+ vld1.u32 {d7[0]}, [r5@32], r3
+ vld1.u32 {d7[1]}, [r8@32], r3
+
+ pld [r7]
+ pld [r4]
+
+ ; src[] * filter_y
+ MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
+
+ pld [r7, r1]
+ pld [r4, r1]
+
+ MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
+
+ pld [r5]
+ pld [r8]
+
+ MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+ pld [r5, r3]
+ pld [r8, r3]
+
+ MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ ; average the new value and the dst value
+ vrhadd.u8 q1, q1, q3
+
+ sub r5, r5, r3, lsl #1 ; reset for store
+ sub r8, r8, r3, lsl #1
+
+ vst1.u32 {d2[0]}, [r5@32], r3
+ vst1.u32 {d2[1]}, [r8@32], r3
+ vst1.u32 {d3[0]}, [r5@32], r3
+ vst1.u32 {d3[1]}, [r8@32], r3
+
+ vmov q8, q10
+ vmov d18, d22
+ vmov d19, d24
+ vmov q10, q13
+ vmov d22, d25
+
+ subs r12, r12, #4 ; h -= 4
+ bgt vpx_convolve8_avg_loop_vert
+
+ ; outer loop
+ add r0, r0, #4
+ add r2, r2, #4
+ subs r6, r6, #4 ; w -= 4
+ bgt vpx_convolve8_avg_loop_vert_h
+
+ pop {r4-r8, pc}
+
+ ENDP
+ END
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -1,0 +1,360 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+ int16x4_t dsrc0,
+ int16x4_t dsrc1,
+ int16x4_t dsrc2,
+ int16x4_t dsrc3,
+ int16x4_t dsrc4,
+ int16x4_t dsrc5,
+ int16x4_t dsrc6,
+ int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void vpx_convolve8_horiz_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w,
+ int h) {
+ int width;
+ const uint8_t *s, *psrc;
+ uint8_t *d, *pdst;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ if (x_step_q4 != 16) {
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4,
+ src += src_stride * 4,
+ dst += dst_stride * 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+ vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+ __builtin_prefetch(src + src_stride * 6);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w, psrc = src + 7, pdst = dst;
+ width > 0;
+ width -= 4, psrc += 4, pdst += 4) { // loop_horiz
+ s = psrc;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(psrc + 64);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+ vreinterpret_u16_u32(d31u32));
+ d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+ vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(psrc + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+ vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+ d18s16, d19s16, d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+ d19s16, d23s16, d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+ vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+ d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+ d = pdst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ }
+ return;
+}
+
+void vpx_convolve8_vert_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y,
+ int y_step_q4,
+ int w,
+ int h) {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint32x2_t d2u32, d3u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ if (y_step_q4 != 16) {
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+ d20s16, d21s16, d22s16, d24s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+ d21s16, d22s16, d24s16, d26s16, q0s16);
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, d26s16, d27s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+ d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
@@ -1,0 +1,280 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ ; These functions are only valid when:
+ ; x_step_q4 == 16
+ ; w%4 == 0
+ ; h%4 == 0
+ ; taps == 8
+ ; VP9_FILTER_WEIGHT == 128
+ ; VP9_FILTER_SHIFT == 7
+
+ EXPORT |vpx_convolve8_horiz_neon|
+ EXPORT |vpx_convolve8_vert_neon|
+ IMPORT |vpx_convolve8_horiz_c|
+ IMPORT |vpx_convolve8_vert_c|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Multiply and accumulate by q0
+ MACRO
+ MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+ vmull.s16 $dst, $src0, d0[0]
+ vmlal.s16 $dst, $src1, d0[1]
+ vmlal.s16 $dst, $src2, d0[2]
+ vmlal.s16 $dst, $src3, d0[3]
+ vmlal.s16 $dst, $src4, d1[0]
+ vmlal.s16 $dst, $src5, d1[1]
+ vmlal.s16 $dst, $src6, d1[2]
+ vmlal.s16 $dst, $src7, d1[3]
+ MEND
+
+; r0 const uint8_t *src
+; r1 int src_stride
+; r2 uint8_t *dst
+; r3 int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4 ; unused
+; sp[]int w
+; sp[]int h
+
+|vpx_convolve8_horiz_neon| PROC
+ ldr r12, [sp, #4] ; x_step_q4
+ cmp r12, #16
+ bne vpx_convolve8_horiz_c
+
+ push {r4-r10, lr}
+
+ sub r0, r0, #3 ; adjust for taps
+
+ ldr r5, [sp, #32] ; filter_x
+ ldr r6, [sp, #48] ; w
+ ldr r7, [sp, #52] ; h
+
+ vld1.s16 {q0}, [r5] ; filter_x
+
+ sub r8, r1, r1, lsl #2 ; -src_stride * 3
+ add r8, r8, #4 ; -src_stride * 3 + 4
+
+ sub r4, r3, r3, lsl #2 ; -dst_stride * 3
+ add r4, r4, #4 ; -dst_stride * 3 + 4
+
+ rsb r9, r6, r1, lsl #2 ; reset src for outer loop
+ sub r9, r9, #7
+ rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
+
+ mov r10, r6 ; w loop counter
+
+vpx_convolve8_loop_horiz_v
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d27}, [r0], r8
+
+ vtrn.16 q12, q13
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+
+ pld [r0, r1, lsl #2]
+
+ vmovl.u8 q8, d24
+ vmovl.u8 q9, d25
+ vmovl.u8 q10, d26
+ vmovl.u8 q11, d27
+
+ ; save a few instructions in the inner loop
+ vswp d17, d18
+ vmov d23, d21
+
+ add r0, r0, #3
+
+vpx_convolve8_loop_horiz
+ add r5, r0, #64
+
+ vld1.32 {d28[]}, [r0], r1
+ vld1.32 {d29[]}, [r0], r1
+ vld1.32 {d31[]}, [r0], r1
+ vld1.32 {d30[]}, [r0], r8
+
+ pld [r5]
+
+ vtrn.16 d28, d31
+ vtrn.16 d29, d30
+ vtrn.8 d28, d29
+ vtrn.8 d31, d30
+
+ pld [r5, r1]
+
+ ; extract to s16
+ vtrn.32 q14, q15
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+
+ pld [r5, r1, lsl #1]
+
+ ; src[] * filter_x
+ MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
+ MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
+ MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
+ MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+ pld [r5, -r8]
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ ; transpose
+ vtrn.16 d2, d3
+ vtrn.32 d2, d3
+ vtrn.8 d2, d3
+
+ vst1.u32 {d2[0]}, [r2@32], r3
+ vst1.u32 {d3[0]}, [r2@32], r3
+ vst1.u32 {d2[1]}, [r2@32], r3
+ vst1.u32 {d3[1]}, [r2@32], r4
+
+ vmov q8, q9
+ vmov d20, d23
+ vmov q11, q12
+ vmov q9, q13
+
+ subs r6, r6, #4 ; w -= 4
+ bgt vpx_convolve8_loop_horiz
+
+ ; outer loop
+ mov r6, r10 ; restore w counter
+ add r0, r0, r9 ; src += src_stride * 4 - w
+ add r2, r2, r12 ; dst += dst_stride * 4 - w
+ subs r7, r7, #4 ; h -= 4
+ bgt vpx_convolve8_loop_horiz_v
+
+ pop {r4-r10, pc}
+
+ ENDP
+
+|vpx_convolve8_vert_neon| PROC
+ ldr r12, [sp, #12]
+ cmp r12, #16
+ bne vpx_convolve8_vert_c
+
+ push {r4-r8, lr}
+
+ ; adjust for taps
+ sub r0, r0, r1
+ sub r0, r0, r1, lsl #1
+
+ ldr r4, [sp, #32] ; filter_y
+ ldr r6, [sp, #40] ; w
+ ldr lr, [sp, #44] ; h
+
+ vld1.s16 {q0}, [r4] ; filter_y
+
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+
+vpx_convolve8_loop_vert_h
+ mov r4, r0
+ add r7, r0, r1, asr #1
+ mov r5, r2
+ add r8, r2, r3, asr #1
+ mov r12, lr ; h loop counter
+
+ vld1.u32 {d16[0]}, [r4], r1
+ vld1.u32 {d16[1]}, [r7], r1
+ vld1.u32 {d18[0]}, [r4], r1
+ vld1.u32 {d18[1]}, [r7], r1
+ vld1.u32 {d20[0]}, [r4], r1
+ vld1.u32 {d20[1]}, [r7], r1
+ vld1.u32 {d22[0]}, [r4], r1
+
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+
+vpx_convolve8_loop_vert
+ ; always process a 4x4 block at a time
+ vld1.u32 {d24[0]}, [r7], r1
+ vld1.u32 {d26[0]}, [r4], r1
+ vld1.u32 {d26[1]}, [r7], r1
+ vld1.u32 {d24[1]}, [r4], r1
+
+ ; extract to s16
+ vmovl.u8 q12, d24
+ vmovl.u8 q13, d26
+
+ pld [r5]
+ pld [r8]
+
+ ; src[] * filter_y
+ MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
+
+ pld [r5, r3]
+ pld [r8, r3]
+
+ MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
+
+ pld [r7]
+ pld [r4]
+
+ MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+ pld [r7, r1]
+ pld [r4, r1]
+
+ MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
+
+ vst1.u32 {d2[0]}, [r5@32], r3
+ vst1.u32 {d2[1]}, [r8@32], r3
+ vst1.u32 {d3[0]}, [r5@32], r3
+ vst1.u32 {d3[1]}, [r8@32], r3
+
+ vmov q8, q10
+ vmov d18, d22
+ vmov d19, d24
+ vmov q10, q13
+ vmov d22, d25
+
+ subs r12, r12, #4 ; h -= 4
+ bgt vpx_convolve8_loop_vert
+
+ ; outer loop
+ add r0, r0, #4
+ add r2, r2, #4
+ subs r6, r6, #4 ; w -= 4
+ bgt vpx_convolve8_loop_vert_h
+
+ pop {r4-r8, pc}
+
+ ENDP
+ END
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -1,0 +1,147 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_avg_neon(
+ const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w,
+ int h) {
+ uint8_t *d;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint32x2_t d0u32, d2u32;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ d = dst;
+ if (w > 32) { // avg64
+ for (; h > 0; h -= 1) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ q10u8 = vld1q_u8(d + 32);
+ q11u8 = vld1q_u8(d + 48);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // avg32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+ q10u8 = vld1q_u8(d);
+ q11u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // avg16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+ q2u8 = vld1q_u8(d);
+ d += dst_stride;
+ q3u8 = vld1q_u8(d);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q2u8);
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // avg8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d1u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(d);
+ d += dst_stride;
+ d3u8 = vld1_u8(d);
+ d += dst_stride;
+
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+ vst1_u8(dst, vget_low_u8(q0u8));
+ dst += dst_stride;
+ vst1_u8(dst, vget_high_u8(q0u8));
+ dst += dst_stride;
+ }
+ } else { // avg4
+ for (; h > 0; h -= 2) {
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+ src += src_stride;
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+ src += src_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+
+ d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
+ vreinterpret_u8_u32(d2u32));
+
+ d0u32 = vreinterpret_u32_u8(d0u8);
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+ dst += dst_stride;
+ }
+ }
+ return;
+}
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
@@ -1,0 +1,116 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_convolve_avg_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve_avg_neon| PROC
+ push {r4-r6, lr}
+ ldrd r4, r5, [sp, #32]
+ mov r6, r2
+
+ cmp r4, #32
+ bgt avg64
+ beq avg32
+ cmp r4, #8
+ bgt avg16
+ beq avg8
+ b avg4
+
+avg64
+ sub lr, r1, #32
+ sub r4, r3, #32
+avg64_h
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0]!
+ vld1.8 {q2-q3}, [r0], lr
+ pld [r2, r3]
+ vld1.8 {q8-q9}, [r6@128]!
+ vld1.8 {q10-q11}, [r6@128], r4
+ vrhadd.u8 q0, q0, q8
+ vrhadd.u8 q1, q1, q9
+ vrhadd.u8 q2, q2, q10
+ vrhadd.u8 q3, q3, q11
+ vst1.8 {q0-q1}, [r2@128]!
+ vst1.8 {q2-q3}, [r2@128], r4
+ subs r5, r5, #1
+ bgt avg64_h
+ pop {r4-r6, pc}
+
+avg32
+ vld1.8 {q0-q1}, [r0], r1
+ vld1.8 {q2-q3}, [r0], r1
+ vld1.8 {q8-q9}, [r6@128], r3
+ vld1.8 {q10-q11}, [r6@128], r3
+ pld [r0]
+ vrhadd.u8 q0, q0, q8
+ pld [r0, r1]
+ vrhadd.u8 q1, q1, q9
+ pld [r6]
+ vrhadd.u8 q2, q2, q10
+ pld [r6, r3]
+ vrhadd.u8 q3, q3, q11
+ vst1.8 {q0-q1}, [r2@128], r3
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #2
+ bgt avg32
+ pop {r4-r6, pc}
+
+avg16
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q2}, [r6@128], r3
+ vld1.8 {q3}, [r6@128], r3
+ pld [r0]
+ pld [r0, r1]
+ vrhadd.u8 q0, q0, q2
+ pld [r6]
+ pld [r6, r3]
+ vrhadd.u8 q1, q1, q3
+ vst1.8 {q0}, [r2@128], r3
+ vst1.8 {q1}, [r2@128], r3
+ subs r5, r5, #2
+ bgt avg16
+ pop {r4-r6, pc}
+
+avg8
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d2}, [r6@64], r3
+ vld1.8 {d3}, [r6@64], r3
+ pld [r0]
+ pld [r0, r1]
+ vrhadd.u8 q0, q0, q1
+ pld [r6]
+ pld [r6, r3]
+ vst1.8 {d0}, [r2@64], r3
+ vst1.8 {d1}, [r2@64], r3
+ subs r5, r5, #2
+ bgt avg8
+ pop {r4-r6, pc}
+
+avg4
+ vld1.32 {d0[0]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d2[0]}, [r6@32], r3
+ vld1.32 {d2[1]}, [r6@32], r3
+ vrhadd.u8 d0, d0, d2
+ vst1.32 {d0[0]}, [r2@32], r3
+ vst1.32 {d0[1]}, [r2@32], r3
+ subs r5, r5, #2
+ bgt avg4
+ pop {r4-r6, pc}
+ ENDP
+
+ END
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -1,0 +1,94 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_copy_neon(
+ const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w,
+ int h) {
+ uint8x8_t d0u8, d2u8;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ if (w > 32) { // copy64
+ for (; h > 0; h--) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // copy32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // copy16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // copy8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(src);
+ src += src_stride;
+
+ vst1_u8(dst, d0u8);
+ dst += dst_stride;
+ vst1_u8(dst, d2u8);
+ dst += dst_stride;
+ }
+ } else { // copy4
+ for (; h > 0; h--) {
+ *(uint32_t *)dst = *(const uint32_t *)src;
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ return;
+}
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
@@ -1,0 +1,84 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_convolve_copy_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve_copy_neon| PROC
+ push {r4-r5, lr}
+ ldrd r4, r5, [sp, #28]
+
+ cmp r4, #32
+ bgt copy64
+ beq copy32
+ cmp r4, #8
+ bgt copy16
+ beq copy8
+ b copy4
+
+copy64
+ sub lr, r1, #32
+ sub r3, r3, #32
+copy64_h
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0]!
+ vld1.8 {q2-q3}, [r0], lr
+ vst1.8 {q0-q1}, [r2@128]!
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #1
+ bgt copy64_h
+ pop {r4-r5, pc}
+
+copy32
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {q2-q3}, [r0], r1
+ vst1.8 {q0-q1}, [r2@128], r3
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #2
+ bgt copy32
+ pop {r4-r5, pc}
+
+copy16
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {q1}, [r0], r1
+ vst1.8 {q0}, [r2@128], r3
+ vst1.8 {q1}, [r2@128], r3
+ subs r5, r5, #2
+ bgt copy16
+ pop {r4-r5, pc}
+
+copy8
+ pld [r0, r1, lsl #1]
+ vld1.8 {d0}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {d2}, [r0], r1
+ vst1.8 {d0}, [r2@64], r3
+ vst1.8 {d2}, [r2@64], r3
+ subs r5, r5, #2
+ bgt copy8
+ pop {r4-r5, pc}
+
+copy4
+ ldr r12, [r0], r1
+ str r12, [r2], r3
+ subs r5, r5, #1
+ bgt copy4
+ pop {r4-r5, pc}
+ ENDP
+
+ END
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -1,0 +1,82 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+ * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+ */
+ DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+
+ // Account for the vertical phase needing 3 lines prior and 4 lines post
+ int intermediate_height = h + 7;
+
+ if (x_step_q4 != 16 || y_step_q4 != 16) {
+ vpx_convolve8_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ /* Filter starting 3 lines back. The neon implementation will ignore the
+ * given height and filter a multiple of 4 lines. Since this goes in to
+ * the temp buffer which has lots of extra room and is subsequently discarded
+ * this is safe if somewhat less than ideal.
+ */
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+ temp, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, intermediate_height);
+
+ /* Step into the temp buffer 3 lines to get the actual frame data */
+ vpx_convolve8_vert_neon(temp + 64 * 3, 64,
+ dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+}
+
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+ int intermediate_height = h + 7;
+
+ if (x_step_q4 != 16 || y_step_q4 != 16) {
+ vpx_convolve8_avg_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ /* This implementation has the same issues as above. In addition, we only want
+ * to average the values after both passes.
+ */
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+ temp, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, intermediate_height);
+ vpx_convolve8_avg_vert_neon(temp + 64 * 3,
+ 64, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+}
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <stdlib.h>
+
#include "./vpx_config.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_ports/mem.h"
--- /dev/null
+++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -1,0 +1,782 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 dst0, dst1, dst2, dst3, res2, res3;
+ v16u8 mask0, mask1, mask2, mask3;
+ v8i16 filt, res0, res1;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, res0, res1);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ SRARI_H2_SH(res0, res1, FILTER_BITS);
+ SAT_SH2_SH(res0, res1, 7);
+ PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+ XORI_B2_128_UB(res2, res3);
+ AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v8i16 filt, vec0, vec1, vec2, vec3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, vec0, vec1);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, vec2, vec3);
+ SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
+ PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
+ res3);
+ ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+ XORI_B2_128_UB(res0, res2);
+ ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
+ dst6);
+ ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
+ AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
+ ST4x8_UB(res0, res2, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ int32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+ dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ int32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
+ v8i16 filt, out0, out1, out2, out3;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height >> 1; loop_cnt--;) {
+ LD_SB2(src, src_stride, src0, src2);
+ LD_SB2(src + 8, src_stride, src1, src3);
+ src += (2 * src_stride);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
+ VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
+ VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+ vec14);
+ VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+ vec15);
+ DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+ vec9, vec10, vec11);
+ DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
+ vec2, vec3);
+ DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+ vec9, vec10, vec11);
+ ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+ out2, out3);
+ LD_UB2(dst, dst_stride, dst0, dst1);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
+ dst += dst_stride;
+ PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
+ VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
+ VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+ vec14);
+ VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+ vec15);
+ DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+ vec9, vec10, vec11);
+ DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
+ vec2, vec3);
+ DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+ vec9, vec10, vec11);
+ ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ LD_UB2(dst, 16, dst1, dst2);
+ PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
+ PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt, cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+ v8i16 filt, out0, out1, out2, out3;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height; loop_cnt--;) {
+ for (cnt = 0; cnt < 2; ++cnt) {
+ src0 = LD_SB(&src[cnt << 5]);
+ src2 = LD_SB(&src[16 + (cnt << 5)]);
+ src3 = LD_SB(&src[24 + (cnt << 5)]);
+ src1 = __msa_sldi_b(src2, src0, 8);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+ vec12);
+ VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+ vec13);
+ VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+ vec14);
+ VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+ vec15);
+ DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+ vec1, vec2, vec3);
+ DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+ vec9, vec10, vec11);
+ DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
+ vec1, vec2, vec3);
+ DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+ vec9, vec10, vec11);
+ ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
+ PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
+ PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
+ v8u16 vec2, vec3, const255, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+ SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+ MIN_UH2_UH(vec2, vec3, const255);
+ PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+ AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v8u16 vec4, vec5, vec6, vec7, const255, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+ vec6, vec7);
+ SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+ MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
+ PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+ res3);
+ ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
+ dst6);
+ AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
+ res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v8u16 vec0, vec1, vec2, vec3, const255, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+ dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v8u16 vec0, vec1, vec2, vec3, const255, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+ dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+ dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ if (16 == height) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+ dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+ PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+ dst, dst_stride);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter, height);
+ }
+}
+
+static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+ res2, res3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+ res6, res7);
+ SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+ SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ MIN_UH4_UH(res0, res1, res2, res3, const255);
+ MIN_UH4_UH(res4, res5, res6, res7, const255);
+ PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+ dst += dst_stride;
+
+ for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+ res2, res3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+ res6, res7);
+ SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+ SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ MIN_UH4_UH(res0, res1, res2, res3, const255);
+ MIN_UH4_UH(res4, res5, res6, res7, const255);
+ PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+ dst += dst_stride;
+ PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+ src4 = LD_SB(src);
+ src6 = LD_SB(src + 16);
+ src7 = LD_SB(src + 24);
+ src5 = __msa_sldi_b(src6, src4, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+ res2, res3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+ res6, res7);
+ SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+ SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+ MIN_UH4_UH(res0, res1, res2, res3, const255);
+ MIN_UH4_UH(res4, res5, res6, res7, const255);
+ LD_UB2(dst, 16, dst0, dst1);
+ PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+ PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
+ dst += dst_stride;
+ LD_UB2(dst, 16, dst2, dst3);
+ PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+ PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, dst0, dst1, dst2, dst3;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, const255, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ const255 = (v8u16)__msa_ldi_h(255);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ LD_SB4(src, 16, src0, src2, src4, src6);
+ src7 = LD_SB(src + 56);
+ SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
+ MIN_UH4_UH(out0, out1, out2, out3, const255);
+ MIN_UH4_UH(out4, out5, out6, out7, const255);
+ PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
+ PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
+ PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
+ PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ int8_t cnt, filt_hor[8];
+
+ if (16 != x_step_q4) {
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ if (((const int32_t *)filter_x)[1] == 0x800000) {
+ vpx_convolve_avg(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ }
+
+ if (((const int32_t *)filter_x)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 8:
+ common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 16:
+ common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 32:
+ common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 64:
+ common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 8:
+ common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 16:
+ common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 32:
+ common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 64:
+ common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ default:
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ }
+}
--- /dev/null
+++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -1,0 +1,679 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+ vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
+ vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+ SRARI_H2_SH(res0, res1, FILTER_BITS);
+ SAT_SH2_SH(res0, res1, 7);
+ PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
+ XORI_B2_128_UB(tmp0, tmp1);
+ AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
+ ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out5 = hz_out9;
+ vec0 = vec2;
+ vec1 = vec3;
+ vec2 = vec4;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+ ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+ ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+ tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
+ tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
+ dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out6 = hz_out10;
+ out0 = out2;
+ out1 = out3;
+ out2 = out8;
+ out4 = out6;
+ out5 = out7;
+ out6 = out9;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 8; multiple8_cnt--;) {
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v16u8 dst0, dst1, dst2, dst3, res0, res1;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ filt = LD_UH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+ AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ src8 = LD_SB(src);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+ hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+ hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+ SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+ hz_out3, hz_out5, 8);
+ hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
+ dst4, dst6);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+ tmp0, tmp1, tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
+ res2, res3);
+ AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
+ res2, res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ } else if (8 == height) {
+ common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+ dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_SB(src);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+ dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert);
+ } else {
+ common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert,
+ height);
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
+ dst += dst_stride;
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ int8_t cnt, filt_hor[8], filt_ver[8];
+
+ if (16 != x_step_q4 || 16 != y_step_q4) {
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ if (((const int32_t *)filter_x)[1] == 0x800000 &&
+ ((const int32_t *)filter_y)[1] == 0x800000) {
+ vpx_convolve_avg(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (((const int32_t *)filter_x)[0] == 0 &&
+ ((const int32_t *)filter_y)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 8:
+ common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 16:
+ common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 32:
+ common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ case 64:
+ common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else if (((const int32_t *)filter_x)[0] == 0 ||
+ ((const int32_t *)filter_y)[0] == 0) {
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ } else {
+ switch (w) {
+ case 4:
+ common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, h);
+ break;
+ case 8:
+ common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, h);
+ break;
+ case 16:
+ common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, h);
+ break;
+ case 32:
+ common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, h);
+ break;
+ case 64:
+ common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ }
+}
--- /dev/null
+++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -1,0 +1,753 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 dst0, dst1, dst2, dst3, out;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+ v16i8 src10998, filt0, filt1, filt2, filt3;
+ v8i16 filt, out10, out32;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+ src4332, src6554);
+ XORI_B3_128_SB(src2110, src4332, src6554);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+ XORI_B2_128_SB(src8776, src10998);
+ out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+ filt1, filt2, filt3);
+ out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+ filt1, filt2, filt3);
+ SRARI_H2_SH(out10, out32, FILTER_BITS);
+ SAT_SH2_SH(out10, out32, 7);
+ out = PCKEV_XORI128_UB(out10, out32);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+ dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
+ out = __msa_aver_u_b(out, dst0);
+
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src2110 = src6554;
+ src4332 = src8776;
+ src6554 = src10998;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+ v8i16 filt, out0, out1, out2, out3;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+ dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height,
+ int32_t width) {
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt, cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+ v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src_tmp += (7 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+ src54_l, src21_l);
+ ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+ src_tmp += (4 * src_stride);
+
+ LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+ src87_l, src98_l, src109_l);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+ filt1, filt2, filt3);
+ out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+ filt1, filt2, filt3);
+ out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+ filt1, filt2, filt3);
+ out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+ PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+ out3_r, tmp0, tmp1, tmp2, tmp3);
+ XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+ AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
+ dst2, dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
+ dst_tmp += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+ filter, height, 64);
+}
+
+static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4;
+ v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+ v16i8 src10_r, src32_r, src21_r, src43_r;
+ v8i16 filt;
+ v8u16 tmp0, tmp1;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ src4 = LD_SB(src);
+ src += src_stride;
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+ dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+
+ out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ out = __msa_aver_u_b(out, dst0);
+
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+ v16u8 src2110, src4332, src6554, src8776, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ src8 = LD_SB(src);
+
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+ ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
+ dst2, dst3);
+ ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+ src76_r, src87_r);
+ ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+ src87_r, src76_r, src2110, src4332, src6554, src8776);
+ DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+ AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
+ ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+ ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+ dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+ src += (8 * src_stride);
+ LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
+ vec2, vec3);
+ ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
+ vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
+ dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
+ dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ if (4 == height) {
+ common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter, height);
+ }
+}
+
+static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+ /* rearranging filter_y */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+ dst += dst_stride;
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+ /* rearranging filter_y */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_UB2(src, 16, src0, src5);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+ LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+ LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
+ src += (4 * src_stride);
+
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
+
+ ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+ ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+ ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src4;
+ src5 = src9;
+ }
+}
+
+static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5;
+ v16u8 src6, src7, src8, src9, src10, src11, filt0;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8u16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_UH(filter);
+ filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_UB4(src, 16, src0, src3, src6, src9);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LD_UB2(src, src_stride, src1, src2);
+ LD_UB2(dst, dst_stride, dst0, dst1);
+ LD_UB2(src + 16, src_stride, src4, src5);
+ LD_UB2(dst + 16, dst_stride, dst2, dst3);
+ LD_UB2(src + 32, src_stride, src7, src8);
+ LD_UB2(dst + 32, dst_stride, dst4, dst5);
+ LD_UB2(src + 48, src_stride, src10, src11);
+ LD_UB2(dst + 48, dst_stride, dst6, dst7);
+ src += (2 * src_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+ ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+ ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ SAT_UH2_UH(tmp4, tmp5, 7);
+ PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ SAT_UH2_UH(tmp6, tmp7, 7);
+ PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+ ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
+
+ ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+ ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ SAT_UH2_UH(tmp4, tmp5, 7);
+ PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ SAT_UH2_UH(tmp6, tmp7, 7);
+ PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
+ dst += (2 * dst_stride);
+
+ src0 = src2;
+ src3 = src5;
+ src6 = src8;
+ src9 = src11;
+ }
+}
+
+void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ int8_t cnt, filt_ver[8];
+
+ if (16 != y_step_q4) {
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ if (((const int32_t *)filter_y)[1] == 0x800000) {
+ vpx_convolve_avg(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (((const int32_t *)filter_y)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 8:
+ common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 16:
+ common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 32:
+ common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 64:
+ common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 8:
+ common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 16:
+ common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+
+ break;
+ case 32:
+ common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 64:
+ common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ }
+}
--- /dev/null
+++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -1,0 +1,742 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v8i16 filt, out0, out1;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1);
+ SRARI_H2_SH(out0, out1, FILTER_BITS);
+ SAT_SH2_SH(out0, out1, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src0, src1, src2, src3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+ filt0, filt1, filt2, filt3, out0, out1, out2,
+ out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ tmp0 = PCKEV_XORI128_UB(out0, out1);
+ tmp1 = PCKEV_XORI128_UB(out2, out3);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (4 * src_stride);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ tmp0 = PCKEV_XORI128_UB(out0, out1);
+ tmp1 = PCKEV_XORI128_UB(out2, out3);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LD_SB2(src, src_stride, src0, src2);
+ LD_SB2(src + 8, src_stride, src1, src3);
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ src += (2 * src_stride);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 16);
+ dst += dst_stride;
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ int32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 filt, out0, out1, out2, out3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= 3;
+
+ /* rearranging filter */
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 16);
+
+ src0 = LD_SB(src + 32);
+ src2 = LD_SB(src + 48);
+ src3 = LD_SB(src + 56);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+
+ XORI_B4_128_SB(src0, src1, src2, src3);
+ HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+ mask3, filt0, filt1, filt2, filt3, out0, out1,
+ out2, out3);
+ SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+ SAT_SH4_SH(out0, out1, out2, out3, 7);
+ out = PCKEV_XORI128_UB(out0, out1);
+ ST_UB(out, dst + 32);
+ out = PCKEV_XORI128_UB(out2, out3);
+ ST_UB(out, dst + 48);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, mask;
+ v16u8 filt0, vec0, vec1, res0, res1;
+ v8u16 vec2, vec3, filt, const255;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+ const255 = (v8u16) __msa_ldi_h(255);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+ SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+ MIN_UH2_UH(vec2, vec3, const255);
+ PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 vec0, vec1, vec2, vec3, filt0;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16i8 res0, res1, res2, res3;
+ v8u16 vec4, vec5, vec6, vec7, filt, const255;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+ const255 = (v8u16) __msa_ldi_h(255);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+ vec6, vec7);
+ SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+ MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
+ PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
+ res2, res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 filt0;
+ v16i8 src0, src1, src2, src3, mask;
+ v8u16 vec0, vec1, vec2, vec3, const255, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+ const255 = (v8u16) __msa_ldi_h(255);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+ ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ v16u8 filt0;
+ v16i8 src0, src1, src2, src3, mask, out0, out1;
+ v8u16 vec0, vec1, vec2, vec3, filt, const255;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+ const255 = (v8u16) __msa_ldi_h(255);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ if (16 == height) {
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+ LD_SB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+
+ VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+ vec2, vec3);
+ SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+ MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+ ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+ }
+}
+
+static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ loop_cnt = (height >> 2) - 1;
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+ const255 = (v8u16) __msa_ldi_h(255);
+
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ MIN_UH4_UH(out0, out1, out2, out3, const255);
+ MIN_UH4_UH(out4, out5, out6, out7, const255);
+ PCKEV_ST_SB(out0, out1, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out2, out3, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out4, out5, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out6, out7, dst);
+ dst += dst_stride;
+
+ for (; loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ MIN_UH4_UH(out0, out1, out2, out3, const255);
+ MIN_UH4_UH(out4, out5, out6, out7, const255);
+ PCKEV_ST_SB(out0, out1, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out2, out3, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out4, out5, dst);
+ dst += dst_stride;
+ PCKEV_ST_SB(out6, out7, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+ const255 = (v8u16) __msa_ldi_h(255);
+
+ for (loop_cnt = height >> 1; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src3 = LD_SB(src + 24);
+ src1 = __msa_sldi_b(src2, src0, 8);
+ src += src_stride;
+ src4 = LD_SB(src);
+ src6 = LD_SB(src + 16);
+ src7 = LD_SB(src + 24);
+ src5 = __msa_sldi_b(src6, src4, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ MIN_UH4_UH(out0, out1, out2, out3, const255);
+ MIN_UH4_UH(out4, out5, out6, out7, const255);
+ PCKEV_ST_SB(out0, out1, dst);
+ PCKEV_ST_SB(out2, out3, dst + 16);
+ dst += dst_stride;
+ PCKEV_ST_SB(out4, out5, dst);
+ PCKEV_ST_SB(out6, out7, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter);
+ filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+ const255 = (v8u16) __msa_ldi_h(255);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src0 = LD_SB(src);
+ src2 = LD_SB(src + 16);
+ src4 = LD_SB(src + 32);
+ src6 = LD_SB(src + 48);
+ src7 = LD_SB(src + 56);
+ SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+ src += src_stride;
+
+ VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+ VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+ VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+ VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+ out2, out3);
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+ out6, out7);
+ SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+ SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+ MIN_UH4_UH(out0, out1, out2, out3, const255);
+ MIN_UH4_UH(out4, out5, out6, out7, const255);
+ PCKEV_ST_SB(out0, out1, dst);
+ PCKEV_ST_SB(out2, out3, dst + 16);
+ PCKEV_ST_SB(out4, out5, dst + 32);
+ PCKEV_ST_SB(out6, out7, dst + 48);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ int8_t cnt, filt_hor[8];
+
+ if (16 != x_step_q4) {
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ if (((const int32_t *)filter_x)[1] == 0x800000) {
+ vpx_convolve_copy(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ }
+
+ if (((const int32_t *)filter_x)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_hz_2t_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 8:
+ common_hz_2t_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 16:
+ common_hz_2t_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 32:
+ common_hz_2t_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ case 64:
+ common_hz_2t_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], h);
+ break;
+ default:
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_hz_8t_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 8:
+ common_hz_8t_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 16:
+ common_hz_8t_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 32:
+ common_hz_8t_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ case 64:
+ common_hz_8t_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, h);
+ break;
+ default:
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ }
+}
--- /dev/null
+++ b/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -1,0 +1,654 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v16u8 mask0, mask1, mask2, mask3, out;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[16]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+ out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+ out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
+ out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
+ SAT_SH2_SH(tmp0, tmp1, 7);
+ out = PCKEV_XORI128_UB(tmp0, tmp1);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out5 = hz_out9;
+ out0 = out2;
+ out1 = out3;
+ out2 = out4;
+ }
+}
+
+static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+ v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
+ v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+ v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+ mask0 = LD_UB(&mc_filt_mask_arr[0]);
+ src -= (3 + 3 * src_stride);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+ mask1 = mask0 + 2;
+ mask2 = mask0 + 4;
+ mask3 = mask0 + 6;
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+ hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ filt_hz1, filt_hz2, filt_hz3);
+
+ filt = LD_SH(filter_vert);
+ SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+ ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+ ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+ ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ XORI_B4_128_SB(src7, src8, src9, src10);
+
+ hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+ tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+ tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+ tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+
+ hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+ filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+ out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
+ tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+ filt_vt2, filt_vt3);
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+ vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+ ST8x4_UB(vec0, vec1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out6 = hz_out10;
+ out0 = out2;
+ out1 = out3;
+ out2 = out8;
+ out4 = out6;
+ out5 = out7;
+ out6 = out9;
+ }
+}
+
+static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 8; multiple8_cnt--;) {
+ common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask;
+ v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ filt = LD_UH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+ hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+ v16i8 res0, res1, res2, res3;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+ v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[16]);
+
+ /* rearranging filter */
+ filt = LD_UH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ filt = LD_UH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ src8 = LD_SB(src);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+ hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+ hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+ hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+ SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+ hz_out3, hz_out5, 8);
+ hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+ vec4, vec5, vec6, vec7);
+ SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+ SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
+ PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
+ res2, res3);
+ ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else if (8 == height) {
+ common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert) {
+ v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+ v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+ v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ int8_t *filter_horiz,
+ int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+ v16u8 filt_hz, filt_vt, vec0;
+ v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_SB(src);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ SAT_UH2_UH(tmp1, tmp2, 7);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ LD_SB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
+ SAT_UH2_UH(tmp3, tmp4, 7);
+ PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+ tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+ tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+ SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
+ SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+ PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ if (4 == height) {
+ common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert);
+ } else {
+ common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+ filter_horiz, filter_vert, height);
+ }
+}
+
+static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+ v16u8 filt_hz, filt_vt, vec0, vec1;
+ v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+ v8i16 filt;
+
+ mask = LD_SB(&mc_filt_mask_arr[0]);
+
+ /* rearranging filter */
+ filt = LD_SH(filter_horiz);
+ filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+ filt = LD_SH(filter_vert);
+ filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB2(src, 8, src0, src1);
+ src += src_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src0, src2, src4, src6);
+ LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ SAT_UH2_UH(tmp1, tmp2, 7);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ SAT_UH2_UH(tmp1, tmp2, 7);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ SAT_UH2_UH(tmp1, tmp2, 7);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+ DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+ SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+ SAT_UH2_UH(tmp1, tmp2, 7);
+ PCKEV_ST_SB(tmp1, tmp2, dst);
+ dst += dst_stride;
+ }
+}
+
+static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 2; multiple8_cnt--;) {
+ common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter_horiz, int8_t *filter_vert,
+ int32_t height) {
+ int32_t multiple8_cnt;
+ for (multiple8_cnt = 4; multiple8_cnt--;) {
+ common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+ filter_vert, height);
+ src += 16;
+ dst += 16;
+ }
+}
+
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int32_t x_step_q4,
+ const int16_t *filter_y, int32_t y_step_q4,
+ int32_t w, int32_t h) {
+ int8_t cnt, filt_hor[8], filt_ver[8];
+
+ if (16 != x_step_q4 || 16 != y_step_q4) {
+ vpx_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ if (((const int32_t *)filter_x)[1] == 0x800000 &&
+ ((const int32_t *)filter_y)[1] == 0x800000) {
+ vpx_convolve_copy(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ for (cnt = 0; cnt < 8; ++cnt) {
+ filt_hor[cnt] = filter_x[cnt];
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (((const int32_t *)filter_x)[0] == 0 &&
+ ((const int32_t *)filter_y)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], (int32_t)h);
+ break;
+ case 8:
+ common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], (int32_t)h);
+ break;
+ case 16:
+ common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], (int32_t)h);
+ break;
+ case 32:
+ common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], (int32_t)h);
+ break;
+ case 64:
+ common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_hor[3], &filt_ver[3], (int32_t)h);
+ break;
+ default:
+ vpx_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else if (((const int32_t *)filter_x)[0] == 0 ||
+ ((const int32_t *)filter_y)[0] == 0) {
+ vpx_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ } else {
+ switch (w) {
+ case 4:
+ common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, (int32_t)h);
+ break;
+ case 8:
+ common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, (int32_t)h);
+ break;
+ case 16:
+ common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, (int32_t)h);
+ break;
+ case 32:
+ common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, (int32_t)h);
+ break;
+ case 64:
+ common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_hor, filt_ver, (int32_t)h);
+ break;
+ default:
+ vpx_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ }
+}
--- /dev/null
+++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -1,0 +1,745 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+ v16i8 src10998, filt0, filt1, filt2, filt3;
+ v16u8 out;
+ v8i16 filt, out10, out32;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+ src4332, src6554);
+ XORI_B3_128_SB(src2110, src4332, src6554);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+ XORI_B2_128_SB(src8776, src10998);
+ out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+ filt1, filt2, filt3);
+ out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+ filt1, filt2, filt3);
+ SRARI_H2_SH(out10, out32, FILTER_BITS);
+ SAT_SH2_SH(out10, out32, 7);
+ out = PCKEV_XORI128_UB(out10, out32);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src2110 = src6554;
+ src4332 = src8776;
+ src6554 = src10998;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+ v16u8 tmp0, tmp1;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+ tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+ v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src += (7 * src_stride);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+ src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+ src54_l, src21_l);
+ ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src += (4 * src_stride);
+
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+ src87_l, src98_l, src109_l);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+ filt1, filt2, filt3);
+ out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+ filt1, filt2, filt3);
+ out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+ filt1, filt2, filt3);
+ out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+ PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
+ tmp0, tmp1, tmp2, tmp3);
+ XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+ ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height,
+ int32_t width) {
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt, cnt;
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16i8 filt0, filt1, filt2, filt3;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+ v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+ src -= (3 * src_stride);
+
+ filt = LD_SH(filter);
+ SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+ XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+ src_tmp += (7 * src_stride);
+ ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+ src32_r, src54_r, src21_r);
+ ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+ ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+ src32_l, src54_l, src21_l);
+ ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+ XORI_B4_128_SB(src7, src8, src9, src10);
+ src_tmp += (4 * src_stride);
+ ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+ src87_r, src98_r, src109_r);
+ ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+ src87_l, src98_l, src109_l);
+ out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+ filt1, filt2, filt3);
+ out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+ filt1, filt2, filt3);
+ out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+ filt1, filt2, filt3);
+ out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+ filt1, filt2, filt3);
+ out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+ filt1, filt2, filt3);
+ out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+ filt1, filt2, filt3);
+ out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+ filt1, filt2, filt3);
+ out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+ filt1, filt2, filt3);
+ SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+ SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+ SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+ SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+ PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+ out3_r, tmp0, tmp1, tmp2, tmp3);
+ XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+ ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+ dst_tmp += (4 * dst_stride);
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+ 32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+ 64);
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4;
+ v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+ v16u8 filt0;
+ v8i16 filt;
+ v8u16 tmp0, tmp1;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+ src += (5 * src_stride);
+
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+ DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+ v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 filt0;
+ v8i16 filt;
+
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ src8 = LD_SB(src);
+ src += src_stride;
+
+ ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+ src32_r, src43_r);
+ ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+ src76_r, src87_r);
+ ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+ src87_r, src76_r, src2110, src4332, src6554, src8776);
+ DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+ tmp0, tmp1, tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+ ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+ ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else if (8 == height) {
+ common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+ }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter) {
+ v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+ v16i8 out0, out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+ ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v16i8 out0, out1;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+ src += (8 * src_stride);
+
+ ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
+ vec2, vec3);
+ ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
+ vec6, vec7);
+ DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+ tmp2, tmp3);
+ SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+ SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+ PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+ ST8x4_UB(out0, out1, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src8;
+ }
+}
+
+static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ if (4 == height) {
+ common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+ } else {
+ common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+ }
+}
+
+static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ src += (4 * src_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+ dst += dst_stride;
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_ST_SB(tmp2, tmp3, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+ dst += dst_stride;
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_ST_SB(tmp2, tmp3, dst);
+ dst += dst_stride;
+
+ src0 = src4;
+ }
+}
+
+static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ src0 = LD_UB(src);
+ src5 = LD_UB(src + 16);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ LD_UB4(src, src_stride, src1, src2, src3, src4);
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+ LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+ src += (4 * src_stride);
+
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+ ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+ ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
+
+ ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+ ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 16);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+ ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
+ dst += (4 * dst_stride);
+
+ src0 = src4;
+ src5 = src9;
+ }
+}
+
+static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int8_t *filter, int32_t height) {
+ uint32_t loop_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+ v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 filt;
+
+ /* rearranging filter_y */
+ filt = LD_SH(filter);
+ filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+ LD_UB4(src, 16, src0, src3, src6, src9);
+ src += src_stride;
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ LD_UB2(src, src_stride, src1, src2);
+ LD_UB2(src + 16, src_stride, src4, src5);
+ LD_UB2(src + 32, src_stride, src7, src8);
+ LD_UB2(src + 48, src_stride, src10, src11);
+ src += (2 * src_stride);
+
+ ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+ ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_ST_SB(tmp0, tmp1, dst);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+ ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+ ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ SAT_UH2_UH(tmp4, tmp5, 7);
+ PCKEV_ST_SB(tmp4, tmp5, dst + 16);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ SAT_UH2_UH(tmp6, tmp7, 7);
+ PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
+
+ ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+ ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+ DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+ SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+ SAT_UH2_UH(tmp0, tmp1, 7);
+ PCKEV_ST_SB(tmp0, tmp1, dst + 32);
+
+ DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+ SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+ SAT_UH2_UH(tmp2, tmp3, 7);
+ PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
+
+ ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+ ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+ DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+ SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+ SAT_UH2_UH(tmp4, tmp5, 7);
+ PCKEV_ST_SB(tmp4, tmp5, dst + 48);
+
+ DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+ SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+ SAT_UH2_UH(tmp6, tmp7, 7);
+ PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
+ dst += (2 * dst_stride);
+
+ src0 = src2;
+ src3 = src5;
+ src6 = src8;
+ src9 = src11;
+ }
+}
+
+void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ int8_t cnt, filt_ver[8];
+
+ if (16 != y_step_q4) {
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ if (((const int32_t *)filter_y)[1] == 0x800000) {
+ vpx_convolve_copy(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ return;
+ }
+
+ for (cnt = 8; cnt--;) {
+ filt_ver[cnt] = filter_y[cnt];
+ }
+
+ if (((const int32_t *)filter_y)[0] == 0) {
+ switch (w) {
+ case 4:
+ common_vt_2t_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 8:
+ common_vt_2t_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 16:
+ common_vt_2t_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 32:
+ common_vt_2t_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ case 64:
+ common_vt_2t_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ &filt_ver[3], h);
+ break;
+ default:
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ common_vt_8t_4w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 8:
+ common_vt_8t_8w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 16:
+ common_vt_8t_16w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 32:
+ common_vt_8t_32w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ case 64:
+ common_vt_8t_64w_msa(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filt_ver, h);
+ break;
+ default:
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ }
+}
--- /dev/null
+++ b/vpx_dsp/mips/vpx_convolve_avg_msa.c
@@ -1,0 +1,232 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint32_t out0, out1, out2, out3;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ if (0 == (height % 4)) {
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+
+ out0 = __msa_copy_u_w((v4i32)dst0, 0);
+ out1 = __msa_copy_u_w((v4i32)dst1, 0);
+ out2 = __msa_copy_u_w((v4i32)dst2, 0);
+ out3 = __msa_copy_u_w((v4i32)dst3, 0);
+ SW4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == (height % 2)) {
+ for (cnt = (height / 2); cnt--;) {
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+
+ LD_UB2(dst, dst_stride, dst0, dst1);
+
+ AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+ out0 = __msa_copy_u_w((v4i32)dst0, 0);
+ out1 = __msa_copy_u_w((v4i32)dst1, 0);
+ SW(out0, dst);
+ dst += dst_stride;
+ SW(out1, dst);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint64_t out0, out1, out2, out3;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+
+ out0 = __msa_copy_u_d((v2i64)dst0, 0);
+ out1 = __msa_copy_u_d((v2i64)dst1, 0);
+ out2 = __msa_copy_u_d((v2i64)dst2, 0);
+ out3 = __msa_copy_u_d((v2i64)dst3, 0);
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ for (cnt = (height / 8); cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+ dst4, dst5, dst6, dst7);
+ ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+ dst += (8 * dst_stride);
+ }
+}
+
+static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint8_t *dst_dup = dst;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ for (cnt = (height / 8); cnt--;) {
+ LD_UB4(src, src_stride, src0, src2, src4, src6);
+ LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+ src += (4 * src_stride);
+ LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
+ LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
+ dst_dup += (4 * dst_stride);
+ LD_UB4(src, src_stride, src8, src10, src12, src14);
+ LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
+ src += (4 * src_stride);
+ LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
+ LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
+ dst_dup += (4 * dst_stride);
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+ dst4, dst5, dst6, dst7);
+ AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+ dst8, dst9, dst10, dst11);
+ AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+ dst12, dst13, dst14, dst15);
+
+ ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
+ ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
+ ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint8_t *dst_dup = dst;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, 16, src0, src1, src2, src3);
+ src += src_stride;
+ LD_UB4(src, 16, src4, src5, src6, src7);
+ src += src_stride;
+ LD_UB4(src, 16, src8, src9, src10, src11);
+ src += src_stride;
+ LD_UB4(src, 16, src12, src13, src14, src15);
+ src += src_stride;
+
+ LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
+ dst_dup += dst_stride;
+ LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
+ dst_dup += dst_stride;
+ LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
+ dst_dup += dst_stride;
+ LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
+ dst_dup += dst_stride;
+
+ AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3);
+ AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+ dst4, dst5, dst6, dst7);
+ AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+ dst8, dst9, dst10, dst11);
+ AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+ dst12, dst13, dst14, dst15);
+
+ ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+ dst += dst_stride;
+ ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
+ dst += dst_stride;
+ ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
+ dst += dst_stride;
+ ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int32_t filter_x_stride,
+ const int16_t *filter_y, int32_t filter_y_stride,
+ int32_t w, int32_t h) {
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+
+ switch (w) {
+ case 4: {
+ avg_width4_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 8: {
+ avg_width8_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ avg_width16_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ avg_width32_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ avg_width64_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ int32_t lp, cnt;
+ for (cnt = h; cnt--;) {
+ for (lp = 0; lp < w; ++lp) {
+ dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
--- /dev/null
+++ b/vpx_dsp/mips/vpx_convolve_copy_msa.c
@@ -1,0 +1,247 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+ out4 = __msa_copy_u_d((v2i64)src4, 0);
+ out5 = __msa_copy_u_d((v2i64)src5, 0);
+ out6 = __msa_copy_u_d((v2i64)src6, 0);
+ out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+ dst += (4 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ for (cnt = height >> 3; cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+ out4 = __msa_copy_u_d((v2i64)src4, 0);
+ out5 = __msa_copy_u_d((v2i64)src5, 0);
+ out6 = __msa_copy_u_d((v2i64)src6, 0);
+ out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 4) {
+ for (cnt = (height / 4); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+ out2 = __msa_copy_u_d((v2i64)src2, 0);
+ out3 = __msa_copy_u_d((v2i64)src3, 0);
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 2) {
+ for (cnt = (height / 2); cnt--;) {
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+ out0 = __msa_copy_u_d((v2i64)src0, 0);
+ out1 = __msa_copy_u_d((v2i64)src1, 0);
+
+ SD(out0, dst);
+ dst += dst_stride;
+ SD(out1, dst);
+ dst += dst_stride;
+ }
+ }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ int32_t height, int32_t width) {
+ int32_t cnt, loop_cnt;
+ const uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ LD_UB8(src_tmp, src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ src_tmp += (8 * src_stride);
+
+ ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+ dst_tmp, dst_stride);
+ dst_tmp += (8 * dst_stride);
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += (8 * src_stride);
+ ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+ dst += (8 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+ } else if (0 == height % 4) {
+ for (cnt = (height >> 2); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ int32_t cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if (0 == height % 12) {
+ for (cnt = (height / 12); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ } else if (0 == height % 8) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+ } else if (0 == height % 4) {
+ for (cnt = (height >> 2); cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+ src += (4 * src_stride);
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+ dst += (4 * dst_stride);
+ }
+ }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height) {
+ copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int32_t filter_x_stride,
+ const int16_t *filter_y, int32_t filter_y_stride,
+ int32_t w, int32_t h) {
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+
+ switch (w) {
+ case 4: {
+ uint32_t cnt, tmp;
+ /* 1 word storage */
+ for (cnt = h; cnt--;) {
+ tmp = LW(src);
+ SW(tmp, dst);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ case 8: {
+ copy_width8_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 16: {
+ copy_width16_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 32: {
+ copy_width32_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ case 64: {
+ copy_width64_msa(src, src_stride, dst, dst_stride, h);
+ break;
+ }
+ default: {
+ uint32_t cnt;
+ for (cnt = h; cnt--;) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+ }
+}
--- /dev/null
+++ b/vpx_dsp/mips/vpx_convolve_msa.h
@@ -1,0 +1,119 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+#define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/vpx_filter.h"
+
+extern const uint8_t mc_filt_mask_arr[16 * 3];
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
+ filt0, filt1, filt2, filt3) ({ \
+ v8i16 tmp0, tmp1; \
+ \
+ tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
+ tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
+ tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \
+ tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \
+ tmp0 = __msa_adds_s_h(tmp0, tmp1); \
+ \
+ tmp0; \
+})
+
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \
+ filt_h0, filt_h1, filt_h2, filt_h3) ({ \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v8i16 hz_out_m; \
+ \
+ VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \
+ vec0_m, vec1_m, vec2_m, vec3_m); \
+ hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \
+ filt_h0, filt_h1, filt_h2, filt_h3); \
+ \
+ hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \
+ hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
+ \
+ hz_out_m; \
+})
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
+ mask0, mask1, mask2, mask3, \
+ filt0, filt1, filt2, filt3, \
+ out0, out1) { \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ \
+ VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
+ DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
+ DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
+ DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
+ DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
+ ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
+ mask0, mask1, mask2, mask3, \
+ filt0, filt1, filt2, filt3, \
+ out0, out1, out2, out3) { \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
+ \
+ VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
+ DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
+ res0_m, res1_m, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
+ DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
+ res4_m, res5_m, res6_m, res7_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
+ DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
+ res0_m, res1_m, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
+ DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
+ res4_m, res5_m, res6_m, res7_m); \
+ ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
+ res7_m, out0, out1, out2, out3); \
+}
+
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = PCKEV_XORI128_UB(in1, in0); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
+ ST_UB(tmp_m, (pdst)); \
+}
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
+ ST_UB(tmp_m, (pdst)); \
+}
+
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
+ pdst, stride) { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ \
+ PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
+ PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
+ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
+}
+#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
--- /dev/null
+++ b/vpx_dsp/vpx_convolve.c
@@ -1,0 +1,558 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters,
+ int x0_q4, int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters,
+ int x0_q4, int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
+ dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+ clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters,
+ int y0_q4, int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters,
+ int y0_q4, int y_step_q4, int w, int h) {
+ int x, y;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+ clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ int x0_q4, int x_step_q4,
+ const InterpKernel *const y_filters,
+ int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ uint8_t temp[135 * 64];
+ int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+ x_filters, x0_q4, x_step_q4, w, intermediate_height);
+ convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+ y_filters, y0_q4, y_step_q4, w, h);
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+ // NOTE: This assumes that the filter table is 256-byte aligned.
+ // TODO(agrange) Modify to make independent of table alignment.
+ return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+ return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ (void)filter_y;
+ (void)y_step_q4;
+
+ convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+ x0_q4, x_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ (void)filter_y;
+ (void)y_step_q4;
+
+ convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+ x0_q4, x_step_q4, w, h);
+}
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ (void)filter_x;
+ (void)x_step_q4;
+
+ convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+ y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ (void)filter_x;
+ (void)x_step_q4;
+
+ convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+ y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ convolve(src, src_stride, dst, dst_stride,
+ filters_x, x0_q4, x_step_q4,
+ filters_y, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Fixed size intermediate buffer places limits on parameters. */
+ DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+ assert(w <= 64);
+ assert(h <= 64);
+
+ vpx_convolve8_c(src, src_stride, temp, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+ vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+}
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ int r;
+
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ for (r = h; r > 0; --r) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ int x, y;
+
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x)
+ dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters,
+ int x0_q4, int x_step_q4,
+ int w, int h, int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters,
+ int x0_q4, int x_step_q4,
+ int w, int h, int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
+ dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters,
+ int y0_q4, int y_step_q4, int w, int h,
+ int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = clip_pixel_highbd(
+ ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters,
+ int y0_q4, int y_step_q4, int w, int h,
+ int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k)
+ sum += src_y[k * src_stride] * y_filter[k];
+ dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ int x0_q4, int x_step_q4,
+ const InterpKernel *const y_filters,
+ int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ uint16_t temp[64 * 135];
+ int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+ x_filters, x0_q4, x_step_q4, w,
+ intermediate_height, bd);
+ highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
+ 64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
+ w, h, bd);
+}
+
+
+void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+ (void)filter_y;
+ (void)y_step_q4;
+
+ highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+ x0_q4, x_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+ (void)filter_y;
+ (void)y_step_q4;
+
+ highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+ x0_q4, x_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+ (void)filter_x;
+ (void)x_step_q4;
+
+ highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+ y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+ (void)filter_x;
+ (void)x_step_q4;
+
+ highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+ y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ highbd_convolve(src, src_stride, dst, dst_stride,
+ filters_x, x0_q4, x_step_q4,
+ filters_y, y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ // Fixed size intermediate buffer places limits on parameters.
+ DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
+ assert(w <= 64);
+ assert(h <= 64);
+
+ vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
+ vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
+ NULL, 0, NULL, 0, w, h, bd);
+}
+
+void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h, int bd) {
+ int r;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+ (void)bd;
+
+ for (r = h; r > 0; --r) {
+ memcpy(dst, src, w * sizeof(uint16_t));
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h, int bd) {
+ int x, y;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ (void)filter_x;
+ (void)filter_y;
+ (void)filter_x_stride;
+ (void)filter_y_stride;
+ (void)bd;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+#endif
--- /dev/null
+++ b/vpx_dsp/vpx_convolve.h
@@ -1,0 +1,38 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_VPX_CONVOLVE_H_
+#define VPX_DSP_VPX_CONVOLVE_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_DSP_VPX_CONVOLVE_H_
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -54,6 +54,54 @@
DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c
endif # CONFIG_VP9
+# interpolation filters
+DSP_SRCS-yes += vpx_convolve.c
+DSP_SRCS-yes += vpx_convolve.h
+DSP_SRCS-yes += vpx_filter.h
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm
+DSP_SRCS-$(HAVE_AVX2) += x86/vpx_subpixel_8t_intrin_avx2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm
+endif
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm
+endif
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
+DSP_SRCS-yes += arm/vpx_convolve8_avg_neon.c
+DSP_SRCS-yes += arm/vpx_convolve8_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+endif # HAVE_NEON
+endif # HAVE_NEON_ASM
+
+# common (msa)
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h
+
# loop filters
DSP_SRCS-yes += loopfilter.c
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -11,8 +11,6 @@
#ifndef VPX_DSP_COMMON_H_
#define VPX_DSP_COMMON_H_
-#include <stdlib.h>
-
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -34,6 +34,12 @@
}
}
+# optimizations which depend on multiple features
+$avx2_ssse3 = '';
+if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
+ $avx2_ssse3 = 'avx2';
+}
+
# functions that are 64 bit only.
$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
if ($opts{arch} eq "x86_64") {
@@ -364,6 +370,62 @@
specialize qw/vpx_highbd_dc_128_predictor_32x32/;
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9
+
+#
+# Sub Pixel Filters
+#
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_copy neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_avg neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8 sse2 ssse3 neon msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_horiz sse2 ssse3 neon msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_vert sse2 ssse3 neon msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg sse2 ssse3 neon msa/;
+
+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon msa/;
+
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ #
+ # Sub Pixel Filters
+ #
+ add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve_copy/;
+
+ add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve_avg/;
+
+ add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_vert/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_avg/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+} # CONFIG_VP9_HIGHBITDEPTH
#
# Loopfilter
--- /dev/null
+++ b/vpx_dsp/vpx_filter.h
@@ -1,0 +1,34 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_VPX_FILTER_H_
+#define VPX_DSP_VPX_FILTER_H_
+
+#include "vpx/vpx_integer.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_DSP_VPX_FILTER_H_
--- /dev/null
+++ b/vpx_dsp/x86/convolve.h
@@ -1,0 +1,296 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_CONVOLVE_H_
+#define VPX_DSP_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+typedef void filter8_1dfunction (
+ const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter
+);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, \
+ int w, int h) { \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ if (filter[0] || filter[1] || filter[2]) { \
+ while (w >= 16) { \
+ vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ vpx_filter_block1d16_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_filter_block1d8_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_filter_block1d4_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ vpx_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h); \
+ } \
+}
+
+#define FUN_CONV_2D(avg, opt) \
+void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, \
+ int w, int h) { \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+ filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
+ vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h + 7); \
+ vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h); \
+ } else { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
+ vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h + 1); \
+ vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h); \
+ } \
+ } else { \
+ vpx_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
+ } \
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void highbd_filter8_1dfunction (
+ const uint16_t *src_ptr,
+ const ptrdiff_t src_pitch,
+ uint16_t *output_ptr,
+ ptrdiff_t out_pitch,
+ unsigned int output_height,
+ const int16_t *filter,
+ int bd
+);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
+ ptrdiff_t src_stride, \
+ uint8_t *dst8, \
+ ptrdiff_t dst_stride, \
+ const int16_t *filter_x, \
+ int x_step_q4, \
+ const int16_t *filter_y, \
+ int y_step_q4, \
+ int w, int h, int bd) { \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ if (filter[0] || filter[1] || filter[2]) { \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h, bd); \
+ } \
+}
+
+#define HIGH_FUN_CONV_2D(avg, opt) \
+void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, \
+ int w, int h, int bd) { \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+ filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
+ vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+ CONVERT_TO_BYTEPTR(fdata2), 64, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h + 7, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
+ 64, dst, dst_stride, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h, bd); \
+ } else { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
+ vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
+ CONVERT_TO_BYTEPTR(fdata2), 64, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h + 1, bd); \
+ vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
+ dst, dst_stride, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h, bd); \
+ } \
+ } else { \
+ vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, \
+ h, bd); \
+ } \
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VPX_DSP_X86_CONVOLVE_H_
--- /dev/null
+++ b/vpx_dsp/x86/vpx_asm_stubs.c
@@ -1,0 +1,162 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+
+#if HAVE_SSE2
+filter8_1dfunction vpx_filter_block1d16_v8_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_sse2;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
+
+filter8_1dfunction vpx_filter_block1d16_v2_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_sse2;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
+
+// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
+
+// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, sse2);
+FUN_CONV_2D(avg_ , sse2);
+
+#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+
+// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+ sse2);
+
+// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h, int bd);
+HIGH_FUN_CONV_2D(, sse2);
+HIGH_FUN_CONV_2D(avg_ , sse2);
+#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+#endif // HAVE_SSE2
--- /dev/null
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -1,0 +1,158 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%define program_name vpx
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1
+INIT_XMM sse2
+cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
+ fx, fxs, fy, fys, w, h
+ mov r4d, dword wm
+ cmp r4d, 4
+ je .w4
+ cmp r4d, 8
+ je .w8
+ cmp r4d, 16
+ je .w16
+ cmp r4d, 32
+ je .w32
+
+ mov r4d, dword hm
+.loop64:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+16]
+ pavgb m2, [dstq+32]
+ pavgb m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, dst_strideq
+ dec r4d
+ jnz .loop64
+ RET
+
+.w32:
+ mov r4d, dword hm
+.loop32:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+src_strideq]
+ movu m3, [srcq+src_strideq+16]
+ lea srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq +16]
+ pavgb m2, [dstq+dst_strideq]
+ pavgb m3, [dstq+dst_strideq+16]
+%endif
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+dst_strideq ], m2
+ mova [dstq+dst_strideq+16], m3
+ lea dstq, [dstq+dst_strideq*2]
+ sub r4d, 2
+ jnz .loop32
+ RET
+
+.w16:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop16:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop16
+ RET
+
+INIT_MMX sse
+.w8:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop8:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop8
+ RET
+
+.w4:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop4:
+ movh m0, [srcq]
+ movh m1, [srcq+src_strideq]
+ movh m2, [srcq+src_strideq*2]
+ movh m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ movh m4, [dstq]
+ movh m5, [dstq+dst_strideq]
+ movh m6, [dstq+dst_strideq*2]
+ movh m7, [dstq+r6q]
+ pavgb m0, m4
+ pavgb m1, m5
+ pavgb m2, m6
+ pavgb m3, m7
+%endif
+ movh [dstq ], m0
+ movh [dstq+dst_strideq ], m1
+ movh [dstq+dst_strideq*2], m2
+ movh [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop4
+ RET
+%endmacro
+
+convolve_fn copy
+convolve_fn avg
--- /dev/null
+++ b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -1,0 +1,962 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+ punpcklwd xmm1, xmm7
+
+ movdqa k0k6, xmm0
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+ movdqa k1k7, xmm1
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+ punpcklwd xmm0, xmm6 ;two row in one register
+ punpcklwd xmm1, xmm7
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+
+ pmaddwd xmm0, k0k6 ;multiply the filter factors
+ pmaddwd xmm1, k1k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm3, k3k4
+
+ paddd xmm0, xmm1 ;sum
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+
+ paddd xmm0, krd ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movq [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm6, xmm7
+ punpckhwd xmm2, xmm5
+ punpckhwd xmm3, xmm4
+
+ movdqa k0k1, xmm0 ;store filter factors on stack
+ movdqa k6k7, xmm6
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+ movdqu xmm0, [rsi + %1] ;0
+ movdqu xmm1, [rsi + rax + %1] ;1
+ movdqu xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movdqu xmm7, [rsi + rdx * 2 + %1] ;7
+ movdqu xmm2, [rsi + rax + %1] ;2
+ movdqu xmm3, [rsi + rax * 2 + %1] ;3
+ movdqu xmm4, [rsi + rdx + %1] ;4
+ movdqu xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+ movdqu temp, xmm4
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm1, xmm6
+ punpcklwd xmm6, xmm7
+ punpckhwd xmm1, xmm7
+ movdqa xmm7, xmm2
+ punpcklwd xmm2, xmm5
+ punpckhwd xmm7, xmm5
+
+ movdqu xmm5, temp
+ movdqu temp, xmm4
+ movdqa xmm4, xmm3
+ punpcklwd xmm3, xmm5
+ punpckhwd xmm4, xmm5
+ movdqu xmm5, temp
+
+ pmaddwd xmm0, k0k1
+ pmaddwd xmm5, k0k1
+ pmaddwd xmm6, k6k7
+ pmaddwd xmm1, k6k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm7, k2k5
+ pmaddwd xmm3, k3k4
+ pmaddwd xmm4, k3k4
+
+ paddd xmm0, xmm6
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+ paddd xmm5, xmm1
+ paddd xmm5, xmm7
+ paddd xmm5, xmm4
+
+ paddd xmm0, krd ;rounding
+ paddd xmm5, krd
+ psrad xmm0, 7 ;shift
+ psrad xmm5, 7
+ packssdw xmm0, xmm5 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movdqu xmm1, [rdi + %2]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi + %2], xmm0
+%endm
+
+;void vpx_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movq xmm0, [rsi] ;load src: row 0
+ movq xmm1, [rsi + rax] ;1
+ movq xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2] ;7
+ movq xmm2, [rsi + rax] ;2
+ movq xmm3, [rsi + rax * 2] ;3
+ movq xmm4, [rsi + rdx] ;4
+ movq xmm5, [rsi + rax * 4] ;5
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 16
+ HIGH_APPLY_FILTER_8 0, 16
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movq xmm0, [rsi] ;load src: row 0
+ movq xmm1, [rsi + rax] ;1
+ movq xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2] ;7
+ movq xmm2, [rsi + rax] ;2
+ movq xmm3, [rsi + rax * 2] ;3
+ movq xmm4, [rsi + rdx] ;4
+ movq xmm5, [rsi + rax * 4] ;5
+
+ HIGH_APPLY_FILTER_4 1
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 1, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 1, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 16
+ HIGH_APPLY_FILTER_8 1, 16
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm4, [rsi + 2]
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+
+ psrldq xmm1, 2
+ psrldq xmm6, 4
+ psrldq xmm7, 6
+ psrldq xmm2, 4
+ psrldq xmm3, 6
+ psrldq xmm5, 2
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 10] ;load src
+ movdqu xmm1, [rsi + 12]
+ movdqu xmm2, [rsi + 14]
+ movdqu xmm3, [rsi + 16]
+ movdqu xmm4, [rsi + 18]
+ movdqu xmm5, [rsi + 20]
+ movdqu xmm6, [rsi + 22]
+ movdqu xmm7, [rsi + 24]
+
+ HIGH_APPLY_FILTER_8 0, 16
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm4, [rsi + 2]
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+
+ psrldq xmm1, 2
+ psrldq xmm6, 4
+ psrldq xmm7, 6
+ psrldq xmm2, 4
+ psrldq xmm3, 6
+ psrldq xmm5, 2
+
+ HIGH_APPLY_FILTER_4 1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 1, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 1, 0
+
+ movdqu xmm0, [rsi + 10] ;load src
+ movdqu xmm1, [rsi + 12]
+ movdqu xmm2, [rsi + 14]
+ movdqu xmm3, [rsi + 16]
+ movdqu xmm4, [rsi + 18]
+ movdqu xmm5, [rsi + 20]
+ movdqu xmm6, [rsi + 22]
+ movdqu xmm7, [rsi + 24]
+
+ HIGH_APPLY_FILTER_8 1, 16
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -1,0 +1,494 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklwd xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm5, rdx
+ movq xmm2, rcx
+ pshufd xmm5, xmm5, 0b
+ movdqa xmm1, xmm5
+ psllw xmm5, xmm2
+ psubw xmm5, xmm1 ;max value (for clamping)
+ pxor xmm2, xmm2 ;min value (for clamping)
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+ punpcklwd xmm0, xmm1 ;two row in one register
+ pmaddwd xmm0, xmm4 ;multiply the filter factors
+
+ paddd xmm0, xmm3 ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, xmm5
+ pmaxsw xmm0, xmm2
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+
+ movq [rdi], xmm0
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%if ARCH_X86_64
+%macro HIGH_GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm6, [rdx] ;load filters
+
+ pshuflw xmm7, xmm6, 11111111b ;k3
+ pshufhw xmm6, xmm6, 0b ;k4
+ psrldq xmm6, 8
+ punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm8, rdx
+ movq xmm5, rcx
+ pshufd xmm8, xmm8, 0b
+ movdqa xmm1, xmm8
+ psllw xmm8, xmm5
+ psubw xmm8, xmm1 ;max value (for clamping)
+ pxor xmm5, xmm5 ;min value (for clamping)
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+ movdqa xmm6, xmm0
+ punpckhwd xmm6, xmm1
+ punpcklwd xmm0, xmm1
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+
+ paddd xmm6, xmm4 ;rounding
+ paddd xmm0, xmm4 ;rounding
+ psrad xmm6, 7 ;shift
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, xmm8
+ pmaxsw xmm0, xmm5
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+ movdqa xmm9, xmm0
+ movdqa xmm6, xmm2
+ punpckhwd xmm9, xmm1
+ punpckhwd xmm6, xmm3
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+
+ pmaddwd xmm9, xmm7
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+ pmaddwd xmm2, xmm7
+
+ paddd xmm9, xmm4 ;rounding
+ paddd xmm6, xmm4
+ paddd xmm0, xmm4
+ paddd xmm2, xmm4
+
+ psrad xmm9, 7 ;shift
+ psrad xmm6, 7
+ psrad xmm0, 7
+ psrad xmm2, 7
+
+ packssdw xmm0, xmm9 ;pack back to word
+ packssdw xmm2, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, xmm8
+ pmaxsw xmm0, xmm5
+ pminsw xmm2, xmm8
+ pmaxsw xmm2, xmm5
+
+%if %1
+ movdqu xmm1, [rdi]
+ movdqu xmm3, [rdi + 16]
+ pavgw xmm0, xmm1
+ pavgw xmm2, xmm3
+%endif
+ movdqu [rdi], xmm0 ;store the result
+ movdqu [rdi + 16], xmm2 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+%endif
+
+global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movq xmm0, [rsi] ;load src
+ movq xmm1, [rsi + 2*rax]
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm1, [rsi + 2*rax] ;1
+ movdqu xmm3, [rsi + 2*rax + 16]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movq xmm0, [rsi] ;load src
+ movq xmm1, [rsi + 2*rax]
+
+ HIGH_APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+
+ HIGH_APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 2*rax + 16]
+
+ HIGH_APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 2
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 18]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
+
+global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 2
+
+ HIGH_APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+
+ HIGH_APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 18]
+
+ HIGH_APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endif
--- /dev/null
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -1,0 +1,602 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// immintrin.h.
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_ports/mem.h"
+
+// filters for 16_h8 and 16_v8
+DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
+ 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+ 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+#if defined(__clang__)
+# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+ (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0)
+# define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+# else // clang > 3.3, and not 5.0 on macosx.
+# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+# endif // clang <= 3.3
+#elif defined(__GNUC__)
+# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+# define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+# else // gcc > 4.7
+# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+# endif // gcc <= 4.6
+#else // !(gcc || clang)
+# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // __clang__
+
+static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x706u));
+
+ filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i-=2) {
+ // load the 2 strides of source
+ srcReg32b1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
+ srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+ _mm_loadu_si128((const __m128i *)
+ (src_ptr+src_pixels_per_line-3)), 1);
+
+ // filter the source buffer
+ srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+ srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+ _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
+ srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+ _mm_loadu_si128((const __m128i *)
+ (src_ptr+src_pixels_per_line+5)), 1);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+ _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+ // filter the source buffer
+ srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+ _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+ _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
+
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
+ srcRegFilt32b2_1);
+
+ src_ptr+=src_stride;
+
+ // save 16 bytes
+ _mm_store_si128((__m128i*)output_ptr,
+ _mm256_castsi256_si128(srcRegFilt32b1_1));
+
+ // save the next 16 bits
+ _mm_store_si128((__m128i*)(output_ptr+output_pitch),
+ _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+ output_ptr+=dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
+ _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
+ _mm256_castsi256_si128(filt4Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+ _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3= _mm_shuffle_epi8(srcReg1,
+ _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2= _mm_shuffle_epi8(srcReg1,
+ _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+ _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+ // reading the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+ // filter the source buffer
+ srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
+ _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+ _mm256_castsi256_si128(filt4Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+ _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
+ _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+ _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+ _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm256_castsi256_si128(addFilterReg64));
+
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm256_castsi256_si128(addFilterReg64));
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+ srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+ }
+}
+
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg64;
+ __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+ __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+ __m256i srcReg32b11, srcReg32b12, filtersReg32;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32,
+ _mm256_set1_epi16(0x706u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ // load 16 bytes 7 times in stride of src_pitch
+ srcReg32b1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr)));
+ srcReg32b2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
+ srcReg32b3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
+ srcReg32b4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
+ srcReg32b5 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+ srcReg32b6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg32b7 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+
+ // have each consecutive loads on the same 256 register
+ srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+ _mm256_castsi256_si128(srcReg32b2), 1);
+ srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+ _mm256_castsi256_si128(srcReg32b3), 1);
+ srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
+ _mm256_castsi256_si128(srcReg32b4), 1);
+ srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
+ _mm256_castsi256_si128(srcReg32b5), 1);
+ srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
+ _mm256_castsi256_si128(srcReg32b6), 1);
+ srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
+ _mm256_castsi256_si128(srcReg32b7), 1);
+
+ // merge every two consecutive registers except the last one
+ srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+ srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+ // save
+ srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+
+ // save
+ srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+
+ // save
+ srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+ // save
+ srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+
+ for (i = output_height; i > 1; i-=2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg32b8 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
+ srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+ _mm256_castsi256_si128(srcReg32b8), 1);
+ srcReg32b9 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
+ srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+ _mm256_castsi256_si128(srcReg32b9), 1);
+
+ // merge every two consecutive registers
+ // save
+ srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+ srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_min_epi16(srcReg32b8, srcReg32b12));
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_min_epi16(srcReg32b8, srcReg32b12));
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
+ srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+ src_ptr+=src_stride;
+
+ // save 16 bytes
+ _mm_store_si128((__m128i*)output_ptr,
+ _mm256_castsi256_si128(srcReg32b1));
+
+ // save the next 16 bits
+ _mm_store_si128((__m128i*)(output_ptr+out_pitch),
+ _mm256_extractf128_si256(srcReg32b1, 1));
+
+ output_ptr+=dst_stride;
+
+ // save part of the registers for next strides
+ srcReg32b10 = srcReg32b11;
+ srcReg32b1 = srcReg32b3;
+ srcReg32b11 = srcReg32b2;
+ srcReg32b3 = srcReg32b5;
+ srcReg32b2 = srcReg32b4;
+ srcReg32b5 = srcReg32b7;
+ srcReg32b7 = srcReg32b9;
+ }
+ if (i > 0) {
+ __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+ __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ // load the last 16 bytes
+ srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+ // merge the last 2 results together
+ srcRegFilt4 = _mm_unpacklo_epi8(
+ _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+ srcRegFilt7 = _mm_unpackhi_epi8(
+ _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
+ _mm256_castsi256_si128(forthFilters));
+ srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
+ _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+ srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+ _mm256_castsi256_si128(secondFilters));
+ srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+ _mm256_castsi256_si128(secondFilters));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+ _mm256_castsi256_si128(thirdFilters));
+ srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+ srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+ _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+ srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+ _mm_max_epi16(srcRegFilt5, srcRegFilt7));
+
+
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm256_castsi256_si128(addFilterReg64));
+ srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+ _mm256_castsi256_si128(addFilterReg64));
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+ srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+ }
+}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+#if ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
+#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
+#else // ARCH_X86
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
+#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
+#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
+#endif // ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
+#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
+#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
+#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
+#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
+#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
+#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
+// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, avx2);
+#endif // HAVE_AX2 && HAVE_SSSE3
--- /dev/null
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -1,0 +1,601 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// tmmintrin.h.
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
+ 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
+ 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+// These are reused by the avx2 intrinsics.
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+
+void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i firstFilters, secondFilters, shuffle1, shuffle2;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+ __m128i addFilterReg64, filtersReg, srcReg, minReg;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter into the first lane
+ firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+ // duplicate only the third 16 bit in the filter into the first lane
+ secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+ // duplicate only the seconds 16 bits in the filter into the second lane
+ // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
+ firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+ // duplicate only the forth 16 bits in the filter into the second lane
+ // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
+ secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+ // loading the local filters
+ shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
+ shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // extract the higher half of the lane
+ srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
+ srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+
+ minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+ // add and saturate all the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+ srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bits
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+ src_ptr+=src_pixels_per_line;
+
+ // save only 4 bytes
+ *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+ output_ptr+=output_pitch;
+ }
+}
+
+void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+ __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+ __m128i addFilterReg64, filtersReg, minReg;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 128 bit register
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 128 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 128 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 128 bit register
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+ filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+ filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+ filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+ // filter the source buffer
+ srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+ srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+ srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+ // add and saturate all the results together
+ minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+ srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bits
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+ src_ptr+=src_pixels_per_line;
+
+ // save only 8 bytes
+ _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+ output_ptr+=output_pitch;
+ }
+}
+
+static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
+ __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 128 bit register
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 128 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 128 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 128 bit register
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+ filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+ filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+ filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+ for (i = 0; i < output_height; i++) {
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+ // filter the source buffer
+ srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+ // reading the next 16 bytes.
+ // (part of it was being read by earlier read)
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+ _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+ // filter the source buffer
+ srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
+ srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+ _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+ srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+ src_ptr+=src_pixels_per_line;
+
+ // save 16 bytes
+ _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+
+ output_ptr+=output_pitch;
+ }
+}
+
+void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i addFilterReg64, filtersReg, minReg;
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+ __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+ __m128i srcReg8;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits in the filter
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits in the filter
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits in the filter
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ // load the first 7 rows of 8 bytes
+ srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+ for (i = 0; i < output_height; i++) {
+ // load the last 8 bytes
+ srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+ // merge the result together
+ srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+ srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+ // merge the result together
+ srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+ srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+ srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+ // add and saturate the results together
+ minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+ srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+ src_ptr+=src_pitch;
+
+ // shift down a row
+ srcReg1 = srcReg2;
+ srcReg2 = srcReg3;
+ srcReg3 = srcReg4;
+ srcReg4 = srcReg5;
+ srcReg5 = srcReg6;
+ srcReg6 = srcReg7;
+ srcReg7 = srcReg8;
+
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+ output_ptr+=out_pitch;
+ }
+}
+
+static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
+ __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+ __m128i srcReg8;
+ unsigned int i;
+
+ // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+ addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the first 16 bits in the filter
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+ // duplicate only the second 16 bits in the filter
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits in the filter
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits in the filter
+ forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+ // load the first 7 rows of 16 bytes
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+ for (i = 0; i < output_height; i++) {
+ // load the last 16 bytes
+ srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+ // merge the result together
+ srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+ srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+ srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
+ srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
+ srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
+ srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+
+ // merge the result together
+ srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+ srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
+
+ // merge the result together
+ srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
+ srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+ _mm_min_epi16(srcRegFilt3, srcRegFilt7));
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm_min_epi16(srcRegFilt6, srcRegFilt8));
+
+ // add and saturate the results together
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
+ _mm_max_epi16(srcRegFilt3, srcRegFilt7));
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+ _mm_max_epi16(srcRegFilt6, srcRegFilt8));
+ srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+ // shift by 7 bit each 16 bit
+ srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
+
+ src_ptr+=src_pitch;
+
+ // shift down a row
+ srcReg1 = srcReg2;
+ srcReg2 = srcReg3;
+ srcReg3 = srcReg4;
+ srcReg4 = srcReg5;
+ srcReg5 = srcReg6;
+ srcReg6 = srcReg7;
+ srcReg7 = srcReg8;
+
+ // save 16 bytes convolve result
+ _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+
+ output_ptr+=out_pitch;
+ }
+}
+
+#if ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+#define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3
+#define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3
+#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
+#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
+#else // ARCH_X86
+filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+#endif // ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+
+// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+ ssse3);
+
+// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_ , ssse3);
--- /dev/null
+++ b/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
@@ -1,0 +1,987 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklqdq xmm0, xmm1
+ punpcklqdq xmm2, xmm3
+ punpcklqdq xmm5, xmm4
+ punpcklqdq xmm6, xmm7
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm2
+ movdqa k5k4, xmm5
+ movdqa k6k7, xmm6
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpckldq xmm6, xmm7
+ punpckldq xmm2, xmm3
+ punpckldq xmm5, xmm4
+
+ punpcklbw xmm0, zero ;unpack to word
+ punpcklbw xmm6, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+
+ pmullw xmm0, k0k1 ;multiply the filter factors
+ pmullw xmm6, k6k7
+ pmullw xmm2, k2k3
+ pmullw xmm5, k5k4
+
+ paddsw xmm0, xmm6 ;sum
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm2
+ psrldq xmm2, 8
+ paddsw xmm0, xmm5
+ psrldq xmm5, 8
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm1, xmm1
+ punpcklwd xmm2, xmm2
+ punpcklwd xmm3, xmm3
+ punpckhwd xmm4, xmm4
+ punpckhwd xmm5, xmm5
+ punpckhwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movdqa k0, xmm0 ;store filter factors on stack
+ movdqa k1, xmm1
+ movdqa k2, xmm2
+ movdqa k3, xmm3
+ movdqa k4, xmm4
+ movdqa k5, xmm5
+ movdqa k6, xmm6
+ movdqa k7, xmm7
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+ movq xmm0, [rsi + %1] ;0
+ movq xmm1, [rsi + rax + %1] ;1
+ movq xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2 + %1] ;7
+ movq xmm2, [rsi + rax + %1] ;2
+ movq xmm3, [rsi + rax * 2 + %1] ;3
+ movq xmm4, [rsi + rdx + %1] ;4
+ movq xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+ punpcklbw xmm0, zero
+ punpcklbw xmm1, zero
+ punpcklbw xmm6, zero
+ punpcklbw xmm7, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+ punpcklbw xmm3, zero
+ punpcklbw xmm4, zero
+
+ pmullw xmm0, k0
+ pmullw xmm1, k1
+ pmullw xmm6, k6
+ pmullw xmm7, k7
+ pmullw xmm2, k2
+ pmullw xmm5, k5
+ pmullw xmm3, k3
+ pmullw xmm4, k4
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+ paddsw xmm0, xmm3
+ paddsw xmm0, xmm4
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi + %2]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi + %2], xmm0
+%endm
+
+;void vpx_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
+sym(vpx_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
+sym(vpx_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
+sym(vpx_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 0, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 1
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 1, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
+sym(vpx_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
+sym(vpx_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
+sym(vpx_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -1,0 +1,1071 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro VERTx4 1
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movq xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
+
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+ movdqa krd, xmm5
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+ lea rbx, [rdx + rdx*4]
+ add rbx, rdx ;pitch * 6
+
+.loop:
+ movd xmm0, [rsi] ;A
+ movd xmm1, [rsi + rdx] ;B
+ movd xmm2, [rsi + rdx * 2] ;C
+ movd xmm3, [rax + rdx * 2] ;D
+ movd xmm4, [rsi + rdx * 4] ;E
+ movd xmm5, [rax + rdx * 4] ;F
+
+ punpcklbw xmm0, xmm1 ;A B
+ punpcklbw xmm2, xmm3 ;C D
+ punpcklbw xmm4, xmm5 ;E F
+
+ movd xmm6, [rsi + rbx] ;G
+ movd xmm7, [rax + rbx] ;H
+
+ pmaddubsw xmm0, k0k1
+ pmaddubsw xmm2, k2k3
+ punpcklbw xmm6, xmm7 ;G H
+ pmaddubsw xmm4, k4k5
+ pmaddubsw xmm6, k6k7
+
+ movdqa xmm1, xmm2
+ paddsw xmm0, xmm6
+ pmaxsw xmm2, xmm4
+ pminsw xmm4, xmm1
+ paddsw xmm0, xmm4
+ paddsw xmm0, xmm2
+
+ paddsw xmm0, krd
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ add rsi, rdx
+ add rax, rdx
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .loop
+%endm
+
+%macro VERTx8 1
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movq xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
+
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+ movdqa krd, xmm5
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+ lea rbx, [rdx + rdx*4]
+ add rbx, rdx ;pitch * 6
+
+.loop:
+ movq xmm0, [rsi] ;A
+ movq xmm1, [rsi + rdx] ;B
+ movq xmm2, [rsi + rdx * 2] ;C
+ movq xmm3, [rax + rdx * 2] ;D
+ movq xmm4, [rsi + rdx * 4] ;E
+ movq xmm5, [rax + rdx * 4] ;F
+
+ punpcklbw xmm0, xmm1 ;A B
+ punpcklbw xmm2, xmm3 ;C D
+ punpcklbw xmm4, xmm5 ;E F
+
+ movq xmm6, [rsi + rbx] ;G
+ movq xmm7, [rax + rbx] ;H
+
+ pmaddubsw xmm0, k0k1
+ pmaddubsw xmm2, k2k3
+ punpcklbw xmm6, xmm7 ;G H
+ pmaddubsw xmm4, k4k5
+ pmaddubsw xmm6, k6k7
+
+ paddsw xmm0, xmm6
+ movdqa xmm1, xmm2
+ pmaxsw xmm2, xmm4
+ pminsw xmm4, xmm1
+ paddsw xmm0, xmm4
+ paddsw xmm0, xmm2
+
+ paddsw xmm0, krd
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ add rsi, rdx
+ add rax, rdx
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .loop
+%endm
+
+
+%macro VERTx16 1
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movq xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
+
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+ movdqa krd, xmm5
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+ lea rbx, [rdx + rdx*4]
+ add rbx, rdx ;pitch * 6
+
+.loop:
+ movq xmm0, [rsi] ;A
+ movq xmm1, [rsi + rdx] ;B
+ movq xmm2, [rsi + rdx * 2] ;C
+ movq xmm3, [rax + rdx * 2] ;D
+ movq xmm4, [rsi + rdx * 4] ;E
+ movq xmm5, [rax + rdx * 4] ;F
+
+ punpcklbw xmm0, xmm1 ;A B
+ punpcklbw xmm2, xmm3 ;C D
+ punpcklbw xmm4, xmm5 ;E F
+
+ movq xmm6, [rsi + rbx] ;G
+ movq xmm7, [rax + rbx] ;H
+
+ pmaddubsw xmm0, k0k1
+ pmaddubsw xmm2, k2k3
+ punpcklbw xmm6, xmm7 ;G H
+ pmaddubsw xmm4, k4k5
+ pmaddubsw xmm6, k6k7
+
+ paddsw xmm0, xmm6
+ movdqa xmm1, xmm2
+ pmaxsw xmm2, xmm4
+ pminsw xmm4, xmm1
+ paddsw xmm0, xmm4
+ paddsw xmm0, xmm2
+
+ paddsw xmm0, krd
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0
+
+ movq xmm0, [rsi + 8] ;A
+ movq xmm1, [rsi + rdx + 8] ;B
+ movq xmm2, [rsi + rdx * 2 + 8] ;C
+ movq xmm3, [rax + rdx * 2 + 8] ;D
+ movq xmm4, [rsi + rdx * 4 + 8] ;E
+ movq xmm5, [rax + rdx * 4 + 8] ;F
+
+ punpcklbw xmm0, xmm1 ;A B
+ punpcklbw xmm2, xmm3 ;C D
+ punpcklbw xmm4, xmm5 ;E F
+
+ movq xmm6, [rsi + rbx + 8] ;G
+ movq xmm7, [rax + rbx + 8] ;H
+ punpcklbw xmm6, xmm7 ;G H
+
+ pmaddubsw xmm0, k0k1
+ pmaddubsw xmm2, k2k3
+ pmaddubsw xmm4, k4k5
+ pmaddubsw xmm6, k6k7
+
+ paddsw xmm0, xmm6
+ movdqa xmm1, xmm2
+ pmaxsw xmm2, xmm4
+ pminsw xmm4, xmm1
+ paddsw xmm0, xmm4
+ paddsw xmm0, xmm2
+
+ paddsw xmm0, krd
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ add rsi, rdx
+ add rax, rdx
+%if %1
+ movq xmm1, [rdi+8]
+ pavgb xmm0, xmm1
+%endif
+
+ movq [rdi+8], xmm0
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .loop
+%endm
+
+;void vpx_filter_block1d8_v8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d4_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx4 0
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d8_v8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d8_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx8 0
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d16_v8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d16_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx16 0
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+global sym(vpx_filter_block1d4_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx4 1
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx8 1
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx16 1
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%macro HORIZx4_ROW 2
+ movdqa %2, %1
+ pshufb %1, [GLOBAL(shuf_t0t1)]
+ pshufb %2, [GLOBAL(shuf_t2t3)]
+ pmaddubsw %1, k0k1k4k5
+ pmaddubsw %2, k2k3k6k7
+
+ movdqa xmm4, %1
+ movdqa xmm5, %2
+ psrldq %1, 8
+ psrldq %2, 8
+ movdqa xmm6, xmm5
+
+ paddsw xmm4, %2
+ pmaxsw xmm5, %1
+ pminsw %1, xmm6
+ paddsw %1, xmm4
+ paddsw %1, xmm5
+
+ paddsw %1, krd
+ psraw %1, 7
+ packuswb %1, %1
+%endm
+
+%macro HORIZx4 1
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movq xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm6, xmm4, 0b ;k0_k1
+ pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5
+ pshuflw xmm7, xmm4, 01010101b ;k2_k3
+ pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
+ pshufd xmm5, xmm5, 0 ;rounding
+
+ movdqa k0k1k4k5, xmm6
+ movdqa k2k3k6k7, xmm7
+ movdqa krd, xmm5
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+ movsxd rcx, dword ptr arg(4) ;output_height
+ shr rcx, 1
+.loop:
+ ;Do two rows once
+ movq xmm0, [rsi - 3] ;load src
+ movq xmm1, [rsi + 5]
+ movq xmm2, [rsi + rax - 3]
+ movq xmm3, [rsi + rax + 5]
+ punpcklqdq xmm0, xmm1
+ punpcklqdq xmm2, xmm3
+
+ HORIZx4_ROW xmm0, xmm1
+ HORIZx4_ROW xmm2, xmm3
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+ movd xmm3, [rdi + rdx]
+ pavgb xmm2, xmm3
+%endif
+ movd [rdi], xmm0
+ movd [rdi +rdx], xmm2
+
+ lea rsi, [rsi + rax]
+ prefetcht0 [rsi + 4 * rax - 3]
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + 2 * rdx]
+ prefetcht0 [rsi + 2 * rax - 3]
+
+ dec rcx
+ jnz .loop
+
+ ; Do last row if output_height is odd
+ movsxd rcx, dword ptr arg(4) ;output_height
+ and rcx, 1
+ je .done
+
+ movq xmm0, [rsi - 3] ; load src
+ movq xmm1, [rsi + 5]
+ punpcklqdq xmm0, xmm1
+
+ HORIZx4_ROW xmm0, xmm1
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+.done
+%endm
+
+%macro HORIZx8_ROW 4
+ movdqa %2, %1
+ movdqa %3, %1
+ movdqa %4, %1
+
+ pshufb %1, [GLOBAL(shuf_t0t1)]
+ pshufb %2, [GLOBAL(shuf_t2t3)]
+ pshufb %3, [GLOBAL(shuf_t4t5)]
+ pshufb %4, [GLOBAL(shuf_t6t7)]
+
+ pmaddubsw %1, k0k1
+ pmaddubsw %2, k2k3
+ pmaddubsw %3, k4k5
+ pmaddubsw %4, k6k7
+
+ paddsw %1, %4
+ movdqa %4, %2
+ pmaxsw %2, %3
+ pminsw %3, %4
+ paddsw %1, %3
+ paddsw %1, %2
+
+ paddsw %1, krd
+ psraw %1, 7
+ packuswb %1, %1
+%endm
+
+%macro HORIZx8 1
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movq xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
+
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+ movdqa krd, xmm5
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+ movsxd rcx, dword ptr arg(4) ;output_height
+ shr rcx, 1
+
+.loop:
+ movq xmm0, [rsi - 3] ;load src
+ movq xmm3, [rsi + 5]
+ movq xmm4, [rsi + rax - 3]
+ movq xmm7, [rsi + rax + 5]
+ punpcklqdq xmm0, xmm3
+ punpcklqdq xmm4, xmm7
+
+ HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
+ HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
+%if %1
+ movq xmm1, [rdi]
+ movq xmm2, [rdi + rdx]
+ pavgb xmm0, xmm1
+ pavgb xmm4, xmm2
+%endif
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm4
+
+ lea rsi, [rsi + rax]
+ prefetcht0 [rsi + 4 * rax - 3]
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + 2 * rdx]
+ prefetcht0 [rsi + 2 * rax - 3]
+ dec rcx
+ jnz .loop
+
+ ;Do last row if output_height is odd
+ movsxd rcx, dword ptr arg(4) ;output_height
+ and rcx, 1
+ je .done
+
+ movq xmm0, [rsi - 3]
+ movq xmm3, [rsi + 5]
+ punpcklqdq xmm0, xmm3
+
+ HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0
+.done
+%endm
+
+%macro HORIZx16 1
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movq xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
+
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+ movdqa krd, xmm5
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+.loop:
+ prefetcht0 [rsi + 2 * rax -3]
+
+ movq xmm0, [rsi - 3] ;load src data
+ movq xmm4, [rsi + 5]
+ movq xmm6, [rsi + 13]
+ punpcklqdq xmm0, xmm4
+ punpcklqdq xmm4, xmm6
+
+ movdqa xmm7, xmm0
+
+ punpcklbw xmm7, xmm7
+ punpckhbw xmm0, xmm0
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+
+ palignr xmm0, xmm7, 1
+ palignr xmm1, xmm7, 5
+ pmaddubsw xmm0, k0k1
+ palignr xmm2, xmm7, 9
+ pmaddubsw xmm1, k2k3
+ palignr xmm3, xmm7, 13
+
+ pmaddubsw xmm2, k4k5
+ pmaddubsw xmm3, k6k7
+ paddsw xmm0, xmm3
+
+ movdqa xmm3, xmm4
+ punpcklbw xmm3, xmm3
+ punpckhbw xmm4, xmm4
+
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+
+ palignr xmm4, xmm3, 1
+ palignr xmm5, xmm3, 5
+ palignr xmm6, xmm3, 9
+ palignr xmm7, xmm3, 13
+
+ movdqa xmm3, xmm1
+ pmaddubsw xmm4, k0k1
+ pmaxsw xmm1, xmm2
+ pmaddubsw xmm5, k2k3
+ pminsw xmm2, xmm3
+ pmaddubsw xmm6, k4k5
+ paddsw xmm0, xmm2
+ pmaddubsw xmm7, k6k7
+ paddsw xmm0, xmm1
+
+ paddsw xmm4, xmm7
+ movdqa xmm7, xmm5
+ pmaxsw xmm5, xmm6
+ pminsw xmm6, xmm7
+ paddsw xmm4, xmm6
+ paddsw xmm4, xmm5
+
+ paddsw xmm0, krd
+ paddsw xmm4, krd
+ psraw xmm0, 7
+ psraw xmm4, 7
+ packuswb xmm0, xmm0
+ packuswb xmm4, xmm4
+ punpcklqdq xmm0, xmm4
+%if %1
+ movdqa xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+
+ lea rsi, [rsi + rax]
+ movdqa [rdi], xmm0
+
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+%endm
+
+;void vpx_filter_block1d4_h8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d4_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 3
+ %define k0k1k4k5 [rsp + 16 * 0]
+ %define k2k3k6k7 [rsp + 16 * 1]
+ %define krd [rsp + 16 * 2]
+
+ HORIZx4 0
+
+ add rsp, 16 * 3
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d8_h8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d8_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx8 0
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vpx_filter_block1d16_h8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vpx_filter_block1d16_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx16 0
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d4_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 3
+ %define k0k1k4k5 [rsp + 16 * 0]
+ %define k2k3k6k7 [rsp + 16 * 1]
+ %define krd [rsp + 16 * 2]
+
+ HORIZx4 1
+
+ add rsp, 16 * 3
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx8 1
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx16 1
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+SECTION_RODATA
+align 16
+shuf_t0t1:
+ db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+align 16
+shuf_t2t3:
+ db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+align 16
+shuf_t4t5:
+ db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+align 16
+shuf_t6t7:
+ db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
--- /dev/null
+++ b/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
@@ -1,0 +1,448 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklqdq xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ pxor xmm2, xmm2
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpcklbw xmm0, xmm2 ;unpack to word
+ pmullw xmm0, xmm4 ;multiply the filter factors
+
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+
+ paddsw xmm0, xmm3 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+
+ pshuflw xmm6, xmm7, 11111111b ;k3
+ pshufhw xmm7, xmm7, 0b ;k4
+ punpcklwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ pxor xmm5, xmm5
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm4 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ pmullw xmm2, xmm6
+ pmullw xmm3, xmm7
+
+ paddsw xmm0, xmm1
+ paddsw xmm2, xmm3
+
+ paddsw xmm0, xmm4 ;rounding
+ paddsw xmm2, xmm4
+ psraw xmm0, 7 ;shift
+ psraw xmm2, 7
+ packuswb xmm0, xmm2 ;pack back to byte
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
+sym(vpx_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
+sym(vpx_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
+sym(vpx_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_v2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
+sym(vpx_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
+sym(vpx_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
+sym(vpx_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_h2_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
--- /dev/null
+++ b/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
@@ -1,0 +1,422 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm3, [rdx] ;load filters
+ psrldq xmm3, 6
+ packsswb xmm3, xmm3
+ pshuflw xmm3, xmm3, 0b ;k3_k4
+
+ movq xmm2, rcx ;rounding
+ pshufd xmm2, xmm2, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm3
+
+ paddsw xmm0, xmm2 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ psrldq xmm7, 6
+ packsswb xmm7, xmm7
+ pshuflw xmm7, xmm7, 0b ;k3_k4
+ punpcklwd xmm7, xmm7
+
+ movq xmm6, rcx ;rounding
+ pshufd xmm6, xmm6, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm7
+
+ paddsw xmm0, xmm6 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm2, xmm1
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm2, xmm7
+
+ paddsw xmm0, xmm6 ;rounding
+ paddsw xmm2, xmm6
+ psraw xmm0, 7 ;shift
+ psraw xmm2, 7
+ packuswb xmm0, xmm2 ;pack back to byte
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h2_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 1
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret