ref: fa96eeb835c88522740a96dcda77f8685fae96a5
parent: 1fa04e1a03c2a77ce1575ef8e7c9d15e2334ad3a
author: Ronald S. Bultje <[email protected]>
date: Tue Jun 11 11:19:14 EDT 2013
Implement SSE version for sad4x8x4d and SSE2 version for sad8x4x4d. Encoding time of crew (CIF, first 50 frames) @ 1500kbps goes from 4min56 to 4min42. Change-Id: I92c0c8b32980d2ae7c6dafc8b883a2c7fcd14a9f
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -332,15 +332,31 @@
#if CONFIG_VP9_ENCODER
const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c;
+const sad_n_by_n_by_4_fn_t sad_64x32x4d_c = vp9_sad64x32x4d_c;
+const sad_n_by_n_by_4_fn_t sad_32x64x4d_c = vp9_sad32x64x4d_c;
const sad_n_by_n_by_4_fn_t sad_32x32x4d_c = vp9_sad32x32x4d_c;
+const sad_n_by_n_by_4_fn_t sad_32x16x4d_c = vp9_sad32x16x4d_c;
+const sad_n_by_n_by_4_fn_t sad_16x32x4d_c = vp9_sad16x32x4d_c;
const sad_n_by_n_by_4_fn_t sad_16x16x4d_c = vp9_sad16x16x4d_c;
+const sad_n_by_n_by_4_fn_t sad_16x8x4d_c = vp9_sad16x8x4d_c;
+const sad_n_by_n_by_4_fn_t sad_8x16x4d_c = vp9_sad8x16x4d_c;
const sad_n_by_n_by_4_fn_t sad_8x8x4d_c = vp9_sad8x8x4d_c;
+const sad_n_by_n_by_4_fn_t sad_8x4x4d_c = vp9_sad8x4x4d_c;
+const sad_n_by_n_by_4_fn_t sad_4x8x4d_c = vp9_sad4x8x4d_c;
const sad_n_by_n_by_4_fn_t sad_4x4x4d_c = vp9_sad4x4x4d_c;
INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values(
make_tuple(64, 64, sad_64x64x4d_c),
+ make_tuple(64, 32, sad_64x32x4d_c),
+ make_tuple(32, 64, sad_32x64x4d_c),
make_tuple(32, 32, sad_32x32x4d_c),
+ make_tuple(32, 16, sad_32x16x4d_c),
+ make_tuple(16, 32, sad_16x32x4d_c),
make_tuple(16, 16, sad_16x16x4d_c),
+ make_tuple(16, 8, sad_16x8x4d_c),
+ make_tuple(8, 16, sad_8x16x4d_c),
make_tuple(8, 8, sad_8x8x4d_c),
+ make_tuple(8, 4, sad_8x4x4d_c),
+ make_tuple(4, 8, sad_4x8x4d_c),
make_tuple(4, 4, sad_4x4x4d_c)));
#endif
@@ -407,8 +423,10 @@
INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(
make_tuple(4, 4, sad_4x4_sse_vp9)));
+const sad_n_by_n_by_4_fn_t sad_4x8x4d_sse = vp9_sad4x8x4d_sse;
const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse = vp9_sad4x4x4d_sse;
INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
+ make_tuple(4, 8, sad_4x8x4d_sse),
make_tuple(4, 4, sad_4x4x4d_sse)));
#endif
#endif
@@ -450,18 +468,28 @@
#if CONFIG_VP9_ENCODER
const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
const sad_n_by_n_by_4_fn_t sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_32x16x4d_sse2 = vp9_sad32x16x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_16x32x4d_sse2 = vp9_sad16x32x4d_sse2;
const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2;
const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2;
const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2;
const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2;
INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
make_tuple(64, 64, sad_64x64x4d_sse2),
+ make_tuple(64, 32, sad_64x32x4d_sse2),
+ make_tuple(32, 64, sad_32x64x4d_sse2),
make_tuple(32, 32, sad_32x32x4d_sse2),
+ make_tuple(32, 16, sad_32x16x4d_sse2),
+ make_tuple(16, 32, sad_16x32x4d_sse2),
make_tuple(16, 16, sad_16x16x4d_sse2),
make_tuple(16, 8, sad_16x8x4d_sse2),
make_tuple(8, 16, sad_8x16x4d_sse2),
- make_tuple(8, 8, sad_8x8x4d_sse2)));
+ make_tuple(8, 8, sad_8x8x4d_sse2),
+ make_tuple(8, 4, sad_8x4x4d_sse2)));
#endif
#endif
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -499,13 +499,14 @@
# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x4x4d
+specialize vp9_sad8x4x4d sse2
prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x8x4d
+specialize vp9_sad4x8x4d sse
prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
specialize vp9_sad4x4x4d sse
+
prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
specialize vp9_sub_pixel_mse16x16 sse2 mmx
--- a/vp9/encoder/x86/vp9_sad4d_sse2.asm
+++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm
@@ -224,6 +224,8 @@
SADNXN4D 16, 8
SADNXN4D 8, 16
SADNXN4D 8, 8
+SADNXN4D 8, 4
INIT_MMX sse
+SADNXN4D 4, 8
SADNXN4D 4, 4