shithub: libvpx

Download patch

ref: fa96eeb835c88522740a96dcda77f8685fae96a5
parent: 1fa04e1a03c2a77ce1575ef8e7c9d15e2334ad3a
author: Ronald S. Bultje <[email protected]>
date: Tue Jun 11 11:19:14 EDT 2013

Implement SSE version for sad4x8x4d and SSE2 version for sad8x4x4d.

Encoding time of crew (CIF, first 50 frames) @ 1500kbps goes from 4min56
to 4min42.

Change-Id: I92c0c8b32980d2ae7c6dafc8b883a2c7fcd14a9f

--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -332,15 +332,31 @@
 
 #if CONFIG_VP9_ENCODER
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c;
+const sad_n_by_n_by_4_fn_t sad_64x32x4d_c = vp9_sad64x32x4d_c;
+const sad_n_by_n_by_4_fn_t sad_32x64x4d_c = vp9_sad32x64x4d_c;
 const sad_n_by_n_by_4_fn_t sad_32x32x4d_c = vp9_sad32x32x4d_c;
+const sad_n_by_n_by_4_fn_t sad_32x16x4d_c = vp9_sad32x16x4d_c;
+const sad_n_by_n_by_4_fn_t sad_16x32x4d_c = vp9_sad16x32x4d_c;
 const sad_n_by_n_by_4_fn_t sad_16x16x4d_c = vp9_sad16x16x4d_c;
+const sad_n_by_n_by_4_fn_t sad_16x8x4d_c = vp9_sad16x8x4d_c;
+const sad_n_by_n_by_4_fn_t sad_8x16x4d_c = vp9_sad8x16x4d_c;
 const sad_n_by_n_by_4_fn_t sad_8x8x4d_c = vp9_sad8x8x4d_c;
+const sad_n_by_n_by_4_fn_t sad_8x4x4d_c = vp9_sad8x4x4d_c;
+const sad_n_by_n_by_4_fn_t sad_4x8x4d_c = vp9_sad4x8x4d_c;
 const sad_n_by_n_by_4_fn_t sad_4x4x4d_c = vp9_sad4x4x4d_c;
 INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values(
                         make_tuple(64, 64, sad_64x64x4d_c),
+                        make_tuple(64, 32, sad_64x32x4d_c),
+                        make_tuple(32, 64, sad_32x64x4d_c),
                         make_tuple(32, 32, sad_32x32x4d_c),
+                        make_tuple(32, 16, sad_32x16x4d_c),
+                        make_tuple(16, 32, sad_16x32x4d_c),
                         make_tuple(16, 16, sad_16x16x4d_c),
+                        make_tuple(16, 8, sad_16x8x4d_c),
+                        make_tuple(8, 16, sad_8x16x4d_c),
                         make_tuple(8, 8, sad_8x8x4d_c),
+                        make_tuple(8, 4, sad_8x4x4d_c),
+                        make_tuple(4, 8, sad_4x8x4d_c),
                         make_tuple(4, 4, sad_4x4x4d_c)));
 #endif
 
@@ -407,8 +423,10 @@
 INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(
                         make_tuple(4, 4, sad_4x4_sse_vp9)));
 
+const sad_n_by_n_by_4_fn_t sad_4x8x4d_sse = vp9_sad4x8x4d_sse;
 const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse = vp9_sad4x4x4d_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
+                        make_tuple(4, 8, sad_4x8x4d_sse),
                         make_tuple(4, 4, sad_4x4x4d_sse)));
 #endif
 #endif
@@ -450,18 +468,28 @@
 
 #if CONFIG_VP9_ENCODER
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_32x16x4d_sse2 = vp9_sad32x16x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_16x32x4d_sse2 = vp9_sad16x32x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2;
 INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
                         make_tuple(64, 64, sad_64x64x4d_sse2),
+                        make_tuple(64, 32, sad_64x32x4d_sse2),
+                        make_tuple(32, 64, sad_32x64x4d_sse2),
                         make_tuple(32, 32, sad_32x32x4d_sse2),
+                        make_tuple(32, 16, sad_32x16x4d_sse2),
+                        make_tuple(16, 32, sad_16x32x4d_sse2),
                         make_tuple(16, 16, sad_16x16x4d_sse2),
                         make_tuple(16, 8, sad_16x8x4d_sse2),
                         make_tuple(8, 16, sad_8x16x4d_sse2),
-                        make_tuple(8, 8, sad_8x8x4d_sse2)));
+                        make_tuple(8, 8, sad_8x8x4d_sse2),
+                        make_tuple(8, 4, sad_8x4x4d_sse2)));
 #endif
 #endif
 
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -499,13 +499,14 @@
 
 # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
 prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x4x4d
+specialize vp9_sad8x4x4d sse2
 
 prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x8x4d
+specialize vp9_sad4x8x4d sse
 
 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse
+
 prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
 specialize vp9_sub_pixel_mse16x16 sse2 mmx
 
--- a/vp9/encoder/x86/vp9_sad4d_sse2.asm
+++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm
@@ -224,6 +224,8 @@
 SADNXN4D 16,  8
 SADNXN4D  8, 16
 SADNXN4D  8,  8
+SADNXN4D  8,  4
 
 INIT_MMX sse
+SADNXN4D  4,  8
 SADNXN4D  4,  4