shithub: libvpx

Download patch

ref: 7eee487c0036340e99425cc1cf1503e21e70678a
parent: 08d86bc9043f55d86f20f4bab74bc4ca949b3a4c
author: Peter de Rivaz <[email protected]>
date: Thu Oct 16 09:41:55 EDT 2014

Added highbitdepth sse2 SAD acceleration and tests

Change-Id: I1a74a1b032b198793ef9cc526327987f7799125f
(cherry picked from commit b1a6f6b9cb47eafe0ce86eaf0318612806091fe5)

--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -27,6 +27,7 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx/vpx_codec.h"
 
 
 #if CONFIG_VP8_ENCODER
@@ -35,7 +36,7 @@
                                    const unsigned char *reference_ptr,
                                    int reference_stride,
                                    unsigned int max_sad);
-typedef std::tr1::tuple<int, int, SadMxNFunc> SadMxNParam;
+typedef std::tr1::tuple<int, int, SadMxNFunc, int> SadMxNParam;
 #endif
 #if CONFIG_VP9_ENCODER
 typedef unsigned int (*SadMxNVp9Func)(const unsigned char *source_ptr,
@@ -42,15 +43,21 @@
                                       int source_stride,
                                       const unsigned char *reference_ptr,
                                       int reference_stride);
-typedef std::tr1::tuple<int, int, SadMxNVp9Func> SadMxNVp9Param;
+typedef std::tr1::tuple<int, int, SadMxNVp9Func, int> SadMxNVp9Param;
+typedef uint32_t (*SadMxNAvgVp9Func)(const uint8_t *source_ptr,
+                                     int source_stride,
+                                     const uint8_t *reference_ptr,
+                                     int reference_stride,
+                                     const uint8_t *second_pred);
+typedef std::tr1::tuple<int, int, SadMxNAvgVp9Func, int> SadMxNAvgVp9Param;
 #endif
 
 typedef void (*SadMxNx4Func)(const uint8_t *src_ptr,
                              int src_stride,
-                             const unsigned char *const ref_ptr[],
+                             const uint8_t *const ref_ptr[],
                              int ref_stride,
-                             unsigned int *sad_array);
-typedef std::tr1::tuple<int, int, SadMxNx4Func> SadMxNx4Param;
+                             uint32_t *sad_array);
+typedef std::tr1::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
 
 using libvpx_test::ACMRandom;
 
@@ -57,20 +64,55 @@
 namespace {
 class SADTestBase : public ::testing::Test {
  public:
-  SADTestBase(int width, int height) : width_(width), height_(height) {}
+  SADTestBase(int width, int height, int bit_depth) :
+      width_(width), height_(height), bd_(bit_depth) {}
 
   static void SetUpTestCase() {
+#if CONFIG_VP9_HIGHBITDEPTH
+    source_data8_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBlockSize));
+    reference_data8_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    second_pred8_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, 64*64));
+    source_data16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, kDataBlockSize*sizeof(uint16_t)));
+    reference_data16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize*sizeof(uint16_t)));
+    second_pred16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, 64*64*sizeof(uint16_t)));
+#else
     source_data_ = reinterpret_cast<uint8_t*>(
         vpx_memalign(kDataAlignment, kDataBlockSize));
     reference_data_ = reinterpret_cast<uint8_t*>(
         vpx_memalign(kDataAlignment, kDataBufferSize));
+    second_pred_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, 64*64));
+#endif
   }
 
   static void TearDownTestCase() {
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_free(source_data8_);
+    source_data8_ = NULL;
+    vpx_free(reference_data8_);
+    reference_data8_ = NULL;
+    vpx_free(second_pred8_);
+    second_pred8_ = NULL;
+    vpx_free(source_data16_);
+    source_data16_ = NULL;
+    vpx_free(reference_data16_);
+    reference_data16_ = NULL;
+    vpx_free(second_pred16_);
+    second_pred16_ = NULL;
+#else
     vpx_free(source_data_);
     source_data_ = NULL;
     vpx_free(reference_data_);
     reference_data_ = NULL;
+    vpx_free(second_pred_);
+    second_pred_ = NULL;
+#endif
   }
 
   virtual void TearDown() {
@@ -84,13 +126,38 @@
   static const int kDataBufferSize = 4 * kDataBlockSize;
 
   virtual void SetUp() {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (bd_ == -1) {
+      use_high_bit_depth_ = false;
+      bit_depth_ = VPX_BITS_8;
+      source_data_ = source_data8_;
+      reference_data_ = reference_data8_;
+      second_pred_ = second_pred8_;
+    } else {
+      use_high_bit_depth_ = true;
+      bit_depth_ = static_cast<vpx_bit_depth_t>(bd_);
+      source_data_ = CONVERT_TO_BYTEPTR(source_data16_);
+      reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
+      second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
+    }
+#endif
+    mask_ = (1 << bit_depth_) - 1;
     source_stride_ = (width_ + 31) & ~31;
     reference_stride_ = width_ * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
-  virtual uint8_t* GetReference(int block_idx) {
+  virtual uint8_t *GetReference(int block_idx) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (!use_high_bit_depth_) {
+      return reference_data_ + block_idx * kDataBlockSize;
+    } else {
+      return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
+                                block_idx * kDataBlockSize);
+    }
+#else
     return reference_data_ + block_idx * kDataBlockSize;
+#endif
   }
 
   // Sum of Absolute Differences. Given two blocks, calculate the absolute
@@ -97,12 +164,79 @@
   // difference between two pixels in the same relative location; accumulate.
   unsigned int ReferenceSAD(unsigned int max_sad, int block_idx) {
     unsigned int sad = 0;
-    const uint8_t* const reference = GetReference(block_idx);
+#if CONFIG_VP9_HIGHBITDEPTH
+      const uint8_t *const reference8 = GetReference(block_idx);
+      const uint8_t *const source8 = source_data_;
+      const uint16_t *const reference16 =
+          CONVERT_TO_SHORTPTR(GetReference(block_idx));
+      const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+#else
+    const uint8_t *const reference = GetReference(block_idx);
+    const uint8_t *const source = source_data_;
+#endif
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (!use_high_bit_depth_) {
+          sad +=
+              abs(source8[h * source_stride_ + w] -
+                  reference8[h * reference_stride_ + w]);
+        } else {
+          sad +=
+              abs(source16[h * source_stride_ + w] -
+                  reference16[h * reference_stride_ + w]);
+        }
+#else
+        sad +=
+            abs(source[h * source_stride_ + w] -
+                reference[h * reference_stride_ + w]);
+#endif
+      }
+      if (sad > max_sad) {
+        break;
+      }
+    }
+    return sad;
+  }
 
+  // Sum of Absolute Differences Average. Given two blocks, and a prediction
+  // calculate the absolute difference between one pixel and average of the
+  // corresponding and predicted pixels; accumulate.
+  unsigned int ReferenceSADavg(unsigned int max_sad, int block_idx) {
+    unsigned int sad = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+      const uint8_t *const reference8 = GetReference(block_idx);
+      const uint8_t *const source8 = source_data_;
+      const uint8_t *const second_pred8 = second_pred_;
+      const uint16_t *const reference16 =
+          CONVERT_TO_SHORTPTR(GetReference(block_idx));
+      const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+      const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+#else
+    const uint8_t *const reference = GetReference(block_idx);
+    const uint8_t *const source = source_data_;
+    const uint8_t *const second_pred = second_pred_;
+#endif
     for (int h = 0; h < height_; ++h) {
       for (int w = 0; w < width_; ++w) {
-        sad += abs(source_data_[h * source_stride_ + w]
-               - reference[h * reference_stride_ + w]);
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (!use_high_bit_depth_) {
+          const int tmp = second_pred8[h * width_ + w] +
+              reference8[h * reference_stride_ + w];
+          const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+          sad += abs(source8[h * source_stride_ + w] - comp_pred);
+        } else {
+          const int tmp = second_pred16[h * width_ + w] +
+              reference16[h * reference_stride_ + w];
+          const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+          sad += abs(source16[h * source_stride_ + w] - comp_pred);
+        }
+#else
+        const int tmp = second_pred[h * width_ + w] +
+            reference[h * reference_stride_ + w];
+        const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+        sad += abs(source[h * source_stride_ + w] - comp_pred);
+#endif
       }
       if (sad > max_sad) {
         break;
@@ -111,26 +245,61 @@
     return sad;
   }
 
-  void FillConstant(uint8_t *data, int stride, uint8_t fill_constant) {
+  void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint8_t *data8 = data;
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+#endif
     for (int h = 0; h < height_; ++h) {
       for (int w = 0; w < width_; ++w) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = fill_constant;
+        } else {
+          data16[h * stride + w] = fill_constant;
+        }
+#else
         data[h * stride + w] = fill_constant;
+#endif
       }
     }
   }
 
   void FillRandom(uint8_t *data, int stride) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint8_t *data8 = data;
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+#endif
     for (int h = 0; h < height_; ++h) {
       for (int w = 0; w < width_; ++w) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = rnd_.Rand8();
+        } else {
+          data16[h * stride + w] = rnd_.Rand16() & mask_;
+        }
+#else
         data[h * stride + w] = rnd_.Rand8();
+#endif
       }
     }
   }
 
-  int width_, height_;
-  static uint8_t* source_data_;
+  int width_, height_, mask_, bd_;
+  vpx_bit_depth_t bit_depth_;
+  static uint8_t *source_data_;
+  static uint8_t *reference_data_;
+  static uint8_t *second_pred_;
   int source_stride_;
-  static uint8_t* reference_data_;
+#if CONFIG_VP9_HIGHBITDEPTH
+  bool use_high_bit_depth_;
+  static uint8_t *source_data8_;
+  static uint8_t *reference_data8_;
+  static uint8_t *second_pred8_;
+  static uint16_t *source_data16_;
+  static uint16_t *reference_data16_;
+  static uint16_t *second_pred16_;
+#endif
   int reference_stride_;
 
   ACMRandom rnd_;
@@ -140,11 +309,11 @@
     : public SADTestBase,
       public ::testing::WithParamInterface<SadMxNx4Param> {
  public:
-  SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+  SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
 
  protected:
   void SADs(unsigned int *results) {
-    const uint8_t* refs[] = {GetReference(0), GetReference(1),
+    const uint8_t *refs[] = {GetReference(0), GetReference(1),
                              GetReference(2), GetReference(3)};
 
     ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
@@ -169,12 +338,12 @@
     : public SADTestBase,
       public ::testing::WithParamInterface<SadMxNParam> {
  public:
-  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
 
  protected:
   unsigned int SAD(unsigned int max_sad, int block_idx) {
     unsigned int ret;
-    const uint8_t* const reference = GetReference(block_idx);
+    const uint8_t *const reference = GetReference(block_idx);
 
     ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_,
@@ -201,12 +370,12 @@
     : public SADTestBase,
       public ::testing::WithParamInterface<SadMxNVp9Param> {
  public:
-  SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+  SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
 
  protected:
   unsigned int SAD(int block_idx) {
     unsigned int ret;
-    const uint8_t* const reference = GetReference(block_idx);
+    const uint8_t *const reference = GetReference(block_idx);
 
     ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_));
@@ -220,20 +389,54 @@
     ASSERT_EQ(reference_sad, exp_sad);
   }
 };
+
+class SADavgVP9Test
+    : public SADTestBase,
+      public ::testing::WithParamInterface<SadMxNAvgVp9Param> {
+ public:
+  SADavgVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD_avg(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                second_pred_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSADavg(UINT_MAX, 0);
+    const unsigned int exp_sad = SAD_avg(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+};
 #endif  // CONFIG_VP9_ENCODER
 
-uint8_t* SADTestBase::source_data_ = NULL;
-uint8_t* SADTestBase::reference_data_ = NULL;
+uint8_t *SADTestBase::source_data_ = NULL;
+uint8_t *SADTestBase::reference_data_ = NULL;
+uint8_t *SADTestBase::second_pred_ = NULL;
+#if CONFIG_VP9_ENCODER && CONFIG_VP9_HIGHBITDEPTH
+uint8_t *SADTestBase::source_data8_ = NULL;
+uint8_t *SADTestBase::reference_data8_ = NULL;
+uint8_t *SADTestBase::second_pred8_ = NULL;
+uint16_t *SADTestBase::source_data16_ = NULL;
+uint16_t *SADTestBase::reference_data16_ = NULL;
+uint16_t *SADTestBase::second_pred16_ = NULL;
+#endif
 
 #if CONFIG_VP8_ENCODER
 TEST_P(SADTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
-  FillConstant(reference_data_, reference_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, mask_);
   CheckSAD(UINT_MAX);
 }
 
 TEST_P(SADTest, MaxSrc) {
-  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(source_data_, source_stride_, mask_);
   FillConstant(reference_data_, reference_stride_, 0);
   CheckSAD(UINT_MAX);
 }
@@ -270,7 +473,7 @@
 TEST_P(SADTest, MaxSAD) {
   // Verify that, when max_sad is set, the implementation does not return a
   // value lower than the reference.
-  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(source_data_, source_stride_, mask_);
   FillConstant(reference_data_, reference_stride_, 0);
   CheckSAD(128);
 }
@@ -279,12 +482,12 @@
 #if CONFIG_VP9_ENCODER
 TEST_P(SADVP9Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
-  FillConstant(reference_data_, reference_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, mask_);
   CheckSAD();
 }
 
 TEST_P(SADVP9Test, MaxSrc) {
-  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(source_data_, source_stride_, mask_);
   FillConstant(reference_data_, reference_stride_, 0);
   CheckSAD();
 }
@@ -317,19 +520,64 @@
   CheckSAD();
   source_stride_ = tmp_stride;
 }
+
+TEST_P(SADavgVP9Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+TEST_P(SADavgVP9Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADavgVP9Test, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgVP9Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgVP9Test, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  source_stride_ = tmp_stride;
+}
 #endif  // CONFIG_VP9_ENCODER
 
 TEST_P(SADx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
-  FillConstant(GetReference(0), reference_stride_, 255);
-  FillConstant(GetReference(1), reference_stride_, 255);
-  FillConstant(GetReference(2), reference_stride_, 255);
-  FillConstant(GetReference(3), reference_stride_, 255);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
   CheckSADs();
 }
 
 TEST_P(SADx4Test, MaxSrc) {
-  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(source_data_, source_stride_, mask_);
   FillConstant(GetReference(0), reference_stride_, 0);
   FillConstant(GetReference(1), reference_stride_, 0);
   FillConstant(GetReference(2), reference_stride_, 0);
@@ -375,6 +623,18 @@
   source_stride_ = tmp_stride;
 }
 
+TEST_P(SADx4Test, SrcAlignedByWidth) {
+  uint8_t * tmp_source_data = source_data_;
+  source_data_ += width_;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
 using std::tr1::make_tuple;
 
 //------------------------------------------------------------------------------
@@ -386,11 +646,11 @@
 const SadMxNFunc sad_8x8_c = vp8_sad8x8_c;
 const SadMxNFunc sad_4x4_c = vp8_sad4x4_c;
 const SadMxNParam c_tests[] = {
-  make_tuple(16, 16, sad_16x16_c),
-  make_tuple(8, 16, sad_8x16_c),
-  make_tuple(16, 8, sad_16x8_c),
-  make_tuple(8, 8, sad_8x8_c),
-  make_tuple(4, 4, sad_4x4_c),
+  make_tuple(16, 16, sad_16x16_c, -1),
+  make_tuple(8, 16, sad_8x16_c, -1),
+  make_tuple(16, 8, sad_16x8_c, -1),
+  make_tuple(8, 8, sad_8x8_c, -1),
+  make_tuple(4, 4, sad_4x4_c, -1),
 };
 INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 #endif  // CONFIG_VP8_ENCODER
@@ -406,15 +666,15 @@
 const SadMxNVp9Func sad_4x8_c_vp9 = vp9_sad4x8_c;
 const SadMxNVp9Func sad_4x4_c_vp9 = vp9_sad4x4_c;
 const SadMxNVp9Param c_vp9_tests[] = {
-  make_tuple(64, 64, sad_64x64_c_vp9),
-  make_tuple(32, 32, sad_32x32_c_vp9),
-  make_tuple(16, 16, sad_16x16_c_vp9),
-  make_tuple(8, 16, sad_8x16_c_vp9),
-  make_tuple(16, 8, sad_16x8_c_vp9),
-  make_tuple(8, 8, sad_8x8_c_vp9),
-  make_tuple(8, 4, sad_8x4_c_vp9),
-  make_tuple(4, 8, sad_4x8_c_vp9),
-  make_tuple(4, 4, sad_4x4_c_vp9),
+  make_tuple(64, 64, sad_64x64_c_vp9, -1),
+  make_tuple(32, 32, sad_32x32_c_vp9, -1),
+  make_tuple(16, 16, sad_16x16_c_vp9, -1),
+  make_tuple(8, 16, sad_8x16_c_vp9, -1),
+  make_tuple(16, 8, sad_16x8_c_vp9, -1),
+  make_tuple(8, 8, sad_8x8_c_vp9, -1),
+  make_tuple(8, 4, sad_8x4_c_vp9, -1),
+  make_tuple(4, 8, sad_4x8_c_vp9, -1),
+  make_tuple(4, 4, sad_4x4_c_vp9, -1),
 };
 INSTANTIATE_TEST_CASE_P(C, SADVP9Test, ::testing::ValuesIn(c_vp9_tests));
 
@@ -432,19 +692,186 @@
 const SadMxNx4Func sad_4x8x4d_c = vp9_sad4x8x4d_c;
 const SadMxNx4Func sad_4x4x4d_c = vp9_sad4x4x4d_c;
 INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values(
-                        make_tuple(64, 64, sad_64x64x4d_c),
-                        make_tuple(64, 32, sad_64x32x4d_c),
-                        make_tuple(32, 64, sad_32x64x4d_c),
-                        make_tuple(32, 32, sad_32x32x4d_c),
-                        make_tuple(32, 16, sad_32x16x4d_c),
-                        make_tuple(16, 32, sad_16x32x4d_c),
-                        make_tuple(16, 16, sad_16x16x4d_c),
-                        make_tuple(16, 8, sad_16x8x4d_c),
-                        make_tuple(8, 16, sad_8x16x4d_c),
-                        make_tuple(8, 8, sad_8x8x4d_c),
-                        make_tuple(8, 4, sad_8x4x4d_c),
-                        make_tuple(4, 8, sad_4x8x4d_c),
-                        make_tuple(4, 4, sad_4x4x4d_c)));
+                        make_tuple(64, 64, sad_64x64x4d_c, -1),
+                        make_tuple(64, 32, sad_64x32x4d_c, -1),
+                        make_tuple(32, 64, sad_32x64x4d_c, -1),
+                        make_tuple(32, 32, sad_32x32x4d_c, -1),
+                        make_tuple(32, 16, sad_32x16x4d_c, -1),
+                        make_tuple(16, 32, sad_16x32x4d_c, -1),
+                        make_tuple(16, 16, sad_16x16x4d_c, -1),
+                        make_tuple(16, 8, sad_16x8x4d_c, -1),
+                        make_tuple(8, 16, sad_8x16x4d_c, -1),
+                        make_tuple(8, 8, sad_8x8x4d_c, -1),
+                        make_tuple(8, 4, sad_8x4x4d_c, -1),
+                        make_tuple(4, 8, sad_4x8x4d_c, -1),
+                        make_tuple(4, 4, sad_4x4x4d_c, -1)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const SadMxNVp9Func highbd_sad_64x64_c_vp9 = vp9_highbd_sad64x64_c;
+const SadMxNVp9Func highbd_sad_32x32_c_vp9 = vp9_highbd_sad32x32_c;
+const SadMxNVp9Func highbd_sad_16x16_c_vp9 = vp9_highbd_sad16x16_c;
+const SadMxNVp9Func highbd_sad_8x16_c_vp9 = vp9_highbd_sad8x16_c;
+const SadMxNVp9Func highbd_sad_16x8_c_vp9 = vp9_highbd_sad16x8_c;
+const SadMxNVp9Func highbd_sad_8x8_c_vp9 = vp9_highbd_sad8x8_c;
+const SadMxNVp9Func highbd_sad_8x4_c_vp9 = vp9_highbd_sad8x4_c;
+const SadMxNVp9Func highbd_sad_4x8_c_vp9 = vp9_highbd_sad4x8_c;
+const SadMxNVp9Func highbd_sad_4x4_c_vp9 = vp9_highbd_sad4x4_c;
+const SadMxNVp9Param c_vp9_highbd_8_tests[] = {
+  make_tuple(64, 64, highbd_sad_64x64_c_vp9, 8),
+  make_tuple(32, 32, highbd_sad_32x32_c_vp9, 8),
+  make_tuple(16, 16, highbd_sad_16x16_c_vp9, 8),
+  make_tuple(8, 16, highbd_sad_8x16_c_vp9, 8),
+  make_tuple(16, 8, highbd_sad_16x8_c_vp9, 8),
+  make_tuple(8, 8, highbd_sad_8x8_c_vp9, 8),
+  make_tuple(8, 4, highbd_sad_8x4_c_vp9, 8),
+  make_tuple(4, 8, highbd_sad_4x8_c_vp9, 8),
+  make_tuple(4, 4, highbd_sad_4x4_c_vp9, 8),
+};
+INSTANTIATE_TEST_CASE_P(C_8, SADVP9Test,
+                        ::testing::ValuesIn(c_vp9_highbd_8_tests));
+
+const SadMxNVp9Param c_vp9_highbd_10_tests[] = {
+  make_tuple(64, 64, highbd_sad_64x64_c_vp9, 10),
+  make_tuple(32, 32, highbd_sad_32x32_c_vp9, 10),
+  make_tuple(16, 16, highbd_sad_16x16_c_vp9, 10),
+  make_tuple(8, 16, highbd_sad_8x16_c_vp9, 10),
+  make_tuple(16, 8, highbd_sad_16x8_c_vp9, 10),
+  make_tuple(8, 8, highbd_sad_8x8_c_vp9, 10),
+  make_tuple(8, 4, highbd_sad_8x4_c_vp9, 10),
+  make_tuple(4, 8, highbd_sad_4x8_c_vp9, 10),
+  make_tuple(4, 4, highbd_sad_4x4_c_vp9, 10),
+};
+INSTANTIATE_TEST_CASE_P(C_10, SADVP9Test,
+                        ::testing::ValuesIn(c_vp9_highbd_10_tests));
+
+const SadMxNVp9Param c_vp9_highbd_12_tests[] = {
+  make_tuple(64, 64, highbd_sad_64x64_c_vp9, 12),
+  make_tuple(32, 32, highbd_sad_32x32_c_vp9, 12),
+  make_tuple(16, 16, highbd_sad_16x16_c_vp9, 12),
+  make_tuple(8, 16, highbd_sad_8x16_c_vp9, 12),
+  make_tuple(16, 8, highbd_sad_16x8_c_vp9, 12),
+  make_tuple(8, 8, highbd_sad_8x8_c_vp9, 12),
+  make_tuple(8, 4, highbd_sad_8x4_c_vp9, 12),
+  make_tuple(4, 8, highbd_sad_4x8_c_vp9, 12),
+  make_tuple(4, 4, highbd_sad_4x4_c_vp9, 12),
+};
+INSTANTIATE_TEST_CASE_P(C_12, SADVP9Test,
+                        ::testing::ValuesIn(c_vp9_highbd_12_tests));
+
+const SadMxNAvgVp9Func highbd_sad8x4_avg_c_vp9 = vp9_highbd_sad8x4_avg_c;
+const SadMxNAvgVp9Func highbd_sad8x8_avg_c_vp9 = vp9_highbd_sad8x8_avg_c;
+const SadMxNAvgVp9Func highbd_sad8x16_avg_c_vp9 = vp9_highbd_sad8x16_avg_c;
+const SadMxNAvgVp9Func highbd_sad16x8_avg_c_vp9 = vp9_highbd_sad16x8_avg_c;
+const SadMxNAvgVp9Func highbd_sad16x16_avg_c_vp9 = vp9_highbd_sad16x16_avg_c;
+const SadMxNAvgVp9Func highbd_sad16x32_avg_c_vp9 = vp9_highbd_sad16x32_avg_c;
+const SadMxNAvgVp9Func highbd_sad32x16_avg_c_vp9 = vp9_highbd_sad32x16_avg_c;
+const SadMxNAvgVp9Func highbd_sad32x32_avg_c_vp9 = vp9_highbd_sad32x32_avg_c;
+const SadMxNAvgVp9Func highbd_sad32x64_avg_c_vp9 = vp9_highbd_sad32x64_avg_c;
+const SadMxNAvgVp9Func highbd_sad64x32_avg_c_vp9 = vp9_highbd_sad64x32_avg_c;
+const SadMxNAvgVp9Func highbd_sad64x64_avg_c_vp9 = vp9_highbd_sad64x64_avg_c;
+SadMxNAvgVp9Param avg_c_vp9_highbd_8_tests[] = {
+  make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 8),
+  make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 8),
+  make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 8),
+  make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 8),
+  make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 8),
+  make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 8),
+  make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 8),
+  make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 8),
+  make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 8),
+  make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 8),
+  make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 8)};
+INSTANTIATE_TEST_CASE_P(C_8, SADavgVP9Test,
+                        ::testing::ValuesIn(avg_c_vp9_highbd_8_tests));
+
+SadMxNAvgVp9Param avg_c_vp9_highbd_10_tests[] = {
+  make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 10),
+  make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 10),
+  make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 10),
+  make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 10),
+  make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 10),
+  make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 10),
+  make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 10),
+  make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 10),
+  make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 10),
+  make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 10),
+  make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 10)};
+INSTANTIATE_TEST_CASE_P(C_10, SADavgVP9Test,
+                        ::testing::ValuesIn(avg_c_vp9_highbd_10_tests));
+
+SadMxNAvgVp9Param avg_c_vp9_highbd_12_tests[] = {
+  make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 12),
+  make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 12),
+  make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 12),
+  make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 12),
+  make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 12),
+  make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 12),
+  make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 12),
+  make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 12),
+  make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 12),
+  make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 12),
+  make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 12)};
+INSTANTIATE_TEST_CASE_P(C_12, SADavgVP9Test,
+                        ::testing::ValuesIn(avg_c_vp9_highbd_12_tests));
+
+const SadMxNx4Func highbd_sad_64x64x4d_c = vp9_highbd_sad64x64x4d_c;
+const SadMxNx4Func highbd_sad_64x32x4d_c = vp9_highbd_sad64x32x4d_c;
+const SadMxNx4Func highbd_sad_32x64x4d_c = vp9_highbd_sad32x64x4d_c;
+const SadMxNx4Func highbd_sad_32x32x4d_c = vp9_highbd_sad32x32x4d_c;
+const SadMxNx4Func highbd_sad_32x16x4d_c = vp9_highbd_sad32x16x4d_c;
+const SadMxNx4Func highbd_sad_16x32x4d_c = vp9_highbd_sad16x32x4d_c;
+const SadMxNx4Func highbd_sad_16x16x4d_c = vp9_highbd_sad16x16x4d_c;
+const SadMxNx4Func highbd_sad_16x8x4d_c  = vp9_highbd_sad16x8x4d_c;
+const SadMxNx4Func highbd_sad_8x16x4d_c  = vp9_highbd_sad8x16x4d_c;
+const SadMxNx4Func highbd_sad_8x8x4d_c   = vp9_highbd_sad8x8x4d_c;
+const SadMxNx4Func highbd_sad_8x4x4d_c   = vp9_highbd_sad8x4x4d_c;
+const SadMxNx4Func highbd_sad_4x8x4d_c   = vp9_highbd_sad4x8x4d_c;
+const SadMxNx4Func highbd_sad_4x4x4d_c   = vp9_highbd_sad4x4x4d_c;
+INSTANTIATE_TEST_CASE_P(C_8, SADx4Test, ::testing::Values(
+                        make_tuple(64, 64, highbd_sad_64x64x4d_c, 8),
+                        make_tuple(64, 32, highbd_sad_64x32x4d_c, 8),
+                        make_tuple(32, 64, highbd_sad_32x64x4d_c, 8),
+                        make_tuple(32, 32, highbd_sad_32x32x4d_c, 8),
+                        make_tuple(32, 16, highbd_sad_32x16x4d_c, 8),
+                        make_tuple(16, 32, highbd_sad_16x32x4d_c, 8),
+                        make_tuple(16, 16, highbd_sad_16x16x4d_c, 8),
+                        make_tuple(16, 8,  highbd_sad_16x8x4d_c,  8),
+                        make_tuple(8,  16, highbd_sad_8x16x4d_c,  8),
+                        make_tuple(8,  8,  highbd_sad_8x8x4d_c,   8),
+                        make_tuple(8,  4,  highbd_sad_8x4x4d_c,   8),
+                        make_tuple(4,  8,  highbd_sad_4x8x4d_c,   8),
+                        make_tuple(4,  4,  highbd_sad_4x4x4d_c,   8)));
+
+INSTANTIATE_TEST_CASE_P(C_10, SADx4Test, ::testing::Values(
+                        make_tuple(64, 64, highbd_sad_64x64x4d_c, 10),
+                        make_tuple(64, 32, highbd_sad_64x32x4d_c, 10),
+                        make_tuple(32, 64, highbd_sad_32x64x4d_c, 10),
+                        make_tuple(32, 32, highbd_sad_32x32x4d_c, 10),
+                        make_tuple(32, 16, highbd_sad_32x16x4d_c, 10),
+                        make_tuple(16, 32, highbd_sad_16x32x4d_c, 10),
+                        make_tuple(16, 16, highbd_sad_16x16x4d_c, 10),
+                        make_tuple(16, 8,  highbd_sad_16x8x4d_c,  10),
+                        make_tuple(8,  16, highbd_sad_8x16x4d_c,  10),
+                        make_tuple(8,  8,  highbd_sad_8x8x4d_c,   10),
+                        make_tuple(8,  4,  highbd_sad_8x4x4d_c,   10),
+                        make_tuple(4,  8,  highbd_sad_4x8x4d_c,   10),
+                        make_tuple(4,  4,  highbd_sad_4x4x4d_c,   10)));
+
+INSTANTIATE_TEST_CASE_P(C_12, SADx4Test, ::testing::Values(
+                        make_tuple(64, 64, highbd_sad_64x64x4d_c, 12),
+                        make_tuple(64, 32, highbd_sad_64x32x4d_c, 12),
+                        make_tuple(32, 64, highbd_sad_32x64x4d_c, 12),
+                        make_tuple(32, 32, highbd_sad_32x32x4d_c, 12),
+                        make_tuple(32, 16, highbd_sad_32x16x4d_c, 12),
+                        make_tuple(16, 32, highbd_sad_16x32x4d_c, 12),
+                        make_tuple(16, 16, highbd_sad_16x16x4d_c, 12),
+                        make_tuple(16, 8,  highbd_sad_16x8x4d_c,  12),
+                        make_tuple(8,  16, highbd_sad_8x16x4d_c,  12),
+                        make_tuple(8,  8,  highbd_sad_8x8x4d_c,   12),
+                        make_tuple(8,  4,  highbd_sad_8x4x4d_c,   12),
+                        make_tuple(4,  8,  highbd_sad_4x8x4d_c,   12),
+                        make_tuple(4,  4,  highbd_sad_4x4x4d_c,   12)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_VP9_ENCODER
 
 //------------------------------------------------------------------------------
@@ -453,7 +880,7 @@
 #if CONFIG_VP8_ENCODER
 const SadMxNFunc sad_16x16_armv6 = vp8_sad16x16_armv6;
 INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values(
-                        make_tuple(16, 16, sad_16x16_armv6)));
+                        make_tuple(16, 16, sad_16x16_armv6, -1)));
 #endif  // CONFIG_VP8_ENCODER
 #endif  // HAVE_MEDIA
 
@@ -465,11 +892,11 @@
 const SadMxNFunc sad_8x8_neon = vp8_sad8x8_neon;
 const SadMxNFunc sad_4x4_neon = vp8_sad4x4_neon;
 INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values(
-                        make_tuple(16, 16, sad_16x16_neon),
-                        make_tuple(8, 16, sad_8x16_neon),
-                        make_tuple(16, 8, sad_16x8_neon),
-                        make_tuple(8, 8, sad_8x8_neon),
-                        make_tuple(4, 4, sad_4x4_neon)));
+                        make_tuple(16, 16, sad_16x16_neon, -1),
+                        make_tuple(8, 16, sad_8x16_neon, -1),
+                        make_tuple(16, 8, sad_16x8_neon, -1),
+                        make_tuple(8, 8, sad_8x8_neon, -1),
+                        make_tuple(4, 4, sad_4x4_neon, -1)));
 #endif  // CONFIG_VP8_ENCODER
 #if CONFIG_VP9_ENCODER
 const SadMxNVp9Func sad_64x64_neon_vp9 = vp9_sad64x64_neon;
@@ -477,10 +904,10 @@
 const SadMxNVp9Func sad_16x16_neon_vp9 = vp9_sad16x16_neon;
 const SadMxNVp9Func sad_8x8_neon_vp9 = vp9_sad8x8_neon;
 const SadMxNVp9Param neon_vp9_tests[] = {
-  make_tuple(64, 64, sad_64x64_neon_vp9),
-  make_tuple(32, 32, sad_32x32_neon_vp9),
-  make_tuple(16, 16, sad_16x16_neon_vp9),
-  make_tuple(8, 8, sad_8x8_neon_vp9),
+  make_tuple(64, 64, sad_64x64_neon_vp9, -1),
+  make_tuple(32, 32, sad_32x32_neon_vp9, -1),
+  make_tuple(16, 16, sad_16x16_neon_vp9, -1),
+  make_tuple(8, 8, sad_8x8_neon_vp9, -1),
 };
 INSTANTIATE_TEST_CASE_P(NEON, SADVP9Test, ::testing::ValuesIn(neon_vp9_tests));
 #endif  // CONFIG_VP9_ENCODER
@@ -496,11 +923,11 @@
 const SadMxNFunc sad_8x8_mmx = vp8_sad8x8_mmx;
 const SadMxNFunc sad_4x4_mmx = vp8_sad4x4_mmx;
 const SadMxNParam mmx_tests[] = {
-  make_tuple(16, 16, sad_16x16_mmx),
-  make_tuple(8, 16, sad_8x16_mmx),
-  make_tuple(16, 8, sad_16x8_mmx),
-  make_tuple(8, 8, sad_8x8_mmx),
-  make_tuple(4, 4, sad_4x4_mmx),
+  make_tuple(16, 16, sad_16x16_mmx, -1),
+  make_tuple(8, 16, sad_8x16_mmx, -1),
+  make_tuple(16, 8, sad_16x8_mmx, -1),
+  make_tuple(8, 8, sad_8x8_mmx, -1),
+  make_tuple(4, 4, sad_4x4_mmx, -1),
 };
 INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
 #endif  // CONFIG_VP8_ENCODER
@@ -513,14 +940,14 @@
 const SadMxNVp9Func sad_4x4_sse_vp9 = vp9_sad4x4_sse;
 const SadMxNVp9Func sad_4x8_sse_vp9 = vp9_sad4x8_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADVP9Test, ::testing::Values(
-                        make_tuple(4, 4, sad_4x4_sse_vp9),
-                        make_tuple(4, 8, sad_4x8_sse_vp9)));
+                        make_tuple(4, 4, sad_4x4_sse_vp9, -1),
+                        make_tuple(4, 8, sad_4x8_sse_vp9, -1)));
 
 const SadMxNx4Func sad_4x8x4d_sse = vp9_sad4x8x4d_sse;
 const SadMxNx4Func sad_4x4x4d_sse = vp9_sad4x4x4d_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
-                        make_tuple(4, 8, sad_4x8x4d_sse),
-                        make_tuple(4, 4, sad_4x4x4d_sse)));
+                        make_tuple(4, 8, sad_4x8x4d_sse, -1),
+                        make_tuple(4, 4, sad_4x4x4d_sse, -1)));
 #endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_SSE
@@ -533,11 +960,11 @@
 const SadMxNFunc sad_8x8_wmt = vp8_sad8x8_wmt;
 const SadMxNFunc sad_4x4_wmt = vp8_sad4x4_wmt;
 const SadMxNParam sse2_tests[] = {
-  make_tuple(16, 16, sad_16x16_wmt),
-  make_tuple(8, 16, sad_8x16_wmt),
-  make_tuple(16, 8, sad_16x8_wmt),
-  make_tuple(8, 8, sad_8x8_wmt),
-  make_tuple(4, 4, sad_4x4_wmt),
+  make_tuple(16, 16, sad_16x16_wmt, -1),
+  make_tuple(8, 16, sad_8x16_wmt, -1),
+  make_tuple(16, 8, sad_16x8_wmt, -1),
+  make_tuple(8, 8, sad_8x8_wmt, -1),
+  make_tuple(4, 4, sad_4x4_wmt, -1),
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 #endif  // CONFIG_VP8_ENCODER
@@ -555,20 +982,6 @@
 const SadMxNVp9Func sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
 const SadMxNVp9Func sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
 const SadMxNVp9Func sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
-const SadMxNVp9Param sse2_vp9_tests[] = {
-  make_tuple(64, 64, sad_64x64_sse2_vp9),
-  make_tuple(64, 32, sad_64x32_sse2_vp9),
-  make_tuple(32, 64, sad_32x64_sse2_vp9),
-  make_tuple(32, 32, sad_32x32_sse2_vp9),
-  make_tuple(32, 16, sad_32x16_sse2_vp9),
-  make_tuple(16, 32, sad_16x32_sse2_vp9),
-  make_tuple(16, 16, sad_16x16_sse2_vp9),
-  make_tuple(16, 8, sad_16x8_sse2_vp9),
-  make_tuple(8, 16, sad_8x16_sse2_vp9),
-  make_tuple(8, 8, sad_8x8_sse2_vp9),
-  make_tuple(8, 4, sad_8x4_sse2_vp9),
-};
-INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::ValuesIn(sse2_vp9_tests));
 
 const SadMxNx4Func sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
 const SadMxNx4Func sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
@@ -581,18 +994,214 @@
 const SadMxNx4Func sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2;
 const SadMxNx4Func sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2;
 const SadMxNx4Func sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const SadMxNVp9Func highbd_sad8x4_sse2_vp9 = vp9_highbd_sad8x4_sse2;
+const SadMxNVp9Func highbd_sad8x8_sse2_vp9 = vp9_highbd_sad8x8_sse2;
+const SadMxNVp9Func highbd_sad8x16_sse2_vp9 = vp9_highbd_sad8x16_sse2;
+const SadMxNVp9Func highbd_sad16x8_sse2_vp9 = vp9_highbd_sad16x8_sse2;
+const SadMxNVp9Func highbd_sad16x16_sse2_vp9 = vp9_highbd_sad16x16_sse2;
+const SadMxNVp9Func highbd_sad16x32_sse2_vp9 = vp9_highbd_sad16x32_sse2;
+const SadMxNVp9Func highbd_sad32x16_sse2_vp9 = vp9_highbd_sad32x16_sse2;
+const SadMxNVp9Func highbd_sad32x32_sse2_vp9 = vp9_highbd_sad32x32_sse2;
+const SadMxNVp9Func highbd_sad32x64_sse2_vp9 = vp9_highbd_sad32x64_sse2;
+const SadMxNVp9Func highbd_sad64x32_sse2_vp9 = vp9_highbd_sad64x32_sse2;
+const SadMxNVp9Func highbd_sad64x64_sse2_vp9 = vp9_highbd_sad64x64_sse2;
+
+INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::Values(
+                        make_tuple(64, 64, sad_64x64_sse2_vp9, -1),
+                        make_tuple(64, 32, sad_64x32_sse2_vp9, -1),
+                        make_tuple(32, 64, sad_32x64_sse2_vp9, -1),
+                        make_tuple(32, 32, sad_32x32_sse2_vp9, -1),
+                        make_tuple(32, 16, sad_32x16_sse2_vp9, -1),
+                        make_tuple(16, 32, sad_16x32_sse2_vp9, -1),
+                        make_tuple(16, 16, sad_16x16_sse2_vp9, -1),
+                        make_tuple(16, 8, sad_16x8_sse2_vp9, -1),
+                        make_tuple(8, 16, sad_8x16_sse2_vp9, -1),
+                        make_tuple(8, 8, sad_8x8_sse2_vp9, -1),
+                        make_tuple(8, 4, sad_8x4_sse2_vp9, -1),
+                        make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 8),
+                        make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 8),
+                        make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 8),
+                        make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 8),
+                        make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 8),
+                        make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 8),
+                        make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 8),
+                        make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 8),
+                        make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 8),
+                        make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 8),
+                        make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 8),
+                        make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 10),
+                        make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 10),
+                        make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 10),
+                        make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 10),
+                        make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 10),
+                        make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 10),
+                        make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 10),
+                        make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 10),
+                        make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 10),
+                        make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 10),
+                        make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 10),
+                        make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 12),
+                        make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 12),
+                        make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 12),
+                        make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 12),
+                        make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 12),
+                        make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 12),
+                        make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 12),
+                        make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 12),
+                        make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 12),
+                        make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 12),
+                        make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 12)));
+
+const SadMxNAvgVp9Func highbd_sad8x4_avg_sse2_vp9 = vp9_highbd_sad8x4_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad8x8_avg_sse2_vp9 = vp9_highbd_sad8x8_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad8x16_avg_sse2_vp9 =
+  vp9_highbd_sad8x16_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad16x8_avg_sse2_vp9 =
+  vp9_highbd_sad16x8_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad16x16_avg_sse2_vp9 =
+  vp9_highbd_sad16x16_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad16x32_avg_sse2_vp9 =
+  vp9_highbd_sad16x32_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad32x16_avg_sse2_vp9 =
+  vp9_highbd_sad32x16_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad32x32_avg_sse2_vp9 =
+  vp9_highbd_sad32x32_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad32x64_avg_sse2_vp9 =
+  vp9_highbd_sad32x64_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad64x32_avg_sse2_vp9 =
+  vp9_highbd_sad64x32_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad64x64_avg_sse2_vp9 =
+  vp9_highbd_sad64x64_avg_sse2;
+
+INSTANTIATE_TEST_CASE_P(SSE2, SADavgVP9Test, ::testing::Values(
+                        make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 8),
+                        make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 8),
+                        make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 8),
+                        make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 8),
+                        make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 8),
+                        make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 8),
+                        make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 8),
+                        make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 8),
+                        make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 8),
+                        make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 8),
+                        make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 8),
+                        make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 10),
+                        make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 10),
+                        make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 10),
+                        make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 10),
+                        make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 10),
+                        make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 10),
+                        make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 10),
+                        make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 10),
+                        make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 10),
+                        make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 10),
+                        make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 10),
+                        make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 12),
+                        make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 12),
+                        make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 12),
+                        make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 12),
+                        make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 12),
+                        make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 12),
+                        make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 12),
+                        make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 12),
+                        make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 12),
+                        make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 12),
+                        make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 12)));
+
+const SadMxNx4Func highbd_sad_64x64x4d_sse2 = vp9_highbd_sad64x64x4d_sse2;
+const SadMxNx4Func highbd_sad_64x32x4d_sse2 = vp9_highbd_sad64x32x4d_sse2;
+const SadMxNx4Func highbd_sad_32x64x4d_sse2 = vp9_highbd_sad32x64x4d_sse2;
+const SadMxNx4Func highbd_sad_32x32x4d_sse2 = vp9_highbd_sad32x32x4d_sse2;
+const SadMxNx4Func highbd_sad_32x16x4d_sse2 = vp9_highbd_sad32x16x4d_sse2;
+const SadMxNx4Func highbd_sad_16x32x4d_sse2 = vp9_highbd_sad16x32x4d_sse2;
+const SadMxNx4Func highbd_sad_16x16x4d_sse2 = vp9_highbd_sad16x16x4d_sse2;
+const SadMxNx4Func highbd_sad_16x8x4d_sse2 = vp9_highbd_sad16x8x4d_sse2;
+const SadMxNx4Func highbd_sad_8x16x4d_sse2 = vp9_highbd_sad8x16x4d_sse2;
+const SadMxNx4Func highbd_sad_8x8x4d_sse2 = vp9_highbd_sad8x8x4d_sse2;
+const SadMxNx4Func highbd_sad_8x4x4d_sse2 = vp9_highbd_sad8x4x4d_sse2;
+const SadMxNx4Func highbd_sad_4x8x4d_sse2 = vp9_highbd_sad4x8x4d_sse2;
+const SadMxNx4Func highbd_sad_4x4x4d_sse2 = vp9_highbd_sad4x4x4d_sse2;
+
 INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
-                        make_tuple(64, 64, sad_64x64x4d_sse2),
-                        make_tuple(64, 32, sad_64x32x4d_sse2),
-                        make_tuple(32, 64, sad_32x64x4d_sse2),
-                        make_tuple(32, 32, sad_32x32x4d_sse2),
-                        make_tuple(32, 16, sad_32x16x4d_sse2),
-                        make_tuple(16, 32, sad_16x32x4d_sse2),
-                        make_tuple(16, 16, sad_16x16x4d_sse2),
-                        make_tuple(16, 8, sad_16x8x4d_sse2),
-                        make_tuple(8, 16, sad_8x16x4d_sse2),
-                        make_tuple(8, 8, sad_8x8x4d_sse2),
-                        make_tuple(8, 4, sad_8x4x4d_sse2)));
+                        make_tuple(64, 64, sad_64x64x4d_sse2, -1),
+                        make_tuple(64, 32, sad_64x32x4d_sse2, -1),
+                        make_tuple(32, 64, sad_32x64x4d_sse2, -1),
+                        make_tuple(32, 32, sad_32x32x4d_sse2, -1),
+                        make_tuple(32, 16, sad_32x16x4d_sse2, -1),
+                        make_tuple(16, 32, sad_16x32x4d_sse2, -1),
+                        make_tuple(16, 16, sad_16x16x4d_sse2, -1),
+                        make_tuple(16, 8, sad_16x8x4d_sse2,  -1),
+                        make_tuple(8, 16, sad_8x16x4d_sse2,  -1),
+                        make_tuple(8, 8, sad_8x8x4d_sse2,   -1),
+                        make_tuple(8, 4, sad_8x4x4d_sse2,   -1),
+                        make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 8),
+                        make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 8),
+                        make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 8),
+                        make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 8),
+                        make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 8),
+                        make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 8),
+                        make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 8),
+                        make_tuple(16, 8, highbd_sad_16x8x4d_sse2,  8),
+                        make_tuple(8, 16, highbd_sad_8x16x4d_sse2,  8),
+                        make_tuple(8, 8, highbd_sad_8x8x4d_sse2,   8),
+                        make_tuple(8, 4, highbd_sad_8x4x4d_sse2,   8),
+                        make_tuple(4, 8, highbd_sad_4x8x4d_sse2,   8),
+                        make_tuple(4, 4, highbd_sad_4x4x4d_sse2,   8),
+                        make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 10),
+                        make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 10),
+                        make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 10),
+                        make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 10),
+                        make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 10),
+                        make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 10),
+                        make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 10),
+                        make_tuple(16, 8, highbd_sad_16x8x4d_sse2,  10),
+                        make_tuple(8, 16, highbd_sad_8x16x4d_sse2,  10),
+                        make_tuple(8, 8, highbd_sad_8x8x4d_sse2,   10),
+                        make_tuple(8, 4, highbd_sad_8x4x4d_sse2,   10),
+                        make_tuple(4, 8, highbd_sad_4x8x4d_sse2,   10),
+                        make_tuple(4, 4, highbd_sad_4x4x4d_sse2,   10),
+                        make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 12),
+                        make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 12),
+                        make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 12),
+                        make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 12),
+                        make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 12),
+                        make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 12),
+                        make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 12),
+                        make_tuple(16, 8, highbd_sad_16x8x4d_sse2,  12),
+                        make_tuple(8, 16, highbd_sad_8x16x4d_sse2,  12),
+                        make_tuple(8, 8, highbd_sad_8x8x4d_sse2,   12),
+                        make_tuple(8, 4, highbd_sad_8x4x4d_sse2,   12),
+                        make_tuple(4, 8, highbd_sad_4x8x4d_sse2,   12),
+                        make_tuple(4, 4, highbd_sad_4x4x4d_sse2,   12)));
+#else
+INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::Values(
+                        make_tuple(64, 64, sad_64x64_sse2_vp9, -1),
+                        make_tuple(64, 32, sad_64x32_sse2_vp9, -1),
+                        make_tuple(32, 64, sad_32x64_sse2_vp9, -1),
+                        make_tuple(32, 32, sad_32x32_sse2_vp9, -1),
+                        make_tuple(32, 16, sad_32x16_sse2_vp9, -1),
+                        make_tuple(16, 32, sad_16x32_sse2_vp9, -1),
+                        make_tuple(16, 16, sad_16x16_sse2_vp9, -1),
+                        make_tuple(16, 8, sad_16x8_sse2_vp9, -1),
+                        make_tuple(8, 16, sad_8x16_sse2_vp9, -1),
+                        make_tuple(8, 8, sad_8x8_sse2_vp9, -1),
+                        make_tuple(8, 4, sad_8x4_sse2_vp9, -1)));
+
+INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
+                        make_tuple(64, 64, sad_64x64x4d_sse2, -1),
+                        make_tuple(64, 32, sad_64x32x4d_sse2, -1),
+                        make_tuple(32, 64, sad_32x64x4d_sse2, -1),
+                        make_tuple(32, 32, sad_32x32x4d_sse2, -1),
+                        make_tuple(32, 16, sad_32x16x4d_sse2, -1),
+                        make_tuple(16, 32, sad_16x32x4d_sse2, -1),
+                        make_tuple(16, 16, sad_16x16x4d_sse2, -1),
+                        make_tuple(16, 8, sad_16x8x4d_sse2,  -1),
+                        make_tuple(8, 16, sad_8x16x4d_sse2,  -1),
+                        make_tuple(8, 8, sad_8x8x4d_sse2,   -1),
+                        make_tuple(8, 4, sad_8x4x4d_sse2,   -1)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_SSE2
@@ -605,11 +1214,11 @@
 const SadMxNx4Func sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3;
 const SadMxNx4Func sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(
-                        make_tuple(16, 16, sad_16x16x4d_sse3),
-                        make_tuple(16, 8, sad_16x8x4d_sse3),
-                        make_tuple(8, 16, sad_8x16x4d_sse3),
-                        make_tuple(8, 8, sad_8x8x4d_sse3),
-                        make_tuple(4, 4, sad_4x4x4d_sse3)));
+                        make_tuple(16, 16, sad_16x16x4d_sse3, -1),
+                        make_tuple(16, 8, sad_16x8x4d_sse3, -1),
+                        make_tuple(8, 16, sad_8x16x4d_sse3, -1),
+                        make_tuple(8, 8, sad_8x8x4d_sse3, -1),
+                        make_tuple(4, 4, sad_4x4x4d_sse3, -1)));
 #endif  // CONFIG_VP8_ENCODER
 #endif  // HAVE_SSE3
 
@@ -618,7 +1227,7 @@
 #if CONFIG_VP8_ENCODER
 const SadMxNFunc sad_16x16_sse3 = vp8_sad16x16_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
-                        make_tuple(16, 16, sad_16x16_sse3)));
+                        make_tuple(16, 16, sad_16x16_sse3, -1)));
 #endif  // CONFIG_VP8_ENCODER
 #endif  // CONFIG_USE_X86INC
 #endif  // HAVE_SSSE3
@@ -625,25 +1234,11 @@
 
 #if HAVE_AVX2
 #if CONFIG_VP9_ENCODER
-const SadMxNVp9Func sad_64x64_avx2_vp9 = vp9_sad64x64_avx2;
-const SadMxNVp9Func sad_64x32_avx2_vp9 = vp9_sad64x32_avx2;
-const SadMxNVp9Func sad_32x64_avx2_vp9 = vp9_sad32x64_avx2;
-const SadMxNVp9Func sad_32x32_avx2_vp9 = vp9_sad32x32_avx2;
-const SadMxNVp9Func sad_32x16_avx2_vp9 = vp9_sad32x16_avx2;
-const SadMxNVp9Param avx2_vp9_tests[] = {
-  make_tuple(64, 64, sad_64x64_avx2_vp9),
-  make_tuple(64, 32, sad_64x32_avx2_vp9),
-  make_tuple(32, 64, sad_32x64_avx2_vp9),
-  make_tuple(32, 32, sad_32x32_avx2_vp9),
-  make_tuple(32, 16, sad_32x16_avx2_vp9),
-};
-INSTANTIATE_TEST_CASE_P(AVX2, SADVP9Test, ::testing::ValuesIn(avx2_vp9_tests));
-
 const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
 const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
 INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(
-                        make_tuple(32, 32, sad_32x32x4d_avx2),
-                        make_tuple(64, 64, sad_64x64x4d_avx2)));
+                        make_tuple(32, 32, sad_32x32x4d_avx2, -1),
+                        make_tuple(64, 64, sad_64x64x4d_avx2, -1)));
 #endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_AVX2
 
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1652,37 +1652,37 @@
   specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
 
   add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad64x64/;
+  specialize qw/vp9_highbd_sad64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad32x64/;
+  specialize qw/vp9_highbd_sad32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad64x32/;
+  specialize qw/vp9_highbd_sad64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad32x16/;
+  specialize qw/vp9_highbd_sad32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad16x32/;
+  specialize qw/vp9_highbd_sad16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad32x32/;
+  specialize qw/vp9_highbd_sad32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad16x16/;
+  specialize qw/vp9_highbd_sad16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad16x8/;
+  specialize qw/vp9_highbd_sad16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad8x16/;
+  specialize qw/vp9_highbd_sad8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad8x8/;
+  specialize qw/vp9_highbd_sad8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad8x4/;
+  specialize qw/vp9_highbd_sad8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
   specialize qw/vp9_highbd_sad4x8/;
@@ -1691,37 +1691,37 @@
   specialize qw/vp9_highbd_sad4x4/;
 
   add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad64x64_avg/;
+  specialize qw/vp9_highbd_sad64x64_avg/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad32x64_avg/;
+  specialize qw/vp9_highbd_sad32x64_avg/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad64x32_avg/;
+  specialize qw/vp9_highbd_sad64x32_avg/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad32x16_avg/;
+  specialize qw/vp9_highbd_sad32x16_avg/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad16x32_avg/;
+  specialize qw/vp9_highbd_sad16x32_avg/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad32x32_avg/;
+  specialize qw/vp9_highbd_sad32x32_avg/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad16x16_avg/;
+  specialize qw/vp9_highbd_sad16x16_avg/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad16x8_avg/;
+  specialize qw/vp9_highbd_sad16x8_avg/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad8x16_avg/;
+  specialize qw/vp9_highbd_sad8x16_avg/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad8x8_avg/;
+  specialize qw/vp9_highbd_sad8x8_avg/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad8x4_avg/;
+  specialize qw/vp9_highbd_sad8x4_avg/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vp9_highbd_sad4x8_avg/;
@@ -1778,44 +1778,43 @@
   specialize qw/vp9_highbd_sad4x4x8/;
 
   add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad64x64x4d/;
+  specialize qw/vp9_highbd_sad64x64x4d sse2/;
 
   add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad32x64x4d/;
+  specialize qw/vp9_highbd_sad32x64x4d sse2/;
 
   add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad64x32x4d/;
+  specialize qw/vp9_highbd_sad64x32x4d sse2/;
 
   add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad32x16x4d/;
+  specialize qw/vp9_highbd_sad32x16x4d sse2/;
 
   add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad16x32x4d/;
+  specialize qw/vp9_highbd_sad16x32x4d sse2/;
 
   add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad32x32x4d/;
+  specialize qw/vp9_highbd_sad32x32x4d sse2/;
 
   add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad16x16x4d/;
+  specialize qw/vp9_highbd_sad16x16x4d sse2/;
 
   add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad16x8x4d/;
+  specialize qw/vp9_highbd_sad16x8x4d sse2/;
 
   add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad8x16x4d/;
+  specialize qw/vp9_highbd_sad8x16x4d sse2/;
 
   add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad8x8x4d/;
+  specialize qw/vp9_highbd_sad8x8x4d sse2/;
 
-  # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
   add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad8x4x4d/;
+  specialize qw/vp9_highbd_sad8x4x4d sse2/;
 
   add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad4x8x4d/;
+  specialize qw/vp9_highbd_sad4x8x4d sse2/;
 
   add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad4x4x4d/;
+  specialize qw/vp9_highbd_sad4x4x4d sse2/;
 
   add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   specialize qw/vp9_highbd_mse16x16/;
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm
@@ -1,0 +1,284 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+  movh                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m4, [ref1q+%5*2]
+  movhps                m5, [ref2q+%5*2]
+  movhps                m6, [ref3q+%5*2]
+  movhps                m7, [ref4q+%5*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  movu                  m2, [ref1q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m2, [ref1q+%5*2]
+  mova                  m3, m0
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+
+  movu                  m2, [ref2q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref2q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+
+  movu                  m2, [ref3q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref3q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+
+  movu                  m2, [ref4q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref4q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+  ; 1st 8 px
+  mova                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  mova                  m3, m0
+  movu                  m2, [ref1q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+
+  ; 2nd 8 px
+  mova                  m0, [srcq +(%4)*2]
+  mova                  m3, m0
+  movu                  m2, [ref1q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         unsigned int res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro HIGH_SADNXN4D 2
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4, one
+%else
+cglobal highbd_sad%1x%2x4d, 4, 8, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4, one
+%endif
+
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+  shl                 srcq, 1
+  shl                ref2q, 1
+  shl                ref3q, 1
+  shl                ref4q, 1
+  shl                ref1q, 1
+
+  mov                 oned, 0x00010001
+  movd                  m1, oned
+  pshufd                m1, m1, 0x0
+
+  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+  movhlps               m0, m4
+  movhlps               m1, m5
+  movhlps               m2, m6
+  movhlps               m3, m7
+  paddd                 m4, m0
+  paddd                 m5, m1
+  paddd                 m6, m2
+  paddd                 m7, m3
+  punpckldq             m4, m5
+  punpckldq             m6, m7
+  movhlps               m0, m4
+  movhlps               m1, m6
+  paddd                 m4, m0
+  paddd                 m6, m1
+  punpcklqdq            m4, m6
+  movifnidn             r4, r4mp
+  movu                [r4], m4
+  RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16,  8
+HIGH_SADNXN4D  8, 16
+HIGH_SADNXN4D  8,  8
+HIGH_SADNXN4D  8,  4
+HIGH_SADNXN4D  4,  8
+HIGH_SADNXN4D  4,  4
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_sad_sse2.asm
@@ -1,0 +1,363 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+  shl                 srcq, 1
+  shl                 refq, 1
+%if %4 == 1
+  shl         second_predq, 1
+%endif
+%endmacro
+
+; unsigned int vp9_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+  HIGH_SAD_FN 64, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  ; first half of each row
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  paddd                 m0, m1
+  paddd                 m0, m3
+  ; second half of each row
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq+64]
+  psubusw               m5, m1
+  psubusw               m1, [srcq+64]
+  por                   m1, m5
+  mova                  m5, [srcq+80]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+80]
+  por                   m2, m5
+  mova                  m5, [srcq+96]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+96]
+  por                   m3, m5
+  mova                  m5, [srcq+112]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+112]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+
+
+; unsigned int vp9_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+  HIGH_SAD_FN 32, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+
+; unsigned int vp9_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+  HIGH_SAD_FN 16, %1, 5, %2
+  mov              n_rowsd, %1/2
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+16]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*2+16]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*2]
+  por                   m3, m5
+  mova                  m5, [srcq+src_strideq*2+16]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_strideq*2+16]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
+
+
+; unsigned int vp9_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+  HIGH_SAD_FN 8, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq*2]
+  movu                  m3, [refq+ref_strideq*4]
+  movu                  m4, [refq+ref_stride3q*2]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+src_strideq*2]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*4]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*4]
+  por                   m3, m5
+  mova                  m5, [srcq+src_stride3q*2]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_stride3q*2]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -102,6 +102,9 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
+endif
 
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
@@ -110,6 +113,9 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm
+endif
 endif
 
 ifeq ($(ARCH_X86_64),yes)