shithub: libvpx

--- a/vp9/encoder/vp9_encoder.c

+++ b/vp9/encoder/vp9_encoder.c

@@ -5801,9 +5801,9 @@

                          TX_SIZE tx_size) {

   // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.

   switch (tx_size) {

-    case TX_8X8: vpx_hadamard_8x8_c(src_diff, bw, coeff); break;

-    case TX_16X16: vpx_hadamard_16x16_c(src_diff, bw, coeff); break;

-    case TX_32X32: vpx_hadamard_32x32_c(src_diff, bw, coeff); break;

+    case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;

+    case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;

+    case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;

     default: assert(0);

--- a/vpx_dsp/avg.c

+++ b/vpx_dsp/avg.c

@@ -32,6 +32,166 @@

   return (sum + 8) >> 4;

+#if CONFIG_VP9_HIGHBITDEPTH

+// src_diff: 13 bit, dynamic range [-4095, 4095]

+// coeff: 16 bit

+static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,

+                                            ptrdiff_t src_stride,

+                                            int32_t *coeff) {

+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];

+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];

+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];

+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];

+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];

+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];

+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];

+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];

+  int16_t c0 = b0 + b2;

+  int16_t c1 = b1 + b3;

+  int16_t c2 = b0 - b2;

+  int16_t c3 = b1 - b3;

+  int16_t c4 = b4 + b6;

+  int16_t c5 = b5 + b7;

+  int16_t c6 = b4 - b6;

+  int16_t c7 = b5 - b7;

+  coeff[0] = c0 + c4;

+  coeff[7] = c1 + c5;

+  coeff[3] = c2 + c6;

+  coeff[4] = c3 + c7;

+  coeff[2] = c0 - c4;

+  coeff[6] = c1 - c5;

+  coeff[1] = c2 - c6;

+  coeff[5] = c3 - c7;

+}

+// src_diff: 16 bit, dynamic range [-32760, 32760]

+// coeff: 19 bit

+static void hadamard_highbd_col8_second_pass(const int32_t *src_diff,

+                                             ptrdiff_t src_stride,

+                                             int32_t *coeff) {

+  int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];

+  int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];

+  int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];

+  int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];

+  int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];

+  int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];

+  int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];

+  int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];

+  int32_t c0 = b0 + b2;

+  int32_t c1 = b1 + b3;

+  int32_t c2 = b0 - b2;

+  int32_t c3 = b1 - b3;

+  int32_t c4 = b4 + b6;

+  int32_t c5 = b5 + b7;

+  int32_t c6 = b4 - b6;

+  int32_t c7 = b5 - b7;

+  coeff[0] = c0 + c4;

+  coeff[7] = c1 + c5;

+  coeff[3] = c2 + c6;

+  coeff[4] = c3 + c7;

+  coeff[2] = c0 - c4;

+  coeff[6] = c1 - c5;

+  coeff[1] = c2 - c6;

+  coeff[5] = c3 - c7;

+}

+// The order of the output coeff of the hadamard is not important. For

+// optimization purposes the final transpose may be skipped.

+void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,

+                               tran_low_t *coeff) {

+  int idx;

+  int32_t buffer[64];

+  int32_t buffer2[64];

+  int32_t *tmp_buf = &buffer[0];

+  for (idx = 0; idx < 8; ++idx) {

+    // src_diff: 13 bit

+    // buffer: 16 bit, dynamic range [-32760, 32760]

+    hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf);

+    tmp_buf += 8;

+    ++src_diff;

+  }

+  tmp_buf = &buffer[0];

+  for (idx = 0; idx < 8; ++idx) {

+    // buffer: 16 bit

+    // buffer2: 19 bit, dynamic range [-262080, 262080]

+    hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx);

+    ++tmp_buf;

+  }

+  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];

+}

+// In place 16x16 2D Hadamard transform

+void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,

+                                 tran_low_t *coeff) {

+  int idx;

+  for (idx = 0; idx < 4; ++idx) {

+    // src_diff: 13 bit, dynamic range [-4095, 4095]

+    const int16_t *src_ptr =

+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;

+    vpx_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);

+  }

+  // coeff: 19 bit, dynamic range [-262080, 262080]

+  for (idx = 0; idx < 64; ++idx) {

+    tran_low_t a0 = coeff[0];

+    tran_low_t a1 = coeff[64];

+    tran_low_t a2 = coeff[128];

+    tran_low_t a3 = coeff[192];

+    tran_low_t b0 = (a0 + a1) >> 1;

+    tran_low_t b1 = (a0 - a1) >> 1;

+    tran_low_t b2 = (a2 + a3) >> 1;

+    tran_low_t b3 = (a2 - a3) >> 1;

+    // new coeff dynamic range: 20 bit

+    coeff[0] = b0 + b2;

+    coeff[64] = b1 + b3;

+    coeff[128] = b0 - b2;

+    coeff[192] = b1 - b3;

+    ++coeff;

+  }

+}

+void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,

+                                 tran_low_t *coeff) {

+  int idx;

+  for (idx = 0; idx < 4; ++idx) {

+    // src_diff: 13 bit, dynamic range [-4095, 4095]

+    const int16_t *src_ptr =

+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;

+    vpx_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);

+  }

+  // coeff: 20 bit

+  for (idx = 0; idx < 256; ++idx) {

+    tran_low_t a0 = coeff[0];

+    tran_low_t a1 = coeff[256];

+    tran_low_t a2 = coeff[512];

+    tran_low_t a3 = coeff[768];

+    tran_low_t b0 = (a0 + a1) >> 2;

+    tran_low_t b1 = (a0 - a1) >> 2;

+    tran_low_t b2 = (a2 + a3) >> 2;

+    tran_low_t b3 = (a2 - a3) >> 2;

+    // new coeff dynamic range: 20 bit

+    coeff[0] = b0 + b2;

+    coeff[256] = b1 + b3;

+    coeff[512] = b0 - b2;

+    coeff[768] = b1 - b3;

+    ++coeff;

+  }

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

 // src_diff: first pass, 9 bit, dynamic range [-255, 255]

 //           second pass, 12 bit, dynamic range [-2040, 2040]

 static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -785,6 +785,15 @@

     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";

     specialize qw/vpx_hadamard_32x32 sse2 avx2/;

+    add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";

+    specialize qw/vpx_highbd_hadamard_8x8/;

+    add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";

+    specialize qw/vpx_highbd_hadamard_16x16/;

+    add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";

+    specialize qw/vpx_highbd_hadamard_32x32/;

     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";

     specialize qw/vpx_satd avx2 sse2 neon/;

   } else {