shithub: libvpx

--- a/test/convolve_test.cc

+++ b/test/convolve_test.cc

@@ -301,9 +301,9 @@

     filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr,

                                dst_stride, output_width, output_height);

   } else {

-    highbd_filter_average_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,

+    highbd_filter_average_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride,

                                       hfilter, vfilter,

-                                      CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,

+                                      CAST_TO_SHORTPTR(dst_ptr), dst_stride,

                                       output_width, output_height, use_highbd);

 #else

@@ -324,8 +324,8 @@

     filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr,

                        dst_stride, output_width, output_height);

   } else {

-    highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, hfilter,

-                              vfilter, CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,

+    highbd_filter_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, hfilter,

+                              vfilter, CAST_TO_SHORTPTR(dst_ptr), dst_stride,

                               output_width, output_height, use_highbd);

 #else

@@ -460,7 +460,7 @@

     if (UUT_->use_highbd_ == 0) {

       return input_ + offset;

     } else {

-      return CONVERT_TO_BYTEPTR(input16_) + offset;

+      return CAST_TO_BYTEPTR(input16_ + offset);

 #else

     return input_ + offset;

@@ -473,7 +473,7 @@

     if (UUT_->use_highbd_ == 0) {

       return output_ + offset;

     } else {

-      return CONVERT_TO_BYTEPTR(output16_) + offset;

+      return CAST_TO_BYTEPTR(output16_ + offset);

 #else

     return output_ + offset;

@@ -486,7 +486,7 @@

     if (UUT_->use_highbd_ == 0) {

       return output_ref_ + offset;

     } else {

-      return CONVERT_TO_BYTEPTR(output16_ref_) + offset;

+      return CAST_TO_BYTEPTR(output16_ref_ + offset);

 #else

     return output_ref_ + offset;

@@ -498,7 +498,7 @@

     if (UUT_->use_highbd_ == 0) {

       return list[index];

     } else {

-      return CONVERT_TO_SHORTPTR(list)[index];

+      return CAST_TO_SHORTPTR(list)[index];

 #else

     return list[index];

@@ -510,7 +510,7 @@

     if (UUT_->use_highbd_ == 0) {

       list[index] = (uint8_t)val;

     } else {

-      CONVERT_TO_SHORTPTR(list)[index] = val;

+      CAST_TO_SHORTPTR(list)[index] = val;

 #else

     list[index] = (uint8_t)val;

@@ -718,7 +718,7 @@

     if (UUT_->use_highbd_ == 0) {

       ref = ref8;

     } else {

-      ref = CONVERT_TO_BYTEPTR(ref16);

+      ref = CAST_TO_BYTEPTR(ref16);

 #else

     uint8_t ref[kOutputStride * kMaxDimension];

@@ -797,7 +797,7 @@

   if (UUT_->use_highbd_ == 0) {

     ref = ref8;

   } else {

-    ref = CONVERT_TO_BYTEPTR(ref16);

+    ref = CAST_TO_BYTEPTR(ref16);

 #else

   uint8_t ref[kOutputStride * kMaxDimension];

--- a/vp9/common/vp9_reconinter.h

+++ b/vp9/common/vp9_reconinter.h

@@ -37,8 +37,9 @@

     const int subpel_x, const int subpel_y, const struct scale_factors *sf,

     int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) {

   sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](

-      src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],

-      ys, w, h, bd);

+      CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(src)), src_stride,

+      CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)), dst_stride, kernel[subpel_x],

+      xs, kernel[subpel_y], ys, w, h, bd);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/encoder/vp9_encoder.c

+++ b/vp9/encoder/vp9_encoder.c

@@ -2417,10 +2417,11 @@

         uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);

         if (src->flags & YV12_FLAG_HIGHBITDEPTH) {

-          vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,

-                               kernel[x_q4 & 0xf], 16 * src_w / dst_w,

-                               kernel[y_q4 & 0xf], 16 * src_h / dst_h,

-                               16 / factor, 16 / factor, bd);

+          vpx_highbd_convolve8(

+              CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(src_ptr)), src_stride,

+              CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst_ptr)), dst_stride,

+              kernel[x_q4 & 0xf], 16 * src_w / dst_w, kernel[y_q4 & 0xf],

+              16 * src_h / dst_h, 16 / factor, 16 / factor, bd);

         } else {

           vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,

                         kernel[x_q4 & 0xf], 16 * src_w / dst_w,

--- a/vp9/encoder/vp9_pickmode.c

+++ b/vp9/encoder/vp9_pickmode.c

@@ -2053,9 +2053,11 @@

         this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];

 #if CONFIG_VP9_HIGHBITDEPTH

         if (cm->use_highbitdepth)

-          vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,

-                                   this_mode_pred->data, this_mode_pred->stride,

-                                   NULL, 0, NULL, 0, bw, bh, xd->bd);

+          vpx_highbd_convolve_copy(

+              CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(best_pred->data)),

+              best_pred->stride,

+              CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(this_mode_pred->data)),

+              this_mode_pred->stride, NULL, 0, NULL, 0, bw, bh, xd->bd);

         else

           vpx_convolve_copy(best_pred->data, best_pred->stride,

                             this_mode_pred->data, this_mode_pred->stride, NULL,

@@ -2162,9 +2164,11 @@

     if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {

 #if CONFIG_VP9_HIGHBITDEPTH

       if (cm->use_highbitdepth)

-        vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,

-                                 pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0,

-                                 bw, bh, xd->bd);

+        vpx_highbd_convolve_copy(

+            CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(best_pred->data)),

+            best_pred->stride,

+            CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(pd->dst.buf)), pd->dst.stride,

+            NULL, 0, NULL, 0, bw, bh, xd->bd);

       else

         vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,

                           pd->dst.stride, NULL, 0, NULL, 0, bw, bh);

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -599,9 +599,10 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

-        recon = CONVERT_TO_BYTEPTR(recon);

-        vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0,

-                                 bs, bs, xd->bd);

+        vpx_highbd_convolve_copy(CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)),

+                                 dst_stride, recon, 32, NULL, 0, NULL, 0, bs,

+                                 bs, xd->bd);

+        recon = CONVERT_TO_BYTEPTR(recon16);

         if (xd->lossless) {

           vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);

         } else {

--- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c

+++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c

@@ -145,8 +145,8 @@

     vpx_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,

                                  x_step_q4, filter_y, y_step_q4, w, h, bd);

   } else {

-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+    const uint16_t *src = CAST_TO_SHORTPTR(src8);

+    uint16_t *dst = CAST_TO_SHORTPTR(dst8);

     const int16x8_t filters = vld1q_s16(filter_x);

     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

     uint16x8_t t0, t1, t2, t3;

@@ -348,8 +348,8 @@

                                      filter_x, x_step_q4, filter_y, y_step_q4,

                                      w, h, bd);

   } else {

-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+    const uint16_t *src = CAST_TO_SHORTPTR(src8);

+    uint16_t *dst = CAST_TO_SHORTPTR(dst8);

     const int16x8_t filters = vld1q_s16(filter_x);

     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

     uint16x8_t t0, t1, t2, t3;

@@ -579,8 +579,8 @@

     vpx_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,

                                 x_step_q4, filter_y, y_step_q4, w, h, bd);

   } else {

-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+    const uint16_t *src = CAST_TO_SHORTPTR(src8);

+    uint16_t *dst = CAST_TO_SHORTPTR(dst8);

     const int16x8_t filters = vld1q_s16(filter_y);

     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

@@ -748,8 +748,8 @@

                                     filter_x, x_step_q4, filter_y, y_step_q4, w,

                                     h, bd);

   } else {

-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+    const uint16_t *src = CAST_TO_SHORTPTR(src8);

+    uint16_t *dst = CAST_TO_SHORTPTR(dst8);

     const int16x8_t filters = vld1q_s16(filter_y);

     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

--- a/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c

+++ b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c

@@ -18,8 +18,8 @@

                                   const int16_t *filter_x, int filter_x_stride,

                                   const int16_t *filter_y, int filter_y_stride,

                                   int w, int h, int bd) {

-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+  const uint16_t *src = CAST_TO_SHORTPTR(src8);

+  uint16_t *dst = CAST_TO_SHORTPTR(dst8);

   (void)filter_x;

   (void)filter_x_stride;

--- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c

+++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c

@@ -18,8 +18,8 @@

                                    const int16_t *filter_x, int filter_x_stride,

                                    const int16_t *filter_y, int filter_y_stride,

                                    int w, int h, int bd) {

-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+  const uint16_t *src = CAST_TO_SHORTPTR(src8);

+  uint16_t *dst = CAST_TO_SHORTPTR(dst8);

   (void)filter_x;

   (void)filter_x_stride;

--- a/vpx_dsp/arm/highbd_vpx_convolve_neon.c

+++ b/vpx_dsp/arm/highbd_vpx_convolve_neon.c

@@ -18,7 +18,7 @@

                                const int16_t *filter_x, int x_step_q4,

                                const int16_t *filter_y, int y_step_q4, int w,

                                int h, int bd) {

-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

+  const uint16_t *src = CAST_TO_SHORTPTR(src8);

   const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));

   // + 1 to make it divisible by 4

   DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);

@@ -29,13 +29,12 @@

    * height and filter a multiple of 4 lines. Since this goes in to the temp

    * buffer which has lots of extra room and is subsequently discarded this is

    * safe if somewhat less than ideal.   */

-  vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3),

-                                  src_stride, CONVERT_TO_BYTEPTR(temp), w,

-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,

-                                  intermediate_height, bd);

+  vpx_highbd_convolve8_horiz_neon(

+      CAST_TO_BYTEPTR(src - src_stride * 3), src_stride, CAST_TO_BYTEPTR(temp),

+      w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd);

   /* Step into the temp buffer 3 lines to get the actual frame data */

-  vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst,

+  vpx_highbd_convolve8_vert_neon(CAST_TO_BYTEPTR(temp + w * 3), w, dst,

                                  dst_stride, filter_x, x_step_q4, filter_y,

                                  y_step_q4, w, h, bd);

@@ -45,7 +44,7 @@

                                    const int16_t *filter_x, int x_step_q4,

                                    const int16_t *filter_y, int y_step_q4,

                                    int w, int h, int bd) {

-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

+  const uint16_t *src = CAST_TO_SHORTPTR(src8);

   const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));

   // + 1 to make it divisible by 4

   DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);

@@ -55,11 +54,10 @@

   /* This implementation has the same issues as above. In addition, we only want

    * to average the values after both passes.

*/

-  vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3),

-                                  src_stride, CONVERT_TO_BYTEPTR(temp), w,

-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,

-                                  intermediate_height, bd);

-  vpx_highbd_convolve8_avg_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst,

+  vpx_highbd_convolve8_horiz_neon(

+      CAST_TO_BYTEPTR(src - src_stride * 3), src_stride, CAST_TO_BYTEPTR(temp),

+      w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd);

+  vpx_highbd_convolve8_avg_vert_neon(CAST_TO_BYTEPTR(temp + w * 3), w, dst,

                                      dst_stride, filter_x, x_step_q4, filter_y,

                                      y_step_q4, w, h, bd);

--- a/vpx_dsp/vpx_convolve.c

+++ b/vpx_dsp/vpx_convolve.c

@@ -324,8 +324,8 @@

                                   const InterpKernel *x_filters, int x0_q4,

                                   int x_step_q4, int w, int h, int bd) {

   int x, y;

-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+  const uint16_t *src = CAST_TO_SHORTPTR(src8);

+  uint16_t *dst = CAST_TO_SHORTPTR(dst8);

   src -= SUBPEL_TAPS / 2 - 1;

   for (y = 0; y < h; ++y) {

@@ -348,8 +348,8 @@

                                       const InterpKernel *x_filters, int x0_q4,

                                       int x_step_q4, int w, int h, int bd) {

   int x, y;

-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+  const uint16_t *src = CAST_TO_SHORTPTR(src8);

+  uint16_t *dst = CAST_TO_SHORTPTR(dst8);

   src -= SUBPEL_TAPS / 2 - 1;

   for (y = 0; y < h; ++y) {

@@ -374,8 +374,8 @@

                                  const InterpKernel *y_filters, int y0_q4,

                                  int y_step_q4, int w, int h, int bd) {

   int x, y;

-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+  const uint16_t *src = CAST_TO_SHORTPTR(src8);

+  uint16_t *dst = CAST_TO_SHORTPTR(dst8);

   src -= src_stride * (SUBPEL_TAPS / 2 - 1);

   for (x = 0; x < w; ++x) {

@@ -400,8 +400,8 @@

                                      const InterpKernel *y_filters, int y0_q4,

                                      int y_step_q4, int w, int h, int bd) {

   int x, y;

-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+  const uint16_t *src = CAST_TO_SHORTPTR(src8);

+  uint16_t *dst = CAST_TO_SHORTPTR(dst8);

   src -= src_stride * (SUBPEL_TAPS / 2 - 1);

   for (x = 0; x < w; ++x) {

@@ -449,12 +449,12 @@

   assert(y_step_q4 <= 32);

   assert(x_step_q4 <= 32);

-  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,

-                        CONVERT_TO_BYTEPTR(temp), 64, x_filters, x0_q4,

+  highbd_convolve_horiz(CAST_TO_BYTEPTR(CAST_TO_SHORTPTR(src) -

+                                        src_stride * (SUBPEL_TAPS / 2 - 1)),

+                        src_stride, CAST_TO_BYTEPTR(temp), 64, x_filters, x0_q4,

                         x_step_q4, w, intermediate_height, bd);

-  highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),

-                       64, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h,

-                       bd);

+  highbd_convolve_vert(CAST_TO_BYTEPTR(temp + 64 * (SUBPEL_TAPS / 2 - 1)), 64,

+                       dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);

 void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,

@@ -541,10 +541,10 @@

   assert(w <= 64);

   assert(h <= 64);

-  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,

-                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);

-  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL,

-                            0, NULL, 0, w, h, bd);

+  vpx_highbd_convolve8_c(src, src_stride, CAST_TO_BYTEPTR(temp), 64, filter_x,

+                         x_step_q4, filter_y, y_step_q4, w, h, bd);

+  vpx_highbd_convolve_avg_c(CAST_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL, 0,

+                            NULL, 0, w, h, bd);

 void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,

@@ -553,8 +553,8 @@

                                 const int16_t *filter_y, int filter_y_stride,

                                 int w, int h, int bd) {

   int r;

-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+  const uint16_t *src = CAST_TO_SHORTPTR(src8);

+  uint16_t *dst = CAST_TO_SHORTPTR(dst8);

   (void)filter_x;

   (void)filter_x_stride;

@@ -575,8 +575,8 @@

                                const int16_t *filter_y, int filter_y_stride,

                                int w, int h, int bd) {

   int x, y;

-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+  const uint16_t *src = CAST_TO_SHORTPTR(src8);

+  uint16_t *dst = CAST_TO_SHORTPTR(dst8);

   (void)filter_x;

   (void)filter_x_stride;

--- a/vpx_dsp/x86/convolve.h

+++ b/vpx_dsp/x86/convolve.h

@@ -107,8 +107,8 @@

       ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,       \

       const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {     \

     if (step_q4 == 16 && filter[3] != 128) {                              \

-      uint16_t *src = CONVERT_TO_SHORTPTR(src8);                          \

-      uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                          \

+      uint16_t *src = CAST_TO_SHORTPTR(src8);                             \

+      uint16_t *dst = CAST_TO_SHORTPTR(dst8);                             \

       if (filter[0] | filter[1] | filter[2]) {                            \

         while (w >= 16) {                                                 \

           vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                \

@@ -162,36 +162,37 @@

     }                                                                     \

-#define HIGH_FUN_CONV_2D(avg, opt)                                            \

-  void vpx_highbd_convolve8_##avg##opt(                                       \

-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \

-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \

-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {         \

-    assert(w <= 64);                                                          \

-    assert(h <= 64);                                                          \

-    if (x_step_q4 == 16 && y_step_q4 == 16) {                                 \

-      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {  \

-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                       \

-        vpx_highbd_convolve8_horiz_##opt(                                     \

-            src - 3 * src_stride, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, \

-            filter_x, x_step_q4, filter_y, y_step_q4, w, h + 7, bd);          \

-        vpx_highbd_convolve8_##avg##vert_##opt(                               \

-            CONVERT_TO_BYTEPTR(fdata2) + 192, 64, dst, dst_stride, filter_x,  \

-            x_step_q4, filter_y, y_step_q4, w, h, bd);                        \

-      } else {                                                                \

-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                       \

-        vpx_highbd_convolve8_horiz_##opt(                                     \

-            src, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, filter_x,        \

-            x_step_q4, filter_y, y_step_q4, w, h + 1, bd);                    \

-        vpx_highbd_convolve8_##avg##vert_##opt(                               \

-            CONVERT_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x,        \

-            x_step_q4, filter_y, y_step_q4, w, h, bd);                        \

-      }                                                                       \

-    } else {                                                                  \

-      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,         \

-                                    filter_x, x_step_q4, filter_y, y_step_q4, \

-                                    w, h, bd);                                \

-    }                                                                         \

+#define HIGH_FUN_CONV_2D(avg, opt)                                             \

+  void vpx_highbd_convolve8_##avg##opt(                                        \

+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \

+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,            \

+      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {          \

+    assert(w <= 64);                                                           \

+    assert(h <= 64);                                                           \

+    if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \

+      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {   \

+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                        \

+        vpx_highbd_convolve8_horiz_##opt(                                      \

+            CAST_TO_BYTEPTR(CAST_TO_SHORTPTR(src) - 3 * src_stride),           \

+            src_stride, CAST_TO_BYTEPTR(fdata2), 64, filter_x, x_step_q4,      \

+            filter_y, y_step_q4, w, h + 7, bd);                                \

+        vpx_highbd_convolve8_##avg##vert_##opt(                                \

+            CAST_TO_BYTEPTR(fdata2 + 192), 64, dst, dst_stride, filter_x,      \

+            x_step_q4, filter_y, y_step_q4, w, h, bd);                         \

+      } else {                                                                 \

+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                        \

+        vpx_highbd_convolve8_horiz_##opt(                                      \

+            src, src_stride, CAST_TO_BYTEPTR(fdata2), 64, filter_x, x_step_q4, \

+            filter_y, y_step_q4, w, h + 1, bd);                                \

+        vpx_highbd_convolve8_##avg##vert_##opt(                                \

+            CAST_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x, x_step_q4, \

+            filter_y, y_step_q4, w, h, bd);                                    \

+      }                                                                        \

+    } else {                                                                   \

+      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,          \

+                                    filter_x, x_step_q4, filter_y, y_step_q4,  \

+                                    w, h, bd);                                 \

+    }                                                                          \

 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vpx_dsp/x86/highbd_convolve_avx2.c

+++ b/vpx_dsp/x86/highbd_convolve_avx2.c

@@ -21,8 +21,8 @@

                                    const int16_t *filter_x, int filter_x_stride,

                                    const int16_t *filter_y, int filter_y_stride,

                                    int width, int h, int bd) {

-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+  const uint16_t *src = CAST_TO_SHORTPTR(src8);

+  uint16_t *dst = CAST_TO_SHORTPTR(dst8);

   (void)filter_x;

   (void)filter_y;

   (void)filter_x_stride;

@@ -104,8 +104,8 @@

                                   const int16_t *filter_x, int filter_x_stride,

                                   const int16_t *filter_y, int filter_y_stride,

                                   int width, int h, int bd) {

-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);

+  uint16_t *src = CAST_TO_SHORTPTR(src8);

+  uint16_t *dst = CAST_TO_SHORTPTR(dst8);

   (void)filter_x;

   (void)filter_y;

   (void)filter_x_stride;

--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm

+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm

@@ -32,9 +32,7 @@

   mov r4d, dword wm

 %ifidn %2, highbd

   shl r4d, 1

-  shl srcq, 1

   shl src_strideq, 1

-  shl dstq, 1

   shl dst_strideq, 1

 %else

   cmp r4d, 4