ref: d5de63d2be0e8547358343c6e123eeb43f3433d9
parent: 081b39f2b765f83406ac41a9ad8c8d6830e1707a
author: Linfeng Zhang <[email protected]>
date: Wed May 3 09:32:08 EDT 2017
Update highbd idct functions arguments to use uint16_t dst BUG=webm:1388 Change-Id: I3581d80d0389b99166e70987d38aba2db6c469d5
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -255,11 +255,11 @@
#if CONFIG_VP9_HIGHBITDEPTH
void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct16x16_256_add_c(in, out, stride, 10);
+ vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct16x16_256_add_c(in, out, stride, 12);
+ vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
@@ -273,36 +273,36 @@
}
void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
- vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10);
+ vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
}
void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
- vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
+ vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
}
#if HAVE_SSE2
void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct16x16_10_add_c(in, out, stride, 10);
+ vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct16x16_10_add_c(in, out, stride, 12);
+ vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
+ vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
+ vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
+ vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
+ vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
}
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -71,11 +71,11 @@
#if CONFIG_VP9_HIGHBITDEPTH
void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10);
+ vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12);
+ vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -55,36 +55,36 @@
#if CONFIG_VP9_HIGHBITDEPTH
void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct4x4_16_add_c(in, out, stride, 10);
+ vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct4x4_16_add_c(in, out, stride, 12);
+ vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
- vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10);
+ vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
}
void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
- vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12);
+ vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
}
void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10);
+ vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12);
+ vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
#if HAVE_SSE2
void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
+ vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
+ vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
}
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -88,45 +88,45 @@
#if CONFIG_VP9_HIGHBITDEPTH
void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct8x8_64_add_c(in, out, stride, 10);
+ vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct8x8_64_add_c(in, out, stride, 12);
+ vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
- vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10);
+ vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
}
void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
- vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
+ vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
}
#if HAVE_SSE2
void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct8x8_12_add_c(in, out, stride, 10);
+ vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct8x8_12_add_c(in, out, stride, 12);
+ vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 10);
+ vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 12);
+ vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
+ vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
- vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
+ vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
}
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -43,9 +43,11 @@
}
#if CONFIG_VP9_HIGHBITDEPTH
-template <InvTxfmWithBdFunc fn>
+typedef void (*InvTxfmHighbdFunc)(const tran_low_t *in, uint16_t *out,
+ int stride, int bd);
+template <InvTxfmHighbdFunc fn>
void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
- fn(in, out, stride, bd);
+ fn(in, CAST_TO_SHORTPTR(out), stride, bd);
}
#endif
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -205,7 +205,7 @@
#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int tx_type, int bd) {
const highbd_transform_2d IHT_4[] = {
{ vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0
@@ -213,7 +213,6 @@
{ vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2
{ vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3
};
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
int i, j;
tran_low_t out[4 * 4];
@@ -245,7 +244,7 @@
{ vpx_highbd_iadst8_c, vpx_highbd_iadst8_c } // ADST_ADST = 3
};
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int tx_type, int bd) {
int i, j;
tran_low_t out[8 * 8];
@@ -252,7 +251,6 @@
tran_low_t *outptr = out;
tran_low_t temp_in[8], temp_out[8];
const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
// Inverse transform row vectors.
for (i = 0; i < 8; ++i) {
@@ -279,7 +277,7 @@
{ vpx_highbd_iadst16_c, vpx_highbd_iadst16_c } // ADST_ADST = 3
};
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int tx_type, int bd) {
int i, j;
tran_low_t out[16 * 16];
@@ -286,7 +284,6 @@
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
// Rows
for (i = 0; i < 16; ++i) {
@@ -307,7 +304,7 @@
}
// idct
-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd) {
if (eob > 1)
vpx_highbd_idct4x4_16_add(input, dest, stride, bd);
@@ -315,7 +312,7 @@
vpx_highbd_idct4x4_1_add(input, dest, stride, bd);
}
-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd) {
if (eob > 1)
vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);
@@ -323,7 +320,7 @@
vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);
}
-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd) {
// If dc is 1, then input[0] is the reconstructed value, do not need
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -340,7 +337,7 @@
}
}
-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd) {
// The calculation can be simplified if there are not many non-zero dct
// coefficients. Use eobs to separate different cases.
@@ -356,7 +353,7 @@
}
}
-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd) {
// Non-zero coeff only in upper-left 8x8
if (eob == 1) {
@@ -372,7 +369,7 @@
// iht
void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd) {
+ uint16_t *dest, int stride, int eob, int bd) {
if (tx_type == DCT_DCT)
vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
else
@@ -380,7 +377,7 @@
}
void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd) {
+ uint16_t *dest, int stride, int eob, int bd) {
if (tx_type == DCT_DCT) {
vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
} else {
@@ -389,7 +386,7 @@
}
void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd) {
+ uint16_t *dest, int stride, int eob, int bd) {
if (tx_type == DCT_DCT) {
vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
} else {
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -57,22 +57,22 @@
int stride, int eob);
#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd);
-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd);
void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd);
+ uint16_t *dest, int stride, int eob, int bd);
void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd);
+ uint16_t *dest, int stride, int eob, int bd);
void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
- uint8_t *dest, int stride, int eob, int bd);
+ uint16_t *dest, int stride, int eob, int bd);
#endif // CONFIG_VP9_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -101,11 +101,11 @@
#
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
- add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
+ add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
- add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
+ add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
- add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+ add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
}
#
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -189,7 +189,7 @@
assert(eob > 0);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
if (xd->lossless) {
vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
} else {
@@ -257,7 +257,7 @@
assert(eob > 0);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
if (xd->lossless) {
vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
} else {
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -184,7 +184,7 @@
void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
#if CONFIG_VP9_HIGHBITDEPTH
- void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+ void (*highbd_itxm_add)(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
#endif
};
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -637,7 +637,7 @@
if (x->skip_encode || p->eobs[block] == 0) return;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
switch (tx_size) {
case TX_32X32:
vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
@@ -700,8 +700,8 @@
if (p->eobs[block] > 0) {
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- x->highbd_itxm_add(dqcoeff, CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)),
- pd->dst.stride, p->eobs[block], xd->bd);
+ x->highbd_itxm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,
+ p->eobs[block], xd->bd);
return;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -801,7 +801,7 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
switch (tx_size) {
case TX_32X32:
if (!x->skip_recode) {
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -601,22 +601,21 @@
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
32, NULL, 0, NULL, 0, bs, bs, xd->bd);
- recon = CAST_TO_BYTEPTR(recon16);
if (xd->lossless) {
- vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
+ vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
} else {
switch (tx_size) {
case TX_4X4:
- vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
+ vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
case TX_8X8:
- vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd);
+ vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
case TX_16X16:
- vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd);
+ vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
case TX_32X32:
- vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd);
+ vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
default: assert(0 && "Invalid transform size");
}
@@ -1005,7 +1004,7 @@
const int block = (row + idy) * 2 + (col + idx);
const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
- uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
int16_t *const src_diff =
vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
--- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -1268,10 +1268,8 @@
}
}
-void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
-
if (bd == 8) {
int16_t row_idct_output[16 * 16];
@@ -1313,10 +1311,8 @@
}
}
-void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
-
if (bd == 8) {
int16_t row_idct_output[16 * 16];
@@ -1349,10 +1345,8 @@
}
}
-void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
-
if (bd == 8) {
int16_t row_idct_output[4 * 16];
@@ -1414,7 +1408,7 @@
*dest += stride;
}
-void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
@@ -1422,7 +1416,6 @@
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
const int16x8_t dc = vdupq_n_s16(a1);
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
int i;
if (a1 >= 0) {
--- a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
@@ -386,8 +386,8 @@
}
static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
- uint8_t *const dest,
- const int stride, const int bd) {
+ uint16_t *dst, const int stride,
+ const int bd) {
int i, idct32_pass_loop;
int32_t trans_buf[32 * 8];
int32_t pass1[32 * 32];
@@ -394,7 +394,6 @@
int32_t pass2[32 * 32];
int32_t *out;
int32x4x2_t q[16];
- uint16_t *dst = CAST_TO_SHORTPTR(dest);
for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
idct32_pass_loop++, input = pass1, out = pass2) {
@@ -637,10 +636,10 @@
}
}
-void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
if (bd == 8) {
- vpx_idct32_32_neon(input, dest, stride, 1);
+ vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1);
} else {
vpx_highbd_idct32_32_neon(input, dest, stride, bd);
}
--- a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
@@ -726,10 +726,9 @@
highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
}
-void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i;
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
if (bd == 8) {
int16_t temp[32 * 16];
--- a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
@@ -594,10 +594,9 @@
highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
}
-void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i;
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
if (bd == 8) {
int16_t temp[32 * 8];
--- a/vpx_dsp/arm/highbd_idct32x32_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -59,7 +59,7 @@
*dest += stride;
}
-void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
@@ -67,7 +67,6 @@
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
const int16x8_t dc = vdupq_n_s16(a1);
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
int i;
if (a1 >= 0) {
--- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -51,7 +51,7 @@
*dest += stride;
}
-void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
const tran_low_t out0 =
@@ -60,7 +60,6 @@
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
const int16x8_t dc = vdupq_n_s16(a1);
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
@@ -133,7 +132,7 @@
*a3 = vsubq_s32(b0, b3);
}
-void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
int32x4_t c0 = vld1q_s32(input);
@@ -140,7 +139,6 @@
int32x4_t c1 = vld1q_s32(input + 4);
int32x4_t c2 = vld1q_s32(input + 8);
int32x4_t c3 = vld1q_s32(input + 12);
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
int16x8_t a0, a1;
if (bd == 8) {
--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -36,7 +36,7 @@
*dest += stride;
}
-void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
@@ -44,7 +44,6 @@
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
const int16x8_t dc = vdupq_n_s16(a1);
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
if (a1 >= 0) {
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
@@ -292,9 +291,8 @@
vst1q_u16(dest, d7_u16);
}
-void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
int32x4_t a0 = vld1q_s32(input);
int32x4_t a1 = vld1q_s32(input + 8);
int32x4_t a2 = vld1q_s32(input + 16);
@@ -553,9 +551,8 @@
*io7 = vsubq_s32(step1[0], step2[7]);
}
-void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
int32x4_t a0 = vld1q_s32(input);
int32x4_t a1 = vld1q_s32(input + 4);
int32x4_t a2 = vld1q_s32(input + 8);
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -1290,7 +1290,7 @@
return 0;
}
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
0.5 shifts per pixel. */
@@ -1299,7 +1299,6 @@
tran_high_t a1, b1, c1, d1, e1;
const tran_low_t *ip = input;
tran_low_t *op = output;
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
for (i = 0; i < 4; i++) {
a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1348,7 +1347,7 @@
}
}
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
int stride, int bd) {
int i;
tran_high_t a1, e1;
@@ -1355,7 +1354,6 @@
tran_low_t tmp[4];
const tran_low_t *ip = in;
tran_low_t *op = tmp;
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
(void)bd;
a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1452,13 +1450,12 @@
output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
}
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[4 * 4];
tran_low_t *outptr = out;
tran_low_t temp_in[4], temp_out[4];
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
// Rows
for (i = 0; i < 4; ++i) {
@@ -1478,13 +1475,12 @@
}
}
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i;
tran_high_t a1;
tran_low_t out =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 4);
@@ -1636,13 +1632,12 @@
output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
}
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
tran_low_t temp_in[8], temp_out[8];
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
// First transform rows
for (i = 0; i < 8; ++i) {
@@ -1662,13 +1657,12 @@
}
}
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[8 * 8] = { 0 };
tran_low_t *outptr = out;
tran_low_t temp_in[8], temp_out[8];
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
// First transform rows
// Only first 4 row has non-zero coefs
@@ -1689,13 +1683,12 @@
}
}
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_high_t a1;
tran_low_t out =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 5);
@@ -2056,13 +2049,12 @@
output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
}
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
// First transform rows
for (i = 0; i < 16; ++i) {
@@ -2082,13 +2074,12 @@
}
}
-void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[16 * 16] = { 0 };
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
- uint16_t *const dest = CAST_TO_SHORTPTR(dest8);
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 8x8 area, we only need to calculate first 8 rows here.
@@ -2111,13 +2102,12 @@
}
}
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[16 * 16] = { 0 };
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 4x4 area, we only need to calculate first 4 rows here.
@@ -2138,13 +2128,12 @@
}
}
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_high_t a1;
tran_low_t out =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -2531,13 +2520,12 @@
output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
}
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[32 * 32];
tran_low_t *outptr = out;
tran_low_t temp_in[32], temp_out[32];
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
// Rows
for (i = 0; i < 32; ++i) {
@@ -2569,13 +2557,12 @@
}
}
-void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out;
tran_low_t temp_in[32], temp_out[32];
- uint16_t *const dest = CAST_TO_SHORTPTR(dest8);
// Rows
// Only upper-left 16x16 has non-zero coeff
@@ -2598,13 +2585,12 @@
}
}
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out;
tran_low_t temp_in[32], temp_out[32];
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
// Rows
// Only upper-left 8x8 has non-zero coeff
@@ -2625,11 +2611,10 @@
}
}
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i, j;
int a1;
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
tran_low_t out =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -629,39 +629,39 @@
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
specialize qw/vpx_iwht4x4_16_add sse2/;
- add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct4x4_1_add neon/;
- add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct8x8_1_add neon/;
- add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct16x16_1_add neon/;
- add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;
- add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -3364,7 +3364,7 @@
return retval;
}
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[4 * 4];
tran_low_t *outptr = out;
@@ -3373,7 +3373,6 @@
__m128i sign_bits[2];
__m128i temp_mm, min_input, max_input;
int test;
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
int optimised_cols = 0;
const __m128i zero = _mm_set1_epi16(0);
const __m128i eight = _mm_set1_epi16(8);
@@ -3479,7 +3478,7 @@
}
}
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
@@ -3486,7 +3485,6 @@
int i, j, test;
__m128i inptr[8];
__m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
const __m128i zero = _mm_set1_epi16(0);
const __m128i sixteen = _mm_set1_epi16(16);
const __m128i max = _mm_set1_epi16(6201);
@@ -3579,7 +3577,7 @@
}
}
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[8 * 8] = { 0 };
tran_low_t *outptr = out;
@@ -3586,7 +3584,6 @@
int i, j, test;
__m128i inptr[8];
__m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
const __m128i zero = _mm_set1_epi16(0);
const __m128i sixteen = _mm_set1_epi16(16);
const __m128i max = _mm_set1_epi16(6201);
@@ -3682,7 +3679,7 @@
}
}
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
@@ -3689,7 +3686,6 @@
int i, j, test;
__m128i inptr[32];
__m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
const __m128i zero = _mm_set1_epi16(0);
const __m128i rounding = _mm_set1_epi16(32);
const __m128i max = _mm_set1_epi16(3155);
@@ -3795,7 +3791,7 @@
}
}
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
tran_low_t out[16 * 16] = { 0 };
tran_low_t *outptr = out;
@@ -3802,7 +3798,6 @@
int i, j, test;
__m128i inptr[32];
__m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
const __m128i zero = _mm_set1_epi16(0);
const __m128i rounding = _mm_set1_epi16(32);
const __m128i max = _mm_set1_epi16(3155);
@@ -3913,7 +3908,7 @@
}
}
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
__m128i dc_value, d;
const __m128i zero = _mm_setzero_si128();
@@ -3920,7 +3915,6 @@
const __m128i one = _mm_set1_epi16(1);
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
int a, i, j;
- uint16_t *dest = CAST_TO_SHORTPTR(dest8);
tran_low_t out;
out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);