ref: f3a73f1277f5b365a3c8965c8c57df7a2bd0a583
parent: ce3f4ade670cf02e05998f4ca50e08736802f5e7
parent: f18322262f212819cde518b64e3cd70471b259b1
author: Debargha Mukherjee <[email protected]>
date: Wed Oct 7 12:28:36 EDT 2015
Merge "Backports highbitdepth accelerations into vp10"
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -87,65 +87,127 @@
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
- add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
- specialize qw/vp10_iht4x4_16_add/;
+ if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+ add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp10_iht4x4_16_add/;
- add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
- specialize qw/vp10_iht8x8_64_add/;
+ add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp10_iht8x8_64_add/;
- add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- specialize qw/vp10_iht16x16_256_add/;
+ add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+ specialize qw/vp10_iht16x16_256_add/;
- add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_fdct4x4 sse2/;
+ add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct4x4/;
- add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_fdct4x4_1 sse2/;
+ add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct4x4_1/;
- add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_fdct8x8 sse2/;
+ add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct8x8/;
- add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_fdct8x8_1 sse2/;
+ add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct8x8_1/;
- add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_fdct16x16 sse2/;
+ add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct16x16/;
- add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_fdct16x16_1 sse2/;
+ add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct16x16_1/;
- add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_fdct32x32 sse2/;
+ add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32/;
- add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_fdct32x32_rd sse2/;
+ add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32_rd/;
- add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_fdct32x32_1 sse2/;
+ add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32_1/;
- add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_highbd_fdct4x4 sse2/;
+ add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct4x4/;
- add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_highbd_fdct8x8 sse2/;
+ add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct8x8/;
- add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_highbd_fdct8x8_1/;
+ add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct8x8_1/;
- add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_highbd_fdct16x16 sse2/;
+ add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct16x16/;
- add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_highbd_fdct16x16_1/;
+ add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct16x16_1/;
- add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_highbd_fdct32x32 sse2/;
+ add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct32x32/;
- add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_highbd_fdct32x32_rd sse2/;
+ add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct32x32_rd/;
- add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_highbd_fdct32x32_1/;
+ add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct32x32_1/;
+ } else {
+ add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp10_iht4x4_16_add sse2/;
+
+ add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp10_iht8x8_64_add sse2/;
+
+ add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+ specialize qw/vp10_iht16x16_256_add/;
+
+ add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct4x4 sse2/;
+
+ add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct4x4_1 sse2/;
+
+ add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct8x8 sse2/;
+
+ add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct8x8_1 sse2/;
+
+ add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct16x16 sse2/;
+
+ add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct16x16_1 sse2/;
+
+ add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32 sse2/;
+
+ add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32_rd sse2/;
+
+ add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_fdct32x32_1 sse2/;
+
+ add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct4x4 sse2/;
+
+ add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct8x8 sse2/;
+
+ add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct8x8_1/;
+
+ add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct16x16 sse2/;
+
+ add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct16x16_1/;
+
+ add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct32x32 sse2/;
+
+ add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct32x32_rd sse2/;
+
+ add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp10_highbd_fdct32x32_1/;
+ }
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
--- a/vp10/common/x86/idct_intrin_sse2.c
+++ b/vp10/common/x86/idct_intrin_sse2.c
@@ -12,14 +12,14 @@
#include "vpx_dsp/x86/txfm_common_sse2.h"
#include "vpx_ports/mem.h"
-void vp10_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
- int tx_type) {
+void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
__m128i in[2];
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
- in[0] = _mm_loadu_si128((const __m128i *)(input));
- in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
+ in[0] = load_input_data(input);
+ in[1] = load_input_data(input + 8);
switch (tx_type) {
case 0: // DCT_DCT
@@ -77,21 +77,21 @@
}
}
-void vp10_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
- int tx_type) {
+void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
__m128i in[8];
const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
// load input data
- in[0] = _mm_load_si128((const __m128i *)input);
- in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
- in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
- in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
- in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
- in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+ in[0] = load_input_data(input);
+ in[1] = load_input_data(input + 8 * 1);
+ in[2] = load_input_data(input + 8 * 2);
+ in[3] = load_input_data(input + 8 * 3);
+ in[4] = load_input_data(input + 8 * 4);
+ in[5] = load_input_data(input + 8 * 5);
+ in[6] = load_input_data(input + 8 * 6);
+ in[7] = load_input_data(input + 8 * 7);
switch (tx_type) {
case 0: // DCT_DCT
@@ -144,8 +144,8 @@
RECON_AND_STORE(dest + 7 * stride, in[7]);
}
-void vp10_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
- int tx_type) {
+void vp10_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
__m128i in0[16], in1[16];
load_buffer_8x16(input, in0);