ref: fc18cf7a11c3065eb676a134a6c2b6f3160b27d0
parent: 08d5cf226e8a2e6ece6f49276ffddedc91b04ae9
parent: d19033fa4e46a2a97adcf752ccebe79bc86662a9
author: Jingning Han <[email protected]>
date: Tue Jul 28 20:06:37 EDT 2015
Merge "Move DC only forward 2D-DCT functions to vpx_dsp"
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -507,18 +507,6 @@
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
-
- add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct4x4_1 sse2/;
-
- add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct8x8_1 sse2/;
-
- add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct16x16_1 sse2/;
-
- add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct32x32_1 sse2/;
} else {
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_fht4x4 sse2 msa/;
@@ -531,18 +519,6 @@
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc";
-
- add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct4x4_1 sse2/;
-
- add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct8x8_1 sse2 neon msa/;
-
- add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct16x16_1 sse2 msa/;
-
- add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct32x32_1 sse2 msa/;
}
#
@@ -595,15 +571,6 @@
add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fwht4x4/;
-
- add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_highbd_fdct8x8_1/;
-
- add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_highbd_fdct16x16_1/;
-
- add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_highbd_fdct32x32_1/;
add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
specialize qw/vp9_highbd_temporal_filter_apply/;
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -9,29 +9,13 @@
*/
#include <arm_neon.h>
+
#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
#include "vp9/common/vp9_blockd.h"
#include "vpx_dsp/txfm_common.h"
-
-void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
- int r;
- int16x8_t sum = vld1q_s16(&input[0]);
- for (r = 1; r < 8; ++r) {
- const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
- sum = vaddq_s16(sum, input_00);
- }
- {
- const int32x4_t a = vpaddlq_s16(sum);
- const int64x2_t b = vpaddlq_s32(a);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
- output[1] = 0;
- }
-}
void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
int16_t* coeff_ptr, intptr_t n_coeffs,
--- a/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
+++ b/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
@@ -10,19 +10,9 @@
#include <assert.h>
-#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_enums.h"
#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
#include "vpx_dsp/mips/fwd_txfm_msa.h"
-
-void vp9_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
- out[1] = 0;
-
- out[0] = LD_HADD(input, stride);
- out[0] += LD_HADD(input + 8, stride);
- out[0] += LD_HADD(input + 16 * 8, stride);
- out[0] += LD_HADD(input + 16 * 8 + 8, stride);
- out[0] >>= 1;
-}
static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
const int32_t *const0, int16_t *int_buf) {
--- a/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c
+++ /dev/null
@@ -1,33 +1,0 @@
-/*
- * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
-
-void vp9_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
- out[1] = 0;
-
- out[0] = LD_HADD(input, stride);
- out[0] += LD_HADD(input + 8, stride);
- out[0] += LD_HADD(input + 16, stride);
- out[0] += LD_HADD(input + 24, stride);
- out[0] += LD_HADD(input + 32 * 8, stride);
- out[0] += LD_HADD(input + 32 * 8 + 8, stride);
- out[0] += LD_HADD(input + 32 * 8 + 16, stride);
- out[0] += LD_HADD(input + 32 * 8 + 24, stride);
- out[0] += LD_HADD(input + 32 * 16, stride);
- out[0] += LD_HADD(input + 32 * 16 + 8, stride);
- out[0] += LD_HADD(input + 32 * 16 + 16, stride);
- out[0] += LD_HADD(input + 32 * 16 + 24, stride);
- out[0] += LD_HADD(input + 32 * 24, stride);
- out[0] += LD_HADD(input + 32 * 24 + 8, stride);
- out[0] += LD_HADD(input + 32 * 24 + 16, stride);
- out[0] += LD_HADD(input + 32 * 24 + 24, stride);
- out[0] >>= 3;
-}
--- a/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
+++ b/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
@@ -10,7 +10,7 @@
#include <assert.h>
-#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_enums.h"
#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
void vp9_fwht4x4_msa(const int16_t *input, int16_t *output,
--- a/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
+++ b/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
@@ -10,13 +10,8 @@
#include <assert.h>
-#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_enums.h"
#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
-
-void vp9_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
- out[0] = LD_HADD(input, stride);
- out[1] = 0;
-}
void vp9_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride,
int32_t tx_type) {
--- a/vp9/encoder/mips/msa/vp9_fdct_msa.h
+++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h
@@ -81,21 +81,6 @@
out5 = -out5; \
}
-#define LD_HADD(psrc, stride) ({ \
- v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \
- v4i32 vec_w_m; \
- \
- LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \
- ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \
- LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \
- ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, \
- in4_m, in6_m, in0_m, in4_m); \
- in0_m += in4_m; \
- \
- vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \
- HADD_SW_S32(vec_w_m); \
-})
-
#define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) { \
v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \
v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -529,17 +529,6 @@
{ fadst16, fadst16 } // ADST_ADST = 3
};
-void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 4; ++r)
- for (c = 0; c < 4; ++c)
- sum += input[r * stride + c];
-
- output[0] = sum << 1;
- output[1] = 0;
-}
-
void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
@@ -572,17 +561,6 @@
}
}
-void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 8; ++r)
- for (c = 0; c < 8; ++c)
- sum += input[r * stride + c];
-
- output[0] = sum;
- output[1] = 0;
-}
-
void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
@@ -695,17 +673,6 @@
*eob_ptr = eob + 1;
}
-void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 16; ++r)
- for (c = 0; c < 16; ++c)
- sum += input[r * stride + c];
-
- output[0] = sum >> 1;
- output[1] = 0;
-}
-
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
@@ -828,16 +795,6 @@
vp9_fht4x4_c(input, output, stride, tx_type);
}
-void vp9_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
- int stride) {
- vp9_fdct8x8_1_c(input, final_output, stride);
-}
-
-void vp9_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
- int stride) {
- vp9_fdct16x16_1_c(input, output, stride);
-}
-
void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
vp9_fht8x8_c(input, output, stride, tx_type);
@@ -851,10 +808,5 @@
void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
vp9_fht16x16_c(input, output, stride, tx_type);
-}
-
-void vp9_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
- int stride) {
- vp9_fdct32x32_1_c(input, out, stride);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -18,35 +18,6 @@
#include "vpx_dsp/x86/txfm_common_sse2.h"
#include "vpx_ports/mem.h"
-void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
- __m128i in0, in1;
- __m128i tmp;
- const __m128i zero = _mm_setzero_si128();
- in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
- (input + 2 * stride)));
- in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
- (input + 3 * stride)));
-
- tmp = _mm_add_epi16(in0, in1);
- in0 = _mm_unpacklo_epi16(zero, tmp);
- in1 = _mm_unpackhi_epi16(zero, tmp);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- tmp = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(tmp, zero);
- in1 = _mm_unpackhi_epi32(tmp, zero);
-
- tmp = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(tmp, 8);
-
- in1 = _mm_add_epi32(tmp, in0);
- in0 = _mm_slli_epi32(in1, 1);
- store_output(&in0, output);
-}
-
static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
int stride) {
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
@@ -212,46 +183,6 @@
}
}
-void vp9_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i u0, u1, sum;
-
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
-
- in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
- sum = _mm_add_epi16(u0, u1);
-
- in0 = _mm_add_epi16(in0, in1);
- in2 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, in0);
-
- u0 = _mm_setzero_si128();
- sum = _mm_add_epi16(sum, in2);
-
- in0 = _mm_unpacklo_epi16(u0, sum);
- in1 = _mm_unpackhi_epi16(u0, sum);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(sum, u0);
- in1 = _mm_unpackhi_epi32(sum, u0);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(sum, 8);
-
- in1 = _mm_add_epi32(sum, in0);
- store_output(&in1, output);
-}
-
void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
int16_t* coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t* zbin_ptr,
@@ -1239,75 +1170,6 @@
}
}
-void vp9_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
- int stride) {
- __m128i in0, in1, in2, in3;
- __m128i u0, u1;
- __m128i sum = _mm_setzero_si128();
- int i;
-
- for (i = 0; i < 2; ++i) {
- input += 8 * i;
- in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
-
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
-
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- sum = _mm_add_epi16(sum, u1);
- }
-
- u0 = _mm_setzero_si128();
- in0 = _mm_unpacklo_epi16(u0, sum);
- in1 = _mm_unpackhi_epi16(u0, sum);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(sum, u0);
- in1 = _mm_unpackhi_epi32(sum, u0);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(sum, 8);
-
- in1 = _mm_add_epi32(sum, in0);
- in1 = _mm_srai_epi32(in1, 1);
- store_output(&in1, output);
-}
-
static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
__m128i *in1, int stride) {
// load first 8 columns
@@ -2193,76 +2055,4 @@
assert(0);
break;
}
-}
-
-void vp9_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
- int stride) {
- __m128i in0, in1, in2, in3;
- __m128i u0, u1;
- __m128i sum = _mm_setzero_si128();
- int i;
-
- for (i = 0; i < 8; ++i) {
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- sum = _mm_add_epi16(sum, u1);
- }
-
- u0 = _mm_setzero_si128();
- in0 = _mm_unpacklo_epi16(u0, sum);
- in1 = _mm_unpackhi_epi16(u0, sum);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(sum, u0);
- in1 = _mm_unpackhi_epi32(sum, u0);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(sum, 8);
-
- in1 = _mm_add_epi32(sum, in0);
- in1 = _mm_srai_epi32(in1, 3);
- store_output(&in1, output);
}
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -136,7 +136,6 @@
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/vpx_dsp/arm/fwd_txfm_neon.c
@@ -201,3 +201,20 @@
vst1q_s16(&final_output[7 * 8], input_7);
}
}
+
+void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+ int r;
+ int16x8_t sum = vld1q_s16(&input[0]);
+ for (r = 1; r < 8; ++r) {
+ const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+ sum = vaddq_s16(sum, input_00);
+ }
+ {
+ const int32x4_t a = vpaddlq_s16(sum);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+ output[1] = 0;
+ }
+}
--- a/vpx_dsp/fwd_txfm.c
+++ b/vpx_dsp/fwd_txfm.c
@@ -77,6 +77,17 @@
}
}
+void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 4; ++r)
+ for (c = 0; c < 4; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum << 1;
+ output[1] = 0;
+}
+
void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
int i, j;
tran_low_t intermediate[64];
@@ -163,6 +174,17 @@
}
}
+void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 8; ++r)
+ for (c = 0; c < 8; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum;
+ output[1] = 0;
+}
+
void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
@@ -343,6 +365,17 @@
}
}
+void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 16; ++r)
+ for (c = 0; c < 16; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum >> 1;
+ output[1] = 0;
+}
+
static INLINE tran_high_t dct_32_round(tran_high_t input) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
// TODO(debargha, peter.derivaz): Find new bounds for this assert,
@@ -679,17 +712,6 @@
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
}
-void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 32; ++r)
- for (c = 0; c < 32; ++c)
- sum += input[r * stride + c];
-
- output[0] = sum >> 3;
- output[1] = 0;
-}
-
void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
int i, j;
tran_high_t output[32 * 32];
@@ -747,6 +769,17 @@
}
}
+void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
+ int r, c;
+ tran_low_t sum = 0;
+ for (r = 0; r < 32; ++r)
+ for (c = 0; c < 32; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum >> 3;
+ output[1] = 0;
+}
+
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
int stride) {
@@ -758,11 +791,21 @@
vp9_fdct8x8_c(input, final_output, stride);
}
+void vp9_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ vp9_fdct8x8_1_c(input, final_output, stride);
+}
+
void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
int stride) {
vp9_fdct16x16_c(input, output, stride);
}
+void vp9_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vp9_fdct16x16_1_c(input, output, stride);
+}
+
void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
vp9_fdct32x32_c(input, out, stride);
}
@@ -770,5 +813,10 @@
void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
int stride) {
vp9_fdct32x32_rd_c(input, out, stride);
+}
+
+void vp9_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
+ int stride) {
+ vp9_fdct32x32_1_c(input, out, stride);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/mips/fwd_dct32x32_msa.c
+++ b/vpx_dsp/mips/fwd_dct32x32_msa.c
@@ -931,3 +931,25 @@
out + (8 * i * 32));
}
}
+
+void vp9_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+ out[1] = 0;
+
+ out[0] = LD_HADD(input, stride);
+ out[0] += LD_HADD(input + 8, stride);
+ out[0] += LD_HADD(input + 16, stride);
+ out[0] += LD_HADD(input + 24, stride);
+ out[0] += LD_HADD(input + 32 * 8, stride);
+ out[0] += LD_HADD(input + 32 * 8 + 8, stride);
+ out[0] += LD_HADD(input + 32 * 8 + 16, stride);
+ out[0] += LD_HADD(input + 32 * 8 + 24, stride);
+ out[0] += LD_HADD(input + 32 * 16, stride);
+ out[0] += LD_HADD(input + 32 * 16 + 8, stride);
+ out[0] += LD_HADD(input + 32 * 16 + 16, stride);
+ out[0] += LD_HADD(input + 32 * 16 + 24, stride);
+ out[0] += LD_HADD(input + 32 * 24, stride);
+ out[0] += LD_HADD(input + 32 * 24 + 8, stride);
+ out[0] += LD_HADD(input + 32 * 24 + 16, stride);
+ out[0] += LD_HADD(input + 32 * 24 + 24, stride);
+ out[0] >>= 3;
+}
--- a/vpx_dsp/mips/fwd_txfm_msa.c
+++ b/vpx_dsp/mips/fwd_txfm_msa.c
@@ -215,6 +215,11 @@
ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
}
+void vp9_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+ out[0] = LD_HADD(input, stride);
+ out[1] = 0;
+}
+
void vp9_fdct16x16_msa(const int16_t *input, int16_t *output,
int32_t src_stride) {
int32_t i;
@@ -229,4 +234,14 @@
for (i = 0; i < 2; ++i) {
fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
}
+}
+
+void vp9_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+ out[1] = 0;
+
+ out[0] = LD_HADD(input, stride);
+ out[0] += LD_HADD(input + 8, stride);
+ out[0] += LD_HADD(input + 16 * 8, stride);
+ out[0] += LD_HADD(input + 16 * 8 + 8, stride);
+ out[0] >>= 1;
}
--- a/vpx_dsp/mips/fwd_txfm_msa.h
+++ b/vpx_dsp/mips/fwd_txfm_msa.h
@@ -14,6 +14,21 @@
#include "vpx_dsp/mips/txfm_macros_msa.h"
#include "vpx_dsp/txfm_common.h"
+#define LD_HADD(psrc, stride) ({ \
+ v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \
+ v4i32 vec_w_m; \
+ \
+ LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \
+ ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \
+ LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \
+ ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, \
+ in4_m, in6_m, in0_m, in4_m); \
+ in0_m += in4_m; \
+ \
+ vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \
+ HADD_SW_S32(vec_w_m); \
+})
+
#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) { \
v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -450,12 +450,21 @@
add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct4x4 sse2/;
+ add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct4x4_1 sse2/;
+
add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct8x8 sse2/;
+ add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct8x8_1 sse2/;
+
add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct16x16 sse2/;
+ add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct16x16_1 sse2/;
+
add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct32x32 sse2/;
@@ -462,6 +471,9 @@
add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct32x32_rd sse2/;
+ add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32_1 sse2/;
+
add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct4x4 sse2/;
@@ -468,29 +480,50 @@
add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct8x8 sse2/;
+ add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_highbd_fdct8x8_1/;
+
add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct16x16 sse2/;
+ add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_highbd_fdct16x16_1/;
+
add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct32x32 sse2/;
add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct32x32_rd sse2/;
+
+ add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_highbd_fdct32x32_1/;
} else {
add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct4x4 sse2 msa/;
+ add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct4x4_1 sse2/;
+
add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct8x8 sse2 neon msa/, "$ssse3_x86_64_x86inc";
+ add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct8x8_1 sse2 neon msa/;
+
add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct16x16 sse2 msa/;
+ add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct16x16_1 sse2 msa/;
+
add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct32x32 sse2 avx2 msa/;
add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct32x32_rd sse2 avx2 msa/;
+
+ add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32_1 sse2 msa/;
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9_ENCODER
--- a/vpx_dsp/x86/fwd_txfm_sse2.c
+++ b/vpx_dsp/x86/fwd_txfm_sse2.c
@@ -8,7 +8,221 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <emmintrin.h> // SSE2
+
#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+
+void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in0, in1;
+ __m128i tmp;
+ const __m128i zero = _mm_setzero_si128();
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+ (input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+ (input + 3 * stride)));
+
+ tmp = _mm_add_epi16(in0, in1);
+ in0 = _mm_unpacklo_epi16(zero, tmp);
+ in1 = _mm_unpackhi_epi16(zero, tmp);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(tmp, zero);
+ in1 = _mm_unpackhi_epi32(tmp, zero);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(tmp, 8);
+
+ in1 = _mm_add_epi32(tmp, in0);
+ in0 = _mm_slli_epi32(in1, 1);
+ store_output(&in0, output);
+}
+
+void vp9_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i u0, u1, sum;
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ sum = _mm_add_epi16(u0, u1);
+
+ in0 = _mm_add_epi16(in0, in1);
+ in2 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, in0);
+
+ u0 = _mm_setzero_si128();
+ sum = _mm_add_epi16(sum, in2);
+
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ store_output(&in1, output);
+}
+
+void vp9_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ input += 8 * i;
+ in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 1);
+ store_output(&in1, output);
+}
+
+void vp9_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
+ int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 3);
+ store_output(&in1, output);
+}
#define DCT_HIGH_BIT_DEPTH 0
#define FDCT4x4_2D vp9_fdct4x4_sse2