ref: 1e3a93e72e9fe9048bcff1deeb86afebdbc04921
parent: 469986f96399cbd2cf929e7e6c418196184e7ffa
parent: c338f3635e5b259fec57b8406f1416a863a4b04b
author: Linfeng Zhang <[email protected]>
date: Fri Jun 30 16:49:19 EDT 2017
Merge changes I5d038b4f,I9d00d1dd,I0722841d,I1f640db7 * changes: Add vpx_highbd_idct8x8_{12, 64}_add_sse4_1 sse2: Add transpose_32bit_4x4x2() and update transpose_32bit_4x4() Refactor highbd idct 4x4 sse4.1 code and add highbd_inv_txfm_sse4.h Refactor vpx_idct8x8_12_add_ssse3() and add inv_txfm_ssse3.h
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -210,6 +210,7 @@
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.h
DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c
DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
@@ -240,7 +241,9 @@
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h
DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c
endif # !CONFIG_VP9_HIGHBITDEPTH
ifeq ($(HAVE_NEON_ASM),yes)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -655,8 +655,8 @@
if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/;
- specialize qw/vpx_highbd_idct8x8_64_add neon sse2/;
- specialize qw/vpx_highbd_idct8x8_12_add neon sse2/;
+ specialize qw/vpx_highbd_idct8x8_64_add neon sse2 sse4_1/;
+ specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/;
specialize qw/vpx_highbd_idct16x16_256_add neon sse2/;
specialize qw/vpx_highbd_idct16x16_38_add neon sse2/;
$vpx_highbd_idct16x16_38_add_sse2=vpx_highbd_idct16x16_256_add_sse2;
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -105,8 +105,8 @@
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
inptr[i] = _mm_srai_epi16(inptr[i], 6);
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
+ d[0] = add_clamp(d[0], inptr[i], bd);
+ d[1] = add_clamp(d[1], inptr[i + 16], bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
@@ -222,8 +222,8 @@
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
inptr[i] = _mm_srai_epi16(inptr[i], 6);
inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
+ d[0] = add_clamp(d[0], inptr[i], bd);
+ d[1] = add_clamp(d[1], inptr[i + 16], bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
_mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -27,7 +27,7 @@
const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0);
__m128i temp1[4], temp2[4], step[4];
- transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+ transpose_32bit_4x4(io, io);
// Note: There is no 32-bit signed multiply SIMD instruction in SSE2.
// _mm_mul_epu32() is used which can only guarantee the lower 32-bit
@@ -98,7 +98,7 @@
_mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
__m128i temp1[4], temp2[4], step[4], sign1[4], sign2[4];
- transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+ transpose_32bit_4x4(io, io);
// stage 1
temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
@@ -187,19 +187,15 @@
highbd_idct4_large_sse2(io);
highbd_idct4_large_sse2(io);
}
- io[0] = wraplow_16bit(io[0], io[1], _mm_set1_epi32(8));
- io[1] = wraplow_16bit(io[2], io[3], _mm_set1_epi32(8));
+ io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
}
- recon_and_store_4(dest, io, stride, bd);
+ recon_and_store_4(io, dest, stride, bd);
}
void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- const __m128i zero = _mm_setzero_si128();
- // Faster than _mm_set1_epi16((1 << bd) - 1).
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
int a1, i;
tran_low_t out;
__m128i dc, d;
@@ -211,7 +207,7 @@
for (i = 0; i < 4; ++i) {
d = _mm_loadl_epi64((const __m128i *)dest);
- d = add_dc_clamp(&zero, &max, &dc, &d);
+ d = add_clamp(d, dc, bd);
_mm_storel_epi64((__m128i *)dest, d);
dest += stride;
}
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -12,15 +12,10 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
-static INLINE void extend_64bit(const __m128i in,
- __m128i *const out /*out[2]*/) {
- out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1
- out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3
-}
-
static INLINE void highbd_idct4(__m128i *const io) {
const __m128i cospi_p16_p16 =
_mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
@@ -28,47 +23,20 @@
_mm_setr_epi32(cospi_8_64 << 2, 0, cospi_8_64 << 2, 0);
const __m128i cospi_p24_p24 =
_mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
- __m128i temp1[4], temp2[4], step[4];
+ __m128i temp1[4], step[4];
- transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+ transpose_32bit_4x4(io, io);
// stage 1
temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
- temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
extend_64bit(temp1[0], temp1);
- extend_64bit(temp2[0], temp2);
- temp1[0] = _mm_mul_epi32(temp1[0], cospi_p16_p16);
- temp1[1] = _mm_mul_epi32(temp1[1], cospi_p16_p16);
- temp2[0] = _mm_mul_epi32(temp2[0], cospi_p16_p16);
- temp2[1] = _mm_mul_epi32(temp2[1], cospi_p16_p16);
- temp1[0] = dct_const_round_shift_64bit(temp1[0]);
- temp1[1] = dct_const_round_shift_64bit(temp1[1]);
- temp2[0] = dct_const_round_shift_64bit(temp2[0]);
- temp2[1] = dct_const_round_shift_64bit(temp2[1]);
- step[0] = pack_4(temp1[0], temp1[1]);
- step[1] = pack_4(temp2[0], temp2[1]);
+ step[0] = multiplication_round_shift(temp1, cospi_p16_p16);
+ temp1[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
+ extend_64bit(temp1[0], temp1);
+ step[1] = multiplication_round_shift(temp1, cospi_p16_p16);
+ multiplication_and_add_2_ssse4_1(&io[1], &io[3], &cospi_p24_p24,
+ &cospi_p08_p08, &step[2], &step[3]);
- extend_64bit(io[1], temp1);
- extend_64bit(io[3], temp2);
- temp1[2] = _mm_mul_epi32(temp1[0], cospi_p08_p08);
- temp1[3] = _mm_mul_epi32(temp1[1], cospi_p08_p08);
- temp1[0] = _mm_mul_epi32(temp1[0], cospi_p24_p24);
- temp1[1] = _mm_mul_epi32(temp1[1], cospi_p24_p24);
- temp2[2] = _mm_mul_epi32(temp2[0], cospi_p24_p24);
- temp2[3] = _mm_mul_epi32(temp2[1], cospi_p24_p24);
- temp2[0] = _mm_mul_epi32(temp2[0], cospi_p08_p08);
- temp2[1] = _mm_mul_epi32(temp2[1], cospi_p08_p08);
- temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); // [1]*cospi_24 - [3]*cospi_8
- temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); // [1]*cospi_24 - [3]*cospi_8
- temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); // [1]*cospi_8 + [3]*cospi_24
- temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); // [1]*cospi_8 + [3]*cospi_24
- temp1[0] = dct_const_round_shift_64bit(temp1[0]);
- temp1[1] = dct_const_round_shift_64bit(temp1[1]);
- temp2[0] = dct_const_round_shift_64bit(temp2[0]);
- temp2[1] = dct_const_round_shift_64bit(temp2[1]);
- step[2] = pack_4(temp1[0], temp1[1]);
- step[3] = pack_4(temp2[0], temp2[1]);
-
// stage 2
io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
@@ -99,9 +67,9 @@
} else {
highbd_idct4(io);
highbd_idct4(io);
- io[0] = wraplow_16bit(io[0], io[1], _mm_set1_epi32(8));
- io[1] = wraplow_16bit(io[2], io[3], _mm_set1_epi32(8));
+ io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
}
- recon_and_store_4(dest, io, stride, bd);
+ recon_and_store_4(io, dest, stride, bd);
}
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -94,7 +94,7 @@
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+ d[i] = add_clamp(d[i], inptr[i], bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
}
@@ -196,7 +196,7 @@
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+ d[i] = add_clamp(d[i], inptr[i], bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
}
--- /dev/null
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -1,0 +1,280 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static void highbd_idct8x8_half1d(__m128i *const io) {
+ const __m128i cp_4q_4q =
+ _mm_setr_epi32(cospi_4_64 << 2, 0, cospi_4_64 << 2, 0);
+ const __m128i cp_8q_8q =
+ _mm_setr_epi32(cospi_8_64 << 2, 0, cospi_8_64 << 2, 0);
+ const __m128i cp_12q_12q =
+ _mm_setr_epi32(cospi_12_64 << 2, 0, cospi_12_64 << 2, 0);
+ const __m128i cp_16q_16q =
+ _mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
+ const __m128i cp_20q_20q =
+ _mm_setr_epi32(cospi_20_64 << 2, 0, cospi_20_64 << 2, 0);
+ const __m128i cp_24q_24q =
+ _mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
+ const __m128i cp_28q_28q =
+ _mm_setr_epi32(cospi_28_64 << 2, 0, cospi_28_64 << 2, 0);
+ __m128i temp1[4], temp2[4], step1[8], step2[8];
+
+ transpose_32bit_4x4x2(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[2] = io[4];
+ step1[1] = io[2];
+ step1[3] = io[6];
+ multiplication_and_add_2_ssse4_1(&io[1], &io[7], &cp_28q_28q, &cp_4q_4q,
+ &step1[4], &step1[7]);
+ multiplication_and_add_2_ssse4_1(&io[5], &io[3], &cp_12q_12q, &cp_20q_20q,
+ &step1[5], &step1[6]);
+
+ // stage 2
+ temp2[0] = _mm_add_epi32(step1[0], step1[2]);
+ extend_64bit(temp2[0], temp1);
+ step2[0] = multiplication_round_shift(temp1, cp_16q_16q);
+ temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
+ extend_64bit(temp2[0], temp1);
+ step2[1] = multiplication_round_shift(temp1, cp_16q_16q);
+ multiplication_and_add_2_ssse4_1(&step1[1], &step1[3], &cp_24q_24q, &cp_8q_8q,
+ &step2[2], &step2[3]);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
+ extend_64bit(temp2[0], temp1);
+ step1[5] = multiplication_round_shift(temp1, cp_16q_16q);
+ temp2[0] = _mm_add_epi32(step2[6], step2[5]);
+ extend_64bit(temp2[0], temp1);
+ step1[6] = multiplication_round_shift(temp1, cp_16q_16q);
+ step1[7] = step2[7];
+
+ // stage 4
+ io[0] = _mm_add_epi32(step1[0], step1[7]);
+ io[1] = _mm_add_epi32(step1[1], step1[6]);
+ io[2] = _mm_add_epi32(step1[2], step1[5]);
+ io[3] = _mm_add_epi32(step1[3], step1[4]);
+ io[4] = _mm_sub_epi32(step1[3], step1[4]);
+ io[5] = _mm_sub_epi32(step1[2], step1[5]);
+ io[6] = _mm_sub_epi32(step1[1], step1[6]);
+ io[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+ const __m128i cp_28q_28q =
+ _mm_setr_epi32(cospi_28_64 << 2, 0, cospi_28_64 << 2, 0);
+ const __m128i cp_4q_4q =
+ _mm_setr_epi32(cospi_4_64 << 2, 0, cospi_4_64 << 2, 0);
+ const __m128i cp_n20q_n20q =
+ _mm_setr_epi32(-cospi_20_64 << 2, 0, -cospi_20_64 << 2, 0);
+ const __m128i cp_12q_12q =
+ _mm_setr_epi32(cospi_12_64 << 2, 0, cospi_12_64 << 2, 0);
+ const __m128i cp_16q_16q =
+ _mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
+ const __m128i cp_8q_8q =
+ _mm_setr_epi32(cospi_8_64 << 2, 0, cospi_8_64 << 2, 0);
+ const __m128i cp_24q_24q =
+ _mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
+ __m128i temp1[4], temp2[4], step1[8], step2[8];
+
+ transpose_32bit_4x4(io, io);
+
+ // stage 1
+ step1[0] = io[0];
+ step1[1] = io[2];
+ extend_64bit(io[1], temp1);
+ step1[4] = multiplication_round_shift(temp1, cp_28q_28q);
+ step1[7] = multiplication_round_shift(temp1, cp_4q_4q);
+ extend_64bit(io[3], temp1);
+ step1[5] = multiplication_round_shift(temp1, cp_n20q_n20q);
+ step1[6] = multiplication_round_shift(temp1, cp_12q_12q);
+
+ // stage 2
+ extend_64bit(step1[0], temp1);
+ step2[0] = multiplication_round_shift(temp1, cp_16q_16q);
+ extend_64bit(step1[1], temp1);
+ step2[2] = multiplication_round_shift(temp1, cp_24q_24q);
+ step2[3] = multiplication_round_shift(temp1, cp_8q_8q);
+ step2[4] = _mm_add_epi32(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+ step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi32(step2[0], step2[3]);
+ step1[1] = _mm_add_epi32(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+ step1[4] = step2[4];
+ temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
+ extend_64bit(temp2[0], temp1);
+ step1[5] = multiplication_round_shift(temp1, cp_16q_16q);
+ temp2[0] = _mm_add_epi32(step2[6], step2[5]);
+ extend_64bit(temp2[0], temp1);
+ step1[6] = multiplication_round_shift(temp1, cp_16q_16q);
+ step1[7] = step2[7];
+
+ // stage 4
+ io[0] = _mm_add_epi32(step1[0], step1[7]);
+ io[1] = _mm_add_epi32(step1[1], step1[6]);
+ io[2] = _mm_add_epi32(step1[2], step1[5]);
+ io[3] = _mm_add_epi32(step1[3], step1[4]);
+ io[4] = _mm_sub_epi32(step1[3], step1[4]);
+ io[5] = _mm_sub_epi32(step1[2], step1[5]);
+ io[6] = _mm_sub_epi32(step1[1], step1[6]);
+ io[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+ io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[4]);
+ io_short[1] = _mm_packs_epi32(io[1], io[5]);
+ io_short[2] = _mm_packs_epi32(io[2], io[6]);
+ io_short[3] = _mm_packs_epi32(io[3], io[7]);
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ io_short[4] = _mm_packs_epi32(io[8], io[12]);
+ io_short[5] = _mm_packs_epi32(io[9], io[13]);
+ io_short[6] = _mm_packs_epi32(io[10], io[14]);
+ io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+ idct8_sse2(io_short);
+ idct8_sse2(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ highbd_idct8x8_half1d(io);
+
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+ highbd_idct8x8_half1d(&io[8]);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ io[4] = io[8];
+ io[5] = io[9];
+ io[6] = io[10];
+ io[7] = io[11];
+ highbd_idct8x8_half1d(io);
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_half1d(&io[8]);
+
+ io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
+ io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
+ io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
+ io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
+ io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
+ io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
+ io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
+ io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
+ }
+
+ recon_and_store_8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], zero);
+ io_short[1] = _mm_packs_epi32(io[1], zero);
+ io_short[2] = _mm_packs_epi32(io[2], zero);
+ io_short[3] = _mm_packs_epi32(io[3], zero);
+
+ idct8x8_12_add_kernel_ssse3(io_short);
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ highbd_idct8x8_12_half1d(io);
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ highbd_idct8x8_12_half1d(io);
+
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_idct8x8_12_half1d(&io[8]);
+
+ io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
+ io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
+ io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
+ io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
+ io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
+ io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
+ io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
+ io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
+ }
+
+ recon_and_store_8(io, dest, stride, bd);
+}
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -17,8 +17,14 @@
#include "vpx_dsp/inv_txfm.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
-static INLINE __m128i wraplow_16bit(const __m128i in0, const __m128i in1,
- const __m128i rounding) {
+static INLINE void extend_64bit(const __m128i in,
+ __m128i *const out /*out[2]*/) {
+ out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1
+ out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3
+}
+
+static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1,
+ const __m128i rounding) {
__m128i temp[2];
temp[0] = _mm_add_epi32(in0, rounding);
temp[1] = _mm_add_epi32(in1, rounding);
@@ -27,6 +33,16 @@
return _mm_packs_epi32(temp[0], temp[1]);
}
+static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1,
+ const __m128i rounding) {
+ __m128i temp[2];
+ temp[0] = _mm_add_epi32(in0, rounding);
+ temp[1] = _mm_add_epi32(in1, rounding);
+ temp[0] = _mm_srai_epi32(temp[0], 5);
+ temp[1] = _mm_srai_epi32(temp[1], 5);
+ return _mm_packs_epi32(temp[0], temp[1]);
+}
+
static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
const __m128i t = _mm_add_epi64(
in,
@@ -40,24 +56,24 @@
return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3
}
-static INLINE __m128i add_dc_clamp(const __m128i *const min,
- const __m128i *const max,
- const __m128i *const dc,
- const __m128i *const in) {
- __m128i out;
- out = _mm_adds_epi16(*in, *dc);
- out = _mm_max_epi16(out, *min);
- out = _mm_min_epi16(out, *max);
- return out;
+static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
+ const int bd) {
+ const __m128i zero = _mm_set1_epi16(0);
+ // Faster than _mm_set1_epi16((1 << bd) - 1).
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i d;
+
+ d = _mm_adds_epi16(in0, in1);
+ d = _mm_max_epi16(d, zero);
+ d = _mm_min_epi16(d, max);
+
+ return d;
}
static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
uint16_t *dest, int stride, int bd,
const int size) {
- const __m128i zero = _mm_setzero_si128();
- // Faster than _mm_set1_epi16((1 << bd) - 1).
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
int a1, i, j;
tran_low_t out;
__m128i dc, d;
@@ -70,7 +86,7 @@
for (i = 0; i < size; ++i) {
for (j = 0; j < (size >> 3); ++j) {
d = _mm_load_si128((const __m128i *)(&dest[j * 8]));
- d = add_dc_clamp(&zero, &max, &dc, &d);
+ d = add_clamp(d, dc, bd);
_mm_store_si128((__m128i *)(&dest[j * 8]), d);
}
dest += stride;
@@ -77,36 +93,47 @@
}
}
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
- __m128i ubounded, retval;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
- ubounded = _mm_cmpgt_epi16(value, max);
- retval = _mm_andnot_si128(ubounded, value);
- ubounded = _mm_and_si128(ubounded, max);
- retval = _mm_or_si128(retval, ubounded);
- retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
- return retval;
+static INLINE void recon_and_store_4_dual(const __m128i in,
+ uint16_t *const dest,
+ const int stride, const int bd) {
+ __m128i d;
+
+ d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+ d = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
+ d = add_clamp(d, in, bd);
+ _mm_storel_epi64((__m128i *)(dest + 0 * stride), d);
+ _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
}
-static INLINE void recon_and_store_4(uint16_t *const dest,
- const __m128i *const io, const int stride,
- int bd) {
- __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
- __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
- d0 =
- _mm_unpacklo_epi64(d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
- d2 = _mm_unpacklo_epi64(
- d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
- d0 = clamp_high_sse2(_mm_adds_epi16(d0, io[0]), bd);
- d2 = clamp_high_sse2(_mm_adds_epi16(d2, io[1]), bd);
- _mm_storel_epi64((__m128i *)dest, d0);
- d0 = _mm_srli_si128(d0, 8);
- _mm_storel_epi64((__m128i *)(dest + stride), d0);
- _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
- d2 = _mm_srli_si128(d2, 8);
- _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+static INLINE void recon_and_store_4(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_4_dual(in[0], dest, stride, bd);
+ dest += 2 * stride;
+ recon_and_store_4_dual(in[1], dest, stride, bd);
+}
+
+static INLINE void recon_and_store_8_kernel(const __m128i in,
+ uint16_t **const dest,
+ const int stride, const int bd) {
+ __m128i d;
+
+ d = _mm_load_si128((const __m128i *)(*dest));
+ d = add_clamp(d, in, bd);
+ _mm_store_si128((__m128i *)(*dest), d);
+ *dest += stride;
+}
+
+static INLINE void recon_and_store_8(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_8_kernel(in[0], &dest, stride, bd);
+ recon_and_store_8_kernel(in[1], &dest, stride, bd);
+ recon_and_store_8_kernel(in[2], &dest, stride, bd);
+ recon_and_store_8_kernel(in[3], &dest, stride, bd);
+ recon_and_store_8_kernel(in[4], &dest, stride, bd);
+ recon_and_store_8_kernel(in[5], &dest, stride, bd);
+ recon_and_store_8_kernel(in[6], &dest, stride, bd);
+ recon_and_store_8_kernel(in[7], &dest, stride, bd);
}
#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
--- /dev/null
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -1,0 +1,60 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+
+#include <smmintrin.h> // SSE4.1
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE __m128i multiplication_round_shift(const __m128i *const in,
+ const __m128i cospi) {
+ __m128i t0, t1;
+ t0 = _mm_mul_epi32(in[0], cospi);
+ t1 = _mm_mul_epi32(in[1], cospi);
+ t0 = dct_const_round_shift_64bit(t0);
+ t1 = dct_const_round_shift_64bit(t1);
+ return pack_4(t0, t1);
+}
+
+static INLINE void multiplication_and_add_2_ssse4_1(const __m128i *const in0,
+ const __m128i *const in1,
+ const __m128i *const cst0,
+ const __m128i *const cst1,
+ __m128i *const out0,
+ __m128i *const out1) {
+ __m128i temp1[4], temp2[4];
+ extend_64bit(*in0, temp1);
+ extend_64bit(*in1, temp2);
+ temp1[2] = _mm_mul_epi32(temp1[0], *cst1);
+ temp1[3] = _mm_mul_epi32(temp1[1], *cst1);
+ temp1[0] = _mm_mul_epi32(temp1[0], *cst0);
+ temp1[1] = _mm_mul_epi32(temp1[1], *cst0);
+ temp2[2] = _mm_mul_epi32(temp2[0], *cst0);
+ temp2[3] = _mm_mul_epi32(temp2[1], *cst0);
+ temp2[0] = _mm_mul_epi32(temp2[0], *cst1);
+ temp2[1] = _mm_mul_epi32(temp2[1], *cst1);
+ temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+ temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+ temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+ temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+ temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+ *out0 = pack_4(temp1[0], temp1[1]);
+ *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -152,28 +152,34 @@
_mm_storel_epi64((__m128i *)(dest), d0);
}
+static INLINE void round_shift_8x8(const __m128i *const in,
+ __m128i *const out) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+ out[0] = _mm_add_epi16(in[0], final_rounding);
+ out[1] = _mm_add_epi16(in[1], final_rounding);
+ out[2] = _mm_add_epi16(in[2], final_rounding);
+ out[3] = _mm_add_epi16(in[3], final_rounding);
+ out[4] = _mm_add_epi16(in[4], final_rounding);
+ out[5] = _mm_add_epi16(in[5], final_rounding);
+ out[6] = _mm_add_epi16(in[6], final_rounding);
+ out[7] = _mm_add_epi16(in[7], final_rounding);
+
+ out[0] = _mm_srai_epi16(out[0], 5);
+ out[1] = _mm_srai_epi16(out[1], 5);
+ out[2] = _mm_srai_epi16(out[2], 5);
+ out[3] = _mm_srai_epi16(out[3], 5);
+ out[4] = _mm_srai_epi16(out[4], 5);
+ out[5] = _mm_srai_epi16(out[5], 5);
+ out[6] = _mm_srai_epi16(out[6], 5);
+ out[7] = _mm_srai_epi16(out[7], 5);
+}
+
static INLINE void write_buffer_8x8(const __m128i *const in,
uint8_t *const dest, const int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
__m128i t[8];
- // Final rounding and shift
- t[0] = _mm_adds_epi16(in[0], final_rounding);
- t[1] = _mm_adds_epi16(in[1], final_rounding);
- t[2] = _mm_adds_epi16(in[2], final_rounding);
- t[3] = _mm_adds_epi16(in[3], final_rounding);
- t[4] = _mm_adds_epi16(in[4], final_rounding);
- t[5] = _mm_adds_epi16(in[5], final_rounding);
- t[6] = _mm_adds_epi16(in[6], final_rounding);
- t[7] = _mm_adds_epi16(in[7], final_rounding);
- t[0] = _mm_srai_epi16(t[0], 5);
- t[1] = _mm_srai_epi16(t[1], 5);
- t[2] = _mm_srai_epi16(t[2], 5);
- t[3] = _mm_srai_epi16(t[3], 5);
- t[4] = _mm_srai_epi16(t[4], 5);
- t[5] = _mm_srai_epi16(t[5], 5);
- t[6] = _mm_srai_epi16(t[6], 5);
- t[7] = _mm_srai_epi16(t[7], 5);
+ round_shift_8x8(in, t);
recon_and_store(dest + 0 * stride, t[0]);
recon_and_store(dest + 1 * stride, t[1]);
--- a/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -12,103 +12,21 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
int stride) {
- const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
- const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
- const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
- const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
- const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i cospi_16_64d = _mm_set1_epi16(2 * cospi_16_64);
- const __m128i cospi_28_64d = _mm_set1_epi16(2 * cospi_28_64);
- const __m128i cospi_4_64d = _mm_set1_epi16(2 * cospi_4_64);
- const __m128i cospi_n20_64d = _mm_set1_epi16(-2 * cospi_20_64);
- const __m128i cospi_12_64d = _mm_set1_epi16(2 * cospi_12_64);
- const __m128i cospi_24_64d = _mm_set1_epi16(2 * cospi_24_64);
- const __m128i cospi_8_64d = _mm_set1_epi16(2 * cospi_8_64);
- __m128i in[8], step1[8], step2[8], tmp[4];
+ __m128i io[8];
- in[0] = load_input_data4(input + 0 * 8);
- in[1] = load_input_data4(input + 1 * 8);
- in[2] = load_input_data4(input + 2 * 8);
- in[3] = load_input_data4(input + 3 * 8);
+ io[0] = load_input_data4(input + 0 * 8);
+ io[1] = load_input_data4(input + 1 * 8);
+ io[2] = load_input_data4(input + 2 * 8);
+ io[3] = load_input_data4(input + 3 * 8);
- // pass 1
-
- transpose_16bit_4x4(in, in);
- // in[0]: 00 10 20 30 01 11 21 31
- // in[1]: 02 12 22 32 03 13 23 33
-
- // stage 1
- tmp[0] = _mm_unpacklo_epi64(in[0], in[0]);
- tmp[1] = _mm_unpackhi_epi64(in[0], in[0]);
- tmp[2] = _mm_unpacklo_epi64(in[1], in[1]);
- tmp[3] = _mm_unpackhi_epi64(in[1], in[1]);
- step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7
- step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6
-
- // stage 2
- step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1
- step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2
- step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
- step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
- step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6
-
- // stage 3
- tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
- step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6
- tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
- tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
- step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
- step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
-
- // stage 4
- tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
- tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
- tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
- tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
-
- // pass 2
-
- idct8x8_12_transpose_16bit_4x8(tmp, in);
-
- // stage 1
- step1[4] = _mm_mulhrs_epi16(in[1], cospi_28_64d);
- step1[7] = _mm_mulhrs_epi16(in[1], cospi_4_64d);
- step1[5] = _mm_mulhrs_epi16(in[3], cospi_n20_64d);
- step1[6] = _mm_mulhrs_epi16(in[3], cospi_12_64d);
-
- // stage 2
- step2[0] = _mm_mulhrs_epi16(in[0], cospi_16_64d); // step2[1] = step2[0]
- step2[2] = _mm_mulhrs_epi16(in[2], cospi_24_64d);
- step2[3] = _mm_mulhrs_epi16(in[2], cospi_8_64d);
- step2[4] = _mm_add_epi16(step1[4], step1[5]);
- step2[5] = _mm_sub_epi16(step1[4], step1[5]);
- step2[6] = _mm_sub_epi16(step1[7], step1[6]);
- step2[7] = _mm_add_epi16(step1[7], step1[6]);
-
- // stage 3
- step1[0] = _mm_add_epi16(step2[0], step2[3]);
- step1[1] = _mm_add_epi16(step2[0], step2[2]);
- step1[2] = _mm_sub_epi16(step2[0], step2[2]);
- step1[3] = _mm_sub_epi16(step2[0], step2[3]);
- multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16,
- &step1[5], &step1[6]);
-
- // stage 4
- in[0] = _mm_add_epi16(step1[0], step2[7]);
- in[1] = _mm_add_epi16(step1[1], step1[6]);
- in[2] = _mm_add_epi16(step1[2], step1[5]);
- in[3] = _mm_add_epi16(step1[3], step2[4]);
- in[4] = _mm_sub_epi16(step1[3], step2[4]);
- in[5] = _mm_sub_epi16(step1[2], step1[5]);
- in[6] = _mm_sub_epi16(step1[1], step1[6]);
- in[7] = _mm_sub_epi16(step1[0], step2[7]);
-
- write_buffer_8x8(in, dest, stride);
+ idct8x8_12_add_kernel_ssse3(io);
+ write_buffer_8x8(io, dest, stride);
}
static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
--- /dev/null
+++ b/vpx_dsp/x86/inv_txfm_ssse3.h
@@ -1,0 +1,109 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_INV_TXFM_SSSE3_H_
+#define VPX_DSP_X86_INV_TXFM_SSSE3_H_
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
+ const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
+ const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
+ const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
+ const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i cospi_16_64d = _mm_set1_epi16(2 * cospi_16_64);
+ const __m128i cospi_28_64d = _mm_set1_epi16(2 * cospi_28_64);
+ const __m128i cospi_4_64d = _mm_set1_epi16(2 * cospi_4_64);
+ const __m128i cospi_n20_64d = _mm_set1_epi16(-2 * cospi_20_64);
+ const __m128i cospi_12_64d = _mm_set1_epi16(2 * cospi_12_64);
+ const __m128i cospi_24_64d = _mm_set1_epi16(2 * cospi_24_64);
+ const __m128i cospi_8_64d = _mm_set1_epi16(2 * cospi_8_64);
+ __m128i step1[8], step2[8], tmp[4];
+
+ // pass 1
+
+ transpose_16bit_4x4(io, io);
+ // io[0]: 00 10 20 30 01 11 21 31
+ // io[1]: 02 12 22 32 03 13 23 33
+
+ // stage 1
+ tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
+ tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
+ tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
+ tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
+ step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7
+ step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6
+
+ // stage 2
+ step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1
+ step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2
+ step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
+ step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6
+
+ // stage 3
+ tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
+ step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6
+ tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
+ tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
+ step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
+ step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
+
+ // stage 4
+ tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
+ tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
+ tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
+ tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
+
+ // pass 2
+
+ idct8x8_12_transpose_16bit_4x8(tmp, io);
+
+ // stage 1
+ step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
+ step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
+ step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
+ step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
+
+ // stage 2
+ step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0]
+ step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
+ step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16,
+ &step1[5], &step1[6]);
+
+ // stage 4
+ io[0] = _mm_add_epi16(step1[0], step2[7]);
+ io[1] = _mm_add_epi16(step1[1], step1[6]);
+ io[2] = _mm_add_epi16(step1[2], step1[5]);
+ io[3] = _mm_add_epi16(step1[3], step2[4]);
+ io[4] = _mm_sub_epi16(step1[3], step2[4]);
+ io[5] = _mm_sub_epi16(step1[2], step1[5]);
+ io[6] = _mm_sub_epi16(step1[1], step1[6]);
+ io[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+#endif // VPX_DSP_X86_INV_TXFM_SSSE3_H_
--- a/vpx_dsp/x86/transpose_sse2.h
+++ b/vpx_dsp/x86/transpose_sse2.h
@@ -21,16 +21,16 @@
// in[2]: 20 21 22 23 XX XX XX XX
// in[3]: 30 31 32 33 XX XX XX XX
// to:
- // tr0_0: 00 10 01 11 02 12 03 13
- // tr0_1: 20 30 21 31 22 32 23 33
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
// Unpack 32 bit elements resulting in:
// out[0]: 00 10 20 30 01 11 21 31
// out[1]: 02 12 22 32 03 13 23 33
- out[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
- out[1] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ out[0] = _mm_unpacklo_epi32(a0, a1);
+ out[1] = _mm_unpackhi_epi32(a0, a1);
}
static INLINE void transpose_16bit_4x8(const __m128i *const in,
@@ -45,24 +45,24 @@
// in[6]: 60 61 62 63 XX XX XX XX
// in[7]: 70 71 72 73 XX XX XX XX
// to:
- // tr0_0: 00 10 01 11 02 12 03 13
- // tr0_1: 20 30 21 31 22 32 23 33
- // tr0_2: 40 50 41 51 42 52 43 53
- // tr0_3: 60 70 61 71 62 72 63 73
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_3 = _mm_unpacklo_epi16(in[6], in[7]);
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
// Unpack 32 bit elements resulting in:
- // tr1_0: 00 10 20 30 01 11 21 31
- // tr1_1: 40 50 60 70 41 51 61 71
- // tr1_2: 02 12 22 32 03 13 23 33
- // tr1_3: 42 52 62 72 43 53 63 73
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 02 12 22 32 03 13 23 33
+ // b3: 42 52 62 72 43 53 63 73
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
// Unpack 64 bit elements resulting in:
// out[0]: 00 10 20 30 40 50 60 70
@@ -69,10 +69,10 @@
// out[1]: 01 11 21 31 41 51 61 71
// out[2]: 02 12 22 32 42 52 62 72
// out[3]: 03 13 23 33 43 53 63 73
- out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b2, b3);
+ out[3] = _mm_unpackhi_epi64(b2, b3);
}
static INLINE void transpose_16bit_8x8(const __m128i *const in,
@@ -87,40 +87,40 @@
// in[6]: 60 61 62 63 64 65 66 67
// in[7]: 70 71 72 73 74 75 76 77
// to:
- // tr0_0: 00 10 01 11 02 12 03 13
- // tr0_1: 20 30 21 31 22 32 23 33
- // tr0_2: 40 50 41 51 42 52 43 53
- // tr0_3: 60 70 61 71 62 72 63 73
- // tr0_4: 04 14 05 15 06 16 07 17
- // tr0_5: 24 34 25 35 26 36 27 37
- // tr0_6: 44 54 45 55 46 56 47 57
- // tr0_7: 64 74 65 75 66 76 67 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_3 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_4 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_5 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ // a6: 44 54 45 55 46 56 47 57
+ // a7: 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
// Unpack 32 bit elements resulting in:
- // tr1_0: 00 10 20 30 01 11 21 31
- // tr1_1: 40 50 60 70 41 51 61 71
- // tr1_2: 04 14 24 34 05 15 25 35
- // tr1_3: 44 54 64 74 45 55 65 75
- // tr1_4: 02 12 22 32 03 13 23 33
- // tr1_5: 42 52 62 72 43 53 63 73
- // tr1_6: 06 16 26 36 07 17 27 37
- // tr1_7: 46 56 66 76 47 57 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_3 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_5 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 04 14 24 34 05 15 25 35
+ // b3: 44 54 64 74 45 55 65 75
+ // b4: 02 12 22 32 03 13 23 33
+ // b5: 42 52 62 72 43 53 63 73
+ // b6: 06 16 26 36 07 17 27 37
+ // b7: 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+ const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
// Unpack 64 bit elements resulting in:
// out[0]: 00 10 20 30 40 50 60 70
@@ -131,14 +131,14 @@
// out[5]: 05 15 25 35 45 55 65 75
// out[6]: 06 16 26 36 46 56 66 76
// out[7]: 07 17 27 37 47 57 67 77
- out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- out[2] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- out[3] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- out[4] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- out[5] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- out[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- out[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b4, b5);
+ out[3] = _mm_unpackhi_epi64(b4, b5);
+ out[4] = _mm_unpacklo_epi64(b2, b3);
+ out[5] = _mm_unpackhi_epi64(b2, b3);
+ out[6] = _mm_unpacklo_epi64(b6, b7);
+ out[7] = _mm_unpackhi_epi64(b6, b7);
}
// Transpose in-place
@@ -160,33 +160,81 @@
left[15] = tbuf[7];
}
-static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1,
- __m128i *const a2, __m128i *const a3) {
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+ __m128i *const out) {
// Unpack 32 bit elements. Goes from:
- // a0: 00 01 02 03
- // a1: 10 11 12 13
- // a2: 20 21 22 23
- // a3: 30 31 32 33
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
// to:
- // b0: 00 10 01 11
- // b1: 20 30 21 31
- // b2: 02 12 03 13
- // b3: 22 32 23 33
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
- const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1);
- const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3);
- const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1);
- const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3);
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
// Unpack 64 bit elements resulting in:
- // a0: 00 10 20 30
- // a1: 01 11 21 31
- // a2: 02 12 22 32
- // a3: 03 13 23 33
- *a0 = _mm_unpacklo_epi64(b0, b1);
- *a1 = _mm_unpackhi_epi64(b0, b1);
- *a2 = _mm_unpacklo_epi64(b2, b3);
- *a3 = _mm_unpackhi_epi64(b2, b3);
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // in[4]: 04 05 06 07
+ // in[5]: 14 15 16 17
+ // in[6]: 24 25 26 27
+ // in[7]: 34 35 36 37
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+ // a4: 04 14 05 15
+ // a5: 24 34 25 35
+ // a6: 06 16 07 17
+ // a7: 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+ const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+ const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+ const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+ out[4] = _mm_unpacklo_epi64(a4, a5);
+ out[5] = _mm_unpackhi_epi64(a4, a5);
+ out[6] = _mm_unpacklo_epi64(a6, a7);
+ out[7] = _mm_unpackhi_epi64(a6, a7);
}
#endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_