shithub: libvpx

--- a/test/fdct4x4_test.cc

+++ b/test/fdct4x4_test.cc

@@ -353,6 +353,13 @@

         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3)));

 #endif

+#if CONFIG_USE_X86INC && HAVE_MMX

+INSTANTIATE_TEST_CASE_P(

+    MMX, Trans4x4WHT,

+    ::testing::Values(

+        make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0)));

+#endif

 #if HAVE_SSE2

 INSTANTIATE_TEST_CASE_P(

     SSE2, Trans4x4DCT,

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -693,7 +693,7 @@

 specialize qw/vp9_fht16x16 sse2 avx2/;

 add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";

-specialize qw/vp9_fwht4x4/;

+specialize qw/vp9_fwht4x4/, "$mmx_x86inc";

 add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";

 specialize qw/vp9_fdct4x4 sse2 avx2/;

--- /dev/null

+++ b/vp9/encoder/x86/vp9_dct_mmx.asm

@@ -1,0 +1,70 @@

+;

+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+SECTION .text

+%macro TRANSFORM_COLS 0

+  paddw           m0,        m1

+  movq            m4,        m0

+  psubw           m3,        m2

+  psubw           m4,        m3

+  psraw           m4,        1

+  movq            m5,        m4

+  psubw           m5,        m1 ;b1

+  psubw           m4,        m2 ;c1

+  psubw           m0,        m4

+  paddw           m3,        m5

+                                ; m0 a0

+  SWAP            1,         4  ; m1 c1

+  SWAP            2,         3  ; m2 d1

+  SWAP            3,         5  ; m3 b1

+%endmacro

+%macro TRANSPOSE_4X4 0

+  movq            m4,        m0

+  movq            m5,        m2

+  punpcklwd       m4,        m1

+  punpckhwd       m0,        m1

+  punpcklwd       m5,        m3

+  punpckhwd       m2,        m3

+  movq            m1,        m4

+  movq            m3,        m0

+  punpckldq       m1,        m5

+  punpckhdq       m4,        m5

+  punpckldq       m3,        m2

+  punpckhdq       m0,        m2

+  SWAP            2, 3, 0, 1, 4

+%endmacro

+INIT_MMX mmx

+cglobal fwht4x4, 3, 4, 8, input, output, stride

+  lea             r3q,       [inputq + strideq*4]

+  movq            m0,        [inputq] ;a1

+  movq            m1,        [inputq + strideq*2] ;b1

+  movq            m2,        [r3q] ;c1

+  movq            m3,        [r3q + strideq*2] ;d1

+  TRANSFORM_COLS

+  TRANSPOSE_4X4

+  TRANSFORM_COLS

+  TRANSPOSE_4X4

+  psllw           m0,        2

+  psllw           m1,        2

+  psllw           m2,        2

+  psllw           m3,        2

+  movq            [outputq],      m0

+  movq            [outputq + 8],  m1

+  movq            [outputq + 16], m2

+  movq            [outputq + 24], m3

+  RET

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -101,6 +101,7 @@

 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm

 ifeq ($(CONFIG_USE_X86INC),yes)

+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm