shithub: libvpx

--- a/test/add_noise_test.cc

+++ b/test/add_noise_test.cc

@@ -185,11 +185,6 @@

 INSTANTIATE_TEST_CASE_P(C, AddNoiseTest,

                         ::testing::Values(vpx_plane_add_noise_c));

-#if HAVE_MMX

-INSTANTIATE_TEST_CASE_P(MMX, AddNoiseTest,

-                        ::testing::Values(vpx_plane_add_noise_mmx));

-#endif

 #if HAVE_SSE2

 INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest,

                         ::testing::Values(vpx_plane_add_noise_sse2));

--- a/test/variance_test.cc

+++ b/test/variance_test.cc

@@ -976,16 +976,6 @@

         make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)));

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-#if HAVE_MMX

-INSTANTIATE_TEST_CASE_P(

-    MMX, VpxSubpelVarianceTest,

-    ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0),

-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_mmx, 0),

-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_mmx, 0),

-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_mmx, 0),

-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_mmx, 0)));

-#endif  // HAVE_MMX

 #if HAVE_SSE2

 INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,

                         ::testing::Values(vpx_get_mb_ss_sse2));

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -55,7 +55,6 @@

 ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)

 DSP_SRCS-yes += add_noise.c

 DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c

-DSP_SRCS-$(HAVE_MMX) += x86/add_noise_mmx.asm

 DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm

 endif # CONFIG_POSTPROC

@@ -322,8 +321,6 @@

 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c

 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c

-DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c

-DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm

 DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c

 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3

 DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_sse2.c

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -1478,16 +1478,16 @@

   specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

+  specialize qw/vpx_sub_pixel_variance16x16 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";

+  specialize qw/vpx_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";

+  specialize qw/vpx_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

+  specialize qw/vpx_sub_pixel_variance8x8 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

   specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";

@@ -1496,7 +1496,7 @@

   specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";

-  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";

+  specialize qw/vpx_sub_pixel_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";

 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";

   specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";

@@ -1541,13 +1541,13 @@

 # Specialty Subpixel

 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";

-  specialize qw/vpx_variance_halfpixvar16x16_h mmx sse2 media/;

+  specialize qw/vpx_variance_halfpixvar16x16_h sse2 media/;

 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";

-  specialize qw/vpx_variance_halfpixvar16x16_v mmx sse2 media/;

+  specialize qw/vpx_variance_halfpixvar16x16_v sse2 media/;

 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";

-  specialize qw/vpx_variance_halfpixvar16x16_hv mmx sse2 media/;

+  specialize qw/vpx_variance_halfpixvar16x16_hv sse2 media/;

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

@@ -1913,7 +1913,7 @@

 if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {

     add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";

-    specialize qw/vpx_plane_add_noise mmx sse2 msa/;

+    specialize qw/vpx_plane_add_noise sse2 msa/;

 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

--- a/vpx_dsp/x86/add_noise_mmx.asm

+++ /dev/null

@@ -1,86 +1,0 @@

-;

-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vpx_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,

-;                            unsigned char blackclamp[16],

-;                            unsigned char whiteclamp[16],

-;                            unsigned char bothclamp[16],

-;                            unsigned int Width, unsigned int Height, int Pitch)

-global sym(vpx_plane_add_noise_mmx) PRIVATE

-sym(vpx_plane_add_noise_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; get the clamps in registers

-    mov     rdx, arg(2) ; blackclamp

-    movq    mm3, [rdx]

-    mov     rdx, arg(3) ; whiteclamp

-    movq    mm4, [rdx]

-    mov     rdx, arg(4) ; bothclamp

-    movq    mm5, [rdx]

-.addnoise_loop:

-    call sym(LIBVPX_RAND) WRT_PLT

-    mov     rcx, arg(1) ;noise

-    and     rax, 0xff

-    add     rcx, rax

-            mov     rdi, rcx

-            movsxd  rcx, dword arg(5) ;[Width]

-            mov     rsi, arg(0) ;Pos

-            xor         rax,rax

-.addnoise_nextset:

-            movq        mm1,[rsi+rax]         ; get the source

-            psubusb     mm1, mm3 ; subtract black clamp

-            paddusb     mm1, mm5 ; add both clamp

-            psubusb     mm1, mm4 ; subtract whiteclamp

-            movq        mm2,[rdi+rax]         ; get the noise for this line

-            paddb       mm1,mm2              ; add it in

-            movq        [rsi+rax],mm1         ; store the result

-            add         rax,8                 ; move to the next line

-            cmp         rax, rcx

-            jl          .addnoise_nextset

-    movsxd  rax, dword arg(7) ; Pitch

-    add     arg(0), rax ; Start += Pitch

-    sub     dword arg(6), 1   ; Height -= 1

-    jg      .addnoise_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-Blur:

-    times 16 dw 16

-    times  8 dw 64

-    times 16 dw 16

-    times  8 dw  0

-rd:

-    times 4 dw 0x40

--- a/vpx_dsp/x86/variance_impl_mmx.asm

+++ /dev/null

@@ -1,343 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define mmx_filter_shift            7

-;void vpx_filter_block2d_bil4x4_var_mmx

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned short *HFilter,

-;    unsigned short *VFilter,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE

-sym(vpx_filter_block2d_bil4x4_var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 16

-    ; end prolog

-        pxor            mm6,            mm6                 ;

-        pxor            mm7,            mm7                 ;

-        mov             rax,            arg(4) ;HFilter             ;

-        mov             rdx,            arg(5) ;VFilter             ;

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        mov             rcx,            4                   ;

-        pxor            mm0,            mm0                 ;

-        movd            mm1,            [rsi]               ;

-        movd            mm3,            [rsi+1]             ;

-        punpcklbw       mm1,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        movq            mm5,            mm1

-%if ABI_IS_32BIT

-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;

-%else

-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;

-        add             rsi, r8

-%endif

-.filter_block2d_bil4x4_var_mmx_loop:

-        movd            mm1,            [rsi]               ;

-        movd            mm3,            [rsi+1]             ;

-        punpcklbw       mm1,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        movq            mm3,            mm5                 ;

-        movq            mm5,            mm1                 ;

-        pmullw          mm3,            [rdx]               ;

-        pmullw          mm1,            [rdx+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        movd            mm3,            [rdi]               ;

-        punpcklbw       mm3,            mm0                 ;

-        psubw           mm1,            mm3                 ;

-        paddw           mm6,            mm1                 ;

-        pmaddwd         mm1,            mm1                 ;

-        paddd           mm7,            mm1                 ;

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;

-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;

-%else

-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

-        add             rsi,            r8

-        add             rdi,            r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rdi,            arg(6) ;sum

-        mov             rsi,            arg(7) ;sumsquared

-        movd            dword ptr [rdi],          mm2                 ;

-        movd            dword ptr [rsi],          mm4                 ;

-    ; begin epilog

-    add rsp, 16

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vpx_filter_block2d_bil_var_mmx

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    unsigned short *HFilter,

-;    unsigned short *VFilter,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE

-sym(vpx_filter_block2d_bil_var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 16

-    ; end prolog

-        pxor            mm6,            mm6                 ;

-        pxor            mm7,            mm7                 ;

-        mov             rax,            arg(5) ;HFilter             ;

-        mov             rdx,            arg(6) ;VFilter             ;

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        pxor            mm0,            mm0                 ;

-        movq            mm1,            [rsi]               ;

-        movq            mm3,            [rsi+1]             ;

-        movq            mm2,            mm1                 ;

-        movq            mm4,            mm3                 ;

-        punpcklbw       mm1,            mm0                 ;

-        punpckhbw       mm2,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        pmullw          mm2,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        pmullw          mm4,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm2,            mm4                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm2,            mmx_filter_shift    ;

-        movq            mm5,            mm1

-        packuswb        mm5,            mm2                 ;

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line

-%else

-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

-        add             rsi,            r8

-%endif

-.filter_block2d_bil_var_mmx_loop:

-        movq            mm1,            [rsi]               ;

-        movq            mm3,            [rsi+1]             ;

-        movq            mm2,            mm1                 ;

-        movq            mm4,            mm3                 ;

-        punpcklbw       mm1,            mm0                 ;

-        punpckhbw       mm2,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        pmullw          mm2,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        pmullw          mm4,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm2,            mm4                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm2,            mmx_filter_shift    ;

-        movq            mm3,            mm5                 ;

-        movq            mm4,            mm5                 ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        movq            mm5,            mm1                 ;

-        packuswb        mm5,            mm2                 ;

-        pmullw          mm3,            [rdx]               ;

-        pmullw          mm4,            [rdx]               ;

-        pmullw          mm1,            [rdx+8]             ;

-        pmullw          mm2,            [rdx+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm2,            mm4                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        psraw           mm2,            mmx_filter_shift    ;

-        movq            mm3,            [rdi]               ;

-        movq            mm4,            mm3                 ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        psubw           mm1,            mm3                 ;

-        psubw           mm2,            mm4                 ;

-        paddw           mm6,            mm1                 ;

-        pmaddwd         mm1,            mm1                 ;

-        paddw           mm6,            mm2                 ;

-        pmaddwd         mm2,            mm2                 ;

-        paddd           mm7,            mm1                 ;

-        paddd           mm7,            mm2                 ;

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;

-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;

-%else

-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;

-        add             rsi,            r8

-        add             rdi,            r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             .filter_block2d_bil_var_mmx_loop       ;

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rdi,            arg(7) ;sum

-        mov             rsi,            arg(8) ;sumsquared

-        movd            dword ptr [rdi],          mm2                 ;

-        movd            dword ptr [rsi],          mm4                 ;

-    ; begin epilog

-    add rsp, 16

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-;short mmx_bi_rd[4] = { 64, 64, 64, 64};

-align 16

-mmx_bi_rd:

-    times 4 dw 64

--- a/vpx_dsp/x86/variance_mmx.c

+++ /dev/null

@@ -1,153 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vpx_dsp_rtcd.h"

-#include "vpx_ports/mem.h"

-DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {

-  { 128, 128, 128, 128,   0,   0,   0,   0 },

-  { 112, 112, 112, 112,  16,  16,  16,  16 },

-  {  96,  96,  96,  96,  32,  32,  32,  32 },

-  {  80,  80,  80,  80,  48,  48,  48,  48 },

-  {  64,  64,  64,  64,  64,  64,  64,  64 },

-  {  48,  48,  48,  48,  80,  80,  80,  80 },

-  {  32,  32,  32,  32,  96,  96,  96,  96 },

-  {  16,  16,  16,  16, 112, 112, 112, 112 }

-};

-extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,

-                                              int ref_pixels_per_line,

-                                              const unsigned char *src_ptr,

-                                              int src_pixels_per_line,

-                                              const int16_t *HFilter,

-                                              const int16_t *VFilter,

-                                              int *sum,

-                                              unsigned int *sumsquared);

-extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,

-                                           int ref_pixels_per_line,

-                                           const unsigned char *src_ptr,

-                                           int src_pixels_per_line,

-                                           unsigned int Height,

-                                           const int16_t *HFilter,

-                                           const int16_t *VFilter,

-                                           int *sum,

-                                           unsigned int *sumsquared);

-uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,

-                                       int xoffset, int yoffset,

-                                       const uint8_t *b, int b_stride,

-                                       uint32_t *sse) {

-    int xsum;

-    unsigned int xxsum;

-    vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,

-                                      bilinear_filters_mmx[xoffset],

-                                      bilinear_filters_mmx[yoffset],

-                                      &xsum, &xxsum);

-    *sse = xxsum;

-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));

-}

-uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,

-                                       int xoffset, int yoffset,

-                                       const uint8_t *b, int b_stride,

-                                       uint32_t *sse) {

-    int xsum;

-    uint32_t xxsum;

-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,

-                                   bilinear_filters_mmx[xoffset],

-                                   bilinear_filters_mmx[yoffset],

-                                   &xsum, &xxsum);

-    *sse = xxsum;

-    return (xxsum - (((uint32_t)xsum * xsum) >> 6));

-}

-uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,

-                                         int xoffset, int yoffset,

-                                         const uint8_t *b, int b_stride,

-                                         uint32_t *sse) {

-    int xsum0, xsum1;

-    unsigned int xxsum0, xxsum1;

-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,

-                                   bilinear_filters_mmx[xoffset],

-                                   bilinear_filters_mmx[yoffset],

-                                   &xsum0, &xxsum0);

-    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,

-                                   bilinear_filters_mmx[xoffset],

-                                   bilinear_filters_mmx[yoffset],

-                                   &xsum1, &xxsum1);

-    xsum0 += xsum1;

-    xxsum0 += xxsum1;

-    *sse = xxsum0;

-    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));

-}

-uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,

-                                        int xoffset, int yoffset,

-                                        const uint8_t *b, int b_stride,

-                                        uint32_t *sse) {

-    int xsum0, xsum1;

-    unsigned int xxsum0, xxsum1;

-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,

-                                   bilinear_filters_mmx[xoffset],

-                                   bilinear_filters_mmx[yoffset],

-                                   &xsum0, &xxsum0);

-    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,

-                                   bilinear_filters_mmx[xoffset],

-                                   bilinear_filters_mmx[yoffset],

-                                   &xsum1, &xxsum1);

-    xsum0 += xsum1;

-    xxsum0 += xxsum1;

-    *sse = xxsum0;

-    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));

-}

-uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,

-                                        int xoffset, int yoffset,

-                                        const uint8_t *b, int b_stride,

-                                        uint32_t *sse) {

-    int xsum;

-    unsigned int xxsum;

-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,

-                                   bilinear_filters_mmx[xoffset],

-                                   bilinear_filters_mmx[yoffset],

-                                   &xsum, &xxsum);

-    *sse = xxsum;

-    return (xxsum - (((uint32_t)xsum * xsum) >> 7));

-}

-uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,

-                                            const uint8_t *b, int b_stride,

-                                            uint32_t *sse) {

-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);

-}

-uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,

-                                            const uint8_t *b, int b_stride,

-                                            uint32_t *sse) {

-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);

-}

-uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,

-                                             const uint8_t *b, int b_stride,

-                                             uint32_t *sse) {

-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);

-}