shithub: libvpx

--- a/test/variance_test.cc

+++ b/test/variance_test.cc

@@ -977,20 +977,6 @@

 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #if HAVE_MMX

-INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,

-                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_mmx)));

-INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,

-                        ::testing::Values(vpx_get_mb_ss_mmx));

-INSTANTIATE_TEST_CASE_P(

-    MMX, VpxVarianceTest,

-    ::testing::Values(make_tuple(4, 4, &vpx_variance16x16_mmx, 0),

-                      make_tuple(4, 3, &vpx_variance16x8_mmx, 0),

-                      make_tuple(3, 4, &vpx_variance8x16_mmx, 0),

-                      make_tuple(3, 3, &vpx_variance8x8_mmx, 0),

-                      make_tuple(2, 2, &vpx_variance4x4_mmx, 0)));

 INSTANTIATE_TEST_CASE_P(

     MMX, VpxSubpelVarianceTest,

     ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0),

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -1407,16 +1407,16 @@

   specialize qw/vpx_variance16x32 sse2 msa/;

 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;

+  specialize qw/vpx_variance16x16 sse2 avx2 media neon msa/;

 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance16x8 mmx sse2 neon msa/;

+  specialize qw/vpx_variance16x8 sse2 neon msa/;

 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance8x16 mmx sse2 neon msa/;

+  specialize qw/vpx_variance8x16 sse2 neon msa/;

 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance8x8 mmx sse2 media neon msa/;

+  specialize qw/vpx_variance8x8 sse2 media neon msa/;

 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

   specialize qw/vpx_variance8x4 sse2 msa/;

@@ -1425,7 +1425,7 @@

   specialize qw/vpx_variance4x8 sse2 msa/;

 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-  specialize qw/vpx_variance4x4 mmx sse2 msa/;

+  specialize qw/vpx_variance4x4 sse2 msa/;

 # Specialty Variance

@@ -1434,10 +1434,10 @@

   specialize qw/vpx_get16x16var sse2 avx2 neon msa/;

 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";

-  specialize qw/vpx_get8x8var mmx sse2 neon msa/;

+  specialize qw/vpx_get8x8var sse2 neon msa/;

 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

-  specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon msa/;

+  specialize qw/vpx_mse16x16 sse2 avx2 media neon msa/;

 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

   specialize qw/vpx_mse16x8 sse2 msa/;

@@ -1449,7 +1449,7 @@

   specialize qw/vpx_mse8x8 sse2 msa/;

 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";

-  specialize qw/vpx_get_mb_ss mmx sse2 msa/;

+  specialize qw/vpx_get_mb_ss sse2 msa/;

 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";

   specialize qw/vpx_get4x4sse_cs neon msa/;

--- a/vpx_dsp/x86/variance_impl_mmx.asm

+++ b/vpx_dsp/x86/variance_impl_mmx.asm

@@ -13,407 +13,6 @@

 %define mmx_filter_shift            7

-;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )

-global sym(vpx_get_mb_ss_mmx) PRIVATE

-sym(vpx_get_mb_ss_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 8

-    ; end prolog

-        mov         rax, arg(0) ;src_ptr

-        mov         rcx, 16

-        pxor        mm4, mm4

-.NEXTROW:

-        movq        mm0, [rax]

-        movq        mm1, [rax+8]

-        movq        mm2, [rax+16]

-        movq        mm3, [rax+24]

-        pmaddwd     mm0, mm0

-        pmaddwd     mm1, mm1

-        pmaddwd     mm2, mm2

-        pmaddwd     mm3, mm3

-        paddd       mm4, mm0

-        paddd       mm4, mm1

-        paddd       mm4, mm2

-        paddd       mm4, mm3

-        add         rax, 32

-        dec         rcx

-        ja          .NEXTROW

-        movq        QWORD PTR [rsp], mm4

-        ;return sum[0]+sum[1];

-        movsxd      rax, dword ptr [rsp]

-        movsxd      rcx, dword ptr [rsp+4]

-        add         rax, rcx

-    ; begin epilog

-    add rsp, 8

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vpx_get8x8var_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  source_stride,

-;    unsigned char *ref_ptr,

-;    int  recon_stride,

-;    unsigned int *SSE,

-;    int *Sum

-;)

-global sym(vpx_get8x8var_mmx) PRIVATE

-sym(vpx_get8x8var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    push rsi

-    push rdi

-    push rbx

-    sub         rsp, 16

-    ; end prolog

-        pxor        mm5, mm5                    ; Blank mmx6

-        pxor        mm6, mm6                    ; Blank mmx7

-        pxor        mm7, mm7                    ; Blank mmx7

-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses

-        mov         rbx, arg(2) ;[ref_ptr]

-        movsxd      rcx, dword ptr arg(1) ;[source_stride]

-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]

-        ; Row 1

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 2

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 3

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 4

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 5

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        ;              movq        mm4, [rbx + rdx]

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 6

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 7

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 8

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Now accumulate the final results.

-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory

-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory

-        movsx       rdx, WORD PTR [rsp+8]

-        movsx       rcx, WORD PTR [rsp+10]

-        movsx       rbx, WORD PTR [rsp+12]

-        movsx       rax, WORD PTR [rsp+14]

-        add         rdx, rcx

-        add         rbx, rax

-        add         rdx, rbx    ;XSum

-        movsxd      rax, DWORD PTR [rsp]

-        movsxd      rcx, DWORD PTR [rsp+4]

-        add         rax, rcx    ;XXSum

-        mov         rsi, arg(4) ;SSE

-        mov         rdi, arg(5) ;Sum

-        mov         dword ptr [rsi], eax

-        mov         dword ptr [rdi], edx

-        xor         rax, rax    ; return 0

-    ; begin epilog

-    add rsp, 16

-    pop rbx

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void

-;vpx_get4x4var_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  source_stride,

-;    unsigned char *ref_ptr,

-;    int  recon_stride,

-;    unsigned int *SSE,

-;    int *Sum

-;)

-global sym(vpx_get4x4var_mmx) PRIVATE

-sym(vpx_get4x4var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    push rsi

-    push rdi

-    push rbx

-    sub         rsp, 16

-    ; end prolog

-        pxor        mm5, mm5                    ; Blank mmx6

-        pxor        mm6, mm6                    ; Blank mmx7

-        pxor        mm7, mm7                    ; Blank mmx7

-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses

-        mov         rbx, arg(2) ;[ref_ptr]

-        movsxd      rcx, dword ptr arg(1) ;[source_stride]

-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]

-        ; Row 1

-        movd        mm0, [rax]                  ; Copy four bytes to mm0

-        movd        mm1, [rbx]                  ; Copy four bytes to mm1

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy four bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 2

-        movd        mm0, [rax]                  ; Copy four bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy four bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 3

-        movd        mm0, [rax]                  ; Copy four bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher precision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy four bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 4

-        movd        mm0, [rax]                  ; Copy four bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Now accumulate the final results.

-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory

-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory

-        movsx       rdx, WORD PTR [rsp+8]

-        movsx       rcx, WORD PTR [rsp+10]

-        movsx       rbx, WORD PTR [rsp+12]

-        movsx       rax, WORD PTR [rsp+14]

-        add         rdx, rcx

-        add         rbx, rax

-        add         rdx, rbx    ;XSum

-        movsxd      rax, DWORD PTR [rsp]

-        movsxd      rcx, DWORD PTR [rsp+4]

-        add         rax, rcx    ;XXSum

-        mov         rsi, arg(4) ;SSE

-        mov         rdi, arg(5) ;Sum

-        mov         dword ptr [rsi], eax

-        mov         dword ptr [rdi], edx

-        xor         rax, rax    ; return 0

-    ; begin epilog

-    add rsp, 16

-    pop rbx

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

 ;void vpx_filter_block2d_bil4x4_var_mmx

;(

 ;    unsigned char *ref_ptr,

--- a/vpx_dsp/x86/variance_mmx.c

+++ b/vpx_dsp/x86/variance_mmx.c

@@ -23,10 +23,6 @@

   {  16,  16,  16,  16, 112, 112, 112, 112 }

};

-extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,

-                              const uint8_t *b, int b_stride,

-                              unsigned int *sse, int *sum);

 extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,

                                               int ref_pixels_per_line,

                                               const unsigned char *src_ptr,

@@ -46,98 +42,6 @@

                                            int *sum,

                                            unsigned int *sumsquared);

-unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,

-                                 const unsigned char *b, int b_stride,

-                                 unsigned int *sse) {

-    unsigned int var;

-    int avg;

-    vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);

-    *sse = var;

-    return (var - (((unsigned int)avg * avg) >> 4));

-}

-unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,

-                                 const unsigned char *b, int b_stride,

-                                 unsigned int *sse) {

-    unsigned int var;

-    int avg;

-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);

-    *sse = var;

-    return (var - (((unsigned int)avg * avg) >> 6));

-}

-unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,

-                              const unsigned char *b, int b_stride,

-                              unsigned int *sse) {

-    unsigned int sse0, sse1, sse2, sse3, var;

-    int sum0, sum1, sum2, sum3;

-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);

-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);

-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,

-                      b + 8 * b_stride, b_stride, &sse2, &sum2);

-    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,

-                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);

-    var = sse0 + sse1 + sse2 + sse3;

-    *sse = var;

-    return var;

-}

-unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,

-                                   const unsigned char *b, int b_stride,

-                                   unsigned int *sse) {

-    unsigned int sse0, sse1, sse2, sse3, var;

-    int sum0, sum1, sum2, sum3, avg;

-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);

-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);

-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,

-                      b + 8 * b_stride, b_stride, &sse2, &sum2);

-    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,

-                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);

-    var = sse0 + sse1 + sse2 + sse3;

-    avg = sum0 + sum1 + sum2 + sum3;

-    *sse = var;

-    return (var - (((unsigned int)avg * avg) >> 8));

-}

-unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,

-                                  const unsigned char *b, int b_stride,

-                                  unsigned int *sse) {

-    unsigned int sse0, sse1, var;

-    int sum0, sum1, avg;

-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);

-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);

-    var = sse0 + sse1;

-    avg = sum0 + sum1;

-    *sse = var;

-    return (var - (((unsigned int)avg * avg) >> 7));

-}

-unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,

-                                  const unsigned char *b, int b_stride,

-                                  unsigned int *sse) {

-    unsigned int sse0, sse1, var;

-    int sum0, sum1, avg;

-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);

-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,

-                      b + 8 * b_stride, b_stride, &sse1, &sum1);

-    var = sse0 + sse1;

-    avg = sum0 + sum1;

-    *sse = var;

-    return (var - (((unsigned int)avg * avg) >> 7));

-}

 uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,

                                        int xoffset, int yoffset,