shithub: libvpx

--- a/test/variance_test.cc

+++ b/test/variance_test.cc

@@ -485,21 +485,6 @@

                       make_tuple(6, 5, subpel_avg_variance64x32_c),

                       make_tuple(6, 6, subpel_avg_variance64x64_c)));

-#if HAVE_MMX

-const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;

-const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;

-const vp9_variance_fn_t variance8x16_mmx = vp9_variance8x16_mmx;

-const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx;

-const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx;

-INSTANTIATE_TEST_CASE_P(

-    MMX, VP9VarianceTest,

-    ::testing::Values(make_tuple(2, 2, variance4x4_mmx),

-                      make_tuple(3, 3, variance8x8_mmx),

-                      make_tuple(3, 4, variance8x16_mmx),

-                      make_tuple(4, 3, variance16x8_mmx),

-                      make_tuple(4, 4, variance16x16_mmx)));

-#endif

 #if HAVE_SSE2

 #if CONFIG_USE_X86INC

 const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -420,19 +420,19 @@

 specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";

+specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";

+specialize qw/vp9_variance16x8/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc";

+specialize qw/vp9_variance8x16/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_variance8x8 mmx neon/, "$sse2_x86inc";

+specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";

 add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";

-specialize qw/vp9_get8x8var mmx neon/, "$sse2_x86inc";

+specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";

 add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";

 specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";

@@ -444,7 +444,7 @@

 specialize qw/vp9_variance4x8/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

-specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";

+specialize qw/vp9_variance4x4/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";

 specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";

@@ -693,7 +693,7 @@

 specialize qw/vp9_sad4x4x4d sse/;

 add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

-specialize qw/vp9_mse16x16 mmx avx2/, "$sse2_x86inc";

+specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";

 add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";

 specialize qw/vp9_mse8x16/;

@@ -705,7 +705,7 @@

 specialize qw/vp9_mse8x8/;

 add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";

-specialize qw/vp9_get_mb_ss mmx sse2/;

+specialize qw/vp9_get_mb_ss sse2/;

 # ENCODEMB INVOKE

 add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";

--- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm

+++ /dev/null

@@ -1,510 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )

-global sym(vp9_get_mb_ss_mmx) PRIVATE

-sym(vp9_get_mb_ss_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 8

-    ; end prolog

-        mov         rax, arg(0) ;src_ptr

-        mov         rcx, 16

-        pxor        mm4, mm4

-.NEXTROW:

-        movq        mm0, [rax]

-        movq        mm1, [rax+8]

-        movq        mm2, [rax+16]

-        movq        mm3, [rax+24]

-        pmaddwd     mm0, mm0

-        pmaddwd     mm1, mm1

-        pmaddwd     mm2, mm2

-        pmaddwd     mm3, mm3

-        paddd       mm4, mm0

-        paddd       mm4, mm1

-        paddd       mm4, mm2

-        paddd       mm4, mm3

-        add         rax, 32

-        dec         rcx

-        ja          .NEXTROW

-        movq        QWORD PTR [rsp], mm4

-        ;return sum[0]+sum[1];

-        movsxd      rax, dword ptr [rsp]

-        movsxd      rcx, dword ptr [rsp+4]

-        add         rax, rcx

-    ; begin epilog

-    add rsp, 8

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_get8x8var_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  source_stride,

-;    unsigned char *ref_ptr,

-;    int  recon_stride,

-;    unsigned int *SSE,

-;    int *Sum

-;)

-global sym(vp9_get8x8var_mmx) PRIVATE

-sym(vp9_get8x8var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    push rsi

-    push rdi

-    push rbx

-    sub         rsp, 16

-    ; end prolog

-        pxor        mm5, mm5                    ; Blank mmx6

-        pxor        mm6, mm6                    ; Blank mmx7

-        pxor        mm7, mm7                    ; Blank mmx7

-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses

-        mov         rbx, arg(2) ;[ref_ptr]

-        movsxd      rcx, dword ptr arg(1) ;[source_stride]

-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]

-        ; Row 1

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 2

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 3

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 4

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 5

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        ;              movq        mm4, [rbx + rdx]

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 6

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 7

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 8

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Now accumulate the final results.

-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory

-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory

-        movsx       rdx, WORD PTR [rsp+8]

-        movsx       rcx, WORD PTR [rsp+10]

-        movsx       rbx, WORD PTR [rsp+12]

-        movsx       rax, WORD PTR [rsp+14]

-        add         rdx, rcx

-        add         rbx, rax

-        add         rdx, rbx    ;XSum

-        movsxd      rax, DWORD PTR [rsp]

-        movsxd      rcx, DWORD PTR [rsp+4]

-        add         rax, rcx    ;XXSum

-        mov         rsi, arg(4) ;SSE

-        mov         rdi, arg(5) ;Sum

-        mov         dword ptr [rsi], eax

-        mov         dword ptr [rdi], edx

-        xor         rax, rax    ; return 0

-    ; begin epilog

-    add rsp, 16

-    pop rbx

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int

-;vp9_get4x4var_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  source_stride,

-;    unsigned char *ref_ptr,

-;    int  recon_stride,

-;    unsigned int *SSE,

-;    int *Sum

-;)

-global sym(vp9_get4x4var_mmx) PRIVATE

-sym(vp9_get4x4var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    push rsi

-    push rdi

-    push rbx

-    sub         rsp, 16

-    ; end prolog

-        pxor        mm5, mm5                    ; Blank mmx6

-        pxor        mm6, mm6                    ; Blank mmx7

-        pxor        mm7, mm7                    ; Blank mmx7

-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses

-        mov         rbx, arg(2) ;[ref_ptr]

-        movsxd      rcx, dword ptr arg(1) ;[source_stride]

-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]

-        ; Row 1

-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0

-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 2

-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 3

-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 4

-        movd        mm0, [rax]                  ; Copy 4 bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Now accumulate the final results.

-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory

-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory

-        movsx       rdx, WORD PTR [rsp+8]

-        movsx       rcx, WORD PTR [rsp+10]

-        movsx       rbx, WORD PTR [rsp+12]

-        movsx       rax, WORD PTR [rsp+14]

-        add         rdx, rcx

-        add         rbx, rax

-        add         rdx, rbx    ;XSum

-        movsxd      rax, DWORD PTR [rsp]

-        movsxd      rcx, DWORD PTR [rsp+4]

-        add         rax, rcx    ;XXSum

-        mov         rsi, arg(4) ;SSE

-        mov         rdi, arg(5) ;Sum

-        mov         dword ptr [rsi], eax

-        mov         dword ptr [rdi], edx

-        xor         rax, rax    ; return 0

-    ; begin epilog

-    add rsp, 16

-    pop rbx

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int

-;vp9_get4x4sse_cs_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  source_stride,

-;    unsigned char *ref_ptr,

-;    int  recon_stride

-;)

-global sym(vp9_get4x4sse_cs_mmx) PRIVATE

-sym(vp9_get4x4sse_cs_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push rsi

-    push rdi

-    push rbx

-    ; end prolog

-        pxor        mm6, mm6                    ; Blank mmx7

-        pxor        mm7, mm7                    ; Blank mmx7

-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses

-        mov         rbx, arg(2) ;[ref_ptr]

-        movsxd      rcx, dword ptr arg(1) ;[source_stride]

-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]

-        ; Row 1

-        movd        mm0, [rax]                  ; Copy eight bytes to mm0

-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 2

-        movd        mm0, [rax]                  ; Copy eight bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 3

-        movd        mm0, [rax]                  ; Copy eight bytes to mm0

-        punpcklbw   mm1, mm6

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 4

-        movd        mm0, [rax]                  ; Copy eight bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        paddd       mm7, mm0                    ; accumulate in mm7

-        movq        mm0,    mm7                 ;

-        psrlq       mm7,    32

-        paddd       mm0,    mm7

-        movq        rax,    mm0

-    ; begin epilog

-    pop rbx

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp9/encoder/x86/vp9_variance_mmx.c

+++ /dev/null

@@ -1,103 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vpx_config.h"

-#include "vp9/encoder/vp9_variance.h"

-#include "vpx_ports/mem.h"

-unsigned int vp9_get8x8var_mmx(const uint8_t *src, int src_stride,

-                               const uint8_t *ref, int ref_stride,

-                               unsigned int *sse, int *sum);

-unsigned int vp9_get4x4var_mmx(const uint8_t *src, int src_stride,

-                               const uint8_t *ref, int ref_stride,

-                               unsigned int *SSE, int *sum);

-unsigned int vp9_variance4x4_mmx(const uint8_t *src, int src_stride,

-                                 const uint8_t *ref, int ref_stride,

-                                 unsigned int *sse) {

-  int sum;

-  vp9_get4x4var_mmx(src, src_stride, ref, ref_stride, sse, &sum);

-  return *sse - (((unsigned int)sum * sum) >> 4);

-}

-unsigned int vp9_variance8x8_mmx(const uint8_t *src, int src_stride,

-                                 const uint8_t *ref, int ref_stride,

-                                 unsigned int *sse) {

-  int sum;

-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, sse, &sum);

-  return *sse - (((unsigned int)sum * sum) >> 6);

-}

-unsigned int vp9_mse16x16_mmx(const uint8_t *src, int src_stride,

-                              const uint8_t *ref, int ref_stride,

-                              unsigned int *sse) {

-  unsigned int sse0, sse1, sse2, sse3;

-  int sum0, sum1, sum2, sum3;

-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);

-  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);

-  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,

-                    ref + 8 * ref_stride, ref_stride, &sse2, &sum2);

-  vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,

-                    ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);

-  *sse = sse0 + sse1 + sse2 + sse3;

-  return *sse;

-}

-unsigned int vp9_variance16x16_mmx(const uint8_t *src, int src_stride,

-                                   const uint8_t *ref, int ref_stride,

-                                   unsigned int *sse) {

-  unsigned int sse0, sse1, sse2, sse3;

-  int sum0, sum1, sum2, sum3, sum;

-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);

-  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);

-  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,

-                    ref + 8 * ref_stride, ref_stride, &sse2, &sum2);

-  vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,

-                    ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);

-  *sse = sse0 + sse1 + sse2 + sse3;

-  sum = sum0 + sum1 + sum2 + sum3;

-  return *sse - (((unsigned int)sum * sum) >> 8);

-}

-unsigned int vp9_variance16x8_mmx(const uint8_t *src, int src_stride,

-                                  const uint8_t *ref, int ref_stride,

-                                  unsigned int *sse) {

-  unsigned int sse0, sse1;

-  int sum0, sum1, sum;

-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);

-  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);

-  *sse = sse0 + sse1;

-  sum = sum0 + sum1;

-  return *sse - (((unsigned int)sum * sum) >> 7);

-}

-unsigned int vp9_variance8x16_mmx(const uint8_t *src, int src_stride,

-                                  const uint8_t *ref, int ref_stride,

-                                  unsigned int *sse) {

-  unsigned int sse0, sse1;

-  int sum0, sum1, sum;

-  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);

-  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,

-                    ref + 8 * ref_stride, ref_stride, &sse1, &sum1);

-  *sse = sse0 + sse1;

-  sum = sum0 + sum1;

-  return *sse - (((unsigned int)sum * sum) >> 7);

-}

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -93,8 +93,6 @@

 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c

 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h

-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c

-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm

 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm

 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c