shithub: libvpx

--- a/vp8/common/x86/loopfilter_block_sse2.asm

+++ /dev/null

@@ -1,815 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%macro LF_ABS 2

-        ; %1 value not preserved

-        ; %2 value preserved

-        ; output in %1

-        movdqa      scratch1, %2            ; v2

-        psubusb     scratch1, %1            ; v2 - v1

-        psubusb     %1, %2                  ; v1 - v2

-        por         %1, scratch1            ; abs(v2 - v1)

-%endmacro

-%macro LF_FILTER_HEV_MASK 8-9

-        LF_ABS      %1, %2                  ; abs(p3 - p2)

-        LF_ABS      %2, %3                  ; abs(p2 - p1)

-        pmaxub      %1, %2                  ; accumulate mask

-%if %0 == 8

-        movdqa      scratch2, %3            ; save p1

-        LF_ABS      scratch2, %4            ; abs(p1 - p0)

-%endif

-        LF_ABS      %4, %5                  ; abs(p0 - q0)

-        LF_ABS      %5, %6                  ; abs(q0 - q1)

-%if %0 == 8

-        pmaxub      %5, scratch2            ; accumulate hev

-%else

-        pmaxub      %5, %9

-%endif

-        pmaxub      %1, %5                  ; accumulate mask

-        LF_ABS      %3, %6                  ; abs(p1 - q1)

-        LF_ABS      %6, %7                  ; abs(q1 - q2)

-        pmaxub      %1, %6                  ; accumulate mask

-        LF_ABS      %7, %8                  ; abs(q2 - q3)

-        pmaxub      %1, %7                  ; accumulate mask

-        paddusb     %4, %4                  ; 2 * abs(p0 - q0)

-        pand        %3, [GLOBAL(tfe)]

-        psrlw       %3, 1                   ; abs(p1 - q1) / 2

-        paddusb     %4, %3                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2

-        psubusb     %1, [limit]

-        psubusb     %4, [blimit]

-        por         %1, %4

-        pcmpeqb     %1, zero                ; mask

-        psubusb     %5, [thresh]

-        pcmpeqb     %5, zero                ; ~hev

-%endmacro

-%macro LF_FILTER 6

-        ; %1-%4: p1-q1

-        ; %5: mask

-        ; %6: hev

-        movdqa      scratch2, %6            ; save hev

-        pxor        %1, [GLOBAL(t80)]       ; ps1

-        pxor        %4, [GLOBAL(t80)]       ; qs1

-        movdqa      scratch1, %1

-        psubsb      scratch1, %4            ; signed_char_clamp(ps1 - qs1)

-        pandn       scratch2, scratch1      ; vp8_filter &= hev

-        pxor        %2, [GLOBAL(t80)]       ; ps0

-        pxor        %3, [GLOBAL(t80)]       ; qs0

-        movdqa      scratch1, %3

-        psubsb      scratch1, %2            ; qs0 - ps0

-        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)

-        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)

-        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)

-        pand        %5, scratch2            ; &= mask

-        movdqa      scratch2, %5

-        paddsb      %5, [GLOBAL(t4)]        ; Filter1

-        paddsb      scratch2, [GLOBAL(t3)]  ; Filter2

-        ; Filter1 >> 3

-        movdqa      scratch1, zero

-        pcmpgtb     scratch1, %5

-        psrlw       %5, 3

-        pand        scratch1, [GLOBAL(te0)]

-        pand        %5, [GLOBAL(t1f)]

-        por         %5, scratch1

-        psubsb      %3, %5                  ; qs0 - Filter1

-        pxor        %3, [GLOBAL(t80)]

-        ; Filter2 >> 3

-        movdqa      scratch1, zero

-        pcmpgtb     scratch1, scratch2

-        psrlw       scratch2, 3

-        pand        scratch1, [GLOBAL(te0)]

-        pand        scratch2, [GLOBAL(t1f)]

-        por         scratch2, scratch1

-        paddsb      %2, scratch2            ; ps0 + Filter2

-        pxor        %2, [GLOBAL(t80)]

-        ; outer tap adjustments

-        paddsb      %5, [GLOBAL(t1)]

-        movdqa      scratch1, zero

-        pcmpgtb     scratch1, %5

-        psrlw       %5, 1

-        pand        scratch1, [GLOBAL(t80)]

-        pand        %5, [GLOBAL(t7f)]

-        por         %5, scratch1

-        pand        %5, %6                  ; vp8_filter &= ~hev

-        psubsb      %4, %5                  ; qs1 - vp8_filter

-        pxor        %4, [GLOBAL(t80)]

-        paddsb      %1, %5                  ; ps1 + vp8_filter

-        pxor        %1, [GLOBAL(t80)]

-%endmacro

-;void vp8_loop_filter_bh_y_sse2

-;(

-;    unsigned char *src_ptr,

-;    int            src_pixel_step,

-;    const char    *blimit,

-;    const char    *limit,

-;    const char    *thresh

-;)

-global sym(vp8_loop_filter_bh_y_sse2) PRIVATE

-sym(vp8_loop_filter_bh_y_sse2):

-%if LIBVPX_YASM_WIN64

-    %define src      rcx ; src_ptr

-    %define stride   rdx ; src_pixel_step

-    %define blimit   r8

-    %define limit    r9

-    %define thresh   r10

-    %define spp      rax

-    %define stride3  r11

-    %define stride5  r12

-    %define stride7  r13

-    push    rbp

-    mov     rbp, rsp

-    SAVE_XMM 11

-    push    r12

-    push    r13

-    mov     thresh, arg(4)

-%else

-    %define src      rdi ; src_ptr

-    %define stride   rsi ; src_pixel_step

-    %define blimit   rdx

-    %define limit    rcx

-    %define thresh   r8

-    %define spp      rax

-    %define stride3  r9

-    %define stride5  r10

-    %define stride7  r11

-%endif

-    %define scratch1 xmm5

-    %define scratch2 xmm6

-    %define zero     xmm7

-    %define i0       [src]

-    %define i1       [spp]

-    %define i2       [src + 2 * stride]

-    %define i3       [spp + 2 * stride]

-    %define i4       [src + 4 * stride]

-    %define i5       [spp + 4 * stride]

-    %define i6       [src + 2 * stride3]

-    %define i7       [spp + 2 * stride3]

-    %define i8       [src + 8 * stride]

-    %define i9       [spp + 8 * stride]

-    %define i10      [src + 2 * stride5]

-    %define i11      [spp + 2 * stride5]

-    %define i12      [src + 4 * stride3]

-    %define i13      [spp + 4 * stride3]

-    %define i14      [src + 2 * stride7]

-    %define i15      [spp + 2 * stride7]

-    ; prep work

-    lea         spp, [src + stride]

-    lea         stride3, [stride + 2 * stride]

-    lea         stride5, [stride3 + 2 * stride]

-    lea         stride7, [stride3 + 4 * stride]

-    pxor        zero, zero

-        ; load the first set into registers

-        movdqa       xmm0, i0

-        movdqa       xmm1, i1

-        movdqa       xmm2, i2

-        movdqa       xmm3, i3

-        movdqa       xmm4, i4

-        movdqa       xmm8, i5

-        movdqa       xmm9, i6   ; q2, will contain abs(p1-p0)

-        movdqa       xmm10, i7

-LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10

-        movdqa       xmm1, i2

-        movdqa       xmm2, i3

-        movdqa       xmm3, i4

-        movdqa       xmm8, i5

-LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4

-        movdqa       i2, xmm1

-        movdqa       i3, xmm2

-; second set

-        movdqa       i4, xmm3

-        movdqa       i5, xmm8

-        movdqa       xmm0, i6

-        movdqa       xmm1, i7

-        movdqa       xmm2, i8

-        movdqa       xmm4, i9

-        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)

-        movdqa       xmm11, i11

-LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9

-        movdqa       xmm0, i6

-        movdqa       xmm1, i7

-        movdqa       xmm4, i8

-        movdqa       xmm8, i9

-LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2

-        movdqa       i6, xmm0

-        movdqa       i7, xmm1

-; last set

-        movdqa       i8, xmm4

-        movdqa       i9, xmm8

-        movdqa       xmm0, i10

-        movdqa       xmm1, i11

-        movdqa       xmm2, i12

-        movdqa       xmm3, i13

-        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)

-        movdqa       xmm11, i15

-LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10

-        movdqa       xmm0, i10

-        movdqa       xmm1, i11

-        movdqa       xmm3, i12

-        movdqa       xmm8, i13

-LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2

-        movdqa       i10, xmm0

-        movdqa       i11, xmm1

-        movdqa       i12, xmm3

-        movdqa       i13, xmm8

-%if LIBVPX_YASM_WIN64

-    pop    r13

-    pop    r12

-    RESTORE_XMM

-    pop    rbp

-%endif

-    ret

-;void vp8_loop_filter_bv_y_sse2

-;(

-;    unsigned char *src_ptr,

-;    int            src_pixel_step,

-;    const char    *blimit,

-;    const char    *limit,

-;    const char    *thresh

-;)

-global sym(vp8_loop_filter_bv_y_sse2) PRIVATE

-sym(vp8_loop_filter_bv_y_sse2):

-%if LIBVPX_YASM_WIN64

-    %define src      rcx ; src_ptr

-    %define stride   rdx ; src_pixel_step

-    %define blimit   r8

-    %define limit    r9

-    %define thresh   r10

-    %define spp      rax

-    %define stride3  r11

-    %define stride5  r12

-    %define stride7  r13

-    push    rbp

-    mov     rbp, rsp

-    SAVE_XMM 15

-    push    r12

-    push    r13

-    mov     thresh, arg(4)

-%else

-    %define src      rdi

-    %define stride   rsi

-    %define blimit   rdx

-    %define limit    rcx

-    %define thresh   r8

-    %define spp      rax

-    %define stride3  r9

-    %define stride5  r10

-    %define stride7  r11

-%endif

-    %define scratch1 xmm5

-    %define scratch2 xmm6

-    %define zero     xmm7

-    %define s0       [src]

-    %define s1       [spp]

-    %define s2       [src + 2 * stride]

-    %define s3       [spp + 2 * stride]

-    %define s4       [src + 4 * stride]

-    %define s5       [spp + 4 * stride]

-    %define s6       [src + 2 * stride3]

-    %define s7       [spp + 2 * stride3]

-    %define s8       [src + 8 * stride]

-    %define s9       [spp + 8 * stride]

-    %define s10      [src + 2 * stride5]

-    %define s11      [spp + 2 * stride5]

-    %define s12      [src + 4 * stride3]

-    %define s13      [spp + 4 * stride3]

-    %define s14      [src + 2 * stride7]

-    %define s15      [spp + 2 * stride7]

-    %define i0       [rsp]

-    %define i1       [rsp + 16]

-    %define i2       [rsp + 32]

-    %define i3       [rsp + 48]

-    %define i4       [rsp + 64]

-    %define i5       [rsp + 80]

-    %define i6       [rsp + 96]

-    %define i7       [rsp + 112]

-    %define i8       [rsp + 128]

-    %define i9       [rsp + 144]

-    %define i10      [rsp + 160]

-    %define i11      [rsp + 176]

-    %define i12      [rsp + 192]

-    %define i13      [rsp + 208]

-    %define i14      [rsp + 224]

-    %define i15      [rsp + 240]

-    ALIGN_STACK 16, rax

-    ; reserve stack space

-    %define      temp_storage  0 ; size is 256 (16*16)

-    %define      stack_size 256

-    sub          rsp, stack_size

-    ; prep work

-    lea         spp, [src + stride]

-    lea         stride3, [stride + 2 * stride]

-    lea         stride5, [stride3 + 2 * stride]

-    lea         stride7, [stride3 + 4 * stride]

-        ; 8-f

-        movdqa      xmm0, s8

-        movdqa      xmm1, xmm0

-        punpcklbw   xmm0, s9                ; 80 90

-        punpckhbw   xmm1, s9                ; 88 98

-        movdqa      xmm2, s10

-        movdqa      xmm3, xmm2

-        punpcklbw   xmm2, s11 ; a0 b0

-        punpckhbw   xmm3, s11 ; a8 b8

-        movdqa      xmm4, xmm0

-        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0

-        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4

-        movdqa      xmm2, xmm1

-        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8

-        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc

-        ; using xmm[0124]

-        ; work on next 4 rows

-        movdqa      xmm3, s12

-        movdqa      xmm5, xmm3

-        punpcklbw   xmm3, s13 ; c0 d0

-        punpckhbw   xmm5, s13 ; c8 d8

-        movdqa      xmm6, s14

-        movdqa      xmm7, xmm6

-        punpcklbw   xmm6, s15 ; e0 f0

-        punpckhbw   xmm7, s15 ; e8 f8

-        movdqa      xmm8, xmm3

-        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0

-        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4

-        movdqa      xmm6, xmm5

-        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8

-        punpckhwd   xmm6, xmm7              ; cc dc ec fc

-        ; pull the third and fourth sets together

-        movdqa      xmm7, xmm0

-        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0

-        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2

-        movdqa      xmm3, xmm4

-        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4

-        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6

-        movdqa      xmm8, xmm1

-        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8

-        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa

-        movdqa      xmm5, xmm2

-        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc

-        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe

-        ; save the calculations. we only have 15 registers ...

-        movdqa      i0, xmm0

-        movdqa      i1, xmm7

-        movdqa      i2, xmm4

-        movdqa      i3, xmm3

-        movdqa      i4, xmm1

-        movdqa      i5, xmm8

-        movdqa      i6, xmm2

-        movdqa      i7, xmm5

-        ; 0-7

-        movdqa      xmm0, s0

-        movdqa      xmm1, xmm0

-        punpcklbw   xmm0, s1 ; 00 10

-        punpckhbw   xmm1, s1 ; 08 18

-        movdqa      xmm2, s2

-        movdqa      xmm3, xmm2

-        punpcklbw   xmm2, s3 ; 20 30

-        punpckhbw   xmm3, s3 ; 28 38

-        movdqa      xmm4, xmm0

-        punpcklwd   xmm0, xmm2              ; 00 10 20 30

-        punpckhwd   xmm4, xmm2              ; 04 14 24 34

-        movdqa      xmm2, xmm1

-        punpcklwd   xmm1, xmm3              ; 08 18 28 38

-        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c

-        ; using xmm[0124]

-        ; work on next 4 rows

-        movdqa      xmm3, s4

-        movdqa      xmm5, xmm3

-        punpcklbw   xmm3, s5 ; 40 50

-        punpckhbw   xmm5, s5 ; 48 58

-        movdqa      xmm6, s6

-        movdqa      xmm7, xmm6

-        punpcklbw   xmm6, s7   ; 60 70

-        punpckhbw   xmm7, s7   ; 68 78

-        movdqa      xmm8, xmm3

-        punpcklwd   xmm3, xmm6              ; 40 50 60 70

-        punpckhwd   xmm8, xmm6              ; 44 54 64 74

-        movdqa      xmm6, xmm5

-        punpcklwd   xmm5, xmm7              ; 48 58 68 78

-        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c

-        ; pull the first two sets together

-        movdqa      xmm7, xmm0

-        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70

-        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72

-        movdqa      xmm3, xmm4

-        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74

-        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76

-        movdqa      xmm8, xmm1

-        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78

-        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a

-        movdqa      xmm5, xmm2

-        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c

-        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e

-        ; final combination

-        movdqa      xmm6, xmm0

-        punpcklqdq  xmm0, i0

-        punpckhqdq  xmm6, i0

-        movdqa      xmm9, xmm7

-        punpcklqdq  xmm7, i1

-        punpckhqdq  xmm9, i1

-        movdqa      xmm10, xmm4

-        punpcklqdq  xmm4, i2

-        punpckhqdq  xmm10, i2

-        movdqa      xmm11, xmm3

-        punpcklqdq  xmm3, i3

-        punpckhqdq  xmm11, i3

-        movdqa      xmm12, xmm1

-        punpcklqdq  xmm1, i4

-        punpckhqdq  xmm12, i4

-        movdqa      xmm13, xmm8

-        punpcklqdq  xmm8, i5

-        punpckhqdq  xmm13, i5

-        movdqa      xmm14, xmm2

-        punpcklqdq  xmm2, i6

-        punpckhqdq  xmm14, i6

-        movdqa      xmm15, xmm5

-        punpcklqdq  xmm5, i7

-        punpckhqdq  xmm15, i7

-        movdqa      i0, xmm0

-        movdqa      i1, xmm6

-        movdqa      i2, xmm7

-        movdqa      i3, xmm9

-        movdqa      i4, xmm4

-        movdqa      i5, xmm10

-        movdqa      i6, xmm3

-        movdqa      i7, xmm11

-        movdqa      i8, xmm1

-        movdqa      i9, xmm12

-        movdqa      i10, xmm8

-        movdqa      i11, xmm13

-        movdqa      i12, xmm2

-        movdqa      i13, xmm14

-        movdqa      i14, xmm5

-        movdqa      i15, xmm15

-; TRANSPOSED DATA AVAILABLE ON THE STACK

-        movdqa      xmm12, xmm6

-        movdqa      xmm13, xmm7

-        pxor        zero, zero

-LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11

-        movdqa       xmm1, i2

-        movdqa       xmm2, i3

-        movdqa       xmm8, i4

-        movdqa       xmm9, i5

-LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4

-        movdqa       i2, xmm1

-        movdqa       i3, xmm2

-; second set

-        movdqa       i4, xmm8

-        movdqa       i5, xmm9

-        movdqa       xmm0, i6

-        movdqa       xmm1, i7

-        movdqa       xmm2, i8

-        movdqa       xmm4, i9

-        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)

-        movdqa       xmm11, i11

-LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3

-        movdqa       xmm0, i6

-        movdqa       xmm1, i7

-        movdqa       xmm3, i8

-        movdqa       xmm4, i9

-LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2

-        movdqa       i6, xmm0

-        movdqa       i7, xmm1

-; last set

-        movdqa       i8, xmm3

-        movdqa       i9, xmm4

-        movdqa       xmm0, i10

-        movdqa       xmm1, i11

-        movdqa       xmm2, i12

-        movdqa       xmm8, i13

-        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)

-        movdqa       xmm11, i15

-LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10

-        movdqa       xmm0, i10

-        movdqa       xmm1, i11

-        movdqa       xmm4, i12

-        movdqa       xmm8, i13

-LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2

-        movdqa       i10, xmm0

-        movdqa       i11, xmm1

-        movdqa       i12, xmm4

-        movdqa       i13, xmm8

-; RESHUFFLE AND WRITE OUT

-        ; 8-f

-        movdqa      xmm0, i8

-        movdqa      xmm1, xmm0

-        punpcklbw   xmm0, i9                ; 80 90

-        punpckhbw   xmm1, i9                ; 88 98

-        movdqa      xmm2, i10

-        movdqa      xmm3, xmm2

-        punpcklbw   xmm2, i11               ; a0 b0

-        punpckhbw   xmm3, i11               ; a8 b8

-        movdqa      xmm4, xmm0

-        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0

-        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4

-        movdqa      xmm2, xmm1

-        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8

-        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc

-        ; using xmm[0124]

-        ; work on next 4 rows

-        movdqa      xmm3, i12

-        movdqa      xmm5, xmm3

-        punpcklbw   xmm3, i13               ; c0 d0

-        punpckhbw   xmm5, i13               ; c8 d8

-        movdqa      xmm6, i14

-        movdqa      xmm7, xmm6

-        punpcklbw   xmm6, i15               ; e0 f0

-        punpckhbw   xmm7, i15               ; e8 f8

-        movdqa      xmm8, xmm3

-        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0

-        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4

-        movdqa      xmm6, xmm5

-        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8

-        punpckhwd   xmm6, xmm7              ; cc dc ec fc

-        ; pull the third and fourth sets together

-        movdqa      xmm7, xmm0

-        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0

-        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2

-        movdqa      xmm3, xmm4

-        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4

-        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6

-        movdqa      xmm8, xmm1

-        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8

-        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa

-        movdqa      xmm5, xmm2

-        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc

-        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe

-        ; save the calculations. we only have 15 registers ...

-        movdqa      i8, xmm0

-        movdqa      i9, xmm7

-        movdqa      i10, xmm4

-        movdqa      i11, xmm3

-        movdqa      i12, xmm1

-        movdqa      i13, xmm8

-        movdqa      i14, xmm2

-        movdqa      i15, xmm5

-        ; 0-7

-        movdqa      xmm0, i0

-        movdqa      xmm1, xmm0

-        punpcklbw   xmm0, i1                ; 00 10

-        punpckhbw   xmm1, i1                ; 08 18

-        movdqa      xmm2, i2

-        movdqa      xmm3, xmm2

-        punpcklbw   xmm2, i3                ; 20 30

-        punpckhbw   xmm3, i3                ; 28 38

-        movdqa      xmm4, xmm0

-        punpcklwd   xmm0, xmm2              ; 00 10 20 30

-        punpckhwd   xmm4, xmm2              ; 04 14 24 34

-        movdqa      xmm2, xmm1

-        punpcklwd   xmm1, xmm3              ; 08 18 28 38

-        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c

-        ; using xmm[0124]

-        ; work on next 4 rows

-        movdqa      xmm3, i4

-        movdqa      xmm5, xmm3

-        punpcklbw   xmm3, i5                ; 40 50

-        punpckhbw   xmm5, i5                ; 48 58

-        movdqa      xmm6, i6

-        movdqa      xmm7, xmm6

-        punpcklbw   xmm6, i7                ; 60 70

-        punpckhbw   xmm7, i7                ; 68 78

-        movdqa      xmm8, xmm3

-        punpcklwd   xmm3, xmm6              ; 40 50 60 70

-        punpckhwd   xmm8, xmm6              ; 44 54 64 74

-        movdqa      xmm6, xmm5

-        punpcklwd   xmm5, xmm7              ; 48 58 68 78

-        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c

-        ; pull the first two sets together

-        movdqa      xmm7, xmm0

-        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70

-        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72

-        movdqa      xmm3, xmm4

-        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74

-        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76

-        movdqa      xmm8, xmm1

-        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78

-        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a

-        movdqa      xmm5, xmm2

-        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c

-        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e

-        ; final combination

-        movdqa      xmm6, xmm0

-        punpcklqdq  xmm0, i8

-        punpckhqdq  xmm6, i8

-        movdqa      xmm9, xmm7

-        punpcklqdq  xmm7, i9

-        punpckhqdq  xmm9, i9

-        movdqa      xmm10, xmm4

-        punpcklqdq  xmm4, i10

-        punpckhqdq  xmm10, i10

-        movdqa      xmm11, xmm3

-        punpcklqdq  xmm3, i11

-        punpckhqdq  xmm11, i11

-        movdqa      xmm12, xmm1

-        punpcklqdq  xmm1, i12

-        punpckhqdq  xmm12, i12

-        movdqa      xmm13, xmm8

-        punpcklqdq  xmm8, i13

-        punpckhqdq  xmm13, i13

-        movdqa      xmm14, xmm2

-        punpcklqdq  xmm2, i14

-        punpckhqdq  xmm14, i14

-        movdqa      xmm15, xmm5

-        punpcklqdq  xmm5, i15

-        punpckhqdq  xmm15, i15

-        movdqa      s0, xmm0

-        movdqa      s1, xmm6

-        movdqa      s2, xmm7

-        movdqa      s3, xmm9

-        movdqa      s4, xmm4

-        movdqa      s5, xmm10

-        movdqa      s6, xmm3

-        movdqa      s7, xmm11

-        movdqa      s8, xmm1

-        movdqa      s9, xmm12

-        movdqa      s10, xmm8

-        movdqa      s11, xmm13

-        movdqa      s12, xmm2

-        movdqa      s13, xmm14

-        movdqa      s14, xmm5

-        movdqa      s15, xmm15

-    ; free stack space

-    add          rsp, stack_size

-    ; un-ALIGN_STACK

-    pop          rsp

-%if LIBVPX_YASM_WIN64

-    pop    r13

-    pop    r12

-    RESTORE_XMM

-    pop    rbp

-%endif

-    ret

-SECTION_RODATA

-align 16

-te0:

-    times 16 db 0xe0

-align 16

-t7f:

-    times 16 db 0x7f

-align 16

-tfe:

-    times 16 db 0xfe

-align 16

-t1f:

-    times 16 db 0x1f

-align 16

-t80:

-    times 16 db 0x80

-align 16

-t1:

-    times 16 db 0x01

-align 16

-t3:

-    times 16 db 0x03

-align 16

-t4:

-    times 16 db 0x04

--- /dev/null

+++ b/vp8/common/x86/loopfilter_block_sse2_x86_64.asm

@@ -1,0 +1,815 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%macro LF_ABS 2

+        ; %1 value not preserved

+        ; %2 value preserved

+        ; output in %1

+        movdqa      scratch1, %2            ; v2

+        psubusb     scratch1, %1            ; v2 - v1

+        psubusb     %1, %2                  ; v1 - v2

+        por         %1, scratch1            ; abs(v2 - v1)

+%endmacro

+%macro LF_FILTER_HEV_MASK 8-9

+        LF_ABS      %1, %2                  ; abs(p3 - p2)

+        LF_ABS      %2, %3                  ; abs(p2 - p1)

+        pmaxub      %1, %2                  ; accumulate mask

+%if %0 == 8

+        movdqa      scratch2, %3            ; save p1

+        LF_ABS      scratch2, %4            ; abs(p1 - p0)

+%endif

+        LF_ABS      %4, %5                  ; abs(p0 - q0)

+        LF_ABS      %5, %6                  ; abs(q0 - q1)

+%if %0 == 8

+        pmaxub      %5, scratch2            ; accumulate hev

+%else

+        pmaxub      %5, %9

+%endif

+        pmaxub      %1, %5                  ; accumulate mask

+        LF_ABS      %3, %6                  ; abs(p1 - q1)

+        LF_ABS      %6, %7                  ; abs(q1 - q2)

+        pmaxub      %1, %6                  ; accumulate mask

+        LF_ABS      %7, %8                  ; abs(q2 - q3)

+        pmaxub      %1, %7                  ; accumulate mask

+        paddusb     %4, %4                  ; 2 * abs(p0 - q0)

+        pand        %3, [GLOBAL(tfe)]

+        psrlw       %3, 1                   ; abs(p1 - q1) / 2

+        paddusb     %4, %3                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2

+        psubusb     %1, [limit]

+        psubusb     %4, [blimit]

+        por         %1, %4

+        pcmpeqb     %1, zero                ; mask

+        psubusb     %5, [thresh]

+        pcmpeqb     %5, zero                ; ~hev

+%endmacro

+%macro LF_FILTER 6

+        ; %1-%4: p1-q1

+        ; %5: mask

+        ; %6: hev

+        movdqa      scratch2, %6            ; save hev

+        pxor        %1, [GLOBAL(t80)]       ; ps1

+        pxor        %4, [GLOBAL(t80)]       ; qs1

+        movdqa      scratch1, %1

+        psubsb      scratch1, %4            ; signed_char_clamp(ps1 - qs1)

+        pandn       scratch2, scratch1      ; vp8_filter &= hev

+        pxor        %2, [GLOBAL(t80)]       ; ps0

+        pxor        %3, [GLOBAL(t80)]       ; qs0

+        movdqa      scratch1, %3

+        psubsb      scratch1, %2            ; qs0 - ps0

+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)

+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)

+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)

+        pand        %5, scratch2            ; &= mask

+        movdqa      scratch2, %5

+        paddsb      %5, [GLOBAL(t4)]        ; Filter1

+        paddsb      scratch2, [GLOBAL(t3)]  ; Filter2

+        ; Filter1 >> 3

+        movdqa      scratch1, zero

+        pcmpgtb     scratch1, %5

+        psrlw       %5, 3

+        pand        scratch1, [GLOBAL(te0)]

+        pand        %5, [GLOBAL(t1f)]

+        por         %5, scratch1

+        psubsb      %3, %5                  ; qs0 - Filter1

+        pxor        %3, [GLOBAL(t80)]

+        ; Filter2 >> 3

+        movdqa      scratch1, zero

+        pcmpgtb     scratch1, scratch2

+        psrlw       scratch2, 3

+        pand        scratch1, [GLOBAL(te0)]

+        pand        scratch2, [GLOBAL(t1f)]

+        por         scratch2, scratch1

+        paddsb      %2, scratch2            ; ps0 + Filter2

+        pxor        %2, [GLOBAL(t80)]

+        ; outer tap adjustments

+        paddsb      %5, [GLOBAL(t1)]

+        movdqa      scratch1, zero

+        pcmpgtb     scratch1, %5

+        psrlw       %5, 1

+        pand        scratch1, [GLOBAL(t80)]

+        pand        %5, [GLOBAL(t7f)]

+        por         %5, scratch1

+        pand        %5, %6                  ; vp8_filter &= ~hev

+        psubsb      %4, %5                  ; qs1 - vp8_filter

+        pxor        %4, [GLOBAL(t80)]

+        paddsb      %1, %5                  ; ps1 + vp8_filter

+        pxor        %1, [GLOBAL(t80)]

+%endmacro

+;void vp8_loop_filter_bh_y_sse2

+;(

+;    unsigned char *src_ptr,

+;    int            src_pixel_step,

+;    const char    *blimit,

+;    const char    *limit,

+;    const char    *thresh

+;)

+global sym(vp8_loop_filter_bh_y_sse2) PRIVATE

+sym(vp8_loop_filter_bh_y_sse2):

+%if LIBVPX_YASM_WIN64

+    %define src      rcx ; src_ptr

+    %define stride   rdx ; src_pixel_step

+    %define blimit   r8

+    %define limit    r9

+    %define thresh   r10

+    %define spp      rax

+    %define stride3  r11

+    %define stride5  r12

+    %define stride7  r13

+    push    rbp

+    mov     rbp, rsp

+    SAVE_XMM 11

+    push    r12

+    push    r13

+    mov     thresh, arg(4)

+%else

+    %define src      rdi ; src_ptr

+    %define stride   rsi ; src_pixel_step

+    %define blimit   rdx

+    %define limit    rcx

+    %define thresh   r8

+    %define spp      rax

+    %define stride3  r9

+    %define stride5  r10

+    %define stride7  r11

+%endif

+    %define scratch1 xmm5

+    %define scratch2 xmm6

+    %define zero     xmm7

+    %define i0       [src]

+    %define i1       [spp]

+    %define i2       [src + 2 * stride]

+    %define i3       [spp + 2 * stride]

+    %define i4       [src + 4 * stride]

+    %define i5       [spp + 4 * stride]

+    %define i6       [src + 2 * stride3]

+    %define i7       [spp + 2 * stride3]

+    %define i8       [src + 8 * stride]

+    %define i9       [spp + 8 * stride]

+    %define i10      [src + 2 * stride5]

+    %define i11      [spp + 2 * stride5]

+    %define i12      [src + 4 * stride3]

+    %define i13      [spp + 4 * stride3]

+    %define i14      [src + 2 * stride7]

+    %define i15      [spp + 2 * stride7]

+    ; prep work

+    lea         spp, [src + stride]

+    lea         stride3, [stride + 2 * stride]

+    lea         stride5, [stride3 + 2 * stride]

+    lea         stride7, [stride3 + 4 * stride]

+    pxor        zero, zero

+        ; load the first set into registers

+        movdqa       xmm0, i0

+        movdqa       xmm1, i1

+        movdqa       xmm2, i2

+        movdqa       xmm3, i3

+        movdqa       xmm4, i4

+        movdqa       xmm8, i5

+        movdqa       xmm9, i6   ; q2, will contain abs(p1-p0)

+        movdqa       xmm10, i7

+LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10

+        movdqa       xmm1, i2

+        movdqa       xmm2, i3

+        movdqa       xmm3, i4

+        movdqa       xmm8, i5

+LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4

+        movdqa       i2, xmm1

+        movdqa       i3, xmm2

+; second set

+        movdqa       i4, xmm3

+        movdqa       i5, xmm8

+        movdqa       xmm0, i6

+        movdqa       xmm1, i7

+        movdqa       xmm2, i8

+        movdqa       xmm4, i9

+        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)

+        movdqa       xmm11, i11

+LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9

+        movdqa       xmm0, i6

+        movdqa       xmm1, i7

+        movdqa       xmm4, i8

+        movdqa       xmm8, i9

+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2

+        movdqa       i6, xmm0

+        movdqa       i7, xmm1

+; last set

+        movdqa       i8, xmm4

+        movdqa       i9, xmm8

+        movdqa       xmm0, i10

+        movdqa       xmm1, i11

+        movdqa       xmm2, i12

+        movdqa       xmm3, i13

+        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)

+        movdqa       xmm11, i15

+LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10

+        movdqa       xmm0, i10

+        movdqa       xmm1, i11

+        movdqa       xmm3, i12

+        movdqa       xmm8, i13

+LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2

+        movdqa       i10, xmm0

+        movdqa       i11, xmm1

+        movdqa       i12, xmm3

+        movdqa       i13, xmm8

+%if LIBVPX_YASM_WIN64

+    pop    r13

+    pop    r12

+    RESTORE_XMM

+    pop    rbp

+%endif

+    ret

+;void vp8_loop_filter_bv_y_sse2

+;(

+;    unsigned char *src_ptr,

+;    int            src_pixel_step,

+;    const char    *blimit,

+;    const char    *limit,

+;    const char    *thresh

+;)

+global sym(vp8_loop_filter_bv_y_sse2) PRIVATE

+sym(vp8_loop_filter_bv_y_sse2):

+%if LIBVPX_YASM_WIN64

+    %define src      rcx ; src_ptr

+    %define stride   rdx ; src_pixel_step

+    %define blimit   r8

+    %define limit    r9

+    %define thresh   r10

+    %define spp      rax

+    %define stride3  r11

+    %define stride5  r12

+    %define stride7  r13

+    push    rbp

+    mov     rbp, rsp

+    SAVE_XMM 15

+    push    r12

+    push    r13

+    mov     thresh, arg(4)

+%else

+    %define src      rdi

+    %define stride   rsi

+    %define blimit   rdx

+    %define limit    rcx

+    %define thresh   r8

+    %define spp      rax

+    %define stride3  r9

+    %define stride5  r10

+    %define stride7  r11

+%endif

+    %define scratch1 xmm5

+    %define scratch2 xmm6

+    %define zero     xmm7

+    %define s0       [src]

+    %define s1       [spp]

+    %define s2       [src + 2 * stride]

+    %define s3       [spp + 2 * stride]

+    %define s4       [src + 4 * stride]

+    %define s5       [spp + 4 * stride]

+    %define s6       [src + 2 * stride3]

+    %define s7       [spp + 2 * stride3]

+    %define s8       [src + 8 * stride]

+    %define s9       [spp + 8 * stride]

+    %define s10      [src + 2 * stride5]

+    %define s11      [spp + 2 * stride5]

+    %define s12      [src + 4 * stride3]

+    %define s13      [spp + 4 * stride3]

+    %define s14      [src + 2 * stride7]

+    %define s15      [spp + 2 * stride7]

+    %define i0       [rsp]

+    %define i1       [rsp + 16]

+    %define i2       [rsp + 32]

+    %define i3       [rsp + 48]

+    %define i4       [rsp + 64]

+    %define i5       [rsp + 80]

+    %define i6       [rsp + 96]

+    %define i7       [rsp + 112]

+    %define i8       [rsp + 128]

+    %define i9       [rsp + 144]

+    %define i10      [rsp + 160]

+    %define i11      [rsp + 176]

+    %define i12      [rsp + 192]

+    %define i13      [rsp + 208]

+    %define i14      [rsp + 224]

+    %define i15      [rsp + 240]

+    ALIGN_STACK 16, rax

+    ; reserve stack space

+    %define      temp_storage  0 ; size is 256 (16*16)

+    %define      stack_size 256

+    sub          rsp, stack_size

+    ; prep work

+    lea         spp, [src + stride]

+    lea         stride3, [stride + 2 * stride]

+    lea         stride5, [stride3 + 2 * stride]

+    lea         stride7, [stride3 + 4 * stride]

+        ; 8-f

+        movdqa      xmm0, s8

+        movdqa      xmm1, xmm0

+        punpcklbw   xmm0, s9                ; 80 90

+        punpckhbw   xmm1, s9                ; 88 98

+        movdqa      xmm2, s10

+        movdqa      xmm3, xmm2

+        punpcklbw   xmm2, s11 ; a0 b0

+        punpckhbw   xmm3, s11 ; a8 b8

+        movdqa      xmm4, xmm0

+        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0

+        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4

+        movdqa      xmm2, xmm1

+        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8

+        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc

+        ; using xmm[0124]

+        ; work on next 4 rows

+        movdqa      xmm3, s12

+        movdqa      xmm5, xmm3

+        punpcklbw   xmm3, s13 ; c0 d0

+        punpckhbw   xmm5, s13 ; c8 d8

+        movdqa      xmm6, s14

+        movdqa      xmm7, xmm6

+        punpcklbw   xmm6, s15 ; e0 f0

+        punpckhbw   xmm7, s15 ; e8 f8

+        movdqa      xmm8, xmm3

+        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0

+        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4

+        movdqa      xmm6, xmm5

+        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8

+        punpckhwd   xmm6, xmm7              ; cc dc ec fc

+        ; pull the third and fourth sets together

+        movdqa      xmm7, xmm0

+        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0

+        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2

+        movdqa      xmm3, xmm4

+        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4

+        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6

+        movdqa      xmm8, xmm1

+        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8

+        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa

+        movdqa      xmm5, xmm2

+        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc

+        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe

+        ; save the calculations. we only have 15 registers ...

+        movdqa      i0, xmm0

+        movdqa      i1, xmm7

+        movdqa      i2, xmm4

+        movdqa      i3, xmm3

+        movdqa      i4, xmm1

+        movdqa      i5, xmm8

+        movdqa      i6, xmm2

+        movdqa      i7, xmm5

+        ; 0-7

+        movdqa      xmm0, s0

+        movdqa      xmm1, xmm0

+        punpcklbw   xmm0, s1 ; 00 10

+        punpckhbw   xmm1, s1 ; 08 18

+        movdqa      xmm2, s2

+        movdqa      xmm3, xmm2

+        punpcklbw   xmm2, s3 ; 20 30

+        punpckhbw   xmm3, s3 ; 28 38

+        movdqa      xmm4, xmm0

+        punpcklwd   xmm0, xmm2              ; 00 10 20 30

+        punpckhwd   xmm4, xmm2              ; 04 14 24 34

+        movdqa      xmm2, xmm1

+        punpcklwd   xmm1, xmm3              ; 08 18 28 38

+        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c

+        ; using xmm[0124]

+        ; work on next 4 rows

+        movdqa      xmm3, s4

+        movdqa      xmm5, xmm3

+        punpcklbw   xmm3, s5 ; 40 50

+        punpckhbw   xmm5, s5 ; 48 58

+        movdqa      xmm6, s6

+        movdqa      xmm7, xmm6

+        punpcklbw   xmm6, s7   ; 60 70

+        punpckhbw   xmm7, s7   ; 68 78

+        movdqa      xmm8, xmm3

+        punpcklwd   xmm3, xmm6              ; 40 50 60 70

+        punpckhwd   xmm8, xmm6              ; 44 54 64 74

+        movdqa      xmm6, xmm5

+        punpcklwd   xmm5, xmm7              ; 48 58 68 78

+        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c

+        ; pull the first two sets together

+        movdqa      xmm7, xmm0

+        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70

+        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72

+        movdqa      xmm3, xmm4

+        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74

+        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76

+        movdqa      xmm8, xmm1

+        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78

+        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a

+        movdqa      xmm5, xmm2

+        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c

+        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e

+        ; final combination

+        movdqa      xmm6, xmm0

+        punpcklqdq  xmm0, i0

+        punpckhqdq  xmm6, i0

+        movdqa      xmm9, xmm7

+        punpcklqdq  xmm7, i1

+        punpckhqdq  xmm9, i1

+        movdqa      xmm10, xmm4

+        punpcklqdq  xmm4, i2

+        punpckhqdq  xmm10, i2

+        movdqa      xmm11, xmm3

+        punpcklqdq  xmm3, i3

+        punpckhqdq  xmm11, i3

+        movdqa      xmm12, xmm1

+        punpcklqdq  xmm1, i4

+        punpckhqdq  xmm12, i4

+        movdqa      xmm13, xmm8

+        punpcklqdq  xmm8, i5

+        punpckhqdq  xmm13, i5

+        movdqa      xmm14, xmm2

+        punpcklqdq  xmm2, i6

+        punpckhqdq  xmm14, i6

+        movdqa      xmm15, xmm5

+        punpcklqdq  xmm5, i7

+        punpckhqdq  xmm15, i7

+        movdqa      i0, xmm0

+        movdqa      i1, xmm6

+        movdqa      i2, xmm7

+        movdqa      i3, xmm9

+        movdqa      i4, xmm4

+        movdqa      i5, xmm10

+        movdqa      i6, xmm3

+        movdqa      i7, xmm11

+        movdqa      i8, xmm1

+        movdqa      i9, xmm12

+        movdqa      i10, xmm8

+        movdqa      i11, xmm13

+        movdqa      i12, xmm2

+        movdqa      i13, xmm14

+        movdqa      i14, xmm5

+        movdqa      i15, xmm15

+; TRANSPOSED DATA AVAILABLE ON THE STACK

+        movdqa      xmm12, xmm6

+        movdqa      xmm13, xmm7

+        pxor        zero, zero

+LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11

+        movdqa       xmm1, i2

+        movdqa       xmm2, i3

+        movdqa       xmm8, i4

+        movdqa       xmm9, i5

+LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4

+        movdqa       i2, xmm1

+        movdqa       i3, xmm2

+; second set

+        movdqa       i4, xmm8

+        movdqa       i5, xmm9

+        movdqa       xmm0, i6

+        movdqa       xmm1, i7

+        movdqa       xmm2, i8

+        movdqa       xmm4, i9

+        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)

+        movdqa       xmm11, i11

+LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3

+        movdqa       xmm0, i6

+        movdqa       xmm1, i7

+        movdqa       xmm3, i8

+        movdqa       xmm4, i9

+LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2

+        movdqa       i6, xmm0

+        movdqa       i7, xmm1

+; last set

+        movdqa       i8, xmm3

+        movdqa       i9, xmm4

+        movdqa       xmm0, i10

+        movdqa       xmm1, i11

+        movdqa       xmm2, i12

+        movdqa       xmm8, i13

+        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)

+        movdqa       xmm11, i15

+LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10

+        movdqa       xmm0, i10

+        movdqa       xmm1, i11

+        movdqa       xmm4, i12

+        movdqa       xmm8, i13

+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2

+        movdqa       i10, xmm0

+        movdqa       i11, xmm1

+        movdqa       i12, xmm4

+        movdqa       i13, xmm8

+; RESHUFFLE AND WRITE OUT

+        ; 8-f

+        movdqa      xmm0, i8

+        movdqa      xmm1, xmm0

+        punpcklbw   xmm0, i9                ; 80 90

+        punpckhbw   xmm1, i9                ; 88 98

+        movdqa      xmm2, i10

+        movdqa      xmm3, xmm2

+        punpcklbw   xmm2, i11               ; a0 b0

+        punpckhbw   xmm3, i11               ; a8 b8

+        movdqa      xmm4, xmm0

+        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0

+        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4

+        movdqa      xmm2, xmm1

+        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8

+        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc

+        ; using xmm[0124]

+        ; work on next 4 rows

+        movdqa      xmm3, i12

+        movdqa      xmm5, xmm3

+        punpcklbw   xmm3, i13               ; c0 d0

+        punpckhbw   xmm5, i13               ; c8 d8

+        movdqa      xmm6, i14

+        movdqa      xmm7, xmm6

+        punpcklbw   xmm6, i15               ; e0 f0

+        punpckhbw   xmm7, i15               ; e8 f8

+        movdqa      xmm8, xmm3

+        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0

+        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4

+        movdqa      xmm6, xmm5

+        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8

+        punpckhwd   xmm6, xmm7              ; cc dc ec fc

+        ; pull the third and fourth sets together

+        movdqa      xmm7, xmm0

+        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0

+        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2

+        movdqa      xmm3, xmm4

+        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4

+        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6

+        movdqa      xmm8, xmm1

+        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8

+        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa

+        movdqa      xmm5, xmm2

+        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc

+        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe

+        ; save the calculations. we only have 15 registers ...

+        movdqa      i8, xmm0

+        movdqa      i9, xmm7

+        movdqa      i10, xmm4

+        movdqa      i11, xmm3

+        movdqa      i12, xmm1

+        movdqa      i13, xmm8

+        movdqa      i14, xmm2

+        movdqa      i15, xmm5

+        ; 0-7

+        movdqa      xmm0, i0

+        movdqa      xmm1, xmm0

+        punpcklbw   xmm0, i1                ; 00 10

+        punpckhbw   xmm1, i1                ; 08 18

+        movdqa      xmm2, i2

+        movdqa      xmm3, xmm2

+        punpcklbw   xmm2, i3                ; 20 30

+        punpckhbw   xmm3, i3                ; 28 38

+        movdqa      xmm4, xmm0

+        punpcklwd   xmm0, xmm2              ; 00 10 20 30

+        punpckhwd   xmm4, xmm2              ; 04 14 24 34

+        movdqa      xmm2, xmm1

+        punpcklwd   xmm1, xmm3              ; 08 18 28 38

+        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c

+        ; using xmm[0124]

+        ; work on next 4 rows

+        movdqa      xmm3, i4

+        movdqa      xmm5, xmm3

+        punpcklbw   xmm3, i5                ; 40 50

+        punpckhbw   xmm5, i5                ; 48 58

+        movdqa      xmm6, i6

+        movdqa      xmm7, xmm6

+        punpcklbw   xmm6, i7                ; 60 70

+        punpckhbw   xmm7, i7                ; 68 78

+        movdqa      xmm8, xmm3

+        punpcklwd   xmm3, xmm6              ; 40 50 60 70

+        punpckhwd   xmm8, xmm6              ; 44 54 64 74

+        movdqa      xmm6, xmm5

+        punpcklwd   xmm5, xmm7              ; 48 58 68 78

+        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c

+        ; pull the first two sets together

+        movdqa      xmm7, xmm0

+        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70

+        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72

+        movdqa      xmm3, xmm4

+        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74

+        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76

+        movdqa      xmm8, xmm1

+        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78

+        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a

+        movdqa      xmm5, xmm2

+        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c

+        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e

+        ; final combination

+        movdqa      xmm6, xmm0

+        punpcklqdq  xmm0, i8

+        punpckhqdq  xmm6, i8

+        movdqa      xmm9, xmm7

+        punpcklqdq  xmm7, i9

+        punpckhqdq  xmm9, i9

+        movdqa      xmm10, xmm4

+        punpcklqdq  xmm4, i10

+        punpckhqdq  xmm10, i10

+        movdqa      xmm11, xmm3

+        punpcklqdq  xmm3, i11

+        punpckhqdq  xmm11, i11

+        movdqa      xmm12, xmm1

+        punpcklqdq  xmm1, i12

+        punpckhqdq  xmm12, i12

+        movdqa      xmm13, xmm8

+        punpcklqdq  xmm8, i13

+        punpckhqdq  xmm13, i13

+        movdqa      xmm14, xmm2

+        punpcklqdq  xmm2, i14

+        punpckhqdq  xmm14, i14

+        movdqa      xmm15, xmm5

+        punpcklqdq  xmm5, i15

+        punpckhqdq  xmm15, i15

+        movdqa      s0, xmm0

+        movdqa      s1, xmm6

+        movdqa      s2, xmm7

+        movdqa      s3, xmm9

+        movdqa      s4, xmm4

+        movdqa      s5, xmm10

+        movdqa      s6, xmm3

+        movdqa      s7, xmm11

+        movdqa      s8, xmm1

+        movdqa      s9, xmm12

+        movdqa      s10, xmm8

+        movdqa      s11, xmm13

+        movdqa      s12, xmm2

+        movdqa      s13, xmm14

+        movdqa      s14, xmm5

+        movdqa      s15, xmm15

+    ; free stack space

+    add          rsp, stack_size

+    ; un-ALIGN_STACK

+    pop          rsp

+%if LIBVPX_YASM_WIN64

+    pop    r13

+    pop    r12

+    RESTORE_XMM

+    pop    rbp

+%endif

+    ret

+SECTION_RODATA

+align 16

+te0:

+    times 16 db 0xe0

+align 16

+t7f:

+    times 16 db 0x7f

+align 16

+tfe:

+    times 16 db 0xfe

+align 16

+t1f:

+    times 16 db 0x1f

+align 16

+t80:

+    times 16 db 0x80

+align 16

+t1:

+    times 16 db 0x01

+align 16

+t3:

+    times 16 db 0x03

+align 16

+t4:

+    times 16 db 0x04

--- a/vp8/encoder/x86/ssim_opt.asm

+++ /dev/null

@@ -1,216 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr

-%macro TABULATE_SSIM 0

-        paddusw         xmm15, xmm3  ; sum_s

-        paddusw         xmm14, xmm4  ; sum_r

-        movdqa          xmm1, xmm3

-        pmaddwd         xmm1, xmm1

-        paddd           xmm13, xmm1 ; sum_sq_s

-        movdqa          xmm2, xmm4

-        pmaddwd         xmm2, xmm2

-        paddd           xmm12, xmm2 ; sum_sq_r

-        pmaddwd         xmm3, xmm4

-        paddd           xmm11, xmm3  ; sum_sxr

-%endmacro

-; Sum across the register %1 starting with q words

-%macro SUM_ACROSS_Q 1

-        movdqa          xmm2,%1

-        punpckldq       %1,xmm0

-        punpckhdq       xmm2,xmm0

-        paddq           %1,xmm2

-        movdqa          xmm2,%1

-        punpcklqdq      %1,xmm0

-        punpckhqdq      xmm2,xmm0

-        paddq           %1,xmm2

-%endmacro

-; Sum across the register %1 starting with q words

-%macro SUM_ACROSS_W 1

-        movdqa          xmm1, %1

-        punpcklwd       %1,xmm0

-        punpckhwd       xmm1,xmm0

-        paddd           %1, xmm1

-        SUM_ACROSS_Q    %1

-%endmacro

-;void ssim_parms_sse2(

-;    unsigned char *s,

-;    int sp,

-;    unsigned char *r,

-;    int rp

-;    unsigned long *sum_s,

-;    unsigned long *sum_r,

-;    unsigned long *sum_sq_s,

-;    unsigned long *sum_sq_r,

-;    unsigned long *sum_sxr);

-;

-; TODO: Use parm passing through structure, probably don't need the pxors

-; ( calling app will initialize to 0 ) could easily fit everything in sse2

-; without too much hastle, and can probably do better estimates with psadw

-; or pavgb At this point this is just meant to be first pass for calculating

-; all the parms needed for 16x16 ssim so we can play with dssim as distortion

-; in mode selection code.

-global sym(vp8_ssim_parms_16x16_sse2) PRIVATE

-sym(vp8_ssim_parms_16x16_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    SAVE_XMM 15

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;s

-    mov             rcx,        arg(1) ;sp

-    mov             rdi,        arg(2) ;r

-    mov             rax,        arg(3) ;rp

-    pxor            xmm0, xmm0

-    pxor            xmm15,xmm15  ;sum_s

-    pxor            xmm14,xmm14  ;sum_r

-    pxor            xmm13,xmm13  ;sum_sq_s

-    pxor            xmm12,xmm12  ;sum_sq_r

-    pxor            xmm11,xmm11  ;sum_sxr

-    mov             rdx, 16      ;row counter

-.NextRow:

-    ;grab source and reference pixels

-    movdqu          xmm5, [rsi]

-    movdqu          xmm6, [rdi]

-    movdqa          xmm3, xmm5

-    movdqa          xmm4, xmm6

-    punpckhbw       xmm3, xmm0 ; high_s

-    punpckhbw       xmm4, xmm0 ; high_r

-    TABULATE_SSIM

-    movdqa          xmm3, xmm5

-    movdqa          xmm4, xmm6

-    punpcklbw       xmm3, xmm0 ; low_s

-    punpcklbw       xmm4, xmm0 ; low_r

-    TABULATE_SSIM

-    add             rsi, rcx   ; next s row

-    add             rdi, rax   ; next r row

-    dec             rdx        ; counter

-    jnz .NextRow

-    SUM_ACROSS_W    xmm15

-    SUM_ACROSS_W    xmm14

-    SUM_ACROSS_Q    xmm13

-    SUM_ACROSS_Q    xmm12

-    SUM_ACROSS_Q    xmm11

-    mov             rdi,arg(4)

-    movd            [rdi], xmm15;

-    mov             rdi,arg(5)

-    movd            [rdi], xmm14;

-    mov             rdi,arg(6)

-    movd            [rdi], xmm13;

-    mov             rdi,arg(7)

-    movd            [rdi], xmm12;

-    mov             rdi,arg(8)

-    movd            [rdi], xmm11;

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void ssim_parms_sse2(

-;    unsigned char *s,

-;    int sp,

-;    unsigned char *r,

-;    int rp

-;    unsigned long *sum_s,

-;    unsigned long *sum_r,

-;    unsigned long *sum_sq_s,

-;    unsigned long *sum_sq_r,

-;    unsigned long *sum_sxr);

-;

-; TODO: Use parm passing through structure, probably don't need the pxors

-; ( calling app will initialize to 0 ) could easily fit everything in sse2

-; without too much hastle, and can probably do better estimates with psadw

-; or pavgb At this point this is just meant to be first pass for calculating

-; all the parms needed for 16x16 ssim so we can play with dssim as distortion

-; in mode selection code.

-global sym(vp8_ssim_parms_8x8_sse2) PRIVATE

-sym(vp8_ssim_parms_8x8_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    SAVE_XMM 15

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;s

-    mov             rcx,        arg(1) ;sp

-    mov             rdi,        arg(2) ;r

-    mov             rax,        arg(3) ;rp

-    pxor            xmm0, xmm0

-    pxor            xmm15,xmm15  ;sum_s

-    pxor            xmm14,xmm14  ;sum_r

-    pxor            xmm13,xmm13  ;sum_sq_s

-    pxor            xmm12,xmm12  ;sum_sq_r

-    pxor            xmm11,xmm11  ;sum_sxr

-    mov             rdx, 8      ;row counter

-.NextRow:

-    ;grab source and reference pixels

-    movq            xmm3, [rsi]

-    movq            xmm4, [rdi]

-    punpcklbw       xmm3, xmm0 ; low_s

-    punpcklbw       xmm4, xmm0 ; low_r

-    TABULATE_SSIM

-    add             rsi, rcx   ; next s row

-    add             rdi, rax   ; next r row

-    dec             rdx        ; counter

-    jnz .NextRow

-    SUM_ACROSS_W    xmm15

-    SUM_ACROSS_W    xmm14

-    SUM_ACROSS_Q    xmm13

-    SUM_ACROSS_Q    xmm12

-    SUM_ACROSS_Q    xmm11

-    mov             rdi,arg(4)

-    movd            [rdi], xmm15;

-    mov             rdi,arg(5)

-    movd            [rdi], xmm14;

-    mov             rdi,arg(6)

-    movd            [rdi], xmm13;

-    mov             rdi,arg(7)

-    movd            [rdi], xmm12;

-    mov             rdi,arg(8)

-    movd            [rdi], xmm11;

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- /dev/null

+++ b/vp8/encoder/x86/ssim_opt_x86_64.asm

@@ -1,0 +1,216 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr

+%macro TABULATE_SSIM 0

+        paddusw         xmm15, xmm3  ; sum_s

+        paddusw         xmm14, xmm4  ; sum_r

+        movdqa          xmm1, xmm3

+        pmaddwd         xmm1, xmm1

+        paddd           xmm13, xmm1 ; sum_sq_s

+        movdqa          xmm2, xmm4

+        pmaddwd         xmm2, xmm2

+        paddd           xmm12, xmm2 ; sum_sq_r

+        pmaddwd         xmm3, xmm4

+        paddd           xmm11, xmm3  ; sum_sxr

+%endmacro

+; Sum across the register %1 starting with q words

+%macro SUM_ACROSS_Q 1

+        movdqa          xmm2,%1

+        punpckldq       %1,xmm0

+        punpckhdq       xmm2,xmm0

+        paddq           %1,xmm2

+        movdqa          xmm2,%1

+        punpcklqdq      %1,xmm0

+        punpckhqdq      xmm2,xmm0

+        paddq           %1,xmm2

+%endmacro

+; Sum across the register %1 starting with q words

+%macro SUM_ACROSS_W 1

+        movdqa          xmm1, %1

+        punpcklwd       %1,xmm0

+        punpckhwd       xmm1,xmm0

+        paddd           %1, xmm1

+        SUM_ACROSS_Q    %1

+%endmacro

+;void ssim_parms_sse2(

+;    unsigned char *s,

+;    int sp,

+;    unsigned char *r,

+;    int rp

+;    unsigned long *sum_s,

+;    unsigned long *sum_r,

+;    unsigned long *sum_sq_s,

+;    unsigned long *sum_sq_r,

+;    unsigned long *sum_sxr);

+;

+; TODO: Use parm passing through structure, probably don't need the pxors

+; ( calling app will initialize to 0 ) could easily fit everything in sse2

+; without too much hastle, and can probably do better estimates with psadw

+; or pavgb At this point this is just meant to be first pass for calculating

+; all the parms needed for 16x16 ssim so we can play with dssim as distortion

+; in mode selection code.

+global sym(vp8_ssim_parms_16x16_sse2) PRIVATE

+sym(vp8_ssim_parms_16x16_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    SAVE_XMM 15

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;s

+    mov             rcx,        arg(1) ;sp

+    mov             rdi,        arg(2) ;r

+    mov             rax,        arg(3) ;rp

+    pxor            xmm0, xmm0

+    pxor            xmm15,xmm15  ;sum_s

+    pxor            xmm14,xmm14  ;sum_r

+    pxor            xmm13,xmm13  ;sum_sq_s

+    pxor            xmm12,xmm12  ;sum_sq_r

+    pxor            xmm11,xmm11  ;sum_sxr

+    mov             rdx, 16      ;row counter

+.NextRow:

+    ;grab source and reference pixels

+    movdqu          xmm5, [rsi]

+    movdqu          xmm6, [rdi]

+    movdqa          xmm3, xmm5

+    movdqa          xmm4, xmm6

+    punpckhbw       xmm3, xmm0 ; high_s

+    punpckhbw       xmm4, xmm0 ; high_r

+    TABULATE_SSIM

+    movdqa          xmm3, xmm5

+    movdqa          xmm4, xmm6

+    punpcklbw       xmm3, xmm0 ; low_s

+    punpcklbw       xmm4, xmm0 ; low_r

+    TABULATE_SSIM

+    add             rsi, rcx   ; next s row

+    add             rdi, rax   ; next r row

+    dec             rdx        ; counter

+    jnz .NextRow

+    SUM_ACROSS_W    xmm15

+    SUM_ACROSS_W    xmm14

+    SUM_ACROSS_Q    xmm13

+    SUM_ACROSS_Q    xmm12

+    SUM_ACROSS_Q    xmm11

+    mov             rdi,arg(4)

+    movd            [rdi], xmm15;

+    mov             rdi,arg(5)

+    movd            [rdi], xmm14;

+    mov             rdi,arg(6)

+    movd            [rdi], xmm13;

+    mov             rdi,arg(7)

+    movd            [rdi], xmm12;

+    mov             rdi,arg(8)

+    movd            [rdi], xmm11;

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void ssim_parms_sse2(

+;    unsigned char *s,

+;    int sp,

+;    unsigned char *r,

+;    int rp

+;    unsigned long *sum_s,

+;    unsigned long *sum_r,

+;    unsigned long *sum_sq_s,

+;    unsigned long *sum_sq_r,

+;    unsigned long *sum_sxr);

+;

+; TODO: Use parm passing through structure, probably don't need the pxors

+; ( calling app will initialize to 0 ) could easily fit everything in sse2

+; without too much hastle, and can probably do better estimates with psadw

+; or pavgb At this point this is just meant to be first pass for calculating

+; all the parms needed for 16x16 ssim so we can play with dssim as distortion

+; in mode selection code.

+global sym(vp8_ssim_parms_8x8_sse2) PRIVATE

+sym(vp8_ssim_parms_8x8_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    SAVE_XMM 15

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;s

+    mov             rcx,        arg(1) ;sp

+    mov             rdi,        arg(2) ;r

+    mov             rax,        arg(3) ;rp

+    pxor            xmm0, xmm0

+    pxor            xmm15,xmm15  ;sum_s

+    pxor            xmm14,xmm14  ;sum_r

+    pxor            xmm13,xmm13  ;sum_sq_s

+    pxor            xmm12,xmm12  ;sum_sq_r

+    pxor            xmm11,xmm11  ;sum_sxr

+    mov             rdx, 8      ;row counter

+.NextRow:

+    ;grab source and reference pixels

+    movq            xmm3, [rsi]

+    movq            xmm4, [rdi]

+    punpcklbw       xmm3, xmm0 ; low_s

+    punpcklbw       xmm4, xmm0 ; low_r

+    TABULATE_SSIM

+    add             rsi, rcx   ; next s row

+    add             rdi, rax   ; next r row

+    dec             rdx        ; counter

+    jnz .NextRow

+    SUM_ACROSS_W    xmm15

+    SUM_ACROSS_W    xmm14

+    SUM_ACROSS_Q    xmm13

+    SUM_ACROSS_Q    xmm12

+    SUM_ACROSS_Q    xmm11

+    mov             rdi,arg(4)

+    movd            [rdi], xmm15;

+    mov             rdi,arg(5)

+    movd            [rdi], xmm14;

+    mov             rdi,arg(6)

+    movd            [rdi], xmm13;

+    mov             rdi,arg(7)

+    movd            [rdi], xmm12;

+    mov             rdi,arg(8)

+    movd            [rdi], xmm11;

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- a/vp8/vp8_common.mk

+++ b/vp8/vp8_common.mk

@@ -115,7 +115,7 @@

 endif

 ifeq ($(ARCH_X86_64),yes)

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2.asm

+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2_x86_64.asm

 endif

 # common (c)

--- a/vp8/vp8cx.mk

+++ b/vp8/vp8cx.mk

@@ -100,7 +100,7 @@

 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm

 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm

 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm

-VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm

+VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt_x86_64.asm

 ifeq ($(CONFIG_REALTIME_ONLY),yes)

 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm

--- a/vp9/common/x86/vp9_idct_ssse3.asm

+++ /dev/null

@@ -1,300 +1,0 @@

-;

-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-; This file provides SSSE3 version of the inverse transformation. Part

-; of the functions are originally derived from the ffmpeg project.

-; Note that the current version applies to x86 64-bit only.

-SECTION_RODATA

-pw_11585x2: times 8 dw 23170

-pd_8192:    times 4 dd 8192

-pw_16:      times 8 dw 16

-%macro TRANSFORM_COEFFS 2

-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2

-pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1

-%endmacro

-TRANSFORM_COEFFS    6270, 15137

-TRANSFORM_COEFFS    3196, 16069

-TRANSFORM_COEFFS   13623,  9102

-%macro PAIR_PP_COEFFS 2

-dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2

-%endmacro

-%macro PAIR_MP_COEFFS 2

-dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2

-%endmacro

-%macro PAIR_MM_COEFFS 2

-dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2

-%endmacro

-PAIR_PP_COEFFS     30274, 12540

-PAIR_PP_COEFFS      6392, 32138

-PAIR_MP_COEFFS     18204, 27246

-PAIR_PP_COEFFS     12540, 12540

-PAIR_PP_COEFFS     30274, 30274

-PAIR_PP_COEFFS      6392,  6392

-PAIR_PP_COEFFS     32138, 32138

-PAIR_MM_COEFFS     18204, 18204

-PAIR_PP_COEFFS     27246, 27246

-SECTION .text

-%if ARCH_X86_64

-%macro SUM_SUB 3

-  psubw  m%3, m%1, m%2

-  paddw  m%1, m%2

-  SWAP    %2, %3

-%endmacro

-; butterfly operation

-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2

-  pmaddwd            m%1, m%3, %5

-  pmaddwd            m%2, m%3, %6

-  paddd              m%1,  %4

-  paddd              m%2,  %4

-  psrad              m%1,  14

-  psrad              m%2,  14

-%endmacro

-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2

-  punpckhwd          m%6, m%2, m%1

-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]

-  punpcklwd          m%2, m%1

-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]

-  packssdw           m%1, m%7

-  packssdw           m%2, m%6

-%endmacro

-; matrix transpose

-%macro INTERLEAVE_2X 4

-  punpckh%1          m%4, m%2, m%3

-  punpckl%1          m%2, m%3

-  SWAP               %3,  %4

-%endmacro

-%macro TRANSPOSE8X8 9

-  INTERLEAVE_2X  wd, %1, %2, %9

-  INTERLEAVE_2X  wd, %3, %4, %9

-  INTERLEAVE_2X  wd, %5, %6, %9

-  INTERLEAVE_2X  wd, %7, %8, %9

-  INTERLEAVE_2X  dq, %1, %3, %9

-  INTERLEAVE_2X  dq, %2, %4, %9

-  INTERLEAVE_2X  dq, %5, %7, %9

-  INTERLEAVE_2X  dq, %6, %8, %9

-  INTERLEAVE_2X  qdq, %1, %5, %9

-  INTERLEAVE_2X  qdq, %3, %7, %9

-  INTERLEAVE_2X  qdq, %2, %6, %9

-  INTERLEAVE_2X  qdq, %4, %8, %9

-  SWAP  %2, %5

-  SWAP  %4, %7

-%endmacro

-%macro IDCT8_1D 0

-  SUM_SUB          0,    4,    9

-  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10

-  pmulhrsw        m0,  m12

-  pmulhrsw        m4,  m12

-  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10

-  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10

-  SUM_SUB          1,    5,    9

-  SUM_SUB          7,    3,    9

-  SUM_SUB          0,    6,    9

-  SUM_SUB          4,    2,    9

-  SUM_SUB          3,    5,    9

-  pmulhrsw        m3,  m12

-  pmulhrsw        m5,  m12

-  SUM_SUB          0,    7,    9

-  SUM_SUB          4,    3,    9

-  SUM_SUB          2,    5,    9

-  SUM_SUB          6,    1,    9

-  SWAP             3,    6

-  SWAP             1,    4

-%endmacro

-; This macro handles 8 pixels per line

-%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero

-  paddw           m%1, m11

-  paddw           m%2, m11

-  psraw           m%1, 5

-  psraw           m%2, 5

-  movh            m%3, [outputq]

-  movh            m%4, [outputq + strideq]

-  punpcklbw       m%3, m%5

-  punpcklbw       m%4, m%5

-  paddw           m%3, m%1

-  paddw           m%4, m%2

-  packuswb        m%3, m%5

-  packuswb        m%4, m%5

-  movh               [outputq], m%3

-  movh     [outputq + strideq], m%4

-%endmacro

-INIT_XMM ssse3

-; full inverse 8x8 2D-DCT transform

-cglobal idct8x8_64_add, 3, 5, 13, input, output, stride

-  mova     m8, [pd_8192]

-  mova    m11, [pw_16]

-  mova    m12, [pw_11585x2]

-  lea      r3, [2 * strideq]

-  mova     m0, [inputq +   0]

-  mova     m1, [inputq +  16]

-  mova     m2, [inputq +  32]

-  mova     m3, [inputq +  48]

-  mova     m4, [inputq +  64]

-  mova     m5, [inputq +  80]

-  mova     m6, [inputq +  96]

-  mova     m7, [inputq + 112]

-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9

-  IDCT8_1D

-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9

-  IDCT8_1D

-  pxor    m12, m12

-  ADD_STORE_8P_2X  0, 1, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  2, 3, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  4, 5, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  6, 7, 9, 10, 12

-  RET

-; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero

-cglobal idct8x8_12_add, 3, 5, 13, input, output, stride

-  mova       m8, [pd_8192]

-  mova      m11, [pw_16]

-  mova      m12, [pw_11585x2]

-  lea        r3, [2 * strideq]

-  mova       m0, [inputq +  0]

-  mova       m1, [inputq + 16]

-  mova       m2, [inputq + 32]

-  mova       m3, [inputq + 48]

-  punpcklwd  m0, m1

-  punpcklwd  m2, m3

-  punpckhdq  m9, m0, m2

-  punpckldq  m0, m2

-  SWAP       2, 9

-  ; m0 -> [0], [0]

-  ; m1 -> [1], [1]

-  ; m2 -> [2], [2]

-  ; m3 -> [3], [3]

-  punpckhqdq m10, m0, m0

-  punpcklqdq m0,  m0

-  punpckhqdq m9,  m2, m2

-  punpcklqdq m2,  m2

-  SWAP       1, 10

-  SWAP       3,  9

-  pmulhrsw   m0, m12

-  pmulhrsw   m2, [dpw_30274_12540]

-  pmulhrsw   m1, [dpw_6392_32138]

-  pmulhrsw   m3, [dpw_m18204_27246]

-  SUM_SUB    0, 2, 9

-  SUM_SUB    1, 3, 9

-  punpcklqdq m9, m3, m3

-  punpckhqdq m5, m3, m9

-  SUM_SUB    3, 5, 9

-  punpckhqdq m5, m3

-  pmulhrsw   m5, m12

-  punpckhqdq m9, m1, m5

-  punpcklqdq m1, m5

-  SWAP       5, 9

-  SUM_SUB    0, 5, 9

-  SUM_SUB    2, 1, 9

-  punpckhqdq m3, m0, m0

-  punpckhqdq m4, m1, m1

-  punpckhqdq m6, m5, m5

-  punpckhqdq m7, m2, m2

-  punpcklwd  m0, m3

-  punpcklwd  m7, m2

-  punpcklwd  m1, m4

-  punpcklwd  m6, m5

-  punpckhdq  m4, m0, m7

-  punpckldq  m0, m7

-  punpckhdq  m10, m1, m6

-  punpckldq  m5, m1, m6

-  punpckhqdq m1, m0, m5

-  punpcklqdq m0, m5

-  punpckhqdq m3, m4, m10

-  punpcklqdq m2, m4, m10

-  pmulhrsw   m0, m12

-  pmulhrsw   m6, m2, [dpw_30274_30274]

-  pmulhrsw   m4, m2, [dpw_12540_12540]

-  pmulhrsw   m7, m1, [dpw_32138_32138]

-  pmulhrsw   m1, [dpw_6392_6392]

-  pmulhrsw   m5, m3, [dpw_m18204_m18204]

-  pmulhrsw   m3, [dpw_27246_27246]

-  mova       m2, m0

-  SUM_SUB    0, 6, 9

-  SUM_SUB    2, 4, 9

-  SUM_SUB    1, 5, 9

-  SUM_SUB    7, 3, 9

-  SUM_SUB    3, 5, 9

-  pmulhrsw   m3, m12

-  pmulhrsw   m5, m12

-  SUM_SUB    0, 7, 9

-  SUM_SUB    2, 3, 9

-  SUM_SUB    4, 5, 9

-  SUM_SUB    6, 1, 9

-  SWAP       3, 6

-  SWAP       1, 2

-  SWAP       2, 4

-  pxor    m12, m12

-  ADD_STORE_8P_2X  0, 1, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  2, 3, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  4, 5, 9, 10, 12

-  lea              outputq, [outputq + r3]

-  ADD_STORE_8P_2X  6, 7, 9, 10, 12

-  RET

-%endif

--- /dev/null

+++ b/vp9/common/x86/vp9_idct_ssse3_x86_64.asm

@@ -1,0 +1,300 @@

+;

+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+; This file provides SSSE3 version of the inverse transformation. Part

+; of the functions are originally derived from the ffmpeg project.

+; Note that the current version applies to x86 64-bit only.

+SECTION_RODATA

+pw_11585x2: times 8 dw 23170

+pd_8192:    times 4 dd 8192

+pw_16:      times 8 dw 16

+%macro TRANSFORM_COEFFS 2

+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2

+pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1

+%endmacro

+TRANSFORM_COEFFS    6270, 15137

+TRANSFORM_COEFFS    3196, 16069

+TRANSFORM_COEFFS   13623,  9102

+%macro PAIR_PP_COEFFS 2

+dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2

+%endmacro

+%macro PAIR_MP_COEFFS 2

+dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2

+%endmacro

+%macro PAIR_MM_COEFFS 2

+dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2

+%endmacro

+PAIR_PP_COEFFS     30274, 12540

+PAIR_PP_COEFFS      6392, 32138

+PAIR_MP_COEFFS     18204, 27246

+PAIR_PP_COEFFS     12540, 12540

+PAIR_PP_COEFFS     30274, 30274

+PAIR_PP_COEFFS      6392,  6392

+PAIR_PP_COEFFS     32138, 32138

+PAIR_MM_COEFFS     18204, 18204

+PAIR_PP_COEFFS     27246, 27246

+SECTION .text

+%if ARCH_X86_64

+%macro SUM_SUB 3

+  psubw  m%3, m%1, m%2

+  paddw  m%1, m%2

+  SWAP    %2, %3

+%endmacro

+; butterfly operation

+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2

+  pmaddwd            m%1, m%3, %5

+  pmaddwd            m%2, m%3, %6

+  paddd              m%1,  %4

+  paddd              m%2,  %4

+  psrad              m%1,  14

+  psrad              m%2,  14

+%endmacro

+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2

+  punpckhwd          m%6, m%2, m%1

+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]

+  punpcklwd          m%2, m%1

+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]

+  packssdw           m%1, m%7

+  packssdw           m%2, m%6

+%endmacro

+; matrix transpose

+%macro INTERLEAVE_2X 4

+  punpckh%1          m%4, m%2, m%3

+  punpckl%1          m%2, m%3

+  SWAP               %3,  %4

+%endmacro

+%macro TRANSPOSE8X8 9

+  INTERLEAVE_2X  wd, %1, %2, %9

+  INTERLEAVE_2X  wd, %3, %4, %9

+  INTERLEAVE_2X  wd, %5, %6, %9

+  INTERLEAVE_2X  wd, %7, %8, %9

+  INTERLEAVE_2X  dq, %1, %3, %9

+  INTERLEAVE_2X  dq, %2, %4, %9

+  INTERLEAVE_2X  dq, %5, %7, %9

+  INTERLEAVE_2X  dq, %6, %8, %9

+  INTERLEAVE_2X  qdq, %1, %5, %9

+  INTERLEAVE_2X  qdq, %3, %7, %9

+  INTERLEAVE_2X  qdq, %2, %6, %9

+  INTERLEAVE_2X  qdq, %4, %8, %9

+  SWAP  %2, %5

+  SWAP  %4, %7

+%endmacro

+%macro IDCT8_1D 0

+  SUM_SUB          0,    4,    9

+  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10

+  pmulhrsw        m0,  m12

+  pmulhrsw        m4,  m12

+  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10

+  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10

+  SUM_SUB          1,    5,    9

+  SUM_SUB          7,    3,    9

+  SUM_SUB          0,    6,    9

+  SUM_SUB          4,    2,    9

+  SUM_SUB          3,    5,    9

+  pmulhrsw        m3,  m12

+  pmulhrsw        m5,  m12

+  SUM_SUB          0,    7,    9

+  SUM_SUB          4,    3,    9

+  SUM_SUB          2,    5,    9

+  SUM_SUB          6,    1,    9

+  SWAP             3,    6

+  SWAP             1,    4

+%endmacro

+; This macro handles 8 pixels per line

+%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero

+  paddw           m%1, m11

+  paddw           m%2, m11

+  psraw           m%1, 5

+  psraw           m%2, 5

+  movh            m%3, [outputq]

+  movh            m%4, [outputq + strideq]

+  punpcklbw       m%3, m%5

+  punpcklbw       m%4, m%5

+  paddw           m%3, m%1

+  paddw           m%4, m%2

+  packuswb        m%3, m%5

+  packuswb        m%4, m%5

+  movh               [outputq], m%3

+  movh     [outputq + strideq], m%4

+%endmacro

+INIT_XMM ssse3

+; full inverse 8x8 2D-DCT transform

+cglobal idct8x8_64_add, 3, 5, 13, input, output, stride

+  mova     m8, [pd_8192]

+  mova    m11, [pw_16]

+  mova    m12, [pw_11585x2]

+  lea      r3, [2 * strideq]

+  mova     m0, [inputq +   0]

+  mova     m1, [inputq +  16]

+  mova     m2, [inputq +  32]

+  mova     m3, [inputq +  48]

+  mova     m4, [inputq +  64]

+  mova     m5, [inputq +  80]

+  mova     m6, [inputq +  96]

+  mova     m7, [inputq + 112]

+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9

+  IDCT8_1D

+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9

+  IDCT8_1D

+  pxor    m12, m12

+  ADD_STORE_8P_2X  0, 1, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  2, 3, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  4, 5, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  6, 7, 9, 10, 12

+  RET

+; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero

+cglobal idct8x8_12_add, 3, 5, 13, input, output, stride

+  mova       m8, [pd_8192]

+  mova      m11, [pw_16]

+  mova      m12, [pw_11585x2]

+  lea        r3, [2 * strideq]

+  mova       m0, [inputq +  0]

+  mova       m1, [inputq + 16]

+  mova       m2, [inputq + 32]

+  mova       m3, [inputq + 48]

+  punpcklwd  m0, m1

+  punpcklwd  m2, m3

+  punpckhdq  m9, m0, m2

+  punpckldq  m0, m2

+  SWAP       2, 9

+  ; m0 -> [0], [0]

+  ; m1 -> [1], [1]

+  ; m2 -> [2], [2]

+  ; m3 -> [3], [3]

+  punpckhqdq m10, m0, m0

+  punpcklqdq m0,  m0

+  punpckhqdq m9,  m2, m2

+  punpcklqdq m2,  m2

+  SWAP       1, 10

+  SWAP       3,  9

+  pmulhrsw   m0, m12

+  pmulhrsw   m2, [dpw_30274_12540]

+  pmulhrsw   m1, [dpw_6392_32138]

+  pmulhrsw   m3, [dpw_m18204_27246]

+  SUM_SUB    0, 2, 9

+  SUM_SUB    1, 3, 9

+  punpcklqdq m9, m3, m3

+  punpckhqdq m5, m3, m9

+  SUM_SUB    3, 5, 9

+  punpckhqdq m5, m3

+  pmulhrsw   m5, m12

+  punpckhqdq m9, m1, m5

+  punpcklqdq m1, m5

+  SWAP       5, 9

+  SUM_SUB    0, 5, 9

+  SUM_SUB    2, 1, 9

+  punpckhqdq m3, m0, m0

+  punpckhqdq m4, m1, m1

+  punpckhqdq m6, m5, m5

+  punpckhqdq m7, m2, m2

+  punpcklwd  m0, m3

+  punpcklwd  m7, m2

+  punpcklwd  m1, m4

+  punpcklwd  m6, m5

+  punpckhdq  m4, m0, m7

+  punpckldq  m0, m7

+  punpckhdq  m10, m1, m6

+  punpckldq  m5, m1, m6

+  punpckhqdq m1, m0, m5

+  punpcklqdq m0, m5

+  punpckhqdq m3, m4, m10

+  punpcklqdq m2, m4, m10

+  pmulhrsw   m0, m12

+  pmulhrsw   m6, m2, [dpw_30274_30274]

+  pmulhrsw   m4, m2, [dpw_12540_12540]

+  pmulhrsw   m7, m1, [dpw_32138_32138]

+  pmulhrsw   m1, [dpw_6392_6392]

+  pmulhrsw   m5, m3, [dpw_m18204_m18204]

+  pmulhrsw   m3, [dpw_27246_27246]

+  mova       m2, m0

+  SUM_SUB    0, 6, 9

+  SUM_SUB    2, 4, 9

+  SUM_SUB    1, 5, 9

+  SUM_SUB    7, 3, 9

+  SUM_SUB    3, 5, 9

+  pmulhrsw   m3, m12

+  pmulhrsw   m5, m12

+  SUM_SUB    0, 7, 9

+  SUM_SUB    2, 3, 9

+  SUM_SUB    4, 5, 9

+  SUM_SUB    6, 1, 9

+  SWAP       3, 6

+  SWAP       1, 2

+  SWAP       2, 4

+  pxor    m12, m12

+  ADD_STORE_8P_2X  0, 1, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  2, 3, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  4, 5, 9, 10, 12

+  lea              outputq, [outputq + r3]

+  ADD_STORE_8P_2X  6, 7, 9, 10, 12

+  RET

+%endif

--- a/vp9/encoder/x86/vp9_dct_ssse3.asm

+++ /dev/null

@@ -1,174 +1,0 @@

-;

-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-; This file provides SSSE3 version of the forward transformation. Part

-; of the macro definitions are originally derived from the ffmpeg project.

-; The current version applies to x86 64-bit only.

-SECTION_RODATA

-pw_11585x2: times 8 dw 23170

-pd_8192:    times 4 dd 8192

-%macro TRANSFORM_COEFFS 2

-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2

-pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1

-%endmacro

-TRANSFORM_COEFFS 15137,   6270

-TRANSFORM_COEFFS 16069,   3196

-TRANSFORM_COEFFS  9102,  13623

-SECTION .text

-%if ARCH_X86_64

-%macro SUM_SUB 3

-  psubw  m%3, m%1, m%2

-  paddw  m%1, m%2

-  SWAP    %2, %3

-%endmacro

-; butterfly operation

-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2

-  pmaddwd            m%1, m%3, %5

-  pmaddwd            m%2, m%3, %6

-  paddd              m%1,  %4

-  paddd              m%2,  %4

-  psrad              m%1,  14

-  psrad              m%2,  14

-%endmacro

-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2

-  punpckhwd          m%6, m%2, m%1

-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]

-  punpcklwd          m%2, m%1

-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]

-  packssdw           m%1, m%7

-  packssdw           m%2, m%6

-%endmacro

-; matrix transpose

-%macro INTERLEAVE_2X 4

-  punpckh%1          m%4, m%2, m%3

-  punpckl%1          m%2, m%3

-  SWAP               %3,  %4

-%endmacro

-%macro TRANSPOSE8X8 9

-  INTERLEAVE_2X  wd, %1, %2, %9

-  INTERLEAVE_2X  wd, %3, %4, %9

-  INTERLEAVE_2X  wd, %5, %6, %9

-  INTERLEAVE_2X  wd, %7, %8, %9

-  INTERLEAVE_2X  dq, %1, %3, %9

-  INTERLEAVE_2X  dq, %2, %4, %9

-  INTERLEAVE_2X  dq, %5, %7, %9

-  INTERLEAVE_2X  dq, %6, %8, %9

-  INTERLEAVE_2X  qdq, %1, %5, %9

-  INTERLEAVE_2X  qdq, %3, %7, %9

-  INTERLEAVE_2X  qdq, %2, %6, %9

-  INTERLEAVE_2X  qdq, %4, %8, %9

-  SWAP  %2, %5

-  SWAP  %4, %7

-%endmacro

-; 1D forward 8x8 DCT transform

-%macro FDCT8_1D 0

-  SUM_SUB            0,  7,  9

-  SUM_SUB            1,  6,  9

-  SUM_SUB            2,  5,  9

-  SUM_SUB            3,  4,  9

-  SUM_SUB            0,  3,  9

-  SUM_SUB            1,  2,  9

-  SUM_SUB            6,  5,  9

-  SUM_SUB            0,  1,  9

-  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10

-  pmulhrsw           m6, m12

-  pmulhrsw           m5, m12

-  pmulhrsw           m0, m12

-  pmulhrsw           m1, m12

-  SUM_SUB            4,  5,  9

-  SUM_SUB            7,  6,  9

-  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10

-  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10

-  SWAP               1,  4

-  SWAP               3,  6

-%endmacro

-%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2

-  psraw              m%3, m%1, 15

-  psraw              m%4, m%2, 15

-  psubw              m%1, m%3

-  psubw              m%2, m%4

-  psraw              m%1, 1

-  psraw              m%2, 1

-%endmacro

-INIT_XMM ssse3

-cglobal fdct8x8, 3, 5, 13, input, output, stride

-  mova               m8, [pd_8192]

-  mova              m12, [pw_11585x2]

-  pxor              m11, m11

-  lea                r3, [2 * strideq]

-  lea                r4, [4 * strideq]

-  mova               m0, [inputq]

-  mova               m1, [inputq + r3]

-  lea                inputq, [inputq + r4]

-  mova               m2, [inputq]

-  mova               m3, [inputq + r3]

-  lea                inputq, [inputq + r4]

-  mova               m4, [inputq]

-  mova               m5, [inputq + r3]

-  lea                inputq, [inputq + r4]

-  mova               m6, [inputq]

-  mova               m7, [inputq + r3]

-  ; left shift by 2 to increase forward transformation precision

-  psllw              m0, 2

-  psllw              m1, 2

-  psllw              m2, 2

-  psllw              m3, 2

-  psllw              m4, 2

-  psllw              m5, 2

-  psllw              m6, 2

-  psllw              m7, 2

-  ; column transform

-  FDCT8_1D

-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9

-  FDCT8_1D

-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9

-  DIVIDE_ROUND_2X   0, 1, 9, 10

-  DIVIDE_ROUND_2X   2, 3, 9, 10

-  DIVIDE_ROUND_2X   4, 5, 9, 10

-  DIVIDE_ROUND_2X   6, 7, 9, 10

-  mova              [outputq +   0], m0

-  mova              [outputq +  16], m1

-  mova              [outputq +  32], m2

-  mova              [outputq +  48], m3

-  mova              [outputq +  64], m4

-  mova              [outputq +  80], m5

-  mova              [outputq +  96], m6

-  mova              [outputq + 112], m7

-  RET

-%endif

--- /dev/null

+++ b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm

@@ -1,0 +1,174 @@

+;

+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+; This file provides SSSE3 version of the forward transformation. Part

+; of the macro definitions are originally derived from the ffmpeg project.

+; The current version applies to x86 64-bit only.

+SECTION_RODATA

+pw_11585x2: times 8 dw 23170

+pd_8192:    times 4 dd 8192

+%macro TRANSFORM_COEFFS 2

+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2

+pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1

+%endmacro

+TRANSFORM_COEFFS 15137,   6270

+TRANSFORM_COEFFS 16069,   3196

+TRANSFORM_COEFFS  9102,  13623

+SECTION .text

+%if ARCH_X86_64

+%macro SUM_SUB 3

+  psubw  m%3, m%1, m%2

+  paddw  m%1, m%2

+  SWAP    %2, %3

+%endmacro

+; butterfly operation

+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2

+  pmaddwd            m%1, m%3, %5

+  pmaddwd            m%2, m%3, %6

+  paddd              m%1,  %4

+  paddd              m%2,  %4

+  psrad              m%1,  14

+  psrad              m%2,  14

+%endmacro

+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2

+  punpckhwd          m%6, m%2, m%1

+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]

+  punpcklwd          m%2, m%1

+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]

+  packssdw           m%1, m%7

+  packssdw           m%2, m%6

+%endmacro

+; matrix transpose

+%macro INTERLEAVE_2X 4

+  punpckh%1          m%4, m%2, m%3

+  punpckl%1          m%2, m%3

+  SWAP               %3,  %4

+%endmacro

+%macro TRANSPOSE8X8 9

+  INTERLEAVE_2X  wd, %1, %2, %9

+  INTERLEAVE_2X  wd, %3, %4, %9

+  INTERLEAVE_2X  wd, %5, %6, %9

+  INTERLEAVE_2X  wd, %7, %8, %9

+  INTERLEAVE_2X  dq, %1, %3, %9

+  INTERLEAVE_2X  dq, %2, %4, %9

+  INTERLEAVE_2X  dq, %5, %7, %9

+  INTERLEAVE_2X  dq, %6, %8, %9

+  INTERLEAVE_2X  qdq, %1, %5, %9

+  INTERLEAVE_2X  qdq, %3, %7, %9

+  INTERLEAVE_2X  qdq, %2, %6, %9

+  INTERLEAVE_2X  qdq, %4, %8, %9

+  SWAP  %2, %5

+  SWAP  %4, %7

+%endmacro

+; 1D forward 8x8 DCT transform

+%macro FDCT8_1D 0

+  SUM_SUB            0,  7,  9

+  SUM_SUB            1,  6,  9

+  SUM_SUB            2,  5,  9

+  SUM_SUB            3,  4,  9

+  SUM_SUB            0,  3,  9

+  SUM_SUB            1,  2,  9

+  SUM_SUB            6,  5,  9

+  SUM_SUB            0,  1,  9

+  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10

+  pmulhrsw           m6, m12

+  pmulhrsw           m5, m12

+  pmulhrsw           m0, m12

+  pmulhrsw           m1, m12

+  SUM_SUB            4,  5,  9

+  SUM_SUB            7,  6,  9

+  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10

+  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10

+  SWAP               1,  4

+  SWAP               3,  6

+%endmacro

+%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2

+  psraw              m%3, m%1, 15

+  psraw              m%4, m%2, 15

+  psubw              m%1, m%3

+  psubw              m%2, m%4

+  psraw              m%1, 1

+  psraw              m%2, 1

+%endmacro

+INIT_XMM ssse3

+cglobal fdct8x8, 3, 5, 13, input, output, stride

+  mova               m8, [pd_8192]

+  mova              m12, [pw_11585x2]

+  pxor              m11, m11

+  lea                r3, [2 * strideq]

+  lea                r4, [4 * strideq]

+  mova               m0, [inputq]

+  mova               m1, [inputq + r3]

+  lea                inputq, [inputq + r4]

+  mova               m2, [inputq]

+  mova               m3, [inputq + r3]

+  lea                inputq, [inputq + r4]

+  mova               m4, [inputq]

+  mova               m5, [inputq + r3]

+  lea                inputq, [inputq + r4]

+  mova               m6, [inputq]

+  mova               m7, [inputq + r3]

+  ; left shift by 2 to increase forward transformation precision

+  psllw              m0, 2

+  psllw              m1, 2

+  psllw              m2, 2

+  psllw              m3, 2

+  psllw              m4, 2

+  psllw              m5, 2

+  psllw              m6, 2

+  psllw              m7, 2

+  ; column transform

+  FDCT8_1D

+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9

+  FDCT8_1D

+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9

+  DIVIDE_ROUND_2X   0, 1, 9, 10

+  DIVIDE_ROUND_2X   2, 3, 9, 10

+  DIVIDE_ROUND_2X   4, 5, 9, 10

+  DIVIDE_ROUND_2X   6, 7, 9, 10

+  mova              [outputq +   0], m0

+  mova              [outputq +  16], m1

+  mova              [outputq +  32], m2

+  mova              [outputq +  48], m3

+  mova              [outputq +  64], m4

+  mova              [outputq +  80], m5

+  mova              [outputq +  96], m6

+  mova              [outputq + 112], m7

+  RET

+%endif

--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm

+++ /dev/null

@@ -1,219 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-SECTION_RODATA

-pw_1: times 8 dw 1

-SECTION .text

-%macro QUANTIZE_FN 2

-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \

-                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \

-                                eob, scan, iscan

-  cmp                    dword skipm, 0

-  jne .blank

-  ; actual quantize loop - setup pointers, rounders, etc.

-  movifnidn                   coeffq, coeffmp

-  movifnidn                  ncoeffq, ncoeffmp

-  mov                             r2, dequantmp

-  movifnidn                    zbinq, zbinmp

-  movifnidn                   roundq, roundmp

-  movifnidn                   quantq, quantmp

-  movd                            m4, dword zbin_oqm       ; m4 = zbin_oq

-  mova                            m0, [zbinq]              ; m0 = zbin

-  punpcklwd                       m4, m4

-  mova                            m1, [roundq]             ; m1 = round

-  pshufd                          m4, m4, 0

-  mova                            m2, [quantq]             ; m2 = quant

-  paddw                           m0, m4                   ; m0 = zbin + zbin_oq

-%ifidn %1, b_32x32

-  pcmpeqw                         m5, m5

-  psrlw                           m5, 15

-  paddw                           m0, m5

-  paddw                           m1, m5

-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2

-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2

-%endif

-  mova                            m3, [r2q]                ; m3 = dequant

-  psubw                           m0, [pw_1]

-  mov                             r2, shiftmp

-  mov                             r3, qcoeffmp

-  mova                            m4, [r2]                 ; m4 = shift

-  mov                             r4, dqcoeffmp

-  mov                             r5, iscanmp

-%ifidn %1, b_32x32

-  psllw                           m4, 1

-%endif

-  pxor                            m5, m5                   ; m5 = dedicated zero

-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob

-  lea                         coeffq, [  coeffq+ncoeffq*2]

-  lea                         iscanq, [  iscanq+ncoeffq*2]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

-  neg                        ncoeffq

-  ; get DC and first 15 AC coeffs

-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

-  pabsw                           m6, m9                   ; m6 = abs(m9)

-  pabsw                          m11, m10                  ; m11 = abs(m10)

-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin

-  punpckhqdq                      m0, m0

-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin

-  paddsw                          m6, m1                   ; m6 += round

-  punpckhqdq                      m1, m1

-  paddsw                         m11, m1                   ; m11 += round

-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16

-  punpckhqdq                      m2, m2

-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

-  paddw                           m8, m6                   ; m8 += m6

-  paddw                          m13, m11                  ; m13 += m11

-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16

-  punpckhqdq                      m4, m4

-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16

-  psignw                          m8, m9                   ; m8 = reinsert sign

-  psignw                         m13, m10                  ; m13 = reinsert sign

-  pand                            m8, m7

-  pand                           m13, m12

-  mova        [qcoeffq+ncoeffq*2+ 0], m8

-  mova        [qcoeffq+ncoeffq*2+16], m13

-%ifidn %1, b_32x32

-  pabsw                           m8, m8

-  pabsw                          m13, m13

-%endif

-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q

-  punpckhqdq                      m3, m3

-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

-%ifidn %1, b_32x32

-  psrlw                           m8, 1

-  psrlw                          m13, 1

-  psignw                          m8, m9

-  psignw                         m13, m10

-%endif

-  mova       [dqcoeffq+ncoeffq*2+ 0], m8

-  mova       [dqcoeffq+ncoeffq*2+16], m13

-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0

-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]

-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]

-  psubw                           m6, m7                   ; m6 = scan[i] + 1

-  psubw                          m11, m12                  ; m11 = scan[i] + 1

-  pandn                           m8, m6                   ; m8 = max(eob)

-  pandn                          m13, m11                  ; m13 = max(eob)

-  pmaxsw                          m8, m13

-  add                        ncoeffq, mmsize

-  jz .accumulate_eob

-.ac_only_loop:

-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

-  pabsw                           m6, m9                   ; m6 = abs(m9)

-  pabsw                          m11, m10                  ; m11 = abs(m10)

-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin

-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin

-%ifidn %1, b_32x32

-  pmovmskb                        r6, m7

-  pmovmskb                        r2, m12

-  or                              r6, r2

-  jz .skip_iter

-%endif

-  paddsw                          m6, m1                   ; m6 += round

-  paddsw                         m11, m1                   ; m11 += round

-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16

-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

-  paddw                          m14, m6                   ; m14 += m6

-  paddw                          m13, m11                  ; m13 += m11

-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16

-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16

-  psignw                         m14, m9                   ; m14 = reinsert sign

-  psignw                         m13, m10                  ; m13 = reinsert sign

-  pand                           m14, m7

-  pand                           m13, m12

-  mova        [qcoeffq+ncoeffq*2+ 0], m14

-  mova        [qcoeffq+ncoeffq*2+16], m13

-%ifidn %1, b_32x32

-  pabsw                          m14, m14

-  pabsw                          m13, m13

-%endif

-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q

-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

-%ifidn %1, b_32x32

-  psrlw                          m14, 1

-  psrlw                          m13, 1

-  psignw                         m14, m9

-  psignw                         m13, m10

-%endif

-  mova       [dqcoeffq+ncoeffq*2+ 0], m14

-  mova       [dqcoeffq+ncoeffq*2+16], m13

-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0

-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]

-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]

-  psubw                           m6, m7                   ; m6 = scan[i] + 1

-  psubw                          m11, m12                  ; m11 = scan[i] + 1

-  pandn                          m14, m6                   ; m14 = max(eob)

-  pandn                          m13, m11                  ; m13 = max(eob)

-  pmaxsw                          m8, m14

-  pmaxsw                          m8, m13

-  add                        ncoeffq, mmsize

-  jl .ac_only_loop

-%ifidn %1, b_32x32

-  jmp .accumulate_eob

-.skip_iter:

-  mova        [qcoeffq+ncoeffq*2+ 0], m5

-  mova        [qcoeffq+ncoeffq*2+16], m5

-  mova       [dqcoeffq+ncoeffq*2+ 0], m5

-  mova       [dqcoeffq+ncoeffq*2+16], m5

-  add                        ncoeffq, mmsize

-  jl .ac_only_loop

-%endif

-.accumulate_eob:

-  ; horizontally accumulate/max eobs and write into [eob] memory pointer

-  mov                             r2, eobmp

-  pshufd                          m7, m8, 0xe

-  pmaxsw                          m8, m7

-  pshuflw                         m7, m8, 0xe

-  pmaxsw                          m8, m7

-  pshuflw                         m7, m8, 0x1

-  pmaxsw                          m8, m7

-  pextrw                          r6, m8, 0

-  mov                             [r2], r6

-  RET

-  ; skip-block, i.e. just write all zeroes

-.blank:

-  mov                             r0, dqcoeffmp

-  movifnidn                  ncoeffq, ncoeffmp

-  mov                             r2, qcoeffmp

-  mov                             r3, eobmp

-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

-  neg                        ncoeffq

-  pxor                            m7, m7

-.blank_loop:

-  mova       [dqcoeffq+ncoeffq*2+ 0], m7

-  mova       [dqcoeffq+ncoeffq*2+16], m7

-  mova        [qcoeffq+ncoeffq*2+ 0], m7

-  mova        [qcoeffq+ncoeffq*2+16], m7

-  add                        ncoeffq, mmsize

-  jl .blank_loop

-  mov                    word [eobq], 0

-  RET

-%endmacro

-INIT_XMM ssse3

-QUANTIZE_FN b, 7

-QUANTIZE_FN b_32x32, 7

--- /dev/null

+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -1,0 +1,219 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+SECTION_RODATA

+pw_1: times 8 dw 1

+SECTION .text

+%macro QUANTIZE_FN 2

+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \

+                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \

+                                eob, scan, iscan

+  cmp                    dword skipm, 0

+  jne .blank

+  ; actual quantize loop - setup pointers, rounders, etc.

+  movifnidn                   coeffq, coeffmp

+  movifnidn                  ncoeffq, ncoeffmp

+  mov                             r2, dequantmp

+  movifnidn                    zbinq, zbinmp

+  movifnidn                   roundq, roundmp

+  movifnidn                   quantq, quantmp

+  movd                            m4, dword zbin_oqm       ; m4 = zbin_oq

+  mova                            m0, [zbinq]              ; m0 = zbin

+  punpcklwd                       m4, m4

+  mova                            m1, [roundq]             ; m1 = round

+  pshufd                          m4, m4, 0

+  mova                            m2, [quantq]             ; m2 = quant

+  paddw                           m0, m4                   ; m0 = zbin + zbin_oq

+%ifidn %1, b_32x32

+  pcmpeqw                         m5, m5

+  psrlw                           m5, 15

+  paddw                           m0, m5

+  paddw                           m1, m5

+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2

+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2

+%endif

+  mova                            m3, [r2q]                ; m3 = dequant

+  psubw                           m0, [pw_1]

+  mov                             r2, shiftmp

+  mov                             r3, qcoeffmp

+  mova                            m4, [r2]                 ; m4 = shift

+  mov                             r4, dqcoeffmp

+  mov                             r5, iscanmp

+%ifidn %1, b_32x32

+  psllw                           m4, 1

+%endif

+  pxor                            m5, m5                   ; m5 = dedicated zero

+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob

+  lea                         coeffq, [  coeffq+ncoeffq*2]

+  lea                         iscanq, [  iscanq+ncoeffq*2]

+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

+  neg                        ncoeffq

+  ; get DC and first 15 AC coeffs

+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

+  pabsw                           m6, m9                   ; m6 = abs(m9)

+  pabsw                          m11, m10                  ; m11 = abs(m10)

+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin

+  punpckhqdq                      m0, m0

+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin

+  paddsw                          m6, m1                   ; m6 += round

+  punpckhqdq                      m1, m1

+  paddsw                         m11, m1                   ; m11 += round

+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16

+  punpckhqdq                      m2, m2

+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

+  paddw                           m8, m6                   ; m8 += m6

+  paddw                          m13, m11                  ; m13 += m11

+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16

+  punpckhqdq                      m4, m4

+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16

+  psignw                          m8, m9                   ; m8 = reinsert sign

+  psignw                         m13, m10                  ; m13 = reinsert sign

+  pand                            m8, m7

+  pand                           m13, m12

+  mova        [qcoeffq+ncoeffq*2+ 0], m8

+  mova        [qcoeffq+ncoeffq*2+16], m13

+%ifidn %1, b_32x32

+  pabsw                           m8, m8

+  pabsw                          m13, m13

+%endif

+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q

+  punpckhqdq                      m3, m3

+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

+%ifidn %1, b_32x32

+  psrlw                           m8, 1

+  psrlw                          m13, 1

+  psignw                          m8, m9

+  psignw                         m13, m10

+%endif

+  mova       [dqcoeffq+ncoeffq*2+ 0], m8

+  mova       [dqcoeffq+ncoeffq*2+16], m13

+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0

+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]

+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]

+  psubw                           m6, m7                   ; m6 = scan[i] + 1

+  psubw                          m11, m12                  ; m11 = scan[i] + 1

+  pandn                           m8, m6                   ; m8 = max(eob)

+  pandn                          m13, m11                  ; m13 = max(eob)

+  pmaxsw                          m8, m13

+  add                        ncoeffq, mmsize

+  jz .accumulate_eob

+.ac_only_loop:

+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

+  pabsw                           m6, m9                   ; m6 = abs(m9)

+  pabsw                          m11, m10                  ; m11 = abs(m10)

+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin

+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin

+%ifidn %1, b_32x32

+  pmovmskb                        r6, m7

+  pmovmskb                        r2, m12

+  or                              r6, r2

+  jz .skip_iter

+%endif

+  paddsw                          m6, m1                   ; m6 += round

+  paddsw                         m11, m1                   ; m11 += round

+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16

+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

+  paddw                          m14, m6                   ; m14 += m6

+  paddw                          m13, m11                  ; m13 += m11

+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16

+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16

+  psignw                         m14, m9                   ; m14 = reinsert sign

+  psignw                         m13, m10                  ; m13 = reinsert sign

+  pand                           m14, m7

+  pand                           m13, m12

+  mova        [qcoeffq+ncoeffq*2+ 0], m14

+  mova        [qcoeffq+ncoeffq*2+16], m13

+%ifidn %1, b_32x32

+  pabsw                          m14, m14

+  pabsw                          m13, m13

+%endif

+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q

+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

+%ifidn %1, b_32x32

+  psrlw                          m14, 1

+  psrlw                          m13, 1

+  psignw                         m14, m9

+  psignw                         m13, m10

+%endif

+  mova       [dqcoeffq+ncoeffq*2+ 0], m14

+  mova       [dqcoeffq+ncoeffq*2+16], m13

+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0

+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]

+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]

+  psubw                           m6, m7                   ; m6 = scan[i] + 1

+  psubw                          m11, m12                  ; m11 = scan[i] + 1

+  pandn                          m14, m6                   ; m14 = max(eob)

+  pandn                          m13, m11                  ; m13 = max(eob)

+  pmaxsw                          m8, m14

+  pmaxsw                          m8, m13

+  add                        ncoeffq, mmsize

+  jl .ac_only_loop

+%ifidn %1, b_32x32

+  jmp .accumulate_eob

+.skip_iter:

+  mova        [qcoeffq+ncoeffq*2+ 0], m5

+  mova        [qcoeffq+ncoeffq*2+16], m5

+  mova       [dqcoeffq+ncoeffq*2+ 0], m5

+  mova       [dqcoeffq+ncoeffq*2+16], m5

+  add                        ncoeffq, mmsize

+  jl .ac_only_loop

+%endif

+.accumulate_eob:

+  ; horizontally accumulate/max eobs and write into [eob] memory pointer

+  mov                             r2, eobmp

+  pshufd                          m7, m8, 0xe

+  pmaxsw                          m8, m7

+  pshuflw                         m7, m8, 0xe

+  pmaxsw                          m8, m7

+  pshuflw                         m7, m8, 0x1

+  pmaxsw                          m8, m7

+  pextrw                          r6, m8, 0

+  mov                             [r2], r6

+  RET

+  ; skip-block, i.e. just write all zeroes

+.blank:

+  mov                             r0, dqcoeffmp

+  movifnidn                  ncoeffq, ncoeffmp

+  mov                             r2, qcoeffmp

+  mov                             r3, eobmp

+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob

+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

+  neg                        ncoeffq

+  pxor                            m7, m7

+.blank_loop:

+  mova       [dqcoeffq+ncoeffq*2+ 0], m7

+  mova       [dqcoeffq+ncoeffq*2+16], m7

+  mova        [qcoeffq+ncoeffq*2+ 0], m7

+  mova        [qcoeffq+ncoeffq*2+16], m7

+  add                        ncoeffq, mmsize

+  jl .blank_loop

+  mov                    word [eobq], 0

+  RET

+%endmacro

+INIT_XMM ssse3

+QUANTIZE_FN b, 7

+QUANTIZE_FN b_32x32, 7

--- a/vp9/encoder/x86/vp9_ssim_opt.asm

+++ /dev/null

@@ -1,216 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr

-%macro TABULATE_SSIM 0

-        paddusw         xmm15, xmm3  ; sum_s

-        paddusw         xmm14, xmm4  ; sum_r

-        movdqa          xmm1, xmm3

-        pmaddwd         xmm1, xmm1

-        paddd           xmm13, xmm1 ; sum_sq_s

-        movdqa          xmm2, xmm4

-        pmaddwd         xmm2, xmm2

-        paddd           xmm12, xmm2 ; sum_sq_r

-        pmaddwd         xmm3, xmm4

-        paddd           xmm11, xmm3  ; sum_sxr

-%endmacro

-; Sum across the register %1 starting with q words

-%macro SUM_ACROSS_Q 1

-        movdqa          xmm2,%1

-        punpckldq       %1,xmm0

-        punpckhdq       xmm2,xmm0

-        paddq           %1,xmm2

-        movdqa          xmm2,%1

-        punpcklqdq      %1,xmm0

-        punpckhqdq      xmm2,xmm0

-        paddq           %1,xmm2

-%endmacro

-; Sum across the register %1 starting with q words

-%macro SUM_ACROSS_W 1

-        movdqa          xmm1, %1

-        punpcklwd       %1,xmm0

-        punpckhwd       xmm1,xmm0

-        paddd           %1, xmm1

-        SUM_ACROSS_Q    %1

-%endmacro

-;void ssim_parms_sse2(

-;    unsigned char *s,

-;    int sp,

-;    unsigned char *r,

-;    int rp

-;    unsigned long *sum_s,

-;    unsigned long *sum_r,

-;    unsigned long *sum_sq_s,

-;    unsigned long *sum_sq_r,

-;    unsigned long *sum_sxr);

-;

-; TODO: Use parm passing through structure, probably don't need the pxors

-; ( calling app will initialize to 0 ) could easily fit everything in sse2

-; without too much hastle, and can probably do better estimates with psadw

-; or pavgb At this point this is just meant to be first pass for calculating

-; all the parms needed for 16x16 ssim so we can play with dssim as distortion

-; in mode selection code.

-global sym(vp9_ssim_parms_16x16_sse2) PRIVATE

-sym(vp9_ssim_parms_16x16_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    SAVE_XMM 15

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;s

-    mov             rcx,        arg(1) ;sp

-    mov             rdi,        arg(2) ;r

-    mov             rax,        arg(3) ;rp

-    pxor            xmm0, xmm0

-    pxor            xmm15,xmm15  ;sum_s

-    pxor            xmm14,xmm14  ;sum_r

-    pxor            xmm13,xmm13  ;sum_sq_s

-    pxor            xmm12,xmm12  ;sum_sq_r

-    pxor            xmm11,xmm11  ;sum_sxr

-    mov             rdx, 16      ;row counter

-.NextRow:

-    ;grab source and reference pixels

-    movdqu          xmm5, [rsi]

-    movdqu          xmm6, [rdi]

-    movdqa          xmm3, xmm5

-    movdqa          xmm4, xmm6

-    punpckhbw       xmm3, xmm0 ; high_s

-    punpckhbw       xmm4, xmm0 ; high_r

-    TABULATE_SSIM

-    movdqa          xmm3, xmm5

-    movdqa          xmm4, xmm6

-    punpcklbw       xmm3, xmm0 ; low_s

-    punpcklbw       xmm4, xmm0 ; low_r

-    TABULATE_SSIM

-    add             rsi, rcx   ; next s row

-    add             rdi, rax   ; next r row

-    dec             rdx        ; counter

-    jnz .NextRow

-    SUM_ACROSS_W    xmm15

-    SUM_ACROSS_W    xmm14

-    SUM_ACROSS_Q    xmm13

-    SUM_ACROSS_Q    xmm12

-    SUM_ACROSS_Q    xmm11

-    mov             rdi,arg(4)

-    movd            [rdi], xmm15;

-    mov             rdi,arg(5)

-    movd            [rdi], xmm14;

-    mov             rdi,arg(6)

-    movd            [rdi], xmm13;

-    mov             rdi,arg(7)

-    movd            [rdi], xmm12;

-    mov             rdi,arg(8)

-    movd            [rdi], xmm11;

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void ssim_parms_sse2(

-;    unsigned char *s,

-;    int sp,

-;    unsigned char *r,

-;    int rp

-;    unsigned long *sum_s,

-;    unsigned long *sum_r,

-;    unsigned long *sum_sq_s,

-;    unsigned long *sum_sq_r,

-;    unsigned long *sum_sxr);

-;

-; TODO: Use parm passing through structure, probably don't need the pxors

-; ( calling app will initialize to 0 ) could easily fit everything in sse2

-; without too much hastle, and can probably do better estimates with psadw

-; or pavgb At this point this is just meant to be first pass for calculating

-; all the parms needed for 16x16 ssim so we can play with dssim as distortion

-; in mode selection code.

-global sym(vp9_ssim_parms_8x8_sse2) PRIVATE

-sym(vp9_ssim_parms_8x8_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    SAVE_XMM 15

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;s

-    mov             rcx,        arg(1) ;sp

-    mov             rdi,        arg(2) ;r

-    mov             rax,        arg(3) ;rp

-    pxor            xmm0, xmm0

-    pxor            xmm15,xmm15  ;sum_s

-    pxor            xmm14,xmm14  ;sum_r

-    pxor            xmm13,xmm13  ;sum_sq_s

-    pxor            xmm12,xmm12  ;sum_sq_r

-    pxor            xmm11,xmm11  ;sum_sxr

-    mov             rdx, 8      ;row counter

-.NextRow:

-    ;grab source and reference pixels

-    movq            xmm3, [rsi]

-    movq            xmm4, [rdi]

-    punpcklbw       xmm3, xmm0 ; low_s

-    punpcklbw       xmm4, xmm0 ; low_r

-    TABULATE_SSIM

-    add             rsi, rcx   ; next s row

-    add             rdi, rax   ; next r row

-    dec             rdx        ; counter

-    jnz .NextRow

-    SUM_ACROSS_W    xmm15

-    SUM_ACROSS_W    xmm14

-    SUM_ACROSS_Q    xmm13

-    SUM_ACROSS_Q    xmm12

-    SUM_ACROSS_Q    xmm11

-    mov             rdi,arg(4)

-    movd            [rdi], xmm15;

-    mov             rdi,arg(5)

-    movd            [rdi], xmm14;

-    mov             rdi,arg(6)

-    movd            [rdi], xmm13;

-    mov             rdi,arg(7)

-    movd            [rdi], xmm12;

-    mov             rdi,arg(8)

-    movd            [rdi], xmm11;

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- /dev/null

+++ b/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm

@@ -1,0 +1,216 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr

+%macro TABULATE_SSIM 0

+        paddusw         xmm15, xmm3  ; sum_s

+        paddusw         xmm14, xmm4  ; sum_r

+        movdqa          xmm1, xmm3

+        pmaddwd         xmm1, xmm1

+        paddd           xmm13, xmm1 ; sum_sq_s

+        movdqa          xmm2, xmm4

+        pmaddwd         xmm2, xmm2

+        paddd           xmm12, xmm2 ; sum_sq_r

+        pmaddwd         xmm3, xmm4

+        paddd           xmm11, xmm3  ; sum_sxr

+%endmacro

+; Sum across the register %1 starting with q words

+%macro SUM_ACROSS_Q 1

+        movdqa          xmm2,%1

+        punpckldq       %1,xmm0

+        punpckhdq       xmm2,xmm0

+        paddq           %1,xmm2

+        movdqa          xmm2,%1

+        punpcklqdq      %1,xmm0

+        punpckhqdq      xmm2,xmm0

+        paddq           %1,xmm2

+%endmacro

+; Sum across the register %1 starting with q words

+%macro SUM_ACROSS_W 1

+        movdqa          xmm1, %1

+        punpcklwd       %1,xmm0

+        punpckhwd       xmm1,xmm0

+        paddd           %1, xmm1

+        SUM_ACROSS_Q    %1

+%endmacro

+;void ssim_parms_sse2(

+;    unsigned char *s,

+;    int sp,

+;    unsigned char *r,

+;    int rp

+;    unsigned long *sum_s,

+;    unsigned long *sum_r,

+;    unsigned long *sum_sq_s,

+;    unsigned long *sum_sq_r,

+;    unsigned long *sum_sxr);

+;

+; TODO: Use parm passing through structure, probably don't need the pxors

+; ( calling app will initialize to 0 ) could easily fit everything in sse2

+; without too much hastle, and can probably do better estimates with psadw

+; or pavgb At this point this is just meant to be first pass for calculating

+; all the parms needed for 16x16 ssim so we can play with dssim as distortion

+; in mode selection code.

+global sym(vp9_ssim_parms_16x16_sse2) PRIVATE

+sym(vp9_ssim_parms_16x16_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    SAVE_XMM 15

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;s

+    mov             rcx,        arg(1) ;sp

+    mov             rdi,        arg(2) ;r

+    mov             rax,        arg(3) ;rp

+    pxor            xmm0, xmm0

+    pxor            xmm15,xmm15  ;sum_s

+    pxor            xmm14,xmm14  ;sum_r

+    pxor            xmm13,xmm13  ;sum_sq_s

+    pxor            xmm12,xmm12  ;sum_sq_r

+    pxor            xmm11,xmm11  ;sum_sxr

+    mov             rdx, 16      ;row counter

+.NextRow:

+    ;grab source and reference pixels

+    movdqu          xmm5, [rsi]

+    movdqu          xmm6, [rdi]

+    movdqa          xmm3, xmm5

+    movdqa          xmm4, xmm6

+    punpckhbw       xmm3, xmm0 ; high_s

+    punpckhbw       xmm4, xmm0 ; high_r

+    TABULATE_SSIM

+    movdqa          xmm3, xmm5

+    movdqa          xmm4, xmm6

+    punpcklbw       xmm3, xmm0 ; low_s

+    punpcklbw       xmm4, xmm0 ; low_r

+    TABULATE_SSIM

+    add             rsi, rcx   ; next s row

+    add             rdi, rax   ; next r row

+    dec             rdx        ; counter

+    jnz .NextRow

+    SUM_ACROSS_W    xmm15

+    SUM_ACROSS_W    xmm14

+    SUM_ACROSS_Q    xmm13

+    SUM_ACROSS_Q    xmm12

+    SUM_ACROSS_Q    xmm11

+    mov             rdi,arg(4)

+    movd            [rdi], xmm15;

+    mov             rdi,arg(5)

+    movd            [rdi], xmm14;

+    mov             rdi,arg(6)

+    movd            [rdi], xmm13;

+    mov             rdi,arg(7)

+    movd            [rdi], xmm12;

+    mov             rdi,arg(8)

+    movd            [rdi], xmm11;

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void ssim_parms_sse2(

+;    unsigned char *s,

+;    int sp,

+;    unsigned char *r,

+;    int rp

+;    unsigned long *sum_s,

+;    unsigned long *sum_r,

+;    unsigned long *sum_sq_s,

+;    unsigned long *sum_sq_r,

+;    unsigned long *sum_sxr);

+;

+; TODO: Use parm passing through structure, probably don't need the pxors

+; ( calling app will initialize to 0 ) could easily fit everything in sse2

+; without too much hastle, and can probably do better estimates with psadw

+; or pavgb At this point this is just meant to be first pass for calculating

+; all the parms needed for 16x16 ssim so we can play with dssim as distortion

+; in mode selection code.

+global sym(vp9_ssim_parms_8x8_sse2) PRIVATE

+sym(vp9_ssim_parms_8x8_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    SAVE_XMM 15

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;s

+    mov             rcx,        arg(1) ;sp

+    mov             rdi,        arg(2) ;r

+    mov             rax,        arg(3) ;rp

+    pxor            xmm0, xmm0

+    pxor            xmm15,xmm15  ;sum_s

+    pxor            xmm14,xmm14  ;sum_r

+    pxor            xmm13,xmm13  ;sum_sq_s

+    pxor            xmm12,xmm12  ;sum_sq_r

+    pxor            xmm11,xmm11  ;sum_sxr

+    mov             rdx, 8      ;row counter

+.NextRow:

+    ;grab source and reference pixels

+    movq            xmm3, [rsi]

+    movq            xmm4, [rdi]

+    punpcklbw       xmm3, xmm0 ; low_s

+    punpcklbw       xmm4, xmm0 ; low_r

+    TABULATE_SSIM

+    add             rsi, rcx   ; next s row

+    add             rdi, rax   ; next r row

+    dec             rdx        ; counter

+    jnz .NextRow

+    SUM_ACROSS_W    xmm15

+    SUM_ACROSS_W    xmm14

+    SUM_ACROSS_Q    xmm13

+    SUM_ACROSS_Q    xmm12

+    SUM_ACROSS_Q    xmm11

+    mov             rdi,arg(4)

+    movd            [rdi], xmm15;

+    mov             rdi,arg(5)

+    movd            [rdi], xmm14;

+    mov             rdi,arg(6)

+    movd            [rdi], xmm13;

+    mov             rdi,arg(7)

+    movd            [rdi], xmm12;

+    mov             rdi,arg(8)

+    movd            [rdi], xmm11;

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -121,7 +121,7 @@

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c

 ifeq ($(ARCH_X86_64), yes)

-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3.asm

+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm

 endif

 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve_neon.c

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -112,12 +112,12 @@

 endif

 ifeq ($(ARCH_X86_64),yes)

-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm

-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.asm

+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm

+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm

 endif

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm

 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm

-VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm

+VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c